update

tcsvn · tcsvn · commit d3cf6300d02f · 2023-08-14T20:30:56.000+02:00
diff --git a/pe2 copy.ipynb b/pe2 copy.ipynb
diff --git a/pe2.ipynb b/pe2.ipynb
diff --git a/pyadlml/dataset/_representations/lastfired.py b/pyadlml/dataset/_representations/lastfired.py
@@ -20,7 +20,7 @@ def resample_last_fired(df_devs, dt=None, n_jobs=None):
 
     """
     use_dask = n_jobs is not None
-    df = df_devs.sort_values(by=TIME).copy()
+    df = df_devs.sort_values(by=TIME).copy().reset_index(drop=True)
     origin = df.at[0, TIME].floor(freq=dt)
 
     # Only keep last device to have fired in a bin
diff --git a/pyadlml/dataset/_representations/state.py b/pyadlml/dataset/_representations/state.py
@@ -59,9 +59,12 @@ def create_state(df_dev, dataset_info=None, dev_pre_values={}, n_jobs=None):
     for dev in dev_bool:
         fvi = df[dev].first_valid_index()
         if fvi != 0:
-            if dev_pre_values:
-                df.loc[0, dev] = dev_pre_values[dev]
-            else:
+            try:
+                if dev_pre_values:
+                    df.loc[0, dev] = dev_pre_values[dev]
+                else:
+                    raise
+            except:
                 value = df[dev].iloc[fvi]
                 df.loc[0, dev] = not value
 
diff --git a/pyadlml/dataset/plot/plotly/discrete.py b/pyadlml/dataset/plot/plotly/discrete.py
@@ -396,7 +396,8 @@ def acts_and_devs(X, y_true=None, y_pred=None, y_conf=None, act_order=None, dev_
         device_order = dev_order
 
     if act_order is None:
-        act_order = list({*np.unique(y_pred).tolist(), *y_true[ACTIVITY].unique().tolist()})
+        y_pred_acts = np.unique(y_pred).tolist() if y_pred is not None else {}
+        act_order = list({*y_pred_acts, *y_true[ACTIVITY].unique().tolist()})
         act_order.sort()
 
     error_text = 'parameter act_order has to be set if y_conf is given'
diff --git a/pyadlml/metrics.py b/pyadlml/metrics.py
@@ -167,11 +167,9 @@ def _slice_categorical_stream(df1, df2, first_ts=None, last_ts=None):
 
     if first_ts is None:
         first_ts = max(df2.loc[0, TIME], df1.loc[0, TIME])
-        first_ts -= pd.Timedelta('1ms')
     if last_ts is None:
         last_ts = min(df2.loc[df2.index[-1], TIME],
                       df1.loc[df1.index[-1], TIME])
-        last_ts += pd.Timedelta('1ms')
 
     df1_tmp = df1.copy()
     df2_tmp = df2.copy()
@@ -189,8 +187,8 @@ def _slice_categorical_stream(df1, df2, first_ts=None, last_ts=None):
     df1[ACTIVITY] = df1[ACTIVITY].ffill()
     df2[ACTIVITY] = df2[ACTIVITY].ffill()
 
-    df1 = df1[(first_ts < df1[TIME]) & (df1[TIME] < last_ts)]
-    df2 = df2[(first_ts < df2[TIME]) & (df2[TIME] < last_ts)]
+    df1 = df1[(first_ts <= df1[TIME]) & (df1[TIME] <= last_ts)]
+    df2 = df2[(first_ts <= df2[TIME]) & (df2[TIME] <= last_ts)]
 
     df = df1.copy()
     df[lbl2_col] = df2[ACTIVITY]
@@ -211,7 +209,7 @@ def online_confusion_matrix(y_true: pd.DataFrame=None, y_pred: np.ndarray=None,
     Parameters
     ----------
     y_true : pd.DataFrame
-    y_true : pd.DataFrame
+    y_true : np.ndarray 
     times : pd.DataFrame
     df : pd.DataFrame
         The already prepared dataframe     
@@ -299,7 +297,7 @@ def add_other(df_acts, add_offset=False):
 
 
 
-def _prepare_cat_stream(y_true: pd.DataFrame, y_pred: np.ndarray, times:np.ndarray) -> pd.DataFrame:
+def _prepare_cat_stream(y_true: pd.DataFrame, y_pred: np.ndarray, y_times:np.ndarray) -> pd.DataFrame:
     """
 
     CAVE add the 'other' activity
@@ -313,6 +311,10 @@ def _prepare_cat_stream(y_true: pd.DataFrame, y_pred: np.ndarray, times:np.ndarr
     times : np.ndarray datetime64[ns], shape (N, )
         Contains the times the predictions where made
 
+    Attention
+    ---------
+    The last prediction is not included since for the last prediction the duration is not known.
+
     Example
     -------
                                      time     y_pred   y_true                   diff  y_pred_idx
@@ -332,9 +334,9 @@ def _prepare_cat_stream(y_true: pd.DataFrame, y_pred: np.ndarray, times:np.ndarr
 
     if is_activity_df(y_true):
 
-        y_pred, times = y_pred.squeeze(), times.squeeze()
+        y_pred, y_times = y_pred.squeeze(), y_times.squeeze()
 
-        df_y_pred = pd.DataFrame({TIME: times, 'y_pred': y_pred})
+        df_y_pred = pd.DataFrame({TIME: y_times, 'y_pred': y_pred})
         df_y_pred = df_y_pred.sort_values(by=TIME)[[TIME, 'y_pred']] \
                              .reset_index(drop=True)
 
@@ -352,12 +354,13 @@ def _prepare_cat_stream(y_true: pd.DataFrame, y_pred: np.ndarray, times:np.ndarr
 
 
         # Clip Ground truth to predictions or pad GT with other such
-        # That both series start and end at the same time
+        # that the ground truth envelopes the predictions by epsilon amount of time
         df_sel_y_true, df_sel_y_pred = df_y_true.copy(), df_y_pred.copy()
         if df_sel_y_pred[TIME].iat[-1] < df_sel_y_true[TIME].iat[-1]:
             # Preds end before GT -> clip GT to preds
             mask = (df_sel_y_true[TIME] < df_sel_y_pred[TIME].iat[-1]).shift(fill_value=True)
             df_sel_y_true = df_sel_y_true[mask].reset_index(drop=True)
+            df_sel_y_true[TIME].iat[-1] = df_sel_y_pred[TIME].iat[-1] + epsilon
         else:
             # GT ends before preds -> add 'other' activity to GT
             df_sel_y_true = pd.concat([df_sel_y_true, pd.DataFrame({
@@ -372,7 +375,7 @@ def _prepare_cat_stream(y_true: pd.DataFrame, y_pred: np.ndarray, times:np.ndarr
                 'y_true': [OTHER]
             }), df_sel_y_true]).reset_index(drop=True)
             clipped_true_to_preds = False
-        else:
+        else: 
             # GT starts before Preds -> clip GT to preds
             mask = (df_sel_y_pred[TIME].iat[0] < df_sel_y_true[TIME]).shift(-1, fill_value=True)
             df_sel_y_true = df_sel_y_true[mask].reset_index(drop=True)
@@ -382,17 +385,16 @@ def _prepare_cat_stream(y_true: pd.DataFrame, y_pred: np.ndarray, times:np.ndarr
         df = _slice_categorical_stream(df_sel_y_pred,  df_sel_y_true)
 
     else:
-        y_true, y_pred, times = y_true.squeeze(), y_pred.squeeze(), times.squeeze()
+        y_true, y_pred, y_times = y_true.squeeze(), y_pred.squeeze(), y_times.squeeze()
 
-        df = pd.DataFrame(data=[times, y_true, y_pred],
+        df = pd.DataFrame(data=[y_times, y_true, y_pred],
                           index=[TIME, 'y_true', 'y_pred']).T
         df[TIME] = pd.to_datetime(df[TIME])
         raise
 
     df['diff'] = df[TIME].shift(-1) - df[TIME]
     # Remove last prediction since there is no td and remove first if GT was clipped 
-    s_idx = 1 if clipped_true_to_preds else 0
-    df = df.iloc[s_idx:-1]
+    df = df.iloc[:-1]
     df.reset_index(inplace=True)
 
     # Create the new column using the index from df_y_pred
@@ -410,6 +412,8 @@ def _prepare_cat_stream(y_true: pd.DataFrame, y_pred: np.ndarray, times:np.ndarr
         assert prev_idx > 0
         df['y_true_idx'] = df['y_true_idx'].fillna(prev_idx-1)\
                                            .astype(int)
+    else:
+        df['y_true_idx'] = df['y_true_idx'].astype(int)
 
     return df.drop(columns=['index'])
 
@@ -424,18 +428,27 @@ def online_max_calibration_error(y_true, y_pred, y_conf, y_times, num_bins):
     return bin_data['max_calibration_error']
 
 
-def relative_rate(df_y_true: pd.DataFrame, y_pred:np.ndarray, y_times: np.ndarray, average: str ='micro'):
-    """ Calculates how often 
+def relative_prediction_rate(y_true: pd.DataFrame, y_pred:np.ndarray, y_times: np.ndarray, average: str ='micro'):
+    """ Calculates how often a prediction changes over the course of a true activity.
 
     Parameters
     ----------
-    df_y_true: pd.DataFrame
+    y_true: pd.DataFrame
+        An activity dataframe with columns ['start_time', 'end_time', 'activity']
+    y_pred: np.ndarray
+
+    y_times: np.ndarray
+
+    average: str, one of ['micro', 'macro'], default='micro'
+
+    Returns
+    ------- 
 
     """
 
     assert average in ['micro', 'macro']
 
-    df = _prepare_cat_stream(df_y_true, y_pred, y_times)
+    df = _prepare_cat_stream(y_true, y_pred, y_times)
     df['y_pred_changes'] = (df['y_pred'] != df['y_pred'].shift())\
                          & (df['y_true'] == df['y_true'].shift())
     counts_per_activity = df.groupby(['y_true', 'y_true_idx'])['y_pred_changes']\
diff --git a/pyadlml/preprocessing/__init__.py b/pyadlml/preprocessing/__init__.py
@@ -6,7 +6,7 @@
     DropTimeIndex, \
     LabelMatcher, \
     DropDuplicates,\
-    Timestamp2Seqtime, \
+    Time2UnitFromOrigin, \
     KeepOnlyDevices, \
     DropDevices, \
     Identity, \
diff --git a/pyadlml/preprocessing/preprocessing.py b/pyadlml/preprocessing/preprocessing.py
@@ -294,7 +294,7 @@ def to_dataframe(self, X, onehot=False):
         raise NotImplementedError
 
 
-class IndexEncoder():
+class IndexEncoder(BaseEstimator):
 
     def __repr__(self):
         return f'{self.__class__.__name__}()'
@@ -303,6 +303,7 @@ def fit(self, X, y=None):
         self.n_features_in_ = 3
         self.lbl_enc = preprocessing.LabelEncoder()
         self.lbl_enc.fit(X[DEVICE])
+        self.n_indices_out_ = len(self.lbl_enc.classes_)
 
     def transform(self, X, y=None):
         X[DEVICE] = self.lbl_enc.transform(X[DEVICE])
@@ -739,34 +740,54 @@ def transform(self, X, y):
         return X.loc[idxs, :], y
 
 
-class Timestamp2Seqtime(BaseEstimator, TransformerMixin, XOrYTransformer, FinalTimeTransformer):
-    def __init__(self, dt='s'):
+class Time2UnitFromOrigin(BaseEstimator, TransformerMixin, XOrYTransformer, FinalTimeTransformer):
+
+    SEC2UNIT_FACTOR = dict(
+        ms=1e-3,
+        s=1,
+        m=60,
+        h=3600,
+        d=3600*24,
+    )
+
+    def __init__(self, unit='s', round_to='D'):
         super().__init__()
-        self.dt = dt
+        self.unit = unit
+        self.round_to = round_to
 
     @XOrYTransformer.x_or_y_transform
     def fit_transform(self, X, y=None):
         X, y = self.fit(X, y)
         return self.transform(X, y)
 
     def fit(self, X, y=None):
-        assert self.dt in ['ms', 's', 'm', 'h']
+        assert self.unit in ['ms', 's', 'm', 'h', 'd']
+        self.orig_time_ = X[TIME].iloc[0].floor(self.round_to)
         return X, y
 
+    def _td2num(self, x):
+        if isinstance(x, str):
+            x = pd.Timedelta(x)
+        elif isinstance(x, pd.Series):
+            x = x.dt
+        x_sec  = x.total_seconds()
+        return x_sec*self.SEC2UNIT_FACTOR[self.unit]
+
+    def _num2td(self, x):
+        x_sec = (1/self.SEC2UNIT_FACTOR[self.unit])*x
+        return pd.Timedelta('1s')*x_sec
+
+
+
     def transform(self, X, y=None):
         """ Change every timestamp into unit i.e. seconds relative
             to the first timestamp in the sequence.
         """
-        self.time = X[TIME]
-        X[TIME] -= X[TIME].iloc[0]
-        if self.dt == 'ms':
-            X[TIME] = X[TIME]/pd.Timedelta('1milli')
-        elif self.dt == 's':
-            X[TIME] = X[TIME]/pd.Timedelta('1sec')
-        elif self.dt == 'm' or self.dt == 'min':
-            X[TIME] = X[TIME]/pd.Timedelta('1min')
-        elif self.dt == 'h':
-            X[TIME] = X[TIME]/pd.Timedelta('1hr')
+        assert self.orig_time_ <= X[TIME].iloc[0]
+        self.times_ = X[TIME].copy()
+        td = (X[TIME] - self.orig_time_)
+        X[TIME] = self._td2num(td)
+
         return X, y
 
 
@@ -1169,10 +1190,24 @@ def plotly_waves(self, end='06:30:00', res='200ms'):
         )
         return fig
 
+
     def plotly_wave_length_per_dim(self):
         from plotly import graph_objects as go
+        import numpy as np
         fig = go.Figure()
-        x = self._num2td(self.get_periods())
-        x += pd.Timestamp('01.01.2000 00:00:00')
-        fig.add_trace(go.Scatter(y=np.arange(0, self.d_dim), x=x))
-        return fig
+        ws = self.get_angular_freqs()
+        f = ws/(2*np.pi)
+        lmbd = 1/f
+        periods = self._num2td(lmbd)
+        print(periods.dtype)
+        x = pd.Timestamp('01.01.2000 00:00:00') + periods
+        fig.add_trace(
+            go.Scatter(
+                y=np.arange(0, self.d_dim), 
+                x=x,
+                customdata=pd.to_timedelta(periods).map(str),
+                hovertemplate="Dim: %{y}<br>Period: %{customdata}<br><extra></extra>"
+            )
+        )
+        fig.update_layout(title='Period per dim')
+        return fig
diff --git a/pyadlml/preprocessing/windows.py b/pyadlml/preprocessing/windows.py
@@ -1006,7 +1006,7 @@ def plot_winsize_vs_lengths(cls, X, y, window_sizes=None, z_scale='linear', y_sc
         dur_min = float('inf') 
         dur_max = float('-inf')
         for s in window_sizes:
-            Xt = cls(window_size=s).fit_transform(x_times)
+            Xt = cls(window_size=s, return_view=False).fit_transform(x_times)
             # Get normalized time lengths
             Xt_dur = (Xt[:,-1] - Xt[:,0])/np.timedelta64(1, time_unit)