Skip to content

Commit d3cf630

Browse files
committed
update
1 parent d02839d commit d3cf630

File tree

9 files changed

+3541
-906
lines changed

9 files changed

+3541
-906
lines changed

pe2 copy.ipynb

+1,724
Large diffs are not rendered by default.

pe2.ipynb

+1,721-862
Large diffs are not rendered by default.

pyadlml/dataset/_representations/lastfired.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def resample_last_fired(df_devs, dt=None, n_jobs=None):
2020
2121
"""
2222
use_dask = n_jobs is not None
23-
df = df_devs.sort_values(by=TIME).copy()
23+
df = df_devs.sort_values(by=TIME).copy().reset_index(drop=True)
2424
origin = df.at[0, TIME].floor(freq=dt)
2525

2626
# Only keep last device to have fired in a bin

pyadlml/dataset/_representations/state.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,12 @@ def create_state(df_dev, dataset_info=None, dev_pre_values={}, n_jobs=None):
5959
for dev in dev_bool:
6060
fvi = df[dev].first_valid_index()
6161
if fvi != 0:
62-
if dev_pre_values:
63-
df.loc[0, dev] = dev_pre_values[dev]
64-
else:
62+
try:
63+
if dev_pre_values:
64+
df.loc[0, dev] = dev_pre_values[dev]
65+
else:
66+
raise
67+
except:
6568
value = df[dev].iloc[fvi]
6669
df.loc[0, dev] = not value
6770

pyadlml/dataset/plot/plotly/discrete.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,8 @@ def acts_and_devs(X, y_true=None, y_pred=None, y_conf=None, act_order=None, dev_
396396
device_order = dev_order
397397

398398
if act_order is None:
399-
act_order = list({*np.unique(y_pred).tolist(), *y_true[ACTIVITY].unique().tolist()})
399+
y_pred_acts = np.unique(y_pred).tolist() if y_pred is not None else {}
400+
act_order = list({*y_pred_acts, *y_true[ACTIVITY].unique().tolist()})
400401
act_order.sort()
401402

402403
error_text = 'parameter act_order has to be set if y_conf is given'

pyadlml/metrics.py

+31-18
Original file line numberDiff line numberDiff line change
@@ -167,11 +167,9 @@ def _slice_categorical_stream(df1, df2, first_ts=None, last_ts=None):
167167

168168
if first_ts is None:
169169
first_ts = max(df2.loc[0, TIME], df1.loc[0, TIME])
170-
first_ts -= pd.Timedelta('1ms')
171170
if last_ts is None:
172171
last_ts = min(df2.loc[df2.index[-1], TIME],
173172
df1.loc[df1.index[-1], TIME])
174-
last_ts += pd.Timedelta('1ms')
175173

176174
df1_tmp = df1.copy()
177175
df2_tmp = df2.copy()
@@ -189,8 +187,8 @@ def _slice_categorical_stream(df1, df2, first_ts=None, last_ts=None):
189187
df1[ACTIVITY] = df1[ACTIVITY].ffill()
190188
df2[ACTIVITY] = df2[ACTIVITY].ffill()
191189

192-
df1 = df1[(first_ts < df1[TIME]) & (df1[TIME] < last_ts)]
193-
df2 = df2[(first_ts < df2[TIME]) & (df2[TIME] < last_ts)]
190+
df1 = df1[(first_ts <= df1[TIME]) & (df1[TIME] <= last_ts)]
191+
df2 = df2[(first_ts <= df2[TIME]) & (df2[TIME] <= last_ts)]
194192

195193
df = df1.copy()
196194
df[lbl2_col] = df2[ACTIVITY]
@@ -211,7 +209,7 @@ def online_confusion_matrix(y_true: pd.DataFrame=None, y_pred: np.ndarray=None,
211209
Parameters
212210
----------
213211
y_true : pd.DataFrame
214-
y_true : pd.DataFrame
212+
y_true : np.ndarray
215213
times : pd.DataFrame
216214
df : pd.DataFrame
217215
The already prepared dataframe
@@ -299,7 +297,7 @@ def add_other(df_acts, add_offset=False):
299297

300298

301299

302-
def _prepare_cat_stream(y_true: pd.DataFrame, y_pred: np.ndarray, times:np.ndarray) -> pd.DataFrame:
300+
def _prepare_cat_stream(y_true: pd.DataFrame, y_pred: np.ndarray, y_times:np.ndarray) -> pd.DataFrame:
303301
"""
304302
305303
CAVE add the 'other' activity
@@ -313,6 +311,10 @@ def _prepare_cat_stream(y_true: pd.DataFrame, y_pred: np.ndarray, times:np.ndarr
313311
times : np.ndarray datetime64[ns], shape (N, )
314312
Contains the times the predictions where made
315313
314+
Attention
315+
---------
316+
The last prediction is not included since for the last prediction the duration is not known.
317+
316318
Example
317319
-------
318320
time y_pred y_true diff y_pred_idx
@@ -332,9 +334,9 @@ def _prepare_cat_stream(y_true: pd.DataFrame, y_pred: np.ndarray, times:np.ndarr
332334

333335
if is_activity_df(y_true):
334336

335-
y_pred, times = y_pred.squeeze(), times.squeeze()
337+
y_pred, y_times = y_pred.squeeze(), y_times.squeeze()
336338

337-
df_y_pred = pd.DataFrame({TIME: times, 'y_pred': y_pred})
339+
df_y_pred = pd.DataFrame({TIME: y_times, 'y_pred': y_pred})
338340
df_y_pred = df_y_pred.sort_values(by=TIME)[[TIME, 'y_pred']] \
339341
.reset_index(drop=True)
340342

@@ -352,12 +354,13 @@ def _prepare_cat_stream(y_true: pd.DataFrame, y_pred: np.ndarray, times:np.ndarr
352354

353355

354356
# Clip Ground truth to predictions or pad GT with other such
355-
# That both series start and end at the same time
357+
# that the ground truth envelopes the predictions by epsilon amount of time
356358
df_sel_y_true, df_sel_y_pred = df_y_true.copy(), df_y_pred.copy()
357359
if df_sel_y_pred[TIME].iat[-1] < df_sel_y_true[TIME].iat[-1]:
358360
# Preds end before GT -> clip GT to preds
359361
mask = (df_sel_y_true[TIME] < df_sel_y_pred[TIME].iat[-1]).shift(fill_value=True)
360362
df_sel_y_true = df_sel_y_true[mask].reset_index(drop=True)
363+
df_sel_y_true[TIME].iat[-1] = df_sel_y_pred[TIME].iat[-1] + epsilon
361364
else:
362365
# GT ends before preds -> add 'other' activity to GT
363366
df_sel_y_true = pd.concat([df_sel_y_true, pd.DataFrame({
@@ -372,7 +375,7 @@ def _prepare_cat_stream(y_true: pd.DataFrame, y_pred: np.ndarray, times:np.ndarr
372375
'y_true': [OTHER]
373376
}), df_sel_y_true]).reset_index(drop=True)
374377
clipped_true_to_preds = False
375-
else:
378+
else:
376379
# GT starts before Preds -> clip GT to preds
377380
mask = (df_sel_y_pred[TIME].iat[0] < df_sel_y_true[TIME]).shift(-1, fill_value=True)
378381
df_sel_y_true = df_sel_y_true[mask].reset_index(drop=True)
@@ -382,17 +385,16 @@ def _prepare_cat_stream(y_true: pd.DataFrame, y_pred: np.ndarray, times:np.ndarr
382385
df = _slice_categorical_stream(df_sel_y_pred, df_sel_y_true)
383386

384387
else:
385-
y_true, y_pred, times = y_true.squeeze(), y_pred.squeeze(), times.squeeze()
388+
y_true, y_pred, y_times = y_true.squeeze(), y_pred.squeeze(), y_times.squeeze()
386389

387-
df = pd.DataFrame(data=[times, y_true, y_pred],
390+
df = pd.DataFrame(data=[y_times, y_true, y_pred],
388391
index=[TIME, 'y_true', 'y_pred']).T
389392
df[TIME] = pd.to_datetime(df[TIME])
390393
raise
391394

392395
df['diff'] = df[TIME].shift(-1) - df[TIME]
393396
# Remove last prediction since there is no td and remove first if GT was clipped
394-
s_idx = 1 if clipped_true_to_preds else 0
395-
df = df.iloc[s_idx:-1]
397+
df = df.iloc[:-1]
396398
df.reset_index(inplace=True)
397399

398400
# Create the new column using the index from df_y_pred
@@ -410,6 +412,8 @@ def _prepare_cat_stream(y_true: pd.DataFrame, y_pred: np.ndarray, times:np.ndarr
410412
assert prev_idx > 0
411413
df['y_true_idx'] = df['y_true_idx'].fillna(prev_idx-1)\
412414
.astype(int)
415+
else:
416+
df['y_true_idx'] = df['y_true_idx'].astype(int)
413417

414418
return df.drop(columns=['index'])
415419

@@ -424,18 +428,27 @@ def online_max_calibration_error(y_true, y_pred, y_conf, y_times, num_bins):
424428
return bin_data['max_calibration_error']
425429

426430

427-
def relative_rate(df_y_true: pd.DataFrame, y_pred:np.ndarray, y_times: np.ndarray, average: str ='micro'):
428-
""" Calculates how often
431+
def relative_prediction_rate(y_true: pd.DataFrame, y_pred:np.ndarray, y_times: np.ndarray, average: str ='micro'):
432+
""" Calculates how often a prediction changes over the course of a true activity.
429433
430434
Parameters
431435
----------
432-
df_y_true: pd.DataFrame
436+
y_true: pd.DataFrame
437+
An activity dataframe with columns ['start_time', 'end_time', 'activity']
438+
y_pred: np.ndarray
439+
440+
y_times: np.ndarray
441+
442+
average: str, one of ['micro', 'macro'], default='micro'
443+
444+
Returns
445+
-------
433446
434447
"""
435448

436449
assert average in ['micro', 'macro']
437450

438-
df = _prepare_cat_stream(df_y_true, y_pred, y_times)
451+
df = _prepare_cat_stream(y_true, y_pred, y_times)
439452
df['y_pred_changes'] = (df['y_pred'] != df['y_pred'].shift())\
440453
& (df['y_true'] == df['y_true'].shift())
441454
counts_per_activity = df.groupby(['y_true', 'y_true_idx'])['y_pred_changes']\

pyadlml/preprocessing/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
DropTimeIndex, \
77
LabelMatcher, \
88
DropDuplicates,\
9-
Timestamp2Seqtime, \
9+
Time2UnitFromOrigin, \
1010
KeepOnlyDevices, \
1111
DropDevices, \
1212
Identity, \

pyadlml/preprocessing/preprocessing.py

+54-19
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@ def to_dataframe(self, X, onehot=False):
294294
raise NotImplementedError
295295

296296

297-
class IndexEncoder():
297+
class IndexEncoder(BaseEstimator):
298298

299299
def __repr__(self):
300300
return f'{self.__class__.__name__}()'
@@ -303,6 +303,7 @@ def fit(self, X, y=None):
303303
self.n_features_in_ = 3
304304
self.lbl_enc = preprocessing.LabelEncoder()
305305
self.lbl_enc.fit(X[DEVICE])
306+
self.n_indices_out_ = len(self.lbl_enc.classes_)
306307

307308
def transform(self, X, y=None):
308309
X[DEVICE] = self.lbl_enc.transform(X[DEVICE])
@@ -739,34 +740,54 @@ def transform(self, X, y):
739740
return X.loc[idxs, :], y
740741

741742

742-
class Timestamp2Seqtime(BaseEstimator, TransformerMixin, XOrYTransformer, FinalTimeTransformer):
743-
def __init__(self, dt='s'):
743+
class Time2UnitFromOrigin(BaseEstimator, TransformerMixin, XOrYTransformer, FinalTimeTransformer):
744+
745+
SEC2UNIT_FACTOR = dict(
746+
ms=1e-3,
747+
s=1,
748+
m=60,
749+
h=3600,
750+
d=3600*24,
751+
)
752+
753+
def __init__(self, unit='s', round_to='D'):
744754
super().__init__()
745-
self.dt = dt
755+
self.unit = unit
756+
self.round_to = round_to
746757

747758
@XOrYTransformer.x_or_y_transform
748759
def fit_transform(self, X, y=None):
749760
X, y = self.fit(X, y)
750761
return self.transform(X, y)
751762

752763
def fit(self, X, y=None):
753-
assert self.dt in ['ms', 's', 'm', 'h']
764+
assert self.unit in ['ms', 's', 'm', 'h', 'd']
765+
self.orig_time_ = X[TIME].iloc[0].floor(self.round_to)
754766
return X, y
755767

768+
def _td2num(self, x):
769+
if isinstance(x, str):
770+
x = pd.Timedelta(x)
771+
elif isinstance(x, pd.Series):
772+
x = x.dt
773+
x_sec = x.total_seconds()
774+
return x_sec*self.SEC2UNIT_FACTOR[self.unit]
775+
776+
def _num2td(self, x):
777+
x_sec = (1/self.SEC2UNIT_FACTOR[self.unit])*x
778+
return pd.Timedelta('1s')*x_sec
779+
780+
781+
756782
def transform(self, X, y=None):
757783
""" Change every timestamp into unit i.e. seconds relative
758784
to the first timestamp in the sequence.
759785
"""
760-
self.time = X[TIME]
761-
X[TIME] -= X[TIME].iloc[0]
762-
if self.dt == 'ms':
763-
X[TIME] = X[TIME]/pd.Timedelta('1milli')
764-
elif self.dt == 's':
765-
X[TIME] = X[TIME]/pd.Timedelta('1sec')
766-
elif self.dt == 'm' or self.dt == 'min':
767-
X[TIME] = X[TIME]/pd.Timedelta('1min')
768-
elif self.dt == 'h':
769-
X[TIME] = X[TIME]/pd.Timedelta('1hr')
786+
assert self.orig_time_ <= X[TIME].iloc[0]
787+
self.times_ = X[TIME].copy()
788+
td = (X[TIME] - self.orig_time_)
789+
X[TIME] = self._td2num(td)
790+
770791
return X, y
771792

772793

@@ -1169,10 +1190,24 @@ def plotly_waves(self, end='06:30:00', res='200ms'):
11691190
)
11701191
return fig
11711192

1193+
11721194
def plotly_wave_length_per_dim(self):
11731195
from plotly import graph_objects as go
1196+
import numpy as np
11741197
fig = go.Figure()
1175-
x = self._num2td(self.get_periods())
1176-
x += pd.Timestamp('01.01.2000 00:00:00')
1177-
fig.add_trace(go.Scatter(y=np.arange(0, self.d_dim), x=x))
1178-
return fig
1198+
ws = self.get_angular_freqs()
1199+
f = ws/(2*np.pi)
1200+
lmbd = 1/f
1201+
periods = self._num2td(lmbd)
1202+
print(periods.dtype)
1203+
x = pd.Timestamp('01.01.2000 00:00:00') + periods
1204+
fig.add_trace(
1205+
go.Scatter(
1206+
y=np.arange(0, self.d_dim),
1207+
x=x,
1208+
customdata=pd.to_timedelta(periods).map(str),
1209+
hovertemplate="Dim: %{y}<br>Period: %{customdata}<br><extra></extra>"
1210+
)
1211+
)
1212+
fig.update_layout(title='Period per dim')
1213+
return fig

pyadlml/preprocessing/windows.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1006,7 +1006,7 @@ def plot_winsize_vs_lengths(cls, X, y, window_sizes=None, z_scale='linear', y_sc
10061006
dur_min = float('inf')
10071007
dur_max = float('-inf')
10081008
for s in window_sizes:
1009-
Xt = cls(window_size=s).fit_transform(x_times)
1009+
Xt = cls(window_size=s, return_view=False).fit_transform(x_times)
10101010
# Get normalized time lengths
10111011
Xt_dur = (Xt[:,-1] - Xt[:,0])/np.timedelta64(1, time_unit)
10121012

0 commit comments

Comments
 (0)