Source code for timeseriesflattener.feature_specs.single_specs

 from dataclasses import dataclass
-from typing import Union
+from typing import Tuple, Union
 
 import pandas as pd
 from timeseriesflattener.aggregation_fns import AggregationFunType
 from timeseriesflattener.utils.pydantic_basemodel import BaseModel
 
 
+
[docs]@dataclass(frozen=True) +class LookPeriod: + min_days: float + max_days: float + + def __post_init__(self): + if self.min_days > self.max_days: + raise ValueError( + f"Invalid LookPeriod. The min_days ({self.min_days}) must be smaller than the max_days {self.max_days}.", + )
+ +
[docs]@dataclass(frozen=True) class CoercedFloats: - lookwindow: Union[float, int] + lookperiod: LookPeriod fallback: Union[float, int]
@@ -245,17 +257,25 @@

Source code for timeseriesflattener.feature_specs.single_specs

return False
-
[docs]def coerce_floats(lookwindow: float, fallback: float) -> CoercedFloats: - lookwindow = ( - lookwindow - if not can_be_coerced_losslessly_to_int(lookwindow) - else int(lookwindow) +
[docs]def coerce_floats(lookperiod: LookPeriod, fallback: float) -> CoercedFloats: + min_days = ( + lookperiod.min_days + if not can_be_coerced_losslessly_to_int(lookperiod.min_days) + else int(lookperiod.min_days) + ) + max_days = ( + lookperiod.max_days + if not can_be_coerced_losslessly_to_int(lookperiod.max_days) + else int(lookperiod.max_days) ) + + coerced_lookperiod = LookPeriod(min_days=min_days, max_days=max_days) + fallback = ( fallback if not can_be_coerced_losslessly_to_int(fallback) else int(fallback) ) - return CoercedFloats(lookwindow=lookwindow, fallback=fallback)
+ return CoercedFloats(lookperiod=coerced_lookperiod, fallback=fallback)
[docs]class StaticSpec(BaseModel): @@ -284,13 +304,18 @@

Source code for timeseriesflattener.feature_specs.single_specs

[docs]def get_temporal_col_name( prefix: str, feature_base_name: str, - lookwindow: Union[float, int], + lookperiod: LookPeriod, aggregation_fn: AggregationFunType, fallback: Union[float, int], ) -> str: """Get the column name for the temporal feature.""" - coerced = coerce_floats(lookwindow=lookwindow, fallback=fallback) - col_str = f"{prefix}_{feature_base_name}_within_{coerced.lookwindow!s}_days_{aggregation_fn.__name__}_fallback_{coerced.fallback}" + coerced = coerce_floats(lookperiod=lookperiod, fallback=fallback) + lookperiod_str = ( + f"{coerced.lookperiod.max_days!s}" + if coerced.lookperiod.min_days == 0 + else f"{coerced.lookperiod.min_days!s}_to_{coerced.lookperiod.max_days!s}" + ) + col_str = f"{prefix}_{feature_base_name}_within_{lookperiod_str}_days_{aggregation_fn.__name__}_fallback_{coerced.fallback}" return col_str
@@ -305,7 +330,8 @@

Source code for timeseriesflattener.feature_specs.single_specs

NOTE: Column names can be overridden when initialising TimeSeriesFlattener. feature_base_name: The name of the feature. Used for column name generation, e.g. <prefix>_<feature_baase_name>_<metadata>. - lookahead_days: How far ahead from the prediction time to look for outcome values. + lookahead_days: In which interval from the prediction time to look for outcome values. + Can be tuple of two floats specifying (min_days, max_days) or float | int which will resolve to (0, value). aggregation_fn: How to aggregate multiple values within lookahead days. Should take a grouped dataframe as input and return a single value. fallback: Value to return if no values is found within window. incident: Whether the outcome is incident or not. E.g. type 2 diabetes is incident because you can only experience it once. @@ -317,18 +343,27 @@

Source code for timeseriesflattener.feature_specs.single_specs

timeseries_df: pd.DataFrame feature_base_name: str - lookahead_days: float + lookahead_days: Union[float, Tuple[float, float]] aggregation_fn: AggregationFunType fallback: Union[float, int] incident: bool prefix: str = "outc" + @property + def lookahead_period(self) -> LookPeriod: + if isinstance(self.lookahead_days, (float, int)): + return LookPeriod(min_days=0, max_days=self.lookahead_days) + return LookPeriod( + min_days=self.lookahead_days[0], + max_days=self.lookahead_days[1], + ) +
[docs] def get_output_col_name(self) -> str: """Get the column name for the output column.""" col_str = get_temporal_col_name( prefix=self.prefix, feature_base_name=self.feature_base_name, - lookwindow=self.lookahead_days, + lookperiod=self.lookahead_period, aggregation_fn=self.aggregation_fn, fallback=self.fallback, ) @@ -354,12 +389,10 @@

Source code for timeseriesflattener.feature_specs.single_specs

NOTE: Column names can be overridden when initialising TimeSeriesFlattener. feature_base_name: The name of the feature. Used for column name generation, e.g. <prefix>_<feature_baase_name>_<metadata>. - lookbehind_days: How far behind from the prediction time to look for predictor values. - aggregation_fn: How to aggregate multiple values within lookahead days. Should take a grouped dataframe as input and return a single value. + lookbehind_days: In which interval from the prediction time to look for predictor values. + Can be tuple of two floats specifying (min_days, max_days) or float | int which will resolve to (0, value). + aggregation_fn: How to aggregate multiple values within lookbehind days. Should take a grouped dataframe as input and return a single value. fallback: Value to return if no values is found within window. - incident: Whether the outcome is incident or not. E.g. type 2 diabetes is incident because you can only experience it once. - Incident outcomes can be handled in a vectorised way during resolution, which is faster than non-incident outcomes. - Requires that each entity only occurs once in the timeseries_df. prefix: The prefix used for column name generation, e.g. <prefix>_<feature_name>_<metadata>. Defaults to "pred". """ @@ -368,15 +401,24 @@

Source code for timeseriesflattener.feature_specs.single_specs

feature_base_name: str aggregation_fn: AggregationFunType fallback: Union[float, int] - lookbehind_days: float + lookbehind_days: Union[float, Tuple[float, float]] prefix: str = "pred" + @property + def lookbehind_period(self) -> LookPeriod: + if isinstance(self.lookbehind_days, (float, int)): + return LookPeriod(min_days=0, max_days=self.lookbehind_days) + return LookPeriod( + min_days=self.lookbehind_days[0], + max_days=self.lookbehind_days[1], + ) +
[docs] def get_output_col_name(self) -> str: """Generate the column name for the output column.""" return get_temporal_col_name( prefix=self.prefix, feature_base_name=self.feature_base_name, - lookwindow=self.lookbehind_days, + lookperiod=self.lookbehind_period, aggregation_fn=self.aggregation_fn, fallback=self.fallback, )
diff --git a/_modules/timeseriesflattener/flattened_dataset.html b/_modules/timeseriesflattener/flattened_dataset.html index 62d194ef..826839f7 100644 --- a/_modules/timeseriesflattener/flattened_dataset.html +++ b/_modules/timeseriesflattener/flattened_dataset.html @@ -245,6 +245,7 @@

Source code for timeseriesflattener.flattened_dataset

from timeseriesflattener.feature_cache.abstract_feature_cache import FeatureCache from timeseriesflattener.feature_specs.single_specs import ( AnySpec, + LookPeriod, OutcomeSpec, PredictorSpec, StaticSpec, @@ -482,7 +483,7 @@

Source code for timeseriesflattener.flattened_dataset

def _drop_records_outside_interval_days( df: DataFrame, direction: str, - interval_days: float, + lookperiod: LookPeriod, timestamp_pred_colname: str, timestamp_value_colname: str, ) -> DataFrame: @@ -492,7 +493,7 @@

Source code for timeseriesflattener.flattened_dataset

Args: direction (str): Whether to look ahead or behind. - interval_days (float): How far to look + lookperiod (LookPeriod): Interval to look within. df (DataFrame): Source dataframe timestamp_pred_colname (str): Name of timestamp column for predictions in df. timestamp_value_colname (str): Name of timestamp column for values in df. @@ -512,12 +513,12 @@

Source code for timeseriesflattener.flattened_dataset

if direction == "ahead": df["is_in_interval"] = ( - df["time_from_pred_to_val_in_days"] <= interval_days - ) & (df["time_from_pred_to_val_in_days"] > 0) + df["time_from_pred_to_val_in_days"] <= lookperiod.max_days + ) & (df["time_from_pred_to_val_in_days"] > lookperiod.min_days) elif direction == "behind": df["is_in_interval"] = ( - df["time_from_pred_to_val_in_days"] >= -interval_days - ) & (df["time_from_pred_to_val_in_days"] < 0) + df["time_from_pred_to_val_in_days"] >= -lookperiod.max_days + ) & (df["time_from_pred_to_val_in_days"] < -lookperiod.min_days) else: raise ValueError("direction can only be 'ahead' or 'behind'") @@ -574,17 +575,17 @@

Source code for timeseriesflattener.flattened_dataset

# Drop prediction times without event times within interval days if isinstance(output_spec, OutcomeSpec): direction = "ahead" - interval_days = output_spec.lookahead_days + lookperiod = output_spec.lookahead_period elif isinstance(output_spec, PredictorSpec): direction = "behind" - interval_days = output_spec.lookbehind_days + lookperiod = output_spec.lookbehind_period else: raise ValueError(f"Unknown output_spec type {type(output_spec)}") df = TimeseriesFlattener._drop_records_outside_interval_days( df, direction=direction, - interval_days=interval_days, + lookperiod=lookperiod, timestamp_pred_colname=timestamp_pred_col_name, timestamp_value_colname=timestamp_val_col_name, ) @@ -883,8 +884,12 @@

Source code for timeseriesflattener.flattened_dataset

if outcome_spec.is_dichotomous(): outcome_is_within_lookahead = ( df[prediction_timestamp_col_name] # type: ignore - + timedelta(days=outcome_spec.lookahead_days) + + timedelta(days=outcome_spec.lookahead_period.max_days) > df[outcome_timestamp_col_name] + ) & ( + df[prediction_timestamp_col_name] # type: ignore + + timedelta(days=outcome_spec.lookahead_period.min_days) + <= df[outcome_timestamp_col_name] ) df[outcome_spec.get_output_col_name()] = outcome_is_within_lookahead.astype( @@ -915,11 +920,11 @@

Source code for timeseriesflattener.flattened_dataset

if isinstance(spec, PredictorSpec): min_val_date = spec.timeseries_df[self.timestamp_col_name].min() # type: ignore - return min_val_date + pd.Timedelta(days=spec.lookbehind_days) + return min_val_date + pd.Timedelta(days=spec.lookbehind_period.max_days) if isinstance(spec, OutcomeSpec): max_val_date = spec.timeseries_df[self.timestamp_col_name].max() # type: ignore - return max_val_date - pd.Timedelta(days=spec.lookahead_days) + return max_val_date - pd.Timedelta(days=spec.lookahead_period.max_days) raise ValueError(f"Spec type {type(spec)} not recognised.") diff --git a/_sources/tutorials/01_basic.ipynb.txt b/_sources/tutorials/01_basic.ipynb.txt index 9bdd35dc..a354fae2 100644 --- a/_sources/tutorials/01_basic.ipynb.txt +++ b/_sources/tutorials/01_basic.ipynb.txt @@ -52,6 +52,15 @@ "execution_count": 1, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/au554730/Desktop/Projects/timeseriesflattener/.venv/lib/python3.10/site-packages/pydantic/_internal/_config.py:269: UserWarning: Valid config keys have changed in V2:\n", + "* 'allow_mutation' has been removed\n", + " warnings.warn(message, UserWarning)\n" + ] + }, { "data": { "text/html": [ @@ -877,7 +886,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -908,6 +917,8 @@ "\n", "We also specify that the outcome is not incident. This means that patient ID (dw_ek_borger) can experience the outcome more than once. If the outcome was marked as incident, all prediction times after the patient experiences the outcome are dropped. This is useful for cases where an event is permanent - for example, whether a patient has type 1 diabetes or not.\n", "\n", + "Here, we specifiy that we want to look 365 days forward from the prediction time to search for outcomes. If we wanted to require a certain period of time from the prediction time before we look for outcome values, we can specify `lookahead_days` as an interval of (min_days, max_days) as a tuple instead. \n", + "\n", "Lastly, we specify a name of the outcome which'll be used when generating its column." ] }, @@ -956,6 +967,30 @@ "Values within the *lookbehind* window are aggregated using `aggregation_fn`, for example the mean as shown in this example, or max/min etc. " ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Temporal predictors can also be specified to look for values within a certain time range from the prediction time, similar to outcome specifications. For instance, you might want to create multiple predictors, where one looks for values within (0, 30) days, and another within (31, 182) days. \n", + "\n", + "This can easily be specified by passing a tuple[min_days, max_days] to the lookbehind_days parameter." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "temporal_interval_predictor_spec = PredictorSpec(\n", + " timeseries_df=df_synth_predictors,\n", + " lookbehind_days=(30, 90),\n", + " fallback=np.nan,\n", + " aggregation_fn=mean,\n", + " feature_base_name=\"predictor_interval_name\",\n", + ")" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -1146,30 +1181,33 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "ts_flattener.add_spec([sex_predictor_spec, temporal_predictor_spec, outcome_spec])" + "ts_flattener.add_spec([sex_predictor_spec, temporal_predictor_spec, temporal_interval_predictor_spec, outcome_spec])" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2023-06-14 16:11:40 [INFO] There were unprocessed specs, computing...\n", - "2023-06-14 16:11:40 [INFO] _drop_pred_time_if_insufficient_look_distance: Dropped 5999 (59.99%) rows\n", - "2023-06-14 16:11:40 [INFO] Processing 2 temporal features in parallel with 1 workers. Chunksize is 2. If this is above 1, it may take some time for the progress bar to move, as processing is batched. However, this makes for much faster total performance.\n", - "100%|██████████| 2/2 [00:00<00:00, 2.14it/s]\n", - "2023-06-14 16:11:41 [INFO] Checking alignment of dataframes - this might take a little while (~2 minutes for 1.000 dataframes with 2.000.000 rows).\n", - "2023-06-14 16:11:41 [INFO] Starting concatenation. Will take some time on performant systems, e.g. 30s for 100 features and 2_000_000 prediction times. This is normal.\n", - "2023-06-14 16:11:41 [INFO] Concatenation took 0.003 seconds\n", - "2023-06-14 16:11:41 [INFO] Merging with original df\n" + "2024-01-18 11:34:22 [INFO] There were unprocessed specs, computing...\n", + "2024-01-18 11:34:22 [INFO] _drop_pred_time_if_insufficient_look_distance: Dropped 5999 (59.99%) rows\n", + "2024-01-18 11:34:22 [INFO] Processing 3 temporal features in parallel with 1 workers. Chunksize is 3. If this is above 1, it may take some time for the progress bar to move, as processing is batched. However, this makes for much faster total performance.\n", + " 0%| | 0/3 [00:00 dataframe ┃ Values ┃ ┃ Column Type Count ┃ │\n", "│ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩ │\n", - "│ │ Number of rows │ 4001 │ │ int64 │ 2 │ │\n", - "│ │ Number of columns │ 6 │ │ float64 │ 2 │ │\n", + "│ │ Number of rows │ 4001 │ │ float64 │ 3 │ │\n", + "│ │ Number of columns │ 7 │ │ int64 │ 2 │ │\n", "│ └───────────────────┴────────┘ │ datetime64 │ 1 │ │\n", "│ │ string │ 1 │ │\n", "│ └─────────────┴───────┘ │\n", "│ number │\n", - "│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━━┓ │\n", - "│ ┃ column_name NA NA % mean sd p0 p25 p75 p100 hist ┃ │\n", - "│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━━┩ │\n", - "│ │ entity_id 0 0 5000 2900 3 2600 7500 10000█████▇ │ │\n", - "│ │ pred_predictor_name_ 72 1.8 5 1.6 0.097 3.9 6 9.9▁▃██▃▁ │ │\n", - "│ │ outc_outcome_name_wi 0 0 0.064 0.25 0 0 0 1█ ▁ │ │\n", - "│ │ pred_female 0 0 0.49 0.5 0 0 1 1█ █ │ │\n", - "│ └────────────────────────────┴─────┴────────┴─────────┴────────┴─────────┴────────┴───────┴────────┴─────────┘ │\n", + "│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┓ │\n", + "│ ┃ column_name NA NA % mean sd p0 p25 p75 p100 hist ┃ │\n", + "│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━┩ │\n", + "│ │ entity_id 0 0 5000 2900 3 2600 7500 10000█████▇ │ │\n", + "│ │ pred_predictor_name_ 72 1.8 5 1.6 0.097 3.9 6 9.9▁▃██▃▁ │ │\n", + "│ │ pred_predictor_inter 2900 72 5 2.8 0.02 2.6 7.4 10▇▇▇██▇ │ │\n", + "│ │ outc_outcome_name_wi 0 0 0.064 0.25 0 0 0 1█ ▁ │ │\n", + "│ │ pred_female 0 0 0.49 0.5 0 0 1 1█ █ │ │\n", + "│ └───────────────────────────┴────────┴────────┴─────────┴────────┴─────────┴───────┴───────┴────────┴────────┘ │\n", "│ datetime │\n", "│ ┏━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓ │\n", "│ ┃ column_name NA NA % first last frequency ┃ │\n", @@ -1215,20 +1254,21 @@ "│ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓ │\n", "│ ┃\u001b[1;36m \u001b[0m\u001b[1;36mdataframe \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mValues\u001b[0m\u001b[1;36m \u001b[0m┃ ┃\u001b[1;36m \u001b[0m\u001b[1;36mColumn Type\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mCount\u001b[0m\u001b[1;36m \u001b[0m┃ │\n", "│ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩ │\n", - "│ │ Number of rows │ 4001 │ │ int64 │ 2 │ │\n", - "│ │ Number of columns │ 6 │ │ float64 │ 2 │ │\n", + "│ │ Number of rows │ 4001 │ │ float64 │ 3 │ │\n", + "│ │ Number of columns │ 7 │ │ int64 │ 2 │ │\n", "│ └───────────────────┴────────┘ │ datetime64 │ 1 │ │\n", "│ │ string │ 1 │ │\n", "│ └─────────────┴───────┘ │\n", "│ \u001b[3m number \u001b[0m │\n", - "│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━━┓ │\n", - "│ ┃\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmean \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msd \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp0 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp25 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp75 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp100 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mhist \u001b[0m\u001b[1m \u001b[0m┃ │\n", - "│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━━┩ │\n", - "│ │ \u001b[38;5;141mentity_id \u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 5000\u001b[0m │ \u001b[36m 2900\u001b[0m │ \u001b[36m 3\u001b[0m │ \u001b[36m 2600\u001b[0m │ \u001b[36m 7500\u001b[0m │ \u001b[36m 10000\u001b[0m │ \u001b[32m█████▇ \u001b[0m │ │\n", - "│ │ \u001b[38;5;141mpred_predictor_name_ \u001b[0m │ \u001b[36m 72\u001b[0m │ \u001b[36m 1.8\u001b[0m │ \u001b[36m 5\u001b[0m │ \u001b[36m 1.6\u001b[0m │ \u001b[36m 0.097\u001b[0m │ \u001b[36m 3.9\u001b[0m │ \u001b[36m 6\u001b[0m │ \u001b[36m 9.9\u001b[0m │ \u001b[32m▁▃██▃▁ \u001b[0m │ │\n", - "│ │ \u001b[38;5;141moutc_outcome_name_wi \u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0.064\u001b[0m │ \u001b[36m 0.25\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 1\u001b[0m │ \u001b[32m█ ▁ \u001b[0m │ │\n", - "│ │ \u001b[38;5;141mpred_female \u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0.49\u001b[0m │ \u001b[36m 0.5\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 1\u001b[0m │ \u001b[36m 1\u001b[0m │ \u001b[32m█ █ \u001b[0m │ │\n", - "│ └────────────────────────────┴─────┴────────┴─────────┴────────┴─────────┴────────┴───────┴────────┴─────────┘ │\n", + "│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┓ │\n", + "│ ┃\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmean \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msd \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp0 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp25 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp75 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp100 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mhist \u001b[0m\u001b[1m \u001b[0m┃ │\n", + "│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━┩ │\n", + "│ │ \u001b[38;5;141mentity_id \u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 5000\u001b[0m │ \u001b[36m 2900\u001b[0m │ \u001b[36m 3\u001b[0m │ \u001b[36m 2600\u001b[0m │ \u001b[36m 7500\u001b[0m │ \u001b[36m 10000\u001b[0m │ \u001b[32m█████▇\u001b[0m │ │\n", + "│ │ \u001b[38;5;141mpred_predictor_name_ \u001b[0m │ \u001b[36m 72\u001b[0m │ \u001b[36m 1.8\u001b[0m │ \u001b[36m 5\u001b[0m │ \u001b[36m 1.6\u001b[0m │ \u001b[36m 0.097\u001b[0m │ \u001b[36m 3.9\u001b[0m │ \u001b[36m 6\u001b[0m │ \u001b[36m 9.9\u001b[0m │ \u001b[32m▁▃██▃▁\u001b[0m │ │\n", + "│ │ \u001b[38;5;141mpred_predictor_inter \u001b[0m │ \u001b[36m 2900\u001b[0m │ \u001b[36m 72\u001b[0m │ \u001b[36m 5\u001b[0m │ \u001b[36m 2.8\u001b[0m │ \u001b[36m 0.02\u001b[0m │ \u001b[36m 2.6\u001b[0m │ \u001b[36m 7.4\u001b[0m │ \u001b[36m 10\u001b[0m │ \u001b[32m▇▇▇██▇\u001b[0m │ │\n", + "│ │ \u001b[38;5;141moutc_outcome_name_wi \u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0.064\u001b[0m │ \u001b[36m 0.25\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 1\u001b[0m │ \u001b[32m█ ▁\u001b[0m │ │\n", + "│ │ \u001b[38;5;141mpred_female \u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0.49\u001b[0m │ \u001b[36m 0.5\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 1\u001b[0m │ \u001b[36m 1\u001b[0m │ \u001b[32m█ █\u001b[0m │ │\n", + "│ └───────────────────────────┴────────┴────────┴─────────┴────────┴─────────┴───────┴───────┴────────┴────────┘ │\n", "│ \u001b[3m datetime \u001b[0m │\n", "│ ┏━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓ │\n", "│ ┃\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfirst \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mlast \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfrequency \u001b[0m\u001b[1m \u001b[0m┃ │\n", @@ -1253,12 +1293,13 @@ "['entity_id',\n", " 'timestamp',\n", " 'prediction_time_uuid',\n", - " 'pred_predictor_name_within_730_days_mean_fallback_nan',\n", - " 'outc_outcome_name_within_365_days_maximum_fallback_0_dichotomous',\n", + " 'pred_predictor_name_within_0_to_730_days_mean_fallback_nan',\n", + " 'pred_predictor_interval_name_within_30_to_90_days_mean_fallback_nan',\n", + " 'outc_outcome_name_within_0_to_365_days_maximum_fallback_0_dichotomous',\n", " 'pred_female']" ] }, - "execution_count": 13, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1273,7 +1314,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -1281,117 +1322,128 @@ "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
 entity_idtimestampprediction_time_uuidpred_Xoutc_Ypred_femaleentity_idtimestampprediction_time_uuidpred_Xpred_X_30_to_90outc_Ypred_female
099031968-05-09 21:24:009903-1968-05-09-21-24-000.9907630.0000000
164471967-09-25 18:08:006447-1967-09-25-18-08-005.5827450.0000001
249271968-06-30 12:13:004927-1968-06-30-12-13-004.9572510.0000000
354751967-01-09 03:09:005475-1967-01-09-03-09-005.9993360.0000000
497931968-12-15 12:59:009793-1968-12-15-12-59-007.2940380.0000000
597681967-07-04 23:09:009768-1967-07-04-23-09-004.3262860.0000001
679161968-12-20 03:38:007916-1968-12-20-03-38-004.6295020.0000000
7331967-07-28 03:16:0033-1967-07-28-03-16-004.6285000.0000000
828831968-01-28 21:50:002883-1968-01-28-21-50-008.2577420.0000001
915151968-07-18 08:28:001515-1968-07-18-08-28-002.9730840.0000000099031968-05-09 21:24:009903-1968-05-09-21-24-000.990763nan0.0000000
164471967-09-25 18:08:006447-1967-09-25-18-08-005.5827457.5771000.0000001
249271968-06-30 12:13:004927-1968-06-30-12-13-004.957251nan0.0000000
354751967-01-09 03:09:005475-1967-01-09-03-09-005.9993369.4972290.0000000
497931968-12-15 12:59:009793-1968-12-15-12-59-007.2940388.1823480.0000000
597681967-07-04 23:09:009768-1967-07-04-23-09-004.326286nan0.0000001
679161968-12-20 03:38:007916-1968-12-20-03-38-004.629502nan0.0000000
7331967-07-28 03:16:0033-1967-07-28-03-16-004.628500nan0.0000000
828831968-01-28 21:50:002883-1968-01-28-21-50-008.257742nan0.0000001
915151968-07-18 08:28:001515-1968-07-18-08-28-002.9730840.6710100.0000000
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 14, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1399,12 +1451,14 @@ "source": [ "# For displayability, shorten col names\n", "shortened_pred = \"pred_X\"\n", + "shortened_pred_interval = \"pred_X_30_to_90\"\n", "shortened_outcome = \"outc_Y\"\n", "\n", "df = df.rename(\n", " {\n", - " \"pred_predictor_name_within_730_days_mean_fallback_nan\": shortened_pred,\n", - " \"outc_outcome_name_within_365_days_maximum_fallback_0_dichotomous\": shortened_outcome,\n", + " \"pred_predictor_name_within_0_to_730_days_mean_fallback_nan\": shortened_pred,\n", + " \"pred_predictor_interval_name_within_30_to_90_days_mean_fallback_nan\": shortened_pred_interval,\n", + " \"outc_outcome_name_within_0_to_365_days_maximum_fallback_0_dichotomous\": shortened_outcome,\n", " },\n", " axis=1,\n", ")\n", @@ -1424,6 +1478,11 @@ "4. Our predictor columns, prefixed with `pred_` and\n", "5. Our outcome columns, prefixed with `outc_`" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] } ], "metadata": { @@ -1442,7 +1501,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.17" + "version": "3.10.13" }, "orig_nbformat": 4, "vscode": { diff --git a/_sources/tutorials/02_advanced.ipynb.txt b/_sources/tutorials/02_advanced.ipynb.txt index 9c2a725e..1147e119 100644 --- a/_sources/tutorials/02_advanced.ipynb.txt +++ b/_sources/tutorials/02_advanced.ipynb.txt @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -47,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -55,7 +55,7 @@ " named_dataframes=[\n", " NamedDataframe(df=load_synth_predictor_float(), name=\"synth_predictor_float\")\n", " ],\n", - " lookbehind_days=[365, 730],\n", + " lookbehind_days=[(0, 365), (365, 730), 1095],\n", " fallback=[np.nan],\n", " aggregation_fns=[mean, maximum],\n", ").create_combinations()" @@ -76,26 +76,32 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "––––––––– We created 4 combinations of predictors. ––––––––––\n", + "––––––––– We created 6 combinations of predictors. ––––––––––\n", "[{'aggregation_fn': 'mean',\n", " 'feature_name': 'synth_predictor_float',\n", - " 'lookbehind_days': 365.0},\n", + " 'lookbehind_days': LookPeriod(min_days=0.0, max_days=365.0)},\n", " {'aggregation_fn': 'maximum',\n", " 'feature_name': 'synth_predictor_float',\n", - " 'lookbehind_days': 365.0},\n", + " 'lookbehind_days': LookPeriod(min_days=0.0, max_days=365.0)},\n", " {'aggregation_fn': 'mean',\n", " 'feature_name': 'synth_predictor_float',\n", - " 'lookbehind_days': 730.0},\n", + " 'lookbehind_days': LookPeriod(min_days=365.0, max_days=730.0)},\n", " {'aggregation_fn': 'maximum',\n", " 'feature_name': 'synth_predictor_float',\n", - " 'lookbehind_days': 730.0}]\n" + " 'lookbehind_days': LookPeriod(min_days=365.0, max_days=730.0)},\n", + " {'aggregation_fn': 'mean',\n", + " 'feature_name': 'synth_predictor_float',\n", + " 'lookbehind_days': LookPeriod(min_days=0, max_days=1095.0)},\n", + " {'aggregation_fn': 'maximum',\n", + " 'feature_name': 'synth_predictor_float',\n", + " 'lookbehind_days': LookPeriod(min_days=0, max_days=1095.0)}]\n" ] } ], @@ -104,7 +110,7 @@ "pred_spec_batch_summary = [\n", " {\n", " \"feature_name\": pred_spec.feature_base_name,\n", - " \"lookbehind_days\": pred_spec.lookbehind_days,\n", + " \"lookbehind_days\": pred_spec.lookbehind_period,\n", " \"aggregation_fn\": pred_spec.aggregation_fn.__name__,\n", " }\n", " for pred_spec in pred_spec_batch\n", @@ -141,7 +147,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -154,14 +160,14 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2023-06-14 16:19:04 [INFO] Overriding pred_time_uuid_col_name in cache with pred_time_uuid_col_name passed to init of flattened dataset\n" + "2024-01-18 11:38:02 [INFO] Overriding pred_time_uuid_col_name in cache with pred_time_uuid_col_name passed to init of flattened dataset\n" ] } ], @@ -192,7 +198,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -201,21 +207,30 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2023-06-14 16:19:04 [INFO] There were unprocessed specs, computing...\n", - "2023-06-14 16:19:04 [INFO] _drop_pred_time_if_insufficient_look_distance: Dropped 4038 (40.38%) rows\n", - "2023-06-14 16:19:04 [INFO] Processing 4 temporal features in parallel with 4 workers. Chunksize is 1. If this is above 1, it may take some time for the progress bar to move, as processing is batched. However, this makes for much faster total performance.\n", - "100%|██████████| 4/4 [00:01<00:00, 2.75it/s]\n", - "2023-06-14 16:19:05 [INFO] Checking alignment of dataframes - this might take a little while (~2 minutes for 1.000 dataframes with 2.000.000 rows).\n", - "2023-06-14 16:19:05 [INFO] Starting concatenation. Will take some time on performant systems, e.g. 30s for 100 features and 2_000_000 prediction times. This is normal.\n", - "2023-06-14 16:19:05 [INFO] Concatenation took 0.007 seconds\n", - "2023-06-14 16:19:05 [INFO] Merging with original df\n" + "2024-01-18 11:38:03 [INFO] There were unprocessed specs, computing...\n", + "2024-01-18 11:38:03 [INFO] _drop_pred_time_if_insufficient_look_distance: Dropped 6053 (60.53%) rows\n", + "2024-01-18 11:38:03 [INFO] Processing 6 temporal features in parallel with 4 workers. Chunksize is 2. If this is above 1, it may take some time for the progress bar to move, as processing is batched. However, this makes for much faster total performance.\n", + " 0%| | 0/6 [00:00 dataframe ┃ Values ┃ ┃ Column Type Count ┃ │\n", "│ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩ │\n", - "│ │ Number of rows │ 5962 │ │ float64 │ 4 │ │\n", - "│ │ Number of columns │ 7 │ │ int64 │ 1 │ │\n", + "│ │ Number of rows │ 3947 │ │ float64 │ 6 │ │\n", + "│ │ Number of columns │ 9 │ │ int64 │ 1 │ │\n", "│ └───────────────────┴────────┘ │ datetime64 │ 1 │ │\n", "│ │ string │ 1 │ │\n", "│ └─────────────┴───────┘ │\n", "│ number │\n", - "│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━━┓ │\n", - "│ ┃ column_name NA NA % mean sd p0 p25 p75 p100 hist ┃ │\n", - "│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━━┩ │\n", - "│ │ entity_id 0 0 5000 2900 0 2500 7400 10000█▇███▇ │ │\n", - "│ │ pred_synth_predictor 820 14 5 2.1 0.00039 3.5 6.4 10▂▄██▄▂ │ │\n", - "│ │ pred_synth_predictor 110 1.8 7.7 2.1 0.058 6.7 9.3 10 ▁▁▂▄█ │ │\n", - "│ │ pred_synth_predictor 820 14 6.6 2.6 0.00039 4.8 8.8 10▁▂▃▄▆█ │ │\n", - "│ │ pred_synth_predictor 110 1.8 5 1.7 0.058 3.9 6.1 9.9▁▃██▃▁ │ │\n", - "│ └───────────────────────────┴───────┴────────┴────────┴───────┴───────────┴───────┴───────┴────────┴─────────┘ │\n", + "│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┓ │\n", + "│ ┃ column_name NA NA % mean sd p0 p25 p75 p100 hist ┃ │\n", + "│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━┩ │\n", + "│ │ entity_id 0 0 5000 2900 0 2600 7400 10000█████▇ │ │\n", + "│ │ pred_synth_predictor 7 0.18 5 1.3 0.29 4.1 5.8 9.9 ▂█▇▁ │ │\n", + "│ │ pred_synth_predictor 510 13 6.6 2.6 0.024 4.8 8.8 10▂▂▃▄▆█ │ │\n", + "│ │ pred_synth_predictor 530 14 6.6 2.6 0.0084 4.8 8.8 10▁▂▃▄▆█ │ │\n", + "│ │ pred_synth_predictor 7 0.18 8.4 1.5 0.29 7.8 9.5 10 ▁▃█ │ │\n", + "│ │ pred_synth_predictor 510 13 5.1 2.2 0.024 3.6 6.5 10▂▄██▅▂ │ │\n", + "│ │ pred_synth_predictor 530 14 5 2.1 0.0084 3.6 6.4 9.9▂▄██▄▂ │ │\n", + "│ └────────────────────────────┴───────┴────────┴────────┴────────┴──────────┴───────┴───────┴────────┴────────┘ │\n", "│ datetime │\n", "│ ┏━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓ │\n", "│ ┃ column_name NA NA % first last frequency ┃ │\n", "│ ┡━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩ │\n", - "│ │ timestamp 0 0 1967-01-02 01:16:00 1969-12-31 21:42:00 None │ │\n", + "│ │ timestamp 0 0 1968-01-02 05:12:00 1969-12-31 21:42:00 None │ │\n", "│ └──────────────────┴──────┴─────────┴────────────────────────────┴────────────────────────────┴──────────────┘ │\n", "│ string │\n", "│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓ │\n", "│ ┃ column_name NA NA % words per row total words ┃ │\n", "│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩ │\n", - "│ │ prediction_time_uuid 0 0 1 6000 │ │\n", + "│ │ prediction_time_uuid 0 0 1 3900 │ │\n", "│ └───────────────────────────────────────┴───────┴───────────┴──────────────────────────┴─────────────────────┘ │\n", "╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯\n", "
\n" @@ -272,32 +289,34 @@ "│ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓ │\n", "│ ┃\u001b[1;36m \u001b[0m\u001b[1;36mdataframe \u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mValues\u001b[0m\u001b[1;36m \u001b[0m┃ ┃\u001b[1;36m \u001b[0m\u001b[1;36mColumn Type\u001b[0m\u001b[1;36m \u001b[0m┃\u001b[1;36m \u001b[0m\u001b[1;36mCount\u001b[0m\u001b[1;36m \u001b[0m┃ │\n", "│ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩ │\n", - "│ │ Number of rows │ 5962 │ │ float64 │ 4 │ │\n", - "│ │ Number of columns │ 7 │ │ int64 │ 1 │ │\n", + "│ │ Number of rows │ 3947 │ │ float64 │ 6 │ │\n", + "│ │ Number of columns │ 9 │ │ int64 │ 1 │ │\n", "│ └───────────────────┴────────┘ │ datetime64 │ 1 │ │\n", "│ │ string │ 1 │ │\n", "│ └─────────────┴───────┘ │\n", "│ \u001b[3m number \u001b[0m │\n", - "│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━━┓ │\n", - "│ ┃\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmean \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msd \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp0 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp25 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp75 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp100 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mhist \u001b[0m\u001b[1m \u001b[0m┃ │\n", - "│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━━┩ │\n", - "│ │ \u001b[38;5;141mentity_id \u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 5000\u001b[0m │ \u001b[36m 2900\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 2500\u001b[0m │ \u001b[36m 7400\u001b[0m │ \u001b[36m 10000\u001b[0m │ \u001b[32m█▇███▇ \u001b[0m │ │\n", - "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 820\u001b[0m │ \u001b[36m 14\u001b[0m │ \u001b[36m 5\u001b[0m │ \u001b[36m 2.1\u001b[0m │ \u001b[36m 0.00039\u001b[0m │ \u001b[36m 3.5\u001b[0m │ \u001b[36m 6.4\u001b[0m │ \u001b[36m 10\u001b[0m │ \u001b[32m▂▄██▄▂ \u001b[0m │ │\n", - "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 110\u001b[0m │ \u001b[36m 1.8\u001b[0m │ \u001b[36m 7.7\u001b[0m │ \u001b[36m 2.1\u001b[0m │ \u001b[36m 0.058\u001b[0m │ \u001b[36m 6.7\u001b[0m │ \u001b[36m 9.3\u001b[0m │ \u001b[36m 10\u001b[0m │ \u001b[32m ▁▁▂▄█ \u001b[0m │ │\n", - "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 820\u001b[0m │ \u001b[36m 14\u001b[0m │ \u001b[36m 6.6\u001b[0m │ \u001b[36m 2.6\u001b[0m │ \u001b[36m 0.00039\u001b[0m │ \u001b[36m 4.8\u001b[0m │ \u001b[36m 8.8\u001b[0m │ \u001b[36m 10\u001b[0m │ \u001b[32m▁▂▃▄▆█ \u001b[0m │ │\n", - "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 110\u001b[0m │ \u001b[36m 1.8\u001b[0m │ \u001b[36m 5\u001b[0m │ \u001b[36m 1.7\u001b[0m │ \u001b[36m 0.058\u001b[0m │ \u001b[36m 3.9\u001b[0m │ \u001b[36m 6.1\u001b[0m │ \u001b[36m 9.9\u001b[0m │ \u001b[32m▁▃██▃▁ \u001b[0m │ │\n", - "│ └───────────────────────────┴───────┴────────┴────────┴───────┴───────────┴───────┴───────┴────────┴─────────┘ │\n", + "│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━┓ │\n", + "│ ┃\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmean \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msd \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp0 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp25 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp75 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp100 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mhist \u001b[0m\u001b[1m \u001b[0m┃ │\n", + "│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━┩ │\n", + "│ │ \u001b[38;5;141mentity_id \u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 5000\u001b[0m │ \u001b[36m 2900\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 2600\u001b[0m │ \u001b[36m 7400\u001b[0m │ \u001b[36m 10000\u001b[0m │ \u001b[32m█████▇\u001b[0m │ │\n", + "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 7\u001b[0m │ \u001b[36m 0.18\u001b[0m │ \u001b[36m 5\u001b[0m │ \u001b[36m 1.3\u001b[0m │ \u001b[36m 0.29\u001b[0m │ \u001b[36m 4.1\u001b[0m │ \u001b[36m 5.8\u001b[0m │ \u001b[36m 9.9\u001b[0m │ \u001b[32m ▂█▇▁ \u001b[0m │ │\n", + "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 510\u001b[0m │ \u001b[36m 13\u001b[0m │ \u001b[36m 6.6\u001b[0m │ \u001b[36m 2.6\u001b[0m │ \u001b[36m 0.024\u001b[0m │ \u001b[36m 4.8\u001b[0m │ \u001b[36m 8.8\u001b[0m │ \u001b[36m 10\u001b[0m │ \u001b[32m▂▂▃▄▆█\u001b[0m │ │\n", + "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 530\u001b[0m │ \u001b[36m 14\u001b[0m │ \u001b[36m 6.6\u001b[0m │ \u001b[36m 2.6\u001b[0m │ \u001b[36m 0.0084\u001b[0m │ \u001b[36m 4.8\u001b[0m │ \u001b[36m 8.8\u001b[0m │ \u001b[36m 10\u001b[0m │ \u001b[32m▁▂▃▄▆█\u001b[0m │ │\n", + "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 7\u001b[0m │ \u001b[36m 0.18\u001b[0m │ \u001b[36m 8.4\u001b[0m │ \u001b[36m 1.5\u001b[0m │ \u001b[36m 0.29\u001b[0m │ \u001b[36m 7.8\u001b[0m │ \u001b[36m 9.5\u001b[0m │ \u001b[36m 10\u001b[0m │ \u001b[32m ▁▃█\u001b[0m │ │\n", + "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 510\u001b[0m │ \u001b[36m 13\u001b[0m │ \u001b[36m 5.1\u001b[0m │ \u001b[36m 2.2\u001b[0m │ \u001b[36m 0.024\u001b[0m │ \u001b[36m 3.6\u001b[0m │ \u001b[36m 6.5\u001b[0m │ \u001b[36m 10\u001b[0m │ \u001b[32m▂▄██▅▂\u001b[0m │ │\n", + "│ │ \u001b[38;5;141mpred_synth_predictor \u001b[0m │ \u001b[36m 530\u001b[0m │ \u001b[36m 14\u001b[0m │ \u001b[36m 5\u001b[0m │ \u001b[36m 2.1\u001b[0m │ \u001b[36m 0.0084\u001b[0m │ \u001b[36m 3.6\u001b[0m │ \u001b[36m 6.4\u001b[0m │ \u001b[36m 9.9\u001b[0m │ \u001b[32m▂▄██▄▂\u001b[0m │ │\n", + "│ └────────────────────────────┴───────┴────────┴────────┴────────┴──────────┴───────┴───────┴────────┴────────┘ │\n", "│ \u001b[3m datetime \u001b[0m │\n", "│ ┏━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓ │\n", "│ ┃\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfirst \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mlast \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfrequency \u001b[0m\u001b[1m \u001b[0m┃ │\n", "│ ┡━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩ │\n", - "│ │ \u001b[38;5;141mtimestamp \u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[31m 1967-01-02 01:16:00 \u001b[0m │ \u001b[31m 1969-12-31 21:42:00 \u001b[0m │ \u001b[38;5;141mNone \u001b[0m │ │\n", + "│ │ \u001b[38;5;141mtimestamp \u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[31m 1968-01-02 05:12:00 \u001b[0m │ \u001b[31m 1969-12-31 21:42:00 \u001b[0m │ \u001b[38;5;141mNone \u001b[0m │ │\n", "│ └──────────────────┴──────┴─────────┴────────────────────────────┴────────────────────────────┴──────────────┘ │\n", "│ \u001b[3m string \u001b[0m │\n", "│ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓ │\n", "│ ┃\u001b[1m \u001b[0m\u001b[1mcolumn_name \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mNA % \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mwords per row \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtotal words \u001b[0m\u001b[1m \u001b[0m┃ │\n", "│ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩ │\n", - "│ │ \u001b[38;5;141mprediction_time_uuid \u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 1\u001b[0m │ \u001b[36m 6000\u001b[0m │ │\n", + "│ │ \u001b[38;5;141mprediction_time_uuid \u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 0\u001b[0m │ \u001b[36m 1\u001b[0m │ \u001b[36m 3900\u001b[0m │ │\n", "│ └───────────────────────────────────────┴───────┴───────────┴──────────────────────────┴─────────────────────┘ │\n", "╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯\n" ] @@ -311,13 +330,15 @@ "['entity_id',\n", " 'timestamp',\n", " 'prediction_time_uuid',\n", - " 'pred_synth_predictor_float_within_365_days_mean_fallback_nan',\n", - " 'pred_synth_predictor_float_within_730_days_maximum_fallback_nan',\n", - " 'pred_synth_predictor_float_within_365_days_maximum_fallback_nan',\n", - " 'pred_synth_predictor_float_within_730_days_mean_fallback_nan']" + " 'pred_synth_predictor_float_within_0_to_1095_days_mean_fallback_nan',\n", + " 'pred_synth_predictor_float_within_365_to_730_days_maximum_fallback_nan',\n", + " 'pred_synth_predictor_float_within_0_to_365_days_maximum_fallback_nan',\n", + " 'pred_synth_predictor_float_within_0_to_1095_days_maximum_fallback_nan',\n", + " 'pred_synth_predictor_float_within_365_to_730_days_mean_fallback_nan',\n", + " 'pred_synth_predictor_float_within_0_to_365_days_mean_fallback_nan']" ] }, - "execution_count": 31, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -330,7 +351,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -338,128 +359,150 @@ "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
 entity_idtimestampprediction_time_uuidpred_1pred_2pred_3pred_4entity_idtimestampprediction_time_uuidpred_1pred_2pred_3pred_4pred_5pred_6
099031968-05-09 21:24:009903-1968-05-09-21-24-000.1549812.1943190.1549810.990763099031968-05-09 21:24:009903-1968-05-09-21-24-002.8646262.1943190.1549815.9315531.4086550.154981
164471967-09-25 18:08:006447-1967-09-25-18-08-005.3960179.7740508.9302565.582745149271968-06-30 12:13:004927-1968-06-30-12-13-004.466599nan6.7306948.630901nan4.957251
249271968-06-30 12:13:004927-1968-06-30-12-13-004.9572516.7306946.7306944.957251231571969-10-07 05:01:003157-1969-10-07-05-01-004.168456nan5.2431765.243176nan5.068323
354751967-01-09 03:09:005475-1967-01-09-03-09-006.0815399.4972299.4972295.999336397931968-12-15 12:59:009793-1968-12-15-12-59-007.1449598.2932669.7089769.7271826.2304178.091755
431571969-10-07 05:01:003157-1969-10-07-05-01-005.0683235.2431765.2431765.068323498611969-01-22 17:34:009861-1969-01-22-17-34-003.6696355.4914153.1302836.2171613.3091973.130283
597931968-12-15 12:59:009793-1968-12-15-12-59-008.0917559.7089769.7089767.29403856571969-04-14 15:47:00657-1969-04-14-15-47-007.3915147.903614nan7.9036147.903614nan
697681967-07-04 23:09:009768-1967-07-04-23-09-004.9594195.7294415.7294414.326286679161968-12-20 03:38:007916-1968-12-20-03-38-004.2517046.0845234.3185866.9791566.0845233.901992
798611969-01-22 17:34:009861-1969-01-22-17-34-003.1302835.4914153.1302833.279378728831968-01-28 21:50:002883-1968-01-28-21-50-004.712403nan8.2577428.257742nan8.257742
86571969-04-14 15:47:00657-1969-04-14-15-47-00nan7.903614nan7.903614815151968-07-18 08:28:001515-1968-07-18-08-28-003.1127003.6846148.6548398.6548393.1046742.907289
979161968-12-20 03:38:007916-1968-12-20-03-38-003.9019926.0845234.3185864.629502967541968-09-21 01:27:006754-1968-09-21-01-27-005.0829183.1021322.3466449.6577552.3249132.346644
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 32, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -496,7 +539,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.17" + "version": "3.10.13" }, "orig_nbformat": 4, "vscode": { diff --git a/feature_specifications.html b/feature_specifications.html index 61d6c739..4cee81db 100644 --- a/feature_specifications.html +++ b/feature_specifications.html @@ -249,7 +249,7 @@

Feature specifications

timeseriesflattener.feature_specs.single_specs#

-class CoercedFloats(lookwindow: Union[float, int], fallback: Union[float, int])[source]#
+class CoercedFloats(lookperiod: timeseriesflattener.feature_specs.single_specs.LookPeriod, fallback: Union[float, int])[source]#

Bases: object

@@ -257,15 +257,31 @@

Feature specifications

-
-lookwindow: Union[float, int]#
+
+lookperiod: LookPeriod#
+
+ + + +
+
+class LookPeriod(min_days: float, max_days: float)[source]#
+

Bases: object

+
+
+max_days: float#
+
+ +
+
+min_days: float#
-class OutcomeSpec(*, timeseries_df: DataFrame, feature_base_name: str, lookahead_days: float, aggregation_fn: Callable[[DataFrameGroupBy], DataFrame], fallback: Union[float, int], incident: bool, prefix: str = 'outc')[source]#
+class OutcomeSpec(*, timeseries_df: DataFrame, feature_base_name: str, lookahead_days: Union[float, Tuple[float, float]], aggregation_fn: Callable[[DataFrameGroupBy], DataFrame], fallback: Union[float, int], incident: bool, prefix: str = 'outc')[source]#

Bases: BaseModel

Specification for an outcome feature.

@@ -278,7 +294,8 @@

Feature specifications
  • feature_base_name – The name of the feature. Used for column name generation, e.g. <prefix>_<feature_baase_name>_<metadata>.

  • -
  • lookahead_days – How far ahead from the prediction time to look for outcome values.

  • +
  • lookahead_days – In which interval from the prediction time to look for outcome values. +Can be tuple of two floats specifying (min_days, max_days) or float | int which will resolve to (0, value).

  • aggregation_fn – How to aggregate multiple values within lookahead days. Should take a grouped dataframe as input and return a single value.

  • fallback – Value to return if no values is found within window.

  • incident – Whether the outcome is incident or not. E.g. type 2 diabetes is incident because you can only experience it once. @@ -323,7 +340,12 @@

    Feature specifications
    -lookahead_days: float#
    +lookahead_days: Union[float, Tuple[float, float]]# +

  • + +
    +
    +property lookahead_period: LookPeriod#
    @@ -334,7 +356,7 @@

    Feature specifications
    -model_fields: ClassVar[dict[str, FieldInfo]] = {'aggregation_fn': FieldInfo(annotation=Callable[list, DataFrame], required=True), 'fallback': FieldInfo(annotation=Union[float, int], required=True), 'feature_base_name': FieldInfo(annotation=str, required=True), 'incident': FieldInfo(annotation=bool, required=True), 'lookahead_days': FieldInfo(annotation=float, required=True), 'prefix': FieldInfo(annotation=str, required=False, default='outc'), 'timeseries_df': FieldInfo(annotation=DataFrame, required=True)}#
    +model_fields: ClassVar[dict[str, FieldInfo]] = {'aggregation_fn': FieldInfo(annotation=Callable[list, DataFrame], required=True), 'fallback': FieldInfo(annotation=Union[float, int], required=True), 'feature_base_name': FieldInfo(annotation=str, required=True), 'incident': FieldInfo(annotation=bool, required=True), 'lookahead_days': FieldInfo(annotation=Union[float, Tuple[float, float]], required=True), 'prefix': FieldInfo(annotation=str, required=False, default='outc'), 'timeseries_df': FieldInfo(annotation=DataFrame, required=True)}#

    Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

    This replaces Model.__fields__ from Pydantic V1.

    @@ -354,7 +376,7 @@

    Feature specifications
    -class PredictorSpec(*, timeseries_df: DataFrame, feature_base_name: str, aggregation_fn: Callable[[DataFrameGroupBy], DataFrame], fallback: Union[float, int], lookbehind_days: float, prefix: str = 'pred')[source]#
    +class PredictorSpec(*, timeseries_df: DataFrame, feature_base_name: str, aggregation_fn: Callable[[DataFrameGroupBy], DataFrame], fallback: Union[float, int], lookbehind_days: Union[float, Tuple[float, float]], prefix: str = 'pred')[source]#

    Bases: BaseModel

    Specification for predictor feature.

    @@ -367,12 +389,10 @@

    Feature specifications
  • feature_base_name – The name of the feature. Used for column name generation, e.g. <prefix>_<feature_baase_name>_<metadata>.

  • -
  • lookbehind_days – How far behind from the prediction time to look for predictor values.

  • -
  • aggregation_fn – How to aggregate multiple values within lookahead days. Should take a grouped dataframe as input and return a single value.

  • +
  • lookbehind_days – In which interval from the prediction time to look for predictor values. +Can be tuple of two floats specifying (min_days, max_days) or float | int which will resolve to (0, value).

  • +
  • aggregation_fn – How to aggregate multiple values within lookbehind days. Should take a grouped dataframe as input and return a single value.

  • fallback – Value to return if no values is found within window.

  • -
  • incident – Whether the outcome is incident or not. E.g. type 2 diabetes is incident because you can only experience it once. -Incident outcomes can be handled in a vectorised way during resolution, which is faster than non-incident outcomes. -Requires that each entity only occurs once in the timeseries_df.

  • prefix – The prefix used for column name generation, e.g. <prefix>_<feature_name>_<metadata>. Defaults to “pred”.

  • @@ -401,7 +421,12 @@

    Feature specifications
    -lookbehind_days: float#
    +lookbehind_days: Union[float, Tuple[float, float]]# +

    + +
    +
    +property lookbehind_period: LookPeriod#
    @@ -412,7 +437,7 @@

    Feature specifications
    -model_fields: ClassVar[dict[str, FieldInfo]] = {'aggregation_fn': FieldInfo(annotation=Callable[list, DataFrame], required=True), 'fallback': FieldInfo(annotation=Union[float, int], required=True), 'feature_base_name': FieldInfo(annotation=str, required=True), 'lookbehind_days': FieldInfo(annotation=float, required=True), 'prefix': FieldInfo(annotation=str, required=False, default='pred'), 'timeseries_df': FieldInfo(annotation=DataFrame, required=True)}#
    +model_fields: ClassVar[dict[str, FieldInfo]] = {'aggregation_fn': FieldInfo(annotation=Callable[list, DataFrame], required=True), 'fallback': FieldInfo(annotation=Union[float, int], required=True), 'feature_base_name': FieldInfo(annotation=str, required=True), 'lookbehind_days': FieldInfo(annotation=Union[float, Tuple[float, float]], required=True), 'prefix': FieldInfo(annotation=str, required=False, default='pred'), 'timeseries_df': FieldInfo(annotation=DataFrame, required=True)}#

    Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo].

    This replaces Model.__fields__ from Pydantic V1.

    @@ -493,12 +518,12 @@

    Feature specifications
    -coerce_floats(lookwindow: float, fallback: float) CoercedFloats[source]#
    +coerce_floats(lookperiod: LookPeriod, fallback: float) CoercedFloats[source]#

    -get_temporal_col_name(prefix: str, feature_base_name: str, lookwindow: Union[float, int], aggregation_fn: Callable[[DataFrameGroupBy], DataFrame], fallback: Union[float, int]) str[source]#
    +get_temporal_col_name(prefix: str, feature_base_name: str, lookperiod: LookPeriod, aggregation_fn: Callable[[DataFrameGroupBy], DataFrame], fallback: Union[float, int]) str[source]#

    Get the column name for the temporal feature.

    @@ -571,7 +596,12 @@

    Feature specificationstimeseriesflattener.feature_specs.single_specs