Source code for sefef.postprocessing

# -*- coding: utf-8 -*-
"""
sefef.postprocessing
--------------------

This module contains functions to process individual predicted probabilities into a unified forecast according to the desired forecast horizon.
Author: Ana Sofia Carmo

:copyright: (c) 2024 by Ana Sofia Carmo
:license: BSD 3-clause License, see LICENSE for more details.
"""

# third-party
import pandas as pd
import numpy as np



[docs]
class Forecast:
    ''' Stores the forecasts made by the model and processes them.

    Attributes
    ---------- 
    pred_proba : array-like, shape (#samples, ), dtype "float64"
        Contains the probability predicted by the model for each sample belonging to the pre-ictal class. 
    timestamps :  array-like, shape (#samples, ), dtype "int64"
        Contains the unix timestamps (in seconds) corresponding to the start-time of each sample. 

    Methods
    -------
    append(pred_proba, timestamps) :
        Appends new predicted probabilities to the ones already in the Forecast object.
    postprocess(forecast_horizon) :
        Applies postprocessing methodology to the predictions stored in "pred_proba", according to "forecast horizon" (in seconds). Returns an array with the new probabilities.

    Raises
    -------
    ValueError :
        Description
    '''

    def __init__(self, pred_proba, timestamps):
        self.pred_proba = np.array(pred_proba)
        self.timestamps = np.array(timestamps)
        assert self.pred_proba.shape == self.timestamps.shape, f'The provided timestamps and predicted probabilities do not have the same shape {self.timestamps.shape} vs {self.pred_proba.shape}.'


[docs]
    def append(self, pred_proba, timestamps):
        pass



[docs]
    def postprocess(self, forecast_horizon, smooth_win, smooth_sliding=False, origin='clock-time'):
        ''' Applies post-processing methodology to the predictions stored in "pred_proba". For each time period with duration equal to "forecast_horizon", mean predicted probabilities are calculated for groups of consecutive samples (with a window of duration "smooth_win", in seconds), with or without overlap, and the maximum across the full period is obtained.

        Parameters
        ---------- 
        forecast_horizon : int
            Forecast horizon in seconds, i.e. time in the future for which the forecasts will be issued.  
        smooth_win : int
            Duration of window, in seconds, used to smooth the predicted probabilities. If "smooth_sliding" is set to False, the duration of this variable should sum up to "forecast_horizon".
        smooth_sliding : bool, defaults to False
            Whether to use a sliding-window approach during smoothing (with a step of 1 sample), or to use non-overlaping smoothing windows. When True, not yet implemented.
        origin : str, defaults to "clock-time"
            Determines if the forecasts are issued at clock-time (e.g. at the start of each hour) or according to the start-time of the first sample. Options are "clock-time" and "sample-time", respectively. 

        Returns
        -------
        result1 : array-like, shape (#forecasts, ), dtype "float64"
            Contains the predicted probabilites of seizure occurrence for the period with duration "forecast_horizon" and starting at the timestamps in "result2".
        result2 : array-like, shape (#forecasts, ), dtype "int64"
            Contains the Unix timestamps, in seconds, for the start of the period for which the forecasts (in "result1") are valid. 
        '''

        if not smooth_sliding:
            assert forecast_horizon % smooth_win == 0, 'With "smooth_sliding"=False, the duration of "smooth_win" should sum up to "forecast_horizon".'
        else:
            raise NotImplementedError

        origin2param = {'clock-time': 'start_day', 'sample-time': 'start'}

        pred_proba = pd.DataFrame(self.pred_proba, index=pd.to_datetime(
            self.timestamps, unit='s', utc=True), columns=['pred_proba'])
        smooth_proba = pred_proba.resample(f'{smooth_win}s', origin=origin2param[origin], label='right').mean()
        smooth_proba.index = smooth_proba.index - pd.Timedelta(f'{smooth_win}s')

        final_proba = smooth_proba.resample(f'{forecast_horizon}s', origin=origin2param[origin], label='right').max()

        return final_proba.pred_proba.to_numpy(), (final_proba.index.astype('int64') // 10**9).to_numpy()