Source code for tsad.utils.preproc

import pandas as pd 

[docs]def value_counts_interval(array,itervals):
    """
    Returns a pandas Series containing the count of values in the 
    input array that fall within each interval.

    Parameters:
    ----------
    array : numpy.ndarray | list of values
        Input array of values.
    intervals (list): 
        List of interval boundaries. The first interval is defined 
        as values less than the first boundary, and the last interval 
        is defined as values greater than or equal to the last boundary.

    Returns:
    -------
    ts : pandas.Series
        A Series containing the count of values in the input array 
        that fall within each interval.
    """

    names = [f"до {itervals[0]}"]
    quantity = [len(array[array < itervals[0]])]
    for i in range(len(itervals)-1):
        quantity.append(len(array[(array >= itervals[i]) & (array < itervals[i+1])]))
        names.append(f'c {itervals[i]} до {itervals[i+1]}')
    names += [f"от {itervals[-1]}"]
    quantity += [len(array[array >= itervals[-1]])]
    ts = pd.Series(quantity,index=names)    
    return ts



        
[docs]def split_by_repeated(series,df=None):
    """
    Splits a pandas series into sub-series based on repeated values.

    Parameters:
    ----------
    series : pandas.Series
        The series to be split.
    df (, optional): pandas.DataFrame. Defaults is None.
        The dataframe to be used to retrieve the original rows. 

    Returns:
    -------

    dict: A dictionary where the keys are the unique values in the series and the values are lists of sub-series.
    """
    series = series.copy().dropna()
    if len(series.unique())==1:
        result = {series.unique()[0]:[series]}
    elif len(series.unique())>1:
        result = {uni:[] for uni in series.unique()}
        recent_i=0
        recent_val=series.values[0]
        for i in range(len(series)):
            val = series.values[i]
            if (recent_val == val):
                continue
            else:
                result[recent_val].append(series[recent_i:i])
                recent_i=i
                recent_val = val

        if i == len(series)-1:
            if (recent_val == val):
                result[recent_val].append(series[recent_i:i+1])
            else:
                result[recent_val].append(series[recent_i:i+1])
    else:
        raise NameError('0 series')


    if df is not None:
        new_result = {uni:[] for uni in series.unique()}
        for key in result:
            for i in range(len(result[key])):
                if len(result[key][i]) <=1:
                    continue
                else:
                    new_result[key].append(df.loc[result[key][i].index])
        return new_result
    else:
        return result
        
        
[docs]def df2dfs(df,  # Авторы не рекомендуют так делать,
            resample_freq = None, # требования
            thereshold_gap = None, 
            koef_freq_of_gap = 1.2, # 1.2 проблема которая возникает которую 02.09.2021 я написал в ИИ 
            plot = False,
            col = None):
    """
    Function that splits df into a list of dfs by gaps. That is it makes raw df 
    satisfying to the input requirements with the lack of gaps and different 
    frequencies of discretization.
    Does not resample as it is a heavy task, but if the frequency is less than 
    koef_freq_of_gap of thereshold_gap, it is perceived as a skip. 
    The main idea: if the signal comes more often, then it does not slip too much, 
    and therefore does not lead to anomalies, but if it is rare, it leads to anomalies, 
    so it is perceived as a skip. 
    
    plot - very long
    
    Parameters
    ----------   
    df : pd.DataFrame
        The original time series for the entire history.   
        
    resample_freq: pd.TimeDelta (optional, default=None)
        The frequency of time series discretization. 
        If default, then the most frequent frequency of discretization. 
        If there is no pronounced frequency, an error will occur. 
    thereshold_gap : pd.TimeDelta (optional, default=None)
        The threshold period, exceeding which the function will perceive 
        this period as a skip. 
    koef_freq_of_gap : float or int (optional if thereshold_gap==None,
        default=1.2)  
        thereshold_gap = koef_freq_of_gap * resample_freq
    plot : bool (optional, default=False)
        Plot the cut, but it is need very long time. 
        If true, then the cut will be drawn.
        If false, then the cut will not be drawn.   
    col : int of str (optional, default=True)
        The name or number of the column to draw.
        If None, the first column is used.
    Returns
    -------
    dfs : list of pd.DataFrame
        A list of time series without gaps with a relatively stable 
        frequency of discretization. 
    """

    df = df.dropna(how='all').dropna(axis=1,how='all')
    dts  = df.dropna(how='all').index.to_series().diff()
    if resample_freq is None:
        dts_dist = dts.value_counts()
        if dts_dist[0] > dts_dist[1:].sum():
            resample_freq  = dts_dist.index[0]
        else: 
            print(dts_dist)
            raise Exception("It is necessary to process the function yourself since there is no prevailing sampling frequency")
    # print(resample_freq,koef_freq_of_gap )
    # print(koef_freq_of_gap )
    thereshold_gap = pd.Timedelta(resample_freq)*koef_freq_of_gap if thereshold_gap is None else thereshold_gap
    gaps = (dts > thereshold_gap).astype(int).cumsum()
    dfs = [df.loc[gaps[gaps==stage].index] for stage in gaps.unique()]
    
    if plot:
        f, ax = plt.subplots()
        if col is None:
            col = df.columns[0]
        else:
            if type(col)==type(int):
                col = df.columns[col]
        for df in dfs:
            df[col].plot(ax=ax)
    return dfs