Source code for statistics

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn import metrics


[docs]def mape1(y_true, y_pred):

    """
    Computes the Mean Absolute Percentage Error between the 2 given time series

    Args:
        y_true: A numpy array that contains the actual values of the time series.
        y_pred: A numpy array that contains the predicted values of the time series.
    
    Return: 
        Mean Absolute Percentage Error value.
            
        
    """
    mape1 = (np.mean(np.abs(y_true-y_pred)/np.mean(y_true)))
    return mape1


[docs]def mpe1(y_true, y_pred):

    """
    Computes the Mean Percentage Error between the 2 given time series.

    Args:
        y_true: A numpy array that contains the actual values of the time series.
        y_pred: A numpy array that contains the predicted values of the time series.
    
    Return: 
        Mean Absolute Error value.
            
        
    """
    mpe1 = (np.mean(y_true-y_pred)/np.mean(y_true))
    return mpe1


[docs]def score(y_true, y_pred):

    """
    Computes a set of values that measure how well a predicted time series matches the actual time series.

    Args:
        y_true: A numpy array that contains the actual values of the time series.
        y_pred: A numpy array that contains the predicted values of the time series.
    
    Return: 
        Returns a value for each of the following measures: 
        r-squared, mean absolute error, mean error, mean absolute percentage error, mean percentage error
        
            
        
    """
    r_sq = metrics.r2_score(y_true, y_pred)
    mae = metrics.mean_absolute_error(y_true, y_pred)
    me = np.mean(y_true-y_pred)
    mape = mape1(y_true, y_pred)
    mpe = mpe1(y_true, y_pred)
    return r_sq, mae, me, mape, mpe


[docs]def get_top_deviations(scores, metric='mpe', n=5):

    """
    Given a matrix that each row contains scores of how well a segment fits a model, find the indices of the top most
    deviant segments.
    
    Args:
        scores: A 2-D numpy array (NxM) that contains M scores for each one of the N segments.
        metric: A string that specifies which score to consider.
        n: number of the deviant segments
    
    Return: 
        The indices of the segments.
            
        
    """
    metrics = {'mpe': 4, 'me': 2}
    score_column = metrics[metric]
    indices = np.argsort(scores[:, score_column])[:n]
    return indices


[docs]def multi_corr(df, dep_column):

    """
    Computation of the coefficient of multiple correlation.
    The input consists of a dataframe and the column corresponding to the dependent variable.
    
    Args:
        df: Date/Time DataFrame or any Given DataFrame.
        dep_column: The corresponding the column to the dependent variable.
    
    Return: 
        The coefficient of multiple correlation between the dependant column and the rest.
            
        
    """
    df_str_corr = df.corr(method='pearson')
    df_str_corr_ind_temp = df_str_corr.drop(index=dep_column)
    df_str_corr_ind = df_str_corr_ind_temp.drop(columns=dep_column)
    df_str_corr_ind_inv = inv(df_str_corr_ind.values)
    df_str_corr_dep = df_str_corr_ind_temp.loc[:, dep_column]
    return np.matmul(np.matmul(np.transpose(df_str_corr_dep.values), df_str_corr_ind_inv), df_str_corr_dep.values)