Source code for statistics
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn import metrics
[docs]def mape1(y_true, y_pred):
"""
Computes the Mean Absolute Percentage Error between the 2 given time series
Args:
y_true: A numpy array that contains the actual values of the time series.
y_pred: A numpy array that contains the predicted values of the time series.
Return:
Mean Absolute Percentage Error value.
"""
mape1 = (np.mean(np.abs(y_true-y_pred)/np.mean(y_true)))
return mape1
[docs]def mpe1(y_true, y_pred):
"""
Computes the Mean Percentage Error between the 2 given time series.
Args:
y_true: A numpy array that contains the actual values of the time series.
y_pred: A numpy array that contains the predicted values of the time series.
Return:
Mean Absolute Error value.
"""
mpe1 = (np.mean(y_true-y_pred)/np.mean(y_true))
return mpe1
[docs]def score(y_true, y_pred):
"""
Computes a set of values that measure how well a predicted time series matches the actual time series.
Args:
y_true: A numpy array that contains the actual values of the time series.
y_pred: A numpy array that contains the predicted values of the time series.
Return:
Returns a value for each of the following measures:
r-squared, mean absolute error, mean error, mean absolute percentage error, mean percentage error
"""
r_sq = metrics.r2_score(y_true, y_pred)
mae = metrics.mean_absolute_error(y_true, y_pred)
me = np.mean(y_true-y_pred)
mape = mape1(y_true, y_pred)
mpe = mpe1(y_true, y_pred)
return r_sq, mae, me, mape, mpe
[docs]def get_top_deviations(scores, metric='mpe', n=5):
"""
Given a matrix that each row contains scores of how well a segment fits a model, find the indices of the top most
deviant segments.
Args:
scores: A 2-D numpy array (NxM) that contains M scores for each one of the N segments.
metric: A string that specifies which score to consider.
n: number of the deviant segments
Return:
The indices of the segments.
"""
metrics = {'mpe': 4, 'me': 2}
score_column = metrics[metric]
indices = np.argsort(scores[:, score_column])[:n]
return indices
[docs]def multi_corr(df, dep_column):
"""
Computation of the coefficient of multiple correlation.
The input consists of a dataframe and the column corresponding to the dependent variable.
Args:
df: Date/Time DataFrame or any Given DataFrame.
dep_column: The corresponding the column to the dependent variable.
Return:
The coefficient of multiple correlation between the dependant column and the rest.
"""
df_str_corr = df.corr(method='pearson')
df_str_corr_ind_temp = df_str_corr.drop(index=dep_column)
df_str_corr_ind = df_str_corr_ind_temp.drop(columns=dep_column)
df_str_corr_ind_inv = inv(df_str_corr_ind.values)
df_str_corr_dep = df_str_corr_ind_temp.loc[:, dep_column]
return np.matmul(np.matmul(np.transpose(df_str_corr_dep.values), df_str_corr_ind_inv), df_str_corr_dep.values)