Source code for learning

import numpy as np
import pandas as pd
import statistics as st
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics

[docs]def predict(df_test, model, feats, target):
    """
    Applies a regression model to predict values of a dependent variable for a given dataframe and 
    given features.

    Args:
        df_test: The input dataframe.
        model: The regression model. Instance of Pipeline.
        feats: List of strings: each string is the name of a column of df_test.
        target: The name of the column of df corresponding to the dependent variable.
    Returns:
        y_pred: Array of predicted values. 
    """

    df_x = df_test[feats]
    df_y = df_test[target] #is this needed?
    x = df_x.values
    y_true = df_y.values #is this needed?
    y_pred = model.predict(x)
    return y_pred

[docs]def fit_linear_model(df, feats, target, a=1e-4, deg=3):
    """
    Fits a regression model on a given dataframe, and returns the model, the predicted values and the associated 
    scores. Applies Ridge Regression with polynomial features. 

    Args:
        df: The input dataframe.
        feats: List of names of columns of df. These are the feature variables.
        target: The name of a column of df corresponding to the dependent variable.
        a: A positive float. Regularization strength parameter for the linear least squares function 
        (the loss function) where regularization is given by the l2-norm. 
        deg: The degree of the regression polynomial.

    Returns:    
        pipeline: The regression model. This is an instance of Pipeline.
        y_pred: An array with the predicted values.
        r_sq: The coefficient of determination “R squared”.
        mae: The mean absolute error.
        me: The mean error.
        mape: The mean absolute percentage error.
        mpe: The mean percentage error.
    """

    df_x = df[feats]
    df_y = df[target]
    X = df_x.values
    y = df_y.values

    polynomial_features = PolynomialFeatures(degree=deg)
    linear_regression = Ridge(alpha=a)

    pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", linear_regression)])

    pipeline.fit(X, y)
    y_pred = pipeline.predict(X)
    r_sq, mae, me, mape, mpe = st.score(y, y_pred)
    return pipeline, y_pred, r_sq, mae, me, mape, mpe

[docs]def get_line_and_slope(values):
    """
    Fits a line on the 2-dimensional graph of a regular time series, defined by a sequence of real values. 

    Args:
        values: A list of real values.

    Returns: 
        line: The list of values as predicted by the linear model.
        slope: Slope of the line.
        intercept: Intercept of the line.   
    """

    ols = LinearRegression()
    X = np.arange(len(values)).reshape(-1,1)
    y = values.reshape(-1,1)
    ols.fit(X, y)
    line = ols.predict(X)
    slope = ols.coef_.item()
    intercept = ols.intercept_.item()
    return line, slope, intercept

[docs]def train_on_reference_points(df, w_train, ref_points, feats, target, random_state=0):
    """
    Trains a regression model on a training set defined by segments of a dataframe. 
    These segments are defined by a set of starting points and a parameter indicating their duration. 
    In each segment, one subset of points is randomly chosen as the training set and the remaining points 
    define the validation set.
    
    Args:
        df: Input dataframe. 
        w_train: The duration, given as a number of days, of the segments where the model is trained.
        ref_points: A list containing the starting date of each segment where the model is trained.
        feats: A list of names of columns of df corresponding to the feature variables.
        target: A name of a column of df corresponding to the dependent variable.
        random_state: Seed for a random number generator, which is used in randomly selecting the validation 
        set among the points in a fixed segment.

    Returns:
        model: The regression model. This is an instance of Pipeline.
        training_scores: An array containing scores for the training set. It contains the coefficient 
        of determination “R squared”, the mean absolute error, the mean error, the mean absolute percentage error.
        validation_scores: An array containing scores for the validation set. It contains the coefficient 
        of determination “R squared”, the mean absolute error, the mean error, the mean absolute percentage error.
    """

    df_train = pd.DataFrame([])
    df_val = pd.DataFrame([])
    for idx in range(ref_points.size):
        d_train_stop = pd.to_datetime(ref_points[idx]) + pd.Timedelta(days=w_train)
        df_tmp = df.loc[ref_points[idx]:str(d_train_stop)]
        df_tmp2 = df_tmp.sample(frac=1, random_state=random_state) # added random state for reproducibility during experiments
        size_train = int(len(df_tmp2) * 0.80)
        df_train = df_train.append(df_tmp2[:size_train])
        df_val = df_val.append(df_tmp2[size_train:])

    model, y_pred_train, r_sq_train, mae_train, me_train, mape_train = fit_linear_model(df_train, feats, target)
    y_pred_val = predict(df_val, model, feats, target)
    r_sq_val, mae_val, me_val, mape_val, mpe_val = st.score(df_val[target].values, y_pred_val)
    training_scores = np.array([r_sq_train, mae_train, me_train, mape_train])
    validation_scores = np.array([r_sq_val, mae_val, me_val, mape_val, mpe_val])

    print('Training Metrics:')
    print(f'MAE:{training_scores[1]:.3f} \nME(true-pred):{training_scores[2]:.3f} \nMAPE:{training_scores[3]:.3f} \nR2: {training_scores[0]:.3f}\n')
    print('Validation Metrics:')
    print(f'MAE:{validation_scores[1]:.3f} \nME(true-pred):{validation_scores[2]:.3f} \nMAPE:{validation_scores[3]:.3f} \nMPE:{validation_scores[4]:.3f} \nR2: {validation_scores[0]:.3f}\n')
    return model, training_scores, validation_scores

[docs]def predict_on_sliding_windows(df, win_size, step, model, feats, target):
    """
    Given a regression model, predicts values on a sliding window in a dataframe 
    and outputs scores, a list of predictions and a list of windows. 

    Args: 
        df: The input dataframe.
        win_size: The size of the sliding window, as a number of days.
        step: The sliding step.
        model: The regression model. 
        feats: A list of names of columns of df indicating the feature variables.
        target: The name of a column of df indicating the dependent variable.

    Returns:
        scores: An array of arrays of scores: one array for each window containing the coefficient of 
        determination “R squared”, the mean absolute error, the mean error, the mean absolute percentage error, 
        the mean percentage error.
        preds_test: a list of predictions: one list of predicted values for each window.
        windows: A list of starting/ending dates: one for each window.
    """

    windows = []
    preds_test = []
    scores_list = []
    for i, time in enumerate2(min(df.index), max(df.index), step=step):
        window = pd.to_datetime(time) + pd.Timedelta(days=win_size)
        df_test = df.loc[time:window]
        if df_test.shape[0]>0:
            y_pred = predict(df_test, model, feats, target)
            r_sq, mae, me, mape, mpe = st.score(df_test[target].values, y_pred)
            scores_list.append([r_sq, mae, me, mape, mpe])
            preds_test.append(y_pred)
            windows.append((time, window))
    scores = np.array(scores_list)
    return scores, preds_test, windows

[docs]def changepoint_scores(df, feats, target, d1, d2, w_train, w_val, w_test):
    """
    Given as input a dataframe and a reference interval where a changepoint may lie, trains a regression model in
    a window before the reference interval, validates the model in a window before the reference interval and tests 
    the model in a window after the reference interval. 

    Args:
        df: The input dataframe.
        feats: A list of names of columns of df indicating the feature variables.
        target: The name of a column of df indicating the dependent variable.
        d1: The first date in the reference interval.
        d2: The last date in the reference interval.
        w_train: The number of days defining the training set.
        w_val: The number of days defining the validation set.
        w_test: The number of days defining the test set.
    Returns:
        y_pred_train: The array of predicted values in the training set.
        score_train: An array containing scores for the training set: 
        the coefficient of determination “R squared”, the mean absolute error, the mean error, 
        the mean absolute percentage error, the mean percentage error.
        y_pred_val: The array of predicted values in the validation set.
        score_val: An array containing scores for the validation set: 
        the coefficient of determination “R squared”, the mean absolute error, the mean error, 
        the mean absolute percentage error, the mean percentage error.
        y_pred_test: The array of predicted values in the test set.
        score_test: An array containing scores for the test set: 
        the coefficient of determination “R squared”, the mean absolute error, the mean error, 
        the mean absolute percentage error, the mean percentage error.
    """

    d_train_start = pd.to_datetime(d1) - pd.Timedelta(days=w_train) - pd.Timedelta(days=w_val)
    d_train_stop = pd.to_datetime(d1) - pd.Timedelta(days=w_val)
    d_test_stop = pd.to_datetime(d2) + pd.Timedelta(days=w_test)
    df_train = df.loc[str(d_train_start):str(d_train_stop)]
    df_val = df.loc[str(d_train_stop):str(d1)]
    df_test = df.loc[str(d2):str(d_test_stop)]
    if len(df_train) > 0 and len(df_test) > 0:
        model, y_pred_train, r_sq_train, mae_train, me_train, mape_train, mpe_train = fit_linear_model(df_train, ['irradiance', 'mod_temp'], 'power')
        y_pred_val = predict(df_val, model, feats, target)
        y_pred_test = predict(df_test, model, feats, target)
        
        r_sq_val, mae_val, me_val, mape_val, mpe_val = st.score(df_val[target].values, y_pred_val)
        r_sq_test, mae_test, me_test, mape_test, mpe_test = st.score(df_test[target].values, y_pred_test)
        score_train = np.array([-r_sq_train, mae_train, me_train, mape_train, mpe_train])
        score_val = np.array([-r_sq_val, mae_val, me_val, mape_val, mpe_val])
        score_test = np.array([-r_sq_test, mae_test, me_test, mape_test, mpe_test])
        return y_pred_train, score_train, y_pred_val, score_val, y_pred_test, score_test
    else:
        raise Exception("Either the training set is empty or the test set is empty")