机器学习-时间序列预测(一)——从copy开始

写在最前面:

       原理先不谈,先搭起环境跑一下

       等我弄清楚在写。。。

       python3.6

       windows 7

import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit  # you have everything done for you
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV, RidgeCV
 
 
# for time-series cross-validation set 5 folds
tscv = TimeSeriesSplit(n_splits=5)
 
 
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
 
 
def timeseries_train_test_split(X, y, test_size):
    """
        Perform train-test split with respect to time series structure
    """
 
    # get the index after which test set starts
    test_index = int(len(X) * (1 - test_size))
 
    X_train = X.iloc[:test_index]
    y_train = y.iloc[:test_index]
    X_test = X.iloc[test_index:]
    y_test = y.iloc[test_index:]
 
    return X_train, X_test, y_train, y_test
 
 
def plotModelResults(model, X_train, X_test, y_train, y_test, plot_intervals=False, plot_anomalies=False):
    """
        Plots modelled vs fact values, prediction intervals and anomalies
 
    """
    prediction = model.predict(X_test)
 
    plt.figure(figsize=(15, 7))
    plt.plot(prediction, "g", label="prediction", linewidth=2.0)
    plt.plot(y_test.values, label="actual", linewidth=2.0)
 
    if plot_intervals:
        cv = cross_val_score(model, X_train, y_train,
                             cv=tscv,
                             scoring="neg_mean_absolute_error")
        mae = cv.mean() * (-1)
        deviation = cv.std()
 
        scale = 20
        lower = prediction - (mae + scale * deviation)
        upper = prediction + (mae + scale * deviation)
 
        plt.plot(lower, "r--", label="upper bond / lower bond", alpha=0.5)
        plt.plot(upper, "r--", alpha=0.5)
 
        if plot_anomalies:
            anomalies = np.array([np.NaN] * len(y_test))
            anomalies[y_test < lower] = y_test[y_test < lower]
            anomalies[y_test > upper] = y_test[y_test > upper]
            plt.plot(anomalies, "o", markersize=10, label="Anomalies")
 
    error = mean_absolute_percentage_error(prediction, y_test)
    plt.title("Mean absolute percentage error {0:.2f}%".format(error))
    plt.legend(loc="best")
    plt.tight_layout()
    plt.grid(True)
    plt.savefig("linear.png")
 
 
def plotCoefficients(model, X_train):
    """
        Plots sorted coefficient values of the model
    """
    coefs = pd.DataFrame(model.coef_, X_train.columns)
    coefs.columns = ["coef"]
    coefs["abs"] = coefs.coef.apply(np.abs)
    coefs = coefs.sort_values(by="abs", ascending=False).drop(["abs"], axis=1)
 
    plt.figure(figsize=(15, 9))
    coefs.coef.plot(kind='bar')
    plt.grid(True, axis='y')
    plt.hlines(y=0, xmin=0, xmax=len(coefs), linestyles='dashed')
    plt.savefig("linear-cov.png")
 
 
def code_mean(data, cat_feature, real_feature):
    """
    Returns a dictionary where keys are unique categories of the cat_feature,
    and values are means over real_feature
    """
    return dict(data.groupby(cat_feature)[real_feature].mean())
 
 
def prepareData(series, lag_start, lag_end, test_size, target_encoding=False):
    """
        series: pd.DataFrame
            dataframe with timeseries
        lag_start: int
            initial step back in time to slice target variable
            example - lag_start = 1 means that the model
                      will see yesterday's values to predict today
        lag_end: int
            final step back in time to slice target variable
            example - lag_end = 4 means that the model
                      will see up to 4 days back in time to predict today
        test_size: float
            size of the test dataset after train/test split as percentage of dataset
        target_encoding: boolean
            if True - add target averages to the dataset
 
    """
    # copy of the initial dataset
    data = pd.DataFrame(series.copy())
    data.columns = ["y"]
 
    # lags of series
    for i in range(lag_start, lag_end):
        data["lag_{}".format(i)] = data.y.shift(i)
 
    # datetime features
    # data.index = data.index.to_datetime()
    data["hour"] = data.index.hour
    data["weekday"] = data.index.weekday
    data['is_weekend'] = data.weekday.isin([5, 6]) * 1
 
    if target_encoding:
        # calculate averages on train set only
        test_index = int(len(data.dropna()) * (1 - test_size))
        data['weekday_average'] = list(map(
            code_mean(data[:test_index], 'weekday', "y").get, data.weekday))
        data["hour_average"] = list(map(
            code_mean(data[:test_index], 'hour', "y").get, data.hour))
 
        # frop encoded variables
        data.drop(["hour", "weekday"], axis=1, inplace=True)
 
    # train-test split
    y = data.dropna().y
    X = data.dropna().drop(['y'], axis=1)
    X_train, X_test, y_train, y_test = \
        timeseries_train_test_split(X, y, test_size=test_size)
 
    return X_train, X_test, y_train, y_test
 
 
def plt_linear():
    data = pd.read_csv('F:\\vs_for_python\\Jia\\Test\\raw_data.csv',
                                       usecols=['timestamp', 'count'])
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    data.set_index("timestamp", drop=True, inplace=True)
    data.rename(columns={'count': 'y'}, inplace=True)
 
    X_train, X_test, y_train, y_test = \
        prepareData(data, lag_start=6, lag_end=25, test_size=0.3, target_encoding=True)
 
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
 
 
    # lr = LinearRegression()
    lr = LassoCV(cv=tscv)
    # lr = RidgeCV(cv=tscv)
    # lr.fit(X_train_scaled, y_train)
 
    """
    from xgboost import XGBRegressor
    lr = XGBRegressor()
    # lr = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=160, silent=False, objective='reg:gamma')
    """
 
    lr.fit(X_train_scaled, y_train)
 
    """
    IMPORTANT
    Generally tree-based models poorly handle trends in data, compared to linear models,
    so you have to detrend your series first or use some tricks to make the magic happen.
    Ideally make the series stationary and then use XGBoost, for example, you can forecast
    trend separately with a linear model and then add predictions from xgboost to get final forecast.
    """
 
    plotModelResults(lr, X_train=X_train_scaled, X_test=X_test_scaled,  y_train=y_train, y_test=y_test, plot_intervals=True, plot_anomalies=True)
    plotCoefficients(lr, X_train=X_train)
 
 
plt_linear()
plt.show()

这里注意两个点,是我自己踩的坑

1、报错Matplotlib is currently using agg, which is a non-GUI backend, so cannot show the figure

import matplotlib
matplotlib.use('TkAgg')  #这里来是Agg,会报错,改为TkAgg即可

 

2、报错FileNotFoundError: File b'E:\titanicdata\train.csv' does not exist

这里路径一律加\\。

原因如下:反斜杠\是转义字符,想表达\请用\\

P.S. Windows的路径中用反斜杠\,是因为DOS 有一个传统是用斜杠(“/”)表示命令行参数,既然斜杠已经被DOS的命令行占领了,只好找一个最接的,那就是反斜杠\。而Linux等目录路径用斜杠/。

数据集地址:https://github.com/nicolasmiller/pyculiarity/blob/master/tests/raw_data.csv

 

效果如下:

机器学习-时间序列预测(一)——从copy开始

机器学习-时间序列预测(一)——从copy开始