请在Capital Bikeshare (美国Washington, D.C.的一个共享单车公司)提供的自行车数据上进行回归分析。训练数据为2011年的数据,要求预测2012年每天的单车共享数量。
字段说明 Instant 记录号 Dteday:日期 Season:季节 1=春天 2=夏天 3=秋天 4=冬天 yr:年份,(0: 2011, 1:2012) mnth:月份( 1 to 12) hr:小时 (0 to 23) (只在 hour.csv 有,作业忽略此字段) holiday:是否是节假日 weekday:星期中的哪天,取值为 0~6 workingday:是否工作日 1=工作日 (非周末和节假日) 0=周末 weathersit:天气 1:晴天,多云 2:雾天,阴天 3:小雪,小雨 4:大雨,大雪,大雾 temp:气温摄氏度 atemp:体感温度 hum:湿度 windspeed:风速 y值 casual:非注册用户个数 registered:注册用户个数 cnt:给定日期(天)时间(每小时)总租车人数,响应变量 y 注意:后三个特征均为要预测的 y,作业里只需对 cnt 进行预测 黑色标记的特征为输入特征 x import numpy as np import matplotlib.pyplot as plt import pandas as pd import seaborn as seb dpath = "F:/" #读取数据 data = pd.read_csv(dpath +"day.csv") #查看前五行数据 #print(data.head()) #print(data.info()) #print(data.describe()) cate_features = ["season","weathersit","weekday"] for col in cate_features: print("%s属性的不同取值和次数:" % col) print(data[col].value_counts()) data[col] = data[col].astype('object') #该4类特征的取值不多,用one-hot编码 #特征处理 x_train_cat = data[cate_features] x_train_cat = pd.get_dummies(x_train_cat) x_train_cat.head() df = pd.DataFrame(x_train_cat) df.to_csv("F:/meng.csv") print(x_train_cat.head()) #对数值型变量进行处理 #对数据进行归一化处理 # from sklearn.preprocessing import MinMaxScaler # mn_x = MinMaxScaler() # numerical_features = ["temp","hum","windspeed"] # temp = mn_x.fit_transform(data[numerical_features]) # x_train_num = pd.DataFrame(data=temp,columns=numerical_features) # print(x_train_num.head()) ######################## from sklearn.preprocessing import PolynomialFeatures #用多项式做数值型数据处理 numerical_features = ["temp","hum","windspeed"] poly = PolynomialFeatures(degree=4, include_bias=False, interaction_only=False) X_ploly = poly.fit_transform(data[numerical_features]) X_ploly_df = pd.DataFrame(X_ploly, columns=poly.get_feature_names()) print(X_ploly_df) ######################## #将前边的特征值和4种数值值进行拼接生成一个新的data结果集 x_train = pd.concat([x_train_cat,X_ploly_df,data['holiday'],data['workingday']],axis=1,ignore_index=False) df = pd.DataFrame(x_train) df.to_csv("F:/meng2.csv") final_train = pd.concat([data['instant'],x_train,data['yr'],data['cnt']],axis=1,ignore_index=False) df = pd.DataFrame(final_train) df.to_csv(dpath+"final.csv",index=False) final_train.head() #加载生成的特征csv tz_data = pd.read_csv(dpath+"final.csv") train=tz_data[tz_data.yr==0] #训练数据 train = train.drop(columns = ['instant','yr']) print("train(训练):"+str(train.shape)) #取2012年的数据作为测试数据 test=tz_data[tz_data.yr==1] #测试数据 #取testID备份留作后用 testID=test['instant'] testCNT=test['cnt'] test = test.drop(columns = ['instant','yr']) print("test(测试):"+str(test.shape)) print(test.head()) #准备训练数据 #训练数据 y_train = train['cnt'] X_train = train X_train = X_train.drop(columns=['cnt']) #测试数据 y_test_real = test['cnt'] y_test = test['cnt'] X_test = test X_test = X_test.drop(columns = ['cnt']) print(X_train.shape) print(X_test.shape) #数据标准化 from sklearn import preprocessing X_train = preprocessing.scale(X_train) X_test = preprocessing.scale(X_test) mean_y = y_train.mean()#训练数据的均值 print("train_mean_y = " ,mean_y) std_y = y_train.std()#训练数据的标准差 print("train_std_y = ",std_y) y_train = (y_train-mean_y) /std_y #训练数据标准化后的y y_test = (y_test - mean_y ) /std_y#测试数据标准化后的y print(y_train.head()) print(y_test.head()) mean_test_y = y_test.mean() mean_diff = mean_test_y print("标准化后的均值为",mean_diff) #岭回归模型训练 from sklearn.linear_model import RidgeCV #岭回归 from sklearn.metrics import r2_score #评价回归预测模型的性能 # = alphas=[0.01,0.1,1,10,100,1000] #最佳的alpha = 1.0 #alphas = np.arange(0.09,0.15,0.01) #最佳的alpha = 0.13999999999999996 alphas = np.arange(-10.0,10.0,0.0001) #最佳的alpha = 0.14 ridge = RidgeCV(alphas = alphas,store_cv_values = True) #训练模型 ridge.fit(X_train,y_train) alpha = ridge.alpha_ print("最佳的alpha = ",alpha) #交叉验证得到的测试误差 mse_cv = np.mean(ridge.cv_values_,axis=0) rmse = np.sqrt(mse_cv) print("cv of rmse",min(rmse)) # 训练误差 from sklearn.metrics import mean_squared_error y_train_pred = ridge.predict(X_train) rmse_train = np.sqrt(mean_squared_error(y_train,y_train_pred)) print("训练集rmse",rmse_train) y_test_pred = ridge.predict(X_test) y_test_pred = y_test_pred + mean_diff rmse_test = np.sqrt(mean_squared_error(y_test,y_test_pred)) print("测试集rmse",rmse_test) r2_score_train = r2_score(y_train,y_train_pred) r2_score_test = r2_score(y_test,y_test_pred) print("score of r2 train is ",r2_score_train) print("score of r2 test is ", r2_score_test) fig = plt.figure(figsize=(10,5)) mse_mean = np.mean(ridge.cv_values_,axis=0) plt.plot(alphas,mse_mean.reshape(len(alphas),1)) plt.xlabel("alphas") plt.ylabel("mse") plt.show() #Lasso模型训练 from sklearn.linear_model import LassoCV lasso = LassoCV() lasso.fit(X_train,y_train) alpha = lasso.alpha_ print("最佳alpha_: ",alpha) mses = np.mean(lasso.mse_path_,axis=1) fig = plt.figure(figsize=(10,5)) plt.plot(np.log10(lasso.alphas_),mses) plt.xlabel("log(alpha)") plt.ylabel("mse") plt.show() y_train_pred = lasso.predict(X_train) rmse_train = np.sqrt(mean_squared_error(y_train,y_train_pred)) print("训练集rmse:",rmse_train) y_test_pred = lasso.predict(X_test)+mean_diff rmse_test = np.sqrt(mean_squared_error(y_test,y_test_pred)) print("测试集rmse:",rmse_test) r2_score_train = r2_score(y_train,y_train_pred) r2_score_test = r2_score(y_test,y_test_pred) print("score of r2 train " ,r2_score_train) print("score of r2 test " ,r2_score_test) #将生成的结果展示 y_test_pred = lasso.predict(X_test) y_test_pred = y_test_pred+mean_diff#标准化的预测值 y_test_pred = y_test_pred*std_y + mean_y fig = plt.figure() plt.plot(testID,y_test_pred,c="red",label="pred") plt.plot(testID,y_test_real,c="blue",label="real value") plt.xlabel("instant") plt.ylabel("count") plt.legend(loc="best") plt.show() df=pd.DataFrame({"instant":testID,"cnt":y_test_real,'pre_cnt':y_test_pred}) df.to_csv(dpath+'result.csv') df.info()