请在Capital Bikeshare (美国Washington, D.C.的一个共享单车公司)提供的自行车数据上进行回归分析。训练数据为2011年的数据,要求预测2012年每天的单车共享数量。

字段说明
Instant 记录号
Dteday:日期
Season:季节
    1=春天
    2=夏天
    3=秋天
    4=冬天
yr:年份,(0: 2011, 1:2012)
mnth:月份( 1 to 12)
hr:小时 (0 to 23) (只在 hour.csv 有,作业忽略此字段)
holiday:是否是节假日
weekday:星期中的哪天,取值为 0~6
workingday:是否工作日
    1=工作日 (非周末和节假日)
    0=周末
weathersit:天气
1:晴天,多云
2:雾天,阴天
3:小雪,小雨
4:大雨,大雪,大雾
temp:气温摄氏度
atemp:体感温度
hum:湿度
windspeed:风速

y值

casual:非注册用户个数
registered:注册用户个数
cnt:给定日期(天)时间(每小时)总租车人数,响应变量 y


注意:后三个特征均为要预测的 y,作业里只需对 cnt 进行预测
黑色标记的特征为输入特征 x


import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as seb

dpath = "F:/"

#读取数据
data  = pd.read_csv(dpath +"day.csv")
#查看前五行数据
#print(data.head())
#print(data.info())
#print(data.describe())

cate_features = ["season","weathersit","weekday"]
for col in cate_features:
    print("%s属性的不同取值和次数:" % col)
    print(data[col].value_counts())
    data[col] = data[col].astype('object')


#该4类特征的取值不多,用one-hot编码

#特征处理
x_train_cat = data[cate_features]
x_train_cat = pd.get_dummies(x_train_cat)
x_train_cat.head()
df = pd.DataFrame(x_train_cat)
df.to_csv("F:/meng.csv")
print(x_train_cat.head())

#对数值型变量进行处理

#对数据进行归一化处理
# from  sklearn.preprocessing import MinMaxScaler
# mn_x  = MinMaxScaler()
# numerical_features = ["temp","hum","windspeed"]
# temp = mn_x.fit_transform(data[numerical_features])
# x_train_num = pd.DataFrame(data=temp,columns=numerical_features)
# print(x_train_num.head())
########################
from sklearn.preprocessing import PolynomialFeatures #用多项式做数值型数据处理
numerical_features = ["temp","hum","windspeed"]
poly = PolynomialFeatures(degree=4, include_bias=False, interaction_only=False)
X_ploly = poly.fit_transform(data[numerical_features])
X_ploly_df = pd.DataFrame(X_ploly, columns=poly.get_feature_names())
print(X_ploly_df)


########################

#将前边的特征值和4种数值值进行拼接生成一个新的data结果集

x_train = pd.concat([x_train_cat,X_ploly_df,data['holiday'],data['workingday']],axis=1,ignore_index=False)
df = pd.DataFrame(x_train)
df.to_csv("F:/meng2.csv")
final_train = pd.concat([data['instant'],x_train,data['yr'],data['cnt']],axis=1,ignore_index=False)

df = pd.DataFrame(final_train)
df.to_csv(dpath+"final.csv",index=False)
final_train.head()

#加载生成的特征csv

tz_data = pd.read_csv(dpath+"final.csv")



train=tz_data[tz_data.yr==0] #训练数据

train = train.drop(columns = ['instant','yr'])
print("train(训练):"+str(train.shape))



#取2012年的数据作为测试数据
test=tz_data[tz_data.yr==1] #测试数据
#取testID备份留作后用
testID=test['instant']
testCNT=test['cnt']

test = test.drop(columns = ['instant','yr'])
print("test(测试):"+str(test.shape))
print(test.head())



#准备训练数据
#训练数据
y_train = train['cnt']
X_train = train
X_train = X_train.drop(columns=['cnt'])
#测试数据
y_test_real = test['cnt']
y_test = test['cnt']
X_test = test
X_test = X_test.drop(columns = ['cnt'])

print(X_train.shape)
print(X_test.shape)

#数据标准化

from sklearn import preprocessing
X_train = preprocessing.scale(X_train)
X_test = preprocessing.scale(X_test)

mean_y = y_train.mean()#训练数据的均值
print("train_mean_y = " ,mean_y)
std_y = y_train.std()#训练数据的标准差
print("train_std_y = ",std_y)


y_train = (y_train-mean_y) /std_y #训练数据标准化后的y
y_test = (y_test - mean_y ) /std_y#测试数据标准化后的y

print(y_train.head())
print(y_test.head())

mean_test_y = y_test.mean()
mean_diff = mean_test_y
print("标准化后的均值为",mean_diff)

#岭回归模型训练
from sklearn.linear_model import  RidgeCV #岭回归
from sklearn.metrics import r2_score  #评价回归预测模型的性能

# = alphas=[0.01,0.1,1,10,100,1000]  #最佳的alpha =  1.0
#alphas = np.arange(0.09,0.15,0.01)  #最佳的alpha =  0.13999999999999996

alphas = np.arange(-10.0,10.0,0.0001)  #最佳的alpha = 0.14


ridge = RidgeCV(alphas = alphas,store_cv_values = True)

#训练模型
ridge.fit(X_train,y_train)

alpha = ridge.alpha_
print("最佳的alpha = ",alpha)

#交叉验证得到的测试误差
mse_cv = np.mean(ridge.cv_values_,axis=0)
rmse = np.sqrt(mse_cv)
print("cv of rmse",min(rmse))

# 训练误差
from sklearn.metrics import mean_squared_error
y_train_pred = ridge.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train,y_train_pred))
print("训练集rmse",rmse_train)




y_test_pred = ridge.predict(X_test)
y_test_pred = y_test_pred + mean_diff
rmse_test = np.sqrt(mean_squared_error(y_test,y_test_pred))
print("测试集rmse",rmse_test)

r2_score_train = r2_score(y_train,y_train_pred)
r2_score_test = r2_score(y_test,y_test_pred)

print("score of r2 train is ",r2_score_train)
print("score of r2 test is ", r2_score_test)

fig = plt.figure(figsize=(10,5))
mse_mean = np.mean(ridge.cv_values_,axis=0)
plt.plot(alphas,mse_mean.reshape(len(alphas),1))
plt.xlabel("alphas")
plt.ylabel("mse")
plt.show()



#Lasso模型训练
from sklearn.linear_model import LassoCV
lasso = LassoCV()
lasso.fit(X_train,y_train)


alpha = lasso.alpha_
print("最佳alpha_: ",alpha)
mses = np.mean(lasso.mse_path_,axis=1)
fig = plt.figure(figsize=(10,5))
plt.plot(np.log10(lasso.alphas_),mses)
plt.xlabel("log(alpha)")
plt.ylabel("mse")
plt.show()


y_train_pred = lasso.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train,y_train_pred))
print("训练集rmse:",rmse_train)

y_test_pred = lasso.predict(X_test)+mean_diff
rmse_test = np.sqrt(mean_squared_error(y_test,y_test_pred))
print("测试集rmse:",rmse_test)

r2_score_train = r2_score(y_train,y_train_pred)
r2_score_test = r2_score(y_test,y_test_pred)
print("score of r2 train " ,r2_score_train)
print("score of r2 test " ,r2_score_test)



#将生成的结果展示
y_test_pred =   lasso.predict(X_test)
y_test_pred = y_test_pred+mean_diff#标准化的预测值

y_test_pred = y_test_pred*std_y + mean_y

fig = plt.figure()
plt.plot(testID,y_test_pred,c="red",label="pred")
plt.plot(testID,y_test_real,c="blue",label="real value")
plt.xlabel("instant")
plt.ylabel("count")
plt.legend(loc="best")
plt.show()


df=pd.DataFrame({"instant":testID,"cnt":y_test_real,'pre_cnt':y_test_pred})
df.to_csv(dpath+'result.csv')
df.info()

 

请在Capital Bikeshare (美国Washington, D.C.的一个共享单车公司)提供的自行车数据上进行回归分析。训练数据为2011年的数据,要求预测2012年每天的单车共享数量。