2018年“达观杯”复盘——任务6
任务6:
1)进一步通过网格搜索法对3个模型进行调优(用5000条数据,调参时采用五折交叉验证的方式),并进行模型评估,展示代码的运行结果。(可以尝试使用其他模型)
2)模型融合,模型融合方式任意,并结合之前的任务给出你的最优结果。
例如Stacking融合,用你目前评分最高的模型作为基准模型,和其他模型进行stacking融合,得到最终模型及评分结果。
一、网格搜索
- 网格搜索(Grid Search)用简答的话来说就是手动的给出一个模型中你想要改动的所用的参数,程序自动的帮你使用穷举法来将所用的参数都运行一遍。决策树中我们常常将最大树深作为需要调节的参数;
- K次验证:
二、模型优化
实现
#1. 导入所需包
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import lightgbm as lgb
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,roc_auc_score,roc_curve,auc
from sklearn.datasets import make_classification
数据预处理
##读取数据
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
train_data=pd.read_csv('datalab/14936/train_set.csv',nrows=5000)
##删除‘article’
train_data.drop(columns='article', inplace=True)
##TF-IDF文本处理
tfidf=TfidfVectorizer()
x_train=tfidf.fit_transform(train_data['word_seg'])
##将训练集拆分成训练集和测试集
y=train_data['class']
x_train,x_test,y_train,y_test=train_test_split(x_train,y,test_size=0.3,random_state=123)
网格搜索
#网格搜索 5折交叉验证。
def gride_search(model,para):
grid = GridSearchCV(model,para,cv = 5,scoring = 'accuracy',n_jobs = 4)
grid = grid.fit(x_train,y_train)
if hasattr(model,'decision_function'):
y_predict_pro = grid.decision_function(x_test)
y_default_predict_pro = model.decision_function(x_test)
else:
y_predict_pro = grid.predict_proba(x_test)[:,1]
y_default_predict_pro = model.predict_proba(x_test)[:,1]
print ('参数调整前后对比:')
print ('best score:',grid.best_score_)
print ('默认参数:', model.get_params)
print ('最优参数:',grid.best_params_)
print ('默认参数 训练误差:', model.score(x_train,y_train))
print ('最优参数 训练误差:', grid.score(x_train,y_train))
print ('默认参数 测试误差:', model.score(x_test,y_test))
print ('最优参数 测试误差:', grid.score(x_test,y_test))
LR模型
#LR
clf_lr = LogisticRegression(random_state=2018)
clf_lr.fit(x_train,y_train)
#调整参数penalty ,C
para = {'penalty':['l1','l2'],'C':[1e-2, 1e-1, 1, 10]}
gride_search(clf_lr,para)
# 用最优参数训练模型
lg = LogisticRegression(C=10, penalty='l2')
lg.fit(x_train, y_train)
lg_y_prediction = lg.predict(x_test)
label = []
for i in range(1, 20):
label.append(i)
f1 = f1_score(y_test, lg_y_prediction, labels=label, average='micro')
print('lg/The F1 Score: ' + str("%.2f" % f1))
svm模型
##SVM模型优化
clf_svm = svm.SVC(random_state = 2018)
clf_svm.fit(x_train,y_train)
#调整参数:C, kernel, gamma
para = {'C':[1e-2, 1e-1, 1, 10],'kernel':['linear','poly','rbf','sigmoid']}
gride_search(clf_svm,para)
# 用最优参数训练模型
svm= svm.LinearSVC(C=10)
svm.fit(x_train, y_train)
svm_y_prediction = svm.predict(x_test)
label = []
for i in range(1, 20):
label.append(i)
f1 = f1_score(y_test, svm_y_prediction, labels=label, average='micro')
print('SVM/The F1 Score: ' + str("%.2f" % f1))
lgb模型
#lightGBM
clf_lgb = lgb.LGBMClassifier(random_state=2018)
clf_lgb.fit(x_train,y_train)
#调整参数 learning_rate,max_depth,n_estimators
para = {'learning_rate': [0.2,0.5,0.7], 'max_depth': range(1,10,2), 'n_estimators':range(20,100,10)}
gride_search(clf_lgb,para)
# 用最优参数训练模型
estimator=lgb.sklearn.LGBMClassifier(num_leaves=31,learning_rate=0.1,n_estimators=40)
estimator.fit(x_train,y_train)
lgb_y_pred = estimator.predict(x_test)
# 模型评估
#f1_score
label = []
for i in range(1, 20):
label.append(i)
f1 = f1_score(y_test, lgb_y_pred, labels=label, average='micro')
print('lightgbm模型的The F1 Score: ' + str("%.2f" % f1))
结果:
模型 | 最优参数 | F1评分 |
---|---|---|
LR | C=10, penalty=‘l2’ | F1=0.67 |
SVM | ‘C’: 10, ‘kernel’: ‘linear’ | F1=0.68 |
LightGBM |
结果后续补上,还在跑程序