用户贷款风险预测
作者介绍:皮吉斯,热爱R语言,知乎号:皮吉斯
本次的分析数据来自kaggle数据竞赛平台的“give me some credit”竞赛项目。任务是提高模型精度AUC。
本次分析用到了多种算法,分别有:逻辑回归,cart决策树,神经网络,xgboost,随机森林。通过多种模型相互对比,最终根据auc选出最好的模型。
分析步骤:
1. 导入数据
2. 数据清洗及准备
3. 模型建立,
4. 模型评估
5. 多模型对比
下面正式开始吧!
一. 导入数据
cs_training <- read.csv("cs_training.csv")
names(cs_training)
a <- cs_training
a1 <- cs_training
colnames(a)<-c('id','response','x1','age','x2','debtratio','monthlyincome','x3','x4','x5','x6','x7')
导入数据,并对列名重命名,方便分析。
各字段的解释如下:
通过summary了解数据的整体情况,可以看到monthliincome和x7变量有缺失值
二. 数据清洗
# age变量
age<-a$age
var_age<-c(var="age",
mean=mean(age,na.rm=TRUE),#na.rm=TRUE去除NA的影响
median=median(age,na.rm=TRUE),
quatile(age,c(0,0.01,0.1,0.25,0.5,0.75,0.9,0.99,1),na.rm=TRUE),
max=max(age,na.rm=TRUE),
missing=sum(is.na(age)) )
这样可以看到各个变量的数据分布情况
# 查看异常值
boxplot(age~response,data = a,horizontal = T ,frame = F , col = "lightgray",main = "Distribution")
上图可以看到,数据存在异常值。
处理异常值:通常采用盖帽法,即用数据分布在1%的数据覆盖在1%以下的数据,用在99%的数据覆盖99%以上的数据。
block<-function(x,lower=T,upper=T){
if(lower){
q1<-quantile(x,0.01)
x[x<=q1]<-q1
}
if(upper){
q99<-quantile(x,0.99)
x[x>q99]<-q99
}
return(x)
}
经过处理,异常值大量减少
# x1
xa1<-a$x1
var_x1<-c(var='xa1',
mean=mean(xa1,na.rm = T),
median=median(xa1,na.rm = T),
quantile(xa1,c(0,0.01,0.1,0.5,0.75,0.9,0.99,1),na.rm = T), max=max(xa1,na.rm = T),
miss=sum(is.na(xa1)) )
boxplot(x1~response,data=a,horizontal=T, frame=F, col="lightgray",main="Distribution")
#对X1变量进行处理
a$x1<-block(a$x1)
# x2
a$x2
summary(a$x2)
xa2<-a$x2
var_x2<-c(var='xa2',
mean=mean(xa2,na.rm = T),
median=median(xa2,na.rm = T),
quantile(xa2,c(0,0.01,0.1,0.5,0.75,0.9,0.99,1),na.rm = T), max=max(xa2,na.rm = T),
miss=sum(is.na(xa2)) )
boxplot(x2~response,data=a,horizontal=T, frame=F, col="lightgray",main="Distribution")
# 对x2变量进行处理
a$x2<-block(a$x2)
a$x1<-round(a$x1,2)
# debtratio
summary(a$debtratio)
ratio<-a$debtratio
var_ratio<-c(var='debtratio',
mean=mean(ratio,na.rm = T),
median=median(ratio,na.rm = T),
quantile(ratio,c(0,0.01,0.1,0.5,0.75,0.9,0.99,1),na.rm = T), max=max(ratio,na.rm = T),
miss=sum(is.na(ratio)) )
hist(a$debtratio)
# 因为debtratio是百分比,对异常值的处理要结合实际
a$debtratio<-ifelse(a$debtratio>1,1,a$debtratio)
# monthlyincome
income<-a$monthlyincome
var_income<-c(var='income',
mean=mean(income,na.rm = T),
median=median(income,na.rm = T),
quantile(income,c(0,0.01,0.1,0.5,0.75,0.9,0.99,1),na.rm = T), max=max(income,na.rm = T),
miss=sum(is.na(income)) )
hist(income)
boxplot(monthlyincome~response,data=a,horizontal=T, frame=F, col="lightgray",main="Distribution")
# 对缺失值处理
a$monthlyincome<-ifelse(is.na(a$monthlyincome)==T,6670.2,a$monthlyincome)
# 对异常值处理
a$monthlyincome<-block(a$monthlyincome)
# x3
summary(a$x3)
x3<-a$x3
var_x3<-c(var='x3',mean=mean(x3,na.rm = T),
median=median(x3,na.rm = T),
quantile(x3,c(0,0.01,0.1,0.5,0.75,0.9,0.99,1),na.rm = T), max=max(x3,na.rm = T),
miss=sum(is.na(x3)) )
names(a)
# 对x3进行处理
a$x3<-block(a$x3)
# x4-延迟90天的次数
summary(a$x4)
x4<-a$x4
var_x4<-c(var='x4',mean=mean(x4,na.rm = T),
median=median(x4,na.rm = T),
quantile(x4,c(0,0.01,0.1,0.5,0.75,0.9,0.99,1),na.rm = T), max=max(x4,na.rm = T),
miss=sum(is.na(x4)) )
# 对x4进行处理
a$x4<-block(a$x4)
# x5-贷款额度
summary(a$x5)
x5<-a$x5
var_x5<-c(var='x5',mean=mean(x5,na.rm = T),
median=median(x5,na.rm = T),
quantile(x5,c(0,0.01,0.1,0.5,0.75,0.9,0.99,1),na.rm = T), max=max(x5,na.rm = T),
miss=sum(is.na(x5)) )
# 对x5异常值进行采用盖帽法处理
a$x5<-block(a$x5)
boxplot(a$x5)
boxplot(x5~response , data = a , horizontal = T)
# x6
summary(a$x6)
x6<-a$x6
var_x6<-c(var='x6',mean=mean(x6,na.rm = T),
median=median(x6,na.rm = T),
quantile(x6,c(0,0.01,0.1,0.5,0.75,0.9,0.99,1),na.rm = T), max=max(x6,na.rm = T),
miss=sum(is.na(x6)) )
# 对x6进行盖帽法
a$x6<-block(a$x6)
# x7
summary(a$x7)
x7<-a$x7
var_x7<-c(var='x7',mean=mean(x7,na.rm = T),
median=median(x7,na.rm = T),
quantile(x7,c(0,0.01,0.1,0.5,0.75,0.9,0.99,1),na.rm = T), max=max(x7,na.rm = T),
miss=sum(is.na(x7)) )
hist(a$x7,freq=F)
# 对缺失值处理
a$x7<-ifelse(is.na(a$x7)==T,0.76,a$x7)
# 对异常值处理
a$x7<-block(a$x7)
summary(a)
#response变量
# 为了方便理解,将1作为违约,0表示不违约
a$response<-as.numeric(!as.logical(a$response))
三. 建模
1. 数据分组
table(a$response)
数据正负比例不平衡,我们要对数据进行smote处理,smote算法的思想是合成新的少数类样本,合成的策略是对每个少数类样本a,从它的最近邻中随机选一个样本b,然后在a、b之间的连线上随机选一点作为新合成的少数类样本。
library(DMwR)
library(lattice)
library(grid)
a$response <- factor(a$response)
a$response <- as.factor(ifelse(a$response == 0 , "yes" , "no"))
newData <- SMOTE(response ~ ., a)
table(newData$response)
newData$response <- ifelse(newData$response == "yes" , 0 , 1)
newData$response <- as.numeric(newData$response)
str(newData)
# 接下来进行分组
index <- sample(1:nrow(newData) , nrow(newData)*0.7)
train <- newData[index,]
test <- newData[-index,]
table(train$response)
table(test$response)
这样的数据就平衡多了
2. 逻辑回归
2.1. 建立模型
glm_model <- glm(response~. , data = train , family = binomial()) summary(glm_model)
#p值全部显著
2.2. 使用逐步法剔除变量
step(glm_model , direction = "both")
可以看到没有变量被剔除
2.3. VIF多重共线性检验
library(car)
library(carData)
vif(glm_model) #一般认为VIF值大于2的话,表明变量间存在共线性。此时没有大于2的值,各个变量间相互独立
2.4. 预测
train_pred <- predict(glm_model , newdata = train , type = "response")
test_pred <- predict(glm_model , newdata = test , type = "response")
2.5. 模型评估(auc=0.842466)
test_prob <- prediction(test_pred, test$response)
test_perf <- performance(test_prob , "tpr" , "fpr")
test_auc <- performance(test_prob ,"auc")
plot(test_perf)
abline(a=0,b=1)
测试集的auc是0.842466
3. cart决策树模型
3.1. 模型建立
library(rpart)
library(rpart.plot)
library(RWeka)
cart_dt<-rpart.control(minsplit = 50,maxdepth = 4,xval=10,cp=0)
cart_model <- rpart(response~., data = train , method = "anova" , control = cart_dt)
rpart.plot(cart_model,digits=3)
3.2. 预测
test_pred <- predict(cart_model , newdata = test)
3.3. 模型评估(auc=0.3670627)
test_prob <- prediction(test_pred, test$response)
test_perf <- performance(test_prob , "tpr" , "fpr")
test_auc <- performance(test_prob ,"auc")
plot(test_perf)
abline(a=0,b=1)
4. nnet包的单隐层BP网络
4.1. 对数据进行标准化
#对数据进行标准化
names(train)
train[,2:11] <- scale(train[,2:11])
test[,2:11] <- scale(test[,2:11])
4.2. 模型建立
library(nnet)
train_nnet<-nnet(response~., linout =F,size=10, decay=0.0076, maxit=200,data = train)
# size隐节点数,decay权值衰减率
4.3. 预测并将概率调整为0和1
test_pred<-predict(train_nnet, test)
test_pred[test_pred<0.5]=0
test_pred[test_pred>=0.5]=1
table(test_pred)
4.4. 计算准确率
test_pred_rate<-sum(test_pred==test$response)/length(test$response)
4.5. 模型评估(auc=0.7848754)
library(gplots)
library(ROCR)
test_prob <- prediction(test_pred, test$response)
test_perf <- performance(test_prob , "tpr" , "fpr")
test_auc <- performance(test_prob ,"auc")
4.6. 模型调参
##输入数据的预测变量必须是二分类的。且所有变量只包含模型输入输出变量。 ##调参时如果变量过多size不宜过大。 ##构建调参函数network()。 构建调参函数network() ##构建调参函数network()。
network<-function(formula,data,size,adjust,decay=0,maxit=200,scale=TRUE,samplerate=0.7,seed=1,linout=FALSE,ifplot=TRUE){
library(nnet)
##规范输出变量为0,1
yvar<-colnames(data)==(all.vars(formula)[1])
levels(data[,yvar])<-c(0,1) ##抽样建立训练集和测试集
set.seed(seed)
select<-sample(1:nrow(data),nrow(data)*samplerate) train=data[select,]
test=data[-select,] ##根据给定判断进行标准化
if(scale==T){
xvar<-colnames(data)!=(all.vars(formula)[1]) train[,xvar]=scale(train[,xvar]) test[,xvar]=scale(test[,xvar])
} ##循环使用nnet训练调参
obj<-eval(parse(text = adjust))
auc<-data.frame()
for(i in obj){
if(adjust=="size"){
mynnet<-nnet(formula,size=i,linout=linout,decay=decay,
maxit=maxit,trace=FALSE,data=train)
}
else if(adjust=="decay"){
mynnet<-nnet(formula,size=size,linout=linout,decay=i,
maxit=maxit,trace=FALSE,data=train)} ##调用之前的ROC()得到对应参数的AUC值
objcolname<-all.vars(formula)[1]
auc0<-ROC(model=mynnet,train=train,test=test,
objcolname=objcolname,ifplot=F) ##输出指定参数不同值对应的数据框
out<-data.frame(i,auc0)
auc<-rbind(auc,out) }
names(auc)<-c(adjust,"Train_auc","Test_auc")
if(ifplot==T){
library(plotrix)
twoord.plot(auc[,1],auc$Train_auc,auc[,1],auc$Test_auc,lcol=4,rcol=2,xlab=adjust,ylab="Train_auc", rylab="Test_auc",type=c("l","b"),lab=c(15,5,10)) }
return(auc) }
auc<-network(response~.,data=train,size=4:16,adjust="size",
decay=0.0001,maxit=200,scale=T) #发现当隐藏节点数为6的时候,auc最高
auc<-network(response~.,data=train,size=11,adjust="decay",
decay=c(0,seq(0.0001,0.01,0.0003)),maxit=200)
#发现decay等于0.0025的时候, auc最高
4.7. 建模2.0
#调参后的模型 #size =11 , decay = 0.0013 , maxit = 200
library(nnet)
train_nnet <- nnet(response~. , linout = F , size = 4 , decay = 0.0031 , maxit = 200 , data = train)
4.8. 模型评估(auc = 0.7923195)
# 预测为0,1
test_pred<-predict(train_nnet, test) test_pred[test_pred<0.5]=0
test_pred[test_pred>=0.5]=1
table(test_pred) #计算准确率
test_pred_rate<-sum(test_pred==test$response)/length(test$response) # 模型评估
library(gplots)
library(ROCR)
test_prob <- prediction(test_pred, test$response)
test_perf <- performance(test_prob , "tpr" , "fpr")
test_auc <- performance(test_prob ,"auc")
plot(test_perf) abline(a=0,b=1)
5. xgboost算法
5.1. 模型建立
library(xgboost)
xgb_model <- xgboost(data = data.matrix(train[,-1]) ,
label=data.matrix(train$response) ,
max_depth = 6, eta= 0.3 , nthread = 2 , nrounds = 15 ,
objective = "binary:logistic" , seed = 123)
5.2. 预测
test_prob <- predict(xgb_model , data.matrix(test[,-1]))
5.3. 模型评估(auc = 0.860617)
test_pred <- prediction(test_prob , test$response)
test_perf <- performance(test_pred , "tpr" , "fpr")
test_auc <- performance(test_pred , "auc")
6. 随机森林
6.1. 模型建立
library(randomForest)
random_model <- randomForest(response~. , data = train)
random_model
6.2. 模型预测
test_prob <- predict(random_model , test , type= "response")
6.3. 模型评估(auc = 0.8531173)
test_pred <- prediction(test_prob , test$response)
test_perf <- performance(test_pred , "tpr" , "fpr")
test_auc <- performance(test_pred , "auc")
综上,通过多种模型对比可以看到xgboost算法的模型精确度是最高的达到0.860617。
公众号后台回复关键字即可学习
回复 爬虫 爬虫三大案例实战
回复 Python 1小时破冰入门回复 数据挖掘 R语言入门及数据挖掘
回复 人工智能 三个月入门人工智能
回复 数据分析师 数据分析师成长之路
回复 机器学习 机器学习的商业应用
回复 数据科学 数据科学实战
回复 常用算法 常用数据挖掘算法