机器学习实战:逻辑回归示例----从疝气病症预测病马的死亡率
1. 准备数据:处理缺失值
2. 逻辑回归算法:
算法来自:机器学习实战:逻辑回归 Logistic Regression
def stocGradAscent1(dataMatrix,classLabels,numIter=150):
m,n=np.shape(dataMatrix)
weights=np.ones(n)
for j in range(numIter):
dataIndex=list(range(m))
for i in range(m):
alpha=4/(1.0+j+i)+0.01
randIndex=int(np.random.uniform(0,len(dataIndex)))
h=sigmoid(sum(dataMatrix[randIndex]*weights))
error=classLabels[randIndex]-h
weights=weights+alpha*error*dataMatrix[randIndex]
del(dataIndex[randIndex])
return weights
3.用Logistic回归进行分类
def classifyVector(inX,weights):
prob=sigmoid(sum(inX*weights))
if prob>0.5:
return 1.0
else:
return 0.0
def colicTest():
frTrain=open('/Users/xxxx/Downloads/machinelearninginaction-master/Ch05/horseColicTraining.txt')
frTest=open('/Users/xxxx/Downloads/machinelearninginaction-master/Ch05/horseColicTest.txt')
trainingSet=[]
trainingLabels=[]
for line in frTrain.readlines():
currLine=line.strip().split('\t')
lineArr=[]
for i in range(21):
lineArr.append(float(currLine[i]))
trainingSet.append(lineArr)
trainingLabels.append(float(currLine[21]))
trainWeights=stocGradAscent1(np.array(trainingSet),trainingLabels,500)
errorCount=0
numTestVec=0.0
for line in frTest.readlines():
numTestVec+=1.0
currLine=line.strip().split('\t')
lineArr=[]
for i in range(21):
lineArr.append(float(currLine[i]))
if int(classifyVector(np.array(lineArr),trainWeights))!=int(currLine[21]):
errorCount+=1
errorRate=(float(errorCount)/numTestVec)
print("the error rate of this test is:%f" % errorRate)
return errorRate
def multiTest():
numTests=10
errorSum=0.0
for k in range(numTests):
errorSum+=colicTest()
print("after %d iterations the average error rate is: %f" % (numTests,errorSum/float(numTests)))