1. 原理推导
直观上去理解朴素贝叶斯算法
![机器学习03-朴素贝叶斯-python 机器学习03-朴素贝叶斯-python](/default/index/img?u=aHR0cHM6Ly9waWFuc2hlbi5jb20vaW1hZ2VzLzI4NS9mMWNkNjQwMDE2YWJiNzJlZDEyNzAwZTUyNjNhN2M3ZC5KUEVH)
![机器学习03-朴素贝叶斯-python 机器学习03-朴素贝叶斯-python](/default/index/img?u=aHR0cHM6Ly9waWFuc2hlbi5jb20vaW1hZ2VzLzk3OC85YjY1NWY4NGM1OTQ4MjliNTI4OTVmMzdkMmFlODA0YS5KUEVH)
2. 代码实现
1. 计算概率
def trainNB0(trainMat , trainCategory):
'''
计算文本每个词出现概率
para:
trainMat: 文本词向量
trainCategory: 分类向量
return:
p1Vec: 分类1的词向量概率
p0Vec: 分类0的词向量概率
pAbusive: 分类1的先验概率
'''
m,n = trainMat.shape
pAbusive = np.sum(trainCategory) / float(m)
p0Num = np.ones(n)
p1Num = np.ones(n)
p0Denom = 2.0
p1Denom = 2.0
for i in range(m):
if trainCategory[i] == 1:
p1Num = p1Num + trainMat[i]
p1Denom = p1Denom + np.sum(trainCategory[i])
else:
p0Num += trainMat[i]
p0Denom += np.sum(trainCategory[i])
p1Vec = np.log(p1Num / p1Denom)
p0Vec = np.log(p0Num / p0Denom)
return p1Vec , p0Vec , pAbusive
2. 比较概率大小
def classifyNB(vec2Classify , p0Vec , p1Vec , pClass1):
'''
para:
vec2Classify:我们需要检测的词向量
p0Vec: 负类词向量的概率
p1Vec: 正类词向量的概率
pClass1: 正负的概率
return: 1 正类
0 负类
'''
p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)
p0 = sum(vec2Classify * p0Vec) + np.log(1-pClass1)
if p1>p0:
return 1
else:
return 0
3. sklearn实现
from sklearn.naive_bayes import GaussianNB , MultinomialNB , BernoulliNB
'''
GaussianNB
针对连续值
MultinomialNB
离散值 可以多次出现
BernoulliNB
离散值 要么存在要么不存在
'''
import numpy as np
import beiyesi as bys
if __name__ == "__main__":
data , target = bys.loadData()
worldList = bys.createWorldVec(data)
trainMat = []
for line in data:
trainMat.append(bys.setOfWords2Vec(worldList , line))
clf0 = GaussianNB()
clf1 = MultinomialNB()
clf2 = BernoulliNB()
clfVec =[]
clfVec.append(clf0)
clfVec.append(clf1)
clfVec.append(clf2)
for clf in clfVec:
clf.fit(np.array(trainMat) , np.array(target))
testDoc = ['love' , 'my' , 'dalmation']
testVec = np.array(bys.setOfWords2Vec(worldList , testDoc))
ret = clf.predict(testVec.reshape((1,-1)))
ret1 = clf.predict_proba(testVec.reshape((1,-1)))
ret2 = clf.predict_log_proba(testVec.reshape((1,-1)))
print ret , ret1 , ret2