机器学习实战(三)朴素贝叶斯分类实例

 一、问题描述

 

机器学习实战(三)朴素贝叶斯分类实例

机器学习实战(三)朴素贝叶斯分类实例

机器学习实战(三)朴素贝叶斯分类实例

 

二、问题解决 

贝叶斯分类:参考上篇博客

 

三、代码实现 

# -*- coding: utf-8 -*-
# @Time    : 19-4-10 上午8:34
# @Author  : MRB
# @File    : demo2.py
# @Software: PyCharm Community Edition
from numpy import *

def str2list(data_str):
    data = [int(temp) for temp in data_str.strip().split()]
    return data

#数据格式化
def load_data(n):
    labels_str = input()
    labels =[int(temp) for temp in labels_str.strip().split()]
    print(labels)
    if len(labels)!= n:
        return -1
    train_set = []
    for index in range(n):
        temp = str2list(input())
        train_set.append(temp)
    test_set = str2list(input())
    # print(test_set)
    return labels,train_set,test_set

#数据统计---方便求出概率
def count(data_mat,labels):
    '''
    :param data_mat: 特征矩阵
    :return: 返回每列数据各个数字的个数
    '''
    num_data = len(data_mat)
    num_feature = len(data_mat[0])

    pA = sum(labels)/float(num_data) # 去打球的概率

    result = {"0":{},"1":{}}
    for index in range(num_feature):
        temp0 = {}
        temp1 = {}
        for i in range(len(data_mat)):
            if labels[i] == 0:
                if str(data_mat[i][index]) not in temp0:
                    temp0[str(data_mat[i][index])] = 1
                else:
                    temp0[str(data_mat[i][index])] += 1
            else:
                if str(data_mat[i][index]) not in temp1:
                    temp1[str(data_mat[i][index])] = 1
                else:
                    temp1[str(data_mat[i][index])] += 1
        # print(str(index),temp0)
        result['0'][str(index)] = temp0
        result['1'][str(index)] = temp1
    return result,pA

def main():
    # n = int(input('n:'))
    # labels, train_set, test_set = load_data(n)
    n = 9
    labels = [0, 0, 0, 0, 1, 1, 1, 1, 1]
    train_set = [[0, 0, 30, 450, 7], [1, 1, 5, 500, 3], [1, 0, 10, 150, 1], [0, 1, 40, 300, 6], [1, 0, 20, 100, 10],
     [0, 1, 25, 180, 12], [0, 0, 32, 50, 11], [1, 0, 23, 120, 9], [0, 0, 27, 200, 8]]
    test_set = [0, 0, 40, 180, 8]
    # print(labels)
    # print(train_set)
    # print(test_set)
    for index in range(2,5):
        max_n = min_n = train_set[0][index]
        for i in range(n):
            if max_n < train_set[i][index]:
                max_n = train_set[i][index]
            if min_n > train_set[i][index]:
                min_n = train_set[i][index]
        step = (max_n - min_n)/3.0
        for i in range(n):
            if train_set[i][index]<min_n+step:
                train_set[i][index] = 0
            elif train_set[i][index]<min_n+step*2:
                train_set[i][index] = 1
            else:
                train_set[i][index] = 2
        if test_set[index]<min_n+step:
            test_set[index] = 0
        elif test_set[index]<min_n+step*2:
            test_set[index] = 1
        else:
            test_set[index] = 2
    print("labels: ",labels)
    print("train_set: ",train_set)
    print("test_set: ",test_set)
    #train
    result,pA = count(train_set,labels)
    print("*"*100)
    p = {}
    for class_key in result.keys():
        # print(class_key,result[class_key])
        for feature_key in result[class_key].keys():
            totall = sum([int(result[class_key][feature_key][temp]) for temp in result[class_key][feature_key].keys()])
            for k in result[class_key][feature_key].keys():
                p['p'+class_key+feature_key+k] = result[class_key][feature_key][k]/totall
    # print(p)
    #test
    p0 = 1 - pA
    p1 = pA
    for i in range(5):
        p0 *= p['p0'+str(i)+str(test_set[i])]
        p1 *= p['p1' + str(i) + str(test_set[i])]
    print("去打篮球:", p1)
    print("不去打篮球:",p0)
    print('去打篮球/不去打篮球:',round(p1/p0,3))

if __name__ == '__main__':
    main()