K-近邻算法之手写数字识别系统
定义将图像转换为向量函数
# 导入程序所需要的模块
import numpy as np
import operator
from os import listdir
读取文件
def img2vector(filename):
returnVect = np.zeros((1, 1024)) # 存储图片像素的向量维度是1x1024
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0, 32*i+j] = int(lineStr[j]) # 图片尺寸是32x32,将其依次放入向量returnVect中
return returnVect
定义 k 近邻算法
def classify0(inX, dataSet, labels, k): # inX是测试集,dataSet是训练集,lebels是训练样本标签,k是取的最近邻个数
dataSetSize = dataSet.shape[0] # 训练样本个数
diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet # np.tile: 重复n次
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5 # distance是inX与dataSet的欧氏距离
sortedDistIndicies = distances.argsort() # 返回排序从小到达的索引位置
classCount = {} # 字典存储k近邻不同label出现的次数
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 # 对应label加1,classCount中若无此key,则默认为0
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) # operator.itemgetter 获取对象的哪个维度的数据
return sortedClassCount[0][0] # 返回k近邻中所属类别最多的哪一类
定义手写数字识别系统函数
def handwritingClassTest():
# 训练样本
hwLabels = []
trainingFileList = listdir('./digits/trainingDigits') #导入训练集
m = len(trainingFileList)
trainingMat = np.zeros((m, 1024))
for i in range(m):
fileNameStr = trainingFileList[i] # fileNameStr 得到的是每个文件名称,例如"0_0.txt"
fileStr = fileNameStr.split('.')[0] #去掉“.txt”,剩下“0_0”
classNumStr = int(fileStr.split('_')[0]) # 按下划线‘_' 划分“0_0”,取第一个元素为类别标签
hwLabels.append(classNumStr)
trainingMat[i, :] = img2vector('./digits/trainingDigits/%s' % fileNameStr)
# 测试样本
testFileList = listdir('./digits/testDigits') #iterate through the test set
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i] # fileNameStr 得到的是每个文件名称,例如"0_0.txt"
fileStr = fileNameStr.split('.')[0] #去掉“.txt”,剩下“0_0”
classNumStr = int(fileStr.split('_')[0]) # 按下划线‘_' 划分“0_0”,取第一个元素为类别标签
vectorUnderTest = img2vector('./digits/testDigits/%s' % fileNameStr)
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) # 调用knn函数
print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr))
if (classifierResult != classNumStr): errorCount += 1.0
print("\nthe total number of errors is: %d" % errorCount)
print("\nthe total error rate is: %f" % (errorCount/float(mTest)))
运行实例函数
img2vector('D:/360安全浏览器下载/MachineLearningInAction-Camp-master/Week1/Reference Code/digits/testDigits/0_13.txt')
结果为:
array([[0., 0., 0., ..., 0., 0., 0.]])
主函数为:
handwritingClassTest()
结果如下: