












import numpy as np
import operator
import matplotlib
import matplotlib.pyplot as plt #KNN 分类器
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet #复制inX为(dataSetSize, 1)
sqDiffMat = diffMat ** 2
sqDistances = sqDiffMat.sum(axis = 1)
distances = sqDistances ** 0.5
sortedDistIndicies = distances.argsort() #按照value排序,并且返回索引
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 sortedClassCount = sorted(classCount.items(), key = lambda item: item[1] ,reverse = True) return sortedClassCount[0][0]

def file2matrix(fileName):
fr = open(fileName)
arrayAllLines = fr.readlines()
numberOfLines = len(arrayAllLines)
returnMat = np.zeros((numberOfLines, 3))
classLabelVector = []
index = 0 for line in arrayAllLines:
line.strip() #默认删除此行开头和结尾的空格和换行
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
index += 1 return returnMat, classLabelVector

def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
row = dataSet.shape[0]
normDataSet = dataSet - np.tile(minVals, (row, 1))
normDataSet = normDataSet / np.tile(ranges, (row, 1))
return normDataSet, ranges, minVals

def datingClassTest():
hoRatio = 0.10
pathName = "./datingTestSet2.txt"
datingDataMat, datingLabels = file2matrix(pathName)
normMat, ranges, minVals = autoNorm(datingDataMat)
row = normMat.shape[0]
numTestVecs = int(hoRatio * row)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:], normMat[numTestVecs:row,:], datingLabels[numTestVecs:row], 5)
print("The classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))
if(classifierResult != datingLabels[i]):
errorCount += 1.0 print("The total error tate is : %f" % (errorCount / float(numTestVecs))) datingClassTest()


from os import listdir
def img2vector(fileName):
returnVect = np.zeros((1,32 * 32))
fr = open(fileName)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0,i * 32 + j] = int(lineStr[j]) return returnVect #手写数字测试错误率
def handwritingClassTest():
hwLabels = []
trainingFileList = listdir('trainingDigits')
m = len(trainingFileList)
trainingMat = np.zeros((m, 32 * 32))
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0]
classNum = int(fileStr.split('_')[0])
trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr) testFileList = listdir('testDigits')
m = len(testFileList)
errorCount = 0.0
for i in range(m):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0]
realClassNum = int(fileStr.split('_')[0])
testVect = img2vector('testDigits/%s' % fileNameStr)
testClassNum = classify0(testVect, trainingMat, hwLabels, 3)
print("The classifier came back with: %d, the real answer is: %d" % (testClassNum, realClassNum))
if(testClassNum != realClassNum):
errorCount += 1.0 print("The total error rate is: %f" % (errorCount / float(m))) handwritingClassTest()

