机器学习之python: kNN
##################################################
# kNN : k Nearest Neighbour
# Author : Monne
# Date : 2015-01-24
# Email : 416606639@qq.com
##################################################
import numpy as np
import time
starttime = time.time() """ too long , equal to classify()
def distance(xVec, yVec):
# 1. attain distance from xVec and yVec
x = np.array(xVec); y = np.array(yVec) # x = array([1,2,3]), y = array([2,3,4])
diff = x - y # x - y = array([-1, -1, -1])
diff2 = diff ** 2 # diff2 = diff**2 = array([1, 1, 1])
sumdiff2 = sum(diff2) # sumdiff2 = sum(diff2) = 3
sqrtsumdiff2 = sumdiff2 ** 0.5 # 9 ** 0.5 = 3.0
return sqrtsumdiff2 def disttest(testx, trainx):
# attain all the distance between testx and trainx[i]
# from distx {ID: distance}
distx = {}
numsample = len(trainx)
for i in range(numsample):
distx[i] = distance(testx, trainx[i])
return distx def sort(testx, trainx):
# sort distx {ID: distance}
# return IDk
distx = disttest(testx, trainx)
sortitems = sorted(distx.iteritems(), key = lambda d:d[1]) # list
IDk = []; distances = []
l = len(trainx)
for i in range(l):
IDk.append(sortitems[i][0]) # ID
distances.append(sortitems[i][1]) # distance
#print "distances = ", distances[:5]
return IDk def majorcount(testx, trainx, trainy, k):
IDk = sort(testx, trainx)
sorty = {} # dist(y, count)
#l = len(trainx)
for i in range(k):
sorty[trainy[IDk[i]]] = sorty.get(trainy[IDk[i]], 0) + 1
sorty = sorted(sorty.iteritems(), key = lambda d:d[1], reverse = True) # list
#print "sorty = ",sorty
return sorty[0][0] def kNN(testx, trainx, trainy, k):
# given testx, trainx, trainy, k
# return predict y
c = classify(testx, trainx, trainy, k)
print "the classifier came back: % r" % c
return c
""" # step 1. data input
def testsample():
trainx = [[1.0, 1.1],
[1.0, 1.0],
[0, 0],
[0, 0.1]]
trainy = ['A', 'A', 'B', 'B']
return trainx, trainy def txt2trainxy(filename):
# 1.read from file
# 2.attain dataset: trainx and trainy
fr = open( filename +'.txt')
trainx = []; trainy = []
for line in fr.readlines():
l = line.split()
trainx.append(map(float,l[: -1]))
trainy.append(int(l[-1]))
return trainx,trainy def img2trainxy(filename):
trainx = []; trainy = []
from os import listdir
fl = listdir(filename) # fr = ['0_2.txt','0_1.txt']
for name in fl: # name = '0_2.txt'
trainy.append(int(name[0])) # name[0] = '0', int(name[0]) = int('0') = 0
fr = open(filename + '/' + name) # open('0_2.txt')
tx = []
for line in fr.readlines(): # line = '001100\r\n'
tx.extend(line.strip()) # line.strip() = '001100', tx = ['0','0,'1','1',...]
trainx.append(map(int, tx)) # map(int, tx) = [0,0,1,1,...]
return trainx, trainy # step 2. data transform
def norm(trainx):
max = np.array(trainx).max(0) # max(0) = max(axis = 0)
min = np.array(trainx).min(0)
diff = max - min
ntrainx = (np.array(trainx) - min) / map(float, diff)
return ntrainx.tolist(), min, map(float, diff) # step 3. classify function
def classify(testx, trainx, trainy, k):
diff = np.array(trainx) - np.array(testx)
diff2 = diff ** 2
sumdiff2 = diff2.sum(axis = 1)
sqrt = sumdiff2 ** 0.5
IDs = sqrt.argsort() # sorted index
sorty = {} # (y, count)
for i in range(k):
key = trainy[IDs[i]]
sorty[key] = sorty.get(key, 0) + 1
return sorted(sorty.iteritems(), key =
lambda d:d[1], reverse = True)[0][0] # step 4. test for error rate
def testkNN(testratio, trainx, trainy, k):
l = int(len(trainx) * testratio)
errorcount = 0
for i in range(l):
c = classify(trainx[i], trainx[l:], trainy[l:], k)
#print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i])
if c != trainy[i]:
errorcount += 1
print "the total error rate is: %f." % (errorcount / float(l))
#return (errorcount / float(l)) def randomtestkNN(testratio, trainx, trainy, k):
import random
m = len(trainx); l = int(m * 0.1)
testx = []; testy = []; s = [] # random choose k number in [0,l)
s = random.sample(range(m), l); b = list(set(range(m)) - set(s))
testx = [trainx[i] for i in s]
testy = [trainy[i] for i in s]
trainx = [trainx[i] for i in b]
trainy = [trainy[i] for i in b]
"""
for i in range(l):
s = random.randint(0, m - 1) #[0,m] include m and maybe repeat
dels.append(s)
testx.append(trainx[s])
testy.append(trainy[s])
trainx = [trainx[i] for i in range(m) if i not in dels]
trainy = [trainy[i] for i in range(m) if i not in dels]
""" errorcount = 0
for i in range(l):
c = classify(testx[i], trainx, trainy, k)
#print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i])
if c != testy[i]:
errorcount += 1
print "the total error rate is: %f." % (errorcount / float(l))
return (errorcount / float(l)) def avg():
a = []
for i in range(1,10):
#print i
a.append(handwriting('trainingDigits', 'testDigits', i))
a = np.array(a)
print a
print a.argsort()
# k = 4, errormin = 0.03 # step 5_1 small sample
def sample(k):
trainx, trainy = testsample()
testkNN(trainx, trainy, k) # step 5_2. use for dating web site
def datingwebsite(filename, k):
## step 1: load data
print "step 1: load data..."
trainx, trainy = txt2trainxy(filename) # must str like 'datingTestSet2', not datingTestSet2
trainx, min, diff = norm(trainx) ## step 2: training...
print "step 2: training..."
pass ## step 3: testing...
print "step 3: testing..."
randomtestkNN(0.10, trainx, trainy, k)
#testkNN(0.10, trainx, trainy, k)
print "time cost: ", (time.time() - starttime) ## step 4: show the result...
print "step 4: show the result..."
resultList = ['not at all', 'in small doses', 'in large doses']
percentTats = float(raw_input(
"percentage of time spent playing video games?> "))
ffMiles = float(raw_input("frequent flier miles earned per year?> "))
iceCream = float(raw_input("liters of ice cream consumed per year?> "))
classx = (np.array([ffMiles, percentTats, iceCream]) - min) / diff
classy = classify(classx, trainx, trainy, k)
print "You will probably like this person: ", resultList[classy - 1] return (errorcount / float(l)) # step 5_3. use for hand writing
def handwriting(trainfile, testfile, k):
## step 1: load data...
print "step 1: load data..."
print "---Getting training set..."
trainx, trainy = img2trainxy(trainfile)
print "---Geting testing set..."
testx, testy = img2trainxy(testfile)
m = len(trainx)
print m, len(trainx[0])
print len(testx), len(testx[0]) # random choose trainx
print "---Random choosing the training data..."
import random
n = random.randint(0, m - 1) # random numbers
s = random.sample(range(m), n) # random samples
trainx = [trainx[i] for i in s]
trainy = [trainy[i] for i in s]
print "---the numbers of training data is: ", n ## step 2: training...
print "step 2: training..."
pass ## step 3: testing...
print "step 3: testing..."
l = len(testx)
errorcount = 0
for i in range(l):
c = classify(testx[i], trainx, trainy, k)
#print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i])
if c != testy[i]:
errorcount += 1
print "the total error rate is: %f." % (errorcount / float(l))
print "time cost: ", (time.time() - starttime) ## step 4: show the result...
print "step 4: show the result..."
pass return (errorcount / float(l)) #datingwebsite('datingTestSet2', 4) handwriting('trainingDigits', 'testDigits', 3) #avg()
机器学习之python: kNN的更多相关文章
- 机器学习之路--KNN算法
机器学习实战之kNN算法 机器学习实战这本书是基于python的,如果我们想要完成python开发,那么python的开发环境必不可少: (1)python3.52,64位,这是我用的python ...
- 可能是史上最全的机器学习和Python(包括数学)速查表
新手学习机器学习很难,就是收集资料也很费劲.所幸Robbie Allen从不同来源收集了目前最全的有关机器学习.Python和相关数学知识的速查表大全.强烈建议收藏! 机器学习有很多方面. 当我开始刷 ...
- 基于Python的机器学习实战:KNN
1.KNN原理: 存在一个样本数据集合,也称作训练样本集,并且样本集中每个数据都存在标签,即我们知道样本集中每一个数据与所属分类的对应关系.输入没有标签的新数据后,将新数据的每个特征与样本集中数据对应 ...
- 吴裕雄--天生自然python机器学习实战:K-NN算法约会网站好友喜好预测以及手写数字预测分类实验
实验设备与软件环境 硬件环境:内存ddr3 4G及以上的x86架构主机一部 系统环境:windows 软件环境:Anaconda2(64位),python3.5,jupyter 内核版本:window ...
- python机器学习一:KNN算法实现
所谓的KNN算法,或者说K最近邻(kNN,k-NearestNeighbor)分类算法是数据挖掘分类技术中最简单的方法之一.所谓K最近邻,就是k个最近的邻居的意思,说的是每个样本都可以用它最接近的k个 ...
- 机器学习常用Python扩展包
在Ubuntu下安装Python模块通常有3种方法:1)使用apt-get:2)使用pip命令(推荐);3)easy_instal 可安装方法参考:[转]linux和windows下安装python集 ...
- Python KNN算法
机器学习新手,接触的是<机器学习实战>这本书,感觉书中描述简单易懂,但对于python语言不熟悉的我,也有很大的空间.今天学习的是k-近邻算法. 1. 简述机器学习 在日常生活中,人们很难 ...
- 大数据分析与机器学习领域Python兵器谱
http://www.thebigdata.cn/JieJueFangAn/13317.html 曾经因为NLTK的缘故开始学习Python,之后渐渐成为我工作中的第一辅助脚本语言,虽然开发语言是C/ ...
- 算法代码[置顶] 机器学习实战之KNN算法详解
改章节笔者在深圳喝咖啡的时候突然想到的...之前就有想写几篇关于算法代码的文章,所以回家到以后就奋笔疾书的写出来发表了 前一段时间介绍了Kmeans聚类,而KNN这个算法刚好是聚类以后经常使用的匹配技 ...
随机推荐
- HDU5039--Hilarity DFS序+线段树区间更新 14年北京网络赛
题意:n个点的树,每个条边权值为0或者1, q次操作 Q 路径边权抑或和为1的点对数, (u, v)(v, u)算2个. M i修改第i条边的权值 如果是0则变成1, 否则变成0 作法: 我们可以求出 ...
- Java学习日记-3 Character和字符串
(先说几个小问题 1.在main方法中调用主类的其他方法时,必须先生成主类的对象 2.String s = new String("Hello") 生成了两个对象 3.熟用布尔+f ...
- 安全控件开发原理分析 支付宝安全控件开发 C++
浏览器安全控件是如果支付宝一样结合web程序密码数据安全处理的程序,采用C++语言开发 通常的安全控件分为两种,一种是指支持IE内核的浏览器,一种支持所有内核的浏览器,支付宝采用的是支持所有内核的浏览 ...
- openStack 云平台使用一般杂记
1. To launch an instance,you must at least specify the flavor,image name,network,security group,keyp ...
- Java WeakHashMap 源码解析
前面把基于特定数据结构的Map介绍完了,它们分别利用了相应数据结构的特点来实现特殊的目的,像HashMap利用哈希表的快速插入.查找实现O(1)的增删改查,TreeMap则利用了红黑树来保证key的有 ...
- 基于Equinox构建OSGi项目
几种OSGi框架 Several independently implemented OSGi frameworks exist today, including four that are avai ...
- Java并发编程:CopyOnWrite容器的实现
Java并发编程:并发容器之CopyOnWriteArrayList(转载) 原文链接: http://ifeve.com/java-copy-on-write/ Copy-On-Write简称COW ...
- Android中自定义Activity和Dialog的位置大小背景和透明度等
1.自定义Activity显示样式 先在res/values下建colors.xml文件,写入: view plainprint? 1. <?xml version="1.0" ...
- ipcs, ipcrm 命令
ipcs命令 是linux/uinx上提供关于一些进程间通信方式的信息,包括共享内存,消息队列,信号 ipcs用法 ipcs -a 是默认的输出信息 打印出当前系统中所有的进程间通信方式的信息 ip ...
- String,StringBuffer与StringBuilder的差别??
String 字符串常量StringBuffer 字符串变量(线程安全)StringBuilder 字符串变量(非线程安全) 简要的说, String 类型和 StringBuffer 类型的主要性能 ...