机器学习之python: kNN
##################################################
# kNN : k Nearest Neighbour
# Author : Monne
# Date : 2015-01-24
# Email : 416606639@qq.com
##################################################
import numpy as np
import time
starttime = time.time() """ too long , equal to classify()
def distance(xVec, yVec):
# 1. attain distance from xVec and yVec
x = np.array(xVec); y = np.array(yVec) # x = array([1,2,3]), y = array([2,3,4])
diff = x - y # x - y = array([-1, -1, -1])
diff2 = diff ** 2 # diff2 = diff**2 = array([1, 1, 1])
sumdiff2 = sum(diff2) # sumdiff2 = sum(diff2) = 3
sqrtsumdiff2 = sumdiff2 ** 0.5 # 9 ** 0.5 = 3.0
return sqrtsumdiff2 def disttest(testx, trainx):
# attain all the distance between testx and trainx[i]
# from distx {ID: distance}
distx = {}
numsample = len(trainx)
for i in range(numsample):
distx[i] = distance(testx, trainx[i])
return distx def sort(testx, trainx):
# sort distx {ID: distance}
# return IDk
distx = disttest(testx, trainx)
sortitems = sorted(distx.iteritems(), key = lambda d:d[1]) # list
IDk = []; distances = []
l = len(trainx)
for i in range(l):
IDk.append(sortitems[i][0]) # ID
distances.append(sortitems[i][1]) # distance
#print "distances = ", distances[:5]
return IDk def majorcount(testx, trainx, trainy, k):
IDk = sort(testx, trainx)
sorty = {} # dist(y, count)
#l = len(trainx)
for i in range(k):
sorty[trainy[IDk[i]]] = sorty.get(trainy[IDk[i]], 0) + 1
sorty = sorted(sorty.iteritems(), key = lambda d:d[1], reverse = True) # list
#print "sorty = ",sorty
return sorty[0][0] def kNN(testx, trainx, trainy, k):
# given testx, trainx, trainy, k
# return predict y
c = classify(testx, trainx, trainy, k)
print "the classifier came back: % r" % c
return c
""" # step 1. data input
def testsample():
trainx = [[1.0, 1.1],
[1.0, 1.0],
[0, 0],
[0, 0.1]]
trainy = ['A', 'A', 'B', 'B']
return trainx, trainy def txt2trainxy(filename):
# 1.read from file
# 2.attain dataset: trainx and trainy
fr = open( filename +'.txt')
trainx = []; trainy = []
for line in fr.readlines():
l = line.split()
trainx.append(map(float,l[: -1]))
trainy.append(int(l[-1]))
return trainx,trainy def img2trainxy(filename):
trainx = []; trainy = []
from os import listdir
fl = listdir(filename) # fr = ['0_2.txt','0_1.txt']
for name in fl: # name = '0_2.txt'
trainy.append(int(name[0])) # name[0] = '0', int(name[0]) = int('0') = 0
fr = open(filename + '/' + name) # open('0_2.txt')
tx = []
for line in fr.readlines(): # line = '001100\r\n'
tx.extend(line.strip()) # line.strip() = '001100', tx = ['0','0,'1','1',...]
trainx.append(map(int, tx)) # map(int, tx) = [0,0,1,1,...]
return trainx, trainy # step 2. data transform
def norm(trainx):
max = np.array(trainx).max(0) # max(0) = max(axis = 0)
min = np.array(trainx).min(0)
diff = max - min
ntrainx = (np.array(trainx) - min) / map(float, diff)
return ntrainx.tolist(), min, map(float, diff) # step 3. classify function
def classify(testx, trainx, trainy, k):
diff = np.array(trainx) - np.array(testx)
diff2 = diff ** 2
sumdiff2 = diff2.sum(axis = 1)
sqrt = sumdiff2 ** 0.5
IDs = sqrt.argsort() # sorted index
sorty = {} # (y, count)
for i in range(k):
key = trainy[IDs[i]]
sorty[key] = sorty.get(key, 0) + 1
return sorted(sorty.iteritems(), key =
lambda d:d[1], reverse = True)[0][0] # step 4. test for error rate
def testkNN(testratio, trainx, trainy, k):
l = int(len(trainx) * testratio)
errorcount = 0
for i in range(l):
c = classify(trainx[i], trainx[l:], trainy[l:], k)
#print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i])
if c != trainy[i]:
errorcount += 1
print "the total error rate is: %f." % (errorcount / float(l))
#return (errorcount / float(l)) def randomtestkNN(testratio, trainx, trainy, k):
import random
m = len(trainx); l = int(m * 0.1)
testx = []; testy = []; s = [] # random choose k number in [0,l)
s = random.sample(range(m), l); b = list(set(range(m)) - set(s))
testx = [trainx[i] for i in s]
testy = [trainy[i] for i in s]
trainx = [trainx[i] for i in b]
trainy = [trainy[i] for i in b]
"""
for i in range(l):
s = random.randint(0, m - 1) #[0,m] include m and maybe repeat
dels.append(s)
testx.append(trainx[s])
testy.append(trainy[s])
trainx = [trainx[i] for i in range(m) if i not in dels]
trainy = [trainy[i] for i in range(m) if i not in dels]
""" errorcount = 0
for i in range(l):
c = classify(testx[i], trainx, trainy, k)
#print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i])
if c != testy[i]:
errorcount += 1
print "the total error rate is: %f." % (errorcount / float(l))
return (errorcount / float(l)) def avg():
a = []
for i in range(1,10):
#print i
a.append(handwriting('trainingDigits', 'testDigits', i))
a = np.array(a)
print a
print a.argsort()
# k = 4, errormin = 0.03 # step 5_1 small sample
def sample(k):
trainx, trainy = testsample()
testkNN(trainx, trainy, k) # step 5_2. use for dating web site
def datingwebsite(filename, k):
## step 1: load data
print "step 1: load data..."
trainx, trainy = txt2trainxy(filename) # must str like 'datingTestSet2', not datingTestSet2
trainx, min, diff = norm(trainx) ## step 2: training...
print "step 2: training..."
pass ## step 3: testing...
print "step 3: testing..."
randomtestkNN(0.10, trainx, trainy, k)
#testkNN(0.10, trainx, trainy, k)
print "time cost: ", (time.time() - starttime) ## step 4: show the result...
print "step 4: show the result..."
resultList = ['not at all', 'in small doses', 'in large doses']
percentTats = float(raw_input(
"percentage of time spent playing video games?> "))
ffMiles = float(raw_input("frequent flier miles earned per year?> "))
iceCream = float(raw_input("liters of ice cream consumed per year?> "))
classx = (np.array([ffMiles, percentTats, iceCream]) - min) / diff
classy = classify(classx, trainx, trainy, k)
print "You will probably like this person: ", resultList[classy - 1] return (errorcount / float(l)) # step 5_3. use for hand writing
def handwriting(trainfile, testfile, k):
## step 1: load data...
print "step 1: load data..."
print "---Getting training set..."
trainx, trainy = img2trainxy(trainfile)
print "---Geting testing set..."
testx, testy = img2trainxy(testfile)
m = len(trainx)
print m, len(trainx[0])
print len(testx), len(testx[0]) # random choose trainx
print "---Random choosing the training data..."
import random
n = random.randint(0, m - 1) # random numbers
s = random.sample(range(m), n) # random samples
trainx = [trainx[i] for i in s]
trainy = [trainy[i] for i in s]
print "---the numbers of training data is: ", n ## step 2: training...
print "step 2: training..."
pass ## step 3: testing...
print "step 3: testing..."
l = len(testx)
errorcount = 0
for i in range(l):
c = classify(testx[i], trainx, trainy, k)
#print "the classifier came back: % r, the real answer is: %r" % (c, trainy[i])
if c != testy[i]:
errorcount += 1
print "the total error rate is: %f." % (errorcount / float(l))
print "time cost: ", (time.time() - starttime) ## step 4: show the result...
print "step 4: show the result..."
pass return (errorcount / float(l)) #datingwebsite('datingTestSet2', 4) handwriting('trainingDigits', 'testDigits', 3) #avg()
机器学习之python: kNN的更多相关文章
- 机器学习之路--KNN算法
机器学习实战之kNN算法 机器学习实战这本书是基于python的,如果我们想要完成python开发,那么python的开发环境必不可少: (1)python3.52,64位,这是我用的python ...
- 可能是史上最全的机器学习和Python(包括数学)速查表
新手学习机器学习很难,就是收集资料也很费劲.所幸Robbie Allen从不同来源收集了目前最全的有关机器学习.Python和相关数学知识的速查表大全.强烈建议收藏! 机器学习有很多方面. 当我开始刷 ...
- 基于Python的机器学习实战:KNN
1.KNN原理: 存在一个样本数据集合,也称作训练样本集,并且样本集中每个数据都存在标签,即我们知道样本集中每一个数据与所属分类的对应关系.输入没有标签的新数据后,将新数据的每个特征与样本集中数据对应 ...
- 吴裕雄--天生自然python机器学习实战:K-NN算法约会网站好友喜好预测以及手写数字预测分类实验
实验设备与软件环境 硬件环境:内存ddr3 4G及以上的x86架构主机一部 系统环境:windows 软件环境:Anaconda2(64位),python3.5,jupyter 内核版本:window ...
- python机器学习一:KNN算法实现
所谓的KNN算法,或者说K最近邻(kNN,k-NearestNeighbor)分类算法是数据挖掘分类技术中最简单的方法之一.所谓K最近邻,就是k个最近的邻居的意思,说的是每个样本都可以用它最接近的k个 ...
- 机器学习常用Python扩展包
在Ubuntu下安装Python模块通常有3种方法:1)使用apt-get:2)使用pip命令(推荐);3)easy_instal 可安装方法参考:[转]linux和windows下安装python集 ...
- Python KNN算法
机器学习新手,接触的是<机器学习实战>这本书,感觉书中描述简单易懂,但对于python语言不熟悉的我,也有很大的空间.今天学习的是k-近邻算法. 1. 简述机器学习 在日常生活中,人们很难 ...
- 大数据分析与机器学习领域Python兵器谱
http://www.thebigdata.cn/JieJueFangAn/13317.html 曾经因为NLTK的缘故开始学习Python,之后渐渐成为我工作中的第一辅助脚本语言,虽然开发语言是C/ ...
- 算法代码[置顶] 机器学习实战之KNN算法详解
改章节笔者在深圳喝咖啡的时候突然想到的...之前就有想写几篇关于算法代码的文章,所以回家到以后就奋笔疾书的写出来发表了 前一段时间介绍了Kmeans聚类,而KNN这个算法刚好是聚类以后经常使用的匹配技 ...
随机推荐
- 【HDOJ】1316 How Many Fibs?
Java水了. import java.util.Scanner; import java.math.BigInteger; public class Main { public static voi ...
- CSDN总结的面试中的十大可视化工具
1. D3.js 基于JavaScript的数据可视化库,允许绑定任意数据到DOM,然后将数据驱动转换应用到Document中. 2. Data.js Data.js是一个JavaScript数据表示 ...
- Linux环境下使用图形化界面的SVN客户端软件-RabbitVCS
如果想在Linux环境下使用图形化界面的SVN客户端软件,那么RabbitVCS绝对是首选,可以媲美Windows环境下用的TortoiseSVN,甚至连操作都基本一样,所以强烈推荐给各位童鞋. Ra ...
- Hbase 计数器
Hbase计数器可以用于统计用户数,点击量等信息 基本操作 可以使用incr操作计数器,incr语法格式如下: incr '<table>', '<row>', '<co ...
- oracle锁表解决方法
SQL> select session_id from v$locked_object; SESSION_ID----------142 SQL> SELECT sid, serial#, ...
- QFII
QFII(Qualified Foreign Institutional Investors)合格的境外机构投资者的英文简称,中文“酋匪”,QFII机制是指外国专业投资机构到境内投资的资格认定制度. ...
- SKTextureAtlas类
继承自 NSObject 符合 NSCodingNSObject(NSObject) 框架 /System/Library/Frameworks/SpriteKit.framework 可用性 可用 ...
- [置顶] 我的Android进阶之旅------>介绍一款集录制与剪辑为一体的屏幕GIF 动画制作工具 GifCam
由于上一篇文章:我的Android进阶之旅------>Android之动画之Frame Animation实例 中展示的是Frame动画效果,但是之前我是将图片截取下来,不好说明确切的动画过程 ...
- 如何为Myeclipse手工添加dtd支持
一.引言 在MyEclipse中开发三大框架的项目时候经常会写一些配置的xml文件,例如:Struts2的struts.xml和Hibernate的hibernate.cfg.xml.Spring的a ...
- Linux磁盘分区实战案例
一.查看新添加磁盘 [root@localhost /]# fdisk -l 磁盘 /dev/sda:53.7 GB, 53687091200 字节,104857600 个扇区 Units = ...