Classification
kNN1
# -*- coding: utf-8 -*- """ kNN : 최근접 이웃 """ import numpy as np # 다차원배열, 선형대수 연산 import matplotlib.pyplot as plt # 1. 알려진 두 집단 x,y 산점도 시각화 plt.scatter(1.2, 1.1) # A 집단 plt.scatter(1.0, 1.0) plt.scatter(1.8, 0.8) # B 집단 plt.scatter(2, 0.9) plt.scatter(1.6, 0.85, color='r') # 분류대상(알려지지 않은 집단) plt.show() # 2. DATA 생성과 함수 정의 p1 = [1.2, 1.1] # A 집단 p2 = [1.0, 1.0] p3 = [1.8, 0.8] # B 집단 p4 = [2, 0.9] category = ['A','A','B','B'] # 알려진 집단 분류범주(Y변수) p5 = [1.6, 0.85] # 분류대상 # data 생성 함수 정의 def data_set(): # 선형대수 연산 : numpy형 변환 know_group = np.array([p1, p2, p3, p4]) # 알려진 집단 not_know_group = np.array(p5) # 알려지지 않은 집단 class_category = np.array(category) # 정답(분류범주) return know_group,not_know_group,class_category know_group,not_know_group,class_category=data_set() print('알려진 집단') """ [[1.2 1.1] [1. 1. ] [1.8 0.8] [2. 0.9]] """ print(know_group) print('알려지지 않은 집단') print(not_know_group) #[1.6 0.85] print('정답') print(class_category) #['A' 'A' 'B' 'B'] # #차(-) -> 자곱(**) -> 합(sum) -> 제곱근(sqrt) diff=know_group-not_know_group #2차원 -1차원 print('차=\n',diff) """ 차= [[-0.4 0.25] [-0.6 0.15] [ 0.2 -0.05] [ 0.4 0.05]] """ sq_diff = diff ** 2 sq_sum = sq_diff.sum(axis=1) #행단위 합계 print(sq_sum) #[0.2225 0.3825 0.0425 0.1625] distance=np.sqrt(sq_sum) print(distance) #[0.47169906 0.61846584 0.20615528 0.40311289] #[3 4 1 2]거리 k=3 (B(2)>A(1)) print(class_category)#['A' 'A' 'B' 'B'] def classfy(know,not_know,cate,k): #유클리드인 거리계산식 diff=know-not_know sq_diff = diff ** 2 sq_sum = sq_diff.sum(axis=1) distance=np.sqrt(sq_sum) #2.가장 가까운 거리 오름차순 정렬 -> index sortDist=distance.argsort() #sort->index #print(sortDist) #[2 3 0 1] #3.최근접 이윳 class_result={} #빈 set for i in range(k):#0~2 key = cate[sortDist[i]] #i=0 -> 'B' class_result[key]=class_result.get(key,0)+1 return class_result #함수 호출 class_result=classfy(know_group,not_know_group,class_category,3) print(class_result) #{'B': 2, 'A': 1} #vot 함수 def class_vote(class_result): return max(class_result,key=class_result.get) vote_result=class_vote(class_result) print("분류결과=",vote_result)#분류결과= B
kNN Class
# -*- coding: utf-8 -*- """ class 구현 """ import numpy as np from Step01_kNN import data_set know_group,not_know_group,class_category=data_set() #class =Func1+Func2+Func3 class kNNclassify: #1.최근접 이웃 def classfy(self,know,not_know,cate,k): #유클리드인 거리계산식 diff=know-not_know sq_diff = diff ** 2 sq_sum = sq_diff.sum(axis=1) distance=np.sqrt(sq_sum) #2.가장 가까운 거리 오름차순 정렬 -> index sortDist=distance.argsort() #sort->index #print(sortDist) #[2 3 0 1] #3.최근접 이윳(k=3) self.class_result={} #빈 set for i in range(k):#0~2 key = cate[sortDist[i]] #i=0 -> 'B' self.class_result[key]=self.class_result.get(key,0)+1 #vot 함수 def class_vote(self): return max(self.class_result,key=self.class_result.get) #class object 생성 obj=kNNclassify() #생성자 #objext.menber : self.class_result obj.classfy(know_group,not_know_group,class_category,3) vote_result=obj.class_vote() print('kNN 분류결과=',vote_result)#kNN 분류결과= B
NB
# -*- coding: utf-8 -*- """ 통계적 분류기 - NB """ import pandas as pd from sklearn import model_selection#train/test from sklearn.naive_bayes import GaussianNB iris=pd.read_csv("../data/iris.csv") print(iris.head()) """ Sepal.Length Sepal.Width Petal.Length Petal.Width Species 0 5.1 3.5 1.4 0.2 setosa 1 4.9 3.0 1.4 0.2 setosa 2 4.7 3.2 1.3 0.2 setosa 3 4.6 3.1 1.5 0.2 setosa 4 5.0 3.6 1.4 0.2 setosa """ #2. x,y 변수 선정 cols=list(iris.columns) x_cols=cols[:4] #X:1~4(연속형) y_cols=cols[-1] #y:5(범주형) #3.train/test split iris_df=iris print(iris_df.shape)#(150, 5) train_iris,test_iris=model_selection.train_test_split(iris_df,test_size=0.3,random_state=123) print(train_iris.shape)#(105, 5) print(test_iris.shape)#(45, 5) #4. model생성 train set obj=GaussianNB() #object model=obj.fit(train_iris[x_cols],train_iris[y_cols]) #5.model 평가 pred=model.predict(test_iris[x_cols]) #Y예측 Y = test_iris[y_cols] #정답 #confusion matrix matrix=pd.crosstab(pred,Y) print(matrix) """ Species setosa versicolor virginica row_0 setosa 18 0 0 versicolor 0 10 2 virginica 0 0 15 """ acc= (matrix.ix[0,0]+matrix.ix[1,1]+matrix.ix[2,2])/len(Y) print('분류정확도=',acc)#분류정확도= 0.9555555555555556
SVM
# -*- coding: utf-8 -*- """ SVM Model """ import pandas as pd from sklearn import model_selection#train/test from sklearn import svm #model iris=pd.read_csv("../data/iris.csv") print(iris.head()) """ Sepal.Length Sepal.Width Petal.Length Petal.Width Species 0 5.1 3.5 1.4 0.2 setosa 1 4.9 3.0 1.4 0.2 setosa 2 4.7 3.2 1.3 0.2 setosa 3 4.6 3.1 1.5 0.2 setosa 4 5.0 3.6 1.4 0.2 setosa """ #2. x,y 변수 선정 cols=list(iris.columns) x_cols=cols[:4] #X:1~4(연속형) y_cols=cols[-1] #y:5(범주형) #3.train/test split iris_df=iris print(iris_df.shape)#(150, 5) train_iris,test_iris=model_selection.train_test_split(iris_df,test_size=0.3,random_state=123) print(train_iris.shape)#(105, 5) print(test_iris.shape)#(45, 5) #4.model -SVM obj=svm.SVC() model=obj.fit(train_iris[x_cols],train_iris[y_cols]) #5.model 평가 pred=model.predict(test_iris[x_cols]) Y=test_iris[y_cols] #confusion matrix matrix=pd.crosstab(pred,Y) print(matrix) """ Species setosa versicolor virginica row_0 setosa 18 0 0 versicolor 0 10 1 virginica 0 0 16 """ acc= (matrix.ix[0,0]+matrix.ix[1,1]+matrix.ix[2,2])/len(Y) print('분류정확도=',acc)#분류정확도= 0.9777777777777777
spam_train_test
# -*- coding: utf-8 -*- """ NB vs SWM -data set :sparse matrix 이용 -file name:../data/spam_tran_test.npy """ from sklearn.naive_bayes import GaussianNB from sklearn import svm import numpy as np import pandas as pd #1.file Loading X_train,X_test,y_train,y_test=np.load("../data/spam_tran_test.npy") print(X_train.shape) #(3901, 4000) print(X_test.shape) #(1673, 4000) print(type(y_train))#<class 'list'> #list -> numpy형변환: 선형대수 연산 y_train=np.array(y_train) y_test=np.array(y_test) print(type(y_train))#<class 'numpy.ndarray'> 선형대수 하기위해서 #2.NB model생성 obj =GaussianNB() nb_model=obj.fit(X_train,y_train) pred=nb_model.predict(X_test) Y=y_test matrix=pd.crosstab(pred,Y) print("nb matrix\n",matrix) """ col_0 0(ham) 1(spam) row_0 0 1264 28 1 167 214 """ acc=(matrix.ix[0,0]+matrix.ix[1,1])/len(Y) print("NB acc=",acc) #NB acc= 0.8834429169157203 #2) 정확률:예측치 yes-> 실제값 yes precision=matrix.ix[1,1]/(matrix.ix[1,0]+matrix.ix[1,1]) print("정확률=",precision)#정확률= 0.5616797900262467 #3) 재현률:실제값yes -> 예측치 yes recall=matrix.ix[1,1]/(matrix.ix[0,1]+matrix.ix[1,1]) print("재현률=",recall)#재현률= 0.8842975206611571 #4) f1 score:precision,recall f1_score=2 * (precision*recall)/(precision+recall) print('f1_score=',f1_score)#f1_score= 0.6869983948635634 #3.SVM model svm_obj =svm.SVC(kernel='linear')#kenel 함수 svm_model=svm_obj.fit(X_train,y_train) svm_pred=svm_model.predict(X_test) svm_Y=y_test svm_matrix=pd.crosstab(svm_pred,svm_Y) print("svm matrix\n",svm_matrix) """ svm matrix col_0 0 1 row_0 0 1428 36 1 3 206 """ svm_acc=(svm_matrix.ix[0,0]+svm_matrix.ix[1,1])/len(svm_Y) print("svm acc=",svm_acc) #svm acc= 0.976688583383144
sms_spam_data
# -*- coding: utf-8 -*- """ Created on Sat Feb 23 15:52:23 2019 @author: 502-03 """ from sklearn.naive_bayes import GaussianNB from sklearn import svm import numpy as np import pandas as pd #1.file Loading X_train,X_test,y_train,y_test=np.load("../data/sms_spam_data.npy") print(X_train.shape) #(4446, 6000) print(X_test.shape) #(1112, 6000) print(type(y_train))#<class 'pandas.core.series.Series'> #NB model 생성 obj=GaussianNB() nb_model=obj.fit(X_train,y_train) nb_pred=nb_model.predict(X_test) nb_Y=y_test nb_tab=pd.crosstab(nb_pred,nb_Y) print("nb_tab=\n",nb_tab) """ nb_tab= type ham spam row_0 ham 812 10 spam 156 134 """ nb_acc=(nb_tab.ix[0,0]+nb_tab.ix[1,1])/len(nb_Y) print("nb_acc=",nb_acc) #nb_acc= 0.8507194244604317 #svm obj=svm.SVC(kernel='linear') svc_model=obj.fit(X_train,y_train) svc_pred=svc_model.predict(X_test) svc_Y=y_test svc_tab=pd.crosstab(svc_pred,svc_Y) print("svc_tab=\n",svc_tab) """ svc_tab= type ham spam row_0 ham 964 20 spam 4 124 """ svc_acc=(svc_tab.ix[0,0]+svc_tab.ix[1,1])/len(svc_Y) print("svc_acc=",svc_acc) #svc_acc= 0.9784172661870504 precision=svc_tab.ix[1,1]/(svc_tab.ix[1,0]+svc_tab.ix[1,1]) print("정확률",precision)#정확률 0.96875 recall=svc_tab.ix[1,1]/(svc_tab.ix[0,1]+svc_tab.ix[1,1]) print("재현률",recall)#재현률 0.8611111111111112 f1_score=2* (precision * recall)/(precision + recall) print("f1_score",f1_score)#f1_score 0.911764705882353
Classification的更多相关文章
- W3School-CSS 分类 (Classification) 实例
CSS 分类 (Classification) 实例 CSS 实例 CSS 背景实例 CSS 文本实例 CSS 字体(font)实例 CSS 边框(border)实例 CSS 外边距 (margin) ...
- Large Margin DAGs for Multiclass Classification
Abstract We present a new learning architecture: the Decision Directed Acyclic Graph (DDAG), which i ...
- 《ImageNet Classification with Deep Convolutional Neural Networks》 剖析
<ImageNet Classification with Deep Convolutional Neural Networks> 剖析 CNN 领域的经典之作, 作者训练了一个面向数量为 ...
- 自然语言23_Text Classification with NLTK
QQ:231469242 欢迎喜欢nltk朋友交流 https://www.pythonprogramming.net/text-classification-nltk-tutorial/?compl ...
- MATLAB 图像分类 Image Category Classification Using Bag of Features
使用MATLAB实现图像的识别,这是MATLAB官网上面的例子,学习一下. http://cn.mathworks.com/help/vision/examples/image-category-cl ...
- Galaxy Classification
10.3 Data Preparation After removing a large number of the columns from the raw SDSS dataset, introd ...
- Kaiju: Fast and sensitive taxonomic classification for metagenomics
Kaiju: Fast and sensitive taxonomic classification for metagenomics 问题描述:However, nucleotide comp ...
- 《Automatic Face Classification of Cushing’s Syndrome in Women – A Novel Screening Approach》学习笔记
<针对女性库欣综合征患者的自动面部分类-一种新颖的筛查方法> Abstract 目的:库兴氏综合征对身体造成相当大的伤害如果不及时治疗,还经常是诊断的时间太长.在这项研究中,我们旨在测试面 ...
- [CS231n-CNN] Image classification and the data-driven approach, k-nearest neighbor, Linear classification I
课程主页:http://cs231n.stanford.edu/ Task: Challenges: _________________________________________________ ...
- [ML] Naive Bayes for Text Classification
TF-IDF Algorithm From http://www.ruanyifeng.com/blog/2013/03/tf-idf.html Chapter 1, 知道了"词频" ...
随机推荐
- kubernetes 报错汇总
一. pod的报错: 1. pod的容器无法启动报错: 报错信息: Normal SandboxChanged 4m9s (x12 over 5m18s) kubelet, k8sn1 Pod san ...
- 【UOJ386】【UNR #3】鸽子固定器 链表
题目描述 有 \(n\) 个物品,每个物品有两个属性:权值 \(v\) 和大小 \(s\). 你要选出 \(m\) 个物品,使得你选出的物品的权值的和的 \(d_v\) 次方减掉大小的极差的 \(d_ ...
- Thinkphp5.1 ORM UML
Thinkphp5.1 ORM UML think-orm
- ADO.NET中的五大内置对象
ADO.NET中的五大内置对象 学习链接:https://blog.csdn.net/wxr15732623310/article/details/51828677
- Mycat的读写分离
1. Mycat实现读写分离的部署: https://www.cnblogs.com/softidea/p/5447566.html springboot动态数据源的原理以及配置: Spring内置了 ...
- 剑指Offer_编程题_24
题目描述 输入一颗二叉树和一个整数,打印出二叉树中结点值的和为输入整数的所有路径.路径定义为从树的根结点开始往下一直到叶结点所经过的结点形成一条路径. /* struct TreeNode { int ...
- 第二节: 比较EF的Lambda查询和Linq查询写法的区别
简介 在前面EF的介绍中,曾多次提到过EF可以使用Lambda和Linq来完成对数据库的访问,这两种的语法的具体使用和注意事项在前面的DotNet进阶的系列章节中已经详细介绍过了,本次借着EF章节,重 ...
- css中的数学表达式calc()
前言 数学表达式calc()是CSS中的函数,主要用于数学运算.使用calc()为页面元素布局提供了便利和新的思路. 概念 数学表达式calc()是calculate计算的缩写,它允许使用+.-.*. ...
- JDBC Template
JDBC Template 1. Spring JDBC Spring框架对JDBC的简单封装,提供了一个JDBCTemplate对象用来简化JDBC的开发 步骤: 导入jar包 创建JDBCTemp ...
- 淘宝网站上的 HTTP 缓存问题两则
在阅读本文前推荐你先阅读我的前两篇文章< 扼杀 304,Cache-Control: immutable>和<关于缓存和 Chrome 的“新版刷新”>:下面要说的两个问题是在 ...