大数据挖掘算法篇之K-Means实例
一、引言
K-Means算法是聚类算法中,应用最为广泛的一种。本文基于欧几里得距离公式:d = sqrt((x1-x2)^+(y1-y2)^)计算二维向量间的距离,作为聚类划分的依据,输入数据为二维数据两列数据,输出结果为聚类中心和元素划分结果。输入数据格式如下:
0.0 0.0
1.0 0.0
0.0 1.0
2.0 1.0
1.0 2.0
2.0 2.0
2.0 0.0
0.0 2.0
7.0 6.0
7.0 7.0
7.0 8.0
8.0 6.0
8.0 7.0
8.0 8.0
8.0 9.0
9.0 7.0
9.0 8.0
9.0 9.0
二、欧几里得距离:
/****************************************************************************
* *
* KMEANS *
* *
*****************************************************************************/ #include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <conio.h>
#include <math.h> // FUNCTION PROTOTYPES // DEFINES
#define SUCCESS 1
#define FAILURE 0
#define TRUE 1
#define FALSE 0
#define MAXVECTDIM 20
#define MAXPATTERN 20
#define MAXCLUSTER 10 char *f2a(double x, int width){
char cbuf[];
char *cp;
int i,k;
int d,s;
cp=fcvt(x,width,&d,&s);
if (s) {
strcpy(cbuf,"-");
}
else {
strcpy(cbuf," ");
} /* endif */
if (d>) {
for (i=; i<d; i++) {
cbuf[i+]=cp[i];
} /* endfor */
cbuf[d+]=;
cp+=d;
strcat(cbuf,".");
strcat(cbuf,cp);
} else {
if (d==) {
strcat(cbuf,".");
strcat(cbuf,cp);
}
else {
k=-d;
strcat(cbuf,".");
for (i=; i<k; i++) {
strcat(cbuf,"");
} /* endfor */
strcat(cbuf,cp);
} /* endif */
} /* endif */
cp=&cbuf[];
return cp;
} // ***** Defined structures & classes *****
struct aCluster {
double Center[MAXVECTDIM];
int Member[MAXPATTERN]; //Index of Vectors belonging to this cluster
int NumMembers;
}; struct aVector {
double Center[MAXVECTDIM];
int Size;
}; class System {
private:
double Pattern[MAXPATTERN][MAXVECTDIM+];
aCluster Cluster[MAXCLUSTER];
int NumPatterns; // Number of patterns
int SizeVector; // Number of dimensions in vector
int NumClusters; // Number of clusters
void DistributeSamples(); // Step 2 of K-means algorithm
int CalcNewClustCenters();// Step 3 of K-means algorithm
double EucNorm(int, int); // Calc Euclidean norm vector
int FindClosestCluster(int); //ret indx of clust closest to pattern
//whose index is arg
public:
void system();
int LoadPatterns(char *fname); // Get pattern data to be clustered
void InitClusters(); // Step 1 of K-means algorithm
void RunKMeans(); // Overall control K-means process
void ShowClusters(); // Show results on screen
void SaveClusters(char *fname); // Save results to file
void ShowCenters();
};
//输出聚类中心
void System::ShowCenters(){
int i,j;
printf("Cluster centers:\n");
for (i=; i<NumClusters; i++) {
Cluster[i].Member[]=i;
printf("ClusterCenter[%d]=(%f,%f)\n",i,Cluster[i].Center[],Cluster[i].Center[]);
} /* endfor */
printf("\n");
getchar();
} //读取文件
int System::LoadPatterns(char *fname)
{
FILE *InFilePtr;
int i,j;
double x;
if((InFilePtr = fopen(fname, "r")) == NULL)
return FAILURE;
fscanf(InFilePtr, "%d", &NumPatterns); // Read # of patterns 18数据量
fscanf(InFilePtr, "%d", &SizeVector); // Read dimension of vector 2维度
fscanf(InFilePtr, "%d", &NumClusters); // Read # of clusters for K-Means 2簇
for (i=; i<NumPatterns; i++) { // For each vector
for (j=; j<SizeVector; j++) { // create a pattern
fscanf(InFilePtr,"%lg",&x); // consisting of all elements
Pattern[i][j]=x;
} /* endfor */
} /* endfor */
//输出所有数据元素
printf("Input patterns:\n");
for (i=; i<NumPatterns; i++) {
printf("Pattern[%d]=(%2.3f,%2.3f)\n",i,Pattern[i][],Pattern[i][]);
} /* endfor */
printf("\n--------------------\n");
getchar();
return SUCCESS;
}
//***************************************************************************
// InitClusters *
// Arbitrarily assign a vector to each of the K clusters *
// We choose the first K vectors to do this *
//***************************************************************************
//初始化聚类中心
void System::InitClusters(){
int i,j;
printf("Initial cluster centers:\n");
for (i=; i<NumClusters; i++) {
Cluster[i].Member[]=i;
for (j=; j<SizeVector; j++) {
Cluster[i].Center[j]=Pattern[i][j];
} /* endfor */
} /* endfor */
for (i=; i<NumClusters; i++) {
printf("ClusterCenter[%d]=(%f,%f)\n",i,Cluster[i].Center[],Cluster[i].Center[]); //untransplant
} /* endfor */
printf("\n");
getchar();
}
//运行KMeans
void System::RunKMeans(){
int converged;
int pass;
pass=;
converged=FALSE;
//第N次聚类
while (converged==FALSE) {
printf("PASS=%d\n",pass++);
DistributeSamples();
converged=CalcNewClustCenters();
ShowCenters();
getchar();
} /* endwhile */
}
//在二维和三维空间中的欧式距离的就是两点之间的距离,二维的公式是
//d = sqrt((x1-x2)^+(y1-y2)^)
//通过这种运算,就可以把所有列的属性都纳入进来
double System::EucNorm(int p, int c){ // Calc Euclidean norm of vector difference
double dist,x; // between pattern vector, p, and cluster
int i; // center, c.
char zout[];
char znum[];
char *pnum;
//
pnum=&znum[];
strcpy(zout,"d=sqrt(");
printf("The distance from pattern %d to cluster %d is calculated as:\n",p,c);
dist=;
for (i=; i<SizeVector ;i++){
//拼写字符串
x=(Cluster[c].Center[i]-Pattern[p][i])*(Cluster[c].Center[i]-Pattern[p][i]);
strcat(zout,f2a(x,));
if (i==)
strcat(zout,"+");
//计算距离
dist += (Cluster[c].Center[i]-Pattern[p][i])*(Cluster[c].Center[i]-Pattern[p][i]);
} /* endfor */
printf("%s)\n",zout);
return dist;
}
//查找最近的群集
int System::FindClosestCluster(int pat){
int i, ClustID;
double MinDist, d;
MinDist =9.9e+99;
ClustID=-;
for (i=; i<NumClusters; i++) {
d=EucNorm(pat,i);
printf("Distance from pattern %d to cluster %d is %f\n\n",pat,i,sqrt(d));
if (d<MinDist) {
MinDist=d;
ClustID=i;
} /* endif */
} /* endfor */
if (ClustID<) {
printf("Aaargh");
exit();
} /* endif */
return ClustID;
}
//
void System::DistributeSamples(){
int i,pat,Clustid,MemberIndex;
//Clear membership list for all current clusters
for (i=; i<NumClusters;i++){
Cluster[i].NumMembers=;
}
for (pat=; pat<NumPatterns; pat++) {
//Find cluster center to which the pattern is closest
Clustid= FindClosestCluster(pat);//查找最近的聚类中心
printf("patern %d assigned to cluster %d\n\n",pat,Clustid);
//post this pattern to the cluster
MemberIndex=Cluster[Clustid].NumMembers;
Cluster[Clustid].Member[MemberIndex]=pat;
Cluster[Clustid].NumMembers++;
} /* endfor */
}
//计算新的群集中心
int System::CalcNewClustCenters(){
int ConvFlag,VectID,i,j,k;
double tmp[MAXVECTDIM];
char xs[];
char ys[];
char nc1[];
char nc2[];
char *pnc1;
char *pnc2;
char *fpv; pnc1=&nc1[];
pnc2=&nc2[];
ConvFlag=TRUE;
printf("The new cluster centers are now calculated as:\n");
for (i=; i<NumClusters; i++) { //for each cluster
pnc1=itoa(Cluster[i].NumMembers,nc1,);
pnc2=itoa(i,nc2,);
strcpy(xs,"Cluster Center");
strcat(xs,nc2);
strcat(xs,"(1/");
strcpy(ys,"(1/");
strcat(xs,nc1);
strcat(ys,nc1);
strcat(xs,")(");
strcat(ys,")(");
for (j=; j<SizeVector; j++) { // clear workspace
tmp[j]=0.0;
} /* endfor */
for (j=; j<Cluster[i].NumMembers; j++) { //traverse member vectors
VectID=Cluster[i].Member[j];
for (k=; k<SizeVector; k++) { //traverse elements of vector
tmp[k] += Pattern[VectID][k]; // add (member) pattern elmnt into temp
if (k==) {
strcat(xs,f2a(Pattern[VectID][k],));
} else {
strcat(ys,f2a(Pattern[VectID][k],));
} /* endif */
} /* endfor */
if(j<Cluster[i].NumMembers-){
strcat(xs,"+");
strcat(ys,"+");
}
else {
strcat(xs,")");
strcat(ys,")");
}
} /* endfor */
for (k=; k<SizeVector; k++) { //traverse elements of vector
tmp[k]=tmp[k]/Cluster[i].NumMembers;
if (tmp[k] != Cluster[i].Center[k])
ConvFlag=FALSE;
Cluster[i].Center[k]=tmp[k];
} /* endfor */
printf("%s,\n",xs);
printf("%s\n",ys);
} /* endfor */
return ConvFlag;
}
//输出聚类
void System::ShowClusters(){
int cl;
for (cl=; cl<NumClusters; cl++) {
printf("\nCLUSTER %d ==>[%f,%f]\n", cl,Cluster[cl].Center[],Cluster[cl].Center[]);
} /* endfor */
} void System::SaveClusters(char *fname){
}
四、主调程序
void main(int argc, char *argv[])
{ System kmeans;
/*
if (argc<2) {
printf("USAGE: KMEANS PATTERN_FILE\n");
exit(0);
}*/
if (kmeans.LoadPatterns("KM2.DAT")==FAILURE ){
printf("UNABLE TO READ PATTERN_FILE:%s\n",argv[]);
exit();
} kmeans.InitClusters();
kmeans.RunKMeans();
kmeans.ShowClusters();
}
五、输出结果
Input patterns:
Pattern[]=(0.000,0.000)
Pattern[]=(1.000,0.000)
Pattern[]=(0.000,1.000)
Pattern[]=(2.000,1.000)
Pattern[]=(1.000,2.000)
Pattern[]=(2.000,2.000)
Pattern[]=(2.000,0.000)
Pattern[]=(0.000,2.000)
Pattern[]=(7.000,6.000)
Pattern[]=(7.000,7.000)
Pattern[]=(7.000,8.000)
Pattern[]=(8.000,6.000)
Pattern[]=(8.000,7.000)
Pattern[]=(8.000,8.000)
Pattern[]=(8.000,9.000)
Pattern[]=(9.000,7.000)
Pattern[]=(9.000,8.000)
Pattern[]=(9.000,9.000) -------------------- Initial cluster centers:
ClusterCenter[]=(0.000000,0.000000)
ClusterCenter[]=(1.000000,0.000000) PASS=
The distance from pattern to cluster is calculated as:
d=sqrt( .+ .)
Distance from pattern to cluster is 0.000000 The distance from pattern to cluster is calculated as:
d=sqrt( 1.0000+ .)
Distance from pattern to cluster is 1.000000 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 1.0000+ .)
Distance from pattern to cluster is 1.000000 The distance from pattern to cluster is calculated as:
d=sqrt( .+ .)
Distance from pattern to cluster is 0.000000 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( .+ 1.0000)
Distance from pattern to cluster is 1.000000 The distance from pattern to cluster is calculated as:
d=sqrt( 1.0000+ 1.0000)
Distance from pattern to cluster is 1.414214 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 4.0000+ 1.0000)
Distance from pattern to cluster is 2.236068 The distance from pattern to cluster is calculated as:
d=sqrt( 1.0000+ 1.0000)
Distance from pattern to cluster is 1.414214 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 1.0000+ 4.0000)
Distance from pattern to cluster is 2.236068 The distance from pattern to cluster is calculated as:
d=sqrt( .+ 4.0000)
Distance from pattern to cluster is 2.000000 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 4.0000+ 4.0000)
Distance from pattern to cluster is 2.828427 The distance from pattern to cluster is calculated as:
d=sqrt( 1.0000+ 4.0000)
Distance from pattern to cluster is 2.236068 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 4.0000+ .)
Distance from pattern to cluster is 2.000000 The distance from pattern to cluster is calculated as:
d=sqrt( 1.0000+ .)
Distance from pattern to cluster is 1.000000 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( .+ 4.0000)
Distance from pattern to cluster is 2.000000 The distance from pattern to cluster is calculated as:
d=sqrt( 1.0000+ 4.0000)
Distance from pattern to cluster is 2.236068 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 49.0000+ 36.0000)
Distance from pattern to cluster is 9.219544 The distance from pattern to cluster is calculated as:
d=sqrt( 36.0000+ 36.0000)
Distance from pattern to cluster is 8.485281 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 49.0000+ 49.0000)
Distance from pattern to cluster is 9.899495 The distance from pattern to cluster is calculated as:
d=sqrt( 36.0000+ 49.0000)
Distance from pattern to cluster is 9.219544 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 49.0000+ 64.0000)
Distance from pattern to cluster is 10.630146 The distance from pattern to cluster is calculated as:
d=sqrt( 36.0000+ 64.0000)
Distance from pattern to cluster is 10.000000 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 64.0000+ 36.0000)
Distance from pattern to cluster is 10.000000 The distance from pattern to cluster is calculated as:
d=sqrt( 49.0000+ 36.0000)
Distance from pattern to cluster is 9.219544 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 64.0000+ 49.0000)
Distance from pattern to cluster is 10.630146 The distance from pattern to cluster is calculated as:
d=sqrt( 49.0000+ 49.0000)
Distance from pattern to cluster is 9.899495 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 64.0000+ 64.0000)
Distance from pattern to cluster is 11.313708 The distance from pattern to cluster is calculated as:
d=sqrt( 49.0000+ 64.0000)
Distance from pattern to cluster is 10.630146 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 64.0000+ 81.0000)
Distance from pattern to cluster is 12.041595 The distance from pattern to cluster is calculated as:
d=sqrt( 49.0000+ 81.0000)
Distance from pattern to cluster is 11.401754 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 81.0000+ 49.0000)
Distance from pattern to cluster is 11.401754 The distance from pattern to cluster is calculated as:
d=sqrt( 64.0000+ 49.0000)
Distance from pattern to cluster is 10.630146 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 81.0000+ 64.0000)
Distance from pattern to cluster is 12.041595 The distance from pattern to cluster is calculated as:
d=sqrt( 64.0000+ 64.0000)
Distance from pattern to cluster is 11.313708 patern assigned to cluster The distance from pattern to cluster is calculated as:
d=sqrt( 81.0000+ 81.0000)
Distance from pattern to cluster is 12.727922 The distance from pattern to cluster is calculated as:
d=sqrt( 64.0000+ 81.0000)
Distance from pattern to cluster is 12.041595 patern assigned to cluster The new cluster centers are now calculated as:
Cluster Center0(/)( .+ .+ .),
(/)( .+ 1.000+ 2.000)
Cluster Center1(/)( 1.000+ 2.000+ 1.000+ 2.000+ 2.000+ 7.000+ 7.000+ 7.000+
.+ 8.000+ 8.000+ 8.000+ 9.000+ 9.000+ 9.000),
(/)( .+ 1.000+ 2.000+ 2.000+ .+ 6.000+ 7.000+ 8.000+ 6.000+ 7.000+ 8.00
+ 9.000+ 7.000+ 8.000+ 9.000)
Cluster centers:
ClusterCenter[]=(0.000000,1.000000)
ClusterCenter[]=(5.866667,5.333333)
大数据挖掘算法篇之K-Means实例的更多相关文章
- Python聚类算法之基本K均值实例详解
Python聚类算法之基本K均值实例详解 本文实例讲述了Python聚类算法之基本K均值运算技巧.分享给大家供大家参考,具体如下: 基本K均值 :选择 K 个初始质心,其中 K 是用户指定的参数,即所 ...
- 图说十大数据挖掘算法(一)K最近邻算法
如果你之前没有学习过K最近邻算法,那今天几张图,让你明白什么是K最近邻算法. 先来一张图,请分辨它是什么水果 很多同学不假思索,直接回答:“菠萝”!!! 仔细看看同学们,这是菠萝么?那再看下边这这张图 ...
- python实现十大核心算法(桶排没实例)
# author:sevenduke # 2019-06-11 # 一.交换排序 # 排序算法的温故:冒泡排序 def dubblesort(arr): for i in range(0, len(a ...
- 详解十大经典数据挖掘算法之——Apriori
本文始发于个人公众号:TechFlow,原创不易,求个关注 今天是机器学习专题的第19篇文章,我们来看经典的Apriori算法. Apriori算法号称是十大数据挖掘算法之一,在大数据时代威风无两,哪 ...
- 机器学习——十大数据挖掘之一的决策树CART算法
本文始发于个人公众号:TechFlow,原创不易,求个关注 今天是机器学习专题的第23篇文章,我们今天分享的内容是十大数据挖掘算法之一的CART算法. CART算法全称是Classification ...
- 【十大经典数据挖掘算法】k
[十大经典数据挖掘算法]系列 C4.5 K-Means SVM Apriori EM PageRank AdaBoost kNN Naïve Bayes CART 1. 引言 k-means与kNN虽 ...
- 【十大经典数据挖掘算法】PageRank
[十大经典数据挖掘算法]系列 C4.5 K-Means SVM Apriori EM PageRank AdaBoost kNN Naïve Bayes CART 我特地把PageRank作为[十大经 ...
- 【十大经典数据挖掘算法】CART
[十大经典数据挖掘算法]系列 C4.5 K-Means SVM Apriori EM PageRank AdaBoost kNN Naïve Bayes CART 1. 前言 分类与回归树(Class ...
- ICDM评选:数据挖掘十大经典算法
原文地址:http://blog.csdn.net/aladdina/article/details/4141177 国际权威的学术组织the IEEE International Conferenc ...
随机推荐
- CDMA LTE FAQ2
1.UE等级 LTE CAT4,应该指的是LTE Category4,字面意思是LTE的ue-Category设置为4.ue-Category指的是UE的接入能力等级.也就是UE能够支持的传输速率的等 ...
- DevExpress v17.2新版亮点—WinForms篇(五)
用户界面套包DevExpress v17.2终于正式发布,本站将以连载的形式为大家介绍各版本新增内容.开篇介绍了DevExpress WinForms v17.2 Data Grid Control ...
- iOS 工程自动化 - OCLint
前言 最近一直在做 iOS 工程自动化方向的事情,所以把自己研究和实践的内容进行记录并分享,希望能给大家一些帮助. 为什么要使用 OCLint 做为一个静态代码分析工具,我们引入 OCLint 的目的 ...
- liunx系统和其它的基本命令
1.su 更换用户 2.sudo 管理员权限 3.PATH 4.sudo shutdown -h now 现在关机 sudo shutdown -r now 现在重启 5.kill ...
- Prime Ring Problem dfs
A ring is compose of n circles as shown in diagram. Put natural number 1, 2, ..., n into each circle ...
- 字符串匹配--扩展KMP模板
对于一个字符串 s 以及子串 t ,扩展KMP可以用来求 t 与 s 的每个子串的最长公共前缀 ext [ i ],当然,如果有某个 ext 值等于 t 串的长度 lent ,那么就说明从其对应的 i ...
- hdu1081 DP类最大子段和(二维压缩+前缀和数组/树状数组计数)
题意:给出一个 n * n 的数字矩阵,问最大子矩阵和是多少. 由于和最长子段和问题类似,一开始想到的就是 DP ,一开始我准备用两个循环进行 DP ,对于每一个 (i,j) ,考察(i - 1,j) ...
- DataFrame 列运算
import pandas as pd import StringIO table_buffer = StringIO.StringIO('''a b 2007-01-08 0.786667 270 ...
- MySQL--”自然键”和”代理键”优缺点
##=====================================================================================## 在数据库表设计中会纠 ...
- MySQL--查询表统计信息
============================================================= 可以用show table status 来查看表的信息,如:show ta ...