kmean算法C++实现
kmean均值算法是一种最常见的聚类算法。算法实现简单,效果也比较好。kmean算法把n个对象划分成指定的k个簇,每个簇中所有对象的均值的平均值为该簇的聚点(中心)。
k均值算法有如下五个步骤:
- 随机生成最初始k个簇心。可以从样本中随机选择,也可以根据样本中每个特征的取值特点随机生成。
- 对每个样本计算到每个簇心的欧式距离,将样本划分到欧氏距离最小的簇心(聚点)。
- 对划分到同一个簇心(聚点)的样本计算平均值,用均值更新簇心(聚点)
- 若某些簇心(聚点)发生变化,转到2;若所有的聚点都没有变化,转5
- 输出划分结果
- #include <vector>
- #include <cassert>
- #include <iostream>
- #include <cmath>
- #include <fstream>
- #include <climits>
- #include <ctime>
- #include <iomanip>
- using namespace std;
- namespace terse {
- class Kmeans {
- private:
- vector<vector<double>> m_dataSet;
- int m_k;
- vector<int> m_clusterResult; // result of cluster
- vector<vector<double>> m_cluserCent; //center of k clusters
- private:
- vector<string> split(const string& s, string pattern) {
- vector<string> res;
- size_t start = ;
- size_t end = ;
- while (start < s.size()) {
- end = s.find_first_of(pattern, start);
- if (end == string::npos) {
- res.push_back(s.substr(start, end - start - ));
- return res;
- }
- res.push_back(s.substr(start, end - start));
- start = end + ;
- }
- return res;
- }
- void loadDataSet(const char* fileName) {
- ifstream dataFile(fileName);
- if (!dataFile.is_open()) {
- cerr << "open file " << fileName << "failed!\n";
- return;
- }
- string tmpstr;
- vector<double> data;
- while (!dataFile.eof()) {
- data.clear();
- tmpstr.clear();
- getline(dataFile, tmpstr);
- vector<string> tmp = split(tmpstr, ",");
- for (string str : tmp) {
- data.push_back(stod(str));
- }
- this->m_dataSet.push_back(data);
- }
- dataFile.close();
- }
- //compute Euclidean distance of two vector
- double distEclud(vector<double>& v1, vector<double>& v2) {
- assert(v1.size() == v2.size());
- double dist = ;
- for (size_t i = ; i < v1.size(); i++) {
- dist += (v1[i] - v2[i]) * (v1[i] - v2[i]);
- }
- return sqrt(dist);
- }
- void generateRandCent() {
- int numOfFeats = this->m_dataSet[].size();
- size_t numOfSamples = this->m_dataSet.size();
- //first:min second:max
- vector<pair<double, double>> minMaxOfFeat(numOfFeats);
- for (int i = ; i < numOfFeats; i++) {
- minMaxOfFeat[i].first = this->m_dataSet[][i];
- minMaxOfFeat[i].second = this->m_dataSet[][i];
- }
- for (size_t i = ; i < numOfSamples; i++) {
- for (int j = ; j < numOfFeats; j++) {
- if (this->m_dataSet[i][j] > minMaxOfFeat[j].second) {
- minMaxOfFeat[j].second = this->m_dataSet[i][j];
- }
- if (this->m_dataSet[i][j] < minMaxOfFeat[j].first) {
- minMaxOfFeat[j].first = this->m_dataSet[i][j];
- }
- }
- }
- srand(time(NULL));
- for (int i = ; i < this->m_k; i++) {
- for (int j = ; j < numOfFeats; j++) {
- this->m_cluserCent[i][j] = minMaxOfFeat[j].first
- + (minMaxOfFeat[j].second - minMaxOfFeat[j].first)
- * (rand() / (double) RAND_MAX);
- }
- }
- }
- void printClusterCent(int iter) {
- int m = this->m_cluserCent.size();
- int n = this->m_cluserCent[].size();
- cout << "iter = " << iter;
- for (int i = ; i < m; i++) {
- cout << " {";
- for (int j = ; j < n; j++) {
- cout << this->m_cluserCent[i][j] << ",";
- }
- cout << "};";
- }
- cout << endl;
- }
- void writeResult(const char* fileName = "res.txt") {
- ofstream fout(fileName);
- if (!fout.is_open()) {
- cerr << "open file " << fileName << "failed!";
- return;
- }
- for (size_t i = ; i < this->m_dataSet.size(); i++) {
- for (size_t j = ; j < this->m_dataSet[].size(); j++) {
- fout << this->m_dataSet[i][j] << "\t";
- }
- fout << setprecision() << this->m_clusterResult[i] << "\n";
- }
- fout.close();
- }
- public:
- Kmeans(int k, const char* fileName) {
- this->m_k = k;
- this->loadDataSet(fileName);
- this->m_clusterResult.reserve(this->m_dataSet.size());
- this->m_cluserCent = vector<vector<double>>(k,
- vector<double>(this->m_dataSet[].size()));
- generateRandCent();
- }
- Kmeans(int k, vector<vector<double>>& data) {
- this->m_k = k;
- this->m_dataSet = data;
- this->m_clusterResult.reserve(this->m_dataSet.size());
- this->m_cluserCent = vector<vector<double>>(k,
- vector<double>(this->m_dataSet[].size()));
- generateRandCent();
- }
- //verbose = 1,printClusterCent();
- void kmeansCluster(int verbose = ) {
- int iter = ;
- bool isClusterChanged = true;
- while (isClusterChanged) {
- isClusterChanged = false;
- //step 1: find the nearest centroid of each point
- int numOfFeats = this->m_dataSet[].size();
- size_t numOfSamples = this->m_dataSet.size();
- for (size_t i = ; i < numOfSamples; i++) {
- int minIndex = -;
- double minDist = INT_MAX;
- for (int j = ; j < this->m_k; j++) {
- double dist = distEclud(this->m_cluserCent[j],
- m_dataSet[i]);
- if (dist < minDist) {
- minDist = dist;
- minIndex = j;
- }
- }
- if (m_clusterResult[i] != minIndex) {
- isClusterChanged = true;
- m_clusterResult[i] = minIndex;
- }
- }
- //step 2: update cluster center
- vector<size_t> cnt(this->m_k, );
- this->m_cluserCent = vector<vector<double>>(this->m_k,
- vector<double>(numOfFeats, 0.0));
- for (size_t i = ; i < numOfSamples; i++) {
- for (int j = ; j < numOfFeats; j++) {
- this->m_cluserCent[this->m_clusterResult[i]][j] +=
- this->m_dataSet[i][j];
- }
- cnt[this->m_clusterResult[i]]++;
- }
- // mean of the vector belong to a cluster
- for (int i = ; i < this->m_k; i++) {
- for (int j = ; j < numOfFeats; j++) {
- this->m_cluserCent[i][j] /= cnt[i];
- }
- }
- if (verbose)
- printClusterCent(iter++);
- }
- writeResult();
- }
- };
- };
- int main(){
- terse::Kmeans kmeans(,"datafile.txt");
- kmeans.kmeansCluster();
- return ;
- }
- /*namespace terse*/
kmean算法C++实现的更多相关文章
- <转>与EM相关的两个算法-K-mean算法以及混合高斯模型
转自http://www.cnblogs.com/jerrylead/archive/2011/04/06/2006924.html http://www.cnblogs.com/jerrylead/ ...
- EM相关两个算法 k-mean算法和混合高斯模型
转自http://www.cnblogs.com/jerrylead/archive/2011/04/06/2006924.html http://www.cnblogs.com/jerrylead/ ...
- 机器学习课程-第8周-聚类(Clustering)—K-Mean算法
1. 聚类(Clustering) 1.1 无监督学习: 简介 在一个典型的监督学习中,我们有一个有标签的训练集,我们的目标是找到能够区分正样本和负样本的决策边界,在这里的监督学习中,我们有一系列标签 ...
- K-Means聚类算法原理
K-Means算法是无监督的聚类算法,它实现起来比较简单,聚类效果也不错,因此应用很广泛.K-Means算法有大量的变体,本文就从最传统的K-Means算法讲起,在其基础上讲述K-Means的优化变体 ...
- 学习OpenCV——Kmean(C++)
从前也练习使用过OpenCV的Kmean算法,但是那版本低,而且也是基于C的开发.这两天由于造论文的需要把它重新翻出来在研究一下C++,发现有了些改进 kmeans C++: doublekmeans ...
- 运用三角不等式加速Kmeans聚类算法
运用三角不等式加速Kmeans聚类算法 引言:最近在刷<数据挖掘导论>,第九章, 9.5.1小节有提到,可以用三角不等式,减少不必要的距离计算,从而达到加速聚类算法的目的.这在超大数据量的 ...
- MLlib--PIC算法
转载请标明出处http://www.cnblogs.com/haozhengfei/p/82c3ef86303321055eb10f7e100eb84b.html PIC算法 幂迭代聚类 ...
- ML: 聚类算法-K均值聚类
基于划分方法聚类算法R包: K-均值聚类(K-means) stats::kmeans().fpc::kmeansruns() K-中心点聚类(K-Medoids) ...
- K-SVD算法
它与K-mean算法原理上是类似的: K-mean 算法: (之前写过:http://www.cnblogs.com/yinheyi/p/6132362.html) 对于初始化的类别中心,可以看作初化 ...
随机推荐
- PHP环境的搭建及与nginx的集成
1. 去php官网下载最新稳定版(最新其实是7.0,为了兼容性,使用5.6.16) wget http://cn2.php.net/get/php-5.6.16.tar.gz/from/this/m ...
- 关于java的volatile关键字与线程栈的内容以及单例的DCL
用volatile修饰的变量,线程在每次使用变量的时候,都会读取变量修改后的最新的值.volatile很容易被误用,用来进行原子性操作. package com.guangshan.test; pub ...
- Android-ActionBar-与Menu结合
ActionBar就是一个标题栏,以前Android3.0之前还称为标题栏,Android3.0之后取名为ActionBar 首先必须在AndroidManifest.xml中指定Applicatio ...
- MVC-1.1 BundleConfig-ScriptBundle
App_Start中的BudleCnfig.cs中 bundles.Add(new ScriptBundle("~/bundles/jquery").Include( " ...
- html隐藏元素
<body> <div>display:元素的位置不被占用</div> <div id="div1" style="displa ...
- XPath高级用法(冰山一角)
运算符+内置函数 使用XPath选择元素时,使用运算符+内置函数来进行筛选: .//div[contains(@class,"ec_desc") or contains(@clas ...
- AndroidStudio的一些快捷键的使用
1.返回上一次浏览快捷键的设置 https://blog.csdn.net/yingtian648/article/details/73277388 2.格式化代码的快捷键的设置 htt ...
- OSX - 可以安装任何程序!
在shell里面执行命令: sudo spctl --master-disable 参考: https://www.jianshu.com/p/010cc30228f3
- C - 前m大的数 (结构体)
点击打开链接 还记得Gardon给小希布置的那个作业么?(上次比赛的1005)其实小希已经找回了原来的那张数表,现在她想确认一下她的答案是否正确,但是整个的答案是很庞大的表,小希只想让你把答案中最大的 ...
- flask源码解析之上下文为什么用栈
楔子 我在之前的文章<flask源码解析之上下文>中对flask上下文流程进行了详细的说明,但是在学习的过程中我一直在思考flask上下文中为什么要使用栈完成对请求上下文和应用上下文的入栈 ...