Kmeans中文聚类
中文文本kmeans聚类
原理:
K就是将原始数据分为K类,Means即均值点。K-Means的核心就是将一堆数据聚集为K个簇,每个簇中都有一个中心点称为均值点,簇中所有点到该簇的均值点的距离都较到其他簇的均值点更近。
实现步骤:
1、给出k个初始聚类中心
2、重复执行:
把每一个数据对象重新分配到k个聚类中心处,形成k个簇
重新计算每一个簇的聚类中心
3、直到聚类中心不在发生变化时,此时分类结束
两种方法:
①
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.cluster import KMeans
from sklearn import metrics
import numpy as np;
import jieba
from DBUtils import update_keyword
def easy_get_parameter_k_means():
data = []
datas = []
file = open("keyword.txt" , encoding='utf-8')
for post in file:
data.append(post.replace('\n','')) datas = data vec = CountVectorizer()
X = vec.fit_transform([" ".join([b for b in jieba.cut(a)]) for a in data])
tf = TfidfTransformer()
X = tf.fit_transform(X.toarray()) data = X.toarray() test_score = []
n_clusters_end = 20 # 聚类个数
n_clusters_start = 20 # 聚类个数
while n_clusters_start <= n_clusters_end:
km = KMeans(n_clusters=n_clusters_start)
km.fit(data)
clusters = km.labels_.tolist()
# print(type(clusters))
# print(clusters)
score = metrics.silhouette_score(X=X, labels=clusters)
num = sorted([(np.sum([1 for a in clusters if a == i]), i) for i in set(clusters)])[-1]
test_score.append([n_clusters_start, score, num[0], num[1]])
#print([n_clusters_start, score, num[0], num[1]]) # 输出分数
n_clusters_start += 1 for i in range(0, 20):
result = []
#print('len(clusters):',len(clusters))
for index in range(len(clusters)):
if clusters[index] == i:
res = datas[index]
update_keyword(res,str(i))
print("更新关键词为:",res,'的分类号为',i)
result.append(res)
#print('res',res) #print("第",i,"类,共", len(result), "个") return clusters # easy_get_parameter_k_means() # 得到最佳参数
print("arrs",easy_get_parameter_k_means())
print("arrs[length]",len(easy_get_parameter_k_means()))
②此方法是读取多个文件构建词库空间,但是当数据文件过多时运行很慢很慢
求特征值 求TF-IDF 利用kmeans算法求聚类中心 和 聚类分类以及各点距离其聚类中心的距离
import os;
import jieba;
import numpy as np;
from numpy import *
import matplotlib.pyplot as plt
import os def read_from_file(file_name): # 读取原文章 with open(file_name, "r", encoding='UTF8') as fp:
words = fp.read()
return words def stop_words(stop_word_file):
words = read_from_file(stop_word_file)
result = jieba.cut(words)
new_words = []
for r in result:
new_words.append(r)
return set(new_words) def del_stop_words(words, stop_words_set):
# words是已经切词但是没有去除停用词的文档。
# 返回的会是去除停用词后的文档
result = jieba.cut(words)
new_words = []
for r in result:
if r not in stop_words_set:
new_words.append(r)
return new_words def get_all_vector(stop_words_set):
# names = [os.path.join(file_path, f) for f in os.listdir(file_path)] docs = []
word_set = set()
file = open("keyword.txt" , encoding='utf-8')
for post in file:
doc = del_stop_words(post, stop_words_set)
docs.append(doc)
word_set |= set(doc)
# print len(doc),len(word_set)
# print("word_Set:",word_set)
# print("docs:", docs)
word_set = list(word_set)
docs_vsm = []
# for word in word_set[:30]:
# print word.encode("utf-8"),
for doc in docs:
temp_vector = []
for word in word_set:
temp_vector.append(doc.count(word) * 1.0)
# print temp_vector[-30:-1]
docs_vsm.append(temp_vector) docs_matrix = np.array(docs_vsm)
print("docs_matrix:", docs_matrix) column_sum = [float(len(np.nonzero(docs_matrix[:, i])[0])) for i in range(docs_matrix.shape[1])]
column_sum = np.array(column_sum)
column_sum = docs_matrix.shape[0] / column_sum
idf = np.log(column_sum)
idf = np.diag(idf)
# 注意一下计算都是矩阵运算,不是单个变量的运算。
for doc_v in docs_matrix:
if doc_v.sum() == 0:
doc_v = doc_v / 1
else:
doc_v = doc_v / (doc_v.sum())
tfidf = np.dot(docs_matrix, idf)
# return names, tfidf
print("idf:", tfidf)
f = "tezheng.txt"
with open(f, "w", encoding='utf8') as file: # ”w"代表着每次运行都覆盖内容
for i in tfidf:
for j in i:
datafl = str(format(float(j), '.2f')) file.write(datafl + "\t")
file.write("\n") def loadDataSet(fileName):
dataSet = [] # 初始化一个空列表
fr = open(fileName)
for line in fr.readlines():
# 切割每一行的数据
curLine = line.strip().split('\t')
# 将数据追加到dataMat,映射所有的元素为 float类型
fltLine = list(map(float, curLine))
dataSet.append(fltLine)
return mat(dataSet) '''
def randCent(dataSet, k):
n = shape(dataSet)[1]
centroids = mat(zeros((k,n)))#用mat函数转换为矩阵之后可以才进行一些线性代数的操作
for j in range(n):#在每个维度的边界内,创建簇心。
minJ = min(dataSet[:,j])
rangeJ = float(max(dataSet[:,j]) - minJ)
centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1))
return centroids def randCent(dataSet,k):
m,n = dataSet.shape
centroids = np.zeros((k,n))
for i in range(k):
index = int(np.random.uniform(0,m)) # centroids[i,:] = dataSet[index,:]
return centroids
''' def randCent(dataSet, k):
n = shape(dataSet)[1]
centroids = mat(zeros((k, n))) # create centroid mat
for j in range(n): # create random cluster centers, within bounds of each dimension
minJ = min(dataSet[:, j])
rangeJ = float(max(dataSet[:, j]) - minJ)
centroids[:, j] = mat(minJ + rangeJ * random.rand(k, 1))
return centroids def distEclud(vecA, vecB):
return math.sqrt(sum(power(vecA - vecB, 2))) # dataSet样本点,k 簇的个数
# disMeas距离量度,默认为欧几里得距离
# createCent,初始点的选取
'''
def K_means(dataSet,k,distMeas = distEclud,createCent = randCent):
print("样本点:::",dataSet)
m = shape(dataSet)[0]#样本数
print('样本数:',m)
clusterAssment = mat(zeros((m,2)))#m*2的矩阵
centroids = createCent(dataSet,k)#初始化k个中心
clusterChanged = True
while clusterChanged:#当聚类不再变化
clusterChanged = False
for i in range(m):
minDist = math.inf;minIndex = -1
for j in range(k):#找到最近的质心
distJI = distMeas(centroids[j,:],dataSet[i,:])
if distJI < minDist:
minDist = distJI;minIndex = j
if clusterAssment[i,0] !=minIndex:clusterChanged = True
#第一列为所属质心,第二列为距离
clusterAssment[i,:] = minIndex,minDist**2
print(centroids) #更改质心位置
for cent in range(k):
ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]
centroids[cent,:] = mean(ptsInClust,axis=0)
return centroids,clusterAssment
''' def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
m = shape(dataSet)[0] # 样本数
clusterAssment = mat(zeros((m, 2))) # m*2的矩阵
centroids = createCent(dataSet, k) # 初始化k个中心
clusterChanged = True
while clusterChanged: # 当聚类不再变化
clusterChanged = False
for i in range(m):
minDist = inf;
minIndex = -1
for j in range(k): # 找到最近的质心
distJI = distMeas(centroids[j, :], dataSet[i, :])
if distJI < minDist:
minDist = distJI;
minIndex = j
if clusterAssment[i, 0] != minIndex: clusterChanged = True
# 第1列为所属质心,第2列为距离
clusterAssment[i, :] = minIndex, minDist ** 2
print(centroids) # 更改质心位置
for cent in range(k):
ptsInClust = dataSet[nonzero(clusterAssment[:, 0].A == cent)[0]]
centroids[cent, :] = mean(ptsInClust, axis=0)
return centroids, clusterAssment if __name__ == '__main__':
wenzhang = read_from_file('keyword.txt')
# print(wenzhang)
wenzhang1 = stop_words('stopword.txt')
# print(wenzhang1)
wenzhang2 = del_stop_words(wenzhang, wenzhang1)
# print(wenzhang2)
wenzhang3 = get_all_vector( wenzhang1)
# kMeans(dataSet, k, distMeas=gen_sim, createCent=randCent)
dataSet = loadDataSet('tezheng.txt')
centroids, clusterAssment = kMeans(dataSet, 10, distMeas=distEclud, createCent=randCent)
print("centroids:", centroids)
print("clusterAssment :", clusterAssment)
print("clusterAssmentlengh :", len(clusterAssment)) '''
import os;
import jieba;
import numpy as np;
from numpy import *
import matplotlib.pyplot as plt
import os def file_name(file_dir):
filesname = []
for root, dirs, files in os.walk(file_dir):
for file in files:
filename = 'keywordfile/' + file
filesname.append(filename)
print("filesname length:", len(filesname))
return filesname def read_from_file(file_name): # 读取原文章 with open(file_name, "r", encoding='UTF8') as fp:
words = fp.read()
return words def stop_words(stop_word_file):
words = read_from_file(stop_word_file)
result = jieba.cut(words)
new_words = []
for r in result:
new_words.append(r)
return set(new_words) def del_stop_words(words, stop_words_set):
# words是已经切词但是没有去除停用词的文档。
# 返回的会是去除停用词后的文档
result = jieba.cut(words)
new_words = []
for r in result:
if r not in stop_words_set:
new_words.append(r)
return new_words def get_all_vector(file_path, stop_words_set):
# names = [os.path.join(file_path, f) for f in os.listdir(file_path)]
names = file_name('keyfile')
posts = [open(name, encoding='utf-8').read() for name in names]
docs = []
word_set = set()
for post in posts:
print('post', post)
doc = del_stop_words(post, stop_words_set)
docs.append(doc)
word_set |= set(doc)
# print len(doc),len(word_set)
# print("word_Set:",word_set)
# print("docs:", docs)
word_set = list(word_set)
docs_vsm = []
# for word in word_set[:30]:
# print word.encode("utf-8"),
for doc in docs:
temp_vector = []
for word in word_set:
temp_vector.append(doc.count(word) * 1.0)
# print temp_vector[-30:-1]
docs_vsm.append(temp_vector) docs_matrix = np.array(docs_vsm)
print("docs_matrix:", docs_matrix) column_sum = [float(len(np.nonzero(docs_matrix[:, i])[0])) for i in range(docs_matrix.shape[1])]
column_sum = np.array(column_sum)
column_sum = docs_matrix.shape[0] / column_sum
idf = np.log(column_sum)
idf = np.diag(idf)
# 注意一下计算都是矩阵运算,不是单个变量的运算。
for doc_v in docs_matrix:
if doc_v.sum() == 0:
doc_v = doc_v / 1
else:
doc_v = doc_v / (doc_v.sum())
tfidf = np.dot(docs_matrix, idf)
# return names, tfidf
print("idf:", tfidf)
f = "tezheng.txt"
with open(f, "w", encoding='utf8') as file: # ”w"代表着每次运行都覆盖内容
for i in tfidf:
for j in i:
datafl = str(format(float(j), '.2f')) file.write(datafl + "\t")
file.write("\n") def loadDataSet(fileName):
dataSet = [] # 初始化一个空列表
fr = open(fileName)
for line in fr.readlines():
# 切割每一行的数据
curLine = line.strip().split('\t')
# 将数据追加到dataMat,映射所有的元素为 float类型
fltLine = list(map(float, curLine))
dataSet.append(fltLine)
return mat(dataSet) def randCent(dataSet, k):
n = shape(dataSet)[1]
centroids = mat(zeros((k,n)))#用mat函数转换为矩阵之后可以才进行一些线性代数的操作
for j in range(n):#在每个维度的边界内,创建簇心。
minJ = min(dataSet[:,j])
rangeJ = float(max(dataSet[:,j]) - minJ)
centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1))
return centroids def randCent(dataSet,k):
m,n = dataSet.shape
centroids = np.zeros((k,n))
for i in range(k):
index = int(np.random.uniform(0,m)) # centroids[i,:] = dataSet[index,:]
return centroids def randCent(dataSet, k):
n = shape(dataSet)[1]
centroids = mat(zeros((k, n))) # create centroid mat
for j in range(n): # create random cluster centers, within bounds of each dimension
minJ = min(dataSet[:, j])
rangeJ = float(max(dataSet[:, j]) - minJ)
centroids[:, j] = mat(minJ + rangeJ * random.rand(k, 1))
return centroids def distEclud(vecA, vecB):
return math.sqrt(sum(power(vecA - vecB, 2))) # dataSet样本点,k 簇的个数
# disMeas距离量度,默认为欧几里得距离
# createCent,初始点的选取 def K_means(dataSet,k,distMeas = distEclud,createCent = randCent):
print("样本点:::",dataSet)
m = shape(dataSet)[0]#样本数
print('样本数:',m)
clusterAssment = mat(zeros((m,2)))#m*2的矩阵
centroids = createCent(dataSet,k)#初始化k个中心
clusterChanged = True
while clusterChanged:#当聚类不再变化
clusterChanged = False
for i in range(m):
minDist = math.inf;minIndex = -1
for j in range(k):#找到最近的质心
distJI = distMeas(centroids[j,:],dataSet[i,:])
if distJI < minDist:
minDist = distJI;minIndex = j
if clusterAssment[i,0] !=minIndex:clusterChanged = True
#第一列为所属质心,第二列为距离
clusterAssment[i,:] = minIndex,minDist**2
print(centroids) #更改质心位置
for cent in range(k):
ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]
centroids[cent,:] = mean(ptsInClust,axis=0)
return centroids,clusterAssment def K_Means(dataSet, k, distMeas=distEclud, createCent=randCent):
m = shape(dataSet)[0] # 样本数
clusterAssment = mat(zeros((m, 2))) # m*2的矩阵
centroids = createCent(dataSet, k) # 初始化k个中心
clusterChanged = True
while clusterChanged: # 当聚类不再变化
clusterChanged = False
for i in range(m):
minDist = inf;
minIndex = -1
for j in range(k): # 找到最近的质心
distJI = distMeas(centroids[j, :], dataSet[i, :])
if distJI < minDist:
minDist = distJI;
minIndex = j
if clusterAssment[i, 0] != minIndex: clusterChanged = True
# 第1列为所属质心,第2列为距离
clusterAssment[i, :] = minIndex, minDist ** 2
print(centroids) # 更改质心位置
for cent in range(k):
ptsInClust = dataSet[nonzero(clusterAssment[:, 0].A == cent)[0]]
centroids[cent, :] = mean(ptsInClust, axis=0)
return centroids, clusterAssment if __name__ == '__main__':
wenzhang = read_from_file('input.txt')
# print(wenzhang)
wenzhang1 = stop_words('stopword.txt')
# print(wenzhang1)
wenzhang2 = del_stop_words(wenzhang, wenzhang1)
# print(wenzhang2)
wenzhang3 = get_all_vector('D:/Pycharm/项目存储/input/', wenzhang1)
# kMeans(dataSet, k, distMeas=gen_sim, createCent=randCent)
dataSet = loadDataSet('tezheng.txt')
centroids, clusterAssment = K_Means(dataSet, 3, distMeas=distEclud, createCent=randCent) print("centroids:", centroids)
print("clusterAssment :", clusterAssment)
print("clusterAssmentlengh :", len(clusterAssment))
'''
Kmeans中文聚类的更多相关文章
- 机器学习之K均值算法(K-means)聚类
K均值算法(K-means)聚类 [关键词]K个种子,均值 一.K-means算法原理 聚类的概念:一种无监督的学习,事先不知道类别,自动将相似的对象归到同一个簇中. K-Means算法是一种聚类分析 ...
- 利用模拟退火提高Kmeans的聚类精度
http://www.cnblogs.com/LBSer/p/4605904.html Kmeans算法是一种非监督聚类算法,由于原理简单而在业界被广泛使用,一般在实践中遇到聚类问题往往会优先使用Km ...
- Python_sklearn机器学习库学习笔记(五)k-means(聚类)
# K的选择:肘部法则 如果问题中没有指定 的值,可以通过肘部法则这一技术来估计聚类数量.肘部法则会把不同 值的成本函数值画出来.随着 值的增大,平均畸变程度会减小:每个类包含的样本数会减少,于是样本 ...
- k-means均值聚类算法(转)
4.1.摘要 在前面的文章中,介绍了三种常见的分类算法.分类作为一种监督学习方法,要求必须事先明确知道各个类别的信息,并且断言所有待分类项都有一个类别与之对应.但是很多时候上述条件得不到满足,尤其是在 ...
- K-means算法[聚类算法]
聚类算法k-Means的实现 <?php /* *Kmeans法(聚类算法的实现) */ /* *求误差平方和J */ //----------------------------------- ...
- 使用K-means进行聚类,用calinski_harabaz_score评价聚类效果
代码如下: """ 下面的方法是用kmeans方法进行聚类,用calinski_harabaz_score方法评价聚类效果的好坏 大概是类间距除以类内距,因此这个值越大越 ...
- kmeans均值聚类算法实现
这个算法中文名为k均值聚类算法,首先我们在二维的特殊条件下讨论其实现的过程,方便大家理解. 第一步.随机生成质心 由于这是一个无监督学习的算法,因此我们首先在一个二维的坐标轴下随机给定一堆点,并随即给 ...
- 基于K-means Clustering聚类算法对电商商户进行级别划分(含Octave仿真)
在从事电商做频道运营时,每到关键时间节点,大促前,季度末等等,我们要做的一件事情就是品牌池打分,更新所有店铺的等级.例如,所以的商户分入SKA,KA,普通店铺,新店铺这4个级别,对于不同级别的商户,会 ...
- 【机器学习】:Kmeans均值聚类算法原理(附带Python代码实现)
这个算法中文名为k均值聚类算法,首先我们在二维的特殊条件下讨论其实现的过程,方便大家理解. 第一步.随机生成质心 由于这是一个无监督学习的算法,因此我们首先在一个二维的坐标轴下随机给定一堆点,并随即给 ...
- K-means算法-聚类
算法过程如下: 1)从N个文档随机选取K个文档作为质心 2)对剩余的每个文档测量其到每个质心的距离,并把它归到最近的质心的类 3)重新计算已经得到的个各类的质心 4)迭代2~3步直至新的质心与原质心相 ...
随机推荐
- CH9120/CH9121 WCH-ETH透传芯片(持续更新)
网络变压器中心抽头: 如果使用网络变压器,变压器的中心抽头需要看PHY芯片时电流型还是电压型. 如果是电压型,则需要通过一个电容直接接到GND. 如果时电流型的PHY,那么就需要根据PHY芯片来看,从 ...
- [OpenCV实战]12 使用深度学习和OpenCV进行手部关键点检测
目录 1 背景 2 实现 3 结果和代码 4 参考 手部关键点检测是在手指上找到关节以及在给定图像中找到指尖的过程.它类似于在脸部(面部关键点检测)或身体(人体姿势估计)上找到关键点.但是手部检测不同 ...
- python进阶之路4基本运算符、格式化输出
内容回顾 PEP8规范 代码编写规范及美观 python注释语法 平时养成写注释的习惯 1.警号 2.三个单引号 3.三个双引号 常量与变量 1.变量语法结构 变量名 赋值符合 数据值 2.底层原理 ...
- 微服务框架——MybatisPlus
MybatisPlus 一.快速入门 1.mybatisPlus特性 无侵入:只增强,不改变. 损耗小:启动的时候直接注入基本CRUD 强大的CRUD操作:提供通用Mapper,通用service,条 ...
- [阿里云]Datahub测试使用记录
由于需要测试阿里云Datahub功能,因此测了一下Datahub的一些功能 DATAHUB: 简介: 阿里云的流式数据(streaming)处理平台 对流式数据的发布(publish)订阅(subsc ...
- vue3实现一个抽奖小项目
前言 在公司年会期间我做了个抽奖小项目,我把它分享出来,有用得着的可以看下. 浏览链接:http://xisite.top/original/luck-draw/index.html 项目链接:htt ...
- C#中检查null的语法糖,非常实用
c#处理null的几个语法糖,非常实用.(尤其是文末Dictionary那个案例,记得收藏) ??如果左边是的null,那么返回右边的操作数,否则就返回左边的操作数,这个在给变量赋予默认值非常好用. ...
- Win10环境下yolov8(ultralytics) 快速配置与测试
win10下亲测有效!(如果想在tensorrt+cuda下部署,直接看第五5章) 一.win10下创建yolov8环境 # 注:python其他版本在win10下,可能有坑,我已经替你踩坑了,这里p ...
- java进阶P-2.7
类函数 函数 用于按指定字符(串)或正则去分割某个字符串,结果以字符串数组形式返回:对某些特殊字符,如果字符(串)正好是正则的一部分,则需要转义才能使用 字符有 | , + , * , ^ , $ , ...
- 读Java8函数式编程笔记02_流
1. 外部迭代 1.1. for循环是一个封装了迭代的语法糖 1.1.1. 本质上来讲是一种串行化操作 1.2. 很难抽象出不同操作 2. 内部迭代 2.1. 内部迭代中的相应接口:Stream 2. ...