【Python】统计个人新浪微博词频并给出相应的柱状图

Python爬虫视频教程零基础小白到scrapy爬虫高手-轻松入门

https://item.taobao.com/item.htm?spm=a1z38n.10677092.0.0.482434a6EmUbbW&id=564564604865

本文介绍如何进行个人新浪微博词频统计，并给出相应的柱状图分析，编程环境为Python 2.7。该文主要包括三个部分：新浪微博API的使用、文本过滤及分词和词频统计。

一、新浪微博API的使用

首先在新浪微博开放平台http://open.weibo.com/development/上申请开发者账号，获取个人APP_KEY和APP_SECRET，下载并安装Python SDK。本文介绍的方法无需每次验证，直接运行即可。

# -*- coding: UTF-8 -*-

from weibo import APIClient

from re import split

import urllib,httplib

import webbrowser

import operator

import numpy as np

import matplotlib.pyplot as plt

class iWInsightor(object):

def __init__(self,ID,PW):

self.ACCOUNT = ID

self.PASSWORD = PW

self.CALLBACK_URL = 'https://api.weibo.com/oauth2/default.html'

self.APP_KEY = 'XXXXXXX'#Yours

self.APP_SECRET = 'XXXXXX'#Yours

self.client = APIClient(app_key=self.APP_KEY, app_secret=self.APP_SECRET, redirect_uri=self.CALLBACK_URL)

self.url = self.client.get_authorize_url()

self.get_Authorization()

def get_code(self):

conn = httplib.HTTPSConnection('api.weibo.com')

postdata = urllib.urlencode({'client_id':self.APP_KEY,'response_type':'code','redirect_uri':self.CALLBACK_URL,'action':'submit','userId':self.ACCOUNT,'passwd':self.PASSWORD,'isLoginSina':0,'from':'','regCallback':'','state':'','ticket':'','withOfficalFlag':0})

conn.request('POST','/oauth2/authorize',postdata,{'Referer':self.url,'Content-Type': 'application/x-www-form-urlencoded'})

res = conn.getresponse()

location = res.getheader('location')

code = location.split('=')[1]

conn.close()

return code

def get_Authorization(self):

code = self.get_code()

r = self.client.request_access_token(code)

access_token = r.access_token

expires_in = r.expires_in

self.client.set_access_token(access_token, expires_in)

#发送微博消息

def post_weibo(self,message):

self.client.post.statuses__update(status=message.decode('gbk'))

#获取当前用户ID

def getCurrentUid(self):

try:

uid = self.client.account.get_uid.get()['uid']

return uid

except Exception:

print 'get userid failed'

return

#获取用户关注列表

def getFocus(self,userid):

focuses = self.client.get.friendships__friends(uid=userid,count=200)

Resfocus = []

for focus in focuses["users"]:

try:

Resfocus.append((focus["screen_name"],focus["gender"]))

except Exception:

print 'get focus failed'

return

return Resfocus

#获取用户标签

def getTags(self,userid):

try:

tags = self.client.tags.get(uid=userid)

except Exception:

print 'get tags failed'

return

userTags = []

sortedT = sorted(tags,key=operator.attrgetter('weight'),reverse=True)

for tag in sortedT:

for item in tag:

if item != 'weight':

userTags.append(tag[item])

return userTags

#获取用户发布的微博

def getWeibo(self,uesrid,infile):

contents = self.client.get.statuses__user_timeline(uid=uesrid, count=100)

for content in contents.statuses:

try:

f = open(infile,'a')

f.write(content.text)

f.write('\n')

f.close()

except Exception:

print 'get text failed'

def autolabel(self,rects):

for rect in rects:

height = rect.get_height()

plt.text(rect.get_x()+rect.get_width()/2., 1.03*height, '%s' % float(height))

#画出用户的关注男女比例图

def getSexplot(self,userid,m,f,n):

res = self.client.get.users__show(uid=userid)

ind = np.arange(1,4)

width = 0.25

plt.subplot(111)

rects1 = plt.bar(left=ind, height=(m,f,n), width=0.25,align = 'center')

plt.ylabel('The Focus Number')

plt.title('Sex Analysis(effective samples:%d)' % (m+f+n))

plt.xticks(ind, ("Male","Female","Unknown") )

self.autolabel(rects1)

plt.legend((rects1,),("User:%s" % res["screen_name"],))

plt.show()

if __name__ == '__main__':

usrID = raw_input('请输入新浪微博用户名：')

usrPW = raw_input('请输入新浪微博密码:')

AppClient = iWInsightor(usrID, usrPW)

userid = AppClient.getCurrentUid()

infile = "E://data/weibo.dat"#微博内容保存路径及文件名

AppClient.getWeibo(userid,infile)

#Focus = AppClient.getFocus(userid)

#m = 0

#f = 0

#n = 0

#for i in Focus:

#if i[1] == "m":

#m = m+1

#elif i[1] == "f":

#f = f+1

#else:

#n = n+1

#AppClient.getSexplot(userid,m,f,n)

二、文本过滤及分词

微博中常常含有一些词汇，其对词频统计无任何作用，利用英文字母数字、汉语标点符号以及其他个性符号，这些我们需要在分词前将其滤除。此外，你还可以添加自己想滤除的符号或者字词。

中文与英文句子比较而言，有一个非常有趣的现象，那就是英文单词之间是有空格的，而中文则不然。因此，分词也成了中文信息处理中的一个基本步骤。我用的是结巴分词，可以添加自定义词典（因为分词字典很多词可能没涉及到），下载地址为https://github.com/fxsjy/jieba。

# -*- coding: UTF-8-*-

import string

import jieba

extra_dict = 'F://NLP/iWInsightor/jieba/mydict.dict'#自定义词典

jieba.load_userdict(extra_dict)

def filter_str(instr):

deEstr = string.punctuation + ' ' + string.digits + string.letters

deCstr = '，。《》【】（）！？★”“、：…'

destr = deEstr + deCstr

outstr = ''

for char in instr.decode('utf-8'):

if char not in destr:

outstr += char

return outstr

fp_in = open('F://NLP/iWInsightor/weibo.dat', 'rb+')#待处理文本

fp_out = open('F://NLP/iWInsightor/weibo_filter.dat', 'a')#处理后的文本

for line in fp_in:

str_delete = filter_str(line)

seg_list = jieba.cut(str_delete,cut_all=True)

str_join = ' '.join(seg_list)

fp_out.write(str_join)

fp_in.close()

fp_out.close()

三、词频统计

词频统计就是指统计出某个文本中各个词出现的次数，这里使用python中的词典数据结构易得。我用的是matplotlib画柱状图，画出top-K个高频词。这里需要注意的是图中的中文显示问题，在使用之前，需要修改相应的设置，具体方法不妨去google一下，我就不详细介绍了。

# -*- coding: UTF-8-*-

import string

import numpy

import pylab

def getstr(word, count):

countstr = word + ',' + str(count)

return countstr

def get_wordlist(infile):

c = open(infile).readlines()

wordlist = []

for line in c:

if len(line)>1:

words = line.split(' ')

for word in words:

if len(word)>1:

wordlist.append(word)

return wordlist

def get_wordcount(wordlist, outfile):

out = open(outfile, 'w')

wordcnt ={}

for i in wordlist:

if i in wordcnt:

wordcnt[i] += 1

else:

wordcnt[i] = 1

worddict = wordcnt.items()

worddict.sort(key=lambda a: -a[1])

for word,cnt in worddict:

out.write(getstr(word.encode('gbk'), cnt)+'\n')

out.close()

return wordcnt

def barGraph(wcDict):

wordlist=[]

for key,val in wcDict.items():

if val>5 and len(key)>3:

wordlist.append((key.decode('utf-8'),val))

wordlist.sort()

keylist=[key for key,val in wordlist]

vallist=[val for key,val in wordlist]

barwidth=0.5

xVal=numpy.arange(len(keylist))

pylab.xticks(xVal+barwidth/2.0,keylist,rotation=45)

pylab.bar(xVal,vallist,width=barwidth,color='y')

pylab.title(u'微博词频分析图')

pylab.show()

if __name__ == '__main__':

myfile = 'F://NLP/iWInsightor/weibo_filter.dat'

outfile = 'F://NLP/iWInsightor/result.dat'

wordlist = get_wordlist(myfile)

wordcnt = get_wordcount(wordlist,outfile)

barGraph(wordcnt)

至此，我们的工作就完成了。下面是我的微博词频的一个柱状图。这些仅是业余时间之作，尚有诸多不足之处。

【Python】统计个人新浪微博词频并给出相应的柱状图的更多相关文章

使用Python 统计nginx日志前十ip访问量并以柱状图显示
脚本内容: import matplotlib.pyplot as plt # nginx_file = '10.6.11.91_access.log-2018-12-27' ip = {} #筛选n ...
python统计文档中词频
python统计文档中词频的小程序 python版本2.7 效果如下: 程序如下,测试文件与完整程序在我的github中 #统计空格数与单词数本函数只返回了空格数需要的可以自己返回多个值 def ...
Python词云（词频统计，掩膜显示）
Python2.7 anaconda.安装Wordcloud,网上有许多下载路径,说一下掩模,就是在这个膜的区域才会有东西,当然这个与实际的掩模还有一定区别,这个词频显示是把所有统计的词,显示在这个掩 ...
如何用Python统计《论语》中每个字的出现次数？10行代码搞定--用计算机学国学
编者按: 上学时听过山师王志民先生一场讲座,说每个人不论干什么,都应该学习国学(原谅我学了计算机专业)!王先生讲得很是吸引我这个工科男,可能比我的后来的那些同学听课还要认真些,当然一方面是兴趣.一方面 ...
Hadoop的改进实验（中文分词词频统计及英文词频统计）（4/4）
声明: 1)本文由我bitpeach原创撰写,转载时请注明出处,侵权必究. 2)本小实验工作环境为Windows系统下的百度云(联网),和Ubuntu系统的hadoop1-2-1(自己提前配好).如不 ...
python统计字符串里每个字符的次数
方法一: 推导式 dd="ewq4aewtaSDDSFDTFDSWQrtewtyufashas" print {i:dd.count(i) for i in dd} 方法二: co ...
Python模拟登陆新浪微博
上篇介绍了新浪微博的登陆过程,这节使用Python编写一个模拟登陆的程序.讲解与程序如下: 1.主函数(WeiboMain.py): import urllib2 import cookielib i ...
python统计元素重复次数
python统计元素重复次数 # !/usr/bin/python3.4 # -*- coding: utf-8 -*- from collections import Counter arr = [ ...
Pig + Ansj 统计中文文本词频
最近特别喜欢用Pig,拥有能满足大部分需求的内置函数(built-in functions),支持自定义函数(user defined functions, UDF),能load 纯文本.avro等格 ...

随机推荐

Java实验报告一：Java开发环境的熟悉
实验要求: 1. 使用JDK编译.运行简单的Java程序 2.使用Eclipse 编辑.编译.运行.调试Java程序实验内容 (一) 命令行下Java程序开发 (二)Eclipse下Java程序 ...
第三个Sprint ------第七天
APP.java代码 package com.app.senior_calculator; import java.io.Serializable; import java.util.ArrayLis ...
使用a标签实现文件的下载与保存
<a>标签的常规使用是定义超链接,用于从一个页面链接到另一个页面,并且需要指定链接目标href,除了定义超链接外,<a>还可以实现文件的保存,直接设置a标签的href属性,但是 ...
[福大软工] Z班——Alpha现场答辩情况汇总
Alpha现场答辩小组互评(文字版) 各组对于麻瓜制造者的评价与建议队伍名评价与建议 *** 界面较友好,安全性不足,功能基本完整.希望能留下卖家的联系方式而不是在APP上直接联系,APP上 ...
ajax跨域请求数据
最近开始接触ajax的跨域请求问题,相比网上说的一大堆,我这里就说得比较浅显了. 关于为什么要跨域这个问题,实际的需求是当网站项目部署在一个域名上的时候,分域可以很好地解决网站卡顿问题(拥有多台服务器 ...
windows的cmd下的find命令比bash（win10下的Ubuntu的bash）下的grep比较
同样的一个catalina文件,windows的cmd下的find命令比bash下的grep要慢,windows确实占下风啊
[转帖]TMD为你揭秘中国互联网下半场所有秘密
https://www.iyiou.com/p/35099.html 李安说,<比利.林恩的中场战事>是“一个成长的故事”.中国互联网也行至中场,下半场如何走,成长的方向在哪里,成当下关键 ...
[转载] Oracle在windows下面的自动备份以及删除今天的脚本..
@echo off echo ================================================ echo Windows环境下Oracle数据库的自动备份脚本 echo ...
K3CLOUD替代方案
路径 [生产制造]->[工程数据]->[替代方案]->[替代方案] 应用场景实际业务处理中,由于订单取消.工程变更.客户需求变化.预测或计划不准确等原因造成原材料库存积压.呆滞,使 ...
rhel6+apache2.4+mysql5.7+php5.6部署LAMP架构
rhel6+apache2.4+mysql5.7+php5.6部署LAMP架构 2017年10月01日 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~准备阶段~~~~~~~~~~~~~ ...

【Python】统计个人新浪微博词频并给出相应的柱状图

【Python】统计个人新浪微博词频并给出相应的柱状图的更多相关文章

随机推荐

热门专题