出于好奇,那些10w+的公众号都写了些什么,于是我写了几个脚本爬取了各行业Top的公众号文章,进行了关键词统计。

抓取数据、分析用到了3中语言:Node.js,Java,Python。废话不多说,直接上代码。

1(NODEJS)

puppeteer模拟登陆,抓取微信公众号链接:

/**
* load wechat article urls on newrank.cn
**/
const puppeteer = require('puppeteer');
//emulate iphone
const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36';
const workPath = './newrank_cn1111';
const fs = require("fs");
const userName = "公众号";
const ppwwdd = "caiyongji";
if (!fs.existsSync(workPath)) {
fs.mkdirSync(workPath)
}
const loginUrl = 'https://www.newrank.cn/public/login/login.html?back=https%3A//www.newrank.cn/'; const monthlyRankUrl = "https://www.newrank.cn/public/info/list.html?period=month&type=data"; const detailUrl = "https://www.newrank.cn/public/info/detail.html?account="; (async () => { const browser = await puppeteer.launch({headless: false});//set headless: true will hide chromium UI
const page = await browser.newPage();
await page.setUserAgent(userAgent);
await page.setViewport({width:1920, height:1000});
await page.setRequestInterception(true); //filter to block images
page.on('request', request => {
if (request.resourceType() === 'image')
request.abort();
else
request.continue();
});
await page.goto(loginUrl);
//login
await loginOperate();
//await page.close(); await processMonthlyRank('.wx-right-type-list-spe a[icon=ss]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=mgs]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=cf]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=kj]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=cy]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=qc]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=ls]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=zc]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=jy]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=xs]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=zw]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=qy]'); await processMonthlyRank('.wx-right-type-list-spe a[icon=wh]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=bk]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=jk]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=shs]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=ms]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=sj]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=lx]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=ym]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=qg]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=ty]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=mt]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=zs]'); await processMonthlyRank('#wx_month_all'); async function loginOperate(){
try{
await page.click('div[data-type=pwd]');
}catch(err){
console.log('login#1');
} try{
await page.type('#account_input',userName);
await page.type('#password_input',ppwwdd);
}catch(err){
console.log('login#2');
} try{
await page.click('#pwd_confirm');
}catch(err){
console.log('login#3');
} } async function processMonthlyRank(btn){
const tab = await browser.newPage();
await tab.setUserAgent(userAgent);
await tab.setViewport({width:1920, height:1000});
await tab.setRequestInterception(true); //filter to block images
tab.on('request', request => {
if (request.resourceType() === 'image')
request.abort();
else
request.continue();
});
await tab.goto(monthlyRankUrl);
try{
await tab.click(btn);
}catch(err){
console.log('processMonthlyRank#1');
}
let fileName = await tab.evaluate(function(param){
return document.querySelector(param).innerHTML;
},btn);
console.log('-------------------------'+fileName+'-------------------------');
await scrollWait(tab);
await waitSecond(tab); const sel = '.wx_main tr';
const texts = await tab.evaluate((sel) => {
let elements = Array.from(document.querySelectorAll(sel));
let txt = elements.map(element => {
return element.innerText
})
return txt;
}, sel);
console.log('total rows: '+texts.length);
let contents='记录条数'+(texts.length-1)+'\n\n';
texts.forEach(function(c,index){
if(index>0){
contents+=c+'\n\n';
}
}); const fs = require("fs");
fs.writeFileSync(workPath+'/'+fileName+'.txt',contents);
console.log(fileName + " has been extracted to local."); const idSel = '.wx_main tr a[href^="detail.html"]';
const ids = await tab.evaluate((idSel) => {
let elements = Array.from(document.querySelectorAll(idSel));
let txt = elements.map(element => {
return element.innerText
})
return txt;
}, idSel);
let idContents='';
let w_name;
let flag =true;
/*ids.forEach(async function(id,index){
if(index%2!=0){
idContents+=id+'\n';
await getDetail(fileName,w_name,id);
w_name =null;
}else{
w_name=id;
}
});*/
await (async ()=>{
for(let i=0;i<ids.length;i++){
if(i%2!=0){
idContents+=ids[i]+'\n';
await getDetail(fileName,w_name,ids[i]);
w_name =null;
}else{
w_name=ids[i];
}
}
})();
let idFile = 'id_'+fileName;
fs.writeFileSync(workPath+'/'+idFile+'.txt',idContents);
console.log(idFile + " has been extracted to local.");
await tab.close();
} async function scrollWait(p, n){
if(n==null) n=5;
for(let i= 0; i<n;i++){
try{
await p.evaluate(()=>window.scrollTo(0, document.body.scrollHeight));
await p.waitForNavigation({timeout:500,waitUntil: ['networkidle0']});
}catch(err){
console.log('scroll to bottom and then wait 500 ms.');
}
}
} async function waitSecond(p){
try{
await p.waitForNavigation({timeout:2000,waitUntil: ['networkidle0']});
}catch(err){
//console.log('wait 1 sec.');
}
} async function getDetail(cat,name,id){
const tab = await browser.newPage();
await tab.setUserAgent(userAgent);
await tab.setViewport({width:1920, height:1000});
await tab.setRequestInterception(true); //filter to block images
tab.on('request', request => {
if (request.resourceType() === 'image')
request.abort();
else
request.continue();
});
await tab.goto(detailUrl+id);
await waitSecond(tab);
const sel = '#info_detail_article_top li .title a';
const hrefs = await tab.evaluate((sel) => {
let elements = Array.from(document.querySelectorAll(sel));
let links = elements.map(element => {
return element.href
})
return links;
}, sel);
let urlList='';
hrefs.forEach(function(href,index){
urlList+=href+"\n";
});
const fs = require("fs");
if (!fs.existsSync(workPath+'/'+cat)) {
fs.mkdirSync(workPath+'/'+cat)
}
fs.writeFileSync(workPath+'/'+cat+'/'+id+'_top_'+name+'.txt',urlList); const sel1 = '#info_detail_article_lastest li .title a';
const hrefs1 = await tab.evaluate((sel1) => {
let elements = Array.from(document.querySelectorAll(sel1));
let links = elements.map(element => {
return element.href
})
return links;
}, sel1);
let urlList1='';
hrefs1.forEach(function(href,index){
urlList1+=href+"\n";
});
fs.writeFileSync(workPath+'/'+cat+'/'+id+'_lastest_'+name+'.txt',urlList1);
console.log(id+' '+name+' has been extracted to local.');
await tab.close();
} })();

  

 

2(JAVA)

Jsoup抓取微信文章文本:Vps 安全设置 Win2003中IIS的安全设置技巧

package com;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Arrays;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadLocalRandom; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; public class WeChatUrls extends Thread {
private File catFile;
final static Integer ThreadNum = 1;
final String ERROR = "ERROR";
private final static String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36";
private final static String WORK_FOLDER = "T:\\Developer\\puppeteerTestCase\\newrank_cn_articles";
private final static String READ_URLS_FOLDER = "T:\\Developer\\puppeteerTestCase\\newrank_cn"; public WeChatUrls(File cat) {
this.catFile = cat;
} private String getUrlProxyContent(String url) {
String body = ERROR;
try {
Document doc = Jsoup.connect(url).userAgent(USER_AGENT).get();
if (doc.select("body") != null) {
body = doc.select("body").text();
}
} catch (IOException e) {
System.out.println("ERROR URL: " + url);
e.printStackTrace();
} return body;
} private void write(String content, String fileName) {
File f = new File(fileName);
FileWriter fw = null;
BufferedWriter bw = null;
try {
if (!f.exists()) {
f.getParentFile().mkdirs();
f.createNewFile();
}
// fw = new FileWriter(f.getAbsoluteFile(), true); // true表示可以追加新内容
fw = new FileWriter(f.getAbsoluteFile()); // 表示不追加
bw = new BufferedWriter(fw);
bw.write(content);
bw.close();
} catch (Exception e) {
e.printStackTrace();
}
} public static void main(String[] args) throws Exception {
File baseFolder = new File(READ_URLS_FOLDER);
File[] cataFiles = baseFolder.listFiles();
ExecutorService service = Executors.newFixedThreadPool(ThreadNum);
Arrays.asList(cataFiles).stream().forEach(catFile -> {
if (catFile.isFile() && catFile.getName().startsWith("id")) {
service.execute(new WeChatUrls(catFile));
}
});
service.shutdown();
} private void process() {
// Set<String> redoSet = new HashSet<>();
String catagory = catFile.getName().split("\\.")[0].split("_")[1];
File urlFolder = new File(READ_URLS_FOLDER + "\\" + catagory);
File[] urlFiles = urlFolder.listFiles();
if (urlFiles != null) {
Arrays.asList(urlFiles).stream().forEach(urlFile -> {
try {
BufferedReader reader = new BufferedReader(new FileReader(catFile));
String wechatId = null;
int countLatest = 1;
int countTop = 1;
while ((wechatId = reader.readLine()) != null) {
if (urlFile.getName().startsWith(wechatId)) {
String wechatName = urlFile.getName().split("\\.")[0].split("_")[2];
// if (urlFile.length() == 0) {
// redoSet.add("\"" + catagory + "\",\"" + wechatName + "\",\"" + wechatId + "\"");
// }
BufferedReader r = new BufferedReader(new FileReader(urlFile));
String wechatUrl = null;
while ((wechatUrl = r.readLine()) != null) {
String writePath = WORK_FOLDER + "\\" + catagory + "\\"
+ (urlFile.getName().contains("top") ? "top" : "latest") + "\\" + wechatId
+ "_" + wechatName + "_"
+ (urlFile.getName().contains("top") ? countTop++ : countLatest++)+".txt";
String content = getUrlProxyContent(wechatUrl);
write(content, writePath);
System.out.println(writePath);
Thread.sleep(ThreadLocalRandom.current().nextInt(500, 3000));
}
r.close();
}
}
reader.close();
} catch (Exception e) {
e.printStackTrace();
}
});
}
// redoSet.stream().forEach(System.out::println); } @Override
public void run() {
process();
}
}

  

 

3(PYTHON)

wordcloud生成词云:

# -*- coding: utf-8 -*-
import json
import random
import time
import os
from pyecharts import Bar,Geo,Line,Overlap
import jieba
from scipy.misc import imread
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
from collections import Counter
os.chdir('T:/Developer/puppeteerTestCase/newrank_cn_articles') stopWords = ['微信','二维码','二维','扫一','一扫','公众','赞赏','转账','关注','打开','阅读','图片','关闭','取消','程序'] def proc(folder, type):
fileLines = []
rootdir = './'+folder+'/'+type
list = os.listdir(rootdir)
for i in range(0,len(list)):
path = os.path.join(rootdir,list[i])
if os.path.isfile(path):
try:
fo = open(path, 'r+')
fileLines += fo.readlines()
except:
print('error while processing file: ' + path) _str = ' '.join(fileLines)
words_list = []
word_generator = jieba.cut_for_search(_str)
for word in word_generator:
words_list.append(word)
words_list = [k for k in words_list if len(k)>1 and k not in stopWords]
back_color = imread('back.jpg')
wc = WordCloud(background_color='white',
max_words=2000,
mask=back_color,
max_font_size=300,
font_path="C:/Windows/Fonts/msyh.ttc",
random_state=42
)
_count = Counter(words_list)
wc.generate_from_frequencies(_count)
image_colors = ImageColorGenerator(back_color)
wc.recolor(color_func=image_colors)
#plt.figure()
#plt.imshow(wc.recolor(color_func=image_colors))
#plt.axis('off') # The pil way (if you don't have matplotlib)
image = wc.to_image()
image.show()
jpgFile = './'+type+'_'+folder+'.jpg'
image.save(jpgFile)
print('image File saved:' + jpgFile) basedir = './'
baselist = os.listdir(basedir)
for l in range(0,len(baselist)):
p = os.path.join(basedir,baselist[l])
if os.path.isdir(p):
proc(os.path.basename(p), 'top')

  

 

4

词云结果涉及23个维度,得出结果如下:

TOP500公众号文章

创业

健康

教育

乐活

企业

情感

体育娱乐

文化

文摘

幽默

政务

旅行

时事

时尚

民生

汽车

百科

科技

美体

美食

职场

财富

文章转自:https://segmentfault.com/r/1250000015997077?shareId=1210000015997081

那些10w+的公众号都在写什么?的更多相关文章

  1. 一个人的公众号,我写了1w+

    大家好,我是Bypass,一个人一直保持着写博客的习惯,为此维护了一个技术公众号,致力于分享原创高质量干货,写的内容主要围绕:渗透测试.WAF绕过.代码审计.应急响应.企业安全. 一直以来,我把它当成 ...

  2. 手机QQ公众号亿级消息实时群发架构

    编者按:高可用架构分享及传播在架构领域具有典型意义的文章,本文由孙子荀分享.转载请注明来自高可用架构公众号 ArchNotes.   孙子荀,2009 年在华为从事内核和分布式系统的开发工作:2011 ...

  3. 微信公众号发送消息模板(java)

    这段时间接触公众号开发,写下向用户发送消息模板的接口调用 先上接口代码 public static JSONObject sendModelMessage(ServletContext context ...

  4. PHP开发微信公众号(二)消息接受与推送

    上一篇文章我们知道怎么获取二维码,这样别人就可以扫描二维码来关注我们,但是别人关注后,发送消息,我们怎么进行相关处理? 这里我们就来学习下怎么处理处理这些消息,以及推送消息. 学习之前首先你需要有一个 ...

  5. spring-boot-route(二十三)开发微信公众号

    在讲微信公众号开发之前,先来大概了解一下微信公众号.微信公众号大体上可以分为服务号和订阅号,订阅号和服务号的区别如下: 服务号可以申请微信支付功能. 服务号只能由企业申请,订阅号可以有企业或个人申请. ...

  6. php开发微信公众号获取信息LBS

    1.一般的公众号都可以在微信公众平台里面设置自定义菜单和自动回复消息,如果需要获取用户位置,则必须开启 服务器配置,当次功能开启后,微信公众平台的自定义菜单和自动回复则失效. 需要通过接口开发来实现微 ...

  7. 微信公众号开发C#系列-9、多公众号集中管理

    1.概述 通过前面8篇关于微信开发相关文章的学习,我们已经对微信常用开发有了一个比较深入的了解.前面的文章都是基于某一特定公众号的,在现实业务中同一单位个体运营着不至一个公众号,此时就需要对多个公众号 ...

  8. 微信公众号UX分析—— 学生作业小结

    1. 不足: 1. 权威性:个人帐号,显得不够正式. 2. 排版问题: + 没有必要的外接端口,界面设计极度缺少排版.哪怕是个人公众号都不至于如此,更何况这是一个学校的教务平台. 3. 反应不及时或无 ...

  9. 如何玩转小程序+公众号?手把手教你JeeWx小程序CMS与公众号关联

    随着微信小程序新功能.新入口的不断更新,小程序的商业价值逐步增强,特别是小程序与公众号的深度融合,已经让小程序成为各行业新的营销渠道.Jeewx平台专注小程序的开发,逐步完善小程序生态圈,通过简单操作 ...

随机推荐

  1. 在kali2.0中使用msf图形界面可能会遇到的问题

    kali版本:Linux kali 4.9.0-kali3-amd64 #1 SMP Debian 4.9.18-1kali1 (2017-04-04) x86_64 GNU/Linux 编写日期:2 ...

  2. springboot - 应用实践(3)springboot的核心

    1.springboot的启动类与核心注解@SpringBootApplication 2.springboot基本配置 3.springboot自动配置原理

  3. 小记---------FLUM负载均衡配置

    sink group允许组织多个sink到一个实体上,sink processors能够提供在组内所有sink之间实现负载均衡的能力,而且在失败的情况下能够进行故障转移从一个sink到另一个sink, ...

  4. C++中函数模板的概念和意义

    1,对泛型编程进行学习,泛型编程是实际工程开发中必用的技术,大型公司的通用 库都是采用泛型编程的技术完成的,C++ 中支持泛型编程技术,C++ 中的函数  模板和类模板就是 C++ 中泛型编程技术,本 ...

  5. Linux目录机构及目录管理

    Linux的目录结构与目录管理 Linux目录结构: 1 目录创建规则 FHS 文件系统层次化标准 指定Linux操作系统哪些目录必须具备 /boot /bin /sbin /etc /sys /pr ...

  6. 082、数据收集利器 cAdvisor (2019-04-30 周二)

    参考https://www.cnblogs.com/CloudMan6/p/7683190.html   cAdvisor 是google 开发的容器监控工具,下面我们开始安装和体验 cAdvisor ...

  7. lamp项目上线流程简述 (ubuntu16.04 )

    1  新建一个sudo用户,而不是直接用root操作 ①  新建用户可参考 https://www.cnblogs.com/bushuwei/p/10880182.html ②  赋予sudo权限: ...

  8. CSS hack(过滤器)

    CSS hack概念: 是针对不同浏览器对同一段代码解析不同的处理方案:<解决兼容性问题> 属性设置在不同版本的IE里会出现不兼容问题,css hack解决兼容主流浏览器和IE 常见的过滤 ...

  9. Smoke Testing

    [Smoke Testing 释义] Smoke Testing 的概念最早源于制造业,用于测试管道.测试时,用鼓风机往管道里灌烟,看管壁外面是否有烟冒出来,以便检验管道是否有缝隙.这一测试显然比较初 ...

  10. Linux RAID磁盘阵列

    RAID磁盘阵列 什么是RAID RAID是磁盘阵列的英文缩写,多块磁盘组成了一个组合,一起完成存储任务,就是磁盘阵列. RAID几种常用的类别(组合) RAID0:条带卷:最低磁盘个数2+,空间利用 ...