出于好奇,那些10w+的公众号都写了些什么,于是我写了几个脚本爬取了各行业Top的公众号文章,进行了关键词统计。

抓取数据、分析用到了3中语言:Node.js,Java,Python。废话不多说,直接上代码。

1(NODEJS)

puppeteer模拟登陆,抓取微信公众号链接:

/**
* load wechat article urls on newrank.cn
**/
const puppeteer = require('puppeteer');
//emulate iphone
const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36';
const workPath = './newrank_cn1111';
const fs = require("fs");
const userName = "公众号";
const ppwwdd = "caiyongji";
if (!fs.existsSync(workPath)) {
fs.mkdirSync(workPath)
}
const loginUrl = 'https://www.newrank.cn/public/login/login.html?back=https%3A//www.newrank.cn/'; const monthlyRankUrl = "https://www.newrank.cn/public/info/list.html?period=month&type=data"; const detailUrl = "https://www.newrank.cn/public/info/detail.html?account="; (async () => { const browser = await puppeteer.launch({headless: false});//set headless: true will hide chromium UI
const page = await browser.newPage();
await page.setUserAgent(userAgent);
await page.setViewport({width:1920, height:1000});
await page.setRequestInterception(true); //filter to block images
page.on('request', request => {
if (request.resourceType() === 'image')
request.abort();
else
request.continue();
});
await page.goto(loginUrl);
//login
await loginOperate();
//await page.close(); await processMonthlyRank('.wx-right-type-list-spe a[icon=ss]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=mgs]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=cf]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=kj]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=cy]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=qc]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=ls]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=zc]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=jy]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=xs]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=zw]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=qy]'); await processMonthlyRank('.wx-right-type-list-spe a[icon=wh]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=bk]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=jk]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=shs]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=ms]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=sj]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=lx]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=ym]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=qg]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=ty]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=mt]');
await processMonthlyRank('.wx-right-type-list-spe a[icon=zs]'); await processMonthlyRank('#wx_month_all'); async function loginOperate(){
try{
await page.click('div[data-type=pwd]');
}catch(err){
console.log('login#1');
} try{
await page.type('#account_input',userName);
await page.type('#password_input',ppwwdd);
}catch(err){
console.log('login#2');
} try{
await page.click('#pwd_confirm');
}catch(err){
console.log('login#3');
} } async function processMonthlyRank(btn){
const tab = await browser.newPage();
await tab.setUserAgent(userAgent);
await tab.setViewport({width:1920, height:1000});
await tab.setRequestInterception(true); //filter to block images
tab.on('request', request => {
if (request.resourceType() === 'image')
request.abort();
else
request.continue();
});
await tab.goto(monthlyRankUrl);
try{
await tab.click(btn);
}catch(err){
console.log('processMonthlyRank#1');
}
let fileName = await tab.evaluate(function(param){
return document.querySelector(param).innerHTML;
},btn);
console.log('-------------------------'+fileName+'-------------------------');
await scrollWait(tab);
await waitSecond(tab); const sel = '.wx_main tr';
const texts = await tab.evaluate((sel) => {
let elements = Array.from(document.querySelectorAll(sel));
let txt = elements.map(element => {
return element.innerText
})
return txt;
}, sel);
console.log('total rows: '+texts.length);
let contents='记录条数'+(texts.length-1)+'\n\n';
texts.forEach(function(c,index){
if(index>0){
contents+=c+'\n\n';
}
}); const fs = require("fs");
fs.writeFileSync(workPath+'/'+fileName+'.txt',contents);
console.log(fileName + " has been extracted to local."); const idSel = '.wx_main tr a[href^="detail.html"]';
const ids = await tab.evaluate((idSel) => {
let elements = Array.from(document.querySelectorAll(idSel));
let txt = elements.map(element => {
return element.innerText
})
return txt;
}, idSel);
let idContents='';
let w_name;
let flag =true;
/*ids.forEach(async function(id,index){
if(index%2!=0){
idContents+=id+'\n';
await getDetail(fileName,w_name,id);
w_name =null;
}else{
w_name=id;
}
});*/
await (async ()=>{
for(let i=0;i<ids.length;i++){
if(i%2!=0){
idContents+=ids[i]+'\n';
await getDetail(fileName,w_name,ids[i]);
w_name =null;
}else{
w_name=ids[i];
}
}
})();
let idFile = 'id_'+fileName;
fs.writeFileSync(workPath+'/'+idFile+'.txt',idContents);
console.log(idFile + " has been extracted to local.");
await tab.close();
} async function scrollWait(p, n){
if(n==null) n=5;
for(let i= 0; i<n;i++){
try{
await p.evaluate(()=>window.scrollTo(0, document.body.scrollHeight));
await p.waitForNavigation({timeout:500,waitUntil: ['networkidle0']});
}catch(err){
console.log('scroll to bottom and then wait 500 ms.');
}
}
} async function waitSecond(p){
try{
await p.waitForNavigation({timeout:2000,waitUntil: ['networkidle0']});
}catch(err){
//console.log('wait 1 sec.');
}
} async function getDetail(cat,name,id){
const tab = await browser.newPage();
await tab.setUserAgent(userAgent);
await tab.setViewport({width:1920, height:1000});
await tab.setRequestInterception(true); //filter to block images
tab.on('request', request => {
if (request.resourceType() === 'image')
request.abort();
else
request.continue();
});
await tab.goto(detailUrl+id);
await waitSecond(tab);
const sel = '#info_detail_article_top li .title a';
const hrefs = await tab.evaluate((sel) => {
let elements = Array.from(document.querySelectorAll(sel));
let links = elements.map(element => {
return element.href
})
return links;
}, sel);
let urlList='';
hrefs.forEach(function(href,index){
urlList+=href+"\n";
});
const fs = require("fs");
if (!fs.existsSync(workPath+'/'+cat)) {
fs.mkdirSync(workPath+'/'+cat)
}
fs.writeFileSync(workPath+'/'+cat+'/'+id+'_top_'+name+'.txt',urlList); const sel1 = '#info_detail_article_lastest li .title a';
const hrefs1 = await tab.evaluate((sel1) => {
let elements = Array.from(document.querySelectorAll(sel1));
let links = elements.map(element => {
return element.href
})
return links;
}, sel1);
let urlList1='';
hrefs1.forEach(function(href,index){
urlList1+=href+"\n";
});
fs.writeFileSync(workPath+'/'+cat+'/'+id+'_lastest_'+name+'.txt',urlList1);
console.log(id+' '+name+' has been extracted to local.');
await tab.close();
} })();

  

 

2(JAVA)

Jsoup抓取微信文章文本:Vps 安全设置 Win2003中IIS的安全设置技巧

package com;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Arrays;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadLocalRandom; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; public class WeChatUrls extends Thread {
private File catFile;
final static Integer ThreadNum = 1;
final String ERROR = "ERROR";
private final static String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36";
private final static String WORK_FOLDER = "T:\\Developer\\puppeteerTestCase\\newrank_cn_articles";
private final static String READ_URLS_FOLDER = "T:\\Developer\\puppeteerTestCase\\newrank_cn"; public WeChatUrls(File cat) {
this.catFile = cat;
} private String getUrlProxyContent(String url) {
String body = ERROR;
try {
Document doc = Jsoup.connect(url).userAgent(USER_AGENT).get();
if (doc.select("body") != null) {
body = doc.select("body").text();
}
} catch (IOException e) {
System.out.println("ERROR URL: " + url);
e.printStackTrace();
} return body;
} private void write(String content, String fileName) {
File f = new File(fileName);
FileWriter fw = null;
BufferedWriter bw = null;
try {
if (!f.exists()) {
f.getParentFile().mkdirs();
f.createNewFile();
}
// fw = new FileWriter(f.getAbsoluteFile(), true); // true表示可以追加新内容
fw = new FileWriter(f.getAbsoluteFile()); // 表示不追加
bw = new BufferedWriter(fw);
bw.write(content);
bw.close();
} catch (Exception e) {
e.printStackTrace();
}
} public static void main(String[] args) throws Exception {
File baseFolder = new File(READ_URLS_FOLDER);
File[] cataFiles = baseFolder.listFiles();
ExecutorService service = Executors.newFixedThreadPool(ThreadNum);
Arrays.asList(cataFiles).stream().forEach(catFile -> {
if (catFile.isFile() && catFile.getName().startsWith("id")) {
service.execute(new WeChatUrls(catFile));
}
});
service.shutdown();
} private void process() {
// Set<String> redoSet = new HashSet<>();
String catagory = catFile.getName().split("\\.")[0].split("_")[1];
File urlFolder = new File(READ_URLS_FOLDER + "\\" + catagory);
File[] urlFiles = urlFolder.listFiles();
if (urlFiles != null) {
Arrays.asList(urlFiles).stream().forEach(urlFile -> {
try {
BufferedReader reader = new BufferedReader(new FileReader(catFile));
String wechatId = null;
int countLatest = 1;
int countTop = 1;
while ((wechatId = reader.readLine()) != null) {
if (urlFile.getName().startsWith(wechatId)) {
String wechatName = urlFile.getName().split("\\.")[0].split("_")[2];
// if (urlFile.length() == 0) {
// redoSet.add("\"" + catagory + "\",\"" + wechatName + "\",\"" + wechatId + "\"");
// }
BufferedReader r = new BufferedReader(new FileReader(urlFile));
String wechatUrl = null;
while ((wechatUrl = r.readLine()) != null) {
String writePath = WORK_FOLDER + "\\" + catagory + "\\"
+ (urlFile.getName().contains("top") ? "top" : "latest") + "\\" + wechatId
+ "_" + wechatName + "_"
+ (urlFile.getName().contains("top") ? countTop++ : countLatest++)+".txt";
String content = getUrlProxyContent(wechatUrl);
write(content, writePath);
System.out.println(writePath);
Thread.sleep(ThreadLocalRandom.current().nextInt(500, 3000));
}
r.close();
}
}
reader.close();
} catch (Exception e) {
e.printStackTrace();
}
});
}
// redoSet.stream().forEach(System.out::println); } @Override
public void run() {
process();
}
}

  

 

3(PYTHON)

wordcloud生成词云:

# -*- coding: utf-8 -*-
import json
import random
import time
import os
from pyecharts import Bar,Geo,Line,Overlap
import jieba
from scipy.misc import imread
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
from collections import Counter
os.chdir('T:/Developer/puppeteerTestCase/newrank_cn_articles') stopWords = ['微信','二维码','二维','扫一','一扫','公众','赞赏','转账','关注','打开','阅读','图片','关闭','取消','程序'] def proc(folder, type):
fileLines = []
rootdir = './'+folder+'/'+type
list = os.listdir(rootdir)
for i in range(0,len(list)):
path = os.path.join(rootdir,list[i])
if os.path.isfile(path):
try:
fo = open(path, 'r+')
fileLines += fo.readlines()
except:
print('error while processing file: ' + path) _str = ' '.join(fileLines)
words_list = []
word_generator = jieba.cut_for_search(_str)
for word in word_generator:
words_list.append(word)
words_list = [k for k in words_list if len(k)>1 and k not in stopWords]
back_color = imread('back.jpg')
wc = WordCloud(background_color='white',
max_words=2000,
mask=back_color,
max_font_size=300,
font_path="C:/Windows/Fonts/msyh.ttc",
random_state=42
)
_count = Counter(words_list)
wc.generate_from_frequencies(_count)
image_colors = ImageColorGenerator(back_color)
wc.recolor(color_func=image_colors)
#plt.figure()
#plt.imshow(wc.recolor(color_func=image_colors))
#plt.axis('off') # The pil way (if you don't have matplotlib)
image = wc.to_image()
image.show()
jpgFile = './'+type+'_'+folder+'.jpg'
image.save(jpgFile)
print('image File saved:' + jpgFile) basedir = './'
baselist = os.listdir(basedir)
for l in range(0,len(baselist)):
p = os.path.join(basedir,baselist[l])
if os.path.isdir(p):
proc(os.path.basename(p), 'top')

  

 

4

词云结果涉及23个维度,得出结果如下:

TOP500公众号文章

创业

健康

教育

乐活

企业

情感

体育娱乐

文化

文摘

幽默

政务

旅行

时事

时尚

民生

汽车

百科

科技

美体

美食

职场

财富

文章转自:https://segmentfault.com/r/1250000015997077?shareId=1210000015997081

那些10w+的公众号都在写什么?的更多相关文章

  1. 一个人的公众号,我写了1w+

    大家好,我是Bypass,一个人一直保持着写博客的习惯,为此维护了一个技术公众号,致力于分享原创高质量干货,写的内容主要围绕:渗透测试.WAF绕过.代码审计.应急响应.企业安全. 一直以来,我把它当成 ...

  2. 手机QQ公众号亿级消息实时群发架构

    编者按:高可用架构分享及传播在架构领域具有典型意义的文章,本文由孙子荀分享.转载请注明来自高可用架构公众号 ArchNotes.   孙子荀,2009 年在华为从事内核和分布式系统的开发工作:2011 ...

  3. 微信公众号发送消息模板(java)

    这段时间接触公众号开发,写下向用户发送消息模板的接口调用 先上接口代码 public static JSONObject sendModelMessage(ServletContext context ...

  4. PHP开发微信公众号(二)消息接受与推送

    上一篇文章我们知道怎么获取二维码,这样别人就可以扫描二维码来关注我们,但是别人关注后,发送消息,我们怎么进行相关处理? 这里我们就来学习下怎么处理处理这些消息,以及推送消息. 学习之前首先你需要有一个 ...

  5. spring-boot-route(二十三)开发微信公众号

    在讲微信公众号开发之前,先来大概了解一下微信公众号.微信公众号大体上可以分为服务号和订阅号,订阅号和服务号的区别如下: 服务号可以申请微信支付功能. 服务号只能由企业申请,订阅号可以有企业或个人申请. ...

  6. php开发微信公众号获取信息LBS

    1.一般的公众号都可以在微信公众平台里面设置自定义菜单和自动回复消息,如果需要获取用户位置,则必须开启 服务器配置,当次功能开启后,微信公众平台的自定义菜单和自动回复则失效. 需要通过接口开发来实现微 ...

  7. 微信公众号开发C#系列-9、多公众号集中管理

    1.概述 通过前面8篇关于微信开发相关文章的学习,我们已经对微信常用开发有了一个比较深入的了解.前面的文章都是基于某一特定公众号的,在现实业务中同一单位个体运营着不至一个公众号,此时就需要对多个公众号 ...

  8. 微信公众号UX分析—— 学生作业小结

    1. 不足: 1. 权威性:个人帐号,显得不够正式. 2. 排版问题: + 没有必要的外接端口,界面设计极度缺少排版.哪怕是个人公众号都不至于如此,更何况这是一个学校的教务平台. 3. 反应不及时或无 ...

  9. 如何玩转小程序+公众号?手把手教你JeeWx小程序CMS与公众号关联

    随着微信小程序新功能.新入口的不断更新,小程序的商业价值逐步增强,特别是小程序与公众号的深度融合,已经让小程序成为各行业新的营销渠道.Jeewx平台专注小程序的开发,逐步完善小程序生态圈,通过简单操作 ...

随机推荐

  1. JAVA 编程思想一

    1: 动态绑定和静态绑定 使用private或static或final修饰的变量或者方法,使用静态绑定.而虚方法(可以被子类重写的方法)则会根据运行时的对象进行动态绑定: 静态绑定使用类信息来完成,而 ...

  2. USACO4.3 Street Race【分析】

    这道题,感觉不是很难,分析清楚之后非常简单.(标签都不知道怎么加) 读完题首先想到了分割点一定是必经点的一种特殊情况,如果分割点不是必经点的话,那么它就不能把这个图分成两半(存在不经过它的边沟通两半) ...

  3. 深入理解java:2.3.3. 并发编程concurrent包 之容器ConcurrentHashMap

    线程不安全的HashMap 因为多线程环境下,使用Hashmap进行put操作会引起死循环,导致CPU利用率接近100%,所以在并发情况下不能使用HashMap. 效率低下的HashTable容器 H ...

  4. Java学习开发第二阶段总结

    第二阶段的学习总结: 在这次学习中虽说任务量是比上次提升了不少.但大部分的内容都于C语言相同或者类似.学习起来相对来说很轻松.但也在这次学习中学到新的知识 ①Jshell 在cmd中运行Jshell脚 ...

  5. A9-USART2_RX_BUF 串口2收发异常

    a9_send_cmd(); //退出透传模式,和前一次发送时间超过 2 秒,输入+++,就可以退出透传模式 delay_ms(); delay_ms(); delay_ms(); a9_quit_t ...

  6. python-day13(正式学习)

    闭包函数 闭包 闭包:闭是封闭(函数内部函数),包是包含(该内部函数对外部作用域而非全局作用域的变量的引用).闭包指的是:函数内部函数对外部作用域而非全局作用域的引用. 额...这里提示一下闭包!=自 ...

  7. java 工具类使用

    BigDecimalUtil 金额计算工具类 import java.math.BigDecimal; public class BigDecimalUtil { private BigDecimal ...

  8. RocketMQ吐血总结

    RocketMQ吐血总结 架构 概念模型 最基本的概念模型与扩展后段概念模型 存储模型 RocketMQ吐血总结 User Guide RocketMQ是一款分布式消息中间件,最初是由阿里巴巴消息中间 ...

  9. 1.基础CRUD

    在ef中,CUD都使用Datacontext.SaveChange()进行保存. SavaChange方法在保存之前会自动调用DetectChanges方法检查DataContext中做了什么更改,以 ...

  10. [转载]Jupyter Notebook中自动补全代码

    原文地址:https://yq.aliyun.com/articles/667928 在公众号之前的文章中,已经介绍了在Jupyter Notebook中设置主题以及输出代码文件到pdf文件中,本文来 ...