

import werobot
import pymongo
class Gongzhonghao():
    def __init__(self,token,APP_ID,ENCODING_AES_KEY,APP_SECRET):
        self.robot = werobot.WeRoBot(token = token)
        self.robot.config['HOST'] = ''
        self.robot.config['PORT'] = 80
        self.robot.config['APP_ID'] = APP_ID
        self.robot.config['ENCODING_AES_KEY'] = ENCODING_AES_KEY
        self.robot.config['APP_SECRET'] = APP_SECRET
    def _getNews_Count(self):
        :return: Int
        mediacount = self.robot.client.get_media_count()
        news_count = mediacount['news_count']
        return news_count
    def getNews(self):
        :return: Json
        i = 0
        items = []
        news_count = self._getNews_Count()
        while i < news_count:
            tempj = self.robot.client.get_media_list('news', i, 20)
            items  = tempj['item'] + items
            i = i + 20
        j = {
            'total_count': news_count,
            'items': items
        return j
    def echo(self):
        :return: null
if __name__ == '__main__':
    g = Gongzhonghao('1', '2', '3','4')
    j = g.getNews()
    client = pymongo.MongoClient('ip', 27017)
    db = client.gongzhonghao
    xxx= db.xxx


# -*- coding:utf-8 -*-
import os
import urllib.parse
from html.parser import HTMLParser
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
class ContentHtmlParser(HTMLParser):
    def __init__(self):
        self.text = ""
    def handle_data(self, data):
        self.text += data
    def get_text(self):
        return self.text
mongo_client = MongoClient("ip", 27017)
mongo_db = mongo_client["gongzhonghao"]
def get_words():
    words = []
    with open("words.txt", encoding="utf-8") as words_file:
        for lines in words_file.readlines():
            if len(lines.strip()) == 0:
            if lines.find("、") != -1:
                for p in lines.split("、"):
                    words.append(p.replace("\n", ""))
                words.append(lines.replace("\n", ""))
    return words
def get_articles(clt):
    articles = []
    collection = mongo_db[clt]
    doc = collection.find_one()
    items = doc["items"]
    for it in items:
        content = it["content"]["news_item"][0]
    return articles
def download(dir, file_name, url):
    if not os.path.exists(dir):
        resp = requests.get(url)
        path = dir + "\\" + file_name
        if os.path.exists(path):
        with open(path, "wb") as f:
    except :
def find_images(content):
    imgs = []
    c = urllib.parse.unquote(content)
    img_labels = BeautifulSoup(c, "html.parser").find_all("img")
    for img in img_labels:
        src = img.get("data-src")
    return imgs
def get_suffix(url):
        suffix = url[url.rindex("=") + 1:]
        if suffix == "jpeg" or suffix == "other":
            return ".jpg"
        return "." + suffix
        return ".jpg"
def filter_content(content):
    parser = ContentHtmlParser()
    return parser.get_text()
def check_jinyongci(content):
    fc = filter_content(content)
    words = get_words()
    invalids = []
    for w in words:
        if fc.find(w) != -1:
    return invalids
def save_jinyongci(clt, title, invalids):
    if len(invalids) == 0:
    file = clt + "\\invalid.txt"
    with open(file, "a+",encoding="utf-8") as f:
        f.write("标题:" + title)
        for iv in invalids:
if __name__ == "__main__":
    clt = "xxx"
    if not os.path.exists(clt):
    articles = get_articles(clt)
    print(clt + ": 共" + str(len(articles)) + "个")
    for i in range(0, len(articles)):
        print("正在处理第 " + str(i) + " 个")
        title = articles[i]["title"]
        thumb_url = articles[i]["thumb_url"]
        content = articles[i]["content"]
        # 下载封面
        # path = os.path.join(clt, title)
        fname = str(i) + "_" + title.replace("|", "").replace("<", "").replace(">", "")
        download(clt, fname + get_suffix(thumb_url), thumb_url)
        # 找出文章中的图片
        imgs = find_images(content)
        index = 0
        for img in imgs:
            download(clt, fname + "_" + str(index) + get_suffix(img), img)
            index = index + 1
        # 找出文章中的敏感词
        invalids = check_jinyongci(content)
        save_jinyongci(clt, title, invalids)





