python 微信爬虫实例

单线程版：

 import  urllib.request

 import urllib.parse

 import urllib.error

 import re,time

 headers = ("User-Agent",

            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3107.4 Safari/537.36")

 operner = urllib.request.build_opener()

 operner.addheaders = [headers]

 urllib.request.install_opener(operner)

 list_url = []

 ###使用代理获取网页url内容

 def use_proxy(url):

     try:

         # proxy = urllib.request.ProxyHandler({'http':proxy_addr})　　　　##使用代理版

         # operner = urllib.request.build_opener()

         # urllib.request.install_opener(operner)

         headers = ("User-Agent",

                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3107.4 Safari/537.36")

         operner = urllib.request.build_opener()

         operner.addheaders = [headers]

         urllib.request.install_opener(operner)

         data = urllib.request.urlopen(url).read().decode('utf-8')

         # print (data)

         return data

     except urllib.error.URLError as e:

         if hasattr(e, "code"):

             print(e.code)

         elif hasattr(e, "reason"):

             print(e.reason)

     except Exception as e:

         print("exception" + str(e))

         time.sleep(1)

 ##获取要爬取的url

 def get_url(key, pagestart, pageend):

     try:

         keycode = urllib.parse.quote(key)

         for page in range(pagestart, pageend + 1):

             url = "http://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=n&type=%d&page=1&ie=utf8" % (

             keycode, page)

             data1 = use_proxy(url)

             #print("data1的内容是", data1)

             listurl_pattern = '<h3>.*?("http://.*?)</h3>'

             result = re.compile(listurl_pattern, re.S).findall(data1)

             for i in range(len(result)):

                 res = result[i].replace("amp;", "").split(" ")[0].replace("\"", "")

                 list_url.append(res)

         #print(list_url)

         return list_url

     except urllib.error.URLError as e:

         if hasattr(e, "code"):

             print(e.code)

         elif hasattr(e, "reason"):

             print(e.reason)

     except Exception as e:

         print("exception:", e)

 ##通过获取的url爬行内容数据并处理

 def get_url_content(list_url):

     fh1=open("D:\\python-script\\1.html", 'wb')

     html1 = '''<!DOCTYPE html>\n<html xmlns="http://www.w3.org/1999/xhmtl">\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<title>微信文章</title></head>\n<body>'''

     fh1.write(html1.encode("utf-8"))

     fh1.close()

     fh = open("D:\\python-script\\1.html", 'ab')

     for url in list_url:

         data_content = use_proxy(url)

         #print (data_content)

         #sys.exit()

         title_pattern = '<h2.*>.*?</h2>'

         result_title = re.compile(title_pattern, re.S).findall(data_content)

         ##标题(str)

         res_title = result_title[0].replace("<h2 class=\"rich_media_title\" id=\"activity-name\">", "").replace("</h2>",

                                                                                           "").strip()

         content_pattern = 'id="js_content">(.*?)<div class="rich_media_tool" id="js_sg_bar">'

         content = re.compile(content_pattern, re.S).findall(data_content)

         try:

             fh.write(res_title.encode("utf-8"))

             for i in content:

                 fh.write(i.strip().encode("utf-8"))

         except UnicodeEncodeError as e:

             continue

     fh.write("</body></html>".encode("utf-8"))

 if __name__ == '__main__':

     pagestart = 1

     pageend = 2

     key = "人工智能"

     get_url(key, pagestart, pageend)

     get_url_content(list_url)

多线程版：

import  urllib.request

import urllib.parse

import urllib.error

import re,time

import queue

import threading

headers = ("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3107.4 Safari/537.36")

operner = urllib.request.build_opener()

operner.addheaders = [headers]

urllib.request.install_opener(operner)

urlque = queue.Queue()

list_url = []

###使用代理获取网页url内容

def use_proxy(url):

    try:

        # proxy = urllib.request.ProxyHandler({'http':proxy_addr})

        # operner = urllib.request.build_opener()

        # urllib.request.install_opener(operner)

        headers = ("User-Agent",

                   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3107.4 Safari/537.36")

        operner = urllib.request.build_opener()

        operner.addheaders = [headers]

        urllib.request.install_opener(operner)

        data = urllib.request.urlopen(url).read().decode('utf-8')

        #print (data)

        return data

    except urllib.error.URLError as e:

        if hasattr(e,"code"):

            print (e.code)

        elif hasattr(e,"reason"):

            print (e.reason)

    except Exception as e:

        print ("exception"+str(e))

        time.sleep(1)

###获取文章的url连接，并将连接加入到队列

class get_url(threading.Thread):

    def __init__(self,key,pagestart,pageend,urlque):

        threading.Thread.__init__(self)

        self.pagestart = pagestart

        self.pageend = pageend

        self.key = key

        self.urlque = urlque

    def run(self):

        try:

            keycode = urllib.parse.quote(self.key)

            for page in range(self.pagestart,self.pageend+1):

                url = "http://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=n&type=%d&page=1&ie=utf8" % (keycode,page)

                data = use_proxy(url)

                print ("data1的内容是",data)

                listurl_pattern = '<h3>.*?("http://.*?)</h3>'

                result = re.compile(listurl_pattern,re.S).findall(data)

                print (result)

                if len(result) == 0:

                    print ("没有可用的url")

                    sys.exit()

                for i in range(len(result)):

                    res = result[i].replace("amp;","").split(" ")[0].replace("\"" ,"")

                    #list_url.append(res)       #加入列表

                    self.urlque.put(res)            ##加入队列

                    self.urlque.task_done()

            #return list_url

        except urllib.error.URLError as e:

            if hasattr(e, "code"):

                print(e.code)

            elif hasattr(e, "reason"):

                print(e.reason)

        except Exception as e:

            print ("exception:",e)

##根据url获取文章内容

class get_url_content(threading.Thread):

    def __init__(self,urlque):

        threading.Thread.__init__(self)

        self.urlque = urlque

    def run(self):

        fh1 = open("D:\\python-script\\1.html", 'wb')

        html1 = '''<!DOCTYPE html>\n<html xmlns="http://www.w3.org/1999/xhmtl">\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<title>微信文章</title></head>\n<body>'''

        fh1.write(html1.encode("utf-8"))

        fh1.close()

        fh = open("D:\\python-script\\1.html", 'ab')

        while True:

            try:

                url = self.urlque.get()

                data_content = use_proxy(url)

                title_pattern = '<h2.*>.*?</h2>'

                result_title = re.compile(title_pattern, re.S).findall(data_content)

                ##标题

                res_title = result_title[0].replace("<h2 class=\"rich_media_title\" id=\"activity-name\">", "").replace("</h2>","").strip()

                content_pattern = 'id="js_content">(.*?)<div class="rich_media_tool" id="js_sg_bar">'

                content = re.compile(content_pattern, re.S).findall(data_content)

                #c = '<p style="max-width: 100%;box-sizing: border-box;min-height: 1em;text-indent: 2em;word-wrap: break-word !important;">'

                # for i in content:

                #     ##内容

                #     c_content=i.replace(c, "").replace("<br  /></p>", "").replace("</p>", "")

                fh.write(res_title.encode("utf-8"))

                for i in content:

                    fh.write(i.strip().encode("utf-8"))

            except UnicodeEncodeError as e:

                continue

            fh.close()

class contrl(threading.Thread):

    def __init__(self,urlqueue):

        threading.Thread.__init__(self)

        self.urlqueue = urlqueue

        while True:

            print ("程序正在执行")

            if self.urlqueue.empty():

                time.sleep(3)

                print ("程序执行完毕")

                exit()

if __name__ == '__main__':

    pagestart = 1

    pageend = 2

    key = "人工智能"

    get_url = get_url(key,pagestart,pageend,urlque)

    get_url.start()

    get_content = get_url_content(urlque)

    get_content.start()

    cntrol = contrl(urlque)

    cntrol.start()

python 微信爬虫实例的更多相关文章

Python 多进程爬虫实例
Python 多进程爬虫实例 import json import re import time from multiprocessing import Pool import requests f ...
python scrapy 爬虫实例
1 创建一个项目 scrapy startproject basicbudejie 2 编写爬虫 import scrapy class Basicbudejie(scrapy.Spider): na ...
python 多线程爬虫实例
多进程 Multiprocessing 模块 Process 类用来描述一个进程对象.创建子进程的时候,只需要传入一个执行函数和函数的参数即可完成 Process 示例的创建. star() 方法启动 ...
Python小爬虫实例
有几个注意点: # -*- coding: utf-8 -*- # func passport jw.qdu.edu.cn import re import urllib# python3后urlli ...
Python 爬虫实例
下面是我写的一个简单爬虫实例 1.定义函数读取html网页的源代码 2.从源代码通过正则表达式挑选出自己需要获取的内容 3.序列中的htm依次写到d盘 #!/usr/bin/python import ...
如何利用Python网络爬虫抓取微信朋友圈的动态（上）
今天小编给大家分享一下如何利用Python网络爬虫抓取微信朋友圈的动态信息,实际上如果单独的去爬取朋友圈的话,难度会非常大,因为微信没有提供向网易云音乐这样的API接口,所以很容易找不到门.不过不要慌 ...
如何利用Python网络爬虫爬取微信朋友圈动态--附代码（下）
前天给大家分享了如何利用Python网络爬虫爬取微信朋友圈数据的上篇(理论篇),今天给大家分享一下代码实现(实战篇),接着上篇往下继续深入. 一.代码实现 1.修改Scrapy项目中的items.py ...
Python爬虫实例：爬取B站《工作细胞》短评——异步加载信息的爬取
很多网页的信息都是通过异步加载的,本文就举例讨论下此类网页的抓取. <工作细胞>最近比较火,bilibili 上目前的短评已经有17000多条. 先看分析下页面右边 li 标签中的就是短 ...
Python爬虫实例：爬取猫眼电影——破解字体反爬
字体反爬字体反爬也就是自定义字体反爬,通过调用自定义的字体文件来渲染网页中的文字,而网页中的文字不再是文字,而是相应的字体编码,通过复制或者简单的采集是无法采集到编码后的文字内容的. 现在貌似不少网 ...

随机推荐

JarvisOJ Misc 炫酷的战队logo
欣赏过了实验室logo,有人觉得我们战队logo直接盗图比较丑,于是我就重新设计了一个,大家再欣赏下? 一开始拿到的BMP文件就打不开,用010打开发现文件头被抹去了,补上了BMP,与文件大小后,发现 ...
ES系列十五、ES常用Java Client API
一.简介 1.先看ES的架构图二.ES支持的客户端连接方式 1.REST API http请求,例如,浏览器请求get方法:利用Postman等工具发起REST请求:java 发起httpClien ...
M3U8文件
M3U本质上说不是音频文件,它是音频文件的列表文件,是纯文本文件.你下载下来打开它,播放软件并不是播放它,而是根据它的记录找到网络地址进行在线播放. M3U文件的大小很小,也就是因为它里面没有任何音频 ...
IntelliJ IDEA 导航的 20 大特性
本文由 ImportNew - elviskang 翻译自 dzone.欢迎加入翻译小组.转载请见文末要求. 在前面的文章里,我介绍了IntelliJ IDEA(以下称IntelliJ)中与代码补全及 ...
性能测试工具 Locust
https://docs.locust.io/en/latest/quickstart.html
BZOJ5063旅游——非旋转treap
题目描述小奇成功打开了大科学家的电脑. 大科学家打算前往n处景点旅游,他用一个序列来维护它们之间的顺序.初始时,序列为1,2,...,n. 接着,大科学家进行m次操作来打乱顺序.每次操作有6步: ...
第三十八天 GIL 进程池与线程池
今日内容: 1.GIL 全局解释器锁 2.Cpython解释器并发效率验证 3.线程互斥锁和GIL对比 4.进程池与线程池一.全局解释器锁 1.GIL:全局解释器锁 GIL本质就是一把互斥锁,是夹在 ...
Docker基本使用（一）
使用docker输入hello world Docker 允许你在容器内运行应用程序, 使用 docker run 命令来在容器内运行一个应用程序. 输出Hello world $ docker ru ...
Marriage Match IV HDU - 3416（最短路 + 最大流）
题意: 求有多少条最短路解析: 正着求一遍最短路得dis1 反着求一遍得 dis2 然后遍历所有的边如果 dis1[u] + dis2[v] + w == dis1[B], 则说明这是一 ...
jQuery 方式模拟提交表单
//add test moudle define(function(require , exports , module) { //=========== 不使用模块化只使用如下代码即可 start ...

python 微信爬虫实例

python 微信爬虫实例的更多相关文章

随机推荐

热门专题