aiohttp_spider_def:

import asyncio
import re
import aiohttp
import aiomysql
from pyquery import PyQuery
from lxml import etree

start_url = 'http://news.baidu.com/'
waitting_urs = []
seen_uels = set()
stoppint = False
sem = asyncio.Semaphore(10)  # 现在并发为3个
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}

async def fetch(url, session):
    async with sem:
        #     await asyncio.sleep(1)
        try:
            async with session.get(url, headers=headers, timeout=1) as resp:
                print('url status:{}'.format(resp.status))
                # if resp.status in [200, 201]:
                data = etree.HTML(await resp.read())
                return data
        except Exception as e:
            print('错误为:{}  url:{}'.format(e, url))

def extract_urls(html):
    try:
        for url in html.xpath('//a/@href'):
            if url and url.startswith("http") and url not in seen_uels:
                if re.findall(r'baidu', url):
                    waitting_urs.append(url)
    except:
        pass

async def init_urls(url, session):
    html = await fetch(url, session)
    seen_uels.add(url)
    extract_urls(html)

async def article_handler(url, session, pool):
    # 获取文章详情
    html = await fetch(url, session)
    seen_uels.add(url)
    extract_urls(html)
    try:
        title = html.xpath('//title/text()')[0].strip()
        print('title:{}'.format(title))
        async with pool.acquire() as conn:
            async with conn.cursor() as cursor:
                try:
                    # 插入
                    await cursor.execute('insert into async_test_async(title) values("{}")'.format(title))

                    # 插入数据
                    await cursor.execute("insert into async_test_async(title) values('{}')".format(title))

                    # 查询数据
                    await cursor.execute("select * from async_test_async")
                    data = await cursor.fetchall()
                    print("data:", data)

                    # 更新数据
                    await cursor.execute("update async_test_async set title='{}' where id={}".format('update', 10168))

                    # 删除数据
                    await cursor.execute("delete from async_test_async where id={}".format(10174))
                except:
                    pass
    except:
        pass

async def consumer(pool):
    async with aiohttp.ClientSession() as session:
        while not stoppint:
            if len(waitting_urs) < 10:
                if url not in seen_uels:
                    asyncio.ensure_future(init_urls(url, session))

            url = waitting_urs.pop()
            print('start get url:{}'.format(url))
            if re.findall(r'baidu', url):
                if url not in seen_uels:
                    print('waitting_urs:{}'.format(waitting_urs[0: 3]))
                    asyncio.ensure_future(article_handler(url, session, pool))
                    await asyncio.sleep(0.1)

async def main(loop):
    pool = await aiomysql.create_pool(host='127.0.0.1', port=3306, user='root', password='root', db='cfda', loop=loop,
                                      charset='utf8', autocommit=True)
    async with aiohttp.ClientSession() as session:
        html = await fetch(start_url, session)
        seen_uels.add(start_url)
        extract_urls(html)

    asyncio.ensure_future(consumer(pool))

if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main(loop))
    loop.run_forever() 

aiohttp_spider_class:

import asyncio
import re
import aiohttp
import aiomysql
from pyquery import PyQuery
from lxml import etree

start_url = 'http://news.baidu.com/'
waitting_urs = []
seen_uels = set()
stoppint = False
sem = asyncio.Semaphore(10)  # 现在并发为3个
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}

class async_text(object):

    async def fetch(self, url, session):
        print("self:", self)
        async with sem:
            #     await asyncio.sleep(1)
            try:
                async with session.get(url, headers=headers, timeout=1) as resp:
                    print('url status:{}'.format(resp.status))
                    # if resp.status in [200, 201]:
                    data = etree.HTML(await resp.read())
                    return data
            except Exception as e:
                print('错误为:{}  url:{}'.format(e, url))

    def extract_urls(self, html):
        try:
            for url in html.xpath('//a/@href'):
                if url and url.startswith("http") and url not in seen_uels:
                    if re.findall(r'baidu', url):
                        waitting_urs.append(url)
        except:
            pass

    async def init_urls(self, url, session):
        html = await self.fetch(self, url, session)
        seen_uels.add(url)
        self.extract_urls(self, html)

    async def article_handler(self, url, session, pool):
        # 获取文章详情
        html = await self.fetch(self, url, session)
        seen_uels.add(url)
        self.extract_urls(self, html)
        try:
            title = html.xpath('//title/text()')[0].strip()
            print('title:{}'.format(title))
            async with pool.acquire() as conn:
                async with conn.cursor() as cur:
                    try:
                        # 插入
                        await cur.execute('insert into async_test_async(title) values("{}")'.format(title))
                    except:
                        pass
        except:
            pass

    async def consumer(self, pool):
        async with aiohttp.ClientSession() as session:
            while not stoppint:
                if len(waitting_urs) < 10:
                    if url not in seen_uels:
                        asyncio.ensure_future(self.init_urls(self, url, session))

                url = waitting_urs.pop()
                print('start get url:{}'.format(url))
                if re.findall(r'baidu', url):
                    if url not in seen_uels:
                        print('waitting_urs:{}'.format(waitting_urs[0: 3]))
                        asyncio.ensure_future(self.article_handler(self, url, session, pool))
                        await asyncio.sleep(0.1)

    @classmethod
    async def main(self, loop):
        pool = await aiomysql.create_pool(host='127.0.0.1', port=3306, user='root', password='root', db='cfda',
                                          loop=loop,
                                          charset='utf8', autocommit=True)
        async with aiohttp.ClientSession() as session:
            html = await self.fetch(self, start_url, session)
            seen_uels.add(start_url)
            self.extract_urls(self, html)

        asyncio.ensure_future(self.consumer(self, pool))

if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    loop.run_until_complete(async_text.main(loop))
    loop.run_forever()

  

aiohttp_spider的更多相关文章

随机推荐

  1. linux系统安全加固

    版权声明:本文为博主原创文章,支持原创,转载请附上原文出处链接和本声明. 本文地址:https://www.cnblogs.com/wannengachao/p/12068256.html 1.文件上 ...

  2. Python函数名做参数,闭包,装饰器

    简单讲解闭包的写法和应用,在这之前,先声明,你定义的任意一个函数都可以作为其他函数的参数.就像下面这段代码的参数func,接收的参数就是一个函数名,在函数体内部使用了func()调用执行函数. 请看下 ...

  3. div里面的元素垂直均匀分布 按钮引发地址栏出现问号 判断一个数组是否为空 div底部居中 路由传参接受参数

    一个固定高度的div的子元素 在垂直 方向上平均分布 .important-dec{ height: 121px; flex-direction: column; display: flex; jus ...

  4. Codeforces Round #598 (Div. 3)

    传送门 A. Payment Without Change 签到. Code /* * Author: heyuhhh * Created Time: 2019/11/4 21:19:19 */ #i ...

  5. 【poj2661】Factstone Benchmark(斯特林公式)

    传送门 题意: 给出\(x,x\leq 12\),求最大的\(n\),满足\(n!\leq 2^{2^x}\). 思路: 通过斯特林公式: \[ n!\approx \sqrt{2\pi n}\cdo ...

  6. 【转】C++ - 结构体构造函数使用总结

    声明 转载自:https://www.cnblogs.com/wlw-x/p/11566191.html 关于结构体构造函数使用总结 三种结构体初始化方法 1.利用结构体自带的默认构造函数 2.利用带 ...

  7. 数据嵌入js的关系图

    参照echarts官网,改了一下效果图: 数据放在了js里. 代码: <%@ page language="java" contentType="text/html ...

  8. linux编程fcntl获取和设置文件锁

    #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <sys/types. ...

  9. 推荐|MathType的使用技巧

    前言 持续更新中,敬请期待... 数学学科 制作新的数学符号 不包含于符号:输入$\not\subseteq,然后按回车键enter即可: 分式\(\cfrac{3-x}{2x-1}\)符号:输入$\ ...

  10. 关于group by的用法 原理

    转载: https://blog.csdn.net/u014717572/article/details/80687042. 写在前面的话:用了好久group by,今天早上一觉醒来,突然感觉grou ...