aiohttp_spider
aiohttp_spider_def:
import asyncio import re import aiohttp import aiomysql from pyquery import PyQuery from lxml import etree start_url = 'http://news.baidu.com/' waitting_urs = [] seen_uels = set() stoppint = False sem = asyncio.Semaphore(10) # 现在并发为3个 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"} async def fetch(url, session): async with sem: # await asyncio.sleep(1) try: async with session.get(url, headers=headers, timeout=1) as resp: print('url status:{}'.format(resp.status)) # if resp.status in [200, 201]: data = etree.HTML(await resp.read()) return data except Exception as e: print('错误为:{} url:{}'.format(e, url)) def extract_urls(html): try: for url in html.xpath('//a/@href'): if url and url.startswith("http") and url not in seen_uels: if re.findall(r'baidu', url): waitting_urs.append(url) except: pass async def init_urls(url, session): html = await fetch(url, session) seen_uels.add(url) extract_urls(html) async def article_handler(url, session, pool): # 获取文章详情 html = await fetch(url, session) seen_uels.add(url) extract_urls(html) try: title = html.xpath('//title/text()')[0].strip() print('title:{}'.format(title)) async with pool.acquire() as conn: async with conn.cursor() as cursor: try: # 插入 await cursor.execute('insert into async_test_async(title) values("{}")'.format(title)) # 插入数据 await cursor.execute("insert into async_test_async(title) values('{}')".format(title)) # 查询数据 await cursor.execute("select * from async_test_async") data = await cursor.fetchall() print("data:", data) # 更新数据 await cursor.execute("update async_test_async set title='{}' where id={}".format('update', 10168)) # 删除数据 await cursor.execute("delete from async_test_async where id={}".format(10174)) except: pass except: pass async def consumer(pool): async with aiohttp.ClientSession() as session: while not stoppint: if len(waitting_urs) < 10: if url not in seen_uels: asyncio.ensure_future(init_urls(url, session)) url = waitting_urs.pop() print('start get url:{}'.format(url)) if re.findall(r'baidu', url): if url not in seen_uels: print('waitting_urs:{}'.format(waitting_urs[0: 3])) asyncio.ensure_future(article_handler(url, session, pool)) await asyncio.sleep(0.1) async def main(loop): pool = await aiomysql.create_pool(host='127.0.0.1', port=3306, user='root', password='root', db='cfda', loop=loop, charset='utf8', autocommit=True) async with aiohttp.ClientSession() as session: html = await fetch(start_url, session) seen_uels.add(start_url) extract_urls(html) asyncio.ensure_future(consumer(pool)) if __name__ == "__main__": loop = asyncio.get_event_loop() loop.run_until_complete(main(loop)) loop.run_forever()
aiohttp_spider_class:
import asyncio import re import aiohttp import aiomysql from pyquery import PyQuery from lxml import etree start_url = 'http://news.baidu.com/' waitting_urs = [] seen_uels = set() stoppint = False sem = asyncio.Semaphore(10) # 现在并发为3个 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"} class async_text(object): async def fetch(self, url, session): print("self:", self) async with sem: # await asyncio.sleep(1) try: async with session.get(url, headers=headers, timeout=1) as resp: print('url status:{}'.format(resp.status)) # if resp.status in [200, 201]: data = etree.HTML(await resp.read()) return data except Exception as e: print('错误为:{} url:{}'.format(e, url)) def extract_urls(self, html): try: for url in html.xpath('//a/@href'): if url and url.startswith("http") and url not in seen_uels: if re.findall(r'baidu', url): waitting_urs.append(url) except: pass async def init_urls(self, url, session): html = await self.fetch(self, url, session) seen_uels.add(url) self.extract_urls(self, html) async def article_handler(self, url, session, pool): # 获取文章详情 html = await self.fetch(self, url, session) seen_uels.add(url) self.extract_urls(self, html) try: title = html.xpath('//title/text()')[0].strip() print('title:{}'.format(title)) async with pool.acquire() as conn: async with conn.cursor() as cur: try: # 插入 await cur.execute('insert into async_test_async(title) values("{}")'.format(title)) except: pass except: pass async def consumer(self, pool): async with aiohttp.ClientSession() as session: while not stoppint: if len(waitting_urs) < 10: if url not in seen_uels: asyncio.ensure_future(self.init_urls(self, url, session)) url = waitting_urs.pop() print('start get url:{}'.format(url)) if re.findall(r'baidu', url): if url not in seen_uels: print('waitting_urs:{}'.format(waitting_urs[0: 3])) asyncio.ensure_future(self.article_handler(self, url, session, pool)) await asyncio.sleep(0.1) @classmethod async def main(self, loop): pool = await aiomysql.create_pool(host='127.0.0.1', port=3306, user='root', password='root', db='cfda', loop=loop, charset='utf8', autocommit=True) async with aiohttp.ClientSession() as session: html = await self.fetch(self, start_url, session) seen_uels.add(start_url) self.extract_urls(self, html) asyncio.ensure_future(self.consumer(self, pool)) if __name__ == "__main__": loop = asyncio.get_event_loop() loop.run_until_complete(async_text.main(loop)) loop.run_forever()
aiohttp_spider的更多相关文章
随机推荐
- Python爬虫(requests模块)
Requests是唯一的一个非转基因的Python HTTP库,人类可以安全享用. Requests基础学习 使用方法: 1.导入Requests模块: import requests 2.尝试用g ...
- itest(爱测试) 4.2.1 发布,开源BUG 跟踪管理 & 敏捷测试管理软件
itest 入选 2019 年度最受欢迎开源中国软件 开源工具的发展,离不开你我的支持,需要您投上宝贵的一票 去投票 itest 简介:查看简介 itest 开源敏捷测试管理,testOps 践行者 ...
- Java Web 学习(6) —— Spring MVC 之校验器
Spring MVC 之校验器 数据验证 一个典型的 Spring MVC 应用会同时应用到 formatters/converters 和 validators. 在调用 controller 期间 ...
- 《移动WEB前端高级开发实践@www.java1234.com.pdf》
HTTP服务器: http-server 3.6.4 利用 Performance API 分析网站性能 页面加载生命周期 4. CSS3 伪类.伪元素, 看https://www.runoob.co ...
- 解决:Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), is another process using it?
简单粗暴法 删除锁 $ sudo rm /var/cache/apt/archives/lock $ sudo rm /var/lib/dpkg/lock 如果还不行,重启虚拟机 $ reboot
- DirectShow 常用函数总结
本文准备总结一些 Direct Show 常用的API接口函数,方便以后查询回忆.如果这里没有你想了解的函数,你可以自行搜索MSDN + 函数名去 MSDN 查找你想要了解的函数,也可以查看百度百科相 ...
- 06-Django视图
什么是视图? 视图就是应用中views.py文件中的函数,视图函数的第一个参数必须是request(HttpRequest)对象.返回的时候必须返回一个HttpResponse对象或子对象(包含Htt ...
- Kubernetes V1.15 二进制部署集群
1. 架构篇 1.1 kubernetes 架构说明 1.2 Flannel网络架构图 1.3 Kubernetes工作流程 2. 组件介绍 2.1 ...
- Docker学习——基本使用
最近公司项目要用docker部署,第一次接触,记录一下,方便使用时查阅. 你有没有遇到过这种情况,在本地运行良好的代码,在另一台电脑或者另一个环境里一堆bug,可以说是水土不服,本质上是两个电脑的运行 ...
- MySQL(9)---纪录一次实际开发过程中用到的复杂存储过程
Mysql(9)---纪录一次实际开发过程中用到的复杂存储过程 为了尽可能的还原当时为什么需要用到存储过程,下面我写了个详细的文档,我们可以从需求文档出发来分析. 有关存储过程之前也写了两篇文章来做铺 ...