aiohttp_spider
aiohttp_spider_def:
import asyncio import re import aiohttp import aiomysql from pyquery import PyQuery from lxml import etree start_url = 'http://news.baidu.com/' waitting_urs = [] seen_uels = set() stoppint = False sem = asyncio.Semaphore(10) # 现在并发为3个 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"} async def fetch(url, session): async with sem: # await asyncio.sleep(1) try: async with session.get(url, headers=headers, timeout=1) as resp: print('url status:{}'.format(resp.status)) # if resp.status in [200, 201]: data = etree.HTML(await resp.read()) return data except Exception as e: print('错误为:{} url:{}'.format(e, url)) def extract_urls(html): try: for url in html.xpath('//a/@href'): if url and url.startswith("http") and url not in seen_uels: if re.findall(r'baidu', url): waitting_urs.append(url) except: pass async def init_urls(url, session): html = await fetch(url, session) seen_uels.add(url) extract_urls(html) async def article_handler(url, session, pool): # 获取文章详情 html = await fetch(url, session) seen_uels.add(url) extract_urls(html) try: title = html.xpath('//title/text()')[0].strip() print('title:{}'.format(title)) async with pool.acquire() as conn: async with conn.cursor() as cursor: try: # 插入 await cursor.execute('insert into async_test_async(title) values("{}")'.format(title)) # 插入数据 await cursor.execute("insert into async_test_async(title) values('{}')".format(title)) # 查询数据 await cursor.execute("select * from async_test_async") data = await cursor.fetchall() print("data:", data) # 更新数据 await cursor.execute("update async_test_async set title='{}' where id={}".format('update', 10168)) # 删除数据 await cursor.execute("delete from async_test_async where id={}".format(10174)) except: pass except: pass async def consumer(pool): async with aiohttp.ClientSession() as session: while not stoppint: if len(waitting_urs) < 10: if url not in seen_uels: asyncio.ensure_future(init_urls(url, session)) url = waitting_urs.pop() print('start get url:{}'.format(url)) if re.findall(r'baidu', url): if url not in seen_uels: print('waitting_urs:{}'.format(waitting_urs[0: 3])) asyncio.ensure_future(article_handler(url, session, pool)) await asyncio.sleep(0.1) async def main(loop): pool = await aiomysql.create_pool(host='127.0.0.1', port=3306, user='root', password='root', db='cfda', loop=loop, charset='utf8', autocommit=True) async with aiohttp.ClientSession() as session: html = await fetch(start_url, session) seen_uels.add(start_url) extract_urls(html) asyncio.ensure_future(consumer(pool)) if __name__ == "__main__": loop = asyncio.get_event_loop() loop.run_until_complete(main(loop)) loop.run_forever()
aiohttp_spider_class:
import asyncio import re import aiohttp import aiomysql from pyquery import PyQuery from lxml import etree start_url = 'http://news.baidu.com/' waitting_urs = [] seen_uels = set() stoppint = False sem = asyncio.Semaphore(10) # 现在并发为3个 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"} class async_text(object): async def fetch(self, url, session): print("self:", self) async with sem: # await asyncio.sleep(1) try: async with session.get(url, headers=headers, timeout=1) as resp: print('url status:{}'.format(resp.status)) # if resp.status in [200, 201]: data = etree.HTML(await resp.read()) return data except Exception as e: print('错误为:{} url:{}'.format(e, url)) def extract_urls(self, html): try: for url in html.xpath('//a/@href'): if url and url.startswith("http") and url not in seen_uels: if re.findall(r'baidu', url): waitting_urs.append(url) except: pass async def init_urls(self, url, session): html = await self.fetch(self, url, session) seen_uels.add(url) self.extract_urls(self, html) async def article_handler(self, url, session, pool): # 获取文章详情 html = await self.fetch(self, url, session) seen_uels.add(url) self.extract_urls(self, html) try: title = html.xpath('//title/text()')[0].strip() print('title:{}'.format(title)) async with pool.acquire() as conn: async with conn.cursor() as cur: try: # 插入 await cur.execute('insert into async_test_async(title) values("{}")'.format(title)) except: pass except: pass async def consumer(self, pool): async with aiohttp.ClientSession() as session: while not stoppint: if len(waitting_urs) < 10: if url not in seen_uels: asyncio.ensure_future(self.init_urls(self, url, session)) url = waitting_urs.pop() print('start get url:{}'.format(url)) if re.findall(r'baidu', url): if url not in seen_uels: print('waitting_urs:{}'.format(waitting_urs[0: 3])) asyncio.ensure_future(self.article_handler(self, url, session, pool)) await asyncio.sleep(0.1) @classmethod async def main(self, loop): pool = await aiomysql.create_pool(host='127.0.0.1', port=3306, user='root', password='root', db='cfda', loop=loop, charset='utf8', autocommit=True) async with aiohttp.ClientSession() as session: html = await self.fetch(self, start_url, session) seen_uels.add(start_url) self.extract_urls(self, html) asyncio.ensure_future(self.consumer(self, pool)) if __name__ == "__main__": loop = asyncio.get_event_loop() loop.run_until_complete(async_text.main(loop)) loop.run_forever()
aiohttp_spider的更多相关文章
随机推荐
- 【解决 FTP】windows访问Linux的vsftpd(FTP服务器)问题200 Switching to ASCII mode,227 Entering Passive Mode
转载:关于FTP主动模式(active mode)与被动模式(passive mode)的工作原理: 主动模式(服务器向客户端敲门,然后客户端开门)FTP:客户机与服务器之间建立连接时,客户机是大于1 ...
- airtest自动化测试工具的环境安装与使用
AirtestIDE的下载与安装 AirtestIDE已经帮你集成了所有的环境,自带录制脚本栏,自动生成脚本,自带编辑器还自带模拟器. 下载地址是Airtest的官网:http://airtest.n ...
- element-ui 中dialog居中
.el-dialog{ display: flex; flex-direction: column; margin:0 !important; ...
- npm --save-dev 和 --save 的区别
转载 >>> 1. npm install 在安装 npm 包时,有两种命令参数可以把它们的信息写入 package.json 文件, 一个是npm install--save ...
- AcWing 791. 高精度加法 解题记录
题目地址 https://www.acwing.com/problem/content/description/793/ 题目描述给定两个正整数,计算它们的和. 输入格式共两行,每行包含一个整数. 输 ...
- 微信小程序开发练习
微信小程序开发工具git管理 https://blog.csdn.net/qq_36672905/article/details/82887102 这个开发工具的界面和交互真的是熟悉又友好,吹爆他
- CSP2019游记(翻车记)
Preface 也许是人生中最重要的一场比赛了(再进不了冬令营我就没了) 结果不论怎样,想必也都是人生中的一次分水岭吧 从暑假开始到今天的一段时间,自己似乎终于找到了学OI的动力与乐趣.能认识到更多志 ...
- LOJ6033「雅礼集训 2017 Day2」棋盘游戏 (博弈论,二分图,匈牙利算法)
什么神仙思路啊-- 看到棋盘就去想二分图.(smg啊)(其实是校内模拟赛有基本一样的题,只不过直接给了个二分图) 看到二分图就去想最大匹配.(我怎么想偶环的性质去了) (以下内容摘自这里) 这个二分图 ...
- 模拟ssh远程执行命令
目录 一.服务端 二.客户端 一.服务端 from socket import * import subprocess server = socket(AF_INET, SOCK_STREAM) se ...
- 【Collect】免费图片库网站推荐(国外高清可商用)
#国外高清可商用免费图片库 1.https://unsplash.com/2.https://pixabay.com/3.https://www.sitebuilderreport.com/stock ...