简介

aiohttp需要python3.5.3以及更高的版本,它不但能做客户端爬虫,也能做服务器端,利用asyncio,协程,十分高效

官方文档

采集模板

一批,一次性采集

  1. import asyncio
  2. import logging
  3. import time
  4. from aiohttp import ClientSession, ClientTimeout
  5. logging.basicConfig(level=logging.INFO, format='[%(asctime)s] - %(levelname)s in %(filename)s.%(funcName)s: %(message)s')
  6. # 默认请求头
  7. HEADERS = {
  8. 'accept': 'text/javascript, text/html, application/xml, text/xml, */*',
  9. 'accept-encoding': 'gzip, deflate, br',
  10. 'accept-language': 'zh-CN,zh;q=0.9',
  11. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
  12. 'Chrome/69.0.3497.100 Safari/537.36',
  13. }
  14. # 默认超时时间
  15. TIMEOUT = 15
  16. class AioCrawl:
  17. def __init__(self):
  18. self.logger = logging.getLogger(__name__)
  19. async def fetch(self, url, method='GET', headers=None, timeout=TIMEOUT, cookies=None, data=None):
  20. """采集纤程"""
  21. method = 'POST' if method.upper() == 'POST' else 'GET'
  22. headers = headers if headers else HEADERS
  23. timeout = ClientTimeout(total=timeout)
  24. cookies = cookies if cookies else None
  25. data = data if data and isinstance(data, dict) else {}
  26. async with ClientSession(headers=headers, timeout=timeout, cookies=cookies) as session:
  27. try:
  28. if method == 'GET':
  29. async with session.get(url) as response:
  30. return await response.read()
  31. else:
  32. async with session.post(url, data=data) as response:
  33. return await response.read()
  34. except Exception as e:
  35. raise e
  36. def prepare_fetch(self, urls):
  37. """准备future_list"""
  38. return [asyncio.ensure_future(self.fetch(url)) for url in urls]
  39. def crawl_batch_urls(self, urls):
  40. """执行采集"""
  41. future_list = self.prepare_fetch(urls)
  42. loop = asyncio.get_event_loop()
  43. loop.run_until_complete(asyncio.wait(future_list))
  44. self.logger.info('采集完一批: {}'.format(len(urls)))
  45. return future_list
  46. if __name__ == '__main__':
  47. a = AioCrawl()
  48. # 2-4秒
  49. t0 = time.time()
  50. future_list = a.crawl_batch_urls(['https://www.sina.com.cn' for _ in range(5)])
  51. print(time.time() - t0)
  52. for future in future_list:
  53. if future.exception():
  54. print(future.exception())
  55. else:
  56. print(len(future.result()))

动态添加任务

  1. import asyncio
  2. import time
  3. from threading import Thread
  4. from aiohttp import ClientSession, ClientTimeout
  5. # 默认请求头
  6. HEADERS = {
  7. 'accept': 'text/javascript, text/html, application/xml, text/xml, */*',
  8. 'accept-encoding': 'gzip, deflate, br',
  9. 'accept-language': 'zh-CN,zh;q=0.9',
  10. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
  11. 'Chrome/69.0.3497.100 Safari/537.36',
  12. }
  13. # 默认超时时间
  14. TIMEOUT = 15
  15. def start_loop(loop):
  16. """驱动事件循环"""
  17. asyncio.set_event_loop(loop)
  18. loop.run_forever()
  19. async def fetch(url, method='GET', headers=None, timeout=TIMEOUT, cookies=None, data=None):
  20. """采集纤程"""
  21. print(url)
  22. method = 'POST' if method.upper() == 'POST' else 'GET'
  23. headers = headers if headers else HEADERS
  24. timeout = ClientTimeout(total=timeout)
  25. cookies = cookies if cookies else None
  26. data = data if data and isinstance(data, dict) else {}
  27. async with ClientSession(headers=headers, timeout=timeout, cookies=cookies) as session:
  28. try:
  29. if method == 'GET':
  30. async with session.get(url) as response:
  31. content = await response.read()
  32. return response.status, content
  33. else:
  34. async with session.post(url, data=data) as response:
  35. content = await response.read()
  36. return response.status, content
  37. except Exception as e:
  38. raise e
  39. def callback(future):
  40. """回调函数"""
  41. try:
  42. print(future.result())
  43. except Exception as e:
  44. print(e)
  45. print(type(future))
  46. print(future)
  47. if __name__ == '__main__':
  48. # 启动事件循环
  49. loop = asyncio.new_event_loop()
  50. t = Thread(target=start_loop, args=(loop,))
  51. t.setDaemon(True)
  52. t.start()
  53. f = asyncio.run_coroutine_threadsafe(fetch('https://www.sina.com.cn'), loop)
  54. f.add_done_callback(callback) # 给future对象添加回调函数
  55. time.sleep(5) # 否则看不到结果

动态添加任务,封装成类

  1. import asyncio
  2. import logging
  3. import time
  4. from threading import Thread
  5. from aiohttp import ClientSession, ClientTimeout, TCPConnector
  6. # 默认请求头
  7. HEADERS = {
  8. 'accept': 'text/javascript, text/html, application/xml, text/xml, */*',
  9. 'accept-encoding': 'gzip, deflate, br',
  10. 'accept-language': 'zh-CN,zh;q=0.9',
  11. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
  12. '(KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
  13. }
  14. # 默认超时时间
  15. TIMEOUT = 15
  16. def start_loop(loop):
  17. asyncio.set_event_loop(loop)
  18. loop.run_forever()
  19. class AioCrawl:
  20. def __init__(self):
  21. self.logger = logging.getLogger(__name__)
  22. # 启动事件循环
  23. self.event_loop = asyncio.new_event_loop()
  24. self.t = Thread(target=start_loop, args=(self.event_loop,))
  25. self.t.setDaemon(True)
  26. self.t.start()
  27. self.concurrent = 0 # 记录并发数
  28. async def fetch(self, url, method='GET', headers=None, timeout=TIMEOUT, cookies=None, data=None, proxy=None):
  29. """采集纤程
  30. :param url: str
  31. :param method: 'GET' or 'POST'
  32. :param headers: dict()
  33. :param timeout: int
  34. :param cookies:
  35. :param data: dict()
  36. :param proxy: str
  37. :return: (status, content)
  38. """
  39. method = 'POST' if method.upper() == 'POST' else 'GET'
  40. headers = headers if headers else HEADERS
  41. timeout = ClientTimeout(total=timeout)
  42. cookies = cookies if cookies else None
  43. data = data if data and isinstance(data, dict) else {}
  44. tcp_connector = TCPConnector(verify_ssl=False) # 禁用证书验证
  45. async with ClientSession(headers=headers, timeout=timeout, cookies=cookies, connector=tcp_connector) as session:
  46. try:
  47. if method == 'GET':
  48. async with session.get(url, proxy=proxy) as response:
  49. content = await response.read()
  50. return response.status, content
  51. else:
  52. async with session.post(url, data=data, proxy=proxy) as response:
  53. content = await response.read()
  54. return response.status, content
  55. except Exception as e:
  56. raise e
  57. def callback(self, future):
  58. """回调函数
  59. 1.处理并转换成Result对象
  60. 2.写数据库
  61. """
  62. msg = str(future.exception()) if future.exception() else 'success'
  63. code = 1 if msg == 'success' else 0
  64. status = future.result()[0] if code == 1 else None
  65. data = future.result()[1] if code == 1 else b'' # 空串
  66. data_len = len(data) if data else 0
  67. if code == 0 or (status is not None and status != 200): # 打印小异常
  68. self.logger.warning('<url="{}", code={}, msg="{}", status={}, data(len):{}>'.format(
  69. future.url, code, msg, status, data_len))
  70. self.concurrent -= 1 # 并发数-1
  71. print(len(data))
  72. def add_tasks(self, tasks):
  73. """添加任务
  74. :param tasks: list <class Task>
  75. :return: future
  76. """
  77. for task in tasks:
  78. # asyncio.run_coroutine_threadsafe 接收一个协程对象和,事件循环对象
  79. future = asyncio.run_coroutine_threadsafe(self.fetch(task), self.event_loop)
  80. future.add_done_callback(self.callback) # 给future对象添加回调函数
  81. self.concurrent += 1 # 并发数加 1
  82. if __name__ == '__main__':
  83. a = AioCrawl()
  84. for _ in range(5):
  85. a.add_tasks(['https://www.sina.com.cn' for _ in range(2)]) # 模拟动态添加任务
  86. time.sleep(1)

aiohttp笔记的更多相关文章

  1. aiohttp的笔记之TCPConnector

    TCPConnector维持链接池,限制并行连接的总量,当池满了,有请求退出再加入新请求.默认是100,limit=0的时候是无限制 1.use_dns_cache: 使用内部DNS映射缓存用以查询D ...

  2. Python开发【笔记】:aiohttp搭建简易聊天室

    简易聊天室: 1.入口main.py import logging import jinja2 import aiohttp_jinja2 from aiohttp import web from a ...

  3. python 学习笔记 aiohttp

    asyncio可以实现单进程并发IO操作,如果仅用在客户端,发挥的威力并不大,如果把asyncio用在服务器端,由于http链接就是IO操作, 因此可以用单线程+coroutine实现多客户的高并发支 ...

  4. 《用OpenResty搭建高性能服务端》笔记

    概要 <用OpenResty搭建高性能服务端>是OpenResty系列课程中的入门课程,主讲人:温铭老师.课程分为10个章节,侧重于OpenResty的基本概念和主要特点的介绍,包括它的指 ...

  5. DAY7-Python学习笔记

    前记: 这几天在弄小程序,view页面的开发很简单,但是在加载图片上遇到了问题,小程序的大小不能超过2M,所以大部分的图片内容要通过request请求服务器来获取,这里之前学习小程序的时候是通过网站A ...

  6. git-简单流程(学习笔记)

    这是阅读廖雪峰的官方网站的笔记,用于自己以后回看 1.进入项目文件夹 初始化一个Git仓库,使用git init命令. 添加文件到Git仓库,分两步: 第一步,使用命令git add <file ...

  7. js学习笔记:webpack基础入门(一)

    之前听说过webpack,今天想正式的接触一下,先跟着webpack的官方用户指南走: 在这里有: 如何安装webpack 如何使用webpack 如何使用loader 如何使用webpack的开发者 ...

  8. SQL Server技术内幕笔记合集

    SQL Server技术内幕笔记合集 发这一篇文章主要是方便大家找到我的笔记入口,方便大家o(∩_∩)o Microsoft SQL Server 6.5 技术内幕 笔记http://www.cnbl ...

  9. PHP-自定义模板-学习笔记

    1.  开始 这几天,看了李炎恢老师的<PHP第二季度视频>中的“章节7:创建TPL自定义模板”,做一个学习笔记,通过绘制架构图.UML类图和思维导图,来对加深理解. 2.  整体架构图 ...

随机推荐

  1. hibernate 中HQL查询

    由于比较简单,在此处只写一些HQL语言. 表关系,多对一. CREATE TABLE `user` ( `id` ) NOT NULL AUTO_INCREMENT, `uname` varchar( ...

  2. apache虚拟主机设置泛域名的方法

    在apache虚拟主机中设置泛域名解析,主要是用到ServerAlias 的配置. 1.支持多域名 例如,让mail.jbxue.org.smtp.jbxue.org.pop3.jbxue.org 都 ...

  3. TIM—基本定时器

    本章参考资料:< STM32F4xx 参考手册>.< STM32F4xx 规格书>.库帮助文档< stm32f4xx_dsp_stdperiph_lib_um.chm&g ...

  4. 一款基于jQuery外观优雅带遮罩弹出层对话框

    今天我们要来分享一款基于jQuery的弹出层对话框插件,该插件包含多种对话框类型,比如提示框.确认框等.更为实用的是,这款jQuery对话框中的按钮事件也可以被我们所捕获,从而相应对话框按钮的各种事件 ...

  5. love2d--glsl02变量和语句

    Shader分为顶点着色器和片段着色器,GPU先处理顶点再处理片段,大概可以这么理解, 顶点着色器处理模型里的点,输出处理后的数据,这些数据经过GPU其它模块处理后传入 片段着色器,经片段着色器综合后 ...

  6. [mmc/sdio]Linux下的sdio和mmc

    http://www.cnblogs.com/RandyQ/p/3607107.html

  7. Android——UI和View——控制方式

    控制方式 只用xml实现 <?xml version="1.0" encoding="utf-8"?> <LinearLayout xmlns ...

  8. TF和SD

    TF卡又称T-Flash卡,全名:TransFLash,又名:Micro SD SD卡(Secure Digital Memory Card,安全数码卡)

  9. jQuery开发中容易忽视的错误

    1.引用jQuery库文件的<script>标签,必须放在引用自定义脚本文件的<script>标签之前,否则,就会发生找不到对象:最好在<head>元素中,把引入样 ...

  10. cpio -H newc参数详解

    -H format 其中个format可以是: ‘bin’ The obsolete binary format. (2147483647 bytes) ‘odc’ The old (POSIX.1) ...