1. import requests
  2. from pyquery import PyQuery as pq
  3. base_headers = {
  4. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
  5. 'Accept-Encoding': 'gzip, deflate, sdch',
  6. 'Accept-Language': 'zh-CN,zh;q=0.8'
  7. }
  8. def get_page(url):
  9. headers = dict(base_headers)
  10. print('Getting',url)
  11. try:
  12. r = requests.get(url,headers=headers)
  13. print('Getting result',url,r.status_code)
  14. if r.status_code == 200:
  15. return r.text
  16. except ConnectionError:
  17. pritn('Cramling Failed',url)
  18. return None
  19.  
  20. # 道生一:创建抽取代理的metaclass
  21. class ProxyMetaclass(type):
  22. """
  23. 元类,在FreeProxyGetter类中加入
  24. __CrawlName__,__CrawlFunc__和__CrawlFuncCount__
  25. 三个参数,分别表示爬虫函数名,函数实例和爬虫函数的数量。
  26. """
  27. # __new__控制__init__的执行,所以在其执行之前
  28. # cls:代表要__init__的类,此参数在实例化时由Python解释器自动提供
  29. # name:类名
  30. # bases:代表继承父类的集合
  31. # attrs:类的方法及属性的集合
  32. def __new__(cls, name, bases, attrs):
  33. count = 0
  34. # 在attrs字典加入两个key,key的值是个列表
  35. attrs['__CrawlFunc__'] = []
  36. attrs['__CrawlName__'] = []
  37. for k, v in attrs.items():
  38. if 'crawl_' in k:
  39. attrs['__CrawlName__'].append(k) #函数名依次添加进attrs['__CrawlName__']列表
  40. attrs['__CrawlFunc__'].append(v) #函数实例依次添加进attrs['__CrawlFunc__']列表
  41. print(k,v)
  42. #print(attrs['__CrawlName__'])
  43. count += 1
  44. for k in attrs['__CrawlName__']:
  45. # 剔除原有的字典键值对
  46. attrs.pop(k)
  47. attrs['__CrawlFuncCount__'] = count
  48. print(attrs)
  49. return type.__new__(cls, name, bases, attrs)
  50.  
  51. # 一生二:创建代理获取类
  52.  
  53. class ProxyGetter(object, metaclass=ProxyMetaclass):
  54. def get_raw_proxies(self, site):
  55. proxies = []
  56. print('Site', site)
  57. for func in self.__CrawlFunc__:
  58. if func.__name__==site:
  59. this_page_proxies = func(self)
  60. for proxy in this_page_proxies:
  61. print('Getting', proxy, 'from', site)
  62. proxies.append(proxy)
              print(proxies)
  63. return proxies
  64.  
  65. def crawl_daili66(self, page_count=4):
  66. start_url = 'http://www.66ip.cn/{}.html'
  67. urls = [start_url.format(page) for page in range(1, page_count + 1)] # format和%s的用法一样
  68. for url in urls:
  69. print('Crawling', url)
  70. html = get_page(url)
  71. if html:
  72. doc = pq(html)
  73. trs = doc('.containerbox table tr:gt(0)').items()
  74. for tr in trs:
  75. ip = tr.find('td:nth-child(1)').text()
  76. port = tr.find('td:nth-child(2)').text()
  77. yield ':'.join([ip, port])
  78.   '''
  79. def crawl_proxy360(self):
  80. start_url = 'http://www.proxy360.cn/Region/China'
  81. print('Crawling', start_url)
  82. html = get_page(start_url)
  83. if html:
  84. doc = pq(html)
  85. lines = doc('div[name="list_proxy_ip"]').items()
  86. for line in lines:
  87. ip = line.find('.tbBottomLine:nth-child(1)').text()
  88. port = line.find('.tbBottomLine:nth-child(2)').text()
  89. yield ':'.join([ip, port])
  90.   '''
  91. def crawl_goubanjia(self):
  92. start_url = 'http://www.goubanjia.com/free/gngn/index.shtml'
  93. html = get_page(start_url)
  94. if html:
  95. doc = pq(html)
  96. tds = doc('td.ip').items()
  97. for td in tds:
  98. td.find('p').remove()
  99. yield td.text().replace(' ', '')
  100.  
  101. if __name__ == '__main__':
  102. # 二生三:实例化ProxyGetter
  103. crawler = ProxyGetter()
  104. print(crawler.__CrawlName__)
  105. # 三生万物
  106. for site_label in range(crawler.__CrawlFuncCount__):
  107. site = crawler.__CrawlName__[site_label] # site_label是列表的索引值
  108. myProxies = crawler.get_raw_proxies(site)

运行结果

  1. D:\pythontest>python proxy_ip.py
  2. crawl_goubanjia <function ProxyGetter.crawl_goubanjia at 0x00000000035D2510>
  3. crawl_daili66 <function ProxyGetter.crawl_daili66 at 0x00000000035D2488>
  4. {'__qualname__': 'ProxyGetter', '__module__': '__main__', '__CrawlName__': ['cra
  5. wl_goubanjia', 'crawl_daili66'], '__CrawlFunc__': [<function ProxyGetter.crawl_g
  6. oubanjia at 0x00000000035D2510>, <function ProxyGetter.crawl_daili66 at 0x000000
  7. 00035D2488>], 'get_raw_proxies': <function ProxyGetter.get_raw_proxies at 0x0000
  8. 0000035D2400>, '__CrawlFuncCount__': }
  9. ['crawl_goubanjia', 'crawl_daili66']
  10. Site crawl_goubanjia
  11. Getting http://www.goubanjia.com/free/gngn/index.shtml
  12. Getting result http://www.goubanjia.com/free/gngn/index.shtml 403
  13. []
  14. Site crawl_daili66
  15. Crawling=== http://www.66ip.cn/1.html
  16. Getting http://www.66ip.cn/1.html
  17. Getting result http://www.66ip.cn/1.html 200
  18. Getting 123.163.97.198: from crawl_daili66
  19. Getting 36.249.109.21: from crawl_daili66
  20. Getting 163.204.245.52: from crawl_daili66
  21. Getting 222.189.247.207: from crawl_daili66
  22. Getting 87.250.218.12: from crawl_daili66
  23. Getting 118.172.176.61: from crawl_daili66
  24. Getting 134.119.214.206: from crawl_daili66
  25. Getting 110.74.208.154: from crawl_daili66
  26. Crawling=== http://www.66ip.cn/2.html
  27. Getting http://www.66ip.cn/2.html
  28. Getting result http://www.66ip.cn/2.html 200
  29. Getting 120.234.138.102: from crawl_daili66
  30. Getting 110.86.136.127: from crawl_daili66
  31. Getting 59.57.38.197: from crawl_daili66
  32. Getting 202.62.86.94: from crawl_daili66
  33. Getting 210.22.176.146: from crawl_daili66
  34. Getting 180.183.136.212: from crawl_daili66
  35. Getting 183.87.153.98: from crawl_daili66
  36. Getting 222.124.2.186: from crawl_daili66
  37. Getting 123.169.126.9: from crawl_daili66
  38. Getting 123.169.126.93: from crawl_daili66
  39. Getting 158.255.249.58: from crawl_daili66
  40. Getting 1.198.72.242: from crawl_daili66
  41. Crawling=== http://www.66ip.cn/3.html
  42. Getting http://www.66ip.cn/3.html
  43. Getting result http://www.66ip.cn/3.html 200
  44. Getting 163.204.246.10: from crawl_daili66
  45. Getting 186.159.112.6: from crawl_daili66
  46. Getting 163.204.246.102: from crawl_daili66
  47. Getting 88.87.72.72: from crawl_daili66
  48. Getting 193.169.118.6: from crawl_daili66
  49. Getting 196.216.220.204: from crawl_daili66
  50. Getting 185.109.62.124: from crawl_daili66
  51. Getting 1.193.246.78: from crawl_daili66
  52. Getting 188.131.239.119: from crawl_daili66
  53. Getting 1.10.188.93: from crawl_daili66
  54. Getting 182.116.237.203: from crawl_daili66
  55. Getting 139.99.223.230: from crawl_daili66
  56. Crawling=== http://www.66ip.cn/4.html
  57. Getting http://www.66ip.cn/4.html
  58. Getting result http://www.66ip.cn/4.html 200
  59. Getting 163.204.246.232: from crawl_daili66
  60. Getting 117.28.96.105: from crawl_daili66
  61. Getting 202.29.220.34: from crawl_daili66
  62. Getting 123.169.114.80: from crawl_daili66
  63. Getting 115.42.34.3: from crawl_daili66
  64. Getting 41.84.131.78: from crawl_daili66
  65. Getting 123.163.96.207: from crawl_daili66
  66. Getting 182.35.83.12: from crawl_daili66
  67. Getting 191.241.226.230: from crawl_daili66
  68. Getting 202.138.236.35: from crawl_daili66
  69. Getting 194.1.193.226: from crawl_daili66
  70. Getting 202.158.77.122: from crawl_daili66

['123.163.97.198:9999', '36.249.109.21:9999', '163.204.245.52:9999', '222.189.247.207:9999', '87.250.218.12:44168',
'118.172.176.61:8080', '134.119.214.206:1080', '110.74.208.154:21776', '120.234.138.102:53779', '110.86.136.127:9999',
'59.57.38.197:9999', '202.62.86.94:83', '210.22.176.146:37299', '180.183.136.212:8080', '183.87.153.98:49602',
'222.124.2.186:8080', '123.169.126.9:3', '123.169.126.93:9999', '158.255.249.58:50100', '1.198.72.242:9999',
'163.204.246.10:2', '186.159.112.6:53281', '163.204.246.102:9999', '88.87.72.72:8080', '193.169.118.6:53281',
'185.109.62.124:808', '1.193.246.78:9999', '188.131.239.119:8118', '1.10.188.93:34871', '182.116.237.203:9999',
'139.99.223.230:8080', '163.204.246.232:9999', '117.28.96.105:9999', '202.29.220.34:38961', '123.169.114.80:9999',
'115.42.34.3:8080', '41.84.131.78:53281', '123.163.96.207:9999', '182.35.83.12:9999', '191.241.226.230:53281',
'202.138.236.35:56413', '194.1.193.226:35646','196.216.220.204:36739', '202.158.77.122:47284']

  1.  
  1. //看来只有一个代理网站能爬到数据

Python 爬取各大代理IP网站(元类封装)的更多相关文章

  1. python爬取高匿代理IP(再也不用担心会进小黑屋了)

    为什么要用代理IP 很多人学习python,不知道从何学起.很多人学习python,掌握了基本语法过后,不知道在哪里寻找案例上手.很多已经做案例的人,却不知道如何去学习更加高深的知识.那么针对这三类人 ...

  2. 利用Python爬取可用的代理IP

    前言 就以最近发现的一个免费代理IP网站为例:http://www.xicidaili.com/nn/.在使用的时候发现很多IP都用不了. 所以用Python写了个脚本,该脚本可以把能用的代理IP检测 ...

  3. 手把手教你使用Python爬取西刺代理数据(下篇)

    /1 前言/ 前几天小编发布了手把手教你使用Python爬取西次代理数据(上篇),木有赶上车的小伙伴,可以戳进去看看.今天小编带大家进行网页结构的分析以及网页数据的提取,具体步骤如下. /2 首页分析 ...

  4. python scrapy 爬取西刺代理ip(一基础篇)(ubuntu环境下) -赖大大

    第一步:环境搭建 1.python2 或 python3 2.用pip安装下载scrapy框架 具体就自行百度了,主要内容不是在这. 第二步:创建scrapy(简单介绍) 1.Creating a p ...

  5. python+scrapy 爬取西刺代理ip(一)

    转自:https://www.cnblogs.com/lyc642983907/p/10739577.html 第一步:环境搭建 1.python2 或 python3 2.用pip安装下载scrap ...

  6. 学以致用:Python爬取廖大Python教程制作pdf

    当我学了廖大的Python教程后,感觉总得做点什么,正好自己想随时查阅,于是就开始有了制作PDF这个想法. 想要把教程变成PDF有三步: 先生成空html,爬取每一篇教程放进一个新生成的div,这样就 ...

  7. 深夜,我用python爬取了整个斗图网站,不服来斗

    QQ.微信斗图总是斗不过,索性直接来爬斗图网,我有整个网站的图,不服来斗. 废话不多说,选取的网站为斗图啦,我们先简单来看一下网站的结构 网页信息 从上面这张图我们可以看出,一页有多套图,这个时候我们 ...

  8. Scrapy爬取西刺代理ip流程

    西刺代理爬虫 1. 新建项目和爬虫 scrapy startproject daili_ips ...... cd daili_ips/ #爬虫名称和domains scrapy genspider ...

  9. python爬取网站数据

    开学前接了一个任务,内容是从网上爬取特定属性的数据.正好之前学了python,练练手. 编码问题 因为涉及到中文,所以必然地涉及到了编码的问题,这一次借这个机会算是彻底搞清楚了. 问题要从文字的编码讲 ...

随机推荐

  1. Integer之常用方法

    public static int parseInt(String s, int radix) throws NumberFormatException {} // radix进制 s 换成 10进制 ...

  2. 微信小程序常用API组件开发

    关于小程序 张小龙定义小程序: 1.不需要下载安装即可使用: 2.用完即走,不用关心是否安装太多应用: 3.应用无处不在,随时可用. 特点: 1.适合业务逻辑简单的应用: 2,.适合低频应用: 3.适 ...

  3. IDEA入门使用--二

    *)IDEA安装和破解:https://www.cnblogs.com/jajian/p/7989032.html    这次我安装的是最新版2019的IDEA *)导入项目时,根据提示,一步步来.其 ...

  4. Vue使用lib-flexible,将px转化为rem

    1.下载lib-flexible 我使用的是vue-cli+webpack,所以是通过npm来安装的 npm i lib-flexible --save 2.引入lib-flexible 在main. ...

  5. (转)Spring Boot干货系列:(七)默认日志logback配置解析

    转:http://tengj.top/2017/04/05/springboot7/ 前言 今天来介绍下Spring Boot如何配置日志logback,我刚学习的时候,是带着下面几个问题来查资料的, ...

  6. Codechef March Cook-Off 2018. Maximum Tree Path

    目录 题意 解析 AC_code @(Codechef March Cook-Off 2018. Maximum Tree Path) 题意 给你一颗\(n(1e5)\)个点有边权有点权的树,\(Mi ...

  7. jquery与其他js冲突

    var $j=JQuery.noConflict(); $j('#msg').hide();//此处$j就代表JQuery //重命名以下,把$改为$j

  8. 爬虫(四)—— 使用pyecharts展示数据

    pyecharts模块 pyecharts可以将数据形象的在页面中用图表显示 一.安装 pip install pyecharts 二.使用 import pyecharts # 创建一个页面 pag ...

  9. Java finally块

    try块也可以有零个或一个finally块. finally块总是与try块一起使用. 语法 finally块的语法是: 1 2 3 finally  {     // Code for finall ...

  10. 发布 Vant - 高效的 Vue 组件库,再造一个有赞移动商城也不在话下

    发布 Vant - 高效的 Vue 组件库,再造一个有赞移动商城也不在话下:https://segmentfault.com/a/1190000011377961 vantUI框架在vue项目中的应用 ...