Python爬虫 | IP池的使用
- 爬虫中为什么需要使用代理
- 代理的分类:
- 免费代理ip提供网站
- 匿名度:
- 透明:知道是代理ip,也会知道你的真实ip
- 匿名:知道是代理ip,不会知道你的真实ip
- 高匿:不知道是代理ip,不会知道你的真实ip
- 类型:
- http:只能请求http开头的url
- https:只能请求https开头的url
- import requests
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
- }
- url = ''
- # 不同的代理IP,代理ip的类型必须和请求url的协议头保持一致
- proxy_list = [
- {"http": ""},
- {'http': ''}
- ]
- # 随机获取代理IP
- proxy = random.choice(proxy_list)
- page_text = requests.get(url=url,headers=headers,proxies=proxy).text
- with open('ip.html','w',encoding='utf-8') as fp:
- fp.write(page_text)
- print('over!')
- import requests
- from lxml import etree
- import time
- import random
- from fake_useragent import UserAgent
- class GetProxyIP(object):
- def __init__(self):
- self.url = ''
- self.proxies = {
- 'http': '',
- 'https': ''}
- # 随机生成User-Agent
- def get_random_ua(self):
- ua = UserAgent() # 创建User-Agent对象
- useragent = ua.random
- return useragent
- # 从西刺代理网站上获取随机的代理IP
- def get_ip_file(self, url):
- headers = {'User-Agent': self.get_random_ua()}
- html = requests.get(url=url, proxies=self.proxies, headers=headers, timeout=5).content.decode('utf-8', 'ignore')
- parse_html = etree.HTML(html)
- tr_list = parse_html.xpath('//tr') # 基准xpath,匹配每个代理IP的节点对象列表
- for tr in tr_list[1:]:
- ip = tr.xpath('./td[2]/text()')[0]
- port = tr.xpath('./td[3]/text()')[0]
- self.test_proxy_ip(ip, port) # 测试ip:port是否可用
- # 测试抓取的代理IP是否可用
- def test_proxy_ip(self, ip, port):
- proxies = {
- 'http': 'http://{}:{}'.format(ip, port),
- 'https': 'https://{}:{}'.format(ip, port), }
- test_url = ''
- try:
- res = requests.get(url=test_url, proxies=proxies, timeout=8)
- if res.status_code == 200:
- print(ip, ":", port, 'Success')
- with open('proxies.txt', 'a') as f:
- f.write(ip + ':' + port + '\n')
- except Exception as e:
- print(ip, port, 'Failed')
- def main(self):
- for i in range(1, 1001):
- url = self.url.format(i)
- self.get_ip_file(url)
- time.sleep(random.randint(5, 10))
- if __name__ == '__main__':
- spider = GetProxyIP()
- spider.main()
- import random
- import requests
- class BaiduSpider(object):
- def __init__(self):
- self.url = ''
- self.headers = {'User-Agent': 'Mozilla/5.0'}
- self.flag = 1
- def get_proxies(self):
- with open('proxies.txt', 'r') as f:
- result = f.readlines() # 读取所有行并返回列表
- proxy_ip = random.choice(result)[:-1] # 获取了所有代理IP
- L = proxy_ip.split(':')
- proxy_ip = {
- 'http': 'http://{}:{}'.format(L[0], L[1]),
- 'https': 'https://{}:{}'.format(L[0], L[1])
- }
- return proxy_ip
- def get_html(self):
- proxies = self.get_proxies()
- if self.flag <= 3:
- try:
- html = requests.get(url=self.url, proxies=proxies, headers=self.headers, timeout=5).text
- print(html)
- except Exception as e:
- print('Retry')
- self.flag += 1
- self.get_html()
- if __name__ == '__main__':
- spider = BaiduSpider()
- spider.get_html()
- import requests
- from fake_useragent import UserAgent
- ua = UserAgent() # 创建User-Agent对象
- useragent = ua.random
- headers = {'User-Agent': useragent}
- def ip_test(ip):
- url = ''
- ip_port = ip.split(':')
- proxies = {
- 'http': 'http://{}:{}'.format(ip_port[0], ip_port[1]),
- 'https': 'https://{}:{}'.format(ip_port[0], ip_port[1]),
- }
- res = requests.get(url=url, headers=headers, proxies=proxies, timeout=5)
- if res.status_code == 200:
- return True
- else:
- return False
- # 提取代理IP
- def get_ip_list():
- # 快代理:
- api_url = ''
- html = requests.get(api_url).content.decode('utf-8', 'ignore')
- ip_port_list = html.split('\n')
- for ip in ip_port_list:
- with open('proxy_ip.txt', 'a') as f:
- if ip_test(ip):
- f.write(ip + '\n')
- if __name__ == '__main__':
- get_ip_list()
- proxies = {
- '协议':'协议://用户名:密码@IP:端口号'
- }
- proxies = {
- 'http':'http://用户名:密码@IP:端口号',
- 'https':'https://用户名:密码@IP:端口号'
- }
- proxies = {
- 'http': 'http://309435365:szayclhp@',
- 'https':'https://309435365:szayclhp@',
- }
- # 获取开放代理的接口
- import requests
- from fake_useragent import UserAgent
- ua = UserAgent() # 创建User-Agent对象
- useragent = ua.random
- headers = {'User-Agent': useragent}
- def ip_test(ip):
- url = ''
- ip_port = ip.split(':')
- proxies = {
- 'http': 'http://1786088386:b95djiha@{}:{}'.format(ip_port[0], ip_port[1]),
- 'https': 'http://1786088386:b95djiha@{}:{}'.format(ip_port[0], ip_port[1]),
- }
- res = requests.get(url=url, headers=headers, proxies=proxies, timeout=5)
- if res.status_code == 200:
- print("OK")
- return True
- else:
- print(res.status_code)
- print("错误")
- return False
- # 提取代理IP
- def get_ip_list():
- # 快代理:
- api_url = ''
- html = requests.get(api_url).content.decode('utf-8', 'ignore')
- ip_port_list = html.split('\n')
- for ip in ip_port_list:
- with open('proxy_ip.txt', 'a') as f:
- if ip_test(ip):
- f.write(ip + '\n')
- if __name__ == '__main__':
- get_ip_list()
- 写一个类;
- get_ip() requests请求接口,得到ip和port;
- test_ip() 请求某一网站,根据状态码或in判断是否有某一内容来判断此ip是否可用,返回Ture和False即可;
- save_ip()测试成功后保存;
