- import requests
- from bs4 import BeautifulSoup
- import lxml
- import telnetlib #验证代理的可用性
- import pymysql.cursors
- import random
- import threading
- BASEURL = '' #西刺首页
- urls = [BASEURL+ 'nn/',BASEURL+'nt/',BASEURL+'wn/',BASEURL+'wt/']#西刺分组(more)的ip信息链接列表
- #请求头信息,必须有User-Agent
- headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
- #proxies = {'https': '', 'http': ''}
- #获得与数据库的连接和游标
- def get_cc():
- # 连接MySQL数据库
- connection = pymysql.connect(host='', port=3306, user='root', password='root', db='iptables',
- charset='utf8', cursorclass=pymysql.cursors.DictCursor)
- # 通过cursor创建游标
- cursor = connection.cursor()
- return connection,cursor
- #保存ip_port到数据库
- def save_ip_port(ip_port):
- connection,cursor = get_cc()
- try:
- sql = 'insert into iptable(ip_port) values("'+ip_port+'")'
- cursor.execute(sql)
- except:
- print('保存'+ip_port+'失败!!!!!')
- else:
- connection.commit()
- connection.close()
- #从数据库获得ip_port
- def get_ip_port():
- connection,cursor = get_cc()
- sql_get_id = 'select id,ip_port from iptable'
- cursor.execute(sql_get_id)
- #fetchone()是查询一条数据
- id_list = cursor.fetchall()#得到所有的id的字典列表
- i = random.randint(0,len(id_list)-1)
- id_num = id_list[i]['id']
- ip_port = id_list[i]['ip_port'] #获得所有可用的代理
- return id_num,ip_port#返回id和ip_port:
- #删除被封的ip_port
- def del_ip_port(id_num):
- connection,cursor = get_cc()
- try:
- sql = 'delete from iptable where id = ' + str(id_num)
- cursor.execute(sql)
- except:
- print('删除'+ip_port+'失败!!!!!')
- else:
- connection.commit()
- connection.close()
- #获得代理
- def get_proxies(ip_port):#ip_port = ''
- proxy_ip = 'http://' + ip_port
- proxy_ips = 'https://' + ip_port
- proxies = {'https': proxy_ips, 'http': proxy_ip}
- return proxies
- #获得对应url分类的最大页码
- def get_max_pagenum(url): #url是more(分类)的链接,/nn,/nt....
- response = requests.get(url,headers = headers)
- status_code = response.status_code
- soup = BeautifulSoup(response.content,'lxml')
- max_pagenum = soup.find('div',attrs = {'class':'pagination'}).find_all('a')[-2].string
- max_pagenum = int(max_pagenum)
- return max_pagenum
- #验证代理是否有用,ip_port = ''
- #每得到一个ip_port都要进行验证,如果可用则保存,否则抛弃
- def verifyProxyList(ip_port):
- url = ''
- # proxies = { "http": "http://"+ ip_port }
- host ,port = ip_port.split(':')
- try:
- # res = requests.get(url,headers = headers,proxies = proxies,timeout = 5.0)
- telnetlib.Telnet(host, port=port, timeout=5)
- except:
- print('---Failur:' + ip_port)
- else:
- #ips.append(ip_port)#这里应该存储到Redis等数据库中
- save_ip_port(ip_port)
- def main(url,proxies):#这里是more的链接,/nn/1,/nn/2....
- try:
- response = requests.get(url,headers = headers,proxies = proxies,timeout = 5.0)
- status_code = response.status_code #503说明ip被封
- if(status_code !=响应的不是正常状态
- #删除旧的代理ip_port,这里还需要验证是否有bug
- old_ip_port = proxies['http'][7:]
- del_ip_port(old_ip_port)
- #修改代理,重新请求
- id_num,ip_port = get_ip_port()
- proxies = get_proxies(ip_port)
- print(str(proxies))
- return
- soup = BeautifulSoup(response.content,'lxml')
- results = soup.find_all('tr')#遍历所有的tr
- for result in results[1:]:#这里第一个tr子标签是th,所以会报错
- tdlist = result.find_all('td')
- ip_port = tdlist[1].string+':'+tdlist[2].string
- verifyProxyList(ip_port)
- except:
- print('请求异常......')
- class myThread(threading.Thread):
- def __init__(self, threadID, name, url):
- threading.Thread.__init__(self)
- self.threadID = threadID
- = name
- self.url = url
- def run(self):
- print('正在执行线程:'没有验证这一行的可行性
- id_num,ip_port = get_ip_port()
- proxies = get_proxies(ip_port)
- max_pagenum = get_max_pagenum(self.url)
- #print(max_pagenum)
- for i in range(1,max_pagenum):
- url = self.url + '/' + str(i)
- main(url,proxies)
- #4线程爬取西刺的ip代理池
- if __name__ == '__main__':
- t1 = myThread(1,"Thread-1",urls[0])
- t2 = myThread(2,"Thread-2",urls[1])
- t3 = myThread(3,"Thread-3",urls[2])
- t4 = myThread(4,"Thread-4",urls[3])
- t1.start()
- t2.start()
- t3.start()
- t4.start()
- t1.join()
- t2.join()
- t3.join()
- t4.join()
