搭建免费的代理ip池

需要解决的问题:

使用什么方式存储ip
- 文件存储
  
  缺点: 打开文件修改文件操作较麻烦
- mysql
  
  缺点: 查询速度较慢
- mongodb
  
  缺点: 查询速度较慢. 没有查重功能
- redis --> 使用redis存储最为合适
所以 -> 数据结构采用redis中的zset有序集合
获取ip的网站
- https://ip.jiangxianli.com/
- https://free.kuaidaili.com/free/intr/
项目架构？？？

项目架构

获取api
筛选api
验证api的有效性
提供api

项目结构图

项目结构如下:

项目代码

code文件夹

redis_proxy.py

# -*- encoding:utf-8 -*-

# @time: 2022/7/4 11:32

# @author: Maxs_hu

"""

这里用来做redis中间商. 去控制redis和ip之间的调用关系

"""

from redis import Redis

import random

class RedisProxy:

    def __init__(self):

        # 连接到redis数据库

        self.red = Redis(

            host='localhost',

            port=6379,

            db=9,

            password=123456,

            decode_responses=True

        )

    # 1. 存储到redis中. 存储之前需要提前判断ip是否存在. 防止将已存在的ip的score抵掉

    # 2. 需要校验所有的ip. 查询ip

    # 3. 验证可用性. 可用分值拉满. 不可用扣分

    # 4. 将可用的ip查出来返回给用户

    #       先给满分的

    #       再给有分的

    #       都没有分. 就不给

    def add_ip(self, ip):  # 外界调用并传入ip

        # 判断ip在redis中是否存在

        if not self.red.zscore('proxy_ip', ip):

            self.red.zadd('proxy_ip', {ip: 10})

            print('proxy_ip存储完毕', ip)

        else:

            print('存在重复', ip)

    def get_all_proxy(self):

        # 查询所有的ip功能

        return self.red.zrange('proxy_ip', 0, -1)

    def set_max_score(self, ip):

        self.red.zadd('proxy_ip', {ip: 100})  # 注意是引号的格式

    def deduct_score(self, ip):

        # 先将分数查询出来

        score = self.red.zscore('proxy_ip', ip)

        # 如果有分值.那就扣一分

        if score > 0:

            self.red.zincrby('proxy_ip', -1, ip)

        else:

            # 如果分值已经扣的小于0了. 那么可以直接删除了

            self.red.zrem('proxy_ip', ip)

    def effect_ip(self):

        # 先将ip通过分数筛选出来

        ips = self.red.zrangebyscore('proxy_ip', 100, 100, 0, -1)

        if ips:

            return random.choice(ips)

        else:  # 没有满分的

            # 将九十分以上的筛选出来

            ips = self.red.zrangebyscore('proxy_ip', 11, 99, 0, -1)

            if ips:

                return random.choice(ips)

            else:

                print('无可用ip')

                return None

ip_collection.py

# -*- encoding:utf-8 -*-

# @time: 2022/7/4 11:32

# @author: Maxs_hu

"""

这里用来收集ip

"""

from redis_proxy import RedisProxy

import requests

from lxml import html

from multiprocessing import Process

import time

import random

def get_kuai_ip(red):

    url = "https://free.kuaidaili.com/free/intr/"

    headers = {

        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"

    }

    resp = requests.get(url, headers=headers)

    etree = html.etree

    et = etree.HTML(resp.text)

    trs = et.xpath('//table//tr')

    for tr in trs:

        ip = tr.xpath('./td[1]/text()')

        port = tr.xpath('./td[2]/text()')

        if not ip:  # 将不含有ip值的筛除

            continue

        proxy_ip = ip[0] + ":" + port[0]

        red.add_ip(proxy_ip)

def get_unknown_ip(red):

    url = "https://ip.jiangxianli.com/"

    headers = {

        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"

    }

    resp = requests.get(url, headers=headers)

    etree = html.etree

    et = etree.HTML(resp.text)

    trs = et.xpath('//table//tr')

    for tr in trs:

        ip = tr.xpath('./td[1]/text()')

        port = tr.xpath('./td[2]/text()')

        if not ip:  # 将不含有ip值的筛除

            continue

        proxy_ip = ip[0] + ":" + port[0]

        red.add_ip(proxy_ip)

def get_happy_ip(red):

    page = random.randint(1, 5)

    url = f'http://www.kxdaili.com/dailiip/2/{page}.html'

    headers = {

        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"

    }

    resp = requests.get(url, headers=headers)

    etree = html.etree

    et = etree.HTML(resp.text)

    trs = et.xpath('//table//tr')

    for tr in trs:

        ip = tr.xpath('./td[1]/text()')

        port = tr.xpath('./td[2]/text()')

        if not ip:  # 将不含有ip值的筛除

            continue

        proxy_ip = ip[0] + ":" + port[0]

        red.add_ip(proxy_ip)

def get_nima_ip(red):

    url = 'http://www.nimadaili.com/'

    headers = {

        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"

    }

    resp = requests.get(url, headers=headers)

    etree = html.etree

    et = etree.HTML(resp.text)

    trs = et.xpath('//table//tr')

    for tr in trs:

        ip = tr.xpath('./td[1]/text()')  # 这里存在空值. 所以不能在后面加[0]

        if not ip:

            continue

        red.add_ip(ip[0])

def get_89_ip(red):

    page = random.randint(1, 26)

    url = f'https://www.89ip.cn/index_{page}.html'

    headers = {

        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"

    }

    resp = requests.get(url, headers=headers)

    etree = html.etree

    et = etree.HTML(resp.text)

    trs = et.xpath('//table//tr')

    for tr in trs:

        ip = tr.xpath('./td[1]/text()')

        if not ip:

            continue

        red.add_ip(ip[0].strip())

def main():

    # 创建一个redis实例化对象

    red = RedisProxy()

    print("开始采集数据")

    while 1:

        try:

            # 这里可以添加各种采集的网站

            print('>>>开始收集快代理ip')

            get_kuai_ip(red)  # 收集快代理

            # get_unknown_ip(red)  # 收集ip

            print(">>>开始收集开心代理ip")

            get_happy_ip(red)  # 收集开心代理

            print(">>>开始收集泥马代理ip")

            # get_nima_ip(red)  # 收集泥马代理

            print(">>>开始收集89代理ip")

            get_89_ip(red)

            time.sleep(60)

        except Exception as e:

            print('ip储存出错了', e)

            time.sleep(60)

if __name__ == '__main__':

    main()

    # 创建一个子进程

    # p = Process(target=main)

    # p.start()

ip_verify.py

# -*- encoding:utf-8 -*-

# @time: 2022/7/4 11:34

# @author: Maxs_hu

"""

这里用来验证ip的可用性: 使用携程发送请求增加效率

"""

from redis_proxy import RedisProxy

from multiprocessing import Process

import asyncio

import aiohttp

import time

async def verify_ip(ip, red, sem):

    timeout = aiohttp.ClientTimeout(total=10)  # 设置网页等待时间不超过十秒

    try:

        async with sem:

            async with aiohttp.ClientSession() as session:

                async with session.get(url='http://www.baidu.com/',

                                       proxy='http://'+ip,

                                       timeout=timeout) as resp:

                    page_source = await resp.text()

                    if resp.status in [200, 302]:

                        # 如果可用. 加分

                        red.set_max_score(ip)

                        print('验证没有问题. 分值拉满~', ip)

                    else:

                        # 如果不可用. 扣分

                        red.deduct_score(ip)

                        print('问题ip. 扣一分', ip)

    except Exception as e:

        print('出错了', e)

        red.deduct_score(ip)

        print('问题ip. 扣一分', ip)

async def task(red):

    ips = red.get_all_proxy()

    sem = asyncio.Semaphore(30)  # 设置每次三十的信号量

    tasks = []

    for ip in ips:

        tasks.append(asyncio.create_task(verify_ip(ip, red, sem)))

    if tasks:

        await asyncio.wait(tasks)

def main():

    red = RedisProxy()

    time.sleep(5)  # 初始的等待时间. 等待采集到数据

    print("开始验证可用性")

    while 1:

        try:

            asyncio.run(task(red))

            time.sleep(100)

        except Exception as e:

            print("ip_verify出错了", e)

            time.sleep(100)

if __name__ == '__main__':

    main()

    # 创建一个子进程

    # p = Process(target=main())

    # p.start()

ip_api.py

# -*- encoding:utf-8 -*-

# @time: 2022/7/4 11:35

# @author: Maxs_hu

"""

这里用来提供给用户ip接口. 通过写后台服务器. 用户访问我们的服务器就可以得到可用的代理ip:

   1. flask

   2. sanic --> 今天使用这个要稍微简单一点

"""

from redis_proxy import RedisProxy

from sanic import Sanic, json

from sanic_cors import CORS

from multiprocessing import Process

# 创建一个app

app = Sanic('ip')  # 随便给个名字

# 解决跨域问题

CORS(app)

red = RedisProxy()

@app.route('maxs_hu_ip')  # 添加路由

def api(req):  # 第一个请求参数固定. 请求对象

   ip = red.effect_ip()

   return json({"ip": ip})

def main():

   # 让sanic跑起来

   app.run(host='127.0.0.1', port=1234)

if __name__ == '__main__':

   main()

   # p = Process(target=main())

   # p.start()

runner.py

# -*- encoding:utf-8 -*-

# @time: 2022/7/5 17:36

# @author: Maxs_hu

from ip_api import main as api_run

from ip_collection import main as coll_run

from ip_verify import main as veri_run

from multiprocessing import Process

def main():

    # 设置互不干扰的三个进程

    p1 = Process(target=api_run)  # 只需要将目标函数的内存地址传过去即可

    p2 = Process(target=coll_run)

    p3 = Process(target=veri_run)

    p1.start()

    p2.start()

    p3.start()

if __name__ == '__main__':

    main()

测试ip是否可用.py

# -*- encoding:utf-8 -*-

# @time: 2022/7/5 18:15

# @author: Maxs_hu

import requests

def get_proxy():

    url = "http://127.0.0.1:1234/maxs_hu_ip"

    resp = requests.get(url)

    return resp.json()

def main():

    url = 'http://mip.chinaz.com/?query=' + get_proxy()["ip"]

    proxies = {

        "http": 'http://' + get_proxy()["ip"],

        "https": 'http://' + get_proxy()["ip"]  # 目前代理只支持http请求

    }

    headers = {

        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36",

    }

    resp = requests.get(url, proxies=proxies, headers=headers)

    resp.encoding = 'utf-8'

    print(resp.text)  # 物理位置

if __name__ == '__main__':

    main()

运行效果

项目运行截图:

redis储存截图:

总结

免费代理ip只支持http的网页操作. 并不好用. 如果有需求可以进行购买然后加入ip代理池
网页部署到自己的服务器上. 别人访问自己的服务器. 以后学了全栈可以加上登录. 和付费功能. 实现功能的进一步拓展
项目架构是生产者消费者模型. 三个模块同时运行. 每个模块为一个进程. 互不影响
代理设计细节有待处理. 但总体运行效果还可以. 遇到问题再修改

基于后端和爬虫创建的代理ip池的更多相关文章

构建一个给爬虫使用的代理IP池
做网络爬虫时,一般对代理IP的需求量比较大.因为在爬取网站信息的过程中,很多网站做了反爬虫策略,可能会对每个IP做频次控制.这样我们在爬取网站时就需要很多代理IP. 代理IP的获取,可以从以下几个途径 ...
爬虫爬取代理IP池及代理IP的验证
最近项目内容需要引入代理IP去爬取内容. 为了项目持续运行,需要不断构造.维护.验证代理IP. 为了绕过服务端对IP 和频率的限制,为了阻止服务端获取真正的主机IP. 一.服务器如何获取客户端IP ...
Python爬虫教程-11-proxy代理IP，隐藏地址（猫眼电影）
Python爬虫教程-11-proxy代理IP,隐藏地址(猫眼电影) ProxyHandler处理(代理服务器),使用代理IP,是爬虫的常用手段,通常使用UserAgent 伪装浏览器爬取仍然可能被网 ...
反爬虫2（代理ip）
在进行爬虫访问时,被访问主机除了会校验访问身份,还会校验访问者的ip, 当短时间同ip大量访问时,主机有可能会拒绝返回,所以就现需要代理ip, 百度中可以获取到大量的免费的代理ip(ps:注意在访问 ...
Python爬虫篇（代理IP）--lizaza.cn
在做网络爬虫的过程中经常会遇到请求次数过多无法访问的现象,这种情况下就可以使用代理IP来解决.但是网上的代理IP要么收费,要么没有API接口.秉着能省则省的原则,自己创建一个代理IP库. 废话不多说, ...
python爬虫构建代理ip池抓取数据库的示例代码
爬虫的小伙伴,肯定经常遇到ip被封的情况,而现在网络上的代理ip免费的已经很难找了,那么现在就用python的requests库从爬取代理ip,创建一个ip代理池,以备使用. 本代码包括ip的爬取,检 ...
爬虫入门到放弃系列05：从程序模块设计到代理IP池
前言上篇文章吧啦吧啦讲了一些有的没的,现在还是回到主题写点技术相关的.本篇文章作为基础爬虫知识的最后一篇,将以爬虫程序的模块设计来完结. 在我漫(liang)长(nian)的爬虫开发生涯中,我通常将 ...
【python3】如何建立爬虫代理ip池
一.为什么需要建立爬虫代理ip池在众多的网站防爬措施中,有一种是根据ip的访问频率进行限制的,在某段时间内,当某个ip的访问量达到一定的阀值时,该ip会被拉黑.在一段时间内被禁止访问. 这种时候,可 ...
Python爬虫代理IP池
目录[-] 1.问题 2.代理池设计 3.代码模块 4.安装 5.使用 6.最后在公司做分布式深网爬虫,搭建了一套稳定的代理池服务,为上千个爬虫提供有效的代理,保证各个爬虫拿到的都是对应网站有效的代 ...

随机推荐

Day 007：PAT训练--1108 Finding Average (20 分)
话不多说: 该题要求将给定的所有数分为两类,其中这两类的个数差距最小,且这两类分别的和差距最大. 可以发现,针对第一个要求,个数差距最小,当给定个数为偶数时,二分即差距为0,最小:若给定个数为奇数时, ...
Jwt验证登录
练习模板:https://gitee.com/zh1446802857/swagger-multi-version-api.git Jwt在我的认知里,是一套门锁.别人(用户)需要用到你的接口的时 ...
使用CreateThreadPool创建线程池
使用Windows API函数来创建线程池,可以极大的方便了自己编写线程池的繁琐步骤. 使用CreateThreadPool来创建一个线程池,需要在创建完成后,初始化线程池的状态,并且在不需要的时候清 ...
一文带你速懂虚拟化KVM和XEN
来源 :蛋蛋团前言 "云计算"这个技术经过十余年的普及到如今已经可以称得上是家喻户晓了,基于云计算平台,在多个领域内创造了一个又一个的记录:电子商务里亿万人同时在线抢购的的&qu ...
以rem为单位，数值较小，border-radius：50%部分浏览器渲染不圆方法
元素使用rem做单位且较小时,对于border-radius:50%在部分浏览器不圆解决方法: 1.将原来宽高扩大至两倍(.1rem --> .2rem),再使用transform:scale( ...
ElasticSearch7.3学习(二十二)----Text字段排序、Scroll分批查询场景解析
1.Text字段排序场景:数据库中按照某个字段排序,sql只需写order by 字段名即可,如果es对一个text field进行排序,es中无法排序.因为文档入倒排索引表时,分词存入,es无法知 ...
Java学习笔记-基础语法Ⅷ-泛型、Map
泛型泛型本质上是参数化类型,也就是说所操作的数据类型被指定为一个参数,即将类型由原来的具体的类型参数化,然后在使用/调用时传入具体的类型,这种参数类型可以用在类.方法和接口中,分别为泛型类.泛型方法 ...
函数式接口和@FunctionalInterface
函数式接口的特点接口有且仅有一个抽象方法允许定义静态方法和默认方法(这两个都不是抽象方法) 允许java.lang.Object中的public方法(因为任何一个函数式接口的实现,默认都继承了Ob ...
Web Api源码(路由注册)
这篇文章只是我学习Web API框架的输出,学习方法还是输出倒逼输入比较行得通,所以不管写的好不好,坚持下去,肯定有收获.篇幅比较长,仔细思考阅读下来大约需要几分钟. 做.NET开发有好几年时间了,从 ...
Nacos源码系列—订阅机制的前因后果(下)
点赞再看,养成习惯,微信搜索[牧小农]关注我获取更多资讯,风里雨里,小农等你,很高兴能够成为你的朋友. 项目源码地址:公众号回复 nacos,即可免费获取源码事件发布在上一节中我们讲解了在Noti ...

基于后端和爬虫创建的代理ip池

搭建免费的代理ip池

项目架构

项目代码

运行效果

总结

基于后端和爬虫创建的代理ip池的更多相关文章

随机推荐

热门专题