# coding=gbk

from bs4 import BeautifulSoup
import requests
import urllib
x = 1
y = 1 def crawl(url):
res = requests.get(url)
soup = BeautifulSoup(res.text, 'html.parser')
global y
with open(f'F:/pachong/xnt/{y}.txt','w',encoding="utf-8") as f:
y += 1
yinhuns = soup.select('img')
for yh in yinhuns:
link = yh.get('src')
global x
urllib.request.urlretrieve(link, f'F:/pachong/xnt/{x}.jpg')
x += 1 for i in range(1,5):
url = "https://acg.fi/hentai/23643.htm/" + str(i) try:
except ValueError as f:
except Exception as e:
  • 运行程序过程中返回下面结果
<img alt="A区(ACG.Fi)" class="logo" src="https://acg.fi/logo.png"/>
HTTP Error 403: Forbidden
  • 问题有三个

    • 搜索src值的时候,没有搜索到全部符合要找的图片网址
    • 返回的第一个网址出现了403错误,拒绝访问
    • soup.select返回的不是正确的list
  • 思考

    • 有可能所要找的网址中包含中文,无法编译
    • 如果通过正则对,请求的url的text进行,筛选
from bs4 import BeautifulSoup
import requests
import urllib
x = 1 def crawl(url, header): res = requests.get(url, headers=header)
soup = BeautifulSoup(res.text, 'html.parser') yinhuns = soup.find('div', attrs = {'id':"content-innerText"}).find_all('img',limit=4)
print(yinhuns) for yh in yinhuns: link = yh.get('src')
global x
urllib.request.urlretrieve(link, 'F:/pachong/xnt/{}.jpg'.format(x))
x += 1 header = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
for i in range(1,5):
url = "https://acg.fi/hentai/23643.htm/" + str(i) try:
crawl(url, header)
except ValueError as f:
except Exception as e:
  • 这个过程用了find(),find_all()方法,依旧没有解决list的问题
  • 后续过程使用urllib.parse.quote对中文部分重新编码,但是urllib.request.urlretrieve依然报错
  • 重新修改后

import requests
import urllib
import re
from PIL import Image
from io import BytesIO
x = 1 # 获取抓取的图片源网址
def crawl(url, header): res = requests.get(url, headers=header)
# 防止被反爬,打开后关闭
res = res.text
pattern = re.compile('http.*?apic.*?jpg')
result = re.findall(pattern, res)
return result # 对重编码的网址下载图片
def down(outs, folder_path):
global x
for out in outs:
# 获取新编码的URL地址
res = requests.get(out)
# 防止被反爬,打开后关闭
bf = BytesIO()
img = Image.open(bf)
img.save(folder_path + f"{x}.jpg")
x += 1 # 对获取的图片源网址进行重编码
def bianma(results):
outs = []
for s in results:
# 用正则筛选出中文部分
pattern = re.compile('[\u4e00-\u9fa5]+')
result = re.search(pattern, s)
su = result.group(0)
# 把中文部分重洗编码
li = urllib.parse.quote(su)
# 把原URL地址中文部分替换成编码后的
out = re.sub(pattern, li, s)
# 对列表进行去重并且按照原来的次序排列
outs_cp = sorted(set(outs), key=outs.index)
return outs_cp def main():
header = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
folder_path = 'F:/pachong/xnt/'
for i in range(1,5):
url = "https://acg.fi/hentai/23643.htm/" + str(i)
results = crawl(url, header)
outs = bianma(results)
down(outs, folder_path)
except Exception as e:
print(e) if __name__ == '__main__':
  • 对于图片路径中有中文的,可以使用BytesIO和PIL下载图片,证实可以有效解决
  • 几次试验出现[Errno 10054] 远程主机强迫关闭了一个现有的连接,可以在requests.get()后使用close()
  • 程序运行无误,就是有点慢,后期可以使用多线程尝试

