python3爬虫-通过requests爬取图虫网
import requests
from fake_useragent import UserAgent
from requests.exceptions import Timeout
from urllib.parse import quote, unquote
import re, json, os, hashlib
from lxml import etree
import time
from multiprocessing import Process, Queue, Pool # 之前想使用多进程,通过队列处理图片下载。没有实现 userAgent = UserAgent()
headers = {
"User-Agent": userAgent.random,
"Host": "tuchong.com",
"Referer": "https://tuchong.com/explore/"
}
baseUrl = "https://tuchong.com/rest/tag-categories/"
baseTagUrl = "https://tuchong.com/rest/tags/"
tagReferer = "https://tuchong.com/tags/" timeout = 5
s = requests.Session() dic = {
"subject": [],
"style": [],
"equipment": [],
"location": [],
} categoriesDict = {
"subject": "题材",
"style": "风格",
"equipment": "器材",
"location": "地区",
} def getCategoryPage(url, category, page=1):
try:
url = url + category
params = {
"page": page,
"count": 20
}
response = s.get(url=url, headers=headers, timeout=timeout, params=params)
if response.status_code == 200:
response.category = category
return response
except Timeout as e:
print(e)
return None def getTagNameUrl(response):
if not response:
return None
data_dict = response.json()
tag_list = data_dict.get("data").get("tag_list")
tag_name_list = [tag.get("tag_name") for tag in tag_list]
return tag_name_list def getNextPageUrl(response):
if not response:
return []
data_dict = response.json()
pages = int(data_dict.get("data").get("pages"))
for page in range(2, pages + 1):
yield page def getAllTag():
global dic
s.get(url="https://tuchong.com/explore/", headers=headers, timeout=timeout)
for category in categoriesDict.keys():
print("获取 -{}- 第 <{}> 页tagName信息.........".format(categoriesDict.get(category), 1))
response = getCategoryPage(url=baseUrl, category=category)
tag_name_list = getTagNameUrl(response) or []
dic.get(category).extend(tag_name_list)
time.sleep(1)
for page in getNextPageUrl(response):
print("获取 -{}- 第 <{}> 页tagName信息.........".format(categoriesDict.get(category), page))
response = getCategoryPage(url=baseUrl, category=category, page=page)
tag_name_list = getTagNameUrl(response) or []
dic.get(category).extend(tag_name_list)
time.sleep(1) def getTagPage(url, tag, page):
tag = quote(tag)
url = url + tag + "/posts"
params = {
"page": page,
"count": 20,
"order": "weekly"
}
headers["Referer"] = tagReferer + tag + "/"
try:
response = requests.get(url=url, params=params, headers=headers, timeout=timeout)
if response.status_code == 200:
return response
except Timeout as e:
print(e)
return None def getImagesInfo(response):
print('---')
if not response:
return None
result = response.json().get("result")
if result == "INVALID":
print("数据取完了")
return None
postList = response.json().get("postList")
imageUrlList = [dic.get("url") for dic in postList]
titleList = [dic.get("title").strip() for dic in postList]
for img_url_title in zip(titleList, imageUrlList):
img_url_title = list(img_url_title)
yield img_url_title def get_md5(img_url):
m = hashlib.md5()
m.update(bytes(img_url, encoding="utf-8"))
return m.hexdigest() def download(imgsUrl):
if imgsUrl:
for img_url in imgsUrl:
response = requests.get(url=img_url)
name = get_md5(img_url)
print("正在下载{}...".format(img_url))
with open(os.path.join(BASE_PATH, name) + ".jpg", "wb") as f:
f.write(response.content) def gogo(tagname):
page = 1
while True:
response = getTagPage(url=baseTagUrl, tag=tagname, page=page)
print("开始爬取 {} 第 {} 页...".format(tagname, page))
info = getImagesInfo(response) or []
if not response:
return
for info_tuple in info:
imgsUrl = putImageUrl(info_tuple)
download(imgsUrl)
page += 1
time.sleep(5) def putImageUrl(img_url_title_list):
if img_url_title_list:
img_url = img_url_title_list[1]
try:
response = s.get(url=img_url, headers=headers, timeout=timeout)
html = etree.HTML(response.text)
imgsUrl = html.xpath("//article[@class='post-content']/img/@src")
return imgsUrl
except requests.exceptions.ConnectionError as e:
print(e)
return None def downloadImage():
for key in dic:
tagname_list = dic.get(key)
for tagname in tagname_list:
gogo(tagname) def run():
getAllTag()
print("所有tag信息获取完毕.........")
print("开始获取每个tag的内容.........")
downloadImage() if __name__ == '__main__':
BASE_PATH = r"D:\tuchong"
run()
python3爬虫-通过requests爬取图虫网的更多相关文章
- 爬取图虫网 示例网址 https://wangxu.tuchong.com/23892889/
#coding=gbk import requests from fake_useragent import UserAgent from lxml import etree import urlli ...
- python3爬虫-使用requests爬取起点小说
import requests from lxml import etree from urllib import parse import os, time def get_page_html(ur ...
- python3爬虫-通过requests爬取西刺代理
import requests from fake_useragent import UserAgent from lxml import etree from urllib.parse import ...
- Python3爬虫使用requests爬取lol英雄皮肤
本人博客:https://xiaoxiablogs.top 此次爬取lol英雄皮肤一共有两个版本,分别是多线程版本和非多线程版本. 多线程版本 # !/usr/bin/env python # -*- ...
- 爬虫 Scrapy框架 爬取图虫图片并下载
items.py,根据需求确定自己的数据要求 # -*- coding: utf-8 -*- # Define here the models for your scraped items # # S ...
- Python爬虫训练:爬取酷燃网视频数据
前言 本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理 项目目标 爬取酷燃网视频数据 https://krcom.cn/ 环境 Py ...
- python爬虫基础应用----爬取校花网视频
一.爬虫简单介绍 爬虫是什么? 爬虫是首先使用模拟浏览器访问网站获取数据,然后通过解析过滤获得有价值的信息,最后保存到到自己库中的程序. 爬虫程序包括哪些模块? python中的爬虫程序主要包括,re ...
- python3 爬虫教学之爬取链家二手房(最下面源码) //以更新源码
前言 作为一只小白,刚进入Python爬虫领域,今天尝试一下爬取链家的二手房,之前已经爬取了房天下的了,看看链家有什么不同,马上开始. 一.分析观察爬取网站结构 这里以广州链家二手房为例:http:/ ...
- 【Python3爬虫】我爬取了七万条弹幕,看看RNG和SKT打得怎么样
一.写在前面 直播行业已经火热几年了,几个大平台也有了各自独特的“弹幕文化”,不过现在很多平台直播比赛时的弹幕都基本没法看的,主要是因为网络上的喷子还是挺多的,尤其是在观看比赛的时候,很多弹幕不是喷选 ...
随机推荐
- WinForm实现Rabbitmq官网6个案例-Hello World
先上代码 namespace RabbitMQDemo { public partial class HelloWorld : Form { string queueName1 = "hel ...
- [算法练习]Add Two Numbers
题目说明: You are given two linked lists representing two non-negative numbers. The digits are stored in ...
- The pit of an if statement in Java
package the.pit.of.an.ifstatement.injava; public class ThePitOfAnIfStatementInJava { public static v ...
- Python爬虫教程-08-post介绍(百度翻译)(下)
Python爬虫教程-08-post介绍(下) 为了更多的设置请求信息,单纯的通过urlopen已经不太能满足需求,此时需要使用request.Request类 构造Request 实例 req = ...
- OFDM正交频分复用---基础入门图示
@(162 - 信号处理) 整理转载自:给小白图示讲解OFDM 下面以图示为主讲解OFDM,以"易懂"为第一要义. 注:下面的讨论如果不做说明,均假设为理想信道. *** 一张原理 ...
- 线性表的顺序存储结构之顺序表类的实现_Java
在上一篇博文——线性表接口的实现_Java中,我们实现了线性表的接口,今天让我们来实现线性表的顺序存储结构——顺序表类. 首先让我们来看下顺序表的定义: 线性表的顺序存储是用一组连续的内存单元依次存放 ...
- windows更新文件和windows.old文件夹清理
在对Windows系统进行升级之后会发现C盘多出一个Windows.old文件夹,直接点击删除缺没有权限,不能进行删除. 在对window进行修复补丁时会产生大量的补丁更新文件,有时不清楚文件存在在哪 ...
- Django路由系统---django重点之url命名分组
django重点之url命名分组[参数无顺序要求]. settigs.py:增加STATICFILES_DIRS静态资源路径配置,名称为创建的文件夹名称 'DIRS': [os.path.join(B ...
- Jquery 获取Checkbox值,prop 和 attr 函数区别
总结: 版本 1.6 1.6 1.4 1.4 函数 勾选 取消勾选 勾选 取消勾选 attr('checked') checked undefined true false .prop('checke ...
- August 13th 2017 Week 33rd Sunday
The best accessory a girl can own is confidence. 女生最好的饰品就是自信. Only when we have our own ideas and on ...