python--爬取豆瓣热门国产电视剧保存为文件

# -*- coding: utf-8 -*-

__author__ = 'Frank Li'

import requests

import json

class HotSpider(object):

    def __init__(self):

        self.url = "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_domestic_hot/items?os=android&for_mobile=1&start={}&count=18&loc_id=108288"

        self.session = requests.session()

        self.headers = {"Referer": "https://m.douban.com/tv/chinese",

                        "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Mobile Safari/537.36"}

    def parse_2_list_from_str(self,url):

        return json.loads(self.session.get(url,headers=self.headers).content.decode())['subject_collection_items']

    def save_as_file(self,content_list,file):

        with open(file,'a',encoding='utf-8') as f:

            for content in content_list:

                f.write(json.dumps(content,ensure_ascii=False))

                f.write('\n')

    def run(self):

        url = self.url.format(0)

        num = 0

        total = 500

        while num<total+18:

            print(url)

            self.save_as_file(self.parse_2_list_from_str(url),'hot.json')

            num+=18

            url=self.url.format(num)

if __name__ == '__main__':

    hot_spider = HotSpider()

    hot_spider.run()

使用 xpath 爬取正在热映的电影保存为 json 文件

# -*- coding: utf-8 -*-

__author__ = 'Frank Li'

import requests

from lxml import etree

import json

url = "https://movie.douban.com/cinema/nowplaying/changsha/"

headers = {"Referer":"https://movie.douban.com/cinema/nowplaying/changsha/",

           "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}

sess = requests.session()

response = sess.get(url,headers=headers)

html_str = response.content.decode()

element = etree.HTML(html_str)

movie_img_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='poster']//img/@src")

movie_name_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='stitle']/a/@title")

movie_addr_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='stitle']/a/@href")

movie_score_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='srating']/span[@class='subject-rate']/text()")

for name,img,addr,score in zip(movie_name_list,movie_img_list,movie_addr_list,movie_score_list):

    item = {}

    item['name'] = name

    item['img'] = img

    item['addr'] = addr

    item['score'] = score

    with open('movie.json','a',encoding='utf-8') as f:

        item_json = json.dumps(item, ensure_ascii=False, indent=2)

        print(item_json)

        f.write(item_json)

        f.write('\n')

        f.flush()

保存下来的 movie.json 文件

{

  "name": "碟中谍6：全面瓦解",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2529365085.jpg",

  "addr": "https://movie.douban.com/subject/26336252/?from=playing_poster",

  "score": "8.3"

}

{

  "name": "阿尔法：狼伴归途",

  "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2530871439.jpg",

  "addr": "https://movie.douban.com/subject/26810318/?from=playing_poster",

  "score": "6.5"

}

{

  "name": "蚁人2：黄蜂女现身",

  "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2529389608.jpg",

  "addr": "https://movie.douban.com/subject/26636712/?from=playing_poster",

  "score": "7.5"

}

{

  "name": "传奇的诞生",

  "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2531286907.jpg",

  "addr": "https://movie.douban.com/subject/3073268/?from=playing_poster",

  "score": "7.6"

}

{

  "name": "快把我哥带走",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2531080870.jpg",

  "addr": "https://movie.douban.com/subject/30122633/?from=playing_poster",

  "score": "7.0"

}

{

  "name": "道高一丈",

  "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2530863118.jpg",

  "addr": "https://movie.douban.com/subject/26954268/?from=playing_poster",

  "score": "5.7"

}

{

  "name": "李宗伟：败者为王",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530870325.jpg",

  "addr": "https://movie.douban.com/subject/27195119/?from=playing_poster",

  "score": "7.1"

}

{

  "name": "西虹市首富",

  "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2529206747.jpg",

  "addr": "https://movie.douban.com/subject/27605698/?from=playing_poster",

  "score": "6.7"

}

{

  "name": "一出好戏",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2529571873.jpg",

  "addr": "https://movie.douban.com/subject/26985127/?from=playing_poster",

  "score": "7.3"

}

{

  "name": "精灵旅社3：疯狂假期",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530591543.jpg",

  "addr": "https://movie.douban.com/subject/26630714/?from=playing_poster",

  "score": "6.9"

}

{

  "name": "苏丹",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2529570494.jpg",

  "addr": "https://movie.douban.com/subject/26728641/?from=playing_poster",

  "score": "7.0"

}

{

  "name": "巨齿鲨",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530572643.jpg",

  "addr": "https://movie.douban.com/subject/26426194/?from=playing_poster",

  "score": "6.0"

}

{

  "name": "藏北秘岭-重返无人区",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2532522676.jpg",

  "addr": "https://movie.douban.com/subject/30208007/?from=playing_poster",

  "score": "6.2"

}

{

  "name": "那些女人",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530146643.jpg",

  "addr": "https://movie.douban.com/subject/26574965/?from=playing_poster",

  "score": "5.3"

}

{

  "name": "草戒指",

  "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2531782507.jpg",

  "addr": "https://movie.douban.com/subject/27204180/?from=playing_poster",

  "score": "5.6"

}

{

  "name": "吻隐者",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2531980221.jpg",

  "addr": "https://movie.douban.com/subject/26928809/?from=playing_poster",

  "score": "7.6"

}

{

  "name": "禹神传之寻找神力",

  "img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2532781444.jpg",

  "addr": "https://movie.douban.com/subject/30227727/?from=playing_poster",

  "score": "6.6"

}

{

  "name": "大师兄",

  "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2528842218.jpg",

  "addr": "https://movie.douban.com/subject/27201353/?from=playing_poster",

  "score": "6.2"

}

简单多线程图片下载

import requests

from bs4 import BeautifulSoup

import os

import threading

def download_img(src,target=None):

    parent_dir = './img'

    os.makedirs(parent_dir,exist_ok=True)

    r = requests.get(src,stream=True)

    target = src.split('/')[-1]

    target = os.path.join(parent_dir,target)

    print(threading.current_thread(),' start to download img: ',target)

    with open(target,'wb') as tar_file:

        for chunk in r.iter_content(chunk_size=128):

            tar_file.write(chunk)

        print('saved {}'.format(target))

if __name__ == '__main__':

    URL = 'https://tieba.baidu.com/p/6034793219'

    html = requests.get(URL).text

    soup = BeautifulSoup(html,'lxml')

    # print(html)

    imgs = []

    srcs = soup.find_all('img',{'class':'BDE_Image'})

    for src in srcs:

        imgs.append(src['src'])

    threads = []

    for i,img in enumerate(imgs):

        t = threading.Thread(target=download_img,args=(img,),name='Thread-{}'.format(i))

        t.start()

        threads.append(t)

    for t in threads:

        t.join()

python--爬取豆瓣热门国产电视剧保存为文件的更多相关文章

requests库爬取豆瓣热门国产电视剧数据并保存到本地
首先要做的就是去豆瓣网找对应的接口,这里就不赘述了,谷歌浏览器抓包即可,然后要做的就是分析返回的json数据的结构: https://movie.douban.com/j/search_subject ...
利用Python爬取豆瓣电影
目标:使用Python爬取豆瓣电影并保存MongoDB数据库中我们先来看一下通过浏览器的方式来筛选某些特定的电影: 我们把URL来复制出来分析分析: https://movie.douban.com ...
Python爬取豆瓣指定书籍的短评
Python爬取豆瓣指定书籍的短评 #!/usr/bin/python # coding=utf-8 import re import sys import time import random im ...
Python爬取豆瓣《复仇者联盟3》评论并生成乖萌的格鲁特
代码地址如下:http://www.demodashi.com/demo/13257.html 1. 需求说明本项目基于Python爬虫,爬取豆瓣电影上关于复仇者联盟3的所有影评,并保存至本地文件. ...
Python爬取豆瓣电影top
Python爬取豆瓣电影top250 下面以四种方法去解析数据,前面三种以插件库来解析,第四种以正则表达式去解析. xpath pyquery beaufifulsoup re 爬取信息:名称评分 ...
python爬取豆瓣首页热门栏目详细流程
记录一下爬取豆瓣热门专栏的经过,通过这篇文章,你能学会requests,HTMLParser,json的基本使用,以及爬取网页内容的基本思路. 使用模块 1,获取豆瓣首页代码:首先我们需要访问豆瓣页面 ...
python 爬取豆瓣的美剧
pc版大概有500条记录,mobile大概是50部,只有热门的,所以少一点 url构造很简单,主要参数就是page_limit与page_start,每翻一页,start+=20即可,tag是&quo ...
python爬取豆瓣电影信息数据
题外话+ 大家好啊,最近自己在做一个属于自己的博客网站(准备辞职回家养老了,明年再战)在家里琐事也很多, 加上自己一回到家就懒了(主要是家里冷啊! 广东十几度,老家几度,躲在被窝瑟瑟发抖,) 由于 ...
python 爬取豆瓣电影短评并wordcloud生成词云图
最近学到数据可视化到了词云图,正好学到爬虫,各种爬网站 [实验名称] 爬取豆瓣电影<千与千寻>的评论并生成词云 1. 利用爬虫获得电影评论的文本数据 2. 处理文本数据生成词云图第一步, ...

随机推荐

PHP获取网络图片并保存在本地目录
PHP获取网络图片并保存在本地目录思路: 代码如下: function file_exists_S3($url) { $state = @file_get_contents($url,0,null,0 ...
MT【308】投影的定义
已知向量$\overrightarrow{a},\overrightarrow{b}$满足:$|\overrightarrow{a}|=2$,向量$\overrightarrow{b}$与$\over ...
自学Aruba集锦
自学Aruba集锦 01 自学Aruba之功率单位和相对单位 02 自学Aruba之无线频段---ISM频段及UNII频段 03 自学Aruba之2.4GHz及5GHz无线信道 04 自学Aruba之 ...
【dfs】P1331 海战
题目描述在峰会期间,武装部队得处于高度戒备.警察将监视每一条大街,军队将保卫建筑物,领空将布满了F-2003飞机.此外,巡洋船只和舰队将被派去保护海岸线.不幸的是因为种种原因,国防海军部仅有很少的几 ...
nginx日志文件的定时切割与归纳
应用环境:生产环境中的Nginx服务器,由于访问日志文件增长速度非常快,日志太大会严重影响服务器效率.同时,为了方便对日志进行分析计算,须要对日志文件进行定时切割.定时切割的方式有按月切割.按天切割 ...
蓝桥杯试题 k倍区间(dp)
问题描述给定一个长度为N的数列,A1, A2, ... AN,如果其中一段连续的子序列Ai, Ai+1, ... Aj(i <= j)之和是K的倍数,我们就称这个区间[i, j]是K倍区间. ...
CodeFroces-- 511div2 C. Enlarge GCD
题目链接:C. Enlarge GCD 给你一个序列删除一些数看可以让他们之间的gcd变大如果可以输出删除数量最小的个数先求出共同 gcd 然后除去找出出现最多的质数然后减去就可以了 #inc ...
JAVA中循环删除list中元素
文章来源: https://www.cnblogs.com/pcheng/p/5336903.html JAVA中循环遍历list有三种方式for循环.增强for循环(也就是常说的foreach循环) ...
vue router获取整条路径参数
$route.path 当前路由对象的路径,如'/vi$route.query 请求参数,如/foo?user=1获取到query.user = 1$route.router 所属路由器以及所属组件信 ...
CF1080F Katya and Segments Sets
题意:给定n个区间,每个区间有颜色.m次询问,每次询问:这n个区间中所有被包含在[x, y]这一区间中的区间,它们的颜色是否取遍了[l, r]中的所有颜色. 强制在线. 解:第一步是大家都熟悉的套路⑧ ...

python--爬取豆瓣热门国产电视剧保存为文件

使用 xpath 爬取正在热映的 电影保存为 json 文件

保存下来的 movie.json 文件

简单多线程 图片下载

python--爬取豆瓣热门国产电视剧保存为文件的更多相关文章

随机推荐

热门专题

使用 xpath 爬取正在热映的电影保存为 json 文件

简单多线程图片下载