scrapy全栈抓xpc练习

#  spider文件

# -*- coding: utf-8 -*-

import scrapy

import re

from scrapy import Request

import json

import string

import random

from xpc.items import PostItem, CommentItem, CopyItem  # 多个item

def strip(s):

    # s存在就去空，不存在就返回空

    if s:

        return s.strip()

    return ""

# 使用scrapy.Request和scrapy.FormRequest发送请求的时候，默认会把cookies保存下来

# 模拟登录的时候不用scrapy框架，直接使用request模块

cookies = dict(

    Authorization='4F635191B0602B5D3B06024483B0602AAF8B06023C2F6259656D'

)

# 上面的cookies是网站返回的，需要先登陆的一下把这个cookies找到

# 生成26个字母+数字

def gen_sessionid():

    return "".join(random.choices(string.ascii_lowercase + string.digits, k=26))

class XinpianchangSpider(scrapy.Spider):

    name = 'XinPianChang'

    allowed_domains = ['xinpianchang.com', 'openapi-vtom.vmovier.com']

    start_urls = ['https://www.xinpianchang.com/channel/index/sort-like?from=tabArticle']

    # 假设从21页开始访问，这里就需要带上cookies，这时候最开始设置的cookies就不能用了，网站会返回4个cookies。需要从写start_requests函数

    # start_urls = ['https://www.xinpianchang.com/channel/index/type-/sort-like/duration_type-0/resolution_type-/page-21']

    page_count = 0

    # 重写父类中的 start_requests方法，该方法默认对start_urls中的url发get请求

    # def start_requests(self):

    #     for url in self.start_urls:

    #         # data = {

    #         #     "kw": "cat"

    #         # }

    #         # post请求发送,使用FormRequest

    #         # yield scrapy.FormRequest(url=url, callback=self.parse, formdata=data)

    #

    #         c = cookies.copy()

    #         c.update(PHPSESSID=gen_sessionid(),

    #                  SERVER_ID='b52601c8-285bdd26',

    #                  channel_page='apU%3D')

    #         yield Request(url, cookies=c, dont_filter=True)

    def parse(self, response):

        # from scrapy.shell import inspect_response

        # inspect_response(response, self)

        self.page_count += 1

        if self.page_count >= 100:

            cookies.update(PHPSESSID=gen_sessionid())

            self.page_count = 0

        url_list = response.xpath('//ul[@class="video-list"]/li/@data-articleid').extract()

        for pid in url_list:

            detail_url = 'https://www.xinpianchang.com/a{}?from=ArticleList'.format(pid)

            # print(detail_url)

            request = response.follow(detail_url, callback=self.parse_post)

            request.meta['pid'] = pid

            yield request  # 进入作品的详情页请求

        pages = response.xpath('//div[@class="page"]/a/@href').extract()

        for page_url in pages:

            # print("列表页翻页url", page_url)  # page_url是一个相对路径，不完整的

            yield response.follow(page_url, self.parse, cookies=cookies)

    def parse_post(self, response):

        pid = response.meta['pid']

        post = PostItem()

        post['pid'] = pid

        post['title'] = response.xpath('//div[@class="title-wrap"]/h3/text()').get()

        # video_url = 'https://openapi-vtom.vmovier.com/v3/video/5E34203E92450?expand=resource&usage=xpc_web'

        # response.text拿到网页返回的源码

        vid = re.findall('vid: "(.*?)",', response.text)[0]

        # print(vid)

        video_url = 'https://openapi-vtom.vmovier.com/v3/video/{}?expand=resource&usage=xpc_web'.format(vid)

        cates = response.xpath('//span[@class="cate v-center"]/a/text()').extract()

        post['category'] = ''.join([cate.strip() for cate in cates])

        post['create_time'] = response.xpath('//span[contains(@class,"update-time")]/i/text()').get()

        post['play_count'] = response.xpath('//i[contains(@class,"play-counts")]/text()').get()

        desc_lst = response.xpath('//p[contains(@class,"desc")]//text()').extract()

        post['desc'] = ' '.join([i.strip() for i in desc_lst])

        # 请求这个video_url, 多了一步这个注意一下

        request = Request(video_url, callback=self.parse_video)

        # 把之前获取到的post通过meta传到下一个函数中. 这个post是请求传参

        request.meta['post'] = post

        yield request

        # 获取评论链接‘https://app.xinpianchang.com/comments?resource_id=10664352&type=article&page=1&per_page=24’

        comment_url = "https://app.xinpianchang.com/comments?resource_id={}&type=article&page=1&per_page=24".format(

            pid)

        request = Request(comment_url, callback=self.parse_comment)

        # 把之前获取到的post通过meta传到下一个函数中

        request.meta['pid'] = pid

        yield request

        # 获取作者页链接

        creator_list = response.xpath('//div[@class="filmplay-creator right-section"]/ul[@class="creator-list"]/li')

        composer_url = 'https://www.xinpianchang.com/u{}?from=articleList'

        # cid = response.xpath('//div[@class="filmplay-creator right-section"]/ul[@class="creator-list"]/li/a/@data-userid')

        for creator in creator_list:

            cid = creator.xpath('./a/@data-userid').get()

            composer_url = 'https://www.xinpianchang.com/u{}?from=articleList'.format(cid)

            request = response.follow(composer_url, self.parse_composer)

            request.meta['cid'] = cid

            # 避免在cookies更新之后，不断的添加到请求头里面，避免请求头里带有一串cookies

            request.meta['dont_merge_cookies'] = True

            yield request

            # 作者和视频的对应关系

            cr = CopyItem()

            cr['pid'] = pid

            cr['cid'] = cid

            cr['pcid'] = pid + cid

            cr['role'] = creator.xpath('./div[@class="creator-info"]/span/text()').get()

            # print("cr", cr)

            yield cr

    def parse_video(self, response):  # 这个response是json格式

        post = response.meta['post']

        # 先把返回的json转化一下， 注意一下

        result = json.loads(response.text)

        post['video_url'] = result['data']['resource']['default']['url']

        # 直接返回给管道了

        yield post

    def parse_comment(self, response):

        result = json.loads(response.text)

        for c in result['data']['list']:

            comment = CommentItem()

            comment['uname'] = c['userInfo']['username']

            comment['user_id'] = c['userInfo']['id']

            # comment['user_page'] = c['userInfo']['web_url']

            comment['content'] = c['content']

            comment['content_id'] = c['id']

            print(comment)

            yield comment

        # 如果有下一页

        if result['data']['next_page_url']:

            next_page = 'https://app.xinpianchang.com' + result['data']['next_page_url']

            # print("next_page", next_page)

            yield response.follow(next_page, self.parse_comment)

    def parse_composer(self, response):

        pass

# settings文件

# -*- coding: utf-8 -*-

# Scrapy settings for xpc project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

#     https://docs.scrapy.org/en/latest/topics/settings.html

#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html

#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'xpc'

SPIDER_MODULES = ['xpc.spiders']

NEWSPIDER_MODULE = 'xpc.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent

# USER_AGENT = 'xpc (+http://www.yourdomain.com)'

# Obey robots.txt rules

ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)

# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)

# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

# DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

# CONCURRENT_REQUESTS_PER_DOMAIN = 16

# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

# 如果使用自定义cookie就把COOKIES_ENABLED设置为True

# 如果使用settings的cookie就把COOKIES_ENABLED设置为False

COOKIES_ENABLED = True

COOKIES_DEBUG = True  # 可以打印出来详细的cookies信息

# Disable Telnet Console (enabled by default)

# TELNETCONSOLE_ENABLED = False

# Override the default request headers:

DEFAULT_REQUEST_HEADERS = {

    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

    'Accept-Language': 'en',

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)',

}

# Enable or disable spider middlewares

# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html

# SPIDER_MIDDLEWARES = {

#    'xpc.middlewares.XpcSpiderMiddleware': 543,

# }

# Enable or disable downloader middlewares

# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html

# DOWNLOADER_MIDDLEWARES = {

#    'xpc.middlewares.XpcDownloaderMiddleware': 543,

# }

# Enable or disable extensions

# See https://docs.scrapy.org/en/latest/topics/extensions.html

# EXTENSIONS = {

#    'scrapy.extensions.telnet.TelnetConsole': None,

# }

# Configure item pipelines

# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {

   'xpc.pipelines.XpcPipeline': 300,  # 优先级高

   # 'xpc.pipelines.MysqlPipeline': 301,

   # 'xpc.pipelines.RedisPipeline': 302,

}

# Enable and configure the AutoThrottle extension (disabled by default)

# See https://docs.scrapy.org/en/latest/topics/autothrottle.html

# AUTOTHROTTLE_ENABLED = True

# The initial download delay

# AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

# AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

HTTPCACHE_ENABLED = False   # True缓存访问过的网页，不会真实的发请求

# HTTPCACHE_ENABLED = True

# HTTPCACHE_EXPIRATION_SECS = 0

# HTTPCACHE_DIR = 'httpcache'

# HTTPCACHE_IGNORE_HTTP_CODES = []

# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# 日志类型： INFO DEBUG ERROR

LOG_LEVEL = 'DEBUG'

# item文件

# -*- coding: utf-8 -*-

import scrapy

class PostItem(scrapy.Item):

    # 保存视频信息

    # 自定义字段，有多个表的时候需要写个table_name

    table_name = 'posts'

    # 下面的是数据字段

    pid = scrapy.Field()

    title = scrapy.Field()

    category = scrapy.Field()

    create_time = scrapy.Field()

    play_count = scrapy.Field()

    desc = scrapy.Field()

    video_url = scrapy.Field()

class CommentItem(scrapy.Item):

    # 保存评论信息

    table_name = 'comments'

    content_id = scrapy.Field()

    pid = scrapy.Field()

    cid = scrapy.Field()

    uname = scrapy.Field()

    user_id = scrapy.Field()

    content = scrapy.Field()

    user_page = scrapy.Field()

class CopyItem(scrapy.Item):

    table_name = 'copyrights'

    pcid = scrapy.Field()  # 表的主键

    pid = scrapy.Field()

    cid = scrapy.Field()

    role = scrapy.Field()

# pipeline文件

# -*- coding: utf-8 -*-

import csv

from xpc.items import PostItem, CommentItem, CopyItem

import pymysql

from redis import Redis

import os

class XpcPipeline(object):

    def __init__(self):

        # 当前文件的上一级

        store_file = os.path.dirname(__file__) + '/xpc.csv'

        # 打开文件

        self.file = open(store_file, 'w', newline="")

        # csv 写法

        self.writer = csv.writer(self.file)

    def open_spider(self, spider):

        print("pipeline 开始爬虫......")

　　

　　# 执行多个不同的item时

    def process_item(self, item, spider):

        if isinstance(item, PostItem):

            print("这是发布信息：", item)

        elif isinstance(item, CommentItem):

            print("这是评论信息：", item)

        elif isinstance(item, CopyItem):

            print("这是版权信息：", item)

        return item  # 返回给下一个要执行的管道类

    def close_spider(self, spider):

        print("pipeline 结束爬虫......")

# 连接数据库

class MysqlPipeline(object):

    conn = None

    cursor = None

    def open_spider(self, spider):

        self.conn = pymysql.Connect(

            host='127.0.0.1',

            port=3306,

            user='root',

            password='',

            db='test_db',

            charset='utf8'

        )

        print("数据库连接成功")

    def process_item(self, item, spider):

        self.cursor = self.conn.cursor()

        try:

            self.cursor.execute('insert into test_db values("%s", "%s")' % (item['author'], item['content']))

            self.conn.commit()

        except Exception as e:

            print("数据库插入异常：", e)

            print("数据库执行回滚")

            self.conn.rollback()

        return item

    def close_spider(self, spider):

        print("断开数据库连接")

        self.cursor.close()

        self.conn.close()

# 连接数据库

class RedisPipeline(object):

    conn = None

    cursor = None

    def open_spider(self, spider):

        self.conn = Redis(

            host='127.0.0.1',

            port=6379

        )

        print("数据库连接成功")

    def process_item(self, item, spider):

        dic = {

            "author": item["author"],

            "content": item["content"]

        }

        self.conn.lpush("队列名字", dic)

    def close_spider(self, spider):

        print("断开数据库连接")

        self.cursor.close()

        self.conn.close()

scrapy全栈抓xpc练习的更多相关文章

爬虫系列---scrapy全栈数据爬取框架(Crawlspider)
一简介 crawlspider 是Spider的一个子类,除了继承spider的功能特性外,还派生了自己更加强大的功能. LinkExtractors链接提取器,Rule规则解析器. 二强大的链接 ...
大数据全栈式开发语言 – Python
前段时间,ThoughtWorks在深圳举办一次社区活动上,有一个演讲主题叫做“Fullstack JavaScript”,是关于用JavaScript进行前端.服务器端,甚至数据库(MongoDB) ...
为什么说Python 是大数据全栈式开发语言
欢迎大家访问我的个人网站<刘江的博客和教程>:www.liujiangblog.com 主要分享Python 及Django教程以及相关的博客交流QQ群:453131687 原文链接 h ...
《从零开始做一个MEAN全栈项目》（2）
欢迎关注本人的微信公众号"前端小填填",专注前端技术的基础和项目开发的学习. 上一节简单介绍了什么是MEAN全栈项目,这一节将简要介绍三个内容:(1)一个通用的MEAN项目的技 ...
《web全栈工程师的自我修养》阅读笔记
在买之前以为这本书是教你怎么去做一个web全栈工程师,以及介绍需要掌握的哪些技术的书,然而看的过程中才发现,是一本方法论的书.读起来的感觉有点像红衣教主的<我的互联网方法论>,以一些自己的 ...
Win10构建Python全栈开发环境With WSL
目录 Win10构建Python全栈开发环境With WSL 启动WSL 总结对<Dev on Windows with WSL>的补充 Win10构建Python全栈开发环境With ...
python全栈开发中级班全程笔记（第二模块、第四章）（常用模块导入）
python全栈开发笔记第二模块第四章 :常用模块(第二部分) 一.os 模块的详解 1.os.getcwd() :得到当前工作目录,即当前python解释器所在目录路径 impor ...
学习笔记之Python全栈开发/人工智能公开课_腾讯课堂
Python全栈开发/人工智能公开课_腾讯课堂 https://ke.qq.com/course/190378 https://github.com/haoran119/ke.qq.com.pytho ...
Python全栈面试题
Mr.Seven 博客园首页新随笔联系订阅管理随笔-132 文章-153 评论-516 不吹不擂,你想要的Python面试都在这里了[315+道题] 写在前面近日恰逢学生毕 ...

随机推荐

python语法基础-面向对象-进阶-长期维护
############### @property定义属性 ############## # 所以对于定义属性你有好几种方式了和种类了,# 静态属性,动态属性, # property # ...
discount the possibility|pessimistic|bankrupt|
Nor can we discount the possibility that some factor in the diet itself has harmful effects. ADJ-GRA ...
Postman接口测试学习笔记
1.postman界面下载安装postman工具,以下是postman的界面快捷区:提供常用的操作入口,新建请求,执行器,导入别人共享的收藏夹测试数据,包括运行收藏夹的一组测试数据: 侧边栏:搜索 ...
Docker：发布镜像问题denied: requested access to the resource is denied的解决方法
问题: 发布镜像的时候,按照教程执行的,结果没有成功,搜了下,找到解决方法了,记录一下. denied: requested access to the resource is denied 解决方法 ...
深入JVM内核--JVM简介
JVM概念 jvm是指通过软件模拟的具有完整硬件系统功能的.运行在一个完全隔离环境中的完成计算机系统. 目前主要有vmMare.visual Box和JVM三款虚拟机. JVM使用软件模拟java字节 ...
MOOC（7）- case依赖、读取json配置文件进行多个接口请求-测试类中调用封装的mock（10）
封装mock后,在单元测试中调用 # -*- coding: utf-8 -*- # @Time : 2020/2/11 8:35 # @File : test_class_10.py.py # @A ...
take office|boast|think twice|dispose of|level|stuff|'s mature for|a green hand|'s a slave to|
One reporter wrote that Dewey was acting like a man who had already been elected and was only passin ...
Nginx笔记总结三:内核参数优化
net.ipv4.netfilter.ip_conntrack_tcp_timeout_established = 1800 net.ipv4.ip_conntrack_max = 16777216 ...
在JavaScript里的“对象字面量”是什么意思？
字面量表示如何表达这个值,一般除去表达式,给变量赋值时,等号右边都可以认为是字面量.字面量分为字符串字面量(string literal ).数组字面量(array literal)和对象字面量(ob ...
Acwing 843. n-皇后问题
n-皇后问题是指将 n 个皇后放在 n∗n 的国际象棋棋盘上,使得皇后不能相互攻击到,即任意两个皇后都不能处于同一行.同一列或同一斜线上. 现在给定整数n,请你输出所有的满足条件的棋子摆法. 输入格式 ...

scrapy全栈抓xpc练习

scrapy全栈抓xpc练习的更多相关文章

随机推荐

热门专题