scrapy爬取《坏蛋是怎样练成的4》

scrapy具体介绍就不用说了，自己百度一下。或者参考以下文档

https://blog.csdn.net/u011054333/article/details/70165401

直接在cmd里运行

scrapy startproject huaidan

scrapy genspider huaidan huaida4.com

然后贴代码放到spiders文件夹里

 1 # -*- coding: utf-8 -*-

 2 import scrapy

 3 from scrapy.http import Request

 4 from urllib import parse

 5 import re

 6

 7 class huaidan(scrapy.Spider):

 8     name = "huaidan"

 9     allowed_domains = ["www.huaidan4.com"]

10     start_urls = ["http://www.huaidan4.com/di-yi-fen-lei.html",

11                   "http://www.huaidan4.com/di-er-juan.html",

12                   "http://www.huaidan4.com"]

13

14

15     #提取下一页文章url交给scrpy进行下载

16     def parse(self, response):

17         #获取文章url

18         all_article=response.css('.container ul li a::attr(href)').extract()

19         all_url=[]

20         for article_url in all_article:

21             if article_url in all_url:

22                 pass

23             else:

24                 all_url.append(article_url)

25                 yield Request(url=article_url,encoding='utf-8',callback=self.parse_detail)

26

27

28

29

30     #提取文章的具体字段

31     def parse_detail(self,response):

32         #获取文章标题

33         article_title = response.xpath('//*[@id="content"]/div[1]/div[1]/h2/text()').extract_first()

34

35         #获取创建时间

36         create_time = response.xpath('//*[@id="content"]/div[1]/div[1]/span/text()[2]').extract_first().strip()

37

38         #获取文章正文

39         article_text = response.css('.post_entry,p::text').extract_first()

40         #处理正文标点符号和无用的信息

41         article_text = re.sub('</?\w+[^>]*>','',article_text)

42         article_text = article_text.replace("\', \'","")

43         article_text = article_text.replace("\\u3000","").strip()

44         article_text = article_text.replace("\\xa0\\xa0\\xa0\\xa0","")

45         article_text = article_text.replace("(新书上传，求收藏，推荐!!!!!!!!!!!!!!!!!!!!)","")

46         article_text = article_text.replace("\\r\\n", "\n")

47         article_text = article_text.replace("免费小说", "")

48         article_text = article_text.replace("www.huaidan4.com", "")

49         article_text = article_text.replace("neirong_2();", "")

50         article_text = article_text.replace("dibutuijian();", "")

51         article_text = article_text.replace("◎欢迎参与讨论，请在这里发表您的看法、交流您的观点。", "")

52         article_text = article_text.replace("《坏蛋是怎样炼成的4》是继曹三少坏蛋是怎样炼成的3的又一作品，作者是曹三少，如果你喜欢坏蛋是怎样炼成的4，请收藏本站以便下次阅读。","")

53         article_text = re.sub('/?\s+', '', article_text)

54

55         #保存文件

56         self.save_article(article_title,create_time,str(article_text))

57

58     #保存文件的方法

59     def save_article(self,article_title,create_time,article_text):

60         biaoti = re.sub('\W+','-',article_title)

61         with open(biaoti+'.txt','w',encoding='utf-8') as file:

62             neirong = (article_title+'\n'+create_time+'\n'+article_text)

63             file.write(neirong)

64             file.close()

以上内容初步完成了把文章保存在本地

---------------------------------------------------------------------------------------------------------------------------------------------------------------

下面内容完成把文章保存到mysql数据库

items.py负责存放爬取节点数据

import scrapy

class HuaidanItem(scrapy.Item):

    catalogues=scrapy.Field()

    id=scrapy.Field()

    article_title = scrapy.Field()

    article_text = scrapy.Field()

    create_time = scrapy.Field()

piplines负责处理items里的内容

# -*- coding: utf-8 -*-

import pymysql

from twisted.enterprise import adbapi

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

#from scrapy.pipelines.images import ImagesPipeline

#from scrapy.pipelines.files import FilesPipeline

class HuaidanPipeline(object):

    def process_item(self, item, spider):

        return item

#直接插入到mysql数据库

class MysqlPiplines(object):

    def __init__(self):

        self.db=pymysql.connect(host="192.168.7.5",user="huaidan",password="huaidan123",database="huaidan",charset = 'utf8')

        self.cursor=self.db.cursor()

    def process_item(self, item, spider):

        self.insert(item["catalogues"],int(item["id"]),item["article_title"],item["create_time"],item["article_text"])

        return item

    def insert(self,catalogues,id,article_title,create_time,article_text):

        selectsql="select id from diyijuan where id = %d " \

                  " union select id from dierjuan where id =%d" \

                  " union select id from disanjuan where id =%d" \

                  " union select id from other where id =%d " % (id,id,id,id)

        self.cursor.execute(selectsql)

        if self.cursor.fetchone() is None:

            insertsql="insert into %s values (%d,'%s','%s','%s');" % (catalogues,id,article_title,create_time,article_text)

            try:

                self.cursor.execute(insertsql)

                self.db.commit()

            except:

                self.db.rollback()

    def spider_closed(self,spider):

        self.db.close()

#异步插入到mysql数据库

class MysqlTwisted(object):

    def __init__(self,dbpool):

        self.dbpool = dbpool

    @classmethod

    def from_settings(cls,settings):

        dbparms = dict(

            host = settings["MYSQL_HOST"],

            user = settings["MYSQL_USER"],

            passwd = settings["MYSQL_PASSWORD"],

            db = settings["MYSQL_DBNAME"],

            charset = 'utf8',

            cursorclass = pymysql.cursors.DictCursor,

            use_unicode = True,

        )

        dbpool=adbapi.ConnectionPool("pymysql", **dbparms)

        return cls(dbpool)

    # 使用twisted讲mysql插入变成异步执行

    def process_item(self, item, spider):

        query = self.dbpool.runInteraction(self.do_insert,item)

        query.addErrback(self.handle_error)

    # 处理异步插入异常

    def handle_error(self,faileure):

        print(faileure)

    # 执行具体的插入

    def do_insert(self,cursor,item):

        #查询id是否已经存在

        id=int(item["id"])

        selectsql = "select id from diyijuan where id = %d " \

                    " union select id from dierjuan where id =%d" \

                    " union select id from disanjuan where id =%d" \

                    " union select id from other where id =%d " % (id,id,id,id)

        cursor.execute(selectsql)

        #如果执行不成功，代表不存在数据库。则执行插入步骤

        if cursor.fetchone() is None:

            insertsql = "insert into %s values (%d,'''%s''','''%s''','''%s''');" % (

            item["catalogues"], id, item["article_title"], item["create_time"], item["article_text"])

            cursor.execute(insertsql)

class myarticlepipline(object):

        def process_item(self, item, spider):

            return item

settings.py负责存放整体设置

# -*- coding: utf-8 -*-

import os

# Scrapy settings for huaidan project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

#     https://doc.scrapy.org/en/latest/topics/settings.html

#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'huaidan'

SPIDER_MODULES = ['huaidan.spiders']

NEWSPIDER_MODULE = 'huaidan.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent

#USER_AGENT = 'huaidan (+http://www.yourdomain.com)'

# Obey robots.txt rules

ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)

# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

#DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False

# Override the default request headers:

#DEFAULT_REQUEST_HEADERS = {

#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

#   'Accept-Language': 'en',

#}

# Enable or disable spider middlewares

# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

#    'huaidan.middlewares.HuaidanSpiderMiddleware': 543,

#}

# Enable or disable downloader middlewares

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#DOWNLOADER_MIDDLEWARES = {

#    'huaidan.middlewares.HuaidanDownloaderMiddleware': 543,

#}

# Enable or disable extensions

# See https://doc.scrapy.org/en/latest/topics/extensions.html

#EXTENSIONS = {

#    'scrapy.extensions.telnet.TelnetConsole': None,

#}

# Configure item pipelines

# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

#下面内容代表执行piplines动作顺序，数字越小，越先执行。

ITEM_PIPELINES = {

   #'huaidan.pipelines.HuaidanPipeline': 300,

    #'scrapy.pipelines.files.FilesPipeline':2,

    #'huaidan.pipelines.myarticlepipline':1,

    #'huaidan.pipelines.MysqlPiplines':2,    #直接插入到mysql数据库的方法

    'huaidan.pipelines.MysqlTwisted':1,     #异步插入到mysql数据库的方法

}

project_dir = os.path.abspath(os.path.dirname(__file__))

FILES_URLS=FIELD =""

FILES_STORE = os.path.join(project_dir,'files')

# Enable and configure the AutoThrottle extension (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/autothrottle.html

#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

#把数据库信息存放到settings里可以直接调用

MYSQL_HOST = "192.168.7.5"

MYSQL_DBNAME = "huaidan"

MYSQL_USER = "huaidan"

MYSQL_PASSWORD = "huaidan123"

scrapy爬取《坏蛋是怎样练成的4》的更多相关文章

【转载】教你分分钟学会用python爬虫框架Scrapy爬取心目中的女神
原文:教你分分钟学会用python爬虫框架Scrapy爬取心目中的女神本博文将带领你从入门到精通爬虫框架Scrapy,最终具备爬取任何网页的数据的能力.本文以校花网为例进行爬取,校花网:http:/ ...
第三百三十四节，web爬虫讲解2—Scrapy框架爬虫—Scrapy爬取百度新闻，爬取Ajax动态生成的信息
第三百三十四节,web爬虫讲解2—Scrapy框架爬虫—Scrapy爬取百度新闻,爬取Ajax动态生成的信息 crapy爬取百度新闻,爬取Ajax动态生成的信息,抓取百度新闻首页的新闻rul地址有多 ...
Scrapy爬取美女图片续集 (原创)
上一篇咱们讲解了Scrapy的工作机制和如何使用Scrapy爬取美女图片,而今天接着讲解Scrapy爬取美女图片,不过采取了不同的方式和代码实现,对Scrapy的功能进行更深入的运用.(我的新书< ...
python scrapy爬取HBS 汉堡南美航运公司柜号信息
下面分享个scrapy的例子利用scrapy爬取HBS 船公司柜号信息 1.前期准备查询提单号下的柜号有哪些,主要是在下面的网站上,输入提单号,然后点击查询 https://www.hamburg ...
scrapy 爬取纵横网实战
前言闲来无事就要练练代码,不知道最近爬取什么网站好,就拿纵横网爬取我最喜欢的雪中悍刀行练手吧准备 python3 scrapy 项目创建: cmd命令行切换到工作目录创建scrapy项目两条命 ...
如何提升scrapy爬取数据的效率
在配置文件中修改相关参数: 增加并发默认的scrapy开启的并发线程为32个,可以适当的进行增加,再配置文件中修改CONCURRENT_REQUESTS = 100值为100,并发设置成了为100. ...
提高Scrapy爬取效率
1.增加并发: 默认scrapy开启的并发线程为32个,可以适当进行增加.在settings配置文件中修改CONCURRENT_REQUESTS = 100值为100,并发设置成了为100. 2.降低 ...
scrapy爬取效率提升配置
增加并发: 默认scrapy开启的并发线程为32个,可以适当进行增加.在settings配置文件中修改CONCURRENT_REQUESTS = 100值为100,并发设置成了为100. 降低日志级别 ...
提高scrapy爬取效率配置
提高scrapy爬取效率配置 #增加并发: 默认scrapy开启的并发线程为32个,可以适当进行增加.在settings配置文件中修改CONCURRENT_REQUESTS = 100值为100,并发 ...
Scrapy爬取美女图片 (原创)
有半个月没有更新了,最近确实有点忙.先是华为的比赛,接着实验室又有项目,然后又学习了一些新的知识,所以没有更新文章.为了表达我的歉意,我给大家来一波福利... 今天咱们说的是爬虫框架.之前我使用pyt ...

随机推荐

不会一致性hash算法，劝你简历别写搞过负载均衡
大家好,我是小富~ 个人公众号:程序员内点事,欢迎学习交流这两天看到技术群里,有小伙伴在讨论一致性hash算法的问题,正愁没啥写的题目就来了,那就简单介绍下它的原理.下边我们以分布式缓存中经典场景举 ...
【漏洞复现】CVE-2022–21661 WordPress核心框架WP_Query SQL注入漏洞原理分析与复现
影响版本 wordpress < 5.8.3 分析参考:https://blog.csdn.net/qq_46717339/article/details/122431779 在 5.8.3 ...
微服务架构 | 3.3 Apache Zookeeper 注册中心
@ 目录前言 1. Zookeeper 基础知识 1.1 Zookeeper 是什么 1.2 Zookeeper 的数据结构 1.3 Watcher 机制 1.4 常见应用场景分析 1.5 Zook ...
【刷题-LeetCode】221. Maximal Square
Maximal Square Given a 2D binary matrix filled with 0's and 1's, find the largest square containing ...
微服务架构 | 12.1 使用 Apache Dubbo 实现远程通信
目录前言 1. Dubbo 基础知识 1.1 Dubbo 是什么 1.2 Dubbo 的架构图 1.3 Spring Cloud 与 Dubbo 的区别 1.4 Dubbo 的特点 1.5 Dubb ...
「Python实用秘技04」为pdf文件批量添加文字水印
本文完整示例代码及文件已上传至我的Github仓库https://github.com/CNFeffery/PythonPracticalSkills 这是我的系列文章「Python实用秘技」的第4期 ...
new实例化和反射实例化有什么区别？
在工厂设计模式中,使用反射实例化,子类可以随便增加,工厂类不需要做任何的修改使用反射之后最大的好处就是解耦合
redis一主两从搭建
一主两从搭建: 主配: daemonize yes port 6379 logfile ./redis6379.log dir ./ bind 10.131.156.170 从1配: daemoniz ...
python利用正则表达式提取文本中特定内容
正则表达式是一个特殊的字符序列,它能帮助你方便的检查一个字符串是否与某种模式匹配. Python 自1.5版本起增加了re 模块,它提供 Perl 风格的正则表达式模式. re 模块使 Python ...
码风QwQ
注:卡常.压行时怎么有效怎么来QwQ 快读真香.( 不喜欢用字符数组,使用string. 此时cin cout输入前会加这样三句以优化: ios::sync_with_stdio(0); cin.ti ...

scrapy爬取《坏蛋是怎样练成的4》

scrapy爬取《坏蛋是怎样练成的4》的更多相关文章

随机推荐

热门专题