18.scrapy_maitian

ershoufang.py

# -*- coding: utf-8 -*-

import scrapy

class ErshoufangSpider(scrapy.Spider):

    name = 'ershoufang'

    allowed_domains = ['maitian.com']

    start_urls = ['http://maitian.com/']

    def parse(self, response):

        pass

zufang_spider.py

import scrapy

from maitian.items import MaitianItem

class MaitianSpider(scrapy.Spider):

    name = "zufang"

    start_urls = ['http://bj.maitian.cn/zfall/PG1']

    def parse(self, response):

        for zufang_itme in response.xpath('//div[@class="list_title"]'):

            yield {

                'title': zufang_itme.xpath('./h1/a/text()').extract_first().strip(),

                'price': zufang_itme.xpath('./div[@class="the_price"]/ol/strong/span/text()').extract_first().strip(),

                'area': zufang_itme.xpath('./p/span/text()').extract_first().replace('㎡', '').strip(),

                'district': zufang_itme.xpath('./p//text()').re(r'昌平|朝阳|东城|大兴|丰台|海淀|石景山|顺义|通州|西城')[0],

            }

        next_page_url = response.xpath(

            '//div[@id="paging"]/a[@class="down_page"]/@href').extract_first()

        if next_page_url is not None:

            yield scrapy.Request(response.urljoin(next_page_url))

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items

# See documentation in:

# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class MaitianItem(scrapy.Item):

    # define the fields for your item here like:

    # name = scrapy.Field()

    title = scrapy.Field()

    price = scrapy.Field()

    area = scrapy.Field()

    district = scrapy.Field()

middlewares.py

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware

#

# See documentation in:

# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

class MaitianSpiderMiddleware(object):

    # Not all methods need to be defined. If a method is not defined,

    # scrapy acts as if the spider middleware does not modify the

    # passed objects.

    @classmethod

    def from_crawler(cls, crawler):

        # This method is used by Scrapy to create your spiders.

        s = cls()

        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

        return s

    def process_spider_input(self, response, spider):

        # Called for each response that goes through the spider

        # middleware and into the spider.

        # Should return None or raise an exception.

        return None

    def process_spider_output(self, response, result, spider):

        # Called with the results returned from the Spider, after

        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.

        for i in result:

            yield i

    def process_spider_exception(self, response, exception, spider):

        # Called when a spider or process_spider_input() method

        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict

        # or Item objects.

        pass

    def process_start_requests(self, start_requests, spider):

        # Called with the start requests of the spider, and works

        # similarly to the process_spider_output() method, except

        # that it doesn’t have a response associated.

        # Must return only requests (not items).

        for r in start_requests:

            yield r

    def spider_opened(self, spider):

        spider.logger.info('Spider opened: %s' % spider.name)

class MaitianDownloaderMiddleware(object):

    # Not all methods need to be defined. If a method is not defined,

    # scrapy acts as if the downloader middleware does not modify the

    # passed objects.

    @classmethod

    def from_crawler(cls, crawler):

        # This method is used by Scrapy to create your spiders.

        s = cls()

        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

        return s

    def process_request(self, request, spider):

        # Called for each request that goes through the downloader

        # middleware.

        # Must either:

        # - return None: continue processing this request

        # - or return a Response object

        # - or return a Request object

        # - or raise IgnoreRequest: process_exception() methods of

        #   installed downloader middleware will be called

        return None

    def process_response(self, request, response, spider):

        # Called with the response returned from the downloader.

        # Must either;

        # - return a Response object

        # - return a Request object

        # - or raise IgnoreRequest

        return response

    def process_exception(self, request, exception, spider):

        # Called when a download handler or a process_request()

        # (from other downloader middleware) raises an exception.

        # Must either:

        # - return None: continue processing this exception

        # - return a Response object: stops process_exception() chain

        # - return a Request object: stops process_exception() chain

        pass

    def spider_opened(self, spider):

        spider.logger.info('Spider opened: %s' % spider.name)

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import pymongo

from scrapy.conf import settings

class MaitianPipeline(object):

    def __init__(self):

        host = settings['MONGODB_HOST']

        port = settings['MONGODB_PORT']

        db_name = settings['MONGODB_DBNAME']

        client = pymongo.MongoClient(host=host, port=port)

        db = client[db_name]

        self.post = db[settings['MONGODB_DOCNAME']]

    def process_item(self, item, spider):

        zufang = dict(item)

        self.post.insert(zufang)

        return item

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for maitian project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

#     https://doc.scrapy.org/en/latest/topics/settings.html

#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'maitian'

SPIDER_MODULES = ['maitian.spiders']

NEWSPIDER_MODULE = 'maitian.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent

#USER_AGENT = 'maitian (+http://www.yourdomain.com)'

# Obey robots.txt rules

ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)

# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

#DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False

# Override the default request headers:

#DEFAULT_REQUEST_HEADERS = {

#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

#   'Accept-Language': 'en',

#}

# Enable or disable spider middlewares

# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

#    'maitian.middlewares.MaitianSpiderMiddleware': 543,

#}

# Enable or disable downloader middlewares

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#DOWNLOADER_MIDDLEWARES = {

#    'maitian.middlewares.MaitianDownloaderMiddleware': 543,

#}

# Enable or disable extensions

# See https://doc.scrapy.org/en/latest/topics/extensions.html

#EXTENSIONS = {

#    'scrapy.extensions.telnet.TelnetConsole': None,

#}

# Configure item pipelines

# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

#ITEM_PIPELINES = {

#    'maitian.pipelines.MaitianPipeline': 300,

#}

ITEM_PIPELINES = {

   'maitian.pipelines.DuplicatesPipeline': 301,

}

# Enable and configure the AutoThrottle extension (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/autothrottle.html

#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

ITEM_PIPELINES = {'maitian.pipelines.MaitianPipeline': 300,}

MONGODB_HOST = '127.0.0.1'

MONGODB_PORT = 27017

MONGODB_DBNAME = 'maitian'

MONGODB_DOCNAME = 'zufang'

18.scrapy_maitian的更多相关文章

CSharpGL(18)分别处理glDrawArrays()和glDrawElements()两种方式下的拾取(ColorCodedPicking)
CSharpGL(18)分别处理glDrawArrays()和glDrawElements()两种方式下的拾取(ColorCodedPicking) 我在(Modern OpenGL用Shader拾取 ...
ABP(现代ASP.NET样板开发框架)系列之18、ABP应用层——权限验证
点这里进入ABP系列文章总目录 ABP(现代ASP.NET样板开发框架)系列之18.ABP应用层——权限验证 ABP是“ASP.NET Boilerplate Project (ASP.NET样板项目 ...
ASP.NET MVC5+EF6+EasyUI 后台管理系统（18）-权限管理系统-表数据
系列目录这一节,我们插入数据来看看数据流,让各位同学,知道这个权限表交互是怎么一个流程,免得大家后天雾里来雾里去首先我再解释一些表,SysUser和SysRole表不用解释了. SysRoleSys ...
C#开发微信门户及应用(18)-微信企业号的通讯录管理开发之成员管理
在上篇随笔<C#开发微信门户及应用(17)-微信企业号的通讯录管理开发之部门管理>介绍了通讯录的部门的相关操作管理,通讯录管理包括部门管理.成员管理.标签管理三个部分,本篇主要介绍成员的管 ...
[MySQL Reference Manual] 18 复制
18 复制 18 复制 18.1 复制配置 18.1.1 基于Binary Log的数据库复制配置 18.1.2 配置基于Binary log的复制 18.1.2.1 设置复制master的配置 18 ...
Hihocoder 太阁最新面经算法竞赛18
Hihocoder 太阁最新面经算法竞赛18 source: https://hihocoder.com/contest/hihointerview27/problems 题目1 : Big Plus ...
grep-2.26 sed-4.2.2 awk-4.1.4 wget-1.18 pcregrep-8.39 pcre2grep-10.22 for windows 最新版本静态编译
-------------------------------------------------------------------------------------------- grep (G ...
《C#本质论》读书笔记（18）多线程处理
.NET Framework 4.0 看(本质论第3版) .NET Framework 4.5 看(本质论第4版) .NET 4.0为多线程引入了两组新API:TPL(Task Parallel Li ...
Java随机生成18位身份证号
package com.ihome.data; import java.text.SimpleDateFormat; import java.util.Calendar; import java.ut ...

随机推荐

Java&Quartz实现任务调度
目录 Java&Quartz实现任务调度 1.Quartz的作用 2.预备 3.Quartz核心 3.1.Job接口 3.2.JobDetail类 3.3 JobExecutionContex ...
nginx、php-fpm安装mongodb及驱动扩展
1.安装mongodb linux下安装mongodb很简单,执行如下命令完成安装 wget http://downloads.mongodb.org/linux/mongodb-linux-i686 ...
P1910 L国的战斗之间谍
P1910 L国的战斗之间谍题目背景 L国即将与I国发动战争!! 题目描述俗话说的好:“知己知彼,百战不殆”.L国的指挥官想派出间谍前往I国,于是,选人工作就落到了你身上. 你现在有N个人选,每个 ...
Python 数据结构_队列
目录目录队列队列 Queue 队列是一种先进先出(FIFO)的数据类型, 新的元素通过入队的方式添加进 Queue 的末尾, 出队就是从 Queue 的头部删除元素. 用列表来做 Queu ...
Delphi中文件名函数-路径、名称、子目录、驱动器、扩展名
文件名函数文件名函数可以对文件的名称.所在子目录.驱动器和扩展名等进行操作.下表列出这些函数及其功能. 函数说明 ExpandFileName() //返回文件的全路径(含驱动器.路径) Extra ...
day 88 DjangoRestFramework学习二之序列化组件、视图组件
DjangoRestFramework学习二之序列化组件.视图组件本节目录一序列化组件二视图组件三 xxx 四 xxx 五 xxx 六 xxx 七 xxx 八 xxx 一序列化组件 ...
创建第一个spirngmvc小项目
题外: 设置目录为源代码目录 1.进入:file->project structure->modules->soures 进入这个里面,选择相应的文件夹.例如src/java里的ja ...
nutch2.2.1+mysql抓取数据
基本环境:linux centos6.5 nutch2.2.1 源码包, mysql 5.5 ,elasticsearch1.1.1, jdk1.7 1.下载地址http://mirror.bjtu. ...
Spark DataFrame中的join使用说明
spark sql 中join的类型 Spark DataFrame中join与SQL很像,都有inner join, left join, right join, full join; 类型说明 ...
Parallels Desktop Centos 设置IP
参考链接 Parallels Desktop虚拟的Centos系统设置静态IP连网 https://blog.csdn.net/hotdust/article/details/53812953#com ...

18.scrapy_maitian

18.scrapy_maitian的更多相关文章

随机推荐

热门专题