scrapy爬虫
a. 配置文件
#settings.py
DEPTH_LIMIT = 1 #指定“递归”的层数
ROBOTSTXT_OBEY = False #对方网站规定哪些网址可以爬,这个选项表示不遵循此规定
b. 选择器
.// #表示对象的子孙中
./ #儿子
./dev #儿子中的div标签
./div[@id='i1'] #儿子中的div标签且id='i1'
obj.extract() #列表中每一个对象转换字符串 => []
obj.extract_first #列表中的每一个对象转换字符串 => 列表第一个元素
//div/text() #获取某个标签的文本
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from scrapy.selector import Selector, HtmlXPathSelector
from scrapy.http import HtmlResponse
html = """<!DOCTYPE html>
<html>
<head lang="en">
<meta charset="UTF-8">
<title></title>
</head>
<body>
<ul>
<li class="item-"><a id='i1' href="link.html">first item</a></li>
<li class="item-0"><a id='i2' href="llink.html">first item</a></li>
<li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li>
</ul>
<div><a href="llink2.html">second item</a></div>
</body>
</html>
"""
response = HtmlResponse(url='http://example.com', body=html,encoding='utf-8')
# hxs = HtmlXPathSelector(response)
# print(hxs)
# hxs = Selector(response=response).xpath('//a')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[2]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[@id]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[@id="i1"]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[contains(@href, "link")]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[starts-with(@href, "link")]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/text()').extract()
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/@href').extract()
# print(hxs)
# hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract()
# print(hxs)
# hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first()
# print(hxs) # ul_list = Selector(response=response).xpath('//body/ul/li')
# for item in ul_list:
# v = item.xpath('./a/span')
# # 或
# # v = item.xpath('a/span')
# # 或
# # v = item.xpath('*/a/span')
# print(v)
c. 结构化处理
setting.py ITEM_PIPELINES = {
'day96.pipelines.Day96Pipeline': 300,
} DB = "....."
# -*- coding: utf-8 -*- # Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html from scrapy.exceptions import DropItem class Day96Pipeline(object): def __init__(self,conn_str):
self.conn_str = conn_str @classmethod
def from_crawler(cls, crawler):
"""
初始化时候,用于创建pipeline对象
:param crawler:
:return:
"""
conn_str = crawler.settings.get('DB')
return cls(conn_str) def open_spider(self,spider):
"""
爬虫开始执行时,调用
:param spider:
:return:
"""
self.conn = open(self.conn_str, 'a') def close_spider(self,spider):
"""
爬虫关闭时,被调用
:param spider:
:return:
"""
self.conn.close() def process_item(self, item, spider):
"""
每当数据需要持久化时,就会被调用
:param item:
:param spider:
:return:
"""
# if spider.name == 'chouti'
tpl = "%s\n%s\n\n" %(item['title'],item['href'])
self.conn.write(tpl) # 交给下一个pipeline处理
return item # 丢弃item,不交给
# raise DropItem()
pipelines.py
d. 常用命令
scrapy startproject sp1
cd p1
scrapy genspider baidu baidu.com #创建爬虫
scrapy crawl baidu
scrapy crawl baidu --nolog
e. 目录结构
sp1
- scrapy.cfg #初始配置文件
- sp1
- spiders #目录
- items.py #格式化
- pipelines.py #持久化
- middlewares.py #中间件
- settings.py #配置
事例
# -*- coding: utf-8 -*-
import scrapy
import sys
import io
from scrapy.selector import Selector,HtmlXPathSelector class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ['chouti.com']
start_urls = ['http://dig.chouti.com/'] def parse(self, response):
hxs = Selector(response=response).xpath('//div[@id="content-list"]/div[@class="item"]')
for obj in hxs:
a = obj.xpath('.//a[@class="show-content color-chag"]/text()').extract_first()
print(a.strip())
获取抽屉新闻标题
# -*- coding: utf-8 -*-
import scrapy
import sys
import io
from scrapy.selector import Selector,HtmlXPathSelector class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ['chouti.com']
start_urls = ['http://dig.chouti.com/'] visited_urls = set() def parse(self, response): #获取当前页的所有页码的url
hxs = Selector(response=response).xpath('//div[@id="dig_lcpage"]//a/@href').extract()
for url in hxs:
md5_url = self.md5(url)
if md5_url in self.visited_urls:
print('已经存在',url)
else:
self.visited_urls.add(md5_url)
print(url) def md5(self,url):
import hashlib
obj = hashlib.md5()
obj.update(bytes(url,encoding='utf-8'))
return obj.hexdigest()
获取抽屉当前页的所有页码
# -*- coding: utf-8 -*-
import scrapy
import sys
import io
from scrapy.http import Request
from scrapy.selector import Selector,HtmlXPathSelector class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ['chouti.com']
start_urls = ['http://dig.chouti.com/'] visited_urls = set() def parse(self, response): #获取当前页的所有页码的url hxs = Selector(response=response).xpath('//div[@id="dig_lcpage"]//a/@href').extract() for url in hxs:
md5_url = self.md5(url)
if md5_url in self.visited_urls:
pass
else:
print(url)
self.visited_urls.add(md5_url)
url = "http://dig.chouti.com%s" %url
#将新要访问的url增加到调度器
yield Request(url=url,callback=self.parse) def md5(self,url):
import hashlib
obj = hashlib.md5()
obj.update(bytes(url,encoding='utf-8'))
return obj.hexdigest()
获取抽屉所有页码
a. 避免重复的url
setting.py DUPEFILTER_CLASS = "day96.duplication.RepeatFilter"
class RepeatFilter(object):
def __init__(self):
self.visited_set = set()
@classmethod
def from_settings(cls, settings):
print('...')
return cls() def request_seen(self, request):
if request.url in self.visited_set:
return True
self.visited_set.add(request.url)
return False def open(self): # can return deferred
print('open')
pass def close(self, reason): # can return a deferred
print('close')
pass
def log(self, request, spider): # log that a request has been filtered
# print('log....')
pass
duplication.py
# -*- coding: utf-8 -*-
import scrapy
import sys
import io
from scrapy.http import Request
from scrapy.selector import Selector,HtmlXPathSelector class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ['chouti.com']
start_urls = ['http://dig.chouti.com/'] from scrapy.dupefilter import RFPDupeFilter def parse(self, response):
print(response.url) #获取当前页的所有页码的url hxs = Selector(response=response).xpath('//div[@id="dig_lcpage"]//a/@href').extract() for url in hxs: url = "http://dig.chouti.com%s" %url
#将新要访问的url增加到调度器
yield Request(url=url,callback=self.parse) def md5(self,url):
import hashlib
obj = hashlib.md5()
obj.update(bytes(url,encoding='utf-8'))
return obj.hexdigest()
chouti.py
116
传智播客
#爬传智播客的老师的名称 #scrapy startproject mySpider #cat /Users/huaixiaozi/PycharmProjects/mySpider/mySpider/items.py import scrapy
class MyspiderItem(scrapy.Item):
name = scrapy.Field()
title = scrapy.Field()
info = scrapy.Field() #cat /Users/huaixiaozi/PycharmProjects/mySpider/mySpider/spiders/itcastspider.py
import scrapy
from mySpider.items import ItcastItem #创建一个爬虫类
class ItcastSpider(scrapy.Spider):
#爬虫名
name = "itcast"
#允许爬虫作用的范围
allowd_domains = ["http://www.itcast.cn"]
#爬虫起始的url
start_urls = ["http://www.itcast.cn/channel/teacher.shtml#"] def parse(self, response): #通过scripy自带的xpath匹配出所有老师的根节点列表集合
teacher_list = response.xpath('//div[@class="li_txt"]') teacherItem = []
#遍历根节点集合
for each in teacher_list:
item = ItcastItem()
name = each.xpath('./h3/text()').extract()
title = each.xpath('./h4/text()').extract()
info = each.xpath('./p/text()').extract()
print("--------------",type(name))
item['name'] = name[0]
item['title'] = title[0]
item['info'] = info[0] teacherItem.append(item)
return teacherItem #保存到json文件
scrapy crawl itcast -o itcast.json
#保存到csv文件
scrapy crawl itcast -o itcast.csv
爬传智播客的老师的名称
scrapy爬虫的更多相关文章
- scrapy爬虫结果插入mysql数据库
1.通过工具创建数据库scrapy
- Python之Scrapy爬虫框架安装及简单使用
题记:早已听闻python爬虫框架的大名.近些天学习了下其中的Scrapy爬虫框架,将自己理解的跟大家分享.有表述不当之处,望大神们斧正. 一.初窥Scrapy Scrapy是一个为了爬取网站数据,提 ...
- Linux搭建Scrapy爬虫集成开发环境
安装Python 下载地址:http://www.python.org/, Python 有 Python 2 和 Python 3 两个版本, 语法有些区别,ubuntu上自带了python2.7. ...
- Scrapy 爬虫
Scrapy 爬虫 使用指南 完全教程 scrapy note command 全局命令: startproject :在 project_name 文件夹下创建一个名为 project_name ...
- [Python爬虫] scrapy爬虫系列 <一>.安装及入门介绍
前面介绍了很多Selenium基于自动测试的Python爬虫程序,主要利用它的xpath语句,通过分析网页DOM树结构进行爬取内容,同时可以结合Phantomjs模拟浏览器进行鼠标或键盘操作.但是,更 ...
- 同时运行多个scrapy爬虫的几种方法(自定义scrapy项目命令)
试想一下,前面做的实验和例子都只有一个spider.然而,现实的开发的爬虫肯定不止一个.既然这样,那么就会有如下几个问题:1.在同一个项目中怎么创建多个爬虫的呢?2.多个爬虫的时候是怎么将他们运行起来 ...
- 如何让你的scrapy爬虫不再被ban之二(利用第三方平台crawlera做scrapy爬虫防屏蔽)
我们在做scrapy爬虫的时候,爬虫经常被ban是常态.然而前面的文章如何让你的scrapy爬虫不再被ban,介绍了scrapy爬虫防屏蔽的各种策略组合.前面采用的是禁用cookies.动态设置use ...
- 如何让你的scrapy爬虫不再被ban
前面用scrapy编写爬虫抓取了自己博客的内容并保存成json格式的数据(scrapy爬虫成长日记之创建工程-抽取数据-保存为json格式的数据)和写入数据库(scrapy爬虫成长日记之将抓取内容写入 ...
- scrapy爬虫成长日记之将抓取内容写入mysql数据库
前面小试了一下scrapy抓取博客园的博客(您可在此查看scrapy爬虫成长日记之创建工程-抽取数据-保存为json格式的数据),但是前面抓取的数据时保存为json格式的文本文件中的.这很显然不满足我 ...
- 【图文详解】scrapy爬虫与动态页面——爬取拉勾网职位信息(2)
上次挖了一个坑,今天终于填上了,还记得之前我们做的拉勾爬虫吗?那时我们实现了一页的爬取,今天让我们再接再厉,实现多页爬取,顺便实现职位和公司的关键词搜索功能. 之前的内容就不再介绍了,不熟悉的请一定要 ...
随机推荐
- 【2016北京集训】Mushroom
Portal --> broken qwq Description 一开始有个蘑菇,蘑菇里面有\(n\)个房间,是一棵有根树,\(1\)号是根,每个房间里面都有杂草,现在要支持以下操作:将某个指 ...
- 对于redis底层框架的理解(五)
之前总结了redis的通讯流程,基本框架,epoll的封装等等,这次介绍下 redis对于select模型的封装 //select 模型 typedef struct aeApiState { //读 ...
- socketserver多线程处理
一.简介 SocketServer简化了网络服务器的编写.在进行socket创建时,使用SocketServer会大大减少创建的步骤,并且SocketServer使用了select它有5个类:Base ...
- -webkit-line-clamp 多行文字溢出...
一.应用 CSS代码: .box { width: 100px; display: -webkit-box; -webkit-line-clamp:; -webkit-box-orient: vert ...
- Python入门记录
最近看到Python3.7版本已经发布了,安装了Aconda最新的版本.安装完成后测试: 在Python程序里有两种办法查看Python版本信息: import sys # 查看版本 print(sy ...
- MyEclipse+Weblogic+Oracle+PLSQL配置注意事项
Weblogic配置详情:<Weblogic安装与配置图文详解>Oracle+PLSQL配置详情:<PL/SQL访问远程Oracle服务器(多种方式)>MyEclipse配置: ...
- [LeetCode] 30. Substring with Concatenation of All Words ☆☆☆
You are given a string, s, and a list of words, words, that are all of the same length. Find all sta ...
- HDU 6211 卡常数取模 预处理 数论
求所有不超过1e9的 primitive Pythagorean triple中第2大的数取模$2^k$作为下标,对应a[i]数组的和. 先上WIKI:https://en.wikipedia.org ...
- onblur & onchange
本文地址:http://www.cnblogs.com/veinyin/p/7606914.html 两者均可用于验证是否输入数据 onblur : 表示不再是焦点,是 onfocus 的相反事件, ...
- Use JPath but not recursively loop a JObject to modify the values.
I am dealing with a Json file, I parsed it into jObject, I have another list which flattened the pro ...