scrapy爬行乌云网公开漏洞程序的分析
# -*- coding: utf-8 -*-
from datetime import datetime
import pymongo
import scrapy
from wooyun.items import WooyunItem
from scrapy.conf import settings class WooyunSpider(scrapy.Spider):
name = "wooyun"#蜘蛛名字,运行命令为:scrapy crawl wooyun
allowed_domains = ["wooyun.org"]
start_urls = [
'http://wooyun.org/bugs/new_public/'
]#spider类会遍历该类变量for url in self.start_urls:yield Request(url, dont_filter=True)爬行的起点
def __init__(self,page_max=settings['PAGE_MAX_DEFAULT'],local_store=settings['LOCAL_STORE_DEFAULT'],\
update=settings['UPDATE_DEFAULT'],*args, **kwargs):
self.page_max = int(page_max)
self.local_store = 'true' == local_store.lower()
self.update = 'true' == update.lower() self.connection_string = "mongodb://%s:%d" % (settings['MONGODB_SERVER'],settings['MONGODB_PORT'])
self.client = pymongo.MongoClient(self.connection_string)
self.db = self.client[settings['MONGODB_DB']]
self.collection = self.db[settings['MONGODB_COLLECTION']] def closed(self,reason):
self.client.close() def parse(self, response):#当爬行返回第一个响应时会调用这个函数
total_pages = response.xpath("//p[@class='page']/text()").re('\d+')[1]
if self.page_max == 0:
end_page = int(total_pages)
else:
end_page = self.page_max
for n in range(1,end_page + 1):
page = "/bugs/new_public/page/%d" %n#乌云公开漏洞列表的一页
url = response.urljoin(page)
yield scrapy.Request(url, self.parse_list)#分析一页的漏洞列表 def parse_list(self,response):#取得一页列表的链接
links = response.xpath('//tbody/tr/td/a/@href').extract()
for url in links:
wooyun_id = url.split('/')[2]
if self.update == True or self.__search_mongodb(wooyun_id) == False:
url = response.urljoin(url)
yield scrapy.Request(url, self.parse_detail) def parse_detail(self,response):#对每一个漏洞的内容的提取
item = WooyunItem()
item['wooyun_id'] = response.xpath('//*[@id="bugDetail"]/div[5]/h3[1]/a/@href').extract()[0].split('/')[2]
item['title'] = response.xpath('//title/text()').extract()[0].split("|")[0]
item['bug_type'] = response.xpath("//h3[@class='wybug_type']/text()").extract()[0].split(u':')[1].strip()
#item['bug_type'] = response.xpath('//*[@id="bugDetail"]/div[5]/h3[7]/text()').extract()[0].split(u':')[1].strip()
#some author not text,for examp:
#http://wooyun.org/bugs/wooyun-2010-01010
#there will be error while parse author,so do this
try:
#item['author'] = response.xpath("//h3[@class='wybug_author']/a/text()").extract()[0]
item['author'] = response.xpath('//*[@id="bugDetail"]/div[5]/h3[4]/a/text()').extract()[0]
except:
item['author'] ='<Parse Error>'
#the response.body type is str,so we need to convert to utf-8
#if not utf-8,saving to mongodb may have some troubles
item['html'] = response.body.decode('utf-8','ignore')
#dt = response.xpath("//h3[@class='wybug_date']/text()").re("[\d+]{4}-[\d+]{2}-[\d+]{2}")[0].split('-')
dt = response.xpath('//*[@id="bugDetail"]/div[5]/h3[5]/text()').re("[\d+]{4}-[\d+]{2}-[\d+]{2}")[0].split('-')
item['datetime'] = datetime(int(dt[0]),int(dt[1]),int(dt[2]))
#dt = response.xpath("//h3[@class='wybug_open_date']/text()").re("[\d+]{4}-[\d+]{2}-[\d+]{2}")[0].split('-')
dt = response.xpath('//*[@id="bugDetail"]/div[5]/h3[6]/text()').re("[\d+]{4}-[\d+]{2}-[\d+]{2}")[0].split('-')
item['datetime_open'] = datetime(int(dt[0]),int(dt[1]),int(dt[2]))
#images url for download
item['image_urls']=[]
if self.local_store:
#乌云图片目前发两种格式,一种是http://static.wooyun.org/wooyun/upload/,另一格式是/upload/...
#因此,对后一种在爬取时,增加http://www.wooyun.org,以形成完整的url地址
#同时,在piplines.py存放时,作相应的反向处理
image_urls = response.xpath("//img[contains(@src, '/upload/')]/@src").extract()
for u in image_urls:
if self.__check_ingnored_image(u):
continue
if u.startswith('/'):
u = 'http://www.wooyun.org' + u
item['image_urls'].append(u)
return item #产生一个item项目 def __check_ingnored_image(self,image_url):
for ignored_url in settings['IMAGE_DOWLOAD_IGNORED']:
if ignored_url in image_url:
return True return False def __search_mongodb(self,wooyun_id):
#
wooyun_id_exsist = True if self.collection.find({'wooyun_id':wooyun_id}).count()>0 else False
#
return wooyun_id_exsist
scrapy.Request(url, self.parse_detail) Request对象有回调函数。Request对象放到scheduler的队列中,engine会从schedule中取得request。
scrapy的crawl命令调用的函数
def run(self, args, opts):
if len(args) < 1:
raise UsageError()
elif len(args) > 1:
raise UsageError("running 'scrapy crawl' with more than one spider is no longer supported")
spname = args[0] self.crawler_process.crawl(spname, **opts.spargs)#crawler_process爬行进程应该是ExecutionEngine实例
self.crawler_process.start()
C:\Python27\Lib\site-packages\scrapy\cmdline.py
def execute(argv=None, settings=None):
......
cmd.settings = settings
cmd.add_options(parser)
opts, args = parser.parse_args(args=argv[1:])
_run_print_help(parser, cmd.process_options, args, opts) cmd.crawler_process = CrawlerProcess(settings)#为命令对象添加爬行进程
_run_print_help(parser, _run_command, cmd, args, opts)
sys.exit(cmd.exitcode)
CrawlerProcess(settings)[C:\Python27\Lib\site-packages\scrapy\cawler.py]
class CrawlerProcess(CrawlerRunner):
"""
A class to run multiple scrapy crawlers in a process simultaneously. This class extends :class:`~scrapy.crawler.CrawlerRunner` by adding support
for starting a Twisted `reactor`_ and handling shutdown signals, like the
keyboard interrupt command Ctrl-C. It also configures top-level logging. This utility should be a better fit than
:class:`~scrapy.crawler.CrawlerRunner` if you aren't running another
Twisted `reactor`_ within your application. The CrawlerProcess object must be instantiated with a
:class:`~scrapy.settings.Settings` object. This class shouldn't be needed (since Scrapy is responsible of using it
accordingly) unless writing scripts that manually handle the crawling
process. See :ref:`run-from-script` for an example.
""" def __init__(self, settings=None):
super(CrawlerProcess, self).__init__(settings)
install_shutdown_handlers(self._signal_shutdown)
configure_logging(self.settings)
log_scrapy_info(self.settings) def _signal_shutdown(self, signum, _):
install_shutdown_handlers(self._signal_kill)
signame = signal_names[signum]
logger.info("Received %(signame)s, shutting down gracefully. Send again to force ",
{'signame': signame})
reactor.callFromThread(self._graceful_stop_reactor) def _signal_kill(self, signum, _):
install_shutdown_handlers(signal.SIG_IGN)
signame = signal_names[signum]
logger.info('Received %(signame)s twice, forcing unclean shutdown',
{'signame': signame})
reactor.callFromThread(self._stop_reactor) def start(self, stop_after_crawl=True):
"""
This method starts a Twisted `reactor`_, adjusts its pool size to
:setting:`REACTOR_THREADPOOL_MAXSIZE`, and installs a DNS cache based
on :setting:`DNSCACHE_ENABLED` and :setting:`DNSCACHE_SIZE`. If `stop_after_crawl` is True, the reactor will be stopped after all
crawlers have finished, using :meth:`join`. :param boolean stop_after_crawl: stop or not the reactor when all
crawlers have finished
"""
if stop_after_crawl:
d = self.join()
# Don't start the reactor if the deferreds are already fired
if d.called:
return
d.addBoth(self._stop_reactor) reactor.installResolver(self._get_dns_resolver())
tp = reactor.getThreadPool()
tp.adjustPoolsize(maxthreads=self.settings.getint('REACTOR_THREADPOOL_MAXSIZE'))
reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
reactor.run(installSignalHandlers=False) # blocking call def _get_dns_resolver(self):
if self.settings.getbool('DNSCACHE_ENABLED'):
cache_size = self.settings.getint('DNSCACHE_SIZE')
else:
cache_size = 0
return CachingThreadedResolver(
reactor=reactor,
cache_size=cache_size,
timeout=self.settings.getfloat('DNS_TIMEOUT')
) def _graceful_stop_reactor(self):
d = self.stop()
d.addBoth(self._stop_reactor)
return d def _stop_reactor(self, _=None):
try:
reactor.stop()
except RuntimeError: # raised if already stopped or in shutdown stage
pass
scrapy爬行乌云网公开漏洞程序的分析的更多相关文章
- CVE-2015-1641 Office类型混淆漏洞及shellcode分析
作者:枕边月亮 原文来自:CVE-2015-1641 Office类型混淆漏洞及shellcode分析 0x1实验环境:Win7_32位,Office2007 0x2工具:Windbg,OD,火绒剑, ...
- 【转】cve2014-3153 漏洞之详细分析与利用
背景学习: Linux Futex的设计与实现 使用者角度看bionic pthread_mutex和linux futex实现 By kernux TopSec α-lab 一 漏洞概述 这个漏洞是 ...
- 漏洞扫描与分析-Nessus-8.7.2最新版-安装-部署-使用
漏洞扫描与分析-Nessus 2019/10/10 Chenxin 简介 官网 https://zh-cn.tenable.com/ 产品 https://zh-cn.tenable.com/prod ...
- BEC合约整数溢出漏洞还原与分析
一.币圈一秒,人间一年 有道是币圈一日,人间一年.这个说法又得升级了,叫币圈一秒,人间一年. 前不久,币圈又出大事啦.BEC智能合约被爆出整数溢出漏洞,导致黑客能无限印币,在一次交易中,也就那么几秒钟 ...
- 堆栈上的舞蹈之释放重引用(UAF) 漏洞原理实验分析
0x01 前言 释放重引用的英文名名称是 Use After Free,也就是著名的 UAF 漏洞的全称.从字面意思可以看出 After Free 就是释放后的内存空间,Use 就是使用的意思,使用释 ...
- Java反序列化漏洞Apache CommonsCollections分析
Java反序列化漏洞Apache CommonsCollections分析 cc链,既为Commons-Collections利用链.此篇文章为cc链的第一条链CC1.而CC1目前用的比较多的有两条链 ...
- [原创]推荐一款强大的.NET程序内存分析工具.NET Memory Profiler
[原创]推荐一款强大的.NET程序内存分析工具.NET Memory Profiler 1 官方网站:http://memprofiler.com/2 下载地址:http://memprofiler. ...
- VS2010/MFC编程入门之四(MFC应用程序框架分析)
VS2010/MFC编程入门之四(MFC应用程序框架分析)-软件开发-鸡啄米 http://www.jizhuomi.com/software/145.html 上一讲鸡啄米讲的是VS2010应用 ...
- 服务器程序源代码分析之三:gunicorn
服务器程序源代码分析之三:gunicorn 时间:2014-05-09 11:33:54 类别:网站架构 访问: 641 次 gunicorn是一个python web 服务部署工具,类似flup,完 ...
随机推荐
- Java第09次实验(IO流)
参考资料 本次作业参考文件 正则表达式参考资料 第1次实验 0. 验证 使用FileOutputStream写字节.(二进制文件与文本文件.try...catch...finally注意事项) 使用D ...
- 关于nginx大流量负载调优
优化nginx包括两方面: 1.是自己重写nginx代码(比如tengine).本身nginx的代码已经足够优秀,如果不是每秒几千的请求,就忽略这个部分吧. 2.另一个就是和优化nginx的配置,这是 ...
- ALGO-147_蓝桥杯_算法训练_4-3水仙花数
问题描述 打印所有100至999之间的水仙花数.所谓水仙花数是指满足其各位数字立方和为该数字本身的整数,例如 =^+^+^. 样例输入 一个满足题目要求的输入范例. 例: 无 样例输出 xxx xxx ...
- ALGO-14_蓝桥杯_算法训练_回文数
问题描述 若一个数(首位不为零)从左向右读与从右向左读都一样,我们就将其称之为回文数. 例如:给定一个10进制数56,将56加65(即把56从右向左读),得到121是一个回文数. 又如:对于10进制数 ...
- sudo 命令报错的解决方法
尝试着用终端打开Mac的安全权限(sudo spctl --master-disable),却显示以下提示,望高手解答. sudo: /etc/sudoers is world writablesud ...
- 史上最全Spring面试71题与答案
1.什么是spring? Spring是个java企业级应用的开源开发框架.Spring主要用来开发Java应用,但是有些扩展是针对构建J2EE平台的web应用.Spring框架目标是简化Java企业 ...
- Java学习——Applet菜单
程序功能:在窗口中添加菜单栏,在菜单栏添加菜单项,并添加下拉菜单和 2 级菜单,通过选择菜单项可以执行不同操作,生成如下图所示窗口. package cys; import java.awt.*; i ...
- etcd集群部署与遇到的坑(转)
原文 https://www.cnblogs.com/breg/p/5728237.html etcd集群部署与遇到的坑 在k8s集群中使用了etcd作为数据中心,在实际操作中遇到了一些坑.今天记录一 ...
- MaidSafe区块链项目白皮书解读
MaidSafe.net宣布项目SAFE到社区 1. 介绍 现有的互联网基础设施越来越难以应付超过24亿互联网用户的需求,这个数字在2017年预计将增长到36亿.今天的架构中,中央中介(服务器)存储并 ...
- 牛客网剑指Offer——正则表达式匹配
1. 题目描述 请实现一个函数用来匹配包括'.'和'*'的正则表达式.模式中的字符'.'表示任意一个字符,而'*'表示它前面的字符可以出现任意次(包含0次). 在本题中,匹配是指字符串的所有字符匹配整 ...