用requests和bs做个简单的爬取网DAI之家的例子。

只做笔记用。


#!/usr/bin/python3

import requests
from bs4 import BeautifulSoup
import re
import xlwt class wdzj_spider:
pingTaiInfo = [] def request(self, url):
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
return requests.get(url, headers=headers) def saveToExcel(self, infoList, fileName='resultme.xls', sheetName='www.wdzj.com'): # 对于要写入的每一个元素,先找他的title在哪列,写入到对应的列去
rowNo = 1
excelTitle = {} book = xlwt.Workbook() # 打开一个excel
if book is None:
print("创建文件{0}失败".format(fileName))
return None
sheet = book.add_sheet(sheetName) # 根据顺序获取sheet
if sheet is None:
print("创建表单{0}失败".format(sheetName))
return None for info in infoList:
for item in info.items():
if str(item[0]) in excelTitle:
colNo = excelTitle[item[0]]
else:
colNo = len(excelTitle)
excelTitle[str(item[0])] = colNo
sheet.write(rowNo, colNo, str(item[1]))
rowNo = rowNo + 1
for m in excelTitle.items():
sheet.write(0, m[1], m[0])
book.save(fileName)
return rowNo def getDataplaneFromPage(self, Link):
mainHtml = self.request(Link)
dataInfo = {}
#默认编码有问题,需手动转码
txtUTF8 = str(mainHtml.content, 'utf-8')
mainBS = BeautifulSoup(txtUTF8, "lxml",from_encoding="utf8")
lis = mainBS.find_all('li', attrs={'class':'normal'})
for tr in lis:
divs = tr.find_all('div')
dataInfo[divs[1].text.strip()] = divs[0].text.strip()
# print("数据={0}".format(dataInfo))
return dataInfo def getGongshangFromPage(self, Link):
mainHtml = self.request(Link)
gongshangInfo = {}
#默认编码有问题,需手动转码
txtUTF8 = str(mainHtml.content, 'utf-8')
mainBS = BeautifulSoup(txtUTF8, "lxml",from_encoding="utf8")
trs = mainBS.find('div', attrs={'class':'lcen'}).find_all('tr')
for tr in trs:
tdTitles = tr.find_all('td', attrs={'class':'t'})
tds = tr.find_all('td')
index = 1
for td in tdTitles:
gongshangInfo[td.text] = tds[index].text.strip(' \n\r')
index = index + 2
# print("工商信息={0}".format(gongshangInfo))
return gongshangInfo def getLinkFromPage(self, pingtaiName, pingtaiLink):
shujuInfo = {}
lianxifangshiInfo = {}
pingtaiHtml = self.request(pingtaiLink)
#默认编码有问题,需手动转码
txtUTF8 = str(pingtaiHtml.content, 'utf-8')
mainBS = BeautifulSoup(txtUTF8, "lxml",from_encoding="utf8")
briefText = mainBS.find("div", class_="cen-zk").get_text().strip(' \n\r')
briefInfo={"P2P平台名称":pingtaiName, "简介":briefText}
# print("简介={0}".format(briefText))
gongshangLink = 'https://'+mainBS.find('a', text='工商/备案')['href'].lstrip('/')
dataA = mainBS.find('div', attrs={'class':'common-header-nav'}).find('a', text='数据') lianxifangshiTitles =mainBS.find('div', attrs={'class':"da-lxfs zzfwbox"}).find_all('div', class_='l')
lianxifangshiContents = mainBS.find('div', attrs={'class': "da-lxfs zzfwbox"}).find_all('div', class_='r') for i in range(0, len(lianxifangshiTitles)):
lianxifangshiInfo[lianxifangshiTitles[i].get_text().strip(' \n\r')] = lianxifangshiContents[i].get_text().strip(' \n\r')
# print("联系方式={0}".format(lianxifangshiInfo)) if dataA:
dataLink = 'https://' + dataA['href'].lstrip('/')
shujuInfo = self.getDataplaneFromPage(dataLink) gongshangInfo = self.getGongshangFromPage(gongshangLink)
self.pingTaiInfo.append({**briefInfo, **gongshangInfo, **shujuInfo, **lianxifangshiInfo}) def getAllPage(self):
startUrl = 'https://www.wdzj.com/dangan/search?filter=&sort=1&currentPage=1'
mainHtml = self.request(startUrl)
pageStr = BeautifulSoup(mainHtml.text, "lxml").find("span", class_="all").text
searchObj = re.search(r'1/([0-9]+)', pageStr, re.M | re.I)
pageCount = searchObj.group(1) startUrl = 'https://www.wdzj.com/dangan/search?filter=&sort=1&currentPage='
baseUrl = 'https://www.wdzj.com' print("总页数:{0}".format(pageCount))
for i in range(1, int(pageCount)+1):
# for i in range(1, 2): urlPage = startUrl + str(i)
pageHtml = self.request(urlPage)
pageStrs = BeautifulSoup(pageHtml.text, "lxml").find('ul',attrs={'class':'terraceList'}).find_all('h2') print("---------------------------------")
print("开始爬取第{0}页,共存在{1}个平台数据".format(i, len(pageStrs)))
for p in pageStrs:
a = p.find('a')
self.getLinkFromPage(a.get_text(), baseUrl+a['href'])
print("#", end='',flush=True)
print("\n结束爬取第{0}页,共爬取{1}个平台数据".format(i,len(pageStrs)))
self.saveToExcel(self.pingTaiInfo,fileName='p2p.xls') if __name__ == '__main__':
w = wdzj_spider()
w.getAllPage()
 

做下修改

 #!/usr/bin/python3

 import requests
from bs4 import BeautifulSoup
import re
import xlwt class wdzj_spider:
pingTaiInfo = []
book=None
sheet=None
excelTitle = {}
rowNo = 1 def request(self, url):
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
return requests.get(url, headers=headers) def openExcel(self, fileName='resultme.xls',sheetName='www.wdzj.com'):
self.book = xlwt.Workbook() # 打开一个excel
if self.book is None:
print("创建文件{0}失败".format(fileName))
return None
self.sheet = self.book.add_sheet(sheetName) # 根据顺序获取sheet
if self.sheet is None:
print("创建表单{0}失败".format(sheetName))
return None def closeExcel(self, fileName='resultme.xls'):
for m in self.excelTitle.items():
self.sheet.write(0, m[1], m[0])
self.book.save(fileName) def saveToExcel(self, infoList, fileName='resultme.xls'):
# 对于要写入的每一个元素,先找他的title在哪列,写入到对应的列去
for info in infoList:
for item in info.items():
if str(item[0]) in self.excelTitle:
colNo = self.excelTitle[item[0]]
else:
colNo = len(self.excelTitle)
self.excelTitle[str(item[0])] = colNo
self.sheet.write(self.rowNo, colNo, str(item[1]))
self.rowNo = self.rowNo + 1
self.book.save(fileName)
return self.rowNo def getDataplaneFromPage(self, Link):
mainHtml = self.request(Link)
dataInfo = {}
#默认编码有问题,需手动转码
txtUTF8 = str(mainHtml.content, 'utf-8')
mainBS = BeautifulSoup(txtUTF8, "lxml",from_encoding="utf8")
lis = mainBS.find_all('li', attrs={'class':'normal'})
for tr in lis:
divs = tr.find_all('div')
dataInfo[divs[1].text.strip()] = divs[0].text.strip()
# print("数据={0}".format(dataInfo))
return dataInfo def getGongshangFromPage(self, Link):
mainHtml = self.request(Link)
gongshangInfo = {}
#默认编码有问题,需手动转码
txtUTF8 = str(mainHtml.content, 'utf-8')
mainBS = BeautifulSoup(txtUTF8, "lxml",from_encoding="utf8")
trs = mainBS.find('div', attrs={'class':'lcen'}).find_all('tr')
for tr in trs:
tdTitles = tr.find_all('td', attrs={'class':'t'})
tds = tr.find_all('td')
index = 1
for td in tdTitles:
gongshangInfo[td.text] = tds[index].text.strip(' \n\r')
index = index + 2
# print("工商信息={0}".format(gongshangInfo))
return gongshangInfo def getLinkFromPage(self, pingtaiName, pingtaiLink):
shujuInfo = {}
lianxifangshiInfo = {}
pingtaiHtml = self.request(pingtaiLink)
#默认编码有问题,需手动转码
txtUTF8 = str(pingtaiHtml.content, 'utf-8')
mainBS = BeautifulSoup(txtUTF8, "lxml",from_encoding="utf8")
briefText = mainBS.find("div", class_="cen-zk").get_text().strip(' \n\r')
briefInfo={"P2P平台名称":pingtaiName, "简介":briefText}
# print("简介={0}".format(briefInfo))
gongshangLink = 'https://'+mainBS.find('a', text='工商/备案')['href'].lstrip('/')
dataA = mainBS.find('div', attrs={'class':'common-header-nav'}).find('a', text='数据') lianxifangshiBox = mainBS.find('div', attrs={'class':"da-lxfs zzfwbox"})
lianxifangshiTitles=[]
lianxifangshiContents=[]
if lianxifangshiBox:
lianxifangshiTitles =lianxifangshiBox.find_all('div', class_='l')
lianxifangshiContents = lianxifangshiBox.find_all('div', class_='r') for i in range(0, len(lianxifangshiTitles)):
lianxifangshiInfo[lianxifangshiTitles[i].get_text().strip(' \n\r')] = lianxifangshiContents[i].get_text().strip(' \n\r')
# print("联系方式={0}".format(lianxifangshiInfo)) if dataA:
dataLink = 'https://' + dataA['href'].lstrip('/')
shujuInfo = self.getDataplaneFromPage(dataLink) gongshangInfo = self.getGongshangFromPage(gongshangLink)
self.pingTaiInfo.append({**briefInfo, **gongshangInfo, **shujuInfo, **lianxifangshiInfo}) def getAllPage(self):
startUrl = 'https://www.wdzj.com/dangan/search?filter=&sort=1&currentPage=1'
mainHtml = self.request(startUrl)
pageStr = BeautifulSoup(mainHtml.text, "lxml").find("span", class_="all").text
searchObj = re.search(r'1/([0-9]+)', pageStr, re.M | re.I)
pageCount = searchObj.group(1) startUrl = 'https://www.wdzj.com/dangan/search?filter=&sort=1&currentPage='
baseUrl = 'https://www.wdzj.com' print("总页数:{0}".format(pageCount))
fileName = 'p2p.xls'
self.openExcel(fileName)
for i in range(1, int(pageCount)+1):
# for i in range(1, 2):
urlPage = startUrl + str(i)
pageHtml = self.request(urlPage)
pageStrs = BeautifulSoup(pageHtml.text, "lxml").find('ul',attrs={'class':'terraceList'}).find_all('h2') print("---------------------------------")
print("开始爬取第{0}页,共存在{1}个平台数据".format(i, len(pageStrs)))
count=0
for p in pageStrs:
a = p.find('a')
try:
self.getLinkFromPage(a.get_text(), baseUrl+a['href'])
count = count + 1
except:
print("爬取第{0}个数据,名称<{1}>失败".format(count+1, a.get_text()))
print("#", end='',flush=True)
print("\n结束爬取第{0}页,共爬取{1}个平台数据".format(i,count))
self.saveToExcel(self.pingTaiInfo, fileName=fileName)
self.pingTaiInfo.clear()
self.closeExcel(fileName) if __name__ == '__main__':
w = wdzj_spider()
w.getAllPage()

网DAI之家简单爬取的更多相关文章

  1. 用BeautifulSoup简单爬取BOSS直聘网岗位

    用BeautifulSoup简单爬取BOSS直聘网岗位 爬取python招聘 import requests from bs4 import BeautifulSoup def fun(path): ...

  2. Golang+chromedp+goquery 简单爬取动态数据

    目录 Golang+chromedp+goquery 简单爬取动态数据 Golang的安装 下载golang软件 解压golang 配置golang 重新导入配置 chromedp框架的使用 实际的代 ...

  3. java爬虫-简单爬取网页图片

    刚刚接触到“爬虫”这个词的时候是在大一,那时候什么都不明白,但知道了百度.谷歌他们的搜索引擎就是个爬虫. 现在大二.再次燃起对爬虫的热爱,查阅资料,知道常用java.python语言编程,这次我选择了 ...

  4. Python简单爬取Amazon图片-其他网站相应修改链接和正则

    简单爬取Amazon图片信息 这是一个简单的模板,如果需要爬取其他网站图片信息,更改URL和正则表达式即可 1 import requests 2 import re 3 import os 4 de ...

  5. 用python爬虫简单爬取 笔趣网:类“起点网”的小说

    首先:文章用到的解析库介绍 BeautifulSoup: Beautiful Soup提供一些简单的.python式的函数用来处理导航.搜索.修改分析树等功能. 它是一个工具箱,通过解析文档为用户提供 ...

  6. 练习: bs4 简单爬取 + matplotlib 折线图显示 (关键词,职位数量、起薪)

    要看一种技术在本地的流行程度,最简单的就是找招聘网站按关键词搜索. 比如今天查到的职位数量是vue 1296个,react 1204个,angular 721个.国际上比较流行的是react,本地市场 ...

  7. python网络爬虫--简单爬取糗事百科

    刚开始学习python爬虫,写了一个简单python程序爬取糗事百科. 具体步骤是这样的:首先查看糗事百科的url:http://www.qiushibaike.com/8hr/page/2/?s=4 ...

  8. 一、python简单爬取静态网页

    一.简单爬虫框架 简单爬虫框架由四个部分组成:URL管理器.网页下载器.网页解析器.调度器,还有应用这一部分,应用主要是NLP配合相关业务. 它的基本逻辑是这样的:给定一个要访问的URL,获取这个ht ...

  9. python scrapy简单爬虫记录(实现简单爬取知乎)

    之前写了个scrapy的学习记录,只是简单的介绍了下scrapy的一些内容,并没有实际的例子,现在开始记录例子 使用的环境是python2.7, scrapy1.2.0 首先创建项目 在要建立项目的目 ...

随机推荐

  1. JQuery里input属性赋值,取值prop()和attr()方法?

    一.赋值的时候 如果是<input type="checkbox" checked>这样的只有属性名就能生效的属性 推荐prop,即:$('input').prop(' ...

  2. 【转】HTTP响应状态码参考簿

    HTTP响应状态码参考簿 http状态返回代码 1xx(临时响应)表示临时响应并需要请求者继续执行操作的状态代码. http状态返回代码 代码   说明100   (继续) 请求者应当继续提出请求. ...

  3. 【js】字符串反转(倒序)的多种处理方式

    今天发布一篇关于字符串反转的几种方式(一种问题的解决方案不是只有一种). 方式1: 这种方式比较简单,推荐使用 字符串转数组,反转数组,数组转字符串. split(""):根据空字 ...

  4. nginx日志文件的配置

    文章来源 运维公会: nginx日志文件的配置 1.日志介绍 nginx有两种日志,一种是访问日志,一种是错误日志. 访问日志中记录的是客户端对服务器的所有请求. 错误日志中记录的是在访问过程中,因为 ...

  5. 记录--linux下mysql数据库问题

    本次主要记录一下linux下mysql数据库的一些问题,也是之前经常用到的知识,这里简单总结一些问题,方便自己以后的回顾.原来一直使用的是阿里云的RDS数据库mysql版,主要是因为上次阿里云做活动可 ...

  6. Tensorflow&CNN:验证集预测与模型评价

    版权声明:本文为博主原创文章,转载 请注明出处:https://blog.csdn.net/sc2079/article/details/90480140 - 写在前面 本科毕业设计终于告一段落了.特 ...

  7. java基础(4)---引用数据类型(数组、字符串、集合)

    一.数组 格式: int[] nums; 1. 数据类型[] 数组名称; 2. 数据类型 数组名称[]; (不太推荐)  动态初始化: new int[4] 数据类型[] 数组名称 = new 数据类 ...

  8. P1967 货车运输[生成树+LCA]

    题目描述 A国有n座城市,编号从 1到n,城市之间有 m 条双向道路.每一条道路对车辆都有重量限制,简称限重.现在有 q* 辆货车在运输货物, 司机们想知道每辆车在不超过车辆限重的情况下,最多能运多重 ...

  9. 猜数游戏-人机对战-经典的randint使用

    翻阅去年的笔记,老师曾经教的random模块下的三种用法,其中之一是randint用法,今天上传,留作笔记参考.人生苦短,我用python! # -*- coding: UTF-8 -*- impor ...

  10. python_面向对象——多态

    1.同一接口,多种形态 class Document: def __init__(self,name): self.name = name def show(self): # 异常处理:提示子类必须把 ...