Python_爬虫_案例汇总:
1.豆瓣采集
1 #coding:utf-8
2 #采集豆瓣书信息和图片,写进数据库
3
4 from urllib import request
5 # from bs4 import BeautifulSoup
6 from lxml import etree
7 import json,pymysql
8
9 # from my_pymysql import pymysql
10
11 url="https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4"
12 headers={
13 'Host':'book.douban.com',
14 'Upgrade-Insecure-Requests':'1',
15 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
16 }
17 req = request.Request(url=url,headers=headers,method="GET")
18 content = request.urlopen(req).read().decode("utf-8")
19 content_dict=etree.HTML(content) #格式化
20 # print(content_dict)
21 content_dict_allli = content_dict.xpath(r'//*[@id="subject_list"]/ul/li') #拿到列表
22 info_all = ''
23
24 for li in content_dict_allli:
25 # 书名/标题
26 title_list = li.xpath(r'div[2]/h2/a/@title') #取标签里的内容,注意地址是相对地址,不能直接拿来用 (注:和bs4不一样)
27 title =title_list[0]
28 title=title.replace(" ",'')
29 print(title)
30 #信息 作者、出版社
31 info_list = li.xpath(r'div[2]/div[1]/text()')
32 author = info_list[0].split('/')[0]
33 author = author.replace('\n','').replace(" ",'')
34 chubanshe = info_list[0].split('/')[1]
35 print(author)
36 print(chubanshe)
37 #评分
38 pingfen_list = li.xpath(r'div[2]/div[2]/span[2]/text()')
39 pingfen = pingfen_list[0]
40 print(pingfen)
41
42 #图片
43 img_net_addr =li.xpath(r'div[1]/a/img/@src')
44 img_net_addr = img_net_addr[0]
45 print(img_net_addr)
46 data = request.urlopen(img_net_addr).read()
47 img_name =str('douban/') + title + str('.jpg')
48 with open(img_name,'wb')as f:
49 f.write(data)
50
51 #数据库
52 db = pymysql.connect(host='localhost',port=3306,user="root",password='root',db='douban',charset='utf8') #
53 cur=db.cursor()
54 sql = "insert into douban(title,author,chubanshe,pingfen)values('%s','%s','%s','%s')"%(title,author,chubanshe,pingfen)
55 cur.execute(sql)
56 db.commit()
57
58 db.close()
采集豆瓣书信息和图片;带请求头、存数据库、图片;写进数据库
2.链家
#coding:utf-8
#完成,,取出链家数据存到文件里
from urllib import request,error
from bs4 import BeautifulSoup
import pymysql # from my_pymysql import pymysql #引入数据库
#创建数据库
db = pymysql.connect(host='localhost',user='root',password='root',db='lianjia',charset='utf8')
cur = db.cursor() #实例化游标 for i in range(1,33):
req=request.urlopen('https://xa.lianjia.com/ershoufang/pg'+str(i)).read().decode('utf-8')
req_bs4 = BeautifulSoup(req,'html.parser') #建立对象,才能用bs4
body_ul=req_bs4.find('ul',class_="sellListContent")
try:
s=''
for li in body_ul:
# info_all = li.find('div',class_="info clear").get_text() #全部信息
tit = li.find('div',class_="title").get_text() #标题
addr = li.find('div',class_="houseInfo").get_text() #地址
pric = li.find('div',class_="totalPrice").get_text() #价格
s+=tit
s+=addr
s+=pric
s+='\n\n'
print(i) #提示采集的位置
# 采集图片开始++++++++++++++++++++++++++++++++++++++++++++
img = li.find("img", class_='lj-lazy')['data-original'] #图片地址
img_format = img.split('.')[-1] # 用点隔开,取图片的后缀
img_name = 'lianjia/images/' + li.find("img", class_='lj-lazy')['alt'] + '.' + img_format # 名字
adr = request.urlopen(img).read() # 读取图片地址,拿到字节流形式的图片,,写进去
try: #;空的话就跳过
with open(img_name, 'wb')as f:
f.write(adr)
except:
pass
# 采集图片完毕----------------------------
#存到数据库
sql = "insert into lianjia_hotel(title,address) values ('%s','%s')"%(tit,addr)
cur.execute(sql)
db.commit()
except:
print("本页完毕~")
#最后再关闭数据库
db.close() #写到一个txt文件里面
# with open('lianjia/lianjia.txt','w',encoding="utf-8")as f:
# f.write(s)
链家下载,文字与图片,用bs4解析
3.今日头条
from selenium import webdriver
from lxml import etree
from pyquery import PyQuery as pq
import time driver = webdriver.Chrome()
driver.maximize_window()
driver.get('https://www.toutiao.com/')
driver.implicitly_wait(10)
driver.find_element_by_link_text('科技').click()
driver.implicitly_wait(10)
for x in range(3):
js="var q=document.documentElement.scrollTop="+str(x*500)
driver.execute_script(js)
time.sleep(2) time.sleep(5)
page = driver.page_source
doc = pq(page)
doc = etree.HTML(str(doc))
contents = doc.xpath('//div[@class="wcommonFeed"]/ul/li')
print(contents)
for x in contents:
title = x.xpath('div/div[1]/div/div[1]/a/text()')
if title:
title = title[0]
with open('toutiao.txt','a+',encoding='utf8')as f:
f.write(title+'\n')
print(title)
else:
pass
今日头条,selenium控制翻页
4.微信群信息(包括成员)和联系人
# -*- coding:utf-8 -*-
'''
扫码登陆微信后获取该微信账号的微信群(包括群内人员)和通讯录联系人信息【注:好像不全】
''' import os
import re
import time
import sys
import subprocess
import requests
import xml.dom.minidom
import json # 微信登陆
class WebwxLogin(object):
def __init__(self):
self.session = requests.session()
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:33.0) Gecko/20100101 Firefox/33.0'}
self.QRImgPath = os.path.split(os.path.realpath(__file__))[0] + os.sep + 'webWeixinQr.jpg'
self.uuid = ''
self.tip = 0
self.base_uri = ''
self.redirect_uri = ''
self.skey = ''
self.wxsid = ''
self.wxuin = ''
self.pass_ticket = ''
self.deviceId = 'e000000000000000'
self.BaseRequest = {}
self.ContactList = []
self.My = []
self.SyncKey = '' def getUUID(self): url = 'https://login.weixin.qq.com/jslogin'
params = {
'appid': 'wx782c26e4c19acffb',
'redirect_uri': 'https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxnewloginpage',
'fun': 'new',
'lang': 'zh_CN',
'_': int(time.time() * 1000), # 时间戳
} response = self.session.get(url, params=params)
target = response.content.decode('utf-8') pattern = r'window.QRLogin.code = (\d+); window.QRLogin.uuid = "(\S+?)"'
ob = re.search(pattern, target) # 正则提取uuid code = ob.group(1)
self.uuid = ob.group(2) if code == '200': # 判断请求是否成功
return True return False def showQRImage(self): url = 'https://login.weixin.qq.com/qrcode/' + self.uuid
response = self.session.get(url) self.tip = 1 with open(self.QRImgPath, 'wb') as f:
f.write(response.content)
f.close()
# 打开二维码
if sys.platform.find('darwin') >= 0:
subprocess.call(['open', self.QRImgPath]) # 苹果系统
elif sys.platform.find('linux') >= 0:
subprocess.call(['xdg-open', self.QRImgPath]) # linux系统
else:
os.startfile(self.QRImgPath) # windows系统 print('请使用微信扫描二维码登录') def checkLogin(self): url = 'https://login.weixin.qq.com/cgi-bin/mmwebwx-bin/login?tip=%s&uuid=%s&_=%s' % (
self.tip, self.uuid, int(time.time() * 1000)) response = self.session.get(url)
target = response.content.decode('utf-8') pattern = r'window.code=(\d+);'
ob = re.search(pattern, target)
code = ob.group(1) if code == '201': # 已扫描
print('成功扫描,请在手机上点击确认登录')
self.tip = 0
elif code == '200': # 已登录
print('正在登录中...')
regx = r'window.redirect_uri="(\S+?)";'
ob = re.search(regx, target)
self.redirect_uri = ob.group(1) + '&fun=new'
self.base_uri = self.redirect_uri[:self.redirect_uri.rfind('/')]
elif code == '408': # 超时
pass return code def login(self): response = self.session.get(self.redirect_uri, verify=False)
data = response.content.decode('utf-8') doc = xml.dom.minidom.parseString(data)
root = doc.documentElement
# 提取响应中的参数
for node in root.childNodes:
if node.nodeName == 'skey':
self.skey = node.childNodes[0].data
elif node.nodeName == 'wxsid':
self.wxsid = node.childNodes[0].data
elif node.nodeName == 'wxuin':
self.wxuin = node.childNodes[0].data
elif node.nodeName == 'pass_ticket':
self.pass_ticket = node.childNodes[0].data if not all((self.skey, self.wxsid, self.wxuin, self.pass_ticket)):
return False self.BaseRequest = {
'Uin': int(self.wxuin),
'Sid': self.wxsid,
'Skey': self.skey,
'DeviceID': self.deviceId,
} return True def webwxinit(self): url = self.base_uri + \
'/webwxinit?pass_ticket=%s&skey=%s&r=%s' % (
self.pass_ticket, self.skey, int(time.time() * 1000))
params = {
'BaseRequest': self.BaseRequest
} h = self.headers
h['ContentType'] = 'application/json; charset=UTF-8'
response = self.session.post(url, data=json.dumps(params), headers=h, verify=False)
data = response.content.decode('utf-8')
print(data) dic = json.loads(data)
self.ContactList = dic['ContactList']
self.My = dic['User'] SyncKeyList = []
for item in dic['SyncKey']['List']:
SyncKeyList.append('%s_%s' % (item['Key'], item['Val']))
self.SyncKey = '|'.join(SyncKeyList) ErrMsg = dic['BaseResponse']['ErrMsg'] Ret = dic['BaseResponse']['Ret']
if Ret != 0:
return False return True def webwxgetcontact(self): url = self.base_uri + \
'/webwxgetcontact?pass_ticket=%s&skey=%s&r=%s' % (
self.pass_ticket, self.skey, int(time.time())) h = self.headers
h['ContentType'] = 'application/json; charset=UTF-8'
response = self.session.get(url, headers=h, verify=False)
data = response.content.decode('utf-8')
# print(data) dic = json.loads(data)
MemberList = dic['MemberList'] # 倒序遍历,不然删除的时候出问题..
SpecialUsers = ["newsapp", "fmessage", "filehelper", "weibo", "qqmail", "tmessage", "qmessage", "qqsync",
"floatbottle", "lbsapp", "shakeapp", "medianote", "qqfriend", "readerapp", "blogapp",
"facebookapp", "masssendapp",
"meishiapp", "feedsapp", "voip", "blogappweixin", "weixin", "brandsessionholder",
"weixinreminder", "wxid_novlwrv3lqwv11", "gh_22b87fa7cb3c", "officialaccounts",
"notification_messages", "wxitil", "userexperience_alarm"]
for i in range(len(MemberList) - 1, -1, -1):
Member = MemberList[i]
if Member['VerifyFlag'] & 8 != 0: # 公众号/服务号
MemberList.remove(Member)
elif Member['UserName'] in SpecialUsers: # 特殊账号
MemberList.remove(Member)
elif Member['UserName'].find('@@') != -1: # 群聊
MemberList.remove(Member)
elif Member['UserName'] == self.My['UserName']: # 自己
MemberList.remove(Member) return MemberList def main(self):
if not self.getUUID():
print('获取uuid失败')
return self.showQRImage()
time.sleep(1) while self.checkLogin() != '200':
pass os.remove(self.QRImgPath) if not self.login():
print('登录失败')
return
# 登录完成, 下面查询好友
if not self.webwxinit():
print('初始化失败')
return MemberList = self.webwxgetcontact() print('通讯录共%s位好友' % len(MemberList)) for x in MemberList:
sex = '未知' if x['Sex'] == 0 else '男' if x['Sex'] == 1 else '女'
print('昵称:%s, 性别:%s, 备注:%s, 签名:%s' % (x['NickName'], sex, x['RemarkName'], x['Signature'])) if __name__ == '__main__':
print('开始')
wx = WebwxLogin()
wx.main()
爬取微信群信息(包括成员)和联系人信息
5.爬取淘宝固定类别商品信息+保存到mysql数据库【格式很规范】
import requests
import re
import pymysql def getHTMLtext(url):
try:
r=requests.get(url,timeout=100)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
def getpage(itl,html):
try:
plt=re.findall(r'"view_price":"[\d.]*"',html)
nlt=re.findall(r'"raw_title":".*?"',html)
for i in range(len(plt)):
price = eval(plt[i].split(':')[1]) # eval(fun,obj)
title = eval(nlt[i].split(':')[1])
itl.append([price, title])
except:
print("") def printgoods(itl):
tplt = "{:2}\t{:8}\t{:16}"
print(tplt.format("序号", "价格", "商品名称")) count = 0
conn = pymysql.connect(host='127.0.0.1', user='root', password='123456', db='company',charset="utf8") cur = conn.cursor() sqlc = '''
create table coffee(
id int(11) not null auto_increment primary key,
name varchar(255) not null,
price float not null)DEFAULT CHARSET=utf8;
''' try:
A = cur.execute(sqlc)
conn.commit()
print('成功')
except:
print("错误")
for g in itl:
count = count + 1
b=tplt.format(count, g[0], g[1]) sqla = '''
insert into coffee(name,price)
values(%s,%s);
'''
try:
B = cur.execute(sqla,(g[1],g[0]))
conn.commit()
print('成功')
except:
print("错误") # save_path = 'D:/taobao.txt'
# f=open(save_path,'a')
#
# f.write(b+'\n')
# f.close() conn.commit()
cur.close()
conn.close() def main():
goods="咖啡"
depth =2
start_url='https://s.taobao.com/search?q='+goods
List =[]
for i in range(depth):
try:
url =start_url +"&s="+ str(i*44)
html=getHTMLtext(url)
getpage(List,html)
except:
continue print(printgoods(List))
# savefiles(data) main()
淘宝信息采集+保存到Mysql数据库
Python_爬虫_案例汇总:的更多相关文章
- Python_爬虫_基础
1.urllib 和 Xpath的区别与联系 from urllib import request from lxml import etree from bs4 import BeautifulS ...
- Python_爬虫_百度图片
百度图片有些有编码问题,暂时不能爬取,多试几个 #思路:抓取图片地址,根据地址转存图片(注意名称):难点:转码 # -*- coding:utf-8 -*- from urllib import re ...
- Python_爬虫笔记_2018.3.19
Python_爬虫_笔记 1.前言 1.1爬虫用途: 网站采集.接口采集(地图(jis 热力学 屋里端口/协议).微信.知乎.) 1.2基本流程:网页下载(requests)+网页解析+爬虫调度 网页 ...
- 中国爬虫违法违规案例汇总github项目介绍
中国爬虫违法违规案例汇总github项目介绍 GitHub - 本项目用来整理所有中国大陆爬虫开发者涉诉与违规相关的新闻.资料与法律法规.致力于帮助在中国大陆工作的爬虫行业从业者了解我国相关法律,避免 ...
- Flex外包公司——案例汇总
Flex做的案例汇总: http://flex.org/showcase/ http://taggraph.com/everybody http://demoprod.informationbuild ...
- ES业界优秀实践案例汇总
ES业界优秀案例汇总 携程 LinkedIn Etsy国外电商CPU(vCore) 70*32 1000*12 4200单日索引数据条数 600亿 500亿 100亿单核处理数据性能/天 2600万/ ...
- Vue-CLI项目路由案例汇总
0901自我总结 Vue-CLI项目路由案例汇总 router.js import Vue from 'vue' import Router from 'vue-router' import Cour ...
- Python爬虫(十一)_案例:使用正则表达式的爬虫
本章将结合先前所学的爬虫和正则表达式知识,做一个简单的爬虫案例,更多内容请参考:Python学习指南 现在拥有了正则表达式这把神兵利器,我们就可以进行对爬取到的全部网页源代码进行筛选了. 下面我们一起 ...
- Python爬虫(十三)_案例:使用XPath的爬虫
本篇是使用XPath的案例,更多内容请参考:Python学习指南 案例:使用XPath的爬虫 现在我们用XPath来做一个简单的爬虫,我们尝试爬取某个贴吧里的所有帖子且将该帖子里每个楼层发布的图片下载 ...
随机推荐
- jmeter_04_常用取样器
目录 常用取样器详解 http取样器 1.1 基本配置 1.2 高级配置 jdbc取样器 2.1 JDBC Connection Configuration 2.1.1 **Variable Name ...
- nginx集群:nginx配置负载均衡集群(nginx1.18.0)
一,nginx的负载均衡集群的特点: 1,nginx集群和lvs的不同? lvs集群:工作在第4层(传输层) nginx集群:工作在第7层(应用层) lvs集群:性能更强 nginx集群:功能更强:可 ...
- HTML <del> 标签
HTML <del> 标签 什么是<del> 标签? 定义文档中已被删除的文本. 实例 a month is <del>25</del> 30 day ...
- Linux命令之Hash缓存表
Hash缓存表 系统初始hash表为空,当外部命令执行时,默认会从PATH路径下寻找该命令,找到后会将这条命令的路径记录到hash表中,当再次使用该命令时,shell解释器首先会查看hash表,存在将 ...
- 字节跳动2020Java面经,你离高薪就只差一片面试题了
前言 经历了惨痛的春招与秋招之后,也积攒了一些面经,希望能对大家有所帮助.由于字数限制需要答案的可以关注GZH[程序员空间] 免费领取完整版PDF 其他 什什么是幂等?什什么情况下需要考虑幂等?你怎么 ...
- Qlik Sense学习笔记之Mashup开发(一)
date: 2018-12-21 12:33:29 updated: 2018-12-21 12:33:29 Qlik Sense学习笔记之Mashup开发(一) 1.基于Qlik Sense API ...
- 关于eslint的使用与配置,以及prettier的使用
eslint官网:https://eslint.bootcss.com/docs/user-guide/getting-started eslint配置:http://eslint.cn/docs/r ...
- array_walk_recursive 地址引用报错的问题
今天看十八哥的视频,学习array_walk_recursive的用法,发现一直报错: PHP版本:5.6.19 代码界面: 报错界面: 查了很长时间,不知道什么问题,后来在网上终于找到原因所在: + ...
- mysql presto 函数收集
格式化日期 presto: select date_format(CURRENT_DATE - INTERVAL '1' month, '%Y-%m') mysql:date_format(DATE ...
- 部署SpringBoot项目jar包到云服务器
前言 做安卓开发也有三四年了,但是对网络这块什么http.tcp/ip之类的一直不理解.并且想自己做一些小项目练练手的时候,数据库直接存在apk里总不是滋味,所以这次站在安卓开发的角度尝试着做一做简单 ...