import requests
import time, random, csv
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from threadpool import ThreadPool, makeRequests def request_url(city_code, city_name, city_letter):
"""
请求主页
"""
with open('has_elong.json', 'a+', encoding='utf-8') as hs:
hs.write(city_code + '\n')
hs.close()
if city_code and int(city_code) < 1000:
city_code = '' + str(city_code)
else:
city_code = str(city_code)
with open('艺龙/%s.csv' % city_name, 'w+', encoding='utf-8-sig') as f:
cs = csv.writer(f, dialect='excel')
# [酒店名称,价格,地址,星级,主题,可供服务,酒店信息]
cs.writerow(['酒店名称', '价格', '地址', '星级', '主题', '可供服务', '酒店信息'])
# 循环1-89页
for n in range(1, 89):
url = 'http://hotel.elong.com/%s/' % city_letter
data = {
"code": "",
"listRequest.areaID": "",
"listRequest.bookingChannel": "",
"listRequest.cardNo": "",
"listRequest.checkInDate": "2019-03-02 00:00:00", # 入住时间
"listRequest.checkOutDate": "2019-03-03 00:00:00", # 离开时间
"listRequest.cityID": city_code,
"listRequest.cityName": city_name, # 北京等地区
"listRequest.customLevel": "",
"listRequest.distance": "",
"listRequest.endLat": "",
"listRequest.endLng": "",
"listRequest.facilityIds": "",
"listRequest.highPrice": "",
"listRequest.hotelBrandIDs": "",
"listRequest.isAdvanceSave": "false",
"listRequest.isAfterCouponPrice": "true",
"listRequest.isCoupon": "false",
"listRequest.isDebug": "false",
"listRequest.isLimitTime": "false",
"listRequest.isLogin": "false",
"listRequest.isMobileOnly": "true",
"listRequest.isNeed5Discount": "true",
"listRequest.isNeedNotContractedHotel": "false",
"listRequest.isNeedSimilarPrice": "false",
"listRequest.isReturnNoRoomHotel": "true",
"listRequest.isStaySave": "false",
"listRequest.isTrace": "false",
"listRequest.isUnionSite": "false",
"listRequest.keywords": "",
"listRequest.keywordsType": "",
"listRequest.language": "cn",
"listRequest.listType": "",
"listRequest.lowPrice": "",
"listRequest.orderFromID": "",
"listRequest.pageIndex": n, # 翻页
"listRequest.pageSize": "",
"listRequest.payMethod": "",
"listRequest.personOfRoom": "",
"listRequest.poiId": "",
"listRequest.promotionChannelCode": "",
"listRequest.proxyID": "ZD",
"listRequest.rankType": "",
"listRequest.returnFilterItem": "true",
"listRequest.sellChannel": "",
"listRequest.seoHotelStar": "",
"listRequest.sortDirection": "",
"listRequest.sortMethod": "",
"listRequest.starLevels": "",
"listRequest.startLat": "",
"listRequest.startLng": "",
"listRequest.taRecommend": "false",
"listRequest.themeIds": "",
"listRequest.ctripToken": "1c06a555-04ce-4884-aa05-e6f92ad0e84e",
"listRequest.elongToken": "jc94shhj-d5a1-4092-8060-828b168dbb61"
}
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cache-Control': 'no-cache',
'Content-Length': '',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
# 'Cookie':'……61b8-48a1-b398-8b9ec1903f05……',
'Host': 'hotel.elong.com',
'Origin': 'http://hotel.elong.com',
'Pragma': 'no-cache',
'Proxy-Connection': 'keep-alive',
'Referer': 'http://hotel.elong.com/%s/' % city_letter,
'User-Agent': UserAgent(verify_ssl=False).random,
'X-Requested-With': 'XMLHttpRequest'
}
try:
time.sleep(random.randint(1, 4))
res = requests.get(url, data=data, headers=headers)
dete_list = get_info_and_req_details(res.text)
for data in dete_list:
cs.writerow(data)
except Exception:
continue
f.close() def get_info_and_req_details(html):
"""
清洗该页列表数据并向请求各个酒店的详情页
page_list = [酒店名称,价格,地址,星级,主题,可供服务,酒店信息]
"""
bs = BeautifulSoup(html, "lxml")
h_list = bs.find_all('div', attrs={'class': 'h_item'})
page_list = []
i = 0
for hotel in h_list:
if i < 25:
try:
hotel_name = hotel.find('div', attrs={'class': 'h_info_pic'}).find('img').get('alt')
hotel_price = str(hotel.find('span', attrs={'class': 'h_pri_num'}).get_text()) + '元起'
hotel_add = hotel.find('p', attrs={'class': 'h_info_b2'}).find('a').get_text().replace('[', '').replace(']', '')
hotel_ress = hotel.find('span', attrs={'class': 'l1'}).get('data-hoteladdress')
try:
hotel_grade = hotel.find('b', attrs={'class': 'icon_stars'}).get('title')
except Exception:
hotel_grade = '经济型'
try:
hotel_theme = hotel.find('div', attrs={'class': 'tagList'}).get_text().replace('\n', ',')
except Exception:
hotel_theme = ''
try:
hotel_link = hotel.find('div', attrs={'class': 'h_info_pic'}).find('a').get('href')
time.sleep(random.randint(1, 3))
detail_html = requests.get('http://hotel.elong.com%s#hotelContent' % hotel_link)
server, hotel_info = get_details(detail_html.text)
except Exception:
server = ''
hotel_info = ''
except Exception:
continue
page_list.append([hotel_name, hotel_price, str(hotel_add)+str(hotel_ress), hotel_grade, hotel_theme, server, hotel_info])
i += 1
return page_list def get_details(detail_html):
"""
清洗详情页数据
"""
detail = BeautifulSoup(detail_html, 'lxml')
server = ''
hotel_info = ''
try:
server = detail.find('ul', attrs={'class': 'dview_icon_list'}).get_text().replace('\n', ',')
hotel_info = detail.find('div', attrs={'class': 'dview_info'}).get_text().replace('\n', ',').replace('\t', ',')
except Exception:
return server, hotel_info
return server, hotel_info if __name__ == '__main__':
has_num = []
req_list = []
  // 地址爬取请借鉴爬取携程酒店信息
for line in open('elong.json', encoding='utf-8'):
line_list = line.replace("\n", "").split(',')
for has in open("has_elong.json", encoding='utf-8'):
has_num.append(int(has.replace('\n', '')))
if int(line_list[0]) in has_num:
continue
# request_url(line_list[0], line_list[1], line_list[2])
line_tuple = (line_list, None)
req_list.append(line_tuple)
pool = ThreadPool(3)
requests_list = makeRequests(request_url, req_list)
[pool.putRequest(req) for req in requests_list]
pool.wait()

使用requests、BeautifulSoup、线程池爬取艺龙酒店信息并保存到Excel中的更多相关文章

  1. 使用requests、re、BeautifulSoup、线程池爬取携程酒店信息并保存到Excel中

    import requests import json import re import csv import threadpool import time, random from bs4 impo ...

  2. Python+Requests+异步线程池爬取视频到本地

    1.本次项目为获取梨视频中的视频,再使用异步线程池下载视频到本地 2.获取视频时,其地址中的Url是会动态变化,不播放时src值为图片的地址,播放时src值为mp4格式 3.查看视频链接是否存在aja ...

  3. Python爬取猫眼电影100榜并保存到excel表格

    首先我们前期要导入的第三方类库有; 通过猫眼电影100榜的源码可以看到很有规律 如: 亦或者是: 根据规律我们可以得到非贪婪的正则表达式 """<div class ...

  4. 爬取拉勾网所有python职位并保存到excel表格 对象方式

    # 1.把之间案例,使用bs4,正则,xpath,进行数据提取. # 2.爬取拉钩网上的所有python职位. from urllib import request,parse import json ...

  5. 爬取淘宝商品数据并保存在excel中

    1.re实现 import requests from requests.exceptions import RequestException import re,json import xlwt,x ...

  6. 基于requests模块的cookie,session和线程池爬取

    目录 基于requests模块的cookie,session和线程池爬取 基于requests模块的cookie操作 基于requests模块的代理操作 基于multiprocessing.dummy ...

  7. 【原创】py3+requests+json+xlwt,爬取拉勾招聘信息

    在拉勾搜索职位时,通过谷歌F12抓取请求信息 发现请求是一个post请求,参数为: 返回的是json数据 有了上面的基础,我们就可以构造请求了 然后对获取到的响应反序列化,这样就获取到了json格式的 ...

  8. py3+requests+json+xlwt,爬取拉勾招聘信息

    在拉勾搜索职位时,通过谷歌F12抓取请求信息 发现请求是一个post请求,参数为: 返回的是json数据 有了上面的基础,我们就可以构造请求了 然后对获取到的响应反序列化,这样就获取到了json格式的 ...

  9. python爬取数据保存到Excel中

    # -*- conding:utf-8 -*- # 1.两页的内容 # 2.抓取每页title和URL # 3.根据title创建文件,发送URL请求,提取数据 import requests fro ...

随机推荐

  1. <s:select>自动加标签

    在使用<s:select>标签时,发现页面位置不对,查看页面源码发现 <tr> <td class="tdLabel"></td> ...

  2. Nginx代理服务——常用的配置语法

    可以到官方查看所有代理的配置语法http://nginx.org/en/docs/http/ngx_http_proxy_module.html 缓存区 Syntax:proxy_buffering ...

  3. Spring Boot自动装配

    前言 一些朋友问我怎么读源码,这篇文章结合我看源码时候一些思路给大家聊聊,我主要从这三个方向出发: 确定目标,这个目标要是一个具体,不要一上来我要看懂Spring,这是不可能的,目标要这么来定,比如看 ...

  4. Scrapy去重

    一.原生 1.模块 from scrapy.dupefilters import RFPDupeFilter 2.RFPDupeFilter方法 a.request_seen 核心:爬虫每执行一次yi ...

  5. 7.JavaSE之类型转换

    类型转换: 由于Java是强类型语言,所以要进行运算的时候,需要用到类型转换. 图中优先级从低到高,小数优先级大于整数. 运算中,不同类型的数据需要转换为同一类型,然后进行运算. 强制类型转换:(类型 ...

  6. Python环境搭建(win)——Python官方解释器

    Python官方解释器搭建方法: 本文以当前最新的3.8.1为例 1.在电脑上打开Python的官网https://www.python.org/ 2.找到Download下的All releases ...

  7. NOI2.5 1490:A Knight's Journey

    描述 Background The knight is getting bored of seeing the same black and white squares again and again ...

  8. robotframework,移动端(小程序)自动化,滚动屏幕的方法

    场景描述: 小程序端定位元素有无法定位弹出层内容的问题(自动化工具只能识别到背景主层,无法识别到弹出层) 解决思路: 1.弹出层元素与背景主层元素位置一致,当点击出弹出层时,在定位背景主层即可定位到弹 ...

  9. [集训]FWT基础练习题

    题意 给出n个长度为20的二进制数和数字k,每次询问给出一个二进制数,问从n个数中挑k个数(不能重复)的按位或能包含询问的组合有多少个.数字均小于等于5E5,1s. 思考 强行算出2^20个答案,再O ...

  10. [计算几何+图论]doge

    题意 在平面直角坐标系上,你有一只doge在原点处.doge被绳子拴住了,绳子不会打结,没有弹性(但很柔软),并且长度为L.平面上有一些目标,因此你的doge会按照顺序去捡起它们,但是doge只能走直 ...