使用requests、BeautifulSoup、线程池爬取艺龙酒店信息并保存到Excel中

import requests

import time, random, csv

from fake_useragent import UserAgent

from bs4 import BeautifulSoup

from threadpool import ThreadPool, makeRequests

def request_url(city_code, city_name, city_letter):

    """

    请求主页

    """

    with open('has_elong.json', 'a+', encoding='utf-8') as hs:

        hs.write(city_code + '\n')

    hs.close()

    if city_code and int(city_code) < 1000:

        city_code = '' + str(city_code)

    else:

        city_code = str(city_code)

    with open('艺龙/%s.csv' % city_name, 'w+', encoding='utf-8-sig') as f:

        cs = csv.writer(f, dialect='excel')

        # [酒店名称，价格，地址，星级，主题，可供服务，酒店信息]

        cs.writerow(['酒店名称', '价格', '地址', '星级', '主题', '可供服务', '酒店信息'])

        # 循环1-89页

        for n in range(1, 89):

            url = 'http://hotel.elong.com/%s/' % city_letter

            data = {

                "code": "",

                "listRequest.areaID": "",

                "listRequest.bookingChannel": "",

                "listRequest.cardNo": "",

                "listRequest.checkInDate": "2019-03-02 00:00:00",  # 入住时间

                "listRequest.checkOutDate": "2019-03-03 00:00:00",  # 离开时间

                "listRequest.cityID": city_code,

                "listRequest.cityName": city_name,  # 北京等地区

                "listRequest.customLevel": "",

                "listRequest.distance": "",

                "listRequest.endLat": "",

                "listRequest.endLng": "",

                "listRequest.facilityIds": "",

                "listRequest.highPrice": "",

                "listRequest.hotelBrandIDs": "",

                "listRequest.isAdvanceSave": "false",

                "listRequest.isAfterCouponPrice": "true",

                "listRequest.isCoupon": "false",

                "listRequest.isDebug": "false",

                "listRequest.isLimitTime": "false",

                "listRequest.isLogin": "false",

                "listRequest.isMobileOnly": "true",

                "listRequest.isNeed5Discount": "true",

                "listRequest.isNeedNotContractedHotel": "false",

                "listRequest.isNeedSimilarPrice": "false",

                "listRequest.isReturnNoRoomHotel": "true",

                "listRequest.isStaySave": "false",

                "listRequest.isTrace": "false",

                "listRequest.isUnionSite": "false",

                "listRequest.keywords": "",

                "listRequest.keywordsType": "",

                "listRequest.language": "cn",

                "listRequest.listType": "",

                "listRequest.lowPrice": "",

                "listRequest.orderFromID": "",

                "listRequest.pageIndex": n,  # 翻页

                "listRequest.pageSize": "",

                "listRequest.payMethod": "",

                "listRequest.personOfRoom": "",

                "listRequest.poiId": "",

                "listRequest.promotionChannelCode": "",

                "listRequest.proxyID": "ZD",

                "listRequest.rankType": "",

                "listRequest.returnFilterItem": "true",

                "listRequest.sellChannel": "",

                "listRequest.seoHotelStar": "",

                "listRequest.sortDirection": "",

                "listRequest.sortMethod": "",

                "listRequest.starLevels": "",

                "listRequest.startLat": "",

                "listRequest.startLng": "",

                "listRequest.taRecommend": "false",

                "listRequest.themeIds": "",

                "listRequest.ctripToken": "1c06a555-04ce-4884-aa05-e6f92ad0e84e",

                "listRequest.elongToken": "jc94shhj-d5a1-4092-8060-828b168dbb61"

            }

            headers = {

                'Accept': 'application/json, text/javascript, */*; q=0.01',

                'Accept-Encoding': 'gzip, deflate',

                'Accept-Language': 'zh-CN,zh;q=0.8',

                'Cache-Control': 'no-cache',

                'Content-Length': '',

                'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',

                # 'Cookie':'……61b8-48a1-b398-8b9ec1903f05……',

                'Host': 'hotel.elong.com',

                'Origin': 'http://hotel.elong.com',

                'Pragma': 'no-cache',

                'Proxy-Connection': 'keep-alive',

                'Referer': 'http://hotel.elong.com/%s/' % city_letter,

                'User-Agent': UserAgent(verify_ssl=False).random,

                'X-Requested-With': 'XMLHttpRequest'

            }

            try:

                time.sleep(random.randint(1, 4))

                res = requests.get(url, data=data, headers=headers)

                dete_list = get_info_and_req_details(res.text)

                for data in dete_list:

                    cs.writerow(data)

            except Exception:

                continue

    f.close()

def get_info_and_req_details(html):

    """

    清洗该页列表数据并向请求各个酒店的详情页

    page_list = [酒店名称，价格，地址，星级，主题，可供服务，酒店信息]

    """

    bs = BeautifulSoup(html, "lxml")

    h_list = bs.find_all('div', attrs={'class': 'h_item'})

    page_list = []

    i = 0

    for hotel in h_list:

        if i < 25:

            try:

                hotel_name = hotel.find('div', attrs={'class': 'h_info_pic'}).find('img').get('alt')

                hotel_price = str(hotel.find('span', attrs={'class': 'h_pri_num'}).get_text()) + '元起'

                hotel_add = hotel.find('p', attrs={'class': 'h_info_b2'}).find('a').get_text().replace('[', '').replace(']', '')

                hotel_ress = hotel.find('span', attrs={'class': 'l1'}).get('data-hoteladdress')

                try:

                    hotel_grade = hotel.find('b', attrs={'class': 'icon_stars'}).get('title')

                except Exception:

                    hotel_grade = '经济型'

                try:

                    hotel_theme = hotel.find('div', attrs={'class': 'tagList'}).get_text().replace('\n', ',')

                except Exception:

                    hotel_theme = ''

                try:

                    hotel_link = hotel.find('div', attrs={'class': 'h_info_pic'}).find('a').get('href')

                    time.sleep(random.randint(1, 3))

                    detail_html = requests.get('http://hotel.elong.com%s#hotelContent' % hotel_link)

                    server, hotel_info = get_details(detail_html.text)

                except Exception:

                    server = ''

                    hotel_info = ''

            except Exception:

                continue

            page_list.append([hotel_name, hotel_price, str(hotel_add)+str(hotel_ress), hotel_grade, hotel_theme, server, hotel_info])

        i += 1

    return page_list

def get_details(detail_html):

    """

    清洗详情页数据

    """

    detail = BeautifulSoup(detail_html, 'lxml')

    server = ''

    hotel_info = ''

    try:

        server = detail.find('ul', attrs={'class': 'dview_icon_list'}).get_text().replace('\n', ',')

        hotel_info = detail.find('div', attrs={'class': 'dview_info'}).get_text().replace('\n', ',').replace('\t', ',')

    except Exception:

        return server, hotel_info

    return server, hotel_info

if __name__ == '__main__':

    has_num = []

    req_list = []
　　// 地址爬取请借鉴爬取携程酒店信息

    for line in open('elong.json', encoding='utf-8'):

        line_list = line.replace("\n", "").split(',')

        for has in open("has_elong.json", encoding='utf-8'):

            has_num.append(int(has.replace('\n', '')))

        if int(line_list[0]) in has_num:

            continue

        # request_url(line_list[0], line_list[1], line_list[2])

        line_tuple = (line_list, None)

        req_list.append(line_tuple)

    pool = ThreadPool(3)

    requests_list = makeRequests(request_url, req_list)

    [pool.putRequest(req) for req in requests_list]

    pool.wait()

使用requests、BeautifulSoup、线程池爬取艺龙酒店信息并保存到Excel中的更多相关文章

使用requests、re、BeautifulSoup、线程池爬取携程酒店信息并保存到Excel中
import requests import json import re import csv import threadpool import time, random from bs4 impo ...
Python+Requests+异步线程池爬取视频到本地
1.本次项目为获取梨视频中的视频,再使用异步线程池下载视频到本地 2.获取视频时,其地址中的Url是会动态变化,不播放时src值为图片的地址,播放时src值为mp4格式 3.查看视频链接是否存在aja ...
Python爬取猫眼电影100榜并保存到excel表格
首先我们前期要导入的第三方类库有; 通过猫眼电影100榜的源码可以看到很有规律如: 亦或者是: 根据规律我们可以得到非贪婪的正则表达式 """<div class ...
爬取拉勾网所有python职位并保存到excel表格对象方式
# 1.把之间案例,使用bs4,正则,xpath,进行数据提取. # 2.爬取拉钩网上的所有python职位. from urllib import request,parse import json ...
爬取淘宝商品数据并保存在excel中
1.re实现 import requests from requests.exceptions import RequestException import re,json import xlwt,x ...
基于requests模块的cookie,session和线程池爬取
目录基于requests模块的cookie,session和线程池爬取基于requests模块的cookie操作基于requests模块的代理操作基于multiprocessing.dummy ...
【原创】py3+requests+json+xlwt，爬取拉勾招聘信息
在拉勾搜索职位时,通过谷歌F12抓取请求信息发现请求是一个post请求,参数为: 返回的是json数据有了上面的基础,我们就可以构造请求了然后对获取到的响应反序列化,这样就获取到了json格式的 ...
py3+requests+json+xlwt，爬取拉勾招聘信息
在拉勾搜索职位时,通过谷歌F12抓取请求信息发现请求是一个post请求,参数为: 返回的是json数据有了上面的基础,我们就可以构造请求了然后对获取到的响应反序列化,这样就获取到了json格式的 ...
python爬取数据保存到Excel中
# -*- conding:utf-8 -*- # 1.两页的内容 # 2.抓取每页title和URL # 3.根据title创建文件,发送URL请求,提取数据 import requests fro ...

随机推荐

使用zipwithindex 算子给dataframe增加自增列 row_number函数实现自增，udf函数实现自增
DataFrame df = ...StructType schema = df.schema().add(DataTypes.createStructField("id", Da ...
ElasticSearch 倒排索引简析
内容概要倒排索引是什么?为什么需要倒排索引? 倒排索引是怎么工作的? 1. 倒排索引是什么? 假设有一个交友网站,信息表如下: 美女1:"我要找在上海做 PHP 的哥哥." 需要 ...
如何构建可伸缩的Web应用？
为什么要构建可伸缩的Web应用? 想象一下,你的营销活动吸引了很多用户,在某个时候,应用必须同时为成千上万的用户提供服务,这么大的并发量,服务器的负载会很大,如果设计不当,系统将无法处理. 接下来发生 ...
jmeter使用—远程分布式
今天要说的是在远程服务器上使用多台服务器进行noGUI分布式使用jmeter压测. 1.首先准备几台服务器,服务器上都需要安装同一个版本的jmeter. 2.在服务器上启动jmeter的方式是在jme ...
AVLtree（C++实现）有统一的旋转操作
在学习完AVLtree之后,我发现,左旋,右旋均可以采用统一的旋转方式来实现,所以把代码贴在下面代码是完整的AVLTree实现 C++标准为C++11 在ubuntu 18.04下通过编译和调试 / ...
谈谈模型融合之三 —— GBDT
前言本来应该是年后就要写的一篇博客,因为考完试后忙了一段时间课设和实验,然后回家后又在摸鱼,就一直没开动.趁着这段时间只能呆在家里来把这些博客补上.在之前的文章中介绍了 Random Forest ...
pku-3321 Apple Tree（dfs序+树状数组）
Description There is an apple tree outside of kaka's house. Every autumn, a lot of apples will grow ...
关于远程办公，微软MVP 15年研发团队的经验分享
今天是2月5日,春节假期结束后的第三天了.为了能够应对来势汹汹的疫情,众多互联网企业纷纷开启了远程办公模式.不知道各团队前两天的远程办公效果如何,我们 Worktile 管理层在大年初四就开始讨论远程 ...
MYSQL-innobackupex备份脚本
自动化运维,是数据库管理员的不懈追求.目前出现了一些数据库自动管理平台,但出现时间较短,不够稳定.最常用的工具依然是shell脚本. 现在需要管理的某mysql数据库,数据量在800G,使用dump备 ...
linux实用指令 | 程序员线上排查必知必会linux指令(持续更新中)
Linux线上排查程序员实用指南一.乱码问题二.帮助指令 1. help命令 2. man命令 3. info命令三.性能监测与优化 1. top命令参考资源 Linux线上排查程序员实用指南 ...

使用requests、BeautifulSoup、线程池爬取艺龙酒店信息并保存到Excel中

使用requests、BeautifulSoup、线程池爬取艺龙酒店信息并保存到Excel中的更多相关文章

随机推荐

热门专题