Python——安居客租房信息爬取（以南昌为例）

前言：

提前安装好所需要的库。
本代码的输入仅需要某个城市的租房地址首页即可，其他自会生成。
使用前请创建所需的目录，或者为代码添加os.makedir()
支持断点重爬，重行运行即可。
headers等随运行环境不同，可能需要进行修改。
本代码使用了高德API key，用于获取地理坐标，但发布时已略去，如需使用，请注册高德api开发者。
内容原创，引用请注明出处。Note: http://www.cnblogs.com/shadrach; author: shadrach@yeah.net。

# author: shadrach@yeah.net

# blog: http://www.cnblogs.com/shadrach

# NOTE: original article, indicate the source if reprint.

# Thanks.

# Update: 2018/1/24

import urllib.request

from bs4 import BeautifulSoup

import xlsxwriter

import xlrd

import os

import math

import time

import glob

# coordinate convert: from gcj(amap) to wgs(gps)

def GCJ2WGS(location):

# location格式如下：locations[1] = "113.923745,22.530824"

    lon = float(location[0:location.find(",")])

    lat = float(location[location.find(",") + 1:len(location)])

    a = 6378245.0 # 克拉索夫斯基椭球参数长半轴a

    ee = 0.00669342162296594323 #克拉索夫斯基椭球参数第一偏心率平方

    PI = 3.14159265358979324 # 圆周率

    # 以下为转换公式

    x = lon - 105.0

    y = lat - 35.0

    # 经度

    dLon = 300.0 + x + 2.0 * y + 0.1 * x * x + 0.1 * x * y + 0.1 * math.sqrt(abs(x))

    dLon += (20.0 * math.sin(6.0 * x * PI) + 20.0 * math.sin(2.0 * x * PI)) * 2.0 / 3.0

    dLon += (20.0 * math.sin(x * PI) + 40.0 * math.sin(x / 3.0 * PI)) * 2.0 / 3.0

    dLon += (150.0 * math.sin(x / 12.0 * PI) + 300.0 * math.sin(x / 30.0 * PI)) * 2.0 / 3.0

    #维度

    dLat = -100.0 + 2.0 * x + 3.0 * y + 0.2 * y * y + 0.1 * x * y + 0.2 * math.sqrt(abs(x))

    dLat += (20.0 * math.sin(6.0 * x * PI) + 20.0 * math.sin(2.0 * x * PI)) * 2.0 / 3.0

    dLat += (20.0 * math.sin(y * PI) + 40.0 * math.sin(y / 3.0 * PI)) * 2.0 / 3.0

    dLat += (160.0 * math.sin(y / 12.0 * PI) + 320 * math.sin(y * PI / 30.0)) * 2.0 / 3.0

    radLat = lat / 180.0 * PI

    magic = math.sin(radLat)

    magic = 1 - ee * magic * magic

    sqrtMagic = math.sqrt(magic)

    dLat = (dLat * 180.0) / ((a * (1 - ee)) / (magic * sqrtMagic) * PI)

    dLon = (dLon * 180.0) / (a / sqrtMagic * math.cos(radLat) * PI)

    wgsLon = lon - dLon

    wgsLat = lat - dLat

    return wgsLon,wgsLat

# xlsx files merge

def xlsx_merge(fileLocation,header,filename):

    fileList = []

    for fileName in glob.glob(fileLocation + "*.xlsx"):

        fileList.append(fileName)

    fileNum = len(fileList)

    matrix = [None] * fileNum

    for i in range(fileNum):

        fileName = fileList[i]

        workBook = xlrd.open_workbook(fileName)

        try:

            sheet = workBook.sheet_by_index(0)

        except Exception as e:

            print(e)

        nRows = sheet.nrows

        matrix[i] = [0]*(nRows - 1)

        nCols = sheet.ncols

        for m in range(nRows - 1):

            matrix[i][m] = [""]* nCols

        for j in range(1,nRows):

            for k in range(nCols):

                matrix[i][j-1][k] = sheet.cell(j,k).value

    fileName = xlsxwriter.Workbook(fileLocation + filename + ".xlsx")

    sheet = fileName.add_worksheet("merged")

    for i in range(len(header)):

        sheet.write(0,i,header[i])

    rowIndex = 1

    for fileIndex in range(fileNum):

        for j in range(len(matrix[fileIndex])):

            for colIndex in range (len(matrix[fileIndex][j])):

                sheet.write(rowIndex,colIndex,matrix[fileIndex][j][colIndex])

            rowIndex += 1

    print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + ": "+ "已完成%d个文件的合并"%fileNum)

    fileName.close()

# uniform request

def soup_form(url,referer):

    headers = {

            "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",

#             "Accept-Encoding":"gzip, deflate, sdch, br",  # 解码错误，注释

            "Accept-Language":"zh-CN,zh;q=0.8",

            "Cache-Control":"max-age=0",

            "Connection":"keep-alive",

            "Cookie":"lps=http%3A%2F%2Fwww.anjuke.com%2F%3Fpi%3DPZ-baidu-pc-all-biaoti%7Chttp%3A%2F%2Fbzclk.baidu.com%2Fadrc.php%3Ft%3D06KL00c00fDgzw60mUFU00PpAs0Mhyup00000PkqW-b00000uN71Vj.THvs_oeHEtY0UWdBmy-bIfK15yNBnHfkrjfLnj0sn1bdmWD0IHYLfbcsnYuKwj-7f1KKfHT4nj0sPYRvwj0dPDFanYFKfsK95gTqFhdWpyfqn103nWfLP1ndniusThqbpyfqnHm0uHdCIZwsT1CEQLILIz49UhGdpvR8mvqVQ1qspHdfyBdBmy-bIidsmzd9UAsVmh-9ULwG0APzm1YkrH6dP0%26tpl%3Dtpl_10085_16624_12226%26l%3D1502510556%26attach%3Dlocation%253D%2526linkName%253D%2525E6%2525A0%252587%2525E9%2525A2%252598%2526linkText%253D%2525E5%2525AE%252589%2525E5%2525B1%252585%2525E5%2525AE%2525A2-%2525E5%25259B%2525BD%2525E5%252586%252585%2525E9%2525A2%252586%2525E5%252585%252588%2525E6%252589%2525BE%2525E6%252588%2525BF%2525E5%2525B9%2525B3%2525E5%25258F%2525B0%2525EF%2525BC%25258C%2525E5%2525AE%252589%2525E5%2525BF%252583%2525E6%25258C%252591%2526xp%253Did%28%252522m4ce5ae35%252522%29%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FH2%25255B1%25255D%25252FA%25255B1%25255D%2526linkType%253D%2526checksum%253D54%26ie%3DUTF-8%26f%3D8%26tn%3Dbaidu%26wd%3D%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%26oq%3D%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%26rqlang%3Dcn; sessid=CE9A95AF-043B-90B5-A2E4-5F5D39B41EC4; als=0; ctid=41; ANJUKE_BUCKET=pc-home%3AErshou_Web_Home_Home-a; _ga=GA1.2.113488767.1516673325; _gid=GA1.2.255451285.1516673325; __xsptplusUT_8=1; __xsptplus8=8.2.1516678573.1516678593.4%232%7Cbzclk.baidu.com%7C%7C%7C%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%7C%23%23249u729XL4J3ZAGKQyEZUyuV4myBLtSZ%23; 58tj_uuid=8a65130f-1085-403a-9e02-5c07dba15641; new_session=0; init_refer=https%253A%252F%252Fnc.zu.anjuke.com%252F%253Ffrom%253Dnavigation; new_uv=2; aQQ_ajkguid=BC9AF129-431B-1C4F-BB91-A27203DE8341; twe=2; Hm_lvt_ed38609fc79dd16e428d5a06610cfeb9=1516673382; Hm_lpvt_ed38609fc79dd16e428d5a06610cfeb9=1516678594",

            "Referer":referer,

            "Upgrade-Insecure-Requests":"",

            "User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"

           }

    request = urllib.request.Request(url = url, headers = headers)

    soup = BeautifulSoup(urllib.request.urlopen(request,timeout=60).read().decode("utf-8"),"lxml")

    return soup

header = ["名称","房型","面积","层数","中介人","小区","高德地址","高德坐标","wgs坐标","wgs经度","wgs纬度","路段","地址","特点一","特点二","特点三","价格","房源链接"]

# Step1: get and save or read level_1 and level_2 links

links_file = "E:/20180123安居客_南昌租房/links/links.xlsx"

if os.path.exists(links_file):

    workbook_links = xlrd.open_workbook(links_file)

    sheet_links = workbook_links.sheet_by_index(0)

    level2_link = sheet_links.col_values(0)

    level2 = sheet_links.col_values(1)

else:

    # get sub_level1

    url_level_0 = "https://nc.zu.anjuke.com/fangyuan/p1/" # 这个地址是需要根据你所爬取的城市进行修改的

    level1_link =[]

    level1 = []

    for a in soup_form(url_level_0,"https://nc.zu.anjuke.com/").find("div", class_ = "sub-items sub-level1").find_all("a"):

        level1_link.append(a.get("href"))

        level1.append(a.text)

    # get sub_level2

    level2_link =[]

    level2 = []

    for i in range(1,len(level1_link)):

        for a in soup_form(level1_link[i],level1_link[i-1]).find("div", class_ = "sub-items sub-level2").find_all("a"):

            if a.text == "全部":

                pass

            else:

                level2_link.append(a.get("href"))

                level2.append(a.text)

    workbook_links = xlsxwriter.Workbook(links_file)

    sheet_links = workbook_links.add_worksheet("level_2")

    sheet_links.write_column(0, 0, level2_link)

    sheet_links.write_column(0, 1, level2)

    workbook_links.close()

print("Step 1 Done!\nStep 2 Start!")

# Step 2: get every level 2 links' rent information

for j in range(len(level2_link)): # at every level 2 page

    page_index = 1

    for k in range(1,51): # max loop

        # at every page, get the max page, and compare to the current page. if more than current page, continue

        rent_info_file ="E:/20180123安居客_南昌租房/split_data/" + level2[j] + "_info_page" + str(page_index) + ".xlsx"

        if os.path.exists(rent_info_file):

            page_index += 1

            print(level2[j] + "_info_page" + str(page_index) + ".xlsx already exits")

        else:

            pages = []

            url = level2_link[j] + "p" + str(page_index)

            soup = soup_form(url, level2_link[j])

            try:

                for a in soup.find("div",class_ = "multi-page").find_all("a"):

                    if a.text == "下一页 >" or a.text == "上一页":

                        pass

                    else:

                        pages.append(int(a.text))

            except Exception:

                break

            try:

                max_page = pages[len(pages)-1]

            except Exception:

                max_page = 1

            if page_index < max_page + 2:

                workbook_page = xlsxwriter.Workbook(rent_info_file)

                sheet = workbook_page.add_worksheet("page" + str(page_index))

                for header_index in range(len(header)):

                    sheet.write(0,header_index,header[header_index])

                row_index = 1

                for div in soup.find_all("div", class_ = "zu-itemmod"):

                    try:

                        sheet.write(row_index,0,div.find("a").get("title")) # 名称

                    except Exception:

                        pass 

                    try:

                        sheet.write(row_index,1,div.find("p").text.split("")[0].split("|")[0].replace(" ","").replace("\n","")) # 房型

                    except Exception:

                        pass

                    try:

                        sheet.write(row_index,2,div.find("p").text.split("")[0].split("|")[1]) # 面积

                    except Exception:

                        pass

                    try:

                        sheet.write(row_index,3,div.find("p").text.split("")[0].split("|")[2]) # 层数

                    except Exception:

                        pass 

                    try:

                        sheet.write(row_index,4,div.find("p").text.split("")[1]) # 中介人

                    except Exception:

                        pass

                    try:

                        xiaoqu = div.find("address").text.split()[0]

                        sheet.write(row_index,5,xiaoqu) # 小区

                        url_amap = "http://restapi.amap.com/v3/geocode/geo?address=" + urllib.parse.quote(xiaoqu) + "&output=xml&city=0791&key=【你的key】"

                        soup_amap = BeautifulSoup(urllib.request.urlopen(url_amap).read(),"xml")

                        sheet.write(row_index,6,soup_amap.find("formatted_address").get_text()) # 高德地址

                        location_amap = soup_amap.find("location").get_text()

                        sheet.write(row_index,7,location_amap) # 高德坐标

                        location_wgs = GCJ2WGS(location_amap)

                        longitude = location_wgs[0]

                        latitude = location_wgs[1]

                        sheet.write(row_index,8,str(longitude) + "," + str(latitude)) # wgs坐标

                        sheet.write(row_index,9,longitude)# wgs经度

                        sheet.write(row_index,10,latitude) # wgs纬度

                    except Exception:

                        pass

                    try:

                        sheet.write(row_index,11,div.find("address").text.split()[1]) # 路段

                    except Exception:

                        pass   

                    try:

                        sheet.write(row_index,12,div.find("address").text.split()[2]) # 地址

                    except Exception:

                        pass

                    try:

                        sheet.write(row_index,13,div.find("span",class_ = "cls-1").text)

                    except Exception:

                        pass

                    try:

                        sheet.write(row_index,14,div.find("span",class_ = "cls-2").text)

                    except Exception:

                        pass

                    try:

                        sheet.write(row_index,15,div.find("span",class_ = "cls-3").text)

                    except Exception:

                        pass

                    try:

                        sheet.write(row_index,16,div.find("strong").text) # 价格

                    except Exception:

                        pass

                    try:

                        sheet.write(row_index,17,div.find("a").get("href")) # 房源链接

                    except Exception:

                        pass

                    row_index += 1

                workbook_page.close()

                print(level2[j] + " page" + str(page_index) + " finished")

                page_index += 1

            else:

                break

    print(level2[j] + "finished")

print("Step 2 Done!\nStep 3 Start!")

# Step 3: merge all xlsx files

xlsx_merge("E:/20180123安居客_南昌租房/split_data/", header, "nanchang_rent_info") 

print("All work done")

Python——安居客租房信息爬取（以南昌为例）的更多相关文章

PyCharm+Scrapy爬取安居客楼盘信息
一.说明 1.1 开发环境说明开发环境--PyCharm 爬虫框架--Scrapy 开发语言--Python 3.6 安装第三方库--Scrapy.pymysql.matplotlib 数据库--M ...
安居客scrapy房产信息爬取到数据可视化(下)-可视化代码
接上篇:安居客scrapy房产信息爬取到数据可视化(下)-可视化代码,可视化的实现~ 先看看保存的数据吧~ 本人之前都是习惯把爬到的数据保存到本地json文件, 这次保存到数据库后发现使用mongod ...
Python 招聘信息爬取及可视化
自学python的大四狗发现校招招python的屈指可数,全是C++.Java.PHP,但看了下社招岗位还是有的.于是为了更加确定有多少可能找到工作,就用python写了个爬虫爬取招聘信息,数据处理, ...
Python网络爬虫与如何爬取段子的项目实例
一.网络爬虫 Python爬虫开发工程师,从网站某一个页面(通常是首页)开始,读取网页的内容,找到在网页中的其它链接地址,然后通过这些链接地址寻找下一个网页,这样一直循环下去,直到把这个网站所有的网页 ...
Python爬虫实战一之爬取QQ音乐
一.前言前段时间尝试爬取了网易云音乐的歌曲,这次打算爬取QQ音乐的歌曲信息.网易云音乐歌曲列表是通过iframe展示的,可以借助Selenium获取到iframe的页面元素, 而QQ音乐采用的是 ...
豆瓣电影信息爬取(json)
豆瓣电影信息爬取(json) # a = "hello world" # 字符串数据类型# b = {"name":"python"} # ...
Python爬虫实战二之爬取百度贴吧帖子
大家好,上次我们实验了爬取了糗事百科的段子,那么这次我们来尝试一下爬取百度贴吧的帖子.与上一篇不同的是,这次我们需要用到文件的相关操作. 前言亲爱的们,教程比较旧了,百度贴吧页面可能改版,可能代码不 ...
Python爬虫实战一之爬取糗事百科段子
大家好,前面入门已经说了那么多基础知识了,下面我们做几个实战项目来挑战一下吧.那么这次为大家带来,Python爬取糗事百科的小段子的例子. 首先,糗事百科大家都听说过吧?糗友们发的搞笑的段子一抓一大把 ...
转 Python爬虫实战二之爬取百度贴吧帖子
静觅 » Python爬虫实战二之爬取百度贴吧帖子大家好,上次我们实验了爬取了糗事百科的段子,那么这次我们来尝试一下爬取百度贴吧的帖子.与上一篇不同的是,这次我们需要用到文件的相关操作. 本篇目标 ...

随机推荐

SQLSTATE[HY000]: General error: 1030 Got error 28 from storage engine
今天上课程化平台考试,输入平台网址突然报这个错误可以先df -h 发现/tmp文件使用满了 ,清理下不需要的临时文件即可
java 易错选择题编辑中
1 System.out.println(int(a+b)); 编译错误应该是(int)(a+b) 2 String s="john"+3; 是正确的,结果就是 john3 3 ...
top命令用法详解
top命令可以实时动态地查看系统的整体运行情况,是一个综合了多方信息监测系统性能和运行信息的实用工具.通过top命令所提供的互动式界面,用热键可以管理. 语法 top(选项) 选项 -b:以批处理模式 ...
process.nextTick
回调函数同步执行 function asyncFake(data, callback) { if(data === 'foo') { callback(true); }else{ callback(f ...
java网络编程基本知识
1.基本概念网络:一组相互连接的计算机,多台计算机组成,使用物理线路进行连接网络连接的功能:交换数据.共享资源网络编程3要素: IP 地址:唯一标识网络上的每一台计算机,两台计算机之间通信的必备 ...
分布式存储ceph——（1）部署ceph
前言: 很多朋友想学ceph,但是开始ceph部署就让初学者举步为艰,ceph部署时由于国外源的问题(具体大家应该懂得),下载和安装软件便会卡住,停止不前.即使配置搭建了国内源后,执行ceph-dep ...
centos 6.8 配置 Redis3.2.5
配置Redis3.2.5 与 php-redis 一.配置Redis 1.下载Redis3.2.5安装包 [root@zhangsan /] wget http://download.redis.io ...
webpack优化相关操作
1.缩小文件搜索的范围 • 优化loader配置尽量精确使用 include 只命中需要的文件. module.exports = { module: { rules: ...
P1119 灾后重建（floyd进阶）
思路:这道题看n的范围很小(n<=200),显然就用floyd可以解决的问题,但又并不是简单的floyd算法,还是需要一些小小的变化.一开始我的思路是先跑一次弗洛伊德最短路,这样子显然复杂度很高 ...
Excel提取字符串示例
1.提取两个字符中间的字

Python——安居客租房信息爬取（以南昌为例）

Python——安居客租房信息爬取（以南昌为例）的更多相关文章

随机推荐

热门专题