用Python写爬虫爬取58同城二手交易数据
爬了14W数据,存入Mongodb,用Charts库展示统计结果,这里展示一个示意
模块1 获取分类url列表
- from bs4 import BeautifulSoup
- import requests,pymongo
- main_url = 'http://bj.58.com/sale.shtml'
- client = pymongo.MongoClient('localhost',27017)
- tc_58 = client['58tc']
- tab_link_list = tc_58['link_list']
- web_data = requests.get(main_url)
- soup = BeautifulSoup(web_data.text,'lxml')
- sub_menu_link = soup.select('ul.ym-submnu > li > b > a')
- link_list = []
- count = 0
- for link in sub_menu_link:
- link = 'http://bj.58.com' + link.get('href')
- #print(link)
- if link == 'http://bj.58.com/shoujihao/':
- pass
- elif link == 'http://bj.58.com/tongxunyw/':
- pass
- elif link == 'http://bj.58.com/tiaozao/':
- count += 1
- if count == 1:
- data = {'link':link}
- link_list.append(data)
- else:
- data = {'link': link}
- link_list.append(data)
- for i in link_list:
- tab_link_list.insert(i)
模块2 获取每个商品详情信息
- from bs4 import BeautifulSoup
- import requests,re,pymongo,sys
- from multiprocessing import Pool
- client = pymongo.MongoClient('localhost',27017)
- tc_58 = client['58tc']
- # detail_link = tc_58['detail_link']
- tab_link_list = tc_58['link_list']
- # tc_58_data = client['58tcData']
- def getDetailUrl(page_url,tab):
- url_list = []
- web_data = requests.get(page_url)
- soup = BeautifulSoup(web_data.text,'lxml')
- detail_url = soup.select('div.infocon > table > tbody > tr > td.t > a[onclick]')
- #获取详细页面url
- for url in detail_url:
- url_list.append(url.get('href').split('?')[0])
- #插入mongodb
- count = 0
- client = pymongo.MongoClient('localhost', 27017)
- tc_58 = client['58tc']
- tab_list = tc_58[tab+'_list']
- for i in url_list:
- count += 1
- tab_list.insert({'link':i})
- return count
- original_price_patt = re.compile('原价:(.+)')
- def getInfo(detail_url):
- try:
- web_data = requests.get(detail_url)
- soup = BeautifulSoup(web_data.text,'lxml')
- title = soup.title.text.strip()
- view_count = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.look_time')[0].text
- want_count = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.want_person')[0].text
- current_price = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > i')
- current_price = current_price[0].text if current_price else None
- original_price = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > b')
- original_price = original_price[0].text if original_price else None
- original_price = re.findall(original_price_patt,original_price) if original_price else None
- location = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i')[0].text
- tag = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.biaoqian_li')
- tag = list(tag[0].stripped_strings) if tag else None
- seller_name = soup.select('body > div.content > div > div.box_right > div.personal.jieshao_div > div.personal_jieshao > p.personal_name')[0].text
- # level = soup.select('body > div.content > div > div.box_right > div.personal.jieshao_div > div.personal_jieshao > span')
- # level = str(level[0]).split('\n')
- #
- # full_count = 0
- # half_count = 0
- # for j in level:
- # if '<span class="icon_png "></span>' == j:
- # full_count += 1
- # elif '<span class="icon_png smallScore"></span>' == j:
- # half_count += 1
- full_count = len(soup.find_all('span', class_='icon_png '))
- half_count = len(soup.find_all('span', class_='icon_png smallScore'))
- level_count = {'full':full_count,'half':half_count}
- desc = soup.select('body > div.content > div > div.box_left > div:nth-of-type(3) > div > div > p')
- desc = desc[0].text if desc else None
- data = {
- 'title':title,
- 'view_count':view_count,
- 'want_count':want_count,
- 'current_price':current_price,
- 'original_price':original_price,
- 'location':location,
- 'tag':tag,
- 'seller_name':seller_name,
- #'level':level,
- 'level_count':level_count,
- 'desc':desc,
- 'link':detail_url
- }
- return data
- except:
- print(sys.exc_info()[0], sys.exc_info()[1])
- return None
- # for i in tab_link_list.find({},{'link':1,'_id':0}):
- # print(i['link'])
- # getDetailUrl(i['link'])
- #规律每个页面最多70页
- def insertDetailLin(sub_menu_list):
- patt = re.compile('.+?com/([a-z]+)/')
- tab_list = []
- for i in sub_menu_list.find({},{'link':1,'_id':0}):
- #for i in [{'link':'http://bj.58.com/shouji/'}]:
- i = i['link']
- sub_menu_name = re.findall(patt,i)[0]
- print(sub_menu_name+': ',end='')
- url_list = []
- for j in range(1,71):
- link = i + 'pn' + str(j)
- url_list.append(link)
- cnt = 0
- for k in url_list:
- cnt = cnt + getDetailUrl(k, sub_menu_name)
- print(str(cnt) + ' lines inserted')
- if cnt != 0:
- tab_list.append(sub_menu_name+'_list')
- return tab_list
- # for i in tab_link_list.find({},{'link':1,'_id':0}):
- # print(i)
- #insertDetailLin(tab_link_list)
- allMenCollectionName = tc_58.collection_names()
- #allMenCollectionName.remove('detail_link')
- allMenCollectionName.remove('link_list')
- def insertData(tab_name):
- client = pymongo.MongoClient('localhost', 27017)
- tc_58 = client['58tc']
- tc_58_data = client['58tcDataNew']
- fenLei = tab_name[:-5]
- fenLei = tc_58_data[fenLei+'_data']
- tab_name = tc_58[tab_name]
- #print(tab_name)
- for i in tab_name.find({},{'link':1,'_id':0}):
- data = getInfo(i['link'])
- fenLei.insert(data)
- def getContinuingly(fenlei):
- client = pymongo.MongoClient('localhost',27017)
- tc_58_data = client['58tcDataNew']
- tc_58 = client['58tc']
- fenlei_data = tc_58_data[fenlei+'_data']
- fenlei_list = tc_58[fenlei+'_list']
- db_urls = [item['link'] for item in fenlei_data.find()]
- index_url = [item['link'] for item in fenlei_list.find()]
- x=set(db_urls)
- y=set(index_url)
- rest_of_urls = y-x
- return list(rest_of_urls)
- def startgetContinuingly(fenlei):
- client = pymongo.MongoClient('localhost', 27017)
- tc_58_data = client['58tcDataNew']
- fenLei = tc_58_data[fenlei+'_data']
- #rest_of_urls = getContinuingly('chuang')
- rest_of_urls = getContinuingly(fenlei)
- #print(rest_of_urls)
- for i in rest_of_urls:
- data = getInfo(i)
- fenLei.insert(data)
- # startgetContinuingly('bijiben')
- pool = Pool()
- pool.map(insertData,allMenCollectionName)
- #pool.map(insertData,['chuang_list'])
- #insertData(allMenCollectionName)
模块3 分析
- from collections import Counter
- import pymongo,charts
- def getTotalCount(database,host=None,port=None):
- client = pymongo.MongoClient(host,port)
- db = client[database]
- tab_list = db.collection_names()
- #print(tab_list)
- count = 0
- for i in tab_list:
- count = count + db[i].find({}).count()
- print(count)
- return count
- #getTotalCount('58tcDataNew')
- #14700
- def getAreaByClassify(classify,database='58tcDataNew',host=None,port=None):
- client = pymongo.MongoClient(host, port)
- db = client[database]
- classify = classify + '_data'
- #location_list = [ i['location'][3:] if i['location'] != '' and i['location'][:2] == '北京' else None for i in db['bijiben_data'].find(filter={},projection={'location':1,'_id':0})]
- location_list = [i['location'][3:] for i in db['yueqi_data'].find(filter={}, projection={'location': 1, '_id': 0})
- if i['location'] != '' and i['location'][:2] == '北京' and i['location'][3:] != '']
- loc_name = list(set(location_list))
- dic_count = {}
- for i in loc_name:
- dic_count[i] = location_list.count(i)
- return dic_count
- # bijiben_area_count = getAreaByClassify(classify='yueqi')
- # print(bijiben_area_count)
- # danche_area_count = getAreaByClassify(classify='danche')
- # sum_area_count = Counter(bijiben_area_count) + Counter(danche_area_count)
- # print(sum_area_count)
- def myCounter(L,database='58tcDataNew',host=None,port=None):
- client = pymongo.MongoClient(host, port)
- db = client[database]
- tab_list = db.collection_names()
- dic_0 = {}
- for i in tab_list:
- loc = i[:-5] + '_area_count'
- dic_0[loc] = 0
- if not L:
- return Counter(dic_0)
- else:
- return Counter(L[0]) + myCounter(L[1:])
- def getAllCount(database='58tcDataNew',host=None,port=None):
- client = pymongo.MongoClient(host, port)
- db = client[database]
- tab_list = db.collection_names()
- dic_all_count = {}
- for i in tab_list:
- dic = getAreaByClassify(i[:-5])
- loc = i[:-5] + '_area_count'
- dic_all_count[loc] = dic
- dic_val = [dic_all_count[x] for x in dic_all_count]
- my = myCounter(dic_val)
- dic_all_count['total_area_count'] = dict(my)
- return dic_all_count
- dic_all_count = getAllCount()
- # print(dic_all_count['bijiben_area_count'])
- # print(dic_all_count['total_area_count'])
- #
- #
- tmp_list = []
- for i in dic_all_count['total_area_count']:
- data = {
- 'name':i,
- 'data':[dic_all_count['total_area_count'][i]],
- 'type':'column'
- }
- tmp_list.append(data)
- options = {
- 'chart' : {'zoomType':'xy'},
- 'title' : {'text': '北京58同城二手交易信息发布区域分布图'},
- 'subtitle': {'text': '数据来源: 58.com'},
- 'xAxis' : {'categories': ['']},
- 'yAxis' : {'title':{'text':'数量'}},
- 'plotOptions': {'column': {'dataLabels': {'enabled': True}}}
- }
- charts.plot(tmp_list,show='inline',options=options)
用Python写爬虫爬取58同城二手交易数据的更多相关文章
- python3爬虫-爬取58同城上所有城市的租房信息
from fake_useragent import UserAgent from lxml import etree import requests, os import time, re, dat ...
- 养只爬虫当宠物(Node.js爬虫爬取58同城租房信息)
先上一个源代码吧. https://github.com/answershuto/Rental 欢迎指导交流. 效果图 搭建Node.js环境及启动服务 安装node以及npm,用express模块启 ...
- 利用python爬取58同城简历数据
利用python爬取58同城简历数据 利用python爬取58同城简历数据 最近接到一个工作,需要获取58同城上面的简历信息(http://gz.58.com/qzyewu/).最开始想到是用pyth ...
- 如何利用Python网络爬虫爬取微信朋友圈动态--附代码(下)
前天给大家分享了如何利用Python网络爬虫爬取微信朋友圈数据的上篇(理论篇),今天给大家分享一下代码实现(实战篇),接着上篇往下继续深入. 一.代码实现 1.修改Scrapy项目中的items.py ...
- 利用Python网络爬虫爬取学校官网十条标题
利用Python网络爬虫爬取学校官网十条标题 案例代码: # __author : "J" # date : 2018-03-06 # 导入需要用到的库文件 import urll ...
- python爬虫爬取get请求的页面数据代码样例
废话不多说,上代码 #!/usr/bin/env python # -*- coding:utf-8 -*- # 导包 import urllib.request import urllib.pars ...
- 04 Python网络爬虫 <<爬取get/post请求的页面数据>>之requests模块
一. urllib库 urllib是Python自带的一个用于爬虫的库,其主要作用就是可以通过代码模拟浏览器发送请求.其常被用到的子模块在Python3中的为urllib.request和urllib ...
- 爬虫--scrapy+redis分布式爬取58同城北京全站租房数据
作业需求: 1.基于Spider或者CrawlSpider进行租房信息的爬取 2.本机搭建分布式环境对租房信息进行爬取 3.搭建多台机器的分布式环境,多台机器同时进行租房数据爬取 建议:用Pychar ...
- python学习(十六)写爬虫爬取糗事百科段子
原文链接:爬取糗事百科段子 利用前面学到的文件.正则表达式.urllib的知识,综合运用,爬取糗事百科的段子先用urllib库获取糗事百科热帖第一页的数据.并打开文件进行保存,正好可以熟悉一下之前学过 ...
随机推荐
- libcurl教程
名称 libcurl 的编程教程 目标 本文档介绍使用libcurl编程的一般原则和一些基本方法.本文主要是介绍 c 语言的调用接口,同时也可能很好的适用于其他类 c 语言的接口. 跨平台的可移植代码 ...
- js类型转化
1. == 是会进行类型转换再进行判断的. true是转换成1,false是转换成0 然后再进行判断 == true false == true true === false false == fal ...
- 转 夕甲甲:孔乙己之 C++ 版
欧欧匹代码的格局,是和别的编程模式不同的:首先要有一个构造函数:基类里只定义了函数的形式,可以随时通过派生增加不同的实现.那些程序员们,每每学会了继承和多态,便可以接一个项目,——这是十年前的事,现在 ...
- 用"时:分:秒"的方式显示运行时间
import datetime,time start = datetime.datetime.now()...dosomething() end = datetime.datetime.now()pr ...
- 忘记BIOS超级管理员密码,怎么破解?
[请尊重原创版权,如需引用,请注明来源及地址] 本人就喜欢没事瞎折腾,动动手活动活动筋骨没坏处,前不久非常便宜的弄到一玩具 ThinkPad T400(公司处理品),外观还算不错,除了电源适配器是坏的 ...
- Windows自带.NET Framework版本大全
The following is a complete list of which version of the .NET Framework is included in which version ...
- 关于yii2框架活动记录activeRecord添加默认字段的问题
平时使用sql的时候可以如下添加默认字段flag: "select a.*,0 as flag from user_info a", 对于yii2框架则需要这样: $query = ...
- Oracle分页存储过程
1.在oracle的sqlplus或其他工具中运行一下pl/sql块建立存储过程 --创建包create or replace package testpackage astype test_curs ...
- Miller_Rabin素数测试
#include<iostream> #include<cmath> #include<cstdio> #include<cstring> #inclu ...
- Gradle笔记系列(一)
1.Gradle概述 Gradle是一个基于Apache Ant和Apache Maven概念的项目自动化构建工具.它使用一种基于Groovy的特定领域语言(DSL)来声明项目设置,抛弃了基于XML的 ...