用Python写爬虫爬取58同城二手交易数据
爬了14W数据,存入Mongodb,用Charts库展示统计结果,这里展示一个示意
模块1 获取分类url列表
from bs4 import BeautifulSoup import requests,pymongo main_url = 'http://bj.58.com/sale.shtml' client = pymongo.MongoClient('localhost',27017) tc_58 = client['58tc'] tab_link_list = tc_58['link_list'] web_data = requests.get(main_url) soup = BeautifulSoup(web_data.text,'lxml') sub_menu_link = soup.select('ul.ym-submnu > li > b > a') link_list = [] count = 0 for link in sub_menu_link: link = 'http://bj.58.com' + link.get('href') #print(link) if link == 'http://bj.58.com/shoujihao/': pass elif link == 'http://bj.58.com/tongxunyw/': pass elif link == 'http://bj.58.com/tiaozao/': count += 1 if count == 1: data = {'link':link} link_list.append(data) else: data = {'link': link} link_list.append(data) for i in link_list: tab_link_list.insert(i)
模块2 获取每个商品详情信息
from bs4 import BeautifulSoup import requests,re,pymongo,sys from multiprocessing import Pool client = pymongo.MongoClient('localhost',27017) tc_58 = client['58tc'] # detail_link = tc_58['detail_link'] tab_link_list = tc_58['link_list'] # tc_58_data = client['58tcData'] def getDetailUrl(page_url,tab): url_list = [] web_data = requests.get(page_url) soup = BeautifulSoup(web_data.text,'lxml') detail_url = soup.select('div.infocon > table > tbody > tr > td.t > a[onclick]') #获取详细页面url for url in detail_url: url_list.append(url.get('href').split('?')[0]) #插入mongodb count = 0 client = pymongo.MongoClient('localhost', 27017) tc_58 = client['58tc'] tab_list = tc_58[tab+'_list'] for i in url_list: count += 1 tab_list.insert({'link':i}) return count original_price_patt = re.compile('原价:(.+)') def getInfo(detail_url): try: web_data = requests.get(detail_url) soup = BeautifulSoup(web_data.text,'lxml') title = soup.title.text.strip() view_count = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.look_time')[0].text want_count = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.want_person')[0].text current_price = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > i') current_price = current_price[0].text if current_price else None original_price = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > b') original_price = original_price[0].text if original_price else None original_price = re.findall(original_price_patt,original_price) if original_price else None location = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i')[0].text tag = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.biaoqian_li') tag = list(tag[0].stripped_strings) if tag else None seller_name = soup.select('body > div.content > div > div.box_right > div.personal.jieshao_div > div.personal_jieshao > p.personal_name')[0].text # level = soup.select('body > div.content > div > div.box_right > div.personal.jieshao_div > div.personal_jieshao > span') # level = str(level[0]).split('\n') # # full_count = 0 # half_count = 0 # for j in level: # if '<span class="icon_png "></span>' == j: # full_count += 1 # elif '<span class="icon_png smallScore"></span>' == j: # half_count += 1 full_count = len(soup.find_all('span', class_='icon_png ')) half_count = len(soup.find_all('span', class_='icon_png smallScore')) level_count = {'full':full_count,'half':half_count} desc = soup.select('body > div.content > div > div.box_left > div:nth-of-type(3) > div > div > p') desc = desc[0].text if desc else None data = { 'title':title, 'view_count':view_count, 'want_count':want_count, 'current_price':current_price, 'original_price':original_price, 'location':location, 'tag':tag, 'seller_name':seller_name, #'level':level, 'level_count':level_count, 'desc':desc, 'link':detail_url } return data except: print(sys.exc_info()[0], sys.exc_info()[1]) return None # for i in tab_link_list.find({},{'link':1,'_id':0}): # print(i['link']) # getDetailUrl(i['link']) #规律每个页面最多70页 def insertDetailLin(sub_menu_list): patt = re.compile('.+?com/([a-z]+)/') tab_list = [] for i in sub_menu_list.find({},{'link':1,'_id':0}): #for i in [{'link':'http://bj.58.com/shouji/'}]: i = i['link'] sub_menu_name = re.findall(patt,i)[0] print(sub_menu_name+': ',end='') url_list = [] for j in range(1,71): link = i + 'pn' + str(j) url_list.append(link) cnt = 0 for k in url_list: cnt = cnt + getDetailUrl(k, sub_menu_name) print(str(cnt) + ' lines inserted') if cnt != 0: tab_list.append(sub_menu_name+'_list') return tab_list # for i in tab_link_list.find({},{'link':1,'_id':0}): # print(i) #insertDetailLin(tab_link_list) allMenCollectionName = tc_58.collection_names() #allMenCollectionName.remove('detail_link') allMenCollectionName.remove('link_list') def insertData(tab_name): client = pymongo.MongoClient('localhost', 27017) tc_58 = client['58tc'] tc_58_data = client['58tcDataNew'] fenLei = tab_name[:-5] fenLei = tc_58_data[fenLei+'_data'] tab_name = tc_58[tab_name] #print(tab_name) for i in tab_name.find({},{'link':1,'_id':0}): data = getInfo(i['link']) fenLei.insert(data) def getContinuingly(fenlei): client = pymongo.MongoClient('localhost',27017) tc_58_data = client['58tcDataNew'] tc_58 = client['58tc'] fenlei_data = tc_58_data[fenlei+'_data'] fenlei_list = tc_58[fenlei+'_list'] db_urls = [item['link'] for item in fenlei_data.find()] index_url = [item['link'] for item in fenlei_list.find()] x=set(db_urls) y=set(index_url) rest_of_urls = y-x return list(rest_of_urls) def startgetContinuingly(fenlei): client = pymongo.MongoClient('localhost', 27017) tc_58_data = client['58tcDataNew'] fenLei = tc_58_data[fenlei+'_data'] #rest_of_urls = getContinuingly('chuang') rest_of_urls = getContinuingly(fenlei) #print(rest_of_urls) for i in rest_of_urls: data = getInfo(i) fenLei.insert(data) # startgetContinuingly('bijiben') pool = Pool() pool.map(insertData,allMenCollectionName) #pool.map(insertData,['chuang_list']) #insertData(allMenCollectionName)
模块3 分析
from collections import Counter import pymongo,charts def getTotalCount(database,host=None,port=None): client = pymongo.MongoClient(host,port) db = client[database] tab_list = db.collection_names() #print(tab_list) count = 0 for i in tab_list: count = count + db[i].find({}).count() print(count) return count #getTotalCount('58tcDataNew') #14700 def getAreaByClassify(classify,database='58tcDataNew',host=None,port=None): client = pymongo.MongoClient(host, port) db = client[database] classify = classify + '_data' #location_list = [ i['location'][3:] if i['location'] != '' and i['location'][:2] == '北京' else None for i in db['bijiben_data'].find(filter={},projection={'location':1,'_id':0})] location_list = [i['location'][3:] for i in db['yueqi_data'].find(filter={}, projection={'location': 1, '_id': 0}) if i['location'] != '' and i['location'][:2] == '北京' and i['location'][3:] != ''] loc_name = list(set(location_list)) dic_count = {} for i in loc_name: dic_count[i] = location_list.count(i) return dic_count # bijiben_area_count = getAreaByClassify(classify='yueqi') # print(bijiben_area_count) # danche_area_count = getAreaByClassify(classify='danche') # sum_area_count = Counter(bijiben_area_count) + Counter(danche_area_count) # print(sum_area_count) def myCounter(L,database='58tcDataNew',host=None,port=None): client = pymongo.MongoClient(host, port) db = client[database] tab_list = db.collection_names() dic_0 = {} for i in tab_list: loc = i[:-5] + '_area_count' dic_0[loc] = 0 if not L: return Counter(dic_0) else: return Counter(L[0]) + myCounter(L[1:]) def getAllCount(database='58tcDataNew',host=None,port=None): client = pymongo.MongoClient(host, port) db = client[database] tab_list = db.collection_names() dic_all_count = {} for i in tab_list: dic = getAreaByClassify(i[:-5]) loc = i[:-5] + '_area_count' dic_all_count[loc] = dic dic_val = [dic_all_count[x] for x in dic_all_count] my = myCounter(dic_val) dic_all_count['total_area_count'] = dict(my) return dic_all_count dic_all_count = getAllCount() # print(dic_all_count['bijiben_area_count']) # print(dic_all_count['total_area_count']) # # tmp_list = [] for i in dic_all_count['total_area_count']: data = { 'name':i, 'data':[dic_all_count['total_area_count'][i]], 'type':'column' } tmp_list.append(data) options = { 'chart' : {'zoomType':'xy'}, 'title' : {'text': '北京58同城二手交易信息发布区域分布图'}, 'subtitle': {'text': '数据来源: 58.com'}, 'xAxis' : {'categories': ['']}, 'yAxis' : {'title':{'text':'数量'}}, 'plotOptions': {'column': {'dataLabels': {'enabled': True}}} } charts.plot(tmp_list,show='inline',options=options)
用Python写爬虫爬取58同城二手交易数据的更多相关文章
- python3爬虫-爬取58同城上所有城市的租房信息
from fake_useragent import UserAgent from lxml import etree import requests, os import time, re, dat ...
- 养只爬虫当宠物(Node.js爬虫爬取58同城租房信息)
先上一个源代码吧. https://github.com/answershuto/Rental 欢迎指导交流. 效果图 搭建Node.js环境及启动服务 安装node以及npm,用express模块启 ...
- 利用python爬取58同城简历数据
利用python爬取58同城简历数据 利用python爬取58同城简历数据 最近接到一个工作,需要获取58同城上面的简历信息(http://gz.58.com/qzyewu/).最开始想到是用pyth ...
- 如何利用Python网络爬虫爬取微信朋友圈动态--附代码(下)
前天给大家分享了如何利用Python网络爬虫爬取微信朋友圈数据的上篇(理论篇),今天给大家分享一下代码实现(实战篇),接着上篇往下继续深入. 一.代码实现 1.修改Scrapy项目中的items.py ...
- 利用Python网络爬虫爬取学校官网十条标题
利用Python网络爬虫爬取学校官网十条标题 案例代码: # __author : "J" # date : 2018-03-06 # 导入需要用到的库文件 import urll ...
- python爬虫爬取get请求的页面数据代码样例
废话不多说,上代码 #!/usr/bin/env python # -*- coding:utf-8 -*- # 导包 import urllib.request import urllib.pars ...
- 04 Python网络爬虫 <<爬取get/post请求的页面数据>>之requests模块
一. urllib库 urllib是Python自带的一个用于爬虫的库,其主要作用就是可以通过代码模拟浏览器发送请求.其常被用到的子模块在Python3中的为urllib.request和urllib ...
- 爬虫--scrapy+redis分布式爬取58同城北京全站租房数据
作业需求: 1.基于Spider或者CrawlSpider进行租房信息的爬取 2.本机搭建分布式环境对租房信息进行爬取 3.搭建多台机器的分布式环境,多台机器同时进行租房数据爬取 建议:用Pychar ...
- python学习(十六)写爬虫爬取糗事百科段子
原文链接:爬取糗事百科段子 利用前面学到的文件.正则表达式.urllib的知识,综合运用,爬取糗事百科的段子先用urllib库获取糗事百科热帖第一页的数据.并打开文件进行保存,正好可以熟悉一下之前学过 ...
随机推荐
- webstorm 2016 激活(转)
2016.2.2 版本的破解方式: 安装以后,打开软件会弹出一个对话框:选择"license server" 输入:http://114.215.133.70:41017 2016 ...
- b/s结构的物业管理系统(一)-------登录篇
最近计划做一个非框架的物业管理系统前端使用bootstrap js jquery 等希望各位指点一下共同学习 ---前端登录页面------ 这个页面的输入框组用的bootstrap的,我设置了几张背 ...
- webapp开发中的一些注意的
和大多数响应式的布局一样,webapp开发也是需要浮动布局和百分比布局,需要考虑的是小屏幕手机250px和大屏幕设备768px,但是习惯以320px和640px来分割,jq中的一句话$(functio ...
- RTTI 运行时类型识别 及异常处理
RTTI 运行时类型识别 typeid ------ dynamic_cast dynamic_cast 注意事项: 1.只能应用于指针和引用之间的转化 2.要转换的类型中必须包含虚函数 3. ...
- 2016国内最值得期待的响应式前端框架pintuer(拼图)--http://www.pintuer.com
近期,需要将项目从pc端的应用扩展到移动端. 当然移动框架的第一选择必然是bootstrap,但是bootstrap作为移动端明显过于死板,而且作为国外的产品,对于国内的应用明显水土不服.框架里总有那 ...
- spring整合struts2
1. Spring 如何在 WEB 应用中使用 ? 1). 需要额外加入的 jar 包: spring-web-4.0.0.RELEASE.jarspring-webmvc-4.0.0.RELEASE ...
- 开启gpu加速的高性能移动端相框组件!
通过设置新的css3新属性translateX来代替传统的绝对定位改变left值的动画原理,新属性translateX会开启浏览器自带的gpu硬件加速动画性能,提高流畅度从而提高用户体验, 代码有很详 ...
- python第十九天-----Django进阶
1.机智的小django为我你们提供了快捷的表单验证! from django.shortcuts import render, HttpResponse,redirect from django i ...
- 第十二章 redis-cluster搭建(redis-3.2.5)
redis集群技术 redis2.x使用客户端分片技术 redis3.x使用cluster集群技术 一.环境 os:centos7 ip:10.211.55.4 redis:3.2.5 gem-red ...
- Arguments Optional
function add() { //return false; if(typeof arguments[0] !== "number" || arguments.length & ...