爬了14W数据,存入Mongodb,用Charts库展示统计结果,这里展示一个示意

模块1 获取分类url列表

  1. from bs4 import BeautifulSoup
  2. import requests,pymongo
  3.  
  4. main_url = 'http://bj.58.com/sale.shtml'
  5. client = pymongo.MongoClient('localhost',27017)
  6. tc_58 = client['58tc']
  7. tab_link_list = tc_58['link_list']
  8.  
  9. web_data = requests.get(main_url)
  10. soup = BeautifulSoup(web_data.text,'lxml')
  11. sub_menu_link = soup.select('ul.ym-submnu > li > b > a')
  12.  
  13. link_list = []
  14. count = 0
  15. for link in sub_menu_link:
  16. link = 'http://bj.58.com' + link.get('href')
  17. #print(link)
  18. if link == 'http://bj.58.com/shoujihao/':
  19. pass
  20. elif link == 'http://bj.58.com/tongxunyw/':
  21. pass
  22. elif link == 'http://bj.58.com/tiaozao/':
  23. count += 1
  24. if count == 1:
  25. data = {'link':link}
  26. link_list.append(data)
  27. else:
  28. data = {'link': link}
  29. link_list.append(data)
  30.  
  31. for i in link_list:
  32. tab_link_list.insert(i)

模块2 获取每个商品详情信息

  1. from bs4 import BeautifulSoup
  2. import requests,re,pymongo,sys
  3. from multiprocessing import Pool
  4.  
  5. client = pymongo.MongoClient('localhost',27017)
  6. tc_58 = client['58tc']
  7. # detail_link = tc_58['detail_link']
  8. tab_link_list = tc_58['link_list']
  9. # tc_58_data = client['58tcData']
  10.  
  11. def getDetailUrl(page_url,tab):
  12. url_list = []
  13. web_data = requests.get(page_url)
  14. soup = BeautifulSoup(web_data.text,'lxml')
  15. detail_url = soup.select('div.infocon > table > tbody > tr > td.t > a[onclick]')
  16.  
  17. #获取详细页面url
  18. for url in detail_url:
  19. url_list.append(url.get('href').split('?')[0])
  20.  
  21. #插入mongodb
  22. count = 0
  23. client = pymongo.MongoClient('localhost', 27017)
  24. tc_58 = client['58tc']
  25. tab_list = tc_58[tab+'_list']
  26. for i in url_list:
  27. count += 1
  28. tab_list.insert({'link':i})
  29. return count
  30.  
  31. original_price_patt = re.compile('原价:(.+)')
  32. def getInfo(detail_url):
  33. try:
  34. web_data = requests.get(detail_url)
  35. soup = BeautifulSoup(web_data.text,'lxml')
  36. title = soup.title.text.strip()
  37. view_count = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.look_time')[0].text
  38. want_count = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.want_person')[0].text
  39. current_price = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > i')
  40. current_price = current_price[0].text if current_price else None
  41. original_price = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > b')
  42. original_price = original_price[0].text if original_price else None
  43. original_price = re.findall(original_price_patt,original_price) if original_price else None
  44. location = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i')[0].text
  45. tag = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.biaoqian_li')
  46. tag = list(tag[0].stripped_strings) if tag else None
  47. seller_name = soup.select('body > div.content > div > div.box_right > div.personal.jieshao_div > div.personal_jieshao > p.personal_name')[0].text
  48. # level = soup.select('body > div.content > div > div.box_right > div.personal.jieshao_div > div.personal_jieshao > span')
  49. # level = str(level[0]).split('\n')
  50. #
  51. # full_count = 0
  52. # half_count = 0
  53. # for j in level:
  54. # if '<span class="icon_png "></span>' == j:
  55. # full_count += 1
  56. # elif '<span class="icon_png smallScore"></span>' == j:
  57. # half_count += 1
  58. full_count = len(soup.find_all('span', class_='icon_png '))
  59. half_count = len(soup.find_all('span', class_='icon_png smallScore'))
  60.  
  61. level_count = {'full':full_count,'half':half_count}
  62. desc = soup.select('body > div.content > div > div.box_left > div:nth-of-type(3) > div > div > p')
  63. desc = desc[0].text if desc else None
  64. data = {
  65. 'title':title,
  66. 'view_count':view_count,
  67. 'want_count':want_count,
  68. 'current_price':current_price,
  69. 'original_price':original_price,
  70. 'location':location,
  71. 'tag':tag,
  72. 'seller_name':seller_name,
  73. #'level':level,
  74. 'level_count':level_count,
  75. 'desc':desc,
  76. 'link':detail_url
  77. }
  78. return data
  79. except:
  80. print(sys.exc_info()[0], sys.exc_info()[1])
  81. return None
  82. # for i in tab_link_list.find({},{'link':1,'_id':0}):
  83. # print(i['link'])
  84. # getDetailUrl(i['link'])
  85.  
  86. #规律每个页面最多70页
  87. def insertDetailLin(sub_menu_list):
  88. patt = re.compile('.+?com/([a-z]+)/')
  89. tab_list = []
  90. for i in sub_menu_list.find({},{'link':1,'_id':0}):
  91. #for i in [{'link':'http://bj.58.com/shouji/'}]:
  92. i = i['link']
  93. sub_menu_name = re.findall(patt,i)[0]
  94. print(sub_menu_name+': ',end='')
  95. url_list = []
  96. for j in range(1,71):
  97. link = i + 'pn' + str(j)
  98. url_list.append(link)
  99.  
  100. cnt = 0
  101. for k in url_list:
  102. cnt = cnt + getDetailUrl(k, sub_menu_name)
  103. print(str(cnt) + ' lines inserted')
  104. if cnt != 0:
  105. tab_list.append(sub_menu_name+'_list')
  106. return tab_list
  107.  
  108. # for i in tab_link_list.find({},{'link':1,'_id':0}):
  109. # print(i)
  110.  
  111. #insertDetailLin(tab_link_list)
  112.  
  113. allMenCollectionName = tc_58.collection_names()
  114. #allMenCollectionName.remove('detail_link')
  115. allMenCollectionName.remove('link_list')
  116. def insertData(tab_name):
  117. client = pymongo.MongoClient('localhost', 27017)
  118. tc_58 = client['58tc']
  119. tc_58_data = client['58tcDataNew']
  120. fenLei = tab_name[:-5]
  121. fenLei = tc_58_data[fenLei+'_data']
  122. tab_name = tc_58[tab_name]
  123. #print(tab_name)
  124. for i in tab_name.find({},{'link':1,'_id':0}):
  125. data = getInfo(i['link'])
  126. fenLei.insert(data)
  127.  
  128. def getContinuingly(fenlei):
  129. client = pymongo.MongoClient('localhost',27017)
  130. tc_58_data = client['58tcDataNew']
  131. tc_58 = client['58tc']
  132. fenlei_data = tc_58_data[fenlei+'_data']
  133. fenlei_list = tc_58[fenlei+'_list']
  134. db_urls = [item['link'] for item in fenlei_data.find()]
  135. index_url = [item['link'] for item in fenlei_list.find()]
  136. x=set(db_urls)
  137. y=set(index_url)
  138. rest_of_urls = y-x
  139. return list(rest_of_urls)
  140.  
  141. def startgetContinuingly(fenlei):
  142. client = pymongo.MongoClient('localhost', 27017)
  143. tc_58_data = client['58tcDataNew']
  144. fenLei = tc_58_data[fenlei+'_data']
  145. #rest_of_urls = getContinuingly('chuang')
  146. rest_of_urls = getContinuingly(fenlei)
  147. #print(rest_of_urls)
  148. for i in rest_of_urls:
  149. data = getInfo(i)
  150. fenLei.insert(data)
  151.  
  152. # startgetContinuingly('bijiben')
  153. pool = Pool()
  154. pool.map(insertData,allMenCollectionName)
  155. #pool.map(insertData,['chuang_list'])
  156. #insertData(allMenCollectionName)

模块3 分析

  1. from collections import Counter
  2. import pymongo,charts
  3.  
  4. def getTotalCount(database,host=None,port=None):
  5. client = pymongo.MongoClient(host,port)
  6. db = client[database]
  7. tab_list = db.collection_names()
  8. #print(tab_list)
  9. count = 0
  10. for i in tab_list:
  11. count = count + db[i].find({}).count()
  12. print(count)
  13. return count
  14.  
  15. #getTotalCount('58tcDataNew')
  16. #14700
  17.  
  18. def getAreaByClassify(classify,database='58tcDataNew',host=None,port=None):
  19. client = pymongo.MongoClient(host, port)
  20. db = client[database]
  21. classify = classify + '_data'
  22. #location_list = [ i['location'][3:] if i['location'] != '' and i['location'][:2] == '北京' else None for i in db['bijiben_data'].find(filter={},projection={'location':1,'_id':0})]
  23. location_list = [i['location'][3:] for i in db['yueqi_data'].find(filter={}, projection={'location': 1, '_id': 0})
  24. if i['location'] != '' and i['location'][:2] == '北京' and i['location'][3:] != '']
  25. loc_name = list(set(location_list))
  26. dic_count = {}
  27. for i in loc_name:
  28. dic_count[i] = location_list.count(i)
  29. return dic_count
  30.  
  31. # bijiben_area_count = getAreaByClassify(classify='yueqi')
  32. # print(bijiben_area_count)
  33. # danche_area_count = getAreaByClassify(classify='danche')
  34. # sum_area_count = Counter(bijiben_area_count) + Counter(danche_area_count)
  35. # print(sum_area_count)
  36.  
  37. def myCounter(L,database='58tcDataNew',host=None,port=None):
  38. client = pymongo.MongoClient(host, port)
  39. db = client[database]
  40. tab_list = db.collection_names()
  41. dic_0 = {}
  42. for i in tab_list:
  43. loc = i[:-5] + '_area_count'
  44. dic_0[loc] = 0
  45.  
  46. if not L:
  47. return Counter(dic_0)
  48. else:
  49. return Counter(L[0]) + myCounter(L[1:])
  50.  
  51. def getAllCount(database='58tcDataNew',host=None,port=None):
  52. client = pymongo.MongoClient(host, port)
  53. db = client[database]
  54. tab_list = db.collection_names()
  55. dic_all_count = {}
  56. for i in tab_list:
  57. dic = getAreaByClassify(i[:-5])
  58. loc = i[:-5] + '_area_count'
  59. dic_all_count[loc] = dic
  60.  
  61. dic_val = [dic_all_count[x] for x in dic_all_count]
  62. my = myCounter(dic_val)
  63.  
  64. dic_all_count['total_area_count'] = dict(my)
  65. return dic_all_count
  66.  
  67. dic_all_count = getAllCount()
  68. # print(dic_all_count['bijiben_area_count'])
  69. # print(dic_all_count['total_area_count'])
  70. #
  71. #
  72.  
  73. tmp_list = []
  74. for i in dic_all_count['total_area_count']:
  75. data = {
  76. 'name':i,
  77. 'data':[dic_all_count['total_area_count'][i]],
  78. 'type':'column'
  79. }
  80. tmp_list.append(data)
  81.  
  82. options = {
  83. 'chart' : {'zoomType':'xy'},
  84. 'title' : {'text': '北京58同城二手交易信息发布区域分布图'},
  85. 'subtitle': {'text': '数据来源: 58.com'},
  86. 'xAxis' : {'categories': ['']},
  87. 'yAxis' : {'title':{'text':'数量'}},
  88. 'plotOptions': {'column': {'dataLabels': {'enabled': True}}}
  89. }
  90. charts.plot(tmp_list,show='inline',options=options)

 
 

用Python写爬虫爬取58同城二手交易数据的更多相关文章

  1. python3爬虫-爬取58同城上所有城市的租房信息

    from fake_useragent import UserAgent from lxml import etree import requests, os import time, re, dat ...

  2. 养只爬虫当宠物(Node.js爬虫爬取58同城租房信息)

    先上一个源代码吧. https://github.com/answershuto/Rental 欢迎指导交流. 效果图 搭建Node.js环境及启动服务 安装node以及npm,用express模块启 ...

  3. 利用python爬取58同城简历数据

    利用python爬取58同城简历数据 利用python爬取58同城简历数据 最近接到一个工作,需要获取58同城上面的简历信息(http://gz.58.com/qzyewu/).最开始想到是用pyth ...

  4. 如何利用Python网络爬虫爬取微信朋友圈动态--附代码(下)

    前天给大家分享了如何利用Python网络爬虫爬取微信朋友圈数据的上篇(理论篇),今天给大家分享一下代码实现(实战篇),接着上篇往下继续深入. 一.代码实现 1.修改Scrapy项目中的items.py ...

  5. 利用Python网络爬虫爬取学校官网十条标题

    利用Python网络爬虫爬取学校官网十条标题 案例代码: # __author : "J" # date : 2018-03-06 # 导入需要用到的库文件 import urll ...

  6. python爬虫爬取get请求的页面数据代码样例

    废话不多说,上代码 #!/usr/bin/env python # -*- coding:utf-8 -*- # 导包 import urllib.request import urllib.pars ...

  7. 04 Python网络爬虫 <<爬取get/post请求的页面数据>>之requests模块

    一. urllib库 urllib是Python自带的一个用于爬虫的库,其主要作用就是可以通过代码模拟浏览器发送请求.其常被用到的子模块在Python3中的为urllib.request和urllib ...

  8. 爬虫--scrapy+redis分布式爬取58同城北京全站租房数据

    作业需求: 1.基于Spider或者CrawlSpider进行租房信息的爬取 2.本机搭建分布式环境对租房信息进行爬取 3.搭建多台机器的分布式环境,多台机器同时进行租房数据爬取 建议:用Pychar ...

  9. python学习(十六)写爬虫爬取糗事百科段子

    原文链接:爬取糗事百科段子 利用前面学到的文件.正则表达式.urllib的知识,综合运用,爬取糗事百科的段子先用urllib库获取糗事百科热帖第一页的数据.并打开文件进行保存,正好可以熟悉一下之前学过 ...

随机推荐

  1. libcurl教程

    名称 libcurl 的编程教程 目标 本文档介绍使用libcurl编程的一般原则和一些基本方法.本文主要是介绍 c 语言的调用接口,同时也可能很好的适用于其他类 c 语言的接口. 跨平台的可移植代码 ...

  2. js类型转化

    1. == 是会进行类型转换再进行判断的. true是转换成1,false是转换成0 然后再进行判断 == true false == true true === false false == fal ...

  3. 转 夕甲甲:孔乙己之 C++ 版

    欧欧匹代码的格局,是和别的编程模式不同的:首先要有一个构造函数:基类里只定义了函数的形式,可以随时通过派生增加不同的实现.那些程序员们,每每学会了继承和多态,便可以接一个项目,——这是十年前的事,现在 ...

  4. 用"时:分:秒"的方式显示运行时间

    import datetime,time start = datetime.datetime.now()...dosomething() end = datetime.datetime.now()pr ...

  5. 忘记BIOS超级管理员密码,怎么破解?

    [请尊重原创版权,如需引用,请注明来源及地址] 本人就喜欢没事瞎折腾,动动手活动活动筋骨没坏处,前不久非常便宜的弄到一玩具 ThinkPad T400(公司处理品),外观还算不错,除了电源适配器是坏的 ...

  6. Windows自带.NET Framework版本大全

    The following is a complete list of which version of the .NET Framework is included in which version ...

  7. 关于yii2框架活动记录activeRecord添加默认字段的问题

    平时使用sql的时候可以如下添加默认字段flag: "select a.*,0 as flag from user_info a", 对于yii2框架则需要这样: $query = ...

  8. Oracle分页存储过程

    1.在oracle的sqlplus或其他工具中运行一下pl/sql块建立存储过程 --创建包create or replace package testpackage astype test_curs ...

  9. Miller_Rabin素数测试

    #include<iostream> #include<cmath> #include<cstdio> #include<cstring> #inclu ...

  10. Gradle笔记系列(一)

    1.Gradle概述 Gradle是一个基于Apache Ant和Apache Maven概念的项目自动化构建工具.它使用一种基于Groovy的特定领域语言(DSL)来声明项目设置,抛弃了基于XML的 ...