最近在看mongodb,然后会用了一些最简单的mongodb的操作,然后想着结合股票信息的数据的抓取,然后将数据存储在mongodb中,对于mongo和数据库的最大的区别是,mongo不需要建表,直接进行存储,然后在选择数据表的时候在进行插入数据的时候要将str格式的字符串转换成json的格式进行插入,这个我在插入数据的时候调试了十多分钟,一直以为是自己字符串的原因,然后看了看插入数据的格式和百度,然后才发现这点。然后我是插入在本机的test.Share表中的,然后其他的注重点就没有什么了~代码写的很丑,冗余也很大,还是会继续更新~并且程序是但进程进行的数据抓取~嗯~ 很蠢~
- #-*-coding:utf-8 -*-
- import urllib
- import re
- import json
- import urllib2
- from lxml import etree
- import requests
- import time
- from Queue import Queue
- from pymongo import MongoClient
- import matplotlib.pyplot as plt
- URL = ''
- nation_que = Queue()
- client = MongoClient('localhost',27017)
- db = client.test
- Share = db.Share
- def sub_sort(array,array1,low,high):
- key = array[low]
- key1 = array1[low]
- while low < high:
- while low < high and array[high] >= key:
- high -= 1
- while low < high and array[high] < key:
- array[low] = array[high]
- array1[low] = array1[high]
- low += 1
- array[high] = array[low]
- array1[high] = array1[low]
- array[low] = key
- array1[low] = key1
- return low
- def quick_sort(array,array1,low,high):
- if low < high:
- key_index = sub_sort(array,array1,low,high)
- quick_sort(array,array1,low,key_index)
- quick_sort(array,array1,key_index+1,high)
- def download(url, headers, num_try=2):
- while num_try >0:
- num_try -= 1
- try:
- content = requests.get(url, headers=headers)
- return content.text
- except urllib2.URLError as e:
- print 'Download error', e.reason
- return None
- current_quto = Queue()
- open_quto = Queue()
- high_quto = Queue()
- low_quto = Queue()
- close_quto = Queue()
- update_time = Queue()
- def get_type_url():
- headers = {
- 'User_agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
- 'Referer': '',
- 'Cookie': 'io=-voMclEjiizK9nWKALqB; UM_distinctid=15f5938ddc72db-089cf9ba58d9e5-31657c00-fa000-15f5938ddc8b24; Hm_lvt_d25bd1db5bca2537d34deae7edca67d3=1509030420; Hm_lpvt_d25bd1db5bca2537d34deae7edca67d3=1509031023',
- 'Accept-Language': 'zh-CN,zh;q=0.8',
- 'Accept-Encoding': 'gzip, deflate',
- 'Accept': '*/*'
- }
- content = download(URL,headers)
- html = etree.HTML(content)
- result = html.xpath('//a[@class="mar_name"]/@href')
- result1 = html.xpath('//td/text()')
- num = 0
- for each in result1:
- if num%6 == 0:
- current_quto.put(each)
- num += 1
- elif num%6 == 1:
- open_quto.put(each)
- num += 1
- elif num%6 == 2:
- high_quto.put(each)
- num += 1
- elif num%6 == 3:
- low_quto.put(each)
- num += 1
- elif num %6 == 4:
- close_quto.put(each)
- num +=1
- elif num %6 == 5:
- update_time.put(each)
- num +=1
- #while not
- for each in result:
- st = each.split('/')
- nation_que.put(st[len(st)-1])
- get_precent()
- def get_precent():
- while not nation_que.empty():
- if not update_time.empty():
- time_update = update_time.get(False)
- update_time.task_done()
- if not current_quto.empty():
- new_rates = current_quto.get(False)
- current_quto.task_done()
- if not open_quto.empty():
- opening = open_quto.get(False)
- open_quto.task_done()
- if not high_quto.empty():
- high = high_quto.get(False)
- high_quto.task_done()
- if not low_quto.empty():
- low = low_quto.get(False)
- low_quto.task_done()
- if not close_quto.empty():
- closing = close_quto.get(False)
- close_quto.task_done()
- ss = nation_que.get(False)
- print ss
- print low
- print high
- print time_update
- print new_rates
- print opening
- url = '' + ss +'&limit=288&resolution=5&codeType=8100&st=0.8274405615006541'
- print url
- headers = {'Accept':'application/json, text/javascript, */*; q=0.01',
- 'Accept-Encoding':'gzip, deflate',
- 'Accept-Language':'zh-CN,zh;q=0.8',
- 'Connection':'keep-alive',
- 'Host':'',
- 'Origin':'',
- 'Referer':'',
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
- }
- num_try = 2
- while num_try >0:
- num_try -= 1
- try:
- content = requests.get(url, headers=headers)
- html = json.loads(content.text)
- st = html['h']
- T_time = html['t']
- if len(st) > 0 and len(T_time) > 0:
- draw_pict(ss,T_time,st,time_update,new_rates,opening,high,low,closing)
- break
- except urllib2.URLError as e:
- print 'Download error', e.reason
- nation_que.task_done()
- List = []
- def draw_pict(name,T_time1,high_rate,time_update,new_rate,opening,high,low,closing):
- High = T_time1
- Time = high_rate
- High_Rate = []
- T_time = []
- mmap = "{\"Type\":\"%s\",\"Current_quto\":\"%s\",\"Opening_quto\":\"%s\",\"High_quto\":\"%s\",\"low_quto\":\"%s\",\"Closing_quto\":\"%s\",\"Update_Time\":\"%s\",\"Real_TIme_infor\":{" % ( name, new_rate, opening, high, low, closing, time_update)
- print mmap
- flag = 0
- for each,high1 in zip(T_time1,high_rate):
- if flag == 1:
- mmap += ","
- else:
- flag = 1
- mm = "\"%s\":\"%s\""%(each,high1)
- st = time.localtime(float(each))
- mmap += mm
- if st.tm_min == 0:
- T_time.append(st.tm_hour)
- High_Rate.append(high1)
- else:
- pass
- mmap += "}}"
- mmap1 = json.loads(mmap)
- print mmap1
- Share.insert(mmap1)
- if len(T_time) == len(High_Rate):
- quick_sort(T_time,High_Rate,0,len(High_Rate)-1)
- List.append(High_Rate)
- def draw_picture():
- colu = len(List)
- num = 1
- for each in List:
- plt.subplot(colu/2 + 1,2,num)
- num+=1
- list = each
- T_time = []
- for i in range(len(list)):
- T_time.append(i)
- print len(list)
- print len(T_time)
- plt.plot(T_time, list, marker='*')
- plt.title('Share Message')
- if __name__ == '__main__':
- get_type_url()
- draw_picture()
