1. import math
  2. import random
  3. import re
  4. import sys
  5. import threading
  6. from time import ctime, sleep
  7. from lxml import etree
  8. import pprint
  9. import requests
  10. from selenium import webdriver
  11.  
  12. f = open('spider_北上广深_district.txt', 'r', encoding='utf-8')
  13. f.closed
  14. POOL_URL_DISTRICT_LIST = []
  15. for i in f:
  16. d = i.replace('\n', '').replace(' ', '').split('"')
  17. for ii in d:
  18. if ii.find('http') > -1:
  19. POOL_URL_DISTRICT_LIST.append(ii)
  20.  
  21. POOL_URL_DISTRICT_MAXPAGE_NUM_DIC = {}
  22.  
  23. res_dic = {}
  24.  
  25. # https://m.lianjia.com/gz/ershoufang/tianhequ/pg23/
  26. # https://m.lianjia.com/sz/ershoufang/longhuaqu/pg23/
  27. # https://m.lianjia.com/bj/ershoufang/haidianqu/pg34/
  28. # 'https://sz.lianjia.com/ershoufang/futianqu/'
  29. # 'https://gz.lianjia.com/ershoufang/panyu/'
  30. # 'https://bj.lianjia.com/ershoufang/miyun/'
  31.  
  32. POOL_URL_DISTRICT_LIST_B = []
  33. MAX_PAGE_NUM = 100
  34.  
  35. def gen_url(num=MAX_PAGE_NUM):
  36. for url in POOL_URL_DISTRICT_LIST:
  37. l = url.split('//')[1].split('lianjia')
  38. [city, district] = l
  39. city = city[:-1]
  40. district = district.split('.com/')[1]
  41. if city != 'sh':
  42. url_ = '%s%s/%s' % ('https://m.lianjia.com/', city, district)
  43. else:
  44. url_ = '%s%s/' % ('http://m.sh.lianjia.com/', district)
  45.  
  46. POOL_URL_DISTRICT_MAXPAGE_NUM_DIC[url_] = num
  47. POOL_URL_DISTRICT_LIST_B.append(url_)
  48.  
  49. gen_url()
  50. exception_url_list = []
  51.  
  52. URL_NUM_EACH_THREAD = 100 * 0.6 * 4
  53. res_dic = {}
  54.  
  55. # QPS_TIME_UNIT_B = 2
  56.  
  57. todo_url_list = []
  58. for url_ in POOL_URL_DISTRICT_LIST_B:
  59. if url_.find('sh.') > -1:
  60. page_addition = 'd'
  61. else:
  62. page_addition = 'pg'
  63. for page_num in range(1, MAX_PAGE_NUM, 1):
  64. url = '%s%s%s/' % (url_, page_addition, page_num)
  65. todo_url_list.append(url)
  66.  
  67. LEN = len(todo_url_list)
  68.  
  69. browser = webdriver.Firefox()
  70.  
  71. def grab_todo_url_list(browser):
  72. global res_dic, todo_url_list
  73.  
  74. d = random.randint(1, 2)
  75. if d % 2 == 0:
  76. todo_url_list.reverse()
  77.  
  78. my_control = len(todo_url_list)
  79. my_control_start = random.randint(0, my_control)
  80. for i in range(my_control_start, my_control, 1):
  81. if len(todo_url_list) > i:
  82. url = todo_url_list[i]
  83. if url not in todo_url_list:
  84. continue
  85. sleep(1)
  86. browser.get(url)
  87. html = browser.page_source
  88. web_site = ''
  89. url_pass_flag = 0
  90. if html.find('price_total') > -1:
  91. selector = etree.HTML(html)
  92. url_l = selector.xpath('//a[@class="a_mask"]/@href')
  93. des_l = selector.xpath('//div[@class="item_other text_cut"]/text()')
  94. price_total_l = selector.xpath('//span[@class="price_total"]/em/text()')
  95. unit_price_l = selector.xpath('//span[@class="unit_price"]/text()')
  96. url_pass_flag = 1
  97. elif html.find('xiaoquname') > -1:
  98. web_site = 'sh'
  99. selector = etree.HTML(html)
  100. url_l = selector.xpath('//ul[@class="fang-list"]/li/a/@href')
  101. xiaoquname_l = selector.xpath('//span[@class="xiaoquname"]/text()')
  102. area_l = selector.xpath('//p[@class="f-area"]/text()')
  103. price_total_l = selector.xpath('//span[@class="f-price"]/text()')
  104. url_pass_flag = 1
  105. # https://m.lianjia.com/bj/ershoufang/yizhuangkaifaqu/pg87
  106. # http://m.sh.lianjia.com/ershoufang/jinshan/d78
  107. elif html.find('搜索条件') > -1 or url.find('/lf/') > -1:
  108. print(111, url)
  109. url_pass_flag = 2
  110. if url_pass_flag == 1:
  111. res_dic[url] = {}
  112. len_l = len(url_l)
  113. res_dic[url]['items_list'] = []
  114. len_l_ = len_l - 1
  115. for i in range(0, len_l_, 1):
  116. d = {}
  117. d['spider_url'] = url
  118. d['item_url'] = url_l[i]
  119. d['des'] = des_l[i] if web_site == '' else '%s||%s' % (
  120. area_l[i].replace('\n', '').replace(',', '').replace(' ', ''),
  121. xiaoquname_l[i].replace(',', '').replace(' ', ''))
  122. d['price_total'] = price_total_l[i]
  123. d['unit_price'] = unit_price_l[i] if web_site == '' else 'sh'
  124. res_dic[url]['items_list'].append(d)
  125. if url_pass_flag != 0:
  126. if url in todo_url_list:
  127. l_index = todo_url_list.index(url)
  128. del todo_url_list[l_index]
  129. else:
  130. if url not in todo_url_list:
  131. todo_url_list.append(url)
  132. browser.close()
  133.  
  134. class MyThread(threading.Thread):
  135. def __init__(self, func, args, name=''):
  136. threading.Thread.__init__(self)
  137. self.name = name
  138. self.func = func
  139. self.args = args
  140.  
  141. def run(self):
  142. self.func(self.args)
  143.  
  144. MAX_EXCEPTION_URL_NUM = 0
  145.  
  146. def deal_exception_url_list():
  147. global todo_url_list
  148. browser = webdriver.Firefox()
  149. if len(todo_url_list) > MAX_EXCEPTION_URL_NUM:
  150. grab_todo_url_list(browser)
  151. else:
  152. return
  153. deal_exception_url_list()
  154.  
  155. POOL_URL_LEN_B = len(POOL_URL_DISTRICT_LIST_B)
  156.  
  157. def main():
  158. print('starting at:', ctime())
  159. threads_list = []
  160. thread_sum = math.ceil(LEN / URL_NUM_EACH_THREAD)
  161. for nloop in range(0, thread_sum, 1):
  162. browser = webdriver.Firefox()
  163. thread_instance = MyThread(grab_todo_url_list, (browser), grab_todo_url_list.__name__)
  164. threads_list.append(thread_instance)
  165. # 主进程将在所有非守护进程退出后,退出
  166. for t in threads_list:
  167. t.setDaemon = False
  168. t.start()
  169. # wait for all thrades to finish
  170. for t in threads_list:
  171. t.join()
  172. # pprint.pprint(res_dic)
  173. deal_exception_url_list()
  174. print('end_r:', ctime())
  175. f_name = 'mobile_lianjia_ershoufang_BSGS.csv'
  176. f = open(f_name, 'w', encoding='utf-8-sig')
  177. str = 'spider_url,item_url,des,price_total,unit_price\n'
  178. f.write(str)
  179. f.closed
  180. f = open(f_name, 'a', encoding='utf-8-sig')
  181.  
  182. for url in res_dic:
  183. try:
  184. for d in res_dic[url]['items_list']:
  185. str = '%s,%s,%s,%s,%s\n' % (d['spider_url'], d['item_url'], d['des'], d['price_total'], d['unit_price'])
  186. f.write(str)
  187. except Exception:
  188. print(Exception)
  189. f.closed
  190. print('end_w:', ctime())
  191.  
  192. if __name__ == '__main__':
  193. main()

  1. # -*- coding: UTF-8 -*-
  2. import math
  3. import random
  4. import sys
  5. import threading
  6. from time import ctime, sleep
  7.  
  8. import requests
  9.  
  10. MAX_PAGINATION = 100
  11. pagination = MAX_PAGINATION
  12. QPS = 50
  13. QPS_TIME_UNIT = 1
  14. # http://lbs.amap.com/api/webservice/guide/tools/info
  15. INFOCODE_OK = ''
  16. file_name_key_pool = 'key_pool.pool'
  17. KEY_POOL_LIST = []
  18. touse_key = ''
  19. f = open(file_name_key_pool, 'r', encoding='utf-8')
  20. for i in f:
  21. try:
  22. list_ = i.split('\t')
  23. key = i.split('\t')[1].split()
  24. KEY_POOL_LIST.append(key[0])
  25. except Exception:
  26. print(Exception)
  27. KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST)
  28.  
  29. URL_TYPE = 'http://restapi.amap.com/v3/place/text'
  30. touse_key = ''
  31. keywords = '&keywords='
  32. OFFSET = '&offset=2'
  33. CITYLIMIT = '&citylimit=false'
  34. # 060100 购物服务 商场 商场
  35. # 060101 购物服务 商场 购物中心
  36. # 060102 购物服务 商场 普通商场
  37. # 060400 购物服务 超级市场 超市
  38.  
  39. POI_TYPES = '&types=060100|060101|060102|060400'
  40. URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1'
  41.  
  42. change_key_qps = 0
  43.  
  44. def change_key():
  45. global touse_key, change_key_qps
  46. # 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
  47. mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
  48. for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
  49. key = KEY_POOL_LIST[i]
  50. if key == touse_key:
  51. if i == KEY_POOL_NUM_INDICATOR:
  52. change_key()
  53. return
  54. else:
  55. continue
  56. touse_key = key
  57. url = URL_FOR_CHANGE_KEY % (touse_key)
  58. try:
  59. change_key_qps += 1
  60. if change_key_qps % QPS == 0:
  61. sleep(QPS_TIME_UNIT)
  62. r = requests.get(url)
  63. json_ = r.json()
  64. except Exception:
  65. print('requests.get(url)', Exception)
  66. change_key()
  67. return
  68. infocode = json_['infocode']
  69. if not infocode == INFOCODE_OK:
  70. if i == KEY_POOL_NUM_INDICATOR:
  71. sys.exit('NOInvalidKEY')
  72. change_key()
  73. return
  74. return
  75.  
  76. FNAME = '【商场任务】28个城市_任务列表_20170727 - 副本.csv'
  77. tosupply_dic = {}
  78. todo_list = []
  79. fo = open(FNAME, 'r', encoding='gbk')
  80. file_line_num = 0
  81. for i in fo:
  82. file_line_num += 1
  83. if file_line_num == 1:
  84. continue
  85. todo_list.append(file_line_num)
  86. tosupply_dic[file_line_num] = {}
  87. l = i.replace('\n', '').replace(',,,,,,,', '').split(',')
  88. dic_ = {}
  89. dic_['sequence_number'] = l[0]
  90. dic_['type'] = l[1]
  91. dic_['city'] = l[2]
  92. dic_['district'] = l[3]
  93. dic_['address'] = l[4]
  94. dic_['name'] = l[5]
  95. dic_['gd_type_1'], dic_['gd_type_2'], dic_['gd_type_3'], dic_['gd_name'], dic_['gd_province'], dic_['gd_city'], \
  96. dic_['gd_district'], dic_['gd_address'] = ['', '', '', '', '', '', '', '']
  97. tosupply_dic[file_line_num] = dic_
  98. LEN = len(todo_list)
  99. EACH_THREAD_REQUEST_NUM = 30
  100.  
  101. requests_counter = 0
  102. tosupply_dic_len = len(tosupply_dic)
  103. tosupply_dic_len_ = tosupply_dic_len - 1
  104.  
  105. def supply_dic(nloop):
  106. global tosupply_dic, requests_counter, todo_list
  107. print(len(todo_list))
  108. d = random.randint(1, 2)
  109. if d % 2 == 0:
  110. todo_list.reverse()
  111.  
  112. for file_line_num in todo_list:
  113. if file_line_num not in todo_list:
  114. continue
  115. t = threading.current_thread()
  116. print('nloop=',nloop)
  117. print(' t._ident=',t._ident)
  118. dic_ = tosupply_dic[file_line_num]
  119. city = dic_['district']
  120. name = dic_['name']
  121. url = '%s?key=%s&keywords=%s&city=%s%s%s%s' % (URL_TYPE, touse_key, name, city, POI_TYPES, OFFSET, CITYLIMIT)
  122. if requests_counter % QPS == 0:
  123. sleep(QPS_TIME_UNIT)
  124. try:
  125. r = requests.get(url)
  126. r_json = r.json()
  127. except Exception:
  128. if file_line_num not in todo_list:
  129. todo_list.append(file_line_num)
  130. continue
  131. infocode = r_json['infocode']
  132. if infocode == '':
  133. count = r_json['count']
  134. if int(count) > 0:
  135. pois_list = r_json['pois']
  136. pos_dic = pois_list[0]
  137. tosupply_dic[file_line_num]['gd_type_one'] = pos_dic['type']
  138. tosupply_dic[file_line_num]['gd_type_1'], tosupply_dic[file_line_num]['gd_type_2'], \
  139. tosupply_dic[file_line_num]['gd_type_3'] = pos_dic['type'].split('|')[0].split(';')
  140. tosupply_dic[file_line_num]['gd_province'] = pos_dic['pname']
  141. tosupply_dic[file_line_num]['gd_city'] = pos_dic['cityname']
  142. tosupply_dic[file_line_num]['gd_district'] = pos_dic['adname']
  143. tosupply_dic[file_line_num]['gd_address'] = pos_dic['address']
  144. elif int(count) == 0:
  145. tosupply_dic[file_line_num]['gd_name'] = 'GD-NO-DATA'
  146. if file_line_num in todo_list:
  147. list_index = todo_list.index(file_line_num)
  148. del todo_list[list_index]
  149. print(file_line_num)
  150. else:
  151. if file_line_num not in todo_list:
  152. todo_list.append(file_line_num)
  153. change_key()
  154.  
  155. MAX_EXCEPTION_URL_NUM = 0
  156.  
  157. def deal_exception_list():
  158. global todo_list
  159. print(todo_list)
  160. if len(todo_list) > MAX_EXCEPTION_URL_NUM:
  161. supply_dic()
  162. else:
  163. return
  164. deal_exception_list()
  165.  
  166. class MyThread(threading.Thread):
  167. def __init__(self, func, args, name=''):
  168. threading.Thread.__init__(self)
  169. self.name = name
  170. self.func = func
  171. self.args = args
  172.  
  173. def run(self):
  174. self.func(self.args)
  175.  
  176. def main():
  177. print('starting at:', ctime())
  178. threads_list = []
  179. thread_sum = math.ceil(tosupply_dic_len / EACH_THREAD_REQUEST_NUM)
  180. print(185, thread_sum)
  181. for nloop in range(0, thread_sum, 1):
  182. thread_instance = MyThread(supply_dic,(nloop),supply_dic.__name__)
  183. threads_list.append(thread_instance)
  184. # 主进程将在所有非守护进程退出后,退出
  185. for t in threads_list:
  186. t.setDaemon = False
  187. t.start()
  188. # wait for all thrades to finish
  189. for t in threads_list:
  190. t.join()
  191. deal_exception_list()
  192.  
  193. FGEN = 'GEN_28.csv'
  194. fo = open(FGEN, 'w', encoding='utf-8-sig')
  195. fo.write(
  196. '序号,类别编号,城市名称,区域 地址,商圈名,gd_type_one,gd_type_1,gd_type_2,gd_type_3,gd_name,gd_province,gd_city,gd_district,gd_address\n')
  197. fo.closed
  198. fo = open(FGEN, 'a', encoding='utf-8-sig')
  199. for file_line_num in tosupply_dic:
  200. if file_line_num == 1:
  201. continue
  202. dic_ = tosupply_dic[file_line_num]
  203. str = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
  204. dic_['sequence_number'], dic_['type'], dic_['city'], dic_['district'], dic_['address'],
  205. dic_['name'], dic_['gd_type_one'], dic_['gd_type_1'], dic_['gd_type_2'], dic_['gd_type_3'], dic_['gd_name'],
  206. dic_['gd_province'],
  207. dic_['gd_city'],
  208. dic_['gd_district'],
  209. dic_['gd_address'])
  210. fo.write(str)
  211. fo.closed
  212.  
  213. if __name__ == '__main__':
  214. main()

16G内存OK   4G内存 --- no

  1. # -*- coding: UTF-8 -*-
  2. import math
  3. import random
  4. import sys
  5. import threading
  6. from time import ctime, sleep
  7.  
  8. import requests
  9.  
  10. # fo = open('电影院任务列表_20170724_新增列 - 副本 (12).csv', 'r', encoding='utf-8')
  11. # file_line_num = 0
  12. # for i in fo:
  13. # if file_line_num == 0:
  14. # continue
  15. # l = i.replace('\n','').split(',')
  16. #
  17. # ddd = 5
  18. # MAX_OFFSET = 25
  19. # OFFSET = MAX_OFFSET - 1
  20. MAX_PAGINATION = 100
  21. pagination = MAX_PAGINATION
  22. QPS = 50
  23. QPS_TIME_UNIT = 1
  24. # http://lbs.amap.com/api/webservice/guide/tools/info
  25. INFOCODE_OK = ''
  26. file_name_key_pool = 'key_pool.pool'
  27. KEY_POOL_LIST = []
  28. touse_key = ''
  29. f = open(file_name_key_pool, 'r', encoding='utf-8')
  30. for i in f:
  31. try:
  32. list_ = i.split('\t')
  33. key = i.split('\t')[1].split()
  34. KEY_POOL_LIST.append(key[0])
  35. except Exception:
  36. print(Exception)
  37. KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST)
  38.  
  39. # http://restapi.amap.com/v3/place/text?key=&keywords=奥斯卡漯河店&types=080601&city=漯河&offset=1
  40. # http://restapi.amap.com/v3/place/text?key=&keywords=奥斯卡漯河店&types=080601&city=漯河&offset=2
  41. # 2得到的结果更接近期望值
  42. URL_TYPE = 'http://restapi.amap.com/v3/place/text'
  43. touse_key = ''
  44. keywords = '&keywords='
  45. OFFSET = '&offset=2'
  46. ## 己方数据 city-name 不匹配 citylimit true-->false
  47. # 华中区 湖北 钟祥 电影院 横店荆门店 横店影视股份有限公司 0 GD-NO-DATA
  48. CITYLIMIT = '&citylimit=false'
  49. # 080601 体育休闲服务 影剧院 电影院
  50. POI_TYPES = '&types=080601'
  51. URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1'
  52.  
  53. change_key_qps = 0
  54.  
  55. def change_key():
  56. global touse_key, change_key_qps
  57. # 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
  58. mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
  59. for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
  60. key = KEY_POOL_LIST[i]
  61. if key == touse_key:
  62. if i == KEY_POOL_NUM_INDICATOR:
  63. change_key()
  64. return
  65. else:
  66. continue
  67. touse_key = key
  68. url = URL_FOR_CHANGE_KEY % (touse_key)
  69. try:
  70. change_key_qps += 1
  71. if change_key_qps % QPS == 0:
  72. sleep(QPS_TIME_UNIT)
  73. r = requests.get(url)
  74. json_ = r.json()
  75. except Exception:
  76. print('requests.get(url)', Exception)
  77. change_key()
  78. return
  79. infocode = json_['infocode']
  80. if not infocode == INFOCODE_OK:
  81. if i == KEY_POOL_NUM_INDICATOR:
  82. sys.exit('NOInvalidKEY')
  83. change_key()
  84. return
  85. return
  86.  
  87. FNAME = '电影院任务列表_20170724.csv'
  88. tosupply_dic = {}
  89. fo = open(FNAME, 'r', encoding='gbk')
  90. file_line_num = 0
  91. for i in fo:
  92. file_line_num += 1
  93. if file_line_num == 1:
  94. continue
  95. tosupply_dic[file_line_num] = {}
  96. is_from_past_line = 1 if len(i.split(',,,,,,,,')) > 1 else 0
  97. l = i.replace('\n', '').replace(',,,,,,,', '').split(',')
  98. dic_ = {}
  99. dic_['sequence_number'] = l[0]
  100. dic_['area'] = l[1] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1]['area']
  101. dic_['province'] = l[2] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1][
  102. 'province']
  103. dic_['city'] = l[3] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1]['city']
  104. dic_['district'] = ''
  105. dic_['address'] = ''
  106. dic_['buliding'] = ''
  107. dic_['longitude_latitude'] = ''
  108. dic_['busniess_type'] = l[8] if is_from_past_line == 0 else l[1]
  109. dic_['name'] = l[9] if is_from_past_line == 0 else l[2]
  110. dic_['parent_company'] = l[10] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1]['parent_company']
  111. dic_['is_from_past_line'] = is_from_past_line
  112. dic_['gd_name'] = ''
  113. dic_['gd_city'] = ''
  114. tosupply_dic[file_line_num] = dic_
  115.  
  116. EACH_THREAD_REQUEST_NUM = 30
  117. exception_line_num_list = []
  118.  
  119. # keywords = '&keywords='
  120. # OFFSET = '&offset=1'
  121. # CITYLIMIT = '&citylimit=true'
  122. # # 080601 体育休闲服务 影剧院 电影院
  123. # POI_TYPES = '&types=080601'
  124. #
  125. # http://restapi.amap.com/v3/place/text?key=&keywords=重庆万盛DAV国际影城&city=重庆&types=080601&offset=1&citylimit=true
  126. requests_counter = 0
  127. tosupply_dic_len = len(tosupply_dic)
  128. tosupply_dic_len_ = tosupply_dic_len - 1
  129.  
  130. # thread_strat_file_line_num
  131. def supply_dic(thread_strat_file_line_num):
  132. global tosupply_dic, requests_counter, tosupply_dic_len_, exception_line_num_list
  133. for loop in range(0, EACH_THREAD_REQUEST_NUM, 1):
  134. file_line_num = thread_strat_file_line_num + loop
  135. if file_line_num - 2 > tosupply_dic_len_:
  136. return
  137. if file_line_num < 2 or len(tosupply_dic[file_line_num]['district']) > 0:
  138. if file_line_num in exception_line_num_list:
  139. list_index = exception_line_num_list.index(file_line_num)
  140. del exception_line_num_list[list_index]
  141. continue
  142. dic_ = tosupply_dic[file_line_num]
  143. city = dic_['city']
  144. name = dic_['name']
  145. url = '%s?key=%s&keywords=%s&city=%s%s%s%s' % (URL_TYPE, touse_key, name, city, POI_TYPES, OFFSET, CITYLIMIT)
  146. if requests_counter % QPS == 0:
  147. sleep(QPS_TIME_UNIT)
  148. try:
  149. r = requests.get(url)
  150. r_json = r.json()
  151. except Exception:
  152. if file_line_num not in exception_line_num_list:
  153. exception_line_num_list.append(file_line_num)
  154. continue
  155. infocode = r_json['infocode']
  156. if infocode == '':
  157. count = r_json['count']
  158. if int(count) > 0:
  159. pois_list = r_json['pois']
  160. pos_dic = pois_list[0]
  161. tosupply_dic[file_line_num]['district'] = pos_dic['adname']
  162.  
  163. if len(pos_dic['address']) <= 2:
  164. print(pos_dic)
  165. print(pos_dic['address'])
  166. tosupply_dic[file_line_num]['address'] = pos_dic['address'] if len(pos_dic['address']) > 2 else '高德缺地址'
  167. if len(pos_dic['address']) <= 2:
  168. print(tosupply_dic[file_line_num]['address'])
  169. tosupply_dic[file_line_num]['longitude_latitude'] = pos_dic['location']
  170. tosupply_dic[file_line_num]['gd_name'] = pos_dic['name']
  171. tosupply_dic[file_line_num]['gd_city'] = pos_dic['cityname']
  172.  
  173. elif int(count) == 0:
  174. tosupply_dic[file_line_num]['gd_name'] = 'GD-NO-DATA'
  175. if file_line_num in exception_line_num_list:
  176. list_index = exception_line_num_list.index(file_line_num)
  177. del exception_line_num_list[list_index]
  178. else:
  179. if file_line_num not in exception_line_num_list:
  180. exception_line_num_list.append(file_line_num)
  181. change_key()
  182.  
  183. MAX_EXCEPTION_URL_NUM = 0
  184.  
  185. def deal_exception_list():
  186. global exception_line_num_list
  187. print(exception_line_num_list)
  188. if len(exception_line_num_list) > MAX_EXCEPTION_URL_NUM:
  189. for thread_strat_file_line_num in exception_line_num_list:
  190. supply_dic(thread_strat_file_line_num)
  191. else:
  192. return
  193. deal_exception_list()
  194.  
  195. class MyThread(threading.Thread):
  196. def __init__(self, func, args, name=''):
  197. threading.Thread.__init__(self)
  198. self.name = name
  199. self.func = func
  200. self.args = args
  201.  
  202. def run(self):
  203. self.func(self.args)
  204.  
  205. # 通灌北路58号苏宁广场7层701
  206. # 林源路创基尚城B区3层
  207. # 民治书香门第上河坊2栋2层
  208. # 东欣大道东欣广场城市综合体E01四楼
  209. # 横栏镇茂辉工业区乐丰四路21号永耀商业广场B幢之四(永耀人才市场旁)
  210. # 信江路西侧金峰城市广场1栋5层502(鹏泰购物广场5楼、迪欧咖啡楼上)
  211. # 上陡门学院中路人才大厦一层(东瓯影城)
  212. # 丰庆路710号(世纪联华超市4楼)
  213. # 风度中路13号百老汇商业城5层(美特斯邦威楼上)
  214. # 龙阳路2000号(龙阳广场5层)
  215. # 容城大道东12号(容城天骄写字楼3层)
  216. # 解放大道387号(汉口宗关水厂)南国西汇城市广场二期5层
  217. # 后沙峪镇安泰大街9号院(中粮祥云小镇)7号楼2层
  218. # 华强新天地3楼华时代美食城门口(横店电影院门口)
  219. # 南三环西路16号1号楼首地大峡谷购物中心5层
  220. # 永兴路7号院1号楼龙湖北京大兴天街购物中心L3层Z2
  221. def gen_building(str):
  222. start_ = 0
  223. end_ = len(str)
  224. res = ''
  225. start_flag = 0
  226. end_flag = 0
  227. if str.find('号') > -1:
  228. start_ = str.find('号') + 1
  229. start_flag = 1
  230. elif str.find('交汇处') > -1:
  231. start_ = str.find('交汇处') + 1 + 2
  232. start_flag = 1
  233. elif str.find('交叉口') > -1:
  234. start_ = str.find('交叉口') + 1 + 2
  235. start_flag = 1
  236. elif str.find('路') > -1:
  237. start_ = str.find('路') + 1
  238. start_flag = 1
  239. elif str.find('道') > -1:
  240. start_ = str.find('道') + 1
  241. start_flag = 1
  242.  
  243. if str.find('层') > -1:
  244. end_ = str.find('层') + 1
  245. end_flag = 1
  246. elif str.find('楼') > -1:
  247. end_ = str.find('楼') + 1
  248. end_flag = 1
  249. if start_flag == 1 or end_flag == 1:
  250. res = ''.join((list(str)[start_:end_]))
  251. if res.find('(') > -1 or res.find('(') > -1:
  252. # new rule
  253. res = res.replace('(', '').replace(')', '').replace('(', '').replace(')', '')
  254. return res
  255.  
  256. def main():
  257. print('starting at:', ctime())
  258. threads_list = []
  259. thread_sum = math.ceil(tosupply_dic_len / EACH_THREAD_REQUEST_NUM)
  260. print(185, thread_sum)
  261. for nloop in range(0, thread_sum, 1):
  262. thread_strat_file_line_num = nloop * EACH_THREAD_REQUEST_NUM
  263. print(thread_strat_file_line_num)
  264. thread_instance = MyThread(supply_dic, (thread_strat_file_line_num), supply_dic.__name__)
  265. threads_list.append(thread_instance)
  266. # 主进程将在所有非守护进程退出后,退出
  267. for t in threads_list:
  268. t.setDaemon = False
  269. t.start()
  270. # wait for all thrades to finish
  271. for t in threads_list:
  272. t.join()
  273. # pprint.pprint(res_dic)
  274.  
  275. deal_exception_list()
  276. for i in exception_line_num_list:
  277. print('EXCEPTION', i)
  278. FGEN = '电影院任务列表_20170724_新增列_已计算楼宇.csv'
  279. fo = open(FGEN, 'w', encoding='utf-8-sig')
  280. fo.write('序号,地理区域,省份,城市,区域,地址,所属楼宇,经纬度,商圈属性,店铺名,所属院线,is_from_past_line,gd_name,gd_city\n')
  281. fo.closed
  282. fo = open(FGEN, 'a', encoding='utf-8-sig')
  283. for file_line_num in tosupply_dic:
  284. if file_line_num == 1:
  285. continue
  286. dic_ = tosupply_dic[file_line_num]
  287. str = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
  288. dic_['sequence_number'], dic_['area'], dic_['province'], dic_['city'], dic_['district'],
  289. dic_['address'].replace(',', ' '),
  290. gen_building(dic_['address']), dic_['longitude_latitude'].replace(',', ' '), dic_['busniess_type'],
  291. dic_['name'],
  292. dic_['parent_company'],
  293. dic_['is_from_past_line'], dic_['gd_name'], dic_['gd_city'])
  294. fo.write(str)
  295. fo.closed
  296.  
  297. if __name__ == '__main__':
  298. main()
  1. # -*- coding: UTF-8 -*-
  2. import math
  3. import random
  4. import sys
  5. import threading
  6. from time import ctime, sleep
  7.  
  8. import requests
  9.  
  10. # fo = open('电影院任务列表_20170724_新增列 - 副本 (12).csv', 'r', encoding='utf-8')
  11. # file_line_num = 0
  12. # for i in fo:
  13. # if file_line_num == 0:
  14. # continue
  15. # l = i.replace('\n','').split(',')
  16. #
  17. # ddd = 5
  18. # MAX_OFFSET = 25
  19. # OFFSET = MAX_OFFSET - 1
  20. MAX_PAGINATION = 100
  21. pagination = MAX_PAGINATION
  22. QPS = 50
  23. QPS_TIME_UNIT = 1
  24. # http://lbs.amap.com/api/webservice/guide/tools/info
  25. INFOCODE_OK = '10000'
  26. file_name_key_pool = 'key_pool.pool'
  27. KEY_POOL_LIST = []
  28. touse_key = ''
  29. f = open(file_name_key_pool, 'r', encoding='utf-8')
  30. for i in f:
  31. try:
  32. list_ = i.split('\t')
  33. key = i.split('\t')[1].split()
  34. KEY_POOL_LIST.append(key[0])
  35. except Exception:
  36. print(Exception)
  37. KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST)
  38.  
  39. # http://restapi.amap.com/v3/place/text?key=&keywords=奥斯卡漯河店&types=080601&city=漯河&offset=1
  40. # http://restapi.amap.com/v3/place/text?key=&keywords=奥斯卡漯河店&types=080601&city=漯河&offset=2
  41. # 2得到的结果更接近期望值
  42. URL_TYPE = 'http://restapi.amap.com/v3/place/text'
  43. touse_key = ''
  44. keywords = '&keywords='
  45. OFFSET = '&offset=2'
  46. ## 己方数据 city-name 不匹配 citylimit true-->false
  47. # 华中区 湖北 钟祥 电影院 横店荆门店 横店影视股份有限公司 0 GD-NO-DATA
  48. CITYLIMIT = '&citylimit=false'
  49. # 080601 体育休闲服务 影剧院 电影院
  50. POI_TYPES = '&types=080601'
  51. URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1'
  52.  
  53. change_key_qps = 0
  54.  
  55. def change_key():
  56. global touse_key, change_key_qps
  57. # 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
  58. mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
  59. for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
  60. key = KEY_POOL_LIST[i]
  61. if key == touse_key:
  62. if i == KEY_POOL_NUM_INDICATOR:
  63. change_key()
  64. return
  65. else:
  66. continue
  67. touse_key = key
  68. url = URL_FOR_CHANGE_KEY % (touse_key)
  69. try:
  70. change_key_qps += 1
  71. if change_key_qps % QPS == 0:
  72. sleep(QPS_TIME_UNIT)
  73. r = requests.get(url)
  74. json_ = r.json()
  75. except Exception:
  76. print('requests.get(url)', Exception)
  77. change_key()
  78. return
  79. infocode = json_['infocode']
  80. if not infocode == INFOCODE_OK:
  81. if i == KEY_POOL_NUM_INDICATOR:
  82. sys.exit('NOInvalidKEY')
  83. change_key()
  84. return
  85. return
  86.  
  87. FNAME = '电影院任务列表_20170724.csv'
  88. tosupply_dic = {}
  89. fo = open(FNAME, 'r', encoding='gbk')
  90. file_line_num = 0
  91. for i in fo:
  92. file_line_num += 1
  93. if file_line_num == 1:
  94. continue
  95. tosupply_dic[file_line_num] = {}
  96. is_from_past_line = 1 if len(i.split(',,,,,,,,')) > 1 else 0
  97. l = i.replace('\n', '').replace(',,,,,,,', '').split(',')
  98. dic_ = {}
  99. dic_['sequence_number'] = l[0]
  100. dic_['area'] = l[1] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1]['area']
  101. dic_['province'] = l[2] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1][
  102. 'province']
  103. dic_['city'] = l[3] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1]['city']
  104. dic_['district'] = ''
  105. dic_['address'] = ''
  106. dic_['buliding'] = ''
  107. dic_['longitude_latitude'] = ''
  108. dic_['busniess_type'] = l[8] if is_from_past_line == 0 else l[1]
  109. dic_['name'] = l[9] if is_from_past_line == 0 else l[2]
  110. dic_['parent_company'] = l[10] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1]['parent_company']
  111. dic_['is_from_past_line'] = is_from_past_line
  112. dic_['gd_name'] = ''
  113. dic_['gd_city'] = ''
  114. tosupply_dic[file_line_num] = dic_
  115.  
  116. EACH_THREAD_REQUEST_NUM = 30
  117. exception_line_num_list = []
  118.  
  119. # keywords = '&keywords='
  120. # OFFSET = '&offset=1'
  121. # CITYLIMIT = '&citylimit=true'
  122. # # 080601 体育休闲服务 影剧院 电影院
  123. # POI_TYPES = '&types=080601'
  124. #
  125. # http://restapi.amap.com/v3/place/text?key=&keywords=重庆万盛DAV国际影城&city=重庆&types=080601&offset=1&citylimit=true
  126. requests_counter = 0
  127. tosupply_dic_len = len(tosupply_dic)
  128. tosupply_dic_len_ = tosupply_dic_len - 1
  129.  
  130. # thread_strat_file_line_num
  131. def supply_dic(thread_strat_file_line_num):
  132. global tosupply_dic, requests_counter, tosupply_dic_len_, exception_line_num_list
  133. for loop in range(0, EACH_THREAD_REQUEST_NUM, 1):
  134. file_line_num = thread_strat_file_line_num + loop
  135. if file_line_num - 2 > tosupply_dic_len_:
  136. return
  137. if file_line_num < 2 or len(tosupply_dic[file_line_num]['district']) > 0:
  138. if file_line_num in exception_line_num_list:
  139. list_index = exception_line_num_list.index(file_line_num)
  140. del exception_line_num_list[list_index]
  141. continue
  142. dic_ = tosupply_dic[file_line_num]
  143. city = dic_['city']
  144. name = dic_['name']
  145. url = '%s?key=%s&keywords=%s&city=%s%s%s%s' % (URL_TYPE, touse_key, name, city, POI_TYPES, OFFSET, CITYLIMIT)
  146. if requests_counter % QPS == 0:
  147. sleep(QPS_TIME_UNIT)
  148. try:
  149. r = requests.get(url)
  150. r_json = r.json()
  151. except Exception:
  152. if file_line_num not in exception_line_num_list:
  153. exception_line_num_list.append(file_line_num)
  154. continue
  155. infocode = r_json['infocode']
  156. if infocode == '10000':
  157. count = r_json['count']
  158. if int(count) > 0:
  159. pois_list = r_json['pois']
  160. pos_dic = pois_list[0]
  161. tosupply_dic[file_line_num]['district'] = pos_dic['adname']
  162.  
  163. if len(pos_dic['address']) <= 2:
  164. print(pos_dic)
  165. print(pos_dic['address'])
  166. tosupply_dic[file_line_num]['address'] = pos_dic['address'] if len(pos_dic['address']) > 2 else '高德缺地址'
  167. if len(pos_dic['address']) <= 2:
  168. print(tosupply_dic[file_line_num]['address'])
  169. tosupply_dic[file_line_num]['longitude_latitude'] = pos_dic['location']
  170. tosupply_dic[file_line_num]['gd_name'] = pos_dic['name']
  171. tosupply_dic[file_line_num]['gd_city'] = pos_dic['cityname']
  172.  
  173. elif int(count) == 0:
  174. tosupply_dic[file_line_num]['gd_name'] = 'GD-NO-DATA'
  175. if file_line_num in exception_line_num_list:
  176. list_index = exception_line_num_list.index(file_line_num)
  177. del exception_line_num_list[list_index]
  178. else:
  179. if file_line_num not in exception_line_num_list:
  180. exception_line_num_list.append(file_line_num)
  181. change_key()
  182.  
  183. MAX_EXCEPTION_URL_NUM = 0
  184.  
  185. def deal_exception_list():
  186. global exception_line_num_list
  187. print(exception_line_num_list)
  188. if len(exception_line_num_list) > MAX_EXCEPTION_URL_NUM:
  189. for thread_strat_file_line_num in exception_line_num_list:
  190. supply_dic(thread_strat_file_line_num)
  191. else:
  192. return
  193. deal_exception_list()
  194.  
  195. class MyThread(threading.Thread):
  196. def __init__(self, func, args, name=''):
  197. threading.Thread.__init__(self)
  198. self.name = name
  199. self.func = func
  200. self.args = args
  201.  
  202. def run(self):
  203. self.func(self.args)
  204.  
  205. # 通灌北路58号苏宁广场7层701
  206. # 林源路创基尚城B区3层
  207. # 民治书香门第上河坊2栋2层
  208. # 东欣大道东欣广场城市综合体E01四楼
  209. # 横栏镇茂辉工业区乐丰四路21号永耀商业广场B幢之四(永耀人才市场旁)
  210. # 信江路西侧金峰城市广场1栋5层502(鹏泰购物广场5楼、迪欧咖啡楼上)
  211. # 上陡门学院中路人才大厦一层(东瓯影城)
  212. # 丰庆路710号(世纪联华超市4楼)
  213. # 风度中路13号百老汇商业城5层(美特斯邦威楼上)
  214. # 龙阳路2000号(龙阳广场5层)
  215. # 容城大道东12号(容城天骄写字楼3层)
  216. # 解放大道387号(汉口宗关水厂)南国西汇城市广场二期5层
  217. # 后沙峪镇安泰大街9号院(中粮祥云小镇)7号楼2层
  218. # 华强新天地3楼华时代美食城门口(横店电影院门口)
  219. # 南三环西路16号1号楼首地大峡谷购物中心5层
  220. # 永兴路7号院1号楼龙湖北京大兴天街购物中心L3层Z2
  221. def gen_building(str):
  222. start_ = 0
  223. end_ = len(str)
  224. res = ''
  225. start_flag = 0
  226. end_flag = 0
  227. if str.find('号') > -1:
  228. start_ = str.find('号') + 1
  229. start_flag = 1
  230. elif str.find('交汇处') > -1:
  231. start_ = str.find('交汇处') + 1 + 2
  232. start_flag = 1
  233. elif str.find('交叉口') > -1:
  234. start_ = str.find('交叉口') + 1 + 2
  235. start_flag = 1
  236. elif str.find('路') > -1:
  237. start_ = str.find('路') + 1
  238. start_flag = 1
  239. elif str.find('道') > -1:
  240. start_ = str.find('道') + 1
  241. start_flag = 1
  242.  
  243. if str.find('层') > -1:
  244. end_ = str.find('层') + 1
  245. end_flag = 1
  246. elif str.find('楼') > -1:
  247. end_ = str.find('楼') + 1
  248. end_flag = 1
  249. if start_flag == 1 or end_flag == 1:
  250. res = ''.join((list(str)[start_:end_]))
  251. if res.find('('):
  252. # new rule
  253. res = res.replace('(', '').replace(')', '')
  254. return res
  255.  
  256. def main():
  257. print('starting at:', ctime())
  258. threads_list = []
  259. thread_sum = math.ceil(tosupply_dic_len / EACH_THREAD_REQUEST_NUM)
  260. print(185, thread_sum)
  261. for nloop in range(0, thread_sum, 1):
  262. thread_strat_file_line_num = nloop * EACH_THREAD_REQUEST_NUM
  263. print(thread_strat_file_line_num)
  264. thread_instance = MyThread(supply_dic, (thread_strat_file_line_num), supply_dic.__name__)
  265. threads_list.append(thread_instance)
  266. # 主进程将在所有非守护进程退出后,退出
  267. for t in threads_list:
  268. t.setDaemon = False
  269. t.start()
  270. # wait for all thrades to finish
  271. for t in threads_list:
  272. t.join()
  273. # pprint.pprint(res_dic)
  274.  
  275. deal_exception_list()
  276. for i in exception_line_num_list:
  277. print('EXCEPTION', i)
  278. FGEN = '电影院任务列表_20170724_新增列_已计算楼宇.csv'
  279. fo = open(FGEN, 'w', encoding='utf-8-sig')
  280. fo.write('序号,地理区域,省份,城市,区域,地址,所属楼宇,经纬度,商圈属性,店铺名,所属院线,is_from_past_line,gd_name,gd_city\n')
  281. fo.closed
  282. fo = open(FGEN, 'a', encoding='utf-8-sig')
  283. for file_line_num in tosupply_dic:
  284. if file_line_num == 1:
  285. continue
  286. dic_ = tosupply_dic[file_line_num]
  287. str = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
  288. dic_['sequence_number'], dic_['area'], dic_['province'], dic_['city'], dic_['district'],
  289. dic_['address'].replace(',', ' '),
  290. gen_building(dic_['address']), dic_['longitude_latitude'].replace(',', ' '), dic_['busniess_type'],
  291. dic_['name'],
  292. dic_['parent_company'],
  293. dic_['is_from_past_line'], dic_['gd_name'], dic_['gd_city'])
  294. fo.write(str)
  295. fo.closed
  296.  
  297. if __name__ == '__main__':
  298. main()

  

  1. 高德缺地址
  2. {'id': 'B0FFGJYCPN', 'name': '横店电影城(原阳店)', 'type': '体育休闲服务;影剧院;电影院', 'typecode': '', 'biz_type': 'cinema', 'address': [], 'location': '113.960307,35.065736', 'tel': '0373-5911199', 'distance': [], 'biz_ext': [], 'pname': '河南省', 'cityname': '新乡市', 'adname': '原阳县', 'importance': [], 'shopid': [], 'shopinfo': '', 'poiweight': []}
  3. []
  4. 高德缺地址
  5. {'id': 'B0FFH0368R', 'name': '横店电影城', 'type': '体育休闲服务;影剧院;电影院', 'typecode': '', 'biz_type': 'cinema', 'address': [], 'location': '118.430053,29.862110', 'tel': [], 'distance': [], 'biz_ext': [], 'pname': '安徽省', 'cityname': '黄山市', 'adname': '歙县', 'importance': [], 'shopid': [], 'shopinfo': '', 'poiweight': []}
  6. []
  7. 高德缺地址
  8. [300, 30, 2, 390, 330, 480, 570, 900, 840, 450, 1140, 630, 990, 180, 1050, 90, 240, 360, 720, 750, 690, 1170, 60, 1230, 960, 210, 1200, 930, 510, 150, 600, 870, 1080, 810, 660, 540, 270, 420, 1110, 120, 780, 1020, 1113, 813, 544, 69, 94, 119]
  9. [390, 330, 570, 840, 1140, 990, 1050, 240, 720, 690, 60, 960, 1200, 510, 600, 1080, 660, 270, 1110, 780, 1113]
  10. [330, 840, 990, 240, 690, 960, 510, 1080, 270, 780]
  11. [840, 240, 960, 1080, 780]
  12. [240, 1080]
  13. [1080]
  14. []
  1. # -*- coding: UTF-8 -*-
  2. import re
  3. import pprint
  4. import json
  5. import time
  6. import math
  7. import sys
  8. import requests
  9. import threading
  10. from time import ctime, sleep
  11. import random
  12.  
  13. # fo = open('电影院任务列表_20170724_新增列 - 副本 (12).csv', 'r', encoding='utf-8')
  14. # file_line_num = 0
  15. # for i in fo:
  16. # if file_line_num == 0:
  17. # continue
  18. # l = i.replace('\n','').split(',')
  19. #
  20. # ddd = 5
  21. # MAX_OFFSET = 25
  22. # OFFSET = MAX_OFFSET - 1
  23. MAX_PAGINATION = 100
  24. pagination = MAX_PAGINATION
  25. QPS = 50
  26. QPS_TIME_UNIT = 1
  27. # http://lbs.amap.com/api/webservice/guide/tools/info
  28. INFOCODE_OK = ''
  29. file_name_key_pool = 'key_pool.pool'
  30. KEY_POOL_LIST = []
  31. touse_key = ''
  32. f = open(file_name_key_pool, 'r', encoding='utf-8')
  33. for i in f:
  34. try:
  35. list_ = i.split('\t')
  36. key = i.split('\t')[1].split()
  37. KEY_POOL_LIST.append(key[0])
  38. except Exception:
  39. print(Exception)
  40. KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST)
  41.  
  42. # http://restapi.amap.com/v3/place/text?key=&keywords=奥斯卡漯河店&types=080601&city=漯河&offset=1
  43. # http://restapi.amap.com/v3/place/text?key=&keywords=奥斯卡漯河店&types=080601&city=漯河&offset=2
  44. # 2得到的结果更接近期望值
  45. URL_TYPE = 'http://restapi.amap.com/v3/place/text'
  46. touse_key = ''
  47. keywords = '&keywords='
  48. OFFSET = '&offset=2'
  49. ## 己方数据 city-name 不匹配 citylimit true-->false
  50. # 华中区 湖北 钟祥 电影院 横店荆门店 横店影视股份有限公司 0 GD-NO-DATA
  51. CITYLIMIT = '&citylimit=false'
  52. # 080601 体育休闲服务 影剧院 电影院
  53. POI_TYPES = '&types=080601'
  54. URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1'
  55.  
  56. change_key_qps = 0
  57.  
  58. def change_key():
  59. global touse_key, change_key_qps
  60. # 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
  61. mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
  62. for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
  63. key = KEY_POOL_LIST[i]
  64. if key == touse_key:
  65. if i == KEY_POOL_NUM_INDICATOR:
  66. change_key()
  67. return
  68. else:
  69. continue
  70. touse_key = key
  71. url = URL_FOR_CHANGE_KEY % (touse_key)
  72. try:
  73. change_key_qps += 1
  74. if change_key_qps % QPS == 0:
  75. sleep(QPS_TIME_UNIT)
  76. r = requests.get(url)
  77. json_ = r.json()
  78. except Exception:
  79. print('requests.get(url)', Exception)
  80. change_key()
  81. return
  82. infocode = json_['infocode']
  83. if not infocode == INFOCODE_OK:
  84. if i == KEY_POOL_NUM_INDICATOR:
  85. sys.exit('NOInvalidKEY')
  86. change_key()
  87. return
  88. return
  89.  
  90. FNAME = '电影院任务列表_20170724.csv'
  91. tosupply_dic = {}
  92. fo = open(FNAME, 'r', encoding='gbk')
  93. file_line_num = 0
  94. for i in fo:
  95. file_line_num += 1
  96. if file_line_num == 1:
  97. continue
  98. tosupply_dic[file_line_num] = {}
  99. is_from_past_line = 1 if len(i.split(',,,,,,,,')) > 1 else 0
  100. l = i.replace('\n', '').replace(',,,,,,,', '').split(',')
  101. dic_ = {}
  102. dic_['sequence_number'] = l[0]
  103. dic_['area'] = l[1] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1]['area']
  104. dic_['province'] = l[2] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1][
  105. 'province']
  106. dic_['city'] = l[3] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1]['city']
  107. dic_['district'] = ''
  108. dic_['address'] = ''
  109. dic_['buliding'] = ''
  110. dic_['longitude_latitude'] = ''
  111. dic_['busniess_type'] = l[8] if is_from_past_line == 0 else l[1]
  112. dic_['name'] = l[9] if is_from_past_line == 0 else l[2]
  113. dic_['parent_company'] = l[10] if is_from_past_line == 0 else tosupply_dic[file_line_num - 1]['parent_company']
  114. dic_['is_from_past_line'] = is_from_past_line
  115. dic_['gd_name'] = ''
  116. dic_['gd_city'] = ''
  117. tosupply_dic[file_line_num] = dic_
  118.  
  119. EACH_THREAD_REQUEST_NUM = 30
  120. exception_line_num_list = []
  121.  
  122. # keywords = '&keywords='
  123. # OFFSET = '&offset=1'
  124. # CITYLIMIT = '&citylimit=true'
  125. # # 080601 体育休闲服务 影剧院 电影院
  126. # POI_TYPES = '&types=080601'
  127. #
  128. # http://restapi.amap.com/v3/place/text?key=&keywords=重庆万盛DAV国际影城&city=重庆&types=080601&offset=1&citylimit=true
  129. requests_counter = 0
  130. tosupply_dic_len = len(tosupply_dic)
  131. tosupply_dic_len_ = tosupply_dic_len - 1
  132.  
  133. # thread_strat_file_line_num
  134. def supply_dic(thread_strat_file_line_num):
  135. global tosupply_dic, requests_counter, tosupply_dic_len_, exception_line_num_list
  136. for loop in range(0, EACH_THREAD_REQUEST_NUM, 1):
  137. file_line_num = thread_strat_file_line_num + loop
  138. if file_line_num - 2 > tosupply_dic_len_:
  139. return
  140. if file_line_num < 2 or len(tosupply_dic[file_line_num]['district']) > 0:
  141. if file_line_num in exception_line_num_list:
  142. list_index = exception_line_num_list.index(file_line_num)
  143. del exception_line_num_list[list_index]
  144. continue
  145. dic_ = tosupply_dic[file_line_num]
  146. city = dic_['city']
  147. name = dic_['name']
  148. url = '%s?key=%s&keywords=%s&city=%s%s%s%s' % (URL_TYPE, touse_key, name, city, POI_TYPES, OFFSET, CITYLIMIT)
  149. if requests_counter % QPS == 0:
  150. sleep(QPS_TIME_UNIT)
  151. try:
  152. r = requests.get(url)
  153. r_json = r.json()
  154. except Exception:
  155. if file_line_num not in exception_line_num_list:
  156. exception_line_num_list.append(file_line_num)
  157. continue
  158. infocode = r_json['infocode']
  159. if infocode == '':
  160. count = r_json['count']
  161. if int(count) > 0:
  162. pois_list = r_json['pois']
  163. pos_dic = pois_list[0]
  164. tosupply_dic[file_line_num]['district'] = pos_dic['adname']
  165.  
  166. if len(pos_dic['address']) <= 2:
  167. print(pos_dic)
  168. print(pos_dic['address'])
  169. tosupply_dic[file_line_num]['address'] = pos_dic['address'] if len(pos_dic['address']) > 2 else '高德缺地址'
  170. if len(pos_dic['address']) <= 2:
  171. print(tosupply_dic[file_line_num]['address'])
  172. tosupply_dic[file_line_num]['longitude_latitude'] = pos_dic['location']
  173. tosupply_dic[file_line_num]['gd_name'] = pos_dic['name']
  174. tosupply_dic[file_line_num]['gd_city'] = pos_dic['cityname']
  175.  
  176. elif int(count) == 0:
  177. tosupply_dic[file_line_num]['gd_name'] = 'GD-NO-DATA'
  178. if file_line_num in exception_line_num_list:
  179. list_index = exception_line_num_list.index(file_line_num)
  180. del exception_line_num_list[list_index]
  181. else:
  182. if file_line_num not in exception_line_num_list:
  183. exception_line_num_list.append(file_line_num)
  184. change_key()
  185.  
  186. MAX_EXCEPTION_URL_NUM = 0
  187.  
  188. def deal_exception_list():
  189. global exception_line_num_list
  190. print(exception_line_num_list)
  191. if len(exception_line_num_list) > MAX_EXCEPTION_URL_NUM:
  192. for thread_strat_file_line_num in exception_line_num_list:
  193. supply_dic(thread_strat_file_line_num)
  194. else:
  195. return
  196. deal_exception_list()
  197.  
  198. class MyThread(threading.Thread):
  199. def __init__(self, func, args, name=''):
  200. threading.Thread.__init__(self)
  201. self.name = name
  202. self.func = func
  203. self.args = args
  204.  
  205. def run(self):
  206. self.func(self.args)
  207.  
  208. def gen_building(str):
  209. start_ = 0
  210. end_ = 0
  211. if str.find('号') > -1:
  212. start_ = str.find('号') + 1
  213. elif str.find('路') > -1:
  214. start_ = str.find('路') + 1
  215. elif str.find('道') > -1:
  216. start_ = str.find('道') + 1
  217.  
  218. if str.find('层') > -1:
  219. end_ = str.find('层') + 1
  220. elif str.find('楼') > -1:
  221. end_ = str.find('楼') + 1
  222.  
  223. if end_ - start_ > 3:
  224. return ''.join((list(str)[start_:end_]))
  225. else:
  226. return ''
  227.  
  228. def main():
  229. print('starting at:', ctime())
  230. threads_list = []
  231. thread_sum = math.ceil(tosupply_dic_len / EACH_THREAD_REQUEST_NUM)
  232. print(185, thread_sum)
  233. for nloop in range(0, thread_sum, 1):
  234. thread_strat_file_line_num = nloop * EACH_THREAD_REQUEST_NUM
  235. print(thread_strat_file_line_num)
  236. thread_instance = MyThread(supply_dic, (thread_strat_file_line_num), supply_dic.__name__)
  237. threads_list.append(thread_instance)
  238. # 主进程将在所有非守护进程退出后,退出
  239. for t in threads_list:
  240. t.setDaemon = False
  241. t.start()
  242. # wait for all thrades to finish
  243. for t in threads_list:
  244. t.join()
  245. # pprint.pprint(res_dic)
  246.  
  247. deal_exception_list()
  248. for i in exception_line_num_list:
  249. print('EXCEPTION', i)
  250. FGEN = '电影院任务列表_20170724_新增列_已计算楼宇.csv'
  251. fo = open(FGEN, 'w', encoding='utf-8-sig')
  252. fo.write('序号,地理区域,省份,城市,区域,地址,所属楼宇,经纬度,商圈属性,店铺名,所属院线,is_from_past_line,gd_name,gd_city\n')
  253. fo.closed
  254. fo = open(FGEN, 'a', encoding='utf-8-sig')
  255. for file_line_num in tosupply_dic:
  256. if file_line_num == 1:
  257. continue
  258. dic_ = tosupply_dic[file_line_num]
  259. str = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
  260. dic_['sequence_number'], dic_['area'], dic_['province'], dic_['city'], dic_['district'],
  261. dic_['address'].replace(',', ' '),
  262. gen_building(dic_['address']), dic_['longitude_latitude'].replace(',', ' '), dic_['busniess_type'],
  263. dic_['name'],
  264. dic_['parent_company'],
  265. dic_['is_from_past_line'], dic_['gd_name'], dic_['gd_city'])
  266. fo.write(str)
  267. fo.closed
  268.  
  269. if __name__ == '__main__':
  270. main()
  1. {'id': 'B02DD0R6M6', 'name': '横店电影城(大汉店)', 'type': '体育休闲服务;影剧院;电影院', 'typecode': '', 'biz_type': 'cinema', 'address': [], 'location': '113.149267,27.838133', 'tel': '0731-22915555', 'distance': [], 'biz_ext': [], 'pname': '湖南省', 'cityname': '株洲市', 'adname': '芦淞区', 'importance': [], 'shopid': [], 'shopinfo': '', 'poiweight': []}
  2. []
  3. 高德缺地址
  4. {'id': 'B0FFGJYCPN', 'name': '横店电影城(原阳店)', 'type': '体育休闲服务;影剧院;电影院', 'typecode': '', 'biz_type': 'cinema', 'address': [], 'location': '113.960307,35.065736', 'tel': '0373-5911199', 'distance': [], 'biz_ext': [], 'pname': '河南省', 'cityname': '新乡市', 'adname': '原阳县', 'importance': [], 'shopid': [], 'shopinfo': '', 'poiweight': []}
  5. []
  6. 高德缺地址
  7. {'id': 'B0FFH0368R', 'name': '横店电影城', 'type': '体育休闲服务;影剧院;电影院', 'typecode': '', 'biz_type': 'cinema', 'address': [], 'location': '118.430053,29.862110', 'tel': [], 'distance': [], 'biz_ext': [], 'pname': '安徽省', 'cityname': '黄山市', 'adname': '歙县', 'importance': [], 'shopid': [], 'shopinfo': '', 'poiweight': []}
  8. []
  9. 高德缺地址
  10. [, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ]
  11. [, , , , , , , , , , , , , , , , , , , , ]
  1. if url in exception_url_list:
  2. l_index = exception_url_list.index(url)
  3. print(139, 'del')
  4. del exception_url_list[l_index]
  5.  
  6. class MyThread(threading.Thread):
  7. def __init__(self, func, args, name=''):
  8. threading.Thread.__init__(self)
  9. self.name = name
  10. self.func = func
  11. self.args = args
  12.  
  13. def run(self):
  14. self.func(self.args)
  15.  
  16. MAX_EXCEPTION_URL_NUM = 60
  17.  
  18. def deal_exception_url_list():
  19. global exception_url_list
  20. if len(exception_url_list) > MAX_EXCEPTION_URL_NUM:
  21. for url in exception_url_list:
  22. grab_one_url(url)
  23. else:
  24. return
  25. deal_exception_url_list()
  1. # -*- coding: UTF-8 -*-
  2.  
  3. import math
  4. import random
  5. import re
  6. import sys
  7. import threading
  8. from time import ctime, sleep
  9. from lxml import etree
  10. import pprint
  11. import requests
  12.  
  13. BASE_URL = 'https://www.dianping.com/'
  14. url_district_list = []
  15. GLUE = 'search'
  16. f = open('spider_深圳_district_bussi-nav_url_list - 副本.txt', 'r', encoding='utf-8')
  17. f.closed
  18. SCALA = 'SCALA'
  19. for i in f:
  20. URL_POOL = i.split(SCALA)
  21. break
  22.  
  23. URL_POOL_LEN = len(URL_POOL)
  24. URL_NUM_EACH_THREAD = 1
  25. # {url:{}}
  26. res_dic = {}
  27.  
  28. MAX_PAGE_NUM = 50
  29. QPS = 30
  30. request_counter = 0
  31. QPS_TIME_UNIT = 1
  32.  
  33. # url = '%s%s%sp%s' % (BASE_URL, GLUE, URL_POOL[1], 3)
  34. # print(url)
  35. # r = requests.get(url)
  36. # html = r.text
  37. # selector = etree.HTML(html)
  38. # page_title = selector.xpath('//title/text()')
  39. # page_Keywords = selector.xpath('//meta[@name="Keywords"]')[0].attrib['content']
  40. # data_ga_index_1 = selector.xpath('.//a[@data-ga-index="1"]/span/text()')[0]
  41. # data_ga_index_2 = selector.xpath('.//a[@data-ga-index="2"]/span/text()')[0]
  42. # data_ga_index_3 = selector.xpath('.//a[@data-ga-index="3"]/span/text()')[0]
  43. # res_dic[url] = {}
  44. # res_dic[url]['page_title'] = page_title
  45. # res_dic[url]['page_Keywords'] = page_Keywords
  46. # res_dic[url]['data_ga_index_1'] = data_ga_index_1
  47. # res_dic[url]['data_ga_index_2'] = data_ga_index_2
  48. # res_dic[url]['data_ga_index_3'] = data_ga_index_3
  49. #
  50. # name_l = selector.xpath('.//li[@class=""]//h4/text()')
  51. #
  52. # mean_price_l = selector.xpath('.//li[@class=""]//a[@class="mean-price"]/b/text()')
  53. #
  54. # flavour_l = selector.xpath('.//li[@class=""]//div[@class="tag-addr"]/a[1]/span/text()')
  55. #
  56. # position_l = selector.xpath('.//li[@class=""]//div[@class="tag-addr"]/a[2]/span/text()')
  57. #
  58. # address_l = selector.xpath('.//li[@class=""]//div[@class="tag-addr"]/span/text()')
  59. #
  60. # len_l = len(name_l)
  61. # res_dic[url]['items_num'] = len_l
  62. # res_dic[url]['items_list'] = []
  63. # len_l_ = len_l - 1
  64. # for i in range(0, len_l_, 1):
  65. # d = {}
  66. # d['name'] = name_l[i]
  67. # d['mean_price'] = mean_price_l[i]
  68. # d['flavour'] = flavour_l[i]
  69. # d['position'] = position_l[i]
  70. # d['address'] = address_l[i]
  71. # res_dic[url]['items_list'].append(d)
  72. #
  73. # d = 4
  74.  
  75. exception_url_list = []
  76.  
  77. def grab_one_url(url):
  78. global res_dic, request_counter, exception_url_list
  79. if request_counter % QPS == 0:
  80. print(36, 'sleep', request_counter)
  81. sleep(QPS_TIME_UNIT)
  82.  
  83. request_counter += 1
  84. try:
  85. r = requests.get(url)
  86. except Exception:
  87. if url not in exception_url_list:
  88. exception_url_list.append(url)
  89. print(exception_url_list)
  90. return
  91. html = r.text
  92. selector = etree.HTML(html)
  93. page_title = selector.xpath('//title/text()')[0]
  94.  
  95. try:
  96. page_Keywords = selector.xpath('//meta[@name="Keywords"]')[0].attrib['content'].replace(',', '、')
  97. except Exception:
  98. if url not in exception_url_list:
  99. exception_url_list.append(url)
  100. print(exception_url_list)
  101. return
  102.  
  103. data_ga_index_1 = selector.xpath('.//a[@data-ga-index="1"]/span/text()')[0]
  104. data_ga_index_2 = selector.xpath('.//a[@data-ga-index="2"]/span/text()')[0]
  105. data_ga_index_3 = selector.xpath('.//a[@data-ga-index="3"]/span/text()')[0]
  106. res_dic[url] = {}
  107. res_dic[url]['page_title'] = page_title
  108. res_dic[url]['page_Keywords'] = page_Keywords
  109. res_dic[url]['data_ga_index_1'] = data_ga_index_1
  110. res_dic[url]['data_ga_index_2'] = data_ga_index_2
  111. res_dic[url]['data_ga_index_3'] = data_ga_index_3
  112.  
  113. name_l = selector.xpath('.//li[@class=""]//h4/text()')
  114.  
  115. mean_price_l = selector.xpath('.//li[@class=""]//a[@class="mean-price"]/b/text()')
  116.  
  117. flavour_l = selector.xpath('.//li[@class=""]//div[@class="tag-addr"]/a[1]/span/text()')
  118.  
  119. position_l = selector.xpath('.//li[@class=""]//div[@class="tag-addr"]/a[2]/span/text()')
  120.  
  121. address_l = selector.xpath('.//li[@class=""]//div[@class="tag-addr"]/span/text()')
  122.  
  123. len_l = len(name_l)
  124. res_dic[url]['items_num'] = len_l
  125. res_dic[url]['items_list'] = []
  126. len_l_ = len_l - 1
  127. for i in range(0, len_l_, 1):
  128. d = {}
  129. d['name'] = name_l[i].replace(',', '、')
  130. d['mean_price'] = mean_price_l[i] if len(mean_price_l) - 1 >= i else ''
  131. d['flavour'] = flavour_l[i]
  132. d['position'] = position_l[i]
  133. # https://www.dianping.com/search/category/7/10/r12335p1
  134. d['address'] = address_l[i].replace(',', '、')
  135. res_dic[url]['items_list'].append(d)
  136.  
  137. if url in exception_url_list:
  138. l_index = exception_url_list.index(url)
  139. print(139, 'del')
  140. del exception_url_list[l_index]
  141.  
  142. class MyThread(threading.Thread):
  143. def __init__(self, func, args, name=''):
  144. threading.Thread.__init__(self)
  145. self.name = name
  146. self.func = func
  147. self.args = args
  148.  
  149. def run(self):
  150. self.func(self.args)
  151.  
  152. MAX_EXCEPTION_URL_NUM = 60
  153.  
  154. def deal_exception_url_list():
  155. global exception_url_list
  156. if len(exception_url_list) > MAX_EXCEPTION_URL_NUM:
  157. for url in exception_url_list:
  158. grab_one_url(url)
  159. else:
  160. return
  161. deal_exception_url_list()
  162.  
  163. def main():
  164. print('starting at:', ctime())
  165. threads_list = []
  166. thread_sum = math.ceil(URL_POOL_LEN / URL_NUM_EACH_THREAD)
  167. for nloop in range(0, URL_POOL_LEN, 1):
  168. for nnloop in range(1, MAX_PAGE_NUM, 1):
  169. url = URL_POOL[nloop]
  170. url = '%s%s%sp%s' % (BASE_URL, GLUE, url, nnloop)
  171. print(62, url)
  172. thread_instance = MyThread(grab_one_url, (url), grab_one_url.__name__)
  173. threads_list.append(thread_instance)
  174. # 主进程将在所有非守护进程退出后,退出
  175. for t in threads_list:
  176. print(70, t)
  177. t.setDaemon = False
  178. t.start()
  179. # wait for all thrades to finish
  180. for t in threads_list:
  181. t.join()
  182. # pprint.pprint(res_dic)
  183.  
  184. deal_exception_url_list()
  185.  
  186. f_name = 'dzdp_基于区-大商圈的餐馆列表-深圳.csv'
  187. f = open(f_name, 'w', encoding='utf-8-sig')
  188. f.write('')
  189. f.closed
  190. f = open(f_name, 'a', encoding='utf-8-sig')
  191. str = 'name,mean_price, flavour, position,address,url,page_title, page_Keywords, data_ga_index_1, data_ga_index_2, data_ga_index_3,\n'
  192. f.write(str)
  193. for url in res_dic:
  194. page_title = res_dic[url]['page_title']
  195. page_Keywords = res_dic[url]['page_Keywords']
  196. data_ga_index_1 = res_dic[url]['data_ga_index_1']
  197. data_ga_index_2 = res_dic[url]['data_ga_index_2']
  198. data_ga_index_3 = res_dic[url]['data_ga_index_3']
  199. for d in res_dic[url]['items_list']:
  200. name = d['name']
  201. mean_price = d['mean_price']
  202. flavour = d['flavour']
  203. position = d['position']
  204. address = d['address']
  205. str = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
  206. name, mean_price, flavour, position, address, url, page_title, page_Keywords, data_ga_index_1,
  207. data_ga_index_2, data_ga_index_3)
  208. f.write(str)
  209. f.closed
  210.  
  211. f_name = 'dzdp_基于区-大商圈的餐馆列表-深圳_EXCEPTION_URL.csv'
  212. f = open(f_name, 'w', encoding='utf-8-sig')
  213. f.write('')
  214. f.closed
  215. f = open(f_name, 'a', encoding='utf-8-sig')
  216. for url in exception_url_list:
  217. f.write(url + '\n')
  218. f.closed
  219.  
  220. if __name__ == '__main__':
  221. main()
  222. #
  223. #
  224. # d = 3
  225. # url = ('%s%s%sp%s') % (BASE_URL, GLUE, '/category/7/10/r1949', 100)
  226. # print(url)
  227. #
  228. #
  229.  
  230. #
  231. #
  232. # class MyThread(threading.Thread):
  233. # def __init__(self, func, args, name=''):
  234. # threading.Thread.__init__(self)
  235. # self.name = name
  236. # self.func = func
  237. # self.args = args
  238. #
  239. # def run(self):
  240. # self.func(self.args)
  241. #
  242. #
  243. # def main():
  244. # print('starting at:', ctime())
  245. # threads_list = []
  246. # thread_sum = math.ceil(file_line_list_len / file_jump_step_num)
  247. # print(thread_sum)
  248. # for nloop in range(1, thread_sum, 1):
  249. # print(nloop)
  250. # thread_instance = MyThread(get_exception_logic_split_loop, (nloop), get_exception_logic_split_loop.__name__)
  251. #
  252. # threads_list.append(thread_instance)
  253. # # 主进程将在所有非守护进程退出后,退出
  254. # for t in threads_list:
  255. # print(t)
  256. # t.setDaemon = False
  257. # t.start()
  258. # # wait for all thrades to finish
  259. # for t in threads_list:
  260. # t.join()
  261. # f_name = 'ALL.csv'
  262. # f = open(f_name, 'w', encoding='utf-8-sig')
  263. # f.write('')
  264. # f.closed
  265. # f = open(f_name, 'a', encoding='utf-8-sig')
  266. # str = 'city, district, name, address, if_in_business_area, if_in_business_area_criterion,catering_kind, average_price, data_from\n'
  267. # f.write(str)
  268. # ## city,district,address,name,catering_kind,average_price,data_from
  269. # count_write_rows = 0
  270. # for i in cater_dic:
  271. # city = i
  272. # if city == '城市':
  273. # continue
  274. # for ii in cater_dic[i]:
  275. # district = ii
  276. # for iii in cater_dic[i][ii]:
  277. # name = iii
  278. # for iv in cater_dic[i][ii][iii]:
  279. # address = iv
  280. # catering_kind = cater_dic[i][ii][iii][iv]['catering_kind']
  281. # average_price = cater_dic[i][ii][iii][iv]['average_price']
  282. # if_in_business_area = cater_dic[i][ii][iii][iv]['if_in_business_area']
  283. # if_in_business_area_criterion = cater_dic[i][ii][iii][iv]['if_in_business_area_criterion']
  284. # data_from = cater_dic[i][ii][iii][iv]['data_from']
  285. # str = '%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
  286. # city, district, name, address, if_in_business_area, if_in_business_area_criterion,
  287. # catering_kind, average_price, data_from)
  288. # f.write(str)
  289. # count_write_rows += 1
  290. # f.closed
  291. # print(count_write_rows)
  292. #
  293. #
  294. # if __name__ == '__main__':
  295. # main()
  296. #
  297. #
  298. #
  299. #
  300. #
  301. #
  302. #
  303. #
  304. # d = 4
  305. # URL_POOL = f.detach()
  306. #
  307. # # {district:bussi-nav:[page]}
  308. # SPIDER_URL_DISTRICT_DIC = {}
  309. # for i in f:
  310. # d = i.replace('\n', '').replace(' ', '')
  311. # if d.find(GLUE) > -1:
  312. # dd = d.split(GLUE)[1].split('\"')[0]
  313. # SPIDER_URL_DISTRICT_DIC[dd] = []
  314. # QPS = 50
  315. # TIME_UNIT = 1
  316. # qps_counter = 0
  317. # for k in SPIDER_URL_DISTRICT_DIC:
  318. # url = '%s%s%s' % (BASE_URL, GLUE, k)
  319. # print(url)
  320. # qps_counter += 1
  321. # if qps_counter % QPS == 0:
  322. # sleep(TIME_UNIT)
  323. # try:
  324. # r = requests.get(url)
  325. # print(r.status_code)
  326. # txt = r.text.replace('\r', '').replace(' ', '').split('\n')
  327. # start_flag = 0
  328. # for d in txt:
  329. # if d.find('id="bussi-nav') > -1:
  330. # start_flag = 1
  331. # else:
  332. # if start_flag == 1:
  333. # if d.find('/div') > -1:
  334. # start_flag = 0
  335. # else:
  336. # dd = d.split(GLUE)[1].split('\"')[0]
  337. # SPIDER_URL_DISTRICT_DIC[k].append(dd)
  338. # print(dd)
  339. # except Exception:
  340. # # 修改为,2个函数,递归请求
  341. # print('EXCEPTION', url)
  342. # print(Exception)
  343. #
  344. # SCALA = 'SCALA'
  345. # str = ''
  346. # for k in SPIDER_URL_DISTRICT_DIC:
  347. # for url in SPIDER_URL_DISTRICT_DIC[k]:
  348. # str += SCALA + url
  349. # print(str)
  350. # str = str[5:]
  351. # print(str)
  352. # f = open('spider_深圳_district_bussi-nav_url_list.txt', 'w', encoding='utf-8')
  353. # f.write(str)
  354. # f.closed
  355. #
  356. # d = 5
  357. #
  358. # d = 3
  359. #
  360. # url = 'https://www.dianping.com/search/category/7/10/r29'
  361. # r = requests.get(url)
  362. #
  363. # d = 5
  364. # ZHITONGZI_CITY_DIC = {}
  365. # f = open('直筒子市_东莞中山.txt', 'r', encoding='utf-8')
  366. # ZHITONGZI_CITY_DIC['东莞市'] = []
  367. # ZHITONGZI_CITY_DIC['中山市'] = []
  368. # c = 0
  369. # for i in f:
  370. # ii = i.split(';')
  371. # for iii in ii:
  372. # iv = iii.split('、')
  373. # if len(iv) > 2:
  374. # c += 1
  375. # for v in iv:
  376. # if v.find('(') > -1:
  377. # v_ = v.split('(')[1]
  378. # elif v.find(')') > -1:
  379. # v_ = v.split(')')[0]
  380. # else:
  381. # v_ = v
  382. # if c == 1 or c == 2:
  383. # ZHITONGZI_CITY_DIC['东莞市'].append(v_)
  384. # elif c == 3 or c == 4:
  385. # ZHITONGZI_CITY_DIC['中山市'].append(v_)
  386. # f.closed
  387. #
  388. #
  389. # def chk_is_coffee(str):
  390. # l_ = ['coffee', 'coffe', 'cafe', 'café', 'starbucks', 'caffé']
  391. # # 上岛花式铁板烧 日本菜
  392. # # 泛太平洋大酒店面馆 其他
  393. # l_b = ['咖啡', '星巴克']
  394. # # 星巴克
  395. # for i in l_:
  396. # if str.upper().find(i.upper()) != -1:
  397. # return True
  398. # for i in l_b:
  399. # if str.find(i) != -1:
  400. # return True
  401. # return False
  402. #
  403. #
  404. # def chk_kfc_mdl(str):
  405. # if str.find(u"麦当劳") != -1:
  406. # return 1
  407. # elif str.find(u"肯德基") != -1 or str.upper().find(u"KFC") != -1:
  408. # return 0
  409. # else:
  410. # return 2
  411. #
  412. #
  413. # def get_name(str):
  414. # if str.find("麦当劳") != -1:
  415. # return '麦当劳'
  416. # elif str.find("肯德基") != -1 or str.upper().find(u"KFC") != -1:
  417. # return '肯德基'
  418. # else:
  419. # # str = '狗不理包子(前门店)'
  420. # # str = '(清真)三羊水饺(新民路店)'
  421. # # | 添椒 | 潮涮三国IP火锅
  422. # if str.find('(') == -1 and str.find('(') == -1:
  423. # return str
  424. # res = str.strip(' ').split('(')[0].strip(' ')
  425. # if len(res) == 0:
  426. # try:
  427. # res = str.split(')')[1].split('(')[0]
  428. # except Exception:
  429. # print(Exception)
  430. # # 一锅两头牛(烟青路店)
  431. # res_b = res
  432. # try:
  433. # res_b = res.split('(')[0]
  434. # except Exception:
  435. # print(Exception)
  436. #
  437. # return res_b
  438. #
  439. #
  440. # def chk_city_district(str):
  441. # city_district = str.replace(' ', '')
  442. # if re.match(r".*[a-zA-Z0-9]", city_district) is not None:
  443. # return False
  444. # elif str.find("[") != -1 or str.find("(") != -1 or str.find(")") != -1:
  445. # return False
  446. # else:
  447. # return city_district
  448. #
  449. #
  450. # def chk_catering_kind(str):
  451. # catering_kind = str.replace(' ', '')
  452. # if re.match(r".*[0-9]", catering_kind) is not None:
  453. # return False
  454. # else:
  455. # return catering_kind
  456. #
  457. #
  458. # # ['a','',' ']
  459. # def chk_list_thickness(list_):
  460. # if len(list_) == 0:
  461. # return False
  462. # res_list = []
  463. # for i in list_:
  464. # i_b = i.replace(' ', '')
  465. # if i.replace(' ', '') == '':
  466. # return False
  467. # else:
  468. # res_list.append(i_b)
  469. # return res_list
  470. #
  471. #
  472. # business_area_tag_list = ['大厦', '大楼', '大厦', '百货', '购物中心', '商业中心', 'MALL', '广场', '商场', '单元', '栋', '座', '楼', '层', '底商']
  473. #
  474. #
  475. # def chk_in_business_area(str):
  476. # global business_area_tag_list
  477. # for i in business_area_tag_list:
  478. # if str.find(i) > -1:
  479. # return 1
  480. # return 0
  481. #
  482. #
  483. # # MAX_OFFSET = 25
  484. # # OFFSET = MAX_OFFSET - 1
  485. # MAX_PAGINATION = 100
  486. # pagination = MAX_PAGINATION
  487. # QPS = 50
  488. # TIME_UNIT = 1
  489. # # http://lbs.amap.com/api/webservice/guide/tools/info
  490. # INFOCODE_OK = '10000'
  491. # file_name_key_pool = 'key_pool.pool'
  492. # KEY_POOL_LIST = []
  493. # touse_key = ''
  494. # f = open(file_name_key_pool, 'r', encoding='utf-8')
  495. # for i in f:
  496. # try:
  497. # list_ = i.split('\t')
  498. # key = i.split('\t')[1].split()
  499. # KEY_POOL_LIST.append(key[0])
  500. # except Exception:
  501. # print(Exception)
  502. # KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST) - 1
  503. #
  504. # # 北京市 西城区 金堂羊蝎子火锅 真武庙四条1号
  505. # # http://restapi.amap.com/v3/place/around?parameters
  506. # URL_TYPE = 'http://restapi.amap.com/v3/place/text'
  507. # # URL_TYPE = 'http://restapi.amap.com/v3/around'
  508. # touse_key = ''
  509. # RADIUS = '&radius=20'
  510. # keywords = '&keywords='
  511. # OFFSET = '&offset=10'
  512. # CITYLIMIT = '&citylimit=true'
  513. #
  514. # URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1'
  515. #
  516. # change_key_qps = 0
  517. #
  518. #
  519. # def change_key():
  520. # global touse_key, change_key_qps
  521. #
  522. # # 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
  523. # mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
  524. # for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
  525. # key = KEY_POOL_LIST[i]
  526. # if key == touse_key:
  527. # if i == KEY_POOL_NUM_INDICATOR:
  528. # change_key()
  529. # return
  530. # else:
  531. # continue
  532. # touse_key = key
  533. # url = URL_FOR_CHANGE_KEY % (touse_key)
  534. # try:
  535. # change_key_qps += 1
  536. # if change_key_qps % QPS == 0:
  537. # sleep(TIME_UNIT)
  538. # r = requests.get(url)
  539. # json_ = r.json()
  540. # except Exception:
  541. # print('requests.get(url)', Exception)
  542. # change_key()
  543. # return
  544. # infocode = json_['infocode']
  545. # if not infocode == INFOCODE_OK:
  546. # if i == KEY_POOL_NUM_INDICATOR:
  547. # sys.exit('NOInvalidKEY')
  548. # change_key()
  549. # return
  550. # return
  551. #
  552. #
  553. #
  554. # # 060101 购物服务 商场 购物中心
  555. #
  556. #
  557. # FILTER_GD_BUSINESS_AREA_TYPE_LIST = ['购物中心']
  558. #
  559. #
  560. # def fliter_gd_business_area_type(url):
  561. # global FILTER_GD_BUSINESS_AREA_TYPE_LIST
  562. # # {"suggestion":{"keywords":{},"cities":{}},"info":"OK","status":"1","count":"1","pois":[{"typecode":"050118","adname":"西城区","biz_type":"diner","id":"B0FFG8RYC7","pname":"北京市","importance":{},"location":"116.393715,39.957242","distance":{},"tel":"18600185618","biz_ext":{},"shopid":{},"address":"德胜街道六铺炕北小街8-1号","poiweight":{},"cityname":"北京市","type":"餐饮服务;中餐厅;特色\/地方风味餐厅","shopinfo":"0","name":"炙热青春"}],"infocode":"10000"
  563. # try:
  564. # r = requests.get(url)
  565. # r_json = r.json()
  566. # except Exception:
  567. # print(203, Exception)
  568. # # 返回数据解析json异常
  569. # return 3
  570. # infocode = r_json['infocode']
  571. # if infocode == '10000':
  572. # count = r_json['count']
  573. # if int(count) > 0:
  574. # pois_list = r_json['pois']
  575. # for l in pois_list:
  576. # type = l['type']
  577. # for chk_type in FILTER_GD_BUSINESS_AREA_TYPE_LIST:
  578. # if type.find(chk_type) > -1:
  579. # return 1
  580. # else:
  581. # change_key()
  582. # return 0
  583. #
  584. #
  585. # # f = open('MEITUAN_DAZHONG_20170704.csv', 'r', encoding='gbk')
  586. # # f_2 = open('MEITUAN_DAZHONG_20170705.csv', 'r', encoding='gbk')
  587. # # f_gd = open('GD_POI_KFC_MC.csv', 'r', encoding='utf-8-sig')
  588. #
  589. # # 市key-区key-品牌名key-地址key-{菜类,均价,data_from}
  590. # # {city:{district:{name:{address:{catering_kind,average_price,data_from}}}}}
  591. # cater_dic = {}
  592. # # [{city,district,address,name,catering_kind,average_price,data_from}]
  593. # cater_exception_list = []
  594. # count_catering = 0
  595. # count_catering_exception = 0
  596. #
  597. # coffee_list = []
  598. # count_coffee = 0
  599. #
  600. # fo = open('MEITUAN_DAZHONG_20170705 - 副本.csv', 'r')
  601. #
  602. # file_line_list = []
  603. # for i in fo:
  604. # file_line_list.append(i)
  605. # fo.closed
  606. # #
  607. # # fo = open('MEITUAN_DAZHONG_20170705 - 副本.csv', 'r')
  608. # # while fo.readline():
  609. # # file_line_list_b.append(fo.readline())
  610. # file_line_list_len = len(file_line_list)
  611. # file_jump_step_num = 2000
  612. # count_catering_exception = 0
  613. # count_coffee = 0
  614. # count_catering = 0
  615. #
  616. #
  617. # def get_exception_logic_split_loop(nloop):
  618. # global touse_key, cater_dic, file_line_list, file_line_list_len, file_jump_step_num, count_catering_exception, count_coffee, count_catering
  619. # start_line = nloop * file_jump_step_num
  620. # if start_line >= file_line_list_len:
  621. # print('last-line')
  622. # return
  623. # else:
  624. # start_line_count = 0
  625. # end_line = start_line + file_jump_step_num
  626. # if end_line >= file_line_list_len:
  627. # end_line = file_line_list_len - 1
  628. # for i in range(start_line, end_line, 1):
  629. # l_ = file_line_list[i].replace('\n', '').split(',')
  630. # city = l_[0]
  631. # district = l_[1]
  632. # address = l_[2]
  633. # name = l_[3]
  634. # average_price = l_[4]
  635. # catering_kind = l_[5]
  636. # data_from = 'mtdz_5'
  637. # # 数据准备层
  638. # # 数据运算层
  639. # # 该层处理从目标文件取出的字段列表
  640. # focus_list = [city, district, address, name, catering_kind, average_price, data_from]
  641. # dic_exception = {}
  642. # dic_exception['data_from'] = data_from
  643. # dic_exception['city'] = city
  644. # dic_exception['district'] = district
  645. # dic_exception['name'] = name
  646. # dic_exception['address'] = address
  647. # dic_exception['catering_kind'] = catering_kind
  648. # dic_exception['average_price'] = average_price
  649. #
  650. # if not chk_list_thickness(focus_list) or not chk_city_district(city) or not chk_city_district(
  651. # district) or not chk_catering_kind(catering_kind):
  652. # count_catering_exception += 1
  653. # cater_exception_list.append(dic_exception)
  654. # else:
  655. # name = get_name(name)
  656. #
  657. # m = chk_is_coffee(name)
  658. # # if m:
  659. # # print(list_)
  660. # if not m:
  661. # m = chk_is_coffee(catering_kind)
  662. # if m:
  663. # count_coffee += 1
  664. # coffee_list.append(dic_exception)
  665. #
  666. # if not m:
  667. # dic_details = {}
  668. # dic_details['data_from'] = data_from
  669. # dic_details['catering_kind'] = catering_kind
  670. # dic_details['average_price'] = average_price
  671. # if_in_business_area = chk_in_business_area(address)
  672. # if_in_business_area_criterion = 'str_match'
  673. # if if_in_business_area == 0:
  674. # city_r = '&city=' + district
  675. # keywords = '&keywords=' + address + '|' + name
  676. # start_line_count += 1
  677. # print(start_line, start_line_count)
  678. # if start_line_count % QPS == 0:
  679. # print('sleep')
  680. # sleep(1)
  681. # url = URL_TYPE + '?' + 'key=' + touse_key + RADIUS + keywords + city_r + CITYLIMIT
  682. #
  683. # if_in_business_area = fliter_gd_business_area_type(url)
  684. # if_in_business_area_criterion = 'str_match+request_api'
  685. #
  686. # dic_details['if_in_business_area_criterion'] = if_in_business_area_criterion
  687. # dic_details['if_in_business_area'] = if_in_business_area
  688. #
  689. # if city not in cater_dic:
  690. # cater_dic[city] = {}
  691. # if district not in cater_dic[city]:
  692. # cater_dic[city][district] = {}
  693. # if name not in cater_dic[city][district]:
  694. # cater_dic[city][district][name] = {}
  695. # if address not in cater_dic[city][district][name]:
  696. # cater_dic[city][district][name][address] = {}
  697. #
  698. # cater_dic[city][district][name][address] = dic_details
  699. # count_catering += 1
  700. #
  701. #
  702. # class MyThread(threading.Thread):
  703. # def __init__(self, func, args, name=''):
  704. # threading.Thread.__init__(self)
  705. # self.name = name
  706. # self.func = func
  707. # self.args = args
  708. #
  709. # def run(self):
  710. # self.func(self.args)
  711. #
  712. #
  713. # def main():
  714. # print('starting at:', ctime())
  715. # threads_list = []
  716. # thread_sum = math.ceil(file_line_list_len / file_jump_step_num)
  717. # print(thread_sum)
  718. # for nloop in range(1, thread_sum, 1):
  719. # print(nloop)
  720. # thread_instance = MyThread(get_exception_logic_split_loop, (nloop), get_exception_logic_split_loop.__name__)
  721. #
  722. # threads_list.append(thread_instance)
  723. # # 主进程将在所有非守护进程退出后,退出
  724. # for t in threads_list:
  725. # print(t)
  726. # t.setDaemon = False
  727. # t.start()
  728. # # wait for all thrades to finish
  729. # for t in threads_list:
  730. # t.join()
  731. # f_name = 'ALL.csv'
  732. # f = open(f_name, 'w', encoding='utf-8-sig')
  733. # f.write('')
  734. # f.closed
  735. # f = open(f_name, 'a', encoding='utf-8-sig')
  736. # str = 'city, district, name, address, if_in_business_area, if_in_business_area_criterion,catering_kind, average_price, data_from\n'
  737. # f.write(str)
  738. # ## city,district,address,name,catering_kind,average_price,data_from
  739. # count_write_rows = 0
  740. # for i in cater_dic:
  741. # city = i
  742. # if city == '城市':
  743. # continue
  744. # for ii in cater_dic[i]:
  745. # district = ii
  746. # for iii in cater_dic[i][ii]:
  747. # name = iii
  748. # for iv in cater_dic[i][ii][iii]:
  749. # address = iv
  750. # catering_kind = cater_dic[i][ii][iii][iv]['catering_kind']
  751. # average_price = cater_dic[i][ii][iii][iv]['average_price']
  752. # if_in_business_area = cater_dic[i][ii][iii][iv]['if_in_business_area']
  753. # if_in_business_area_criterion = cater_dic[i][ii][iii][iv]['if_in_business_area_criterion']
  754. # data_from = cater_dic[i][ii][iii][iv]['data_from']
  755. # str = '%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
  756. # city, district, name, address, if_in_business_area, if_in_business_area_criterion,
  757. # catering_kind, average_price, data_from)
  758. # f.write(str)
  759. # count_write_rows += 1
  760. # f.closed
  761. # print(count_write_rows)
  762. #
  763. #
  764. # if __name__ == '__main__':
  765. # main()

82000 277
186000 345
42000 251
186000 346
186000 347
42000 252
82000 278
42000 253
42000 254
40000 346
40000 347
42000 255
40000 348
42000 256
40000 349
82000 279
40000 350
sleep
72000 279
12000 350
sleep
72000 280
72000 281
72000 282
96000 274
72000 283
96000 275
186000 348
72000 284
186000 349
106000 275
132000 328
166000 298
188000 372
60000 336
60000 337
60000 338
60000 339
60000 340
82000 280
42000 257
82000 281
82000 282
60000 341
186000 350
sleep
96000 276
72000 285
72000 286
40000 351
72000 287
96000 277
96000 278
72000 288
72000 289
96000 279
72000 290
96000 280
72000 291
96000 281
72000 292
2000 371
96000 282
102000 255

# -*- coding: UTF-8 -*-

import re
import pprint
import json
import time
import math
import sys
import requests
import threading
from time import ctime, sleep
import random ZHITONGZI_CITY_DIC = {}
f = open('直筒子市_东莞中山.txt', 'r', encoding='utf-8')
ZHITONGZI_CITY_DIC['东莞市'] = []
ZHITONGZI_CITY_DIC['中山市'] = []
c = 0
for i in f:
ii = i.split(';')
for iii in ii:
iv = iii.split('、')
if len(iv) > 2:
c += 1
for v in iv:
if v.find('(') > -1:
v_ = v.split('(')[1]
elif v.find(')') > -1:
v_ = v.split(')')[0]
else:
v_ = v
if c == 1 or c == 2:
ZHITONGZI_CITY_DIC['东莞市'].append(v_)
elif c == 3 or c == 4:
ZHITONGZI_CITY_DIC['中山市'].append(v_)
f.closed def chk_is_coffee(str):
l_ = ['coffee', 'coffe', 'cafe', 'café', 'starbucks', 'caffé']
# 上岛花式铁板烧 日本菜
# 泛太平洋大酒店面馆 其他
l_b = ['咖啡', '星巴克']
# 星巴克
for i in l_:
if str.upper().find(i.upper()) != -1:
return True
for i in l_b:
if str.find(i) != -1:
return True
return False def chk_kfc_mdl(str):
if str.find(u"麦当劳") != -1:
return 1
elif str.find(u"肯德基") != -1 or str.upper().find(u"KFC") != -1:
return 0
else:
return 2 def get_name(str):
if str.find("麦当劳") != -1:
return '麦当劳'
elif str.find("肯德基") != -1 or str.upper().find(u"KFC") != -1:
return '肯德基'
else:
# str = '狗不理包子(前门店)'
# str = '(清真)三羊水饺(新民路店)'
# | 添椒 | 潮涮三国IP火锅
if str.find('(') == -1 and str.find('(') == -1:
return str
res = str.strip(' ').split('(')[0].strip(' ')
if len(res) == 0:
try:
res = str.split(')')[1].split('(')[0]
except Exception:
print(Exception)
# 一锅两头牛(烟青路店)
res_b = res
try:
res_b = res.split('(')[0]
except Exception:
print(Exception) return res_b def chk_city_district(str):
city_district = str.replace(' ', '')
if re.match(r".*[a-zA-Z0-9]", city_district) is not None:
return False
elif str.find("[") != -1 or str.find("(") != -1 or str.find(")") != -1:
return False
else:
return city_district def chk_catering_kind(str):
catering_kind = str.replace(' ', '')
if re.match(r".*[0-9]", catering_kind) is not None:
return False
else:
return catering_kind # ['a','',' ']
def chk_list_thickness(list_):
if len(list_) == 0:
return False
res_list = []
for i in list_:
i_b = i.replace(' ', '')
if i.replace(' ', '') == '':
return False
else:
res_list.append(i_b)
return res_list business_area_tag_list = ['大厦', '大楼', '大厦', '百货', '购物中心', '商业中心', 'MALL', '广场', '商场', '单元', '栋', '座', '楼', '层', '底商'] def chk_in_business_area(str):
global business_area_tag_list
for i in business_area_tag_list:
if str.find(i) > -1:
return 1
return 0 # MAX_OFFSET = 25
# OFFSET = MAX_OFFSET - 1
MAX_PAGINATION = 100
pagination = MAX_PAGINATION
QPS = 50
TIME_UNIT = 1
# http://lbs.amap.com/api/webservice/guide/tools/info
INFOCODE_OK = ''
file_name_key_pool = 'key_pool.pool'
KEY_POOL_LIST = []
touse_key = ''
f = open(file_name_key_pool, 'r', encoding='utf-8')
for i in f:
try:
list_ = i.split('\t')
key = i.split('\t')[1].split()
KEY_POOL_LIST.append(key[0])
except Exception:
print(Exception)
KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST) - 1 # 北京市 西城区 金堂羊蝎子火锅 真武庙四条1号
# http://restapi.amap.com/v3/place/around?parameters
URL_TYPE = 'http://restapi.amap.com/v3/place/text'
# URL_TYPE = 'http://restapi.amap.com/v3/around'
touse_key = ''
RADIUS = '&radius=20'
keywords = '&keywords='
OFFSET = '&offset=10'
CITYLIMIT = '&citylimit=true' URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1' change_key_qps = 0 def change_key():
global touse_key, change_key_qps # 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
key = KEY_POOL_LIST[i]
if key == touse_key:
if i == KEY_POOL_NUM_INDICATOR:
change_key()
return
else:
continue
touse_key = key
url = URL_FOR_CHANGE_KEY % (touse_key)
try:
change_key_qps += 1
if change_key_qps % QPS == 0:
sleep(TIME_UNIT)
r = requests.get(url)
json_ = r.json()
except Exception:
print('requests.get(url)', Exception)
change_key()
return
infocode = json_['infocode']
if not infocode == INFOCODE_OK:
if i == KEY_POOL_NUM_INDICATOR:
sys.exit('NOInvalidKEY')
change_key()
return
return # 060101 购物服务 商场 购物中心
FILTER_GD_BUSINESS_AREA_TYPE_LIST = ['购物中心'] def fliter_gd_business_area_type(url):
global FILTER_GD_BUSINESS_AREA_TYPE_LIST
# {"suggestion":{"keywords":{},"cities":{}},"info":"OK","status":"1","count":"1","pois":[{"typecode":"050118","adname":"西城区","biz_type":"diner","id":"B0FFG8RYC7","pname":"北京市","importance":{},"location":"116.393715,39.957242","distance":{},"tel":"18600185618","biz_ext":{},"shopid":{},"address":"德胜街道六铺炕北小街8-1号","poiweight":{},"cityname":"北京市","type":"餐饮服务;中餐厅;特色\/地方风味餐厅","shopinfo":"0","name":"炙热青春"}],"infocode":"10000"
try:
r = requests.get(url)
r_json = r.json()
except Exception:
print(203, Exception)
# 返回数据解析json异常
return 3
infocode = r_json['infocode']
if infocode == '':
count = r_json['count']
if int(count) > 0:
pois_list = r_json['pois']
for l in pois_list:
type = l['type']
for chk_type in FILTER_GD_BUSINESS_AREA_TYPE_LIST:
if type.find(chk_type) > -1:
return 1
else:
change_key()
return 0 # f = open('MEITUAN_DAZHONG_20170704.csv', 'r', encoding='gbk')
# f_2 = open('MEITUAN_DAZHONG_20170705.csv', 'r', encoding='gbk')
# f_gd = open('GD_POI_KFC_MC.csv', 'r', encoding='utf-8-sig') # 市key-区key-品牌名key-地址key-{菜类,均价,data_from}
# {city:{district:{name:{address:{catering_kind,average_price,data_from}}}}}
cater_dic = {}
# [{city,district,address,name,catering_kind,average_price,data_from}]
cater_exception_list = []
count_catering = 0
count_catering_exception = 0 coffee_list = []
count_coffee = 0 fo = open('MEITUAN_DAZHONG_20170705 - 副本.csv', 'r') file_line_list = []
for i in fo:
file_line_list.append(i)
fo.closed
#
# fo = open('MEITUAN_DAZHONG_20170705 - 副本.csv', 'r')
# while fo.readline():
# file_line_list_b.append(fo.readline())
file_line_list_len = len(file_line_list)
file_jump_step_num = 2000
count_catering_exception = 0
count_coffee = 0
count_catering = 0 def get_exception_logic_split_loop(nloop):
global touse_key, cater_dic, file_line_list, file_line_list_len, file_jump_step_num, count_catering_exception, count_coffee, count_catering
start_line = nloop * file_jump_step_num
if start_line >= file_line_list_len:
print('last-line')
return
else:
start_line_count = 0
end_line = start_line + file_jump_step_num
if end_line >= file_line_list_len:
end_line = file_line_list_len - 1
for i in range(start_line, end_line, 1):
l_ = file_line_list[i].replace('\n', '').split(',')
city = l_[0]
district = l_[1]
address = l_[2]
name = l_[3]
average_price = l_[4]
catering_kind = l_[5]
data_from = 'mtdz_5'
# 数据准备层
# 数据运算层
# 该层处理从目标文件取出的字段列表
focus_list = [city, district, address, name, catering_kind, average_price, data_from]
dic_exception = {}
dic_exception['data_from'] = data_from
dic_exception['city'] = city
dic_exception['district'] = district
dic_exception['name'] = name
dic_exception['address'] = address
dic_exception['catering_kind'] = catering_kind
dic_exception['average_price'] = average_price if not chk_list_thickness(focus_list) or not chk_city_district(city) or not chk_city_district(
district) or not chk_catering_kind(catering_kind):
count_catering_exception += 1
cater_exception_list.append(dic_exception)
else:
name = get_name(name) m = chk_is_coffee(name)
# if m:
# print(list_)
if not m:
m = chk_is_coffee(catering_kind)
if m:
count_coffee += 1
coffee_list.append(dic_exception) if not m:
dic_details = {}
dic_details['data_from'] = data_from
dic_details['catering_kind'] = catering_kind
dic_details['average_price'] = average_price
if_in_business_area = chk_in_business_area(address)
if_in_business_area_criterion = 'str_match'
if if_in_business_area == 0:
city_r = '&city=' + district
keywords = '&keywords=' + address + '|' + name
start_line_count += 1
print( start_line, start_line_count)
if start_line_count % QPS == 0:
print('sleep')
sleep(1)
url = URL_TYPE + '?' + 'key=' + touse_key + RADIUS + keywords + city_r + CITYLIMIT if_in_business_area = fliter_gd_business_area_type(url)
if_in_business_area_criterion = 'str_match+request_api' dic_details['if_in_business_area_criterion'] = if_in_business_area_criterion
dic_details['if_in_business_area'] = if_in_business_area if city not in cater_dic:
cater_dic[city] = {}
if district not in cater_dic[city]:
cater_dic[city][district] = {}
if name not in cater_dic[city][district]:
cater_dic[city][district][name] = {}
if address not in cater_dic[city][district][name]:
cater_dic[city][district][name][address] = {} cater_dic[city][district][name][address] = dic_details
count_catering += 1 class MyThread(threading.Thread):
def __init__(self, func, args, name=''):
threading.Thread.__init__(self)
self.name = name
self.func = func
self.args = args def run(self):
self.func(self.args) def main():
print('starting at:', ctime())
threads_list = []
thread_sum = math.ceil(file_line_list_len / file_jump_step_num)
print(thread_sum)
for nloop in range(1, thread_sum, 1):
print(nloop)
thread_instance = MyThread(get_exception_logic_split_loop, (nloop), get_exception_logic_split_loop.__name__) threads_list.append(thread_instance)
# 主进程将在所有非守护进程退出后,退出
for t in threads_list:
print(t)
t.setDaemon = False
t.start()
# wait for all thrades to finish
for t in threads_list:
t.join()
f_name = 'ALL.csv'
f = open(f_name, 'w', encoding='utf-8-sig')
f.write('')
f.closed
f = open(f_name, 'a', encoding='utf-8-sig')
str = 'city, district, name, address, if_in_business_area, if_in_business_area_criterion,catering_kind, average_price, data_from\n'
f.write(str)
## city,district,address,name,catering_kind,average_price,data_from
count_write_rows = 0
for i in cater_dic:
city = i
if city == '城市':
continue
for ii in cater_dic[i]:
district = ii
for iii in cater_dic[i][ii]:
name = iii
for iv in cater_dic[i][ii][iii]:
address = iv
catering_kind = cater_dic[i][ii][iii][iv]['catering_kind']
average_price = cater_dic[i][ii][iii][iv]['average_price']
if_in_business_area = cater_dic[i][ii][iii][iv]['if_in_business_area']
if_in_business_area_criterion = cater_dic[i][ii][iii][iv]['if_in_business_area_criterion']
data_from = cater_dic[i][ii][iii][iv]['data_from']
str = '%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
city, district, name, address, if_in_business_area, if_in_business_area_criterion,
catering_kind, average_price, data_from)
f.write(str)
count_write_rows += 1
f.closed
print(count_write_rows) if __name__ == '__main__':
main()
# -*- coding: UTF-8 -*-

import re
import pprint
import json
import time
import math
import sys
import requests
import threading
from time import ctime, sleep
import random ZHITONGZI_CITY_DIC = {}
f = open('直筒子市_东莞中山.txt', 'r', encoding='utf-8')
ZHITONGZI_CITY_DIC['东莞市'] = []
ZHITONGZI_CITY_DIC['中山市'] = []
c = 0
for i in f:
ii = i.split(';')
for iii in ii:
iv = iii.split('、')
if len(iv) > 2:
c += 1
for v in iv:
if v.find('(') > -1:
v_ = v.split('(')[1]
elif v.find(')') > -1:
v_ = v.split(')')[0]
else:
v_ = v
if c == 1 or c == 2:
ZHITONGZI_CITY_DIC['东莞市'].append(v_)
elif c == 3 or c == 4:
ZHITONGZI_CITY_DIC['中山市'].append(v_)
f.closed def chk_is_coffee(str):
l_ = ['coffee', 'coffe', 'cafe', 'café', 'starbucks', 'caffé']
# 上岛花式铁板烧 日本菜
# 泛太平洋大酒店面馆 其他
l_b = ['咖啡', '星巴克']
# 星巴克
for i in l_:
if str.upper().find(i.upper()) != -1:
return True
for i in l_b:
if str.find(i) != -1:
return True
return False def chk_kfc_mdl(str):
if str.find(u"麦当劳") != -1:
return 1
elif str.find(u"肯德基") != -1 or str.upper().find(u"KFC") != -1:
return 0
else:
return 2 def get_name(str):
if str.find("麦当劳") != -1:
return '麦当劳'
elif str.find("肯德基") != -1 or str.upper().find(u"KFC") != -1:
return '肯德基'
else:
# str = '狗不理包子(前门店)'
# str = '(清真)三羊水饺(新民路店)'
# | 添椒 | 潮涮三国IP火锅
if str.find('(') == -1 and str.find('(') == -1:
return str
res = str.strip(' ').split('(')[0].strip(' ')
if len(res) == 0:
try:
res = str.split(')')[1].split('(')[0]
except Exception:
print(Exception)
# 一锅两头牛(烟青路店)
res_b = res
try:
res_b = res.split('(')[0]
except Exception:
print(Exception) return res_b def chk_city_district(str):
city_district = str.replace(' ', '')
if re.match(r".*[a-zA-Z0-9]", city_district) is not None:
return False
elif str.find("[") != -1 or str.find("(") != -1 or str.find(")") != -1:
return False
else:
return city_district def chk_catering_kind(str):
catering_kind = str.replace(' ', '')
if re.match(r".*[0-9]", catering_kind) is not None:
return False
else:
return catering_kind # ['a','',' ']
def chk_list_thickness(list_):
if len(list_) == 0:
return False
res_list = []
for i in list_:
i_b = i.replace(' ', '')
if i.replace(' ', '') == '':
return False
else:
res_list.append(i_b)
return res_list business_area_tag_list = ['大厦', '大楼', '大厦', '百货', '购物中心', '商业中心', 'MALL', '广场', '商场', '单元', '栋', '座', '楼', '层', '底商'] def chk_in_business_area(str):
global business_area_tag_list
for i in business_area_tag_list:
if str.find(i) > -1:
return 1
return 0 # MAX_OFFSET = 25
# OFFSET = MAX_OFFSET - 1
MAX_PAGINATION = 100
pagination = MAX_PAGINATION
QPS = 50
TIME_UNIT = 1
# http://lbs.amap.com/api/webservice/guide/tools/info
INFOCODE_OK = ''
file_name_key_pool = 'key_pool.pool'
KEY_POOL_LIST = []
touse_key = ''
f = open(file_name_key_pool, 'r', encoding='utf-8')
for i in f:
try:
list_ = i.split('\t')
key = i.split('\t')[1].split()
KEY_POOL_LIST.append(key[0])
except Exception:
print(Exception)
KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST) - 1 # 北京市 西城区 金堂羊蝎子火锅 真武庙四条1号
# http://restapi.amap.com/v3/place/around?parameters
URL_TYPE = 'http://restapi.amap.com/v3/place/text'
# URL_TYPE = 'http://restapi.amap.com/v3/around'
touse_key = ''
RADIUS = '&radius=20'
keywords = '&keywords='
OFFSET = '&offset=10'
CITYLIMIT = '&citylimit=true' URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1' change_key_qps = 0 def change_key():
global touse_key, change_key_qps # 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
key = KEY_POOL_LIST[i]
if key == touse_key:
if i == KEY_POOL_NUM_INDICATOR:
change_key()
return
else:
continue
touse_key = key
url = URL_FOR_CHANGE_KEY % (touse_key)
try:
change_key_qps += 1
if change_key_qps % QPS == 0:
sleep(TIME_UNIT)
r = requests.get(url)
json_ = r.json()
except Exception:
print('requests.get(url)', Exception)
change_key()
return
infocode = json_['infocode']
if not infocode == INFOCODE_OK:
if i == KEY_POOL_NUM_INDICATOR:
sys.exit('NOInvalidKEY')
change_key()
return
return # 060101 购物服务 商场 购物中心
FILTER_GD_BUSINESS_AREA_TYPE_LIST = ['购物中心'] def fliter_gd_business_area_type(url):
global FILTER_GD_BUSINESS_AREA_TYPE_LIST
# {"suggestion":{"keywords":{},"cities":{}},"info":"OK","status":"1","count":"1","pois":[{"typecode":"050118","adname":"西城区","biz_type":"diner","id":"B0FFG8RYC7","pname":"北京市","importance":{},"location":"116.393715,39.957242","distance":{},"tel":"18600185618","biz_ext":{},"shopid":{},"address":"德胜街道六铺炕北小街8-1号","poiweight":{},"cityname":"北京市","type":"餐饮服务;中餐厅;特色\/地方风味餐厅","shopinfo":"0","name":"炙热青春"}],"infocode":"10000"
try:
r = requests.get(url)
r_json = r.json()
except Exception:
print(203, Exception)
# 返回数据解析json异常
return 3
infocode = r_json['infocode']
if infocode == '':
count = r_json['count']
if int(count) > 0:
pois_list = r_json['pois']
for l in pois_list:
type = l['type']
for chk_type in FILTER_GD_BUSINESS_AREA_TYPE_LIST:
if type.find(chk_type) > -1:
return 1
else:
change_key()
return 0 # f = open('MEITUAN_DAZHONG_20170704.csv', 'r', encoding='gbk')
# f_2 = open('MEITUAN_DAZHONG_20170705.csv', 'r', encoding='gbk')
# f_gd = open('GD_POI_KFC_MC.csv', 'r', encoding='utf-8-sig') # 市key-区key-品牌名key-地址key-{菜类,均价,data_from}
# {city:{district:{name:{address:{catering_kind,average_price,data_from}}}}}
cater_dic = {}
# [{city,district,address,name,catering_kind,average_price,data_from}]
cater_exception_list = []
count_catering = 0
count_catering_exception = 0 coffee_list = []
count_coffee = 0 fo = open('MEITUAN_DAZHONG_20170705 - 副本.csv', 'r') file_line_list = []
for i in fo:
file_line_list.append(i)
fo.closed
#
# fo = open('MEITUAN_DAZHONG_20170705 - 副本.csv', 'r')
# while fo.readline():
# file_line_list_b.append(fo.readline())
file_line_list_len = len(file_line_list)
file_jump_step_num = 4000
count_catering_exception = 0
count_coffee = 0
count_catering = 0 def get_exception_logic_split_loop(nloop):
print(247, nloop)
global touse_key, cater_dic, file_line_list, file_line_list_len, file_jump_step_num, count_catering_exception, count_coffee, count_catering
start_line = nloop * file_jump_step_num
if start_line >= file_line_list_len:
print('last-line')
return
else:
start_line_count = 0
end_line = start_line + file_jump_step_num
if end_line >= file_line_list_len:
end_line = file_line_list_len - 1
for i in range(start_line, end_line, 1):
l_ = file_line_list[i].replace('\n', '').split(',')
city = l_[0]
district = l_[1]
address = l_[2]
name = l_[3]
average_price = l_[4]
catering_kind = l_[5]
data_from = 'mtdz_5'
# 数据准备层
# 数据运算层
# 该层处理从目标文件取出的字段列表
focus_list = [city, district, address, name, catering_kind, average_price, data_from]
dic_exception = {}
dic_exception['data_from'] = data_from
dic_exception['city'] = city
dic_exception['district'] = district
dic_exception['name'] = name
dic_exception['address'] = address
dic_exception['catering_kind'] = catering_kind
dic_exception['average_price'] = average_price if not chk_list_thickness(focus_list) or not chk_city_district(city) or not chk_city_district(
district) or not chk_catering_kind(catering_kind):
count_catering_exception += 1
cater_exception_list.append(dic_exception)
else:
name = get_name(name) m = chk_is_coffee(name)
# if m:
# print(list_)
if not m:
m = chk_is_coffee(catering_kind)
if m:
count_coffee += 1
coffee_list.append(dic_exception) if not m:
dic_details = {}
dic_details['data_from'] = data_from
dic_details['catering_kind'] = catering_kind
dic_details['average_price'] = average_price
if_in_business_area = chk_in_business_area(address)
if_in_business_area_criterion = 'str_match'
if if_in_business_area == 0:
city_r = '&city=' + district
keywords = '&keywords=' + address + '|' + name
start_line_count += 1
print(417, start_line, start_line_count)
if start_line_count % QPS == 0:
print('sleep')
sleep(1)
url = URL_TYPE + '?' + 'key=' + touse_key + RADIUS + keywords + city_r + CITYLIMIT if_in_business_area = fliter_gd_business_area_type(url)
if_in_business_area_criterion = 'str_match+request_api' dic_details['if_in_business_area_criterion'] = if_in_business_area_criterion
dic_details['if_in_business_area'] = if_in_business_area if city not in cater_dic:
cater_dic[city] = {}
if district not in cater_dic[city]:
cater_dic[city][district] = {}
if name not in cater_dic[city][district]:
cater_dic[city][district][name] = {}
if address not in cater_dic[city][district][name]:
cater_dic[city][district][name][address] = {} cater_dic[city][district][name][address] = dic_details
count_catering += 1 class MyThread(threading.Thread):
def __init__(self, func, args, name=''):
threading.Thread.__init__(self)
self.name = name
self.func = func
self.args = args def run(self):
self.func(self.args) def main():
print('starting at:', ctime())
threads_list = []
thread_sum = math.ceil(file_line_list_len / file_jump_step_num)
print(thread_sum)
for nloop in range(1, thread_sum, 1):
print(nloop)
thread_instance = MyThread(get_exception_logic_split_loop, (nloop), get_exception_logic_split_loop.__name__)
print(353, '')
threads_list.append(thread_instance)
# 主进程将在所有非守护进程退出后,退出
for t in threads_list:
print(t)
t.setDaemon = False
t.start()
# wait for all thrades to finish
for t in threads_list:
t.join() print(467, cater_dic)
f_name = 'ALL.csv'
f = open(f_name, 'w', encoding='utf-8-sig')
f.write('')
f.closed
f = open(f_name, 'a', encoding='utf-8-sig')
str = 'city, district, name, address, if_in_business_area, if_in_business_area_criterion,catering_kind, average_price, data_from\n'
f.write(str)
## city,district,address,name,catering_kind,average_price,data_from
count_write_rows = 0
for i in cater_dic:
city = i
if city == '城市':
continue
for ii in cater_dic[i]:
district = ii
for iii in cater_dic[i][ii]:
name = iii
for iv in cater_dic[i][ii][iii]:
address = iv
catering_kind = cater_dic[i][ii][iii][iv]['catering_kind']
average_price = cater_dic[i][ii][iii][iv]['average_price']
if_in_business_area = cater_dic[i][ii][iii][iv]['if_in_business_area']
if_in_business_area_criterion = cater_dic[i][ii][iii][iv]['if_in_business_area_criterion']
data_from = cater_dic[i][ii][iii][iv]['data_from']
str = '%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
city, district, name, address, if_in_business_area, if_in_business_area_criterion,
catering_kind, average_price, data_from)
f.write(str)
count_write_rows += 1
f.closed
print(count_write_rows) if __name__ == '__main__':
main()
change_key_qps = 0

def change_key():
global touse_key, change_key_qps
change_key_qps += 1
if change_key_qps % QPS == 0:
sleep(TIME_UNIT)
# 高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;
mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
print(mean_use_key)
for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
key = KEY_POOL_LIST[i]
if key == touse_key:
if i == KEY_POOL_NUM_INDICATOR:
change_key()
else:
continue
print(172, 'present_key', touse_key)
touse_key = key
url = URL_FOR_CHANGE_KEY % (touse_key)
print(175, 'touse_key', touse_key)
try:
r = requests.get(url)
try:
json_ = r.json()
except Exception:
print(' r.json()', Exception)
change_key()
except Exception:
print('requests.get(url)', Exception)
change_key()
infocode = json_['infocode']
if not infocode == INFOCODE_OK:
if i == KEY_POOL_NUM_INDICATOR:
sys.exit('NOInvalidKEY')
change_key()

  

高德没有遵守自己的QPS/日限策略;所不能通过其返回码,来控制key的使用;

2
172 present_key c5ef87ab7efe0d76b970fd330bf9e7f2
175 touse_key adf8e13d1b170fcef7132ea3178a2d6c
172 present_key adf8e13d1b170fcef7132ea3178a2d6c
175 touse_key 2f3d41dfbce352fc4d82009c552505fe
172 present_key 2f3d41dfbce352fc4d82009c552505fe
175 touse_key c5ef87ab7efe0d76b970fd330bf9e7f2
172 present_key c5ef87ab7efe0d76b970fd330bf9e7f2
175 touse_key 2f3d41dfbce352fc4d82009c552505fe
172 present_key 2f3d41dfbce352fc4d82009c552505fe
175 touse_key c5ef87ab7efe0d76b970fd330bf9e7f2
172 present_key c5ef87ab7efe0d76b970fd330bf9e7f2
175 touse_key 6d95ab3f63c494911002c1734089548a
6
172 present_key 6d95ab3f63c494911002c1734089548a
175 touse_key adf8e13d1b170fcef7132ea3178a2d6c
6
172 present_key adf8e13d1b170fcef7132ea3178a2d6c
175 touse_key 6d95ab3f63c494911002c1734089548a
6
172 present_key 6d95ab3f63c494911002c1734089548a
175 touse_key adf8e13d1b170fcef7132ea3178a2d6c
4
172 present_key adf8e13d1b170fcef7132ea3178a2d6c
175 touse_key c0d76e9fa950d0ff1761d56bd78a902e

def change_key():
global touse_key
mean_use_key = random.randint(0, KEY_POOL_NUM_INDICATOR)
print(mean_use_key)
for i in range(mean_use_key, KEY_POOL_NUM_INDICATOR, 1):
key = KEY_POOL_LIST[i]
if key == touse_key:
continue
print(172, 'present_key', touse_key)
touse_key = key
url = URL_FOR_CHANGE_KEY % (touse_key)
print(175, 'touse_key', touse_key)
try:
r = requests.get(url)
try :
json_ = r.json()
except Exception:
print(' r.json()',Exception)
change_key()
except Exception:
print('requests.get(url)',Exception)
change_key()
infocode = json_['infocode']
if not infocode == INFOCODE_OK:
if i == KEY_POOL_NUM_INDICATOR:
sys.exit('NOInvalidKEY')
change_key()

  

# -*- coding: UTF-8 -*-

import re
import pprint
import json
import time
import math
import sys
import requests
import threading
from time import ctime, sleep ZHITONGZI_CITY_DIC = {}
f = open('直筒子市_东莞中山.txt', 'r', encoding='utf-8')
ZHITONGZI_CITY_DIC['东莞市'] = []
ZHITONGZI_CITY_DIC['中山市'] = []
c = 0
for i in f:
ii = i.split(';')
for iii in ii:
iv = iii.split('、')
if len(iv) > 2:
c += 1
for v in iv:
if v.find('(') > -1:
v_ = v.split('(')[1]
elif v.find(')') > -1:
v_ = v.split(')')[0]
else:
v_ = v
if c == 1 or c == 2:
ZHITONGZI_CITY_DIC['东莞市'].append(v_)
elif c == 3 or c == 4:
ZHITONGZI_CITY_DIC['中山市'].append(v_)
f.closed def chk_is_coffee(str):
l_ = ['coffee', 'coffe', 'cafe', 'café', 'starbucks', 'caffé']
# 上岛花式铁板烧 日本菜
# 泛太平洋大酒店面馆 其他
l_b = ['咖啡', '星巴克']
# 星巴克
for i in l_:
if str.upper().find(i.upper()) != -1:
return True
for i in l_b:
if str.find(i) != -1:
return True
return False def chk_kfc_mdl(str):
if str.find(u"麦当劳") != -1:
return 1
elif str.find(u"肯德基") != -1 or str.upper().find(u"KFC") != -1:
return 0
else:
return 2 def get_name(str):
if str.find("麦当劳") != -1:
return '麦当劳'
elif str.find("肯德基") != -1 or str.upper().find(u"KFC") != -1:
return '肯德基'
else:
# str = '狗不理包子(前门店)'
# str = '(清真)三羊水饺(新民路店)'
# | 添椒 | 潮涮三国IP火锅
if str.find('(') == -1 and str.find('(') == -1:
return str
res = str.strip(' ').split('(')[0].strip(' ')
if len(res) == 0:
try:
res = str.split(')')[1].split('(')[0]
except Exception:
print(Exception)
# 一锅两头牛(烟青路店)
res_b = res
try:
res_b = res.split('(')[0]
except Exception:
print(Exception) return res_b def chk_city_district(str):
city_district = str.replace(' ', '')
if re.match(r".*[a-zA-Z0-9]", city_district) is not None:
return False
elif str.find("[") != -1 or str.find("(") != -1 or str.find(")") != -1:
return False
else:
return city_district def chk_catering_kind(str):
catering_kind = str.replace(' ', '')
if re.match(r".*[0-9]", catering_kind) is not None:
return False
else:
return catering_kind # ['a','',' ']
def chk_list_thickness(list_):
if len(list_) == 0:
return False
res_list = []
for i in list_:
i_b = i.replace(' ', '')
if i.replace(' ', '') == '':
return False
else:
res_list.append(i_b)
return res_list business_area_tag_list = ['大厦', '大楼', '大厦', '百货', '购物中心', '商业中心', 'MALL', '广场', '商场', '单元', '栋', '座', '楼', '层', '底商'] def chk_in_business_area(str):
global business_area_tag_list
for i in business_area_tag_list:
if str.find(i) > -1:
return 1
return 0 # MAX_OFFSET = 25
# OFFSET = MAX_OFFSET - 1
MAX_PAGINATION = 100
pagination = MAX_PAGINATION
QPS = 50
TIME_UNIT = 1
# http://lbs.amap.com/api/webservice/guide/tools/info
INFOCODE_OK = ''
file_name_key_pool = 'key_pool.pool'
KEY_POOL_LIST = []
touse_key = ''
f = open(file_name_key_pool, 'r', encoding='utf-8')
for i in f:
try:
list_ = i.split('\t')
key = i.split('\t')[1].split()
KEY_POOL_LIST.append(key[0])
except Exception:
print(Exception)
KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST) - 1 # 北京市 西城区 金堂羊蝎子火锅 真武庙四条1号
# http://restapi.amap.com/v3/place/around?parameters
URL_TYPE = 'http://restapi.amap.com/v3/place/text'
# URL_TYPE = 'http://restapi.amap.com/v3/around'
touse_key = ''
RADIUS = '&radius=20'
keywords = '&keywords='
OFFSET = '&offset=10'
CITYLIMIT = '&citylimit=true' URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1' def change_key():
global touse_key
for i in range(0, KEY_POOL_NUM_INDICATOR, 1):
key = KEY_POOL_LIST[i]
if key == touse_key:
continue
touse_key = key
url = URL_FOR_CHANGE_KEY % (touse_key)
print(62, 'chk_key', url)
print(62, 'touse_key', touse_key)
try:
r = requests.get(url)
except Exception:
print(Exception)
change_key()
json_ = r.json()
infocode = json_['infocode']
if not infocode == INFOCODE_OK:
if i == KEY_POOL_NUM_INDICATOR:
sys.exit('NOInvalidKEY')
change_key() # 060101 购物服务 商场 购物中心
FILTER_GD_BUSINESS_AREA_TYPE_LIST = ['购物中心'] def fliter_gd_business_area_type(url):
global FILTER_GD_BUSINESS_AREA_TYPE_LIST
# {"suggestion":{"keywords":{},"cities":{}},"info":"OK","status":"1","count":"1","pois":[{"typecode":"050118","adname":"西城区","biz_type":"diner","id":"B0FFG8RYC7","pname":"北京市","importance":{},"location":"116.393715,39.957242","distance":{},"tel":"18600185618","biz_ext":{},"shopid":{},"address":"德胜街道六铺炕北小街8-1号","poiweight":{},"cityname":"北京市","type":"餐饮服务;中餐厅;特色\/地方风味餐厅","shopinfo":"0","name":"炙热青春"}],"infocode":"10000"
try:
r = requests.get(url)
r_json = r.json()
except Exception:
print(Exception)
print(195, url)
return 0
infocode = r_json['infocode']
if infocode == '':
count = r_json['count']
if int(count) > 0:
pois_list = r_json['pois']
for l in pois_list:
type = l['type']
for chk_type in FILTER_GD_BUSINESS_AREA_TYPE_LIST:
if type.find(chk_type) > -1:
return 1
else:
change_key()
return 0 # f = open('MEITUAN_DAZHONG_20170704.csv', 'r', encoding='gbk')
# f_2 = open('MEITUAN_DAZHONG_20170705.csv', 'r', encoding='gbk')
# f_gd = open('GD_POI_KFC_MC.csv', 'r', encoding='utf-8-sig') # 市key-区key-品牌名key-地址key-{菜类,均价,data_from}
# {city:{district:{name:{address:{catering_kind,average_price,data_from}}}}}
cater_dic = {}
# [{city,district,address,name,catering_kind,average_price,data_from}]
cater_exception_list = []
count_catering = 0
count_catering_exception = 0 coffee_list = []
count_coffee = 0 fo = open('MEITUAN_DAZHONG_20170705 - 副本.csv', 'r') file_line_list = []
for i in fo:
file_line_list.append(i)
fo.closed
#
# fo = open('MEITUAN_DAZHONG_20170705 - 副本.csv', 'r')
# while fo.readline():
# file_line_list_b.append(fo.readline())
file_line_list_len = len(file_line_list)
file_jump_step_num = 10000
count_catering_exception = 0
count_coffee = 0
count_catering = 0 def get_exception_logic_split_loop(nloop):
print(247,nloop)
global touse_key,cater_dic, file_line_list, file_line_list_len, file_jump_step_num, count_catering_exception, count_coffee, count_catering
start_line = nloop * file_jump_step_num
if start_line >= file_line_list_len:
print('last-line')
return
else:
start_line_count = 0
end_line = start_line + file_jump_step_num
if end_line >= file_line_list_len:
end_line = file_line_list_len - 1
for i in range(start_line, end_line, 1):
l_ = file_line_list[i].replace('\n', '').split(',')
city = l_[0]
district = l_[1]
address = l_[2]
name = l_[3]
average_price = l_[4]
catering_kind = l_[5]
data_from = 'mtdz_5'
# 数据准备层
# 数据运算层
# 该层处理从目标文件取出的字段列表
focus_list = [city, district, address, name, catering_kind, average_price, data_from]
dic_exception = {}
dic_exception['data_from'] = data_from
dic_exception['city'] = city
dic_exception['district'] = district
dic_exception['name'] = name
dic_exception['address'] = address
dic_exception['catering_kind'] = catering_kind
dic_exception['average_price'] = average_price if not chk_list_thickness(focus_list) or not chk_city_district(city) or not chk_city_district(
district) or not chk_catering_kind(catering_kind):
count_catering_exception += 1
cater_exception_list.append(dic_exception)
else:
name = get_name(name) m = chk_is_coffee(name)
# if m:
# print(list_)
if not m:
m = chk_is_coffee(catering_kind)
if m:
count_coffee += 1
coffee_list.append(dic_exception) if not m:
dic_details = {}
dic_details['data_from'] = data_from
dic_details['catering_kind'] = catering_kind
dic_details['average_price'] = average_price
if_in_business_area = chk_in_business_area(address)
if_in_business_area_criterion = 'str_match'
if if_in_business_area == 0:
city_r = '&city=' + district
keywords = '&keywords=' + address + '|' + name
start_line_count += 1
print(417, start_line,start_line_count)
if start_line_count%QPS == 0:
print('sleep')
sleep(1)
url = URL_TYPE + '?' + 'key=' + touse_key + RADIUS + keywords + city_r + CITYLIMIT if_in_business_area = fliter_gd_business_area_type(url)
if_in_business_area_criterion = 'str_match+request_api' dic_details['if_in_business_area_criterion'] = if_in_business_area_criterion
dic_details['if_in_business_area'] = if_in_business_area if city not in cater_dic:
cater_dic[city] = {}
if district not in cater_dic[city]:
cater_dic[city][district] = {}
if name not in cater_dic[city][district]:
cater_dic[city][district][name] = {}
if address not in cater_dic[city][district][name]:
cater_dic[city][district][name][address] = {} cater_dic[city][district][name][address] = dic_details
count_catering += 1 class MyThread(threading.Thread):
def __init__(self,func,args,name=''):
threading.Thread.__init__(self)
self.name = name
self.func = func
self.args = args
def run(self):
self.func(self.args) def main():
print('starting at:',ctime())
threads_list = []
thread_sum = math.ceil(file_line_list_len / file_jump_step_num)
print(thread_sum)
for nloop in range(1, thread_sum, 1):
print(nloop)
thread_instance = MyThread(get_exception_logic_split_loop,(nloop),get_exception_logic_split_loop.__name__)
print(353,'')
threads_list.append(thread_instance)
# 主进程将在所有非守护进程退出后,退出
for t in threads_list:
print(t)
t.setDaemon = False
t.start()
# wait for all thrades to finish
for t in threads_list:
t.join() print(467, cater_dic)
f_name = 'ALL.csv'
f = open(f_name, 'w', encoding='utf-8-sig')
f.write('')
f.closed
f = open(f_name, 'a', encoding='utf-8-sig')
str = 'city, district, name, address, if_in_business_area, if_in_business_area_criterion,catering_kind, average_price, data_from\n'
f.write(str)
## city,district,address,name,catering_kind,average_price,data_from
count_write_rows = 0
for i in cater_dic:
city = i
if city == '城市':
continue
for ii in cater_dic[i]:
district = ii
for iii in cater_dic[i][ii]:
name = iii
for iv in cater_dic[i][ii][iii]:
address = iv
catering_kind = cater_dic[i][ii][iii][iv]['catering_kind']
average_price = cater_dic[i][ii][iii][iv]['average_price']
if_in_business_area = cater_dic[i][ii][iii][iv]['if_in_business_area']
if_in_business_area_criterion = cater_dic[i][ii][iii][iv]['if_in_business_area_criterion']
data_from = cater_dic[i][ii][iii][iv]['data_from']
str = '%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
city, district, name, address, if_in_business_area, if_in_business_area_criterion,
catering_kind, average_price, data_from)
f.write(str)
count_write_rows += 1
f.closed
print(count_write_rows) if __name__ == '__main__':
main()

while fo.readline() 少统计了??

# -*- coding: UTF-8 -*-

import re
import pprint
import json
import time
import math
import sys
import requests
import threading
from time import ctime, sleep ZHITONGZI_CITY_DIC = {}
f = open('直筒子市_东莞中山.txt', 'r', encoding='utf-8')
ZHITONGZI_CITY_DIC['东莞市'] = []
ZHITONGZI_CITY_DIC['中山市'] = []
c = 0
for i in f:
ii = i.split(';')
for iii in ii:
iv = iii.split('、')
if len(iv) > 2:
c += 1
for v in iv:
if v.find('(') > -1:
v_ = v.split('(')[1]
elif v.find(')') > -1:
v_ = v.split(')')[0]
else:
v_ = v
if c == 1 or c == 2:
ZHITONGZI_CITY_DIC['东莞市'].append(v_)
elif c == 3 or c == 4:
ZHITONGZI_CITY_DIC['中山市'].append(v_)
f.closed def chk_is_coffee(str):
l_ = ['coffee', 'coffe', 'cafe', 'café', 'starbucks', 'caffé']
# 上岛花式铁板烧 日本菜
# 泛太平洋大酒店面馆 其他
l_b = ['咖啡', '星巴克']
# 星巴克
for i in l_:
if str.upper().find(i.upper()) != -1:
return True
for i in l_b:
if str.find(i) != -1:
return True
return False def chk_kfc_mdl(str):
if str.find(u"麦当劳") != -1:
return 1
elif str.find(u"肯德基") != -1 or str.upper().find(u"KFC") != -1:
return 0
else:
return 2 def get_name(str):
if str.find("麦当劳") != -1:
return '麦当劳'
elif str.find("肯德基") != -1 or str.upper().find(u"KFC") != -1:
return '肯德基'
else:
# str = '狗不理包子(前门店)'
# str = '(清真)三羊水饺(新民路店)'
# | 添椒 | 潮涮三国IP火锅
if str.find('(') == -1 and str.find('(') == -1:
return str
res = str.strip(' ').split('(')[0].strip(' ')
if len(res) == 0:
try:
res = str.split(')')[1].split('(')[0]
except Exception:
print(Exception)
# 一锅两头牛(烟青路店)
res_b = res
try:
res_b = res.split('(')[0]
except Exception:
print(Exception) return res_b def chk_city_district(str):
city_district = str.replace(' ', '')
if re.match(r".*[a-zA-Z0-9]", city_district) is not None:
return False
elif str.find("[") != -1 or str.find("(") != -1 or str.find(")") != -1:
return False
else:
return city_district def chk_catering_kind(str):
catering_kind = str.replace(' ', '')
if re.match(r".*[0-9]", catering_kind) is not None:
return False
else:
return catering_kind # ['a','',' ']
def chk_list_thickness(list_):
if len(list_) == 0:
return False
res_list = []
for i in list_:
i_b = i.replace(' ', '')
if i.replace(' ', '') == '':
return False
else:
res_list.append(i_b)
return res_list business_area_tag_list = ['大厦', '大楼', '大厦', '百货', '购物中心', '商业中心', 'MALL', '广场', '商场', '单元', '栋', '座', '楼', '层', '底商'] def chk_in_business_area(str):
global business_area_tag_list
for i in business_area_tag_list:
if str.find(i) > -1:
return 1
return 0 # MAX_OFFSET = 25
# OFFSET = MAX_OFFSET - 1
MAX_PAGINATION = 100
pagination = MAX_PAGINATION
QPS = 50
TIME_UNIT = 1
# http://lbs.amap.com/api/webservice/guide/tools/info
INFOCODE_OK = '10000'
file_name_key_pool = 'key_pool.pool'
KEY_POOL_LIST = []
touse_key = ''
f = open(file_name_key_pool, 'r', encoding='utf-8')
for i in f:
try:
list_ = i.split('\t')
key = i.split('\t')[1].split()
KEY_POOL_LIST.append(key[0])
except Exception:
print(Exception)
KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST) - 1 # 北京市 西城区 金堂羊蝎子火锅 真武庙四条1号
# http://restapi.amap.com/v3/place/around?parameters
URL_TYPE = 'http://restapi.amap.com/v3/place/text'
# URL_TYPE = 'http://restapi.amap.com/v3/around'
touse_key = ''
RADIUS = '&radius=20'
keywords = '&keywords='
OFFSET = '&offset=10'
CITYLIMIT = '&citylimit=true' URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1' def change_key():
global touse_key
for i in range(0, KEY_POOL_NUM_INDICATOR, 1):
key = KEY_POOL_LIST[i]
if key == touse_key:
continue
touse_key = key
url = URL_FOR_CHANGE_KEY % (touse_key)
print(62, 'chk_key', url)
print(62, 'touse_key', touse_key)
try:
r = requests.get(url)
except Exception:
print(Exception)
change_key()
json_ = r.json()
infocode = json_['infocode']
if not infocode == INFOCODE_OK:
if i == KEY_POOL_NUM_INDICATOR:
sys.exit('NOInvalidKEY')
change_key() # 060101 购物服务 商场 购物中心
FILTER_GD_BUSINESS_AREA_TYPE_LIST = ['购物中心'] def fliter_gd_business_area_type(url):
global FILTER_GD_BUSINESS_AREA_TYPE_LIST
# {"suggestion":{"keywords":{},"cities":{}},"info":"OK","status":"1","count":"1","pois":[{"typecode":"050118","adname":"西城区","biz_type":"diner","id":"B0FFG8RYC7","pname":"北京市","importance":{},"location":"116.393715,39.957242","distance":{},"tel":"18600185618","biz_ext":{},"shopid":{},"address":"德胜街道六铺炕北小街8-1号","poiweight":{},"cityname":"北京市","type":"餐饮服务;中餐厅;特色\/地方风味餐厅","shopinfo":"0","name":"炙热青春"}],"infocode":"10000"
try:
r = requests.get(url)
r_json = r.json()
except Exception:
print(Exception)
print(195, url)
return 0
infocode = r_json['infocode']
if infocode == '10000':
count = r_json['count']
if int(count) > 0:
pois_list = r_json['pois']
for l in pois_list:
type = l['type']
for chk_type in FILTER_GD_BUSINESS_AREA_TYPE_LIST:
if type.find(chk_type) > -1:
return 1
else:
change_key()
return 0 # f = open('MEITUAN_DAZHONG_20170704.csv', 'r', encoding='gbk')
# f_2 = open('MEITUAN_DAZHONG_20170705.csv', 'r', encoding='gbk')
# f_gd = open('GD_POI_KFC_MC.csv', 'r', encoding='utf-8-sig') # 市key-区key-品牌名key-地址key-{菜类,均价,data_from}
# {city:{district:{name:{address:{catering_kind,average_price,data_from}}}}}
cater_dic = {}
# [{city,district,address,name,catering_kind,average_price,data_from}]
cater_exception_list = []
count_catering = 0
count_catering_exception = 0 coffee_list = []
count_coffee = 0 fo = open('MEITUAN_DAZHONG_20170705 - 副本.csv', 'r', encoding='gbk') # line = fo.readline()
file_line_list = []
while fo.readline():
file_line_list.append(fo.readline())
file_line_list_len = len(file_line_list)
file_jump_step_num = 5000
count_catering_exception = 0
count_coffee = 0
count_catering = 0 def get_exception_logic_split_loop(nloop):
print(247,nloop)
global touse_key,cater_dic, file_line_list, file_line_list_len, file_jump_step_num, count_catering_exception, count_coffee, count_catering
start_line = nloop * file_jump_step_num
if start_line >= file_line_list_len:
print('last-line')
return
else:
start_line_count = 0
end_line = start_line + file_jump_step_num
if end_line >= file_line_list_len:
end_line = file_line_list_len - 1
for i in range(start_line, end_line, 1):
l_ = file_line_list[i].replace('\n', '').split(',')
city = l_[0]
district = l_[1]
address = l_[2]
name = l_[3]
average_price = l_[4]
catering_kind = l_[5]
data_from = 'mtdz_5'
# 数据准备层
# 数据运算层
# 该层处理从目标文件取出的字段列表
focus_list = [city, district, address, name, catering_kind, average_price, data_from]
dic_exception = {}
dic_exception['data_from'] = data_from
dic_exception['city'] = city
dic_exception['district'] = district
dic_exception['name'] = name
dic_exception['address'] = address
dic_exception['catering_kind'] = catering_kind
dic_exception['average_price'] = average_price if not chk_list_thickness(focus_list) or not chk_city_district(city) or not chk_city_district(
district) or not chk_catering_kind(catering_kind):
count_catering_exception += 1
cater_exception_list.append(dic_exception)
else:
name = get_name(name) m = chk_is_coffee(name)
# if m:
# print(list_)
if not m:
m = chk_is_coffee(catering_kind)
if m:
count_coffee += 1
coffee_list.append(dic_exception) if not m:
dic_details = {}
dic_details['data_from'] = data_from
dic_details['catering_kind'] = catering_kind
dic_details['average_price'] = average_price
if_in_business_area = chk_in_business_area(address)
if_in_business_area_criterion = 'str_match'
if if_in_business_area == 0:
city_r = '&city=' + district
keywords = '&keywords=' + address + '|' + name
start_line_count += 1
print(417, start_line,start_line_count)
if start_line_count%QPS == 0:
print('sleep')
sleep(1)
url = URL_TYPE + '?' + 'key=' + touse_key + RADIUS + keywords + city_r + CITYLIMIT if_in_business_area = fliter_gd_business_area_type(url)
if if_in_business_area == 1:
if_in_business_area_criterion = 'request_api' dic_details['if_in_business_area_criterion'] = if_in_business_area_criterion
dic_details['if_in_business_area'] = if_in_business_area if city not in cater_dic:
cater_dic[city] = {}
if district not in cater_dic[city]:
cater_dic[city][district] = {}
if name not in cater_dic[city][district]:
cater_dic[city][district][name] = {}
if address not in cater_dic[city][district][name]:
cater_dic[city][district][name][address] = {} cater_dic[city][district][name][address] = dic_details
count_catering += 1 class MyThread(threading.Thread):
def __init__(self,func,args,name=''):
threading.Thread.__init__(self)
self.name = name
self.func = func
self.args = args
def run(self):
self.func(self.args) def main():
print('starting at:',ctime())
threads_list = []
thread_sum = math.floor(file_line_list_len / file_jump_step_num)
print(thread_sum)
for nloop in range(1, thread_sum, 1):
print(nloop)
thread_instance = MyThread(get_exception_logic_split_loop,(nloop),get_exception_logic_split_loop.__name__)
print(353,'123')
threads_list.append(thread_instance)
# 主进程将在所有非守护进程退出后,退出
for t in threads_list:
print(t)
t.setDaemon = False
t.start()
# wait for all thrades to finish
for t in threads_list:
t.join() print(467, cater_dic)
f_name = 'ALL.csv'
f = open(f_name, 'w', encoding='utf-8-sig')
f.write('')
f.closed
f = open(f_name, 'a', encoding='utf-8-sig')
str = '市,区,品牌名,地址,是否在商场,菜别(类型),均价,data_from,\n'
f.write(str)
## city,district,address,name,catering_kind,average_price,data_from
count_write_rows = 0
for i in cater_dic:
city = i
if city == '城市':
continue
for ii in cater_dic[i]:
district = ii
for iii in cater_dic[i][ii]:
name = iii
for iv in cater_dic[i][ii][iii]:
address = iv
catering_kind = cater_dic[i][ii][iii][iv]['catering_kind']
average_price = cater_dic[i][ii][iii][iv]['average_price']
if_in_business_area = cater_dic[i][ii][iii][iv]['if_in_business_area']
if_in_business_area_criterion = cater_dic[i][ii][iii][iv]['if_in_business_area_criterion']
data_from = cater_dic[i][ii][iii][iv]['data_from']
str = '%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
city, district, name, address, if_in_business_area, if_in_business_area_criterion,
catering_kind, average_price, data_from)
f.write(str)
count_write_rows += 1
f.closed
print(count_write_rows) if __name__ == '__main__':
main()

  

thread_process_action的更多相关文章

随机推荐

  1. JavaScript文件中; !function (win, undefined) {}(window);的意义

    +function (){}-function (){}!function (){}~function (){}(function (){})() 这种写法可以保证匿名函数立即运行且运行一次 传入的 ...

  2. 医院里的CR、DR、CT、磁共振、B超都是什么?

    转自 百度知道MR CT CR DR DSA X线 都事医学影像疾病诊断的一种. MRI 是磁共振影像检查,可以获得横断面,矢状面和冠状面的影像.空间分辩率好. CT 是一种X线诊断设备,是一种复杂的 ...

  3. 加密算法:DES、AES等

    指标:运算速度.安全性.资源消耗 对称加密算法(加解密密钥相同): 非对称算法(加密密钥和解密密钥不同): 散列算法比较: 对称与非对称算法比较: 算法选择(从性能和安全性综合) 对称加密: AES( ...

  4. 并发问题java

    两个重要的概念:同步和异步 同步,按照流程顺序一步一步的执行,等待获取单步的返回结果并执行下一步:发送方发出数据后,等接收方发回响应以后才发下一个数据包的通讯方式. 同步在一定程度上可以看做是单线程, ...

  5. 在WebStorm中使用editorConfig插件

    在webStorm中默认是支持editorConfig插件的,那么我们需要在webStorm中自定义editorConfig的配置怎么来做? 第一步:打开webStrome > File > ...

  6. Vue+elementui 实现复杂表头和动态增加列的二维表格

    先上完成的效果图:列是根据查询结果增加的 数据格式: 表头的数据取出: data.data.forEach(element => { this.thead.push({ 品名: element. ...

  7. 用Node.js原生代码实现静态服务器

    ---恢复内容开始--- 后端中服务器类型有两种 1. web服务器[ 静态服务器 ] - 举例: wamp里面www目录 - 目的是为了展示页面内容 - 前端: nginx 2. 应用级服务器[ a ...

  8. python常用模块学习1

    import time time.sleep(1)#暂停时间 time.time()#显示当前系统时间戳 t=time.localtime()#结构化当地时间,可以将结构化时间想象成一个类 print ...

  9. ASPOSE的示例下载地址

    ftp://112.124.7.170/ASPOSE/Aspose.Words_16.3.0.zip http://blog.163.com/haolongqin@126/blog/static/10 ...

  10. OGG replicat复制进程的拆分

    参考资料: 1.https://blog.csdn.net/datingting1/article/details/79583690