'utf-8-sig api_res = r.data.decode('utf-8') json_ = json.loads(api_res)
东莞市
东莞城市标志
东莞城市标志
1985年,广东省东莞县经国务院批准列为珠江三角洲经济开发区,同年9月撤消东莞县,建立(县级)东莞市,1988年1月升格为地级市。东莞市是全国五个不设市辖区的地级市之一,直属广东省。行政区划代码:441900,区号:0769 。截至2011年10月10日,东莞市辖32个镇(街道);其中包括4个街道(莞城街道、南城街道、东城街道、万江街道);28个镇(石碣镇、石龙镇、茶山镇、石排镇、企石镇、横沥镇、桥头镇、谢岗镇、东坑镇、常平镇、寮步镇、大朗镇、黄江镇、清溪镇、塘厦镇、凤岗镇、长安镇、 虎门镇、厚街镇、沙田镇、道滘镇、洪梅镇、麻涌镇、中堂镇、高埗镇、樟木头镇、大岭山镇、望牛墩镇)。
中山市
中山城市各标志
中山城市各标志
中山市是全国五个不设市辖区的地级市之一,行政区划代码442000,区号0760。中山市下辖6个街道、18个镇;镇(街道)下辖若干个行政村和社区;行政村下辖若干村民小组,社区下辖若干居民小组。镇办事机构为镇人民政府,街道办事机构为街道办事处;行政村办事机构为村民委员会,社区办事机构为社区居民委员会。
截至2011年9月1日,广东省中山市辖24个镇(街道);其中包括6个街道(石岐街道、东区街道、西区街道、南区街道、五桂山街道、中山港街道);18个镇(黄圃镇、南头镇、东凤镇、阜沙镇、小榄镇、东升镇、古镇镇、横栏镇、三角镇、民众镇、南朗镇、港口镇、大涌镇、沙溪镇、三乡镇、板芙镇、神湾镇、坦洲镇)。
ZHITONGZI_CITY_DIC = {}
f = open('直筒子市_东莞中山.txt', 'r', encoding='utf-8')
ZHITONGZI_CITY_DIC['东莞市'] = []
ZHITONGZI_CITY_DIC['中山市'] = []
c = 0
for i in f:
ii = i.split(';')
for iii in ii:
iv =iii.split( '、')
if len(iv)>2:
c += 1
for v in iv:
if v.find('(')>-1:
v_ = v.split('(')[1]
elif v.find(')') > -1:
v_ = v.split(')')[0]
else:
v_ = v
if c ==1 or c==2:
ZHITONGZI_CITY_DIC['东莞市'].append(v_)
elif c ==3 or c==4:
ZHITONGZI_CITY_DIC['中山市'].append(v_)
f.closed
# -*- coding: UTF-8 -*-
import re # mac_detail_dic
# {mac:{city_list,district_list,name_list,mac_detail_dic_list:[mac_detail_dic]}}
count_file_line = 0
source_file_line_a = 0
source_file_line_b = 0 def main(file_, file_id):
global count_file_line, mac_dic, mac_num_dic, source_file_line_a, source_file_line_b
if file_id == 1:
source_file_line = source_file_line_a
elif file_id == 2:
source_file_line = source_file_line_b
for i in file_:
source_file_line += 1
list_ = i.replace('\n', '').split(',')
# mac地址,ssid,个数,商圈类型,商圈名,城市,区域,经度,纬度
# mac,ssid,num,type,name,city,district,longitude,latitude,source_file_line,source_file
mac = list_[0].replace(' ', '')
m = re.match(r"\w{2}:\w{2}:\w{2}:\w{2}:\w{2}:\w{2}", mac)
if m:
if len(list_) == 9:
ssid = list_[1].replace(',', '').replace('"', '')
num = list_[2].replace(',', '').replace('"', '')
type = list_[3].replace(',', '').replace('"', '')
name = list_[4].replace(',', '').replace('"', '')
city = list_[5].replace(',', '').replace('"', '')
district = list_[6].replace(',', '').replace('"', '')
longitude = list_[7].replace(',', '').replace('"', '')
latitude = list_[8].replace(',', '').replace('"', '') mac_detail_dic = {}
mac_detail_dic['ssid'] = ssid
mac_detail_dic['num'] = num
mac_detail_dic['type'] = type
mac_detail_dic['name'] = name
mac_detail_dic['city'] = city
mac_detail_dic['district'] = district
mac_detail_dic['longitude'] = longitude
mac_detail_dic['latitude'] = latitude
mac_detail_dic['source_file_line'] = source_file_line
mac_detail_dic['source_file'] = file_id count_file_line += 1
# {mac:{city_list,district_list,name_list,mac_detail_dic_list:[mac_detail_dic]}} if mac not in mac_dic:
mac_dic[mac] = {}
mac_dic[mac]['city_list'] = []
mac_dic[mac]['district_list'] = []
mac_dic[mac]['name_list'] = []
mac_dic[mac]['mac_detail_dic_list'] = [] if city not in mac_dic[mac]['city_list']:
mac_dic[mac]['city_list'].append(city)
if district not in mac_dic[mac]['district_list']:
mac_dic[mac]['district_list'].append(district)
if name not in mac_dic[mac]['name_list']:
mac_dic[mac]['name_list'].append(name)
mac_dic[mac]['mac_detail_dic_list'].append(mac_detail_dic) if city not in place_mac_dic:
place_mac_dic[city] = {}
if district not in place_mac_dic[city]:
place_mac_dic[city][district] = {}
if name not in place_mac_dic[city][district]:
place_mac_dic[city][district][name] = []
mac_detail_dic['mac'] = mac
place_mac_dic[city][district][name].append(mac_detail_dic)
else:
print('mac字符串格式错误', mac) f = open('mac_1.csv', 'r', encoding='utf8')
f_2 = open('mac_2.csv', 'r', encoding='utf8') mac_dic = {}
mac_num_dic = {}
# {city:district:name:{type,mac_list:[mac_detail_dic]}}
place_mac_dic = {}
main(f, 1)
main(f_2, 2) for mac in mac_dic:
num = len(mac_dic[mac]['mac_detail_dic_list'])
if num not in mac_num_dic:
mac_num_dic[num] = {}
mac_num_dic[num]['mac_num'] = 0
mac_num_dic[num]['mac_list'] = []
mac_num_dic[num]['mac_num'] += 1
mac_num_dic[num]['mac_list'].append(mac) str = ''
mac_num_list = sorted(mac_num_dic, reverse=False)
# 统计mac出现次数占数据总量的比值
for i in mac_num_list:
str += '%s:%s /10K, \n ' % (i, mac_num_dic[i]['mac_num'] / count_file_line * 10000)
# str += str(i) + ':'+str(mac_num_dic[i]['mac_num']/count_mac)+','
print(str) # 同一市-区-name的mac列表
# 同一mac出现的市-区-name列表--不在同一市,即标注为异常mac ## {mac:{city_list,district_list,name_list,mac_detail_dic_list:[mac_detail_dic]}} mac_questionable_list = []
FNAME_GEN = '标记地理位置异常的mac.csv'
# f = open(FNAME_GEN, 'w', encoding='utf-8-sig')
# , source_file, source_file_line
# str = 'mac,questionable,city_list,district_list,name_list\n'
# f_gen = open(FNAME_GEN, 'w', encoding='utf-8-sig')
# f_gen.write(str) # 同一Mac地址出现在相同商圈,但SSID不同
# 同一Mac地址出现在不同城市的不同商圈
# 同一Mac地址出现在相同城市、相同区域的不同商圈
# 同一Mac地址出现在相同城市、不同区域的不同商圈
# 已过滤Mac地址 chk_diff_city_mac_list = []
chk_one_city_diff_district_mac_list = []
chk_one_district_diff_name_mac_list = []
chk_one_name_diff_ssid_mac_list = []
chk_res_dic = {}
for mac in mac_dic:
d = mac_dic[mac]
city_list = d['city_list']
district_list = d['district_list']
name_list = d['name_list']
mac_detail_dic_list = d['mac_detail_dic_list']
len_city_list = len(city_list)
len_district_list = len(district_list)
len_name_list = len(name_list)
len_mac_detail_dic_list = len(mac_detail_dic_list) if len_city_list > 1:
chk_diff_city_mac_list.append(mac)
if len_city_list == 1 and len_district_list > 1:
chk_one_city_diff_district_mac_list.append(mac)
if len_city_list == 1 and len_district_list == 1 and len_name_list > 1:
chk_one_district_diff_name_mac_list.append(mac)
if len_city_list == 1 and len_district_list == 1 and len_name_list == 1 and len_mac_detail_dic_list > 1:
chk_one_name_diff_ssid_mac_list.append(mac) chk_res_dic['chk_diff_city_mac_list'] = chk_diff_city_mac_list
chk_res_dic['chk_one_city_diff_district_mac_list'] = chk_one_city_diff_district_mac_list
chk_res_dic['chk_one_district_diff_name_mac_list'] = chk_one_district_diff_name_mac_list
chk_res_dic['chk_one_name_diff_ssid_mac_list'] = chk_one_name_diff_ssid_mac_list for k in chk_res_dic:
f_name = k + '.csv'
str = 'mac, city, district, name, type, ssid, longitude, latitude, source_file_line,source_file\n'
f_gen = open(f_name, 'w', encoding='utf-8-sig')
f_gen.write(str)
l_ = chk_res_dic[k]
for mac in l_:
d = mac_dic[mac]['mac_detail_dic_list']
for dd in d:
ssid = dd['ssid']
num = dd['num']
type = dd['type']
name = dd['name']
city = dd['city']
district = dd['district']
longitude = dd['longitude']
latitude = dd['latitude']
source_file_line = dd['source_file_line']
source_file = dd['source_file']
str = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
mac, city, district, name, type, ssid, longitude, latitude, source_file_line,
source_file)
f_gen.write(str) d = 4 for mac in mac_dic:
d = mac_dic[mac]
city_list = d['city_list']
district_list = d['district_list']
name_list = d['name_list']
city_list_str = '|'.join(city_list)
district_list_str = '|'.join(district_list)
name_list_str = '|'.join(name_list)
# dd = '|'.join(d['mac_detail_dic_list'])
# source_file = d['source_file']
# source_file_line = d['source_file_line']
questionable = 0
if len(city_list) > 1 or len(district_list) > 1:
questionable = 1
if mac not in mac_questionable_list:
mac_questionable_list.append(mac)
str = '%s,%s,%s,%s,%s\n' % (mac, questionable, city_list_str, district_list_str, name_list_str)
# f_gen = open(FNAME_GEN, 'a', encoding='utf-8-sig')
# f_gen.write(str)
# f_gen.closed FNAME_GEN_B = '市-区-name-mac.csv'
f = open(FNAME_GEN_B, 'w', encoding='utf-8-sig')
## {city:district:name:{type,mac_list:[mac_detail_dic]}}
str = 'city,district,name,type,mac,questionable,ssid,longitude,latitude,source_file_line,source_file\n'
f_gen = open(FNAME_GEN_B, 'w', encoding='utf-8-sig')
f_gen.write(str) FNAME_GEN_C = 'mac_市-区-name-ssid.csv'
f = open(FNAME_GEN_C, 'w', encoding='utf-8-sig')
# , source_file, source_file_line
str = 'mac,questionable,city_list,district_list,name_list\n'
f_gen = open(FNAME_GEN_C, 'w', encoding='utf-8-sig')
f_gen.write(str)
for mac in mac_dic:
d = mac_dic[mac]
# city_list = d['city_list']
# district_list = d['district_list']
# name_list = d['name_list']
# city_list_str = '|'.join(city_list)
# district_list_str = '|'.join(district_list)
# name_list_str = '|'.join(name_list)
dd = d['mac_detail_dic_list'] for i in dd:
mac = i['mac']
ssid = i['ssid']
num = i['num']
type = i['type']
name = i['name']
longitude = i['longitude']
latitude = i['latitude']
source_file_line = i['source_file_line']
source_file = i['source_file']
questionable = 0
city = i['city']
district = i['district']
if mac in mac_questionable_list:
questionable = 1 str = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
city, district, name, type, mac, questionable, ssid, longitude, latitude, source_file_line,
source_file)
f_gen = open(FNAME_GEN_C, 'a', encoding='utf-8-sig')
f_gen.write(str) f_gen.closed # mac,ssid,num,type,name,city,district,longitude,latitude,source_file_line,source_file for city in place_mac_dic:
for district in place_mac_dic[city]:
for name in place_mac_dic[city][district]:
for i in place_mac_dic[city][district][name]:
mac = i['mac']
ssid = i['ssid']
num = i['num']
type = i['type']
name = i['name']
longitude = i['longitude']
latitude = i['latitude']
source_file_line = i['source_file_line']
source_file = i['source_file']
questionable = 0
if mac in mac_questionable_list:
questionable = 1 str = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
city, district, name, type, mac, questionable, ssid, longitude, latitude, source_file_line,
source_file)
f_gen = open(FNAME_GEN_B, 'a', encoding='utf-8-sig')
f_gen.write(str)
f_gen.closed FNAME_GEN_C = 'mac_市-区-name-ssid.csv'
f = open(FNAME_GEN_C, 'w', encoding='utf-8-sig')
# , source_file, source_file_line
str = 'mac, questionable, ssid,city, district, name, type, longitude, latitude, source_file_line, source_file\n'
f_gen = open(FNAME_GEN_C, 'w', encoding='utf-8-sig')
f_gen.write(str)
for mac in mac_dic:
d = mac_dic[mac]
# city_list = d['city_list']
# district_list = d['district_list']
# name_list = d['name_list']
# city_list_str = '|'.join(city_list)
# district_list_str = '|'.join(district_list)
# name_list_str = '|'.join(name_list)
dd = d['mac_detail_dic_list'] for i in dd:
mac = i['mac']
ssid = i['ssid']
num = i['num']
type = i['type']
name = i['name']
longitude = i['longitude']
latitude = i['latitude']
source_file_line = i['source_file_line']
source_file = i['source_file']
questionable = 0
city = i['city']
district = i['district']
if mac in mac_questionable_list:
questionable = 1 str = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
mac, questionable, ssid, city, district, name, type, longitude, latitude, source_file_line,
source_file)
f_gen = open(FNAME_GEN_C, 'a', encoding='utf-8-sig')
f_gen.write(str) f_gen.closed
# for mac in mac_num_dic:
# times = mac_num_dic[mac]
# if times in mac_num_dic_count:
# mac_num_dic_count[times]['num'] += 1
# mac_num_dic_count[times]['mac_list'].append(mac)
# else:
# dic_ = {}
# dic_['num'] = 1
# dic_['mac_list'] = []
# dic_['mac_list'].append(mac)
# mac_num_dic_count[times] = dic_ # -------write
# f = open('mac_appear_times_sorted.csv', 'a', encoding='utf-8-sig')
# str = 'mac,mac出现次数的类别,该类别下mac数,该类别下mac数,mac数与csv总数据的比值,mac数与csv总数据中不同的mac数的比值\n'
# f.write(str)
# # 中文 NotePad++ BOM
# f_summary = open('mac_appear_times_sorted_summary.csv', 'a', encoding='utf-8-sig')
# # str_summary = 'mac_appear_times_category,category_count,count/csv_whole_data,category_count/discount_mac_count\n'
# str_summary = 'mac出现次数的类别,该类别下mac数,mac数与csv总数据的比值,mac数与csv总数据中不同的mac数的比值\n'
# f_summary.write(str_summary)
#
# mac_num_dic_count_sorted_list = sorted(mac_num_dic_count.items(), key=lambda d: d[1]['num'], reverse=False)
# for i in mac_num_dic_count_sorted_list:
# times = i[0]
# num = i[1]['num']
# str_summary = '%i,%i,%f,%f\n' % (times, num, num / count_mac, num / distinct_mac_num)
# f_summary.write(str_summary)
# mac_list = i[1]['mac_list']
# for mac in mac_list:
# str = '%s,%i,%i,%f,%f\n' % (mac, times, num, num / count_mac, num / distinct_mac_num)
# f.write(str)
# f_summary.closed
# f.closed
#
# OUR_OFFICE_SSID = ['SW_MobilePhone', 'SW_MobilePhone2', 'SW-Guest', 'kmz', 'lamciu', 'lamciu_5G', 'SW_MobilePhone_5G',
# 'wyf的iMac', 'my3579', 'my3579_5G']
#
# f_our_office = open('mac_our_judgefrom_ssid.csv', 'a', encoding='utf-8-sig')
# for mac in mac_dic:
# for i in mac_dic[mac]:
# ssid = i[mac]['ssid']
# for our_ssid in OUR_OFFICE_SSID:
# if ssid == our_ssid:
# num = i[mac]['num']
# type = i[mac]['type']
# name = i[mac]['name']
# city = i[mac]['city']
# district = i[mac]['district']
# longitude = i[mac]['longitude']
# latitude = i[mac]['latitude']
# str = '%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
# mac, ssid, num, type, name, city, district, longitude, latitude)
# f_our_office.write(str)
# f_our_office.closed
#
# d = 5
另外,希望你考虑一个问题,“处理数据的时候,某些场景,试着在一个全数据表中处理,可以加列(比如,加二元列,‘是、否’),而不要生成‘子表’,以方便数据逻辑回溯、逻辑综合校验、及时调错、优化”
# -*- coding: UTF-8 -*-
import urllib3
import json
import time
import math
import threading
import sys
import re start_time = time.time()
print(start_time)
print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))) # FNAME_JVM = '【JMTool】商场列表_1442条_20170710.csv'
FNAME_JVM = '【JMTool】商场列表_1442条_20170710SIMPLE.csv'
# FNAME_JVM = 'jcw_jmtool_商场_20170714_JMTool.csv' FNAME_EXTERNALDATA = '商场_externaldata.csv'
# FNAME_EXTERNALDATA = 'jcw_jmtool_商场_20170714_GD.csv'
f_name_teamtask = '副本执行组负责区域城市明细_20170708.csv' # FNAME_GEN = '商场列表_externaldataJMTool1442有效并集_28除北上广深.csv'
# FNAME_GEN_ONLY_API = '商场列表_externaldataJMTool_仅出现在externaldata_28除北上广深.csv' # FNAME_GEN = '商场列表_externaldataJMTool1442有效并集_28城市除28除北上广深_购物服务商场购物中心.csv'
# FNAME_GEN_ONLY_API = '商场列表_externaldataJMTool_仅出现在externaldata_28城市除28除北上广深_购物服务商场购物中心.csv' # FNAME_GEN = 'jcw_商场_已去重.csv'
# FNAME_GEN_ONLY_API = 'jcw_商场_疑似重复.csv' FNAME_GEN = '商场列表_JMTool1442_28除北上广深_已去重.csv'
# FNAME_GEN_ONLY_API = '商场列表_JMTool1442_28除北上广深_待手检.csv'
FNAME_GEN_ONLY_API = '商场列表_JMTool1442_28除北上广深_待手检.csv' # important_flag pname city name type typecode address district location id shopinfo
# {city:district:name:{type,address,cityclass} ##
# {important_flag:[city]}
teamtask_dic = {}
f = open(f_name_teamtask, 'r', encoding='utf-8-sig')
CITY_28 = []
CITY_4 = ['深圳市', '广州市', '北京市', '上海市']
for i in f:
list_ = i.replace('\n', '').split(',')
try:
important_flag = list_[4]
if important_flag in ['备注', '暂不做']:
continue
city = list_[2]
if city.find('市') == -1:
city = city + '市'
if not important_flag in teamtask_dic:
teamtask_dic[important_flag] = []
teamtask_dic[important_flag].append(city)
if city not in CITY_28 and important_flag == '重点':
CITY_28.append(city)
except Exception:
print(Exception) DONGGUAN_DISTRICT_LIST = []
f = open('东莞行政区域.txt', 'r', encoding='utf-8')
for i in f:
l = i.replace('。', '').replace('\n', '').split('、')
for ii in l:
DONGGUAN_DISTRICT_LIST.append(ii) def gen_dongguan_district(str):
for i in DONGGUAN_DISTRICT_LIST:
s = i[:-1]
if str.find(s) > -1:
return i
return '' jm_dic = {}
counter_jm_a = 0
counter_jm_b = 0
counter_jm_c = 0 ##
# 在同市-区(县)条件下:
#
# 名/路---原形/变体
# A-
# 重庆市 大渡口区 壹街购物中心B区 文体路88号
# 重庆市 大渡口区 壹街购物中心A区 文体路88号
# B-
# 成都市 青羊区 新城市广场 西大街1号
# 成都市 青羊区 百盛(西大街店) 西大街1号新城市广场1-5层
#
# 大连市 金州区 大商新玛特(开发区店) 开发区大连市经济开发区本溪街3号
# 大连市 金州区 友谊商城开发区店 本溪街3-5号
#
# C-
# 太原市 小店区 燕莎(和信时尚商城) 长风街705号和信时尚商城F3层
# 太原市 小店区 燕莎(和信时尚商城) 长风街705号和信时尚商城F4层
# D-
# 西安市 临潼区 全都超市 人民南路25号
# 西安市 临潼区 全都(临潼北环路店) 下北街巷与环城北路交叉口东150米 # 交叉判断
# 名地址 交叉综合判断 # {city:district:name:[dic]}
def todo_if_distinct(city_district_dic, name_, dic_, split_str_list=['(', '(', '-']):
for name in city_district_dic:
dic_list = city_district_dic[name]
if len(dic_list) > 1:
for one in dic_list:
one['todo_if_distinct'] = 1 for split_str in split_str_list:
# name
name_sub = name_.split(split_str)[0] name_list = sorted(city_district_dic, reverse=True)
for one_name in name_list:
if one_name.find(name_sub) > -1 and one_name != name_:
for one in city_district_dic[one_name]:
one['todo_if_distinct'] = 1
dic_['todo_if_distinct'] = 1 if name_sub != name_:
if name_sub in name_list:
for one in city_district_dic[name_sub]:
one['todo_if_distinct'] = 1
dic_['todo_if_distinct'] = 1
name_list = sorted(city_district_dic, reverse=False)
if name_sub in name_list:
for one in city_district_dic[name_sub]:
one['todo_if_distinct'] = 1
dic_['todo_if_distinct'] = 1 # address
addres_sub = dic_['address'].split(split_str)[0]
for name in city_district_dic:
if name != name_:
for one in city_district_dic[name]:
chk_dic_address = one['address']
if addres_sub.find(chk_dic_address) > -1 or chk_dic_address.find(addres_sub) > -1:
dic_['todo_if_distinct'] = 1
one['todo_if_distinct'] = 1 # return False # sorted(distinct_dic[city][district], reverse=True)
def todo_if_distinct_whole_reverse(whole_dic):
for city in whole_dic:
for district in whole_dic[city]:
name_list = sorted(whole_dic[city][district], reverse=True)
for name in name_list:
dic_list = whole_dic[city][district][name]
for dic_ in dic_list:
todo_if_distinct(whole_dic[city][district], name, dic_) name_list = sorted(whole_dic[city][district], reverse=False)
for name in name_list:
dic_list = whole_dic[city][district][name]
for dic_ in dic_list:
todo_if_distinct(whole_dic[city][district], name, dic_)
return whole_dic def dic_cp_field(dic_, source_key='todo_if_distinct', new_key='todo_if_distinct_self'):
for city in dic_:
for district in dic_[city]:
for name in dic_[city][district]:
for d in dic_[city][district][name]:
d[new_key] = d[source_key]
return dic_ def dic_flag_field(dic_, flag_='JMTool', new_key='from_flag'):
for city in dic_:
for district in dic_[city]:
for name in dic_[city][district]:
for d in dic_[city][district][name]:
d[new_key] = flag_
return dic_ def gen_sorted_namelist(whole_dic):
res = {}
for city in whole_dic:
for district in whole_dic[city]:
name_list = sorted(whole_dic[city][district], reverse=True)
for name in name_list:
dic_list = whole_dic[city][district][name]
flag = 0
for dic_ in dic_list:
if flag == 0:
if city not in res:
res[city] = {}
if district not in res[city]:
res[city][district] = {}
if name not in res[city][district]:
res[city][district][name] = []
flag = 1
res[city][district][name].append(dic_)
return res cityclass_list = ['A', 'B', 'C']
f = open(FNAME_JVM, 'r', encoding='gbk')
sourcefile_line = 0
for i in f:
sourcefile_line += 1
l_ = i.replace('\n', '').replace('"', '').replace('"', '').replace(' ', '').split(',')
# NOT SIMPLE
# if len(l_) < 14:
# print('sourcefile_line=',sourcefile_line,l_)
# continue
# city = l_[6]
# if city in CITY_4:
# continue
# if city not in CITY_28:
# continue
# district = l_[12]
# address = l_[13] if len(l_[13]) > 0 else l_[14]
# address = address.replace(',', '').replace('"', '')
# name = l_[9].replace(',', '').replace('"', '')
# type = l_[8] if len(l_[8]) > 0 else l_[21] # SIMPLE
city = l_[3].replace(',', '').replace('"', '')
if city in CITY_4:
continue
if city not in CITY_28:
continue
if len(l_) < 8:
print(l_)
continue
type = l_[1].replace(',', '').replace('"', '')
name = l_[2].replace(',', '').replace('"', '')
district = l_[5]
address = l_[6] if len(l_[6]) > 0 else l_[7]
if len(address) < 2:
print(232, address)
continue
# address = address.replace(',', '').replace('"', '')
# name = name.replace(',', '').replace('"', '') # if len(l_) < 14:
# print('sourcefile_line=',sourcefile_line,l_)
# continue
# city = l_[6]
# if city in CITY_4:
# continue
# if city not in CITY_28:
# continue
# district = l_[12]
# address = l_[13] if len(l_[13]) > 0 else l_[14]
# address = address.replace(',', '').replace('"', '')
# name = l_[9].replace(',', '').replace('"', '')
# type = l_[8] if len(l_[8]) > 0 else l_[21] # if cityclass == 'A':
# counter_jm_a += 1
# elif cityclass == 'B':
# counter_jm_b += 1
# elif cityclass == 'C':
# counter_jm_c += 1
dic_ = {}
dic_['type'] = type.replace(',', '')
dic_['address'] = address.replace(',', '').replace('"', '').replace('、', '')
# dic_['cityclass'] = cityclass
dic_['sourcefile_line'] = sourcefile_line
dic_['todo_if_distinct'] = 0
if not city in jm_dic:
jm_dic[city] = {}
if not district in jm_dic[city]:
jm_dic[city][district] = {}
if not name in jm_dic[city][district]:
jm_dic[city][district][name] = []
jm_dic[city][district][name].append(dic_) jm_dic = gen_sorted_namelist(jm_dic)
jm_dic = todo_if_distinct_whole_reverse(jm_dic)
jm_dic = dic_cp_field(jm_dic)
jm_dic = dic_flag_field(jm_dic, flag_='JMTool')
# {city:district:name:{type,address,cityclass}
api_dic = {}
# name type tel locationx locationy addr province city citycode district street adcode number typecode address gpsx gpsy bdx bdy
# EXTERNALDATA_NEEDED_TYPE_LIST = ['购物服务;商场;购物中心', '购物服务;商场;商场', '购物服务;商场;普通商场']
# EXTERNALDATA_NEEDED_TYPE_LIST = ['购物服务;商场;购物中心']
# EXTERNALDATA_NEEDED_TYPE_LIST = ['购物服务;商场;购物中心']
EXTERNALDATA_NEEDED_TYPE_ONLY = '购物服务;商场;购物中心'
# EXTERNALDATA_NEEDED_CITY_LIST = ['深圳市', '广州市'] f = open(FNAME_EXTERNALDATA, 'r', encoding='utf-8')
sourcefile_line = 1
MUNICIPALITY_LIST = ['北京市', '上海市', '重庆市', '天津市']
for i in f:
sourcefile_line += 1
l_ = i.replace('\n', '').split(',')
province = l_[6]
city = l_[7]
if province in MUNICIPALITY_LIST:
city = province
if city in CITY_4:
continue
if city not in CITY_28:
continue
type_list = l_[1].split('|') if EXTERNALDATA_NEEDED_TYPE_ONLY not in type_list:
continue
type = EXTERNALDATA_NEEDED_TYPE_ONLY
district = l_[9]
name = l_[0].replace('"', '')
address = l_[14].replace('"', '')
if city == '东莞市' and district.find('[]') > -1:
district = gen_dongguan_district(address)
if district == '':
continue
if len(address) < 3:
continue
dic_ = {}
dic_['sourcefile_line'] = sourcefile_line
dic_['address'] = address
dic_['todo_if_distinct'] = 0
dic_['type'] = type if city not in api_dic:
api_dic[city] = {}
if district not in api_dic[city]:
api_dic[city][district] = {}
if name not in api_dic[city][district]:
api_dic[city][district][name] = []
api_dic[city][district][name].append(dic_) api_dic = gen_sorted_namelist(api_dic)
api_dic = todo_if_distinct_whole_reverse(api_dic)
api_dic = dic_cp_field(api_dic)
api_dic = dic_flag_field(api_dic, flag_='UdiskGD') # todo_if_distinct_whole_reverse(api_dic)
distinct_dic = api_dic for city in jm_dic:
if city not in distinct_dic:
distinct_dic[city] = jm_dic[city]
continue
for district in jm_dic[city]:
if district not in distinct_dic[city]:
distinct_dic[city][district] = jm_dic[city][district]
continue
for name in jm_dic[city][district]:
if name not in distinct_dic[city][district]:
distinct_dic[city][district][name] = []
distinct_dic[city][district][name] = jm_dic[city][district][name]
else:
for i in jm_dic[city][district][name]:
i['both'] = 1
i['sourcefile_line'] = 'jm' + str(i['sourcefile_line'])
distinct_dic[city][district][name].append(i) distinct_dic = todo_if_distinct_whole_reverse(distinct_dic)
d = 8 FNAME_GEN_ALL = '商场列表_JMTool1442_28除28除北上广深_ALL.csv'
str = '来源,综合重复,自重复,是否上传,备注,city,district, name,address, type,行号\n'
f_gen_all = open(FNAME_GEN_ALL, 'w', encoding='utf-8-sig')
f_gen_all.write(str) for i in distinct_dic:
for ii in distinct_dic[i]:
for iii in distinct_dic[i][ii]:
for one in distinct_dic[i][ii][iii]:
dic_ = one
# type = dic_['type'].replace(',', '、')
address = dic_['address'].replace(',', '、')
# cityclass = dic_['cityclass']
sourcefile_line = dic_['sourcefile_line']
todo_if_distinct = dic_['todo_if_distinct']
todo_if_distinct_self = dic_['todo_if_distinct_self']
from_flag = dic_['from_flag']
if 'both' in dic_:
from_flag = 'BOTH' str = '%s,%s,%s,0,,%s,%s,%s,%s,%s,%s\n' % (
from_flag, todo_if_distinct, todo_if_distinct_self, i, ii, iii, address,
type, sourcefile_line) f_gen_all = open(FNAME_GEN_ALL, 'a', encoding='utf-8-sig')
f_gen_all.write(str) f_gen_all.closed
end_time = time.time()
print(end_time)
print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time))) print('start_time-end_time', start_time - end_time)
# -*- coding: UTF-8 -*-
import urllib3
import json
import time
import math
import sys
import re start_time = time.time()
print(start_time)
print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))) MAX_OFFSET = 25
OFFSET = MAX_OFFSET - 1
MAX_PAGINATION = 100
pagination = MAX_PAGINATION
QPS = 50
TIME_UNIT = 1
# http://lbs.amap.com/api/webservice/guide/tools/info
INFOCODE_OK = '10000' HTTP = urllib3.PoolManager() # 考虑但忽略本地磁盘IO时间
# 不考虑但不可http通信时间 f_name_teamtask = '副本执行组负责区域城市明细_20170708.csv'
f_name_gdapi_place_type_subset = 'gdapi_place_type_购物服务_真子集.text' f_name_gdapi_city_code = '高德地图API城市编码对照表forcsv.csv' f_name_gen = '商场名单列表DATAfrom高德api购物服务_副本执行组负责区域城市明细_20170708.csv'
#商场名单列表DATAfrom高德api购物服务_副本执行组负责区域城市明细_20170708_v003.csv
f_name_gen = '商场名单列表DATAfrom高德api购物服务_副本执行组负责区域城市明细_20170708_v003.csv'
# {city_district:[district,city]}
gdapi_city_district_code_dic = {} file_name_key_pool = 'key_pool.pool' KEY_POOL_LIST = []
touse_key = ''
f = open(file_name_key_pool, 'r', encoding='utf-8')
for i in f:
try:
list_ = i.split('\t')
key = i.split('\t')[1].split()
KEY_POOL_LIST.append(key[0])
except Exception:
print(Exception) KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST) - 1 ## http://restapi.amap.com/v3/place/text?key=7eca72ff53f467e17faf9cf2d7fa6cc7&types=060000|060100|060101|060102|060103|060200&city=北京
BASE_URL = 'http://restapi.amap.com/v3/place/text?key=%s&page=%s&offset=%s&city=%s&types=%s'
##高德须知:2017年7月13日 09:36:29
# count 搜索方案数目(最大值为1000)
# page =0 与 page =1 内容一样
# OFFSET 每页记录数据 强烈建议不超过25,若超过25可能造成访问报错
OFFSET = 23 # Key平台类型 Key平台类型 个人开发者 企业开发者
# 日配额(次/日)QPS(次/秒) 日配额(次/日)QPS(次/秒)
# Web服务API 搜索 1000 50 40万 300
# quota QUOTA = 1000
# 实践发现:高德并没有在某个key超过QUOTA后,不再返回INFOCODE_OK,
# #故change_key()并没有加入根据请求计数,强制换key,仅通过高德的返回码判断是否换key URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1' def change_key():
global touse_key
for i in range(0, KEY_POOL_NUM_INDICATOR, 1):
key = KEY_POOL_LIST[i]
if key == touse_key:
continue
touse_key = key
url = URL_FOR_CHANGE_KEY % (touse_key)
print(62, 'chk_key', url)
print(62, 'touse_key', touse_key)
r = HTTP.urlopen('GET', url, redirect=False)
api_res = r.data.decode('utf-8')
json_ = json.loads(api_res)
infocode = json_['infocode']
if not infocode == INFOCODE_OK:
if i == KEY_POOL_NUM_INDICATOR:
sys.exit('NOInvalidKEY')
change_key() def conv_api_res_tofile_strlist(json_, add_filefield=''):
pois = json_['pois']
strlist = []
for i in pois:
# 重点,北京市,北京市,galleria,购物服务;商场;普通商场,060101,东三环北路27号嘉铭中心B1层-019,021-022,025-026室,东城区,116.400257 39.887626,B0FFHB40IJ,0
# {} 'dict' object has no attribute 'replace'
if isinstance(i['address'], dict):
continue
# address = '' if isinstance( i['address'], dict) else i['address'].replace(',','/')
address = i['address'].replace(',', '/') adname = i['adname']
if re.match('.{0,}\d.{0,}', adname) is not None:
print(adname)
continue cityname = i['cityname']
id = i['id']
##B000A856LJ
if re.match('[0-9A-Z]{10}', id) is None:
print(id)
continue
location = i['location'].replace(',', ' ')
name = i['name']
pname = i['pname']
shopinfo = i['shopinfo']
type = i['type']
typecode = i['typecode']
str = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
add_filefield, pname, cityname, name, type, typecode, address, adname, location, id, shopinfo)
strlist.append(str)
return strlist f = open(f_name_gdapi_city_code, 'r', encoding='gbk')
for i in f:
list_ = i.replace('\n', '').split(',')
try:
city_district = list_[0]
district = list_[1]
city = list_[2]
gdapi_city_district_code_dic[city_district] = {}
gdapi_city_district_code_dic[city_district]['district'] = district
gdapi_city_district_code_dic[city_district]['city'] = city
except Exception:
print(Exception) # district_code = gdapi_city_district_code_dic[city_zh]['district']
def gen_district_code_list(city_zh):
print(140,city_zh)
global gdapi_city_district_code_dic
l= []
if city_zh not in gdapi_city_district_code_dic:
return False
city_code = gdapi_city_district_code_dic[city_zh]['city']
print(146,city_code) for i in gdapi_city_district_code_dic:
print(149,i)
dic_= gdapi_city_district_code_dic[i]
if dic_['city'] == city_code:
l.append(dic_['district'])
return l d = 9
# {important_flag:[city]}
teamtask_dic = {}
f = open(f_name_teamtask, 'r', encoding='utf-8-sig')
for i in f:
list_ = i.replace('\n', '').split(',')
try:
important_flag = list_[4]
if important_flag in ['备注', '暂不做']:
continue
city = list_[2]
if city.find('市') == -1:
city = city + '市'
if not important_flag in teamtask_dic:
teamtask_dic[important_flag] = []
teamtask_dic[important_flag].append(city)
except Exception:
print(Exception) gdapi_place_type_code_str = ''
f = open(f_name_gdapi_place_type_subset, 'r', encoding='utf-8-sig')
for i in f:
list_ = i.split('\t')
try:
code = list_[0]
gdapi_place_type_code_str += code
gdapi_place_type_code_str += '|'
except Exception:
print(Exception) gdapi_place_type_code_str = gdapi_place_type_code_str[:-1] gdapi_place_type_code_str = '060100|060101|060102|060103' # {important_flag:{city:[url]}}
# http://restapi.amap.com/v3/place/text?key=7eca72ff53f467e17faf9cf2d7fa6cc7&types=060000|060100|060101|060102|060103|060200&city=北京
#
def TOMODIFY_qps_sleep(request_count, info=''):
if request_count % QPS == 0:
print(request_count, 'sleep')
print(info)
time.sleep(TIME_UNIT) count_write_rows = 0 f_gen = open(f_name_gen, 'w', encoding='utf-8-sig')
str = 'important_flag, pname, cityname, name, type, typecode, address, adname, location, id, shopinfo\n'
f_gen.write(str)
f_gen.closed
f_gen = open(f_name_gen, 'a', encoding='utf-8-sig') request_count = 0 # gdapi_place_type_code_str = '060100'
for important_flag in teamtask_dic:
city_list = teamtask_dic[important_flag]
for city_zh in city_list:
if city_zh not in gdapi_city_district_code_dic:
print(city_zh)
continue
# city_code = gdapi_city_district_code_dic[city_zh]['city']
#血!!!传南山区,而非深圳市!!!!
district_code_list = gen_district_code_list(city_zh)
if len(district_code_list)>0:
for district_code in district_code_list:
city_code = district_code
request_count += 1
TOMODIFY_qps_sleep(request_count, count_write_rows)
url = BASE_URL % (touse_key, '0', '1', city_code, gdapi_place_type_code_str)
r = HTTP.urlopen('GET', url, redirect=False)
api_res = r.data.decode('utf-8')
json_ = json.loads(api_res)
infocode = json_['infocode']
if not infocode == INFOCODE_OK:
change_key()
print('187', url)
url = BASE_URL % (touse_key, '0', '1', city_code, gdapi_place_type_code_str)
print('189')
print(url)
request_count += 1
TOMODIFY_qps_sleep(request_count, count_write_rows)
r = HTTP.urlopen('GET', url, redirect=False)
api_res = r.data.decode('utf-8')
json_ = json.loads(api_res) json_count = json_['count'] if not int(json_count) > 0:
continue page_sum = math.ceil(int(json_count) / OFFSET)
print('204', page_sum)
for page_num in range(0, page_sum, 1): # BASE_URL = 'http://restapi.amap.com/v3/place/text?key=%s&page=%s&OFFSET=%s&city=%s&types=%s'
url = BASE_URL % (touse_key, page_num, OFFSET, city_code, gdapi_place_type_code_str)
# UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 7835-7836: unexpected end of data
try:
r = HTTP.urlopen('GET', url, redirect=False)
except Exception:
print(url)
print(Exception)
request_count += 1
TOMODIFY_qps_sleep(request_count, count_write_rows)
try:
api_res = r.data.decode('utf-8')
json_ = json.loads(api_res)
except Exception:
print(api_res)
print(url)
print(Exception)
continue
infocode = json_['infocode']
if not infocode == INFOCODE_OK:
change_key()
url = BASE_URL % (touse_key, page_num, OFFSET, city_code, gdapi_place_type_code_str)
r = HTTP.urlopen('GET', url, redirect=False)
request_count += 1
TOMODIFY_qps_sleep(request_count, count_write_rows)
api_res = r.data.decode('utf-8')
print(api_res)
json_ = json.loads(api_res) file_strlist = conv_api_res_tofile_strlist(json_, important_flag)
# TODO
# 顾虑到内存,未建立结果集合的数据词典
# TODO
## 异常url集合,但手动请求该url发现返回结果正常,待处理该类异常url
# 数据总量未知,本地io是多次写入还是单次写入----策略未知,暂且:open-write(append)-close for str in file_strlist:
f_gen = open(f_name_gen, 'a', encoding='utf-8-sig')
f_gen.write(str)
count_write_rows += 1
f_gen.closed print(count_write_rows) end_time = time.time()
print(end_time)
print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time))) print('start_time-end_time', start_time - end_time)
# -*- coding: UTF-8 -*-
import urllib3
import json
import time
import math
import sys
import re start_time = time.time()
print(start_time)
print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))) MAX_OFFSET = 25
OFFSET = MAX_OFFSET - 1
MAX_PAGINATION = 100
pagination = MAX_PAGINATION
QPS = 50
TIME_UNIT = 1
# http://lbs.amap.com/api/webservice/guide/tools/info
INFOCODE_OK = '10000' HTTP = urllib3.PoolManager() # 考虑但忽略本地磁盘IO时间
# 不考虑但不可http通信时间 f_name_teamtask = '副本执行组负责区域城市明细_20170708.csv'
f_name_gdapi_place_type_subset = 'gdapi_place_type_购物服务_真子集.text' f_name_gdapi_city_code = '高德地图API城市编码对照表forcsv.csv' f_name_gen = '商场名单列表DATAfrom高德api购物服务_副本执行组负责区域城市明细_20170708.csv'
# {city_district:[district,city]}
gdapi_city_district_code_dic = {} file_name_key_pool = 'key_pool.pool' KEY_POOL_LIST = []
touse_key = ''
f = open(file_name_key_pool, 'r', encoding='utf-8')
for i in f:
try:
list_ = i.split('\t')
key = i.split('\t')[1].split()
KEY_POOL_LIST.append(key[0])
except Exception:
print(Exception) KEY_POOL_NUM_INDICATOR = len(KEY_POOL_LIST) - 1 ## http://restapi.amap.com/v3/place/text?key=7eca72ff53f467e17faf9cf2d7fa6cc7&types=060000|060100|060101|060102|060103|060200&city=北京
BASE_URL = 'http://restapi.amap.com/v3/place/text?key=%s&page=%s&offset=%s&city=%s&types=%s'
##高德须知:2017年7月13日 09:36:29
# count 搜索方案数目(最大值为1000)
# page =0 与 page =1 内容一样
# OFFSET 每页记录数据 强烈建议不超过25,若超过25可能造成访问报错
OFFSET = 23 # Key平台类型 Key平台类型 个人开发者 企业开发者
# 日配额(次/日)QPS(次/秒) 日配额(次/日)QPS(次/秒)
# Web服务API 搜索 1000 50 40万 300
# quota QUOTA = 1000
# 实践发现:高德并没有在某个key超过QUOTA后,不再返回INFOCODE_OK,
# #故change_key()并没有加入根据请求计数,强制换key,仅通过高德的返回码判断是否换key URL_FOR_CHANGE_KEY = 'http://restapi.amap.com/v3/place/text?key=%s&types=060100&city=010&OFFSET=1' def change_key():
global touse_key
for i in range(0, KEY_POOL_NUM_INDICATOR, 1):
key = KEY_POOL_LIST[i]
if key == touse_key:
continue
touse_key = key
url = URL_FOR_CHANGE_KEY % (touse_key)
print(62, 'chk_key', url)
print(62, 'touse_key', touse_key)
r = HTTP.urlopen('GET', url, redirect=False)
api_res = r.data.decode('utf-8')
json_ = json.loads(api_res)
infocode = json_['infocode']
if not infocode == INFOCODE_OK:
if i == KEY_POOL_NUM_INDICATOR:
sys.exit('NOInvalidKEY')
change_key() def conv_api_res_tofile_strlist(json_, add_filefield=''):
pois = json_['pois']
strlist = []
for i in pois:
# 重点,北京市,北京市,galleria,购物服务;商场;普通商场,060101,东三环北路27号嘉铭中心B1层-019,021-022,025-026室,东城区,116.400257 39.887626,B0FFHB40IJ,0
# {} 'dict' object has no attribute 'replace'
if isinstance(i['address'], dict):
continue
# address = '' if isinstance( i['address'], dict) else i['address'].replace(',','/')
address = i['address'].replace(',', '/') adname = i['adname']
if re.match('.{0,}\d.{0,}', adname) is not None:
print(adname)
continue cityname = i['cityname']
id = i['id']
##B000A856LJ
if re.match('[0-9A-Z]{10}', id) is None:
print(id)
continue
location = i['location'].replace(',', ' ')
name = i['name']
pname = i['pname']
shopinfo = i['shopinfo']
type = i['type']
typecode = i['typecode']
str = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
add_filefield, pname, cityname, name, type, typecode, address, adname, location, id, shopinfo)
strlist.append(str)
return strlist f = open(f_name_gdapi_city_code, 'r', encoding='gbk')
for i in f:
list_ = i.replace('\n', '').split(',')
try:
city_district = list_[0]
district = list_[1]
city = list_[2]
gdapi_city_district_code_dic[city_district] = {}
gdapi_city_district_code_dic[city_district]['district'] = district
gdapi_city_district_code_dic[city_district]['city'] = city
except Exception:
print(Exception) d = 9
# {important_flag:[city]}
teamtask_dic = {}
f = open(f_name_teamtask, 'r', encoding='utf-8-sig')
for i in f:
list_ = i.replace('\n', '').split(',')
try:
important_flag = list_[4]
if important_flag in ['备注', '暂不做']:
continue
city = list_[2]
if city.find('市') == -1:
city = city + '市'
if not important_flag in teamtask_dic:
teamtask_dic[important_flag] = []
teamtask_dic[important_flag].append(city)
except Exception:
print(Exception) gdapi_place_type_code_str = ''
f = open(f_name_gdapi_place_type_subset, 'r', encoding='utf-8-sig')
for i in f:
list_ = i.split('\t')
try:
code = list_[0]
gdapi_place_type_code_str += code
gdapi_place_type_code_str += '|'
except Exception:
print(Exception) gdapi_place_type_code_str = gdapi_place_type_code_str[:-1] # {important_flag:{city:[url]}}
# http://restapi.amap.com/v3/place/text?key=7eca72ff53f467e17faf9cf2d7fa6cc7&types=060000|060100|060101|060102|060103|060200&city=北京
#
def TOMODIFY_qps_sleep(request_count, info=''):
if request_count % QPS == 0:
print(request_count, 'sleep')
print(info)
time.sleep(TIME_UNIT) count_write_rows = 0 f_gen = open(f_name_gen, 'w', encoding='utf-8-sig')
str = 'important_flag, pname, cityname, name, type, typecode, address, adname, location, id, shopinfo\n'
f_gen.write(str)
f_gen.closed
f_gen = open(f_name_gen, 'a', encoding='utf-8-sig') request_count = 0 # gdapi_place_type_code_str = '060100'
for important_flag in teamtask_dic:
city_list = teamtask_dic[important_flag]
for city_zh in city_list:
if city_zh not in gdapi_city_district_code_dic:
print(city_zh)
continue
city_code = gdapi_city_district_code_dic[city_zh]['city']
request_count += 1
TOMODIFY_qps_sleep(request_count, count_write_rows)
url = BASE_URL % (touse_key, '0', '1', city_code, gdapi_place_type_code_str)
r = HTTP.urlopen('GET', url, redirect=False)
api_res = r.data.decode('utf-8')
json_ = json.loads(api_res)
infocode = json_['infocode']
if not infocode == INFOCODE_OK:
change_key()
print('187', url)
url = BASE_URL % (touse_key, '0', '1', city_code, gdapi_place_type_code_str)
print('189')
print(url)
request_count += 1
TOMODIFY_qps_sleep(request_count, count_write_rows)
r = HTTP.urlopen('GET', url, redirect=False)
api_res = r.data.decode('utf-8')
json_ = json.loads(api_res) json_count = json_['count'] if not int(json_count) > 0:
continue page_sum = math.ceil(int(json_count) / OFFSET)
print('204', page_sum)
for page_num in range(0, page_sum, 1): # BASE_URL = 'http://restapi.amap.com/v3/place/text?key=%s&page=%s&OFFSET=%s&city=%s&types=%s'
url = BASE_URL % (touse_key, page_num, OFFSET, city_code, gdapi_place_type_code_str)
r = HTTP.urlopen('GET', url, redirect=False)
request_count += 1
TOMODIFY_qps_sleep(request_count, count_write_rows)
api_res = r.data.decode('utf-8')
try:
json_ = json.loads(api_res)
except Exception:
print(api_res)
print(url)
print(Exception)
continue
infocode = json_['infocode']
if not infocode == INFOCODE_OK:
change_key()
url = BASE_URL % (touse_key, page_num, OFFSET, city_code, gdapi_place_type_code_str)
r = HTTP.urlopen('GET', url, redirect=False)
request_count += 1
TOMODIFY_qps_sleep(request_count, count_write_rows)
api_res = r.data.decode('utf-8')
print(api_res)
json_ = json.loads(api_res) file_strlist = conv_api_res_tofile_strlist(json_, important_flag)
# TODO
# 数据总量未知,本地io是多次写入还是单次写入----策略未知,暂且:open-write(append)-close
for str in file_strlist:
f_gen = open(f_name_gen, 'a', encoding='utf-8-sig')
f_gen.write(str)
count_write_rows += 1
f_gen.closed print(count_write_rows) end_time = time.time()
print(end_time)
print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time))) print('start_time-end_time', start_time - end_time)
# -*- coding: UTF-8 -*-
import urllib3
import json
import time f_name_teamtask = '副本执行组负责区域城市明细_20170708.csv'
f_name_gdapi_place_type_subset = 'gdapi_place_type_购物服务_真子集.text' f_name_gdapi_city_code = '高德地图API城市编码对照表forcsv.csv' f_name_gen = '商场名单列表DATAfrom高德api购物服务_副本执行组负责区域城市明细_20170708.csv'
# {city_district:[district,city]}
gdapi_city_district_code_dic = {} f = open(f_name_gdapi_city_code, 'r', encoding='gbk')
for i in f:
list_ = i.replace('\n', '').split(',')
try:
city_district = list_[0]
district = list_[1]
city = list_[2]
gdapi_city_district_code_dic[city_district] = {}
gdapi_city_district_code_dic[city_district]['district'] = district
gdapi_city_district_code_dic[city_district]['city'] = city
except Exception:
print(Exception) d = 9
# {important_flag:[city]}
teamtask_dic = {}
f = open(f_name_teamtask, 'r', encoding='utf-8-sig')
for i in f:
list_ = i.replace('\n', '').split(',')
try:
important_flag = list_[4]
if important_flag in ['备注', '暂不做']:
continue
city = list_[2]
if city.find('市') == -1:
city = city + '市'
if not important_flag in teamtask_dic:
teamtask_dic[important_flag] = []
teamtask_dic[important_flag].append(city)
except Exception:
print(Exception) api_url_dic = {} gdapi_place_type_code_str = ''
f = open(f_name_gdapi_place_type_subset, 'r', encoding='utf-8-sig')
for i in f:
list_ = i.split('\t')
try:
code = list_[0]
gdapi_place_type_code_str += code
gdapi_place_type_code_str += '|'
except Exception:
print(Exception) gdapi_place_type_code_str = gdapi_place_type_code_str[:-1] # {important_flag:{city:[url]}}
# http://restapi.amap.com/v3/place/text?key=7eca72ff53f467e17faf9cf2d7fa6cc7&types=060000|060100|060101|060102|060103|060200&city=北京 KEY = '7eca72ff53f467e17faf9cf2d7fa6cc7'
# offset = 24
pagination = 100
base_url = 'http://restapi.amap.com/v3/place/text?&offset=24&key=' + KEY + '&types=' + gdapi_place_type_code_str + '&city=' for important_flag in teamtask_dic:
if not important_flag in api_url_dic:
api_url_dic[important_flag] = {}
for city in teamtask_dic[important_flag]:
city_code = gdapi_city_district_code_dic[city]['city'] url = ''
url = base_url + city_code
for page_num in range(0, pagination, 1):
page = '&page=%i' % (page_num)
url_page = url + page
if not city in api_url_dic[important_flag]:
api_url_dic[important_flag][city] = []
api_url_dic[important_flag][city].append(url_page) count_write_rows = 0
f_gen = open(f_name_gen, 'w', encoding='utf-8-sig')
str = 'important_flag, pname, cityname, name, type, typecode, address, adname, location, id, shopinfo\n'
f_gen.write(str)
f_gen.closed
# f_gen = open(f_name_gen, 'a', encoding='utf-8-sig') http = urllib3.PoolManager()
for important_flag in api_url_dic:
city_dic = api_url_dic[important_flag]
for city in city_dic:
url_list = city_dic[city] for page_num in range(0, pagination, 1):
url = url_list[page_num]
r = http.urlopen('GET', url, redirect=False)
api_res = r.data.decode('utf-8')
json_ = json.loads(api_res)
json_count = json_['count']
if int(json_count) > 0:
pois = json_['pois']
for i in pois:
address = i['address']
adname = i['adname']
cityname = i['cityname']
id = i['id']
location = i['location'].replace(',', ' ')
name = i['name']
pname = i['pname']
shopinfo = i['shopinfo']
type = i['type']
typecode = i['typecode']
str = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
important_flag, pname, cityname, name, type, typecode, address, adname, location, id, shopinfo) f_gen = open(f_name_gen, 'a', encoding='utf-8-sig')
f_gen.write(str)
count_write_rows += 1
print(count_write_rows)
time.sleep(1)
f_gen.closed # f_gen.closed
print(count_write_rows)
# -*- coding: UTF-8 -*-
import re
import pprint def deal_exception(str):
global area_name_exception_list, count_area_name_exception
count_area_name_exception += 1
area_name_exception_list.append(str) # {areacode:address}
areacode_addres_dic = {} # 城市(city)+区域(region)+地址(adress)+商圈名(area_name)+点名称(cp_name)+店铺(brand)
# {city:{district:{area_name:{address:{brand_name:{cp_name_list,area_code,cp_codel}}}}}}
area_name_dic = {}
# {city_pinyin:zh}
city_pinyin_zh_dic = {}
# {dbid:district_pinyin}
district_dbid_pinyin_dic = {}
# {district_pinyin:district_zh}
district_pinyin_zh_dic = {} area_name_exception_list = []
count_area_name = 0
count_area_name_exception = 0 f_name_db = 'test_area.csv'
f_name_csv = 'business_20170705.csv' f_name_db_city_pinyin = 'test_city.csv'
f_name_db_tbl_city_region_standard = 'test_city_region_standard.csv' datafrom_ = f_name_db + f_name_db_city_pinyin + f_name_db_tbl_city_region_standard
datafrom = f_name_csv + '--' + datafrom_.replace('.csv', '--') f_name_gen = '成功_商场名单表_DATAfrom' + datafrom + '.csv'
f_name_gen_fail = '失败_商场名单表_DATAfromtest_area.csv+business_20170705.csv' f = open(f_name_db, 'r', encoding='utf-8-sig')
for i in f:
list_ = i.replace('\n', '').split(',')
try:
area_code = list_[1]
address = list_[9] if len(list_[8]) < len(list_[9]) else list_[8]
district_pinyin = list_[1].split('-')[3]
district_dbid = list_[17]
# 强规则:要求区、地址,都不可缺失
if len(address) < 7 or len(district_dbid) < 1 or district_pinyin == 'USERADD':
continue
district_dbid_pinyin_dic[district_dbid] = district_pinyin
if area_code not in areacode_addres_dic:
areacode_addres_dic[area_code] = address
except Exception:
print(list_) f = open(f_name_db_city_pinyin, 'r', encoding='utf-8-sig')
for i in f:
list_ = i.replace('\n', '').split(',')
try:
zh = list_[1]
pinyin = list_[2]
city_pinyin_zh_dic[pinyin] = zh
except Exception:
print(list_) f = open(f_name_db_tbl_city_region_standard, 'r', encoding='utf-8-sig')
for i in f:
list_ = i.replace('\n', '').split(',')
try:
district_dbid = list_[0]
district_zh = list_[2]
district_pinyin = district_dbid_pinyin_dic[district_dbid]
district_pinyin_zh_dic[district_pinyin] = district_zh
except Exception:
print(list_) def get_brand_name(str):
l_ = str.split(';')
for i in l_:
ll = i.split(':')
if ll[0] == '品牌':
return ll[1]
return '' f = open(f_name_csv, 'r', encoding='utf-8-sig')
for i in f:
list_ = i.replace('\n', '').split(',')
try:
area_code = list_[3]
cp_code = list_[7]
address = areacode_addres_dic[area_code]
city_district = area_code.split('-')
city = city_district[2]
district = city_district[3]
area_name = list_[4].split('(')[0]
brand_name = get_brand_name(list_[10])
if brand_name == '':
continue
cp_name = list_[6]
dic_detail = {}
dic_detail['area_code'] = area_code
dic_detail['cp_code'] = cp_code dic_detail['cp_name_list'] = 'reerr'
dic_detail['cp_name_list'] = [cp_name]
if city in area_name_dic:
if district in area_name_dic[city]:
if area_name in area_name_dic[city][district]:
if address in area_name_dic[city][district][area_name]:
if brand_name in area_name_dic[city][district][area_name][address]:
if cp_name not in area_name_dic[city][district][area_name][address][brand_name][
'cp_name_list']:
area_name_dic[city][district][area_name][address][brand_name]['cp_name_list'].append(
cp_name)
else:
area_name_dic[city][district][area_name][address] = {}
area_name_dic[city][district][area_name][address][brand_name] = dic_detail
else:
area_name_dic[city][district][area_name] = {}
area_name_dic[city][district][area_name][address] = {}
area_name_dic[city][district][area_name][address][brand_name] = dic_detail
else:
area_name_dic[city][district] = {}
area_name_dic[city][district][area_name] = {}
area_name_dic[city][district][area_name][address] = {}
area_name_dic[city][district][area_name][address][brand_name] = dic_detail
else:
area_name_dic[city] = {}
area_name_dic[city][district] = {}
area_name_dic[city][district][area_name] = {}
area_name_dic[city][district][area_name][address] = {}
area_name_dic[city][district][area_name][address][brand_name] = dic_detail except Exception:
d = 7
# print(list_) # 城市(city)+区域(region)+地址(adress)+商圈名(area_name)+点名称(cp_name)+店铺(brand)
# {city:{district:{area_name:{address:{brand_name:{cp_name_list,area_code,cp_codel}}}}}} f_gen = open(f_name_gen, 'w', encoding='utf-8-sig')
f_gen.write('')
str = 'DATAfrom,城市,区域,地址,商圈名,店铺(品牌),点名称,area_code,cp_code\n'
f_gen.write(str)
f_gen.closed f_gen = open(f_name_gen, 'a', encoding='utf-8-sig')
count_write_rows = 0
for i in area_name_dic:
city = i
city_zh = city
if city in city_pinyin_zh_dic:
city_zh = city_pinyin_zh_dic[city]
else:
continue
for ii in area_name_dic[i]:
district = ii
if district in district_pinyin_zh_dic:
district_zh = district_pinyin_zh_dic[district]
else:
continue
for iii in area_name_dic[i][ii]:
area_name = iii
for iv in area_name_dic[i][ii][iii]:
address = iv
for v in area_name_dic[i][ii][iii][iv]:
brand_name = v
brand_name_dic = area_name_dic[i][ii][iii][iv][v]
area_code = brand_name_dic['area_code']
cp_code = brand_name_dic['cp_code']
cp_name_str = '+'.join(brand_name_dic['cp_name_list'])
str = '%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
datafrom, city_zh, district_zh, address, area_name, brand_name, cp_name_str, area_code, cp_code)
f_gen.write(str)
count_write_rows += 1
f_gen.closed
print(count_write_rows)
# -*- coding: UTF- -*-
import re
import pprint def chk_is_coffee(str):
l_ = ['coffee', 'coffe', 'cafe', 'café', 'starbucks', 'caffé']
# 上岛花式铁板烧 日本菜
# 泛太平洋大酒店面馆 其他
l_b = ['咖啡', '星巴克']
# 星巴克
for i in l_:
if str.upper().find(i.upper()) != -:
return True
for i in l_b:
if str.find(i) != -:
return True
return False def chk_kfc_mdl(str):
if str.find(u"麦当劳") != -:
return
elif str.find(u"肯德基") != - or str.upper().find(u"KFC") != -:
return
else:
return def get_brand_name(str):
if str.find("麦当劳") != -:
return '麦当劳'
elif str.find("肯德基") != - or str.upper().find(u"KFC") != -:
return '肯德基'
else:
# str = '狗不理包子(前门店)'
# str = '(清真)三羊水饺(新民路店)'
# | 添椒 | 潮涮三国IP火锅
if str.find('(') == - and str.find('(') == -:
return str
res = str.strip(' ').split('(')[].strip(' ')
if len(res) == :
try:
res = str.split(')')[].split('(')[]
except Exception:
print(Exception)
# 一锅两头牛(烟青路店)
res_b = res
try:
res_b = res.split('(')[]
except Exception:
print(Exception) return res_b def chk_city_district(str):
city_district = str.replace(' ', '')
if re.match(r".*[a-zA-Z0-9]", city_district) is not None:
return False
elif str.find("[") != - or str.find("(") != - or str.find(")") != -:
return False
else:
return city_district def chk_catering_kind(str):
catering_kind = str.replace(' ', '')
if re.match(r".*[0-9]", catering_kind) is not None:
return False
else:
return catering_kind # ['a','',' ']
def chk_list_thickness(list_):
if len(list_) == :
return False
res_list = []
for i in list_:
i_b = i.replace(' ', '')
if i.replace(' ', '') == '':
return False
else:
res_list.append(i_b)
return res_list def get_exception(file_, data_from='m_d_4'):
global count_catering, cater_dic, cater_exception_list, count_catering_exception, coffee_list, count_coffee
for i in file_:
list_ = i.replace('\n', '').split(',')
# 数据准备层
# 针对不同输入的文件,获取原始的关注的字段:这相当于抽象了一层,将文件层抽象
if data_from == 'm_d_4':
city = list_[]
district = list_[]
address = list_[]
brand_name = list_[]
catering_kind = list_[]
average_price = list_[]
elif data_from == 'm_d_5':
city = list_[]
district = list_[]
address = list_[]
brand_name = list_[]
catering_kind = list_[]
average_price = list_[]
elif data_from == 'GD_map':
city = list_[] if not list_[].find('[') != - else list_[]
district = list_[]
address = list_[] + list_[]
brand_name = list_[]
average_price = ''
catering_kind = '快餐厅'
# 数据运算层
# 该层处理从目标文件取出的字段列表
focus_list = [city, district, address, brand_name, catering_kind, average_price, data_from]
dic_exception = {}
dic_exception['data_from'] = data_from
dic_exception['city'] = city
dic_exception['district'] = district
dic_exception['brand_name'] = brand_name
dic_exception['address'] = address
dic_exception['catering_kind'] = catering_kind
dic_exception['average_price'] = average_price if not chk_list_thickness(focus_list) or not chk_city_district(city) or not chk_city_district(
district) or not chk_catering_kind(catering_kind):
count_catering_exception +=
cater_exception_list.append(dic_exception)
else:
brand_name = get_brand_name(brand_name) m = chk_is_coffee(brand_name)
# if m:
# print(list_)
if not m:
m = chk_is_coffee(catering_kind)
if m:
count_coffee +=
coffee_list.append(dic_exception) if not m:
dic_details = {}
dic_details['data_from'] = data_from
dic_details['catering_kind'] = catering_kind
dic_details['average_price'] = average_price if city in cater_dic:
if district in cater_dic[city]:
if brand_name in cater_dic[city][district]:
if address in cater_dic[city][district][brand_name]:
continue
else:
cater_dic[city][district][brand_name][address] = dic_details
else:
cater_dic[city][district][brand_name] = {}
cater_dic[city][district][brand_name][address] = dic_details
else:
cater_dic[city][district] = {}
cater_dic[city][district][brand_name] = {}
cater_dic[city][district][brand_name][address] = dic_details
else:
cater_dic[city] = {}
cater_dic[city][district] = {}
cater_dic[city][district][brand_name] = {}
cater_dic[city][district][brand_name][address] = dic_details
count_catering += # f = open('MEITUAN_DAZHONG_20170704.csv', 'r', encoding='gbk')
f_2 = open('MEITUAN_DAZHONG_20170705.csv', 'r', encoding='gbk')
f_gd = open('GD_POI_KFC_MC.csv', 'r', encoding='utf-8-sig') # 城市 城市编码 区域 区域编码 街道 地址 楼宇名(任务名/商圈名/area_name) 楼宇名编码 店铺名 店铺编码 店铺类型
# 城市 城市编码 区域 区域编码 街道 地址 楼宇名(任务名/商圈名/area_name) 楼宇名编码 店铺名 "采集点名称 cp_name" "采集点编码cp_code" 店铺类型
# city,citycode,distric,districtcode,street,,address,city,area,longitude,latitude
# 城市 区域 街道 地址 楼宇名(任务名/商圈名/area_name) 楼宇名编码 店铺名 "采集点名称 cp_name" "采集点编码cp_code" 店铺类型
# 城市 区 地址 品牌名
# city,distric,address,brand_name,catering_kind # 市key-区key-品牌名key-地址key-{菜类,均价,data_from}
# {city:{district:{brand_name:{address:{catering_kind,average_price,data_from}}}}}
cater_dic = {}
# [{city,district,address,brand_name,catering_kind,average_price,data_from}]
cater_exception_list = []
count_catering =
count_catering_exception = coffee_list = []
count_coffee = get_exception(f_2, 'm_d_5')
d5 = count_catering
get_exception(f_gd, 'GD_map')
d6 = count_catering
print(d5, d6) # pp = pprint.PrettyPrinter(depth=)
# pp.pprint(cater_exception_list) # pp = pprint.PrettyPrinter(depth=)
# pp.pprint(cater_dic)
# f_name = '成功_美团大众_高德_去咖啡_餐饮_DATAfrom美团大众20170705和未标日期高德数据.csv'
f = open(f_name, 'w', encoding='utf-8-sig')
f.write('')
f.closed
f = open(f_name, 'a', encoding='utf-8-sig')
str = '市,区,品牌名,地址,菜别(类型),均价,data_from\n'
f.write(str)
## city,district,address,brand_name,catering_kind,average_price,data_from
count_write_rows =
for i in cater_dic:
city = i
if city == '城市':
continue
for ii in cater_dic[i]:
district = ii
for iii in cater_dic[i][ii]:
brand_name = iii
for iv in cater_dic[i][ii][iii]:
address = iv
catering_kind = cater_dic[i][ii][iii][iv]['catering_kind']
average_price = cater_dic[i][ii][iii][iv]['average_price']
data_from = cater_dic[i][ii][iii][iv]['data_from']
str = '%s,%s,%s,%s,%s,%s,%s\n' % (
city, district, brand_name, address, catering_kind, average_price, data_from)
f.write(str)
count_write_rows +=
f.closed
print(count_write_rows) f_name = '失败_美团大众_高德_DATAfrom美团大众20170705和未标日期高德数据.csv'
f = open(f_name, 'w', encoding='utf-8-sig')
f.write('')
f.closed
f = open(f_name, 'a', encoding='utf-8-sig')
str = '市,区,品牌名,地址,菜别(类型),均价,data_from\n'
f.write(str)
## city,district,address,brand_name,catering_kind,average_price
count_write_rows =
for i in cater_exception_list:
city = i['city']
district = i['district']
brand_name = i['brand_name']
address = i['address']
catering_kind = i['catering_kind']
average_price = i['average_price']
data_from = i['data_from']
str = '%s,%s,%s,%s,%s,%s,%s\n' % (
city, district, brand_name, address, catering_kind, average_price, data_from)
f.write(str)
count_write_rows +=
f.closed
print(count_write_rows) f_name = '成功_美团大众_高德_仅咖啡_DATAfrom美团大众20170705和未标日期高德数据.csv'
f = open(f_name, 'w', encoding='utf-8-sig')
f.write('')
f.closed
f = open(f_name, 'a', encoding='utf-8-sig')
str = '市,区,品牌名,地址,菜别(咖啡),均价,data_from\n'
f.write(str)
## city,district,address,brand_name,catering_kind,average_price
count_write_rows =
for i in coffee_list:
city = i['city']
district = i['district']
brand_name = i['brand_name'].split('(')[]
address = i['address']
catering_kind = i['catering_kind']
average_price = i['average_price']
data_from = i['data_from']
str = '%s,%s,%s,%s,%s,%s,%s\n' % (
city, district, brand_name, address, catering_kind, average_price, data_from)
f.write(str)
count_write_rows +=
f.closed
print(count_write_rows)
# -*- coding: UTF-8 -*-
import re
import pprint # 城市 城市编码 区域 区域编码 街道 地址 楼宇名(任务名/商圈名/area_name) 楼宇名编码 店铺名 店铺编码 店铺类型
# 城市 城市编码 区域 区域编码 街道 地址 楼宇名(任务名/商圈名/area_name) 楼宇名编码 店铺名 "采集点名称 cp_name" "采集点编码cp_code" 店铺类型
# city,citycode,distric,districtcode,street,,address,city,area,longitude,latitude
# 城市 区域 街道 地址 楼宇名(任务名/商圈名/area_name) 楼宇名编码 店铺名 "采集点名称 cp_name" "采集点编码cp_code" 店铺类型
# 城市 区 地址 品牌名
# city,distric,address,brand_name,catering_kind def chk_is_coffee(str):
l_ = ['coffee', 'coffe', 'cafe']
# 上岛花式铁板烧 日本菜
# 泛太平洋大酒店面馆 其他
l_b = ['咖啡', '星巴克']
# 星巴克
for i in l_:
if str.upper().find(i.upper()) != -1:
return True
for i in l_b:
if str.find(i) != -1:
return True
return False def chk_kfc_mdl(str):
if str.find(u"麦当劳") != -1:
return 1
elif str.find(u"肯德基") != -1 or str.upper().find(u"KFC") != -1:
return 0
else:
return 2 def get_brand_name(str):
if str.find("麦当劳") != -1:
return '麦当劳'
elif str.find("肯德基") != -1 or str.upper().find(u"KFC") != -1:
return '肯德基'
else:
# str = '狗不理包子(前门店)'
# str = '(清真)三羊水饺(新民路店)'
# | 添椒 | 潮涮三国IP火锅
if str.find('(') == -1 and str.find('(') == -1:
return str
res = str.strip(' ').split('(')[0].strip(' ')
if len(res) == 0:
try:
res = str.split(')')[1].split('(')[0]
except Exception:
print(Exception)
# 一锅两头牛(烟青路店)
res_b = res
try:
res_b = res.split('(')[0]
except Exception:
print(Exception) return res_b def chk_city_district(str):
city_district = str.replace(' ', '')
if re.match(r".*[a-zA-Z0-9]", city_district) is not None:
return False
elif str.find("[") != -1 or str.find("(") != -1 or str.find(")") != -1:
return False
else:
return city_district def chk_catering_kind(str):
catering_kind = str.replace(' ', '')
if re.match(r".*[0-9]", catering_kind) is not None:
return False
else:
return catering_kind # ['a','',' ']
def chk_list_thickness(list_):
if len(list_) == 0:
return False
res_list = []
for i in list_:
i_b = i.replace(' ', '')
if i.replace(' ', '') == '':
return False
else:
res_list.append(i_b)
return res_list def get_exception(file_, data_from='m_d_4'):
global count_catering, cater_dic, cater_exception_list, count_catering_exception, coffee_list, count_coffee
for i in file_:
list_ = i.replace('\n', '').split(',')
# 针对不同输入的文件,获取原始的关注的字段:这相当于抽象了一层,将文件层抽象
if data_from == 'm_d_4':
city = list_[1]
district = list_[2]
address = list_[3]
brand_name = list_[4]
catering_kind = list_[6]
average_price = list_[5]
elif data_from == 'm_d_5':
city = list_[0]
district = list_[1]
address = list_[2]
brand_name = list_[3]
catering_kind = list_[5]
average_price = list_[4]
elif data_from == 'GD_map':
city = list_[7] if not list_[7].find('[') != -1 else list_[6]
district = list_[9]
address = list_[10] + list_[11]
brand_name = list_[0]
average_price = '1'
catering_kind = '快餐厅'
# 该层处理从目标文件取出的字段列表
focus_list = [city, district, address, brand_name, catering_kind, average_price, data_from]
dic_exception = {}
dic_exception['data_from'] = data_from
dic_exception['city'] = city
dic_exception['district'] = district
dic_exception['brand_name'] = brand_name
dic_exception['address'] = address
dic_exception['catering_kind'] = catering_kind
dic_exception['average_price'] = average_price if not chk_list_thickness(focus_list) or not chk_city_district(city) or not chk_city_district(
district) or not chk_catering_kind(catering_kind):
count_catering_exception += 1
cater_exception_list.append(dic_exception)
else:
brand_name = get_brand_name(brand_name) m = chk_is_coffee(brand_name)
# if m:
# print(list_)
if not m:
m = chk_is_coffee(catering_kind)
if m:
count_coffee += 1
coffee_list.append(dic_exception)
if not m:
dic_city_district_details = {}
dic_city_district_details['data_from'] = data_from
dic_city_district_details['brand_name'] = brand_name
dic_city_district_details['address'] = address
dic_city_district_details['catering_kind'] = catering_kind
dic_city_district_details['average_price'] = average_price
if city in cater_dic:
if district in cater_dic[city]:
cater_dic[city][district].append(dic_city_district_details)
else:
dic_city_district = []
dic_city_district.append(dic_city_district_details)
cater_dic[city][district] = dic_city_district
else:
dic_city = {}
dic_city_district = []
dic_city_district.append(dic_city_district_details)
dic_city[district] = dic_city_district
cater_dic[city] = dic_city
count_catering += 1 s = '数据来源MEITUAN_DAZHONG_20170704.csv、MEITUAN_DAZHONG_20170705.csv、GD_POI_KFC_MC.csv'
print(s) f = open('MEITUAN_DAZHONG_20170704.csv', 'r', encoding='gbk')
f_2 = open('MEITUAN_DAZHONG_20170705.csv', 'r', encoding='gbk')
f_gd = open('GD_POI_KFC_MC.csv', 'r', encoding='utf-8-sig') # 市key-区key-[品牌名,地址,菜类,均价,data_from]
# city,district,address,brand_name,catering_kind,average_price,data_from
# {city:{district:[{address,brand_name,catering_kind,average_price,data_from}]}}
cater_dic = {}
# [{city,district,address,brand_name,catering_kind,average_price,data_from}]
cater_exception_list = []
count_catering = 0
count_catering_exception = 0
# [{city,district,address,brand_name,catering_kind,average_price,data_from}]
coffee_list = []
count_coffee = 0 get_exception(f)
d4 = count_catering
get_exception(f_2, 'm_d_5')
d5 = count_catering
get_exception(f_gd, 'GD_map')
d6 = count_catering
print(d4, d5, d6) # pp = pprint.PrettyPrinter(depth=6)
# pp.pprint(cater_exception_list) # pp = pprint.PrettyPrinter(depth=6)
# pp.pprint(cater_dic)
# f_name = '成功_美团大众_高德_去咖啡_餐饮_DATAfrom美团大众20170704-0705和未标日期高德数据.csv'
f = open(f_name, 'w', encoding='utf-8-sig')
f.write('')
f.closed
f = open(f_name, 'a', encoding='utf-8-sig')
str = '市,区,品牌名,地址,菜别(类型),均价,data_from\n'
f.write(str)
## city,district,address,brand_name,catering_kind,average_price,data_from
count_write_rows = 0
for i in cater_dic:
city = i
for ii in cater_dic[i]:
district = ii
for iii in cater_dic[i][ii]:
brand_name = iii['brand_name']
address = iii['address']
catering_kind = iii['catering_kind']
average_price = iii['average_price']
data_from = iii['data_from']
str = '%s,%s,%s,%s,%s,%s,%s\n' % (
city, district, brand_name, address, catering_kind, average_price, data_from)
f.write(str)
count_write_rows += 1
f.closed
print(count_write_rows) f_name = '失败_美团大众_高德_DATAfrom美团大众20170704-0705和未标日期高德数据.csv'
f = open(f_name, 'w', encoding='utf-8-sig')
f.write('')
f.closed
f = open(f_name, 'a', encoding='utf-8-sig')
str = '市,区,品牌名,地址,菜别(类型),均价,data_from\n'
f.write(str)
## city,district,address,brand_name,catering_kind,average_price
count_write_rows = 0
for i in cater_exception_list:
city = i['city']
district = i['district']
brand_name = i['brand_name']
address = i['address']
catering_kind = i['catering_kind']
average_price = i['average_price']
data_from = i['data_from']
str = '%s,%s,%s,%s,%s,%s,%s\n' % (
city, district, brand_name, address, catering_kind, average_price, data_from)
f.write(str)
count_write_rows += 1
f.closed
print(count_write_rows) f_name = '成功_美团大众_高德_仅咖啡_DATAfrom美团大众20170704-0705和未标日期高德数据.csv'
f = open(f_name, 'w', encoding='utf-8-sig')
f.write('')
f.closed
f = open(f_name, 'a', encoding='utf-8-sig')
str = '市,区,品牌名,地址,菜别(咖啡),均价,data_from\n'
f.write(str)
## city,district,address,brand_name,catering_kind,average_price
count_write_rows = 0
for i in coffee_list:
city = i['city']
district = i['district']
brand_name = i['brand_name']
address = i['address']
catering_kind = i['catering_kind']
average_price = i['average_price']
data_from = i['data_from']
str = '%s,%s,%s,%s,%s,%s,%s\n' % (
city, district, brand_name, address, catering_kind, average_price, data_from)
f.write(str)
count_write_rows += 1
f.closed
print(count_write_rows) def fun4(file_, data_from='m_d_4'):
global count_catering, cater_dic
for i in file_:
list_ = i.replace('\n', '').split(',')
if data_from == 'm_d_4':
city = list_[1]
# 大众点评 南京市 烧仙草) 浦口区-江浦街道珠江路1-11号 悸动(奶茶 0 面包甜点
district = list_[2]
address = list_[3].strip(' ')
brand_name = get_brand_name(list_[4])
catering_kind = list_[6] if not isinstance(list_[6], int) and len(list_[6]) > 0 else '值缺失_菜类'
average_price = list_[5]
elif data_from == 'm_d_5':
city = list_[0]
district = list_[1]
address = list_[2].strip(' ')
brand_name = get_brand_name(list_[3])
catering_kind = list_[5] if not isinstance(list_[5], int) and len(list_[5]) > 0 else '值缺失_菜类'
average_price = list_[4]
elif data_from == 'GD_map':
city = list_[7] if not list_[7].find('[') != -1 else list_[6]
district = list_[9]
address = list_[10] + list_[11]
brand_name = list_[0] if len(list_[0]) > 0 else ''
average_price = ''
if len(brand_name) > 0:
chk_kfc_mdl_res = chk_kfc_mdl(brand_name)
if chk_kfc_mdl_res == 1:
brand_name = '麦当劳'
elif chk_kfc_mdl_res == 0:
brand_name = '肯德基'
else:
brand_name = '值缺失_品牌名' catering_kind = '快餐厅' city = city.replace(' ', '').replace(',', '').replace(',', '')
district = district.replace(' ', '').replace(',', '')
address = address.replace(' ', '').replace(',', '')
brand_name = brand_name.replace(' ', '').replace(',', '')
catering_kind = catering_kind.replace(' ', '').replace(',', '')
average_price = average_price.replace(' ', '').replace(',', '')
if average_price == '':
average_price = '值缺失_均价'
if catering_kind == '':
average_price = '值缺失_菜类'
if len(city) == 0 or len(district) == 0 or len(address) == 0 or len(brand_name) == 0:
# 数据缺失,则直接剔除
# print(list_)
continue
if re.match(r".*\d", city) is not None or city == 'city':
print(city)
continue
# 新疆维吾尔自治区
# 香港特別行政區
if city.find('市') == -1 and city.find('州') == -1 and city.find('區') != -1 and city.find('区') != -1:
print(city)
continue if re.match(r".*\[", district) is not None or re.match(r".*[a-zA-Z0-9]", district) is not None or district.find(
'店') != -1:
district = '值缺失_区'
if re.match(r".*\[", district) is not None or re.match(r".*[a-zA-Z0-9]", district) is not None or district.find(
'店') != -1:
district = '值缺失_区'
# 长沙县长沙理工大学云塘校区
if len(district.split('县')) >= 2:
print(district)
district = district.split('县')[0] + '县'
print(district) # 安宁市连然街道办事处光明社区
if len(district.split('市')) >= 2:
print(district)
district = district.split('市')[0] + '市'
print(district) m = chk_is_coffee(brand_name)
# if m:
# print(list_)
if not m:
m = chk_is_coffee(catering_kind)
# if m:
# print(list_)
if not m:
dic_city_district_details = {}
dic_city_district_details['brand_name'] = brand_name
dic_city_district_details['address'] = address
dic_city_district_details['catering_kind'] = catering_kind
dic_city_district_details['average_price'] = average_price
if city in cater_dic:
if district in cater_dic[city]:
cater_dic[city][district].append(dic_city_district_details)
else:
dic_city_district = []
dic_city_district.append(dic_city_district_details)
cater_dic[city][district] = dic_city_district
else:
dic_city = {}
dic_city_district = []
dic_city_district.append(dic_city_district_details)
dic_city[district] = dic_city_district
cater_dic[city] = dic_city
count_catering += 1
排查原料数据,发现其中的“区”字段值存在的问题:
0-缺失;
1-有值但是无法仅仅通过“区”字段值提取出“区”字段值;
1-有值但是可以仅仅通过“区”字段值提取出“区”字段值; 处理思路:
对于原料数据:
0-是取出100%合格的一手数据,将其他数据单独选出;
1-针对每个文件,去对单个文件建立规则:一个文件一个文件地去处理。
w 排查原料数据,未见异常
# -*- coding: UTF-8 -*-
import re
import pprint # 城市 城市编码 区域 区域编码 街道 地址 楼宇名(任务名/商圈名/area_name) 楼宇名编码 店铺名 店铺编码 店铺类型
# 城市 城市编码 区域 区域编码 街道 地址 楼宇名(任务名/商圈名/area_name) 楼宇名编码 店铺名 "采集点名称 cp_name" "采集点编码cp_code" 店铺类型
# city,citycode,distric,districtcode,street,,address,city,area,longitude,latitude
# 城市 区域 街道 地址 楼宇名(任务名/商圈名/area_name) 楼宇名编码 店铺名 "采集点名称 cp_name" "采集点编码cp_code" 店铺类型
# 城市 区 地址 品牌名
# city,distric,address,brand_name,catering_kind def chk_is_coffee(str):
l_ = ['coffee', 'coffe', 'cafe']
# 上岛花式铁板烧 日本菜
# 泛太平洋大酒店面馆 其他
l_b = ['咖啡', '星巴克']
# 星巴克
for i in l_:
if str.upper().find(i.upper()) != -1:
return True
for i in l_b:
if str.find(i) != -1:
return True
return False def chk_kfc_mdl(str):
if str.find(u"麦当劳") != -1:
return 1
else:
return 0 def get_brand_name(str):
# str = '狗不理包子(前门店)'
# str = '(清真)三羊水饺(新民路店)'
# | 添椒 | 潮涮三国IP火锅
if str.find('(') == -1 and str.find('(') == -1:
return str
res = str.strip(' ').split('(')[0].strip(' ')
if len(res) == 0:
try:
res = str.split(')')[1].split('(')[0]
except Exception:
print(Exception)
# 一锅两头牛(烟青路店)
res_b = res
try:
res_b = res.split('(')[0]
except Exception:
print(Exception) return res_b def fun4(file_, file_name='m_d_4'):
global count_catering, cater_dic
for i in file_:
list_ = i.replace('\n', '').split(',')
if file_name == 'm_d_4':
city = list_[1]
district = list_[2]
address = list_[3].strip(' ')
brand_name = get_brand_name(list_[4])
catering_kind = list_[6] if not isinstance(list_[6], int) and len(list_[6]) > 0 else '未知'
average_price = list_[5]
elif file_name == 'm_d_5':
city = list_[0]
district = list_[1]
address = list_[2].strip(' ')
brand_name = get_brand_name(list_[3])
catering_kind = list_[5] if not isinstance(list_[5], int) and len(list_[5]) > 0 else '未知'
average_price = list_[4]
elif file_name == 'GD_map': city = list_[7] if not list_[7].find('[') != -1 else list_[6]
district = list_[9]
address = list_[14]
brand_name = list_[0] if len(list_[0]) > 0 else ''
average_price = ''
if len(brand_name) > 0:
if chk_kfc_mdl(brand_name) == 1:
brand_name = '麦当劳'
else:
brand_name = '肯德基' catering_kind = '快餐厅' city = city.replace(' ', '')
district =district.replace(' ', '')
address = address.replace(' ', '')
brand_name =brand_name.replace(' ', '') if re.match(r".*\d", city) is not None:
print(city)
continue if len(city) == 0 or len(district) == 0 or len(address) == 0 or len(brand_name) == 0:
# 数据缺失,则直接剔除
# print(list_)
continue
if city.find('市') == -1 and city.find('州') == -1:
continue m = chk_is_coffee(brand_name)
# if m:
# print(list_)
if not m:
m = chk_is_coffee(catering_kind)
# if m:
# print(list_)
if not m:
dic_city_district_details = {}
dic_city_district_details['brand_name'] = brand_name
dic_city_district_details['address'] = address
dic_city_district_details['catering_kind'] = catering_kind
dic_city_district_details['average_price'] = average_price
if city in cater_dic:
if district in cater_dic[city]:
cater_dic[city][district].append(dic_city_district_details)
else:
dic_city_district = []
dic_city_district.append(dic_city_district_details)
cater_dic[city][district] = dic_city_district
else:
dic_city = {}
dic_city_district = []
dic_city_district.append(dic_city_district_details)
dic_city[district] = dic_city_district
cater_dic[city] = dic_city
count_catering += 1 s = '数据来源MEITUAN_DAZHONG_20170704.csv、MEITUAN_DAZHONG_20170705.csv、GD_POI_KFC_MC.csv'
print(s) f = open('MEITUAN_DAZHONG_20170704.csv', 'r', encoding='gbk')
f_2 = open('MEITUAN_DAZHONG_20170705.csv', 'r', encoding='gbk')
f_gd = open('GD_POI_KFC_MC.csv', 'r', encoding='utf-8-sig') # 市key-区key-[品牌名,地址,菜系(类),均价]
# city,district,address,brand_name,catering_kind,average_price
cater_dic = {}
count_catering = 0 fun4(f)
d4 = count_catering
fun4(f_2, 'm_d_5')
d5 = count_catering
fun4(f_gd, 'GD_map')
d6 = count_catering
print(d4, d5, d6) # pp = pprint.PrettyPrinter(depth=6)
# pp.pprint(cater_dic) f = open('美团_大众_高德__去咖啡_餐饮_DATAfrom美团大众20170704-0705和未标日期高德数据.csv', 'a', encoding='utf-8-sig')
str = '市,区,品牌名,地址,菜别(类型),均价\n'
f.write(str)
## city,district,address,brand_name,catering_kind,average_price
for i in cater_dic:
city = i
for ii in cater_dic[i]:
district = ii
for iii in cater_dic[i][ii]:
brand_name = iii['brand_name']
address = iii['address']
catering_kind = iii['catering_kind']
average_price = iii['average_price']
str = '%s,%s,%s,%s,%s,%s\n' % (city, district, address, brand_name, catering_kind, average_price)
f.write(str)
f.closed
# 201200 397178 405444
# 201200 397178 405106
# 201184 397144 405072
# 201184 397144 405072
.... cater_dic = {} .... if not m:
dic_city_district_details = {}
dic_city_district_details['brand_name'] = brand_name
dic_city_district_details['address'] = address
dic_city_district_details['catering_kind'] = catering_kind
dic_city_district_details['average_price'] = average_price
if city in cater_dic:
if district in cater_dic[city]:
cater_dic[city][district].append(dic_city_district_details)
else:
dic_city_district = []
dic_city_district.append(dic_city_district_details)
cater_dic[city][district] = dic_city_district
else:
dic_city = {}
dic_city_district = []
dic_city_district.append(dic_city_district_details)
dic_city[district] = dic_city_district
cater_dic[city] = dic_city
'utf-8-sig csv
# -*- coding: UTF-8 -*-
import re def main_(file_):
global count_mac, dic_mac, dic_mac_num
for i in file_:
list_ = i.replace('\n', '').split(',')
# mac地址,ssid,个数,商圈类型,商圈名,城市,区域,经度,纬度
# mac,ssid,num,business_type,business_name,city,area,longitude,latitude
mac = list_[0].replace(' ', '')
m = re.match(r"\w{2}:\w{2}:\w{2}:\w{2}:\w{2}:\w{2}", mac)
if m:
if len(list_) == 9:
dic_inner = {}
dic_inner['ssid'] = list_[1]
dic_inner['num'] = list_[2]
dic_inner['business_type'] = list_[3]
dic_inner['business_name'] = list_[4]
dic_inner['city'] = list_[5]
dic_inner['area'] = list_[6]
dic_inner['longitude'] = list_[7]
dic_inner['latitude'] = list_[8] count_mac += 1 dic_outer = {}
dic_outer[mac] = dic_inner if mac in dic_mac:
dic_mac[mac].append(dic_outer)
else:
dic_mac[mac] = [dic_outer] if mac in dic_mac_num:
dic_mac_num[mac] += 1
else:
dic_mac_num[mac] = 1 else:
# 数据缺失,则直接剔除
# print(list_)
continue
else:
print('mac字符串格式错误', mac) s = '数据来源mac_1.csv、mac_2.csv'
print(s) f = open('mac_1.csv', 'r', encoding='utf8')
f_2 = open('mac_2.csv', 'r', encoding='utf8') # 以mac为key,以其他字段为value
dic_mac = {}
# 对mac计数
count_mac = 0
# 以mac为key,以该mac的数据条数为value
dic_mac_num = {}
# 以mac的数据条数为key,以该种类别的mac数目和各个mac值组成的list为value
dic_mac_num_count = {} main_(f)
main_(f_2) s = 'csv数据总条数='
print(s, count_mac)
# 不相同的mac数目
s = '不相同的mac数目='
distinct_mac_num = len(dic_mac)
print(s, distinct_mac_num)
s = '不相同的mac数目/csv数据总条数='
print(s, distinct_mac_num / count_mac)
s = 'csv数据总条数/不相同的mac数目='
print(s, count_mac / distinct_mac_num) for mac in dic_mac_num:
times = dic_mac_num[mac]
if times in dic_mac_num_count:
dic_mac_num_count[times]['num'] += 1
dic_mac_num_count[times]['mac_list'].append(mac)
else:
dic_ = {}
dic_['num'] = 1
dic_['mac_list'] = []
dic_['mac_list'].append(mac)
dic_mac_num_count[times] = dic_
# -------write
# f = open('mac_appear_times_sorted.csv', 'a', encoding='utf-8-sig')
# str = 'mac,mac出现次数的类别,该类别下mac数,该类别下mac数,mac数与csv总数据的比值,mac数与csv总数据中不同的mac数的比值\n'
# f.write(str)
# # 中文 NotePad++ BOM
# f_summary = open('mac_appear_times_sorted_summary.csv', 'a', encoding='utf-8-sig')
# # str_summary = 'mac_appear_times_category,category_count,count/csv_whole_data,category_count/discount_mac_count\n'
# str_summary = 'mac出现次数的类别,该类别下mac数,mac数与csv总数据的比值,mac数与csv总数据中不同的mac数的比值\n'
# f_summary.write(str_summary)
#
# dic_mac_num_count_sorted_list = sorted(dic_mac_num_count.items(), key=lambda d: d[1]['num'], reverse=False)
# for i in dic_mac_num_count_sorted_list:
# times = i[0]
# num = i[1]['num']
# str_summary = '%i,%i,%f,%f\n' % (times, num, num / count_mac, num / distinct_mac_num)
# f_summary.write(str_summary)
# mac_list = i[1]['mac_list']
# for mac in mac_list:
# str = '%s,%i,%i,%f,%f\n' % (mac, times, num, num / count_mac, num / distinct_mac_num)
# f.write(str)
# f_summary.closed
# f.closed # OUR_OFFICE_SSID = ['SW_MobilePhone', 'SW_MobilePhone2', 'SW-Guest', 'kmz', 'lamciu', 'lamciu_5G', 'SW_MobilePhone_5G',
# 'wyf的iMac']
#
# f_our_office = open('mac_our_office_judgefrom_ssid.csv', 'a', encoding='utf-8-sig')
# for mac in dic_mac:
# for i in dic_mac[mac]:
# ssid = i[mac]['ssid']
# for our_ssid in OUR_OFFICE_SSID:
# if ssid == our_ssid:
# num = i[mac]['num']
# business_type = i[mac]['business_type']
# business_name = i[mac]['business_name']
# city = i[mac]['city']
# area = i[mac]['area']
# longitude = i[mac]['longitude']
# latitude = i[mac]['latitude']
# str = '%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
# mac, ssid, num, business_type, business_name, city, area, longitude, latitude)
# f_our_office.write(str)
# f_our_office.closed # 同一mac的longitude latitude欧式距离:mac_1.csv、mac_2.csv中longitude latitude缺失值比例(?)
# “南山区”、“福田区”在大陆有互异性,即同一个mac的area不为同一区即判定为错误mac
f_diff = open('mac_error_appear_in_diff_cityarea_businessname.csv', 'a', encoding='utf-8-sig')
mac_error_list = []
for mac in dic_mac_num:
if dic_mac_num[mac] > 1:
flag_ = 0
for i in dic_mac[mac]:
print(i[mac])
if flag_ == 0:
area_0 = ''
area_0 = i[mac]['area'] num = i[mac]['num']
business_type = i[mac]['business_type']
business_name = i[mac]['business_name']
city = i[mac]['city']
longitude = i[mac]['longitude']
latitude = i[mac]['latitude'] ssid = i[mac]['ssid']
str_0 = '%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
mac, ssid, num, business_type, business_name, city, area_0, longitude, latitude) else:
print(222)
print(area_0)
print(i[mac]['area'])
if area_0 != i[mac]['area']:
flag_ = 1
mac_error_list.append(mac) num = i[mac]['num']
business_type = i[mac]['business_type']
business_name = i[mac]['business_name']
city = i[mac]['city']
area = i[mac]['area']
longitude = i[mac]['longitude']
latitude = i[mac]['latitude'] ssid = i[mac]['ssid']
str_1 = '%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
mac, ssid, num, business_type, business_name, city, area, longitude, latitude)
f_diff.write(str_0)
f_diff.write(str_1)
else:
continue
f_diff.closed
'utf-8-sig api_res = r.data.decode('utf-8') json_ = json.loads(api_res)的更多相关文章
- html = data.decode('gbk').encode('utf-8')
html = data.decode('gbk').encode('utf-8')此处encode编码要与html文件内charset=utf-8的格式一致,如果不一致,浏览器打开乱码,文本编辑器正常 ...
- 将基因组数据分类并写出文件,python,awk,R data.table速度PK
由于基因组数据过大,想进一步用R语言处理担心系统内存不够,因此想着将文件按染色体拆分,发现python,awk,R 语言都能够非常简单快捷的实现,那么速度是否有差距呢,因此在跑几个50G的大文件之前, ...
- R Data Frame
https://www.datamentor.io/r-programming/data-frame/ Check if a variable is a data frame or not We ca ...
- R data formats
R data formats: Rdata Rda Rds 1.概念 rds是R语言中利用二进制保存的源文件,加载readr包以后,使用write_rds(x,file='x.rds')保存文件,re ...
- 使用macaca抓页面元素,执行命令后报安装失败处理Error: Command failed: ……pm install -r "/data/local/tmp/com.macaca.android.testing"
最近换了小米手机做自动化测试,执行命令的时候报安装失败错误,错误如下 解决:设置小米允许USB安装就好了 pm install -r "/data/local/tmp/com.macaca. ...
- Guzzle Unable to parse JSON data: JSON_ERROR_SYNTAX - Syntax error, malformed JSON
项目更新到正式平台时,出现Guzzle(5.3) client get请求出现:Unable to parse JSON data: JSON_ERROR_SYNTAX - Syntax error, ...
- R: data.frame 数据框的:查询位置、排序(sort、order)、筛选满足条件的子集。。
################################################### 问题:数据框 data.frame 查.排序等, 18.4.27 怎么对数据框 data.f ...
- R: data.frame 生成、操作数组。重命名、增、删、改
################################################### 问题:生成.操作数据框 18.4.27 怎么生成数据框 data.frame.,,及其相关操 ...
- ajax data数据里面传一个json到后台的写法
$.ajax({ url:url+"/crm/contact", type:'PUT', ...
随机推荐
- Spring Boot系列(三) Spring Boot 之 JDBC
数据源 类型 javax.sql.DataSource javax.sql.XADataSource org.springframework.jdbc.datasource.embedded,Enbe ...
- (一) MySQL架构
1.MySQL架构 MySQL拥有分层的架构,上层是服务器层的服务和查询执行引擎,下层是存储引擎,真正负责数据的存储和提取. 服务层包含了连接和线程处理,以及大多数MySQL的核心服务,如对SQL的解 ...
- Vue2+VueRouter2+webpack 构建项目实战(四)接通api,先渲染个列表
Vue2+VueRouter2+webpack 构建项目实战(四)接通api,先渲染个列表: Vue2+VueRouter2+Webpack+Axios 构建项目实战2017重制版(一)基础知识概述
- APMServ升级PHP至5.3
APMServ5.2.6 的php版本是php5.2.6,所以需要升级一下PHP版本:1.到 php下载地址下载PHP5.3的VC6版本的zip文件,我下载的是:php-5.3.23-Win32-VC ...
- mysql索引与补充
一, 什么是索引 为什么要有索引? 一般的应用系统,读写比例在10:1左右,而且插入操作和一般的更新操作很少出现性能问题,在生产环境中,我们遇到最多的,也是最容易出问题的,还是一些复杂的查询操作,因此 ...
- 十二、支持向量机(Support Vector Machines)
12.1 优化目标 参考视频: 12 - 1 - Optimization Objective (15 min).mkv 到目前为止,你已经见过一系列不同的学习算法.在监督学习中,许多学习算法的性能都 ...
- 【数据库运维】数据库(server)的时区设置及世界主要地区的时区
[时区设置不当会有什么问题] 当进行海外项目运维的时候,常常会遇到时区设置的问题.假设时区设置不当 或者 同样项目的server之间的时区不一致,都会有导致项目的数据异常的风险. 假设数据表的字段使用 ...
- mysql中,表与表之间的关系
""" 1.字段的修改.添加.删除 2.多表关系(外键) 3.单表详细操作:增删改,查(各种条件) """ 字段操作 create ta ...
- 从0构建webpack开发环境(一) 一个简单webpack.config.js
本文基于webpack4.X,使用的包管理工具是yarn 概念相关就不搬运了,直接开始 首先项目初始化 mkdir webpack-demo && cd webpack-demo ya ...
- HashMap、Hashtable和ConcurrentHashMap的区别
HashTable 底层数组+链表实现,无论key还是value都不能为null,线程安全,实现线程安全的方式是在修改数据时锁住整个HashTable,效率低,ConcurrentHashMap做了相 ...