def gen_file_data(fodir, fname, sheet_index=0, ):
if fname.find('.xlsx') > -1:
fname_open = '%s\\%s' % (fodir, fname)
book = xlrd.open_workbook(fname_open, on_demand=True)
sheet = book.sheet_by_index(sheet_index)
data = [[str(c.value) for c in sheet.row(i)] for i in range(sheet.nrows)]
book.release_resources()
del book
elif fname.find('.csv') > -1:
data = []
fname_open = '%s\\%s' % (fodir, fname)
with open(fname_open, 'r', encoding='utf-8') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',')
for row in spamreader:
data.append(row)
csvfile.close()
return data

  

import xlrd
import time
import sys
import os
import requests
import sqlite3
import threading
import math
import csv curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath) MAX_USED_TIMES, overrun_str, DB_KEY_EXHAUST, next_day_tag = 1900, '天配额超限,限制访问', 'DB_KEY_EXHAUST', '000003' db = 'py_bdspider_status.db'
db = '%s\\%s' % (curPath, db) def db_chk_one_exist(key):
conn = sqlite3.connect(db)
c = conn.cursor()
sql = 'SELECT key FROM baidu_map_key_used WHERE key="%s"' % (key)
r = 0
res = c.execute(sql).fetchone()
if res is not None:
r = 1
conn.close
return r # def db_init_key_table():
# conn = sqlite3.connect(db)
# c = conn.cursor()
# k_file = '%s\\%s' % (curPath, 'bdmap_key.txt')
# with open(k_file, 'r', encoding='utf-8') as pf:
# for i in pf:
# if len(i) < 4:
# continue
# author, key = i.replace(' ', '').replace('\n', '').replace('\t', '').split(';')
# r = db_chk_one_exist(key)
# if r == 0:
# localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
# sql = 'INSERT INTO baidu_map_key_used (author,key,update_time,today_used) VALUES ("%s","%s","%s",%s) ' % (
# author, key, localtime_, 0)
# c.execute(sql)
# conn.commit()
# conn.close()
# pf.close()
#
#
# db_init_key_table() def db_recovery_bdkeynum():
if time.strftime("%H%M%S", time.localtime()) == next_day_tag:
conn = sqlite3.connect(db)
c = conn.cursor()
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
sql = 'UPDATE baidu_map_key_used SET today_used = 0 ,update_time=%s ' % (localtime_)
c.execute(sql)
conn.commit()
conn.close()
return def db_get_one_effective():
db_recovery_bdkeynum()
conn = sqlite3.connect(db)
c = conn.cursor()
sql = 'SELECT key FROM baidu_map_key_used WHERE today_used<=%s ORDER BY today_used ASC' % (MAX_USED_TIMES)
res, r = c.execute(sql).fetchone(), ''
if res is None:
r = DB_KEY_EXHAUST
else:
r = res[0]
conn.close()
return r def db_update_one_today_used(key):
conn = sqlite3.connect(db)
c = conn.cursor()
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
sql = 'UPDATE baidu_map_key_used SET today_used = today_used+1 ,update_time=%s WHERE key="%s" ' % (
localtime_, key)
c.execute(sql)
conn.commit()
conn.close() dir_, dir_exception, requested_file_list = 'baidu_map_uid', 'baidu_map_uid_exception', []
requested_file_dir_str, requested_file_dir_exception_str = '%s\\%s\\' % (curPath, dir_), '%s\\%s\\' % (
curPath, dir_exception)
requested_file_dir = os.listdir(requested_file_dir_str) def gen_requested_file_list(file_postfix='.html'):
filepath = '%s\\%s' % (curPath, dir_)
pathDir = os.listdir(filepath)
for allDir in pathDir:
child = os.path.join('%s%s' % (filepath, allDir))
requested_file = child.split(dir_)[1].split('&')[0].split(file_postfix)[0]
if requested_file not in requested_file_list:
requested_file_list.append(requested_file) def gen_file_data(fodir, fname, sheet_index=0, ):
if fname.find('.xlsx') > -1:
fname_open = '%s\\%s' % (fodir, fname)
book = xlrd.open_workbook(fname_open, on_demand=True)
sheet = book.sheet_by_index(sheet_index)
data = [[str(c.value) for c in sheet.row(i)] for i in range(sheet.nrows)]
book.release_resources()
del book
elif fname.find('.csv') > -1:
data = []
fname_open = '%s\\%s' % (fodir, fname)
with open(fname_open, 'r', encoding='utf-8') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',')
for row in spamreader:
data.append(row)
csvfile.close()
return data # 3 9
request_dic, target_type_list, target_type_except_list = {}, ['北京市', '上海市', '广州市'], ['火车站', '高铁站', '汽车站', '飞机场', '小学',
'幼儿园', '中学',
'综合医院', '商场']
# ['4s店','餐饮','家电','酒店','咖啡馆','售楼处','专科医院']
# ['住宅小区','写字楼'] # file_postfix_l = ['.html', '.txt']
# for i in file_postfix_l:
# gen_requested_file_list(i) fname_source = 'jfinder_public_jmtool_old_data.csv'
data_file = gen_file_data(curPath, fname_source) def replace_illeagl_tag(str_):
l = [' ', '\n', '\t']
for i in l:
str_ = str_.replace(i, '')
return str_ # 碧海富通城三期(3栋) ok
# =碧海富通城-三期(3栋) ok
replace_to_empty_l = [' ', '|', '\t', '\n', '/', '?', '?', '·', '.'] def gen_bd_query_origin_name(name_):
for i in replace_to_empty_l:
name_ = name_.replace(i, '')
return name_.replace('(', '(').replace(')', ')').replace('?', '').replace('?', '') for l in data_file:
# db_from, db_id, db_area_code, db_name, db_type_, db_city, db_district, db_address, db_street, db_uid, db_submit_time = l
# db_from, id, area_code, name, type_, city, district, address, street, uid, submit_time = l
dbid, area_code, uid, name_, type_en, city, district, address, street, db_from, submit_time, type_ = l if db_from == 'db_from':
continue
request_name = gen_bd_query_origin_name(name_)
input_ = '%s%s%s' % (city, district, request_name)
if input_ in requested_file_list:
print('requested', input_)
continue
if city not in request_dic:
request_dic[city] = {}
if district not in request_dic[city]:
request_dic[city][district] = {}
request_dic[city][district]['request_name_list'] = []
request_dic[city][district]['request_uid_list'] = []
request_dic[city][district]['file_row_list'] = []
if request_name not in request_dic[city][district]['request_name_list']:
request_dic[city][district]['request_name_list'].append(request_name)
uid = uid.replace(' ', '')
if len(uid) > 0 and uid not in request_dic[city][district]['request_uid_list']:
request_dic[city][district]['request_uid_list'].append(uid)
request_dic[city][district]['file_row_list'].append(l)
del data_file base_url = 'http://api.map.baidu.com/place/v2/suggestion?query=R-QUERY&region=R-CITY&city_limit=true&output=json&ak=R-AK'
ex_l = ['Proxy Error', 'APP IP校验失败', 'APP不存在,AK有误请检查再重试', 'The requested URL could not be retrieved',
'Address already in use', '天配额超限,限制访问', 'Parameter Invalid'] write_res_file_dir = '%s\\%s\\' % (curPath, dir_) def write_res_file(str_, input_, ak, dir_=write_res_file_dir, file_postfix='.txt'):
for ex in ex_l:
if str_.find(ex) > -1:
print('EXCEPTION-', ex, 'AK-', ak, 'STR-', str_) return
fname = '%s%s%s' % (dir_, input_, file_postfix)
with open(fname, 'w', encoding='utf-8') as ft:
ft.write(str_)
ft.close()
print('ok', threading.get_ident(), input_) class MyThread(threading.Thread):
def __init__(self, func, args, name):
threading.Thread.__init__(self)
self.name, self.func, self.args = name, func, args def run(self):
self.func(self.args) def fun_(city):
for district in request_dic[city]:
for request_name in request_dic[city][district]['request_name_list']:
ak = db_get_one_effective()
if ak == DB_KEY_EXHAUST:
print(DB_KEY_EXHAUST)
break
else:
url_ = base_url.replace('R-QUERY', request_name).replace('R-CITY', city).replace('R-AK', ak)
print(url_)
input_ = '%s%s%s' % (city, district, request_name) bd_res_json_str = requests.get(url_).text
db_update_one_today_used(ak)
write_res_file(bd_res_json_str, input_, ak) # try:
# # gen_requested_file_list()
# # gen_requested_file_list('.txt')
# # if input_ in requested_file_list:
# # continue
# bd_res_json_str = requests.get(url_).text
# db_update_one_today_used(ak)
# write_res_file(bd_res_json_str, input_)
# except Exception:
# bd_res_json_str = '请求百度-异常'
# write_res_file(bd_res_json_str, input_, requested_file_dir_exception_str)
# print(bd_res_json_str, input_) try:
start_loop, stop_loop = int(sys.argv[1]), int(sys.argv[2])
except Exception:
start_loop, stop_loop = -1, 200 def main():
threads_list, nloop = [], 0
request_dic_city_l = sorted(request_dic, reverse=False)
for city in request_dic_city_l:
nloop += 1
if nloop < start_loop or nloop > stop_loop:
continue
thread_instance = MyThread(fun_, (city), fun_.__name__)
threads_list.append(thread_instance)
for t in threads_list:
t.setDaemon = False
t.start()
for t in threads_list:
t.join() if __name__ == '__main__':
main()

  

csv .xlsx的更多相关文章

  1. 读取 csv , xlsx 表格并添加总分列

    import pandas as pd import numpy as np data = pd.read_excel('学生成绩表.csv',columns = ['学号','姓名','高数','英 ...

  2. Linux-各种姿势(less\vi等)打开各种类型的文件(txt/csv/xlsx等)出现不能打开(全乱码、部分乱码、二进制文件等)的问题

    (一)linux各种中文乱码解决办法整理 远程登录服务器用vim在终端下编辑查看文件经常会遇见各种中文乱码问题. 做如下设置可基本解决vim中文乱码问题,首先查看系统对中文的支持locale -a | ...

  3. SAS学习笔记15 SAS导入数据(import txt csv xlsx spss)

  4. C#:将.csv格式文件转换成.xlsx格式文件

    using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; usin ...

  5. Data_r_and_w(csv,json,xlsx)

    import osimport sysimport argparsetry:    import cStringIO as StringIOexcept:    import StringIOimpo ...

  6. 使用 WeihanLi.Npoi 操作 CSV

    使用 WeihanLi.Npoi 操作 CSV Intro 最近发现 csv 文件在很多情况下都在使用,而且经过大致了解,csv 格式简单,相比 excel 文件要小很多,读取也很是方便,而且也很通用 ...

  7. SSIS 中将csv 文件批量导出到excel 文件,并设置excel 文件中某些列的data column format 为Text

    csv 文件是文本文件类型,但是打开csv 文件后(默认使用本地已经安装的excel 来打开excel 文件),默认显示出来的是general 类型(column data format)的数据, 这 ...

  8. PHP 导出 Excel 兼容 CSV XlS格式

    class ExcelRead { /** * 获取Excel文件内容 * @param $file * @return mixed * @throws PHPExcel_Reader_Excepti ...

  9. Coursera-Getting and Cleaning Data-week4-R语言中的正则表达式以及文本处理

    博客总目录:http://www.cnblogs.com/weibaar/p/4507801.html Thursday, January 29, 2015 补上第四周笔记,以及本次课程总结. 第四周 ...

随机推荐

  1. Scala和Java二种方式实战Spark Streaming开发

    一.Java方式开发 1.开发前准备:假定您以搭建好了Spark集群. 2.开发环境采用eclipse maven工程,需要添加Spark Streaming依赖. 3.Spark streaming ...

  2. 【Consul】 分布式环境中的服务注册和发现利器

    参考资料: http://www.cnblogs.com/shanyou/p/4695131.html http://blog.csdn.net/viewcode/article/details/45 ...

  3. 如何安装Android模拟器到VM虚拟机

    1 像普通安装一样找到ISO镜像文件,该镜像文件名称为"android-x86-2.2-generic.iso",该镜像文件可以从谷歌官网得到 http://code.google ...

  4. Elasticsearch 2014年10月简报

    1. Elasticsearch Updates 1.1 公布了Kibana 4 Beta 1 和Beta 1.1 Kibana 4不管是在界面的布局,使用配置方法,还是底层绘制图表的方式都与Kiba ...

  5. 基于Android Classic Bluetooth的蓝牙聊天软件

    代码地址如下:http://www.demodashi.com/demo/12133.html BluetoothChat 基于Android Classic Bluetooth的蓝牙聊天软件,目前仅 ...

  6. mongoDB DOS窗口显示中文

    http://zhidao.baidu.com/question/157276582 由于mongodb后台的字符编码都是utf-8的,而中文windows cmd窗口使用的字符编码是GBK(属性-& ...

  7. iOS端App的icon和Launch Image规格实时更新

    启动影像 : iPhone :320 x 480 640 x 960 640*1136 750*1334 1242*2208  iPad :768 x 1004 1536 x 2008 APP图标: ...

  8. Spark高速上手之交互式分析

    1.1  Spark交互式分析 执行Spark脚本前,启动Hadoop的HDFS和YARN.Spark的shell提供 了简单方式去识别API.相同也有一个强大的工具去交互式地分析数据. 两种语言有这 ...

  9. JavaScript 中的命名空间

    全局变量应该由有系统范围相关性的对象们保留,并且它们的命名应该避免含糊并尽量减少命名冲突的风险.在实践中,这意味着你应该避免创建全局对象,除非它们是绝对必须的. 所以你对此是怎么做的?传统方法告诉我们 ...

  10. Android SDK环境搭建

    方法有二 方法一: Android SDK开发包国内下载地址 http://www.cnblogs.com/bjzhanghao/archive/2012/11/14/android-platform ...