python 读取mysql存储的文件路径下载文件,内容解析,上传七牛云,内容入es
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import ConfigParser
import json
import os
import re
from re import sub
import sys
import time
import requests
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfparser import PDFParser
from qiniu import Auth
from qiniu import etag
from qiniu import put_file
import log_config
from OP_Mysql import get_connection
from HTMLParser import HTMLParser
import random reload(sys)
sys.setdefaultencoding('utf-8')
logger = log_config.getlogger('analysis_neeq_content', 'analysis_neeq_content.log')
conf = ConfigParser.ConfigParser()
conf.read("mysql.conf")
neeq_remainder = conf.get("basic_config", "neeq_remainder")
neeq_server_num = conf.get("basic_config", "neeq_server_num")
neeq_start_id = conf.get("basic_config", "neeq_start_id")
neeq_json_path = conf.get("basic_config", "neeq_json_path")
neeq_json = conf.get("basic_config", "neeq_json")
json_suffix = '.json'
neeq_id = conf.get("basic_config", "neeq_id")
neeq_file_path = conf.get("basic_config", "neeq_file_path")
access_key = conf.get("basic_config", "access_key")
secret_key = conf.get("basic_config", "secret_key")
bucket = conf.get("basic_config", "bucket") class analysis:
def __init__(self):
# 用于文件追加
self.count = 0
self.neeq_json = neeq_json
self.headers = {'Host': 'www.neeq.com.cn',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36'
}
self.create_init_dirtory() # 创建初始文件夹
def create_init_dirtory(self):
if not os.path.exists(neeq_json_path):
os.makedirs(neeq_json_path)
if not os.path.exists(neeq_file_path):
os.makedirs(neeq_file_path) # mysql 获取数据
def get_data(self):
with get_connection() as db:
# SQL 查询语句
count = r"SELECT COUNT(*) as num FROM ssb_insight_neeq WHERE pro_status = 0 AND neeq_id <= %s and %s = (neeq_id %% %s)"
logger.info("now excute sql script sql = %s" % count)
try:
# 获取所有记录列表
db.cursor.execute(count, [neeq_start_id, neeq_remainder, neeq_server_num])
counts = db.cursor.fetchall()
num = counts[0]['num']
logger.info("now rows num = %s" % num)
if 0 != num % 1000:
pages = num / 1000 + 1
else:
pages = num / 1000
start_rows = 1000
for i in range(0, pages):
start_page = i * 1000 sql = "SELECT t.sec_code,t.sec_name,t.title,t.doc_type,t.doc_type_key,c.industry1,c.industry2," \
"t.url,t.public_time,t.content,t.pro_status,t.module,t.es_id FROM ssb_insight_neeq t " \
"LEFT JOIN ssb_d_listed_company c ON t.sec_code = c.secCode WHERE t.pro_status = 0 and t.neeq_id <= %s " \
"AND %s = (t.neeq_id %% %s) ORDER BY t.neeq_id DESC LIMIT %s ,%s" db.cursor.execute(sql, [neeq_start_id, neeq_remainder, neeq_server_num, start_page, start_rows])
result_datas = db.cursor.fetchall()
# 1000 数据放入此数组
json_data = []
es_id_file_addr = []
for row in result_datas:
item = {}
es_obj = {}
result = {'secCode': row['sec_code'],
'secName': row['sec_name'],
'title': row['title'],
'docType': row['doc_type'].split(','),
'docTypeKey': row['doc_type_key'].split(','),
'url': row['url'],
'publicTime': row['public_time'],
'industry1': row['industry1'],
'industry2': row['industry2'],
'content': row['content'],
'proStatus': bool(row['pro_status']),
'module': row['module'],
}
file_url = row['url']
self.download_file(file_url)
file_name = re.findall(r".*/(.*)", file_url)[0]
file_paths = neeq_file_path + file_name
if os.path.exists(file_paths):
content = self.analysis_file_content(file_paths)
self.upload_qiniu(file_paths)
self.del_file(file_paths)
if content == '':
continue
result['content'] = content else:
logger.warn("file_url %s download fail" % file_url)
continue
item['id'] = row['es_id']
item['data'] = result
json_data.append(item)
es_obj['es_id'] = row['es_id']
es_obj['file_addr'] = file_paths
es_id_file_addr.append(es_obj)
self.write_json_file(json_data)
self.write_es_id_file_addr(es_id_file_addr)
except Exception as e:
logger.error("Error: unable to fecth data Exception %s" % e) def write_json_file(self, json_data):
# 写数据
json_path = neeq_json_path + self.neeq_json + json_suffix
rows = self.get_json_rows(json_path)
if rows > 100000:
self.count = self.count + 1
self.neeq_json = neeq_json + str(self.count)
json_path = neeq_json_path + self.neeq_json + json_suffix
with open(json_path, 'a') as es_file:
for jsonitem in json_data:
jsondatar = json.dumps(jsonitem, ensure_ascii=True)
es_file.write(jsondatar+"\n") def write_es_id_file_addr(self, es_id_data):
# 写入es_id,以及 七牛云 地址
with open(neeq_id, 'a') as es_id_file:
for jsonitem in es_id_data:
es_id_file.write(jsonitem['es_id']+","+jsonitem['file_addr']+";"+"\n") # 获取json文件行数,用于分文件存储
def get_json_rows(self, json_path):
count = 0
if not os.path.exists(json_path):
return 0
thefile = open(json_path, 'rb')
while True:
buffer = thefile.read(8192 * 1024)
if not buffer:
break
count += buffer.count('\n')
thefile.close()
return count # 上传文件
def upload_qiniu(self, file_path_name):
q = Auth(access_key, secret_key)
# 生成上传 Token,可以指定过期时间等
token = q.upload_token(bucket, file_path_name, 3600)
# 要上传文件的本地路径
ret, info = put_file(token, file_path_name, file_path_name)
# logger.info(info)
if info.status_code != 200:
logger.info("file upload qiniuyun fail %s" % file_path_name) # 删除文件
def del_file(self, file_path_name):
if os.path.exists(file_path_name):
os.remove(file_path_name)
else:
logger.info("%s 文件不存在" % file_path_name) # 下载文件
def download_file(self, file_url):
time.sleep(random.uniform(1, 2))
retry = 0
try:
while retry < 3:
file_name = re.findall(r".*/(.*)", file_url)[0]
response = requests.get(file_url, stream=True, headers=self.headers, timeout=5)
if response.status_code == requests.codes.ok:
with open(neeq_file_path + file_name, "wb") as code:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
code.write(chunk)
break
except Exception as e:
logger.exception(e)
retry += 1 # 解析文件
def analysis_file_content(self, filename):
content = ''
fenzhihouzhui = re.findall(r'.*(\..*)', str(filename))[0]
if fenzhihouzhui == '.pdf' or fenzhihouzhui == '.PDF':
content = self.analysis_pdf_file_content(filename)
elif fenzhihouzhui == '.html' or fenzhihouzhui == '.HTML':
content = self.analysi_html_file_content(filename)
return content def analysis_pdf_file_content(self, filename):
content = ''
try:
fileobject = open(filename, 'rb')
parser = PDFParser(fileobject)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
else:
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
for x in layout:
if isinstance(x, LTTextBoxHorizontal):
results = x.get_text().encode('utf-8')
content += results
fileobject.close()
except Exception as e:
logger.error("analysis pdf file fail : %s" % e)
return content def analysi_html_file_content(self, filename):
content_open = open(filename, 'rb')
contents = content_open.read()
print contents
contents = dehtml(contents) class pythonNToTxt(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.__text = [] def handle_data(self, data):
text = data.strip()
if len(text) > 0:
text = sub('[ \t\r\n]+', ' ', text)
self.__text.append(text + ' ') def handle_starttag(self, tag, attrs):
if tag == 'p':
self.__text.append('\n\n')
elif tag == 'br':
self.__text.append('\n') def handle_startendtag(self, tag, attrs):
if tag == 'br':
self.__text.append('\n\n') def text(self):
return ''.join(self.__text).strip() def dehtml(text):
try:
parser = pythonNToTxt()
parser.feed(text)
parser.close()
return parser.text()
except Exception as e:
logger.error("html analysis excepiton : %s" % e)
return text logger.info("analysis neeq content start,now params neeq_remainder=%s,neeq_start_id =%s,neeq_json = %s,neeq_id = %s ,neeq_file_path = %s" % (neeq_remainder, neeq_start_id, neeq_json, neeq_id, neeq_file_path))
analysis = analysis()
analysis.get_data()
#!/usr/bin/env python
# -*- coding: utf-8 -*
import sys
import log_config
import ConfigParser
import pymysql
from DBUtils.PooledDB import PooledDB
reload(sys)
sys.setdefaultencoding('utf-8')
conf = ConfigParser.ConfigParser()
conf.read("mysql.conf")
user = conf.get("mysql", "user")
password = conf.get("mysql", "password")
database = conf.get("mysql", "database")
host = conf.get("mysql", "host")
port = conf.get("mysql", "port")
charset = "utf8" class OPMysql(object):
__pool = None def __init__(self):
# 构造函数,创建数据库连接、游标
pass def __enter__(self):
self.conn = self.getmysqlconn()
self.cursor = self.conn.cursor(cursor=pymysql.cursors.DictCursor)
return self def __exit__(self, typeq, value, trace):
self.cursor.close()
self.conn.close() # 数据库连接池连接
@staticmethod
def getmysqlconn():
if OPMysql.__pool is None:
__pool_a = PooledDB(creator=pymysql, mincached=1, maxcached=10, host=host, user=user, passwd=password, db=database, port=int(port), charset=charset)
OPMysql.__pool = __pool_a
return OPMysql.__pool.connection() def get_connection():
return OPMysql()
日志模块在前面随笔中
#------mysql basic config
[mysql]
user=用户名
password=密码
database=数据库
host=你的mysqlIp
port =3306 [basic_config]
#---------------neeq config
#余数为0
neeq_remainder = 0
#服务器台数
neeq_server_num = 6
neeq_start_id = 1000
neeq_json_path = neeq/json/
neeq_json = neeq
neeq_id = neeq/neeq_id.txt
neeq_file_path = neeq/file/
bucket = 七牛云bucket
access_key =你的七牛云access_key
secret_key = 你的七牛云secret_key
python 读取mysql存储的文件路径下载文件,内容解析,上传七牛云,内容入es的更多相关文章
- JAVA中实现根据文件路径下载文件
import javax.servlet.http.HttpServletResponse; import java.io.File; import java.io.FileInputStream; ...
- 由ASP.NET Core根据路径下载文件异常引发的探究
前言 最近在开发新的项目,使用的是ASP.NET Core6.0版本的框架.由于项目中存在文件下载功能,没有使用类似MinIO或OSS之类的分布式文件系统,而是下载本地文件,也就是根据本地文件路径进行 ...
- Delphi阿里云对象存储OSS【支持上传文件、下载文件、删除文件、创建目录、删除目录、Bucket操作等】
作者QQ:(648437169) 点击下载➨Delphi阿里云对象存储OSS 阿里云api文档 [Delphi阿里云对象存储OSS]支持 获取Bucket列表.设置Bucket ...
- js上传文件带参数,并且,返回给前台文件路径,解析上传的xml文件,存储到数据库中
ajaxfileupload.js jQuery.extend({ createUploadIframe: function(id, uri) { //create frame var frameId ...
- 【python】用python脚本Paramiko实现远程执行命令、下载、推送/上传文件功能
Paramiko: paramiko模块,基于SSH用于连接远程服务器并执行相关操作. SSHClient: 用于连接远程服务器并执行基本命令 SFTPClient: 用于连接远程服务器并执行上传下载 ...
- 文件上传和下载(可批量上传)——Spring(二)
针对SpringMVC的文件上传和下载.下载用之前“文件上传和下载——基础(一)”的依然可以,但是上传功能要修改,这是因为springMVC 都为我们封装好成自己的文件对象了,转换的过程就在我们所配置 ...
- 基于SpringMVC的文件(增删改查)上传、下载、更新、删除
一.项目背景 摘要:最近一直在忙着项目的事,3个项目过去了,发现有一个共同的业务,那就是附件的处理,附件包括各种文档,当然还有图片等特殊文件,由于时间的关系,每次都是匆匆忙忙的搞定上线,称这项目的空档 ...
- 微信小程序开发技巧总结(二) -- 文件的选取、移动、上传和下载
微信小程序开发技巧总结(二) -- 文件的选取.移动.上传和下载 1.不同类型文件的选取 1.1 常用的图片 视频 对于大部分开发者来说,需要上传的文件形式主要为图片,微信为此提供了接口. wx.ch ...
- 文件上传和下载(可批量上传)——Spring(三)
在文件上传和下载(可批量上传)——Spring(二)的基础上,发现了文件下载时,只有在Chrome浏览器下文件名正常显示,还有发布到服务器后,不能上传到指定的文件夹目录,如上传20160310.txt ...
随机推荐
- 图像处理笔记(1): bmp文件结构处理与显示
1.1图和调色板的概念 如今Windows(3.x以及95,98,NT)系列已经成为绝大多数用户使用的操作系统,它比DOS成功的一个重要因素是它可视化的漂亮界面.那么Windows是如何显示图象的呢? ...
- AppCan使用注意问题
1.文件上传的时候尽量使用uexUploadMsg,然后注意文件名,文件名一定要正确才能传上去.
- keytool生成JKS证书的详细步骤及截图
注:防止有不必要的空格,尽量不要复制粘贴 1. 依据CFCA所提供的CN生成密钥存储文件和密钥对(创建JKS证书库) keytool -genkey -v -alias slserver -keyal ...
- Nginx启动提示找不到libpcre.so.1解决方法
如果是32位系统 [root@lee ~]# ln -s /usr/local/lib/libpcre.so.1 /lib 如果是64位系统 [root@lee ~]# ln -s /usr/lo ...
- Py修行路 python基础 (三)字典
names=["zhang"]names2=["jack","jie"]names.clear()print(names) #清空整个列表 ...
- File类的使用:遍历目录
- GO语言文件的创建与打开实例分析
本文实例分析了GO语言文件的创建与打开用法.分享给大家供大家参考.具体分析如下: 文件操作是个很重要的话题,使用也非常频繁,熟悉如何操作文件是必不可少的.Golang 对文件的支持是在 os pack ...
- 【bzoj1024】[SCOI2009]生日快乐
1024: [SCOI2009]生日快乐 Time Limit: 1 Sec Memory Limit: 162 MBSubmit: 2372 Solved: 1717[Submit][Statu ...
- oracle级联更新与级联删除
Oracle级联删除:可以使用外键约束来实现,建立表的主外键关系,给列设置级联删除.如下: ——创建了CLASS表,并设置ID字段为主键. -- Create tablecreate table CL ...
- 扩展卡尔曼滤波EKF与多传感器融合
参考:https://blog.csdn.net/young_gy/article/details/78468153 Extended Kalman Filter(扩展卡尔曼滤波)是卡尔曼滤波的非线性 ...