python 读取mysql存储的文件路径下载文件，内容解析，上传七牛云，内容入es

#!/usr/bin/env python

# -*- coding: utf-8 -*-

import ConfigParser

import json

import os

import re

from re import sub

import sys

import time

import requests

from pdfminer.converter import PDFPageAggregator

from pdfminer.layout import LTTextBoxHorizontal, LAParams

from pdfminer.pdfdocument import PDFDocument

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

from pdfminer.pdfpage import PDFPage

from pdfminer.pdfpage import PDFTextExtractionNotAllowed

from pdfminer.pdfparser import PDFParser

from qiniu import Auth

from qiniu import etag

from qiniu import put_file

import log_config

from OP_Mysql import get_connection

from HTMLParser import HTMLParser

import random

reload(sys)

sys.setdefaultencoding('utf-8')

logger = log_config.getlogger('analysis_neeq_content', 'analysis_neeq_content.log')

conf = ConfigParser.ConfigParser()

conf.read("mysql.conf")

neeq_remainder = conf.get("basic_config", "neeq_remainder")

neeq_server_num = conf.get("basic_config", "neeq_server_num")

neeq_start_id = conf.get("basic_config", "neeq_start_id")

neeq_json_path = conf.get("basic_config", "neeq_json_path")

neeq_json = conf.get("basic_config", "neeq_json")

json_suffix = '.json'

neeq_id = conf.get("basic_config", "neeq_id")

neeq_file_path = conf.get("basic_config", "neeq_file_path")

access_key = conf.get("basic_config", "access_key")

secret_key = conf.get("basic_config", "secret_key")

bucket = conf.get("basic_config", "bucket")

class analysis:

    def __init__(self):

        # 用于文件追加

        self.count = 0

        self.neeq_json = neeq_json

        self.headers = {'Host': 'www.neeq.com.cn',

                        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36'

                        }

        self.create_init_dirtory()

    # 创建初始文件夹

    def create_init_dirtory(self):

        if not os.path.exists(neeq_json_path):

            os.makedirs(neeq_json_path)

        if not os.path.exists(neeq_file_path):

            os.makedirs(neeq_file_path)

    # mysql 获取数据

    def get_data(self):

        with get_connection() as db:

            # SQL 查询语句

            count = r"SELECT COUNT(*) as num FROM ssb_insight_neeq WHERE pro_status = 0 AND neeq_id <= %s and %s = (neeq_id %% %s)"

            logger.info("now excute sql script sql = %s" % count)

            try:

                # 获取所有记录列表

                db.cursor.execute(count, [neeq_start_id, neeq_remainder, neeq_server_num])

                counts = db.cursor.fetchall()

                num = counts[0]['num']

                logger.info("now rows num = %s" % num)

                if 0 != num % 1000:

                    pages = num / 1000 + 1

                else:

                    pages = num / 1000

                start_rows = 1000

                for i in range(0, pages):

                    start_page = i * 1000

                    sql = "SELECT t.sec_code,t.sec_name,t.title,t.doc_type,t.doc_type_key,c.industry1,c.industry2," \

                          "t.url,t.public_time,t.content,t.pro_status,t.module,t.es_id FROM ssb_insight_neeq t " \

                          "LEFT JOIN ssb_d_listed_company c ON t.sec_code = c.secCode WHERE t.pro_status = 0 and t.neeq_id <= %s " \

                          "AND %s = (t.neeq_id %% %s) ORDER BY t.neeq_id DESC LIMIT %s ,%s"

                    db.cursor.execute(sql, [neeq_start_id, neeq_remainder, neeq_server_num, start_page, start_rows])

                    result_datas = db.cursor.fetchall()

                    # 1000 数据放入此数组

                    json_data = []

                    es_id_file_addr = []

                    for row in result_datas:

                        item = {}

                        es_obj = {}

                        result = {'secCode': row['sec_code'],

                                  'secName': row['sec_name'],

                                  'title': row['title'],

                                  'docType': row['doc_type'].split(','),

                                  'docTypeKey': row['doc_type_key'].split(','),

                                  'url': row['url'],

                                  'publicTime': row['public_time'],

                                  'industry1': row['industry1'],

                                  'industry2': row['industry2'],

                                  'content': row['content'],

                                  'proStatus': bool(row['pro_status']),

                                  'module': row['module'],

                                  }

                        file_url = row['url']

                        self.download_file(file_url)

                        file_name = re.findall(r".*/(.*)", file_url)[0]

                        file_paths = neeq_file_path + file_name

                        if os.path.exists(file_paths):

                            content = self.analysis_file_content(file_paths)

                            self.upload_qiniu(file_paths)

                            self.del_file(file_paths)

                            if content == '':

                                continue

                            result['content'] = content

                        else:

                            logger.warn("file_url %s download fail" % file_url)

                            continue

                        item['id'] = row['es_id']

                        item['data'] = result

                        json_data.append(item)

                        es_obj['es_id'] = row['es_id']

                        es_obj['file_addr'] = file_paths

                        es_id_file_addr.append(es_obj)

                    self.write_json_file(json_data)

                    self.write_es_id_file_addr(es_id_file_addr)

            except Exception as e:

                logger.error("Error: unable to fecth data Exception %s" % e)

    def write_json_file(self, json_data):

        # 写数据

        json_path = neeq_json_path + self.neeq_json + json_suffix

        rows = self.get_json_rows(json_path)

        if rows > 100000:

            self.count = self.count + 1

            self.neeq_json = neeq_json + str(self.count)

            json_path = neeq_json_path + self.neeq_json + json_suffix

        with open(json_path, 'a') as es_file:

            for jsonitem in json_data:

                jsondatar = json.dumps(jsonitem, ensure_ascii=True)

                es_file.write(jsondatar+"\n")

    def write_es_id_file_addr(self, es_id_data):

        # 写入es_id，以及 七牛云 地址

        with open(neeq_id, 'a') as es_id_file:

            for jsonitem in es_id_data:

                es_id_file.write(jsonitem['es_id']+","+jsonitem['file_addr']+";"+"\n")

    # 获取json文件行数，用于分文件存储

    def get_json_rows(self, json_path):

        count = 0

        if not os.path.exists(json_path):

            return 0

        thefile = open(json_path, 'rb')

        while True:

            buffer = thefile.read(8192 * 1024)

            if not buffer:

                break

            count += buffer.count('\n')

        thefile.close()

        return count

    # 上传文件

    def upload_qiniu(self, file_path_name):

        q = Auth(access_key, secret_key)

        # 生成上传 Token，可以指定过期时间等

        token = q.upload_token(bucket, file_path_name, 3600)

        # 要上传文件的本地路径

        ret, info = put_file(token, file_path_name, file_path_name)

        # logger.info(info)

        if info.status_code != 200:

            logger.info("file upload qiniuyun fail %s" % file_path_name)

    # 删除文件

    def del_file(self, file_path_name):

        if os.path.exists(file_path_name):

            os.remove(file_path_name)

        else:

            logger.info("%s 文件不存在" % file_path_name)

    # 下载文件

    def download_file(self, file_url):

        time.sleep(random.uniform(1, 2))

        retry = 0

        try:

            while retry < 3:

                file_name = re.findall(r".*/(.*)", file_url)[0]

                response = requests.get(file_url, stream=True, headers=self.headers, timeout=5)

                if response.status_code == requests.codes.ok:

                    with open(neeq_file_path + file_name, "wb") as code:

                        for chunk in response.iter_content(chunk_size=1024):

                            if chunk:

                                code.write(chunk)

                break

        except Exception as e:

            logger.exception(e)

            retry += 1

    # 解析文件

    def analysis_file_content(self, filename):

        content = ''

        fenzhihouzhui = re.findall(r'.*(\..*)', str(filename))[0]

        if fenzhihouzhui == '.pdf' or fenzhihouzhui == '.PDF':

            content = self.analysis_pdf_file_content(filename)

        elif fenzhihouzhui == '.html' or fenzhihouzhui == '.HTML':

            content = self.analysi_html_file_content(filename)

        return content

    def analysis_pdf_file_content(self, filename):

        content = ''

        try:

            fileobject = open(filename, 'rb')

            parser = PDFParser(fileobject)

            document = PDFDocument(parser)

            if not document.is_extractable:

                raise PDFTextExtractionNotAllowed

            else:

                rsrcmgr = PDFResourceManager()

                laparams = LAParams()

                device = PDFPageAggregator(rsrcmgr, laparams=laparams)

                interpreter = PDFPageInterpreter(rsrcmgr, device)

                for page in PDFPage.create_pages(document):

                    interpreter.process_page(page)

                    layout = device.get_result()

                    for x in layout:

                        if isinstance(x, LTTextBoxHorizontal):

                            results = x.get_text().encode('utf-8')

                            content += results

            fileobject.close()

        except Exception as e:

            logger.error("analysis pdf file fail : %s" % e)

        return content

    def analysi_html_file_content(self, filename):

        content_open = open(filename, 'rb')

        contents = content_open.read()

        print contents

        contents = dehtml(contents)

class pythonNToTxt(HTMLParser):

    def __init__(self):

        HTMLParser.__init__(self)

        self.__text = []

    def handle_data(self, data):

        text = data.strip()

        if len(text) > 0:

            text = sub('[ \t\r\n]+', ' ', text)

            self.__text.append(text + ' ')

    def handle_starttag(self, tag, attrs):

        if tag == 'p':

            self.__text.append('\n\n')

        elif tag == 'br':

            self.__text.append('\n')

    def handle_startendtag(self, tag, attrs):

        if tag == 'br':

            self.__text.append('\n\n')

    def text(self):

        return ''.join(self.__text).strip()

def dehtml(text):

    try:

        parser = pythonNToTxt()

        parser.feed(text)

        parser.close()

        return parser.text()

    except Exception as e:

        logger.error("html analysis excepiton : %s" % e)

        return text

logger.info("analysis neeq content start,now params neeq_remainder=%s,neeq_start_id =%s,neeq_json = %s,neeq_id = %s ,neeq_file_path = %s" % (neeq_remainder, neeq_start_id, neeq_json, neeq_id, neeq_file_path))

analysis = analysis()

analysis.get_data()

#!/usr/bin/env python

# -*- coding: utf-8 -*

import sys

import log_config

import ConfigParser

import pymysql

from DBUtils.PooledDB import PooledDB

reload(sys)

sys.setdefaultencoding('utf-8')

conf = ConfigParser.ConfigParser()

conf.read("mysql.conf")

user = conf.get("mysql", "user")

password = conf.get("mysql", "password")

database = conf.get("mysql", "database")

host = conf.get("mysql", "host")

port = conf.get("mysql", "port")

charset = "utf8"

class OPMysql(object):

    __pool = None

    def __init__(self):

        # 构造函数，创建数据库连接、游标

        pass

    def __enter__(self):

        self.conn = self.getmysqlconn()

        self.cursor = self.conn.cursor(cursor=pymysql.cursors.DictCursor)

        return self

    def __exit__(self, typeq, value, trace):

        self.cursor.close()

        self.conn.close()

    # 数据库连接池连接

    @staticmethod

    def getmysqlconn():

        if OPMysql.__pool is None:

            __pool_a = PooledDB(creator=pymysql, mincached=1, maxcached=10, host=host, user=user, passwd=password, db=database, port=int(port), charset=charset)

            OPMysql.__pool = __pool_a

        return OPMysql.__pool.connection()

def get_connection():

    return OPMysql()

日志模块在前面随笔中

#------mysql basic config

[mysql]

user=用户名

password=密码

database=数据库

host=你的mysqlIp

port =3306

[basic_config]

#---------------neeq config

#余数为0

neeq_remainder = 0

#服务器台数

neeq_server_num = 6

neeq_start_id = 1000

neeq_json_path = neeq/json/

neeq_json = neeq

neeq_id = neeq/neeq_id.txt

neeq_file_path = neeq/file/

bucket = 七牛云bucket

access_key =你的七牛云access_key

secret_key = 你的七牛云secret_key

python 读取mysql存储的文件路径下载文件，内容解析，上传七牛云，内容入es的更多相关文章

JAVA中实现根据文件路径下载文件
import javax.servlet.http.HttpServletResponse; import java.io.File; import java.io.FileInputStream; ...
由ASP.NET Core根据路径下载文件异常引发的探究
前言最近在开发新的项目,使用的是ASP.NET Core6.0版本的框架.由于项目中存在文件下载功能,没有使用类似MinIO或OSS之类的分布式文件系统,而是下载本地文件,也就是根据本地文件路径进行 ...
Delphi阿里云对象存储OSS【支持上传文件、下载文件、删除文件、创建目录、删除目录、Bucket操作等】
作者QQ:(648437169) 点击下载➨Delphi阿里云对象存储OSS 阿里云api文档 [Delphi阿里云对象存储OSS]支持获取Bucket列表.设置Bucket ...
js上传文件带参数，并且，返回给前台文件路径，解析上传的xml文件，存储到数据库中
ajaxfileupload.js jQuery.extend({ createUploadIframe: function(id, uri) { //create frame var frameId ...
【python】用python脚本Paramiko实现远程执行命令、下载、推送/上传文件功能
Paramiko: paramiko模块,基于SSH用于连接远程服务器并执行相关操作. SSHClient: 用于连接远程服务器并执行基本命令 SFTPClient: 用于连接远程服务器并执行上传下载 ...
文件上传和下载（可批量上传）——Spring（二）
针对SpringMVC的文件上传和下载.下载用之前“文件上传和下载——基础(一)”的依然可以,但是上传功能要修改,这是因为springMVC 都为我们封装好成自己的文件对象了,转换的过程就在我们所配置 ...
基于SpringMVC的文件（增删改查）上传、下载、更新、删除
一.项目背景摘要:最近一直在忙着项目的事,3个项目过去了,发现有一个共同的业务,那就是附件的处理,附件包括各种文档,当然还有图片等特殊文件,由于时间的关系,每次都是匆匆忙忙的搞定上线,称这项目的空档 ...
微信小程序开发技巧总结（二） -- 文件的选取、移动、上传和下载
微信小程序开发技巧总结(二) -- 文件的选取.移动.上传和下载 1.不同类型文件的选取 1.1 常用的图片视频对于大部分开发者来说,需要上传的文件形式主要为图片,微信为此提供了接口. wx.ch ...
文件上传和下载（可批量上传）——Spring（三）
在文件上传和下载(可批量上传)——Spring(二)的基础上,发现了文件下载时,只有在Chrome浏览器下文件名正常显示,还有发布到服务器后,不能上传到指定的文件夹目录,如上传20160310.txt ...

随机推荐

谷歌浏览器无法播放QQ空间视频动画的解决方案
https://qzonestyle.gtimg.cn/qzone/photo/v7/js/module/flashDetector/flash_tutorial.pdf Chrome开启⽅法 1. ...
安装ElasticSearch客户端Kibana
安装Kibana Kibana是一个为 ElasticSearch 提供的数据分析的 Web 接口.可使用它对日志进行高效的搜索.可视化.分析等各种操作. wget https://artifacts ...
小程序mina框架与配置
小程序是采用MINA框架  <view> Hello {{name}}</view> <button bindtap=&quo ...
python开发socket套接字：粘包问题&udp套接字&socketserver
一,发生粘包服务器端 from socket import * phone=socket(AF_INET,SOCK_STREAM) #套接字 phone.setsockopt(SOL_SOCKET, ...
STL的一些技巧函数使用
1.emplace() 函数和 emplace_back() 函数 C++11的STL中新增加了emplace() 函数和 emplace_back() 函数,用来实现insert() 函数和 pus ...
highcharts钻取例子
<!doctype html> <html lang="en"> <head> <script type="text/javas ...
Linux性能监测：网络篇
网络的监测是所有 Linux 子系统里面最复杂的,有太多的因素在里面,比如:延迟.阻塞.冲突.丢包等,更糟的是与 Linux 主机相连的路由器.交换机.无线信号都会影响到整体网络并且很难判断是因为 L ...
storm集群配置以及java编写拓扑例子
storm集群配置安装修改配置文件使用java编写拓扑 storm集群配置 storm配置相当简单安装 tar -zxvf apache-storm-1.2.2.tar.gz rm apach ...
minerd.exe 处理
:top TASKKILL /F /IM "minerd.exe" Goto top and then searching the file system for minerd.e ...
y3pP5nCr攀科汲野奶园 O8XY02cm脱罕谘诜驮仆补殖沦ltGLD71R
{字母=2}谘们土毁低聊临禄霉{字母=3}焚派匠莆胺慷{字母=3}孔毡沃卮肪{字母=1}}{字母=1}尚澈心于逃丫导九壮何前僚九粤绦剖逃仲寺椿澈裳枚盟裳鹊酱滥食孤罕胤狼鞘孜跋柿悸菇沽惫菇卮认鹿锤敦擞众 ...

python 读取mysql存储的文件路径下载文件，内容解析，上传七牛云，内容入es

python 读取mysql存储的文件路径下载文件，内容解析，上传七牛云，内容入es的更多相关文章

随机推荐

热门专题