一点资讯视频抓取 phantomjs

# _*_ coding: utf- _*_

"""

思路:

.列表页使用phantomjs模拟点击.每个链接只抓取第一页9-10条内容,按照标题去重

.布置定时任务,每天8点执行一次

"""

import MySQLdb

import redis

import sys

import os

import re

import urllib

import requests

import time

import hashlib

import traceback

import urlparse

import random

import signal

# import multiprocessing

import matplotlib

matplotlib.use("Agg")

import shutil

import socket #图片下载延迟的

socket.setdefaulttimeout()

import multiprocessing

from config import IConfig

from video_list import ydzx_url_list

from bs4 import BeautifulSoup

from upload_images import UploadFile

from moviepy.editor import VideoFileClip

from selenium import webdriver

from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

reload(sys)

sys.setdefaultencoding('utf-8')

class WxpnVideo(multiprocessing.Process):

    def __init__(self):

        self.redisConf = IConfig.load('resource.redis')

        self.redisServer = redis.Redis(host=self.redisConf['host'], port=self.redisConf['port'], db=self.redisConf['db'], password=self.redisConf['passwd'])

        self.dbConfig = IConfig.load('resource.mysql')

        self.conn = MySQLdb.connect(

            user = self.dbConfig['user'],

            passwd = self.dbConfig['password'],

            db = self.dbConfig['dbname'],

            host = self.dbConfig['host'],

            charset = "utf8",

            use_unicode = True)

        self.conn.ping(True)

        self.cursor = self.conn.cursor()

        self.headers = {

            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',

            'Host': 'www.yidianzixun.com',

            'X-Requested-With': 'XMLHttpRequest',

            'Connection': 'keep-alive',

            'Accept': '*/*',

            'Accept-Encoding': 'gzip, deflate',

            'Accept-Language': 'zh-CN,zh;q=0.9',

        }

        self.domain = IConfig.load('resource.domain')

        self.apiConf = IConfig.load('resource.apiurl')

        self.key_video_list = 'wxpn:video:list'

        self.key_title = 'wxpn:video:title'

        self.storeConfig = IConfig.load('resource.store')

        self.thumb_path = self.storeConfig['images_path']

        self.ossConf = IConfig.load('resource.oss')

        self.key_id = self.ossConf['access_key_id']

        self.key_secret = self.ossConf['access_key_secret']

        self.endponit = self.ossConf['endponit']

        self.img_upload = UploadFile()

        self.auth = self.img_upload.auth_oss(self.key_id, self.key_secret)

        self.videoConf = IConfig.load('resource.apiurl')

        self.video_publish = self.videoConf['video_publish_api']

        self.ydzx_page_api = self.videoConf['ydzx_page_api']

        self.start_time = int(time.time())

        multiprocessing.Process.__init__(self)

    def store_video_list_redis(self, video_list):

        if video_list:

            for per_list in video_list:

                if not self.redisServer.sismember(self.key_video_list, per_list):

                    self.redisServer.sadd(self.key_video_list, per_list)

        else:

            return False

    def get_video_para(self):

        while True:

            if self.redisServer.scard(self.key_video_list) == :

                break

            link = self.redisServer.spop(self.key_video_list)

            print(link)

            # url = self.ydzx_page_api + link

            # try:

            #     res = requests.get(url=url, timeout=)

            # except Exception as e:

            #     print('连接失败')

            # print(res.status_code)

            dcap = dict(DesiredCapabilities.PHANTOMJS)

            dcap["phantomjs.page.settings.userAgent"] = (

            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"

            )

            try:

                driver = webdriver.PhantomJS(desired_capabilities=dcap, executable_path='/usr/local/phantomjs/bin/phantomjs')

                # driver.set_page_load_timeout()

                # driver.set_script_timeout()

                time.sleep(random.randrange(, ))

                driver.get(link)

                time.sleep(random.randrange(, ))

                text = driver.page_source

                driver.service.process.send_signal(signal.SIGTERM)

                driver.quit()

            except Exception as e:

                print(traceback.format_exc())

                continue

            # if res.status_code == :

            soup = BeautifulSoup(text, 'lxml')

            title_list = soup.select('div.channel-news div.doc-title')

            itemid_list = soup.select('div.channel-news a.style-content-middle')

            if title_list and itemid_list:

                try:

                    for num, title in enumerate(title_list):

                        m = hashlib.md5()

                        m.update(str(title.text).strip())

                        psw = m.hexdigest()

                        print(title.text)

                        itemid = itemid_list[num]['data-docid']

                        if not self.redisServer.sismember(self.key_title, psw):

                            yield psw, itemid

                except Exception as e:

                    print(traceback.format_exc())

                    continue

            else:

                print('一点资讯视频列表{0}页链接请求失败,请及时查看原因'.format(link))

                # self.form_data['content'] = '【创业黑马】预警:一点资讯视频列表{0}页链接请求失败,请及时查看原因'.format(link)

                # res = requests.post(self.msg_api, data=self.form_data)

    def time_cycle(self,origin_time):

        now = time.time()

        try:

            if origin_time == '昨天':

                published = int(now) - *

            elif '天' in origin_time:

                day_one = re.compile('(.*?)天')

                published = int(now)-int(day_one.findall(origin_time)[])**

            elif '小时' in origin_time:

                hour_one = re.compile('(.*?)小时')

                published = int(now)-int(hour_one.findall(origin_time)[])**

            elif '分' in origin_time:

                min_one = re.compile('(.*?)分')

                published = int(now)-int(min_one.findall(origin_time)[])*

            elif '月' in origin_time:

                month_one = re.compile('(.*?)个月')

                published = int(now)-int(month_one.findall(origin_time)[])***

            else:

                timeArray = time.strptime(origin_time, "%Y.%m.%d")

                published = int(time.mktime(timeArray))

            return published

        except Exception as e:

            print(traceback.format_exc())

    def download_video(self, psw, itemid):

        now = int(time.time())

        url = 'http://www.yidianzixun.com/article/' + itemid

        print(url)

        self.headers['Referer'] = url

        try:

            res = requests.get(url=url, headers=self.headers, timeout=)

            print(res.status_code)

        except Exception as e:

            print('小链接连接失败')

        if res.status_code == :

            soup = BeautifulSoup(res.text, 'lxml')

            title = soup.select('div.left-wrapper > h2')[].text

            try:

                video_src = soup.select('div.video-wrapper > video')[]['src']

            except Exception as e:

                print('此篇为文章,不是视频')

            thumb_src = soup.select('div.video-wrapper > video')[]['poster']

            try:

                source = soup.select('body.page-article .left-wrapper > .meta > a')[].text

            except Exception as e:

                source = soup.select('body.page-article .left-wrapper > .meta > span')[].text

                source_re = re.sub('来源：', '', str(source))

                source = source_re

            publishtime = soup.select('body.page-article .left-wrapper > .meta > span')[].text

            try:

                timestamp = self.time_cycle(str(publishtime))

            except Exception as e:

                timestamp = now

            img_url_parts = urlparse.urlparse(thumb_src)

            img_url_query = urlparse.parse_qs(img_url_parts.query,True)

            if img_url_query.has_key('wx_fmt'):

                ext_name = '.' + img_url_query['wx_fmt'][]

            else:

                ext_name = '.png'

            thumb_p = self.thumb_path + 'video/thumb'

            if not os.path.exists(thumb_p):

                os.mkdir(thumb_p)

            img_down_local_path = thumb_p + '/' + psw[:] + ext_name

            urllib.urlretrieve(thumb_src, img_down_local_path)

            file_name = psw[:] + ext_name

            if os.path.exists(img_down_local_path):

                images_path = self.ossConf['video_thumb_path']

                status = self.img_upload.upload_to_oss(self.auth, self.endponit, images_path, file_name, img_down_local_path)

            thumb_src = self.domain['img_url_oss'] + 'Cmstop/ydzx/' + file_name

            m = hashlib.md5()

            m.update(str(thumb_src))

            psw_thumb = m.hexdigest()

            try:

                delay_re = re.compile('"duration":(\d+)')

                playtime = delay_re.findall(str(res.text))[]

            except Exception as e:

                print(traceback.format_exc())

                playtime = None

            video_path = self.thumb_path + 'video/' + str(video_src).split('/')[-]

            video_res = requests.get(video_src).content

            with open(video_path, 'wb') as f:

                f.write(video_res)

            try:

                clip = VideoFileClip(video_path)

                print(clip.duration)

            except Exception as e:

                print(traceback.format_exc())

                return False

            # with open(video_path, 'r') as f:

            #     length = len(f.read())

            #     if length < :

            #         return False

            video_name = str(video_src).split('/')[-][:]

            if os.path.exists(video_path):

                images_path = self.ossConf['video_path']

                status = self.img_upload.upload_to_oss(self.auth, self.endponit, images_path, video_name, video_path)

                print('')

                if status != 'success':

                    return False

            print('视频上传成功')

            video_link = self.domain['img_url_oss'] + 'Cmstop/video/ydzx/' + video_name

            try:

                sql = "insert into cmstop_comment_topic(title, description, thumb, created, url_md5, url) values(%s, '', %s, %s, %s, '')"

                params = (title, thumb_src, now, psw_thumb)

                self.cursor.execute(sql, params)

                self.conn.commit()

                topicid = self.cursor.lastrowid

            except Exception as e:

                print(traceback.format_exc())

                self.conn.rollback()

            sourceid = self.get_article_sourceid(source)

            try:

                result = self.cursor.execute("""

                            insert into cmstop_content(topicid, sourceid, catid, modelid, title, subtitle, source_title, source_link, weight, status, created, score, published, thumb, createdby)

                            values(%s, %s, %s, %s, %s, %s, %s, %s, , %s, %s, %s, %s, %s, %s)

                            """, (topicid, sourceid, 47, 4, title, None, title, '', 3, now, 0, timestamp, thumb_src, 0))

                self.conn.commit()

                lastrowid = self.cursor.lastrowid

            except Exception as e:

                print(traceback.format_exc())

                self.conn.rollback()

            video_id = str(video_src).split('/')[-][:-]

            try:

                sql = "insert into cmstop_video(contentid, video, playtime, author, video_id, aid) values(%s, %s, %s, %s, %s, %s)"

                print(sql)

                params = (lastrowid, video_link, playtime, source, video_id, )

                self.cursor.execute(sql, params)

                self.conn.commit()

            except Exception as e:

                print(traceback.format_exc())

                self.conn.rollback()

            self.redisServer.sadd(self.key_title, psw)

            api_url = self.video_publish + str(lastrowid)

            try:

                resp = urllib.urlopen(api_url)

                result = resp.read()

            except:

                print 'connect failed'

        else:

            print('一点资讯视频主链接请求失败,请及时查看原因')

            # self.form_data['content'] = '【创业黑马】预警:一点资讯视频主链接请求失败,请及时查看原因'

            # res = requests.post(self.msg_api, data=self.form_data)

    def get_article_sourceid(self, source, medias = []):

        source = source.strip()

        sourceid = 

        """

        print source

        print set([source.encode('utf-8')])

        print medias

        """

        result = self.cursor.execute('select `sourceid`, `name`, `has_signed_contract` from `cmstop_source` where `name`="' + source + '"')

        has_signed_contract = 

        if medias and (set([source.encode('utf-8')]) & medias):

            has_signed_contract = 

        if result:

            data = self.cursor.fetchone()

            sourceid = data[]

            if data[] != has_signed_contract:

                try:

                    result = self.cursor.execute("""

                        update `cmstop_source` set `has_signed_contract`=%s where sourceid=%s

                        """, (has_signed_contract, sourceid))

                    self.conn.commit()

                except:

                    self.conn.rollback()

        else:

            try:

                result = self.cursor.execute("""

                    insert into `cmstop_source`(`name`, `logo`, `url`, `initial`, `has_signed_contract`)

                    values(%s, %s, %s, %s, %s)

                    """, (source, '', '', '', has_signed_contract))

                self.conn.commit()

                sourceid = self.cursor.lastrowid

            except:

                self.conn.rollback()

        return sourceid

    def run(self):

        os.system('pkill phantomjs')

        lockConf = IConfig.load('resource.lock')

        lock_file = lockConf['lock_path_ydzx']

        if os.path.exists(lock_file):

            print('lock file exists')

            return False

        os.system(r'touch %s '% lock_file)

        self.store_video_list_redis(ydzx_url_list)

        get_video_para = self.get_video_para()

        for psw, itemid in get_video_para:

            print(psw)

            stop_time = int(time.time())

            balance_time = stop_time - self.start_time

            if balance_time >= : #运行时间为3个小时

                self.del_file(self.thumb_path + 'video')

                os.system(r'rm -rf %s' % lock_file)

                os._exit()

            try:

                self.download_video(psw=psw, itemid=itemid)

                time.sleep(random.uniform(, ))

                os.system('pkill ffmpeg-osx-v3.2.4')

            except Exception as e:

                print(traceback.format_exc())

                continue

        self.del_file(self.thumb_path + 'video')

        os.system(r'rm -rf %s' % lock_file)

    def video_publish(self):

        sql = 'select contentid from cmstop_video where contentid<=3528920 and contentid>=3430851'

        self.cursor.execute(sql)

        data = self.cursor.fetchall()

        for num in data:

            api_url = self.video_publish + str(num[])

            try:

                resp = urllib.urlopen(api_url)

                result = resp.read()

            except:

                print 'connect failed'

    def del_file(self, path):

        os.chdir(path) #进入要清空的目录

        ds = list(os.listdir(path)) #获得该目录下所有文件或文件夹列表皮

        for d in ds: #遍历该列表

            if os.path.isfile(d): #如果列表项是文件

                os.remove(d) #直接删除

            else: #如果不会文件

                shutil.rmtree(d) #也直接删除

if __name__ == '__main__':

    video_one = WxpnVideo()

    # video_two = WxpnVideo()

    video_one.start()

    # video_two.start()

    video_one.join()

    # video_two.join()

一点资讯视频抓取 phantomjs的更多相关文章

寻找丢失的微服务-HAProxy热加载问题的发现与分析原创：单既喜一点大数据技术团队 4月8日在一点资讯的容器计算平台中，我们通过HAProxy进行Marathon服务发现。本文记录HAProxy服务热加载后某微服务50%概率失效的问题。设计3组对比实验，验证了陈旧配置的HAProxy在Reload时没有退出进而导致微服务丢失，并给出了解决方案. Keywords：HAProxy热加
寻找丢失的微服务-HAProxy热加载问题的发现与分析原创: 单既喜一点大数据技术团队 4月8日在一点资讯的容器计算平台中,我们通过HAProxy进行Marathon服务发现.本文记录HAPro ...
UC脱茧蜕变，移动资讯市场格局再生变
日前,UC浏览器正式更名为UC,同时正式发布大数据驱动的独立资讯应用“UC头条”.而整个UC品牌也从工具类升级为优质资讯内容平台,并吹响了向“大数据新型媒体平台”进军的冲锋号.根据UC官方公布的数据显 ...
2016年我们重新思考移动互联网创业的风险, 微信还是APP？
感觉这两年前端开发又火起来了,很多做内容创业和做微电商创业的人,往往都选择了运营微信号.对于做纯技术开发的人来说,一般是看不上微信号的,感觉没什么技术含量,或者说没什么技术壁垒.也有另一批人观点相反的 ...
wifi万能钥匙自媒体平台开放注册（付注册流程）
12月13日,有网友爆料,wifi万能钥匙自媒体开放注册,看来自媒体还没有达到饱和阶段,也印证了自媒体时代才刚刚到来.现在这个自媒体的时代,几乎大多互联网企业都开通了自己的自媒体,比较知名的像今日头条 ...
长文丨papi、咪蒙、罗胖之后，内容创业的机会在哪儿
一.内容的一年 app死了,内容永生! 2016年,创业圈画风突变,昨天还在激辩java和PHP谁更好的技术宅们.还在梦想着成为第二个乔布斯改变世界的产品狗们.还在忽悠着用O2O横扫传统行业的小老板们 ...
2016中国APP分类排行榜参选入围产品公示
2016中国APP分类排行榜参选入围产品公示由中国科学院<互联网周刊>.中国社会科学院信息化研究中心.eNet硅谷动力共同主办的2016中国APP分类排行榜发布暨颁奖晚宴即将举行.此 ...
O2O模式成功案例分享汲取精华化为己用
本文通过分享一些公司的o2o成功案例让您了解什么是O2O,o2o的优势,o2o模式有哪些,未来我们要如何做o2o才更有竞争力,学牛人的o2o创新玩法,摸索适合自己的o2o思路.拥抱o2o - 传统企业 ...
Android APP安全评估工具 Drozer - 使用介绍
一.列出drozer当前可用的所有模块dz> list dz> list app.activity.forintent Find activities that can handle th ...
C#入门教程（一）–.Net平台技术介绍、C#语言及开发工具介绍-打造C#学习教程
一.什么是.Net平台? .Net平台是微软搭建的技术平台,技术人员在此平台上进行应用的搭建与开发.它提供了运行所必须的环境.NET Framework类库以及CLR(公共语言运行时).好比我们人类的 ...

随机推荐

ios开发――解决UICollectionView的cell间距与设置不符问题
在用UICollectionView展示数据时,有时我们希望将cell的间距调成一个我们想要的值,然后查API可以看到有这么一个属性: - (CGFloat)minimumInteritemSpaci ...
MyBatis的基本用法
MyBatis MyBatis 是一款优秀的持久层框架,它支持定制化 SQL.存储过程以及高级映射.MyBatis 避免了几乎所有的 JDBC 代码和手动设置参数以及获取结果集.MyBatis 可以使 ...
iOS如何才能在招聘中表现得靠谱？
http://www.cocoachina.com/programmer/20150707/12414.html 近一年内陆续面试了不少人了,从面试者到面试官的转变让我对 iOS 招聘有了更多的感受. ...
Laravel 下的伪造跨站请求保护 CSRF#
简介# Laravel 可以轻松地保护应用程序免受跨站请求伪造(CSRF) 的攻击.跨站请求伪造是一种恶意的攻击, 他凭借已通过身份验证的用户身份来运行未经过授权的命令. Laravel 会自动为每个 ...
oracle函数 LPAD(c1,n[,c2])
[功能]在字符串c1的左边用字符串c2填充,直到长度为n时为止 [参数]C1 字符串 n 追加后字符总长度 c2 追加字符串,默认为空格 [返回]字符型 [说明]如果c1长度大于n,则返回c1左边n个 ...
@loj - 2290@ 「THUWC 2017」随机二分图
目录 @description@ @solution@ @accepted code@ @details@ @description@ 一个左右各 n 个点的二分图,图中的边会按照一定的规律随机出现. ...
getopt、getopt_long和getopt_long_only解析命令行参数
一:posix约定: 下面是POSIX标准中关于程序名.参数的约定: 程序名不宜少于2个字符且不多于9个字符: 程序名应只包含小写字母和阿拉伯数字: 选项名应该是单字符或单数字,且以短横 '-' 为前 ...
HTML5有哪些新特性？移除了哪些元素？
HTML5新特性: 拖放(Drag and drop)API 语义化标签(header.nav.footer.section.article.aside) 音频.视频(audio.video)API ...
uva 100 The 3n + 1 problem (RMQ)
uva.onlinejudge.org/index.php?option=com_onlinejudge&Itemid=8&page=show_problem&problem= ...
[转载] linux、Solaris下xdmcp远程桌面服务
原文链接 http://youlvconglin.blog.163.com/blog/static/52320420106243857254/ 使用图形界面远程登录linux和Solaris,首先要在 ...

一点资讯 视频抓取 phantomjs

一点资讯 视频抓取 phantomjs的更多相关文章

随机推荐

热门专题

一点资讯视频抓取 phantomjs

一点资讯视频抓取 phantomjs的更多相关文章