pressmuSpiderr

#!/usr/bin/env python

# encoding: utf-8

import requests

from random import choice

from lxml import html

from urllib.parse import urljoin,quote

import os

import time

NAMEURLDIC={}

NAMEURLDIC_L2={}

ualist=["Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",

"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",

"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",

"Mozilla/5.0 (compatible; Yahoo! Slurp/3.0; http://help.yahoo.com/help/us/ysearch/slurp)"]

ua=choice(ualist)

header={"User_Agent":ua}

mailurl="https://press.mu"

url="https://press.mu/tag"

searc_url="https://press.mu/search/{}?p={}"

def getpage(url):

    req=None

    try:

        req=requests.get(url=url,headers=header,stream=True)

        req.encoding=req.apparent_encoding

    except:

        pass

    return req

def parse(url):

    source=getpage(url).text

    if len(source):

        root=html.fromstring(source)

    return root

def buff(url):

    buff = None

    req=getpage(url)

    return req

def save_file(title,url,type="m3u8"):

    if os.path.exists("pressimg"):

        pass

    else:

        os.mkdir("pressimg")

    with open(f'./pressimg/{title}.{type}',"wb") as fs:

            fs.write(buff(url).content)

root=parse(url)

taglist=root.xpath("//section[@id='tag']/ul/li/a")

for tag in taglist:

    title=tag.xpath("./text()")[0]

    href=urljoin(mailurl,tag.xpath("./@href")[0])

    NAMEURLDIC.setdefault(title,href)

for k,v in NAMEURLDIC.items():

    #第一页

    root=parse(v)

    #视频件数：

    v_count=root.xpath("//p[@id='hit']/strong/text()")[0]

    v_max_page_num=root.xpath("//nav[@id='pager']/ul/li[last()-1]/a/text()")[0]

    print(f'当前分类为{k}:，视频件数为：{v_count}')

    for item in range(1,int(v_max_page_num)+1):

        print(f"获取第{item}页")

        if item==1:

            pass

        else:

            root = parse(searc_url.format(quote(title.strip()),item))

        level2list=root.xpath("//section[@class='items']//h2/a")

        for level2 in level2list:

            title_level2 = level2.xpath("./text()")[0]

            href_level2 = urljoin(mailurl, level2.xpath("./@href")[0])

            NAMEURLDIC_L2.setdefault(title_level2, href_level2)

            print(title_level2,href_level2)

            root2 = parse(href_level2)

            videourl=root2.xpath("//div[@id='player']//video/source/@src")[0]

            imgurl="https:"+root2.xpath("//div[@id='player']//video/@poster")[0]

            print("videourl",videourl)

            print("imgurl",imgurl)

            save_file(title_level2,videourl)

            save_file(title_level2,imgurl,"jpg")

            print("开始下载",f"{title_level2}.jpg")

pressmuSpiderr的更多相关文章

Thymeleaf3.0内容
Thymeleaf简介什么是Thymeleaf Thymeleaf是网站或者独立应用程序的新式的服务端java模板引擎,可以执行HTML,XML,JavaScript,CSS甚至纯文本模板. Thy ...

随机推荐

【app.js】配置及App函数说明
app.js中的App函数用来注册一个小程序或设置全局变量. App函数: 语法:App(Object) 参数: Object json对象说明: App函数必须在app.js中调用 ...
LightGBM的算法介绍
LightGBM算法的特别之处自从微软推出了LightGBM,其在工业界表现的越来越好,很多比赛的Top选手也掏出LightGBM上分.所以,本文介绍下LightGBM的特别之处. LightGBM ...
Ubuntu16.04 + CUDA9.0 + cuDNN7.3 + Tensorflow-gpu-1.12 + Jupyter Notebook 深度学习环境配置
目录一.Ubuntu16.04 LTS系统的安装二.设置软件源的国内镜像 1. 设置方法 2.关于ubuntu镜像的小知识三.Nvidia显卡驱动的安装 1. 首先查看显卡型号和推荐的显卡驱动 ...
[C/C++] new/delete和malloc/free基本区别
/**便于遗忘时复习**/ 区别一:本质 new/delete 在C++中是运算符不是函数,需要编译器支持.malloc/free是库函数,需要头文件支持,在C语言中使用. 区别二:开辟内存大小用 ...
数据结构11——KMP
一.博客导航 KMP算法扩展KMP算法
详细讲解Java中方法的重载和重写
首先讲讲方法的重载: Java的重载就是在类中可以创建多个方法,它们具有相同的名字,但是却有不同的参数. 判断是否重载只有两个条件: 1)相同的方法名 2)不同的参数具体为: A.方法参数类型不同 ...
3GPP规范命名规则解读
http://blog.sina.com.cn/s/blog_6b10255301012co6.html 学习了解电信技术知识的一个很好的手段是阅读3GPP的规范.但是3GPP有大量的规范,我们可能经 ...
如何使用 window.open（）处理ajax请求返回的url：在本页面打开并防止浏览器拦截
ajax请求中用window.open()打开请求返回url(例如实现下载功能时),可能会因为跨域问题导致浏览器拦截解决办法是:在请求前,打开一个窗口,请求成功后将返回的url直接赋值给该窗口的hr ...
每个分组函数相当于一个for循环将集合的变量不断遍历
每个分组函数相当于一个for循环将集合的变量不断遍历
hihocoder 1320 压缩字符串（字符串+dp）
题解: 其实就是对应三种dp的转移方式 1.拼接类型 dp[i][j] = dp[i][c] + dp[c][j] 2.不变类型 dp[i][j] = j-i+1 3.重复类型(必须满足有k个循环节) ...

pressmuSpiderr

pressmuSpiderr的更多相关文章

随机推荐

热门专题