定向爬取极客学院视频，原本只有年费VIP只能下载，经过分析，只要找个免费体验VIP即可爬取所有视频
涉及的基本技术：python xpath 正则 com+
通过python调用迅雷从组件，实现自动创建文件夹和自动添加批量下载任务，前提要成功安装迅雷和迅雷组件
思路：path路径爬取所有标签-》搜索页面所有该课程分类-》课程页面获取课程明细-》正则分析视频地址
极客学院的一直在改进，可能需要自己改进

import requests from lxml import etree import re import sys, os, glob,time import scrapy

reload(sys) sys.setdefaultencoding("utf-8")

#baesurl = "http://www.jikexueyuan.com/search/s/q_"

#base_path = "f:/jike/"

#heanders Cookie需要自己抓取，否则只能抓取到免费课程

headers = { "Host": "www.jikexueyuan.com", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Cookie": "ga=GA1.2.1700377703.1438173034; Hmlvtf3c68d41bda15331608595c98e9c3915=1438173034; MECHATLVTime=1438179151498; MECHATCKID=cookieVal=006600143817303272961295; statssid=1438985023415; statuuid=1438173038588973692017; connect.sid=s%3AWt8IWWxkVZ6zlhop7HpbG-vtXqtwIAs.QC1tYy4qV1bHOMDN0UTUfScLKFncl4NY5zAk1SS17Kw; QINGCLOUDELB=37e16e60f0cd051b754b0acf9bdfd4b5d562b81daa2a899c46d3a1e304c7eb2b|VbjfT|VbjfT; Hmlpvtf3c68d41bda15331608595c98e9c3915=1438179151; statisNew=0; statfromWebUrl=; gat=1; uname=jike76; uid=2992598; code=SMapFI; authcode=d572TzIvHFXNIVNXcNf4vI5lv1tQlyEknAG4m0mDQmvMRPa4VhDOtJXOSfO%2BeVFVPzra8M1sEkEzxqLX9qRgS6nWhd5VMobbDpeqvJ726i54TqMoDo81P4OlhQ", "Connection": "keep-alive" }

class jikeautodown: basepath = "" baseurl = "" coursetag = "" courseid = ""

def __init__(self, base_path, base_url):

    if base_path and base_url:

        self.base_path = base_path

        self.base_url = base_url

        self.get_tags()

    else:

        print("base_path and base_url is all must needed!")

        return

def run(self):

    self.get_tags()

get_tags 获取所有便签

def get_tags(self):

    url = "http://www.jikexueyuan.com/path/"

    tag_html = requests.get(url).text.decode("utf-8").encode("GB18030")

    tag_etree = etree.HTML(tag_html)

    tag_lists = [str(tag).rstrip("/")[str(tag).rstrip("/").rindex("/") + 1:] for tag in

                 tag_etree.xpath('/html/body/div[1]/div[4]/div/div[3]/div/a/@href') if tag]

    if tag_lists:

        for tag in tag_lists:

            print(tag)

            self.course_tag = tag

            self.get_total_page(tag)

get_tags 获取课程所有页面课程分页是js生成不好直接抓取，所以就暴力了

def get_total_page(self, tag):

    if tag:

        for page in range(1, 50):

            page_url = self.base_url + tag + "?pageNum=%d" % page

            # print(page_url)

            page_html = requests.get(page_url, headers=headers).text.decode("utf-8").encode("GB18030")

            # print(page_html)

            no_userMenu = re.search(r"userMenu", page_html, re.S)

            if no_userMenu is None:

                print("please check the cookies")

                return

            no_search = re.search(r"no-search", page_html, re.S)

            if no_search:

                print("the tag ;%s,%d is biggest page" % (tag, page - 1))

                # return page_url_lists

                break

            else:

                # page_url_lists.append(page_url)

                self.get_course_pages(page_url)

                # print(page_url)

getcoursepages 获取课程详细页面

def get_course_pages(self, tag_url):

    if tag_url:

        print("the tag_url:%s " % tag_url)

        course_page_lists = self.get_xpath_lists(tag_url, headers,

                                                 '//*[@id="changeid"]/ul/li/div/div[2]/h5/a/@href')

        if course_page_lists:

            for course_page_url in course_page_lists:

                self.get_down_urls(course_page_url)

getdownurls通过正则获取视频下载地址

def get_down_urls(self, course_page_url):

    if course_page_url:

        self.course_id = course_page_url[course_page_url.rindex("/") + 1:course_page_url.rindex(".")]

        # print(course_page_url)

        print("             course_id:%s %s" % (self.course_id, course_page_url))

        course_down_lists = self.get_xpath_lists(course_page_url, headers,

                                                 '//*[@class="video-list"]/div[2]/ul/li/div/h2/a/@href')

        if course_down_lists:

            for course_down_url in course_down_lists:

                course_down_html = requests.get(course_down_url, headers=headers).text.decode("utf-8").encode(

                    "GB18030")

                course_down = re.findall(r'source src="(.*?)"', course_down_html, re.S)

                if course_down:

                    print("                     %s" % course_down[0])

                    if self.addTasktoXunlei(course_down[0]):

                        # print("                     %s is add success!" % course_down[0])

                        print("                     is add success!")

                        time.sleep(5)

getfilelists创建文件夹

def get_file_lists(self, course_tag, course_id):

    course_path = ""

    if self.base_path and os.path.exists(self.base_path) == False:

        try:

            os.mkdir(self.base_path)

        except Exception:

            print("error :%s" % Exception.message)

            return

    if course_tag and os.path.exists(self.base_path + course_tag) == False:

        try:

            os.mkdir(self.base_path + course_tag)

            # print("%s dir is create success!" % (self.base_path + course_tag))

        except Exception:

            print("dir is create error,the error is %s" % Exception.message)

    tmp = self.base_path + course_tag + "\\" + str(course_id)

    if course_id and os.path.exists(tmp) == False:

        try:

            os.mkdir(tmp)

            course_path = tmp

            # print("%s dir is create success!" % tmp)

        except Exception:

            print("dir is create error,the error is %s" % Exception.message)

            return

    else:

        course_path = tmp

    return course_path

getxpathlists 专门解析xpath，不用每次都写

def get_xpath_lists(self, url, headers, xpath):

    try:

        html = requests.get(url, headers=headers).text.decode("utf-8").encode("GB18030")

        tree = etree.HTML(html)

        lists = [str(plist) for plist in tree.xpath(xpath) if plist]

    except Exception:

        print("get xpath list is error is :%s" % Exception.message)

        return

    return lists

addTasktoXunlei 添加迅雷任，必须安装迅雷，还需要对迅雷设置默认不提醒，否则就需要手动点击确定了

def addTasktoXunlei(self, down_url):

    flag = False

    from win32com.client import Dispatch

    o = Dispatch("ThunderAgent.Agent.1")

    # http: // cv3.jikexueyuan.com / 201508011650 / a396d5f2b9a19e8438da3ea888e4cc73 / python / course_776 / 01 / video / c776b_01_h264_sd_960_540.mp4

    if down_url:

        course_infos = str(down_url).replace(" ", "").replace("http://", "").split("/")

        course_path = self.get_file_lists(self.course_tag, self.course_id)

        try:

            o.AddTask(down_url, course_infos[len(course_infos)-1], course_path, "", "http://cv3.jikexueyuan.com", 1, 0, 5)

            o.CommitTasks()

            flag = True

        except Exception:

            print(Exception.message)

            print("                     AddTask is fail!")

    return flag

if __name__ == "__main__":

    myjike = jike_auto_down("f:\\jike\\", "http://www.jikexueyuan.com/search/s/q_")

    myjike.run()

python极客学院爬虫V1的更多相关文章

python scrapy版极客学院爬虫V2
python scrapy版极客学院爬虫V2 1 基本技术使用scrapy 2 这个爬虫的难点是 Request中的headers和cookies 尝试过好多次才成功(模拟登录),否则只能抓免费课 ...
基于requests实现极客学院课程爬虫
背景本文主要是为了完成极客学院课程<Python 单线程爬虫>中讲师布置的实战作业. 开发环境操作系统:windows 10 Python :Python 2.7 IDE:PyChar ...
【极客学院出品】Cocos2d-X系列课程之九-BOX2D物理引擎
Cocos2d-x 是时下最热门的手游引擎,在国内和国外手机游戏开发使用的份额各自是70%和25%,在App Store的top10中,有7个是用它开发的. 本节课程为Cocos2d-x系列课程之九, ...
scrapy爬取极客学院全部课程
# -*- coding: utf-8 -*- # scrapy爬取极客学院全部课程 import scrapy from pyquery import PyQuery as pq from jike ...
国内第一本micropython的书出版《机器人Python极客编程入门与实战》
第一本micropython的书<机器人Python极客编程入门与实战>. 购买地址:https://item.taobao.com/item.htm?spm=2013.1.w4018-1 ...
【极客学院-idea教程】
极客学院idea教程: http://whudoc.qiniudn.com/2016/IntelliJ-IDEA-Tutorial/index.html
maven介绍极客学院
来自极客学院 Apache Maven 是一套软件工程管理和整合工具.基于工程对象模型(POM)的概念,通过一个中央信息管理模块,Maven 能够管理项目的构建.报告和文档. Maven - 概述 M ...
极客学院年VIP卡原价260的F码,200出售
F码是中国最大的IT职业在线教育平台——极客学院推出的VIP时间兑换码,凭此可在极客学院官网兑换年VIP,畅享平台上所有IT技术课程. 购买请点击 http://www.bejson.com/othe ...
极客学院免费VIP
[手快福利]用我的链接注册极客学院,你我都能免费得30天VIP!6500+编程开发视频教程随便学,还能下载资料和源码 http://e.jikexueyuan.com/invite/index.htm ...

随机推荐

Oracle数据库，内置函数小结
1.聚合函数 count(字段) // 求非空行的数量 max(字段) // 获取最大值 sum(字段) //求和 avg(字段) // 平均值 min(字段) // 最小值 2.转换函数 to_da ...
pm2 配置
---恢复内容开始--- 1. ecosystem.json { "apps": [ { "name": "name", // 项目名 &q ...
[小北De编程手记] : Lesson 05 玩转 xUnit.Net 之从Assert谈UT框架实践
这一篇,本文会介绍一下基本的断言概念,但重点会放在企业级单元测试的相关功能上面.下面来跟大家分享一下xUnit.Net的断言,主要涉及到以下内容: 关于断言的概念 xUnit.Net常用的断言关于单 ...
Atitit.excel导出功能解决方案 php java C#.net版总集合.doc
Atitit.excel导出功能解决方案 php java C#.net版总集合.docx 1.1. Excel的保存格式office2003 office2007/2010格式1 1.2. 类库选 ...
Nibbler – 免费的网站测试和指标评分工具
Nibbler 是一款免费的工具,用于测试网站的各个方面指标.输入任意网站的地址,Nibbler 会给你一份报告,列出网站的10个关键领域的分数,包括可访问性,用户体验,搜索引擎优化,社交媒体和技术等 ...
经典网页设计：20个与众不同的国外 HTML5 网站
大家都都知道, HTML5 具备所有最新的技术和功能,帮助我们创造平滑过渡,花式图像滑块和动画.如果你正在考虑使用HTML5 来设计自己的网站,那么这个集合能够帮助你. 在过去的10年里,网页设计师使 ...
html4基础知识梳理
基础的html知识,只放Xmind的截图. 第一部分: 第二部分: 某些标签的使用示例及注意事项,在印象笔记里.
原生JS：Array对象详解
Array对象本文参考MDN做的详细整理,方便大家参考[MDN](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/ ...
记STM32F030多通道ADC DMA读取乱序问题
问题描述通过 uint16_t ConvData[8]保存DMA搬运的ADC转换数值,但是这个数组数值的顺序总是和ADC不是顺序对应的.比如用7个通道的ADC,当设置ADC_InitStructure ...
通过组策略实现Firefox自动以当前域账号登录MOSS站点---（原创）
忘忧草原创,转发请保留本人的大名,谢谢,如果需要文档的请找我索取前言通过组策略实现基于AD的windows验证的sharepoint站点在火狐下自动以当前域账号登录. 操作步骤-在服务器添加策略工 ...

python极客学院爬虫V1