How to backup your blogs on cnblogs
This is an alternative to OfflineExplorer.
Thanks for this article[1] which is listed in Reference secton, and I modify several lines to adapt to my blogs. Here is the change list:
1. L193, change "homepage1_BottomPager" to "homepage1_HomePageDays_BottomPager". Because I can't find "homepage1_BottomPager" in the source code of my cnblog web page at all.
2. L394, set url to your last page.
3. L396, set the output directory on your local disk.
Enjoy it!
- #! encoding=utf-8
- #cnblogs博客备份,使用方法:修改最下面的url和output,然后执行就可以了。
- import urllib2
- import re
- import os
- import sys
- # from HTMLParser import HTMLParser
- import html5lib
- # from xml.etree.ElementTree import ElementTree
- from urlparse import urlparse
- import xml
- import codecs
- import traceback
- import time
- # class MyHTMLParser(HTMLParser):
- # def handle_starttag(self, tag, attrs):
- # # if tag.lower() == "img":
- # print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs))
- # for x in attrs:
- # print "name %s,value %s" % (x[0],x[1])
- # def handle_endtag(self, tag):
- # print "Encountered the end of a %s tag" % tag
- # def handle_startendtag(self, tag, attrs):
- # print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs))
- # for x in attrs:
- # print "name %s,value %s" % (x[0],x[1])
- # 资源尝试次数
- gTestTime = 5
- def DownloadFile(url,output):
- responseText = None
- dirssPath = None
- try:
- res = urlparse(url)
- url = res.scheme+"://"+res.netloc+res.path
- path = res.path
- index = path.rfind('/')
- dirss = "/"
- if index != -1:
- dirss = output + "/" + res.netloc.encode("utf-8") + path[0:index].encode("utf-8")
- dirssPath = output + "/" + res.netloc.encode("utf-8") + path.encode("utf-8")
- dirss_ansi = dirss.decode('utf-8')
- if not os.path.exists(dirss_ansi):
- os.makedirs(dirss_ansi)
- global gTestTime
- count = gTestTime
- while True:
- if count < 0:
- break
- count = count - 1
- header={"User-Agent": "Mozilla-Firefox5.0"}
- if not url.startswith("http://"):
- break
- try:
- # print "url: %s:%d" % (url,count)
- time.sleep(0.5)
- request = urllib2.Request(url,None,header)
- response = urllib2.urlopen(request)
- dirssPath_ansi = dirssPath.decode("utf-8")
- if not os.path.exists(dirssPath_ansi):
- resourceFile = open(dirssPath_ansi,"wb")
- responseText = response.read()
- if url.endswith(".js"):
- responseText = responseText.replace("http://","")
- responseText = responseText.replace("https://","")
- resourceFile.write(responseText)
- resourceFile.close()
- break
- except Exception,e:
- print "DownloadFile: %s:%s:%d" % (e,url,count)
- # pass
- # exstr = traceback.format_exc()
- # print exstr
- except Exception,e:
- pass
- # exstr = traceback.format_exc()
- # print exstr
- return (responseText,url,output)
- def ReadCss(css):
- # print "ReadCss"
- mode = 'url\(\"?([^)]+)\"?\)'
- pattern = re.compile(mode)
- try:
- text = css[0]
- if css[0] == None:
- return
- strMatch = pattern.findall(text)
- size = len(strMatch)
- # print "size: ",size
- for i in range(0,size,1):
- one = strMatch[i]
- newurl = GetConcatUrl(css[1],one)
- DownloadFile(newurl,css[2])
- except Exception,e:
- pass
- # exstr = traceback.format_exc()
- # print exstr
- def Download(url,output):
- # try:
- header={"User-Agent": "Mozilla-Firefox5.0"}
- namespace = "{http://www.w3.org/1999/xhtml}"
- request = urllib2.Request(url,None,header)
- response = urllib2.urlopen(request)
- data = response.read()
- document = html5lib.parse(data)
- imgElements = document.findall('.//{0}img'.format(namespace))
- # print "imgElements %d" % len(imgElements)
- for img in imgElements:
- src = img.attrib["src"]
- # print "src %s" % src
- try:
- res = urlparse(src)
- # 非cnblogs的图片不下载
- if not res.netloc.endswith(".cnblogs.com"):
- print "image not download: %s:%s" % (src,res.netloc)
- continue
- except Exception,e:
- pass
- DownloadFile(src,output)
- linkElements = document.findall('.//{0}link'.format(namespace))
- # print "linkElements %d" % len(linkElements)
- for link in linkElements:
- href = link.attrib["href"]
- # print "href %s" % href
- text = DownloadFile(href,output)
- if link.attrib.has_key("rel") and link.attrib["rel"].lower() == "stylesheet":
- ReadCss(text)
- scriptElements = document.findall('.//{0}script'.format(namespace))
- # print "scriptElements %d" % len(scriptElements)
- for script in scriptElements:
- if script.attrib.has_key("src"):
- src = script.attrib["src"]
- # print "src %s" % src
- DownloadFile(src,output)
- htmlNameIndex = url.rfind("/");
- urlLen = len(url)
- htmlName = GetHtmlName(url)
- output = output.decode("utf-8") + "/"+htmlName+".htm"
- data = data.replace("http://","")
- data = data.replace("https://","")
- data = data.replace("www.w3.org/1999/xhtml","http://www.w3.org/1999/xhtml")
- resourceFile = open(output,"wb")
- resourceFile.write(data)
- resourceFile.close()
- def GetConcatUrl(url,png):
- # one: "../images/f_icon.png" -- url http://static.csdn.net/public/common/toolbar/css/index.css
- count = 0
- index = png.find("..")
- startindex = None
- while index != -1:
- count = count + 1;
- startindex = index + 2
- index = png.find("..",startindex)
- second = png[startindex:]
- length = len(url)
- index = url.rfind("/")
- endindex = 0
- while count >= 0 and index != -1:
- endindex = index
- index = url.rfind("/",0, endindex)
- count = count - 1
- first = url[0:endindex]
- return first+second
- def getAllListUrl(url):
- header={"User-Agent": "Mozilla-Firefox5.0"}
- request = urllib2.Request(url,None,header)
- response = urllib2.urlopen(request)
- data = response.read()
- # By default, the document will be an xml.etree element instance.Whenever possible, html5lib chooses the accelerated ElementTreeimplementation (i.e. xml.etree.cElementTree on Python 2.x).
- document = html5lib.parse(data)
- namespace = "{http://www.w3.org/1999/xhtml}"
- # get <div id="homepage1_BottomPager" class="topicListFooter">
pageList = document.findall('.//{0}div[@id=\'homepage1_HomePageDays_BottomPager\']'.format(namespace))- print( "Debug>len(pageList)=%d"%len(pageList) );
- # get <div class="pager">
- alinks = list(pageList[0])
- # get content in <div class="pager">, like:<a href="http://www.cnblogs.com/GnagWang/default.html?page=1">
- alinks1 = list(alinks[0])
- lastArticle = alinks1[len(alinks1)-1]
- # lastArticleHref = u'http://www.cnblogs.com/GnagWang/default.html?page=20'
- lastArticleHref = lastArticle.attrib["href"]
- lastPageIndex = lastArticleHref.rfind("=")
- lastPageNum = int(lastArticleHref[lastPageIndex+1:])
- urlInfo = lastArticleHref[0:lastPageIndex]
- urlList = []
- for x in xrange(1,lastPageNum+1):
- listUrl = urlInfo+"="+str(x)
- urlList.append(listUrl)
- return urlList
- def getArticleList(url):
- # 获取所有的文章url
- # <div id="article_toplist" class="list"></div>
- # <div id="article_list" class="list"
- # <div class="list_item article_item"
- # <div class="article_title">
- # <span class="ico ico_type_Original"></span>
- # <h1>
- # <span class="link_title">
- # <a href="/infoworld/article/details/18984183">
- # <div class="article_manage">
- # <span class="link_postdate"></span>
- urlList = getAllListUrl(url)
- print "文章页数(number of pages) ",len(urlList)
- header={"User-Agent": "Mozilla-Firefox5.0"}
- allLists = []
- strPage = "分析 第 {0} 页 ".decode("utf-8").encode("utf-8")
- pageNum = 0
- global gTestTime
- for one in urlList:
- tryCount = gTestTime # try count
- pageNum = pageNum + 1
- pageNumStr = strPage.format(pageNum)
- print pageNumStr
- while tryCount > 0:
- try:
- tryCount = tryCount - 1
- time.sleep(0.5) #访问太快会不响应
- request = urllib2.Request(one,None,header)
- response = urllib2.urlopen(request)
- data = response.read()
- document = html5lib.parse(data,encoding="utf-8")
- namespace = "{http://www.w3.org/1999/xhtml}"
- # .//{0}div[@id=\'article_toplist\']
- #topLists = document.findall('.//{0}div[@id=\'article_toplist\']/{0}div[@class=\'list_item article_item\']'.format(namespace))
- #articleLists = document.findall('.//{0}div[@id=\'article_list\']/{0}div[@class=\'list_item article_item\']'.format(namespace))
- articleLists = document.findall('.//{0}div[@class=\'postTitle\']'.format(namespace))
- allLists = allLists + articleLists
- break
- except Exception, e:
- print "getArticleList %s:%s:%d" % (e,one,tryCount)
- count = 0 # 文章数
- artices = []
- for article in allLists:
- count = count+1
- alink = article.find(".//{0}a".format(namespace))
- # href = u'http://www.cnblogs.com/GnagWang/archive/2010/04/02/1702721.html'
- href = alink.attrib["href"]
- #oneHref = "http://blog.csdn.net"+href
- oneHref = href
- childElement = list(alink)
- linkIter = alink.itertext()
- title = "".encode("utf-8")
- for x in linkIter:
- title = title+x.strip().encode("utf-8")
- artices.append([oneHref,title])
- return artices
- def GetUserName(url):
- htmlNameIndex = url.rfind("/");
- urlLen = len(url)
- htmlName = ""
- htmlNameIndex1 = url.rfind("/",0,htmlNameIndex)
- htmlName = url[htmlNameIndex1+1:htmlNameIndex]
- # if htmlNameIndex+1 == urlLen:
- # htmlNameIndex = url.rfind("/",0,htmlNameIndex)
- # htmlName = url[htmlNameIndex+1:urlLen-1]
- # else:
- # htmlName = url[htmlNameIndex+1:]
- return htmlName
- def GetHtmlName(url):
- htmlNameIndex = url.rfind("/");
- urlLen = len(url)
- htmlName = ""
- if htmlNameIndex+1 == urlLen:
- htmlNameIndex = url.rfind("/",0,htmlNameIndex)
- htmlName = url[htmlNameIndex+1:urlLen-1]
- else:
- htmlName = url[htmlNameIndex+1:]
- return htmlName
- #url必须是类似http://www.cnblogs.com/GnagWang/default.html?page=19这样的。并且这页必须包括最后一页的链接。例如GnagWang共20页,则URL建议为前面的URL
- def Start(url,output):
- print "备份开始"
- lists = getArticleList(url)
- username = GetUserName(url)
- output_username = output+"/"+username
- output_username.replace("\\","/")
- if not os.path.exists(output_username.decode("utf-8")):
- os.mkdir(output_username.decode("utf-8"))
- totalNum = len(lists)
- print "总文章数(number of articles): %d" % totalNum
- # 生成首页文件
- doctype = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n'
- charset = '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />'
- indexHtml = output_username + ".htm"
- f = open(indexHtml.decode("utf-8"),"w")
- print >> f,doctype
- print >> f,'<html>'
- print >> f,'<head>'
- print >> f,charset
- print >> f,'</head>'
- print >> f,'<frameset cols=\"20%,*\">'
- navigationHtmlName = username+'-navigation.htm'
- print >> f,'<frame src=\"'+navigationHtmlName+'\" />'
- firstHtmlName = GetHtmlName(lists[0][0])
- print >> f,'<frame src=\"'+username+'/'+firstHtmlName+'.htm\" name=\"showframe\">'
- print >> f,'</frameset>'
- print >> f,'</html>'
- f.close()
- # 生成导航文件
- navigationHtml = output+"/"+navigationHtmlName
- # f = open(navigationHtml.decode("utf-8"),"w")
- f = codecs.open(navigationHtml.decode("utf-8"),"w","utf-8-sig")
- print >> f,doctype
- print >> f,'<html>'
- print >> f,'<head>'
- print >> f,charset
- print >> f,'<style> body{font: 12px Verdana, Arial, Helvetica, sans-serif;}a{color: #808080;}</style>'
- print >> f,'</head>'
- print >> f,'<body>'
- count = 0
- for x in lists:
- count = count + 1
- articleIdHtml = username+"/"+GetHtmlName(x[0])+".htm"
- print >> f,'<a href=\"'+articleIdHtml + '\" target=\"showframe\">'+str(count)+'.'+x[1].decode("utf-8")+'</a><br /><br />'
- print >> f,'</body>'
- print >> f,'</html>'
- f.close()
- print "开始下载文章"
- currentNum = 0
- strPage = "{0}:{1}.".decode("utf-8").encode("utf-8")
- global gTestTime
- for x in lists:
- count = gTestTime
- currentNum = currentNum+1
- while True:
- if count < 0:
- break
- count = count - 1
- try:
- time.sleep(1) #访问太快,csdn会报503错误.
- strPageTemp = strPage.format(totalNum,currentNum)
- strPageTemp = strPageTemp+x[1]
- print strPageTemp #这里有时候会不能输出,报output is not utf-8错误,单独执行时
- print x[0]
- print "\n"
- Download(x[0],output_username)
- break
- except Exception, e:
- # exstr = traceback.format_exc()
- # print exstr
- pass
- #url必须是类似http://www.cnblogs.com/GnagWang/default.html?page=21这样的。并且这页必须包括最后一页的链接。例如GnagWang共20页,则URL建议为前面的URL
- if __name__=='__main__':
- url = "http://www.cnblogs.com/yaoyansi/default.html?page=4"
- #output = "C:/Users/apple/Desktop/新建文件夹"
- output = "/tmp/my_tmp/cnblogs"
- Start(url,output)
- # Download("http://blog.csdn.net/dcraw/article/details/6858820",
- # "C:/Users/apple/Desktop/新建文件夹/infoworld")
Reference:
[1] http://blog.csdn.net/llrraa2010/article/details/35540845
How to backup your blogs on cnblogs的更多相关文章
- Usual tiny skills & solutions
Ubuntu and Win10 - double OS 2016-02-21 Yesterday I helped my friend install Ubuntu (14.04 LTS) on h ...
- MySQL 使用XtraBackup的shell脚本介绍
mysql_backup.sh是关于MySQL的一个使用XtraBackup做备份的shell脚本,实现了简单的完整备份和增量备份.以及邮件发送备份信息等功能.功能目前还比较简单,后续将继续完善和增加 ...
- Troubleshooting Failed Requests Using Tracing in IIS 8.5
https://docs.microsoft.com/en-us/iis/troubleshoot/using-failed-request-tracing/troubleshooting-faile ...
- 我心中的核心组件(可插拔的AOP)~第十五回 我的日志组件Logger.Core(策略,模版方法,工厂,单例等模式的使用)
回到目录 之前的讲过两篇关于日志组件的文章,分别是<第一回 日志记录组件之自主的Vlog>和<第三回 日志记录组件之log4net>,而今天主要说一下我自己开发的另一种日志 ...
- 【编译原理】c++实现自下而上语法分析及中间代码(四元式)生成
写在前面:本博客为本人原创,严禁任何形式的转载!本博客只允许放在博客园(.cnblogs.com),如果您在其他网站看到这篇博文,请通过下面这个唯一的合法链接转到原文! 本博客全网唯一合法URL:ht ...
- 【编译原理】c++实现自下而上语法分析器
写在前面:本博客为本人原创,严禁任何形式的转载!本博客只允许放在博客园(.cnblogs.com),如果您在其他网站看到这篇博文,请通过下面这个唯一的合法链接转到原文! 本博客全网唯一合法URL:ht ...
- 【编译原理】c++实现自上而下语法分析器
写在前面:本博客为本人原创,严禁任何形式的转载!本博客只允许放在博客园(.cnblogs.com),如果您在其他网站看到这篇博文,请通过下面这个唯一的合法链接转到原文! 本博客全网唯一合法URL:ht ...
- 【编译原理】c++实现词法分析器
写在前面:本博客为本人原创,严禁任何形式的转载!本博客只允许放在博客园(.cnblogs.com),如果您在其他网站看到这篇博文,请通过下面这个唯一的合法链接转到原文! 本博客全网唯一合法URL:ht ...
- Associate File Type with Qt In Mac Os and Win
Win Registry Question One day, my boss want me to finish one function which let the users can double ...
随机推荐
- ZOJ2604-DP
递推式并不知自己推出来的,用来纪念一下学会了java的函数用法... 最后5分钟的时候,大姐头直接告诉我打表,但是我当场就蒙了,我怎么会用java打表(其实自己脑中想的就是要输出到文件中然后生成数组打 ...
- Linux内核启动过程start_kernel分析
虽然题目是start_kernel分析,但是由于我在ubuntu环境下配置实验环境遇到了一些问题,我觉得有必要把这些问题及其解决办法写下来. 首先我使用的是Ubuntu14.04 amx64,以下的步 ...
- 【LeetCode OJ】Construct Binary Tree from Preorder and Inorder Traversal
Problem Link: https://oj.leetcode.com/problems/construct-binary-tree-from-preorder-and-inorder-trave ...
- 上传8m以上文件,报错误 101 (net::ERR_CONNECTION_RESET):连接已重置
经过多方查找,原来是因为我使用了nginx反响代理的原因.nginx在做反向代理时,默认的可以上传的附件大小是1M,可以通过设置nginx.conf中的client_max_body_size进行更改 ...
- SQL 事务
/*========================事务==============================*/begin tran--开始事务 --开始写流程语句 --语句写完之后if @@ ...
- Makefile拆分编写
在实际开发项目中,我们通常将一个工程划分为多个文件夹,每个文件夹代表不能的功能,如:我的一个项目cpl,它分为两个文件夹:src和test.当在cpl文件夹中运行make的时候,它的一级目录都会自动运 ...
- R语言实现 广义加性模型 Generalized Additive Models(GAM) 入门
转载请说明. R语言官网:http://www.r-project.org/ R语言软件下载:http://ftp.ctex.org/mirrors/CRAN/ 注:下载时点击 ins ...
- LintCode Two Sum
1. 数组numbers == null 及numbers.length == 0, 而不是用numbers[] 2. HashMap<Integer, Integer>而不是<in ...
- CodeForces 239A. Triangle
Link: http://codeforces.com/contest/407/problem/A 给定直角三角形的2个直角边a,b.求在直角坐标系中,是否存在对应的直角三角形,使得三个定点都在整点 ...
- R包之间冲突带来的奇怪错误
今天调试一个paper的代码,出现很奇怪的错误: qh2 <- mydf %>% filter(date >= as.Date('2013-08-14'),date <= as ...