001 #coding:utf-8
002 import re,os,shutil,sys
003 import urllib2,socket,cookielib
004 from threading import Thread,stack_size,Lock
005 from Queue import Queue
006 import time
007 from gzip import GzipFile
008 from StringIO import StringIO
009
010 class ContentEncodingProcessor(urllib2.BaseHandler):
011 """A handler to add gzip capabilities to urllib2 requests """
012
013 # add headers to requests
014 def http_request(self, req):
015 req.add_header("Accept-Encoding", "gzip, deflate")
016 return req
017
018 # decode
019 def http_response(self, req, resp):
020 old_resp = resp
021 # gzip
022 if resp.headers.get("content-encoding") == "gzip":
023 gz = GzipFile(
024 fileobj=StringIO(resp.read()),
025 mode="r"
026 )
027 resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
028 resp.msg = old_resp.msg
029 # deflate
030 if resp.headers.get("content-encoding") == "deflate":
031 gz = StringIO( deflate(resp.read()) )
032 resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) # 'class to add info() and
033 resp.msg = old_resp.msg
034 return resp
035
036 # deflate support
037 import zlib
038 def deflate(data): # zlib only provides the zlib compress format, not the deflate format;
039 try: # so on top of all there's this workaround:
040 return zlib.decompress(data, -zlib.MAX_WBITS)
041 except zlib.error:
042 return zlib.decompress(data)
043
044 class Fetcher:
045 '''
046 html Fetcher
047
048 basic usage
049 -----------
050 from fetcher import Fetcher
051 f = Fetcher()
052 f.get(url)
053
054 post
055 ----
056 req = urllib2.Request(...)
057 f.post(req)
058
059 multi-thread
060 ------------
061 f = Fetcher(threads=10)
062 for url in urls:
063 f.push(url)
064 while f.taskleft()
065 url,html = f.pop()
066 deal_with(url,html)
067 '''
068 def __init__(self,timeout=10,threads=None,stacksize=32768*16,loginfunc=None):
069 #proxy_support = urllib2.ProxyHandler({'http':'http://localhost:3128'})
070 cookie_support = urllib2.HTTPCookieProcessor(cookielib.CookieJar())
071 encoding_support = ContentEncodingProcessor()
072 #self.opener = urllib2.build_opener(cookie_support,encoding_support,proxy_support,urllib2.HTTPHandler)
073 self.opener = urllib2.build_opener(cookie_support,encoding_support,urllib2.HTTPHandler)
074 self.req = urllib2.Request('http://www.hsbc.com')
075 socket.setdefaulttimeout(timeout)
076 self.q_req = Queue()
077 self.q_ans = Queue()
078 self.lock = Lock()
079 self.running = 0
080 if loginfunc:
081 self.opener = loginfunc(self.opener)
082 if threads:
083 self.threads = threads
084 stack_size(stacksize)
085 for i in range(threads):
086 t = Thread(target=self.threadget)
087 t.setDaemon(True)
088 t.start()
089
090 def __del__(self):
091 time.sleep(0.5)
092 self.q_req.join()
093 self.q_ans.join()
094
095 def taskleft(self):
096 return self.q_req.qsize()+self.q_ans.qsize()+self.running
097
098 def push(self,req,repeat=3):
099 if not self.threads:
100 print 'no thread, return get instead'
101 return get(req,repeat)
102 self.q_req.put(req)
103
104 def pop(self):
105 try:
106 data = self.q_ans.get(block=True,timeout=10)
107 self.q_ans.task_done()
108 except:
109 data = ['','']
110 return data
111
112 def threadget(self):
113 while True:
114 req = self.q_req.get()
115 with self.lock:
116 self.running += 1
117 ans = self.get(req)
118 print 'got',req
119 self.q_ans.put((req,ans))
120 try:
121 self.q_req.task_done()
122 except:
123 pass
124 with self.lock:
125 self.running -= 1
126 time.sleep(0.1) # don't spam
127
128 def proxyisworking(self):
129 try:
130 self.opener.open('http://www.hsbc.com').read(1024)
131 return True
132 except Exception , what:
133 print what
134 return False
135 def get(self,req,repeat=3):
136 '''
137 http GET req and repeat 3 times if failed
138 html text is returned when succeeded
139 '' is returned when failed
140 '''
141 try:
142 response = self.opener.open(req)
143 data = response.read()
144 except Exception , what:
145 print what,req
146 if repeat>0:
147 return self.get(req,repeat-1)
148 else:
149 print 'GET Failed',req
150 return ''
151 return data
152
153 def post(self,req,repeat=3):
154 '''
155 http POST req and repeat 3 times if failed
156 html text/True is returned when succeeded
157 False is returned when failed
158 '''
159 if not isinstance(req,urllib2.Request):
160 print 'post method need urllib.Request as argument'
161 return False
162 else:
163 r = self.get(req,repeat)
164 if r:
165 return r
166 else:
167 return True
168
169 class SiteCopyer:
170 def __init__(self,url):
171 self.baseurl = url
172 self.home = self.baseurl.split('/')[2]
173 self.f = Fetcher(threads=10)
174 self.create_dir()
175
176 def create_dir(self):
177 try:
178 shutil.rmtree(self.home)
179 except Exception,what:
180 print what
181 try:
182 os.mkdir(self.home)
183 os.mkdir(self.home+'/media')
184 os.mkdir(self.home+'/media/js')
185 os.mkdir(self.home+'/media/css')
186 os.mkdir(self.home+'/media/image')
187 except Exception,what:
188 print what
189
190 def full_link(self,link,baseurl=None):
191 if not baseurl:
192 baseurl = self.baseurl
193 if '?' in link:
194 link = link.rsplit('?',1)[0]
195 if not link.startswith('http://'):
196 if link.startswith('/'):
197 link = '/'.join(baseurl.split('/',3)[:3]) + link
198 elif link.startswith('../'):
199 while link.startswith('../'):
200 baseurl = baseurl.rsplit('/',2)[0]
201 link = link[3:]
202 link = baseurl+'/'+link
203 else:
204 link = baseurl.rsplit('/',1)[0]+'/'+link
205 return link
206
207 def link_alias(self,link):
208 link = self.full_link(link)
209 name = link.rsplit('/',1)[1]
210 if '.css' in name:
211 name = name[:name.find('.css')+4]
212 alias = '/media/css/'+name
213 elif '.js' in name:
214 name = name[:name.find('.js')+3]
215 alias = '/media/js/'+name
216 else:
217 alias = '/media/image/'+name
218 return alias
219
220 def strip_link(self,link):
221 if link and (link[0] in ['"',"'"]):
222 link = link[1:]
223 while link and (link[-1] in ['"',"'"]):
224 link = link[:-1]
225 while link.endswith('/'):
226 link = link[:-1]
227 if link and (link[0] not in ["<","'",'"']) and ('feed' not in link):
228 return link
229 else:
230 return ''
231
232 def copy(self):
233 page = self.f.get(self.baseurl)
234 links = re.compile(r'<link[^>]*href=(.*?)[ >]',re.I).findall(page)
235 links.extend( re.compile(r'<script[^>]*src=(.*?)[ >]',re.I).findall(page) )
236 links.extend( re.compile(r'<img[^>]*src=(.*?)[ >]',re.I).findall(page) )
237 templinks = []
238 for link in links:
239 slink = self.strip_link(link)
240 if slink:
241 templinks.append(slink)
242 links = templinks
243 for link in set(links):
244 page = page.replace(link,self.link_alias(link)[1:])
245 self.f.push( self.full_link(link) )
246 open(self.home+'/index.html','w').write(page)
247 while self.f.taskleft():
248 url,page = self.f.pop()
249 if url.endswith('.css'):
250 links = re.compile(r'url\([\'"]?(.*?)[\'"]?\)').findall(page)
251 templinks = []
252 for link in links:
253 slink = self.strip_link(link)
254 if slink:
255 templinks.append(slink)
256 links = templinks
257 for link in set(links):
258 self.f.push( self.full_link(link,url) )
259 page = page.replace(link,self.link_alias(link)[1:].replace("media",".."))
260 print 'write to',self.home+self.link_alias(url)
261 try:
262 open(self.home+self.link_alias(url),'w').write(page)
263 except Exception,what:
264 print what
265
266 if __name__ == "__main__":
267 if len(sys.argv) == 2:
268 url = sys.argv[1]
269 SiteCopyer(url).copy()
270 else:
271 print "Usage: python "+sys.argv[0]+" url"

【py网页】sitecopy代码的更多相关文章

  1. php正则获取网页标题、关键字、网页描述代码

    php正则获取网页关键字,代码如下: function get_keywords($html) { $html=strtolower($html); preg_match("@<hea ...

  2. Html网页的代码

    Html网页的代码 很全哦 1)贴图:<img src="图片地址"> 2)加入连接:<a href="所要连接的相关地址">写上你想写 ...

  3. 使用python对py文件程序代码复用度检查

    #!/user/bin/env python # @Time :2018/6/5 14:58 # @Author :PGIDYSQ #@File :PyCheck.py from os.path im ...

  4. 网页HTML代码在线运行器

    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/ ...

  5. 基于jQuery仿QQ音乐播放器网页版代码

    基于jQuery仿QQ音乐播放器网页版代码是一款黑色样式风格的网页QQ音乐播放器样式代码.效果图如下: 在线预览   源码下载 实现的代码. html代码: <div class="m ...

  6. mp4网页播放代码,有声音无图像的解决办法~

    mp4网页播放代码,有声音无图像的解决办法~     关于网页播放mp4格式的视频,找了一些插件,这里推荐一下video.js 官方网址:http://www.videojs.com/ github ...

  7. 程序猿爱情表白专用html5动画网页的代码

    程序猿爱情表白专用html5动画网页的代码 下载地址:源代码 程序员表白专用的html5动画特效网页,真的挺羡慕创作者的水平,有这水平能够把爱表白给想表白的人,不要以为那些鲜花是用 的图片.你会发如今 ...

  8. Python获取网页html代码

    获取网页html代码: import requests res = requests.get('https://www.cnblogs.com/easyidea/p/10214559.html') r ...

  9. 【py网页】urlopen的补充,完美

    urllib 是 python 自带的一个抓取网页信息一个接口,他最主要的方法是 urlopen(),是基于 python 的 open() 方法的.下面是主要说明: 1 urllib.urlopen ...

随机推荐

  1. [LeetCode]题解(python):80-Remove Duplicates from Sorted Array II

    题目来源 https://leetcode.com/problems/remove-duplicates-from-sorted-array-ii/ Follow up for "Remov ...

  2. iOS:GPUImage强大的图像处理框架

    GPUImage是一个非常棒的图像处理的开源库,里面提供了非常非常多的滤镜效果来加工图像. 不过就是因为太多效果了,而且对于程序员来说,那么多效果并不清楚知道要用那一个.于是我就使用提供的默认值,加上 ...

  3. eigen主页

    http://eigen.tuxfamily.org/index.php?title=Main_Page

  4. JBoss远程方法调用漏洞利用详解

    早上起床打开微博看到空虚浪子心大神发的一篇有关Jboss漏洞的文章,对我等菜鸟来说那边文章看起来还是很吃力的,所以查了查国内外的资料,翻译写了这边文章,记录一下. 在JBoss服务器上部署web应用程 ...

  5. ubantu install chrome

    ubantu apt-get installt -y openssh-server sudo apt-get -f install libappindicator1 libindicator7dpkg ...

  6. java yum安装的环境变量设置

    如何(怎样)在CentOS 6.X 或 redhat 上使用安装JDK runtime environment (openjdk) ? CentOS 6.X 和 5.X 自带有OpenJDK runt ...

  7. C# GridControl 行背景颜色

    使用C# DevExpress_gridControl 行号行样式显示行背景颜色,必须取消自动变换行色属性 取消Focus变色,属性为OptionSelection--EnableAppearance ...

  8. 9G10内核时钟tick实现

    9G10中PIT(Periodic Interval Timer)提供OS调度中断,它提供了最高精度和最有效的管理(即使系统长时间响应).一. 硬件PIT目标是提供OS的周期中断.PIT提供一个可编程 ...

  9. javaScirpt学习之事件

    一.事件的添加和删除 不同浏览器,不同DOM级别,添加和删除事件的方法也不同,具体可以参见后面的EventUtil元素. 二.事件对象 在事件处理程序中,浏览器会为之传入一个event对象,该对象的常 ...

  10. nsstring基本数据类型的包装类

    // //  main.m //  10-基本数据类型的包装类 // //  Created by apple on 14-3-20. //  Copyright (c) 2014年 apple. A ...