001 #coding:utf-8
002 import re,os,shutil,sys
003 import urllib2,socket,cookielib
004 from threading import Thread,stack_size,Lock
005 from Queue import Queue
006 import time
007 from gzip import GzipFile
008 from StringIO import StringIO
009
010 class ContentEncodingProcessor(urllib2.BaseHandler):
011 """A handler to add gzip capabilities to urllib2 requests """
012
013 # add headers to requests
014 def http_request(self, req):
015 req.add_header("Accept-Encoding", "gzip, deflate")
016 return req
017
018 # decode
019 def http_response(self, req, resp):
020 old_resp = resp
021 # gzip
022 if resp.headers.get("content-encoding") == "gzip":
023 gz = GzipFile(
024 fileobj=StringIO(resp.read()),
025 mode="r"
026 )
027 resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
028 resp.msg = old_resp.msg
029 # deflate
030 if resp.headers.get("content-encoding") == "deflate":
031 gz = StringIO( deflate(resp.read()) )
032 resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) # 'class to add info() and
033 resp.msg = old_resp.msg
034 return resp
035
036 # deflate support
037 import zlib
038 def deflate(data): # zlib only provides the zlib compress format, not the deflate format;
039 try: # so on top of all there's this workaround:
040 return zlib.decompress(data, -zlib.MAX_WBITS)
041 except zlib.error:
042 return zlib.decompress(data)
043
044 class Fetcher:
045 '''
046 html Fetcher
047
048 basic usage
049 -----------
050 from fetcher import Fetcher
051 f = Fetcher()
052 f.get(url)
053
054 post
055 ----
056 req = urllib2.Request(...)
057 f.post(req)
058
059 multi-thread
060 ------------
061 f = Fetcher(threads=10)
062 for url in urls:
063 f.push(url)
064 while f.taskleft()
065 url,html = f.pop()
066 deal_with(url,html)
067 '''
068 def __init__(self,timeout=10,threads=None,stacksize=32768*16,loginfunc=None):
069 #proxy_support = urllib2.ProxyHandler({'http':'http://localhost:3128'})
070 cookie_support = urllib2.HTTPCookieProcessor(cookielib.CookieJar())
071 encoding_support = ContentEncodingProcessor()
072 #self.opener = urllib2.build_opener(cookie_support,encoding_support,proxy_support,urllib2.HTTPHandler)
073 self.opener = urllib2.build_opener(cookie_support,encoding_support,urllib2.HTTPHandler)
074 self.req = urllib2.Request('http://www.hsbc.com')
075 socket.setdefaulttimeout(timeout)
076 self.q_req = Queue()
077 self.q_ans = Queue()
078 self.lock = Lock()
079 self.running = 0
080 if loginfunc:
081 self.opener = loginfunc(self.opener)
082 if threads:
083 self.threads = threads
084 stack_size(stacksize)
085 for i in range(threads):
086 t = Thread(target=self.threadget)
087 t.setDaemon(True)
088 t.start()
089
090 def __del__(self):
091 time.sleep(0.5)
092 self.q_req.join()
093 self.q_ans.join()
094
095 def taskleft(self):
096 return self.q_req.qsize()+self.q_ans.qsize()+self.running
097
098 def push(self,req,repeat=3):
099 if not self.threads:
100 print 'no thread, return get instead'
101 return get(req,repeat)
102 self.q_req.put(req)
103
104 def pop(self):
105 try:
106 data = self.q_ans.get(block=True,timeout=10)
107 self.q_ans.task_done()
108 except:
109 data = ['','']
110 return data
111
112 def threadget(self):
113 while True:
114 req = self.q_req.get()
115 with self.lock:
116 self.running += 1
117 ans = self.get(req)
118 print 'got',req
119 self.q_ans.put((req,ans))
120 try:
121 self.q_req.task_done()
122 except:
123 pass
124 with self.lock:
125 self.running -= 1
126 time.sleep(0.1) # don't spam
127
128 def proxyisworking(self):
129 try:
130 self.opener.open('http://www.hsbc.com').read(1024)
131 return True
132 except Exception , what:
133 print what
134 return False
135 def get(self,req,repeat=3):
136 '''
137 http GET req and repeat 3 times if failed
138 html text is returned when succeeded
139 '' is returned when failed
140 '''
141 try:
142 response = self.opener.open(req)
143 data = response.read()
144 except Exception , what:
145 print what,req
146 if repeat>0:
147 return self.get(req,repeat-1)
148 else:
149 print 'GET Failed',req
150 return ''
151 return data
152
153 def post(self,req,repeat=3):
154 '''
155 http POST req and repeat 3 times if failed
156 html text/True is returned when succeeded
157 False is returned when failed
158 '''
159 if not isinstance(req,urllib2.Request):
160 print 'post method need urllib.Request as argument'
161 return False
162 else:
163 r = self.get(req,repeat)
164 if r:
165 return r
166 else:
167 return True
168
169 class SiteCopyer:
170 def __init__(self,url):
171 self.baseurl = url
172 self.home = self.baseurl.split('/')[2]
173 self.f = Fetcher(threads=10)
174 self.create_dir()
175
176 def create_dir(self):
177 try:
178 shutil.rmtree(self.home)
179 except Exception,what:
180 print what
181 try:
182 os.mkdir(self.home)
183 os.mkdir(self.home+'/media')
184 os.mkdir(self.home+'/media/js')
185 os.mkdir(self.home+'/media/css')
186 os.mkdir(self.home+'/media/image')
187 except Exception,what:
188 print what
189
190 def full_link(self,link,baseurl=None):
191 if not baseurl:
192 baseurl = self.baseurl
193 if '?' in link:
194 link = link.rsplit('?',1)[0]
195 if not link.startswith('http://'):
196 if link.startswith('/'):
197 link = '/'.join(baseurl.split('/',3)[:3]) + link
198 elif link.startswith('../'):
199 while link.startswith('../'):
200 baseurl = baseurl.rsplit('/',2)[0]
201 link = link[3:]
202 link = baseurl+'/'+link
203 else:
204 link = baseurl.rsplit('/',1)[0]+'/'+link
205 return link
206
207 def link_alias(self,link):
208 link = self.full_link(link)
209 name = link.rsplit('/',1)[1]
210 if '.css' in name:
211 name = name[:name.find('.css')+4]
212 alias = '/media/css/'+name
213 elif '.js' in name:
214 name = name[:name.find('.js')+3]
215 alias = '/media/js/'+name
216 else:
217 alias = '/media/image/'+name
218 return alias
219
220 def strip_link(self,link):
221 if link and (link[0] in ['"',"'"]):
222 link = link[1:]
223 while link and (link[-1] in ['"',"'"]):
224 link = link[:-1]
225 while link.endswith('/'):
226 link = link[:-1]
227 if link and (link[0] not in ["<","'",'"']) and ('feed' not in link):
228 return link
229 else:
230 return ''
231
232 def copy(self):
233 page = self.f.get(self.baseurl)
234 links = re.compile(r'<link[^>]*href=(.*?)[ >]',re.I).findall(page)
235 links.extend( re.compile(r'<script[^>]*src=(.*?)[ >]',re.I).findall(page) )
236 links.extend( re.compile(r'<img[^>]*src=(.*?)[ >]',re.I).findall(page) )
237 templinks = []
238 for link in links:
239 slink = self.strip_link(link)
240 if slink:
241 templinks.append(slink)
242 links = templinks
243 for link in set(links):
244 page = page.replace(link,self.link_alias(link)[1:])
245 self.f.push( self.full_link(link) )
246 open(self.home+'/index.html','w').write(page)
247 while self.f.taskleft():
248 url,page = self.f.pop()
249 if url.endswith('.css'):
250 links = re.compile(r'url\([\'"]?(.*?)[\'"]?\)').findall(page)
251 templinks = []
252 for link in links:
253 slink = self.strip_link(link)
254 if slink:
255 templinks.append(slink)
256 links = templinks
257 for link in set(links):
258 self.f.push( self.full_link(link,url) )
259 page = page.replace(link,self.link_alias(link)[1:].replace("media",".."))
260 print 'write to',self.home+self.link_alias(url)
261 try:
262 open(self.home+self.link_alias(url),'w').write(page)
263 except Exception,what:
264 print what
265
266 if __name__ == "__main__":
267 if len(sys.argv) == 2:
268 url = sys.argv[1]
269 SiteCopyer(url).copy()
270 else:
271 print "Usage: python "+sys.argv[0]+" url"

【py网页】sitecopy代码的更多相关文章

  1. php正则获取网页标题、关键字、网页描述代码

    php正则获取网页关键字,代码如下: function get_keywords($html) { $html=strtolower($html); preg_match("@<hea ...

  2. Html网页的代码

    Html网页的代码 很全哦 1)贴图:<img src="图片地址"> 2)加入连接:<a href="所要连接的相关地址">写上你想写 ...

  3. 使用python对py文件程序代码复用度检查

    #!/user/bin/env python # @Time :2018/6/5 14:58 # @Author :PGIDYSQ #@File :PyCheck.py from os.path im ...

  4. 网页HTML代码在线运行器

    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/ ...

  5. 基于jQuery仿QQ音乐播放器网页版代码

    基于jQuery仿QQ音乐播放器网页版代码是一款黑色样式风格的网页QQ音乐播放器样式代码.效果图如下: 在线预览   源码下载 实现的代码. html代码: <div class="m ...

  6. mp4网页播放代码,有声音无图像的解决办法~

    mp4网页播放代码,有声音无图像的解决办法~     关于网页播放mp4格式的视频,找了一些插件,这里推荐一下video.js 官方网址:http://www.videojs.com/ github ...

  7. 程序猿爱情表白专用html5动画网页的代码

    程序猿爱情表白专用html5动画网页的代码 下载地址:源代码 程序员表白专用的html5动画特效网页,真的挺羡慕创作者的水平,有这水平能够把爱表白给想表白的人,不要以为那些鲜花是用 的图片.你会发如今 ...

  8. Python获取网页html代码

    获取网页html代码: import requests res = requests.get('https://www.cnblogs.com/easyidea/p/10214559.html') r ...

  9. 【py网页】urlopen的补充,完美

    urllib 是 python 自带的一个抓取网页信息一个接口,他最主要的方法是 urlopen(),是基于 python 的 open() 方法的.下面是主要说明: 1 urllib.urlopen ...

随机推荐

  1. Tram---poj1847(简单最短路)

    题目链接:http://poj.org/problem?id=1847 题意:给了N个交叉口,每个交叉口有自己能转到的交叉口. 注意这里:First number in the i-th line, ...

  2. A Word (Or Two) On Quality

    In the world of interactive project management the promise of quality has become cliché. Quality is ...

  3. sqlserver 中server 函数GETDATE(),DEFAULT用法

    alter table Persons add datenow date DEFAULT GETDATE() null, datetimenow datetime DEFAULT GETDATE()n ...

  4. Swift 遇到的报错信息

    第一个,没看懂.一开始还以为是不支持iOS7的缘故. dyld: Library not loaded: @rpath/libswiftCore.dylib Referenced from: /var ...

  5. sell -- 解码16进制unicode

    1. //System.out.println("decodeUnicode:" + decodeUnicode("0049"));//I public sta ...

  6. enableEventValidation是干什么的?

    回发或回调参数无效.在配置中使用 <pages enableEventValidation="true"/> 或在页面中使用 <%@ Page EnableEve ...

  7. ELK-Python(二)

    不具有通用性,留作纪念. [root@GXB-CTRLCENTER python]# cat insert_pv.py #!/usr/bin/env python # -*- coding:utf-8 ...

  8. 英特尔15天开发IDF硬享公社App全过程

    4月8日英特尔智能硬件大赛在上海决赛,4月13日英热尔智能硬件大赛10强在深圳首秀.英特尔辗转两大城市来顾全两地智能硬件创业者,移动直播就成了用户们最重要的观看渠道. 这次英特尔两大会事现场直播都在官 ...

  9. echo选项

    1.  echo -n不换行输出 $echo -n "123" $echo "456" 最终输出 123456 而不是 123 456 2.  echo -e  ...

  10. PHP PSR-1 基本代码规范(中文版)

    基本代码规范 本篇规范制定了代码基本元素的相关标准,以确保共享的PHP代码间具有较高程度的技术互通性. 关键词 “必须”("MUST").“一定不可/一定不能”("MUS ...