【py网页】sitecopy代码
001 #coding:utf-8
002 import re,os,shutil,sys
003 import urllib2,socket,cookielib
004 from threading import Thread,stack_size,Lock
005 from Queue import Queue
006 import time
007 from gzip import GzipFile
008 from StringIO import StringIO
009
010 class ContentEncodingProcessor(urllib2.BaseHandler):
011 """A handler to add gzip capabilities to urllib2 requests """
012
013 # add headers to requests
014 def http_request(self, req):
015 req.add_header("Accept-Encoding", "gzip, deflate")
016 return req
017
018 # decode
019 def http_response(self, req, resp):
020 old_resp = resp
021 # gzip
022 if resp.headers.get("content-encoding") == "gzip":
023 gz = GzipFile(
024 fileobj=StringIO(resp.read()),
025 mode="r"
026 )
027 resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
028 resp.msg = old_resp.msg
029 # deflate
030 if resp.headers.get("content-encoding") == "deflate":
031 gz = StringIO( deflate(resp.read()) )
032 resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) # 'class to add info() and
033 resp.msg = old_resp.msg
034 return resp
035
036 # deflate support
037 import zlib
038 def deflate(data): # zlib only provides the zlib compress format, not the deflate format;
039 try: # so on top of all there's this workaround:
040 return zlib.decompress(data, -zlib.MAX_WBITS)
041 except zlib.error:
042 return zlib.decompress(data)
043
044 class Fetcher:
045 '''
046 html Fetcher
047
048 basic usage
049 -----------
050 from fetcher import Fetcher
051 f = Fetcher()
052 f.get(url)
053
054 post
055 ----
056 req = urllib2.Request(...)
057 f.post(req)
058
059 multi-thread
060 ------------
061 f = Fetcher(threads=10)
062 for url in urls:
063 f.push(url)
064 while f.taskleft()
065 url,html = f.pop()
066 deal_with(url,html)
067 '''
068 def __init__(self,timeout=10,threads=None,stacksize=32768*16,loginfunc=None):
069 #proxy_support = urllib2.ProxyHandler({'http':'http://localhost:3128'})
070 cookie_support = urllib2.HTTPCookieProcessor(cookielib.CookieJar())
071 encoding_support = ContentEncodingProcessor()
072 #self.opener = urllib2.build_opener(cookie_support,encoding_support,proxy_support,urllib2.HTTPHandler)
073 self.opener = urllib2.build_opener(cookie_support,encoding_support,urllib2.HTTPHandler)
074 self.req = urllib2.Request('http://www.hsbc.com')
075 socket.setdefaulttimeout(timeout)
076 self.q_req = Queue()
077 self.q_ans = Queue()
078 self.lock = Lock()
079 self.running = 0
080 if loginfunc:
081 self.opener = loginfunc(self.opener)
082 if threads:
083 self.threads = threads
084 stack_size(stacksize)
085 for i in range(threads):
086 t = Thread(target=self.threadget)
087 t.setDaemon(True)
088 t.start()
089
090 def __del__(self):
091 time.sleep(0.5)
092 self.q_req.join()
093 self.q_ans.join()
094
095 def taskleft(self):
096 return self.q_req.qsize()+self.q_ans.qsize()+self.running
097
098 def push(self,req,repeat=3):
099 if not self.threads:
100 print 'no thread, return get instead'
101 return get(req,repeat)
102 self.q_req.put(req)
103
104 def pop(self):
105 try:
106 data = self.q_ans.get(block=True,timeout=10)
107 self.q_ans.task_done()
108 except:
109 data = ['','']
110 return data
111
112 def threadget(self):
113 while True:
114 req = self.q_req.get()
115 with self.lock:
116 self.running += 1
117 ans = self.get(req)
118 print 'got',req
119 self.q_ans.put((req,ans))
120 try:
121 self.q_req.task_done()
122 except:
123 pass
124 with self.lock:
125 self.running -= 1
126 time.sleep(0.1) # don't spam
127
128 def proxyisworking(self):
129 try:
130 self.opener.open('http://www.hsbc.com').read(1024)
131 return True
132 except Exception , what:
133 print what
134 return False
135 def get(self,req,repeat=3):
136 '''
137 http GET req and repeat 3 times if failed
138 html text is returned when succeeded
139 '' is returned when failed
140 '''
141 try:
142 response = self.opener.open(req)
143 data = response.read()
144 except Exception , what:
145 print what,req
146 if repeat>0:
147 return self.get(req,repeat-1)
148 else:
149 print 'GET Failed',req
150 return ''
151 return data
152
153 def post(self,req,repeat=3):
154 '''
155 http POST req and repeat 3 times if failed
156 html text/True is returned when succeeded
157 False is returned when failed
158 '''
159 if not isinstance(req,urllib2.Request):
160 print 'post method need urllib.Request as argument'
161 return False
162 else:
163 r = self.get(req,repeat)
164 if r:
165 return r
166 else:
167 return True
168
169 class SiteCopyer:
170 def __init__(self,url):
171 self.baseurl = url
172 self.home = self.baseurl.split('/')[2]
173 self.f = Fetcher(threads=10)
174 self.create_dir()
175
176 def create_dir(self):
177 try:
178 shutil.rmtree(self.home)
179 except Exception,what:
180 print what
181 try:
182 os.mkdir(self.home)
183 os.mkdir(self.home+'/media')
184 os.mkdir(self.home+'/media/js')
185 os.mkdir(self.home+'/media/css')
186 os.mkdir(self.home+'/media/image')
187 except Exception,what:
188 print what
189
190 def full_link(self,link,baseurl=None):
191 if not baseurl:
192 baseurl = self.baseurl
193 if '?' in link:
194 link = link.rsplit('?',1)[0]
195 if not link.startswith('http://'):
196 if link.startswith('/'):
197 link = '/'.join(baseurl.split('/',3)[:3]) + link
198 elif link.startswith('../'):
199 while link.startswith('../'):
200 baseurl = baseurl.rsplit('/',2)[0]
201 link = link[3:]
202 link = baseurl+'/'+link
203 else:
204 link = baseurl.rsplit('/',1)[0]+'/'+link
205 return link
206
207 def link_alias(self,link):
208 link = self.full_link(link)
209 name = link.rsplit('/',1)[1]
210 if '.css' in name:
211 name = name[:name.find('.css')+4]
212 alias = '/media/css/'+name
213 elif '.js' in name:
214 name = name[:name.find('.js')+3]
215 alias = '/media/js/'+name
216 else:
217 alias = '/media/image/'+name
218 return alias
219
220 def strip_link(self,link):
221 if link and (link[0] in ['"',"'"]):
222 link = link[1:]
223 while link and (link[-1] in ['"',"'"]):
224 link = link[:-1]
225 while link.endswith('/'):
226 link = link[:-1]
227 if link and (link[0] not in ["<","'",'"']) and ('feed' not in link):
228 return link
229 else:
230 return ''
231
232 def copy(self):
233 page = self.f.get(self.baseurl)
234 links = re.compile(r'<link[^>]*href=(.*?)[ >]',re.I).findall(page)
235 links.extend( re.compile(r'<script[^>]*src=(.*?)[ >]',re.I).findall(page) )
236 links.extend( re.compile(r'<img[^>]*src=(.*?)[ >]',re.I).findall(page) )
237 templinks = []
238 for link in links:
239 slink = self.strip_link(link)
240 if slink:
241 templinks.append(slink)
242 links = templinks
243 for link in set(links):
244 page = page.replace(link,self.link_alias(link)[1:])
245 self.f.push( self.full_link(link) )
246 open(self.home+'/index.html','w').write(page)
247 while self.f.taskleft():
248 url,page = self.f.pop()
249 if url.endswith('.css'):
250 links = re.compile(r'url\([\'"]?(.*?)[\'"]?\)').findall(page)
251 templinks = []
252 for link in links:
253 slink = self.strip_link(link)
254 if slink:
255 templinks.append(slink)
256 links = templinks
257 for link in set(links):
258 self.f.push( self.full_link(link,url) )
259 page = page.replace(link,self.link_alias(link)[1:].replace("media",".."))
260 print 'write to',self.home+self.link_alias(url)
261 try:
262 open(self.home+self.link_alias(url),'w').write(page)
263 except Exception,what:
264 print what
265
266 if __name__ == "__main__":
267 if len(sys.argv) == 2:
268 url = sys.argv[1]
269 SiteCopyer(url).copy()
270 else:
271 print "Usage: python "+sys.argv[0]+" url"
【py网页】sitecopy代码的更多相关文章
- php正则获取网页标题、关键字、网页描述代码
php正则获取网页关键字,代码如下: function get_keywords($html) { $html=strtolower($html); preg_match("@<hea ...
- Html网页的代码
Html网页的代码 很全哦 1)贴图:<img src="图片地址"> 2)加入连接:<a href="所要连接的相关地址">写上你想写 ...
- 使用python对py文件程序代码复用度检查
#!/user/bin/env python # @Time :2018/6/5 14:58 # @Author :PGIDYSQ #@File :PyCheck.py from os.path im ...
- 网页HTML代码在线运行器
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/ ...
- 基于jQuery仿QQ音乐播放器网页版代码
基于jQuery仿QQ音乐播放器网页版代码是一款黑色样式风格的网页QQ音乐播放器样式代码.效果图如下: 在线预览 源码下载 实现的代码. html代码: <div class="m ...
- mp4网页播放代码,有声音无图像的解决办法~
mp4网页播放代码,有声音无图像的解决办法~ 关于网页播放mp4格式的视频,找了一些插件,这里推荐一下video.js 官方网址:http://www.videojs.com/ github ...
- 程序猿爱情表白专用html5动画网页的代码
程序猿爱情表白专用html5动画网页的代码 下载地址:源代码 程序员表白专用的html5动画特效网页,真的挺羡慕创作者的水平,有这水平能够把爱表白给想表白的人,不要以为那些鲜花是用 的图片.你会发如今 ...
- Python获取网页html代码
获取网页html代码: import requests res = requests.get('https://www.cnblogs.com/easyidea/p/10214559.html') r ...
- 【py网页】urlopen的补充,完美
urllib 是 python 自带的一个抓取网页信息一个接口,他最主要的方法是 urlopen(),是基于 python 的 open() 方法的.下面是主要说明: 1 urllib.urlopen ...
随机推荐
- position定位的小问题
css中position定位有四个属性,分别是:static.fixed.relative.absolute. 其中,static是默认值,未脱离文档流,元素的位置即按照文档结构的顺序进行定位排序: ...
- Magento PHP Extension "curl" must be loaded解决方法
我记得我第一次在xampp装magento的时候,进入后台时提示PHP Extension "curl" must be loaded 在网页上查了下原因和解决方法,发现是mage ...
- QQ聊天气泡(图片拉伸不变样)、内容尺寸定制(高度随字数、字体而变)
- (void)viewDidLoad { [super viewDidLoad]; self.view.backgroundColor = [UIColor whiteColor]; /** QQ聊 ...
- CocoaPods的安装及使用/利用开源库Diplomat实现分享及第三方登录/git的使用
<<史上最简洁版本>> 1.gem sources -l查看 当前的源 //1.1 sudo -i..以下都是以管理员的身份来操作的 2.gem sources --remov ...
- A Framework for Programme Management
In business today organisations manage multiple projects concurrently with shared or overlapping res ...
- jq 拖拽
1.尼玛, move事件的时候忘了加ev,找了一个多小时 <!DOCTYPE html> <html> <head lang="en"> < ...
- @JsonProperty的使用
jackson的maven依赖 <dependency> <groupId>com.fasterxml.jackson.core</groupId> <art ...
- 注册表操作命令和自定义cmd窗口
REM @echo offREM clsREM echo Microsoft Windows 7REM echo ------------------------REM echo Welcome to ...
- Java遇见HTML——JSP篇之JavaBeans
一.JavaBean简介及设计原则 设计原则:公有类.无参的公有构造方法.属性私有.有getter and setter方法 实例: 二.Jsp动作元素 JSP动作标签分为五大类: 三.在JSP页面中 ...
- Java基础之集合框架——在文件中存储地图(TryPhoneBook2)
控制台程序. import java.io.*; public class Person implements Comparable<Person>, Serializable { // ...