最近学习 python 走火入魔,趁着热情继续初级体验一下下爬虫,以前用 java也写过,这里还是最初级的爬取html,都没有用html解析器,正则等。。。而且一直在循环效率肯定### 很低下

import urllib.request as urllib2

import random
ua_list = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
] ua_agent = random.choice(ua_list) # ua_agent_dict = {'User-Agent':ua_agent} # print(ua_agent_dict)
# request = urllib2.Request(url=url)
# request.add_header(**ua_agent_dict) def checkPageExists(url,ua_agent):
request = urllib2.Request(url=url)
request.add_header('User_Agent',ua_agent)
try:
code = urllib2.urlopen(request).code
except IOError as httperr:
return False
return True if code == 200 else False
checkPageExists('https://www.cnblogs.com/Frank99/p/91111024.html',ua_agent=ua_agent)
url_prefix = 'https://www.cnblogs.com/Frank99/p/'
url_subfix = '.html'
# https://www.cnblogs.com/Frank99/p/
def getHtml(url,ua_agent):
request = urllib2.Request(url=url)
request.add_header('User_Agent',ua_agent)
print('正在从页面 {} 读取数据......'.format(url))
response = urllib2.urlopen(request)
print('从页面 {} 读取数据完成......'.format(url))
return response.read() def write_html2file(html,file_name):
with open(file_name,'w',encoding='utf-8') as f:
print('开始保存文件{}......'.format(file_name))
f.write(html.decode())
print('保存文件{}完成......'.format(file_name)) if __name__ == '__main__':
list(map(lambda i:write_html2file(getHtml(url_prefix+str(i)+url_subfix,ua_agent=ua_agent),str(i)+url_subfix),[i for i in range(9111123,9111125) if checkPageExists(url_prefix+str(i)+url_subfix,ua_agent=ua_agent)])) # for i in range(9111123,9111125):
# url = url_prefix+str(i)+url_subfix
# file_name = str(i)+url_subfix
# if checkPageExists(url,ua_agent=ua_agent):
# html = getHtml(url,ua_agent=ua_agent)
# write_html2file(html,file_name)
import urllib.request as urllib2

import random
ua_list = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
] ua_agent = random.choice(ua_list) # ua_agent_dict = {'User-Agent':ua_agent} # print(ua_agent_dict)
# request = urllib2.Request(url=url)
# request.add_header(**ua_agent_dict) def checkPageExists(url,ua_agent):
request = urllib2.Request(url=url)
request.add_header('User_Agent',ua_agent)
try:
code = urllib2.urlopen(request).code
except IOError as httperr:
return False
return True if code == 200 else False
# checkPageExists('https://www.cnblogs.com/Frank99/p/91111024.html',ua_agent=ua_agent) # https://www.cnblogs.com/Frank99/p/
def getHtml(url,ua_agent):
request = urllib2.Request(url=url)
request.add_header('User_Agent',ua_agent)
print('正在从页面 {} 读取数据......'.format(url))
response = urllib2.urlopen(request)
print('从页面 {} 读取数据完成......'.format(url))
return response.read() def write_html2file(html,file_name):
with open(file_name,'w',encoding='utf-8') as f:
print('开始保存文件{}......'.format(file_name))
f.write(html.decode())
print('保存文件{}完成......'.format(file_name)) if __name__ == '__main__':
# url_prefix = 'https://www.cnblogs.com/Frank99/p/'
# url_subfix = '.html'
url_prefix = input('请输入要被爬取的资源地址前缀...')
url_subfix = input('请输入要被爬取的资源地址后缀...')
list(map(lambda i:write_html2file(getHtml(url_prefix+str(i)+url_subfix,ua_agent=ua_agent),str(i)+url_subfix),(i for i in range(5400017,9111125) if checkPageExists(url_prefix+str(i)+url_subfix,ua_agent=ua_agent)))) for i in range(9111123,9111125):
url = url_prefix+str(i)+url_subfix
file_name = str(i)+url_subfix
if checkPageExists(url,ua_agent=ua_agent):
html = getHtml(url,ua_agent=ua_agent)
write_html2file(html,file_name)

刚虾米。。。。

# https://tieba.baidu.com/f?kw=%E5%B8%83%E8%A2%8B%E6%88%8F&ie=utf-8&pn=100
# https://tieba.baidu.com/f?kw=%E5%B8%83%E8%A2%8B%E6%88%8F&ie=utf-8&pn=10
import urllib.request as urllib2
from urllib import parse
import random
class TieBa(object):
def __init__(self,**kw):
for key in kw:
if key == 'name':
self.__name = kw[key]
elif key == 'start':
self.__start = kw[key]
elif key == 'end':
self.__end = kw[key]
# elif key == 'url':
# self.__url = kw[key]
def set_name(self,name):
self.__name = name
def get_name(self):
return self.__name
def set_start(self,start):
self.__start = start
def get_start(self):
return self.__start
def set_end(self,end):
self.__end = end
def get_end(self):
return self.__end def spider_html(self):
'''
爬取网页信息
'''
name=self.__name
start=self.__start
end=self.__end
words ={'kw':name}
name = parse.urlencode(words)
url_prefix = r'https://tieba.baidu.com/f?'
url_suffix =r'&ie=utf-8&pn='
url = url_prefix+name+url_suffix
start = int(start)
end = int(end)
for page in range(start,end):
url = url+str(page)
print(url)
html = self.__get_html(page,url)
file_name = '{}-{}.html'.format(words['kw'],page)
self.__write2file(file_name,html) def __get_html(self,page,url):
ua_list = ["Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"]
request = urllib2.Request(url)
request.add_header('User-Agent',random.choice(ua_list))
response = urllib2.urlopen(request)
print('第 {} 正在爬取'.format(page))
html = response.read()
print('第 {} 爬取完成'.format(page))
return html def __write2file(self,file_name,html):
print('开始保存html为文件...')
with open(file_name,'w',encoding='utf-8') as f:
f.write(html.decode())
print('保存html为文件成功...')
if __name__ =='__main__':
tb = TieBa()
tb.set_name(input('请输入贴吧名!'))
tb.set_start(input('请输入从第几页开始: '))
tb.set_end(input('请输入从第几页结束: '))
tb.spider_html()

Condition ,用来同步 线程 , 底部还是 Lock , RLock (在一个线程中可以多次重入)

from threading import (Thread,Condition)

class XiaoAI(Thread):
def __init__(self,cond,name='小爱'):
super().__init__(name=name)
self.cond = cond def run(self):
with self.cond:
self.cond.wait()
print('{name}: 在'.format(name=self.name))
self.cond.notify() self.cond.wait()
print('{name}: 好啊!'.format(name=self.name))
self.cond.notify()
class TianMao(Thread):
def __init__(self,cond,name='天猫'):
super().__init__(name=name)
self.cond = cond def run(self):
with cond:
print('{name}:小爱同学'.format(name=self.name))
self.cond.notify()
self.cond.wait()
print('{name}: 我们来对古诗吧。'.format(name=self.name))
self.cond.notify()
self.cond.wait() if __name__ == '__main__':
cond = Condition()
xiao = XiaoAI(cond)
tian = TianMao(cond) xiao.start() # 这里 start 顺序千万要注意
tian.start()
xiao.join()
tian.join()
from threading import (Thread,Semaphore)
from urllib.parse import urlencode
import requests
import chardet
import logging
from os import path
import random
import re
logging.basicConfig(level=logging.DEBUG)
# https://tieba.baidu.com/f?kw=%E5%B8%83%E8%A2%8B%E6%88%8F&ie=utf-8&pn=100 class TieBaSpider(Thread):
def __init__(self,url,sem,name='TieBaSpider'):
super(TieBaSpider,self).__init__(name=name)
self.url = url
self.sem = sem def _save(self,text):
parent_dir = r'D:\tieba'
file_name = path.join(parent_dir,path.split(re.sub(r'[%|=|&|?]','',self.url))[1])+'.html'
with open(file_name,'w',encoding='utf-8') as fw:
fw.write(text)
fw.flush()
return 1 def run(self):
# ua_list = ["Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
# "Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
# "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
# "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"]
# header = {'User-Agent':random.choice(ua_list)}
response = requests.get(self.url)#header=header)
content = response.content
logging.info(response.encoding)
# result = chardet.detect(content)
# logging.info(result)
# code = result.get('encoding','utf-8')
self._save(content.decode(response.encoding))
self.sem.release() class UrlProducer(Thread):
def __init__(self,tb_name,sem,pages_once=3,start_index=1,end_index=9):# end-start % pages_once == 0
super(UrlProducer,self).__init__(name=tb_name)
self.tb_name = urlencode(tb_name)
self.sem = sem
logging.info(self.tb_name)
self.pages_once = pages_once
self.start_index = start_index
self.end_index = end_index def run(self):
for page_idx in range(self.start_index,self.end_index+1):
self.sem.acquire()
url_prefix = r'https://tieba.baidu.com/f?'
url_suffix = r'&fr=ala0&tpl='
self.url = url_prefix+self.tb_name+url_suffix+str(page_idx)
tb_spider = TieBaSpider(self.url,self.sem)
tb_spider.start() if __name__ == '__main__':
kw_dict = dict(kw=r'国家地理')
sem = Semaphore(3) # 控制一次并发 3 个线程
url_producer = UrlProducer(kw_dict,sem=sem)
url_producer.start() url_producer.join()

免费 ip 代理池

站大爷

python 小爬虫爬取博客文章初体验的更多相关文章

  1. 爬虫---lxml爬取博客文章

    上一篇大概写了下lxml的用法,今天我们通过案例来实践,爬取我的博客博客并保存在本地 爬取博客园博客 爬取思路: 1.首先找到需要爬取的博客园地址 2.解析博客园地址 # coding:utf-8 i ...

  2. [Python学习] 简单网络爬虫抓取博客文章及思想介绍

            前面一直强调Python运用到网络爬虫方面很有效,这篇文章也是结合学习的Python视频知识及我研究生数据挖掘方向的知识.从而简介下Python是怎样爬去网络数据的,文章知识很easy ...

  3. [js高手之路]Node.js实现简易的爬虫-抓取博客文章列表信息

    抓取目标:就是我自己的博客:http://www.cnblogs.com/ghostwu/ 需要实现的功能: 抓取文章标题,超链接,文章摘要,发布时间 需要用到的库: node.js自带的http库 ...

  4. python爬取博客圆首页文章链接+标题

    新人一枚,初来乍到,请多关照 来到博客园,不知道写点啥,那就去瞄一瞄大家都在干什么好了. 使用python 爬取博客园首页文章链接和标题. 首先当然是环境了,爬虫在window10系统下,python ...

  5. 如何利用Python网络爬虫爬取微信朋友圈动态--附代码(下)

    前天给大家分享了如何利用Python网络爬虫爬取微信朋友圈数据的上篇(理论篇),今天给大家分享一下代码实现(实战篇),接着上篇往下继续深入. 一.代码实现 1.修改Scrapy项目中的items.py ...

  6. 利用Python网络爬虫爬取学校官网十条标题

    利用Python网络爬虫爬取学校官网十条标题 案例代码: # __author : "J" # date : 2018-03-06 # 导入需要用到的库文件 import urll ...

  7. [js高手之路]Node.js实现简易的爬虫-抓取博客所有文章列表信息

    抓取目标:就是我自己的博客:http://www.cnblogs.com/ghostwu/ 需要实现的功能: 抓取博客所有的文章标题,超链接,文章摘要,发布时间 需要用到的库: node.js自带的h ...

  8. Java使用Jsoup之爬取博客数据应用实例

    导入Maven依赖 <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> <dependency> <g ...

  9. Python开发爬虫之动态网页抓取篇:爬取博客评论数据——通过Selenium模拟浏览器抓取

    区别于上篇动态网页抓取,这里介绍另一种方法,即使用浏览器渲染引擎.直接用浏览器在显示网页时解析 HTML.应用 CSS 样式并执行 JavaScript 的语句. 这个方法在爬虫过程中会打开一个浏览器 ...

随机推荐

  1. eclipse repository connector

  2. Java和Android的Lru缓存,及其实现原理

    一.概述 Android提供了LRUCache类,可以方便的使用它来实现LRU算法的缓存.Java提供了LinkedHashMap,可以用该类很方便的实现LRU算法,Java的LRULinkedHas ...

  3. 【LightOJ 1136】Division by 3(简单数学)

    BUPT2017 wintertraining(16) #5 C HDU - 1021 题意 1, 12, 123, 1234, ..., 12345678910, ... 问第a到第b个数(incl ...

  4. hdu 2577 How to Type(dp)

    Problem Description Pirates have finished developing the typing software. He called Cathy to test hi ...

  5. vlc sdl2.0 播放示例

    #include <stdio.h> #include <stdint.h> #include <math.h> #include <stdlib.h> ...

  6. 4月1日->-4月15日 2周阶段性计划

    4月1日->4月14日 ST表 树状数组 LCA 一周的时间,力求掌握这三个知识点并各刷五道题左右. 树状数组 ST表 LCA 然而:进展总比计划快(......什么鬼) 树状数组刷了5题,ST ...

  7. asp <----> vb(com,dll) <---> c 来回的调用,生命在于折腾

    最近想改进一个两年前写的小程序,原因是最近将运行在托管服务器上的asp程序迁移到阿里云主机上运行. 初次使用阿里云主机,买了一个配置较低的主机(1核,1G内存,年付:1500左右吧) 原来使用的托管的 ...

  8. 写入与读取第三方的 cookie - P3P: CP="CAO PSA OUR"

    应用的场景是这样: 在 a.com 页面显示一个 来自b.com的一张图片 a.com/test.html 的内容: <img src=b.com/a.jpg> 但需求是,当用户访问 b. ...

  9. 字符串格式化(七)-format

    print("i am %s" %'admin') # i am admin msg = "i am %s" %'Alex' print(msg) # i am ...

  10. 在Windows上部署dubbo-admin(监控中心)

    在Windows上部署dubbo-admin(监控中心) 2018年11月12日 22:45:13 zzz_er 阅读数:1004 标签: dubbo 更多 个人分类: dubbo   版权声明:本文 ...