python爬虫前提技术

1、BeautifulSoup 解析html如何使用

转自：http://blog.csdn.net/u013372487/article/details/51734047

#!/usr/bin/python

# -*- coding: UTF-8 -*-

from bs4 import BeautifulSoup

import re

#待分析字符串

html_doc = """

<html>

<head>

    <title>The Dormouse's story</title>

</head>

<body>

<p class="title aq">

    <b>

        The Dormouse's story

    </b>

</p>

<p class="story">Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>

    and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

    and they lived at the bottom of a well.

</p>

<p class="story">...</p>

"""

# html字符串创建BeautifulSoup对象

soup = BeautifulSoup(html_doc, 'html.parser', from_encoding='utf-8')

#输出第一个 title 标签

print soup.title

#输出第一个 title 标签的标签名称

print soup.title.name

#输出第一个 title 标签的包含内容

print soup.title.string

#输出第一个 title 标签的父标签的标签名称

print soup.title.parent.name

#输出第一个  p 标签

print soup.p

#输出第一个  p 标签的 class 属性内容

print soup.p['class']

#输出第一个  a 标签的  href 属性内容

print soup.a['href']

'''

soup的属性可以被添加,删除或修改. 再说一次, soup的属性操作方法与字典一样

'''

#修改第一个 a 标签的href属性为 http://www.baidu.com/

soup.a['href'] = 'http://www.baidu.com/'

#给第一个 a 标签添加 name 属性

soup.a['name'] = u'百度'

#删除第一个 a 标签的 class 属性为

del soup.a['class']

##输出第一个  p 标签的所有子节点

print soup.p.contents

#输出第一个  a 标签

print soup.a

#输出所有的  a 标签，以列表形式显示

print soup.find_all('a')

#输出第一个 id 属性等于  link3 的  a 标签

print soup.find(id="link3")

#获取所有文字内容

print(soup.get_text())

#输出第一个  a 标签的所有属性信息

print soup.a.attrs

for link in soup.find_all('a'):

    #获取 link 的  href 属性内容

    print(link.get('href'))

#对soup.p的子节点进行循环输出

for child in soup.p.children:

    print(child)

#正则匹配，名字中带有b的标签

for tag in soup.find_all(re.compile("b")):

    print(tag.name)

2、cookie等使用方法以及函数爬虫

参照： https://cuiqingcai.com/968.html

3、header,代理,超时,认证,异常处理

参照：  http://blog.csdn.net/m_buddy/article/details/55193762

4、错误异常处理

1.URLError

# -*- coding: UTF-8 -*-

import urllib

import urllib

from urllib import request

import re

import requests

import urllib.parse

import urllib.request

from urllib.request import Request, urlopen

from urllib.error import URLError, HTTPError

if __name__ == "__main__":

    #一个不存在的连接

    url = "http://www.douyu.com/Jack_Cui.html"

    request = urllib.request.Request(url)

    try:

        response = urllib.request.urlopen(request)

        # html = responese.read()

    except urllib.error.HTTPError as e:

        print(e.code)

运行结果：

C:\Python34\python.exe G:/xiaoshuo2.py

403

Process finished with exit code 0

# -*- coding: UTF-8 -*-

import urllib

import urllib

from urllib import request

import re

import requests

import urllib.parse

import urllib.request

from urllib.request import Request, urlopen

from urllib.error import URLError, HTTPError

if __name__ == "__main__":

    #一个不存在的连接

    url = "http://www.douyu.com/Jack_Cui.html"

    request = urllib.request.Request(url)

    try:

        response = urllib.request.urlopen(request)

        html = response.read().decode('utf-8')

        print(html)

    except urllib.error.HTTPError as e:

        print(e.code)

运行结果：

C:\Python34\python.exe G:/xiaoshuo2.py

403

Process finished with exit code 0

import urllib

import urllib

from urllib import request

import re

import requests

import urllib.parse

import urllib.request

from urllib.request import Request, urlopen

from urllib.error import URLError, HTTPError

url = "http://www.douyu.com/Jack_Cui.html"

rep=urllib.request.Request(url)

try:

    data=urllib.request.urlopen(rep)

except urllib.error.URLError as e:

        if hasattr(e,'code'):

            print("HTTPError")

            print(e.code)

        if hasattr(e,'reason' ):

            print("URLError")

            print(e.reason)

输出结果：

C:\Python34\python.exe G:/xiaoshuo2.py

HTTPError

403

URLError

Forbidden

Process finished with exit code 0

5、python打印防止换行和换行

https://www.cnblogs.com/kfx2007/p/5970784.html

实例：

# coding=utf-8

import re

language = '''''

<table class="infobox bordered vcard" style="width: 21em; font-size: 89%; text-align: left;" cellpadding="3">

<caption style="text-align: center; font-size: larger;" class="fn"><b>jenkins</b></caption>

<tr>

<th>性別：</th>

<td>男</td>d

</tr>

<tr>

<th>異名：</th>

<td><span class="nickname">(字) 翔宇</span></td>

</tr>

<tr>

<th>爱好：</th>

<td><span class="org"><a href="../articles/%E4%B8%AD9A.html" title="篮球">篮球</a></span></td>

</tr>

<tr>

<th>籍貫：</th>

<td><a href="../articles/%E6%B5%9981.html" title="广西省">广西省</a><a href="../articles/%E7%BB%8D82.html" title="桂林市">桂林市</a></td>

</tr>

</table>

'''

#获取table中tr值

res_tr = r'<tr>(.*?)</tr>'

m_tr =  re.findall(res_tr,language,re.S|re.M)

for line in m_tr:

    #获取表格第一列th 属性

    res_th = r'<th>(.*?)</th>'

    m_th = re.findall(res_th,line,re.S|re.M)

    for mm in m_th:

        if "href" in mm: #如果获取加粗的th中含超链接则处理

            restr = r'<a href=.*?>(.*?)</a>'

            h = re.findall(restr,mm,re.S|re.M)

            print (h[0],end=' ') #逗号连接属性值 防止换行

        else:

            print (mm,end=' ')   #unicode防止乱

    #获取表格第二列td 属性值

    res_td = r'<td>(.*?)</td>'  #r'<td .*?>(.*?)</td>'

    m_td = re.findall(res_td,line,re.S|re.M)

    for nn in m_td:

        if "href" in nn: #处理超链接<a href=../rel=..></a>

            res_value = r'<a .*?>(.*?)</a>'

            m_value = re.findall(res_value,nn,re.S|re.M)

            for value in m_value:

                print (value,end=' ')

        elif "span" in nn: #处理标签<span>

            res_value = r'<span .*?>(.*?)</span>'

            m_value = re.findall(res_value,nn,re.S|re.M) #<td><span class="nickname">(字) 翔宇</span></td>

            for value in m_value:

                print (value,end=' ')

        else:

            print (nn,end=' ')

        print (' ') #换行

C:\Python34\python.exe G:/xiaoshuo2.py

性別： 男

異名： (字) 翔宇

爱好： 篮球

籍貫： 广西省 桂林市  

6、python打印如何呢不换行

https://www.cnblogs.com/hwd9654/p/5707920.html

# -*- coding:utf-8 -*-

import urllib

import re

#import requests

import urllib.parse

import urllib.request

from urllib.request import Request, urlopen

from urllib.error import URLError, HTTPError

class Tool:

    removeImg = re.compile('<img.*?>| {7}|')

    removeAddr = re.compile('<a.*?>|</a>')

    replaceLine = re.compile('<tr>|<div>|</div>|</p>')

    replaceTD= re.compile('<td>')

    replacePara = re.compile('<p.*?>')

    replaceBR = re.compile('<br><br>|<br>')

    removeExtraTag = re.compile('<.*?>')

    def replace(self,x):

        x = re.sub(self.removeImg,"",x)

        x = re.sub(self.removeAddr,"",x)

        x = re.sub(self.replaceLine,"\n",x)

        x = re.sub(self.replaceTD,"\t",x)

        x = re.sub(self.replacePara,"\n",x)

        x = re.sub(self.replaceBR,"\n",x)

        x = re.sub(self.removeExtraTag,"",x)

        return x.strip()

class BDTB:

    def __init__(self,baseUrl,seeLZ):

        self.baseURL = baseUrl

        self.seeLZ = '?see_lz='+str(seeLZ)

        self.tool = Tool()

    def getPage(self,pageNum):

        try:

            url = self.baseURL+ self.seeLZ + '&pn=' + str(pageNum)

            request = urllib.request.Request(url)

            response = urllib.request.urlopen(request).read().decode("utf8")

            #print (response)

            return response

        except urllib.error.URLError as e:

            if hasattr(e,"reason"):

                print ("连接百度贴吧失败,错误原因",e.reason)

                return None

    def getTitle(self):

        page = self.getPage(1)

        pattern = re.compile('<h3 class="core_title_txt.*?>(.*?)</h3>',re.S)

        result = re.search(pattern,page)

        if result:

        #print (result.group(1))

            return result.group(1).strip()

        else:

            return None

    def getPageNum(self):

        page = self.getPage(1)

        pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>',re.S)

        result = re.search(pattern,page)

        #print (result.group(1))

        if result:

            return result.group(1).strip()

        else:

            return None

    def getContent(self,page):

        pattern = re.compile('<div id="post_content_.*?>(.*?)</div>',re.S)

        items = re.findall(pattern,page)

        floor = 1

        for item in items:

            print (floor,"楼-------------------------------------------------------------------------------------\n",end='')

            #print ("楼---------------------------------------------------------------------------------------------------------------\n")

            print (self.tool.replace(item))

            floor += 1

baseURLh = 'http://tieba.baidu.com/p/3138733512'

bdtb = BDTB(baseURLh,1)

bdtb.getContent(bdtb.getPage(1))

打印结果：

C:\Python34\python.exe C:/Users/Administrator/ceshi.py

1 楼-------------------------------------------------------------------------------------

很多媒体都在每赛季之前给球员排个名，我也有这个癖好…………，我会尽量理性的分析球队地位，个人能力等因素，评出我心目中的下赛季50大现役球员，这个50大是指预估他本赛季在篮球场上对球队的影响力……不是过去的荣誉什么的，所以难免有一定的主观性……如果把你喜欢的球星排低了，欢迎理性讨论！

状元维金斯镇楼

P.S 1 我每天都至少更新一个，不TJ。

      2 今年的新秀我就不考虑了，没上赛季参照

2 楼-------------------------------------------------------------------------------------

50 惊喜新人王 迈卡威

上赛季数据

篮板 6.2  助攻 6.3  抢断 1.9 盖帽  0.6 失误 3.5 犯规  3  得分 16.7

新赛季第50位，我给上赛季的新人王迈卡威。 上赛季迈卡威在彻底重建的76人中迅速掌握了球队，一开始就三双搞定了热火赢得了万千眼球。后来也屡屡有经验的表现，新秀赛季就拿过三双的球员不多，迈卡威现在可以说在76人站稳了脚跟。

7、python爬虫xpath的语法

http://www.cnblogs.com/lonenysky/p/4649455.html

//*[@id="AD_4586850"]/div[1]/strong/i

//*[@id="shop_list"]/div[1]/strong/i

//*[@id="shop_list"]

8、requests用法

http://cuiqingcai.com/2556.html

#-*—coding:utf8-*-

from lxml import etree

import requests

import re

#编码转换

import sys

#headers构造一个字典，里面保存了user-agent

#headers= { 'User-Agent' : 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' }

html = requests.get('http://cuiqingcai.com')

print(html.text)

9、sub使用

http://blog.csdn.net/lovemianmian/article/details/8867613

1、去除imag标签

import re

text ='<imgJGood is a handsome boy,>         he is cool, clever, and so on...'

removeImg = re.compile('<img.*?>')

s=re.sub(removeImg,"",text).strip()

print (s)

C:\Python34\python.exe G:/xiaoshuo2.py

he is cool, clever, and so on...

1、1 单独去除7位长空格

import re

text ='<imgJGood is a handsome boy,>         he is cool, clever, and so on...'

removeImg = re.compile('| {7}|')

s=re.sub(removeImg,"",text).strip()

print (s)

打印

C:\Python34\python.exe G:/xiaoshuo2.py

<imgJGood is a handsome boy,>         he is cool, clever, and so on...

2、去除imag标签 + 去除7位长空格

import re

text ='<imgJGood is a handsome boy,>         he is cool, clever, and so on...'

removeImg = re.compile('<img.*?>| {7}|')

s=re.sub(removeImg,"",text).strip()

print (s)

打印：

C:\Python34\python.exe G:/xiaoshuo2.py

he is cool, clever, and so on...

Process finished with exit code 0

3、去除imag标签 + 保留7位长空格

import re

text ='<imgJGood is a handsome boy,>         he is cool, clever, and so on...'

removeImg = re.compile('<img.*?>{7}')

s=re.sub(removeImg,"",text).strip()

print (s)

打印：

C:\Python34\python.exe G:/xiaoshuo2.py

<imgJGood is a handsome boy,>         he is cool, clever, and so on...

Process finished with exit code 0

4、把两个标签中间的内容去掉

import re

text='<a href="http://jump2.bdimg.com/safecheck/index?url=x+Z5)">迈卡威</a>刷出了不错的数据'

removeImg = re.compile('<a.*?>|</a>')

s=re.sub(removeImg,"",text).strip()

print (s)

打印：

C:\Python34\python.exe G:/xiaoshuo2.py

迈卡威刷出了不错的数据

5，把<br>换行符换成/n 换行符

import re

text ='height="510"><br><br><br><br>状元维金斯镇楼<br>P.S 1 我每天都至少更新一个，不TJ。<br>      2 今年的新秀我就不考虑了，没上赛季参照'

removeImg = re.compile('<br><br>|<br>')

s=re.sub(removeImg,"\n",text).strip()

print (s)

C:\Python34\python.exe G:/xiaoshuo2.py

height="">

状元维金斯镇楼

P.S 1 我每天都至少更新一个，不TJ。

      2 今年的新秀我就不考虑了，没上赛季参照

5.1，把<br>换行符换成/n 换行符

import re

text ='height="510"><br><br><br><br>状元维金斯镇楼<br>P.S 1 我每天都至少更新一个，不TJ。<br>      2 今年的新秀我就不考虑了，没上赛季参照'

removeImg = re.compile('<br>')

s=re.sub(removeImg,"\n",text).strip()

print (s)

C:\Python34\python.exe G:/xiaoshuo2.py

height="">

状元维金斯镇楼

P.S 1 我每天都至少更新一个，不TJ。

      2 今年的新秀我就不考虑了，没上赛季参照

10、正则表达式

<div class="list-item">

            <div class="personal-info">

                <div class="pic-word">

                    <div class="pic s60">

                        <a href="//mm.taobao.com/687471686.htm" target="_blank" class="lady-avatar">        <img src="//gtd.alicdn.com/sns_logo/i2/TB1XZ1PQVXXXXaJXpXXSutbFXXX.jpg_60x60.jpg" alt="" width="" height=""/>

</a>

                    </div>

                    <p class="top">

                    <a class="lady-name" href="//mm.taobao.com/self/model_card.htm?user_id=687471686" target="_blank">jenkins</a>

                    <em><strong>27</strong>岁</em>

                    <span>广州市</span>

pattern = re.compile('<div class="list-item">.*? href="(.*?)".*? src="(.*?)".*? target="_blank">(.*?)</a>.*?<strong>(.*?)</strong>.*?<span>(.*?)</span>',re.S)

或者

pattern = re.compile('<div class="list-item">.*?<a href="(.*?)".*?<img src="(.*?)".*?<a class="lady-name".*?>(.*?)</a>.*?<strong>(.*?)</strong>.*?<span>(.*?)</span>',re.S)

https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gbk&word=%C3%C0%C5%AE%CD%BC%C6%AC&fr=ala&ala=1&alatpl=cover&pos=0&hs=2&xthttps=111111

python爬虫前提技术的更多相关文章

python爬虫伪装技术应用
版权声明:本文为博主原创文章,转载请注明出处: https://blog.csdn.net/sc2079/article/details/82423865 -写在前面本篇博客主要是爬虫伪装技术的应 ...
Python爬虫入门教程 58-100 python爬虫高级技术之验证码篇4-极验证识别技术之一
目录验证码类型官网最新效果找个用极验证的网站拼接验证码图片编写自动化代码核心run方法模拟拖动方法图片处理方法初步运行结果拼接图图片存储到本地 @ 验证码类型今天要搞定的验证码 ...
Python爬虫入门教程 57-100 python爬虫高级技术之验证码篇3-滑动验证码识别技术
滑动验证码介绍本篇博客涉及到的验证码为滑动验证码,不同于极验证,本验证码难度略低,需要的将滑块拖动到矩形区域右侧即可完成. 这类验证码不常见了,官方介绍地址为:https://promotion.a ...
Python爬虫入门教程 56-100 python爬虫高级技术之验证码篇2-开放平台OCR技术
今日的验证码之旅今天你要学习的验证码采用通过第三方AI平台开放的OCR接口实现,OCR文字识别技术目前已经比较成熟了,而且第三方比较多,今天采用的是百度的. 注册百度AI平台官方网址:http:/ ...
Python爬虫入门教程 55-100 python爬虫高级技术之验证码篇
验证码探究如果你是一个数据挖掘爱好者,那么验证码是你避免不过去的一个天坑,和各种验证码斗争,必然是你成长的一条道路,接下来的几篇文章,我会尽量的找到各种验证码,并且去尝试解决掉它,中间有些技术甚至我 ...
Python爬虫入门教程 59-100 python爬虫高级技术之验证码篇5-极验证识别技术之二
图片比对昨天的博客已经将图片存储到了本地,今天要做的第一件事情,就是需要在两张图片中进行比对,将图片缺口定位出来缺口图片完整图片计算缺口坐标对比两张图片的所有RBG像素点,得到不一样像素点的 ...
《Python爬虫技术：深入理解原理、技术与开发》已经出版，送Python基础视频课程
好消息,<Python爬虫技术:深入理解原理.技术与开发>已经出版!!! JetBrains官方推荐图书!JetBrains官大中华区市场部经理赵磊作序!送Python基础视频课程!J ...
Python爬虫帮你打包下载所有抖音好听的背景音乐，还不快收藏一起听歌【华为云技术分享】
版权声明:本文为博主原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明. 本文链接:https://blog.csdn.net/devcloud/article/detai ...
Python爬虫技术：爬虫时如何知道是否代理ip伪装成功？
前言本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. python爬虫时如何知道是否代理ip伪装成功: 有时候我们的爬虫程序添加了 ...

随机推荐

Python——列表赋值的若干用例
原创声明:本文系博主原创文章,转载或引用请注明出处. 1. 直接赋值 >>> a = [1,2,3,4,5] >>> b = a >>> id(a ...
html2canvas-html图片合成-canvas生成图片
作用 html2canvas可以通过纯JS对浏览器端经行截屏,但截图的精确度还有待提高,部分css不可识别,所以在canvas中不能完美呈现原画面样式支持的浏览器 Firefox 3.5+ Goog ...
Java 包装类及其与String转换、进制转换
一.包装类 1.基本类型和引用类型 Java中的基本类型我们都知道有8种,但是作为基本类型限制功能的发挥,例如整形转String类型等可能需要类方法实现会更加简便.那么八个基本类型对应八个包装类,即引 ...
java面试题(目录版)
在https://www.cnblogs.com/marsitman/p/9539369.html 根据自己以往的面试经验,在该基础上做了补充和删减,均链接到相应的地址(链接失效请留言评论). 一. ...
爱搞事情的webpack
webpack 是一个现代 JavaScript 应用程序的静态模块打包器(module bundler). 当 webpack 处理应用程序时,它会递归地构建一个依赖关系图(dependency g ...
phpStudy环境下composer的安装
前言原来是做php开发的,现在转行前端工程师,因为很久没有接触了,可能会有其他问题,这里简单记录一下,后期遇到什么问题再进行更新~ 话说下载特别慢所以这里给个网盘链接Composer-Setup.e ...
什么是http协议(一)
http协议是大家在互联网中最为熟悉的协议,只要上网大家都会遇到,但是,很多人被问道什么是http协议,http协议的内容是什么就懵了.这里,我们随便聊聊http协议. 首先,我们说说协议.我一直觉得 ...
java——ThreadLocal、ThreadLocalMap、Thread三者的关系
https://www.jianshu.com/p/377bb840802f Thread类中维护了一个成员变量:ThreadLocalMap 每个Thread有一个自己的ThreadLocalMap ...
C# 常用方法——base64字符串转图片
其他常用方法详见:https://www.cnblogs.com/zhuanjiao/p/12060937.html /// <summary> /// base64编码的文本转为图片 / ...
[Python之路] ORM（对象关系映射）
一.概念 ORM是Python后端Web框架Django的核心思想,"Object Relational Mapping",即对象-关系映射,简称ORM. 一句话理解就是: 创建一 ...

python爬虫前提技术

python爬虫前提技术的更多相关文章

随机推荐

热门专题