python之HTMLParser解析HTML文档

HTMLParser是Python自带的模块，使用简单，能够很容易的实现HTML文件的分析。
本文主要简单讲一下HTMLParser的用法.

使用时需要定义一个从类HTMLParser继承的类，重定义函数：

handle_starttag( tag, attrs)
handle_startendtag( tag, attrs)
handle_endtag( tag)
handle_data(data)

更多属性及方法请查看源代码：

"""A parser for HTML and XHTML."""

# This file is based on sgmllib.py, but the API is slightly different.

# XXX There should be a way to distinguish between PCDATA (parsed

# character data -- the normal case), RCDATA (replaceable character

# data -- only char and entity references and end tags are special)

# and CDATA (character data -- only end tags are special).

import markupbase

import re

# Regular expressions used for parsing

interesting_normal = re.compile('[&<]')

incomplete = re.compile('&[a-zA-Z#]')

entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')

charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')

starttagopen = re.compile('<[a-zA-Z]')

piclose = re.compile('>')

commentclose = re.compile(r'--\s*>')

# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state

# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state

# note: if you change tagfind/attrfind remember to update locatestarttagend too

tagfind = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')

# this regex is currently unused, but left for backward compatibility

tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')

attrfind = re.compile(

    r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'

    r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')

locatestarttagend = re.compile(r"""

  <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name

  (?:[\s/]*                          # optional whitespace before attribute name

    (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name

      (?:\s*=+\s*                    # value indicator

        (?:'[^']*'                   # LITA-enclosed value

          |"[^"]*"                   # LIT-enclosed value

          |(?!['"])[^>\s]*           # bare value

         )

       )?(?:\s|/(?!>))*

     )*

   )?

  \s*                                # trailing whitespace

""", re.VERBOSE)

endendtag = re.compile('>')

# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between

# </ and the tag name, so maybe this should be fixed

endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')

class HTMLParseError(Exception):

    """Exception raised for all parse errors."""

    def __init__(self, msg, position=(None, None)):

        assert msg

        self.msg = msg

        self.lineno = position[0]

        self.offset = position[1]

    def __str__(self):

        result = self.msg

        if self.lineno is not None:

            result = result + ", at line %d" % self.lineno

        if self.offset is not None:

            result = result + ", column %d" % (self.offset + 1)

        return result

class HTMLParser(markupbase.ParserBase):

    """Find tags and other markup and call handler functions.

    Usage:

        p = HTMLParser()

        p.feed(data)

        ...

        p.close()

    Start tags are handled by calling self.handle_starttag() or

    self.handle_startendtag(); end tags by self.handle_endtag().  The

    data between tags is passed from the parser to the derived class

    by calling self.handle_data() with the data as argument (the data

    may be split up in arbitrary chunks).  Entity references are

    passed by calling self.handle_entityref() with the entity

    reference as the argument.  Numeric character references are

    passed to self.handle_charref() with the string containing the

    reference as the argument.

    """

    CDATA_CONTENT_ELEMENTS = ("script", "style")

    def __init__(self):

        """Initialize and reset this instance."""

        self.reset()

    def reset(self):

        """Reset this instance.  Loses all unprocessed data."""

        self.rawdata = ''

        self.lasttag = '???'

        self.interesting = interesting_normal

        self.cdata_elem = None

        markupbase.ParserBase.reset(self)

    def feed(self, data):

        r"""Feed data to the parser.

        Call this as often as you want, with as little or as much text

        as you want (may include '\n').

        """

        self.rawdata = self.rawdata + data

        self.goahead(0)

    def close(self):

        """Handle any buffered data."""

        self.goahead(1)

    def error(self, message):

        raise HTMLParseError(message, self.getpos())

    __starttag_text = None

    def get_starttag_text(self):

        """Return full source of start tag: '<...>'."""

        return self.__starttag_text

    def set_cdata_mode(self, elem):

        self.cdata_elem = elem.lower()

        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)

    def clear_cdata_mode(self):

        self.interesting = interesting_normal

        self.cdata_elem = None

    # Internal -- handle data as far as reasonable.  May leave state

    # and data to be processed by a subsequent call.  If 'end' is

    # true, force handling all data as if followed by EOF marker.

    def goahead(self, end):

        rawdata = self.rawdata

        i = 0

        n = len(rawdata)

        while i < n:

            match = self.interesting.search(rawdata, i) # < or &

            if match:

                j = match.start()

            else:

                if self.cdata_elem:

                    break

                j = n

            if i < j: self.handle_data(rawdata[i:j])

            i = self.updatepos(i, j)

            if i == n: break

            startswith = rawdata.startswith

            if startswith('<', i):

                if starttagopen.match(rawdata, i): # < + letter

                    k = self.parse_starttag(i)

                elif startswith("</", i):

                    k = self.parse_endtag(i)

                elif startswith("<!--", i):

                    k = self.parse_comment(i)

                elif startswith("<?", i):

                    k = self.parse_pi(i)

                elif startswith("<!", i):

                    k = self.parse_html_declaration(i)

                elif (i + 1) < n:

                    self.handle_data("<")

                    k = i + 1

                else:

                    break

                if k < 0:

                    if not end:

                        break

                    k = rawdata.find('>', i + 1)

                    if k < 0:

                        k = rawdata.find('<', i + 1)

                        if k < 0:

                            k = i + 1

                    else:

                        k += 1

                    self.handle_data(rawdata[i:k])

                i = self.updatepos(i, k)

            elif startswith("&#", i):

                match = charref.match(rawdata, i)

                if match:

                    name = match.group()[2:-1]

                    self.handle_charref(name)

                    k = match.end()

                    if not startswith(';', k-1):

                        k = k - 1

                    i = self.updatepos(i, k)

                    continue

                else:

                    if ";" in rawdata[i:]:  # bail by consuming '&#'

                        self.handle_data(rawdata[i:i+2])

                        i = self.updatepos(i, i+2)

                    break

            elif startswith('&', i):

                match = entityref.match(rawdata, i)

                if match:

                    name = match.group(1)

                    self.handle_entityref(name)

                    k = match.end()

                    if not startswith(';', k-1):

                        k = k - 1

                    i = self.updatepos(i, k)

                    continue

                match = incomplete.match(rawdata, i)

                if match:

                    # match.group() will contain at least 2 chars

                    if end and match.group() == rawdata[i:]:

                        self.error("EOF in middle of entity or char ref")

                    # incomplete

                    break

                elif (i + 1) < n:

                    # not the end of the buffer, and can't be confused

                    # with some other construct

                    self.handle_data("&")

                    i = self.updatepos(i, i + 1)

                else:

                    break

            else:

                assert 0, "interesting.search() lied"

        # end while

        if end and i < n and not self.cdata_elem:

            self.handle_data(rawdata[i:n])

            i = self.updatepos(i, n)

        self.rawdata = rawdata[i:]

    # Internal -- parse html declarations, return length or -1 if not terminated

    # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state

    # See also parse_declaration in _markupbase

    def parse_html_declaration(self, i):

        rawdata = self.rawdata

        if rawdata[i:i+2] != '<!':

            self.error('unexpected call to parse_html_declaration()')

        if rawdata[i:i+4] == '<!--':

            # this case is actually already handled in goahead()

            return self.parse_comment(i)

        elif rawdata[i:i+3] == '<![':

            return self.parse_marked_section(i)

        elif rawdata[i:i+9].lower() == '<!doctype':

            # find the closing >

            gtpos = rawdata.find('>', i+9)

            if gtpos == -1:

                return -1

            self.handle_decl(rawdata[i+2:gtpos])

            return gtpos+1

        else:

            return self.parse_bogus_comment(i)

    # Internal -- parse bogus comment, return length or -1 if not terminated

    # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state

    def parse_bogus_comment(self, i, report=1):

        rawdata = self.rawdata

        if rawdata[i:i+2] not in ('<!', '</'):

            self.error('unexpected call to parse_comment()')

        pos = rawdata.find('>', i+2)

        if pos == -1:

            return -1

        if report:

            self.handle_comment(rawdata[i+2:pos])

        return pos + 1

    # Internal -- parse processing instr, return end or -1 if not terminated

    def parse_pi(self, i):

        rawdata = self.rawdata

        assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'

        match = piclose.search(rawdata, i+2) # >

        if not match:

            return -1

        j = match.start()

        self.handle_pi(rawdata[i+2: j])

        j = match.end()

        return j

    # Internal -- handle starttag, return end or -1 if not terminated

    def parse_starttag(self, i):

        self.__starttag_text = None

        endpos = self.check_for_whole_start_tag(i)

        if endpos < 0:

            return endpos

        rawdata = self.rawdata

        self.__starttag_text = rawdata[i:endpos]

        # Now parse the data between i+1 and j into a tag and attrs

        attrs = []

        match = tagfind.match(rawdata, i+1)

        assert match, 'unexpected call to parse_starttag()'

        k = match.end()

        self.lasttag = tag = match.group(1).lower()

        while k < endpos:

            m = attrfind.match(rawdata, k)

            if not m:

                break

            attrname, rest, attrvalue = m.group(1, 2, 3)

            if not rest:

                attrvalue = None

            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \

                 attrvalue[:1] == '"' == attrvalue[-1:]:

                attrvalue = attrvalue[1:-1]

            if attrvalue:

                attrvalue = self.unescape(attrvalue)

            attrs.append((attrname.lower(), attrvalue))

            k = m.end()

        end = rawdata[k:endpos].strip()

        if end not in (">", "/>"):

            lineno, offset = self.getpos()

            if "\n" in self.__starttag_text:

                lineno = lineno + self.__starttag_text.count("\n")

                offset = len(self.__starttag_text) \

                         - self.__starttag_text.rfind("\n")

            else:

                offset = offset + len(self.__starttag_text)

            self.handle_data(rawdata[i:endpos])

            return endpos

        if end.endswith('/>'):

            # XHTML-style empty tag: <span attr="value" />

            self.handle_startendtag(tag, attrs)

        else:

            self.handle_starttag(tag, attrs)

            if tag in self.CDATA_CONTENT_ELEMENTS:

                self.set_cdata_mode(tag)

        return endpos

    # Internal -- check to see if we have a complete starttag; return end

    # or -1 if incomplete.

    def check_for_whole_start_tag(self, i):

        rawdata = self.rawdata

        m = locatestarttagend.match(rawdata, i)

        if m:

            j = m.end()

            next = rawdata[j:j+1]

            if next == ">":

                return j + 1

            if next == "/":

                if rawdata.startswith("/>", j):

                    return j + 2

                if rawdata.startswith("/", j):

                    # buffer boundary

                    return -1

                # else bogus input

                self.updatepos(i, j + 1)

                self.error("malformed empty start tag")

            if next == "":

                # end of input

                return -1

            if next in ("abcdefghijklmnopqrstuvwxyz=/"

                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):

                # end of input in or before attribute value, or we have the

                # '/' from a '/>' ending

                return -1

            if j > i:

                return j

            else:

                return i + 1

        raise AssertionError("we should not get here!")

    # Internal -- parse endtag, return end or -1 if incomplete

    def parse_endtag(self, i):

        rawdata = self.rawdata

        assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"

        match = endendtag.search(rawdata, i+1) # >

        if not match:

            return -1

        gtpos = match.end()

        match = endtagfind.match(rawdata, i) # </ + tag + >

        if not match:

            if self.cdata_elem is not None:

                self.handle_data(rawdata[i:gtpos])

                return gtpos

            # find the name: w3.org/TR/html5/tokenization.html#tag-name-state

            namematch = tagfind.match(rawdata, i+2)

            if not namematch:

                # w3.org/TR/html5/tokenization.html#end-tag-open-state

                if rawdata[i:i+3] == '</>':

                    return i+3

                else:

                    return self.parse_bogus_comment(i)

            tagname = namematch.group(1).lower()

            # consume and ignore other stuff between the name and the >

            # Note: this is not 100% correct, since we might have things like

            # </tag attr=">">, but looking for > after tha name should cover

            # most of the cases and is much simpler

            gtpos = rawdata.find('>', namematch.end())

            self.handle_endtag(tagname)

            return gtpos+1

        elem = match.group(1).lower() # script or style

        if self.cdata_elem is not None:

            if elem != self.cdata_elem:

                self.handle_data(rawdata[i:gtpos])

                return gtpos

        self.handle_endtag(elem)

        self.clear_cdata_mode()

        return gtpos

    # Overridable -- finish processing of start+end tag: <tag.../>

    def handle_startendtag(self, tag, attrs):

        self.handle_starttag(tag, attrs)

        self.handle_endtag(tag)

    # Overridable -- handle start tag

    def handle_starttag(self, tag, attrs):

        pass

    # Overridable -- handle end tag

    def handle_endtag(self, tag):

        pass

    # Overridable -- handle character reference

    def handle_charref(self, name):

        pass

    # Overridable -- handle entity reference

    def handle_entityref(self, name):

        pass

    # Overridable -- handle data

    def handle_data(self, data):

        pass

    # Overridable -- handle comment

    def handle_comment(self, data):

        pass

    # Overridable -- handle declaration

    def handle_decl(self, decl):

        pass

    # Overridable -- handle processing instruction

    def handle_pi(self, data):

        pass

    def unknown_decl(self, data):

        pass

    # Internal -- helper to remove special character quoting

    entitydefs = None

    def unescape(self, s):

        if '&' not in s:

            return s

        def replaceEntities(s):

            s = s.groups()[0]

            try:

                if s[0] == "#":

                    s = s[1:]

                    if s[0] in ['x','X']:

                        c = int(s[1:], 16)

                    else:

                        c = int(s)

                    return unichr(c)

            except ValueError:

                return '&#'+s+';'

            else:

                # Cannot use name2codepoint directly, because HTMLParser supports apos,

                # which is not part of HTML 4

                import htmlentitydefs

                if HTMLParser.entitydefs is None:

                    entitydefs = HTMLParser.entitydefs = {'apos':u"'"}

                    for k, v in htmlentitydefs.name2codepoint.iteritems():

                        entitydefs[k] = unichr(v)

                try:

                    return self.entitydefs[s]

                except KeyError:

                    return '&'+s+';'

        return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)

可以看出，源代码中handle_xxxxxx函数体均是空的，需要自己继承并添加处理内容；否则函数不作任何处理。

1. 获取标签属性

tag是的html标签，attrs是 (属性，值)元组(tuple)的列表(list).

如一个标签为：<input type="hidden" name="NXX" id="IDXX" value="VXX" />

那么它的attrs列表为[('type', 'hidden'), ('name', 'NXX'), ('id', 'IDXX'), ('value', 'VXX')]
HTMLParser自动将tag和attrs都转为小写。

下面给出的例子抽取了html中的所有链接：

from HTMLParser import HTMLParser

class MyHTMLParser(HTMLParser):

    def __init__(self):

        HTMLParser.__init__(self)

        self.links = []

    def handle_starttag(self, tag, attrs):

        #print "Encountered the beginning of a %s tag" % tag

        if tag == "a":

            if len(attrs) == 0: pass

            else:

                for (variable, value)  in attrs:

                    if variable == "href":

                        self.links.append(value)

if __name__ == "__main__":

    html_code = """

    <a href="www.google.com"> google.com</a>

    <A Href="www.pythonclub.org"> PythonClub </a>

    <A HREF = "www.sina.com.cn"> Sina </a>

    """

    hp = MyHTMLParser()

    hp.feed(html_code)

    hp.close()

    print(hp.links)

输出为：

['www.google.com', 'www.pythonclub.org', 'www.sina.com.cn']

如果想抽取图形链接：

<img src='http://www.google.com/intl/zh-CN_ALL/images/logo.gif' />

就要重定义 handle_startendtag( tag, attrs) 函数

2. 获取标签内容　　

test1.html文件内容如下：

<html>

<head>

<title> XHTML 与 HTML 4.01 标准没有太多的不同</title>

</head>

<body>

i love you

</body>

</html>

2.1 第一个例子

import HTMLParser

class TitleParser(HTMLParser.HTMLParser):

    def __init__(self):

        HTMLParser.HTMLParser.__init__(self)

        # self.taglevels=[]

        self.handledtags = ['title','body']

        self.processing = None

    def handle_starttag(self,tag,attrs):

        print '--------------'

        print 'handle start func',tag

    def handle_endtag(self,tag):

        print '================'

        print 'handle end func',tag

if __name__ == '__main__':

    fd=open('test1.html')

    tp=TitleParser()

    tp.feed(fd.read())

运行结果：

--------------

handle start func html

--------------

handle start func head

--------------

handle start func title

=======================

handle end func title

=======================

handle end func head

--------------

handle start func body

=======================

handle end func body

=======================

handle end func html

相信大家已经看出来了，解析时碰到<***>，自动调用handle_starttag()；碰到</***>，自动调用handle_endtag()

2.2 添加handle_data方法

import HTMLParser

class TitleParser(HTMLParser.HTMLParser):

    def __init__(self):

        HTMLParser.HTMLParser.__init__(self)

        # self.taglevels=[]

        self.handledtags = ['title','body']

        self.processing = None

    def handle_starttag(self,tag,attrs):

        print '--------------'

        print 'handle start func',tag

    def handle_data(self,data):

        print '####'

        print 'handle data func'

        if data == '\n':

            print r'\n'

        else:

            print data,

    def handle_endtag(self,tag):

        print '======================='

        print 'handle end func',tag

if __name__ == '__main__':

    fd=open('test1.html')

    tp=TitleParser()

    tp.feed(fd.read())

运行结果：

--------------

handle start func html

####

handle data func

\n

--------------

handle start func head

####

handle data func

\n

--------------

handle start func title

####

handle data func

 XHTML 与 HTML 4.01 标准没有太多的不同 =======================

handle end func title

####

handle data func

\n

=======================

handle end func head

####

handle data func

\n

--------------

handle start func body

####

handle data func

i love you

=======================

handle end func body

####

handle data func

\n

=======================

handle end func html

说明：

每一个标签，无论<> 还是</>，均会调用handle_data()
html中第一行、第二行分别为<html>和<head>，后面无具体数据，只有回车换行，所用调用handle_data()，打印结果为换行；</html></head>同理。

2.2 解析需要的内容

import HTMLParser

class TitleParser(HTMLParser.HTMLParser):

    def __init__(self):

        HTMLParser.HTMLParser.__init__(self)

        self.handledtags = ['title','body']

        self.processing = None

        self.data = []

    def handle_starttag(self,tag,attrs):

        if tag in self.handledtags:

            self.processing = tag

    def handle_data(self,data):

        if self.processing:

            self.data.append(data)

    def handle_endtag(self,tag):

        if tag == self.processing:

            self.processing = None

if __name__ == '__main__':

    fd = open('test1.html')

    tp = TitleParser()

    tp.feed(fd.read())

    for each in tp.data:

        print each

运行结果：

 XHTML 与 HTML 4.01 标准没有太多的不同

i love you

2.3 解析豆瓣热门电影实例

#encoding=utf8

import urllib2

from HTMLParser import HTMLParser

'''

<li class="ui-slide-item s" data-rater="6802" data-enough="True" data-intro="" data-actors="朴灿烈 / 袁姗姗 / 姜潮" data-director="金帝荣" data-region="中国大陆" data-duration="99分钟" data-ticket="https://movie.douban.com/subject/26564988/cinema/" data-trailer="https://movie.douban.com/subject/26564988/trailer" data-star="30" data-rate="5.3" data-release="2016" data-title="所以……和黑粉结婚了" data-dstat-viewport=".screening-bd" data-dstat-watch=".ui-slide-content" data-dstat-mode="click,expose" data-dstat-areaid="70_4">

'''

class MYPARSER(HTMLParser):

    def __init__(self):

        HTMLParser.__init__(self)

        self.movies = []

    def handle_starttag(self,tag,attrs):

        def _attr(attrlist,attrname):

            for each in attrlist:

                if attrname == each[0]:

                    return each[1]

            return None

        if tag == 'li' and _attr(attrs,'data-title'):

            movie = {}

            movie['actors'] = _attr(attrs,'data-actors')

            movie['director'] = _attr(attrs,'data-director')

            movie['duration'] = _attr(attrs,'data-dutation')

            movie['title'] = _attr(attrs,'data-title')

            movie['rate'] = _attr(attrs,'data-rate')

            self.movies.append(movie)

def movieparser(url):

    headers = {}

    req = urllib2.Request(url,headers)

    s = urllib2.urlopen(req)

    myparser = MYPARSER()

    myparser.feed(s.read())

    myparser.close()

    return myparser.movies

if __name__ == '__main__':

    url = 'https://movie.douban.com/'

    movies = movieparser(url)

    for each in movies:

        print('%(title)s|%(rate)s|%(actors)s|%(director)s|%(duration)s' % each)

运行结果：

寒战2|7.2|郭富城 / 梁家辉 / 杨采妮|梁乐民|None

致青春·原来你还在这里|3.9|吴亦凡 / 刘亦菲 / 金世佳|周拓如|None

大鱼海棠|6.6|季冠霖 / 苏尚卿 / 许魏洲|梁旋|None

忍者神龟2：破影而出 Teenage Mutant Ninja Turtles: Out of the Shadows|6.4|梅根·福克斯 / 斯蒂芬·阿美尔 / 威尔·阿奈特|戴夫·格林|None

摇滚藏獒|6.8|郭德纲 / 郭麒麟 / 于谦|艾什·布兰农|None

发条城市|6.4|王宁 / 修睿 / 王自健|江涛|None

赏金猎人|5.5|李敏镐 / 钟汉良 / 唐嫣|申太罗|None

张震讲故事之合租屋|4.8|卢杉 / 傅亨 / 吴谨西|战越|None

惊天魔盗团2 Now You See Me 2|6.6|杰西·艾森伯格 / 伍迪·哈里森 / 戴夫·弗兰科|朱浩伟|None

海底总动员2：多莉去哪儿 Finding Dory|7.4|艾伦·德杰尼勒斯 / 艾伯特·布鲁克斯 / 艾德·奥尼尔|安德鲁·斯坦顿|None

独立日：卷土重来 Independence Day: Resurgence|5.9|利亚姆·海姆斯沃斯 / 杰夫·高布伦 / 比尔·普尔曼|罗兰·艾默里奇|None

丑小鸭历险记|3.3|朱可可 / 阿飞 / 夏倚轩|郑义|None

所以……和黑粉结婚了|5.3|朴灿烈 / 袁姗姗 / 姜潮|金帝荣|None

筷仙|2.7|胡影怡 / 朱璇 / 周骏|姬雨|None

古田会议|2.9|许铂岑 / 王韦智 / 王怡苏|陈健|None

魔轮|4.8|林心如 / 何润东 / 金世佳|王早|None

代码说明：

代码中的文档字符串，是需要解析的文档，从豆瓣网抓取的
抓取的内容包括：标题、评分、演员、导演、时长