re模块详解

 #!/usr/bin/env python

 #-*- coding:UTF-8 -*-

 #####################################################

 # Author: sunfx   xingrhce@163.com

 # Last modified:  2014/11/18

 # Filename:  re.py

 # Q  Q  群:  236147801

 #####################################################

 import re

 #1.查找文本中的字符

 pattern = 'this'

 text = 'Does this text match the pattern?'

 match = re.search(pattern,text)

 s = match.start()

 e = match.end()

 print 'Found "%s"\nin "%s"\nfrom %d to %d ("%s")' %\

       (match.re.pattern,match.string,s,e,text[s:e])

 '''

 match.re.pattern 要匹配的内容

 match.string 匹配的字符

 s  匹配到内容开始索引

 d  匹配到内容结束索引

 text[s:e] 匹配字符

 '''

 #2.编译表达式

 regexes = [ re.compile(p)

             for p in ['this','that']

 ] #把字符转换Regexobject格式

 print 'Text: %r\n' % text #输出text内容

 for regex in regexes:

     print 'Seeking "%s"->' % regex.pattern,  #regex.pattern 要匹配的字符

     if regex.search(text): #在text中搜索this or that

         print 'match!'

     else:

         print 'no match'

 #3.多重匹配

 text = 'abbaaabbbbaaaaa'

 pattern = 'ab'

 for match in re.findall(pattern,text):

     print 'Found: "%s"' % match

 #findall 直接返回字符串

 for match in re.finditer(pattern,text):

     s = match.start()

     e = match.end()

     print 'Found "%s" at %d:%d' % (text[s:e],s,e)

 #finditer 返回原输入文字在字符串的位置

 #4.模式语法

 def test_patterns(text,patterns=[]):

     for pattern,desc in patterns:

         print 'Pattern %r (%s) \n' %(pattern,desc)

         print '   %r' % text

         for match in re.finditer(pattern,text):

             s = match.start()

             e = match.end()

             substr = text[s:e] #匹配到的字符

             n_backslashes = text[:s].count('\\') #查找文本:s坐标之前的包含多少\\

             prefix = '.' * ( s + n_backslashes )

             print '    %s%r' % (prefix,substr)

         print

     return

 test_patterns('abbaaabbbbaaaaa',

             [('ab',"'a' followed by 'b'")]

     )

 #贪婪模式 这种模式会减少单个匹配减少

 '''

      *                '匹配一次到多次'

      +                '至少匹配一次到多次'

      ?                '只匹配一次'

      ab*,             'a followerd by zero or more b'),  #匹配0次或者更多次

      ab+,             'a followerd by one or mrore b'),  #最少匹配一次或者更多次

      ab?,             'a followerd by zero or one b'),   #匹配0最多一次

      ab{3},           'a followerd by three b'),         #最少匹配三次

      ab{2,3},           'a followerd by two to three b')   #匹配两至三次

      ab*?,             'a followerd by zero or more b'),  #匹配0次或者更多次

      ab+?,             'a followerd by one or mrore b'),  #最少匹配一次或者更多次

      ab??,             'a followerd by zero or one b'),   #匹配0最多一次

      ab{3}?,           'a followerd by three b'),         #最少匹配三次

      ab{2,3}?,           'a followerd by two to three b')   #匹配两至三次

 '''

 #用法如下:

 str = 'absdsdsdsdsd'

 print re.findall('ab*',str)

 #['ab']

 print re.findall('ab*?',str)

 #['a']

 #5.字符集

 '''

 [ab]     'either a or b 匹配a或者b'

 a[ab]+   'a followerd by 1 more a or b 匹配一次a、b或者多次 '

 a[ab]+?  'a followerd by 1 or more a or b,not greedy 匹配1一次可以匹配多次'

 [^]      '不包含内容'

 [a-z]    '所有小写ASCII字母'

 [A-Z]    '所有大写写ASCII字母'

 [a-zA-Z] '一个小写和大写的序列'

 [A-Za-z] '一个大写小写的序列'

 '''

 str ='aaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbasbsbab,a_baba'

 print re.findall('[ab]',str)

 print re.findall('a[ab]+',str)

 print re.findall('a[ab]+?',str)

 print re.findall('[^_]',str)

 str = 'China,lovE'

 print re.findall('[a-z][A-Z]',str)  #['vE']

 print re.findall('[A-Z][a-z]',str)  #['Ch']

 print re.findall('[A-Z][a-z]+',str) #['China']

 print re.findall('[a-z][A-Z]+',str) #['vE']

 print re.findall('[A-Z][a-z]*',str) #['China', 'E']

 print re.findall('[a-z][A-Z]*',str) #['h', 'i', 'n', 'a', 'l', 'o', 'vE']

 print re.findall('[A-Z][a-z]?',str) #['Ch', 'E']

 print re.findall('[a-z][A-Z]?',str) #['h', 'i', 'n', 'a', 'l', 'o', 'vE']

 '''

 .      元字符匹配一个字符

 a.

 b.

 a.*b

 a.*?b

 '''

 c = 'woaizhongguoawsb,wasssssssssssssdsdsdsdbsdddddddbaaabbbbbbbsd'

 print re.findall('a.',c)  #['ai', 'aw', 'as', 'aa', 'ab']

 print re.findall('b.',c)  #['b,', 'bs', 'ba', 'bb', 'bb', 'bb', 'bs']

 print re.findall('a.*b',c)  #['aizhongguoawsb,wasssssssssssssdsdsdsdbsdddddddbaaabbbbbbb'] #贪婪模式匹配a到b之间的任意字符长度字符

 print re.findall('a.*?b',c)  #['aizhongguoawsb', 'asssssssssssssdsdsdsdb', 'aaab'] # ?结束了* 的贪婪模式,

                              #它不会到最后一个b再去匹配而且见好就收,匹配可能最短的字符

 #6.转义码

 '''

 转义码                                   含义

  \d                                    一个数字

  \D                                    一个非字符

  \s                                    空白符(制表符、空格、换行符)

  \S                                    非空白符(符号、字母、数字)

  \w                                    字母数字

  \W                                    非字母数字(符号、制表符、空格、换行符)

 '''

 #7.锚定

 '''

 锚定码                               含义

   ^                              字符串或行的开始

   $                              字符串或行结束

   \A                             字符串开始

   \Z                             字符串结束

   \b                             一个单词开头或者末尾的空串

   \B                             不在一个单词的开头活末尾的空串

 '''

 #8.限制搜索 match、search

 text = 'This is some text --with punctuation.'

 pattern = 'is'

 print 'Text    :',text

 print 'pattern:',pattern

 m = re.match(pattern,text)   #因为match是从字符开头开始匹配 is没有在开头所以没有匹配到.

 print 'Match :',m   

 s = re.search(pattern,text) #is在文本中出现了两次所以匹配到内容

 print 'Search :',s

 pattern = re.compile(r'\b\w*is\w*\b') #编译规则

 print 'Text:',text

 pos = 0

 while  True:

     match = pattern.search(text,pos) #搜索规则

     if not match:

         break

     s = match.start()

     e = match.end()

     print '  %d : %d = "%s"' % (s,e-1,text[s:e])

     pos = e

 #9 用户组解析匹配(任何一个正则都可以为组并嵌套在一个更大的表达式中)

 regex = re.compile(r'(\bt\w+)\W+(\w+)')

 print 'Input  text      :',text

 print 'Pattern          :',regex.pattern

 match = regex.search(text)

 print 'Entire match     :',match.group(0) #表示整个表达式的字符串,子组从1开始排序

 print 'World start with "t":',match.group(1) #匹配到的第一组

 print 'World after "t" word :',match.group(2) #匹配到的第二组

 #python对基本分组进行了扩展 (?P<name>pattern)

 print text

 print

 for pattern in [ r'^(?P<first_word>\w+)',  #组名和正则表达式组成

                  r'(?P<last_word>\w+)\S*$',

                  r'(?P<t_word>\bt\w+)\W+(?P<other_word>\w+)',

                  r'(?P<ends_with_t>\w+t)\b',

                  ]:

     regex = re.compile(pattern)

     match = regex.search(text)

     print 'Matching "%s"' % pattern

     print ' ',match.groups()  #匹配到所有的组的值

     print ' ',match.groupdict() #把组名和字串生成字典

     print

 def test_patterns(text,patterns=[]):

     '''Given source text and a list of patterns,look for

     matches for each pattern within the text and print

     them to stdout.

     '''

     #look for each pattern in the text and print the resuls

     for pattern,desc in patterns:

         print 'Pattern %r (%s)\n' % (pattern,desc)

         print '   %r' % text

     for match in re.finditer(pattern,text):

         s = match.start()

         e = match.end()

         prefix = ' ' * (s) #'空格 X 次数'

         print '   %s%r%s' % (prefix,text[s:e],' '*(len(text)-e)),

         print match.groups()

         if match.groupdict():

             print '%s%s' % (' ' * (len(text) -s),match,groupdict())

             print

     return

 print test_patterns(text,[(r'(a(a*)(b*))','a followerd by 0-n a and 0-n b')])

 '''

 |       代表左右表达式任意匹配一个,他总是先尝试匹配左边的表达式,一旦成功匹配则

 跳过匹配右边的表达式。如果|没有被包括()中,则它的范围是整个正则表达式

 ?:pattern

 '''

 #10.搜索选项 - 不区分大小写的匹配

 '''

 re.IGNORECASE 忽略大小写

 '''

 text  = 'This is some text  -- with punctuation.'

 pattern = r'\bT\w+'

 with_case = re.compile(pattern)

 whitout_case = re.compile(pattern,re.IGNORECASE) #re.IGNORECASE 忽略大小写

 print 'Text: \n  %r' % text

 print 'Pattern:\n %s' % pattern

 print 'Case-sensitive:'

 for match in with_case.findall(text):

     print '  %r' % match

 print 'Case-insensitive:'

 for match in whitout_case.findall(text):

     print ' %r' % match

 #11.多行输入

 '''

 MULTILINE  多行匹配

 '''

 text = 'This is some text  -- with punctuation.\nA secone lines.'

 pattern = r'(^\w+)|(\w+\S*$)'

 single_line = re.compile(pattern)

 multiline = re.compile(pattern,re.MULTILINE)

 print 'Text:\n %r' % text

 print 'Pattern:\n  %s' % pattern

 print 'Single Line :'

 for match in single_line.findall(text):

     print '  %r' % (match,)

 print 'MULTILINE  :'

 for match in multiline.findall(text):

     print '  %r'  % (match,)

 '''

 DOTALL 让点字符也可以匹配换行符

 '''

 pattern = r'.+'

 no_newlines = re.compile(pattern)

 dotall = re.compile(pattern,re.DOTALL)

 print 'Text :\n   %r' % text

 print 'Pattern:\n %s' % pattern

 print 'No newlines :'

 for match in no_newlines.findall(text):

     print '  %r' % match

 print 'Dotall    :'

 for  match in dotall.findall(text):

     print '  %r' % match

 #12 Unicode匹配

 '''

 re.UNICODE 匹配Unicode

 '''

 import codecs

 import sys

 #set standard output encoding to UTF-8

 sys.output = codecs.getwriter('UTF-8')(sys.stdout)

 pattern = ur'\w+'

 ascii_pattern = re.compile(pattern)

 unicde_pattern = re.compile(pattern,re.UNICODE)

 print 'Text    :',text

 print 'Pattern :',pattern

 print 'ASCII   :',u', '.join(ascii_pattern.findall(text))

 print 'Unicode :',u', '.join(unicde_pattern.findall(text))

 '''

 re.VERBOSE 让正则更容易读

 '''

 address = re.compile(

         '''

         [\w\d.+-]+    #username

         @

         ([\w\d.]+\.)+ #domain name prefix

         (com|org|edu) #TODO:support more top-level domains

         ''',

         re.UNICODE | re.VERBOSE)

 candidates = [

         u'first.last@example.com',

         u'first.last+category@gmail.com',

         u'valid-address@mail.example.com',

         u'not-valid@example.foo'

 ]

 for candidate in candidates:

     match = address.search(candidate)

     print '%-30s %s' % (candidate,'Matche' if match else 'no match')

 address = re.compile (

     '''

     #A name is made up of letters,and may include "."

     #for title abbreviations and middle initials.

     ((?P<name>

         ([\w.,]+\S+)*[\w.,]+)

         \s*

         # Email addresses are wrapped in angle

         # brackets: <> but only if a name is

         # found, so keep the start bracket in this

         # group.

         <

     )?  # the entire name is optional

     # the address itself:username@domain.tld

     (?P<email>

         [\w\d.+-]+    #username

         @

         ([\w\d.]+\.)+ #domain name prefix

         (com|org|edu) #TODO:support more top-level domains

     )

     >? # optional closeing angle break

     ''',

     re.UNICODE | re.VERBOSE)

 candidates = [

         u'first.last@example.com',

         u'first.last+category@gmail.com',

         u'valid-address@mail.example.com',

         u'not-valid@example.foo'

         u'Fist Last <first.last@example.com>'

         u'NO Brackets first.last@example',

         u'First Last',

         u'First Middle Last <first.last@example.com>',

         u'First M. Last <first.last@example.com>',

         u'<first.last@example.com>',

 ]

 for candidate in candidates:

     print 'candidate:',candidate

     match = address.search(candidate)

     if match:

         print ' Name:',match.groupdict()['name']

         print ' Email:',match.groupdict()['email']

     else:

         print '   No match'

 '''

                     正则表达式标志缩写表

     标志                  缩写               描述

   IGNORECASE              i           忽略大小写

   MULTILINE                 m           多行匹配

   DOTALL                    s          让点字符也可以匹配换行符

   UNICODE                  u          匹配Unicode

   VERBOSE                 x          让正则更容易读

 在模式中嵌入标签(?imu)会打开相应的选项

 '''

 text = 'This is  some text -- with punctuation.'

 pattern = r'(?i)\bT\w+'

 regex = re.compile(pattern)

 print 'Text   :',text

 print 'Pattern    :',pattern

 print 'Matches   :',regex.findall(text)

 #13 前向或后向

 address = re.compile(

     '''

     # A name is made up of letters, and may include "."

     # for title abbreviations and middle initials

     ((?P<name>

         ([\w.,]+\s+)*[\w.,]+

         )

     \s+

     )  # name is no longer optional

     # LOOKAHEAD

     # Email address are wrapped in angle brackets, but only

     # if they are both present or neither is .

     (?= (<.*>$)

         |

         ([^<].*[^>]$)

     )

     <? # optional opening angle bracket

     # The address itself: username@domain.tld

     (?P<email>

         [\w\d.+-]+

         @

         ([\w\d.]+\.)+

         (com|org|edu)

     )

     >?

     ''',

     re.UNICODE | re.VERBOSE)

 candidates = [

     u'First Last <first.last@example.com>',

     u'No Brackets first.last@example.com',

     u'Open Brackets <first.last@example.com>',

     u'Close Brackets first.last@example.com',

     ]

 for candidate in candidates:

     print 'Candidate:',candidate

     match = address.search(candidate)

     if match:

         print ' Name :',match.groupdict()['name']

         print ' Email :',match.groupdict()['email']

     else:

         print '  No match'

 #自动忽略系统常用的noreply邮件地址

 '''

 (?!noreply@.*$) 忽略这个邮件地址

 (?<!noreply>)  两种模式 写在username之前不会向后断言

 (?<=pattern)   用肯定向后断言查找符合某个模式的文本

 '''

 address = re.compile(

     '''

     ^

     # An address: username@domain.tld

     # Ignore noreply address

     (?!noreply@.*$)

     [\w\d.+-]+     # username

     @

     ([\w\d.]+\.)+  # domain name prefix

     (com|org|edu)  # limit the allowed top-level domains

     $

     ''',

     re.UNICODE | re.VERBOSE)

 candidates = [

     u'first.last@example.com',

     u'noreply@example.com',

 ]

 for candidate in candidates:

     print 'Candidate:',candidate

     match = address.search(candidate)

     if match:

         print '  Match:',candidate[match.start():match.end()]

     else:

         print '  No match'

 twitter = re.compile(

     '''

     # A twitter handle: @username

     (?<=@)

     ([\w\d_]+)   # username

     ''',

     re.UNICODE | re.VERBOSE)

 text = ''' This text includes two Twitter handles.

 One for @TheSF,and one for the author,@doughellmann.

 '''

 print text

 for match in twitter.findall(text):

     print 'handle:',match

 #14 自引用表达式 #可以把表达式编号后面来引用

 address = re.compile(

     '''

     (\w+)          # first name

     \s+

     (([\w.]+)\s+)?  # optional middle name or initial

     (\w+)           # last name

     \s+

     <

     # The address: first_name.last_name@domain.tld

     (?P<email>

         \1         #first name

         \.

         \4         #last name

         @

         ([\w\d.]+\.)+

         (com|org|edu)

         )

     >

     ''',

     re.UNICODE | re.VERBOSE | re.IGNORECASE)

 candidates = [

     u'First Last <first.last@example.com>',

     u'Different Name <first.last.example.com>',

     u'First Middle Last <first.last@example.com>',

 ]

 for candidate in candidates:

     print 'Candidate:',candidate

     match = address.search(candidate)

 if match:

     print '  Match name:',match.group(1),match.group(4)

 else:

     print ' No match'

 #正则表达式解析包括一个扩展,可以使用(?P=name)指示表达式先前匹配的一个命名组的值.

 address = re.compile(

     '''

     # The regular name

     (?P<first_name>\w+)

     \s+

     (([\w.]+)\s+)?

     (?P<last_name>\w+)

     \s+

     <

     # The address: first_name.last_name@domain.tld

     (?P<email>

         (?P=first_name)

         \.

         (?P=last_name)

         @

         ([\w\d.]+\.)+

         (com|org|edu)

         )

     >

     ''',

     re.UNICODE | re.VERBOSE | re.IGNORECASE)

 candidates = [

     u'First last <first.last@example.com>',

     u'Different Name <first.last@example.com>',

     u'First Middle last <first.last@example.com>',

     u'First M. Last<first.last@example.com>',

 ]

 for candidate in candidates:

     print 'Candidate:',candidate

     match = address.search(candidate)

     if match:

         print '  Match name:',match.groupdict()['first_name']

         print match.groupdict()['last_name']

         print '  Match email:',match.groupdict()['email']

     else:

         print 'No match'

 #15 用模式修改字符串

 '''

 re支持使用正则表达式作为搜索机制来修改文本，而且可以替换可以引用正则表达式中的匹配组作为替换文本的一部分。

 '''

 bold = re.compile(r'\*{2}(.*?)\*{2}')

 text = 'Make this **bold**. This **too**.'

 print 'Text:',text

 print 'Bold:',bold.sub(r'<b>\1</b>',text)

 '''

 使用命名组来替换

 count 来限制替换次数

 sbun 工作原理和sub相似 subn同时返回修改后的字符串和完成的替换次数

 '''

 bold = re.compile(r'\*{2}(?P<bold_text>.*?)\*{2}',re.UNICODE,)

 print 'Text:',text

 print 'Bold:',bold.sub(r'<b>\g<bold_text></b>',text,count=1)

 #16 利用模式拆分

 '''

 str.split() 是分解字符串来完成解析的最常用方法之一,它只是支持字面值得作为分隔符

 '''

 text = '''Paragraph one

 one tuo lines.

 Paragraph two.

 Paragraph three.'''

 print 'With findall:'

 for num,para in enumerate(re.findall(r'.+?\n{2,}|$',

                                     text,

                                     flags = re.DOTALL)

                             ):

     print num,repr(para)

     print

 print

 print 'With split:'

 for num,para in enumerate(re.split(r'\n{2,}',text)):

     print num,repr(para)

     print

re模块详解的更多相关文章

Python中操作mysql的pymysql模块详解
Python中操作mysql的pymysql模块详解前言 pymsql是Python中操作MySQL的模块,其使用方法和MySQLdb几乎相同.但目前pymysql支持python3.x而后者不支持 ...
python之OS模块详解
python之OS模块详解 ^_^,步入第二个模块世界----->OS 常见函数列表 os.sep:取代操作系统特定的路径分隔符 os.name:指示你正在使用的工作平台.比如对于Windows ...
python之sys模块详解
python之sys模块详解 sys模块功能多,我们这里介绍一些比较实用的功能,相信你会喜欢的,和我一起走进python的模块吧! sys模块的常见函数列表 sys.argv: 实现从程序外部向程序传 ...
python中threading模块详解（一）
python中threading模块详解(一) 来源 http://blog.chinaunix.net/uid-27571599-id-3484048.html threading提供了一个比thr ...
python time 模块详解
Python中time模块详解发表于2011年5月5日 12:58 a.m. 位于分类我爱Python 在平常的代码中,我们常常需要与时间打交道.在Python中,与时间处理有关的模块就包括: ...
python time模块详解
python time模块详解转自:http://blog.csdn.net/kiki113/article/details/4033017 python 的内嵌time模板翻译及说明一.简介 ...
小白的Python之路 day5 time,datatime模块详解
一.模块的分类可以分成三大类: 1.标准库 2.开源模块 3.自定义模块二.标准库模块详解 1.time与datetime 在Python中,通常有这几种方式来表示时间:1)时间戳 2)格式化的时 ...
小白的Python之路 day5 random模块和string模块详解
random模块详解一.概述首先我们看到这个单词是随机的意思,他在python中的主要用于一些随机数,或者需要写一些随机数的代码,下面我们就来整理他的一些用法二.常用方法 1. random.r ...
Python中time模块详解
Python中time模块详解在平常的代码中,我们常常需要与时间打交道.在Python中,与时间处理有关的模块就包括:time,datetime以及calendar.这篇文章,主要讲解time模块. ...
Ansible安装部署及常用模块详解
Ansible命令使用 Ansible语法使用ansible <pattern_goes_here> -m <module_name> -a <arguments> ...

随机推荐

a链接中套a链接
<a href="baidu.com"> <div> <div class="title">百度</div> & ...
Linux下搭建DNS服务器
1. 安装需要的软件由于实验过程是在自己电脑进行的,所以需要安装bind bind-chroot,以下为安装过程. 图1-1 安装bind 图1-2 安装bind-chroot 2.修改DNS主配置 ...
sass中中文注释报错
最近项目中用到了sass来编译css,但是scss代码中写了中文注释后编译报错, 经过查找文档和资料,终于找到了解决办法,即在scss文件顶部加上@charset "utf-8"; ...
判断Sql Server2008中ntext不为空
select * from 表名 where datalength(列名)=0 or datalength(列名) is null
C#关于winforms窗体大小、边框、移动、动画等属性
1.窗体的边框设置为无: 2.把窗体高度调整为25,发现跑的时候总在40左右,这时需要改改属性, 把MinimumSize(0, 0)改成不为0的,最好改成和自己想要一样的大小, 最大高宽也应该这么调 ...
多线程完成socket
//服务器端代码 public class Service { //服务器 public static void main(String[] args) { ServerSocket serverSo ...
STEP模块——电子琴
电子琴原理什么是声音?上过初中的朋友都知道声音是由震动所产生的.一定频率的震动就产生了一定频率的声音. 理论研究第一步,让喇叭发出do re mi fa sol la si的音,我们先不管do的频率 ...
[转载]Three Trending Computer Vision Research Areas，从CVPR看接下来几年的CV的发展趋势
As I walked through the large poster-filled hall at CVPR 2013, I asked myself, “Quo vadis Computer V ...
Can't update: no tracked branch
git更新错误:Can't update: no tracked branch No tracked branch configured for branch master. To make your ...
C/C++中的NULL讨论和总结
代码如下 #include <stdio.h> int main(){ int *p; p = NULL; printf("p=0x%x\n",p); typedef ...

re模块详解

re模块详解的更多相关文章

随机推荐

热门专题