
# normalize()的第一个参数指定字符串标准化的方式,分别有NFD/NFC

  1. >>> s1 = 'Spicy Jalape\u00f1o'
  2. >>> s2 = 'Spicy Jalapen\u0303o'
  3. >>> import unicodedata
  4. # NFC表示字符应该是整体组成(可能是使用单一编码)
  5. >>> t1 = unicodedata.normalize('NFC', s1)
  6. >>> t2 = unicodedata.normalize('NFC', s2)
  7. >>> t1 == t2
  8. True
  9. # NFD表示字符应该分解为多个组合字符表示
  10. >>> t1 = unicodedata.normalize('NFD', s1)
  11. >>> t2 = unicodedata.normalize('NFD', s2)
  12. >>> t1 == t2
  13. True



  1. >>> s1
  2. 'Spicy Jalapeño'
  3. >>> t1 = unicodedata.normalize('NFD', s1)
  4. >>> ''.join(c for c in t1 if not unicodedata.combining(c)) # 去除和音字符
  5. 'Spicy Jalapeno'


  1. >>> s = ' hello world \n'
  2. # 去除左右空白字符
  3. >>> s.strip()
  4. 'hello world'
  5. # 去除右边空白字符
  6. >>> s.rstrip()
  7. ' hello world'
  8. # 去除左边空白字符
  9. >>> s.lstrip()
  10. 'hello world \n'
  11. >>> t = '-----hello====='
  12. # 去除左边指定字段('-')
  13. >>> t.lstrip('-')
  14. 'hello====='
  15. # 去除右边指定字段('-')
  16. >>> t.rstrip('=')
  17. '-----hello'

# 值得注意的是,strip等不能够去除中间空白字符,要使用去除中间空白字符可以使用下面方法

  1. >>> s = ' hello world \n'
  2. # 使用replace()那么会造成"一个不留"
  3. >>> s.replace(' ', '')
  4. 'helloworld\n'
  5. # 使用正则
  6. >>> import re
  7. >>> re.sub(r'\s+', ' ', s)
  8. ' hello world '


# 处理和音字符

  1. >>> s = 'pýtĥöñ\fis\tawesome\r\n'
  2. >>> remap = {ord('\r'): None, ord('\t'): ' ', ord('\f'): ' '} # 构造字典,对应空字符
  3. >>> a = s.translate(remap) # 进行字典转换
  4. >>> a
  5. 'pýtĥöñ is awesome\n'
  6. >>> import unicodedata
  7. >>> import sys
  8. >>> cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c))) # 查找系统的和音字符,并将其设置为字典的键,值设置为空
  9. >>> b = unicodedata.normalize('NFD', a) # 将原始输入标准化为分解形式字符
  10. >>> b
  11. 'pýtĥöñ is awesome\n'
  12. >>> b.translate(cmb_chrs)
  13. 'python is awesome\n'

# 将所有的Unicode数字字符映射到对应的ASCII字符上

  1. # unicodedata.digit(chr(c)) # 将ASCII转换为十进制数字,再加上'0'的ASCII就对应了“0~9”的ASCII码
  2. >>> digitmap = {c: ord('')+unicodedata.digit(chr(c)) for c in range(sys.maxunicode) if unicodedata.category(chr(c)) == 'Nd'} # (unicodedata.category(chr(c)) == 'Nd')表示系统“0~9”的Unicode字符
  3. >>> len(digitmap)
  4. 610
  5. >>> x = '\u0661\u0662\u0663'
  6. >>> x.translate(digitmap)
  7. ''


  1. >>> a
  2. 'pýtĥöñ is awesome\n'
  3. >>> b = unicodedata.normalize('NFD', a)
  4. >>> b.encode('ascii', 'ignore').decode('ascii')
  5. 'python is awesome\n'

