【Python】 xml转json

　　虽然python有解析xml的模块，也有生成json的模块，但是没有把这两者连接起来的模块。

　　下面是以来自MIT的大神Martin Blech写的一个方便的模块，供大家参考。也别忘了在用之前先拜谢作者三次ww

#!/usr/bin/env python

"Makes working with XML feel like you are working with JSON"

try:

    from defusedexpat import pyexpat as expat

except ImportError:

    from xml.parsers import expat

from xml.sax.saxutils import XMLGenerator

from xml.sax.xmlreader import AttributesImpl

try:  # pragma no cover

    from cStringIO import StringIO

except ImportError:  # pragma no cover

    try:

        from StringIO import StringIO

    except ImportError:

        from io import StringIO

try:  # pragma no cover

    from collections import OrderedDict

except ImportError:  # pragma no cover

    try:

        from ordereddict import OrderedDict

    except ImportError:

        OrderedDict = dict

try:  # pragma no cover

    _basestring = basestring

except NameError:  # pragma no cover

    _basestring = str

try:  # pragma no cover

    _unicode = unicode

except NameError:  # pragma no cover

    _unicode = str

__author__ = 'Martin Blech'

__version__ = '0.10.2'

__license__ = 'MIT'

class ParsingInterrupted(Exception):

    pass

class _DictSAXHandler(object):

    def __init__(self,

                 item_depth=0,

                 item_callback=lambda *args: True,

                 xml_attribs=True,

                 attr_prefix='@',

                 cdata_key='#text',

                 force_cdata=False,

                 cdata_separator='',

                 postprocessor=None,

                 dict_constructor=OrderedDict,

                 strip_whitespace=True,

                 namespace_separator=':',

                 namespaces=None,

                 force_list=None):

        self.path = []

        self.stack = []

        self.data = []

        self.item = None

        self.item_depth = item_depth

        self.xml_attribs = xml_attribs

        self.item_callback = item_callback

        self.attr_prefix = attr_prefix

        self.cdata_key = cdata_key

        self.force_cdata = force_cdata

        self.cdata_separator = cdata_separator

        self.postprocessor = postprocessor

        self.dict_constructor = dict_constructor

        self.strip_whitespace = strip_whitespace

        self.namespace_separator = namespace_separator

        self.namespaces = namespaces

        self.force_list = force_list

    def _build_name(self, full_name):

        if not self.namespaces:

            return full_name

        i = full_name.rfind(self.namespace_separator)

        if i == -1:

            return full_name

        namespace, name = full_name[:i], full_name[i+1:]

        short_namespace = self.namespaces.get(namespace, namespace)

        if not short_namespace:

            return name

        else:

            return self.namespace_separator.join((short_namespace, name))

    def _attrs_to_dict(self, attrs):

        if isinstance(attrs, dict):

            return attrs

        return self.dict_constructor(zip(attrs[0::2], attrs[1::2]))

    def startElement(self, full_name, attrs):

        name = self._build_name(full_name)

        attrs = self._attrs_to_dict(attrs)

        self.path.append((name, attrs or None))

        if len(self.path) > self.item_depth:

            self.stack.append((self.item, self.data))

            if self.xml_attribs:

                attr_entries = []

                for key, value in attrs.items():

                    key = self.attr_prefix+self._build_name(key)

                    if self.postprocessor:

                        entry = self.postprocessor(self.path, key, value)

                    else:

                        entry = (key, value)

                    if entry:

                        attr_entries.append(entry)

                attrs = self.dict_constructor(attr_entries)

            else:

                attrs = None

            self.item = attrs or None

            self.data = []

    def endElement(self, full_name):

        name = self._build_name(full_name)

        if len(self.path) == self.item_depth:

            item = self.item

            if item is None:

                item = (None if not self.data

                        else self.cdata_separator.join(self.data))

            should_continue = self.item_callback(self.path, item)

            if not should_continue:

                raise ParsingInterrupted()

        if len(self.stack):

            data = (None if not self.data

                    else self.cdata_separator.join(self.data))

            item = self.item

            self.item, self.data = self.stack.pop()

            if self.strip_whitespace and data:

                data = data.strip() or None

            if data and self.force_cdata and item is None:

                item = self.dict_constructor()

            if item is not None:

                if data:

                    self.push_data(item, self.cdata_key, data)

                self.item = self.push_data(self.item, name, item)

            else:

                self.item = self.push_data(self.item, name, data)

        else:

            self.item = None

            self.data = []

        self.path.pop()

    def characters(self, data):

        if not self.data:

            self.data = [data]

        else:

            self.data.append(data)

    def push_data(self, item, key, data):

        if self.postprocessor is not None:

            result = self.postprocessor(self.path, key, data)

            if result is None:

                return item

            key, data = result

        if item is None:

            item = self.dict_constructor()

        try:

            value = item[key]

            if isinstance(value, list):

                value.append(data)

            else:

                item[key] = [value, data]

        except KeyError:

            if self._should_force_list(key, data):

                item[key] = [data]

            else:

                item[key] = data

        return item

    def _should_force_list(self, key, value):

        if not self.force_list:

            return False

        try:

            return key in self.force_list

        except TypeError:

            return self.force_list(self.path[:-1], key, value)

def parse(xml_input, encoding=None, expat=expat, process_namespaces=False,

          namespace_separator=':', **kwargs):

    """Parse the given XML input and convert it into a dictionary.

    `xml_input` can either be a `string` or a file-like object.

    If `xml_attribs` is `True`, element attributes are put in the dictionary

    among regular child elements, using `@` as a prefix to avoid collisions. If

    set to `False`, they are just ignored.

    Simple example::

        >>> import xmltodict

        >>> doc = xmltodict.parse(\"\"\"

        ... <a prop="x">

        ...   <b>1</b>

        ...   <b>2</b>

        ... </a>

        ... \"\"\")

        >>> doc['a']['@prop']

        u'x'

        >>> doc['a']['b']

        [u'1', u'2']

    If `item_depth` is `0`, the function returns a dictionary for the root

    element (default behavior). Otherwise, it calls `item_callback` every time

    an item at the specified depth is found and returns `None` in the end

    (streaming mode).

    The callback function receives two parameters: the `path` from the document

    root to the item (name-attribs pairs), and the `item` (dict). If the

    callback's return value is false-ish, parsing will be stopped with the

    :class:`ParsingInterrupted` exception.

    Streaming example::

        >>> def handle(path, item):

        ...     print 'path:%s item:%s' % (path, item)

        ...     return True

        ...

        >>> xmltodict.parse(\"\"\"

        ... <a prop="x">

        ...   <b>1</b>

        ...   <b>2</b>

        ... </a>\"\"\", item_depth=2, item_callback=handle)

        path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:1

        path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:2

    The optional argument `postprocessor` is a function that takes `path`,

    `key` and `value` as positional arguments and returns a new `(key, value)`

    pair where both `key` and `value` may have changed. Usage example::

        >>> def postprocessor(path, key, value):

        ...     try:

        ...         return key + ':int', int(value)

        ...     except (ValueError, TypeError):

        ...         return key, value

        >>> xmltodict.parse('<a><b>1</b><b>2</b><b>x</b></a>',

        ...                 postprocessor=postprocessor)

        OrderedDict([(u'a', OrderedDict([(u'b:int', [1, 2]), (u'b', u'x')]))])

    You can pass an alternate version of `expat` (such as `defusedexpat`) by

    using the `expat` parameter. E.g:

        >>> import defusedexpat

        >>> xmltodict.parse('<a>hello</a>', expat=defusedexpat.pyexpat)

        OrderedDict([(u'a', u'hello')])

    You can use the force_list argument to force lists to be created even

    when there is only a single child of a given level of hierarchy. The

    force_list argument is a tuple of keys. If the key for a given level

    of hierarchy is in the force_list argument, that level of hierarchy

    will have a list as a child (even if there is only one sub-element).

    The index_keys operation takes precendence over this. This is applied

    after any user-supplied postprocessor has already run.

        For example, given this input:

        <servers>

          <server>

            <name>host1</name>

            <os>Linux</os>

            <interfaces>

              <interface>

                <name>em0</name>

                <ip_address>10.0.0.1</ip_address>

              </interface>

            </interfaces>

          </server>

        </servers>

        If called with force_list=('interface',), it will produce

        this dictionary:

        {'servers':

          {'server':

            {'name': 'host1',

             'os': 'Linux'},

             'interfaces':

              {'interface':

                [ {'name': 'em0', 'ip_address': '10.0.0.1' } ] } } }

        `force_list` can also be a callable that receives `path`, `key` and

        `value`. This is helpful in cases where the logic that decides whether

        a list should be forced is more complex.

    """

    handler = _DictSAXHandler(namespace_separator=namespace_separator,

                              **kwargs)

    if isinstance(xml_input, _unicode):

        if not encoding:

            encoding = 'utf-8'

        xml_input = xml_input.encode(encoding)

    if not process_namespaces:

        namespace_separator = None

    parser = expat.ParserCreate(

        encoding,

        namespace_separator

    )

    try:

        parser.ordered_attributes = True

    except AttributeError:

        # Jython's expat does not support ordered_attributes

        pass

    parser.StartElementHandler = handler.startElement

    parser.EndElementHandler = handler.endElement

    parser.CharacterDataHandler = handler.characters

    parser.buffer_text = True

    try:

        parser.ParseFile(xml_input)

    except (TypeError, AttributeError):

        parser.Parse(xml_input, True)

    return handler.item

def _emit(key, value, content_handler,

          attr_prefix='@',

          cdata_key='#text',

          depth=0,

          preprocessor=None,

          pretty=False,

          newl='\n',

          indent='\t',

          full_document=True):

    if preprocessor is not None:

        result = preprocessor(key, value)

        if result is None:

            return

        key, value = result

    if (not hasattr(value, '__iter__')

            or isinstance(value, _basestring)

            or isinstance(value, dict)):

        value = [value]

    for index, v in enumerate(value):

        if full_document and depth == 0 and index > 0:

            raise ValueError('document with multiple roots')

        if v is None:

            v = OrderedDict()

        elif not isinstance(v, dict):

            v = _unicode(v)

        if isinstance(v, _basestring):

            v = OrderedDict(((cdata_key, v),))

        cdata = None

        attrs = OrderedDict()

        children = []

        for ik, iv in v.items():

            if ik == cdata_key:

                cdata = iv

                continue

            if ik.startswith(attr_prefix):

                if not isinstance(iv, _unicode):

                    iv = _unicode(iv)

                attrs[ik[len(attr_prefix):]] = iv

                continue

            children.append((ik, iv))

        if pretty:

            content_handler.ignorableWhitespace(depth * indent)

        content_handler.startElement(key, AttributesImpl(attrs))

        if pretty and children:

            content_handler.ignorableWhitespace(newl)

        for child_key, child_value in children:

            _emit(child_key, child_value, content_handler,

                  attr_prefix, cdata_key, depth+1, preprocessor,

                  pretty, newl, indent)

        if cdata is not None:

            content_handler.characters(cdata)

        if pretty and children:

            content_handler.ignorableWhitespace(depth * indent)

        content_handler.endElement(key)

        if pretty and depth:

            content_handler.ignorableWhitespace(newl)

def unparse(input_dict, output=None, encoding='utf-8', full_document=True,

            **kwargs):

    """Emit an XML document for the given `input_dict` (reverse of `parse`).

    The resulting XML document is returned as a string, but if `output` (a

    file-like object) is specified, it is written there instead.

    Dictionary keys prefixed with `attr_prefix` (default=`'@'`) are interpreted

    as XML node attributes, whereas keys equal to `cdata_key`

    (default=`'#text'`) are treated as character data.

    The `pretty` parameter (default=`False`) enables pretty-printing. In this

    mode, lines are terminated with `'\n'` and indented with `'\t'`, but this

    can be customized with the `newl` and `indent` parameters.

    """

    if full_document and len(input_dict) != 1:

        raise ValueError('Document must have exactly one root.')

    must_return = False

    if output is None:

        output = StringIO()

        must_return = True

    content_handler = XMLGenerator(output, encoding)

    if full_document:

        content_handler.startDocument()

    for key, value in input_dict.items():

        _emit(key, value, content_handler, full_document=full_document,

              **kwargs)

    if full_document:

        content_handler.endDocument()

    if must_return:

        value = output.getvalue()

        try:  # pragma no cover

            value = value.decode(encoding)

        except AttributeError:  # pragma no cover

            pass

        return value

if __name__ == '__main__':  # pragma: no cover

    import sys

    import marshal

    try:

        stdin = sys.stdin.buffer

        stdout = sys.stdout.buffer

    except AttributeError:

        stdin = sys.stdin

        stdout = sys.stdout

    (item_depth,) = sys.argv[1:]

    item_depth = int(item_depth)

    def handle_item(path, item):

        marshal.dump((path, item), stdout)

        return True

    try:

        root = parse(stdin,

                     item_depth=item_depth,

                     item_callback=handle_item,

                     dict_constructor=dict)

        if item_depth == 0:

            handle_item([], root)

    except KeyboardInterrupt:

        pass

【Python】 xml转json的更多相关文章

Python: xml转json
1,引言 GooSeeker早在9年前就开始了Semantic Web领域的产品化,MS谋数台和DS打数机是其中两个产品.对web内容做结构化转换和语义处理的主要路线是 XML -> RDF - ...
Python 解析构建数据大杂烩 -- csv、xml、json、excel
Python 可以通过各种库去解析我们常见的数据.其中 csv 文件以纯文本形式存储表格数据,以某字符作为分隔值,通常为逗号:xml 可拓展标记语言,很像超文本标记语言 Html ,但主要对文档和数据 ...
Python解析xml与JSON
xml与json是常用的文件交换格式,常用来表示网页的html则是xml的变种.解析xml和json在web开发中有着重要应用. DOM解析XML 文件对象模型(Document Object Mod ...
python入门（十）：XML和JSON解析
一.python解析XML 1.xml.dom.*模块,它是W3C DOM API的实现,若需要处理DOM API则该模块很适合,注意xml.dom包里面有许多模块,须区分它们间的不同: 2.xml. ...
python cookbook第三版学习笔记七：python解析csv,json,xml文件
CSV文件读取: Csv文件格式如下:分别有2行三列. 访问代码如下: f=open(r'E:\py_prj\test.csv','rb') f_csv=csv.reader(f) for f in ...
[Network] HTML、XML和JSON学习汇总
写在前面:楼主也是刚刚接触这方面的知识,之前完全是零基础,后来经朋友推荐了几个不错的博文,看完以后豁然开朗.但是此博文更加偏重于基础知识介绍(其实更深的楼主也还不了解,这方面的大神请绕道),只是分享个 ...
xml和json的区别
本文转自SanMaoSpace的博客链接地址如下:http://www.cnblogs.com/SanMaoSpace/p/3139186.html 1.定义介绍 (1).XML定义扩展标记语言 ( ...
数据解析（XML和JSON数据结构）
一解析二 XML数据结构三 JSON 数据结构一解析 1 定义: 从事先规定好的格式中提取数据解析的前提:提前约定好格式,数据提供方按照格式提供数据.数据获取方则按照 ...
【原】iOS学习之XML与JSON两种数据结构比较和各自底层实现
1.XML与JSON两种数据结构的优缺点 1> XML 优点:  格式统一, 符合标准  容易与其他系统进行远程交互, 数据共享比较方便   缺点: XML文件格式文件庞大, 格式复杂, 传输占 ...
XML与JSON的对比
XML与JSON的对比 1.各自定义 XML 扩展标记语言 (Extensible Markup Language, XML) ,用于标记电子文件使其具有结构性的标记语言,可以用来标记数据.定义数据类 ...

随机推荐

工作中常用的linux命令（1）
1.cd :进入一个目录,例如进入/home/admin目录:cd /home/admin 2.pwd :查看当前所在目录:如图: 3.ls :列出当前目录下的所有文件: 4.ll :列出当前目录下的 ...
Java求素数时出现错误
Java求素数时出现错误 1.具体错误如下 No enclosing instance of type Prime is accessible. Must qualify the allocation ...
dojo柱形图
dojo柱形图添加属性 1.给柱状图的柱子填充颜色 .addSeries("A",[45,56,12,23,78,67],{stroke:{color:"#FF0000& ...
关于json.parse和json.stringify的区别
json.parse是将字符串解析成json格式而json.stringify是将json解析成字符串格式
idea好用插件（一）
代码规范插件 Alibaba Java Coding Guidelines 安装后可以在文件.文件夹邮件,显示编码规约扫描,点击后显示可以通过双击定位问题代码,对某些问题可以进行快速的修复比如: ...
NgRx/Store 4 + Angular 5使用教程
这篇文章将会示范如何使用NgRx/Store 4和Angular5.@ngrx/store是基于RxJS的状态管理库,其灵感来源于Redux.在NgRx中,状态是由一个包含action和reducer ...
自己用的reset.css，大部分转载，加上自己常用的设置
@charset "UTF-8";/*KISSY CSS Reset理念:清除和重置是紧密不可分的特色:1.适应中文 2.基于最新主流浏览器维护:玉伯(lifesinger@gma ...
[BZOJ1022] [SHOI2008] 小约翰的游戏John (SJ定理)
Description 小约翰经常和他的哥哥玩一个非常有趣的游戏:桌子上有n堆石子,小约翰和他的哥哥轮流取石子,每个人取的时候,可以随意选择一堆石子,在这堆石子中取走任意多的石子,但不能一粒石子也不取 ...
浅析Unity中的Enlighten与混合光照
0x00 前言在Unity的5.6版本之前的5.x中,主要使用了Geomerics公司的Enlighten[1]来提供实时全局照明以及烘焙全局照明,在5.6之后Unity引入了新的Lightmapp ...
如何将生产环境的字段类型从INT修改为BIGINT
介绍改变数据类型是一个看起来很简单的事情,但是如果表非常大或者有最小停机时间的要求,又该如何处理那?这里我提供一个思路来解决这个问题. 背景在一个常规SQL Server heath检查中,使用s ...

【Python】 xml转json

【Python】 xml转json的更多相关文章

随机推荐

热门专题