【Python】 xml转json
虽然python有解析xml的模块,也有生成json的模块,但是没有把这两者连接起来的模块。
下面是以来自MIT的大神Martin Blech写的一个方便的模块,供大家参考。也别忘了在用之前先拜谢作者三次ww
#!/usr/bin/env python
"Makes working with XML feel like you are working with JSON" try:
from defusedexpat import pyexpat as expat
except ImportError:
from xml.parsers import expat
from xml.sax.saxutils import XMLGenerator
from xml.sax.xmlreader import AttributesImpl
try: # pragma no cover
from cStringIO import StringIO
except ImportError: # pragma no cover
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
try: # pragma no cover
from collections import OrderedDict
except ImportError: # pragma no cover
try:
from ordereddict import OrderedDict
except ImportError:
OrderedDict = dict try: # pragma no cover
_basestring = basestring
except NameError: # pragma no cover
_basestring = str
try: # pragma no cover
_unicode = unicode
except NameError: # pragma no cover
_unicode = str __author__ = 'Martin Blech'
__version__ = '0.10.2'
__license__ = 'MIT' class ParsingInterrupted(Exception):
pass class _DictSAXHandler(object):
def __init__(self,
item_depth=0,
item_callback=lambda *args: True,
xml_attribs=True,
attr_prefix='@',
cdata_key='#text',
force_cdata=False,
cdata_separator='',
postprocessor=None,
dict_constructor=OrderedDict,
strip_whitespace=True,
namespace_separator=':',
namespaces=None,
force_list=None):
self.path = []
self.stack = []
self.data = []
self.item = None
self.item_depth = item_depth
self.xml_attribs = xml_attribs
self.item_callback = item_callback
self.attr_prefix = attr_prefix
self.cdata_key = cdata_key
self.force_cdata = force_cdata
self.cdata_separator = cdata_separator
self.postprocessor = postprocessor
self.dict_constructor = dict_constructor
self.strip_whitespace = strip_whitespace
self.namespace_separator = namespace_separator
self.namespaces = namespaces
self.force_list = force_list def _build_name(self, full_name):
if not self.namespaces:
return full_name
i = full_name.rfind(self.namespace_separator)
if i == -1:
return full_name
namespace, name = full_name[:i], full_name[i+1:]
short_namespace = self.namespaces.get(namespace, namespace)
if not short_namespace:
return name
else:
return self.namespace_separator.join((short_namespace, name)) def _attrs_to_dict(self, attrs):
if isinstance(attrs, dict):
return attrs
return self.dict_constructor(zip(attrs[0::2], attrs[1::2])) def startElement(self, full_name, attrs):
name = self._build_name(full_name)
attrs = self._attrs_to_dict(attrs)
self.path.append((name, attrs or None))
if len(self.path) > self.item_depth:
self.stack.append((self.item, self.data))
if self.xml_attribs:
attr_entries = []
for key, value in attrs.items():
key = self.attr_prefix+self._build_name(key)
if self.postprocessor:
entry = self.postprocessor(self.path, key, value)
else:
entry = (key, value)
if entry:
attr_entries.append(entry)
attrs = self.dict_constructor(attr_entries)
else:
attrs = None
self.item = attrs or None
self.data = [] def endElement(self, full_name):
name = self._build_name(full_name)
if len(self.path) == self.item_depth:
item = self.item
if item is None:
item = (None if not self.data
else self.cdata_separator.join(self.data)) should_continue = self.item_callback(self.path, item)
if not should_continue:
raise ParsingInterrupted()
if len(self.stack):
data = (None if not self.data
else self.cdata_separator.join(self.data))
item = self.item
self.item, self.data = self.stack.pop()
if self.strip_whitespace and data:
data = data.strip() or None
if data and self.force_cdata and item is None:
item = self.dict_constructor()
if item is not None:
if data:
self.push_data(item, self.cdata_key, data)
self.item = self.push_data(self.item, name, item)
else:
self.item = self.push_data(self.item, name, data)
else:
self.item = None
self.data = []
self.path.pop() def characters(self, data):
if not self.data:
self.data = [data]
else:
self.data.append(data) def push_data(self, item, key, data):
if self.postprocessor is not None:
result = self.postprocessor(self.path, key, data)
if result is None:
return item
key, data = result
if item is None:
item = self.dict_constructor()
try:
value = item[key]
if isinstance(value, list):
value.append(data)
else:
item[key] = [value, data]
except KeyError:
if self._should_force_list(key, data):
item[key] = [data]
else:
item[key] = data
return item def _should_force_list(self, key, value):
if not self.force_list:
return False
try:
return key in self.force_list
except TypeError:
return self.force_list(self.path[:-1], key, value) def parse(xml_input, encoding=None, expat=expat, process_namespaces=False,
namespace_separator=':', **kwargs):
"""Parse the given XML input and convert it into a dictionary. `xml_input` can either be a `string` or a file-like object. If `xml_attribs` is `True`, element attributes are put in the dictionary
among regular child elements, using `@` as a prefix to avoid collisions. If
set to `False`, they are just ignored. Simple example:: >>> import xmltodict
>>> doc = xmltodict.parse(\"\"\"
... <a prop="x">
... <b>1</b>
... <b>2</b>
... </a>
... \"\"\")
>>> doc['a']['@prop']
u'x'
>>> doc['a']['b']
[u'1', u'2'] If `item_depth` is `0`, the function returns a dictionary for the root
element (default behavior). Otherwise, it calls `item_callback` every time
an item at the specified depth is found and returns `None` in the end
(streaming mode). The callback function receives two parameters: the `path` from the document
root to the item (name-attribs pairs), and the `item` (dict). If the
callback's return value is false-ish, parsing will be stopped with the
:class:`ParsingInterrupted` exception. Streaming example:: >>> def handle(path, item):
... print 'path:%s item:%s' % (path, item)
... return True
...
>>> xmltodict.parse(\"\"\"
... <a prop="x">
... <b>1</b>
... <b>2</b>
... </a>\"\"\", item_depth=2, item_callback=handle)
path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:1
path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:2 The optional argument `postprocessor` is a function that takes `path`,
`key` and `value` as positional arguments and returns a new `(key, value)`
pair where both `key` and `value` may have changed. Usage example:: >>> def postprocessor(path, key, value):
... try:
... return key + ':int', int(value)
... except (ValueError, TypeError):
... return key, value
>>> xmltodict.parse('<a><b>1</b><b>2</b><b>x</b></a>',
... postprocessor=postprocessor)
OrderedDict([(u'a', OrderedDict([(u'b:int', [1, 2]), (u'b', u'x')]))]) You can pass an alternate version of `expat` (such as `defusedexpat`) by
using the `expat` parameter. E.g: >>> import defusedexpat
>>> xmltodict.parse('<a>hello</a>', expat=defusedexpat.pyexpat)
OrderedDict([(u'a', u'hello')]) You can use the force_list argument to force lists to be created even
when there is only a single child of a given level of hierarchy. The
force_list argument is a tuple of keys. If the key for a given level
of hierarchy is in the force_list argument, that level of hierarchy
will have a list as a child (even if there is only one sub-element).
The index_keys operation takes precendence over this. This is applied
after any user-supplied postprocessor has already run. For example, given this input:
<servers>
<server>
<name>host1</name>
<os>Linux</os>
<interfaces>
<interface>
<name>em0</name>
<ip_address>10.0.0.1</ip_address>
</interface>
</interfaces>
</server>
</servers> If called with force_list=('interface',), it will produce
this dictionary:
{'servers':
{'server':
{'name': 'host1',
'os': 'Linux'},
'interfaces':
{'interface':
[ {'name': 'em0', 'ip_address': '10.0.0.1' } ] } } } `force_list` can also be a callable that receives `path`, `key` and
`value`. This is helpful in cases where the logic that decides whether
a list should be forced is more complex.
"""
handler = _DictSAXHandler(namespace_separator=namespace_separator,
**kwargs)
if isinstance(xml_input, _unicode):
if not encoding:
encoding = 'utf-8'
xml_input = xml_input.encode(encoding)
if not process_namespaces:
namespace_separator = None
parser = expat.ParserCreate(
encoding,
namespace_separator
)
try:
parser.ordered_attributes = True
except AttributeError:
# Jython's expat does not support ordered_attributes
pass
parser.StartElementHandler = handler.startElement
parser.EndElementHandler = handler.endElement
parser.CharacterDataHandler = handler.characters
parser.buffer_text = True
try:
parser.ParseFile(xml_input)
except (TypeError, AttributeError):
parser.Parse(xml_input, True)
return handler.item def _emit(key, value, content_handler,
attr_prefix='@',
cdata_key='#text',
depth=0,
preprocessor=None,
pretty=False,
newl='\n',
indent='\t',
full_document=True):
if preprocessor is not None:
result = preprocessor(key, value)
if result is None:
return
key, value = result
if (not hasattr(value, '__iter__')
or isinstance(value, _basestring)
or isinstance(value, dict)):
value = [value]
for index, v in enumerate(value):
if full_document and depth == 0 and index > 0:
raise ValueError('document with multiple roots')
if v is None:
v = OrderedDict()
elif not isinstance(v, dict):
v = _unicode(v)
if isinstance(v, _basestring):
v = OrderedDict(((cdata_key, v),))
cdata = None
attrs = OrderedDict()
children = []
for ik, iv in v.items():
if ik == cdata_key:
cdata = iv
continue
if ik.startswith(attr_prefix):
if not isinstance(iv, _unicode):
iv = _unicode(iv)
attrs[ik[len(attr_prefix):]] = iv
continue
children.append((ik, iv))
if pretty:
content_handler.ignorableWhitespace(depth * indent)
content_handler.startElement(key, AttributesImpl(attrs))
if pretty and children:
content_handler.ignorableWhitespace(newl)
for child_key, child_value in children:
_emit(child_key, child_value, content_handler,
attr_prefix, cdata_key, depth+1, preprocessor,
pretty, newl, indent)
if cdata is not None:
content_handler.characters(cdata)
if pretty and children:
content_handler.ignorableWhitespace(depth * indent)
content_handler.endElement(key)
if pretty and depth:
content_handler.ignorableWhitespace(newl) def unparse(input_dict, output=None, encoding='utf-8', full_document=True,
**kwargs):
"""Emit an XML document for the given `input_dict` (reverse of `parse`). The resulting XML document is returned as a string, but if `output` (a
file-like object) is specified, it is written there instead. Dictionary keys prefixed with `attr_prefix` (default=`'@'`) are interpreted
as XML node attributes, whereas keys equal to `cdata_key`
(default=`'#text'`) are treated as character data. The `pretty` parameter (default=`False`) enables pretty-printing. In this
mode, lines are terminated with `'\n'` and indented with `'\t'`, but this
can be customized with the `newl` and `indent` parameters. """
if full_document and len(input_dict) != 1:
raise ValueError('Document must have exactly one root.')
must_return = False
if output is None:
output = StringIO()
must_return = True
content_handler = XMLGenerator(output, encoding)
if full_document:
content_handler.startDocument()
for key, value in input_dict.items():
_emit(key, value, content_handler, full_document=full_document,
**kwargs)
if full_document:
content_handler.endDocument()
if must_return:
value = output.getvalue()
try: # pragma no cover
value = value.decode(encoding)
except AttributeError: # pragma no cover
pass
return value if __name__ == '__main__': # pragma: no cover
import sys
import marshal
try:
stdin = sys.stdin.buffer
stdout = sys.stdout.buffer
except AttributeError:
stdin = sys.stdin
stdout = sys.stdout (item_depth,) = sys.argv[1:]
item_depth = int(item_depth) def handle_item(path, item):
marshal.dump((path, item), stdout)
return True try:
root = parse(stdin,
item_depth=item_depth,
item_callback=handle_item,
dict_constructor=dict)
if item_depth == 0:
handle_item([], root)
except KeyboardInterrupt:
pass
【Python】 xml转json的更多相关文章
- Python: xml转json
1,引言 GooSeeker早在9年前就开始了Semantic Web领域的产品化,MS谋数台和DS打数机是其中两个产品.对web内容做结构化转换和语义处理的主要路线是 XML -> RDF - ...
- Python 解析构建数据大杂烩 -- csv、xml、json、excel
Python 可以通过各种库去解析我们常见的数据.其中 csv 文件以纯文本形式存储表格数据,以某字符作为分隔值,通常为逗号:xml 可拓展标记语言,很像超文本标记语言 Html ,但主要对文档和数据 ...
- Python解析xml与JSON
xml与json是常用的文件交换格式,常用来表示网页的html则是xml的变种.解析xml和json在web开发中有着重要应用. DOM解析XML 文件对象模型(Document Object Mod ...
- python入门(十):XML和JSON解析
一.python解析XML 1.xml.dom.*模块,它是W3C DOM API的实现,若需要处理DOM API则该模块很适合,注意xml.dom包里面有许多模块,须区分它们间的不同: 2.xml. ...
- python cookbook第三版学习笔记七:python解析csv,json,xml文件
CSV文件读取: Csv文件格式如下:分别有2行三列. 访问代码如下: f=open(r'E:\py_prj\test.csv','rb') f_csv=csv.reader(f) for f in ...
- [Network] HTML、XML和JSON学习汇总
写在前面:楼主也是刚刚接触这方面的知识,之前完全是零基础,后来经朋友推荐了几个不错的博文,看完以后豁然开朗.但是此博文更加偏重于基础知识介绍(其实更深的楼主也还不了解,这方面的大神请绕道),只是分享个 ...
- xml和json的区别
本文转自SanMaoSpace的博客 链接地址如下:http://www.cnblogs.com/SanMaoSpace/p/3139186.html 1.定义介绍 (1).XML定义扩展标记语言 ( ...
- 数据解析(XML和JSON数据结构)
一 解析 二 XML数据结构 三 JSON 数据结构 一 解析 1 定义: 从事先规定好的格式中提取数据 解析的前提:提前约定好格式,数据提供方按照格式提供数据.数据获取方则按照 ...
- 【原】iOS学习之XML与JSON两种数据结构比较和各自底层实现
1.XML与JSON两种数据结构的优缺点 1> XML 优点: 格式统一, 符合标准 容易与其他系统进行远程交互, 数据共享比较方便 缺点: XML文件格式文件庞大, 格式复杂, 传输占 ...
- XML与JSON的对比
XML与JSON的对比 1.各自定义 XML 扩展标记语言 (Extensible Markup Language, XML) ,用于标记电子文件使其具有结构性的标记语言,可以用来标记数据.定义数据类 ...
随机推荐
- Shell脚本编程学习入门 01
从程序员的角度来看, Shell本身是一种用C语言编写的程序,从用户的角度来看,Shell是用户与Linux操作系统沟通的桥梁.用户既可以输入命令执行,又可以利用 Shell脚本编程,完成更加复杂的操 ...
- FusionCharts 2D柱状图和折线图的组合图调试错误
在设计FusionCharts 2D柱状图和折线图的组合图的时候,我发现不管怎么重启服务器,组合图就是不出来.后来,我通过调试发现我犯了一个致命的错误,运用平常一贯的思维,认为3D图有这种类型,那么2 ...
- R语言︱集合运算——小而美法则
每每以为攀得众山小,可.每每又切实来到起点,大牛们,缓缓脚步来俺笔记葩分享一下吧,please~ --------------------------- 集合运算的一般规则如下: union(x ...
- 图像处理------透明混合 - Alpha Blending效果
基本原理: 图像的透明混合有个专属名词– Alpha Blending 对任意两张图像可以合成为一张图像,合成图像的像素取值根据数学公式: RGB3 = (1- a) * RGB1 + a * RGB ...
- zTree实现地市县三级级联封装类
zTree实现地市县三级级联封装类 Province.java: /** * @Title:Province.java * @Package:com.gwtjs.model * @Descriptio ...
- Python与Mongodb交互
MongoDB 是由C++语言编写的,是一个基于分布式文件存储的开源数据库系统 MongoDB 旨在为WEB应用提供可扩展的高性能数据存储解决方案 MongoDB 将数据存储为一个文档,数据结构由键值 ...
- RHEL简单管理SELINUX
Security Enhanced Linux(SELinux)是一个额外的系统安全层,主要目的是防止已遭泄露的系统服务访问用户数据. 对于一个服务来说,要关注SELinux的三个方面,一是文件SEL ...
- Java冒泡排序法升级版
/* * 冒泡排序之升级版,可比较整型数组.小数型数组 * * */ public static <T extends Comparable<T>> void Bubb ...
- css补充
(一)水平对齐1.使用margin属性水平对齐可通过将左和右外边距设置为 "auto",来对齐块元素.除非已经声明了 !DOCTYPE,否则使用 margin:auto 在 IE8 ...
- 一个仿3D的平面游戏页面
package com.totoo.TouhouMassLight;import android.os.Bundle;import android.view.MotionEvent;import an ...