XML解析

Categories: Python

XML

# document.xml为docx文件中的

from xml.etree.ElementTree import parse

from xml.etree.ElementTree import XMLParser

f = open('document.xml')

doc = parse(f, XMLParser(encoding="utf-8"))

# 1

t_elems = doc.findall('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t')

# 2

ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

t_elems = doc.findall('.//w:t', ns)

# 3

t_elems = doc.iter('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t')

for elem in t_elems:

    print(elem.text)

LXML

from lxml import etree

f = open('document.xml')

doc_lxml = etree.parse(f, etree.XMLParser(encoding="utf-8"))

# 1

p_lxml = doc_lxml.iter('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t')

# 2

ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

find_results = doc_lxml.findall('//w:t', ns)

# Namespace prefix->URI mapping known in the context of this Element. This includes all namespace declarations of the parents.

# Note that changing the returned dict has no effect on the Element.

print(doc_lxml.getroot().nsmap)

See also