XML # document.xml为docx文件中的 from xml.etree.ElementTree import parse from xml.etree.ElementTree import XMLParser f = open('document.xml') doc = parse(f, XMLParser(encoding="utf-8")) # 1 t_elems = doc.findall('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t') # 2 ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} t_elems = doc.findall('.//w:t', ns) # 3 t_elems = doc.iter('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t') for elem in t_elems: print(elem.text) LXML from lxml import etree f = open('document.xml') doc_lxml = etree.parse(f, etree.XMLParser(encoding="utf-8")) # 1 p_lxml = doc_lxml.iter('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t') # 2 ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} find_results = doc_lxml.findall('//w:t', ns) # Namespace prefix->URI mapping known in the context of this Element.
Read More →
from lxml import etree html = etree.HTML(resHtml, parser=etree.HTMLParser(encoding='utf-8')) # 处理源文件的时候,由于没有指定编码,所以它使用了一个默认编码,从而导致和UTF-8冲突,产生乱码 # http://lxml.
Read More →