Python XML DOM Minidom
Contents
Usage
Parsing a file
from xml.dom import minidom document = minidom.parse(filename)
If the XML file uses namespaces, it can be easier to disable that feature in the parser.
from xml.dom import minidom, expatbuilder document = expatbuilder.parse(filename, False)
Scrape HTML tables
for table in document.getElementsByTagName("table"):
for row in table.getElementsByTagName("tr"):
if row.firstChild is not None and row.firstChild.nodeName is not None and row.firstChild.nodeName=="th":
for header in row.childNodes:
data[0].append(header.nodeValue if header.nodeValue is not None else "")
else:
data.append([])
for cell in row.childNodes:
data[-1].append(cell.nodeValue if cell.nodeValue is not None else "")
Traverse all nodes
def recurse_print(node):
if node.nodeType == minidom.Node.TEXT_NODE:
print(node.data)
else:
for child in node.childNodes:
recurse_print(child)
recurse_print(document)
Scrubbing the DOM
It can be useful to scrub the DOM of unhelpful or useless components.
To remove attributes, try:
if node.hasAttribute("hidden"):
node.removeAttribute("hidden")To remove nodes, try:
for child in node.childNodes:
if child.hasAttribute("hidden"):
node.removeChild(child)
child.unlink()To replace nodes, as with comments, try:
replacement = document.createComment("scrubbed useless node")
# alternatively, createTextNode or createElement
for child in node.childNodes:
if child.hasAttribute("hidden"):
node.replaceChild(child, replacement)