| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374 |
- #!/usr/bin/env python3
- """
- Utilities for editing OOXML documents.
- This module provides XMLEditor, a tool for manipulating XML files with support for
- line-number-based node finding and DOM manipulation. Each element is automatically
- annotated with its original line and column position during parsing.
- Example usage:
- editor = XMLEditor("document.xml")
- # Find node by line number or range
- elem = editor.get_node(tag="w:r", line_number=519)
- elem = editor.get_node(tag="w:p", line_number=range(100, 200))
- # Find node by text content
- elem = editor.get_node(tag="w:p", contains="specific text")
- # Find node by attributes
- elem = editor.get_node(tag="w:r", attrs={"w:id": "target"})
- # Combine filters
- elem = editor.get_node(tag="w:p", line_number=range(1, 50), contains="text")
- # Replace, insert, or manipulate
- new_elem = editor.replace_node(elem, "<w:r><w:t>new text</w:t></w:r>")
- editor.insert_after(new_elem, "<w:r><w:t>more</w:t></w:r>")
- # Save changes
- editor.save()
- """
- import html
- from pathlib import Path
- from typing import Optional, Union
- import defusedxml.minidom
- import defusedxml.sax
- class XMLEditor:
- """
- Editor for manipulating OOXML XML files with line-number-based node finding.
- This class parses XML files and tracks the original line and column position
- of each element. This enables finding nodes by their line number in the original
- file, which is useful when working with Read tool output.
- Attributes:
- xml_path: Path to the XML file being edited
- encoding: Detected encoding of the XML file ('ascii' or 'utf-8')
- dom: Parsed DOM tree with parse_position attributes on elements
- """
- def __init__(self, xml_path):
- """
- Initialize with path to XML file and parse with line number tracking.
- Args:
- xml_path: Path to XML file to edit (str or Path)
- Raises:
- ValueError: If the XML file does not exist
- """
- self.xml_path = Path(xml_path)
- if not self.xml_path.exists():
- raise ValueError(f"XML file not found: {xml_path}")
- with open(self.xml_path, "rb") as f:
- header = f.read(200).decode("utf-8", errors="ignore")
- self.encoding = "ascii" if 'encoding="ascii"' in header else "utf-8"
- parser = _create_line_tracking_parser()
- self.dom = defusedxml.minidom.parse(str(self.xml_path), parser)
- def get_node(
- self,
- tag: str,
- attrs: Optional[dict[str, str]] = None,
- line_number: Optional[Union[int, range]] = None,
- contains: Optional[str] = None,
- ):
- """
- Get a DOM element by tag and identifier.
- Finds an element by either its line number in the original file or by
- matching attribute values. Exactly one match must be found.
- Args:
- tag: The XML tag name (e.g., "w:del", "w:ins", "w:r")
- attrs: Dictionary of attribute name-value pairs to match (e.g., {"w:id": "1"})
- line_number: Line number (int) or line range (range) in original XML file (1-indexed)
- contains: Text string that must appear in any text node within the element.
- Supports both entity notation (“) and Unicode characters (\u201c).
- Returns:
- defusedxml.minidom.Element: The matching DOM element
- Raises:
- ValueError: If node not found or multiple matches found
- Example:
- elem = editor.get_node(tag="w:r", line_number=519)
- elem = editor.get_node(tag="w:r", line_number=range(100, 200))
- elem = editor.get_node(tag="w:del", attrs={"w:id": "1"})
- elem = editor.get_node(tag="w:p", attrs={"w14:paraId": "12345678"})
- elem = editor.get_node(tag="w:commentRangeStart", attrs={"w:id": "0"})
- elem = editor.get_node(tag="w:p", contains="specific text")
- elem = editor.get_node(tag="w:t", contains="“Agreement") # Entity notation
- elem = editor.get_node(tag="w:t", contains="\u201cAgreement") # Unicode character
- """
- matches = []
- for elem in self.dom.getElementsByTagName(tag):
- # Check line_number filter
- if line_number is not None:
- parse_pos = getattr(elem, "parse_position", (None,))
- elem_line = parse_pos[0]
- # Handle both single line number and range
- if isinstance(line_number, range):
- if elem_line not in line_number:
- continue
- else:
- if elem_line != line_number:
- continue
- # Check attrs filter
- if attrs is not None:
- if not all(
- elem.getAttribute(attr_name) == attr_value
- for attr_name, attr_value in attrs.items()
- ):
- continue
- # Check contains filter
- if contains is not None:
- elem_text = self._get_element_text(elem)
- # Normalize the search string: convert HTML entities to Unicode characters
- # This allows searching for both "“Rowan" and ""Rowan"
- normalized_contains = html.unescape(contains)
- if normalized_contains not in elem_text:
- continue
- # If all applicable filters passed, this is a match
- matches.append(elem)
- if not matches:
- # Build descriptive error message
- filters = []
- if line_number is not None:
- line_str = (
- f"lines {line_number.start}-{line_number.stop - 1}"
- if isinstance(line_number, range)
- else f"line {line_number}"
- )
- filters.append(f"at {line_str}")
- if attrs is not None:
- filters.append(f"with attributes {attrs}")
- if contains is not None:
- filters.append(f"containing '{contains}'")
- filter_desc = " ".join(filters) if filters else ""
- base_msg = f"Node not found: <{tag}> {filter_desc}".strip()
- # Add helpful hint based on filters used
- if contains:
- hint = "Text may be split across elements or use different wording."
- elif line_number:
- hint = "Line numbers may have changed if document was modified."
- elif attrs:
- hint = "Verify attribute values are correct."
- else:
- hint = "Try adding filters (attrs, line_number, or contains)."
- raise ValueError(f"{base_msg}. {hint}")
- if len(matches) > 1:
- raise ValueError(
- f"Multiple nodes found: <{tag}>. "
- f"Add more filters (attrs, line_number, or contains) to narrow the search."
- )
- return matches[0]
- def _get_element_text(self, elem):
- """
- Recursively extract all text content from an element.
- Skips text nodes that contain only whitespace (spaces, tabs, newlines),
- which typically represent XML formatting rather than document content.
- Args:
- elem: defusedxml.minidom.Element to extract text from
- Returns:
- str: Concatenated text from all non-whitespace text nodes within the element
- """
- text_parts = []
- for node in elem.childNodes:
- if node.nodeType == node.TEXT_NODE:
- # Skip whitespace-only text nodes (XML formatting)
- if node.data.strip():
- text_parts.append(node.data)
- elif node.nodeType == node.ELEMENT_NODE:
- text_parts.append(self._get_element_text(node))
- return "".join(text_parts)
- def replace_node(self, elem, new_content):
- """
- Replace a DOM element with new XML content.
- Args:
- elem: defusedxml.minidom.Element to replace
- new_content: String containing XML to replace the node with
- Returns:
- List[defusedxml.minidom.Node]: All inserted nodes
- Example:
- new_nodes = editor.replace_node(old_elem, "<w:r><w:t>text</w:t></w:r>")
- """
- parent = elem.parentNode
- nodes = self._parse_fragment(new_content)
- for node in nodes:
- parent.insertBefore(node, elem)
- parent.removeChild(elem)
- return nodes
- def insert_after(self, elem, xml_content):
- """
- Insert XML content after a DOM element.
- Args:
- elem: defusedxml.minidom.Element to insert after
- xml_content: String containing XML to insert
- Returns:
- List[defusedxml.minidom.Node]: All inserted nodes
- Example:
- new_nodes = editor.insert_after(elem, "<w:r><w:t>text</w:t></w:r>")
- """
- parent = elem.parentNode
- next_sibling = elem.nextSibling
- nodes = self._parse_fragment(xml_content)
- for node in nodes:
- if next_sibling:
- parent.insertBefore(node, next_sibling)
- else:
- parent.appendChild(node)
- return nodes
- def insert_before(self, elem, xml_content):
- """
- Insert XML content before a DOM element.
- Args:
- elem: defusedxml.minidom.Element to insert before
- xml_content: String containing XML to insert
- Returns:
- List[defusedxml.minidom.Node]: All inserted nodes
- Example:
- new_nodes = editor.insert_before(elem, "<w:r><w:t>text</w:t></w:r>")
- """
- parent = elem.parentNode
- nodes = self._parse_fragment(xml_content)
- for node in nodes:
- parent.insertBefore(node, elem)
- return nodes
- def append_to(self, elem, xml_content):
- """
- Append XML content as a child of a DOM element.
- Args:
- elem: defusedxml.minidom.Element to append to
- xml_content: String containing XML to append
- Returns:
- List[defusedxml.minidom.Node]: All inserted nodes
- Example:
- new_nodes = editor.append_to(elem, "<w:r><w:t>text</w:t></w:r>")
- """
- nodes = self._parse_fragment(xml_content)
- for node in nodes:
- elem.appendChild(node)
- return nodes
- def get_next_rid(self):
- """Get the next available rId for relationships files."""
- max_id = 0
- for rel_elem in self.dom.getElementsByTagName("Relationship"):
- rel_id = rel_elem.getAttribute("Id")
- if rel_id.startswith("rId"):
- try:
- max_id = max(max_id, int(rel_id[3:]))
- except ValueError:
- pass
- return f"rId{max_id + 1}"
- def save(self):
- """
- Save the edited XML back to the file.
- Serializes the DOM tree and writes it back to the original file path,
- preserving the original encoding (ascii or utf-8).
- """
- content = self.dom.toxml(encoding=self.encoding)
- self.xml_path.write_bytes(content)
- def _parse_fragment(self, xml_content):
- """
- Parse XML fragment and return list of imported nodes.
- Args:
- xml_content: String containing XML fragment
- Returns:
- List of defusedxml.minidom.Node objects imported into this document
- Raises:
- AssertionError: If fragment contains no element nodes
- """
- # Extract namespace declarations from the root document element
- root_elem = self.dom.documentElement
- namespaces = []
- if root_elem and root_elem.attributes:
- for i in range(root_elem.attributes.length):
- attr = root_elem.attributes.item(i)
- if attr.name.startswith("xmlns"): # type: ignore
- namespaces.append(f'{attr.name}="{attr.value}"') # type: ignore
- ns_decl = " ".join(namespaces)
- wrapper = f"<root {ns_decl}>{xml_content}</root>"
- fragment_doc = defusedxml.minidom.parseString(wrapper)
- nodes = [
- self.dom.importNode(child, deep=True)
- for child in fragment_doc.documentElement.childNodes # type: ignore
- ]
- elements = [n for n in nodes if n.nodeType == n.ELEMENT_NODE]
- assert elements, "Fragment must contain at least one element"
- return nodes
- def _create_line_tracking_parser():
- """
- Create a SAX parser that tracks line and column numbers for each element.
- Monkey patches the SAX content handler to store the current line and column
- position from the underlying expat parser onto each element as a parse_position
- attribute (line, column) tuple.
- Returns:
- defusedxml.sax.xmlreader.XMLReader: Configured SAX parser
- """
- def set_content_handler(dom_handler):
- def startElementNS(name, tagName, attrs):
- orig_start_cb(name, tagName, attrs)
- cur_elem = dom_handler.elementStack[-1]
- cur_elem.parse_position = (
- parser._parser.CurrentLineNumber, # type: ignore
- parser._parser.CurrentColumnNumber, # type: ignore
- )
- orig_start_cb = dom_handler.startElementNS
- dom_handler.startElementNS = startElementNS
- orig_set_content_handler(dom_handler)
- parser = defusedxml.sax.make_parser()
- orig_set_content_handler = parser.setContentHandler
- parser.setContentHandler = set_content_handler # type: ignore
- return parser
|