utilities.py 13 KB


  1. #!/usr/bin/env python3
  2. """
  3. Utilities for editing OOXML documents.
  4. This module provides XMLEditor, a tool for manipulating XML files with support for
  5. line-number-based node finding and DOM manipulation. Each element is automatically
  6. annotated with its original line and column position during parsing.
  7. Example usage:
  8. editor = XMLEditor("document.xml")
  9. # Find node by line number or range
  10. elem = editor.get_node(tag="w:r", line_number=519)
  11. elem = editor.get_node(tag="w:p", line_number=range(100, 200))
  12. # Find node by text content
  13. elem = editor.get_node(tag="w:p", contains="specific text")
  14. # Find node by attributes
  15. elem = editor.get_node(tag="w:r", attrs={"w:id": "target"})
  16. # Combine filters
  17. elem = editor.get_node(tag="w:p", line_number=range(1, 50), contains="text")
  18. # Replace, insert, or manipulate
  19. new_elem = editor.replace_node(elem, "<w:r><w:t>new text</w:t></w:r>")
  20. editor.insert_after(new_elem, "<w:r><w:t>more</w:t></w:r>")
  21. # Save changes
  22. editor.save()
  23. """
  24. import html
  25. from pathlib import Path
  26. from typing import Optional, Union
  27. import defusedxml.minidom
  28. import defusedxml.sax
  29. class XMLEditor:
  30. """
  31. Editor for manipulating OOXML XML files with line-number-based node finding.
  32. This class parses XML files and tracks the original line and column position
  33. of each element. This enables finding nodes by their line number in the original
  34. file, which is useful when working with Read tool output.
  35. Attributes:
  36. xml_path: Path to the XML file being edited
  37. encoding: Detected encoding of the XML file ('ascii' or 'utf-8')
  38. dom: Parsed DOM tree with parse_position attributes on elements
  39. """
  40. def __init__(self, xml_path):
  41. """
  42. Initialize with path to XML file and parse with line number tracking.
  43. Args:
  44. xml_path: Path to XML file to edit (str or Path)
  45. Raises:
  46. ValueError: If the XML file does not exist
  47. """
  48. self.xml_path = Path(xml_path)
  49. if not self.xml_path.exists():
  50. raise ValueError(f"XML file not found: {xml_path}")
  51. with open(self.xml_path, "rb") as f:
  52. header = f.read(200).decode("utf-8", errors="ignore")
  53. self.encoding = "ascii" if 'encoding="ascii"' in header else "utf-8"
  54. parser = _create_line_tracking_parser()
  55. self.dom = defusedxml.minidom.parse(str(self.xml_path), parser)
  56. def get_node(
  57. self,
  58. tag: str,
  59. attrs: Optional[dict[str, str]] = None,
  60. line_number: Optional[Union[int, range]] = None,
  61. contains: Optional[str] = None,
  62. ):
  63. """
  64. Get a DOM element by tag and identifier.
  65. Finds an element by either its line number in the original file or by
  66. matching attribute values. Exactly one match must be found.
  67. Args:
  68. tag: The XML tag name (e.g., "w:del", "w:ins", "w:r")
  69. attrs: Dictionary of attribute name-value pairs to match (e.g., {"w:id": "1"})
  70. line_number: Line number (int) or line range (range) in original XML file (1-indexed)
  71. contains: Text string that must appear in any text node within the element.
  72. Supports both entity notation (&#8220;) and Unicode characters (\u201c).
  73. Returns:
  74. defusedxml.minidom.Element: The matching DOM element
  75. Raises:
  76. ValueError: If node not found or multiple matches found
  77. Example:
  78. elem = editor.get_node(tag="w:r", line_number=519)
  79. elem = editor.get_node(tag="w:r", line_number=range(100, 200))
  80. elem = editor.get_node(tag="w:del", attrs={"w:id": "1"})
  81. elem = editor.get_node(tag="w:p", attrs={"w14:paraId": "12345678"})
  82. elem = editor.get_node(tag="w:commentRangeStart", attrs={"w:id": "0"})
  83. elem = editor.get_node(tag="w:p", contains="specific text")
  84. elem = editor.get_node(tag="w:t", contains="&#8220;Agreement") # Entity notation
  85. elem = editor.get_node(tag="w:t", contains="\u201cAgreement") # Unicode character
  86. """
  87. matches = []
  88. for elem in self.dom.getElementsByTagName(tag):
  89. # Check line_number filter
  90. if line_number is not None:
  91. parse_pos = getattr(elem, "parse_position", (None,))
  92. elem_line = parse_pos[0]
  93. # Handle both single line number and range
  94. if isinstance(line_number, range):
  95. if elem_line not in line_number:
  96. continue
  97. else:
  98. if elem_line != line_number:
  99. continue
  100. # Check attrs filter
  101. if attrs is not None:
  102. if not all(
  103. elem.getAttribute(attr_name) == attr_value
  104. for attr_name, attr_value in attrs.items()
  105. ):
  106. continue
  107. # Check contains filter
  108. if contains is not None:
  109. elem_text = self._get_element_text(elem)
  110. # Normalize the search string: convert HTML entities to Unicode characters
  111. # This allows searching for both "&#8220;Rowan" and ""Rowan"
  112. normalized_contains = html.unescape(contains)
  113. if normalized_contains not in elem_text:
  114. continue
  115. # If all applicable filters passed, this is a match
  116. matches.append(elem)
  117. if not matches:
  118. # Build descriptive error message
  119. filters = []
  120. if line_number is not None:
  121. line_str = (
  122. f"lines {line_number.start}-{line_number.stop - 1}"
  123. if isinstance(line_number, range)
  124. else f"line {line_number}"
  125. )
  126. filters.append(f"at {line_str}")
  127. if attrs is not None:
  128. filters.append(f"with attributes {attrs}")
  129. if contains is not None:
  130. filters.append(f"containing '{contains}'")
  131. filter_desc = " ".join(filters) if filters else ""
  132. base_msg = f"Node not found: <{tag}> {filter_desc}".strip()
  133. # Add helpful hint based on filters used
  134. if contains:
  135. hint = "Text may be split across elements or use different wording."
  136. elif line_number:
  137. hint = "Line numbers may have changed if document was modified."
  138. elif attrs:
  139. hint = "Verify attribute values are correct."
  140. else:
  141. hint = "Try adding filters (attrs, line_number, or contains)."
  142. raise ValueError(f"{base_msg}. {hint}")
  143. if len(matches) > 1:
  144. raise ValueError(
  145. f"Multiple nodes found: <{tag}>. "
  146. f"Add more filters (attrs, line_number, or contains) to narrow the search."
  147. )
  148. return matches[0]
  149. def _get_element_text(self, elem):
  150. """
  151. Recursively extract all text content from an element.
  152. Skips text nodes that contain only whitespace (spaces, tabs, newlines),
  153. which typically represent XML formatting rather than document content.
  154. Args:
  155. elem: defusedxml.minidom.Element to extract text from
  156. Returns:
  157. str: Concatenated text from all non-whitespace text nodes within the element
  158. """
  159. text_parts = []
  160. for node in elem.childNodes:
  161. if node.nodeType == node.TEXT_NODE:
  162. # Skip whitespace-only text nodes (XML formatting)
  163. if node.data.strip():
  164. text_parts.append(node.data)
  165. elif node.nodeType == node.ELEMENT_NODE:
  166. text_parts.append(self._get_element_text(node))
  167. return "".join(text_parts)
  168. def replace_node(self, elem, new_content):
  169. """
  170. Replace a DOM element with new XML content.
  171. Args:
  172. elem: defusedxml.minidom.Element to replace
  173. new_content: String containing XML to replace the node with
  174. Returns:
  175. List[defusedxml.minidom.Node]: All inserted nodes
  176. Example:
  177. new_nodes = editor.replace_node(old_elem, "<w:r><w:t>text</w:t></w:r>")
  178. """
  179. parent = elem.parentNode
  180. nodes = self._parse_fragment(new_content)
  181. for node in nodes:
  182. parent.insertBefore(node, elem)
  183. parent.removeChild(elem)
  184. return nodes
  185. def insert_after(self, elem, xml_content):
  186. """
  187. Insert XML content after a DOM element.
  188. Args:
  189. elem: defusedxml.minidom.Element to insert after
  190. xml_content: String containing XML to insert
  191. Returns:
  192. List[defusedxml.minidom.Node]: All inserted nodes
  193. Example:
  194. new_nodes = editor.insert_after(elem, "<w:r><w:t>text</w:t></w:r>")
  195. """
  196. parent = elem.parentNode
  197. next_sibling = elem.nextSibling
  198. nodes = self._parse_fragment(xml_content)
  199. for node in nodes:
  200. if next_sibling:
  201. parent.insertBefore(node, next_sibling)
  202. else:
  203. parent.appendChild(node)
  204. return nodes
  205. def insert_before(self, elem, xml_content):
  206. """
  207. Insert XML content before a DOM element.
  208. Args:
  209. elem: defusedxml.minidom.Element to insert before
  210. xml_content: String containing XML to insert
  211. Returns:
  212. List[defusedxml.minidom.Node]: All inserted nodes
  213. Example:
  214. new_nodes = editor.insert_before(elem, "<w:r><w:t>text</w:t></w:r>")
  215. """
  216. parent = elem.parentNode
  217. nodes = self._parse_fragment(xml_content)
  218. for node in nodes:
  219. parent.insertBefore(node, elem)
  220. return nodes
  221. def append_to(self, elem, xml_content):
  222. """
  223. Append XML content as a child of a DOM element.
  224. Args:
  225. elem: defusedxml.minidom.Element to append to
  226. xml_content: String containing XML to append
  227. Returns:
  228. List[defusedxml.minidom.Node]: All inserted nodes
  229. Example:
  230. new_nodes = editor.append_to(elem, "<w:r><w:t>text</w:t></w:r>")
  231. """
  232. nodes = self._parse_fragment(xml_content)
  233. for node in nodes:
  234. elem.appendChild(node)
  235. return nodes
  236. def get_next_rid(self):
  237. """Get the next available rId for relationships files."""
  238. max_id = 0
  239. for rel_elem in self.dom.getElementsByTagName("Relationship"):
  240. rel_id = rel_elem.getAttribute("Id")
  241. if rel_id.startswith("rId"):
  242. try:
  243. max_id = max(max_id, int(rel_id[3:]))
  244. except ValueError:
  245. pass
  246. return f"rId{max_id + 1}"
  247. def save(self):
  248. """
  249. Save the edited XML back to the file.
  250. Serializes the DOM tree and writes it back to the original file path,
  251. preserving the original encoding (ascii or utf-8).
  252. """
  253. content = self.dom.toxml(encoding=self.encoding)
  254. self.xml_path.write_bytes(content)
  255. def _parse_fragment(self, xml_content):
  256. """
  257. Parse XML fragment and return list of imported nodes.
  258. Args:
  259. xml_content: String containing XML fragment
  260. Returns:
  261. List of defusedxml.minidom.Node objects imported into this document
  262. Raises:
  263. AssertionError: If fragment contains no element nodes
  264. """
  265. # Extract namespace declarations from the root document element
  266. root_elem = self.dom.documentElement
  267. namespaces = []
  268. if root_elem and root_elem.attributes:
  269. for i in range(root_elem.attributes.length):
  270. attr = root_elem.attributes.item(i)
  271. if attr.name.startswith("xmlns"): # type: ignore
  272. namespaces.append(f'{attr.name}="{attr.value}"') # type: ignore
  273. ns_decl = " ".join(namespaces)
  274. wrapper = f"<root {ns_decl}>{xml_content}</root>"
  275. fragment_doc = defusedxml.minidom.parseString(wrapper)
  276. nodes = [
  277. self.dom.importNode(child, deep=True)
  278. for child in fragment_doc.documentElement.childNodes # type: ignore
  279. ]
  280. elements = [n for n in nodes if n.nodeType == n.ELEMENT_NODE]
  281. assert elements, "Fragment must contain at least one element"
  282. return nodes
  283. def _create_line_tracking_parser():
  284. """
  285. Create a SAX parser that tracks line and column numbers for each element.
  286. Monkey patches the SAX content handler to store the current line and column
  287. position from the underlying expat parser onto each element as a parse_position
  288. attribute (line, column) tuple.
  289. Returns:
  290. defusedxml.sax.xmlreader.XMLReader: Configured SAX parser
  291. """
  292. def set_content_handler(dom_handler):
  293. def startElementNS(name, tagName, attrs):
  294. orig_start_cb(name, tagName, attrs)
  295. cur_elem = dom_handler.elementStack[-1]
  296. cur_elem.parse_position = (
  297. parser._parser.CurrentLineNumber, # type: ignore
  298. parser._parser.CurrentColumnNumber, # type: ignore
  299. )
  300. orig_start_cb = dom_handler.startElementNS
  301. dom_handler.startElementNS = startElementNS
  302. orig_set_content_handler(dom_handler)
  303. parser = defusedxml.sax.make_parser()
  304. orig_set_content_handler = parser.setContentHandler
  305. parser.setContentHandler = set_content_handler # type: ignore
  306. return parser