document.py 49 KB


  1. #!/usr/bin/env python3
  2. """
  3. Library for working with Word documents: comments, tracked changes, and editing.
  4. Usage:
  5. from skills.docx.scripts.document import Document
  6. # Initialize
  7. doc = Document('workspace/unpacked')
  8. doc = Document('workspace/unpacked', author="John Doe", initials="JD")
  9. # Find nodes
  10. node = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "1"})
  11. node = doc["word/document.xml"].get_node(tag="w:p", line_number=10)
  12. # Add comments
  13. doc.add_comment(start=node, end=node, text="Comment text")
  14. doc.reply_to_comment(parent_comment_id=0, text="Reply text")
  15. # Suggest tracked changes
  16. doc["word/document.xml"].suggest_deletion(node) # Delete content
  17. doc["word/document.xml"].revert_insertion(ins_node) # Reject insertion
  18. doc["word/document.xml"].revert_deletion(del_node) # Reject deletion
  19. # Save
  20. doc.save()
  21. """
  22. import html
  23. import random
  24. import shutil
  25. import tempfile
  26. from datetime import datetime, timezone
  27. from pathlib import Path
  28. from defusedxml import minidom
  29. from ooxml.scripts.pack import pack_document
  30. from ooxml.scripts.validation.docx import DOCXSchemaValidator
  31. from ooxml.scripts.validation.redlining import RedliningValidator
  32. from .utilities import XMLEditor
  33. # Path to template files
  34. TEMPLATE_DIR = Path(__file__).parent / "templates"
  35. class DocxXMLEditor(XMLEditor):
  36. """XMLEditor that automatically applies RSID, author, and date to new elements.
  37. Automatically adds attributes to elements that support them when inserting new content:
  38. - w:rsidR, w:rsidRDefault, w:rsidP (for w:p and w:r elements)
  39. - w:author and w:date (for w:ins, w:del, w:comment elements)
  40. - w:id (for w:ins and w:del elements)
  41. Attributes:
  42. dom (defusedxml.minidom.Document): The DOM document for direct manipulation
  43. """
  44. def __init__(
  45. self, xml_path, rsid: str, author: str = "Claude", initials: str = "C"
  46. ):
  47. """Initialize with required RSID and optional author.
  48. Args:
  49. xml_path: Path to XML file to edit
  50. rsid: RSID to automatically apply to new elements
  51. author: Author name for tracked changes and comments (default: "Claude")
  52. initials: Author initials (default: "C")
  53. """
  54. super().__init__(xml_path)
  55. self.rsid = rsid
  56. self.author = author
  57. self.initials = initials
  58. def _get_next_change_id(self):
  59. """Get the next available change ID by checking all tracked change elements."""
  60. max_id = -1
  61. for tag in ("w:ins", "w:del"):
  62. elements = self.dom.getElementsByTagName(tag)
  63. for elem in elements:
  64. change_id = elem.getAttribute("w:id")
  65. if change_id:
  66. try:
  67. max_id = max(max_id, int(change_id))
  68. except ValueError:
  69. pass
  70. return max_id + 1
  71. def _ensure_w16du_namespace(self):
  72. """Ensure w16du namespace is declared on the root element."""
  73. root = self.dom.documentElement
  74. if not root.hasAttribute("xmlns:w16du"): # type: ignore
  75. root.setAttribute( # type: ignore
  76. "xmlns:w16du",
  77. "http://schemas.microsoft.com/office/word/2023/wordml/word16du",
  78. )
  79. def _ensure_w16cex_namespace(self):
  80. """Ensure w16cex namespace is declared on the root element."""
  81. root = self.dom.documentElement
  82. if not root.hasAttribute("xmlns:w16cex"): # type: ignore
  83. root.setAttribute( # type: ignore
  84. "xmlns:w16cex",
  85. "http://schemas.microsoft.com/office/word/2018/wordml/cex",
  86. )
  87. def _ensure_w14_namespace(self):
  88. """Ensure w14 namespace is declared on the root element."""
  89. root = self.dom.documentElement
  90. if not root.hasAttribute("xmlns:w14"): # type: ignore
  91. root.setAttribute( # type: ignore
  92. "xmlns:w14",
  93. "http://schemas.microsoft.com/office/word/2010/wordml",
  94. )
  95. def _inject_attributes_to_nodes(self, nodes):
  96. """Inject RSID, author, and date attributes into DOM nodes where applicable.
  97. Adds attributes to elements that support them:
  98. - w:r: gets w:rsidR (or w:rsidDel if inside w:del)
  99. - w:p: gets w:rsidR, w:rsidRDefault, w:rsidP, w14:paraId, w14:textId
  100. - w:t: gets xml:space="preserve" if text has leading/trailing whitespace
  101. - w:ins, w:del: get w:id, w:author, w:date, w16du:dateUtc
  102. - w:comment: gets w:author, w:date, w:initials
  103. - w16cex:commentExtensible: gets w16cex:dateUtc
  104. Args:
  105. nodes: List of DOM nodes to process
  106. """
  107. from datetime import datetime, timezone
  108. timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
  109. def is_inside_deletion(elem):
  110. """Check if element is inside a w:del element."""
  111. parent = elem.parentNode
  112. while parent:
  113. if parent.nodeType == parent.ELEMENT_NODE and parent.tagName == "w:del":
  114. return True
  115. parent = parent.parentNode
  116. return False
  117. def add_rsid_to_p(elem):
  118. if not elem.hasAttribute("w:rsidR"):
  119. elem.setAttribute("w:rsidR", self.rsid)
  120. if not elem.hasAttribute("w:rsidRDefault"):
  121. elem.setAttribute("w:rsidRDefault", self.rsid)
  122. if not elem.hasAttribute("w:rsidP"):
  123. elem.setAttribute("w:rsidP", self.rsid)
  124. # Add w14:paraId and w14:textId if not present
  125. if not elem.hasAttribute("w14:paraId"):
  126. self._ensure_w14_namespace()
  127. elem.setAttribute("w14:paraId", _generate_hex_id())
  128. if not elem.hasAttribute("w14:textId"):
  129. self._ensure_w14_namespace()
  130. elem.setAttribute("w14:textId", _generate_hex_id())
  131. def add_rsid_to_r(elem):
  132. # Use w:rsidDel for <w:r> inside <w:del>, otherwise w:rsidR
  133. if is_inside_deletion(elem):
  134. if not elem.hasAttribute("w:rsidDel"):
  135. elem.setAttribute("w:rsidDel", self.rsid)
  136. else:
  137. if not elem.hasAttribute("w:rsidR"):
  138. elem.setAttribute("w:rsidR", self.rsid)
  139. def add_tracked_change_attrs(elem):
  140. # Auto-assign w:id if not present
  141. if not elem.hasAttribute("w:id"):
  142. elem.setAttribute("w:id", str(self._get_next_change_id()))
  143. if not elem.hasAttribute("w:author"):
  144. elem.setAttribute("w:author", self.author)
  145. if not elem.hasAttribute("w:date"):
  146. elem.setAttribute("w:date", timestamp)
  147. # Add w16du:dateUtc for tracked changes (same as w:date since we generate UTC timestamps)
  148. if elem.tagName in ("w:ins", "w:del") and not elem.hasAttribute(
  149. "w16du:dateUtc"
  150. ):
  151. self._ensure_w16du_namespace()
  152. elem.setAttribute("w16du:dateUtc", timestamp)
  153. def add_comment_attrs(elem):
  154. if not elem.hasAttribute("w:author"):
  155. elem.setAttribute("w:author", self.author)
  156. if not elem.hasAttribute("w:date"):
  157. elem.setAttribute("w:date", timestamp)
  158. if not elem.hasAttribute("w:initials"):
  159. elem.setAttribute("w:initials", self.initials)
  160. def add_comment_extensible_date(elem):
  161. # Add w16cex:dateUtc for comment extensible elements
  162. if not elem.hasAttribute("w16cex:dateUtc"):
  163. self._ensure_w16cex_namespace()
  164. elem.setAttribute("w16cex:dateUtc", timestamp)
  165. def add_xml_space_to_t(elem):
  166. # Add xml:space="preserve" to w:t if text has leading/trailing whitespace
  167. if (
  168. elem.firstChild
  169. and elem.firstChild.nodeType == elem.firstChild.TEXT_NODE
  170. ):
  171. text = elem.firstChild.data
  172. if text and (text[0].isspace() or text[-1].isspace()):
  173. if not elem.hasAttribute("xml:space"):
  174. elem.setAttribute("xml:space", "preserve")
  175. for node in nodes:
  176. if node.nodeType != node.ELEMENT_NODE:
  177. continue
  178. # Handle the node itself
  179. if node.tagName == "w:p":
  180. add_rsid_to_p(node)
  181. elif node.tagName == "w:r":
  182. add_rsid_to_r(node)
  183. elif node.tagName == "w:t":
  184. add_xml_space_to_t(node)
  185. elif node.tagName in ("w:ins", "w:del"):
  186. add_tracked_change_attrs(node)
  187. elif node.tagName == "w:comment":
  188. add_comment_attrs(node)
  189. elif node.tagName == "w16cex:commentExtensible":
  190. add_comment_extensible_date(node)
  191. # Process descendants (getElementsByTagName doesn't return the element itself)
  192. for elem in node.getElementsByTagName("w:p"):
  193. add_rsid_to_p(elem)
  194. for elem in node.getElementsByTagName("w:r"):
  195. add_rsid_to_r(elem)
  196. for elem in node.getElementsByTagName("w:t"):
  197. add_xml_space_to_t(elem)
  198. for tag in ("w:ins", "w:del"):
  199. for elem in node.getElementsByTagName(tag):
  200. add_tracked_change_attrs(elem)
  201. for elem in node.getElementsByTagName("w:comment"):
  202. add_comment_attrs(elem)
  203. for elem in node.getElementsByTagName("w16cex:commentExtensible"):
  204. add_comment_extensible_date(elem)
  205. def replace_node(self, elem, new_content):
  206. """Replace node with automatic attribute injection."""
  207. nodes = super().replace_node(elem, new_content)
  208. self._inject_attributes_to_nodes(nodes)
  209. return nodes
  210. def insert_after(self, elem, xml_content):
  211. """Insert after with automatic attribute injection."""
  212. nodes = super().insert_after(elem, xml_content)
  213. self._inject_attributes_to_nodes(nodes)
  214. return nodes
  215. def insert_before(self, elem, xml_content):
  216. """Insert before with automatic attribute injection."""
  217. nodes = super().insert_before(elem, xml_content)
  218. self._inject_attributes_to_nodes(nodes)
  219. return nodes
  220. def append_to(self, elem, xml_content):
  221. """Append to with automatic attribute injection."""
  222. nodes = super().append_to(elem, xml_content)
  223. self._inject_attributes_to_nodes(nodes)
  224. return nodes
  225. def revert_insertion(self, elem):
  226. """Reject an insertion by wrapping its content in a deletion.
  227. Wraps all runs inside w:ins in w:del, converting w:t to w:delText.
  228. Can process a single w:ins element or a container element with multiple w:ins.
  229. Args:
  230. elem: Element to process (w:ins, w:p, w:body, etc.)
  231. Returns:
  232. list: List containing the processed element(s)
  233. Raises:
  234. ValueError: If the element contains no w:ins elements
  235. Example:
  236. # Reject a single insertion
  237. ins = doc["word/document.xml"].get_node(tag="w:ins", attrs={"w:id": "5"})
  238. doc["word/document.xml"].revert_insertion(ins)
  239. # Reject all insertions in a paragraph
  240. para = doc["word/document.xml"].get_node(tag="w:p", line_number=42)
  241. doc["word/document.xml"].revert_insertion(para)
  242. """
  243. # Collect insertions
  244. ins_elements = []
  245. if elem.tagName == "w:ins":
  246. ins_elements.append(elem)
  247. else:
  248. ins_elements.extend(elem.getElementsByTagName("w:ins"))
  249. # Validate that there are insertions to reject
  250. if not ins_elements:
  251. raise ValueError(
  252. f"revert_insertion requires w:ins elements. "
  253. f"The provided element <{elem.tagName}> contains no insertions. "
  254. )
  255. # Process all insertions - wrap all children in w:del
  256. for ins_elem in ins_elements:
  257. runs = list(ins_elem.getElementsByTagName("w:r"))
  258. if not runs:
  259. continue
  260. # Create deletion wrapper
  261. del_wrapper = self.dom.createElement("w:del")
  262. # Process each run
  263. for run in runs:
  264. # Convert w:t → w:delText and w:rsidR → w:rsidDel
  265. if run.hasAttribute("w:rsidR"):
  266. run.setAttribute("w:rsidDel", run.getAttribute("w:rsidR"))
  267. run.removeAttribute("w:rsidR")
  268. elif not run.hasAttribute("w:rsidDel"):
  269. run.setAttribute("w:rsidDel", self.rsid)
  270. for t_elem in list(run.getElementsByTagName("w:t")):
  271. del_text = self.dom.createElement("w:delText")
  272. # Copy ALL child nodes (not just firstChild) to handle entities
  273. while t_elem.firstChild:
  274. del_text.appendChild(t_elem.firstChild)
  275. for i in range(t_elem.attributes.length):
  276. attr = t_elem.attributes.item(i)
  277. del_text.setAttribute(attr.name, attr.value)
  278. t_elem.parentNode.replaceChild(del_text, t_elem)
  279. # Move all children from ins to del wrapper
  280. while ins_elem.firstChild:
  281. del_wrapper.appendChild(ins_elem.firstChild)
  282. # Add del wrapper back to ins
  283. ins_elem.appendChild(del_wrapper)
  284. # Inject attributes to the deletion wrapper
  285. self._inject_attributes_to_nodes([del_wrapper])
  286. return [elem]
  287. def revert_deletion(self, elem):
  288. """Reject a deletion by re-inserting the deleted content.
  289. Creates w:ins elements after each w:del, copying deleted content and
  290. converting w:delText back to w:t.
  291. Can process a single w:del element or a container element with multiple w:del.
  292. Args:
  293. elem: Element to process (w:del, w:p, w:body, etc.)
  294. Returns:
  295. list: If elem is w:del, returns [elem, new_ins]. Otherwise returns [elem].
  296. Raises:
  297. ValueError: If the element contains no w:del elements
  298. Example:
  299. # Reject a single deletion - returns [w:del, w:ins]
  300. del_elem = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "3"})
  301. nodes = doc["word/document.xml"].revert_deletion(del_elem)
  302. # Reject all deletions in a paragraph - returns [para]
  303. para = doc["word/document.xml"].get_node(tag="w:p", line_number=42)
  304. nodes = doc["word/document.xml"].revert_deletion(para)
  305. """
  306. # Collect deletions FIRST - before we modify the DOM
  307. del_elements = []
  308. is_single_del = elem.tagName == "w:del"
  309. if is_single_del:
  310. del_elements.append(elem)
  311. else:
  312. del_elements.extend(elem.getElementsByTagName("w:del"))
  313. # Validate that there are deletions to reject
  314. if not del_elements:
  315. raise ValueError(
  316. f"revert_deletion requires w:del elements. "
  317. f"The provided element <{elem.tagName}> contains no deletions. "
  318. )
  319. # Track created insertion (only relevant if elem is a single w:del)
  320. created_insertion = None
  321. # Process all deletions - create insertions that copy the deleted content
  322. for del_elem in del_elements:
  323. # Clone the deleted runs and convert them to insertions
  324. runs = list(del_elem.getElementsByTagName("w:r"))
  325. if not runs:
  326. continue
  327. # Create insertion wrapper
  328. ins_elem = self.dom.createElement("w:ins")
  329. for run in runs:
  330. # Clone the run
  331. new_run = run.cloneNode(True)
  332. # Convert w:delText → w:t
  333. for del_text in list(new_run.getElementsByTagName("w:delText")):
  334. t_elem = self.dom.createElement("w:t")
  335. # Copy ALL child nodes (not just firstChild) to handle entities
  336. while del_text.firstChild:
  337. t_elem.appendChild(del_text.firstChild)
  338. for i in range(del_text.attributes.length):
  339. attr = del_text.attributes.item(i)
  340. t_elem.setAttribute(attr.name, attr.value)
  341. del_text.parentNode.replaceChild(t_elem, del_text)
  342. # Update run attributes: w:rsidDel → w:rsidR
  343. if new_run.hasAttribute("w:rsidDel"):
  344. new_run.setAttribute("w:rsidR", new_run.getAttribute("w:rsidDel"))
  345. new_run.removeAttribute("w:rsidDel")
  346. elif not new_run.hasAttribute("w:rsidR"):
  347. new_run.setAttribute("w:rsidR", self.rsid)
  348. ins_elem.appendChild(new_run)
  349. # Insert the new insertion after the deletion
  350. nodes = self.insert_after(del_elem, ins_elem.toxml())
  351. # If processing a single w:del, track the created insertion
  352. if is_single_del and nodes:
  353. created_insertion = nodes[0]
  354. # Return based on input type
  355. if is_single_del and created_insertion:
  356. return [elem, created_insertion]
  357. else:
  358. return [elem]
  359. @staticmethod
  360. def suggest_paragraph(xml_content: str) -> str:
  361. """Transform paragraph XML to add tracked change wrapping for insertion.
  362. Wraps runs in <w:ins> and adds <w:ins/> to w:rPr in w:pPr for numbered lists.
  363. Args:
  364. xml_content: XML string containing a <w:p> element
  365. Returns:
  366. str: Transformed XML with tracked change wrapping
  367. """
  368. wrapper = f'<root xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">{xml_content}</root>'
  369. doc = minidom.parseString(wrapper)
  370. para = doc.getElementsByTagName("w:p")[0]
  371. # Ensure w:pPr exists
  372. pPr_list = para.getElementsByTagName("w:pPr")
  373. if not pPr_list:
  374. pPr = doc.createElement("w:pPr")
  375. para.insertBefore(
  376. pPr, para.firstChild
  377. ) if para.firstChild else para.appendChild(pPr)
  378. else:
  379. pPr = pPr_list[0]
  380. # Ensure w:rPr exists in w:pPr
  381. rPr_list = pPr.getElementsByTagName("w:rPr")
  382. if not rPr_list:
  383. rPr = doc.createElement("w:rPr")
  384. pPr.appendChild(rPr)
  385. else:
  386. rPr = rPr_list[0]
  387. # Add <w:ins/> to w:rPr
  388. ins_marker = doc.createElement("w:ins")
  389. rPr.insertBefore(
  390. ins_marker, rPr.firstChild
  391. ) if rPr.firstChild else rPr.appendChild(ins_marker)
  392. # Wrap all non-pPr children in <w:ins>
  393. ins_wrapper = doc.createElement("w:ins")
  394. for child in [c for c in para.childNodes if c.nodeName != "w:pPr"]:
  395. para.removeChild(child)
  396. ins_wrapper.appendChild(child)
  397. para.appendChild(ins_wrapper)
  398. return para.toxml()
  399. def suggest_deletion(self, elem):
  400. """Mark a w:r or w:p element as deleted with tracked changes (in-place DOM manipulation).
  401. For w:r: wraps in <w:del>, converts <w:t> to <w:delText>, preserves w:rPr
  402. For w:p (regular): wraps content in <w:del>, converts <w:t> to <w:delText>
  403. For w:p (numbered list): adds <w:del/> to w:rPr in w:pPr, wraps content in <w:del>
  404. Args:
  405. elem: A w:r or w:p DOM element without existing tracked changes
  406. Returns:
  407. Element: The modified element
  408. Raises:
  409. ValueError: If element has existing tracked changes or invalid structure
  410. """
  411. if elem.nodeName == "w:r":
  412. # Check for existing w:delText
  413. if elem.getElementsByTagName("w:delText"):
  414. raise ValueError("w:r element already contains w:delText")
  415. # Convert w:t → w:delText
  416. for t_elem in list(elem.getElementsByTagName("w:t")):
  417. del_text = self.dom.createElement("w:delText")
  418. # Copy ALL child nodes (not just firstChild) to handle entities
  419. while t_elem.firstChild:
  420. del_text.appendChild(t_elem.firstChild)
  421. # Preserve attributes like xml:space
  422. for i in range(t_elem.attributes.length):
  423. attr = t_elem.attributes.item(i)
  424. del_text.setAttribute(attr.name, attr.value)
  425. t_elem.parentNode.replaceChild(del_text, t_elem)
  426. # Update run attributes: w:rsidR → w:rsidDel
  427. if elem.hasAttribute("w:rsidR"):
  428. elem.setAttribute("w:rsidDel", elem.getAttribute("w:rsidR"))
  429. elem.removeAttribute("w:rsidR")
  430. elif not elem.hasAttribute("w:rsidDel"):
  431. elem.setAttribute("w:rsidDel", self.rsid)
  432. # Wrap in w:del
  433. del_wrapper = self.dom.createElement("w:del")
  434. parent = elem.parentNode
  435. parent.insertBefore(del_wrapper, elem)
  436. parent.removeChild(elem)
  437. del_wrapper.appendChild(elem)
  438. # Inject attributes to the deletion wrapper
  439. self._inject_attributes_to_nodes([del_wrapper])
  440. return del_wrapper
  441. elif elem.nodeName == "w:p":
  442. # Check for existing tracked changes
  443. if elem.getElementsByTagName("w:ins") or elem.getElementsByTagName("w:del"):
  444. raise ValueError("w:p element already contains tracked changes")
  445. # Check if it's a numbered list item
  446. pPr_list = elem.getElementsByTagName("w:pPr")
  447. is_numbered = pPr_list and pPr_list[0].getElementsByTagName("w:numPr")
  448. if is_numbered:
  449. # Add <w:del/> to w:rPr in w:pPr
  450. pPr = pPr_list[0]
  451. rPr_list = pPr.getElementsByTagName("w:rPr")
  452. if not rPr_list:
  453. rPr = self.dom.createElement("w:rPr")
  454. pPr.appendChild(rPr)
  455. else:
  456. rPr = rPr_list[0]
  457. # Add <w:del/> marker
  458. del_marker = self.dom.createElement("w:del")
  459. rPr.insertBefore(
  460. del_marker, rPr.firstChild
  461. ) if rPr.firstChild else rPr.appendChild(del_marker)
  462. # Convert w:t → w:delText in all runs
  463. for t_elem in list(elem.getElementsByTagName("w:t")):
  464. del_text = self.dom.createElement("w:delText")
  465. # Copy ALL child nodes (not just firstChild) to handle entities
  466. while t_elem.firstChild:
  467. del_text.appendChild(t_elem.firstChild)
  468. # Preserve attributes like xml:space
  469. for i in range(t_elem.attributes.length):
  470. attr = t_elem.attributes.item(i)
  471. del_text.setAttribute(attr.name, attr.value)
  472. t_elem.parentNode.replaceChild(del_text, t_elem)
  473. # Update run attributes: w:rsidR → w:rsidDel
  474. for run in elem.getElementsByTagName("w:r"):
  475. if run.hasAttribute("w:rsidR"):
  476. run.setAttribute("w:rsidDel", run.getAttribute("w:rsidR"))
  477. run.removeAttribute("w:rsidR")
  478. elif not run.hasAttribute("w:rsidDel"):
  479. run.setAttribute("w:rsidDel", self.rsid)
  480. # Wrap all non-pPr children in <w:del>
  481. del_wrapper = self.dom.createElement("w:del")
  482. for child in [c for c in elem.childNodes if c.nodeName != "w:pPr"]:
  483. elem.removeChild(child)
  484. del_wrapper.appendChild(child)
  485. elem.appendChild(del_wrapper)
  486. # Inject attributes to the deletion wrapper
  487. self._inject_attributes_to_nodes([del_wrapper])
  488. return elem
  489. else:
  490. raise ValueError(f"Element must be w:r or w:p, got {elem.nodeName}")
  491. def _generate_hex_id() -> str:
  492. """Generate random 8-character hex ID for para/durable IDs.
  493. Values are constrained to be less than 0x7FFFFFFF per OOXML spec:
  494. - paraId must be < 0x80000000
  495. - durableId must be < 0x7FFFFFFF
  496. We use the stricter constraint (0x7FFFFFFF) for both.
  497. """
  498. return f"{random.randint(1, 0x7FFFFFFE):08X}"
  499. def _generate_rsid() -> str:
  500. """Generate random 8-character hex RSID."""
  501. return "".join(random.choices("0123456789ABCDEF", k=8))
  502. class Document:
  503. """Manages comments in unpacked Word documents."""
  504. def __init__(
  505. self,
  506. unpacked_dir,
  507. rsid=None,
  508. track_revisions=False,
  509. author="Claude",
  510. initials="C",
  511. ):
  512. """
  513. Initialize with path to unpacked Word document directory.
  514. Automatically sets up comment infrastructure (people.xml, RSIDs).
  515. Args:
  516. unpacked_dir: Path to unpacked DOCX directory (must contain word/ subdirectory)
  517. rsid: Optional RSID to use for all comment elements. If not provided, one will be generated.
  518. track_revisions: If True, enables track revisions in settings.xml (default: False)
  519. author: Default author name for comments (default: "Claude")
  520. initials: Default author initials for comments (default: "C")
  521. """
  522. self.original_path = Path(unpacked_dir)
  523. if not self.original_path.exists() or not self.original_path.is_dir():
  524. raise ValueError(f"Directory not found: {unpacked_dir}")
  525. # Create temporary directory with subdirectories for unpacked content and baseline
  526. self.temp_dir = tempfile.mkdtemp(prefix="docx_")
  527. self.unpacked_path = Path(self.temp_dir) / "unpacked"
  528. shutil.copytree(self.original_path, self.unpacked_path)
  529. # Pack original directory into temporary .docx for validation baseline (outside unpacked dir)
  530. self.original_docx = Path(self.temp_dir) / "original.docx"
  531. pack_document(self.original_path, self.original_docx, validate=False)
  532. self.word_path = self.unpacked_path / "word"
  533. # Generate RSID if not provided
  534. self.rsid = rsid if rsid else _generate_rsid()
  535. print(f"Using RSID: {self.rsid}")
  536. # Set default author and initials
  537. self.author = author
  538. self.initials = initials
  539. # Cache for lazy-loaded editors
  540. self._editors = {}
  541. # Comment file paths
  542. self.comments_path = self.word_path / "comments.xml"
  543. self.comments_extended_path = self.word_path / "commentsExtended.xml"
  544. self.comments_ids_path = self.word_path / "commentsIds.xml"
  545. self.comments_extensible_path = self.word_path / "commentsExtensible.xml"
  546. # Load existing comments and determine next ID (before setup modifies files)
  547. self.existing_comments = self._load_existing_comments()
  548. self.next_comment_id = self._get_next_comment_id()
  549. # Convenient access to document.xml editor (semi-private)
  550. self._document = self["word/document.xml"]
  551. # Setup tracked changes infrastructure
  552. self._setup_tracking(track_revisions=track_revisions)
  553. # Add author to people.xml
  554. self._add_author_to_people(author)
  555. def __getitem__(self, xml_path: str) -> DocxXMLEditor:
  556. """
  557. Get or create a DocxXMLEditor for the specified XML file.
  558. Enables lazy-loaded editors with bracket notation:
  559. node = doc["word/document.xml"].get_node(tag="w:p", line_number=42)
  560. Args:
  561. xml_path: Relative path to XML file (e.g., "word/document.xml", "word/comments.xml")
  562. Returns:
  563. DocxXMLEditor instance for the specified file
  564. Raises:
  565. ValueError: If the file does not exist
  566. Example:
  567. # Get node from document.xml
  568. node = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "1"})
  569. # Get node from comments.xml
  570. comment = doc["word/comments.xml"].get_node(tag="w:comment", attrs={"w:id": "0"})
  571. """
  572. if xml_path not in self._editors:
  573. file_path = self.unpacked_path / xml_path
  574. if not file_path.exists():
  575. raise ValueError(f"XML file not found: {xml_path}")
  576. # Use DocxXMLEditor with RSID, author, and initials for all editors
  577. self._editors[xml_path] = DocxXMLEditor(
  578. file_path, rsid=self.rsid, author=self.author, initials=self.initials
  579. )
  580. return self._editors[xml_path]
  581. def add_comment(self, start, end, text: str) -> int:
  582. """
  583. Add a comment spanning from one element to another.
  584. Args:
  585. start: DOM element for the starting point
  586. end: DOM element for the ending point
  587. text: Comment content
  588. Returns:
  589. The comment ID that was created
  590. Example:
  591. start_node = cm.get_document_node(tag="w:del", id="1")
  592. end_node = cm.get_document_node(tag="w:ins", id="2")
  593. cm.add_comment(start=start_node, end=end_node, text="Explanation")
  594. """
  595. comment_id = self.next_comment_id
  596. para_id = _generate_hex_id()
  597. durable_id = _generate_hex_id()
  598. timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
  599. # Add comment ranges to document.xml immediately
  600. self._document.insert_before(start, self._comment_range_start_xml(comment_id))
  601. # If end node is a paragraph, append comment markup inside it
  602. # Otherwise insert after it (for run-level anchors)
  603. if end.tagName == "w:p":
  604. self._document.append_to(end, self._comment_range_end_xml(comment_id))
  605. else:
  606. self._document.insert_after(end, self._comment_range_end_xml(comment_id))
  607. # Add to comments.xml immediately
  608. self._add_to_comments_xml(
  609. comment_id, para_id, text, self.author, self.initials, timestamp
  610. )
  611. # Add to commentsExtended.xml immediately
  612. self._add_to_comments_extended_xml(para_id, parent_para_id=None)
  613. # Add to commentsIds.xml immediately
  614. self._add_to_comments_ids_xml(para_id, durable_id)
  615. # Add to commentsExtensible.xml immediately
  616. self._add_to_comments_extensible_xml(durable_id)
  617. # Update existing_comments so replies work
  618. self.existing_comments[comment_id] = {"para_id": para_id}
  619. self.next_comment_id += 1
  620. return comment_id
  621. def reply_to_comment(
  622. self,
  623. parent_comment_id: int,
  624. text: str,
  625. ) -> int:
  626. """
  627. Add a reply to an existing comment.
  628. Args:
  629. parent_comment_id: The w:id of the parent comment to reply to
  630. text: Reply text
  631. Returns:
  632. The comment ID that was created for the reply
  633. Example:
  634. cm.reply_to_comment(parent_comment_id=0, text="I agree with this change")
  635. """
  636. if parent_comment_id not in self.existing_comments:
  637. raise ValueError(f"Parent comment with id={parent_comment_id} not found")
  638. parent_info = self.existing_comments[parent_comment_id]
  639. comment_id = self.next_comment_id
  640. para_id = _generate_hex_id()
  641. durable_id = _generate_hex_id()
  642. timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
  643. # Add comment ranges to document.xml immediately
  644. parent_start_elem = self._document.get_node(
  645. tag="w:commentRangeStart", attrs={"w:id": str(parent_comment_id)}
  646. )
  647. parent_ref_elem = self._document.get_node(
  648. tag="w:commentReference", attrs={"w:id": str(parent_comment_id)}
  649. )
  650. self._document.insert_after(
  651. parent_start_elem, self._comment_range_start_xml(comment_id)
  652. )
  653. parent_ref_run = parent_ref_elem.parentNode
  654. self._document.insert_after(
  655. parent_ref_run, f'<w:commentRangeEnd w:id="{comment_id}"/>'
  656. )
  657. self._document.insert_after(
  658. parent_ref_run, self._comment_ref_run_xml(comment_id)
  659. )
  660. # Add to comments.xml immediately
  661. self._add_to_comments_xml(
  662. comment_id, para_id, text, self.author, self.initials, timestamp
  663. )
  664. # Add to commentsExtended.xml immediately (with parent)
  665. self._add_to_comments_extended_xml(
  666. para_id, parent_para_id=parent_info["para_id"]
  667. )
  668. # Add to commentsIds.xml immediately
  669. self._add_to_comments_ids_xml(para_id, durable_id)
  670. # Add to commentsExtensible.xml immediately
  671. self._add_to_comments_extensible_xml(durable_id)
  672. # Update existing_comments so replies work
  673. self.existing_comments[comment_id] = {"para_id": para_id}
  674. self.next_comment_id += 1
  675. return comment_id
  676. def __del__(self):
  677. """Clean up temporary directory on deletion."""
  678. if hasattr(self, "temp_dir") and Path(self.temp_dir).exists():
  679. shutil.rmtree(self.temp_dir)
  680. def validate(self) -> None:
  681. """
  682. Validate the document against XSD schema and redlining rules.
  683. Raises:
  684. ValueError: If validation fails.
  685. """
  686. # Create validators with current state
  687. schema_validator = DOCXSchemaValidator(
  688. self.unpacked_path, self.original_docx, verbose=False
  689. )
  690. redlining_validator = RedliningValidator(
  691. self.unpacked_path, self.original_docx, verbose=False
  692. )
  693. # Run validations
  694. if not schema_validator.validate():
  695. raise ValueError("Schema validation failed")
  696. if not redlining_validator.validate():
  697. raise ValueError("Redlining validation failed")
  698. def save(self, destination=None, validate=True) -> None:
  699. """
  700. Save all modified XML files to disk and copy to destination directory.
  701. This persists all changes made via add_comment() and reply_to_comment().
  702. Args:
  703. destination: Optional path to save to. If None, saves back to original directory.
  704. validate: If True, validates document before saving (default: True).
  705. """
  706. # Only ensure comment relationships and content types if comment files exist
  707. if self.comments_path.exists():
  708. self._ensure_comment_relationships()
  709. self._ensure_comment_content_types()
  710. # Save all modified XML files in temp directory
  711. for editor in self._editors.values():
  712. editor.save()
  713. # Validate by default
  714. if validate:
  715. self.validate()
  716. # Copy contents from temp directory to destination (or original directory)
  717. target_path = Path(destination) if destination else self.original_path
  718. shutil.copytree(self.unpacked_path, target_path, dirs_exist_ok=True)
  719. # ==================== Private: Initialization ====================
  720. def _get_next_comment_id(self):
  721. """Get the next available comment ID."""
  722. if not self.comments_path.exists():
  723. return 0
  724. editor = self["word/comments.xml"]
  725. max_id = -1
  726. for comment_elem in editor.dom.getElementsByTagName("w:comment"):
  727. comment_id = comment_elem.getAttribute("w:id")
  728. if comment_id:
  729. try:
  730. max_id = max(max_id, int(comment_id))
  731. except ValueError:
  732. pass
  733. return max_id + 1
  734. def _load_existing_comments(self):
  735. """Load existing comments from files to enable replies."""
  736. if not self.comments_path.exists():
  737. return {}
  738. editor = self["word/comments.xml"]
  739. existing = {}
  740. for comment_elem in editor.dom.getElementsByTagName("w:comment"):
  741. comment_id = comment_elem.getAttribute("w:id")
  742. if not comment_id:
  743. continue
  744. # Find para_id from the w:p element within the comment
  745. para_id = None
  746. for p_elem in comment_elem.getElementsByTagName("w:p"):
  747. para_id = p_elem.getAttribute("w14:paraId")
  748. if para_id:
  749. break
  750. if not para_id:
  751. continue
  752. existing[int(comment_id)] = {"para_id": para_id}
  753. return existing
  754. # ==================== Private: Setup Methods ====================
  755. def _setup_tracking(self, track_revisions=False):
  756. """Set up comment infrastructure in unpacked directory.
  757. Args:
  758. track_revisions: If True, enables track revisions in settings.xml
  759. """
  760. # Create or update word/people.xml
  761. people_file = self.word_path / "people.xml"
  762. self._update_people_xml(people_file)
  763. # Update XML files
  764. self._add_content_type_for_people(self.unpacked_path / "[Content_Types].xml")
  765. self._add_relationship_for_people(
  766. self.word_path / "_rels" / "document.xml.rels"
  767. )
  768. # Always add RSID to settings.xml, optionally enable trackRevisions
  769. self._update_settings(
  770. self.word_path / "settings.xml", track_revisions=track_revisions
  771. )
  772. def _update_people_xml(self, path):
  773. """Create people.xml if it doesn't exist."""
  774. if not path.exists():
  775. # Copy from template
  776. shutil.copy(TEMPLATE_DIR / "people.xml", path)
  777. def _add_content_type_for_people(self, path):
  778. """Add people.xml content type to [Content_Types].xml if not already present."""
  779. editor = self["[Content_Types].xml"]
  780. if self._has_override(editor, "/word/people.xml"):
  781. return
  782. # Add Override element
  783. root = editor.dom.documentElement
  784. override_xml = '<Override PartName="/word/people.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.people+xml"/>'
  785. editor.append_to(root, override_xml)
  786. def _add_relationship_for_people(self, path):
  787. """Add people.xml relationship to document.xml.rels if not already present."""
  788. editor = self["word/_rels/document.xml.rels"]
  789. if self._has_relationship(editor, "people.xml"):
  790. return
  791. root = editor.dom.documentElement
  792. root_tag = root.tagName # type: ignore
  793. prefix = root_tag.split(":")[0] + ":" if ":" in root_tag else ""
  794. next_rid = editor.get_next_rid()
  795. # Create the relationship entry
  796. rel_xml = f'<{prefix}Relationship Id="{next_rid}" Type="http://schemas.microsoft.com/office/2011/relationships/people" Target="people.xml"/>'
  797. editor.append_to(root, rel_xml)
  798. def _update_settings(self, path, track_revisions=False):
  799. """Add RSID and optionally enable track revisions in settings.xml.
  800. Args:
  801. path: Path to settings.xml
  802. track_revisions: If True, adds trackRevisions element
  803. Places elements per OOXML schema order:
  804. - trackRevisions: early (before defaultTabStop)
  805. - rsids: late (after compat)
  806. """
  807. editor = self["word/settings.xml"]
  808. root = editor.get_node(tag="w:settings")
  809. prefix = root.tagName.split(":")[0] if ":" in root.tagName else "w"
  810. # Conditionally add trackRevisions if requested
  811. if track_revisions:
  812. track_revisions_exists = any(
  813. elem.tagName == f"{prefix}:trackRevisions"
  814. for elem in editor.dom.getElementsByTagName(f"{prefix}:trackRevisions")
  815. )
  816. if not track_revisions_exists:
  817. track_rev_xml = f"<{prefix}:trackRevisions/>"
  818. # Try to insert before documentProtection, defaultTabStop, or at start
  819. inserted = False
  820. for tag in [f"{prefix}:documentProtection", f"{prefix}:defaultTabStop"]:
  821. elements = editor.dom.getElementsByTagName(tag)
  822. if elements:
  823. editor.insert_before(elements[0], track_rev_xml)
  824. inserted = True
  825. break
  826. if not inserted:
  827. # Insert as first child of settings
  828. if root.firstChild:
  829. editor.insert_before(root.firstChild, track_rev_xml)
  830. else:
  831. editor.append_to(root, track_rev_xml)
  832. # Always check if rsids section exists
  833. rsids_elements = editor.dom.getElementsByTagName(f"{prefix}:rsids")
  834. if not rsids_elements:
  835. # Add new rsids section
  836. rsids_xml = f'''<{prefix}:rsids>
  837. <{prefix}:rsidRoot {prefix}:val="{self.rsid}"/>
  838. <{prefix}:rsid {prefix}:val="{self.rsid}"/>
  839. </{prefix}:rsids>'''
  840. # Try to insert after compat, before clrSchemeMapping, or before closing tag
  841. inserted = False
  842. compat_elements = editor.dom.getElementsByTagName(f"{prefix}:compat")
  843. if compat_elements:
  844. editor.insert_after(compat_elements[0], rsids_xml)
  845. inserted = True
  846. if not inserted:
  847. clr_elements = editor.dom.getElementsByTagName(
  848. f"{prefix}:clrSchemeMapping"
  849. )
  850. if clr_elements:
  851. editor.insert_before(clr_elements[0], rsids_xml)
  852. inserted = True
  853. if not inserted:
  854. editor.append_to(root, rsids_xml)
  855. else:
  856. # Check if this rsid already exists
  857. rsids_elem = rsids_elements[0]
  858. rsid_exists = any(
  859. elem.getAttribute(f"{prefix}:val") == self.rsid
  860. for elem in rsids_elem.getElementsByTagName(f"{prefix}:rsid")
  861. )
  862. if not rsid_exists:
  863. rsid_xml = f'<{prefix}:rsid {prefix}:val="{self.rsid}"/>'
  864. editor.append_to(rsids_elem, rsid_xml)
  865. # ==================== Private: XML File Creation ====================
  866. def _add_to_comments_xml(
  867. self, comment_id, para_id, text, author, initials, timestamp
  868. ):
  869. """Add a single comment to comments.xml."""
  870. if not self.comments_path.exists():
  871. shutil.copy(TEMPLATE_DIR / "comments.xml", self.comments_path)
  872. editor = self["word/comments.xml"]
  873. root = editor.get_node(tag="w:comments")
  874. escaped_text = (
  875. text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
  876. )
  877. # Note: w:rsidR, w:rsidRDefault, w:rsidP on w:p, w:rsidR on w:r,
  878. # and w:author, w:date, w:initials on w:comment are automatically added by DocxXMLEditor
  879. comment_xml = f'''<w:comment w:id="{comment_id}">
  880. <w:p w14:paraId="{para_id}" w14:textId="77777777">
  881. <w:r><w:rPr><w:rStyle w:val="CommentReference"/></w:rPr><w:annotationRef/></w:r>
  882. <w:r><w:rPr><w:color w:val="000000"/><w:sz w:val="20"/><w:szCs w:val="20"/></w:rPr><w:t>{escaped_text}</w:t></w:r>
  883. </w:p>
  884. </w:comment>'''
  885. editor.append_to(root, comment_xml)
  886. def _add_to_comments_extended_xml(self, para_id, parent_para_id):
  887. """Add a single comment to commentsExtended.xml."""
  888. if not self.comments_extended_path.exists():
  889. shutil.copy(
  890. TEMPLATE_DIR / "commentsExtended.xml", self.comments_extended_path
  891. )
  892. editor = self["word/commentsExtended.xml"]
  893. root = editor.get_node(tag="w15:commentsEx")
  894. if parent_para_id:
  895. xml = f'<w15:commentEx w15:paraId="{para_id}" w15:paraIdParent="{parent_para_id}" w15:done="0"/>'
  896. else:
  897. xml = f'<w15:commentEx w15:paraId="{para_id}" w15:done="0"/>'
  898. editor.append_to(root, xml)
  899. def _add_to_comments_ids_xml(self, para_id, durable_id):
  900. """Add a single comment to commentsIds.xml."""
  901. if not self.comments_ids_path.exists():
  902. shutil.copy(TEMPLATE_DIR / "commentsIds.xml", self.comments_ids_path)
  903. editor = self["word/commentsIds.xml"]
  904. root = editor.get_node(tag="w16cid:commentsIds")
  905. xml = f'<w16cid:commentId w16cid:paraId="{para_id}" w16cid:durableId="{durable_id}"/>'
  906. editor.append_to(root, xml)
  907. def _add_to_comments_extensible_xml(self, durable_id):
  908. """Add a single comment to commentsExtensible.xml."""
  909. if not self.comments_extensible_path.exists():
  910. shutil.copy(
  911. TEMPLATE_DIR / "commentsExtensible.xml", self.comments_extensible_path
  912. )
  913. editor = self["word/commentsExtensible.xml"]
  914. root = editor.get_node(tag="w16cex:commentsExtensible")
  915. xml = f'<w16cex:commentExtensible w16cex:durableId="{durable_id}"/>'
  916. editor.append_to(root, xml)
  917. # ==================== Private: XML Fragments ====================
  918. def _comment_range_start_xml(self, comment_id):
  919. """Generate XML for comment range start."""
  920. return f'<w:commentRangeStart w:id="{comment_id}"/>'
  921. def _comment_range_end_xml(self, comment_id):
  922. """Generate XML for comment range end with reference run.
  923. Note: w:rsidR is automatically added by DocxXMLEditor.
  924. """
  925. return f'''<w:commentRangeEnd w:id="{comment_id}"/>
  926. <w:r>
  927. <w:rPr><w:rStyle w:val="CommentReference"/></w:rPr>
  928. <w:commentReference w:id="{comment_id}"/>
  929. </w:r>'''
  930. def _comment_ref_run_xml(self, comment_id):
  931. """Generate XML for comment reference run.
  932. Note: w:rsidR is automatically added by DocxXMLEditor.
  933. """
  934. return f'''<w:r>
  935. <w:rPr><w:rStyle w:val="CommentReference"/></w:rPr>
  936. <w:commentReference w:id="{comment_id}"/>
  937. </w:r>'''
  938. # ==================== Private: Metadata Updates ====================
  939. def _has_relationship(self, editor, target):
  940. """Check if a relationship with given target exists."""
  941. for rel_elem in editor.dom.getElementsByTagName("Relationship"):
  942. if rel_elem.getAttribute("Target") == target:
  943. return True
  944. return False
  945. def _has_override(self, editor, part_name):
  946. """Check if an override with given part name exists."""
  947. for override_elem in editor.dom.getElementsByTagName("Override"):
  948. if override_elem.getAttribute("PartName") == part_name:
  949. return True
  950. return False
  951. def _has_author(self, editor, author):
  952. """Check if an author already exists in people.xml."""
  953. for person_elem in editor.dom.getElementsByTagName("w15:person"):
  954. if person_elem.getAttribute("w15:author") == author:
  955. return True
  956. return False
  957. def _add_author_to_people(self, author):
  958. """Add author to people.xml (called during initialization)."""
  959. people_path = self.word_path / "people.xml"
  960. # people.xml should already exist from _setup_tracking
  961. if not people_path.exists():
  962. raise ValueError("people.xml should exist after _setup_tracking")
  963. editor = self["word/people.xml"]
  964. root = editor.get_node(tag="w15:people")
  965. # Check if author already exists
  966. if self._has_author(editor, author):
  967. return
  968. # Add author with proper XML escaping to prevent injection
  969. escaped_author = html.escape(author, quote=True)
  970. person_xml = f'''<w15:person w15:author="{escaped_author}">
  971. <w15:presenceInfo w15:providerId="None" w15:userId="{escaped_author}"/>
  972. </w15:person>'''
  973. editor.append_to(root, person_xml)
  974. def _ensure_comment_relationships(self):
  975. """Ensure word/_rels/document.xml.rels has comment relationships."""
  976. editor = self["word/_rels/document.xml.rels"]
  977. if self._has_relationship(editor, "comments.xml"):
  978. return
  979. root = editor.dom.documentElement
  980. root_tag = root.tagName # type: ignore
  981. prefix = root_tag.split(":")[0] + ":" if ":" in root_tag else ""
  982. next_rid_num = int(editor.get_next_rid()[3:])
  983. # Add relationship elements
  984. rels = [
  985. (
  986. next_rid_num,
  987. "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments",
  988. "comments.xml",
  989. ),
  990. (
  991. next_rid_num + 1,
  992. "http://schemas.microsoft.com/office/2011/relationships/commentsExtended",
  993. "commentsExtended.xml",
  994. ),
  995. (
  996. next_rid_num + 2,
  997. "http://schemas.microsoft.com/office/2016/09/relationships/commentsIds",
  998. "commentsIds.xml",
  999. ),
  1000. (
  1001. next_rid_num + 3,
  1002. "http://schemas.microsoft.com/office/2018/08/relationships/commentsExtensible",
  1003. "commentsExtensible.xml",
  1004. ),
  1005. ]
  1006. for rel_id, rel_type, target in rels:
  1007. rel_xml = f'<{prefix}Relationship Id="rId{rel_id}" Type="{rel_type}" Target="{target}"/>'
  1008. editor.append_to(root, rel_xml)
  1009. def _ensure_comment_content_types(self):
  1010. """Ensure [Content_Types].xml has comment content types."""
  1011. editor = self["[Content_Types].xml"]
  1012. if self._has_override(editor, "/word/comments.xml"):
  1013. return
  1014. root = editor.dom.documentElement
  1015. # Add Override elements
  1016. overrides = [
  1017. (
  1018. "/word/comments.xml",
  1019. "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml",
  1020. ),
  1021. (
  1022. "/word/commentsExtended.xml",
  1023. "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsExtended+xml",
  1024. ),
  1025. (
  1026. "/word/commentsIds.xml",
  1027. "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsIds+xml",
  1028. ),
  1029. (
  1030. "/word/commentsExtensible.xml",
  1031. "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsExtensible+xml",
  1032. ),
  1033. ]
  1034. for part_name, content_type in overrides:
  1035. override_xml = (
  1036. f'<Override PartName="{part_name}" ContentType="{content_type}"/>'
  1037. )
  1038. editor.append_to(root, override_xml)