| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276 |
- #!/usr/bin/env python3
- """
- Library for working with Word documents: comments, tracked changes, and editing.
- Usage:
- from skills.docx.scripts.document import Document
- # Initialize
- doc = Document('workspace/unpacked')
- doc = Document('workspace/unpacked', author="John Doe", initials="JD")
- # Find nodes
- node = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "1"})
- node = doc["word/document.xml"].get_node(tag="w:p", line_number=10)
- # Add comments
- doc.add_comment(start=node, end=node, text="Comment text")
- doc.reply_to_comment(parent_comment_id=0, text="Reply text")
- # Suggest tracked changes
- doc["word/document.xml"].suggest_deletion(node) # Delete content
- doc["word/document.xml"].revert_insertion(ins_node) # Reject insertion
- doc["word/document.xml"].revert_deletion(del_node) # Reject deletion
- # Save
- doc.save()
- """
- import html
- import random
- import shutil
- import tempfile
- from datetime import datetime, timezone
- from pathlib import Path
- from defusedxml import minidom
- from ooxml.scripts.pack import pack_document
- from ooxml.scripts.validation.docx import DOCXSchemaValidator
- from ooxml.scripts.validation.redlining import RedliningValidator
- from .utilities import XMLEditor
- # Path to template files
- TEMPLATE_DIR = Path(__file__).parent / "templates"
- class DocxXMLEditor(XMLEditor):
- """XMLEditor that automatically applies RSID, author, and date to new elements.
- Automatically adds attributes to elements that support them when inserting new content:
- - w:rsidR, w:rsidRDefault, w:rsidP (for w:p and w:r elements)
- - w:author and w:date (for w:ins, w:del, w:comment elements)
- - w:id (for w:ins and w:del elements)
- Attributes:
- dom (defusedxml.minidom.Document): The DOM document for direct manipulation
- """
- def __init__(
- self, xml_path, rsid: str, author: str = "Claude", initials: str = "C"
- ):
- """Initialize with required RSID and optional author.
- Args:
- xml_path: Path to XML file to edit
- rsid: RSID to automatically apply to new elements
- author: Author name for tracked changes and comments (default: "Claude")
- initials: Author initials (default: "C")
- """
- super().__init__(xml_path)
- self.rsid = rsid
- self.author = author
- self.initials = initials
- def _get_next_change_id(self):
- """Get the next available change ID by checking all tracked change elements."""
- max_id = -1
- for tag in ("w:ins", "w:del"):
- elements = self.dom.getElementsByTagName(tag)
- for elem in elements:
- change_id = elem.getAttribute("w:id")
- if change_id:
- try:
- max_id = max(max_id, int(change_id))
- except ValueError:
- pass
- return max_id + 1
- def _ensure_w16du_namespace(self):
- """Ensure w16du namespace is declared on the root element."""
- root = self.dom.documentElement
- if not root.hasAttribute("xmlns:w16du"): # type: ignore
- root.setAttribute( # type: ignore
- "xmlns:w16du",
- "http://schemas.microsoft.com/office/word/2023/wordml/word16du",
- )
- def _ensure_w16cex_namespace(self):
- """Ensure w16cex namespace is declared on the root element."""
- root = self.dom.documentElement
- if not root.hasAttribute("xmlns:w16cex"): # type: ignore
- root.setAttribute( # type: ignore
- "xmlns:w16cex",
- "http://schemas.microsoft.com/office/word/2018/wordml/cex",
- )
- def _ensure_w14_namespace(self):
- """Ensure w14 namespace is declared on the root element."""
- root = self.dom.documentElement
- if not root.hasAttribute("xmlns:w14"): # type: ignore
- root.setAttribute( # type: ignore
- "xmlns:w14",
- "http://schemas.microsoft.com/office/word/2010/wordml",
- )
- def _inject_attributes_to_nodes(self, nodes):
- """Inject RSID, author, and date attributes into DOM nodes where applicable.
- Adds attributes to elements that support them:
- - w:r: gets w:rsidR (or w:rsidDel if inside w:del)
- - w:p: gets w:rsidR, w:rsidRDefault, w:rsidP, w14:paraId, w14:textId
- - w:t: gets xml:space="preserve" if text has leading/trailing whitespace
- - w:ins, w:del: get w:id, w:author, w:date, w16du:dateUtc
- - w:comment: gets w:author, w:date, w:initials
- - w16cex:commentExtensible: gets w16cex:dateUtc
- Args:
- nodes: List of DOM nodes to process
- """
- from datetime import datetime, timezone
- timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
- def is_inside_deletion(elem):
- """Check if element is inside a w:del element."""
- parent = elem.parentNode
- while parent:
- if parent.nodeType == parent.ELEMENT_NODE and parent.tagName == "w:del":
- return True
- parent = parent.parentNode
- return False
- def add_rsid_to_p(elem):
- if not elem.hasAttribute("w:rsidR"):
- elem.setAttribute("w:rsidR", self.rsid)
- if not elem.hasAttribute("w:rsidRDefault"):
- elem.setAttribute("w:rsidRDefault", self.rsid)
- if not elem.hasAttribute("w:rsidP"):
- elem.setAttribute("w:rsidP", self.rsid)
- # Add w14:paraId and w14:textId if not present
- if not elem.hasAttribute("w14:paraId"):
- self._ensure_w14_namespace()
- elem.setAttribute("w14:paraId", _generate_hex_id())
- if not elem.hasAttribute("w14:textId"):
- self._ensure_w14_namespace()
- elem.setAttribute("w14:textId", _generate_hex_id())
- def add_rsid_to_r(elem):
- # Use w:rsidDel for <w:r> inside <w:del>, otherwise w:rsidR
- if is_inside_deletion(elem):
- if not elem.hasAttribute("w:rsidDel"):
- elem.setAttribute("w:rsidDel", self.rsid)
- else:
- if not elem.hasAttribute("w:rsidR"):
- elem.setAttribute("w:rsidR", self.rsid)
- def add_tracked_change_attrs(elem):
- # Auto-assign w:id if not present
- if not elem.hasAttribute("w:id"):
- elem.setAttribute("w:id", str(self._get_next_change_id()))
- if not elem.hasAttribute("w:author"):
- elem.setAttribute("w:author", self.author)
- if not elem.hasAttribute("w:date"):
- elem.setAttribute("w:date", timestamp)
- # Add w16du:dateUtc for tracked changes (same as w:date since we generate UTC timestamps)
- if elem.tagName in ("w:ins", "w:del") and not elem.hasAttribute(
- "w16du:dateUtc"
- ):
- self._ensure_w16du_namespace()
- elem.setAttribute("w16du:dateUtc", timestamp)
- def add_comment_attrs(elem):
- if not elem.hasAttribute("w:author"):
- elem.setAttribute("w:author", self.author)
- if not elem.hasAttribute("w:date"):
- elem.setAttribute("w:date", timestamp)
- if not elem.hasAttribute("w:initials"):
- elem.setAttribute("w:initials", self.initials)
- def add_comment_extensible_date(elem):
- # Add w16cex:dateUtc for comment extensible elements
- if not elem.hasAttribute("w16cex:dateUtc"):
- self._ensure_w16cex_namespace()
- elem.setAttribute("w16cex:dateUtc", timestamp)
- def add_xml_space_to_t(elem):
- # Add xml:space="preserve" to w:t if text has leading/trailing whitespace
- if (
- elem.firstChild
- and elem.firstChild.nodeType == elem.firstChild.TEXT_NODE
- ):
- text = elem.firstChild.data
- if text and (text[0].isspace() or text[-1].isspace()):
- if not elem.hasAttribute("xml:space"):
- elem.setAttribute("xml:space", "preserve")
- for node in nodes:
- if node.nodeType != node.ELEMENT_NODE:
- continue
- # Handle the node itself
- if node.tagName == "w:p":
- add_rsid_to_p(node)
- elif node.tagName == "w:r":
- add_rsid_to_r(node)
- elif node.tagName == "w:t":
- add_xml_space_to_t(node)
- elif node.tagName in ("w:ins", "w:del"):
- add_tracked_change_attrs(node)
- elif node.tagName == "w:comment":
- add_comment_attrs(node)
- elif node.tagName == "w16cex:commentExtensible":
- add_comment_extensible_date(node)
- # Process descendants (getElementsByTagName doesn't return the element itself)
- for elem in node.getElementsByTagName("w:p"):
- add_rsid_to_p(elem)
- for elem in node.getElementsByTagName("w:r"):
- add_rsid_to_r(elem)
- for elem in node.getElementsByTagName("w:t"):
- add_xml_space_to_t(elem)
- for tag in ("w:ins", "w:del"):
- for elem in node.getElementsByTagName(tag):
- add_tracked_change_attrs(elem)
- for elem in node.getElementsByTagName("w:comment"):
- add_comment_attrs(elem)
- for elem in node.getElementsByTagName("w16cex:commentExtensible"):
- add_comment_extensible_date(elem)
- def replace_node(self, elem, new_content):
- """Replace node with automatic attribute injection."""
- nodes = super().replace_node(elem, new_content)
- self._inject_attributes_to_nodes(nodes)
- return nodes
- def insert_after(self, elem, xml_content):
- """Insert after with automatic attribute injection."""
- nodes = super().insert_after(elem, xml_content)
- self._inject_attributes_to_nodes(nodes)
- return nodes
- def insert_before(self, elem, xml_content):
- """Insert before with automatic attribute injection."""
- nodes = super().insert_before(elem, xml_content)
- self._inject_attributes_to_nodes(nodes)
- return nodes
- def append_to(self, elem, xml_content):
- """Append to with automatic attribute injection."""
- nodes = super().append_to(elem, xml_content)
- self._inject_attributes_to_nodes(nodes)
- return nodes
- def revert_insertion(self, elem):
- """Reject an insertion by wrapping its content in a deletion.
- Wraps all runs inside w:ins in w:del, converting w:t to w:delText.
- Can process a single w:ins element or a container element with multiple w:ins.
- Args:
- elem: Element to process (w:ins, w:p, w:body, etc.)
- Returns:
- list: List containing the processed element(s)
- Raises:
- ValueError: If the element contains no w:ins elements
- Example:
- # Reject a single insertion
- ins = doc["word/document.xml"].get_node(tag="w:ins", attrs={"w:id": "5"})
- doc["word/document.xml"].revert_insertion(ins)
- # Reject all insertions in a paragraph
- para = doc["word/document.xml"].get_node(tag="w:p", line_number=42)
- doc["word/document.xml"].revert_insertion(para)
- """
- # Collect insertions
- ins_elements = []
- if elem.tagName == "w:ins":
- ins_elements.append(elem)
- else:
- ins_elements.extend(elem.getElementsByTagName("w:ins"))
- # Validate that there are insertions to reject
- if not ins_elements:
- raise ValueError(
- f"revert_insertion requires w:ins elements. "
- f"The provided element <{elem.tagName}> contains no insertions. "
- )
- # Process all insertions - wrap all children in w:del
- for ins_elem in ins_elements:
- runs = list(ins_elem.getElementsByTagName("w:r"))
- if not runs:
- continue
- # Create deletion wrapper
- del_wrapper = self.dom.createElement("w:del")
- # Process each run
- for run in runs:
- # Convert w:t → w:delText and w:rsidR → w:rsidDel
- if run.hasAttribute("w:rsidR"):
- run.setAttribute("w:rsidDel", run.getAttribute("w:rsidR"))
- run.removeAttribute("w:rsidR")
- elif not run.hasAttribute("w:rsidDel"):
- run.setAttribute("w:rsidDel", self.rsid)
- for t_elem in list(run.getElementsByTagName("w:t")):
- del_text = self.dom.createElement("w:delText")
- # Copy ALL child nodes (not just firstChild) to handle entities
- while t_elem.firstChild:
- del_text.appendChild(t_elem.firstChild)
- for i in range(t_elem.attributes.length):
- attr = t_elem.attributes.item(i)
- del_text.setAttribute(attr.name, attr.value)
- t_elem.parentNode.replaceChild(del_text, t_elem)
- # Move all children from ins to del wrapper
- while ins_elem.firstChild:
- del_wrapper.appendChild(ins_elem.firstChild)
- # Add del wrapper back to ins
- ins_elem.appendChild(del_wrapper)
- # Inject attributes to the deletion wrapper
- self._inject_attributes_to_nodes([del_wrapper])
- return [elem]
- def revert_deletion(self, elem):
- """Reject a deletion by re-inserting the deleted content.
- Creates w:ins elements after each w:del, copying deleted content and
- converting w:delText back to w:t.
- Can process a single w:del element or a container element with multiple w:del.
- Args:
- elem: Element to process (w:del, w:p, w:body, etc.)
- Returns:
- list: If elem is w:del, returns [elem, new_ins]. Otherwise returns [elem].
- Raises:
- ValueError: If the element contains no w:del elements
- Example:
- # Reject a single deletion - returns [w:del, w:ins]
- del_elem = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "3"})
- nodes = doc["word/document.xml"].revert_deletion(del_elem)
- # Reject all deletions in a paragraph - returns [para]
- para = doc["word/document.xml"].get_node(tag="w:p", line_number=42)
- nodes = doc["word/document.xml"].revert_deletion(para)
- """
- # Collect deletions FIRST - before we modify the DOM
- del_elements = []
- is_single_del = elem.tagName == "w:del"
- if is_single_del:
- del_elements.append(elem)
- else:
- del_elements.extend(elem.getElementsByTagName("w:del"))
- # Validate that there are deletions to reject
- if not del_elements:
- raise ValueError(
- f"revert_deletion requires w:del elements. "
- f"The provided element <{elem.tagName}> contains no deletions. "
- )
- # Track created insertion (only relevant if elem is a single w:del)
- created_insertion = None
- # Process all deletions - create insertions that copy the deleted content
- for del_elem in del_elements:
- # Clone the deleted runs and convert them to insertions
- runs = list(del_elem.getElementsByTagName("w:r"))
- if not runs:
- continue
- # Create insertion wrapper
- ins_elem = self.dom.createElement("w:ins")
- for run in runs:
- # Clone the run
- new_run = run.cloneNode(True)
- # Convert w:delText → w:t
- for del_text in list(new_run.getElementsByTagName("w:delText")):
- t_elem = self.dom.createElement("w:t")
- # Copy ALL child nodes (not just firstChild) to handle entities
- while del_text.firstChild:
- t_elem.appendChild(del_text.firstChild)
- for i in range(del_text.attributes.length):
- attr = del_text.attributes.item(i)
- t_elem.setAttribute(attr.name, attr.value)
- del_text.parentNode.replaceChild(t_elem, del_text)
- # Update run attributes: w:rsidDel → w:rsidR
- if new_run.hasAttribute("w:rsidDel"):
- new_run.setAttribute("w:rsidR", new_run.getAttribute("w:rsidDel"))
- new_run.removeAttribute("w:rsidDel")
- elif not new_run.hasAttribute("w:rsidR"):
- new_run.setAttribute("w:rsidR", self.rsid)
- ins_elem.appendChild(new_run)
- # Insert the new insertion after the deletion
- nodes = self.insert_after(del_elem, ins_elem.toxml())
- # If processing a single w:del, track the created insertion
- if is_single_del and nodes:
- created_insertion = nodes[0]
- # Return based on input type
- if is_single_del and created_insertion:
- return [elem, created_insertion]
- else:
- return [elem]
- @staticmethod
- def suggest_paragraph(xml_content: str) -> str:
- """Transform paragraph XML to add tracked change wrapping for insertion.
- Wraps runs in <w:ins> and adds <w:ins/> to w:rPr in w:pPr for numbered lists.
- Args:
- xml_content: XML string containing a <w:p> element
- Returns:
- str: Transformed XML with tracked change wrapping
- """
- wrapper = f'<root xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">{xml_content}</root>'
- doc = minidom.parseString(wrapper)
- para = doc.getElementsByTagName("w:p")[0]
- # Ensure w:pPr exists
- pPr_list = para.getElementsByTagName("w:pPr")
- if not pPr_list:
- pPr = doc.createElement("w:pPr")
- para.insertBefore(
- pPr, para.firstChild
- ) if para.firstChild else para.appendChild(pPr)
- else:
- pPr = pPr_list[0]
- # Ensure w:rPr exists in w:pPr
- rPr_list = pPr.getElementsByTagName("w:rPr")
- if not rPr_list:
- rPr = doc.createElement("w:rPr")
- pPr.appendChild(rPr)
- else:
- rPr = rPr_list[0]
- # Add <w:ins/> to w:rPr
- ins_marker = doc.createElement("w:ins")
- rPr.insertBefore(
- ins_marker, rPr.firstChild
- ) if rPr.firstChild else rPr.appendChild(ins_marker)
- # Wrap all non-pPr children in <w:ins>
- ins_wrapper = doc.createElement("w:ins")
- for child in [c for c in para.childNodes if c.nodeName != "w:pPr"]:
- para.removeChild(child)
- ins_wrapper.appendChild(child)
- para.appendChild(ins_wrapper)
- return para.toxml()
- def suggest_deletion(self, elem):
- """Mark a w:r or w:p element as deleted with tracked changes (in-place DOM manipulation).
- For w:r: wraps in <w:del>, converts <w:t> to <w:delText>, preserves w:rPr
- For w:p (regular): wraps content in <w:del>, converts <w:t> to <w:delText>
- For w:p (numbered list): adds <w:del/> to w:rPr in w:pPr, wraps content in <w:del>
- Args:
- elem: A w:r or w:p DOM element without existing tracked changes
- Returns:
- Element: The modified element
- Raises:
- ValueError: If element has existing tracked changes or invalid structure
- """
- if elem.nodeName == "w:r":
- # Check for existing w:delText
- if elem.getElementsByTagName("w:delText"):
- raise ValueError("w:r element already contains w:delText")
- # Convert w:t → w:delText
- for t_elem in list(elem.getElementsByTagName("w:t")):
- del_text = self.dom.createElement("w:delText")
- # Copy ALL child nodes (not just firstChild) to handle entities
- while t_elem.firstChild:
- del_text.appendChild(t_elem.firstChild)
- # Preserve attributes like xml:space
- for i in range(t_elem.attributes.length):
- attr = t_elem.attributes.item(i)
- del_text.setAttribute(attr.name, attr.value)
- t_elem.parentNode.replaceChild(del_text, t_elem)
- # Update run attributes: w:rsidR → w:rsidDel
- if elem.hasAttribute("w:rsidR"):
- elem.setAttribute("w:rsidDel", elem.getAttribute("w:rsidR"))
- elem.removeAttribute("w:rsidR")
- elif not elem.hasAttribute("w:rsidDel"):
- elem.setAttribute("w:rsidDel", self.rsid)
- # Wrap in w:del
- del_wrapper = self.dom.createElement("w:del")
- parent = elem.parentNode
- parent.insertBefore(del_wrapper, elem)
- parent.removeChild(elem)
- del_wrapper.appendChild(elem)
- # Inject attributes to the deletion wrapper
- self._inject_attributes_to_nodes([del_wrapper])
- return del_wrapper
- elif elem.nodeName == "w:p":
- # Check for existing tracked changes
- if elem.getElementsByTagName("w:ins") or elem.getElementsByTagName("w:del"):
- raise ValueError("w:p element already contains tracked changes")
- # Check if it's a numbered list item
- pPr_list = elem.getElementsByTagName("w:pPr")
- is_numbered = pPr_list and pPr_list[0].getElementsByTagName("w:numPr")
- if is_numbered:
- # Add <w:del/> to w:rPr in w:pPr
- pPr = pPr_list[0]
- rPr_list = pPr.getElementsByTagName("w:rPr")
- if not rPr_list:
- rPr = self.dom.createElement("w:rPr")
- pPr.appendChild(rPr)
- else:
- rPr = rPr_list[0]
- # Add <w:del/> marker
- del_marker = self.dom.createElement("w:del")
- rPr.insertBefore(
- del_marker, rPr.firstChild
- ) if rPr.firstChild else rPr.appendChild(del_marker)
- # Convert w:t → w:delText in all runs
- for t_elem in list(elem.getElementsByTagName("w:t")):
- del_text = self.dom.createElement("w:delText")
- # Copy ALL child nodes (not just firstChild) to handle entities
- while t_elem.firstChild:
- del_text.appendChild(t_elem.firstChild)
- # Preserve attributes like xml:space
- for i in range(t_elem.attributes.length):
- attr = t_elem.attributes.item(i)
- del_text.setAttribute(attr.name, attr.value)
- t_elem.parentNode.replaceChild(del_text, t_elem)
- # Update run attributes: w:rsidR → w:rsidDel
- for run in elem.getElementsByTagName("w:r"):
- if run.hasAttribute("w:rsidR"):
- run.setAttribute("w:rsidDel", run.getAttribute("w:rsidR"))
- run.removeAttribute("w:rsidR")
- elif not run.hasAttribute("w:rsidDel"):
- run.setAttribute("w:rsidDel", self.rsid)
- # Wrap all non-pPr children in <w:del>
- del_wrapper = self.dom.createElement("w:del")
- for child in [c for c in elem.childNodes if c.nodeName != "w:pPr"]:
- elem.removeChild(child)
- del_wrapper.appendChild(child)
- elem.appendChild(del_wrapper)
- # Inject attributes to the deletion wrapper
- self._inject_attributes_to_nodes([del_wrapper])
- return elem
- else:
- raise ValueError(f"Element must be w:r or w:p, got {elem.nodeName}")
- def _generate_hex_id() -> str:
- """Generate random 8-character hex ID for para/durable IDs.
- Values are constrained to be less than 0x7FFFFFFF per OOXML spec:
- - paraId must be < 0x80000000
- - durableId must be < 0x7FFFFFFF
- We use the stricter constraint (0x7FFFFFFF) for both.
- """
- return f"{random.randint(1, 0x7FFFFFFE):08X}"
- def _generate_rsid() -> str:
- """Generate random 8-character hex RSID."""
- return "".join(random.choices("0123456789ABCDEF", k=8))
- class Document:
- """Manages comments in unpacked Word documents."""
- def __init__(
- self,
- unpacked_dir,
- rsid=None,
- track_revisions=False,
- author="Claude",
- initials="C",
- ):
- """
- Initialize with path to unpacked Word document directory.
- Automatically sets up comment infrastructure (people.xml, RSIDs).
- Args:
- unpacked_dir: Path to unpacked DOCX directory (must contain word/ subdirectory)
- rsid: Optional RSID to use for all comment elements. If not provided, one will be generated.
- track_revisions: If True, enables track revisions in settings.xml (default: False)
- author: Default author name for comments (default: "Claude")
- initials: Default author initials for comments (default: "C")
- """
- self.original_path = Path(unpacked_dir)
- if not self.original_path.exists() or not self.original_path.is_dir():
- raise ValueError(f"Directory not found: {unpacked_dir}")
- # Create temporary directory with subdirectories for unpacked content and baseline
- self.temp_dir = tempfile.mkdtemp(prefix="docx_")
- self.unpacked_path = Path(self.temp_dir) / "unpacked"
- shutil.copytree(self.original_path, self.unpacked_path)
- # Pack original directory into temporary .docx for validation baseline (outside unpacked dir)
- self.original_docx = Path(self.temp_dir) / "original.docx"
- pack_document(self.original_path, self.original_docx, validate=False)
- self.word_path = self.unpacked_path / "word"
- # Generate RSID if not provided
- self.rsid = rsid if rsid else _generate_rsid()
- print(f"Using RSID: {self.rsid}")
- # Set default author and initials
- self.author = author
- self.initials = initials
- # Cache for lazy-loaded editors
- self._editors = {}
- # Comment file paths
- self.comments_path = self.word_path / "comments.xml"
- self.comments_extended_path = self.word_path / "commentsExtended.xml"
- self.comments_ids_path = self.word_path / "commentsIds.xml"
- self.comments_extensible_path = self.word_path / "commentsExtensible.xml"
- # Load existing comments and determine next ID (before setup modifies files)
- self.existing_comments = self._load_existing_comments()
- self.next_comment_id = self._get_next_comment_id()
- # Convenient access to document.xml editor (semi-private)
- self._document = self["word/document.xml"]
- # Setup tracked changes infrastructure
- self._setup_tracking(track_revisions=track_revisions)
- # Add author to people.xml
- self._add_author_to_people(author)
- def __getitem__(self, xml_path: str) -> DocxXMLEditor:
- """
- Get or create a DocxXMLEditor for the specified XML file.
- Enables lazy-loaded editors with bracket notation:
- node = doc["word/document.xml"].get_node(tag="w:p", line_number=42)
- Args:
- xml_path: Relative path to XML file (e.g., "word/document.xml", "word/comments.xml")
- Returns:
- DocxXMLEditor instance for the specified file
- Raises:
- ValueError: If the file does not exist
- Example:
- # Get node from document.xml
- node = doc["word/document.xml"].get_node(tag="w:del", attrs={"w:id": "1"})
- # Get node from comments.xml
- comment = doc["word/comments.xml"].get_node(tag="w:comment", attrs={"w:id": "0"})
- """
- if xml_path not in self._editors:
- file_path = self.unpacked_path / xml_path
- if not file_path.exists():
- raise ValueError(f"XML file not found: {xml_path}")
- # Use DocxXMLEditor with RSID, author, and initials for all editors
- self._editors[xml_path] = DocxXMLEditor(
- file_path, rsid=self.rsid, author=self.author, initials=self.initials
- )
- return self._editors[xml_path]
- def add_comment(self, start, end, text: str) -> int:
- """
- Add a comment spanning from one element to another.
- Args:
- start: DOM element for the starting point
- end: DOM element for the ending point
- text: Comment content
- Returns:
- The comment ID that was created
- Example:
- start_node = cm.get_document_node(tag="w:del", id="1")
- end_node = cm.get_document_node(tag="w:ins", id="2")
- cm.add_comment(start=start_node, end=end_node, text="Explanation")
- """
- comment_id = self.next_comment_id
- para_id = _generate_hex_id()
- durable_id = _generate_hex_id()
- timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
- # Add comment ranges to document.xml immediately
- self._document.insert_before(start, self._comment_range_start_xml(comment_id))
- # If end node is a paragraph, append comment markup inside it
- # Otherwise insert after it (for run-level anchors)
- if end.tagName == "w:p":
- self._document.append_to(end, self._comment_range_end_xml(comment_id))
- else:
- self._document.insert_after(end, self._comment_range_end_xml(comment_id))
- # Add to comments.xml immediately
- self._add_to_comments_xml(
- comment_id, para_id, text, self.author, self.initials, timestamp
- )
- # Add to commentsExtended.xml immediately
- self._add_to_comments_extended_xml(para_id, parent_para_id=None)
- # Add to commentsIds.xml immediately
- self._add_to_comments_ids_xml(para_id, durable_id)
- # Add to commentsExtensible.xml immediately
- self._add_to_comments_extensible_xml(durable_id)
- # Update existing_comments so replies work
- self.existing_comments[comment_id] = {"para_id": para_id}
- self.next_comment_id += 1
- return comment_id
- def reply_to_comment(
- self,
- parent_comment_id: int,
- text: str,
- ) -> int:
- """
- Add a reply to an existing comment.
- Args:
- parent_comment_id: The w:id of the parent comment to reply to
- text: Reply text
- Returns:
- The comment ID that was created for the reply
- Example:
- cm.reply_to_comment(parent_comment_id=0, text="I agree with this change")
- """
- if parent_comment_id not in self.existing_comments:
- raise ValueError(f"Parent comment with id={parent_comment_id} not found")
- parent_info = self.existing_comments[parent_comment_id]
- comment_id = self.next_comment_id
- para_id = _generate_hex_id()
- durable_id = _generate_hex_id()
- timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
- # Add comment ranges to document.xml immediately
- parent_start_elem = self._document.get_node(
- tag="w:commentRangeStart", attrs={"w:id": str(parent_comment_id)}
- )
- parent_ref_elem = self._document.get_node(
- tag="w:commentReference", attrs={"w:id": str(parent_comment_id)}
- )
- self._document.insert_after(
- parent_start_elem, self._comment_range_start_xml(comment_id)
- )
- parent_ref_run = parent_ref_elem.parentNode
- self._document.insert_after(
- parent_ref_run, f'<w:commentRangeEnd w:id="{comment_id}"/>'
- )
- self._document.insert_after(
- parent_ref_run, self._comment_ref_run_xml(comment_id)
- )
- # Add to comments.xml immediately
- self._add_to_comments_xml(
- comment_id, para_id, text, self.author, self.initials, timestamp
- )
- # Add to commentsExtended.xml immediately (with parent)
- self._add_to_comments_extended_xml(
- para_id, parent_para_id=parent_info["para_id"]
- )
- # Add to commentsIds.xml immediately
- self._add_to_comments_ids_xml(para_id, durable_id)
- # Add to commentsExtensible.xml immediately
- self._add_to_comments_extensible_xml(durable_id)
- # Update existing_comments so replies work
- self.existing_comments[comment_id] = {"para_id": para_id}
- self.next_comment_id += 1
- return comment_id
- def __del__(self):
- """Clean up temporary directory on deletion."""
- if hasattr(self, "temp_dir") and Path(self.temp_dir).exists():
- shutil.rmtree(self.temp_dir)
- def validate(self) -> None:
- """
- Validate the document against XSD schema and redlining rules.
- Raises:
- ValueError: If validation fails.
- """
- # Create validators with current state
- schema_validator = DOCXSchemaValidator(
- self.unpacked_path, self.original_docx, verbose=False
- )
- redlining_validator = RedliningValidator(
- self.unpacked_path, self.original_docx, verbose=False
- )
- # Run validations
- if not schema_validator.validate():
- raise ValueError("Schema validation failed")
- if not redlining_validator.validate():
- raise ValueError("Redlining validation failed")
- def save(self, destination=None, validate=True) -> None:
- """
- Save all modified XML files to disk and copy to destination directory.
- This persists all changes made via add_comment() and reply_to_comment().
- Args:
- destination: Optional path to save to. If None, saves back to original directory.
- validate: If True, validates document before saving (default: True).
- """
- # Only ensure comment relationships and content types if comment files exist
- if self.comments_path.exists():
- self._ensure_comment_relationships()
- self._ensure_comment_content_types()
- # Save all modified XML files in temp directory
- for editor in self._editors.values():
- editor.save()
- # Validate by default
- if validate:
- self.validate()
- # Copy contents from temp directory to destination (or original directory)
- target_path = Path(destination) if destination else self.original_path
- shutil.copytree(self.unpacked_path, target_path, dirs_exist_ok=True)
- # ==================== Private: Initialization ====================
- def _get_next_comment_id(self):
- """Get the next available comment ID."""
- if not self.comments_path.exists():
- return 0
- editor = self["word/comments.xml"]
- max_id = -1
- for comment_elem in editor.dom.getElementsByTagName("w:comment"):
- comment_id = comment_elem.getAttribute("w:id")
- if comment_id:
- try:
- max_id = max(max_id, int(comment_id))
- except ValueError:
- pass
- return max_id + 1
- def _load_existing_comments(self):
- """Load existing comments from files to enable replies."""
- if not self.comments_path.exists():
- return {}
- editor = self["word/comments.xml"]
- existing = {}
- for comment_elem in editor.dom.getElementsByTagName("w:comment"):
- comment_id = comment_elem.getAttribute("w:id")
- if not comment_id:
- continue
- # Find para_id from the w:p element within the comment
- para_id = None
- for p_elem in comment_elem.getElementsByTagName("w:p"):
- para_id = p_elem.getAttribute("w14:paraId")
- if para_id:
- break
- if not para_id:
- continue
- existing[int(comment_id)] = {"para_id": para_id}
- return existing
- # ==================== Private: Setup Methods ====================
- def _setup_tracking(self, track_revisions=False):
- """Set up comment infrastructure in unpacked directory.
- Args:
- track_revisions: If True, enables track revisions in settings.xml
- """
- # Create or update word/people.xml
- people_file = self.word_path / "people.xml"
- self._update_people_xml(people_file)
- # Update XML files
- self._add_content_type_for_people(self.unpacked_path / "[Content_Types].xml")
- self._add_relationship_for_people(
- self.word_path / "_rels" / "document.xml.rels"
- )
- # Always add RSID to settings.xml, optionally enable trackRevisions
- self._update_settings(
- self.word_path / "settings.xml", track_revisions=track_revisions
- )
- def _update_people_xml(self, path):
- """Create people.xml if it doesn't exist."""
- if not path.exists():
- # Copy from template
- shutil.copy(TEMPLATE_DIR / "people.xml", path)
- def _add_content_type_for_people(self, path):
- """Add people.xml content type to [Content_Types].xml if not already present."""
- editor = self["[Content_Types].xml"]
- if self._has_override(editor, "/word/people.xml"):
- return
- # Add Override element
- root = editor.dom.documentElement
- override_xml = '<Override PartName="/word/people.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.people+xml"/>'
- editor.append_to(root, override_xml)
- def _add_relationship_for_people(self, path):
- """Add people.xml relationship to document.xml.rels if not already present."""
- editor = self["word/_rels/document.xml.rels"]
- if self._has_relationship(editor, "people.xml"):
- return
- root = editor.dom.documentElement
- root_tag = root.tagName # type: ignore
- prefix = root_tag.split(":")[0] + ":" if ":" in root_tag else ""
- next_rid = editor.get_next_rid()
- # Create the relationship entry
- rel_xml = f'<{prefix}Relationship Id="{next_rid}" Type="http://schemas.microsoft.com/office/2011/relationships/people" Target="people.xml"/>'
- editor.append_to(root, rel_xml)
- def _update_settings(self, path, track_revisions=False):
- """Add RSID and optionally enable track revisions in settings.xml.
- Args:
- path: Path to settings.xml
- track_revisions: If True, adds trackRevisions element
- Places elements per OOXML schema order:
- - trackRevisions: early (before defaultTabStop)
- - rsids: late (after compat)
- """
- editor = self["word/settings.xml"]
- root = editor.get_node(tag="w:settings")
- prefix = root.tagName.split(":")[0] if ":" in root.tagName else "w"
- # Conditionally add trackRevisions if requested
- if track_revisions:
- track_revisions_exists = any(
- elem.tagName == f"{prefix}:trackRevisions"
- for elem in editor.dom.getElementsByTagName(f"{prefix}:trackRevisions")
- )
- if not track_revisions_exists:
- track_rev_xml = f"<{prefix}:trackRevisions/>"
- # Try to insert before documentProtection, defaultTabStop, or at start
- inserted = False
- for tag in [f"{prefix}:documentProtection", f"{prefix}:defaultTabStop"]:
- elements = editor.dom.getElementsByTagName(tag)
- if elements:
- editor.insert_before(elements[0], track_rev_xml)
- inserted = True
- break
- if not inserted:
- # Insert as first child of settings
- if root.firstChild:
- editor.insert_before(root.firstChild, track_rev_xml)
- else:
- editor.append_to(root, track_rev_xml)
- # Always check if rsids section exists
- rsids_elements = editor.dom.getElementsByTagName(f"{prefix}:rsids")
- if not rsids_elements:
- # Add new rsids section
- rsids_xml = f'''<{prefix}:rsids>
- <{prefix}:rsidRoot {prefix}:val="{self.rsid}"/>
- <{prefix}:rsid {prefix}:val="{self.rsid}"/>
- </{prefix}:rsids>'''
- # Try to insert after compat, before clrSchemeMapping, or before closing tag
- inserted = False
- compat_elements = editor.dom.getElementsByTagName(f"{prefix}:compat")
- if compat_elements:
- editor.insert_after(compat_elements[0], rsids_xml)
- inserted = True
- if not inserted:
- clr_elements = editor.dom.getElementsByTagName(
- f"{prefix}:clrSchemeMapping"
- )
- if clr_elements:
- editor.insert_before(clr_elements[0], rsids_xml)
- inserted = True
- if not inserted:
- editor.append_to(root, rsids_xml)
- else:
- # Check if this rsid already exists
- rsids_elem = rsids_elements[0]
- rsid_exists = any(
- elem.getAttribute(f"{prefix}:val") == self.rsid
- for elem in rsids_elem.getElementsByTagName(f"{prefix}:rsid")
- )
- if not rsid_exists:
- rsid_xml = f'<{prefix}:rsid {prefix}:val="{self.rsid}"/>'
- editor.append_to(rsids_elem, rsid_xml)
- # ==================== Private: XML File Creation ====================
- def _add_to_comments_xml(
- self, comment_id, para_id, text, author, initials, timestamp
- ):
- """Add a single comment to comments.xml."""
- if not self.comments_path.exists():
- shutil.copy(TEMPLATE_DIR / "comments.xml", self.comments_path)
- editor = self["word/comments.xml"]
- root = editor.get_node(tag="w:comments")
- escaped_text = (
- text.replace("&", "&").replace("<", "<").replace(">", ">")
- )
- # Note: w:rsidR, w:rsidRDefault, w:rsidP on w:p, w:rsidR on w:r,
- # and w:author, w:date, w:initials on w:comment are automatically added by DocxXMLEditor
- comment_xml = f'''<w:comment w:id="{comment_id}">
- <w:p w14:paraId="{para_id}" w14:textId="77777777">
- <w:r><w:rPr><w:rStyle w:val="CommentReference"/></w:rPr><w:annotationRef/></w:r>
- <w:r><w:rPr><w:color w:val="000000"/><w:sz w:val="20"/><w:szCs w:val="20"/></w:rPr><w:t>{escaped_text}</w:t></w:r>
- </w:p>
- </w:comment>'''
- editor.append_to(root, comment_xml)
- def _add_to_comments_extended_xml(self, para_id, parent_para_id):
- """Add a single comment to commentsExtended.xml."""
- if not self.comments_extended_path.exists():
- shutil.copy(
- TEMPLATE_DIR / "commentsExtended.xml", self.comments_extended_path
- )
- editor = self["word/commentsExtended.xml"]
- root = editor.get_node(tag="w15:commentsEx")
- if parent_para_id:
- xml = f'<w15:commentEx w15:paraId="{para_id}" w15:paraIdParent="{parent_para_id}" w15:done="0"/>'
- else:
- xml = f'<w15:commentEx w15:paraId="{para_id}" w15:done="0"/>'
- editor.append_to(root, xml)
- def _add_to_comments_ids_xml(self, para_id, durable_id):
- """Add a single comment to commentsIds.xml."""
- if not self.comments_ids_path.exists():
- shutil.copy(TEMPLATE_DIR / "commentsIds.xml", self.comments_ids_path)
- editor = self["word/commentsIds.xml"]
- root = editor.get_node(tag="w16cid:commentsIds")
- xml = f'<w16cid:commentId w16cid:paraId="{para_id}" w16cid:durableId="{durable_id}"/>'
- editor.append_to(root, xml)
- def _add_to_comments_extensible_xml(self, durable_id):
- """Add a single comment to commentsExtensible.xml."""
- if not self.comments_extensible_path.exists():
- shutil.copy(
- TEMPLATE_DIR / "commentsExtensible.xml", self.comments_extensible_path
- )
- editor = self["word/commentsExtensible.xml"]
- root = editor.get_node(tag="w16cex:commentsExtensible")
- xml = f'<w16cex:commentExtensible w16cex:durableId="{durable_id}"/>'
- editor.append_to(root, xml)
- # ==================== Private: XML Fragments ====================
- def _comment_range_start_xml(self, comment_id):
- """Generate XML for comment range start."""
- return f'<w:commentRangeStart w:id="{comment_id}"/>'
- def _comment_range_end_xml(self, comment_id):
- """Generate XML for comment range end with reference run.
- Note: w:rsidR is automatically added by DocxXMLEditor.
- """
- return f'''<w:commentRangeEnd w:id="{comment_id}"/>
- <w:r>
- <w:rPr><w:rStyle w:val="CommentReference"/></w:rPr>
- <w:commentReference w:id="{comment_id}"/>
- </w:r>'''
- def _comment_ref_run_xml(self, comment_id):
- """Generate XML for comment reference run.
- Note: w:rsidR is automatically added by DocxXMLEditor.
- """
- return f'''<w:r>
- <w:rPr><w:rStyle w:val="CommentReference"/></w:rPr>
- <w:commentReference w:id="{comment_id}"/>
- </w:r>'''
- # ==================== Private: Metadata Updates ====================
- def _has_relationship(self, editor, target):
- """Check if a relationship with given target exists."""
- for rel_elem in editor.dom.getElementsByTagName("Relationship"):
- if rel_elem.getAttribute("Target") == target:
- return True
- return False
- def _has_override(self, editor, part_name):
- """Check if an override with given part name exists."""
- for override_elem in editor.dom.getElementsByTagName("Override"):
- if override_elem.getAttribute("PartName") == part_name:
- return True
- return False
- def _has_author(self, editor, author):
- """Check if an author already exists in people.xml."""
- for person_elem in editor.dom.getElementsByTagName("w15:person"):
- if person_elem.getAttribute("w15:author") == author:
- return True
- return False
- def _add_author_to_people(self, author):
- """Add author to people.xml (called during initialization)."""
- people_path = self.word_path / "people.xml"
- # people.xml should already exist from _setup_tracking
- if not people_path.exists():
- raise ValueError("people.xml should exist after _setup_tracking")
- editor = self["word/people.xml"]
- root = editor.get_node(tag="w15:people")
- # Check if author already exists
- if self._has_author(editor, author):
- return
- # Add author with proper XML escaping to prevent injection
- escaped_author = html.escape(author, quote=True)
- person_xml = f'''<w15:person w15:author="{escaped_author}">
- <w15:presenceInfo w15:providerId="None" w15:userId="{escaped_author}"/>
- </w15:person>'''
- editor.append_to(root, person_xml)
- def _ensure_comment_relationships(self):
- """Ensure word/_rels/document.xml.rels has comment relationships."""
- editor = self["word/_rels/document.xml.rels"]
- if self._has_relationship(editor, "comments.xml"):
- return
- root = editor.dom.documentElement
- root_tag = root.tagName # type: ignore
- prefix = root_tag.split(":")[0] + ":" if ":" in root_tag else ""
- next_rid_num = int(editor.get_next_rid()[3:])
- # Add relationship elements
- rels = [
- (
- next_rid_num,
- "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments",
- "comments.xml",
- ),
- (
- next_rid_num + 1,
- "http://schemas.microsoft.com/office/2011/relationships/commentsExtended",
- "commentsExtended.xml",
- ),
- (
- next_rid_num + 2,
- "http://schemas.microsoft.com/office/2016/09/relationships/commentsIds",
- "commentsIds.xml",
- ),
- (
- next_rid_num + 3,
- "http://schemas.microsoft.com/office/2018/08/relationships/commentsExtensible",
- "commentsExtensible.xml",
- ),
- ]
- for rel_id, rel_type, target in rels:
- rel_xml = f'<{prefix}Relationship Id="rId{rel_id}" Type="{rel_type}" Target="{target}"/>'
- editor.append_to(root, rel_xml)
- def _ensure_comment_content_types(self):
- """Ensure [Content_Types].xml has comment content types."""
- editor = self["[Content_Types].xml"]
- if self._has_override(editor, "/word/comments.xml"):
- return
- root = editor.dom.documentElement
- # Add Override elements
- overrides = [
- (
- "/word/comments.xml",
- "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml",
- ),
- (
- "/word/commentsExtended.xml",
- "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsExtended+xml",
- ),
- (
- "/word/commentsIds.xml",
- "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsIds+xml",
- ),
- (
- "/word/commentsExtensible.xml",
- "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsExtensible+xml",
- ),
- ]
- for part_name, content_type in overrides:
- override_xml = (
- f'<Override PartName="{part_name}" ContentType="{content_type}"/>'
- )
- editor.append_to(root, override_xml)
|