redlining.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279
  1. """
  2. Validator for tracked changes in Word documents.
  3. """
  4. import subprocess
  5. import tempfile
  6. import zipfile
  7. from pathlib import Path
  8. class RedliningValidator:
  9. """Validator for tracked changes in Word documents."""
  10. def __init__(self, unpacked_dir, original_docx, verbose=False):
  11. self.unpacked_dir = Path(unpacked_dir)
  12. self.original_docx = Path(original_docx)
  13. self.verbose = verbose
  14. self.namespaces = {
  15. "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
  16. }
  17. def validate(self):
  18. """Main validation method that returns True if valid, False otherwise."""
  19. # Verify unpacked directory exists and has correct structure
  20. modified_file = self.unpacked_dir / "word" / "document.xml"
  21. if not modified_file.exists():
  22. print(f"FAILED - Modified document.xml not found at {modified_file}")
  23. return False
  24. # First, check if there are any tracked changes by Claude to validate
  25. try:
  26. import xml.etree.ElementTree as ET
  27. tree = ET.parse(modified_file)
  28. root = tree.getroot()
  29. # Check for w:del or w:ins tags authored by Claude
  30. del_elements = root.findall(".//w:del", self.namespaces)
  31. ins_elements = root.findall(".//w:ins", self.namespaces)
  32. # Filter to only include changes by Claude
  33. claude_del_elements = [
  34. elem
  35. for elem in del_elements
  36. if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude"
  37. ]
  38. claude_ins_elements = [
  39. elem
  40. for elem in ins_elements
  41. if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude"
  42. ]
  43. # Redlining validation is only needed if tracked changes by Claude have been used.
  44. if not claude_del_elements and not claude_ins_elements:
  45. if self.verbose:
  46. print("PASSED - No tracked changes by Claude found.")
  47. return True
  48. except Exception:
  49. # If we can't parse the XML, continue with full validation
  50. pass
  51. # Create temporary directory for unpacking original docx
  52. with tempfile.TemporaryDirectory() as temp_dir:
  53. temp_path = Path(temp_dir)
  54. # Unpack original docx
  55. try:
  56. with zipfile.ZipFile(self.original_docx, "r") as zip_ref:
  57. zip_ref.extractall(temp_path)
  58. except Exception as e:
  59. print(f"FAILED - Error unpacking original docx: {e}")
  60. return False
  61. original_file = temp_path / "word" / "document.xml"
  62. if not original_file.exists():
  63. print(
  64. f"FAILED - Original document.xml not found in {self.original_docx}"
  65. )
  66. return False
  67. # Parse both XML files using xml.etree.ElementTree for redlining validation
  68. try:
  69. import xml.etree.ElementTree as ET
  70. modified_tree = ET.parse(modified_file)
  71. modified_root = modified_tree.getroot()
  72. original_tree = ET.parse(original_file)
  73. original_root = original_tree.getroot()
  74. except ET.ParseError as e:
  75. print(f"FAILED - Error parsing XML files: {e}")
  76. return False
  77. # Remove Claude's tracked changes from both documents
  78. self._remove_claude_tracked_changes(original_root)
  79. self._remove_claude_tracked_changes(modified_root)
  80. # Extract and compare text content
  81. modified_text = self._extract_text_content(modified_root)
  82. original_text = self._extract_text_content(original_root)
  83. if modified_text != original_text:
  84. # Show detailed character-level differences for each paragraph
  85. error_message = self._generate_detailed_diff(
  86. original_text, modified_text
  87. )
  88. print(error_message)
  89. return False
  90. if self.verbose:
  91. print("PASSED - All changes by Claude are properly tracked")
  92. return True
  93. def _generate_detailed_diff(self, original_text, modified_text):
  94. """Generate detailed word-level differences using git word diff."""
  95. error_parts = [
  96. "FAILED - Document text doesn't match after removing Claude's tracked changes",
  97. "",
  98. "Likely causes:",
  99. " 1. Modified text inside another author's <w:ins> or <w:del> tags",
  100. " 2. Made edits without proper tracked changes",
  101. " 3. Didn't nest <w:del> inside <w:ins> when deleting another's insertion",
  102. "",
  103. "For pre-redlined documents, use correct patterns:",
  104. " - To reject another's INSERTION: Nest <w:del> inside their <w:ins>",
  105. " - To restore another's DELETION: Add new <w:ins> AFTER their <w:del>",
  106. "",
  107. ]
  108. # Show git word diff
  109. git_diff = self._get_git_word_diff(original_text, modified_text)
  110. if git_diff:
  111. error_parts.extend(["Differences:", "============", git_diff])
  112. else:
  113. error_parts.append("Unable to generate word diff (git not available)")
  114. return "\n".join(error_parts)
  115. def _get_git_word_diff(self, original_text, modified_text):
  116. """Generate word diff using git with character-level precision."""
  117. try:
  118. with tempfile.TemporaryDirectory() as temp_dir:
  119. temp_path = Path(temp_dir)
  120. # Create two files
  121. original_file = temp_path / "original.txt"
  122. modified_file = temp_path / "modified.txt"
  123. original_file.write_text(original_text, encoding="utf-8")
  124. modified_file.write_text(modified_text, encoding="utf-8")
  125. # Try character-level diff first for precise differences
  126. result = subprocess.run(
  127. [
  128. "git",
  129. "diff",
  130. "--word-diff=plain",
  131. "--word-diff-regex=.", # Character-by-character diff
  132. "-U0", # Zero lines of context - show only changed lines
  133. "--no-index",
  134. str(original_file),
  135. str(modified_file),
  136. ],
  137. capture_output=True,
  138. text=True,
  139. )
  140. if result.stdout.strip():
  141. # Clean up the output - remove git diff header lines
  142. lines = result.stdout.split("\n")
  143. # Skip the header lines (diff --git, index, +++, ---, @@)
  144. content_lines = []
  145. in_content = False
  146. for line in lines:
  147. if line.startswith("@@"):
  148. in_content = True
  149. continue
  150. if in_content and line.strip():
  151. content_lines.append(line)
  152. if content_lines:
  153. return "\n".join(content_lines)
  154. # Fallback to word-level diff if character-level is too verbose
  155. result = subprocess.run(
  156. [
  157. "git",
  158. "diff",
  159. "--word-diff=plain",
  160. "-U0", # Zero lines of context
  161. "--no-index",
  162. str(original_file),
  163. str(modified_file),
  164. ],
  165. capture_output=True,
  166. text=True,
  167. )
  168. if result.stdout.strip():
  169. lines = result.stdout.split("\n")
  170. content_lines = []
  171. in_content = False
  172. for line in lines:
  173. if line.startswith("@@"):
  174. in_content = True
  175. continue
  176. if in_content and line.strip():
  177. content_lines.append(line)
  178. return "\n".join(content_lines)
  179. except (subprocess.CalledProcessError, FileNotFoundError, Exception):
  180. # Git not available or other error, return None to use fallback
  181. pass
  182. return None
  183. def _remove_claude_tracked_changes(self, root):
  184. """Remove tracked changes authored by Claude from the XML root."""
  185. ins_tag = f"{{{self.namespaces['w']}}}ins"
  186. del_tag = f"{{{self.namespaces['w']}}}del"
  187. author_attr = f"{{{self.namespaces['w']}}}author"
  188. # Remove w:ins elements
  189. for parent in root.iter():
  190. to_remove = []
  191. for child in parent:
  192. if child.tag == ins_tag and child.get(author_attr) == "Claude":
  193. to_remove.append(child)
  194. for elem in to_remove:
  195. parent.remove(elem)
  196. # Unwrap content in w:del elements where author is "Claude"
  197. deltext_tag = f"{{{self.namespaces['w']}}}delText"
  198. t_tag = f"{{{self.namespaces['w']}}}t"
  199. for parent in root.iter():
  200. to_process = []
  201. for child in parent:
  202. if child.tag == del_tag and child.get(author_attr) == "Claude":
  203. to_process.append((child, list(parent).index(child)))
  204. # Process in reverse order to maintain indices
  205. for del_elem, del_index in reversed(to_process):
  206. # Convert w:delText to w:t before moving
  207. for elem in del_elem.iter():
  208. if elem.tag == deltext_tag:
  209. elem.tag = t_tag
  210. # Move all children of w:del to its parent before removing w:del
  211. for child in reversed(list(del_elem)):
  212. parent.insert(del_index, child)
  213. parent.remove(del_elem)
  214. def _extract_text_content(self, root):
  215. """Extract text content from Word XML, preserving paragraph structure.
  216. Empty paragraphs are skipped to avoid false positives when tracked
  217. insertions add only structural elements without text content.
  218. """
  219. p_tag = f"{{{self.namespaces['w']}}}p"
  220. t_tag = f"{{{self.namespaces['w']}}}t"
  221. paragraphs = []
  222. for p_elem in root.findall(f".//{p_tag}"):
  223. # Get all text elements within this paragraph
  224. text_parts = []
  225. for t_elem in p_elem.findall(f".//{t_tag}"):
  226. if t_elem.text:
  227. text_parts.append(t_elem.text)
  228. paragraph_text = "".join(text_parts)
  229. # Skip empty paragraphs - they don't affect content validation
  230. if paragraph_text:
  231. paragraphs.append(paragraph_text)
  232. return "\n".join(paragraphs)
  233. if __name__ == "__main__":
  234. raise RuntimeError("This module should not be run directly.")