| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279 |
- """
- Validator for tracked changes in Word documents.
- """
- import subprocess
- import tempfile
- import zipfile
- from pathlib import Path
- class RedliningValidator:
- """Validator for tracked changes in Word documents."""
- def __init__(self, unpacked_dir, original_docx, verbose=False):
- self.unpacked_dir = Path(unpacked_dir)
- self.original_docx = Path(original_docx)
- self.verbose = verbose
- self.namespaces = {
- "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
- }
- def validate(self):
- """Main validation method that returns True if valid, False otherwise."""
- # Verify unpacked directory exists and has correct structure
- modified_file = self.unpacked_dir / "word" / "document.xml"
- if not modified_file.exists():
- print(f"FAILED - Modified document.xml not found at {modified_file}")
- return False
- # First, check if there are any tracked changes by Claude to validate
- try:
- import xml.etree.ElementTree as ET
- tree = ET.parse(modified_file)
- root = tree.getroot()
- # Check for w:del or w:ins tags authored by Claude
- del_elements = root.findall(".//w:del", self.namespaces)
- ins_elements = root.findall(".//w:ins", self.namespaces)
- # Filter to only include changes by Claude
- claude_del_elements = [
- elem
- for elem in del_elements
- if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude"
- ]
- claude_ins_elements = [
- elem
- for elem in ins_elements
- if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude"
- ]
- # Redlining validation is only needed if tracked changes by Claude have been used.
- if not claude_del_elements and not claude_ins_elements:
- if self.verbose:
- print("PASSED - No tracked changes by Claude found.")
- return True
- except Exception:
- # If we can't parse the XML, continue with full validation
- pass
- # Create temporary directory for unpacking original docx
- with tempfile.TemporaryDirectory() as temp_dir:
- temp_path = Path(temp_dir)
- # Unpack original docx
- try:
- with zipfile.ZipFile(self.original_docx, "r") as zip_ref:
- zip_ref.extractall(temp_path)
- except Exception as e:
- print(f"FAILED - Error unpacking original docx: {e}")
- return False
- original_file = temp_path / "word" / "document.xml"
- if not original_file.exists():
- print(
- f"FAILED - Original document.xml not found in {self.original_docx}"
- )
- return False
- # Parse both XML files using xml.etree.ElementTree for redlining validation
- try:
- import xml.etree.ElementTree as ET
- modified_tree = ET.parse(modified_file)
- modified_root = modified_tree.getroot()
- original_tree = ET.parse(original_file)
- original_root = original_tree.getroot()
- except ET.ParseError as e:
- print(f"FAILED - Error parsing XML files: {e}")
- return False
- # Remove Claude's tracked changes from both documents
- self._remove_claude_tracked_changes(original_root)
- self._remove_claude_tracked_changes(modified_root)
- # Extract and compare text content
- modified_text = self._extract_text_content(modified_root)
- original_text = self._extract_text_content(original_root)
- if modified_text != original_text:
- # Show detailed character-level differences for each paragraph
- error_message = self._generate_detailed_diff(
- original_text, modified_text
- )
- print(error_message)
- return False
- if self.verbose:
- print("PASSED - All changes by Claude are properly tracked")
- return True
- def _generate_detailed_diff(self, original_text, modified_text):
- """Generate detailed word-level differences using git word diff."""
- error_parts = [
- "FAILED - Document text doesn't match after removing Claude's tracked changes",
- "",
- "Likely causes:",
- " 1. Modified text inside another author's <w:ins> or <w:del> tags",
- " 2. Made edits without proper tracked changes",
- " 3. Didn't nest <w:del> inside <w:ins> when deleting another's insertion",
- "",
- "For pre-redlined documents, use correct patterns:",
- " - To reject another's INSERTION: Nest <w:del> inside their <w:ins>",
- " - To restore another's DELETION: Add new <w:ins> AFTER their <w:del>",
- "",
- ]
- # Show git word diff
- git_diff = self._get_git_word_diff(original_text, modified_text)
- if git_diff:
- error_parts.extend(["Differences:", "============", git_diff])
- else:
- error_parts.append("Unable to generate word diff (git not available)")
- return "\n".join(error_parts)
- def _get_git_word_diff(self, original_text, modified_text):
- """Generate word diff using git with character-level precision."""
- try:
- with tempfile.TemporaryDirectory() as temp_dir:
- temp_path = Path(temp_dir)
- # Create two files
- original_file = temp_path / "original.txt"
- modified_file = temp_path / "modified.txt"
- original_file.write_text(original_text, encoding="utf-8")
- modified_file.write_text(modified_text, encoding="utf-8")
- # Try character-level diff first for precise differences
- result = subprocess.run(
- [
- "git",
- "diff",
- "--word-diff=plain",
- "--word-diff-regex=.", # Character-by-character diff
- "-U0", # Zero lines of context - show only changed lines
- "--no-index",
- str(original_file),
- str(modified_file),
- ],
- capture_output=True,
- text=True,
- )
- if result.stdout.strip():
- # Clean up the output - remove git diff header lines
- lines = result.stdout.split("\n")
- # Skip the header lines (diff --git, index, +++, ---, @@)
- content_lines = []
- in_content = False
- for line in lines:
- if line.startswith("@@"):
- in_content = True
- continue
- if in_content and line.strip():
- content_lines.append(line)
- if content_lines:
- return "\n".join(content_lines)
- # Fallback to word-level diff if character-level is too verbose
- result = subprocess.run(
- [
- "git",
- "diff",
- "--word-diff=plain",
- "-U0", # Zero lines of context
- "--no-index",
- str(original_file),
- str(modified_file),
- ],
- capture_output=True,
- text=True,
- )
- if result.stdout.strip():
- lines = result.stdout.split("\n")
- content_lines = []
- in_content = False
- for line in lines:
- if line.startswith("@@"):
- in_content = True
- continue
- if in_content and line.strip():
- content_lines.append(line)
- return "\n".join(content_lines)
- except (subprocess.CalledProcessError, FileNotFoundError, Exception):
- # Git not available or other error, return None to use fallback
- pass
- return None
- def _remove_claude_tracked_changes(self, root):
- """Remove tracked changes authored by Claude from the XML root."""
- ins_tag = f"{{{self.namespaces['w']}}}ins"
- del_tag = f"{{{self.namespaces['w']}}}del"
- author_attr = f"{{{self.namespaces['w']}}}author"
- # Remove w:ins elements
- for parent in root.iter():
- to_remove = []
- for child in parent:
- if child.tag == ins_tag and child.get(author_attr) == "Claude":
- to_remove.append(child)
- for elem in to_remove:
- parent.remove(elem)
- # Unwrap content in w:del elements where author is "Claude"
- deltext_tag = f"{{{self.namespaces['w']}}}delText"
- t_tag = f"{{{self.namespaces['w']}}}t"
- for parent in root.iter():
- to_process = []
- for child in parent:
- if child.tag == del_tag and child.get(author_attr) == "Claude":
- to_process.append((child, list(parent).index(child)))
- # Process in reverse order to maintain indices
- for del_elem, del_index in reversed(to_process):
- # Convert w:delText to w:t before moving
- for elem in del_elem.iter():
- if elem.tag == deltext_tag:
- elem.tag = t_tag
- # Move all children of w:del to its parent before removing w:del
- for child in reversed(list(del_elem)):
- parent.insert(del_index, child)
- parent.remove(del_elem)
- def _extract_text_content(self, root):
- """Extract text content from Word XML, preserving paragraph structure.
- Empty paragraphs are skipped to avoid false positives when tracked
- insertions add only structural elements without text content.
- """
- p_tag = f"{{{self.namespaces['w']}}}p"
- t_tag = f"{{{self.namespaces['w']}}}t"
- paragraphs = []
- for p_elem in root.findall(f".//{p_tag}"):
- # Get all text elements within this paragraph
- text_parts = []
- for t_elem in p_elem.findall(f".//{t_tag}"):
- if t_elem.text:
- text_parts.append(t_elem.text)
- paragraph_text = "".join(text_parts)
- # Skip empty paragraphs - they don't affect content validation
- if paragraph_text:
- paragraphs.append(paragraph_text)
- return "\n".join(paragraphs)
- if __name__ == "__main__":
- raise RuntimeError("This module should not be run directly.")
|