| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274 |
- """
- Validator for Word document XML files against XSD schemas.
- """
- import re
- import tempfile
- import zipfile
- import lxml.etree
- from .base import BaseSchemaValidator
- class DOCXSchemaValidator(BaseSchemaValidator):
- """Validator for Word document XML files against XSD schemas."""
- # Word-specific namespace
- WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
- # Word-specific element to relationship type mappings
- # Start with empty mapping - add specific cases as we discover them
- ELEMENT_RELATIONSHIP_TYPES = {}
- def validate(self):
- """Run all validation checks and return True if all pass."""
- # Test 0: XML well-formedness
- if not self.validate_xml():
- return False
- # Test 1: Namespace declarations
- all_valid = True
- if not self.validate_namespaces():
- all_valid = False
- # Test 2: Unique IDs
- if not self.validate_unique_ids():
- all_valid = False
- # Test 3: Relationship and file reference validation
- if not self.validate_file_references():
- all_valid = False
- # Test 4: Content type declarations
- if not self.validate_content_types():
- all_valid = False
- # Test 5: XSD schema validation
- if not self.validate_against_xsd():
- all_valid = False
- # Test 6: Whitespace preservation
- if not self.validate_whitespace_preservation():
- all_valid = False
- # Test 7: Deletion validation
- if not self.validate_deletions():
- all_valid = False
- # Test 8: Insertion validation
- if not self.validate_insertions():
- all_valid = False
- # Test 9: Relationship ID reference validation
- if not self.validate_all_relationship_ids():
- all_valid = False
- # Count and compare paragraphs
- self.compare_paragraph_counts()
- return all_valid
- def validate_whitespace_preservation(self):
- """
- Validate that w:t elements with whitespace have xml:space='preserve'.
- """
- errors = []
- for xml_file in self.xml_files:
- # Only check document.xml files
- if xml_file.name != "document.xml":
- continue
- try:
- root = lxml.etree.parse(str(xml_file)).getroot()
- # Find all w:t elements
- for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"):
- if elem.text:
- text = elem.text
- # Check if text starts or ends with whitespace
- if re.match(r"^\s.*", text) or re.match(r".*\s$", text):
- # Check if xml:space="preserve" attribute exists
- xml_space_attr = f"{{{self.XML_NAMESPACE}}}space"
- if (
- xml_space_attr not in elem.attrib
- or elem.attrib[xml_space_attr] != "preserve"
- ):
- # Show a preview of the text
- text_preview = (
- repr(text)[:50] + "..."
- if len(repr(text)) > 50
- else repr(text)
- )
- errors.append(
- f" {xml_file.relative_to(self.unpacked_dir)}: "
- f"Line {elem.sourceline}: w:t element with whitespace missing xml:space='preserve': {text_preview}"
- )
- except (lxml.etree.XMLSyntaxError, Exception) as e:
- errors.append(
- f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
- )
- if errors:
- print(f"FAILED - Found {len(errors)} whitespace preservation violations:")
- for error in errors:
- print(error)
- return False
- else:
- if self.verbose:
- print("PASSED - All whitespace is properly preserved")
- return True
- def validate_deletions(self):
- """
- Validate that w:t elements are not within w:del elements.
- For some reason, XSD validation does not catch this, so we do it manually.
- """
- errors = []
- for xml_file in self.xml_files:
- # Only check document.xml files
- if xml_file.name != "document.xml":
- continue
- try:
- root = lxml.etree.parse(str(xml_file)).getroot()
- # Find all w:t elements that are descendants of w:del elements
- namespaces = {"w": self.WORD_2006_NAMESPACE}
- xpath_expression = ".//w:del//w:t"
- problematic_t_elements = root.xpath(
- xpath_expression, namespaces=namespaces
- )
- for t_elem in problematic_t_elements:
- if t_elem.text:
- # Show a preview of the text
- text_preview = (
- repr(t_elem.text)[:50] + "..."
- if len(repr(t_elem.text)) > 50
- else repr(t_elem.text)
- )
- errors.append(
- f" {xml_file.relative_to(self.unpacked_dir)}: "
- f"Line {t_elem.sourceline}: <w:t> found within <w:del>: {text_preview}"
- )
- except (lxml.etree.XMLSyntaxError, Exception) as e:
- errors.append(
- f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
- )
- if errors:
- print(f"FAILED - Found {len(errors)} deletion validation violations:")
- for error in errors:
- print(error)
- return False
- else:
- if self.verbose:
- print("PASSED - No w:t elements found within w:del elements")
- return True
- def count_paragraphs_in_unpacked(self):
- """Count the number of paragraphs in the unpacked document."""
- count = 0
- for xml_file in self.xml_files:
- # Only check document.xml files
- if xml_file.name != "document.xml":
- continue
- try:
- root = lxml.etree.parse(str(xml_file)).getroot()
- # Count all w:p elements
- paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p")
- count = len(paragraphs)
- except Exception as e:
- print(f"Error counting paragraphs in unpacked document: {e}")
- return count
- def count_paragraphs_in_original(self):
- """Count the number of paragraphs in the original docx file."""
- count = 0
- try:
- # Create temporary directory to unpack original
- with tempfile.TemporaryDirectory() as temp_dir:
- # Unpack original docx
- with zipfile.ZipFile(self.original_file, "r") as zip_ref:
- zip_ref.extractall(temp_dir)
- # Parse document.xml
- doc_xml_path = temp_dir + "/word/document.xml"
- root = lxml.etree.parse(doc_xml_path).getroot()
- # Count all w:p elements
- paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p")
- count = len(paragraphs)
- except Exception as e:
- print(f"Error counting paragraphs in original document: {e}")
- return count
- def validate_insertions(self):
- """
- Validate that w:delText elements are not within w:ins elements.
- w:delText is only allowed in w:ins if nested within a w:del.
- """
- errors = []
- for xml_file in self.xml_files:
- if xml_file.name != "document.xml":
- continue
- try:
- root = lxml.etree.parse(str(xml_file)).getroot()
- namespaces = {"w": self.WORD_2006_NAMESPACE}
- # Find w:delText in w:ins that are NOT within w:del
- invalid_elements = root.xpath(
- ".//w:ins//w:delText[not(ancestor::w:del)]",
- namespaces=namespaces
- )
- for elem in invalid_elements:
- text_preview = (
- repr(elem.text or "")[:50] + "..."
- if len(repr(elem.text or "")) > 50
- else repr(elem.text or "")
- )
- errors.append(
- f" {xml_file.relative_to(self.unpacked_dir)}: "
- f"Line {elem.sourceline}: <w:delText> within <w:ins>: {text_preview}"
- )
- except (lxml.etree.XMLSyntaxError, Exception) as e:
- errors.append(
- f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
- )
- if errors:
- print(f"FAILED - Found {len(errors)} insertion validation violations:")
- for error in errors:
- print(error)
- return False
- else:
- if self.verbose:
- print("PASSED - No w:delText elements within w:ins elements")
- return True
- def compare_paragraph_counts(self):
- """Compare paragraph counts between original and new document."""
- original_count = self.count_paragraphs_in_original()
- new_count = self.count_paragraphs_in_unpacked()
- diff = new_count - original_count
- diff_str = f"+{diff}" if diff > 0 else str(diff)
- print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})")
- if __name__ == "__main__":
- raise RuntimeError("This module should not be run directly.")
|