base.py 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951
  1. """
  2. Base validator with common validation logic for document files.
  3. """
  4. import re
  5. from pathlib import Path
  6. import lxml.etree
  7. class BaseSchemaValidator:
  8. """Base validator with common validation logic for document files."""
  9. # Elements whose 'id' attributes must be unique within their file
  10. # Format: element_name -> (attribute_name, scope)
  11. # scope can be 'file' (unique within file) or 'global' (unique across all files)
  12. UNIQUE_ID_REQUIREMENTS = {
  13. # Word elements
  14. "comment": ("id", "file"), # Comment IDs in comments.xml
  15. "commentrangestart": ("id", "file"), # Must match comment IDs
  16. "commentrangeend": ("id", "file"), # Must match comment IDs
  17. "bookmarkstart": ("id", "file"), # Bookmark start IDs
  18. "bookmarkend": ("id", "file"), # Bookmark end IDs
  19. # Note: ins and del (track changes) can share IDs when part of same revision
  20. # PowerPoint elements
  21. "sldid": ("id", "file"), # Slide IDs in presentation.xml
  22. "sldmasterid": ("id", "global"), # Slide master IDs must be globally unique
  23. "sldlayoutid": ("id", "global"), # Slide layout IDs must be globally unique
  24. "cm": ("authorid", "file"), # Comment author IDs
  25. # Excel elements
  26. "sheet": ("sheetid", "file"), # Sheet IDs in workbook.xml
  27. "definedname": ("id", "file"), # Named range IDs
  28. # Drawing/Shape elements (all formats)
  29. "cxnsp": ("id", "file"), # Connection shape IDs
  30. "sp": ("id", "file"), # Shape IDs
  31. "pic": ("id", "file"), # Picture IDs
  32. "grpsp": ("id", "file"), # Group shape IDs
  33. }
  34. # Mapping of element names to expected relationship types
  35. # Subclasses should override this with format-specific mappings
  36. ELEMENT_RELATIONSHIP_TYPES = {}
  37. # Unified schema mappings for all Office document types
  38. SCHEMA_MAPPINGS = {
  39. # Document type specific schemas
  40. "word": "ISO-IEC29500-4_2016/wml.xsd", # Word documents
  41. "ppt": "ISO-IEC29500-4_2016/pml.xsd", # PowerPoint presentations
  42. "xl": "ISO-IEC29500-4_2016/sml.xsd", # Excel spreadsheets
  43. # Common file types
  44. "[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd",
  45. "app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd",
  46. "core.xml": "ecma/fouth-edition/opc-coreProperties.xsd",
  47. "custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd",
  48. ".rels": "ecma/fouth-edition/opc-relationships.xsd",
  49. # Word-specific files
  50. "people.xml": "microsoft/wml-2012.xsd",
  51. "commentsIds.xml": "microsoft/wml-cid-2016.xsd",
  52. "commentsExtensible.xml": "microsoft/wml-cex-2018.xsd",
  53. "commentsExtended.xml": "microsoft/wml-2012.xsd",
  54. # Chart files (common across document types)
  55. "chart": "ISO-IEC29500-4_2016/dml-chart.xsd",
  56. # Theme files (common across document types)
  57. "theme": "ISO-IEC29500-4_2016/dml-main.xsd",
  58. # Drawing and media files
  59. "drawing": "ISO-IEC29500-4_2016/dml-main.xsd",
  60. }
  61. # Unified namespace constants
  62. MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006"
  63. XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
  64. # Common OOXML namespaces used across validators
  65. PACKAGE_RELATIONSHIPS_NAMESPACE = (
  66. "http://schemas.openxmlformats.org/package/2006/relationships"
  67. )
  68. OFFICE_RELATIONSHIPS_NAMESPACE = (
  69. "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
  70. )
  71. CONTENT_TYPES_NAMESPACE = (
  72. "http://schemas.openxmlformats.org/package/2006/content-types"
  73. )
  74. # Folders where we should clean ignorable namespaces
  75. MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"}
  76. # All allowed OOXML namespaces (superset of all document types)
  77. OOXML_NAMESPACES = {
  78. "http://schemas.openxmlformats.org/officeDocument/2006/math",
  79. "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
  80. "http://schemas.openxmlformats.org/schemaLibrary/2006/main",
  81. "http://schemas.openxmlformats.org/drawingml/2006/main",
  82. "http://schemas.openxmlformats.org/drawingml/2006/chart",
  83. "http://schemas.openxmlformats.org/drawingml/2006/chartDrawing",
  84. "http://schemas.openxmlformats.org/drawingml/2006/diagram",
  85. "http://schemas.openxmlformats.org/drawingml/2006/picture",
  86. "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing",
  87. "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
  88. "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
  89. "http://schemas.openxmlformats.org/presentationml/2006/main",
  90. "http://schemas.openxmlformats.org/spreadsheetml/2006/main",
  91. "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes",
  92. "http://www.w3.org/XML/1998/namespace",
  93. }
  94. def __init__(self, unpacked_dir, original_file, verbose=False):
  95. self.unpacked_dir = Path(unpacked_dir).resolve()
  96. self.original_file = Path(original_file)
  97. self.verbose = verbose
  98. # Set schemas directory
  99. self.schemas_dir = Path(__file__).parent.parent.parent / "schemas"
  100. # Get all XML and .rels files
  101. patterns = ["*.xml", "*.rels"]
  102. self.xml_files = [
  103. f for pattern in patterns for f in self.unpacked_dir.rglob(pattern)
  104. ]
  105. if not self.xml_files:
  106. print(f"Warning: No XML files found in {self.unpacked_dir}")
  107. def validate(self):
  108. """Run all validation checks and return True if all pass."""
  109. raise NotImplementedError("Subclasses must implement the validate method")
  110. def validate_xml(self):
  111. """Validate that all XML files are well-formed."""
  112. errors = []
  113. for xml_file in self.xml_files:
  114. try:
  115. # Try to parse the XML file
  116. lxml.etree.parse(str(xml_file))
  117. except lxml.etree.XMLSyntaxError as e:
  118. errors.append(
  119. f" {xml_file.relative_to(self.unpacked_dir)}: "
  120. f"Line {e.lineno}: {e.msg}"
  121. )
  122. except Exception as e:
  123. errors.append(
  124. f" {xml_file.relative_to(self.unpacked_dir)}: "
  125. f"Unexpected error: {str(e)}"
  126. )
  127. if errors:
  128. print(f"FAILED - Found {len(errors)} XML violations:")
  129. for error in errors:
  130. print(error)
  131. return False
  132. else:
  133. if self.verbose:
  134. print("PASSED - All XML files are well-formed")
  135. return True
  136. def validate_namespaces(self):
  137. """Validate that namespace prefixes in Ignorable attributes are declared."""
  138. errors = []
  139. for xml_file in self.xml_files:
  140. try:
  141. root = lxml.etree.parse(str(xml_file)).getroot()
  142. declared = set(root.nsmap.keys()) - {None} # Exclude default namespace
  143. for attr_val in [
  144. v for k, v in root.attrib.items() if k.endswith("Ignorable")
  145. ]:
  146. undeclared = set(attr_val.split()) - declared
  147. errors.extend(
  148. f" {xml_file.relative_to(self.unpacked_dir)}: "
  149. f"Namespace '{ns}' in Ignorable but not declared"
  150. for ns in undeclared
  151. )
  152. except lxml.etree.XMLSyntaxError:
  153. continue
  154. if errors:
  155. print(f"FAILED - {len(errors)} namespace issues:")
  156. for error in errors:
  157. print(error)
  158. return False
  159. if self.verbose:
  160. print("PASSED - All namespace prefixes properly declared")
  161. return True
  162. def validate_unique_ids(self):
  163. """Validate that specific IDs are unique according to OOXML requirements."""
  164. errors = []
  165. global_ids = {} # Track globally unique IDs across all files
  166. for xml_file in self.xml_files:
  167. try:
  168. root = lxml.etree.parse(str(xml_file)).getroot()
  169. file_ids = {} # Track IDs that must be unique within this file
  170. # Remove all mc:AlternateContent elements from the tree
  171. mc_elements = root.xpath(
  172. ".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE}
  173. )
  174. for elem in mc_elements:
  175. elem.getparent().remove(elem)
  176. # Now check IDs in the cleaned tree
  177. for elem in root.iter():
  178. # Get the element name without namespace
  179. tag = (
  180. elem.tag.split("}")[-1].lower()
  181. if "}" in elem.tag
  182. else elem.tag.lower()
  183. )
  184. # Check if this element type has ID uniqueness requirements
  185. if tag in self.UNIQUE_ID_REQUIREMENTS:
  186. attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag]
  187. # Look for the specified attribute
  188. id_value = None
  189. for attr, value in elem.attrib.items():
  190. attr_local = (
  191. attr.split("}")[-1].lower()
  192. if "}" in attr
  193. else attr.lower()
  194. )
  195. if attr_local == attr_name:
  196. id_value = value
  197. break
  198. if id_value is not None:
  199. if scope == "global":
  200. # Check global uniqueness
  201. if id_value in global_ids:
  202. prev_file, prev_line, prev_tag = global_ids[
  203. id_value
  204. ]
  205. errors.append(
  206. f" {xml_file.relative_to(self.unpacked_dir)}: "
  207. f"Line {elem.sourceline}: Global ID '{id_value}' in <{tag}> "
  208. f"already used in {prev_file} at line {prev_line} in <{prev_tag}>"
  209. )
  210. else:
  211. global_ids[id_value] = (
  212. xml_file.relative_to(self.unpacked_dir),
  213. elem.sourceline,
  214. tag,
  215. )
  216. elif scope == "file":
  217. # Check file-level uniqueness
  218. key = (tag, attr_name)
  219. if key not in file_ids:
  220. file_ids[key] = {}
  221. if id_value in file_ids[key]:
  222. prev_line = file_ids[key][id_value]
  223. errors.append(
  224. f" {xml_file.relative_to(self.unpacked_dir)}: "
  225. f"Line {elem.sourceline}: Duplicate {attr_name}='{id_value}' in <{tag}> "
  226. f"(first occurrence at line {prev_line})"
  227. )
  228. else:
  229. file_ids[key][id_value] = elem.sourceline
  230. except (lxml.etree.XMLSyntaxError, Exception) as e:
  231. errors.append(
  232. f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
  233. )
  234. if errors:
  235. print(f"FAILED - Found {len(errors)} ID uniqueness violations:")
  236. for error in errors:
  237. print(error)
  238. return False
  239. else:
  240. if self.verbose:
  241. print("PASSED - All required IDs are unique")
  242. return True
  243. def validate_file_references(self):
  244. """
  245. Validate that all .rels files properly reference files and that all files are referenced.
  246. """
  247. errors = []
  248. # Find all .rels files
  249. rels_files = list(self.unpacked_dir.rglob("*.rels"))
  250. if not rels_files:
  251. if self.verbose:
  252. print("PASSED - No .rels files found")
  253. return True
  254. # Get all files in the unpacked directory (excluding reference files)
  255. all_files = []
  256. for file_path in self.unpacked_dir.rglob("*"):
  257. if (
  258. file_path.is_file()
  259. and file_path.name != "[Content_Types].xml"
  260. and not file_path.name.endswith(".rels")
  261. ): # This file is not referenced by .rels
  262. all_files.append(file_path.resolve())
  263. # Track all files that are referenced by any .rels file
  264. all_referenced_files = set()
  265. if self.verbose:
  266. print(
  267. f"Found {len(rels_files)} .rels files and {len(all_files)} target files"
  268. )
  269. # Check each .rels file
  270. for rels_file in rels_files:
  271. try:
  272. # Parse relationships file
  273. rels_root = lxml.etree.parse(str(rels_file)).getroot()
  274. # Get the directory where this .rels file is located
  275. rels_dir = rels_file.parent
  276. # Find all relationships and their targets
  277. referenced_files = set()
  278. broken_refs = []
  279. for rel in rels_root.findall(
  280. ".//ns:Relationship",
  281. namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE},
  282. ):
  283. target = rel.get("Target")
  284. if target and not target.startswith(
  285. ("http", "mailto:")
  286. ): # Skip external URLs
  287. # Resolve the target path relative to the .rels file location
  288. if rels_file.name == ".rels":
  289. # Root .rels file - targets are relative to unpacked_dir
  290. target_path = self.unpacked_dir / target
  291. else:
  292. # Other .rels files - targets are relative to their parent's parent
  293. # e.g., word/_rels/document.xml.rels -> targets relative to word/
  294. base_dir = rels_dir.parent
  295. target_path = base_dir / target
  296. # Normalize the path and check if it exists
  297. try:
  298. target_path = target_path.resolve()
  299. if target_path.exists() and target_path.is_file():
  300. referenced_files.add(target_path)
  301. all_referenced_files.add(target_path)
  302. else:
  303. broken_refs.append((target, rel.sourceline))
  304. except (OSError, ValueError):
  305. broken_refs.append((target, rel.sourceline))
  306. # Report broken references
  307. if broken_refs:
  308. rel_path = rels_file.relative_to(self.unpacked_dir)
  309. for broken_ref, line_num in broken_refs:
  310. errors.append(
  311. f" {rel_path}: Line {line_num}: Broken reference to {broken_ref}"
  312. )
  313. except Exception as e:
  314. rel_path = rels_file.relative_to(self.unpacked_dir)
  315. errors.append(f" Error parsing {rel_path}: {e}")
  316. # Check for unreferenced files (files that exist but are not referenced anywhere)
  317. unreferenced_files = set(all_files) - all_referenced_files
  318. if unreferenced_files:
  319. for unref_file in sorted(unreferenced_files):
  320. unref_rel_path = unref_file.relative_to(self.unpacked_dir)
  321. errors.append(f" Unreferenced file: {unref_rel_path}")
  322. if errors:
  323. print(f"FAILED - Found {len(errors)} relationship validation errors:")
  324. for error in errors:
  325. print(error)
  326. print(
  327. "CRITICAL: These errors will cause the document to appear corrupt. "
  328. + "Broken references MUST be fixed, "
  329. + "and unreferenced files MUST be referenced or removed."
  330. )
  331. return False
  332. else:
  333. if self.verbose:
  334. print(
  335. "PASSED - All references are valid and all files are properly referenced"
  336. )
  337. return True
  338. def validate_all_relationship_ids(self):
  339. """
  340. Validate that all r:id attributes in XML files reference existing IDs
  341. in their corresponding .rels files, and optionally validate relationship types.
  342. """
  343. import lxml.etree
  344. errors = []
  345. # Process each XML file that might contain r:id references
  346. for xml_file in self.xml_files:
  347. # Skip .rels files themselves
  348. if xml_file.suffix == ".rels":
  349. continue
  350. # Determine the corresponding .rels file
  351. # For dir/file.xml, it's dir/_rels/file.xml.rels
  352. rels_dir = xml_file.parent / "_rels"
  353. rels_file = rels_dir / f"{xml_file.name}.rels"
  354. # Skip if there's no corresponding .rels file (that's okay)
  355. if not rels_file.exists():
  356. continue
  357. try:
  358. # Parse the .rels file to get valid relationship IDs and their types
  359. rels_root = lxml.etree.parse(str(rels_file)).getroot()
  360. rid_to_type = {}
  361. for rel in rels_root.findall(
  362. f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
  363. ):
  364. rid = rel.get("Id")
  365. rel_type = rel.get("Type", "")
  366. if rid:
  367. # Check for duplicate rIds
  368. if rid in rid_to_type:
  369. rels_rel_path = rels_file.relative_to(self.unpacked_dir)
  370. errors.append(
  371. f" {rels_rel_path}: Line {rel.sourceline}: "
  372. f"Duplicate relationship ID '{rid}' (IDs must be unique)"
  373. )
  374. # Extract just the type name from the full URL
  375. type_name = (
  376. rel_type.split("/")[-1] if "/" in rel_type else rel_type
  377. )
  378. rid_to_type[rid] = type_name
  379. # Parse the XML file to find all r:id references
  380. xml_root = lxml.etree.parse(str(xml_file)).getroot()
  381. # Find all elements with r:id attributes
  382. for elem in xml_root.iter():
  383. # Check for r:id attribute (relationship ID)
  384. rid_attr = elem.get(f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id")
  385. if rid_attr:
  386. xml_rel_path = xml_file.relative_to(self.unpacked_dir)
  387. elem_name = (
  388. elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
  389. )
  390. # Check if the ID exists
  391. if rid_attr not in rid_to_type:
  392. errors.append(
  393. f" {xml_rel_path}: Line {elem.sourceline}: "
  394. f"<{elem_name}> references non-existent relationship '{rid_attr}' "
  395. f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})"
  396. )
  397. # Check if we have type expectations for this element
  398. elif self.ELEMENT_RELATIONSHIP_TYPES:
  399. expected_type = self._get_expected_relationship_type(
  400. elem_name
  401. )
  402. if expected_type:
  403. actual_type = rid_to_type[rid_attr]
  404. # Check if the actual type matches or contains the expected type
  405. if expected_type not in actual_type.lower():
  406. errors.append(
  407. f" {xml_rel_path}: Line {elem.sourceline}: "
  408. f"<{elem_name}> references '{rid_attr}' which points to '{actual_type}' "
  409. f"but should point to a '{expected_type}' relationship"
  410. )
  411. except Exception as e:
  412. xml_rel_path = xml_file.relative_to(self.unpacked_dir)
  413. errors.append(f" Error processing {xml_rel_path}: {e}")
  414. if errors:
  415. print(f"FAILED - Found {len(errors)} relationship ID reference errors:")
  416. for error in errors:
  417. print(error)
  418. print("\nThese ID mismatches will cause the document to appear corrupt!")
  419. return False
  420. else:
  421. if self.verbose:
  422. print("PASSED - All relationship ID references are valid")
  423. return True
  424. def _get_expected_relationship_type(self, element_name):
  425. """
  426. Get the expected relationship type for an element.
  427. First checks the explicit mapping, then tries pattern detection.
  428. """
  429. # Normalize element name to lowercase
  430. elem_lower = element_name.lower()
  431. # Check explicit mapping first
  432. if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES:
  433. return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower]
  434. # Try pattern detection for common patterns
  435. # Pattern 1: Elements ending in "Id" often expect a relationship of the prefix type
  436. if elem_lower.endswith("id") and len(elem_lower) > 2:
  437. # e.g., "sldId" -> "sld", "sldMasterId" -> "sldMaster"
  438. prefix = elem_lower[:-2] # Remove "id"
  439. # Check if this might be a compound like "sldMasterId"
  440. if prefix.endswith("master"):
  441. return prefix.lower()
  442. elif prefix.endswith("layout"):
  443. return prefix.lower()
  444. else:
  445. # Simple case like "sldId" -> "slide"
  446. # Common transformations
  447. if prefix == "sld":
  448. return "slide"
  449. return prefix.lower()
  450. # Pattern 2: Elements ending in "Reference" expect a relationship of the prefix type
  451. if elem_lower.endswith("reference") and len(elem_lower) > 9:
  452. prefix = elem_lower[:-9] # Remove "reference"
  453. return prefix.lower()
  454. return None
  455. def validate_content_types(self):
  456. """Validate that all content files are properly declared in [Content_Types].xml."""
  457. errors = []
  458. # Find [Content_Types].xml file
  459. content_types_file = self.unpacked_dir / "[Content_Types].xml"
  460. if not content_types_file.exists():
  461. print("FAILED - [Content_Types].xml file not found")
  462. return False
  463. try:
  464. # Parse and get all declared parts and extensions
  465. root = lxml.etree.parse(str(content_types_file)).getroot()
  466. declared_parts = set()
  467. declared_extensions = set()
  468. # Get Override declarations (specific files)
  469. for override in root.findall(
  470. f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override"
  471. ):
  472. part_name = override.get("PartName")
  473. if part_name is not None:
  474. declared_parts.add(part_name.lstrip("/"))
  475. # Get Default declarations (by extension)
  476. for default in root.findall(
  477. f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default"
  478. ):
  479. extension = default.get("Extension")
  480. if extension is not None:
  481. declared_extensions.add(extension.lower())
  482. # Root elements that require content type declaration
  483. declarable_roots = {
  484. "sld",
  485. "sldLayout",
  486. "sldMaster",
  487. "presentation", # PowerPoint
  488. "document", # Word
  489. "workbook",
  490. "worksheet", # Excel
  491. "theme", # Common
  492. }
  493. # Common media file extensions that should be declared
  494. media_extensions = {
  495. "png": "image/png",
  496. "jpg": "image/jpeg",
  497. "jpeg": "image/jpeg",
  498. "gif": "image/gif",
  499. "bmp": "image/bmp",
  500. "tiff": "image/tiff",
  501. "wmf": "image/x-wmf",
  502. "emf": "image/x-emf",
  503. }
  504. # Get all files in the unpacked directory
  505. all_files = list(self.unpacked_dir.rglob("*"))
  506. all_files = [f for f in all_files if f.is_file()]
  507. # Check all XML files for Override declarations
  508. for xml_file in self.xml_files:
  509. path_str = str(xml_file.relative_to(self.unpacked_dir)).replace(
  510. "\\", "/"
  511. )
  512. # Skip non-content files
  513. if any(
  514. skip in path_str
  515. for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"]
  516. ):
  517. continue
  518. try:
  519. root_tag = lxml.etree.parse(str(xml_file)).getroot().tag
  520. root_name = root_tag.split("}")[-1] if "}" in root_tag else root_tag
  521. if root_name in declarable_roots and path_str not in declared_parts:
  522. errors.append(
  523. f" {path_str}: File with <{root_name}> root not declared in [Content_Types].xml"
  524. )
  525. except Exception:
  526. continue # Skip unparseable files
  527. # Check all non-XML files for Default extension declarations
  528. for file_path in all_files:
  529. # Skip XML files and metadata files (already checked above)
  530. if file_path.suffix.lower() in {".xml", ".rels"}:
  531. continue
  532. if file_path.name == "[Content_Types].xml":
  533. continue
  534. if "_rels" in file_path.parts or "docProps" in file_path.parts:
  535. continue
  536. extension = file_path.suffix.lstrip(".").lower()
  537. if extension and extension not in declared_extensions:
  538. # Check if it's a known media extension that should be declared
  539. if extension in media_extensions:
  540. relative_path = file_path.relative_to(self.unpacked_dir)
  541. errors.append(
  542. f' {relative_path}: File with extension \'{extension}\' not declared in [Content_Types].xml - should add: <Default Extension="{extension}" ContentType="{media_extensions[extension]}"/>'
  543. )
  544. except Exception as e:
  545. errors.append(f" Error parsing [Content_Types].xml: {e}")
  546. if errors:
  547. print(f"FAILED - Found {len(errors)} content type declaration errors:")
  548. for error in errors:
  549. print(error)
  550. return False
  551. else:
  552. if self.verbose:
  553. print(
  554. "PASSED - All content files are properly declared in [Content_Types].xml"
  555. )
  556. return True
  557. def validate_file_against_xsd(self, xml_file, verbose=False):
  558. """Validate a single XML file against XSD schema, comparing with original.
  559. Args:
  560. xml_file: Path to XML file to validate
  561. verbose: Enable verbose output
  562. Returns:
  563. tuple: (is_valid, new_errors_set) where is_valid is True/False/None (skipped)
  564. """
  565. # Resolve both paths to handle symlinks
  566. xml_file = Path(xml_file).resolve()
  567. unpacked_dir = self.unpacked_dir.resolve()
  568. # Validate current file
  569. is_valid, current_errors = self._validate_single_file_xsd(
  570. xml_file, unpacked_dir
  571. )
  572. if is_valid is None:
  573. return None, set() # Skipped
  574. elif is_valid:
  575. return True, set() # Valid, no errors
  576. # Get errors from original file for this specific file
  577. original_errors = self._get_original_file_errors(xml_file)
  578. # Compare with original (both are guaranteed to be sets here)
  579. assert current_errors is not None
  580. new_errors = current_errors - original_errors
  581. if new_errors:
  582. if verbose:
  583. relative_path = xml_file.relative_to(unpacked_dir)
  584. print(f"FAILED - {relative_path}: {len(new_errors)} new error(s)")
  585. for error in list(new_errors)[:3]:
  586. truncated = error[:250] + "..." if len(error) > 250 else error
  587. print(f" - {truncated}")
  588. return False, new_errors
  589. else:
  590. # All errors existed in original
  591. if verbose:
  592. print(
  593. f"PASSED - No new errors (original had {len(current_errors)} errors)"
  594. )
  595. return True, set()
  596. def validate_against_xsd(self):
  597. """Validate XML files against XSD schemas, showing only new errors compared to original."""
  598. new_errors = []
  599. original_error_count = 0
  600. valid_count = 0
  601. skipped_count = 0
  602. for xml_file in self.xml_files:
  603. relative_path = str(xml_file.relative_to(self.unpacked_dir))
  604. is_valid, new_file_errors = self.validate_file_against_xsd(
  605. xml_file, verbose=False
  606. )
  607. if is_valid is None:
  608. skipped_count += 1
  609. continue
  610. elif is_valid and not new_file_errors:
  611. valid_count += 1
  612. continue
  613. elif is_valid:
  614. # Had errors but all existed in original
  615. original_error_count += 1
  616. valid_count += 1
  617. continue
  618. # Has new errors
  619. new_errors.append(f" {relative_path}: {len(new_file_errors)} new error(s)")
  620. for error in list(new_file_errors)[:3]: # Show first 3 errors
  621. new_errors.append(
  622. f" - {error[:250]}..." if len(error) > 250 else f" - {error}"
  623. )
  624. # Print summary
  625. if self.verbose:
  626. print(f"Validated {len(self.xml_files)} files:")
  627. print(f" - Valid: {valid_count}")
  628. print(f" - Skipped (no schema): {skipped_count}")
  629. if original_error_count:
  630. print(f" - With original errors (ignored): {original_error_count}")
  631. print(
  632. f" - With NEW errors: {len(new_errors) > 0 and len([e for e in new_errors if not e.startswith(' ')]) or 0}"
  633. )
  634. if new_errors:
  635. print("\nFAILED - Found NEW validation errors:")
  636. for error in new_errors:
  637. print(error)
  638. return False
  639. else:
  640. if self.verbose:
  641. print("\nPASSED - No new XSD validation errors introduced")
  642. return True
  643. def _get_schema_path(self, xml_file):
  644. """Determine the appropriate schema path for an XML file."""
  645. # Check exact filename match
  646. if xml_file.name in self.SCHEMA_MAPPINGS:
  647. return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name]
  648. # Check .rels files
  649. if xml_file.suffix == ".rels":
  650. return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"]
  651. # Check chart files
  652. if "charts/" in str(xml_file) and xml_file.name.startswith("chart"):
  653. return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"]
  654. # Check theme files
  655. if "theme/" in str(xml_file) and xml_file.name.startswith("theme"):
  656. return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"]
  657. # Check if file is in a main content folder and use appropriate schema
  658. if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS:
  659. return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name]
  660. return None
  661. def _clean_ignorable_namespaces(self, xml_doc):
  662. """Remove attributes and elements not in allowed namespaces."""
  663. # Create a clean copy
  664. xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
  665. xml_copy = lxml.etree.fromstring(xml_string)
  666. # Remove attributes not in allowed namespaces
  667. for elem in xml_copy.iter():
  668. attrs_to_remove = []
  669. for attr in elem.attrib:
  670. # Check if attribute is from a namespace other than allowed ones
  671. if "{" in attr:
  672. ns = attr.split("}")[0][1:]
  673. if ns not in self.OOXML_NAMESPACES:
  674. attrs_to_remove.append(attr)
  675. # Remove collected attributes
  676. for attr in attrs_to_remove:
  677. del elem.attrib[attr]
  678. # Remove elements not in allowed namespaces
  679. self._remove_ignorable_elements(xml_copy)
  680. return lxml.etree.ElementTree(xml_copy)
  681. def _remove_ignorable_elements(self, root):
  682. """Recursively remove all elements not in allowed namespaces."""
  683. elements_to_remove = []
  684. # Find elements to remove
  685. for elem in list(root):
  686. # Skip non-element nodes (comments, processing instructions, etc.)
  687. if not hasattr(elem, "tag") or callable(elem.tag):
  688. continue
  689. tag_str = str(elem.tag)
  690. if tag_str.startswith("{"):
  691. ns = tag_str.split("}")[0][1:]
  692. if ns not in self.OOXML_NAMESPACES:
  693. elements_to_remove.append(elem)
  694. continue
  695. # Recursively clean child elements
  696. self._remove_ignorable_elements(elem)
  697. # Remove collected elements
  698. for elem in elements_to_remove:
  699. root.remove(elem)
  700. def _preprocess_for_mc_ignorable(self, xml_doc):
  701. """Preprocess XML to handle mc:Ignorable attribute properly."""
  702. # Remove mc:Ignorable attributes before validation
  703. root = xml_doc.getroot()
  704. # Remove mc:Ignorable attribute from root
  705. if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib:
  706. del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"]
  707. return xml_doc
  708. def _validate_single_file_xsd(self, xml_file, base_path):
  709. """Validate a single XML file against XSD schema. Returns (is_valid, errors_set)."""
  710. schema_path = self._get_schema_path(xml_file)
  711. if not schema_path:
  712. return None, None # Skip file
  713. try:
  714. # Load schema
  715. with open(schema_path, "rb") as xsd_file:
  716. parser = lxml.etree.XMLParser()
  717. xsd_doc = lxml.etree.parse(
  718. xsd_file, parser=parser, base_url=str(schema_path)
  719. )
  720. schema = lxml.etree.XMLSchema(xsd_doc)
  721. # Load and preprocess XML
  722. with open(xml_file, "r") as f:
  723. xml_doc = lxml.etree.parse(f)
  724. xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc)
  725. xml_doc = self._preprocess_for_mc_ignorable(xml_doc)
  726. # Clean ignorable namespaces if needed
  727. relative_path = xml_file.relative_to(base_path)
  728. if (
  729. relative_path.parts
  730. and relative_path.parts[0] in self.MAIN_CONTENT_FOLDERS
  731. ):
  732. xml_doc = self._clean_ignorable_namespaces(xml_doc)
  733. # Validate
  734. if schema.validate(xml_doc):
  735. return True, set()
  736. else:
  737. errors = set()
  738. for error in schema.error_log:
  739. # Store normalized error message (without line numbers for comparison)
  740. errors.add(error.message)
  741. return False, errors
  742. except Exception as e:
  743. return False, {str(e)}
  744. def _get_original_file_errors(self, xml_file):
  745. """Get XSD validation errors from a single file in the original document.
  746. Args:
  747. xml_file: Path to the XML file in unpacked_dir to check
  748. Returns:
  749. set: Set of error messages from the original file
  750. """
  751. import tempfile
  752. import zipfile
  753. # Resolve both paths to handle symlinks (e.g., /var vs /private/var on macOS)
  754. xml_file = Path(xml_file).resolve()
  755. unpacked_dir = self.unpacked_dir.resolve()
  756. relative_path = xml_file.relative_to(unpacked_dir)
  757. with tempfile.TemporaryDirectory() as temp_dir:
  758. temp_path = Path(temp_dir)
  759. # Extract original file
  760. with zipfile.ZipFile(self.original_file, "r") as zip_ref:
  761. zip_ref.extractall(temp_path)
  762. # Find corresponding file in original
  763. original_xml_file = temp_path / relative_path
  764. if not original_xml_file.exists():
  765. # File didn't exist in original, so no original errors
  766. return set()
  767. # Validate the specific file in original
  768. is_valid, errors = self._validate_single_file_xsd(
  769. original_xml_file, temp_path
  770. )
  771. return errors if errors else set()
  772. def _remove_template_tags_from_text_nodes(self, xml_doc):
  773. """Remove template tags from XML text nodes and collect warnings.
  774. Template tags follow the pattern {{ ... }} and are used as placeholders
  775. for content replacement. They should be removed from text content before
  776. XSD validation while preserving XML structure.
  777. Returns:
  778. tuple: (cleaned_xml_doc, warnings_list)
  779. """
  780. warnings = []
  781. template_pattern = re.compile(r"\{\{[^}]*\}\}")
  782. # Create a copy of the document to avoid modifying the original
  783. xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
  784. xml_copy = lxml.etree.fromstring(xml_string)
  785. def process_text_content(text, content_type):
  786. if not text:
  787. return text
  788. matches = list(template_pattern.finditer(text))
  789. if matches:
  790. for match in matches:
  791. warnings.append(
  792. f"Found template tag in {content_type}: {match.group()}"
  793. )
  794. return template_pattern.sub("", text)
  795. return text
  796. # Process all text nodes in the document
  797. for elem in xml_copy.iter():
  798. # Skip processing if this is a w:t element
  799. if not hasattr(elem, "tag") or callable(elem.tag):
  800. continue
  801. tag_str = str(elem.tag)
  802. if tag_str.endswith("}t") or tag_str == "t":
  803. continue
  804. elem.text = process_text_content(elem.text, "text content")
  805. elem.tail = process_text_content(elem.tail, "tail content")
  806. return lxml.etree.ElementTree(xml_copy), warnings
  807. if __name__ == "__main__":
  808. raise RuntimeError("This module should not be run directly.")