pptx.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315
  1. """
  2. Validator for PowerPoint presentation XML files against XSD schemas.
  3. """
  4. import re
  5. from .base import BaseSchemaValidator
  6. class PPTXSchemaValidator(BaseSchemaValidator):
  7. """Validator for PowerPoint presentation XML files against XSD schemas."""
  8. # PowerPoint presentation namespace
  9. PRESENTATIONML_NAMESPACE = (
  10. "http://schemas.openxmlformats.org/presentationml/2006/main"
  11. )
  12. # PowerPoint-specific element to relationship type mappings
  13. ELEMENT_RELATIONSHIP_TYPES = {
  14. "sldid": "slide",
  15. "sldmasterid": "slidemaster",
  16. "notesmasterid": "notesmaster",
  17. "sldlayoutid": "slidelayout",
  18. "themeid": "theme",
  19. "tablestyleid": "tablestyles",
  20. }
  21. def validate(self):
  22. """Run all validation checks and return True if all pass."""
  23. # Test 0: XML well-formedness
  24. if not self.validate_xml():
  25. return False
  26. # Test 1: Namespace declarations
  27. all_valid = True
  28. if not self.validate_namespaces():
  29. all_valid = False
  30. # Test 2: Unique IDs
  31. if not self.validate_unique_ids():
  32. all_valid = False
  33. # Test 3: UUID ID validation
  34. if not self.validate_uuid_ids():
  35. all_valid = False
  36. # Test 4: Relationship and file reference validation
  37. if not self.validate_file_references():
  38. all_valid = False
  39. # Test 5: Slide layout ID validation
  40. if not self.validate_slide_layout_ids():
  41. all_valid = False
  42. # Test 6: Content type declarations
  43. if not self.validate_content_types():
  44. all_valid = False
  45. # Test 7: XSD schema validation
  46. if not self.validate_against_xsd():
  47. all_valid = False
  48. # Test 8: Notes slide reference validation
  49. if not self.validate_notes_slide_references():
  50. all_valid = False
  51. # Test 9: Relationship ID reference validation
  52. if not self.validate_all_relationship_ids():
  53. all_valid = False
  54. # Test 10: Duplicate slide layout references validation
  55. if not self.validate_no_duplicate_slide_layouts():
  56. all_valid = False
  57. return all_valid
  58. def validate_uuid_ids(self):
  59. """Validate that ID attributes that look like UUIDs contain only hex values."""
  60. import lxml.etree
  61. errors = []
  62. # UUID pattern: 8-4-4-4-12 hex digits with optional braces/hyphens
  63. uuid_pattern = re.compile(
  64. r"^[\{\(]?[0-9A-Fa-f]{8}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{12}[\}\)]?$"
  65. )
  66. for xml_file in self.xml_files:
  67. try:
  68. root = lxml.etree.parse(str(xml_file)).getroot()
  69. # Check all elements for ID attributes
  70. for elem in root.iter():
  71. for attr, value in elem.attrib.items():
  72. # Check if this is an ID attribute
  73. attr_name = attr.split("}")[-1].lower()
  74. if attr_name == "id" or attr_name.endswith("id"):
  75. # Check if value looks like a UUID (has the right length and pattern structure)
  76. if self._looks_like_uuid(value):
  77. # Validate that it contains only hex characters in the right positions
  78. if not uuid_pattern.match(value):
  79. errors.append(
  80. f" {xml_file.relative_to(self.unpacked_dir)}: "
  81. f"Line {elem.sourceline}: ID '{value}' appears to be a UUID but contains invalid hex characters"
  82. )
  83. except (lxml.etree.XMLSyntaxError, Exception) as e:
  84. errors.append(
  85. f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
  86. )
  87. if errors:
  88. print(f"FAILED - Found {len(errors)} UUID ID validation errors:")
  89. for error in errors:
  90. print(error)
  91. return False
  92. else:
  93. if self.verbose:
  94. print("PASSED - All UUID-like IDs contain valid hex values")
  95. return True
  96. def _looks_like_uuid(self, value):
  97. """Check if a value has the general structure of a UUID."""
  98. # Remove common UUID delimiters
  99. clean_value = value.strip("{}()").replace("-", "")
  100. # Check if it's 32 hex-like characters (could include invalid hex chars)
  101. return len(clean_value) == 32 and all(c.isalnum() for c in clean_value)
  102. def validate_slide_layout_ids(self):
  103. """Validate that sldLayoutId elements in slide masters reference valid slide layouts."""
  104. import lxml.etree
  105. errors = []
  106. # Find all slide master files
  107. slide_masters = list(self.unpacked_dir.glob("ppt/slideMasters/*.xml"))
  108. if not slide_masters:
  109. if self.verbose:
  110. print("PASSED - No slide masters found")
  111. return True
  112. for slide_master in slide_masters:
  113. try:
  114. # Parse the slide master file
  115. root = lxml.etree.parse(str(slide_master)).getroot()
  116. # Find the corresponding _rels file for this slide master
  117. rels_file = slide_master.parent / "_rels" / f"{slide_master.name}.rels"
  118. if not rels_file.exists():
  119. errors.append(
  120. f" {slide_master.relative_to(self.unpacked_dir)}: "
  121. f"Missing relationships file: {rels_file.relative_to(self.unpacked_dir)}"
  122. )
  123. continue
  124. # Parse the relationships file
  125. rels_root = lxml.etree.parse(str(rels_file)).getroot()
  126. # Build a set of valid relationship IDs that point to slide layouts
  127. valid_layout_rids = set()
  128. for rel in rels_root.findall(
  129. f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
  130. ):
  131. rel_type = rel.get("Type", "")
  132. if "slideLayout" in rel_type:
  133. valid_layout_rids.add(rel.get("Id"))
  134. # Find all sldLayoutId elements in the slide master
  135. for sld_layout_id in root.findall(
  136. f".//{{{self.PRESENTATIONML_NAMESPACE}}}sldLayoutId"
  137. ):
  138. r_id = sld_layout_id.get(
  139. f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id"
  140. )
  141. layout_id = sld_layout_id.get("id")
  142. if r_id and r_id not in valid_layout_rids:
  143. errors.append(
  144. f" {slide_master.relative_to(self.unpacked_dir)}: "
  145. f"Line {sld_layout_id.sourceline}: sldLayoutId with id='{layout_id}' "
  146. f"references r:id='{r_id}' which is not found in slide layout relationships"
  147. )
  148. except (lxml.etree.XMLSyntaxError, Exception) as e:
  149. errors.append(
  150. f" {slide_master.relative_to(self.unpacked_dir)}: Error: {e}"
  151. )
  152. if errors:
  153. print(f"FAILED - Found {len(errors)} slide layout ID validation errors:")
  154. for error in errors:
  155. print(error)
  156. print(
  157. "Remove invalid references or add missing slide layouts to the relationships file."
  158. )
  159. return False
  160. else:
  161. if self.verbose:
  162. print("PASSED - All slide layout IDs reference valid slide layouts")
  163. return True
  164. def validate_no_duplicate_slide_layouts(self):
  165. """Validate that each slide has exactly one slideLayout reference."""
  166. import lxml.etree
  167. errors = []
  168. slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels"))
  169. for rels_file in slide_rels_files:
  170. try:
  171. root = lxml.etree.parse(str(rels_file)).getroot()
  172. # Find all slideLayout relationships
  173. layout_rels = [
  174. rel
  175. for rel in root.findall(
  176. f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
  177. )
  178. if "slideLayout" in rel.get("Type", "")
  179. ]
  180. if len(layout_rels) > 1:
  181. errors.append(
  182. f" {rels_file.relative_to(self.unpacked_dir)}: has {len(layout_rels)} slideLayout references"
  183. )
  184. except Exception as e:
  185. errors.append(
  186. f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}"
  187. )
  188. if errors:
  189. print("FAILED - Found slides with duplicate slideLayout references:")
  190. for error in errors:
  191. print(error)
  192. return False
  193. else:
  194. if self.verbose:
  195. print("PASSED - All slides have exactly one slideLayout reference")
  196. return True
  197. def validate_notes_slide_references(self):
  198. """Validate that each notesSlide file is referenced by only one slide."""
  199. import lxml.etree
  200. errors = []
  201. notes_slide_references = {} # Track which slides reference each notesSlide
  202. # Find all slide relationship files
  203. slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels"))
  204. if not slide_rels_files:
  205. if self.verbose:
  206. print("PASSED - No slide relationship files found")
  207. return True
  208. for rels_file in slide_rels_files:
  209. try:
  210. # Parse the relationships file
  211. root = lxml.etree.parse(str(rels_file)).getroot()
  212. # Find all notesSlide relationships
  213. for rel in root.findall(
  214. f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
  215. ):
  216. rel_type = rel.get("Type", "")
  217. if "notesSlide" in rel_type:
  218. target = rel.get("Target", "")
  219. if target:
  220. # Normalize the target path to handle relative paths
  221. normalized_target = target.replace("../", "")
  222. # Track which slide references this notesSlide
  223. slide_name = rels_file.stem.replace(
  224. ".xml", ""
  225. ) # e.g., "slide1"
  226. if normalized_target not in notes_slide_references:
  227. notes_slide_references[normalized_target] = []
  228. notes_slide_references[normalized_target].append(
  229. (slide_name, rels_file)
  230. )
  231. except (lxml.etree.XMLSyntaxError, Exception) as e:
  232. errors.append(
  233. f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}"
  234. )
  235. # Check for duplicate references
  236. for target, references in notes_slide_references.items():
  237. if len(references) > 1:
  238. slide_names = [ref[0] for ref in references]
  239. errors.append(
  240. f" Notes slide '{target}' is referenced by multiple slides: {', '.join(slide_names)}"
  241. )
  242. for slide_name, rels_file in references:
  243. errors.append(f" - {rels_file.relative_to(self.unpacked_dir)}")
  244. if errors:
  245. print(
  246. f"FAILED - Found {len([e for e in errors if not e.startswith(' ')])} notes slide reference validation errors:"
  247. )
  248. for error in errors:
  249. print(error)
  250. print("Each slide may optionally have its own slide file.")
  251. return False
  252. else:
  253. if self.verbose:
  254. print("PASSED - All notes slide references are unique")
  255. return True
  256. if __name__ == "__main__":
  257. raise RuntimeError("This module should not be run directly.")