inventory.py 37 KB


  1. #!/usr/bin/env python3
  2. """
  3. Extract structured text content from PowerPoint presentations.
  4. This module provides functionality to:
  5. - Extract all text content from PowerPoint shapes
  6. - Preserve paragraph formatting (alignment, bullets, fonts, spacing)
  7. - Handle nested GroupShapes recursively with correct absolute positions
  8. - Sort shapes by visual position on slides
  9. - Filter out slide numbers and non-content placeholders
  10. - Export to JSON with clean, structured data
  11. Classes:
  12. ParagraphData: Represents a text paragraph with formatting
  13. ShapeData: Represents a shape with position and text content
  14. Main Functions:
  15. extract_text_inventory: Extract all text from a presentation
  16. save_inventory: Save extracted data to JSON
  17. Usage:
  18. python inventory.py input.pptx output.json
  19. """
  20. import argparse
  21. import json
  22. import platform
  23. import sys
  24. from dataclasses import dataclass
  25. from pathlib import Path
  26. from typing import Any, Dict, List, Optional, Tuple, Union
  27. from PIL import Image, ImageDraw, ImageFont
  28. from pptx import Presentation
  29. from pptx.enum.text import PP_ALIGN
  30. from pptx.shapes.base import BaseShape
  31. # Type aliases for cleaner signatures
  32. JsonValue = Union[str, int, float, bool, None]
  33. ParagraphDict = Dict[str, JsonValue]
  34. ShapeDict = Dict[
  35. str, Union[str, float, bool, List[ParagraphDict], List[str], Dict[str, Any], None]
  36. ]
  37. InventoryData = Dict[
  38. str, Dict[str, "ShapeData"]
  39. ] # Dict of slide_id -> {shape_id -> ShapeData}
  40. InventoryDict = Dict[str, Dict[str, ShapeDict]] # JSON-serializable inventory
  41. def main():
  42. """Main entry point for command-line usage."""
  43. parser = argparse.ArgumentParser(
  44. description="Extract text inventory from PowerPoint with proper GroupShape support.",
  45. formatter_class=argparse.RawDescriptionHelpFormatter,
  46. epilog="""
  47. Examples:
  48. python inventory.py presentation.pptx inventory.json
  49. Extracts text inventory with correct absolute positions for grouped shapes
  50. python inventory.py presentation.pptx inventory.json --issues-only
  51. Extracts only text shapes that have overflow or overlap issues
  52. The output JSON includes:
  53. - All text content organized by slide and shape
  54. - Correct absolute positions for shapes in groups
  55. - Visual position and size in inches
  56. - Paragraph properties and formatting
  57. - Issue detection: text overflow and shape overlaps
  58. """,
  59. )
  60. parser.add_argument("input", help="Input PowerPoint file (.pptx)")
  61. parser.add_argument("output", help="Output JSON file for inventory")
  62. parser.add_argument(
  63. "--issues-only",
  64. action="store_true",
  65. help="Include only text shapes that have overflow or overlap issues",
  66. )
  67. args = parser.parse_args()
  68. input_path = Path(args.input)
  69. if not input_path.exists():
  70. print(f"Error: Input file not found: {args.input}")
  71. sys.exit(1)
  72. if not input_path.suffix.lower() == ".pptx":
  73. print("Error: Input must be a PowerPoint file (.pptx)")
  74. sys.exit(1)
  75. try:
  76. print(f"Extracting text inventory from: {args.input}")
  77. if args.issues_only:
  78. print(
  79. "Filtering to include only text shapes with issues (overflow/overlap)"
  80. )
  81. inventory = extract_text_inventory(input_path, issues_only=args.issues_only)
  82. output_path = Path(args.output)
  83. output_path.parent.mkdir(parents=True, exist_ok=True)
  84. save_inventory(inventory, output_path)
  85. print(f"Output saved to: {args.output}")
  86. # Report statistics
  87. total_slides = len(inventory)
  88. total_shapes = sum(len(shapes) for shapes in inventory.values())
  89. if args.issues_only:
  90. if total_shapes > 0:
  91. print(
  92. f"Found {total_shapes} text elements with issues in {total_slides} slides"
  93. )
  94. else:
  95. print("No issues discovered")
  96. else:
  97. print(
  98. f"Found text in {total_slides} slides with {total_shapes} text elements"
  99. )
  100. except Exception as e:
  101. print(f"Error processing presentation: {e}")
  102. import traceback
  103. traceback.print_exc()
  104. sys.exit(1)
  105. @dataclass
  106. class ShapeWithPosition:
  107. """A shape with its absolute position on the slide."""
  108. shape: BaseShape
  109. absolute_left: int # in EMUs
  110. absolute_top: int # in EMUs
  111. class ParagraphData:
  112. """Data structure for paragraph properties extracted from a PowerPoint paragraph."""
  113. def __init__(self, paragraph: Any):
  114. """Initialize from a PowerPoint paragraph object.
  115. Args:
  116. paragraph: The PowerPoint paragraph object
  117. """
  118. self.text: str = paragraph.text.strip()
  119. self.bullet: bool = False
  120. self.level: Optional[int] = None
  121. self.alignment: Optional[str] = None
  122. self.space_before: Optional[float] = None
  123. self.space_after: Optional[float] = None
  124. self.font_name: Optional[str] = None
  125. self.font_size: Optional[float] = None
  126. self.bold: Optional[bool] = None
  127. self.italic: Optional[bool] = None
  128. self.underline: Optional[bool] = None
  129. self.color: Optional[str] = None
  130. self.theme_color: Optional[str] = None
  131. self.line_spacing: Optional[float] = None
  132. # Check for bullet formatting
  133. if (
  134. hasattr(paragraph, "_p")
  135. and paragraph._p is not None
  136. and paragraph._p.pPr is not None
  137. ):
  138. pPr = paragraph._p.pPr
  139. ns = "{http://schemas.openxmlformats.org/drawingml/2006/main}"
  140. if (
  141. pPr.find(f"{ns}buChar") is not None
  142. or pPr.find(f"{ns}buAutoNum") is not None
  143. ):
  144. self.bullet = True
  145. if hasattr(paragraph, "level"):
  146. self.level = paragraph.level
  147. # Add alignment if not LEFT (default)
  148. if hasattr(paragraph, "alignment") and paragraph.alignment is not None:
  149. alignment_map = {
  150. PP_ALIGN.CENTER: "CENTER",
  151. PP_ALIGN.RIGHT: "RIGHT",
  152. PP_ALIGN.JUSTIFY: "JUSTIFY",
  153. }
  154. if paragraph.alignment in alignment_map:
  155. self.alignment = alignment_map[paragraph.alignment]
  156. # Add spacing properties if set
  157. if hasattr(paragraph, "space_before") and paragraph.space_before:
  158. self.space_before = paragraph.space_before.pt
  159. if hasattr(paragraph, "space_after") and paragraph.space_after:
  160. self.space_after = paragraph.space_after.pt
  161. # Extract font properties from first run
  162. if paragraph.runs:
  163. first_run = paragraph.runs[0]
  164. if hasattr(first_run, "font"):
  165. font = first_run.font
  166. if font.name:
  167. self.font_name = font.name
  168. if font.size:
  169. self.font_size = font.size.pt
  170. if font.bold is not None:
  171. self.bold = font.bold
  172. if font.italic is not None:
  173. self.italic = font.italic
  174. if font.underline is not None:
  175. self.underline = font.underline
  176. # Handle color - both RGB and theme colors
  177. try:
  178. # Try RGB color first
  179. if font.color.rgb:
  180. self.color = str(font.color.rgb)
  181. except (AttributeError, TypeError):
  182. # Fall back to theme color
  183. try:
  184. if font.color.theme_color:
  185. self.theme_color = font.color.theme_color.name
  186. except (AttributeError, TypeError):
  187. pass
  188. # Add line spacing if set
  189. if hasattr(paragraph, "line_spacing") and paragraph.line_spacing is not None:
  190. if hasattr(paragraph.line_spacing, "pt"):
  191. self.line_spacing = round(paragraph.line_spacing.pt, 2)
  192. else:
  193. # Multiplier - convert to points
  194. font_size = self.font_size if self.font_size else 12.0
  195. self.line_spacing = round(paragraph.line_spacing * font_size, 2)
  196. def to_dict(self) -> ParagraphDict:
  197. """Convert to dictionary for JSON serialization, excluding None values."""
  198. result: ParagraphDict = {"text": self.text}
  199. # Add optional fields only if they have values
  200. if self.bullet:
  201. result["bullet"] = self.bullet
  202. if self.level is not None:
  203. result["level"] = self.level
  204. if self.alignment:
  205. result["alignment"] = self.alignment
  206. if self.space_before is not None:
  207. result["space_before"] = self.space_before
  208. if self.space_after is not None:
  209. result["space_after"] = self.space_after
  210. if self.font_name:
  211. result["font_name"] = self.font_name
  212. if self.font_size is not None:
  213. result["font_size"] = self.font_size
  214. if self.bold is not None:
  215. result["bold"] = self.bold
  216. if self.italic is not None:
  217. result["italic"] = self.italic
  218. if self.underline is not None:
  219. result["underline"] = self.underline
  220. if self.color:
  221. result["color"] = self.color
  222. if self.theme_color:
  223. result["theme_color"] = self.theme_color
  224. if self.line_spacing is not None:
  225. result["line_spacing"] = self.line_spacing
  226. return result
  227. class ShapeData:
  228. """Data structure for shape properties extracted from a PowerPoint shape."""
  229. @staticmethod
  230. def emu_to_inches(emu: int) -> float:
  231. """Convert EMUs (English Metric Units) to inches."""
  232. return emu / 914400.0
  233. @staticmethod
  234. def inches_to_pixels(inches: float, dpi: int = 96) -> int:
  235. """Convert inches to pixels at given DPI."""
  236. return int(inches * dpi)
  237. @staticmethod
  238. def get_font_path(font_name: str) -> Optional[str]:
  239. """Get the font file path for a given font name.
  240. Args:
  241. font_name: Name of the font (e.g., 'Arial', 'Calibri')
  242. Returns:
  243. Path to the font file, or None if not found
  244. """
  245. system = platform.system()
  246. # Common font file variations to try
  247. font_variations = [
  248. font_name,
  249. font_name.lower(),
  250. font_name.replace(" ", ""),
  251. font_name.replace(" ", "-"),
  252. ]
  253. # Define font directories and extensions by platform
  254. if system == "Darwin": # macOS
  255. font_dirs = [
  256. "/System/Library/Fonts/",
  257. "/Library/Fonts/",
  258. "~/Library/Fonts/",
  259. ]
  260. extensions = [".ttf", ".otf", ".ttc", ".dfont"]
  261. else: # Linux
  262. font_dirs = [
  263. "/usr/share/fonts/truetype/",
  264. "/usr/local/share/fonts/",
  265. "~/.fonts/",
  266. ]
  267. extensions = [".ttf", ".otf"]
  268. # Try to find the font file
  269. from pathlib import Path
  270. for font_dir in font_dirs:
  271. font_dir_path = Path(font_dir).expanduser()
  272. if not font_dir_path.exists():
  273. continue
  274. # First try exact matches
  275. for variant in font_variations:
  276. for ext in extensions:
  277. font_path = font_dir_path / f"{variant}{ext}"
  278. if font_path.exists():
  279. return str(font_path)
  280. # Then try fuzzy matching - find files containing the font name
  281. try:
  282. for file_path in font_dir_path.iterdir():
  283. if file_path.is_file():
  284. file_name_lower = file_path.name.lower()
  285. font_name_lower = font_name.lower().replace(" ", "")
  286. if font_name_lower in file_name_lower and any(
  287. file_name_lower.endswith(ext) for ext in extensions
  288. ):
  289. return str(file_path)
  290. except (OSError, PermissionError):
  291. continue
  292. return None
  293. @staticmethod
  294. def get_slide_dimensions(slide: Any) -> tuple[Optional[int], Optional[int]]:
  295. """Get slide dimensions from slide object.
  296. Args:
  297. slide: Slide object
  298. Returns:
  299. Tuple of (width_emu, height_emu) or (None, None) if not found
  300. """
  301. try:
  302. prs = slide.part.package.presentation_part.presentation
  303. return prs.slide_width, prs.slide_height
  304. except (AttributeError, TypeError):
  305. return None, None
  306. @staticmethod
  307. def get_default_font_size(shape: BaseShape, slide_layout: Any) -> Optional[float]:
  308. """Extract default font size from slide layout for a placeholder shape.
  309. Args:
  310. shape: Placeholder shape
  311. slide_layout: Slide layout containing the placeholder definition
  312. Returns:
  313. Default font size in points, or None if not found
  314. """
  315. try:
  316. if not hasattr(shape, "placeholder_format"):
  317. return None
  318. shape_type = shape.placeholder_format.type # type: ignore
  319. for layout_placeholder in slide_layout.placeholders:
  320. if layout_placeholder.placeholder_format.type == shape_type:
  321. # Find first defRPr element with sz (size) attribute
  322. for elem in layout_placeholder.element.iter():
  323. if "defRPr" in elem.tag and (sz := elem.get("sz")):
  324. return float(sz) / 100.0 # Convert EMUs to points
  325. break
  326. except Exception:
  327. pass
  328. return None
  329. def __init__(
  330. self,
  331. shape: BaseShape,
  332. absolute_left: Optional[int] = None,
  333. absolute_top: Optional[int] = None,
  334. slide: Optional[Any] = None,
  335. ):
  336. """Initialize from a PowerPoint shape object.
  337. Args:
  338. shape: The PowerPoint shape object (should be pre-validated)
  339. absolute_left: Absolute left position in EMUs (for shapes in groups)
  340. absolute_top: Absolute top position in EMUs (for shapes in groups)
  341. slide: Optional slide object to get dimensions and layout information
  342. """
  343. self.shape = shape # Store reference to original shape
  344. self.shape_id: str = "" # Will be set after sorting
  345. # Get slide dimensions from slide object
  346. self.slide_width_emu, self.slide_height_emu = (
  347. self.get_slide_dimensions(slide) if slide else (None, None)
  348. )
  349. # Get placeholder type if applicable
  350. self.placeholder_type: Optional[str] = None
  351. self.default_font_size: Optional[float] = None
  352. if hasattr(shape, "is_placeholder") and shape.is_placeholder: # type: ignore
  353. if shape.placeholder_format and shape.placeholder_format.type: # type: ignore
  354. self.placeholder_type = (
  355. str(shape.placeholder_format.type).split(".")[-1].split(" ")[0] # type: ignore
  356. )
  357. # Get default font size from layout
  358. if slide and hasattr(slide, "slide_layout"):
  359. self.default_font_size = self.get_default_font_size(
  360. shape, slide.slide_layout
  361. )
  362. # Get position information
  363. # Use absolute positions if provided (for shapes in groups), otherwise use shape's position
  364. left_emu = (
  365. absolute_left
  366. if absolute_left is not None
  367. else (shape.left if hasattr(shape, "left") else 0)
  368. )
  369. top_emu = (
  370. absolute_top
  371. if absolute_top is not None
  372. else (shape.top if hasattr(shape, "top") else 0)
  373. )
  374. self.left: float = round(self.emu_to_inches(left_emu), 2) # type: ignore
  375. self.top: float = round(self.emu_to_inches(top_emu), 2) # type: ignore
  376. self.width: float = round(
  377. self.emu_to_inches(shape.width if hasattr(shape, "width") else 0),
  378. 2, # type: ignore
  379. )
  380. self.height: float = round(
  381. self.emu_to_inches(shape.height if hasattr(shape, "height") else 0),
  382. 2, # type: ignore
  383. )
  384. # Store EMU positions for overflow calculations
  385. self.left_emu = left_emu
  386. self.top_emu = top_emu
  387. self.width_emu = shape.width if hasattr(shape, "width") else 0
  388. self.height_emu = shape.height if hasattr(shape, "height") else 0
  389. # Calculate overflow status
  390. self.frame_overflow_bottom: Optional[float] = None
  391. self.slide_overflow_right: Optional[float] = None
  392. self.slide_overflow_bottom: Optional[float] = None
  393. self.overlapping_shapes: Dict[
  394. str, float
  395. ] = {} # Dict of shape_id -> overlap area in sq inches
  396. self.warnings: List[str] = []
  397. self._estimate_frame_overflow()
  398. self._calculate_slide_overflow()
  399. self._detect_bullet_issues()
  400. @property
  401. def paragraphs(self) -> List[ParagraphData]:
  402. """Calculate paragraphs from the shape's text frame."""
  403. if not self.shape or not hasattr(self.shape, "text_frame"):
  404. return []
  405. paragraphs = []
  406. for paragraph in self.shape.text_frame.paragraphs: # type: ignore
  407. if paragraph.text.strip():
  408. paragraphs.append(ParagraphData(paragraph))
  409. return paragraphs
  410. def _get_default_font_size(self) -> int:
  411. """Get default font size from theme text styles or use conservative default."""
  412. try:
  413. if not (
  414. hasattr(self.shape, "part") and hasattr(self.shape.part, "slide_layout")
  415. ):
  416. return 14
  417. slide_master = self.shape.part.slide_layout.slide_master # type: ignore
  418. if not hasattr(slide_master, "element"):
  419. return 14
  420. # Determine theme style based on placeholder type
  421. style_name = "bodyStyle" # Default
  422. if self.placeholder_type and "TITLE" in self.placeholder_type:
  423. style_name = "titleStyle"
  424. # Find font size in theme styles
  425. for child in slide_master.element.iter():
  426. tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
  427. if tag == style_name:
  428. for elem in child.iter():
  429. if "sz" in elem.attrib:
  430. return int(elem.attrib["sz"]) // 100
  431. except Exception:
  432. pass
  433. return 14 # Conservative default for body text
  434. def _get_usable_dimensions(self, text_frame) -> Tuple[int, int]:
  435. """Get usable width and height in pixels after accounting for margins."""
  436. # Default PowerPoint margins in inches
  437. margins = {"top": 0.05, "bottom": 0.05, "left": 0.1, "right": 0.1}
  438. # Override with actual margins if set
  439. if hasattr(text_frame, "margin_top") and text_frame.margin_top:
  440. margins["top"] = self.emu_to_inches(text_frame.margin_top)
  441. if hasattr(text_frame, "margin_bottom") and text_frame.margin_bottom:
  442. margins["bottom"] = self.emu_to_inches(text_frame.margin_bottom)
  443. if hasattr(text_frame, "margin_left") and text_frame.margin_left:
  444. margins["left"] = self.emu_to_inches(text_frame.margin_left)
  445. if hasattr(text_frame, "margin_right") and text_frame.margin_right:
  446. margins["right"] = self.emu_to_inches(text_frame.margin_right)
  447. # Calculate usable area
  448. usable_width = self.width - margins["left"] - margins["right"]
  449. usable_height = self.height - margins["top"] - margins["bottom"]
  450. # Convert to pixels
  451. return (
  452. self.inches_to_pixels(usable_width),
  453. self.inches_to_pixels(usable_height),
  454. )
  455. def _wrap_text_line(self, line: str, max_width_px: int, draw, font) -> List[str]:
  456. """Wrap a single line of text to fit within max_width_px."""
  457. if not line:
  458. return [""]
  459. # Use textlength for efficient width calculation
  460. if draw.textlength(line, font=font) <= max_width_px:
  461. return [line]
  462. # Need to wrap - split into words
  463. wrapped = []
  464. words = line.split(" ")
  465. current_line = ""
  466. for word in words:
  467. test_line = current_line + (" " if current_line else "") + word
  468. if draw.textlength(test_line, font=font) <= max_width_px:
  469. current_line = test_line
  470. else:
  471. if current_line:
  472. wrapped.append(current_line)
  473. current_line = word
  474. if current_line:
  475. wrapped.append(current_line)
  476. return wrapped
  477. def _estimate_frame_overflow(self) -> None:
  478. """Estimate if text overflows the shape bounds using PIL text measurement."""
  479. if not self.shape or not hasattr(self.shape, "text_frame"):
  480. return
  481. text_frame = self.shape.text_frame # type: ignore
  482. if not text_frame or not text_frame.paragraphs:
  483. return
  484. # Get usable dimensions after accounting for margins
  485. usable_width_px, usable_height_px = self._get_usable_dimensions(text_frame)
  486. if usable_width_px <= 0 or usable_height_px <= 0:
  487. return
  488. # Set up PIL for text measurement
  489. dummy_img = Image.new("RGB", (1, 1))
  490. draw = ImageDraw.Draw(dummy_img)
  491. # Get default font size from placeholder or use conservative estimate
  492. default_font_size = self._get_default_font_size()
  493. # Calculate total height of all paragraphs
  494. total_height_px = 0
  495. for para_idx, paragraph in enumerate(text_frame.paragraphs):
  496. if not paragraph.text.strip():
  497. continue
  498. para_data = ParagraphData(paragraph)
  499. # Load font for this paragraph
  500. font_name = para_data.font_name or "Arial"
  501. font_size = int(para_data.font_size or default_font_size)
  502. font = None
  503. font_path = self.get_font_path(font_name)
  504. if font_path:
  505. try:
  506. font = ImageFont.truetype(font_path, size=font_size)
  507. except Exception:
  508. font = ImageFont.load_default()
  509. else:
  510. font = ImageFont.load_default()
  511. # Wrap all lines in this paragraph
  512. all_wrapped_lines = []
  513. for line in paragraph.text.split("\n"):
  514. wrapped = self._wrap_text_line(line, usable_width_px, draw, font)
  515. all_wrapped_lines.extend(wrapped)
  516. if all_wrapped_lines:
  517. # Calculate line height
  518. if para_data.line_spacing:
  519. # Custom line spacing explicitly set
  520. line_height_px = para_data.line_spacing * 96 / 72
  521. else:
  522. # PowerPoint default single spacing (1.0x font size)
  523. line_height_px = font_size * 96 / 72
  524. # Add space_before (except first paragraph)
  525. if para_idx > 0 and para_data.space_before:
  526. total_height_px += para_data.space_before * 96 / 72
  527. # Add paragraph text height
  528. total_height_px += len(all_wrapped_lines) * line_height_px
  529. # Add space_after
  530. if para_data.space_after:
  531. total_height_px += para_data.space_after * 96 / 72
  532. # Check for overflow (ignore negligible overflows <= 0.05")
  533. if total_height_px > usable_height_px:
  534. overflow_px = total_height_px - usable_height_px
  535. overflow_inches = round(overflow_px / 96.0, 2)
  536. if overflow_inches > 0.05: # Only report significant overflows
  537. self.frame_overflow_bottom = overflow_inches
  538. def _calculate_slide_overflow(self) -> None:
  539. """Calculate if shape overflows the slide boundaries."""
  540. if self.slide_width_emu is None or self.slide_height_emu is None:
  541. return
  542. # Check right overflow (ignore negligible overflows <= 0.01")
  543. right_edge_emu = self.left_emu + self.width_emu
  544. if right_edge_emu > self.slide_width_emu:
  545. overflow_emu = right_edge_emu - self.slide_width_emu
  546. overflow_inches = round(self.emu_to_inches(overflow_emu), 2)
  547. if overflow_inches > 0.01: # Only report significant overflows
  548. self.slide_overflow_right = overflow_inches
  549. # Check bottom overflow (ignore negligible overflows <= 0.01")
  550. bottom_edge_emu = self.top_emu + self.height_emu
  551. if bottom_edge_emu > self.slide_height_emu:
  552. overflow_emu = bottom_edge_emu - self.slide_height_emu
  553. overflow_inches = round(self.emu_to_inches(overflow_emu), 2)
  554. if overflow_inches > 0.01: # Only report significant overflows
  555. self.slide_overflow_bottom = overflow_inches
  556. def _detect_bullet_issues(self) -> None:
  557. """Detect bullet point formatting issues in paragraphs."""
  558. if not self.shape or not hasattr(self.shape, "text_frame"):
  559. return
  560. text_frame = self.shape.text_frame # type: ignore
  561. if not text_frame or not text_frame.paragraphs:
  562. return
  563. # Common bullet symbols that indicate manual bullets
  564. bullet_symbols = ["•", "●", "○"]
  565. for paragraph in text_frame.paragraphs:
  566. text = paragraph.text.strip()
  567. # Check for manual bullet symbols
  568. if text and any(text.startswith(symbol + " ") for symbol in bullet_symbols):
  569. self.warnings.append(
  570. "manual_bullet_symbol: use proper bullet formatting"
  571. )
  572. break
  573. @property
  574. def has_any_issues(self) -> bool:
  575. """Check if shape has any issues (overflow, overlap, or warnings)."""
  576. return (
  577. self.frame_overflow_bottom is not None
  578. or self.slide_overflow_right is not None
  579. or self.slide_overflow_bottom is not None
  580. or len(self.overlapping_shapes) > 0
  581. or len(self.warnings) > 0
  582. )
  583. def to_dict(self) -> ShapeDict:
  584. """Convert to dictionary for JSON serialization."""
  585. result: ShapeDict = {
  586. "left": self.left,
  587. "top": self.top,
  588. "width": self.width,
  589. "height": self.height,
  590. }
  591. # Add optional fields if present
  592. if self.placeholder_type:
  593. result["placeholder_type"] = self.placeholder_type
  594. if self.default_font_size:
  595. result["default_font_size"] = self.default_font_size
  596. # Add overflow information only if there is overflow
  597. overflow_data = {}
  598. # Add frame overflow if present
  599. if self.frame_overflow_bottom is not None:
  600. overflow_data["frame"] = {"overflow_bottom": self.frame_overflow_bottom}
  601. # Add slide overflow if present
  602. slide_overflow = {}
  603. if self.slide_overflow_right is not None:
  604. slide_overflow["overflow_right"] = self.slide_overflow_right
  605. if self.slide_overflow_bottom is not None:
  606. slide_overflow["overflow_bottom"] = self.slide_overflow_bottom
  607. if slide_overflow:
  608. overflow_data["slide"] = slide_overflow
  609. # Only add overflow field if there is overflow
  610. if overflow_data:
  611. result["overflow"] = overflow_data
  612. # Add overlap field if there are overlapping shapes
  613. if self.overlapping_shapes:
  614. result["overlap"] = {"overlapping_shapes": self.overlapping_shapes}
  615. # Add warnings field if there are warnings
  616. if self.warnings:
  617. result["warnings"] = self.warnings
  618. # Add paragraphs after placeholder_type
  619. result["paragraphs"] = [para.to_dict() for para in self.paragraphs]
  620. return result
  621. def is_valid_shape(shape: BaseShape) -> bool:
  622. """Check if a shape contains meaningful text content."""
  623. # Must have a text frame with content
  624. if not hasattr(shape, "text_frame") or not shape.text_frame: # type: ignore
  625. return False
  626. text = shape.text_frame.text.strip() # type: ignore
  627. if not text:
  628. return False
  629. # Skip slide numbers and numeric footers
  630. if hasattr(shape, "is_placeholder") and shape.is_placeholder: # type: ignore
  631. if shape.placeholder_format and shape.placeholder_format.type: # type: ignore
  632. placeholder_type = (
  633. str(shape.placeholder_format.type).split(".")[-1].split(" ")[0] # type: ignore
  634. )
  635. if placeholder_type == "SLIDE_NUMBER":
  636. return False
  637. if placeholder_type == "FOOTER" and text.isdigit():
  638. return False
  639. return True
  640. def collect_shapes_with_absolute_positions(
  641. shape: BaseShape, parent_left: int = 0, parent_top: int = 0
  642. ) -> List[ShapeWithPosition]:
  643. """Recursively collect all shapes with valid text, calculating absolute positions.
  644. For shapes within groups, their positions are relative to the group.
  645. This function calculates the absolute position on the slide by accumulating
  646. parent group offsets.
  647. Args:
  648. shape: The shape to process
  649. parent_left: Accumulated left offset from parent groups (in EMUs)
  650. parent_top: Accumulated top offset from parent groups (in EMUs)
  651. Returns:
  652. List of ShapeWithPosition objects with absolute positions
  653. """
  654. if hasattr(shape, "shapes"): # GroupShape
  655. result = []
  656. # Get this group's position
  657. group_left = shape.left if hasattr(shape, "left") else 0
  658. group_top = shape.top if hasattr(shape, "top") else 0
  659. # Calculate absolute position for this group
  660. abs_group_left = parent_left + group_left
  661. abs_group_top = parent_top + group_top
  662. # Process children with accumulated offsets
  663. for child in shape.shapes: # type: ignore
  664. result.extend(
  665. collect_shapes_with_absolute_positions(
  666. child, abs_group_left, abs_group_top
  667. )
  668. )
  669. return result
  670. # Regular shape - check if it has valid text
  671. if is_valid_shape(shape):
  672. # Calculate absolute position
  673. shape_left = shape.left if hasattr(shape, "left") else 0
  674. shape_top = shape.top if hasattr(shape, "top") else 0
  675. return [
  676. ShapeWithPosition(
  677. shape=shape,
  678. absolute_left=parent_left + shape_left,
  679. absolute_top=parent_top + shape_top,
  680. )
  681. ]
  682. return []
  683. def sort_shapes_by_position(shapes: List[ShapeData]) -> List[ShapeData]:
  684. """Sort shapes by visual position (top-to-bottom, left-to-right).
  685. Shapes within 0.5 inches vertically are considered on the same row.
  686. """
  687. if not shapes:
  688. return shapes
  689. # Sort by top position first
  690. shapes = sorted(shapes, key=lambda s: (s.top, s.left))
  691. # Group shapes by row (within 0.5 inches vertically)
  692. result = []
  693. row = [shapes[0]]
  694. row_top = shapes[0].top
  695. for shape in shapes[1:]:
  696. if abs(shape.top - row_top) <= 0.5:
  697. row.append(shape)
  698. else:
  699. # Sort current row by left position and add to result
  700. result.extend(sorted(row, key=lambda s: s.left))
  701. row = [shape]
  702. row_top = shape.top
  703. # Don't forget the last row
  704. result.extend(sorted(row, key=lambda s: s.left))
  705. return result
  706. def calculate_overlap(
  707. rect1: Tuple[float, float, float, float],
  708. rect2: Tuple[float, float, float, float],
  709. tolerance: float = 0.05,
  710. ) -> Tuple[bool, float]:
  711. """Calculate if and how much two rectangles overlap.
  712. Args:
  713. rect1: (left, top, width, height) of first rectangle in inches
  714. rect2: (left, top, width, height) of second rectangle in inches
  715. tolerance: Minimum overlap in inches to consider as overlapping (default: 0.05")
  716. Returns:
  717. Tuple of (overlaps, overlap_area) where:
  718. - overlaps: True if rectangles overlap by more than tolerance
  719. - overlap_area: Area of overlap in square inches
  720. """
  721. left1, top1, w1, h1 = rect1
  722. left2, top2, w2, h2 = rect2
  723. # Calculate overlap dimensions
  724. overlap_width = min(left1 + w1, left2 + w2) - max(left1, left2)
  725. overlap_height = min(top1 + h1, top2 + h2) - max(top1, top2)
  726. # Check if there's meaningful overlap (more than tolerance)
  727. if overlap_width > tolerance and overlap_height > tolerance:
  728. # Calculate overlap area in square inches
  729. overlap_area = overlap_width * overlap_height
  730. return True, round(overlap_area, 2)
  731. return False, 0
  732. def detect_overlaps(shapes: List[ShapeData]) -> None:
  733. """Detect overlapping shapes and update their overlapping_shapes dictionaries.
  734. This function requires each ShapeData to have its shape_id already set.
  735. It modifies the shapes in-place, adding shape IDs with overlap areas in square inches.
  736. Args:
  737. shapes: List of ShapeData objects with shape_id attributes set
  738. """
  739. n = len(shapes)
  740. # Compare each pair of shapes
  741. for i in range(n):
  742. for j in range(i + 1, n):
  743. shape1 = shapes[i]
  744. shape2 = shapes[j]
  745. # Ensure shape IDs are set
  746. assert shape1.shape_id, f"Shape at index {i} has no shape_id"
  747. assert shape2.shape_id, f"Shape at index {j} has no shape_id"
  748. rect1 = (shape1.left, shape1.top, shape1.width, shape1.height)
  749. rect2 = (shape2.left, shape2.top, shape2.width, shape2.height)
  750. overlaps, overlap_area = calculate_overlap(rect1, rect2)
  751. if overlaps:
  752. # Add shape IDs with overlap area in square inches
  753. shape1.overlapping_shapes[shape2.shape_id] = overlap_area
  754. shape2.overlapping_shapes[shape1.shape_id] = overlap_area
  755. def extract_text_inventory(
  756. pptx_path: Path, prs: Optional[Any] = None, issues_only: bool = False
  757. ) -> InventoryData:
  758. """Extract text content from all slides in a PowerPoint presentation.
  759. Args:
  760. pptx_path: Path to the PowerPoint file
  761. prs: Optional Presentation object to use. If not provided, will load from pptx_path.
  762. issues_only: If True, only include shapes that have overflow or overlap issues
  763. Returns a nested dictionary: {slide-N: {shape-N: ShapeData}}
  764. Shapes are sorted by visual position (top-to-bottom, left-to-right).
  765. The ShapeData objects contain the full shape information and can be
  766. converted to dictionaries for JSON serialization using to_dict().
  767. """
  768. if prs is None:
  769. prs = Presentation(str(pptx_path))
  770. inventory: InventoryData = {}
  771. for slide_idx, slide in enumerate(prs.slides):
  772. # Collect all valid shapes from this slide with absolute positions
  773. shapes_with_positions = []
  774. for shape in slide.shapes: # type: ignore
  775. shapes_with_positions.extend(collect_shapes_with_absolute_positions(shape))
  776. if not shapes_with_positions:
  777. continue
  778. # Convert to ShapeData with absolute positions and slide reference
  779. shape_data_list = [
  780. ShapeData(
  781. swp.shape,
  782. swp.absolute_left,
  783. swp.absolute_top,
  784. slide,
  785. )
  786. for swp in shapes_with_positions
  787. ]
  788. # Sort by visual position and assign stable IDs in one step
  789. sorted_shapes = sort_shapes_by_position(shape_data_list)
  790. for idx, shape_data in enumerate(sorted_shapes):
  791. shape_data.shape_id = f"shape-{idx}"
  792. # Detect overlaps using the stable shape IDs
  793. if len(sorted_shapes) > 1:
  794. detect_overlaps(sorted_shapes)
  795. # Filter for issues only if requested (after overlap detection)
  796. if issues_only:
  797. sorted_shapes = [sd for sd in sorted_shapes if sd.has_any_issues]
  798. if not sorted_shapes:
  799. continue
  800. # Create slide inventory using the stable shape IDs
  801. inventory[f"slide-{slide_idx}"] = {
  802. shape_data.shape_id: shape_data for shape_data in sorted_shapes
  803. }
  804. return inventory
  805. def get_inventory_as_dict(pptx_path: Path, issues_only: bool = False) -> InventoryDict:
  806. """Extract text inventory and return as JSON-serializable dictionaries.
  807. This is a convenience wrapper around extract_text_inventory that returns
  808. dictionaries instead of ShapeData objects, useful for testing and direct
  809. JSON serialization.
  810. Args:
  811. pptx_path: Path to the PowerPoint file
  812. issues_only: If True, only include shapes that have overflow or overlap issues
  813. Returns:
  814. Nested dictionary with all data serialized for JSON
  815. """
  816. inventory = extract_text_inventory(pptx_path, issues_only=issues_only)
  817. # Convert ShapeData objects to dictionaries
  818. dict_inventory: InventoryDict = {}
  819. for slide_key, shapes in inventory.items():
  820. dict_inventory[slide_key] = {
  821. shape_key: shape_data.to_dict() for shape_key, shape_data in shapes.items()
  822. }
  823. return dict_inventory
  824. def save_inventory(inventory: InventoryData, output_path: Path) -> None:
  825. """Save inventory to JSON file with proper formatting.
  826. Converts ShapeData objects to dictionaries for JSON serialization.
  827. """
  828. # Convert ShapeData objects to dictionaries
  829. json_inventory: InventoryDict = {}
  830. for slide_key, shapes in inventory.items():
  831. json_inventory[slide_key] = {
  832. shape_key: shape_data.to_dict() for shape_key, shape_data in shapes.items()
  833. }
  834. with open(output_path, "w", encoding="utf-8") as f:
  835. json.dump(json_inventory, f, indent=2, ensure_ascii=False)
  836. if __name__ == "__main__":
  837. main()