| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020 |
- #!/usr/bin/env python3
- """
- Extract structured text content from PowerPoint presentations.
- This module provides functionality to:
- - Extract all text content from PowerPoint shapes
- - Preserve paragraph formatting (alignment, bullets, fonts, spacing)
- - Handle nested GroupShapes recursively with correct absolute positions
- - Sort shapes by visual position on slides
- - Filter out slide numbers and non-content placeholders
- - Export to JSON with clean, structured data
- Classes:
- ParagraphData: Represents a text paragraph with formatting
- ShapeData: Represents a shape with position and text content
- Main Functions:
- extract_text_inventory: Extract all text from a presentation
- save_inventory: Save extracted data to JSON
- Usage:
- python inventory.py input.pptx output.json
- """
- import argparse
- import json
- import platform
- import sys
- from dataclasses import dataclass
- from pathlib import Path
- from typing import Any, Dict, List, Optional, Tuple, Union
- from PIL import Image, ImageDraw, ImageFont
- from pptx import Presentation
- from pptx.enum.text import PP_ALIGN
- from pptx.shapes.base import BaseShape
- # Type aliases for cleaner signatures
- JsonValue = Union[str, int, float, bool, None]
- ParagraphDict = Dict[str, JsonValue]
- ShapeDict = Dict[
- str, Union[str, float, bool, List[ParagraphDict], List[str], Dict[str, Any], None]
- ]
- InventoryData = Dict[
- str, Dict[str, "ShapeData"]
- ] # Dict of slide_id -> {shape_id -> ShapeData}
- InventoryDict = Dict[str, Dict[str, ShapeDict]] # JSON-serializable inventory
- def main():
- """Main entry point for command-line usage."""
- parser = argparse.ArgumentParser(
- description="Extract text inventory from PowerPoint with proper GroupShape support.",
- formatter_class=argparse.RawDescriptionHelpFormatter,
- epilog="""
- Examples:
- python inventory.py presentation.pptx inventory.json
- Extracts text inventory with correct absolute positions for grouped shapes
- python inventory.py presentation.pptx inventory.json --issues-only
- Extracts only text shapes that have overflow or overlap issues
- The output JSON includes:
- - All text content organized by slide and shape
- - Correct absolute positions for shapes in groups
- - Visual position and size in inches
- - Paragraph properties and formatting
- - Issue detection: text overflow and shape overlaps
- """,
- )
- parser.add_argument("input", help="Input PowerPoint file (.pptx)")
- parser.add_argument("output", help="Output JSON file for inventory")
- parser.add_argument(
- "--issues-only",
- action="store_true",
- help="Include only text shapes that have overflow or overlap issues",
- )
- args = parser.parse_args()
- input_path = Path(args.input)
- if not input_path.exists():
- print(f"Error: Input file not found: {args.input}")
- sys.exit(1)
- if not input_path.suffix.lower() == ".pptx":
- print("Error: Input must be a PowerPoint file (.pptx)")
- sys.exit(1)
- try:
- print(f"Extracting text inventory from: {args.input}")
- if args.issues_only:
- print(
- "Filtering to include only text shapes with issues (overflow/overlap)"
- )
- inventory = extract_text_inventory(input_path, issues_only=args.issues_only)
- output_path = Path(args.output)
- output_path.parent.mkdir(parents=True, exist_ok=True)
- save_inventory(inventory, output_path)
- print(f"Output saved to: {args.output}")
- # Report statistics
- total_slides = len(inventory)
- total_shapes = sum(len(shapes) for shapes in inventory.values())
- if args.issues_only:
- if total_shapes > 0:
- print(
- f"Found {total_shapes} text elements with issues in {total_slides} slides"
- )
- else:
- print("No issues discovered")
- else:
- print(
- f"Found text in {total_slides} slides with {total_shapes} text elements"
- )
- except Exception as e:
- print(f"Error processing presentation: {e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)
- @dataclass
- class ShapeWithPosition:
- """A shape with its absolute position on the slide."""
- shape: BaseShape
- absolute_left: int # in EMUs
- absolute_top: int # in EMUs
- class ParagraphData:
- """Data structure for paragraph properties extracted from a PowerPoint paragraph."""
- def __init__(self, paragraph: Any):
- """Initialize from a PowerPoint paragraph object.
- Args:
- paragraph: The PowerPoint paragraph object
- """
- self.text: str = paragraph.text.strip()
- self.bullet: bool = False
- self.level: Optional[int] = None
- self.alignment: Optional[str] = None
- self.space_before: Optional[float] = None
- self.space_after: Optional[float] = None
- self.font_name: Optional[str] = None
- self.font_size: Optional[float] = None
- self.bold: Optional[bool] = None
- self.italic: Optional[bool] = None
- self.underline: Optional[bool] = None
- self.color: Optional[str] = None
- self.theme_color: Optional[str] = None
- self.line_spacing: Optional[float] = None
- # Check for bullet formatting
- if (
- hasattr(paragraph, "_p")
- and paragraph._p is not None
- and paragraph._p.pPr is not None
- ):
- pPr = paragraph._p.pPr
- ns = "{http://schemas.openxmlformats.org/drawingml/2006/main}"
- if (
- pPr.find(f"{ns}buChar") is not None
- or pPr.find(f"{ns}buAutoNum") is not None
- ):
- self.bullet = True
- if hasattr(paragraph, "level"):
- self.level = paragraph.level
- # Add alignment if not LEFT (default)
- if hasattr(paragraph, "alignment") and paragraph.alignment is not None:
- alignment_map = {
- PP_ALIGN.CENTER: "CENTER",
- PP_ALIGN.RIGHT: "RIGHT",
- PP_ALIGN.JUSTIFY: "JUSTIFY",
- }
- if paragraph.alignment in alignment_map:
- self.alignment = alignment_map[paragraph.alignment]
- # Add spacing properties if set
- if hasattr(paragraph, "space_before") and paragraph.space_before:
- self.space_before = paragraph.space_before.pt
- if hasattr(paragraph, "space_after") and paragraph.space_after:
- self.space_after = paragraph.space_after.pt
- # Extract font properties from first run
- if paragraph.runs:
- first_run = paragraph.runs[0]
- if hasattr(first_run, "font"):
- font = first_run.font
- if font.name:
- self.font_name = font.name
- if font.size:
- self.font_size = font.size.pt
- if font.bold is not None:
- self.bold = font.bold
- if font.italic is not None:
- self.italic = font.italic
- if font.underline is not None:
- self.underline = font.underline
- # Handle color - both RGB and theme colors
- try:
- # Try RGB color first
- if font.color.rgb:
- self.color = str(font.color.rgb)
- except (AttributeError, TypeError):
- # Fall back to theme color
- try:
- if font.color.theme_color:
- self.theme_color = font.color.theme_color.name
- except (AttributeError, TypeError):
- pass
- # Add line spacing if set
- if hasattr(paragraph, "line_spacing") and paragraph.line_spacing is not None:
- if hasattr(paragraph.line_spacing, "pt"):
- self.line_spacing = round(paragraph.line_spacing.pt, 2)
- else:
- # Multiplier - convert to points
- font_size = self.font_size if self.font_size else 12.0
- self.line_spacing = round(paragraph.line_spacing * font_size, 2)
- def to_dict(self) -> ParagraphDict:
- """Convert to dictionary for JSON serialization, excluding None values."""
- result: ParagraphDict = {"text": self.text}
- # Add optional fields only if they have values
- if self.bullet:
- result["bullet"] = self.bullet
- if self.level is not None:
- result["level"] = self.level
- if self.alignment:
- result["alignment"] = self.alignment
- if self.space_before is not None:
- result["space_before"] = self.space_before
- if self.space_after is not None:
- result["space_after"] = self.space_after
- if self.font_name:
- result["font_name"] = self.font_name
- if self.font_size is not None:
- result["font_size"] = self.font_size
- if self.bold is not None:
- result["bold"] = self.bold
- if self.italic is not None:
- result["italic"] = self.italic
- if self.underline is not None:
- result["underline"] = self.underline
- if self.color:
- result["color"] = self.color
- if self.theme_color:
- result["theme_color"] = self.theme_color
- if self.line_spacing is not None:
- result["line_spacing"] = self.line_spacing
- return result
- class ShapeData:
- """Data structure for shape properties extracted from a PowerPoint shape."""
- @staticmethod
- def emu_to_inches(emu: int) -> float:
- """Convert EMUs (English Metric Units) to inches."""
- return emu / 914400.0
- @staticmethod
- def inches_to_pixels(inches: float, dpi: int = 96) -> int:
- """Convert inches to pixels at given DPI."""
- return int(inches * dpi)
- @staticmethod
- def get_font_path(font_name: str) -> Optional[str]:
- """Get the font file path for a given font name.
- Args:
- font_name: Name of the font (e.g., 'Arial', 'Calibri')
- Returns:
- Path to the font file, or None if not found
- """
- system = platform.system()
- # Common font file variations to try
- font_variations = [
- font_name,
- font_name.lower(),
- font_name.replace(" ", ""),
- font_name.replace(" ", "-"),
- ]
- # Define font directories and extensions by platform
- if system == "Darwin": # macOS
- font_dirs = [
- "/System/Library/Fonts/",
- "/Library/Fonts/",
- "~/Library/Fonts/",
- ]
- extensions = [".ttf", ".otf", ".ttc", ".dfont"]
- else: # Linux
- font_dirs = [
- "/usr/share/fonts/truetype/",
- "/usr/local/share/fonts/",
- "~/.fonts/",
- ]
- extensions = [".ttf", ".otf"]
- # Try to find the font file
- from pathlib import Path
- for font_dir in font_dirs:
- font_dir_path = Path(font_dir).expanduser()
- if not font_dir_path.exists():
- continue
- # First try exact matches
- for variant in font_variations:
- for ext in extensions:
- font_path = font_dir_path / f"{variant}{ext}"
- if font_path.exists():
- return str(font_path)
- # Then try fuzzy matching - find files containing the font name
- try:
- for file_path in font_dir_path.iterdir():
- if file_path.is_file():
- file_name_lower = file_path.name.lower()
- font_name_lower = font_name.lower().replace(" ", "")
- if font_name_lower in file_name_lower and any(
- file_name_lower.endswith(ext) for ext in extensions
- ):
- return str(file_path)
- except (OSError, PermissionError):
- continue
- return None
- @staticmethod
- def get_slide_dimensions(slide: Any) -> tuple[Optional[int], Optional[int]]:
- """Get slide dimensions from slide object.
- Args:
- slide: Slide object
- Returns:
- Tuple of (width_emu, height_emu) or (None, None) if not found
- """
- try:
- prs = slide.part.package.presentation_part.presentation
- return prs.slide_width, prs.slide_height
- except (AttributeError, TypeError):
- return None, None
- @staticmethod
- def get_default_font_size(shape: BaseShape, slide_layout: Any) -> Optional[float]:
- """Extract default font size from slide layout for a placeholder shape.
- Args:
- shape: Placeholder shape
- slide_layout: Slide layout containing the placeholder definition
- Returns:
- Default font size in points, or None if not found
- """
- try:
- if not hasattr(shape, "placeholder_format"):
- return None
- shape_type = shape.placeholder_format.type # type: ignore
- for layout_placeholder in slide_layout.placeholders:
- if layout_placeholder.placeholder_format.type == shape_type:
- # Find first defRPr element with sz (size) attribute
- for elem in layout_placeholder.element.iter():
- if "defRPr" in elem.tag and (sz := elem.get("sz")):
- return float(sz) / 100.0 # Convert EMUs to points
- break
- except Exception:
- pass
- return None
- def __init__(
- self,
- shape: BaseShape,
- absolute_left: Optional[int] = None,
- absolute_top: Optional[int] = None,
- slide: Optional[Any] = None,
- ):
- """Initialize from a PowerPoint shape object.
- Args:
- shape: The PowerPoint shape object (should be pre-validated)
- absolute_left: Absolute left position in EMUs (for shapes in groups)
- absolute_top: Absolute top position in EMUs (for shapes in groups)
- slide: Optional slide object to get dimensions and layout information
- """
- self.shape = shape # Store reference to original shape
- self.shape_id: str = "" # Will be set after sorting
- # Get slide dimensions from slide object
- self.slide_width_emu, self.slide_height_emu = (
- self.get_slide_dimensions(slide) if slide else (None, None)
- )
- # Get placeholder type if applicable
- self.placeholder_type: Optional[str] = None
- self.default_font_size: Optional[float] = None
- if hasattr(shape, "is_placeholder") and shape.is_placeholder: # type: ignore
- if shape.placeholder_format and shape.placeholder_format.type: # type: ignore
- self.placeholder_type = (
- str(shape.placeholder_format.type).split(".")[-1].split(" ")[0] # type: ignore
- )
- # Get default font size from layout
- if slide and hasattr(slide, "slide_layout"):
- self.default_font_size = self.get_default_font_size(
- shape, slide.slide_layout
- )
- # Get position information
- # Use absolute positions if provided (for shapes in groups), otherwise use shape's position
- left_emu = (
- absolute_left
- if absolute_left is not None
- else (shape.left if hasattr(shape, "left") else 0)
- )
- top_emu = (
- absolute_top
- if absolute_top is not None
- else (shape.top if hasattr(shape, "top") else 0)
- )
- self.left: float = round(self.emu_to_inches(left_emu), 2) # type: ignore
- self.top: float = round(self.emu_to_inches(top_emu), 2) # type: ignore
- self.width: float = round(
- self.emu_to_inches(shape.width if hasattr(shape, "width") else 0),
- 2, # type: ignore
- )
- self.height: float = round(
- self.emu_to_inches(shape.height if hasattr(shape, "height") else 0),
- 2, # type: ignore
- )
- # Store EMU positions for overflow calculations
- self.left_emu = left_emu
- self.top_emu = top_emu
- self.width_emu = shape.width if hasattr(shape, "width") else 0
- self.height_emu = shape.height if hasattr(shape, "height") else 0
- # Calculate overflow status
- self.frame_overflow_bottom: Optional[float] = None
- self.slide_overflow_right: Optional[float] = None
- self.slide_overflow_bottom: Optional[float] = None
- self.overlapping_shapes: Dict[
- str, float
- ] = {} # Dict of shape_id -> overlap area in sq inches
- self.warnings: List[str] = []
- self._estimate_frame_overflow()
- self._calculate_slide_overflow()
- self._detect_bullet_issues()
- @property
- def paragraphs(self) -> List[ParagraphData]:
- """Calculate paragraphs from the shape's text frame."""
- if not self.shape or not hasattr(self.shape, "text_frame"):
- return []
- paragraphs = []
- for paragraph in self.shape.text_frame.paragraphs: # type: ignore
- if paragraph.text.strip():
- paragraphs.append(ParagraphData(paragraph))
- return paragraphs
- def _get_default_font_size(self) -> int:
- """Get default font size from theme text styles or use conservative default."""
- try:
- if not (
- hasattr(self.shape, "part") and hasattr(self.shape.part, "slide_layout")
- ):
- return 14
- slide_master = self.shape.part.slide_layout.slide_master # type: ignore
- if not hasattr(slide_master, "element"):
- return 14
- # Determine theme style based on placeholder type
- style_name = "bodyStyle" # Default
- if self.placeholder_type and "TITLE" in self.placeholder_type:
- style_name = "titleStyle"
- # Find font size in theme styles
- for child in slide_master.element.iter():
- tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
- if tag == style_name:
- for elem in child.iter():
- if "sz" in elem.attrib:
- return int(elem.attrib["sz"]) // 100
- except Exception:
- pass
- return 14 # Conservative default for body text
- def _get_usable_dimensions(self, text_frame) -> Tuple[int, int]:
- """Get usable width and height in pixels after accounting for margins."""
- # Default PowerPoint margins in inches
- margins = {"top": 0.05, "bottom": 0.05, "left": 0.1, "right": 0.1}
- # Override with actual margins if set
- if hasattr(text_frame, "margin_top") and text_frame.margin_top:
- margins["top"] = self.emu_to_inches(text_frame.margin_top)
- if hasattr(text_frame, "margin_bottom") and text_frame.margin_bottom:
- margins["bottom"] = self.emu_to_inches(text_frame.margin_bottom)
- if hasattr(text_frame, "margin_left") and text_frame.margin_left:
- margins["left"] = self.emu_to_inches(text_frame.margin_left)
- if hasattr(text_frame, "margin_right") and text_frame.margin_right:
- margins["right"] = self.emu_to_inches(text_frame.margin_right)
- # Calculate usable area
- usable_width = self.width - margins["left"] - margins["right"]
- usable_height = self.height - margins["top"] - margins["bottom"]
- # Convert to pixels
- return (
- self.inches_to_pixels(usable_width),
- self.inches_to_pixels(usable_height),
- )
- def _wrap_text_line(self, line: str, max_width_px: int, draw, font) -> List[str]:
- """Wrap a single line of text to fit within max_width_px."""
- if not line:
- return [""]
- # Use textlength for efficient width calculation
- if draw.textlength(line, font=font) <= max_width_px:
- return [line]
- # Need to wrap - split into words
- wrapped = []
- words = line.split(" ")
- current_line = ""
- for word in words:
- test_line = current_line + (" " if current_line else "") + word
- if draw.textlength(test_line, font=font) <= max_width_px:
- current_line = test_line
- else:
- if current_line:
- wrapped.append(current_line)
- current_line = word
- if current_line:
- wrapped.append(current_line)
- return wrapped
- def _estimate_frame_overflow(self) -> None:
- """Estimate if text overflows the shape bounds using PIL text measurement."""
- if not self.shape or not hasattr(self.shape, "text_frame"):
- return
- text_frame = self.shape.text_frame # type: ignore
- if not text_frame or not text_frame.paragraphs:
- return
- # Get usable dimensions after accounting for margins
- usable_width_px, usable_height_px = self._get_usable_dimensions(text_frame)
- if usable_width_px <= 0 or usable_height_px <= 0:
- return
- # Set up PIL for text measurement
- dummy_img = Image.new("RGB", (1, 1))
- draw = ImageDraw.Draw(dummy_img)
- # Get default font size from placeholder or use conservative estimate
- default_font_size = self._get_default_font_size()
- # Calculate total height of all paragraphs
- total_height_px = 0
- for para_idx, paragraph in enumerate(text_frame.paragraphs):
- if not paragraph.text.strip():
- continue
- para_data = ParagraphData(paragraph)
- # Load font for this paragraph
- font_name = para_data.font_name or "Arial"
- font_size = int(para_data.font_size or default_font_size)
- font = None
- font_path = self.get_font_path(font_name)
- if font_path:
- try:
- font = ImageFont.truetype(font_path, size=font_size)
- except Exception:
- font = ImageFont.load_default()
- else:
- font = ImageFont.load_default()
- # Wrap all lines in this paragraph
- all_wrapped_lines = []
- for line in paragraph.text.split("\n"):
- wrapped = self._wrap_text_line(line, usable_width_px, draw, font)
- all_wrapped_lines.extend(wrapped)
- if all_wrapped_lines:
- # Calculate line height
- if para_data.line_spacing:
- # Custom line spacing explicitly set
- line_height_px = para_data.line_spacing * 96 / 72
- else:
- # PowerPoint default single spacing (1.0x font size)
- line_height_px = font_size * 96 / 72
- # Add space_before (except first paragraph)
- if para_idx > 0 and para_data.space_before:
- total_height_px += para_data.space_before * 96 / 72
- # Add paragraph text height
- total_height_px += len(all_wrapped_lines) * line_height_px
- # Add space_after
- if para_data.space_after:
- total_height_px += para_data.space_after * 96 / 72
- # Check for overflow (ignore negligible overflows <= 0.05")
- if total_height_px > usable_height_px:
- overflow_px = total_height_px - usable_height_px
- overflow_inches = round(overflow_px / 96.0, 2)
- if overflow_inches > 0.05: # Only report significant overflows
- self.frame_overflow_bottom = overflow_inches
- def _calculate_slide_overflow(self) -> None:
- """Calculate if shape overflows the slide boundaries."""
- if self.slide_width_emu is None or self.slide_height_emu is None:
- return
- # Check right overflow (ignore negligible overflows <= 0.01")
- right_edge_emu = self.left_emu + self.width_emu
- if right_edge_emu > self.slide_width_emu:
- overflow_emu = right_edge_emu - self.slide_width_emu
- overflow_inches = round(self.emu_to_inches(overflow_emu), 2)
- if overflow_inches > 0.01: # Only report significant overflows
- self.slide_overflow_right = overflow_inches
- # Check bottom overflow (ignore negligible overflows <= 0.01")
- bottom_edge_emu = self.top_emu + self.height_emu
- if bottom_edge_emu > self.slide_height_emu:
- overflow_emu = bottom_edge_emu - self.slide_height_emu
- overflow_inches = round(self.emu_to_inches(overflow_emu), 2)
- if overflow_inches > 0.01: # Only report significant overflows
- self.slide_overflow_bottom = overflow_inches
- def _detect_bullet_issues(self) -> None:
- """Detect bullet point formatting issues in paragraphs."""
- if not self.shape or not hasattr(self.shape, "text_frame"):
- return
- text_frame = self.shape.text_frame # type: ignore
- if not text_frame or not text_frame.paragraphs:
- return
- # Common bullet symbols that indicate manual bullets
- bullet_symbols = ["•", "●", "○"]
- for paragraph in text_frame.paragraphs:
- text = paragraph.text.strip()
- # Check for manual bullet symbols
- if text and any(text.startswith(symbol + " ") for symbol in bullet_symbols):
- self.warnings.append(
- "manual_bullet_symbol: use proper bullet formatting"
- )
- break
- @property
- def has_any_issues(self) -> bool:
- """Check if shape has any issues (overflow, overlap, or warnings)."""
- return (
- self.frame_overflow_bottom is not None
- or self.slide_overflow_right is not None
- or self.slide_overflow_bottom is not None
- or len(self.overlapping_shapes) > 0
- or len(self.warnings) > 0
- )
- def to_dict(self) -> ShapeDict:
- """Convert to dictionary for JSON serialization."""
- result: ShapeDict = {
- "left": self.left,
- "top": self.top,
- "width": self.width,
- "height": self.height,
- }
- # Add optional fields if present
- if self.placeholder_type:
- result["placeholder_type"] = self.placeholder_type
- if self.default_font_size:
- result["default_font_size"] = self.default_font_size
- # Add overflow information only if there is overflow
- overflow_data = {}
- # Add frame overflow if present
- if self.frame_overflow_bottom is not None:
- overflow_data["frame"] = {"overflow_bottom": self.frame_overflow_bottom}
- # Add slide overflow if present
- slide_overflow = {}
- if self.slide_overflow_right is not None:
- slide_overflow["overflow_right"] = self.slide_overflow_right
- if self.slide_overflow_bottom is not None:
- slide_overflow["overflow_bottom"] = self.slide_overflow_bottom
- if slide_overflow:
- overflow_data["slide"] = slide_overflow
- # Only add overflow field if there is overflow
- if overflow_data:
- result["overflow"] = overflow_data
- # Add overlap field if there are overlapping shapes
- if self.overlapping_shapes:
- result["overlap"] = {"overlapping_shapes": self.overlapping_shapes}
- # Add warnings field if there are warnings
- if self.warnings:
- result["warnings"] = self.warnings
- # Add paragraphs after placeholder_type
- result["paragraphs"] = [para.to_dict() for para in self.paragraphs]
- return result
- def is_valid_shape(shape: BaseShape) -> bool:
- """Check if a shape contains meaningful text content."""
- # Must have a text frame with content
- if not hasattr(shape, "text_frame") or not shape.text_frame: # type: ignore
- return False
- text = shape.text_frame.text.strip() # type: ignore
- if not text:
- return False
- # Skip slide numbers and numeric footers
- if hasattr(shape, "is_placeholder") and shape.is_placeholder: # type: ignore
- if shape.placeholder_format and shape.placeholder_format.type: # type: ignore
- placeholder_type = (
- str(shape.placeholder_format.type).split(".")[-1].split(" ")[0] # type: ignore
- )
- if placeholder_type == "SLIDE_NUMBER":
- return False
- if placeholder_type == "FOOTER" and text.isdigit():
- return False
- return True
- def collect_shapes_with_absolute_positions(
- shape: BaseShape, parent_left: int = 0, parent_top: int = 0
- ) -> List[ShapeWithPosition]:
- """Recursively collect all shapes with valid text, calculating absolute positions.
- For shapes within groups, their positions are relative to the group.
- This function calculates the absolute position on the slide by accumulating
- parent group offsets.
- Args:
- shape: The shape to process
- parent_left: Accumulated left offset from parent groups (in EMUs)
- parent_top: Accumulated top offset from parent groups (in EMUs)
- Returns:
- List of ShapeWithPosition objects with absolute positions
- """
- if hasattr(shape, "shapes"): # GroupShape
- result = []
- # Get this group's position
- group_left = shape.left if hasattr(shape, "left") else 0
- group_top = shape.top if hasattr(shape, "top") else 0
- # Calculate absolute position for this group
- abs_group_left = parent_left + group_left
- abs_group_top = parent_top + group_top
- # Process children with accumulated offsets
- for child in shape.shapes: # type: ignore
- result.extend(
- collect_shapes_with_absolute_positions(
- child, abs_group_left, abs_group_top
- )
- )
- return result
- # Regular shape - check if it has valid text
- if is_valid_shape(shape):
- # Calculate absolute position
- shape_left = shape.left if hasattr(shape, "left") else 0
- shape_top = shape.top if hasattr(shape, "top") else 0
- return [
- ShapeWithPosition(
- shape=shape,
- absolute_left=parent_left + shape_left,
- absolute_top=parent_top + shape_top,
- )
- ]
- return []
- def sort_shapes_by_position(shapes: List[ShapeData]) -> List[ShapeData]:
- """Sort shapes by visual position (top-to-bottom, left-to-right).
- Shapes within 0.5 inches vertically are considered on the same row.
- """
- if not shapes:
- return shapes
- # Sort by top position first
- shapes = sorted(shapes, key=lambda s: (s.top, s.left))
- # Group shapes by row (within 0.5 inches vertically)
- result = []
- row = [shapes[0]]
- row_top = shapes[0].top
- for shape in shapes[1:]:
- if abs(shape.top - row_top) <= 0.5:
- row.append(shape)
- else:
- # Sort current row by left position and add to result
- result.extend(sorted(row, key=lambda s: s.left))
- row = [shape]
- row_top = shape.top
- # Don't forget the last row
- result.extend(sorted(row, key=lambda s: s.left))
- return result
- def calculate_overlap(
- rect1: Tuple[float, float, float, float],
- rect2: Tuple[float, float, float, float],
- tolerance: float = 0.05,
- ) -> Tuple[bool, float]:
- """Calculate if and how much two rectangles overlap.
- Args:
- rect1: (left, top, width, height) of first rectangle in inches
- rect2: (left, top, width, height) of second rectangle in inches
- tolerance: Minimum overlap in inches to consider as overlapping (default: 0.05")
- Returns:
- Tuple of (overlaps, overlap_area) where:
- - overlaps: True if rectangles overlap by more than tolerance
- - overlap_area: Area of overlap in square inches
- """
- left1, top1, w1, h1 = rect1
- left2, top2, w2, h2 = rect2
- # Calculate overlap dimensions
- overlap_width = min(left1 + w1, left2 + w2) - max(left1, left2)
- overlap_height = min(top1 + h1, top2 + h2) - max(top1, top2)
- # Check if there's meaningful overlap (more than tolerance)
- if overlap_width > tolerance and overlap_height > tolerance:
- # Calculate overlap area in square inches
- overlap_area = overlap_width * overlap_height
- return True, round(overlap_area, 2)
- return False, 0
- def detect_overlaps(shapes: List[ShapeData]) -> None:
- """Detect overlapping shapes and update their overlapping_shapes dictionaries.
- This function requires each ShapeData to have its shape_id already set.
- It modifies the shapes in-place, adding shape IDs with overlap areas in square inches.
- Args:
- shapes: List of ShapeData objects with shape_id attributes set
- """
- n = len(shapes)
- # Compare each pair of shapes
- for i in range(n):
- for j in range(i + 1, n):
- shape1 = shapes[i]
- shape2 = shapes[j]
- # Ensure shape IDs are set
- assert shape1.shape_id, f"Shape at index {i} has no shape_id"
- assert shape2.shape_id, f"Shape at index {j} has no shape_id"
- rect1 = (shape1.left, shape1.top, shape1.width, shape1.height)
- rect2 = (shape2.left, shape2.top, shape2.width, shape2.height)
- overlaps, overlap_area = calculate_overlap(rect1, rect2)
- if overlaps:
- # Add shape IDs with overlap area in square inches
- shape1.overlapping_shapes[shape2.shape_id] = overlap_area
- shape2.overlapping_shapes[shape1.shape_id] = overlap_area
- def extract_text_inventory(
- pptx_path: Path, prs: Optional[Any] = None, issues_only: bool = False
- ) -> InventoryData:
- """Extract text content from all slides in a PowerPoint presentation.
- Args:
- pptx_path: Path to the PowerPoint file
- prs: Optional Presentation object to use. If not provided, will load from pptx_path.
- issues_only: If True, only include shapes that have overflow or overlap issues
- Returns a nested dictionary: {slide-N: {shape-N: ShapeData}}
- Shapes are sorted by visual position (top-to-bottom, left-to-right).
- The ShapeData objects contain the full shape information and can be
- converted to dictionaries for JSON serialization using to_dict().
- """
- if prs is None:
- prs = Presentation(str(pptx_path))
- inventory: InventoryData = {}
- for slide_idx, slide in enumerate(prs.slides):
- # Collect all valid shapes from this slide with absolute positions
- shapes_with_positions = []
- for shape in slide.shapes: # type: ignore
- shapes_with_positions.extend(collect_shapes_with_absolute_positions(shape))
- if not shapes_with_positions:
- continue
- # Convert to ShapeData with absolute positions and slide reference
- shape_data_list = [
- ShapeData(
- swp.shape,
- swp.absolute_left,
- swp.absolute_top,
- slide,
- )
- for swp in shapes_with_positions
- ]
- # Sort by visual position and assign stable IDs in one step
- sorted_shapes = sort_shapes_by_position(shape_data_list)
- for idx, shape_data in enumerate(sorted_shapes):
- shape_data.shape_id = f"shape-{idx}"
- # Detect overlaps using the stable shape IDs
- if len(sorted_shapes) > 1:
- detect_overlaps(sorted_shapes)
- # Filter for issues only if requested (after overlap detection)
- if issues_only:
- sorted_shapes = [sd for sd in sorted_shapes if sd.has_any_issues]
- if not sorted_shapes:
- continue
- # Create slide inventory using the stable shape IDs
- inventory[f"slide-{slide_idx}"] = {
- shape_data.shape_id: shape_data for shape_data in sorted_shapes
- }
- return inventory
- def get_inventory_as_dict(pptx_path: Path, issues_only: bool = False) -> InventoryDict:
- """Extract text inventory and return as JSON-serializable dictionaries.
- This is a convenience wrapper around extract_text_inventory that returns
- dictionaries instead of ShapeData objects, useful for testing and direct
- JSON serialization.
- Args:
- pptx_path: Path to the PowerPoint file
- issues_only: If True, only include shapes that have overflow or overlap issues
- Returns:
- Nested dictionary with all data serialized for JSON
- """
- inventory = extract_text_inventory(pptx_path, issues_only=issues_only)
- # Convert ShapeData objects to dictionaries
- dict_inventory: InventoryDict = {}
- for slide_key, shapes in inventory.items():
- dict_inventory[slide_key] = {
- shape_key: shape_data.to_dict() for shape_key, shape_data in shapes.items()
- }
- return dict_inventory
- def save_inventory(inventory: InventoryData, output_path: Path) -> None:
- """Save inventory to JSON file with proper formatting.
- Converts ShapeData objects to dictionaries for JSON serialization.
- """
- # Convert ShapeData objects to dictionaries
- json_inventory: InventoryDict = {}
- for slide_key, shapes in inventory.items():
- json_inventory[slide_key] = {
- shape_key: shape_data.to_dict() for shape_key, shape_data in shapes.items()
- }
- with open(output_path, "w", encoding="utf-8") as f:
- json.dump(json_inventory, f, indent=2, ensure_ascii=False)
- if __name__ == "__main__":
- main()
|