| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159 |
- #!/usr/bin/env python3
- """
- Tool to pack a directory into a .docx, .pptx, or .xlsx file with XML formatting undone.
- Example usage:
- python pack.py <input_directory> <office_file> [--force]
- """
- import argparse
- import shutil
- import subprocess
- import sys
- import tempfile
- import defusedxml.minidom
- import zipfile
- from pathlib import Path
- def main():
- parser = argparse.ArgumentParser(description="Pack a directory into an Office file")
- parser.add_argument("input_directory", help="Unpacked Office document directory")
- parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)")
- parser.add_argument("--force", action="store_true", help="Skip validation")
- args = parser.parse_args()
- try:
- success = pack_document(
- args.input_directory, args.output_file, validate=not args.force
- )
- # Show warning if validation was skipped
- if args.force:
- print("Warning: Skipped validation, file may be corrupt", file=sys.stderr)
- # Exit with error if validation failed
- elif not success:
- print("Contents would produce a corrupt file.", file=sys.stderr)
- print("Please validate XML before repacking.", file=sys.stderr)
- print("Use --force to skip validation and pack anyway.", file=sys.stderr)
- sys.exit(1)
- except ValueError as e:
- sys.exit(f"Error: {e}")
- def pack_document(input_dir, output_file, validate=False):
- """Pack a directory into an Office file (.docx/.pptx/.xlsx).
- Args:
- input_dir: Path to unpacked Office document directory
- output_file: Path to output Office file
- validate: If True, validates with soffice (default: False)
- Returns:
- bool: True if successful, False if validation failed
- """
- input_dir = Path(input_dir)
- output_file = Path(output_file)
- if not input_dir.is_dir():
- raise ValueError(f"{input_dir} is not a directory")
- if output_file.suffix.lower() not in {".docx", ".pptx", ".xlsx"}:
- raise ValueError(f"{output_file} must be a .docx, .pptx, or .xlsx file")
- # Work in temporary directory to avoid modifying original
- with tempfile.TemporaryDirectory() as temp_dir:
- temp_content_dir = Path(temp_dir) / "content"
- shutil.copytree(input_dir, temp_content_dir)
- # Process XML files to remove pretty-printing whitespace
- for pattern in ["*.xml", "*.rels"]:
- for xml_file in temp_content_dir.rglob(pattern):
- condense_xml(xml_file)
- # Create final Office file as zip archive
- output_file.parent.mkdir(parents=True, exist_ok=True)
- with zipfile.ZipFile(output_file, "w", zipfile.ZIP_DEFLATED) as zf:
- for f in temp_content_dir.rglob("*"):
- if f.is_file():
- zf.write(f, f.relative_to(temp_content_dir))
- # Validate if requested
- if validate:
- if not validate_document(output_file):
- output_file.unlink() # Delete the corrupt file
- return False
- return True
- def validate_document(doc_path):
- """Validate document by converting to HTML with soffice."""
- # Determine the correct filter based on file extension
- match doc_path.suffix.lower():
- case ".docx":
- filter_name = "html:HTML"
- case ".pptx":
- filter_name = "html:impress_html_Export"
- case ".xlsx":
- filter_name = "html:HTML (StarCalc)"
- with tempfile.TemporaryDirectory() as temp_dir:
- try:
- result = subprocess.run(
- [
- "soffice",
- "--headless",
- "--convert-to",
- filter_name,
- "--outdir",
- temp_dir,
- str(doc_path),
- ],
- capture_output=True,
- timeout=10,
- text=True,
- )
- if not (Path(temp_dir) / f"{doc_path.stem}.html").exists():
- error_msg = result.stderr.strip() or "Document validation failed"
- print(f"Validation error: {error_msg}", file=sys.stderr)
- return False
- return True
- except FileNotFoundError:
- print("Warning: soffice not found. Skipping validation.", file=sys.stderr)
- return True
- except subprocess.TimeoutExpired:
- print("Validation error: Timeout during conversion", file=sys.stderr)
- return False
- except Exception as e:
- print(f"Validation error: {e}", file=sys.stderr)
- return False
- def condense_xml(xml_file):
- """Strip unnecessary whitespace and remove comments."""
- with open(xml_file, "r", encoding="utf-8") as f:
- dom = defusedxml.minidom.parse(f)
- # Process each element to remove whitespace and comments
- for element in dom.getElementsByTagName("*"):
- # Skip w:t elements and their processing
- if element.tagName.endswith(":t"):
- continue
- # Remove whitespace-only text nodes and comment nodes
- for child in list(element.childNodes):
- if (
- child.nodeType == child.TEXT_NODE
- and child.nodeValue
- and child.nodeValue.strip() == ""
- ) or child.nodeType == child.COMMENT_NODE:
- element.removeChild(child)
- # Write back the condensed XML
- with open(xml_file, "wb") as f:
- f.write(dom.toxml(encoding="UTF-8"))
- if __name__ == "__main__":
- main()
|