pack.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. #!/usr/bin/env python3
  2. """
  3. Tool to pack a directory into a .docx, .pptx, or .xlsx file with XML formatting undone.
  4. Example usage:
  5. python pack.py <input_directory> <office_file> [--force]
  6. """
  7. import argparse
  8. import shutil
  9. import subprocess
  10. import sys
  11. import tempfile
  12. import defusedxml.minidom
  13. import zipfile
  14. from pathlib import Path
  15. def main():
  16. parser = argparse.ArgumentParser(description="Pack a directory into an Office file")
  17. parser.add_argument("input_directory", help="Unpacked Office document directory")
  18. parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)")
  19. parser.add_argument("--force", action="store_true", help="Skip validation")
  20. args = parser.parse_args()
  21. try:
  22. success = pack_document(
  23. args.input_directory, args.output_file, validate=not args.force
  24. )
  25. # Show warning if validation was skipped
  26. if args.force:
  27. print("Warning: Skipped validation, file may be corrupt", file=sys.stderr)
  28. # Exit with error if validation failed
  29. elif not success:
  30. print("Contents would produce a corrupt file.", file=sys.stderr)
  31. print("Please validate XML before repacking.", file=sys.stderr)
  32. print("Use --force to skip validation and pack anyway.", file=sys.stderr)
  33. sys.exit(1)
  34. except ValueError as e:
  35. sys.exit(f"Error: {e}")
  36. def pack_document(input_dir, output_file, validate=False):
  37. """Pack a directory into an Office file (.docx/.pptx/.xlsx).
  38. Args:
  39. input_dir: Path to unpacked Office document directory
  40. output_file: Path to output Office file
  41. validate: If True, validates with soffice (default: False)
  42. Returns:
  43. bool: True if successful, False if validation failed
  44. """
  45. input_dir = Path(input_dir)
  46. output_file = Path(output_file)
  47. if not input_dir.is_dir():
  48. raise ValueError(f"{input_dir} is not a directory")
  49. if output_file.suffix.lower() not in {".docx", ".pptx", ".xlsx"}:
  50. raise ValueError(f"{output_file} must be a .docx, .pptx, or .xlsx file")
  51. # Work in temporary directory to avoid modifying original
  52. with tempfile.TemporaryDirectory() as temp_dir:
  53. temp_content_dir = Path(temp_dir) / "content"
  54. shutil.copytree(input_dir, temp_content_dir)
  55. # Process XML files to remove pretty-printing whitespace
  56. for pattern in ["*.xml", "*.rels"]:
  57. for xml_file in temp_content_dir.rglob(pattern):
  58. condense_xml(xml_file)
  59. # Create final Office file as zip archive
  60. output_file.parent.mkdir(parents=True, exist_ok=True)
  61. with zipfile.ZipFile(output_file, "w", zipfile.ZIP_DEFLATED) as zf:
  62. for f in temp_content_dir.rglob("*"):
  63. if f.is_file():
  64. zf.write(f, f.relative_to(temp_content_dir))
  65. # Validate if requested
  66. if validate:
  67. if not validate_document(output_file):
  68. output_file.unlink() # Delete the corrupt file
  69. return False
  70. return True
  71. def validate_document(doc_path):
  72. """Validate document by converting to HTML with soffice."""
  73. # Determine the correct filter based on file extension
  74. match doc_path.suffix.lower():
  75. case ".docx":
  76. filter_name = "html:HTML"
  77. case ".pptx":
  78. filter_name = "html:impress_html_Export"
  79. case ".xlsx":
  80. filter_name = "html:HTML (StarCalc)"
  81. with tempfile.TemporaryDirectory() as temp_dir:
  82. try:
  83. result = subprocess.run(
  84. [
  85. "soffice",
  86. "--headless",
  87. "--convert-to",
  88. filter_name,
  89. "--outdir",
  90. temp_dir,
  91. str(doc_path),
  92. ],
  93. capture_output=True,
  94. timeout=10,
  95. text=True,
  96. )
  97. if not (Path(temp_dir) / f"{doc_path.stem}.html").exists():
  98. error_msg = result.stderr.strip() or "Document validation failed"
  99. print(f"Validation error: {error_msg}", file=sys.stderr)
  100. return False
  101. return True
  102. except FileNotFoundError:
  103. print("Warning: soffice not found. Skipping validation.", file=sys.stderr)
  104. return True
  105. except subprocess.TimeoutExpired:
  106. print("Validation error: Timeout during conversion", file=sys.stderr)
  107. return False
  108. except Exception as e:
  109. print(f"Validation error: {e}", file=sys.stderr)
  110. return False
  111. def condense_xml(xml_file):
  112. """Strip unnecessary whitespace and remove comments."""
  113. with open(xml_file, "r", encoding="utf-8") as f:
  114. dom = defusedxml.minidom.parse(f)
  115. # Process each element to remove whitespace and comments
  116. for element in dom.getElementsByTagName("*"):
  117. # Skip w:t elements and their processing
  118. if element.tagName.endswith(":t"):
  119. continue
  120. # Remove whitespace-only text nodes and comment nodes
  121. for child in list(element.childNodes):
  122. if (
  123. child.nodeType == child.TEXT_NODE
  124. and child.nodeValue
  125. and child.nodeValue.strip() == ""
  126. ) or child.nodeType == child.COMMENT_NODE:
  127. element.removeChild(child)
  128. # Write back the condensed XML
  129. with open(xml_file, "wb") as f:
  130. f.write(dom.toxml(encoding="UTF-8"))
  131. if __name__ == "__main__":
  132. main()