unpack.py 1.0 KB

1234567891011121314151617181920212223242526272829
  1. #!/usr/bin/env python3
  2. """Unpack and format XML contents of Office files (.docx, .pptx, .xlsx)"""
  3. import random
  4. import sys
  5. import defusedxml.minidom
  6. import zipfile
  7. from pathlib import Path
  8. # Get command line arguments
  9. assert len(sys.argv) == 3, "Usage: python unpack.py <office_file> <output_dir>"
  10. input_file, output_dir = sys.argv[1], sys.argv[2]
  11. # Extract and format
  12. output_path = Path(output_dir)
  13. output_path.mkdir(parents=True, exist_ok=True)
  14. zipfile.ZipFile(input_file).extractall(output_path)
  15. # Pretty print all XML files
  16. xml_files = list(output_path.rglob("*.xml")) + list(output_path.rglob("*.rels"))
  17. for xml_file in xml_files:
  18. content = xml_file.read_text(encoding="utf-8")
  19. dom = defusedxml.minidom.parseString(content)
  20. xml_file.write_bytes(dom.toprettyxml(indent=" ", encoding="ascii"))
  21. # For .docx files, suggest an RSID for tracked changes
  22. if input_file.endswith(".docx"):
  23. suggested_rsid = "".join(random.choices("0123456789ABCDEF", k=8))
  24. print(f"Suggested RSID for edit session: {suggested_rsid}")