fill_pdf_form_with_annotations.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. import json
  2. import sys
  3. from pypdf import PdfReader, PdfWriter
  4. from pypdf.annotations import FreeText
  5. # Fills a PDF by adding text annotations defined in `fields.json`. See forms.md.
  6. def transform_coordinates(bbox, image_width, image_height, pdf_width, pdf_height):
  7. """Transform bounding box from image coordinates to PDF coordinates"""
  8. # Image coordinates: origin at top-left, y increases downward
  9. # PDF coordinates: origin at bottom-left, y increases upward
  10. x_scale = pdf_width / image_width
  11. y_scale = pdf_height / image_height
  12. left = bbox[0] * x_scale
  13. right = bbox[2] * x_scale
  14. # Flip Y coordinates for PDF
  15. top = pdf_height - (bbox[1] * y_scale)
  16. bottom = pdf_height - (bbox[3] * y_scale)
  17. return left, bottom, right, top
  18. def fill_pdf_form(input_pdf_path, fields_json_path, output_pdf_path):
  19. """Fill the PDF form with data from fields.json"""
  20. # `fields.json` format described in forms.md.
  21. with open(fields_json_path, "r") as f:
  22. fields_data = json.load(f)
  23. # Open the PDF
  24. reader = PdfReader(input_pdf_path)
  25. writer = PdfWriter()
  26. # Copy all pages to writer
  27. writer.append(reader)
  28. # Get PDF dimensions for each page
  29. pdf_dimensions = {}
  30. for i, page in enumerate(reader.pages):
  31. mediabox = page.mediabox
  32. pdf_dimensions[i + 1] = [mediabox.width, mediabox.height]
  33. # Process each form field
  34. annotations = []
  35. for field in fields_data["form_fields"]:
  36. page_num = field["page_number"]
  37. # Get page dimensions and transform coordinates.
  38. page_info = next(p for p in fields_data["pages"] if p["page_number"] == page_num)
  39. image_width = page_info["image_width"]
  40. image_height = page_info["image_height"]
  41. pdf_width, pdf_height = pdf_dimensions[page_num]
  42. transformed_entry_box = transform_coordinates(
  43. field["entry_bounding_box"],
  44. image_width, image_height,
  45. pdf_width, pdf_height
  46. )
  47. # Skip empty fields
  48. if "entry_text" not in field or "text" not in field["entry_text"]:
  49. continue
  50. entry_text = field["entry_text"]
  51. text = entry_text["text"]
  52. if not text:
  53. continue
  54. font_name = entry_text.get("font", "Arial")
  55. font_size = str(entry_text.get("font_size", 14)) + "pt"
  56. font_color = entry_text.get("font_color", "000000")
  57. # Font size/color seems to not work reliably across viewers:
  58. # https://github.com/py-pdf/pypdf/issues/2084
  59. annotation = FreeText(
  60. text=text,
  61. rect=transformed_entry_box,
  62. font=font_name,
  63. font_size=font_size,
  64. font_color=font_color,
  65. border_color=None,
  66. background_color=None,
  67. )
  68. annotations.append(annotation)
  69. # page_number is 0-based for pypdf
  70. writer.add_annotation(page_number=page_num - 1, annotation=annotation)
  71. # Save the filled PDF
  72. with open(output_pdf_path, "wb") as output:
  73. writer.write(output)
  74. print(f"Successfully filled PDF form and saved to {output_pdf_path}")
  75. print(f"Added {len(annotations)} text annotations")
  76. if __name__ == "__main__":
  77. if len(sys.argv) != 4:
  78. print("Usage: fill_pdf_form_with_annotations.py [input pdf] [fields.json] [output pdf]")
  79. sys.exit(1)
  80. input_pdf = sys.argv[1]
  81. fields_json = sys.argv[2]
  82. output_pdf = sys.argv[3]
  83. fill_pdf_form(input_pdf, fields_json, output_pdf)