extract_form_field_info.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. import json
  2. import sys
  3. from pypdf import PdfReader
  4. # Extracts data for the fillable form fields in a PDF and outputs JSON that
  5. # Claude uses to fill the fields. See forms.md.
  6. # This matches the format used by PdfReader `get_fields` and `update_page_form_field_values` methods.
  7. def get_full_annotation_field_id(annotation):
  8. components = []
  9. while annotation:
  10. field_name = annotation.get('/T')
  11. if field_name:
  12. components.append(field_name)
  13. annotation = annotation.get('/Parent')
  14. return ".".join(reversed(components)) if components else None
  15. def make_field_dict(field, field_id):
  16. field_dict = {"field_id": field_id}
  17. ft = field.get('/FT')
  18. if ft == "/Tx":
  19. field_dict["type"] = "text"
  20. elif ft == "/Btn":
  21. field_dict["type"] = "checkbox" # radio groups handled separately
  22. states = field.get("/_States_", [])
  23. if len(states) == 2:
  24. # "/Off" seems to always be the unchecked value, as suggested by
  25. # https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf#page=448
  26. # It can be either first or second in the "/_States_" list.
  27. if "/Off" in states:
  28. field_dict["checked_value"] = states[0] if states[0] != "/Off" else states[1]
  29. field_dict["unchecked_value"] = "/Off"
  30. else:
  31. print(f"Unexpected state values for checkbox `${field_id}`. Its checked and unchecked values may not be correct; if you're trying to check it, visually verify the results.")
  32. field_dict["checked_value"] = states[0]
  33. field_dict["unchecked_value"] = states[1]
  34. elif ft == "/Ch":
  35. field_dict["type"] = "choice"
  36. states = field.get("/_States_", [])
  37. field_dict["choice_options"] = [{
  38. "value": state[0],
  39. "text": state[1],
  40. } for state in states]
  41. else:
  42. field_dict["type"] = f"unknown ({ft})"
  43. return field_dict
  44. # Returns a list of fillable PDF fields:
  45. # [
  46. # {
  47. # "field_id": "name",
  48. # "page": 1,
  49. # "type": ("text", "checkbox", "radio_group", or "choice")
  50. # // Per-type additional fields described in forms.md
  51. # },
  52. # ]
  53. def get_field_info(reader: PdfReader):
  54. fields = reader.get_fields()
  55. field_info_by_id = {}
  56. possible_radio_names = set()
  57. for field_id, field in fields.items():
  58. # Skip if this is a container field with children, except that it might be
  59. # a parent group for radio button options.
  60. if field.get("/Kids"):
  61. if field.get("/FT") == "/Btn":
  62. possible_radio_names.add(field_id)
  63. continue
  64. field_info_by_id[field_id] = make_field_dict(field, field_id)
  65. # Bounding rects are stored in annotations in page objects.
  66. # Radio button options have a separate annotation for each choice;
  67. # all choices have the same field name.
  68. # See https://westhealth.github.io/exploring-fillable-forms-with-pdfrw.html
  69. radio_fields_by_id = {}
  70. for page_index, page in enumerate(reader.pages):
  71. annotations = page.get('/Annots', [])
  72. for ann in annotations:
  73. field_id = get_full_annotation_field_id(ann)
  74. if field_id in field_info_by_id:
  75. field_info_by_id[field_id]["page"] = page_index + 1
  76. field_info_by_id[field_id]["rect"] = ann.get('/Rect')
  77. elif field_id in possible_radio_names:
  78. try:
  79. # ann['/AP']['/N'] should have two items. One of them is '/Off',
  80. # the other is the active value.
  81. on_values = [v for v in ann["/AP"]["/N"] if v != "/Off"]
  82. except KeyError:
  83. continue
  84. if len(on_values) == 1:
  85. rect = ann.get("/Rect")
  86. if field_id not in radio_fields_by_id:
  87. radio_fields_by_id[field_id] = {
  88. "field_id": field_id,
  89. "type": "radio_group",
  90. "page": page_index + 1,
  91. "radio_options": [],
  92. }
  93. # Note: at least on macOS 15.7, Preview.app doesn't show selected
  94. # radio buttons correctly. (It does if you remove the leading slash
  95. # from the value, but that causes them not to appear correctly in
  96. # Chrome/Firefox/Acrobat/etc).
  97. radio_fields_by_id[field_id]["radio_options"].append({
  98. "value": on_values[0],
  99. "rect": rect,
  100. })
  101. # Some PDFs have form field definitions without corresponding annotations,
  102. # so we can't tell where they are. Ignore these fields for now.
  103. fields_with_location = []
  104. for field_info in field_info_by_id.values():
  105. if "page" in field_info:
  106. fields_with_location.append(field_info)
  107. else:
  108. print(f"Unable to determine location for field id: {field_info.get('field_id')}, ignoring")
  109. # Sort by page number, then Y position (flipped in PDF coordinate system), then X.
  110. def sort_key(f):
  111. if "radio_options" in f:
  112. rect = f["radio_options"][0]["rect"] or [0, 0, 0, 0]
  113. else:
  114. rect = f.get("rect") or [0, 0, 0, 0]
  115. adjusted_position = [-rect[1], rect[0]]
  116. return [f.get("page"), adjusted_position]
  117. sorted_fields = fields_with_location + list(radio_fields_by_id.values())
  118. sorted_fields.sort(key=sort_key)
  119. return sorted_fields
  120. def write_field_info(pdf_path: str, json_output_path: str):
  121. reader = PdfReader(pdf_path)
  122. field_info = get_field_info(reader)
  123. with open(json_output_path, "w") as f:
  124. json.dump(field_info, f, indent=2)
  125. print(f"Wrote {len(field_info)} fields to {json_output_path}")
  126. if __name__ == "__main__":
  127. if len(sys.argv) != 3:
  128. print("Usage: extract_form_field_info.py [input pdf] [output json]")
  129. sys.exit(1)
  130. write_field_info(sys.argv[1], sys.argv[2])