| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152 |
- import json
- import sys
- from pypdf import PdfReader
- # Extracts data for the fillable form fields in a PDF and outputs JSON that
- # Claude uses to fill the fields. See forms.md.
- # This matches the format used by PdfReader `get_fields` and `update_page_form_field_values` methods.
- def get_full_annotation_field_id(annotation):
- components = []
- while annotation:
- field_name = annotation.get('/T')
- if field_name:
- components.append(field_name)
- annotation = annotation.get('/Parent')
- return ".".join(reversed(components)) if components else None
- def make_field_dict(field, field_id):
- field_dict = {"field_id": field_id}
- ft = field.get('/FT')
- if ft == "/Tx":
- field_dict["type"] = "text"
- elif ft == "/Btn":
- field_dict["type"] = "checkbox" # radio groups handled separately
- states = field.get("/_States_", [])
- if len(states) == 2:
- # "/Off" seems to always be the unchecked value, as suggested by
- # https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf#page=448
- # It can be either first or second in the "/_States_" list.
- if "/Off" in states:
- field_dict["checked_value"] = states[0] if states[0] != "/Off" else states[1]
- field_dict["unchecked_value"] = "/Off"
- else:
- print(f"Unexpected state values for checkbox `${field_id}`. Its checked and unchecked values may not be correct; if you're trying to check it, visually verify the results.")
- field_dict["checked_value"] = states[0]
- field_dict["unchecked_value"] = states[1]
- elif ft == "/Ch":
- field_dict["type"] = "choice"
- states = field.get("/_States_", [])
- field_dict["choice_options"] = [{
- "value": state[0],
- "text": state[1],
- } for state in states]
- else:
- field_dict["type"] = f"unknown ({ft})"
- return field_dict
- # Returns a list of fillable PDF fields:
- # [
- # {
- # "field_id": "name",
- # "page": 1,
- # "type": ("text", "checkbox", "radio_group", or "choice")
- # // Per-type additional fields described in forms.md
- # },
- # ]
- def get_field_info(reader: PdfReader):
- fields = reader.get_fields()
- field_info_by_id = {}
- possible_radio_names = set()
- for field_id, field in fields.items():
- # Skip if this is a container field with children, except that it might be
- # a parent group for radio button options.
- if field.get("/Kids"):
- if field.get("/FT") == "/Btn":
- possible_radio_names.add(field_id)
- continue
- field_info_by_id[field_id] = make_field_dict(field, field_id)
- # Bounding rects are stored in annotations in page objects.
- # Radio button options have a separate annotation for each choice;
- # all choices have the same field name.
- # See https://westhealth.github.io/exploring-fillable-forms-with-pdfrw.html
- radio_fields_by_id = {}
- for page_index, page in enumerate(reader.pages):
- annotations = page.get('/Annots', [])
- for ann in annotations:
- field_id = get_full_annotation_field_id(ann)
- if field_id in field_info_by_id:
- field_info_by_id[field_id]["page"] = page_index + 1
- field_info_by_id[field_id]["rect"] = ann.get('/Rect')
- elif field_id in possible_radio_names:
- try:
- # ann['/AP']['/N'] should have two items. One of them is '/Off',
- # the other is the active value.
- on_values = [v for v in ann["/AP"]["/N"] if v != "/Off"]
- except KeyError:
- continue
- if len(on_values) == 1:
- rect = ann.get("/Rect")
- if field_id not in radio_fields_by_id:
- radio_fields_by_id[field_id] = {
- "field_id": field_id,
- "type": "radio_group",
- "page": page_index + 1,
- "radio_options": [],
- }
- # Note: at least on macOS 15.7, Preview.app doesn't show selected
- # radio buttons correctly. (It does if you remove the leading slash
- # from the value, but that causes them not to appear correctly in
- # Chrome/Firefox/Acrobat/etc).
- radio_fields_by_id[field_id]["radio_options"].append({
- "value": on_values[0],
- "rect": rect,
- })
- # Some PDFs have form field definitions without corresponding annotations,
- # so we can't tell where they are. Ignore these fields for now.
- fields_with_location = []
- for field_info in field_info_by_id.values():
- if "page" in field_info:
- fields_with_location.append(field_info)
- else:
- print(f"Unable to determine location for field id: {field_info.get('field_id')}, ignoring")
- # Sort by page number, then Y position (flipped in PDF coordinate system), then X.
- def sort_key(f):
- if "radio_options" in f:
- rect = f["radio_options"][0]["rect"] or [0, 0, 0, 0]
- else:
- rect = f.get("rect") or [0, 0, 0, 0]
- adjusted_position = [-rect[1], rect[0]]
- return [f.get("page"), adjusted_position]
-
- sorted_fields = fields_with_location + list(radio_fields_by_id.values())
- sorted_fields.sort(key=sort_key)
- return sorted_fields
- def write_field_info(pdf_path: str, json_output_path: str):
- reader = PdfReader(pdf_path)
- field_info = get_field_info(reader)
- with open(json_output_path, "w") as f:
- json.dump(field_info, f, indent=2)
- print(f"Wrote {len(field_info)} fields to {json_output_path}")
- if __name__ == "__main__":
- if len(sys.argv) != 3:
- print("Usage: extract_form_field_info.py [input pdf] [output json]")
- sys.exit(1)
- write_field_info(sys.argv[1], sys.argv[2])
|