fill_fillable_fields.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. import json
  2. import sys
  3. from pypdf import PdfReader, PdfWriter
  4. from extract_form_field_info import get_field_info
  5. # Fills fillable form fields in a PDF. See forms.md.
  6. def fill_pdf_fields(input_pdf_path: str, fields_json_path: str, output_pdf_path: str):
  7. with open(fields_json_path) as f:
  8. fields = json.load(f)
  9. # Group by page number.
  10. fields_by_page = {}
  11. for field in fields:
  12. if "value" in field:
  13. field_id = field["field_id"]
  14. page = field["page"]
  15. if page not in fields_by_page:
  16. fields_by_page[page] = {}
  17. fields_by_page[page][field_id] = field["value"]
  18. reader = PdfReader(input_pdf_path)
  19. has_error = False
  20. field_info = get_field_info(reader)
  21. fields_by_ids = {f["field_id"]: f for f in field_info}
  22. for field in fields:
  23. existing_field = fields_by_ids.get(field["field_id"])
  24. if not existing_field:
  25. has_error = True
  26. print(f"ERROR: `{field['field_id']}` is not a valid field ID")
  27. elif field["page"] != existing_field["page"]:
  28. has_error = True
  29. print(f"ERROR: Incorrect page number for `{field['field_id']}` (got {field['page']}, expected {existing_field['page']})")
  30. else:
  31. if "value" in field:
  32. err = validation_error_for_field_value(existing_field, field["value"])
  33. if err:
  34. print(err)
  35. has_error = True
  36. if has_error:
  37. sys.exit(1)
  38. writer = PdfWriter(clone_from=reader)
  39. for page, field_values in fields_by_page.items():
  40. writer.update_page_form_field_values(writer.pages[page - 1], field_values, auto_regenerate=False)
  41. # This seems to be necessary for many PDF viewers to format the form values correctly.
  42. # It may cause the viewer to show a "save changes" dialog even if the user doesn't make any changes.
  43. writer.set_need_appearances_writer(True)
  44. with open(output_pdf_path, "wb") as f:
  45. writer.write(f)
  46. def validation_error_for_field_value(field_info, field_value):
  47. field_type = field_info["type"]
  48. field_id = field_info["field_id"]
  49. if field_type == "checkbox":
  50. checked_val = field_info["checked_value"]
  51. unchecked_val = field_info["unchecked_value"]
  52. if field_value != checked_val and field_value != unchecked_val:
  53. return f'ERROR: Invalid value "{field_value}" for checkbox field "{field_id}". The checked value is "{checked_val}" and the unchecked value is "{unchecked_val}"'
  54. elif field_type == "radio_group":
  55. option_values = [opt["value"] for opt in field_info["radio_options"]]
  56. if field_value not in option_values:
  57. return f'ERROR: Invalid value "{field_value}" for radio group field "{field_id}". Valid values are: {option_values}'
  58. elif field_type == "choice":
  59. choice_values = [opt["value"] for opt in field_info["choice_options"]]
  60. if field_value not in choice_values:
  61. return f'ERROR: Invalid value "{field_value}" for choice field "{field_id}". Valid values are: {choice_values}'
  62. return None
  63. # pypdf (at least version 5.7.0) has a bug when setting the value for a selection list field.
  64. # In _writer.py around line 966:
  65. #
  66. # if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0:
  67. # txt = "\n".join(annotation.get_inherited(FA.Opt, []))
  68. #
  69. # The problem is that for selection lists, `get_inherited` returns a list of two-element lists like
  70. # [["value1", "Text 1"], ["value2", "Text 2"], ...]
  71. # This causes `join` to throw a TypeError because it expects an iterable of strings.
  72. # The horrible workaround is to patch `get_inherited` to return a list of the value strings.
  73. # We call the original method and adjust the return value only if the argument to `get_inherited`
  74. # is `FA.Opt` and if the return value is a list of two-element lists.
  75. def monkeypatch_pydpf_method():
  76. from pypdf.generic import DictionaryObject
  77. from pypdf.constants import FieldDictionaryAttributes
  78. original_get_inherited = DictionaryObject.get_inherited
  79. def patched_get_inherited(self, key: str, default = None):
  80. result = original_get_inherited(self, key, default)
  81. if key == FieldDictionaryAttributes.Opt:
  82. if isinstance(result, list) and all(isinstance(v, list) and len(v) == 2 for v in result):
  83. result = [r[0] for r in result]
  84. return result
  85. DictionaryObject.get_inherited = patched_get_inherited
  86. if __name__ == "__main__":
  87. if len(sys.argv) != 4:
  88. print("Usage: fill_fillable_fields.py [input pdf] [field_values.json] [output pdf]")
  89. sys.exit(1)
  90. monkeypatch_pydpf_method()
  91. input_pdf = sys.argv[1]
  92. fields_json = sys.argv[2]
  93. output_pdf = sys.argv[3]
  94. fill_pdf_fields(input_pdf, fields_json, output_pdf)