check_bounding_boxes.py 3.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. from dataclasses import dataclass
  2. import json
  3. import sys
  4. # Script to check that the `fields.json` file that Claude creates when analyzing PDFs
  5. # does not have overlapping bounding boxes. See forms.md.
  6. @dataclass
  7. class RectAndField:
  8. rect: list[float]
  9. rect_type: str
  10. field: dict
  11. # Returns a list of messages that are printed to stdout for Claude to read.
  12. def get_bounding_box_messages(fields_json_stream) -> list[str]:
  13. messages = []
  14. fields = json.load(fields_json_stream)
  15. messages.append(f"Read {len(fields['form_fields'])} fields")
  16. def rects_intersect(r1, r2):
  17. disjoint_horizontal = r1[0] >= r2[2] or r1[2] <= r2[0]
  18. disjoint_vertical = r1[1] >= r2[3] or r1[3] <= r2[1]
  19. return not (disjoint_horizontal or disjoint_vertical)
  20. rects_and_fields = []
  21. for f in fields["form_fields"]:
  22. rects_and_fields.append(RectAndField(f["label_bounding_box"], "label", f))
  23. rects_and_fields.append(RectAndField(f["entry_bounding_box"], "entry", f))
  24. has_error = False
  25. for i, ri in enumerate(rects_and_fields):
  26. # This is O(N^2); we can optimize if it becomes a problem.
  27. for j in range(i + 1, len(rects_and_fields)):
  28. rj = rects_and_fields[j]
  29. if ri.field["page_number"] == rj.field["page_number"] and rects_intersect(ri.rect, rj.rect):
  30. has_error = True
  31. if ri.field is rj.field:
  32. messages.append(f"FAILURE: intersection between label and entry bounding boxes for `{ri.field['description']}` ({ri.rect}, {rj.rect})")
  33. else:
  34. messages.append(f"FAILURE: intersection between {ri.rect_type} bounding box for `{ri.field['description']}` ({ri.rect}) and {rj.rect_type} bounding box for `{rj.field['description']}` ({rj.rect})")
  35. if len(messages) >= 20:
  36. messages.append("Aborting further checks; fix bounding boxes and try again")
  37. return messages
  38. if ri.rect_type == "entry":
  39. if "entry_text" in ri.field:
  40. font_size = ri.field["entry_text"].get("font_size", 14)
  41. entry_height = ri.rect[3] - ri.rect[1]
  42. if entry_height < font_size:
  43. has_error = True
  44. messages.append(f"FAILURE: entry bounding box height ({entry_height}) for `{ri.field['description']}` is too short for the text content (font size: {font_size}). Increase the box height or decrease the font size.")
  45. if len(messages) >= 20:
  46. messages.append("Aborting further checks; fix bounding boxes and try again")
  47. return messages
  48. if not has_error:
  49. messages.append("SUCCESS: All bounding boxes are valid")
  50. return messages
  51. if __name__ == "__main__":
  52. if len(sys.argv) != 2:
  53. print("Usage: check_bounding_boxes.py [fields.json]")
  54. sys.exit(1)
  55. # Input file should be in the `fields.json` format described in forms.md.
  56. with open(sys.argv[1]) as f:
  57. messages = get_bounding_box_messages(f)
  58. for msg in messages:
  59. print(msg)