import fitz  # PyMuPDF
import argparse
import json
import re

# Parse the command line argument for the PDF path
parser = argparse.ArgumentParser(description="Extract seat numbers and their coordinates from a PDF.")
parser.add_argument("pdf_path", help="Path to the PDF file")
args = parser.parse_args()

# Open the PDF
doc = fitz.open(args.pdf_path)

seat_data = []

# Regex to detect seat numbers (digits)
seat_number_pattern = re.compile(r'^\d{1,2}$')

# Iterate through all pages
for page_num in range(len(doc)):
    page = doc[page_num]
    blocks = page.get_text("dict")["blocks"]

    for block in blocks:
        for line in block.get("lines", []):
            for span in line.get("spans", []):
                text = span["text"].strip()
                if seat_number_pattern.match(text):  # Check if text is a number
                    bbox = span["bbox"]  # (x0, y0, x1, y1)
                    seat_data.append({
                        "seat_number": text,
                        "coordinates": {
                            "x0": bbox[0],
                            "y0": bbox[1],
                            "x1": bbox[2],
                            "y1": bbox[3]
                        },
                        "page": page_num + 1
                    })

# Close the document
doc.close()

# Output result as JSON
print(json.dumps(seat_data, indent=2))
