import fitz  # PyMuPDF

def extract_large_numbers_from_pdf(pdf_path, min_font_size=15):  # Adjust the min_font_size for your case
    doc = fitz.open(pdf_path)
    numbers_with_details = []

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]
        
        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        font_size = span["size"]
                        text = span["text"]
                        bbox = span["bbox"]  # Bounding box of the text
                        color_int = span.get("color", 0)  # Color as an integer (default to 0 if not present)

                        # Extract the RGB color from the integer
                        r = (color_int >> 16) & 255
                        g = (color_int >> 8) & 255
                        b = color_int & 255
                        rgb_color = (r, g, b)

                        # Check if the text is a large number
                        if font_size >= min_font_size and text.isdigit():
                            x1, y1, x2, y2 = bbox
                            number_info = {
                                "text": text,
                                "font_size": font_size,
                                "coordinates": (x1, y1, x2, y2),
                                "color": rgb_color  # RGB color as a tuple
                            }
                            numbers_with_details.append(number_info)

    return numbers_with_details

# Usage example
pdf_path = './vaj.pdf'
large_numbers = extract_large_numbers_from_pdf(pdf_path)

for number in large_numbers:
    print(f"Number: {number['text']}")
    print(f"Coordinates: {number['coordinates']}")
    print(f"Font Size: {number['font_size']}")
    print(f"Color (RGB): {number['color']}")
    print("------")

