import fitz  # PyMuPDF

# Red color in normalized RGB
target_red_color_rgb = ('#ff1a1a')  # Red

# Tolerance for color matching
def colors_are_similar(color1, color2, tolerance=0.05):
    return all(abs(a - b) <= tolerance for a, b in zip(color1, color2))

# Path to the PDF file
pdf_path = './FINAL.pdf'

# Open the PDF file
doc = fitz.open(pdf_path)

# Initialize a list to store bounding boxes of red stroked items
red_stroke_bboxes = []

# Iterate through all pages in the PDF
for page_num in range(doc.page_count):
    page = doc.load_page(page_num)
    
    # Extract all drawing commands on the page
    for item in page.get_drawings():
        # Get the bounding box of the drawing item
        bbox = item.get('rect', None)

        # Check for stroke color
        stroke_color = item.get('stroke')

        # Check if the stroke color matches red (with tolerance)
        if stroke_color and colors_are_similar(tuple(stroke_color[:3]), target_red_color_rgb):
            if bbox:
                red_stroke_bboxes.append(bbox)

# Function to calculate the combined bounding box for a group of polygons
def calculate_combined_bbox(bboxes):
    if not bboxes:
        return None
    # Start with the first bounding box
    x0, y0, x1, y1 = bboxes[0].x0, bboxes[0].y0, bboxes[0].x1, bboxes[0].y1
    # Iterate over all bounding boxes and expand the combined box
    for bbox in bboxes[1:]:
        x0 = min(x0, bbox.x0)
        y0 = min(y0, bbox.y0)
        x1 = max(x1, bbox.x1)
        y1 = max(y1, bbox.y1)
    return (x0, y0, x1, y1)

# Combine the bounding boxes for red strokes
combined_red_bbox = calculate_combined_bbox(red_stroke_bboxes)

# Extract text within each red stroked bounding box
def extract_text_within_bbox(page, bbox):
    text = page.get_textbox(fitz.Rect(*bbox))
    return text.strip()

# Output the combined bounding box for red strokes and extract text inside
if combined_red_bbox:
    print(f"Combined bounding box for red strokes: {combined_red_bbox}")

    # Extract text from the page inside the combined bounding box (adjust page number if needed)
    page = doc.load_page(0)  # Assuming all polygons are on the first page, adjust if needed
    text = extract_text_within_bbox(page, combined_red_bbox)
    print(f"Text within the red stroked box: '{text}'")
else:
    print("No red stroked boxes found.")

# Close the document after processing
doc.close()
