import fitz  # PyMuPDF
from collections import defaultdict

def extract_blocks_and_shapes_from_pdf(pdf_path):
    # Open the PDF file
    document = fitz.open(pdf_path)
    
    extracted_data = []
    
    # Loop through each page in the document
    for page_number in range(len(document)):
        page = document.load_page(page_number)
        
        # Extract text blocks
        text_blocks = page.get_text("blocks")
        for block in text_blocks:
            text = block[4]  # Text of the block
            bbox = block[:4]  # Bounding box (x0, y0, x1, y1)
            if text.strip():
                extracted_data.append({
                    'type': 'text',
                    'content': text.strip(),
                    'bbox': {
                        'x0': bbox[0],
                        'y0': bbox[1],
                        'x1': bbox[2],
                        'y1': bbox[3]
                    }
                })
        
        # Extract vector graphics (shapes like lines, rectangles)
        for shape in page.get_drawings():
            # shape['type'] can be 'rect', 'line', 'polyline', 'curve', etc.
            # We are specifically interested in 'rect' and 'line' for blocks
            for item in shape["items"]:
                if item[0] == "l":  # Lines (l for line)
                    line = item[1]
                    if len(line) >= 4:  # Ensure we have at least 4 coordinates
                        x0, y0, x1, y1 = line[0], line[1], line[2], line[3]
                        extracted_data.append({
                            'type': 'line',
                            'bbox': {
                                'x0': x0,
                                'y0': y0,
                                'x1': x1,
                                'y1': y1
                            }
                        })
                elif item[0] == "re":  # Rectangles (re for rectangle)
                    rect = item[1]
                    if len(rect) >= 4:  # Ensure we have enough coordinates
                        x0, y0, x1, y1 = rect[0], rect[1], rect[0] + rect[2], rect[1] + rect[3]
                        extracted_data.append({
                            'type': 'rectangle',
                            'bbox': {
                                'x0': x0,
                                'y0': y0,
                                'x1': x1,
                                'y1': y1
                            }
                        })
    
    return extracted_data

def overlap(bbox1, bbox2, threshold=10):
    # Check if two bounding boxes overlap or are within a threshold distance
    return not (bbox1['x1'] + threshold < bbox2['x0'] or
                bbox1['x0'] > bbox2['x1'] + threshold or
                bbox1['y1'] + threshold < bbox2['y0'] or
                bbox1['y0'] > bbox2['y1'] + threshold)

def group_blocks(extracted_data, threshold=10):
    groups = defaultdict(list)
    group_id = 0

    for item in extracted_data:
        added_to_group = False
        for group_items in groups.values():
            if any(overlap(item['bbox'], g['bbox'], threshold) for g in group_items):
                group_items.append(item)
                added_to_group = True
                break
        if not added_to_group:
            groups[group_id].append(item)
            group_id += 1

    return groups

# Run the function on your PDF
pdf_path = "./FINAL.pdf"
detected_blocks = extract_blocks_and_shapes_from_pdf(pdf_path)

# Group the blocks
grouped_blocks = group_blocks(detected_blocks)

# Output the grouped data
for group_id, group_items in grouped_blocks.items():
    print(f"Group {group_id}:")
    for item in group_items:
        if item['type'] == 'text':
            print(f"  Text: {item['content']}, Coordinates: {item['bbox']}")
        else:
            print(f"  Detected {item['type']}, Coordinates: {item['bbox']}")
    print("\n")
