import fitz
import re
import sys
import json

def extract_specific_numbers_with_color(pdf_path, target_numbers, target_color):
    """
    Extract specific numbers from a PDF along with their coordinates, filtering by a specific color.
    
    :param pdf_path: Path to the PDF file.
    :param target_numbers: List of specific numbers to extract (as strings).
    :param target_color: The target color to filter by (float).
    :return: List of dictionaries containing text, color, and coordinates for the specified numbers.
    """
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        print(f"Error opening PDF: {e}")
        return []

    filtered_data = []

    # Iterate through each page in the PDF
    for page_num in range(len(doc)):
        try:
            page = doc.load_page(page_num)
            blocks = page.get_text("dict")["blocks"]
        except Exception as e:
            print(f"Error reading page {page_num}: {e}")
            continue

        # Iterate through each block (text block, image block, etc.)
        for block in blocks:
            if block["type"] == 0:  # Type 0 corresponds to text
                lines = block.get("lines", [])

                # Iterate through each line in the block
                for line in lines:
                    spans = line.get("spans", [])

                    # Iterate through each span (part of the line)
                    for span in spans:
                        text = span.get("text", "")
                        bbox = span.get("bbox", [])
                        color = span.get("color", 0)  # Color as a float value

                        # Use a regular expression to find numbers in the text
                        numbers = re.findall(r'\b\d+\b', text)

                        # Filter to include only target numbers with the target color
                        filtered_numbers = [num for num in numbers if num in target_numbers and color == target_color]

                        # Collect data for each filtered number
                        for number in filtered_numbers:
                            filtered_data.append({
                                "text": number,
                                "color": color,
                                "coordinates": bbox
                            })

    return filtered_data

if __name__ == "__main__":
    # Check if the correct number of arguments are provided
    if len(sys.argv) < 3:
        print("Usage: python script.py <path_to_pdf> <target_numbers_comma_separated>")
        sys.exit(1)

    # Get the PDF path and target numbers from command-line arguments
    pdf_path = sys.argv[1]
    target_numbers = sys.argv[2].split(",")  # Split the input into a list of strings
    target_numbers = [num.strip() for num in target_numbers]  # Remove any extra spaces
    
    target_color = 0  # Extract only if the color is exactly 0

    # Extract data for specific numbers with the specified color
    specific_data = extract_specific_numbers_with_color(pdf_path, target_numbers, target_color)

    # Output the filtered data in JSON format
    json_output = json.dumps(specific_data, indent=4)
    print(json_output)
