import sys
import cv2
import numpy as np
import fitz  # PyMuPDF
import pytesseract
import json
import os

# Set up Tesseract executable path if necessary
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def convert_pdf_to_images(pdf_path):
    """Convert each page of the PDF to an image."""
    doc = fitz.open(pdf_path)
    images = []

    for page_num in range(doc.page_count):
        page = doc[page_num]
        pix = page.get_pixmap()
        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
        images.append(img)
    
    return images

def preprocess_image(image):
    """Preprocess the image for better OCR performance."""
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # Apply GaussianBlur or other filters if needed
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    # Apply thresholding to binarize the image
    _, thresh = cv2.threshold(blurred, 150, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    return thresh

def extract_numbers_and_coordinates(image, debug=False):
    """Extract numbers and their coordinates from the image."""
    results = []
    
    # Preprocess the image for better OCR results
    processed_image = preprocess_image(image)

    # Use Tesseract to perform OCR on the preprocessed image
    data = pytesseract.image_to_data(processed_image, config='--psm 6', output_type=pytesseract.Output.DICT)

    # Debug: Save processed image
    if debug:
        cv2.imwrite("debug_processed_image.png", processed_image)

    # Loop over each element found by Tesseract
    for i in range(len(data['text'])):
        text = data['text'][i].strip()
        if text in {'1', '2'}:  # Filter to extract specific numbers '1' and '2'
            x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
            results.append({
                'text': text,
                'coordinates': [(x, y), (x + w, y), (x + w, y + h), (x, y + h)]
            })
    
    # Debug: Print OCR results
    if debug:
        for i in range(len(data['text'])):
            print(f"Detected text: {data['text'][i]} at coordinates: ({data['left'][i]}, {data['top'][i]})")

    return results

def process_pdf(pdf_path, output_dir, debug=False):
    """Process the PDF to extract specific numbers and their coordinates."""
    # Convert PDF pages to images
    images = convert_pdf_to_images(pdf_path)
    extracted_data = {}

    for page_num, image in enumerate(images):
        # Extract numbers and coordinates from each page image
        results = extract_numbers_and_coordinates(image, debug)
        extracted_data[f'page_{page_num + 1}'] = results

        # Optional: Save images with annotated results
        for result in results:
            x, y, w, h = result['coordinates'][0][0], result['coordinates'][0][1], result['coordinates'][2][0] - result['coordinates'][0][0], result['coordinates'][2][1] - result['coordinates'][0][1]
            cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
            cv2.putText(image, result['text'], (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        
        # Save the annotated image for review
        output_image_path = os.path.join(output_dir, f"page_{page_num + 1}_annotated.png")
        cv2.imwrite(output_image_path, image)
    
    return extracted_data

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python script.py <pdf_file> <output_dir>")
        sys.exit(1)

    pdf_file = sys.argv[1]
    output_dir = sys.argv[2]

    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Process the PDF and extract the data
    extracted_data = process_pdf(pdf_file, output_dir, debug=True)
    json_output = json.dumps({'extracted_data': extracted_data}, indent=4)
    print(json_output)
