// Visual Element Locator Tool - Uses vision models to find element coordinates
// This tool bridges the gap when DOM-based tools fail

import { BrowserTool } from "../../../ai_tools_interface.js";
import { z } from "zod";
import { TakeScreenshotTool } from "./MiscTools.js";
import { getValidTabId } from "./utils.js";

/**
 * VisualElementLocatorTool - Finds element coordinates using vision models
 *
 * This tool:
 * 1. Takes a screenshot of the current page
 * 2. Sends it to a vision model (GPT-4o, Claude, Gemini)
 * 3. Asks the model to locate an element based on description
 * 4. Returns coordinates {x, y} that can be used with ClickTool/TypeTool
 *
 * Use this when:
 * - DOM-based selectors fail to find elements
 * - Elements have dynamic or unpredictable attributes
 * - You need to find elements by visual appearance
 * - Dealing with canvas, SVG, or shadow DOM elements
 */
export class VisualElementLocatorTool extends BrowserTool {
  constructor(llmInvoker) {
    super(
      "locate_element_visually",
      "Find an element's coordinates using AI vision. Returns {x, y} coordinates for use with click/type tools. Use when selectors fail or element is hard to locate.",
      z.object({
        tabId: z.number().optional().describe("Tab ID (optional - will use active tab)"),
        description: z.string().describe("Clear description of the element to find (e.g., 'blue Submit button', 'email input field', 'Google logo')"),
        reasoning: z.string().describe("Why you're using visual location instead of selectors")
      })
    );

    // LLM invoker to call vision models
    this.llmInvoker = llmInvoker;

    // Screenshot tool instance
    this.screenshotTool = new TakeScreenshotTool();
  }

  /**
   * Set the LLM invoker (for cases where it's not available at construction)
   */
  setLLMInvoker(llmInvoker) {
    this.llmInvoker = llmInvoker;
  }

  async call({ tabId, description, reasoning }) {
    // Gracefully handle missing LLM invoker (no API key configured)
    if (!this.llmInvoker) {
      console.warn('[VisualElementLocatorTool] Tool disabled: No API key configured');
      return JSON.stringify({
        found: false,
        disabled: true,
        error: 'VisualElementLocatorTool is disabled - API key not configured',
        message: 'Cannot locate element visually: Vision API key is not configured. Please configure your API key in settings or use DOM-based tools instead.'
      });
    }

    try {
      const validTabId = await getValidTabId(tabId);

      console.log(`🔍 [VisualLocator] Finding: "${description}"`);
      console.log(`📝 [VisualLocator] Reasoning: ${reasoning}`);

      // Step 1: Take a screenshot with low detail (faster, cheaper)
      const screenshot = await this.screenshotTool.call({
        tabId: validTabId,
        detail: 'high', // Use high detail for accurate coordinate detection
        maxWidth: 1280,  // Good balance of accuracy and cost
        grayscale: false, // Keep color for better element identification
        quality: 80
      });

      if (!screenshot || screenshot.includes('Screenshot capture failed')) {
        throw new Error(`Failed to capture screenshot: ${screenshot}`);
      }

      console.log(`📸 [VisualLocator] Screenshot captured, analyzing with vision model...`);

      // Step 2: Get viewport dimensions
      const viewport = await chrome.tabs.sendMessage(validTabId, {
        type: 'GET_VIEWPORT_SIZE'
      });

      if (!viewport || viewport.error) {
        throw new Error(`Failed to get viewport size: ${viewport?.error || 'Unknown error'}`);
      }

      const { width: viewportWidth, height: viewportHeight } = viewport;

      // Step 3: Call vision model to locate element
      const prompt = this._buildVisionPrompt(description, viewportWidth, viewportHeight);

      const visionResponse = await this._callVisionModel(screenshot, prompt);

      // Step 4: Parse response to extract coordinates
      const coordinates = this._parseCoordinates(visionResponse, viewportWidth, viewportHeight);

      if (!coordinates) {
        throw new Error(`Vision model could not locate element: ${description}. Response: ${visionResponse}`);
      }

      console.log(`[VisualLocator] Found "${description}" at (${coordinates.x}, ${coordinates.y})`);

      // Return coordinates in a format that ClickTool/TypeTool can use
      return JSON.stringify({
        found: true,
        coordinates: coordinates,
        description: description,
        message: `Located "${description}" at coordinates (${coordinates.x}, ${coordinates.y}). Use click tool with these coordinates.`
      });

    } catch (error) {
      console.error('[VisualElementLocatorTool] Error:', error);
      return JSON.stringify({
        found: false,
        error: error.message,
        message: `Failed to locate "${description}" visually: ${error.message}`
      });
    }
  }

  /**
   * Build the prompt for the vision model
   */
  _buildVisionPrompt(description, viewportWidth, viewportHeight) {
    return `You are a visual element locator. Your task is to find an element in a screenshot and return its CENTER coordinates.

**Element to find:** ${description}

**Screenshot dimensions:** ${viewportWidth}x${viewportHeight} pixels

**Instructions:**
1. Locate the element described above in the screenshot
2. Find the CENTER point of the element (not corner)
3. Return ONLY the coordinates in this exact format: {"x": <number>, "y": <number>}
4. Coordinates must be within bounds: x ∈ [0, ${viewportWidth}], y ∈ [0, ${viewportHeight}]
5. If element is not found, return: {"found": false, "reason": "<why not found>"}

**Examples:**
- Element: "blue Submit button" → {"x": 450, "y": 320}
- Element: "search input field" → {"x": 640, "y": 120}
- Element not found → {"found": false, "reason": "No blue Submit button visible"}

**Return ONLY JSON, no other text.**`;
  }

  /**
   * Call the vision model with screenshot and prompt
   */
  async _callVisionModel(screenshotDataUrl, prompt) {
    try {
      // Use the LLM invoker to call vision model
      // The invoker should handle model selection (GPT-4o, Claude 3.7, Gemini)
      const response = await this.llmInvoker({
        messages: [
          {
            role: 'user',
            content: [
              {
                type: 'text',
                text: prompt
              },
              {
                type: 'image_url',
                image_url: {
                  url: screenshotDataUrl,
                  detail: 'high' // High detail for accurate coordinate detection
                }
              }
            ]
          }
        ],
        temperature: 0.1, // Low temperature for deterministic results
        max_tokens: 200   // Short response expected
      });

      return response;
    } catch (error) {
      throw new Error(`Vision model call failed: ${error.message}`);
    }
  }

  /**
   * Parse coordinates from vision model response
   */
  _parseCoordinates(response, maxX, maxY) {
    try {
      // Response might be wrapped in markdown code blocks
      let jsonText = response.trim();

      // Remove markdown code blocks if present
      jsonText = jsonText.replace(/```json\s*/g, '').replace(/```\s*/g, '');

      // Parse JSON
      const parsed = JSON.parse(jsonText);

      // Check if element was found
      if (parsed.found === false) {
        console.warn(`[VisualLocator] Element not found: ${parsed.reason}`);
        return null;
      }

      // Extract coordinates
      let { x, y } = parsed;

      // Validate coordinates
      if (typeof x !== 'number' || typeof y !== 'number') {
        throw new Error(`Invalid coordinates format: ${JSON.stringify(parsed)}`);
      }

      // Clamp coordinates to viewport bounds
      x = Math.max(0, Math.min(x, maxX));
      y = Math.max(0, Math.min(y, maxY));

      return { x: Math.round(x), y: Math.round(y) };

    } catch (error) {
      console.error('[VisualLocator] Failed to parse coordinates:', error);
      console.error('[VisualLocator] Response was:', response);
      return null;
    }
  }
}
