/**
 * Unified OCR validation utility
 * Combines functionality from both tests/utils/ocr-verification.js and lib/utils/ocr-validator.js
 * Uses Tesseract to extract text from screenshots and compare with DOM extraction
 */

import Tesseract from 'tesseract.js';
import fs from 'fs';
import path from 'path';

/**
 * Calculate Levenshtein distance between two strings
 * Used for fuzzy matching when OCR has minor errors
 */
function levenshteinDistance(str1, str2) {
  const matrix = [];

  for (let i = 0; i <= str2.length; i++) {
    matrix[i] = [i];
  }
  for (let j = 0; j <= str1.length; j++) {
    matrix[0][j] = j;
  }

  for (let i = 1; i <= str2.length; i++) {
    for (let j = 1; j <= str1.length; j++) {
      if (str2.charAt(i - 1) === str1.charAt(j - 1)) {
        matrix[i][j] = matrix[i - 1][j - 1];
      } else {
        matrix[i][j] = Math.min(
          matrix[i - 1][j - 1] + 1, // substitution
          matrix[i][j - 1] + 1,     // insertion
          matrix[i - 1][j] + 1      // deletion
        );
      }
    }
  }

  return matrix[str2.length][str1.length];
}

/**
 * Calculate similarity percentage between two strings
 */
function calculateSimilarity(str1, str2) {
  const maxLength = Math.max(str1.length, str2.length);
  if (maxLength === 0) return 100;

  const distance = levenshteinDistance(str1.toLowerCase(), str2.toLowerCase());
  return ((maxLength - distance) / maxLength) * 100;
}

/**
 * OCR Validator class with support for both coverage-based and fuzzy-matching validation
 */
export class OCRValidator {
  constructor(options = {}) {
    this.worker = null;
    // Default to English only for better accuracy
    // Multi-language detection causes confusion (e.g., treating 't' as 'т')
    // Use options.languages = ['eng', 'rus'] for Cyrillic text if needed
    this.languages = options.languages || ['eng'];
    this.langPath = options.langPath || 'https://tessdata.projectnaptha.com/4.0.0';
  }

  /**
   * Initialize Tesseract worker
   */
  async initialize() {
    if (!this.worker) {
      console.log(`🔧 Initializing Tesseract.js OCR engine (${this.languages.join('+')})...`);
      this.worker = await Tesseract.createWorker(this.languages, 1, {
        langPath: this.langPath,
        logger: m => {
          // Only log errors, suppress verbose output
          if (m.status === 'error') {
            console.error('Tesseract error:', m);
          }
        }
      });

      // Configure Tesseract parameters for better accuracy
      // PSM 3 = Fully automatic page segmentation (default, best for documents)
      // OEM 1 = Neural nets LSTM engine only (best accuracy for modern text)
      await this.worker.setParameters({
        tessedit_pageseg_mode: '3',  // Automatic page segmentation
        tessedit_ocr_engine_mode: '1', // LSTM neural net engine
      });

      console.log(`OCR engine ready with ${this.languages.join(' and ')} support`);
    }
  }

  /**
   * Clean up Tesseract worker
   */
  async cleanup() {
    if (this.worker) {
      await this.worker.terminate();
      this.worker = null;
    }
  }

  /**
   * Detect if text contains Cyrillic characters
   */
  hasCyrillic(text) {
    // Check for Cyrillic Unicode range (U+0400 to U+04FF)
    return /[\u0400-\u04FF]/.test(text);
  }

  /**
   * Extract text from screenshot using OCR
   * Automatically detects if Cyrillic is needed and re-runs if necessary
   */
  async extractTextFromScreenshot(screenshotPath) {
    if (!fs.existsSync(screenshotPath)) {
      throw new Error(`Screenshot not found: ${screenshotPath}`);
    }

    await this.initialize();

    // First pass: Try with current language settings
    const { data: { text, confidence } } = await this.worker.recognize(screenshotPath);

    // If we're using English-only and detect Cyrillic, re-run with eng+rus
    if (this.languages.length === 1 && this.languages[0] === 'eng' && this.hasCyrillic(text)) {
      console.log(`   🔄 Cyrillic detected, re-running OCR with eng+rus...`);

      // Reinitialize with both languages
      await this.cleanup();
      this.languages = ['eng', 'rus'];
      await this.initialize();

      const { data: { text: textRetry, confidence: confidenceRetry } } = await this.worker.recognize(screenshotPath);
      console.log(`   OCR Confidence: ${confidenceRetry.toFixed(1)}% (eng+rus)`);

      return {
        text: textRetry,
        confidence: confidenceRetry,
        wordCount: textRetry.split(/\s+/).filter(w => w.length > 0).length
      };
    }

    console.log(`   OCR Confidence: ${confidence.toFixed(1)}%`);

    return {
      text,
      confidence,
      wordCount: text.split(/\s+/).filter(w => w.length > 0).length
    };
  }

  /**
   * Clean and normalize text for comparison
   */
  cleanText(text) {
    return text
      .toLowerCase()
      .replace(/[^\w\s]/g, ' ')  // Replace punctuation with spaces
      .replace(/\s+/g, ' ')
      .trim();
  }

  /**
   * Calculate text coverage between OCR and extracted content (for main.test.js)
   */
  calculateCoverage(ocrText, extractedContent) {
    const ocrWords = new Set(
      ocrText.split(/\s+/)
        .filter(w => w.length > 2)
        .map(w => w.toLowerCase())
    );

    const extractedWords = new Set(
      this.cleanText(extractedContent)
        .split(/\s+/)
        .filter(w => w.length > 2)
    );

    if (ocrWords.size === 0) {
      return { coverage: 0, ocrWords: 0, matchedWords: 0 };
    }

    let matchedWords = 0;
    for (const word of ocrWords) {
      if (extractedWords.has(word)) {
        matchedWords++;
      }
    }

    const coverage = (matchedWords / ocrWords.size) * 100;

    return {
      coverage: coverage.toFixed(1),
      ocrWords: ocrWords.size,
      matchedWords,
      missingWords: Array.from(ocrWords)
        .filter(w => !extractedWords.has(w))
        .slice(0, 10)
    };
  }

  /**
   * Calculate fuzzy match with Levenshtein distance (for page-extraction.test.js)
   * @param {string} ocrText - OCR extracted text
   * @param {string} markdownText - Extractor output
   * @returns {object} Match statistics
   */
  calculateFuzzyMatch(ocrText, markdownText) {
    const ocrNorm = this.cleanText(ocrText);
    const mdNorm = this.cleanText(markdownText);

    // Split into words (filter out very short words)
    const ocrWords = ocrNorm.split(/\s+/).filter(w => w.length >= 3);
    const mdWordsArray = mdNorm.split(/\s+/);
    const mdWordsSet = new Set(mdWordsArray);

    let exactMatches = 0;
    let fuzzyMatches = 0;
    let totalWords = ocrWords.length;
    const missing = [];

    for (const word of ocrWords) {
      // Try exact match first
      if (mdWordsSet.has(word)) {
        exactMatches++;
      } else {
        // Try fuzzy match with Levenshtein distance
        const match = this.findBestMatch(word, mdWordsArray, 2);
        if (match) {
          fuzzyMatches++;
        } else {
          if (missing.length < 30) {
            missing.push(word);
          }
        }
      }
    }

    const matchedWords = exactMatches + fuzzyMatches;
    const matchPercentage = totalWords > 0 ? (matchedWords / totalWords) * 100 : 0;

    return {
      matchPercentage,
      matchedWords,
      exactMatches,
      fuzzyMatches,
      totalWords,
      missing
    };
  }

  /**
   * Find best fuzzy match for a word using Levenshtein distance
   */
  findBestMatch(word, candidates, maxDistance = 2) {
    let bestMatch = null;
    let bestDistance = Infinity;

    for (const candidate of candidates) {
      // Only compare words of similar length (within 2 chars)
      if (Math.abs(candidate.length - word.length) > 2) continue;

      const distance = levenshteinDistance(word, candidate);
      if (distance < bestDistance && distance <= maxDistance) {
        bestDistance = distance;
        bestMatch = { word: candidate, distance };
      }
    }

    return bestMatch;
  }

  /**
   * Validate content extraction using OCR (main.test.js style)
   */
  async validateExtraction(screenshotPath, extractedContent, options = {}) {
    const {
      minCoverage = 95,
      saveDebugInfo = false,
      debugPath = null
    } = options;

    try {
      const ocrResult = await this.extractTextFromScreenshot(screenshotPath);
      const coverageResult = this.calculateCoverage(ocrResult.text, extractedContent);

      // Save debug info if requested
      if (saveDebugInfo && debugPath) {
        const debugInfo = {
          screenshotPath,
          ocrConfidence: ocrResult.confidence,
          ocrWordCount: ocrResult.wordCount,
          extractedWordCount: extractedContent.split(/\s+/).length,
          coverage: coverageResult,
          ocrTextSample: ocrResult.text.substring(0, 500),
          extractedTextSample: extractedContent.substring(0, 500)
        };

        const fsPromises = await import('fs/promises');
        await fsPromises.writeFile(
          path.join(debugPath, 'ocr-debug.json'),
          JSON.stringify(debugInfo, null, 2)
        );
      }

      const passed = parseFloat(coverageResult.coverage) >= minCoverage;

      return {
        passed,
        coverage: coverageResult.coverage,
        ocrWords: coverageResult.ocrWords,
        matchedWords: coverageResult.matchedWords,
        missingWords: coverageResult.missingWords,
        ocrConfidence: ocrResult.confidence,
        message: passed ?
          `OCR validation passed: ${coverageResult.coverage}% coverage` :
          `OCR validation failed: ${coverageResult.coverage}% coverage (required: ${minCoverage}%)`
      };
    } catch (error) {
      return {
        passed: false,
        coverage: 0,
        error: error.message,
        message: `OCR validation error: ${error.message}`
      };
    }
  }

  /**
   * Find coordinates of text in screenshot
   * Returns {x, y, width, height} of the best match
   */
  async findTextCoordinates(screenshotPath, searchText) {
    if (!fs.existsSync(screenshotPath)) {
      throw new Error(`Screenshot not found: ${screenshotPath}`);
    }

    await this.initialize();

    const { data } = await this.worker.recognize(screenshotPath);
    const searchTerms = this.cleanText(searchText).split(/\s+/);

    let words = data.words;

    // If words are missing but blocks exist, extract words from blocks
    if (!words && data.blocks) {
      words = [];
      for (const block of data.blocks) {
        if (block.paragraphs) {
          for (const paragraph of block.paragraphs) {
            if (paragraph.lines) {
              for (const line of paragraph.lines) {
                if (line.words) {
                  words.push(...line.words);
                }
              }
            }
          }
        }
      }
    }

    if (!words || words.length === 0) {
      console.warn('OCR returned no words for', screenshotPath);
      if (data) {
        console.warn('Available data keys:', Object.keys(data));
        console.warn('Blocks count:', data.blocks?.length || 0);
        if (data.blocks && data.blocks.length > 0) {
          console.warn('First block structure:', JSON.stringify(data.blocks[0], null, 2).substring(0, 500));
        }
      }
      return null;
    }

    // DEBUG: Log words found
    // console.log(`DEBUG: findTextCoordinates words: ${words.map(w => w.text).join(' ')}`);

    // Find sequence of words that matches search terms
    for (let i = 0; i <= words.length - searchTerms.length; i++) {
      let match = true;
      let combinedBbox = { x0: Infinity, y0: Infinity, x1: -Infinity, y1: -Infinity };

      for (let j = 0; j < searchTerms.length; j++) {
        const word = words[i + j];
        const wordText = this.cleanText(word.text);
        const searchTerm = searchTerms[j];

        // Fuzzy match word
        if (calculateSimilarity(wordText, searchTerm) < 80) {
          match = false;
          break;
        }

        // Update bounding box
        combinedBbox.x0 = Math.min(combinedBbox.x0, word.bbox.x0);
        combinedBbox.y0 = Math.min(combinedBbox.y0, word.bbox.y0);
        combinedBbox.x1 = Math.max(combinedBbox.x1, word.bbox.x1);
        combinedBbox.y1 = Math.max(combinedBbox.y1, word.bbox.y1);
      }

      if (match) {
        // Return center coordinates
        return {
          x: combinedBbox.x0 + (combinedBbox.x1 - combinedBbox.x0) / 2,
          y: combinedBbox.y0 + (combinedBbox.y1 - combinedBbox.y0) / 2,
          width: combinedBbox.x1 - combinedBbox.x0,
          height: combinedBbox.y1 - combinedBbox.y0
        };
      }
    }

    return null;
  }

  /**
   * Verify text appears in screenshot using fuzzy matching
   */
  async verifyScreenshotContainsText(screenshotPath, expectedTexts, stepName, options = {}) {
    const {
      caseSensitive = false,
      exactMatch = false,
      fuzzyMatch = true,
      similarityThreshold = 80
    } = options;

    const ocrResult = await this.extractTextFromScreenshot(screenshotPath);
    const rawText = ocrResult.text.replace(/\s+/g, ' ').trim();
    const cleanText = caseSensitive ? rawText : this.cleanText(rawText);
    const searchText = cleanText;


    const foundTexts = [];
    const missingTexts = [];

    for (const expectedText of expectedTexts) {
      const searchFor = caseSensitive ? expectedText : this.cleanText(expectedText);
      let found = false;
      let matchDetails = '';

      if (exactMatch) {
        const wordRegex = new RegExp(`\\b${searchFor.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`);
        found = wordRegex.test(searchText);
      } else if (fuzzyMatch) {
        const words = searchText.split(/\s+/);
        const searchWords = searchFor.split(/\s+/);
        let bestSimilarity = 0;
        let bestMatch = '';

        // Try matching against each position in the text
        for (let i = 0; i <= words.length - searchWords.length; i++) {
          const phrase = words.slice(i, i + searchWords.length).join(' ');
          const similarity = calculateSimilarity(searchFor, phrase);
          if (similarity > bestSimilarity) {
            bestSimilarity = similarity;
            bestMatch = phrase;
          }
        }

        if (bestSimilarity >= similarityThreshold) {
          found = true;
          matchDetails = ` (${bestSimilarity.toFixed(1)}% similar to "${bestMatch}")`;
        }
      } else {
        found = searchText.includes(searchFor);
      }

      if (found) {
        foundTexts.push(expectedText + matchDetails);
      } else {
        missingTexts.push(expectedText);
      }
    }

    console.log(`\n🔍 OCR VERIFICATION - ${stepName}:`);
    console.log(`   Screenshot: ${path.basename(screenshotPath)}`);
    console.log(`   Expected: [${expectedTexts.join(', ')}]`);
    console.log(`   Found: [${foundTexts.join(', ')}] ${foundTexts.length > 0 ? '' : ''}`);

    if (missingTexts.length > 0) {
      const errMessage = `Missing: [${missingTexts.join(', ')}] at ${screenshotPath}`;
      console.log(errMessage);
      console.log('OCR text (raw):', rawText);
      console.log('OCR text (cleaned):', cleanText);
      console.log('Words array:', searchText.split(/\s+/));
      throw new Error(errMessage);
    }

    console.log(`OCR Verification PASSED for ${screenshotPath}`);
    return true;
  }
}

// Export for backward compatibility with both test suites
export default OCRValidator;

// Export helper functions
export { levenshteinDistance, calculateSimilarity };

// Export functional API for tests/utils/ocr-verification.js compatibility
let globalValidator = null;

export async function initOCR(languages = ['eng']) {
  if (!globalValidator) {
    globalValidator = new OCRValidator({ languages });
    await globalValidator.initialize();
  }
  return globalValidator;
}

export async function cleanupOCR() {
  if (globalValidator) {
    await globalValidator.cleanup();
    globalValidator = null;
  }
}

export async function extractTextFromScreenshot(screenshotPath) {
  const validator = await initOCR();
  const result = await validator.extractTextFromScreenshot(screenshotPath);
  return result.text;
}

export async function findTextCoordinates(screenshotPath, searchText) {
  const validator = await initOCR();
  return validator.findTextCoordinates(screenshotPath, searchText);
}

export async function verifyScreenshotContainsText(screenshotPath, expectedTexts, stepName, options = {}) {
  const validator = await initOCR();
  return validator.verifyScreenshotContainsText(screenshotPath, expectedTexts, stepName, options);
}

/**
 * Compare visual (OCR) content with DOM content to detect rendering issues
 * Critical for validating that browser automation actually changed the UI
 */
export async function compareVisualWithDOM(page, screenshotPath, stepName) {
  // Get DOM text
  const domText = await page.evaluate(() => document.body.innerText);
  const cleanDomText = domText.replace(/\s+/g, ' ').trim().toLowerCase();

  // Get OCR text
  const ocrText = await extractTextFromScreenshot(screenshotPath);
  const cleanOcrText = ocrText.replace(/\s+/g, ' ').trim().toLowerCase();

  // Find text that's in DOM but not visible (potential rendering issue)
  const domWords = new Set(cleanDomText.split(' '));
  const ocrWords = new Set(cleanOcrText.split(' '));

  const invisibleWords = [];
  for (const word of domWords) {
    if (word.length > 3 && !ocrWords.has(word)) {
      invisibleWords.push(word);
    }
  }

  console.log(`\nVisual vs DOM Comparison - ${stepName}:`);
  console.log(`DOM word count: ${domWords.size}`);
  console.log(`OCR word count: ${ocrWords.size}`);

  if (invisibleWords.length > 0) {
    console.log(`⚠️ Text in DOM but not visible: ${invisibleWords.slice(0, 5).join(', ')}`);
    console.log(`This might indicate rendering issues or hidden elements`);
  } else {
    console.log(`Visual rendering matches DOM content`);
  }

  return {
    domWords: domWords.size,
    ocrWords: ocrWords.size,
    invisibleWords
  };
}
