/**
 * HTML Page Extractor - browser-use inspired approach
 * Extracts complete HTML with indexed interactive elements
 * Skips script/style/noscript tags but extracts all visible content
 */

class HtmlPageExtractor {
  /**
   * Extract indexed HTML content
   * @param {Object} options Configuration options
   * @returns {Object} Extracted HTML with indexed elements
   */
  extractIndexedHtml(options = {}) {
    const {
      maxElements = 1000
    } = options;

    // Extract all interactive elements
    const { elements, domElements } = this.extractInteractiveElements({
      maxElements
    });

    // Index elements
    const indexedElements = this.assignIndices(elements, domElements, maxElements);

    // Build HTML content with indexed elements inline
    const html = this.buildIndexedHtml(document.body, indexedElements);

    return {
      html,
      elementCount: indexedElements.elements.length,
      elements: indexedElements.elements,
      metadata: {
        title: document.title,
        url: window.location.href,
        timestamp: new Date().toISOString()
      }
    };
  }

  /**
   * Extract all interactive elements from the page
   */
  extractInteractiveElements(options) {
    const { maxElements } = options;
    const elements = [];
    const domElements = [];

    // Define selectors for interactive elements (browser-use style)
    const interactiveSelectors = [
      'a[href]',
      'button',
      'input',
      'select',
      'textarea',
      '[role="button"]',
      '[role="link"]',
      '[role="checkbox"]',
      '[role="radio"]',
      '[role="combobox"]',
      '[role="listbox"]',
      '[role="option"]',
      '[role="menuitem"]',
      '[role="tab"]',
      '[onclick]',
      '[tabindex]:not([tabindex="-1"])',
      '[contenteditable="true"]'
    ];

    // Query all interactive elements
    const allElements = document.querySelectorAll(interactiveSelectors.join(', '));

    for (const element of allElements) {
      if (elements.length >= maxElements) break;

      const elementInfo = this.extractElementInfo(element);
      if (elementInfo) {
        elements.push(elementInfo);
        domElements.push(element);
      }
    }

    return { elements, domElements };
  }

  /**
   * Extract information from a single element
   */
  extractElementInfo(element) {
    const tagName = element.tagName.toLowerCase();
    const rect = element.getBoundingClientRect();

    // Get text content
    let text = '';
    if (tagName === 'input' || tagName === 'textarea') {
      text = element.value || element.placeholder || '';
    } else if (tagName === 'select') {
      const selectedOption = element.options[element.selectedIndex];
      text = selectedOption ? selectedOption.text : '';
    } else {
      text = element.innerText?.trim() || element.textContent?.trim() || '';
    }

    // Limit text length
    if (text.length > 100) {
      text = text.substring(0, 97) + '...';
    }

    return {
      tagName,
      text,
      attributes: this.extractAttributes(element),
      rect: {
        top: rect.top,
        left: rect.left,
        width: rect.width,
        height: rect.height
      }
    };
  }

  /**
   * Extract relevant attributes from element
   */
  extractAttributes(element) {
    const attrs = {};
    const relevantAttrs = [
      'type', 'name', 'id', 'class', 'placeholder', 'value',
      'href', 'src', 'alt', 'title', 'role', 'aria-label',
      'aria-describedby', 'aria-expanded', 'aria-selected',
      'disabled', 'readonly', 'required', 'checked',
      'contenteditable', 'data-testid'
    ];

    for (const attr of relevantAttrs) {
      const value = element.getAttribute(attr);
      if (value !== null && value !== '') {
        attrs[attr] = value;
      }
    }

    return attrs;
  }

  /**
   * Assign indices to elements
   */
  assignIndices(elements, domElements, maxElements) {
    const indexedElements = [];
    const elementByNode = new Map();
    const indexedDomElements = [];

    const limit = Math.min(elements.length, maxElements);
    for (let i = 0; i < limit; i++) {
      const element = elements[i];
      const domElement = domElements[i];

      element.index = i;
      indexedElements.push(element);
      indexedDomElements.push(domElement);
      elementByNode.set(domElement, element);
    }

    return { elements: indexedElements, domElements: indexedDomElements, elementByNode };
  }

  /**
   * Build HTML content with indexed elements inline
   * browser-use format: [index]<tag attr="val" />
   * IMPORTANT: Does NOT skip any content - extracts everything
   */
  buildIndexedHtml(rootNode, indexedData) {
    const { elementByNode } = indexedData;
    const parts = [];

    const traverse = (node) => {
      // Check if this node is an indexed element
      const indexedElement = elementByNode.get(node);
      if (indexedElement) {
        // Add indexed element in browser-use format: [index]<tag attr="val" />
        parts.push(this.formatIndexedElement(indexedElement));
        parts.push(' ');
        // Continue processing children to get all content
      }

      // Process children nodes
      for (const child of node.childNodes) {
        if (child.nodeType === Node.TEXT_NODE) {
          const text = child.textContent.trim();
          if (text) {
            parts.push(text);
            parts.push(' ');
          }
        } else if (child.nodeType === Node.ELEMENT_NODE) {
          const tagName = child.tagName.toLowerCase();

          // Skip script, style, noscript, and code tags - they contain code/data, not visible content
          // Code tags often contain hidden JSON state (e.g., LinkedIn's `<code style="display:none">`)
          if (tagName === 'script' || tagName === 'style' || tagName === 'noscript' || tagName === 'code') {
            continue;
          }

          // Add newlines for block elements
          const isBlockElement = ['div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'section', 'article', 'header', 'footer', 'main', 'nav', 'br'].includes(tagName);

          if (isBlockElement) {
            const lastChar = parts.length > 0 ? parts[parts.length - 1].slice(-1) : '';
            if (lastChar !== '\n') {
              parts.push('\n');
            }
          }

          // Recursively process children
          traverse(child);

          // Add newline after block elements
          if (isBlockElement) {
            const currentLastChar = parts.length > 0 ? parts[parts.length - 1].slice(-1) : '';
            if (currentLastChar !== '\n') {
              parts.push('\n');
            }
          }
        }
      }
    };

    traverse(rootNode);

    // Clean up excessive newlines
    let html = parts.join('')
      .replace(/\n{3,}/g, '\n\n')
      .replace(/^\\n+/, '')
      .replace(/\n+$/, '')
      .trim();

    return html;
  }

  /**
   * Format indexed element in browser-use style
   * Format: [index]<tag attr="val" />
   */
  formatIndexedElement(element) {
    let html = `[${element.index}]<${element.tagName}`;

    // Add key attributes
    const importantAttrs = ['type', 'placeholder', 'aria-label', 'name', 'id', 'role', 'value', 'href', 'class'];
    for (const attr of importantAttrs) {
      if (element.attributes[attr]) {
        html += ` ${attr}="${element.attributes[attr]}"`;
      }
    }

    // Self-closing tags
    if (['input', 'img', 'br', 'hr', 'meta', 'link'].includes(element.tagName)) {
      html += ' />';
    } else if (element.text) {
      html += `>${element.text}</${element.tagName}>`;
    } else {
      html += `></${element.tagName}>`;
    }

    return html;
  }
}

// Make it available in the global scope for Playwright eval context
globalThis.HtmlPageExtractor = HtmlPageExtractor;
// Backward compatibility alias for existing code
globalThis.IndexedHtmlExtractor = HtmlPageExtractor;

// ES6 exports for module systems
export default HtmlPageExtractor;
export { HtmlPageExtractor, HtmlPageExtractor as IndexedHtmlExtractor };
