package com.speechify.client.helpers.content.standard.html.contentExtractionRules

import com.speechify.client.api.content.view.web.WebPageElementAttribute
import com.speechify.client.helpers.content.standard.html.ElementMatcher

internal val allPagesRules by lazy { /* `lazy` just for this file to read overview-before-detail/what-before-how */
    RelevantContentExtractionRules(
        ignored = ElementMatcher.CompositeAnyOf(
            /* leaving variables, not inlining them, especially to minimize nesting and to no have to re-break lines in comments to make the opinionated linter (which doesn't like auto-wrap) happy 😤*/
            byTagNameMatchersSet = allPagesSkippedTags,
            otherMatchers = allPagesSkippedMatchers,
        ),
        inline = inlineElementMatcher,
    )
}

/* TODO Combine into `elementsKnowledge` */
private val allPagesSkippedTags = listOf(
    "style", /* Not the content for presenting to human by definition */
    "script", /* Not the content for presenting to human by definition */
    "head", /* Not the content for presenting to human by definition */
    /* "noscript",  since we're parsing in a no-Javascript context, we should keep these elements! They typically
    contain things like images that would have otherwise been rendered by JS.
     */
    "header", /* Not the core of the content by the semantic intent, typically just the content repeating across every
     page of a website', e.g. defined as [_" introductory content, typically a group of introductory or navigational
      aids. It may contain some heading elements but also a logo, a search form, an author name, and other elements."](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/header)
      Although sometimes it can be used NOT for website logo, but for the main heading of the article with the title,
      for example nytimes.com uses it to display the main title e.g. [here](https://www.nytimes.com/2022/08/30/opinion/trump-barr-justice-department.html)
      (though we do have the page title extracted and presented already) or with the info on the author, so in theory
       could be good to extract it, if not even read.
      */
    "footer", /* Not the core of the content by the semantic intent, typically just the content repeating across every
     page of a website' */
    "nav", /* [_"purpose is to provide navigation
     links"_](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/nav) so skipping, as they will typically be
      repeated on all pages of a website */
    "menu", /* Skipping because like `nav`, [_"<menu> was intended for interactive
     items"_](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/menu), so they will typically be
      repeated on all pages of a website */
    "aside", /* [_"portion ... whose content is only indirectly related to the document's main
     content"_](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/aside) */
    "math", /* [Math](https://developer.mozilla.org/en-US/docs/Web/MathML/Element/math) contains text nodes. We skip
     entirely because speaking anything in place of mathematical formulas isn't supported (though in some faraway future
      it could be) while speaking the content text nodes would be gibberish. */
    /* "figure", don't skip <figure>s, because they usually contain images that we care about */
    "figcaption", /* Skip to maintain parity with Chrome Extension (but in future it could make sense to say something
     for an image, and it would make sense to also speak the caption or its `alt` text) */
    "button", /* Skipping to maintain parity with Chrome Extension (although #ChromeSkippingOnlyOutsideOfP: they aren't
     skipped inside <p> there, but just only outside, so could consider in future reading them) */
    "select", /* Skipping to maintain parity with Chrome Extension (but see #ChromeSkippingOnlyOutsideOfP) */
    "dialog", /* Skipping to maintain parity with Chrome Extension. It could make sense to skip only when matching
     `:not([open])`, but it's likely going to be a side dialog accompanying the content, rather than main content of the
      page, and it would be very jarring to read these */
    "ix:header", /* Skipping XBRL Header tags from HTML pages: they are non-displayed portions of the document according
     to the [XBRL Definition](https://www.xbrl.org/specification/inlinexbrl-part1/rec-2013-11-18/inlinexbrl-part1-rec-2013-11-18.html#d1e3971) */
)
    .map { ElementMatcher.ByTagName(it) }
    .toSet()

/* TODO Combine into `elementsKnowledge` */
private val allPagesSkippedMatchers = arrayOf<ElementMatcher.ScanRequiringMatcher>(
    ElementMatcher.ByAttributePresence("hidden"), /* This is equivalent to `display: none;` and
     pages which optimize for search-engines or robots will sometimes add this, not to require understanding CSS.
     Hidden elements will most often not be the important content, but rather some aside content that will show
      conditionally but is rendered into the page for ease of implementation or optimization (server-side rendering).
     */
    ElementMatcher.ByAttributePresence(
        attributeNameLowercase = "aria-label",
        ignoreForTags = setOf("body", "article", "main", "content", "section"),
        ignoreForElementAttributes = setOf(
            WebPageElementAttribute("aria-label", "content"),
            WebPageElementAttribute("role", "main"),
        ),
    ), /*
    The aria-label attribute defines a string value that labels an interactive element. We *very* rarely want to show
    these things (button labels, galleries, slideshows, etc), so we benefit from ignoring completely.
    https://developer.mozilla.org/en-US/docs/Web/Accessibility/ARIA/Attributes/aria-label
    */
    ElementMatcher.ByRole("navigation"), /* Used on Wikipedia for table-of-contents of an article and some
     menus, while by standard [_"Like the HTML <nav> element, navigation landmarks ..."_](https://developer.mozilla.org/en-US/docs/Web/Accessibility/ARIA/Roles/navigation_role) */
    ElementMatcher.ByRole("menu"), /* Since we also have `<menu>` excluded - it's most likely the same purpose
      despite [the docs not mentioning it](https://developer.mozilla.org/en-US/docs/Web/Accessibility/ARIA/Roles/menu_role) */
    ElementMatcher.ByRole("toolbar"), /* Used for 'sharing buttons' e.g. on nytimes.com
     [here](https://www.nytimes.com/2022/08/30/opinion/trump-barr-justice-department.html) */
    ElementMatcher.ByRole("tooltip"), /* Not visible by default as per [_"Tooltips provide contextual
     information about an element when that owning element receives focus"_](https://developer.mozilla.org/en-US/docs/Web/Accessibility/ARIA/Roles/tooltip_role) */
    ElementMatcher.ByRole("note"), /* Used on Wikipedia for _"From Wikipedia, the free encyclopedia"_ top note,
     while by standard [_"a section whose content is parenthetic or ancillary to the main content"_](https://developer.mozilla.org/en-US/docs/Web/Accessibility/ARIA/Roles/note_role) */
    ElementMatcher.ByRole("banner"), /* [_"<header> element has an identical
     meaning"_](https://developer.mozilla.org/en-US/docs/Web/Accessibility/ARIA/Roles/banner_role) */
    ElementMatcher.ByRole("contentinfo"), /* Same as <footer> as per [_"Using the <footer> element instead is
     recommended"_](https://developer.mozilla.org/en-US/docs/Web/Accessibility/ARIA/Roles/banner_role) */
    ElementMatcher.ByRole("complementary"), /* [_"supporting section ... If possible, use the HTML <aside>
     element"_](https://developer.mozilla.org/en-US/docs/Web/Accessibility/ARIA/Roles/complementary_role) */
)

private class ElementKnowledge(
    val matcher: ElementMatcher.CompositionMarkerInterfaces.NonCompositeMatcher,
    /**
     * `true` here means that the element does not create its own paragraph but is treated as one part of the
     * paragraph's text.
     */
    val isInline: Boolean,
)

private val elementsKnowledge = sequence {
    yieldAll(
        /* TODO(slawomir) - for now I consume this as a `set` of these element names, so back to square 1, but the
             reason for the indirection through [ElementKnowledge] is because I intend to evolve this into a list of
              [ElementKnowledge], where some elements are matched by other ways than a name. Each [ElementKnowledge]
               would appear only once, and would have multiple properties about belonging to different sets,
                where each such decision would come with a justificatory comment where helpful.
          */
        setOf(
            "td", /* This is not a standard, but our own derogation, to make table cells inline. This is in the light
             of not having any tables support and will perhaps change in future. For now, since all we have is
              paragraphs, this 'inlining' of `<td>`s avoids making a paragraph for each table cell.
              TODO <td>s often don't have whitespace so need to add one in between if there isn't to prevent joining words */

            /* Below are inline elements taken from [here](https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements#list_of_inline_elements)
               Some aren't necessary here as we don't render them in our view, and they have no text-nodes, but we keep the full list to be
               able to easily bring it up-to-date with the source.
             */
            "tr",
            "th",
            "table",
            "a",
            "abbr",
            "acronym",
            "audio",
            "b",
            "bdi",
            "bdo",
            "big",
            "br",
            "button",
            "canvas",
            "cite",
            "code",
            "data",
            "datalist",
            "del",
            "dfn",
            "em",
            "embed",
            "i",
            "iframe",
            "img",
            "input",
            "ins",
            "kbd",
            "label",
            "map",
            "mark",
            "meter",
            "object",
            // "picture", // usually contains <img>, so we prefer to pass-thru here
            "output",
            "pre",
            "progress",
            "q",
            "ruby",
            "s",
            "samp",
            "script",
            "select",
            "slot",
            "small",
            "span",
            "strong",
            "sub",
            "sup",
            "svg",
            "template",
            "textarea",
            "time",
            "u",
            "tt",
            "var",
            "video",
            "wbr",
            "ul",
            "ol",
            "li",
        )
            .map { elName ->
                ElementKnowledge(
                    isInline = true,
                    matcher = ElementMatcher.ByTagName(elName)
                        as ElementMatcher.CompositionMarkerInterfaces.NonCompositeMatcher,
                )
            },
    )
}.toList()

private val inlineElementMatcher =
    ElementMatcher.CompositeAnyOf.fromScalarMatchers(
        elementsKnowledge.asSequence().filter { it.isInline }
            .map { it.matcher },
    )
