package com.speechify.client.helpers.content.standard.html

import com.speechify.client.api.content.ContentCursor
import com.speechify.client.api.content.ContentIndex
import com.speechify.client.api.content.ContentText
import com.speechify.client.api.content.ContentTextUtils
import com.speechify.client.api.content.TextElementContentSlice
import com.speechify.client.api.content.view.standard.StandardBlock
import com.speechify.client.api.content.view.standard.StandardBlocks
import com.speechify.client.api.content.view.standard.StandardElement
import com.speechify.client.api.content.view.standard.StandardView
import com.speechify.client.api.content.view.web.WebPage
import com.speechify.client.api.content.view.web.WebPageNode
import com.speechify.client.api.content.view.web.WebPageView
import com.speechify.client.api.content.view.web.getAttribute
import com.speechify.client.api.util.Callback
import com.speechify.client.api.util.fromCo
import com.speechify.client.api.util.successfully
import com.speechify.client.helpers.content.standard.ContentSequenceCharacteristicsOfImmutableAlwaysLiveNoUserEffectContent
import com.speechify.client.helpers.content.standard.StandardBlockChunking
import com.speechify.client.helpers.content.standard.html.contentExtractionRules.RelevantContentExtractionRules
import com.speechify.client.helpers.content.standard.html.contentExtractionRules.allPagesRules
import com.speechify.client.helpers.content.standard.html.contentExtractionRules.knownPages.getRulesForSourceUrl
import com.speechify.client.internal.sync.coLazy
import com.speechify.client.internal.util.extensions.collections.groupConsecutiveBy
import com.speechify.client.internal.util.extensions.intentSyntax.isNotNullAnd
import com.speechify.client.internal.util.www.parseUrl
import kotlin.js.JsExport

private const val textLengthLimitInSingleParagraph = 3000

/**
 * An adapter that accepts a [WebPageView] and behaves like a [StandardView]
 *
 * Note: We implement [ContentIndex] here instead of in [WebPageView] because [ContentIndex] is simpler to implement given the [StandardBlock] models than the recursive [WebPageNode] models.
 */
@JsExport
open class WebPageStandardView internal constructor(
    private val webPageView: WebPageView,
    private val shouldUseRichBlocksParsing: Boolean = false,
) :
    StandardView,
    ContentSequenceCharacteristicsOfImmutableAlwaysLiveNoUserEffectContent {
    override val start: ContentCursor = webPageView.start
    override val end: ContentCursor = webPageView.end

    // Lazy-memoized computations since the Web Page content never changes.
    private val standardBlockChunking = coLazy {
        val blocksParsingFn = when (shouldUseRichBlocksParsing) {
            true -> ::getRichBlocksFromWebPage
            false -> ::getBlocksFromWebPage
        }
        val allBlocks = blocksParsingFn(webPageView.getWebPage())

        // Moved the chunking to `coLazy` instead of `StandardBlockChunking` in order to do it once.
        val chunkedBlocks = allBlocks.toList().flatMap {
            it.chunkIfItsSingleParagraphWithLargeTextContent(textLengthLimitInSingleParagraph)
        }

        createStandardBlockChunking(
            chunkedStandardBlocks = StandardBlocks(
                blocks = chunkedBlocks.toTypedArray(),
                start = start,
                end = end,
            ),
            chunkSize = CHUNK_SIZE,
        )
    }

    internal open fun createStandardBlockChunking(
        chunkedStandardBlocks: StandardBlocks,
        chunkSize: Int,
    ) = StandardBlockChunking(
        allBlocks = chunkedStandardBlocks,
        chunkSize = chunkSize,
    )

    override fun getBlocksAroundCursor(
        cursor: ContentCursor,
        callback: Callback<StandardBlocks>,
    ) =
        callback.fromCo {
            standardBlockChunking.get().getBlocksAroundCursor(cursor).successfully()
        }

    override fun getBlocksBetweenCursors(start: ContentCursor, end: ContentCursor, callback: Callback<StandardBlocks>) =
        callback.fromCo {
            standardBlockChunking.get().allBlocks.blocks.filter { !it.end.isBefore(start) && !it.start.isAfter(end) }
                .let { blocksBetween ->
                    StandardBlocks(
                        blocksBetween.toTypedArray(),
                        blocksBetween.firstOrNull()?.start ?: start,
                        blocksBetween.lastOrNull()?.end ?: end,
                    )
                }
                .successfully()
        }

    override fun destroy() = Unit // NOOP

    companion object {
        internal const val CHUNK_SIZE = 25
    }
}

internal fun getRichBlocksFromWebPage(webPage: WebPage): Sequence<StandardBlock> {
    val applicableRules: RelevantContentExtractionRules =
        allPagesRules + (webPage.sourceUrl?.let { getRulesForSourceUrl(it) })
    val shouldIgnoreWithAllDescendants = applicableRules.ignored?.let { it::isMatching } ?: { false }
    val shouldIgnoreAsAncestor = applicableRules.inline?.let { it::isMatching } ?: { false }
    val isInline = applicableRules.inline?.let { it::isMatching } ?: { false }

    // The following HTML tags have structural significance, so they can have empty or missing text/children
    val tagsWithStructuralSignificance = setOf("img", "td", "th")

    fun isInlineNodeOfInterest(node: WebPageNode): Boolean {
        return when (node) {
            // Ignore leaf text nodes that are zero-length to make the checks in upper layers a bit easier to reason
            // about in terms of visible content
            is WebPageNode.Text -> node.text.text.isNotEmpty()
            is WebPageNode.Element -> !shouldIgnoreWithAllDescendants(node) && shouldIgnoreAsAncestor(node) && isInline(
                node,
            )
        }
    }

    // TODO(anson): cleanup the mental model of "inline", "block", "container", "interest", etc
    // TODO(anson): esp find a way to make the interaction between the skip/inline content matching rules and
    // the modeling/text-flattening/skipping in these algorithms
    fun isContainerNodeThatIsAlsoPermittedInline(node: WebPageNode): Boolean {
        return node is WebPageNode.Element && node.tagName.lowercase().let {
            it == "p" || it.matches(Regex("^[hH]\\d$"))
        }
    }

    fun getTopLevelInlineNodesOfInterest(root: WebPageNode): Sequence<WebPageNode> = sequence {
        when {
            root is WebPageNode.Element && shouldIgnoreWithAllDescendants(root) -> return@sequence
            isInlineNodeOfInterest(root) || isContainerNodeThatIsAlsoPermittedInline(root) -> yield(root)
            root is WebPageNode.Element -> yieldAll(root.children.flatMap { getTopLevelInlineNodesOfInterest(it) })
            root is WebPageNode.Text -> yield(root)
        }
    }

    fun extractListStyle(node: WebPageNode.Element, default: StandardElement.List.ListStyle):
        StandardElement.List.ListStyle {
        fun extractListStyle(elementStyle: String): StandardElement.List.ListStyle {
            var indexOfStart = elementStyle.indexOf("list-style:")
            var styleDeclaratorLength = 12
            if (indexOfStart == -1) {
                indexOfStart = elementStyle.indexOf("list-style-type:")
                styleDeclaratorLength = 16
            }
            if (indexOfStart == -1) {
                return default
            }

            var end = elementStyle.indexOf(';', indexOfStart + styleDeclaratorLength)
            if (end == -1) {
                end = elementStyle.length
            }

            var listStyle = elementStyle.substring(indexOfStart + styleDeclaratorLength, end).trim().lowercase()
            if (listStyle.startsWith("-moz-")) {
                listStyle = listStyle.substring(5)
            }
            listStyle = listStyle.replace('-', '_')
            return StandardElement.List.ListStyle.values().firstOrNull { it.name.lowercase() == listStyle } ?: default
        }

        val style = node.getAttribute("style")
        if (style == null) {
            return default
        }

        return extractListStyle(style)
    }

    /**
     * @returns `emptyList` if this is an Element node that is expected to have children but has none.
     */
    fun inlineNodeToStandardElement(node: WebPageNode): List<StandardElement> {
        return when (node) {
            is WebPageNode.Element -> {
                when (val tagName = node.tagName.lowercase()) {
                    // Handle "void" elements with no children
                    "img" -> {
                        val src = node.getAttribute("src") ?: return emptyList()
                        val altText = node.getAttribute("alt")

                        val url = parseUrl(src, relativeToBaseUrl = webPage.sourceUrl) ?: return emptyList()
                        listOf(
                            StandardElement.Image.Remote(
                                url = url.toString(),
                                altText = altText,
                                reference = node.ref,
                            ),
                        )
                    }

                    // Handle <br> tags. We want to represent these as a newline character in the output
                    "br" -> listOf(StandardElement.Text(TextElementContentSlice.fromTextElement(node.ref, "\n")))

                    // Handle elements that we expect to have children
                    else -> {
                        val inlineBlocks = node.children
                            .flatMap(::getTopLevelInlineNodesOfInterest)
                            .map(::inlineNodeToStandardElement)
                            .flatten()

                        if (inlineBlocks.isEmpty() && !tagsWithStructuralSignificance.contains(
                                tagName,
                            )
                        ) {
                            return emptyList()
                        }

                        when {
                            tagName.matches(Regex("^[hH]\\d$")) -> listOf(
                                StandardElement.Heading(
                                    level = tagName[1].digitToInt(),
                                    inlineBlocks,
                                ),
                            )

                            tagName == "p" -> listOf(StandardElement.Paragraph(inlineBlocks))
                            tagName == "u" -> listOf(StandardElement.Underlined(inlineBlocks))
                            tagName == "i" -> listOf(StandardElement.Italics(inlineBlocks))
                            tagName == "strong" -> listOf(StandardElement.Bold(inlineBlocks))
                            tagName == "a" -> {
                                val href = node.attributes.find { it.name == "href" }?.value ?: ""
                                val url = parseUrl(href, relativeToBaseUrl = webPage.sourceUrl) ?: return inlineBlocks

                                listOf(
                                    StandardElement.Anchor.External(
                                        url = url.toString(),
                                        inlineBlocks,
                                    ),
                                )
                            }

                            tagName == "table" ->
                                listOf(
                                    StandardElement.Table(inlineBlocks.filterIsInstance<StandardElement.Table.Row>()),
                                )

                            tagName == "tr" ->
                                listOf(
                                    StandardElement.Table.Row(
                                        inlineBlocks.filterIsInstance<StandardElement.Table.Cell>(),
                                    ),
                                )

                            tagName == "td" -> {
                                val rowSpan = node.getAttribute("rowspan")?.toInt() ?: 1
                                val colSpan = node.getAttribute("colspan")?.toInt() ?: 1
                                listOf(StandardElement.Table.Cell.body(inlineBlocks, node.ref, rowSpan, colSpan))
                            }
                            tagName == "th" -> {
                                val rowSpan = node.getAttribute("rowspan")?.toInt() ?: 1
                                val colSpan = node.getAttribute("colspan")?.toInt() ?: 1
                                listOf(StandardElement.Table.Cell.header(inlineBlocks, node.ref, rowSpan, colSpan))
                            }
                            tagName == "code" -> listOf(StandardElement.Code(inlineBlocks))
                            tagName == "ul" -> {
                                val style = extractListStyle(node, StandardElement.List.ListStyle.DISC)
                                val listItems = inlineBlocks.filterIsInstance<StandardElement.List.ListItem>()
                                if (listItems.isEmpty()) {
                                    // We encountered web pages that have an <ul> tag with no list items
                                    // Ex: https://www.brookings.edu/articles/how-artificial-intelligence-is-transforming-the-world/
                                    return inlineBlocks
                                }
                                listOf(
                                    StandardElement.List(
                                        listItems,
                                        style,
                                    ),
                                )
                            }
                            tagName == "ol" -> {
                                val style = extractListStyle(node, StandardElement.List.ListStyle.DECIMAL)
                                val listItems = inlineBlocks.filterIsInstance<StandardElement.List.ListItem>()
                                if (listItems.isEmpty()) {
                                    // We encountered web pages that have an <ul> tag with no list items
                                    // Ex: https://www.brookings.edu/articles/how-artificial-intelligence-is-transforming-the-world/
                                    return inlineBlocks
                                }
                                listOf(
                                    StandardElement.List(
                                        listItems,
                                        style,
                                    ),
                                )
                            }
                            tagName == "li" -> listOf(StandardElement.List.ListItem(inlineBlocks))

                            else -> inlineBlocks
                        }
                    }
                }
            }

            is WebPageNode.Text -> {
                listOf(StandardElement.Text(node.text))
            }
        }
    }

    fun getTopLevelInlineNodesOfInterestWithClosestAncestorBlockNode(root: WebPageNode):
        Sequence<Pair<WebPageNode, WebPageNode.Element?>> =
        sequence {
            when (root) {
                is WebPageNode.Text -> {
                    yield(root to null)
                }

                is WebPageNode.Element -> {
                    fun getResultRecursively(
                        curNode: WebPageNode,
                        closestAncestorBlockNode: WebPageNode.Element?,
                    ): Sequence<Pair<WebPageNode, WebPageNode.Element?>> =
                        sequence innerSequence@{
                            when (curNode) {
                                is WebPageNode.Text -> yield(curNode to closestAncestorBlockNode)
                                is WebPageNode.Element -> {
                                    if (shouldIgnoreWithAllDescendants(curNode)) return@innerSequence

                                    val closestNotIgnoredAncestorOfChildren =
                                        if (shouldIgnoreAsAncestor(curNode)) {
                                            closestAncestorBlockNode
                                        } else {
                                            curNode
                                        }

                                    for (child in curNode.children) {
                                        when {
                                            isInlineNodeOfInterest(child) -> yield(
                                                Pair(child, closestNotIgnoredAncestorOfChildren),
                                            )

                                            else -> yieldAll(
                                                getResultRecursively(child, closestNotIgnoredAncestorOfChildren),
                                            )
                                        }
                                    }
                                }
                            }
                        }
                    yieldAll(getResultRecursively(root, null))
                }
            }
        }

    fun List<WebPageNode>.cleanupWhitespace(): List<WebPageNode> {
        fun helper(
            nodes: List<WebPageNode>,
            initialShouldTrimLeadingWhitespace: Boolean,
            shouldTrimTrailingWhitespace: Boolean,
        ): Sequence<Pair<WebPageNode, Boolean>> =
            sequence helper@{
                var shouldTrimLeadingWhitespace = initialShouldTrimLeadingWhitespace
                nodes.forEachIndexed { ix, it ->
                    val isLastChild = ix == nodes.lastIndex
                    when (it) {
                        is WebPageNode.Element -> {
                            // don't clean the subtree for whitespace-preserving elements
                            if (it.tagName.lowercase() == "code") {
                                // for <code> elements we don't want to remove the surrounding <code></code> block
                                yield(it to false)
                                return@forEachIndexed
                            }
                            if (it.tagName.lowercase() == "pre") {
                                yieldAll(it.children.map { it to false })
                                return@forEachIndexed
                            }

                            val children = helper(
                                it.children.toList(),
                                initialShouldTrimLeadingWhitespace = shouldTrimLeadingWhitespace,
                                shouldTrimTrailingWhitespace = false,
                            ).toList()

                            if (children.isEmpty()) {
                                when {
                                    // Don't omit special elements on the basis of having no useful text!!
                                    tagsWithStructuralSignificance.contains(
                                        it.tagName.lowercase(),
                                    ) -> {
                                        yield(it.copy(children = emptyArray()) to shouldTrimLeadingWhitespace)
                                    }

                                    it.tagName.lowercase() == "br" -> {
                                        yield(it to false)
                                        shouldTrimLeadingWhitespace = true
                                    }

                                    else -> {}
                                }
                            } else {
                                this.yield(
                                    it.copy(
                                        children = children.map { it.first }.toTypedArray(),
                                    ) to children.last().second,
                                )
                                shouldTrimLeadingWhitespace = children.last().second
                            }
                        }

                        // from https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model/Whitespace#how_does_css_process_whitespace
                        is WebPageNode.Text ->
                            it.rawText
                                // 1. First, all spaces and tabs immediately before and after a line break are ignored
                                .replace(Regex("[ \t]*[\r\n][ \t]*"), "\n")
                                // 2. Next, all tab characters are handled as space characters,
                                .replace('\t', ' ')
                                // 3. Next, line breaks are converted to spaces:
                                .replace(Regex("[\r\n]"), " ")
                                // 4. After that, any space immediately following another space (even across two separate inline elements) is ignored,
                                .replace(Regex(" +"), " ")
                                .let { cleanText ->
                                    if (shouldTrimLeadingWhitespace && cleanText.isNotEmpty() && cleanText.first()
                                        .isWhitespace()
                                    ) {
                                        cleanText.trimStart()
                                    } else {
                                        cleanText
                                    }
                                }
                                .let { cleanText ->
                                    if (shouldTrimTrailingWhitespace && isLastChild && cleanText.isNotEmpty() &&
                                        cleanText.last().isWhitespace()
                                    ) {
                                        cleanText.trimEnd()
                                    } else {
                                        cleanText
                                    }
                                }
                                .let { cleanText ->
                                    if (cleanText.isNotEmpty()) {
                                        this.yield(it.copy(rawText = cleanText) to cleanText.last().isWhitespace())
                                        shouldTrimLeadingWhitespace = cleanText.last().isWhitespace()
                                    }
                                }
                    }
                }
            }

        return helper(
            this@cleanupWhitespace,
            initialShouldTrimLeadingWhitespace = true,
            shouldTrimTrailingWhitespace = true,
        ).map { (cleanNode, _) -> cleanNode }.toList()
    }

    fun inlineNodesAndClosestAncestorBlockNodeToStandardBlock(
        inlineNodes: List<WebPageNode>,
        closestAncestorBlockNode: WebPageNode.Element?,
    ): StandardBlock? {
        val inlineBlocks = inlineNodes
            .cleanupWhitespace()
            .map(::inlineNodeToStandardElement)
            .flatten()

        return when {
            // Omit this subtree if none of the top-level inline nodes could be mapped to a standard inline block
            inlineBlocks.isEmpty() -> null

            // default to Paragraph if no semantic parent node for this group
            closestAncestorBlockNode == null -> StandardBlock.Paragraph(inlineBlocks)

            closestAncestorBlockNode.tagName.matches(Regex("[hH]\\d")) -> {
                val level = closestAncestorBlockNode.tagName.last().digitToInt()
                StandardBlock.Heading(level = level, inlineBlocks)
            }

            // default to Paragraph for all other nodes for now
            else -> StandardBlock.Paragraph(inlineBlocks)
        }
    }

    return getTopLevelInlineNodesOfInterestWithClosestAncestorBlockNode(webPage.root)
        .groupConsecutiveBy { it.second }
        .mapNotNull { (block, pairs) ->
            inlineNodesAndClosestAncestorBlockNodeToStandardBlock(
                inlineNodes = pairs.map { it.first },
                closestAncestorBlockNode = block,
            )
        }
}

internal /* #InternalForTests */
fun getBlocksFromWebPage(webPage: WebPage): Sequence<StandardBlock> = getBlocksFromRootElement(
    webPage.root,
    webPage.sourceUrl,
)

internal fun getBlocksFromRootElement(
    root: WebPageNode.Element,
    sourceUrl: String?,
): Sequence<StandardBlock> {
    val applicableRules: RelevantContentExtractionRules =
        allPagesRules + (sourceUrl?.let { getRulesForSourceUrl(it) })

    return getAllDescendantTextNodesPairedWithClosestMatchingAncestor(
        rootElt = root,
        shouldIgnoreWithAllDescendants = applicableRules.ignored?.let { it::isMatching },
        shouldIgnoreAsAncestor = applicableRules.inline?.let { it::isMatching },
        /* Note that it's a whitelist of inline
                    elements, so any unknown element is assumed a block-element */

        /* Another note is that when there's a block element inside an inline element
        (e.g. `<span>text <p> more</p> text2</span>`) we will create 3 paragraphs, but this seems justified, at
        least for known block elements. For custom ones, it's hard to see without CSS. Once found an 'undesirably
        block element', we can consider: explicitly adding it here to make inline, forcing no-blocks under certain
        elements (to be implemented), or developing importing with CSS ([PLT-1944 'Full Web Page in single file' support]
        or using Browser Extension to preprocess import).
        */
    )
        .groupConsecutiveBy(
            /* Need to use groupConsecutive because `groupBy` would change order if there was ever a
                    `<div>ChildText<p>GrandChildText</p>ChildText</div>` and would put one of `ChildText`s on the other side of GC
                   */
            { (_, closestMatchingAncestor) -> closestMatchingAncestor },
            valueTransform = { (textNode, _) -> textNode },
        )
        .mapNotNull { (sharedClosestMatchingAncestor, textNodes) ->
            val text = textNodes
                .map { it.text }
                .trimmedAtHeadAndTail()
                .withMultiWhitespaceCollapsed()
                .toList()
                .let { texts ->
                    if (texts.isEmpty()) {
                        // Don't yield elements with only blank nodes
                        return@mapNotNull null
                    } else {
                        return@let ContentTextUtils.concat(texts)
                    }
                }

            when {
                /* TODO - see if <hX>s ever have block children. If so, we could solve by introducing something that
                     forces to be the last block-creating ancestor (forceAllDescendantsTInline), or we could maintain
                     references to parents in nodes and find any header-type ancestor through these links
                 */
                sharedClosestMatchingAncestor.isNotNullAnd { tagName.lowercase().matches(Regex("h\\d")) } ->
                    StandardBlock.Heading(text)

                else -> StandardBlock.Paragraph(text)
            }
        }
}

private fun getAllDescendantTextNodesPairedWithClosestMatchingAncestor(
    rootElt: WebPageNode.Element,
    shouldIgnoreWithAllDescendants: ((elt: WebPageNode.Element) -> Boolean)?,
    shouldIgnoreAsAncestor: ((elt: WebPageNode.Element) -> Boolean)?,
): Sequence<Pair<WebPageNode.Text, WebPageNode.Element?>> = sequence {
    fun getResultRecursively(
        currRoot: WebPageNode.Element,
        closestNotIgnoredAncestor: WebPageNode.Element?,
    ): Sequence<Pair<WebPageNode.Text, WebPageNode.Element?>> =
        sequence innerSequence@{
            if (shouldIgnoreWithAllDescendants?.let { it(currRoot) } == true) {
                return@innerSequence
            }

            val closestNotIgnoredAncestorOfChildren =
                if (shouldIgnoreAsAncestor?.let { it(currRoot) } == true) {
                    closestNotIgnoredAncestor
                } else {
                    currRoot
                }
            for (child in currRoot.children) {
                when (child) {
                    is WebPageNode.Text -> {
                        yield(Pair(child, closestNotIgnoredAncestorOfChildren))
                    }

                    is WebPageNode.Element ->
                        yieldAll(getResultRecursively(child, closestNotIgnoredAncestorOfChildren))
                }
            }
        }

    yieldAll(getResultRecursively(rootElt, null))
}

/**
 * Collapses strings like `"  "` (double space) and `" \n"` (space and newline) into a single " ", even if the
 * duplication is in two adjacent fragments.
 * As per [HTML behavior](https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model/Whitespace)
 */
internal fun Sequence<ContentText>.withMultiWhitespaceCollapsed() = sequence {
    val iterator = this@withMultiWhitespaceCollapsed
        .filter { it.text.isNotEmpty() }
        .map { it.replaceAll("\\s\\s+") { " " } }
        .iterator()

    if (!iterator.hasNext()) {
        return@sequence
    }
    var currentFragment = iterator.next()

    while (iterator.hasNext()) {
        val next = iterator.next()
        if (currentFragment.text.last().isWhitespace()) {
            /* This is a `"word " + X` situation. Let's left-trim the X (our arbitrary choice) */

            if (!next.text.first().isWhitespace()) {
                // No trimming needed, as X doesn't start with whitespace
                yield(currentFragment)
            } else {
                // X starts with whitespace - need to trim
                if (currentFragment.text.length == 1) {
                    /* X is just a " ", trimming means removing it entirely, so we skip it */
                } else {
                    yield(currentFragment.slice(0, currentFragment.length - 1))
                }
            }
        } else {
            yield(currentFragment) // The fragment doesn't end with whitespace, so no trimming required - can just yield
        }
        currentFragment = next
    }

    yield(currentFragment)
}

/**
 * Like [String.trim] but for a collection of string fragments:
 * * entirely removes whitespace-only fragments from head and tail
 * * left-trims the first non-blank fragment and right-trims the last non-blank fragment.
 */
internal fun Iterable<ContentText>.trimmedAtHeadAndTail(): Sequence<ContentText> = sequence {
    val iterator = this@trimmedAtHeadAndTail
        .dropWhile { it.text.isBlank() }
        .dropLastWhile { it.text.isBlank() }
        .iterator()

    if (!iterator.hasNext()) {
        return@sequence
    }

    var currentFragment = iterator.next()

    val indexOfFirstNonWhitespace = currentFragment.text.indexOfFirst { !it.isWhitespace() }
    if (indexOfFirstNonWhitespace != 0) {
        currentFragment =
            currentFragment.slice(startIndex = indexOfFirstNonWhitespace, endIndex = currentFragment.length)
    }

    while (iterator.hasNext()) {
        yield(currentFragment)
        currentFragment = iterator.next()
    }

    val indexOfLastNonWhitespace = currentFragment.text.indexOfLast { !it.isWhitespace() }
    if (indexOfLastNonWhitespace != currentFragment.length - 1) {
        currentFragment = currentFragment.slice(startIndex = 0, endIndex = indexOfLastNonWhitespace + 1)
    }

    yield(currentFragment)
}

/**
 * Used to chunk a single Paragraph with very large text content.
 * There is a known use case where CE imports huge text file as html with few flat html text nodes with a very
 * large content (tens of thousand of characters). Which leads the App on iOS/Android to crash.
 * See [https://linear.app/speechify-inc/issue/CXP-3687/investigate-text-items-imported-via-chrome-extension-dont-play-on-ios]
 */
internal fun StandardBlock.chunkIfItsSingleParagraphWithLargeTextContent(textLengthLimit: Int): List<StandardBlock> {
    if (this !is StandardBlock.Paragraph || _elements.any { it !is StandardElement.Text }) {
        return listOf(this)
    }
    return _elements.flatMap {
        it as StandardElement.Text // we are already sure that all `StandardElement` are Text ones.
        it.chunkByCharLengthLimitAtNearestSentenceBoundary(limit = textLengthLimit)
    }.chunkByLengthLimit(textLengthLimit).map {
        StandardBlock.Paragraph(it)
    }
}

/**
 * Splits a list of StandardElement.Text into chunks where the sum of text lengths in each chunk does not exceed the
 * specified limit.
 */
private fun List<StandardElement.Text>.chunkByLengthLimit(limit: Int): List<List<StandardElement.Text>> {
    return fold(mutableListOf<MutableList<StandardElement.Text>>()) { accumulator, element ->
        if (accumulator.isEmpty() || accumulator.last().sumOf { it.text.length } + element.text.length > limit) {
            accumulator.add(mutableListOf(element))
        } else {
            accumulator.last().add(element)
        }
        accumulator
    }
}

/**
 * Used for chunking [StandardElement.Text] text by character length limit at nearest sentence boundary
 * was implemented instead of utilizing existing sentence splitting utilities.
 *
 * Rationale:
 * This function was implemented instead of utilizing existing sentence splitting utilities
 * primarily for optimization and performance reasons, particularly to prevent crashes in
 * mobile apps when dealing with large volumes of text. While accuracy is important, in this
 * specific context, the precise segmentation of sentences may not be critical.
 */
fun StandardElement.Text.chunkByCharLengthLimitAtNearestSentenceBoundary(limit: Int): List<StandardElement.Text> {
    if (text.length <= limit) return listOf(this)

    fun getIndexesOfNaturalParagraphBreaks(text: String, startIndex: Int): Sequence<Int> = sequence {
        var currentIndex = startIndex
        while (currentIndex < text.length) {
            val index = text.indexOfAny(charArrayOf('.', '!', '?'), currentIndex)
            if (index == -1) return@sequence // No more delimiters found
            yield(index + 1)
            currentIndex = index + 1 // Move to the next character
        }
    }

    // Apply the "at most _limit_ chars" policy to the breaks, yielding another sequence of indexes
    val indexesOfSliceBoundaries: Sequence<Int> = sequence {
        var currentSliceBoundaryIndex = 0
        while (currentSliceBoundaryIndex < text.length) {
            yield(currentSliceBoundaryIndex)
            currentSliceBoundaryIndex = getIndexesOfNaturalParagraphBreaks(text.text, currentSliceBoundaryIndex)
                .takeWhile { it - currentSliceBoundaryIndex <= limit }.lastOrNull()
                ?: (currentSliceBoundaryIndex + limit)
        }
        yield(currentSliceBoundaryIndex.coerceAtMost(text.length))
    }

    // Actually slice up the content based on the sequence of break indexes.
    return indexesOfSliceBoundaries.windowed(size = 2, step = 1, partialWindows = false)
        .map { (start, end) -> text.slice(start, end) }.toList().map {
            StandardElement.Text(it)
        }
}
