package com.speechify.client.internal.util.text.groupingToWords

/**
 * Ideogram is a graphic symbol that represents an idea or concept, independent of any particular language, and specific words or phrases.
 * In this case we want CJK(Chinese, Japanese, Korean) character.
 * There are multiple Regex that we could have use but the one used supports all platform.
 * 1. \\p{IsIdeographic} does not work with JS(Javascript), works with Android.
 * 2. \\p{InCJK_UNIFIED_IDEOGRAPHS} this works on Java platform only, does not work with JS, Native(IOS)
 * 3. \\p{Script=Han} does not work with IOS/MacOS.
 * 4. [\\u4E00-\\u9FFF] supports all platform. Found at https://github.com/JetBrains/kotlin/blob/75b4469757adef2fef8f7084ede075b48dedeb80/libraries/stdlib/native-wasm/src/kotlin/text/regex/AbstractCharClass.kt#L571 as a value of CJKUNIFIEDIDEOGRAPHS.
 */
private val IDEOGRAM_REGEX: Regex = Regex("[\\u4E00-\\u9FFF]")

/**
 * NOTE: The words will include accompanying punctuation, e.g. `(something` or `another,`.
 */
val QUOTATION_MARKS = "\"«»„“‘’‹›﹃﹄﹁﹂「」『』»«›‹”".toCharArray().toSet()

internal fun CharSequence.getWordsWithPunctuationAsIndexRanges(
    fromIndex: Int = 0,
    toIndexInclusive: Int = this.lastIndex,
): Sequence<IntRange> {
    return sequence {
        var currentIndex = fromIndex
        var currentWordStartIndex = fromIndex

        // We use getOrNull to avoid exception in case of out of bound. If this happened, we just return a whitespace instead
        fun currentChar() = getOrNull(currentIndex) ?: ' '

        fun insideWord() = currentWordStartIndex < currentIndex

        while (currentIndex <= toIndexInclusive) {
            val currentChar = currentChar()
            when {
                currentChar.isWhitespace() || QUOTATION_MARKS.contains(currentChar) -> {
                    // yield the word if it's not empty
                    if (insideWord()) {
                        yield(currentWordStartIndex until currentIndex)
                    }
                    currentWordStartIndex = currentIndex + 1
                }
                IDEOGRAM_REGEX.matches(currentChar.toString()) -> {
                    // yield the previous word if it is not empty as well as the ideogram
                    if (insideWord()) {
                        yield(currentWordStartIndex until currentIndex)
                    }
                    yield(currentIndex until currentIndex + 1)
                    currentWordStartIndex = currentIndex + 1
                }
            }

            currentIndex++
        }

        // We may have a complete word when we reach the end. If so, write it as well.
        // If we end on a whitespace, count the next word as well for some reason??? See SpeechWordCountTest
        if (insideWord() && (currentIndex == lastIndex + 1 || currentChar().isWhitespace())) {
            yield(currentWordStartIndex until currentIndex)
        }
    }
}

internal fun CharSequence.getWordsWithPunctuation(
    fromIndex: Int = 0,
    toIndexInclusive: Int = this.lastIndex,
) = this.getWordsWithPunctuationAsIndexRanges(
    fromIndex = fromIndex,
    toIndexInclusive = toIndexInclusive,
)
    .map { this.slice(it) }

internal fun String.wordCount() = this.getWordsWithPunctuationAsIndexRanges().count()
