package com.speechify.client.internal.util.text.groupingToSentences

import com.speechify.client.api.util.text.currencySigns
import com.speechify.client.internal.util.extensions.numbers.ranges.width
import com.speechify.client.internal.util.text.groupingToSentences.internal.SpanFind
import com.speechify.client.internal.util.text.groupingToSentences.internal.sentenceTerminators.armenianFullStop
import com.speechify.client.internal.util.text.groupingToSentences.internal.sentenceTerminators.burmeseFullStop
import com.speechify.client.internal.util.text.groupingToSentences.internal.sentenceTerminators.chineseAndJapaneseFullStop
import com.speechify.client.internal.util.text.groupingToSentences.internal.sentenceTerminators.chineseAndJapaneseFullWidthExclamation
import com.speechify.client.internal.util.text.groupingToSentences.internal.sentenceTerminators.chineseAndJapaneseFullWidthQuestionMark
import com.speechify.client.internal.util.text.groupingToSentences.internal.sentenceTerminators.greekQuestionMark
import com.speechify.client.internal.util.text.groupingToSentences.internal.sentenceTerminators.nagariFullStop
import com.speechify.client.internal.util.text.groupingToSentences.internal.sentenceTerminators.rightToLeftScriptsQuestionMarkArabic
import com.speechify.client.internal.util.text.groupingToSentences.internal.sentenceTerminators.rightToLeftScriptsQuestionMarkReversed
import com.speechify.client.internal.util.text.groupingToSentences.internal.sentenceTerminators.sanskritFullStop
import com.speechify.client.internal.util.text.groupingToSentences.internal.sentenceTerminators.shahmukhiFullStop
import com.speechify.client.internal.util.text.groupingToWords.QUOTATION_MARKS

private val terminators = setOf(
    '.', '!', '?', '‽', '…', armenianFullStop, burmeseFullStop, greekQuestionMark,
    nagariFullStop, rightToLeftScriptsQuestionMarkArabic, rightToLeftScriptsQuestionMarkReversed, sanskritFullStop,
    shahmukhiFullStop, chineseAndJapaneseFullStop, chineseAndJapaneseFullWidthQuestionMark,
    chineseAndJapaneseFullWidthExclamation,
)
private val terminatorsNotUsingWhitespaceAfter = setOf(
    chineseAndJapaneseFullStop,
    chineseAndJapaneseFullWidthQuestionMark,
    chineseAndJapaneseFullWidthExclamation,
)

private val regexOfSimplifiedWebAddress = Regex(
    """\b([a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}\b""",
)

/**
 * Returns the sentences in this [CharSequence] as a sequence of [IntRange]s, with each range representing the index
 * of the first and last character of the sentence. The terminator punctuation marks are also included in the ranges.
 *
 * The function supports multiple terminators, quotations in both British-English and American-English grammar,
 * parentheses and emojis properly.
 */
internal fun CharSequence.getSentencesAsIndexRanges(): Sequence<IntRange> {
    val text = this.trimEnd()
    if (text.isEmpty()) return emptySequence()
    if (text.length == 1) return sequenceOf(0..0)

    var start: Int = indexOfFirstNonWhitespaceChar(text, 0)
    var end: Int = start
    val wordBuilder = StringBuilder()
    var sentenceStartedWithQuotationOrSpan = false
    return sequence {
        while (end < text.length) {
            val char = text[end]
            if (char.isWhitespace()) {
                wordBuilder.setLength(0)
            } else {
                wordBuilder.append(char)
            }

            when {
                isStartOfQuotation(text, end, wordBuilder.toString()) -> {
                    sentenceStartedWithQuotationOrSpan = sentenceStartedWithQuotationOrSpan || (start == end)
                    end = indexOfQuotationEnd(text, end)
                    wordBuilder.setLength(0)
                    wordBuilder.append(text.subSequence(start, end))
                    if ((previousNonWhitespaceCharacterIsTerminator(text, end) && sentenceStartedWithQuotationOrSpan) ||
                        terminatorInsideQuotationAndNewSentenceEnd(text, end)
                    ) {
                        yield(start..end)
                        sentenceStartedWithQuotationOrSpan = false
                        end = indexOfFirstNonWhitespaceChar(text, end + 1)
                        start = end
                    }
                }
                isSpanStart(text, end, wordBuilder.toString()) -> {
                    sentenceStartedWithQuotationOrSpan = sentenceStartedWithQuotationOrSpan || (start == end)
                    end = indexOfSpanEnd(text, end)
                    wordBuilder.setLength(0)
                    wordBuilder.append(text.subSequence(start, end))
                    if (sentenceStartedWithQuotationOrSpan) {
                        yield(start..end)
                        sentenceStartedWithQuotationOrSpan = false
                        end = indexOfFirstNonWhitespaceChar(text, end) + 1
                        start = end + 1
                    }
                }
                char in terminators -> {
                    // We reached a terminator and we don't have an abbreviation nor decimal number
                    if (!isAbbreviation(wordBuilder.toString()) &&
                        !isDecimalNumberOrMonetaryAmount(wordBuilder.toString(), text, end + 1)
                    ) {
                        if (char !in terminatorsNotUsingWhitespaceAfter) {
                            // move end until the first whitespace or text end
                            while (end + 1 < text.length && !text[end + 1].isWhitespace()) {
                                end++
                                wordBuilder.append(text[end])
                            }
                        }
                        // check that captured word with terminator is not a simplified web address, e.g. google.com
                        if (!isSimplifiedWebAddress(wordBuilder.toString())) {
                            yield(start..end)
                            sentenceStartedWithQuotationOrSpan = false

                            // Move start until the first non-whitespace character
                            start = indexOfFirstNonWhitespaceChar(text, end + 1)

                            // We have it as start - 1 because we have an end++ further down
                            end = start - 1
                        }
                    }
                    wordBuilder.setLength(0)
                }
            }
            end++
        }

        // Add the last range if needed
        if (start < end - 1) {
            yield(start until end)
        }
    }
}

/**
 * This method returns true if we have a terminator inside a quotation and after that the first character is upper case,
 * which indicates that a new sentence starts.
 *
 * This is for scenarios like this: An all-purpose home device called a "plumbus." Let's see how it is made.
 * The method should split after "plumbus."
 */
internal fun terminatorInsideQuotationAndNewSentenceEnd(text: CharSequence, currentIndex: Int): Boolean {
    if (!previousNonWhitespaceCharacterIsTerminator(text, currentIndex)) {
        return false
    }

    var index = currentIndex + 1

    // We reached the end of the text. It's a new sentence.
    if (index >= text.length) return true

    while (text[index].isWhitespace() || text[index] in QUOTATION_MARKS) {
        index++
        if (index >= text.length) return true
    }
    return text[index].isUpperCase()
}

internal fun previousNonWhitespaceCharacterIsTerminator(text: CharSequence, currentIndex: Int): Boolean {
    var index = currentIndex - 1
    while (index >= 0 && text[index].isWhitespace()) {
        index--
    }
    return text[index] in terminators
}

internal fun indexOfFirstNonWhitespaceChar(text: CharSequence, start: Int): Int {
    var end = start
    while (end < text.length && text[end].isWhitespace()) {
        end++
    }
    return end
}

internal fun isDecimalNumberOrMonetaryAmount(currentWord: String, fullText: CharSequence, nextCharIndex: Int): Boolean {
    if (nextCharIndex >= fullText.length) return false

    var indexOfCurrencySing: Int = -1
    for (sing in currencySigns) {
        indexOfCurrencySing = currentWord.indexOf(sing)
        if (indexOfCurrencySing != -1) break
    }

    // There are special cases where normal letters are used as currency signs. In this case, it is mandatory that the
    // word starts with that letter and is followed only by numbers. Adding the letter in the normal {currencySigns}
    // lists can have unforeseen consequences, especially for OCR fallback detection.
    val currencySignsThatAreSimpleLetters = setOf(
        'R', // South African Rand
    )
    if (indexOfCurrencySing == -1) {
        for (sing in currencySignsThatAreSimpleLetters) {
            if (currentWord.startsWith(sing)) {
                indexOfCurrencySing = 0
                break
            }
        }
    }

    val currentWordToCheck = if (indexOfCurrencySing >= 0) {
        currentWord.substring(indexOfCurrencySing + 1)
    } else {
        currentWord
    }

    val nextChar = fullText[nextCharIndex]
    if (nextChar.isWhitespace()) return false
    return (currentWordToCheck + nextChar).toDoubleOrNull() != null
}

internal fun isSimplifiedWebAddress(currentWord: String): Boolean {
    return regexOfSimplifiedWebAddress.matches(currentWord)
}

private val knownAbbreviations = listOf(
    "Mr",
    "Mrs",
    "Dr",
    "Ph",
    "D",
    "Prof",
    "St",
    "Sr",
    "Jr",
    "Ms",
    "Mx",
    "Mt",
    "Mme",
    "Mlle",
    "Mmes",
    "Messrs",
    "Msgr",
    "Mons",
    "Maj",
    "Capt",
    "Lt",
    "Col",
    "Gen",
    "Rep",
    "Sen",
    "Gov",
    "Supt",
    "Det",
    "Rev",
    "Hon",
    "Pres",
    "Inc",
    "p",
    "pp",
    "vs",
    "v",
    "vol",
    "Vol",
    "No",
    "al", // the full abbreviation is `et al.`, however, we only store one word at a time for lookback
)
internal fun isAbbreviation(word: String): Boolean {
    if (word.length < 3) return true
    return word.last() == '.' && knownAbbreviations.contains(word.substring(0 until word.length - 1))
}

private val spanStartCharacters = setOf('(', '[', '{', '<')
private val spanStartToEndPair = mapOf('(' to ')', '[' to ']', '{' to '}', '<' to '>')

/**
 * Checks if the character at [start] is the start of text that is between different types of
 * "parentheses": [spanStartCharacters]. The method also requires the constructed word up until that point so that
 * it can verify if the word is an emoji.
 */
internal fun isSpanStart(text: CharSequence, start: Int, word: String): Boolean {
    if (start >= text.length || isEmoji(word)) return false
    return text[start] in spanStartCharacters
}

internal fun indexOfSpanEnd(text: CharSequence, start: Int): Int {
    val expectedEndChar = spanStartToEndPair[text[start]]
    var end = start
    end++
    while (end < text.length) {
        if (text[end] == expectedEndChar) {
            return end
        }
        if (text[end] in spanStartCharacters) {
            end = indexOfSpanEnd(text, end)
        }
        end++
    }
    return text.lastIndex
}

internal fun isStartOfQuotation(text: CharSequence, start: Int, word: String): Boolean {
    if (start >= text.length || isEmoji(word)) return false
    if (text[start] in QUOTATION_MARKS) {
        if (start == 0) return true
        return text[start - 1].isWhitespace()
    }
    return false
}

internal val apostrophes = setOf('`', '\'', '’')

internal fun indexOfQuotationEnd(text: CharSequence, start: Int): Int {
    var end = start
    end++
    while (end < text.length) {
        if (text[end] in QUOTATION_MARKS) {
            // We don't want to end quotation at words like "don't"
            if (text[end] in apostrophes) {
                if (end == text.lastIndex) return text.lastIndex
                if (text[end + 1].isWhitespace() || text[end + 1] in terminators) return end
            } else {
                return end
            }
        }
        end++
    }
    return text.lastIndex
}

private val emojiRegex = Regex(
    /**
     * Verifying that the character before the opening parenthesis does not form sad ASCII emoji like ':('
     * to ensure that we don't split sentences like "I'm not sad :( but also happy :)".
     */
    """^\(?[:;=8B][\-o*']?[)\](\[dDpP/\\:}{@|]${'$'}""",
)
internal fun isEmoji(word: String): Boolean {
    return emojiRegex.matches(word)
}

internal fun CharSequence.getSentences() =
    this.getSentencesAsIndexRanges()
        .map { this.slice(it) }

private fun CharSequence.trimStartByMatchLength(spanFind: SpanFind): CharSequence =
    this.drop(spanFind.range.width)
