package com.speechify.client.api.content.ocr

import com.speechify.client.api.content.view.book.BookPageTextContentItem
import com.speechify.client.api.diagnostics.DiagnosticEvent
import com.speechify.client.api.diagnostics.Log
import com.speechify.client.api.util.images.BoundingBox
import com.speechify.client.api.util.images.CoordinateTransform
import com.speechify.client.api.util.images.isOverlapWithTolerance
import com.speechify.client.api.util.images.verticalDistanceTo
import com.speechify.client.api.util.text.currencySigns
import com.speechify.client.internal.util.text.groupingToWords.QUOTATION_MARKS
import kotlin.math.floor
import kotlin.math.max
import kotlin.math.min
import kotlin.random.Random

class ExperimentalStrategyImplementation private constructor() {
    companion object {
        private val checkPipeline = listOf(
            LegacyCheck(),
            TooFewElementsCheck(),
            CorruptedTextCheck(),
            TextGibberishCheck(),
            WhiteSpaceBetweenCharactersCheck(),
            OnlyHeaderAndFooterCheck(),
            DuplicateContentCheck(),
        )

        fun shouldApplyOcrFallback(
            rawTextContentItems: List<BookPageTextContentItem>,
        ): Boolean {
            // We filter for only text that is on the page to avoid checking for hidden text
            val filteredItems = filterPotentiallyHidden(rawTextContentItems)
            for (check in checkPipeline) {
                if (check.shouldRunOcr(filteredItems)) {
                    logCheckFailed(check.name)
                    return true
                }
            }

            return false
        }

        private fun logCheckFailed(name: String) {
            Log.i(
                DiagnosticEvent(
                    message = "OCR Fallback detection check failed",
                    sourceAreaId = "ExperimentalStrategyImplementation",
                    properties = mapOf("check" to name),
                ),
            )
        }

        private fun filterPotentiallyHidden(textContent: List<BookPageTextContentItem>): List<BookPageTextContentItem> {
            // We filter for only text that is actually on the page to avoid checking for hidden text
            // Since the box should be normalized at this stage, it should only have values between [0, 1] for the transform
            // negative numbers or above 1 means that the text is not on the page
            return textContent.filter { t ->
                !(
                    t.normalizedBox.transform.tx < 0 || t.normalizedBox.transform.ty < 0 ||
                        t.normalizedBox.transform.tx > 1 || t.normalizedBox.transform.ty > 1
                    )
            }
        }
    }
}

internal interface Check {
    val name: String
    fun shouldRunOcr(items: List<BookPageTextContentItem>): Boolean
}

private class LegacyCheck : Check {
    override val name = "LegacyStrategy"
    override fun shouldRunOcr(items: List<BookPageTextContentItem>): Boolean {
        return LegacyStrategyImplementation.shouldApplyOcrFallback(items)
    }
}

/**
 * Text extraction check which validates that there aren't too few elements extracted from the page
 */
private class TooFewElementsCheck : Check {
    companion object {
        private val TITLE_CONTENT_OVERLAP_BOUNDING_BOX: BoundingBox = BoundingBox(
            0.70,
            0.40,
            CoordinateTransform(1.0, 0.0, 0.0, 1.0, 0.15, 0.15),
        )
    }

    override val name = "TooFewElements"
    override fun shouldRunOcr(items: List<BookPageTextContentItem>): Boolean {
        fun isTitlePage(
            textContent: List<BookPageTextContentItem>,
            words: List<String>,
        ): Boolean {
            // Title pages usually have very few words
            if (words.size > 30) return false

            // Text on title pages should be close to the middle of the page
            return textContent.all { TITLE_CONTENT_OVERLAP_BOUNDING_BOX.overlaps(it.normalizedBox) }
        }

        if (items.isEmpty()) return true

        // If we have too few blocks, it may be that we only have headers, footers and image captions
        val words = items.flatMap { it.text.text.split(' ') }
        if (isTitlePage(items, words)) return false
        if (words.size < 14) return true
        return false
    }
}

/**
 * Text extraction check which validates that the extracted text items contain valid characters and potential words.
 * This does multiple checks to ensure text integrity:
 *    - Text contains printable characters
 *    - Text does not contain wrongly placed currency signs (usually indicating corrupted text)
 *    - Text does not contain wrongly placed math symbols (usually indicating corrupted text)
 *    - Text does not contain wrongly placed numbers (usually indicating corrupted text)
 *    - Text does not contain wrongly placed symbols (usually indicating corrupted text)
 */
class CorruptedTextCheck : Check {
    companion object {
        private val CORRUPTED_CHAR_CODES = setOf(
            65533, 65534, 65535, 57442, 57531, 57442, 57501, 61468, 61453, 61451,
            61450, 61452, 57403, 57402, 57404,
        )
    }

    private fun Char.isPrintableUnicode(): Boolean {
        // ASCII characters from 0x20 (blank space) upwards. Below are only non-printable characters
        return this.code >= 0x20
    }

    override val name = "CorruptedText"
    override fun shouldRunOcr(items: List<BookPageTextContentItem>): Boolean {
        val words = items.flatMap { it.text.text.split(' ') }
        return items.any {
            it.text.text.any { CORRUPTED_CHAR_CODES.contains(it.code) || !it.isPrintableUnicode() }
        } ||
            textContainsCurrencySignInWrongLocation(words) ||
            textContainsMathSymbolsWronglyPlaced(words) ||
            containsNumbersWronglyPlaced(words) ||
            textContainsWronglyPlacedCharacters(items)
    }

    /**
     * Detect if there is a currency sign (€, $, £) in the text where it should not be
     *
     */
    private fun textContainsCurrencySignInWrongLocation(textContent: List<String>): Boolean {
        for (sign in currencySigns) {
            if (textContent.any {
                it.contains(sign) && isCurrencySignWronglyPlaced(it, sign)
            }
            ) {
                return true
            }
        }

        return false
    }

    /**
     * This method tries to detect if the text has any math symbols ('>', '<', '=') in the middle of words.
     * The `-` is NOT included since it is used in other cases.
     * The `+` is NOT include since it can be used in other cases (ex: 0.1 mg/kg+)
     *
     * To do this, we validate that all words that contain the math sign that have a length greater than 1 are
     * not all digits.
     * Ex:
     *      - 3x+4>22 - NO OCR NEEDED
     *      - scienti>c - OCR needed
     *
     */
    private fun textContainsMathSymbolsWronglyPlaced(textContent: List<String>): Boolean {
        val mathSigns = setOf('>', '<', '=')

        fun stringCanContainMathSymbols(word: String): Boolean {
            val acceptedSymbols = setOf(
                '>', '<', '+', '=', '.', ',', '!', '?', ':', ';', '(', ')', '[', ']',
                '{', '}', '-', '%', '-', '/', '\\',
            )

            return word.all { it in acceptedSymbols || it.isDigit() } || wordContainsUrl(word)
        }

        fun percentageOfMathSymbols(word: String): Double {
            return mathSigns.count { it in word }.toDouble() / word.length
        }

        for (sign in mathSigns) {
            if (textContent.filter { it.length > 1 && it.contains(sign) && percentageOfMathSymbols(it) < 0.20 }
                .any {
                    !stringCanContainMathSymbols(it)
                }
            ) {
                return true
            }
        }

        return false
    }

    /**
     * Returns tru if any of the words contains a digit somewhere in the middle of the word.
     * This is because some DFs have a letter or two replaced by a number. To reduce false-positives,
     * we only consider words that have only one digit in the middle since this is the most common scenario
     * and PDFs that suffer from this rarely have the number at the beginning or the end and (at least so fat)
     * also have words that will fail this scenario
     */
    private fun containsNumbersWronglyPlaced(textContent: List<String>): Boolean {
        return textContent.any { word ->
            word.length >= 3 &&
                !word.first().isDigit() &&
                !word.last().isDigit() &&
                word.count { it.isDigit() } == 1 &&
                word.all { it.isLetterOrDigit() }
        }
    }

    private fun isCurrencySignWronglyPlaced(word: String, sign: Char): Boolean {
        // Euro sign is correctly placed if it's alone or the word is too small to contain anything else
        if (word == sign.toString() || word.length <= 1) return false

        // Euro sign is correctly placed if it's in parentheses
        if (word.startsWith("($sign") && word.endsWith(")")) return false

        // Check if the currency sign is part of a valid number format
        // For this, we remove the sign from the word and check if it is a valid number
        // We remove the last character in case we have M, B or k (for formats like $100k)
        val wordWithoutSign = word.filter { it.lowercaseChar() != sign }
        val endLetterRepresentingNumber = wordRepresentingNumber(wordWithoutSign)
        val possibleNumber = if (endLetterRepresentingNumber != null) {
            wordWithoutSign.dropLast(endLetterRepresentingNumber.length)
        } else {
            wordWithoutSign
        }
        return !possibleNumber.all { it.isDigit() || it in setOf(',', '.', '-', '–') || it.isWhitespace() }
    }

    private fun wordRepresentingNumber(possibleNumber: String): String? {
        val possibleTerminators = setOf("M", "B", "k", "million", "billion", "thousand", "hundred")
        for (possibleTerminator in possibleTerminators) {
            if (possibleNumber.endsWith(possibleTerminator)) {
                return possibleTerminator
            }
        }
        return null
    }

    // We can't check if it is an URL sicne it can be between parantheses
    // (see 48decfe0-3097-4977-915c-1cda7f67fd7d-10.pdf)
    private fun wordContainsUrl(word: String): Boolean {
        val urlRegex = "https?:\\/\\/(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}" +
            "\\b([-a-zA-Z0-9()@:%_\\+.~#?&//=]*)"
        return Regex(urlRegex).find(word) != null || word.contains("mailto:")
    }

    private fun textContainsWronglyPlacedCharacters(textContent: List<BookPageTextContentItem>): Boolean {
        val charactersToSearch = setOf('"', '!', '?')

        fun isWhitespaceOrQuotation(text: Char): Boolean {
            val acceptedBorderingCharacters = setOf('!', '?', '.')
            // We search even in the `charactersToSearch` since there are cases where the last character is duplicated:
            // EX: "Not done anything?!" (from 48decfe0-3097-4977-915c-1cda7f67fd7d.pdf)
            return text.isWhitespace() || QUOTATION_MARKS.contains(text) || acceptedBorderingCharacters.contains(
                text,
            )
        }

        return textContent.any { item ->
            val text = item.text.text
            charactersToSearch.any { sign ->
                if (sign == '?' && wordContainsUrl(text)) {
                    false
                } else {
                    val indexOfCharacter = text.indexOf(sign)
                    indexOfCharacter > 0 && indexOfCharacter < text.length - 1 &&
                        !isWhitespaceOrQuotation(text[indexOfCharacter + 1]) &&
                        !isWhitespaceOrQuotation(text[indexOfCharacter - 1])
                }
            }
        }
    }
}

/**
 * Text extraction check that validates the text is not gibberish. This is done by calculating the percentage of symbols
 * and acute letters. A high percentage (more than 50%) will usually indicate that the extracted text ius gibberish and
 * can't be read, so OCR is needed
 */
class TextGibberishCheck : Check {
    companion object {
        private const val SYMBOLS_CANT_BE_FIRST_LETTER = "!@%^*;,?\\|`~=_+`"
        private const val SYMBOLS = SYMBOLS_CANT_BE_FIRST_LETTER + "-/.:#\$<>&\"'~()[]{}"
        private const val ACUTE_LETTERS = "åáàäçéêëèîíìïóôöúûüû"
        private val NON_STANDARD_LETTERS = setOf('Ɵ', 'Ư', 'Ʃ', '�', '')
    }

    override val name = "TextGibberish"
    override fun shouldRunOcr(items: List<BookPageTextContentItem>): Boolean {
        fun isProportionOfSymbolsInTextTooGreat(symbolCount: Int, length: Int): Boolean {
            // how many symbols are in the text
            // If we have more than 50%, the text is considered gibberish
            val symbolPercentage = symbolCount.toDouble() / length
            if (symbolPercentage > 0.5) return true

            return false
        }

        fun wordsContainWronglyPlacedNonStandardLetters(words: List<String>): Boolean {
            val filteredWords = words.filter { it.length >= 3 }.map { it.substring(1, it.length - 1) }
            return filteredWords
                .any { word ->
                    word.any { NON_STANDARD_LETTERS.contains(it) }
                }
        }

        fun wordStartsWithSymbol(words: List<String>): Boolean {
            val filteredWords = words.filter { it.length > 1 }
            return filteredWords.any { word ->
                word.any {
                    SYMBOLS_CANT_BE_FIRST_LETTER.contains(word[0]) && !word.substring(1).any { SYMBOLS.contains(it) }
                }
            }
        }

        var symbolCount = 0
        var accuteCount = 0
        var length = 0
        for (contentItem in items) {
            symbolCount += contentItem.text.text.count { SYMBOLS.contains(it) }
            accuteCount += contentItem.text.text.count { ACUTE_LETTERS.contains(it) }
            length += contentItem.text.text.length
        }

        val words = items.flatMap { it.text.text.split(' ') }
        return isProportionOfSymbolsInTextTooGreat(symbolCount, length) ||
            isProportionOfSymbolsInTextTooGreat(accuteCount, length) ||
            wordsContainWronglyPlacedNonStandardLetters(words) ||
            wordStartsWithSymbol(words)
    }
}

/**
 * Text extraction check which valdiates that the extracted text does not contain too many white spaces. This is to
 * check for PDFs where the text has a space between each letter (Ex: `T H I S I S A S E N T E N C E`)
 */
class WhiteSpaceBetweenCharactersCheck : Check {
    override val name = "WhiteSpaceBetweenCharacters"
    override fun shouldRunOcr(items: List<BookPageTextContentItem>): Boolean {
        fun groupIntoLines(textContent: List<BookPageTextContentItem>): List<List<BookPageTextContentItem>> {
            val lines = mutableListOf<MutableList<BookPageTextContentItem>>()
            for (item in textContent) {
                val line = lines.find {
                    it.first().normalizedBox.verticalDistanceTo(item.normalizedBox) < item.normalizedBox.height / 3
                }
                if (line == null) {
                    lines.add(mutableListOf(item))
                } else {
                    line.add(item)
                }
            }

            return lines
        }

        fun lineHasTooManySpaces(line: List<BookPageTextContentItem>): Boolean {
            val words = line.map { it.text.text }.flatMap { it.split(" ") }

            // We count the number of words that have one letter and are not just digits
            // There are PDFs that have a line that is just digits separated by white spaces, and we want to exclude those
            // see ocr_test_6.pdf as an example
            var numConsecutiveOneLetterWords = 0
            for (word in words) {
                if (word.length == 1 && word[0].isLetter()) {
                    numConsecutiveOneLetterWords++
                    if (numConsecutiveOneLetterWords > 3) {
                        // We have more than 3 one-letter words one after the other.
                        // This usually means that OCR is needed
                        return true
                    }
                } else {
                    numConsecutiveOneLetterWords = 0
                }
            }

            return false
        }

        // We filter out elements that can be the author and title whixh are present on some books right at the top
        // of each page, as well as the bottom ones (like page number) since they can have gaps used for styling
        val mainContentItems = items.filter { it.normalizedBox.top > 0.1 && it.normalizedBox.bottom < 0.9 }

        val lines = groupIntoLines(mainContentItems)

        // If we have too many words that have only one letter, we consider that the text has wrongly placed spacing
        return lines.any { lineHasTooManySpaces(it) }
    }
}

/**
 * Text extraction check which validates that we don't have only header and footer text
 */
class OnlyHeaderAndFooterCheck : Check {
    companion object {
        private val MAIN_CONTENT_OVERLAP_BOUNDING_BOX: BoundingBox = BoundingBox(
            0.90,
            0.90,
            CoordinateTransform(1.0, 0.0, 0.0, 1.0, 0.05, 0.05),
        )
    }
    override val name = "OnlyHeaderAndFooter"
    override fun shouldRunOcr(items: List<BookPageTextContentItem>): Boolean {
        fun overlapTooSmall(overlapPercentage: Double): Boolean {
            // We are expecting at least 80% of the text to be inside the bounding box
            return overlapPercentage < 0.8
        }

        fun topDistanceTooBig(distance: Double): Boolean {
            // We expect that the distance to the first paragraph to be small. If it is too big it is probably
            // a caption for an image
            return distance > 0.35
        }

        fun averageLineHeight(textContent: List<BookPageTextContentItem>): Double {
            val randomWordsFromText =
                (1 until textContent.size).shuffled(Random(textContent.size)).take(7 - 1)
            val boxIndices = (listOf(0) + randomWordsFromText).toTypedArray()
            return boxIndices.map { textContent[it].normalizedBox.height }.average()
        }

        fun tooFewLines(
            textContent: List<BookPageTextContentItem>,
            minDistanceTop: Double,
            maxDistanceTop: Double,
        ): Boolean {
            val areaWithPotentialText = maxDistanceTop - minDistanceTop
            val averageLineHeight = averageLineHeight(textContent)
            val estimatedLines = floor(areaWithPotentialText / averageLineHeight).toInt()
            val estimatedLinesFound = textContent.map { t -> t.normalizedBox.top }.distinct().size

            // if we have less than 50% of the number of lines we are expecting, it means that there are huge gaps
            return estimatedLinesFound < estimatedLines / 2
        }

        var numBoxesInside = 0
        var minDistanceTop = 1.0
        var maxDistanceTop = 0.0

        for (textContentItem in items) {
            val textBoundingBox = textContentItem.normalizedBox
            if (textBoundingBox.overlaps(MAIN_CONTENT_OVERLAP_BOUNDING_BOX)) {
                numBoxesInside += 1
                minDistanceTop = min(minDistanceTop, textContentItem.normalizedBox.top)
                maxDistanceTop = max(maxDistanceTop, textContentItem.normalizedBox.top)
            }
        }
        val overlapPercentage = numBoxesInside.toDouble() / items.size

        return overlapTooSmall(overlapPercentage) ||
            (topDistanceTooBig(minDistanceTop) && tooFewLines(items, minDistanceTop, maxDistanceTop))
    }
}

/**
 * Text extraction test which validates that we don't have duplicate text. This is needed because some PDFs have the
 * text duplicated (one of the instances is invisible) and will result in a bad experience when listening
 */
class DuplicateContentCheck : Check {
    override val name = "DuplicateContent"
    override fun shouldRunOcr(items: List<BookPageTextContentItem>): Boolean {
        // HashSet to store unique combinations of box and text
        val uniqueContentSet = HashSet<Pair<BoundingBox, String>>()

        for (bookPageContentTextItem in items) {
            val contentPair = bookPageContentTextItem.normalizedBox to bookPageContentTextItem.text.text
            // Check for duplicate content with overlap tolerance
            if (uniqueContentSet.any { (box, text) ->
                text.trim() == bookPageContentTextItem.text.text.trim() &&
                    // The tolerance is calculated based on the height of the box. This way we can adjust
                    // our detection depending on font size
                    box.isOverlapWithTolerance(bookPageContentTextItem.normalizedBox, tolerance = box.height / 2.25)
            }
            ) {
                return true
            } else {
                uniqueContentSet.add(contentPair)
            }
        }

        return false
    }
}
