package com.speechify.client.api.audio

import com.speechify.client.api.SpeechifyURI
import com.speechify.client.api.audio.caching.MediaSynthesisCache
import com.speechify.client.api.audio.caching.MediaSynthesisCacheInMemory
import com.speechify.client.api.audio.caching.MediaSynthesisCachePersistent
import com.speechify.client.api.audio.caching.NoOpMediaSynthesisCache
import com.speechify.client.api.content.JoinedTextWithMap
import com.speechify.client.api.content.ValueWithStringRepresentation
import com.speechify.client.api.util.Result
import com.speechify.client.api.util.orThrow
import com.speechify.client.internal.services.db.DbService
import com.speechify.client.internal.util.extensions.collections.BatchTransformResultWithRealignment
import com.speechify.client.internal.util.extensions.collections.MaxSizeOptions
import com.speechify.client.internal.util.extensions.collections.SplitItemByMaxSizeFn
import com.speechify.client.internal.util.extensions.collections.TransformedBatchResult
import com.speechify.client.internal.util.extensions.collections.subListByRemovingCountAtStart
import com.speechify.client.internal.util.extensions.collections.windowedToBatchesOfAimedSizeSumWithMapAndRealignment
import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.onEach

/**
 * A MediaSynthesisService provides metadata for a set of Voices and the capability to synthesize audio media files with corresponding SpeechMarks.
 *
 * Note: This interface exists to decouple the logic of building media-backed Utterances from the logic of producing the audio files and associates SpeechMarks.
 * The former is implemented in our [MediaVoice], into which we inject a MediaSynthesisService that can abstract our Audio Server, Resemble, Polly, WaveNet, and even on-device AI-powered Speechify voices.
 */
internal interface MediaSynthesisService {
    suspend fun synthesize(
        text: String,
        /**
         * Preceding context is the text that comes before the text that we're synthesizing.
         * [Speechify Context](https://audio.docs.speechify.dev/synthesis/ssml.html#speechify-context) for more details.
         */
        precedingContext: String?,
        options: SynthesizeOptions,
    ): Result<SynthesizeResponse>

    val maxTextCharactersCount: Int
}

/**
 * A pragmatic narrowing down-of the interface for allowing simple cache implementation that ties its life
 * to specific specs like Voice and Audio format. Helps to implement a cache that gets garbage collected
 * when the user changes the specs.
 */
internal class SingleSpecsMediaSynthesisService<
    JoinedSentencesType : ValueWithStringRepresentation,
    SentenceType : ValueWithStringRepresentation,
    >(
    audioMediaFormat: AudioMediaFormat?,
    voiceSpec: VoiceSpec.VoiceSpecForMediaVoiceFromAudioServer,
    private val getFullTextOfSentences:
        (sentences: List<SentenceType>) -> JoinedTextWithMap<JoinedSentencesType, SentenceType>,
    private val splitSentenceByMaxChars: SplitItemByMaxSizeFn<SentenceType>,
    private val mediaSynthesisService: MediaSynthesisService,
    private val mediaSynthesisCache: MediaSynthesisCache<JoinedSentencesType, SentenceType>,
    private val textToSpeechAudioContextInclusionOption: TextToSpeechAudioContextInclusionOption,
) {
    private val maxTextCharactersCountOptions = MaxSizeOptions(
        maxSizeSumInEachBatch = mediaSynthesisService.maxTextCharactersCount,
        splitItemExceedingSize = splitSentenceByMaxChars,
    )

    fun synthesizingColdFlow(
        inputContentFlow: Flow<SentenceType>,
        shouldOptimizeForQuickStart: Boolean,
        /**
         * NOTE: The resulting Flow may have also batches of drastically different size than requested here - this will
         * especially happen when user re-listens, and starts from a different sentence, and is done in order to
         * realign such a re-listen with the cached content.
         */
        aimedCharsCountInBatch: Int,
    ): Flow<TransformedBatchResult<SentenceType, SynthesisResponseWithCachingInformation>> {
        /**
         * We need to keep track of the last synthesize text representation, so that we can use it as the preceding context for the next batch.
         * The input items text representation is captured only after the realignment in [windowedToBatchesOfAimedSizeSumWithMapAndRealignment].
         */
        var lastSynthesizingInputItemsTextRepresentation: String? = null
        return inputContentFlow
            /**
             * TODO consider #TODOPersistentCacheBecomesUnusableIfAimedCharsCountInBatchIsReduced to prevent
             *  audio-downloads becoming unusable if `aimedCharsCountInBatch` is reduced
             */
            .windowedToBatchesOfAimedSizeSumWithMapAndRealignment(
                itemSize = { it.textRepresentation.length },
                aimedSizeSumInEachBatch = aimedCharsCountInBatch,
                maxSizeOptions = maxTextCharactersCountOptions,
                mapBatchWithRealignment = { (batchIndex, sentences) ->
                    val sentencesWithFullText = getFullTextOfSentences(sentences)
                    val cacheResultWithRealignment = mediaSynthesisCache.getCachedEntryOrNull(
                        inputSentencesAndWhitespaceSeparators = sentencesWithFullText,
                    )
                    when (cacheResultWithRealignment) {
                        null -> {
                            /* Nothing in the cache. Need to synthesize */
                            if (batchIndex == 0 && shouldOptimizeForQuickStart) {
                                /* We were asked to start quickly, and this is the starting batch (batchIndex=0) so */
                                val sentencesForOptimizedStart = getFullTextOfSentences(
                                    sentences.take(1), /* Get a small number of sentences, so that the payload of
                                    the first request is small */
                                )
                                val synthesisResult = SynthesisResponseWithCachingInformation.LiveSynthesisResponse(
                                    getSynthesizeResponseFn = {
                                        val result = synthesize(
                                            text = sentencesForOptimizedStart.joinedText.textRepresentation,
                                            precedingContext = null, // batchIndex == 0, so no preceding context
                                        )
                                            .orThrow()

                                        mediaSynthesisCache.putCachedEntry(
                                            inputSentencesAndWhitespaceSeparators = sentencesForOptimizedStart,
                                            newSynthesisResult = result,
                                        )

                                        result
                                    },
                                )
                                BatchTransformResultWithRealignment.MatchOfShorterInput(
                                    result = synthesisResult,
                                    inputItemsConsumedInThisBatch = sentencesForOptimizedStart.constituentParts,
                                    remainingUnmatchedItems = sentences.subListByRemovingCountAtStart(
                                        countOfItemsToRemoveFromStart =
                                        sentencesForOptimizedStart.constituentParts.size,
                                    ),
                                )
                            } else {
                                val precedingContext = lastSynthesizingInputItemsTextRepresentation
                                val synthesisResult = SynthesisResponseWithCachingInformation.LiveSynthesisResponse(
                                    getSynthesizeResponseFn = {
                                        val result = synthesize(
                                            text = sentencesWithFullText.joinedText.textRepresentation,
                                            /**
                                             * Use the preceding context only if it's enabled
                                             */
                                            precedingContext = if (
                                                textToSpeechAudioContextInclusionOption
                                                    .textToSpeechIncludePrecedingContextForAudioSynthesis.value
                                            ) {
                                                precedingContext
                                            } else {
                                                null
                                            },
                                        )
                                            .orThrow()

                                        mediaSynthesisCache.putCachedEntry(
                                            inputSentencesAndWhitespaceSeparators = sentencesWithFullText,
                                            newSynthesisResult = result,
                                        )

                                        result
                                    },
                                )

                                BatchTransformResultWithRealignment.ExactMatch(
                                    result = synthesisResult,
                                )
                            }
                        }
                        else -> cacheResultWithRealignment.mapResult {
                            SynthesisResponseWithCachingInformation.CachedSynthesisResponse(
                                synthesizeResponse = it,
                            )
                        }
                    }
                },
            ).onEach {
                lastSynthesizingInputItemsTextRepresentation =
                    getFullTextOfSentences(it.inputItems).joinedText.textRepresentation
            }
    }

    suspend fun synthesize(
        text: String,
        precedingContext: String?,
    ): Result<SynthesizeResponse> =
        mediaSynthesisService.synthesize(
            text = text,
            precedingContext = precedingContext,
            options = options,
        )

    private val options = SynthesizeOptions(
        audioMediaFormat = audioMediaFormat,
        voiceSpec = voiceSpec,
    )
}

internal sealed class SynthesisResponseWithCachingInformation {
    abstract val isCached: Boolean

    /**
     * For cached items this will immediately return the result, while for uncached items,
     * only when this is called, the actual synthesis is performed.
     */
    abstract suspend fun getSynthesizeResponse(): SynthesizeResponse

    internal class CachedSynthesisResponse(
        private val synthesizeResponse: SynthesizeResponse,
    ) : SynthesisResponseWithCachingInformation() {
        override val isCached get() = true

        override suspend fun getSynthesizeResponse(): SynthesizeResponse =
            synthesizeResponse
    }

    internal class LiveSynthesisResponse(
        private val getSynthesizeResponseFn: suspend () -> SynthesizeResponse,
    ) : SynthesisResponseWithCachingInformation() {
        override val isCached get() = false

        override suspend fun getSynthesizeResponse(): SynthesizeResponse =
            getSynthesizeResponseFn()
    }
}

internal fun
<
    JoinedSentencesType : ValueWithStringRepresentation,
    SentenceType : ValueWithStringRepresentation,
    >
MediaSynthesisService.asSingleSpecsServiceUncached(
    audioMediaFormat: AudioMediaFormat,
    voiceSpec: VoiceSpec.VoiceSpecForMediaVoiceFromAudioServer,
    getFullTextOfSentences: (List<SentenceType>) -> JoinedTextWithMap<JoinedSentencesType, SentenceType>,
    splitSentenceByMaxChars: SplitItemByMaxSizeFn<SentenceType>,
    textToSpeechAudioContextInclusionOption: TextToSpeechAudioContextInclusionOption,
) =
    SingleSpecsMediaSynthesisService(
        audioMediaFormat = audioMediaFormat,
        voiceSpec = voiceSpec,
        getFullTextOfSentences = getFullTextOfSentences,
        mediaSynthesisService = this,
        splitSentenceByMaxChars = splitSentenceByMaxChars,
        mediaSynthesisCache = NoOpMediaSynthesisCache(),
        textToSpeechAudioContextInclusionOption = textToSpeechAudioContextInclusionOption,
    )

internal fun
<
    JoinedSentencesType : ValueWithStringRepresentation,
    SentenceType : ValueWithStringRepresentation,
    >
MediaSynthesisService.asSingleSpecsServiceCachedInMemory(
    cacheCapacityInCharsOfText: Int,
    audioMediaFormat: AudioMediaFormat?,
    voiceSpec: VoiceSpec.VoiceSpecForMediaVoiceFromAudioServer,
    getFullTextOfSentences: (List<SentenceType>) -> JoinedTextWithMap<JoinedSentencesType, SentenceType>,
    splitSentenceByMaxChars: SplitItemByMaxSizeFn<SentenceType>,
    textToSpeechAudioContextInclusionOption: TextToSpeechAudioContextInclusionOption,
) = SingleSpecsMediaSynthesisService(
    audioMediaFormat = audioMediaFormat,
    voiceSpec = voiceSpec,
    getFullTextOfSentences = getFullTextOfSentences,
    mediaSynthesisService = this,
    splitSentenceByMaxChars = splitSentenceByMaxChars,
    mediaSynthesisCache = MediaSynthesisCacheInMemory(cacheCapacityInCharsOfText),
    textToSpeechAudioContextInclusionOption = textToSpeechAudioContextInclusionOption,
)

internal fun
<
    JoinedSentencesType : ValueWithStringRepresentation,
    SentenceType : ValueWithStringRepresentation,
    >
MediaSynthesisService.asSingleSpecsServiceCachedPersistently(
    speechifyURI: SpeechifyURI,
    audioMediaFormat: AudioMediaFormat?,
    voiceSpec: VoiceSpec.VoiceSpecForMediaVoiceFromAudioServer,
    getFullTextOfSentences: (List<SentenceType>) -> JoinedTextWithMap<JoinedSentencesType, SentenceType>,
    splitSentenceByMaxChars: SplitItemByMaxSizeFn<SentenceType>,
    dbService: DbService,
    textToSpeechAudioContextInclusionOption: TextToSpeechAudioContextInclusionOption,
) = SingleSpecsMediaSynthesisService(
    audioMediaFormat = audioMediaFormat,
    voiceSpec = voiceSpec,
    getFullTextOfSentences = getFullTextOfSentences,
    mediaSynthesisService = this,
    splitSentenceByMaxChars = splitSentenceByMaxChars,
    mediaSynthesisCache = MediaSynthesisCachePersistent(
        speechifyURI,
        voiceSpec,
        dbService,
    ),
    textToSpeechAudioContextInclusionOption = textToSpeechAudioContextInclusionOption,
)

internal data class SynthesizeOptions(
    override val audioMediaFormat: AudioMediaFormat?,
    val voiceSpec: VoiceSpec.VoiceSpecForMediaVoiceFromAudioServer,
) : AudioMediaFormatOptions

internal data class SynthesizeResponse(
    val format: AudioMediaFormat,
    val mediaUrl: String,
    val speechMarks: SpeechMarks,
)
