@file:OptIn(ExperimentalSerializationApi::class)

package com.speechify.client.api.services.audio

import com.speechify.client.api.ClientConfig
import com.speechify.client.api.audio.AudioMediaFormat
import com.speechify.client.api.audio.SpeechMarksChunk
import com.speechify.client.api.audio.SpeechMarksImpl
import com.speechify.client.api.audio.SynthesizeOptions
import com.speechify.client.api.audio.SynthesizeResponse
import com.speechify.client.api.telemetry.withTelemetry
import com.speechify.client.api.util.Result
import com.speechify.client.api.util.successfully
import com.speechify.client.internal.http.HttpClient
import com.speechify.client.internal.http.parse
import com.speechify.client.internal.sync.AtomicBool
import com.speechify.client.internal.sync.swap
import com.speechify.client.internal.util.diagnostics.enriching.errorEnrichingWithTags
import com.speechify.client.internal.util.encodeToXmlTextNode
import com.speechify.client.internal.util.extensions.iterators.peekable
import com.speechify.client.internal.util.timeout.withTimeoutThrowingNonCancellationException
import com.speechify.client.internal.util.www.asDataUrl
import kotlinx.serialization.ExperimentalSerializationApi
import kotlinx.serialization.Serializable
import kotlinx.serialization.decodeFromByteArray
import kotlinx.serialization.protobuf.ProtoBuf
import kotlinx.serialization.protobuf.ProtoNumber
import kotlin.js.JsExport

@JsExport
class AudioServerConfiguration(
    val authorizationTokenProvider: AuthorizationTokenProvider,
    val serviceVersionUrlPrefix: String = "v1",
)

private val isFirstRequest = AtomicBool(true)

/**
 * Implementation of the Audio Server following the spec outlined in the [audio server docs](https://audio.docs.speechify.dev/)
 */
internal class AudioServer internal constructor(
    private val httpClient: HttpClient,
    private val clientConfig: ClientConfig,
    private val configuration: AudioServerConfiguration,
) : AudioService {
    private val authorizationTokenProvider = configuration.authorizationTokenProvider

    override suspend fun synthesize(
        text: String,
        precedingContext: String?,
        options: SynthesizeOptions,
    ): Result<SynthesizeResponse> = withTelemetry("AudioServer.synthesize") { diagnostics ->
        @Suppress(
            "NAME_SHADOWING", /* TODO - flatten `options` out: log and request from `SynthesizeOptions` directly */
        )
        val options = options.toAudioServiceSynthesizeOptions()

        diagnostics.addProperty(TELEMETRY_PROP__AUDIO_SERVER_VERSION, configuration.serviceVersionUrlPrefix) /** Can be removed when [LegacyAudioService]
         is removed and there are no other versions */
        diagnostics.addProperty("scheme", authorizationTokenProvider.scheme.name)
        diagnostics.addProperty("name", options.voiceParams.name)
        diagnostics.addProperty("lang", options.voiceParams.languageCode)
        diagnostics.addProperty("engine", options.voiceParams.engine)
        diagnostics.addProperty("fmt", options.audioMediaFormat)
        diagnostics.addProperty("len", text.length)

        /** NOTE: Not using [maxTotalSsmlCharactersCount] since we clearly don't really add anything beyond the outer
         * tags at the moment.
         */

        val ssml = Ssml.fromPlainString(
            string = text,
            precedingContext = when {
                /**
                 * Pass the preceding context to the audio server only if the voice supports the feature
                 * Only voices with `speechify` engine support this feature.
                 */
                options.voiceParams.engine == "speechify" -> precedingContext
                else -> null
            },
        ).toSsmlString()

        val scheme = authorizationTokenProvider.scheme

        val token = withTimeoutThrowingNonCancellationException( /* Throw a non-CancellationException, because
            hitting this timeout is not expected, and is rather a result of software/network failure. */
            if (isFirstRequest.swap(newValue = false)) {
                /* Be a bit more generous for first request */
                SpeechSynthesisRequestTimeout * 2
            } else {
                SpeechSynthesisRequestTimeout
            },
            actionName = "Get authorization token for AudioServer",
        ) {
            errorEnrichingWithTags(
                "authorizationTokenProvider.getValidToken",
            ) {
                authorizationTokenProvider.getValidToken()
            }
        }

        val response = httpClient.post(
            url = "${clientConfig.platformAudioServiceUrl}/${configuration.serviceVersionUrlPrefix}/synthesis/get",
            canRetryOnResponseNotReceived = true, /* A POST request is normally interpreted as
             non-idempotent ([standard](https://developer.mozilla.org/en-US/docs/Glossary/Idempotent#see_also)), so we
              very much need to add this hint. */
            isErrorResponseAllowingRetry = { response ->
                when (response.status.toInt()) {
                    429, 500, 503 -> true /* As per prescription by Audio Server team
                     [here](https://speechifyworkspace.slack.com/archives/C03JLSQMBEJ/p1677604524061729?thread_ts=1676466650.908299&cid=C03JLSQMBEJ) */
                    else -> false
                }
            },
            shouldAbortOnCoroutineCancellation = true,
            expectedResponseBodyByteCountBelow = defaultAudioServerExpectedResponseBodyByteCountBelow,
            telemetryEventBuilder = diagnostics,
        ) {
            auth(
                scheme = scheme.name,
                token = token,
            )
            bodyJson(
                AudioServerRequest(
                    ssml = ssml.text,
                    voice = VoiceParamsV1.fromVoiceParams(options.voiceParams),
                    forcedAudioFormat = when (options.audioMediaFormat) {
                        AudioMediaFormat.MP3 -> "mp3"
                        AudioMediaFormat.OGG -> "ogg"
                        null -> null
                    },
                ),
            )
        }

        // Log status code since non-200 will probably manifest as parsing failure below
        response.ifSuccessful { httpResponse -> diagnostics.addProperty("status", httpResponse.status.toString()) }

        val audioServerResponse = response
            .parse<AudioServerResponse>(ProtoBuf.Default::decodeFromByteArray)
            .orReturn { r -> return@withTelemetry r }

        return@withTelemetry SynthesizeResponse(
            format = AudioMediaFormat.valueOf(audioServerResponse.audioFormat.uppercase()),
            mediaUrl = audioServerResponse.audioData.asDataUrl(
                mediaType = "audio/${audioServerResponse.audioFormat}",
            ),
            speechMarks = SpeechMarksImpl(
                ssml.changes.adjustSpeechMarks(
                    audioServerResponse
                        .speechMarks
                        .let { entireContentChunk ->
                            entireContentChunk.chunks
                                .ifEmpty { /* If for any reason there are no more details than for the entire chunk,
                                 return at least that. */
                                    listOf(entireContentChunk)
                                }
                        },
                ),
            ),
        ).successfully()
    }

    /*
     Numbers as per Liam Dyer's [chat](https://speechifyworkspace.slack.com/archives/C03JLSQMBEJ/p1677612298351319?thread_ts=1677608320.875599&cid=C03JLSQMBEJ):
     V1: Text (Excluding SSML): 2000 characters SSML: 5000 characters
     (also to be added to [docs](https://audio.docs.speechify.dev/synthesis/overview.html))
     */
    override val maxTextCharactersCount = 2000
    private val maxTotalSsmlCharactersCount = 5000
}

internal const val defaultAudioServerExpectedResponseBodyByteCountBelow = (5 * 1024 * 1024) /* 5 MB */

@Serializable
// internal for testing only, not for use outside this file
internal class AudioServerResponse(
    @ProtoNumber(1)
    val audioData: ByteArray,
    @ProtoNumber(2)
    val audioFormat: String,
    @ProtoNumber(3)
    val speechMarks: NestedChunk,
) {
    /**
     * See https://audio.docs.speechify.dev/synthesis/speech-marks.html#speech-marks
     */
    internal interface TextChunkToAudioChunkMapping {
        /**
         * Index into the text where the chunk starts.
         * Inclusive, so the first character of the chunk is exactly [start].
         */
        val start: Int

        /**
         * Index into the text where the chunk ends.
         * Exclusive, so the last character of the chunk is [end] - 1.
         */
        val end: Int

        // Audio timestamps

        val startTime: Double
        val endTime: Double
    }

    @Serializable
    data class NestedChunk(
        @ProtoNumber(1)
        val type: String? = "sentence",
        @ProtoNumber(2)
        override val startTime: Double = 0.0,
        @ProtoNumber(3)
        override val endTime: Double = 0.0,
        @ProtoNumber(4)
        override val start: Int = 0,
        @ProtoNumber(5)
        override val end: Int = 0,
        @ProtoNumber(6)
        val value: String = DEFAULT_FOR_CHUNK_VALUE_PROP,
        @ProtoNumber(7)
        val chunks: List<Chunk> = emptyList(),
    ) : TextChunkToAudioChunkMapping

    @Serializable
    data class Chunk(
        @ProtoNumber(1)
        val type: String? = "word",
        @ProtoNumber(2)
        override val startTime: Double = 0.0,
        @ProtoNumber(3)
        override val endTime: Double = 0.0,
        @ProtoNumber(4)
        override val start: Int = 0,
        @ProtoNumber(5)
        override val end: Int = 0,
        @ProtoNumber(6)
        val value: String = DEFAULT_FOR_CHUNK_VALUE_PROP,
    ) : TextChunkToAudioChunkMapping

    companion object {
        fun fromBytes(bytes: ByteArray): AudioServerResponse {
            return ProtoBuf.Default.decodeFromByteArray(bytes)
        }
    }
}

/**
 * Specifying this as default for `Chunk.value` because:
 * 1. AudioServer was seen returning chunks that have an empty string in `value`.
 * 2. Consequently, when empty string happened, the case fell into the gotcha of Protobuf, that a field [_"will be serialized
  *   to the wire unless it is the default value"_](https://protobuf.dev/programming-guides/proto3/), and the fact that
 *    a non-nullable string has an implicit rule of: [_"For strings, the default value is the empty string."_](https://protobuf.dev/programming-guides/proto3/#default),
 *    but Kotlin deserialization treats it an error if not specified explicitly. The only mention of this in Kotlin is
 *    [_"The corresponding Serializable class should match the ProtoBuf definition and should use the same default values:"_](https://kotlinlang.org/api/kotlinx.serialization/kotlinx-serialization-protobuf/kotlinx.serialization.protobuf/-proto-buf/)
 *    though because the string's default is implicit, so not present in the ProtoBuf definition, it's easy to miss.
 *
 * While the empty-string may be preventable on service side (it doesn't make sense), this cannot be guaranteed by
 * contract types, so best if SDK is made resilient to this, especially since SDK copes with `""` just fine for now.
 */
private const val DEFAULT_FOR_CHUNK_VALUE_PROP = ""

@Serializable
private data class AudioServerRequest(
    val ssml: String,
    val voice: VoiceParamsV1,
    val forcedAudioFormat: String? = null,
)

@Serializable
private data class VoiceParamsV1(
    val name: String,
    val engine: String,
    val language: String,
) {
    companion object {
        fun fromVoiceParams(v: VoiceParams) = VoiceParamsV1(v.name, v.engine, v.languageCode)
    }
}

// internal for testing only, not for use outside this file
internal sealed class Ssml(val children: List<Ssml>) {

    class Speak(children: List<Ssml>) : Ssml(children) {
        // the audio server ignores tags when assigning indexes for `SpeechMarks`,
        // so we don't need to record these in the [changes].
        override fun appendToStringBuilder(builder: StringBuilder, changes: Changes) {
            builder.append("<speak>")
            children.forEach { it.appendToStringBuilder(builder, changes) }
            builder.append("</speak>")
        }
    }

    class Text(val text: String) : Ssml(listOf()) {
        override fun appendToStringBuilder(builder: StringBuilder, changes: Changes) {
            val xmlEncoded = text.encodeToXmlTextNode(
                onReplace = { matchResult, replaceWith ->
                    /** Record data that allows to translate indices back to original ones as
                     *  per #SpeechMarkIndicesApplyToTextPostXmlEscaping - see [Changes] for more */
                    val diff = replaceWith.length - matchResult.value.length
                    if (diff != 0) changes.addChange(matchResult.range.first, diff)
                },
            )
            changes.finishString(text.length)
            builder.append(xmlEncoded)
        }

        override fun toString() = "Text(text='$text')"
    }

    /**
     * Adds `<speechify:context>` tag with the preceding context in between.
     * [Speechify Context](https://audio.docs.speechify.dev/synthesis/ssml.html#speechify-context) for more details.
     */
    class Context(private val value: String) : Ssml(listOf()) {
        override fun appendToStringBuilder(builder: StringBuilder, changes: Changes) {
            builder.append("<speechify:context>")
            builder.append(value.encodeToXmlTextNode())
            builder.append("</speechify:context>")
        }
    }

    protected abstract fun appendToStringBuilder(builder: StringBuilder, changes: Changes)

    fun toSsmlString(): SsmlString {
        val changes = Changes()
        val builder = StringBuilder()
        appendToStringBuilder(builder, changes)
        return SsmlString(builder.toString(), changes)
    }

    override fun toString() = "${this::class.simpleName}(children=$children)"

    companion object {
        /**
         * Generates an SSML string from a plain string.
         */
        fun fromPlainString(
            string: String,
            precedingContext: String? = null,
        ): Ssml {
            return Speak(
                listOfNotNull(
                    precedingContext?.let { Context(precedingContext) },
                    Text(string),
                ),
            )
        }
    }

    data class SsmlString(
        val text: String,
        val changes: Changes,
    )

    /**
     * Sending text to the audio server as SSML implies escaping special characters as
     * per [encodeToXmlTextNode].
     * #SpeechMarkIndicesApplyToTextPostXmlEscaping - because [_"The speech marks from the audio server will use the
     * indices from the escaped values."_](https://audio.docs.speechify.dev/synthesis/ssml.html#coming-from-paragraphchunks))
     * the original ones need to be translated from audio server response.
     * TODO: Consider following up on [this request](https://speechifyworkspace.slack.com/archives/C03JLSQMBEJ/p1682415992687539)
     *  which responds to the docs' [_“The audio server cannot offer an option to do this transformation for you since text could already be escaped before inserting it into SSML (ex. text in HTML elements). However, if you can make this guarantee, you can request the feature in #audio-server on slack.”_](https://audio.docs.speechify.dev/synthesis/ssml.html#coming-from-paragraphchunks)
     *
     * This class stores the original indexes and the length change of a modification of the text so that
     * the indexes in the SpeechMarks can be readjusted.
     */
    data class Changes(private val _changes: MutableList<Change> = mutableListOf()) {
        val changes: List<Change> get() = _changes

        data class Change(val at: Int, val diff: Int) : Comparable<Change> {
            override fun compareTo(other: Change): Int {
                return at.compareTo(other.at)
            }
        }

        // Text nodes are independent, so we have to store the offset to shift each change by
        private var offset = 0

        fun addChange(at: Int, diff: Int) {
            _changes.add(Change(at + offset, diff))
        }

        fun finishString(len: Int) {
            _changes.sort()
            offset += len
        }
    }
}

/* Function made `internal` only for tests, and not intended for use outside this file */
internal fun Ssml.Changes.adjustSpeechMarks(
    chunks: List<AudioServerResponse.TextChunkToAudioChunkMapping>,
): List<SpeechMarksChunk> {
    val changeIter = changes.iterator().peekable()

    // this keeps track of how far from the original index the currently iterated chunk is
    var offset = 0
    return chunks.map {
        val current = changeIter.peekNext()
        // if `it`, when adjusted to the real index is past the current change
        var realStart = it.start - offset
        if (current != null && realStart >= current.at) {
            // advance the iterator
            changeIter.next()
            // adjust the offset
            offset += current.diff

            // if this speech mark doesn't point to one of the words that was changed, we also have to adjust the start
            if (realStart != current.at) {
                realStart -= current.diff
            }
        }
        // adjust the indexes to their original position
        val realEnd = it.end - offset
        SpeechMarksChunk(
            startCharacterIndex = realStart,
            endCharacterIndex = realEnd,
            // the audio server gives us doubles but the unit is milliseconds
            startTimeInMilliseconds = it.startTime.toInt(),
            endTimeInMilliseconds = it.endTime.toInt(),
        )
    }
}
