UP-Viagg-io/Viagg-io/Assets/afca/ViaggioAI/Scripts/AIServices/SpeechSynthesizerService.cs

using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using UnityEngine;

#region Enums

public enum ESpeechSynthesizerState
{
    StartingUp = 0,
    Ready = 10,
    Speaking = 20,
    GeneratingClip = 30,
    Disabled = 90,
}

public enum ESpeechOutputType
{
    PlayDirectly = 0,
    GenerateAudioClip = 10,
}

#endregion

public class SpeechSynthesizerService : MonoBehaviour
{

    #region Inspector Properties

    [Header("Config Values")]
    [SerializeField]
    private bool debugModeIsActive;

    [SerializeField]
    private ESpeechOutputType speechOutputType;

    [SerializeField]
    private string speechKey;

    [SerializeField]
    private string speechRegion;

    [SerializeField]
    [Tooltip("Format: 'de-CH'")]
    private string initialSynthesisLanguage;

    [SerializeField]
    [Tooltip("Voice gallery: https://speech.microsoft.com/portal/voicegallery")]
    private List<LanguageVoice> defaultSynthesisVoices;

    [SerializeField]
    private string fallbackMultilingualSynthesisVoiceName;

    #endregion

    #region Public Properties

    #region SpeechSynthesizerState

    private ESpeechSynthesizerState _speechSynthesizerState = ESpeechSynthesizerState.StartingUp;

    public ESpeechSynthesizerState SpeechSynthesizerState
    {
        get { return this._speechSynthesizerState; }
        set
        {
            if (value != this._speechSynthesizerState)
            {
                this.logIfInDebugMode($"SpeechSynthesizerState changed, new value= {value}");

                this._speechSynthesizerState = value;

                this.OnSpeechSynthesizerStateChangedEvent?.Invoke(this, value);
            }
        }
    }

    public event EventHandler<ESpeechSynthesizerState> OnSpeechSynthesizerStateChangedEvent;

    #endregion

    #region GeneratedAudioClip

    private AudioClip _generatedAudioClip = null;

    public AudioClip GeneratedAudioClip
    {
        get { return this._generatedAudioClip; }
        set
        {
            if (value != this._generatedAudioClip)
            {
                this.logIfInDebugMode($"GeneratedAudioClip changed, new value= {value}");

                this._generatedAudioClip = value;

                this.OnGeneratedAudioClipChangedEvent?.Invoke(this, value);
            }
        }
    }

    public event EventHandler<AudioClip> OnGeneratedAudioClipChangedEvent;

    #endregion

    #endregion

    #region Private Properties

    private SpeechSynthesizer speechSynthesizer;
    private SpeechConfig speechConfig;

    private string activeLanguage = null;
    private string activeVoice = null;

    #endregion

    #region Framework Functions

    async void OnDisable()
    {
        Debug.Log("SpeechSynthesizerService disposing speechSynthesizer from OnDisable");
        await this.disposeSynthesizer();
    }

    #endregion

    #region Private Events

    #endregion

    #region Public Events

    public event EventHandler<bool> OnSpeechOutputStartedEvent;

    public event EventHandler<bool> OnSpeechOutputEndedEvent;

    #endregion

    #region Public Functions

    public async void Synthesize(string text, string languageCode, string voiceNameOverride = null)
    {
        if (this.SpeechSynthesizerState == ESpeechSynthesizerState.Speaking)
        {
            await this.speechSynthesizer.StopSpeakingAsync();
        }

        await this.reInitializeIfNecessary(languageCode, voiceNameOverride);

        if (this.speechOutputType == ESpeechOutputType.GenerateAudioClip)
        {
            this.SpeechSynthesizerState = ESpeechSynthesizerState.GeneratingClip;
        }
        else if (this.speechOutputType == ESpeechOutputType.PlayDirectly)
        {
            this.SpeechSynthesizerState = ESpeechSynthesizerState.Speaking;

            this.OnSpeechOutputStartedEvent?.Invoke(this, true);
        }

        SpeechSynthesisResult synthesisResult = await this.speechSynthesizer.SpeakTextAsync(text);

        if (this.speechOutputType == ESpeechOutputType.GenerateAudioClip)
        {
            // Todo add WavUtility
            // this.GeneratedAudioClip = WavUtility.ToAudioClip(synthesisResult.AudioData);

        }
        else if (this.speechOutputType == ESpeechOutputType.PlayDirectly)
        {
            // Set state to SpeakingEnded to allow On
            this.OnSpeechOutputEndedEvent?.Invoke(this, true);
        }

        this.SpeechSynthesizerState = ESpeechSynthesizerState.Ready;
    }

    #endregion

    #region Private Functions

    private async Task initialize(string languageCode, string voiceName)
    {
        this.logIfInDebugMode($"SpeechSynthesizerService initializing using language={languageCode} and voice={voiceName}");

        if (this.speechSynthesizer != null)
        {
            Debug.Log("SpeechSynthesizerService disposing speechSynthesizer from initialize");
            await this.disposeSynthesizer();
        }

        this.SpeechSynthesizerState = ESpeechSynthesizerState.StartingUp;

        this.speechConfig = SpeechConfig.FromSubscription(this.speechKey, this.speechRegion);
        this.speechConfig.SpeechSynthesisLanguage = languageCode;
        this.speechConfig.SpeechSynthesisVoiceName = voiceName;

        if (this.speechOutputType == ESpeechOutputType.GenerateAudioClip)
        {
            this.speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm);
        }

        AudioConfig audioConfigOutput = AudioConfig.FromDefaultSpeakerOutput();

        if (this.speechOutputType == ESpeechOutputType.PlayDirectly)
        {
            this.speechSynthesizer = new SpeechSynthesizer(speechConfig, audioConfigOutput);
        }
        else if (this.speechOutputType == ESpeechOutputType.GenerateAudioClip)
        {
            this.speechSynthesizer = new SpeechSynthesizer(speechConfig, null);
        }

        this.activeLanguage = languageCode;
        this.activeVoice = voiceName;

        this.logIfInDebugMode($"SpeechSynthesizerService initialized using language={languageCode} and voice={voiceName}");

        this.SpeechSynthesizerState = ESpeechSynthesizerState.Ready;
    }

    private string getDefaultVoiceForLanguage(string language)
    {
        string retVoiceName = this.fallbackMultilingualSynthesisVoiceName;

        LanguageVoice? languageSpecificVoice = this.defaultSynthesisVoices.FirstOrDefault(lv => lv.LanguageCode == language);

        if (!string.IsNullOrEmpty(languageSpecificVoice?.VoiceName))
        {
            retVoiceName = ((LanguageVoice)languageSpecificVoice).VoiceName;
        }

        return retVoiceName;
    }

    private async Task reInitializeIfNecessary(string languageCode, string voiceNameOverride)
    {
        bool reInitNecessary = false;

        if (languageCode != this.activeLanguage)
        {
            // Language changed
            reInitNecessary = true;
        }
        else
        {
            // Language unchanged
            if (voiceNameOverride != null && voiceNameOverride != this.activeLanguage)
            {
                // Voice changed
                reInitNecessary = true;
            }
        }

        if (reInitNecessary)
        {
            string voiceName;

            if (voiceNameOverride == null)
            {
                // No specific voice defined -> get default voice for this language
                voiceName = this.getDefaultVoiceForLanguage(languageCode);
            }
            else
            {
                voiceName = voiceNameOverride;
            }

            await this.initialize(languageCode, voiceName);
        }
    }

    private void logIfInDebugMode(string message)
    {
        if (!this.debugModeIsActive)
        {
            return;
        }

        Debug.Log($"(SpeechSynthesizerService) => {message}");
    }

    private async Task disposeSynthesizer()
    {
        if (this.speechSynthesizer == null)
        {
            return;
        }

        // Make sure speaking isn't active anymore, otherwise dispose will throw exception
        await this.speechSynthesizer.StopSpeakingAsync();

        while (this.SpeechSynthesizerState == ESpeechSynthesizerState.Speaking)
        {
            await Task.Delay(10);
        }

        this.activeLanguage = null;
        this.activeVoice = null;

        this.speechSynthesizer.Dispose();
        this.speechSynthesizer = null;

        this.SpeechSynthesizerState = ESpeechSynthesizerState.Disabled;
    }

    #endregion

}