UP-Viagg-io/Viagg-io/Assets/afca/ViaggioAI/Scripts/AIServices/SpeechRecognitionService.cs

using System;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using UnityEngine;


#region Enums

public enum ESpeechRecognitionState
{
    StartingUp = 0,
    Ready = 10,
    Listening = 20,
    Disabled = 90,
}

#endregion


public class SpeechRecognitionService : MonoBehaviour
{

    #region Inspector Properties

    [Header("Config Values")]
    [SerializeField]
    private bool debugModeIsActive;

    [SerializeField]
    private string speechKey;

    [SerializeField]
    private string speechRegion;

    [SerializeField]
    private string initialRecognitionLanguageCode; // Format: "de-CH"

    #endregion

    #region Public Properties

    #region SpeechRecognitionState

    private ESpeechRecognitionState _speechRecognitionState = ESpeechRecognitionState.StartingUp;

    public ESpeechRecognitionState SpeechRecognitionState
    {
        get { return this._speechRecognitionState; }
        set
        {
            if (value != this._speechRecognitionState)
            {
                this.logIfInDebugMode($"SpeechRecognitionState changed, new value= {value}");

                this._speechRecognitionState = value;

                this.OnSpeechRecognitionStateChangedEvent?.Invoke(this, value);
            }
        }
    }

    public event EventHandler<ESpeechRecognitionState> OnSpeechRecognitionStateChangedEvent;

    #endregion

    #endregion

    #region Private Properties

    private object threadLocker = new object();
    private SpeechConfig speechConfig;

    private bool listeningRestartIsPending;

    private bool invokeUserSpeechInputStartedEventIsPending;
    private string pendingPartialTranscription;
    private string pendingFullTranscription;
    private string invokeUserSpeechInputEndedEventIsPending;

    #endregion

    #region Framework Functions

    void OnEnable()
    {
        this.initialize(this.initialRecognitionLanguageCode);
    }

    void Update()
    {
        this.doMainThreadTasks();
    }

    void OnDisable()
    {
        this.SpeechRecognitionState = ESpeechRecognitionState.Disabled;
    }

    #endregion

    #region Private Events

    private void speechRecognizer_SpeechStartDetected(object sender, RecognitionEventArgs e)
    {
        if (this == null || this.SpeechRecognitionState != ESpeechRecognitionState.Listening)
        {
            // Listener was stopped before speech start was detected
            return;
        }

        // Set invoke pending flag (to later invoke on main thread)
        this.invokeUserSpeechInputStartedEventIsPending = true;
    }

    private void speechRecognizer_Recognizing(object sender, SpeechRecognitionEventArgs e)
    {
        if (this == null || this.SpeechRecognitionState != ESpeechRecognitionState.Listening)
        {
            // Listener was stopped before this
            return;
        }

        // Save pending partial transcription (to later invoke on main thread)
        this.pendingPartialTranscription = e?.Result?.Text;
    }

    #endregion

    #region Public Events

    public event EventHandler<bool> OnUserSpeechInputStartedEvent;

    public event EventHandler<string> OnPartialTranscriptionChangedEvent;

    public event EventHandler<string> OnFullTranscriptionChangedEvent;

    public event EventHandler<string> OnUserSpeechInputEndedEvent;

    #endregion

    #region Public Functions

    public async void StartListeningOnceAsync()
    {
        if (this.speechConfig == null)
        {
            Debug.LogError("Speech recognizer is not ready yet. Wait for initialize before calling StartListeningOnceAsync");
            return;
        }

        AudioConfig audioConfigInput = AudioConfig.FromDefaultMicrophoneInput();

        using (SpeechRecognizer recognizer = new SpeechRecognizer(speechConfig))
        {
            lock (threadLocker)
            {
                this.SpeechRecognitionState = ESpeechRecognitionState.Listening;
            }

            recognizer.SpeechStartDetected += this.speechRecognizer_SpeechStartDetected;
            recognizer.Recognizing += this.speechRecognizer_Recognizing;

            SpeechRecognitionResult result = await recognizer.RecognizeOnceAsync().ConfigureAwait(false);

            if (this.SpeechRecognitionState != ESpeechRecognitionState.Listening)
            {
                // Listener was stopped before recognition finished -> unsubscribe and return
                recognizer.SpeechStartDetected -= this.speechRecognizer_SpeechStartDetected;
                recognizer.Recognizing -= this.speechRecognizer_Recognizing;

                return;
            }

            // Check result
            string fullTranscription = null;

            if (result.Reason == ResultReason.RecognizedSpeech && !string.IsNullOrEmpty(result.Text))
            {
                fullTranscription = result.Text;
                this.invokeUserSpeechInputEndedEventIsPending = fullTranscription;
            }
            else if (result.Reason == ResultReason.NoMatch)
            {
                this.logIfInDebugMode("SpeechIntentService NoMatch: Speech could not be recognized.");
                listeningRestartIsPending = true;
            }
            else if (result.Reason == ResultReason.Canceled)
            {
                var cancellation = CancellationDetails.FromResult(result);
                this.logIfInDebugMode($"SpeechIntentService Canceled: Reason={cancellation.Reason} ErrorDetails={cancellation.ErrorDetails}");
            }

            lock (threadLocker)
            {
                this.SpeechRecognitionState = ESpeechRecognitionState.Ready;

                this.pendingFullTranscription = fullTranscription;
            }

            recognizer.SpeechStartDetected -= this.speechRecognizer_SpeechStartDetected;
            recognizer.Recognizing -= this.speechRecognizer_Recognizing;
        }
    }

    public void StopListening()
    {
        this.SpeechRecognitionState = ESpeechRecognitionState.Ready;
    }

    public void SetRecognitionLanguage(string languageCode)
    {
        this.initialize(languageCode);
    }

    #endregion

    #region Private Functions

    private void initialize(string languageCode)
    {
        this.SpeechRecognitionState = ESpeechRecognitionState.StartingUp;

        this.speechConfig = SpeechConfig.FromSubscription(this.speechKey, this.speechRegion);
        this.speechConfig.SpeechRecognitionLanguage = languageCode;

        // Is this hack still necessary?
        var audioClip = Microphone.Start(Microphone.devices[0], true, 200, 16000);

        this.logIfInDebugMode($"SpeechRecognitionService initialized using microphone: {Microphone.devices[0]}");

        this.SpeechRecognitionState = ESpeechRecognitionState.Ready;
    }

    private void doMainThreadTasks()
    {
        if (this.listeningRestartIsPending)
        {
            this.StartListeningOnceAsync();
            listeningRestartIsPending = false;
        }

        if (this.invokeUserSpeechInputStartedEventIsPending)
        {
            this.OnUserSpeechInputStartedEvent?.Invoke(this, true);
            this.invokeUserSpeechInputStartedEventIsPending = false;
        }

        if (this.pendingPartialTranscription != null)
        {
            this.OnPartialTranscriptionChangedEvent?.Invoke(this, this.pendingPartialTranscription);
            this.pendingPartialTranscription = null;
        }

        if (this.pendingFullTranscription != null)
        {
            this.OnFullTranscriptionChangedEvent?.Invoke(this, this.pendingFullTranscription);
            this.pendingFullTranscription = null;
        }

        if (this.invokeUserSpeechInputEndedEventIsPending != null)
        {
            this.OnUserSpeechInputEndedEvent?.Invoke(this, invokeUserSpeechInputEndedEventIsPending);
            this.invokeUserSpeechInputEndedEventIsPending = null;
        }
    }

    private void logIfInDebugMode(string message)
    {
        if (!this.debugModeIsActive)
        {
            return;
        }

        Debug.Log($"(SpeechRecognitionService) => {message}");
    }

    #endregion

}