using System; using Microsoft.CognitiveServices.Speech; using Microsoft.CognitiveServices.Speech.Audio; using UnityEngine; #region Enums public enum ESpeechRecognitionState { StartingUp = 0, Ready = 10, Listening = 20, Disabled = 90, } #endregion public class SpeechRecognitionService : MonoBehaviour { #region Inspector Properties [Header("Config Values")] [SerializeField] private bool debugModeIsActive; [SerializeField] private string speechKey; [SerializeField] private string speechRegion; [SerializeField] private string initialRecognitionLanguageCode; // Format: "de-CH" [Header("Asset Objects")] [SerializeField] private AudioClip startedListeningClip; #endregion #region Public Properties #region SpeechRecognitionState private ESpeechRecognitionState _speechRecognitionState = ESpeechRecognitionState.StartingUp; public ESpeechRecognitionState SpeechRecognitionState { get { return this._speechRecognitionState; } set { if (value != this._speechRecognitionState) { this.logIfInDebugMode($"SpeechRecognitionState changed, new value= {value}"); this._speechRecognitionState = value; this.OnSpeechRecognitionStateChangedEvent?.Invoke(this, value); } } } public event EventHandler OnSpeechRecognitionStateChangedEvent; #endregion #region PartialTranscription private string _partialTranscription = ""; public string PartialTranscription { get { return this._partialTranscription; } set { if (value != this._partialTranscription) { this.logIfInDebugMode("PartialTranscription changed, new value= " + value); this._partialTranscription = value; if (this.OnPartialTranscriptionChangedEvent != null) { this.OnPartialTranscriptionChangedEvent.Invoke(this, value); } } } } public event EventHandler OnPartialTranscriptionChangedEvent; #endregion #region FullTranscription private string _fullTranscription = ""; public string FullTranscription { get { return this._fullTranscription; } set { if (value != this._fullTranscription) { this.logIfInDebugMode("FullTranscription changed, new value= " + value); this._fullTranscription = value; if (this.OnFullTranscriptionChangedEvent != null) { this.OnFullTranscriptionChangedEvent.Invoke(this, value); } } } } public event EventHandler OnFullTranscriptionChangedEvent; #endregion #endregion #region Private Properties private ViaggioAIManager vaim { get { return ViaggioAIManager.Instance; } } private object threadLocker = new object(); private SpeechConfig speechConfig; private bool listeningRestartIsPending; private bool invokeUserSpeechInputStartedEventIsPending; private string pendingPartialTranscription; private string pendingFullTranscription; private string invokeUserSpeechInputEndedEventIsPending; private string pendingRecoError; #endregion #region Framework Functions void OnEnable() { this.initialize(this.initialRecognitionLanguageCode); } void Update() { this.doMainThreadTasks(); } void OnDisable() { this.listeningRestartIsPending = false; this.SpeechRecognitionState = ESpeechRecognitionState.Disabled; } #endregion #region Private Events private void speechRecognizer_SpeechStartDetected(object sender, RecognitionEventArgs e) { if (this == null || this.SpeechRecognitionState != ESpeechRecognitionState.Listening) { // Listener was stopped before speech start was detected return; } // Set invoke pending flag (to later invoke on main thread) this.invokeUserSpeechInputStartedEventIsPending = true; } private void speechRecognizer_Recognizing(object sender, SpeechRecognitionEventArgs e) { if (this == null || this.SpeechRecognitionState != ESpeechRecognitionState.Listening) { // Listener was stopped before this return; } // Save pending partial transcription (to later invoke on main thread) this.pendingPartialTranscription = e?.Result?.Text; } #endregion #region Public Events public event EventHandler OnUserSpeechInputStartedEvent; public event EventHandler OnUserSpeechInputEndedEvent; public event EventHandler OnSpeechRecognitionErrorChangedEvent; #endregion #region Public Functions public async void StartListeningOnceAsync() { if (this.speechConfig == null) { Debug.LogError("Speech recognizer is not ready yet. Wait for initialize before calling StartListeningOnceAsync"); return; } using (AudioConfig audioConfigInput = AudioConfig.FromDefaultMicrophoneInput()) using (SpeechRecognizer recognizer = new SpeechRecognizer(speechConfig, audioConfigInput)) { lock (threadLocker) { this.SpeechRecognitionState = ESpeechRecognitionState.Listening; } recognizer.SpeechStartDetected += this.speechRecognizer_SpeechStartDetected; recognizer.Recognizing += this.speechRecognizer_Recognizing; SpeechRecognitionResult result = await recognizer.RecognizeOnceAsync().ConfigureAwait(false); if (this.SpeechRecognitionState != ESpeechRecognitionState.Listening) { // Listener was stopped before recognition finished -> unsubscribe and return recognizer.SpeechStartDetected -= this.speechRecognizer_SpeechStartDetected; recognizer.Recognizing -= this.speechRecognizer_Recognizing; return; } // Check result string fullTranscription = null; if (result.Reason == ResultReason.RecognizedSpeech && !string.IsNullOrEmpty(result.Text)) { fullTranscription = result.Text; this.invokeUserSpeechInputEndedEventIsPending = fullTranscription; } else if (result.Reason == ResultReason.NoMatch) { this.logIfInDebugMode("SpeechIntentService NoMatch: Speech could not be recognized."); listeningRestartIsPending = true; } else if (result.Reason == ResultReason.Canceled) { var cancellation = CancellationDetails.FromResult(result); this.pendingRecoError = $"SpeechIntentService Canceled: Reason={cancellation.Reason} ErrorDetails={cancellation.ErrorDetails}"; } lock (threadLocker) { this.SpeechRecognitionState = ESpeechRecognitionState.Ready; this.pendingFullTranscription = fullTranscription; } recognizer.SpeechStartDetected -= this.speechRecognizer_SpeechStartDetected; recognizer.Recognizing -= this.speechRecognizer_Recognizing; } } public void StopListening() { this.listeningRestartIsPending = false; this.SpeechRecognitionState = ESpeechRecognitionState.Ready; } public void SetRecognitionLanguage(string languageCode) { this.initialize(languageCode); } #endregion #region Private Functions private void initialize(string languageCode) { this.SpeechRecognitionState = ESpeechRecognitionState.StartingUp; this.speechConfig = SpeechConfig.FromSubscription(this.speechKey, this.speechRegion); this.speechConfig.SpeechRecognitionLanguage = languageCode; string micName = (Microphone.devices != null && Microphone.devices.Length > 0) ? Microphone.devices[0] : ""; this.logIfInDebugMode($"SpeechRecognitionService initialized using microphone: {micName}"); this.SpeechRecognitionState = ESpeechRecognitionState.Ready; } private void doMainThreadTasks() { if (this.listeningRestartIsPending) { this.StartListeningOnceAsync(); listeningRestartIsPending = false; } if (this.invokeUserSpeechInputStartedEventIsPending) { this.OnUserSpeechInputStartedEvent?.Invoke(this, true); this.invokeUserSpeechInputStartedEventIsPending = false; } if (this.pendingPartialTranscription != null) { this.PartialTranscription = this.pendingPartialTranscription; this.pendingPartialTranscription = null; } if (this.pendingFullTranscription != null) { this.FullTranscription = this.pendingFullTranscription; this.pendingFullTranscription = null; } if (this.invokeUserSpeechInputEndedEventIsPending != null) { this.OnUserSpeechInputEndedEvent?.Invoke(this, invokeUserSpeechInputEndedEventIsPending); this.invokeUserSpeechInputEndedEventIsPending = null; } if (this.pendingRecoError != null) { this.OnSpeechRecognitionErrorChangedEvent?.Invoke(this, this.pendingRecoError); vaim.RaiseViaggioAIError(this.pendingRecoError); this.pendingRecoError = null; } } private void logIfInDebugMode(string message) { if (!this.debugModeIsActive) { return; } Debug.Log($"(SpeechRecognitionService) => {message}"); } #endregion }