using System; using Microsoft.CognitiveServices.Speech; using Microsoft.CognitiveServices.Speech.Audio; using UnityEngine; #region Enums public enum ESpeechRecognitionState { StartingUp = 0, Ready = 10, Listening = 20, Disabled = 90, } #endregion public class SpeechRecognitionService : MonoBehaviour { #region Inspector Properties [Header("Config Values")] [SerializeField] private bool debugModeIsActive; [SerializeField] private string speechKey; [SerializeField] private string speechRegion; [SerializeField] private string initialRecognitionLanguageCode; // Format: "de-CH" #endregion #region Public Properties #region SpeechRecognitionState private ESpeechRecognitionState _speechRecognitionState = ESpeechRecognitionState.StartingUp; public ESpeechRecognitionState SpeechRecognitionState { get { return this._speechRecognitionState; } set { if (value != this._speechRecognitionState) { this.logIfInDebugMode($"SpeechRecognitionState changed, new value= {value}"); this._speechRecognitionState = value; this.OnSpeechRecognitionStateChangedEvent?.Invoke(this, value); } } } public event EventHandler OnSpeechRecognitionStateChangedEvent; #endregion #endregion #region Private Properties private object threadLocker = new object(); private SpeechConfig speechConfig; private bool listeningRestartIsPending; private bool invokeUserSpeechInputStartedEventIsPending; private string pendingPartialTranscription; private string pendingFullTranscription; private string invokeUserSpeechInputEndedEventIsPending; #endregion #region Framework Functions void OnEnable() { this.initialize(this.initialRecognitionLanguageCode); } void Update() { this.doMainThreadTasks(); } void OnDisable() { this.SpeechRecognitionState = ESpeechRecognitionState.Disabled; } #endregion #region Private Events private void speechRecognizer_SpeechStartDetected(object sender, RecognitionEventArgs e) { if (this == null || this.SpeechRecognitionState != ESpeechRecognitionState.Listening) { // Listener was stopped before speech start was detected return; } // Set invoke pending flag (to later invoke on main thread) this.invokeUserSpeechInputStartedEventIsPending = true; } private void speechRecognizer_Recognizing(object sender, SpeechRecognitionEventArgs e) { if (this == null || this.SpeechRecognitionState != ESpeechRecognitionState.Listening) { // Listener was stopped before this return; } // Save pending partial transcription (to later invoke on main thread) this.pendingPartialTranscription = e?.Result?.Text; } #endregion #region Public Events public event EventHandler OnUserSpeechInputStartedEvent; public event EventHandler OnPartialTranscriptionChangedEvent; public event EventHandler OnFullTranscriptionChangedEvent; public event EventHandler OnUserSpeechInputEndedEvent; #endregion #region Public Functions public async void StartListeningOnceAsync() { if (this.speechConfig == null) { Debug.LogError("Speech recognizer is not ready yet. Wait for initialize before calling StartListeningOnceAsync"); return; } AudioConfig audioConfigInput = AudioConfig.FromDefaultMicrophoneInput(); using (SpeechRecognizer recognizer = new SpeechRecognizer(speechConfig)) { lock (threadLocker) { this.SpeechRecognitionState = ESpeechRecognitionState.Listening; } recognizer.SpeechStartDetected += this.speechRecognizer_SpeechStartDetected; recognizer.Recognizing += this.speechRecognizer_Recognizing; SpeechRecognitionResult result = await recognizer.RecognizeOnceAsync().ConfigureAwait(false); if (this.SpeechRecognitionState != ESpeechRecognitionState.Listening) { // Listener was stopped before recognition finished -> unsubscribe and return recognizer.SpeechStartDetected -= this.speechRecognizer_SpeechStartDetected; recognizer.Recognizing -= this.speechRecognizer_Recognizing; return; } // Check result string fullTranscription = null; if (result.Reason == ResultReason.RecognizedSpeech && !string.IsNullOrEmpty(result.Text)) { fullTranscription = result.Text; this.invokeUserSpeechInputEndedEventIsPending = fullTranscription; } else if (result.Reason == ResultReason.NoMatch) { this.logIfInDebugMode("SpeechIntentService NoMatch: Speech could not be recognized."); listeningRestartIsPending = true; } else if (result.Reason == ResultReason.Canceled) { var cancellation = CancellationDetails.FromResult(result); this.logIfInDebugMode($"SpeechIntentService Canceled: Reason={cancellation.Reason} ErrorDetails={cancellation.ErrorDetails}"); } lock (threadLocker) { this.SpeechRecognitionState = ESpeechRecognitionState.Ready; this.pendingFullTranscription = fullTranscription; } recognizer.SpeechStartDetected -= this.speechRecognizer_SpeechStartDetected; recognizer.Recognizing -= this.speechRecognizer_Recognizing; } } public void StopListening() { this.SpeechRecognitionState = ESpeechRecognitionState.Ready; } public void SetRecognitionLanguage(string languageCode) { this.initialize(languageCode); } #endregion #region Private Functions private void initialize(string languageCode) { this.SpeechRecognitionState = ESpeechRecognitionState.StartingUp; this.speechConfig = SpeechConfig.FromSubscription(this.speechKey, this.speechRegion); this.speechConfig.SpeechRecognitionLanguage = languageCode; // Is this hack still necessary? var audioClip = Microphone.Start(Microphone.devices[0], true, 200, 16000); this.logIfInDebugMode($"SpeechRecognitionService initialized using microphone: {Microphone.devices[0]}"); this.SpeechRecognitionState = ESpeechRecognitionState.Ready; } private void doMainThreadTasks() { if (this.listeningRestartIsPending) { this.StartListeningOnceAsync(); listeningRestartIsPending = false; } if (this.invokeUserSpeechInputStartedEventIsPending) { this.OnUserSpeechInputStartedEvent?.Invoke(this, true); this.invokeUserSpeechInputStartedEventIsPending = false; } if (this.pendingPartialTranscription != null) { this.OnPartialTranscriptionChangedEvent?.Invoke(this, this.pendingPartialTranscription); this.pendingPartialTranscription = null; } if (this.pendingFullTranscription != null) { this.OnFullTranscriptionChangedEvent?.Invoke(this, this.pendingFullTranscription); this.pendingFullTranscription = null; } if (this.invokeUserSpeechInputEndedEventIsPending != null) { this.OnUserSpeechInputEndedEvent?.Invoke(this, invokeUserSpeechInputEndedEventIsPending); this.invokeUserSpeechInputEndedEventIsPending = null; } } private void logIfInDebugMode(string message) { if (!this.debugModeIsActive) { return; } Debug.Log($"(SpeechRecognitionService) => {message}"); } #endregion }