UP-Viagg-io/Viagg-io/Assets/afca/ViaggioAI/Scripts/AIServices/SpeechRecognitionService.cs

280 lines
8.2 KiB
C#

using System;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using UnityEngine;
#region Enums
public enum ESpeechRecognitionState
{
StartingUp = 0,
Ready = 10,
Listening = 20,
Disabled = 90,
}
#endregion
public class SpeechRecognitionService : MonoBehaviour
{
#region Inspector Properties
[Header("Config Values")]
[SerializeField]
private bool debugModeIsActive;
[SerializeField]
private string speechKey;
[SerializeField]
private string speechRegion;
[SerializeField]
private string initialRecognitionLanguageCode; // Format: "de-CH"
#endregion
#region Public Properties
#region SpeechRecognitionState
private ESpeechRecognitionState _speechRecognitionState = ESpeechRecognitionState.StartingUp;
public ESpeechRecognitionState SpeechRecognitionState
{
get { return this._speechRecognitionState; }
set
{
if (value != this._speechRecognitionState)
{
this.logIfInDebugMode($"SpeechRecognitionState changed, new value= {value}");
this._speechRecognitionState = value;
this.OnSpeechRecognitionStateChangedEvent?.Invoke(this, value);
}
}
}
public event EventHandler<ESpeechRecognitionState> OnSpeechRecognitionStateChangedEvent;
#endregion
#endregion
#region Private Properties
private object threadLocker = new object();
private SpeechConfig speechConfig;
private bool listeningRestartIsPending;
private bool invokeUserSpeechInputStartedEventIsPending;
private string pendingPartialTranscription;
private string pendingFullTranscription;
private string invokeUserSpeechInputEndedEventIsPending;
#endregion
#region Framework Functions
void OnEnable()
{
this.initialize(this.initialRecognitionLanguageCode);
}
void Update()
{
this.doMainThreadTasks();
}
void OnDisable()
{
this.SpeechRecognitionState = ESpeechRecognitionState.Disabled;
}
#endregion
#region Private Events
private void speechRecognizer_SpeechStartDetected(object sender, RecognitionEventArgs e)
{
if (this == null || this.SpeechRecognitionState != ESpeechRecognitionState.Listening)
{
// Listener was stopped before speech start was detected
return;
}
// Set invoke pending flag (to later invoke on main thread)
this.invokeUserSpeechInputStartedEventIsPending = true;
}
private void speechRecognizer_Recognizing(object sender, SpeechRecognitionEventArgs e)
{
if (this == null || this.SpeechRecognitionState != ESpeechRecognitionState.Listening)
{
// Listener was stopped before this
return;
}
// Save pending partial transcription (to later invoke on main thread)
this.pendingPartialTranscription = e?.Result?.Text;
}
#endregion
#region Public Events
public event EventHandler<bool> OnUserSpeechInputStartedEvent;
public event EventHandler<string> OnPartialTranscriptionChangedEvent;
public event EventHandler<string> OnFullTranscriptionChangedEvent;
public event EventHandler<string> OnUserSpeechInputEndedEvent;
#endregion
#region Public Functions
public async void StartListeningOnceAsync()
{
if (this.speechConfig == null)
{
Debug.LogError("Speech recognizer is not ready yet. Wait for initialize before calling StartListeningOnceAsync");
return;
}
AudioConfig audioConfigInput = AudioConfig.FromDefaultMicrophoneInput();
using (SpeechRecognizer recognizer = new SpeechRecognizer(speechConfig))
{
lock (threadLocker)
{
this.SpeechRecognitionState = ESpeechRecognitionState.Listening;
}
recognizer.SpeechStartDetected += this.speechRecognizer_SpeechStartDetected;
recognizer.Recognizing += this.speechRecognizer_Recognizing;
SpeechRecognitionResult result = await recognizer.RecognizeOnceAsync().ConfigureAwait(false);
if (this.SpeechRecognitionState != ESpeechRecognitionState.Listening)
{
// Listener was stopped before recognition finished -> unsubscribe and return
recognizer.SpeechStartDetected -= this.speechRecognizer_SpeechStartDetected;
recognizer.Recognizing -= this.speechRecognizer_Recognizing;
return;
}
// Check result
string fullTranscription = null;
if (result.Reason == ResultReason.RecognizedSpeech && !string.IsNullOrEmpty(result.Text))
{
fullTranscription = result.Text;
this.invokeUserSpeechInputEndedEventIsPending = fullTranscription;
}
else if (result.Reason == ResultReason.NoMatch)
{
this.logIfInDebugMode("SpeechIntentService NoMatch: Speech could not be recognized.");
listeningRestartIsPending = true;
}
else if (result.Reason == ResultReason.Canceled)
{
var cancellation = CancellationDetails.FromResult(result);
this.logIfInDebugMode($"SpeechIntentService Canceled: Reason={cancellation.Reason} ErrorDetails={cancellation.ErrorDetails}");
}
lock (threadLocker)
{
this.SpeechRecognitionState = ESpeechRecognitionState.Ready;
this.pendingFullTranscription = fullTranscription;
}
recognizer.SpeechStartDetected -= this.speechRecognizer_SpeechStartDetected;
recognizer.Recognizing -= this.speechRecognizer_Recognizing;
}
}
public void StopListening()
{
this.SpeechRecognitionState = ESpeechRecognitionState.Ready;
}
public void SetRecognitionLanguage(string languageCode)
{
this.initialize(languageCode);
}
#endregion
#region Private Functions
private void initialize(string languageCode)
{
this.SpeechRecognitionState = ESpeechRecognitionState.StartingUp;
this.speechConfig = SpeechConfig.FromSubscription(this.speechKey, this.speechRegion);
this.speechConfig.SpeechRecognitionLanguage = languageCode;
// Is this hack still necessary?
var audioClip = Microphone.Start(Microphone.devices[0], true, 200, 16000);
this.logIfInDebugMode($"SpeechRecognitionService initialized using microphone: {Microphone.devices[0]}");
this.SpeechRecognitionState = ESpeechRecognitionState.Ready;
}
private void doMainThreadTasks()
{
if (this.listeningRestartIsPending)
{
this.StartListeningOnceAsync();
listeningRestartIsPending = false;
}
if (this.invokeUserSpeechInputStartedEventIsPending)
{
this.OnUserSpeechInputStartedEvent?.Invoke(this, true);
this.invokeUserSpeechInputStartedEventIsPending = false;
}
if (this.pendingPartialTranscription != null)
{
this.OnPartialTranscriptionChangedEvent?.Invoke(this, this.pendingPartialTranscription);
this.pendingPartialTranscription = null;
}
if (this.pendingFullTranscription != null)
{
this.OnFullTranscriptionChangedEvent?.Invoke(this, this.pendingFullTranscription);
this.pendingFullTranscription = null;
}
if (this.invokeUserSpeechInputEndedEventIsPending != null)
{
this.OnUserSpeechInputEndedEvent?.Invoke(this, invokeUserSpeechInputEndedEventIsPending);
this.invokeUserSpeechInputEndedEventIsPending = null;
}
}
private void logIfInDebugMode(string message)
{
if (!this.debugModeIsActive)
{
return;
}
Debug.Log($"(SpeechRecognitionService) => {message}");
}
#endregion
}