280 lines
8.2 KiB
C#
Executable File
280 lines
8.2 KiB
C#
Executable File
using System;
|
|
using Microsoft.CognitiveServices.Speech;
|
|
using Microsoft.CognitiveServices.Speech.Audio;
|
|
using UnityEngine;
|
|
|
|
|
|
#region Enums
|
|
|
|
public enum ESpeechRecognitionState
|
|
{
|
|
StartingUp = 0,
|
|
Ready = 10,
|
|
Listening = 20,
|
|
Disabled = 90,
|
|
}
|
|
|
|
#endregion
|
|
|
|
|
|
public class SpeechRecognitionService : MonoBehaviour
|
|
{
|
|
|
|
#region Inspector Properties
|
|
|
|
[Header("Config Values")]
|
|
[SerializeField]
|
|
private bool debugModeIsActive;
|
|
|
|
[SerializeField]
|
|
private string speechKey;
|
|
|
|
[SerializeField]
|
|
private string speechRegion;
|
|
|
|
[SerializeField]
|
|
private string initialRecognitionLanguageCode; // Format: "de-CH"
|
|
|
|
#endregion
|
|
|
|
#region Public Properties
|
|
|
|
#region SpeechRecognitionState
|
|
|
|
private ESpeechRecognitionState _speechRecognitionState = ESpeechRecognitionState.StartingUp;
|
|
|
|
public ESpeechRecognitionState SpeechRecognitionState
|
|
{
|
|
get { return this._speechRecognitionState; }
|
|
set
|
|
{
|
|
if (value != this._speechRecognitionState)
|
|
{
|
|
this.logIfInDebugMode($"SpeechRecognitionState changed, new value= {value}");
|
|
|
|
this._speechRecognitionState = value;
|
|
|
|
this.OnSpeechRecognitionStateChangedEvent?.Invoke(this, value);
|
|
}
|
|
}
|
|
}
|
|
|
|
public event EventHandler<ESpeechRecognitionState> OnSpeechRecognitionStateChangedEvent;
|
|
|
|
#endregion
|
|
|
|
#endregion
|
|
|
|
#region Private Properties
|
|
|
|
private object threadLocker = new object();
|
|
private SpeechConfig speechConfig;
|
|
|
|
private bool listeningRestartIsPending;
|
|
|
|
private bool invokeUserSpeechInputStartedEventIsPending;
|
|
private string pendingPartialTranscription;
|
|
private string pendingFullTranscription;
|
|
private string invokeUserSpeechInputEndedEventIsPending;
|
|
|
|
#endregion
|
|
|
|
#region Framework Functions
|
|
|
|
void OnEnable()
|
|
{
|
|
this.initialize(this.initialRecognitionLanguageCode);
|
|
}
|
|
|
|
void Update()
|
|
{
|
|
this.doMainThreadTasks();
|
|
}
|
|
|
|
void OnDisable()
|
|
{
|
|
this.SpeechRecognitionState = ESpeechRecognitionState.Disabled;
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Private Events
|
|
|
|
private void speechRecognizer_SpeechStartDetected(object sender, RecognitionEventArgs e)
|
|
{
|
|
if (this == null || this.SpeechRecognitionState != ESpeechRecognitionState.Listening)
|
|
{
|
|
// Listener was stopped before speech start was detected
|
|
return;
|
|
}
|
|
|
|
// Set invoke pending flag (to later invoke on main thread)
|
|
this.invokeUserSpeechInputStartedEventIsPending = true;
|
|
}
|
|
|
|
private void speechRecognizer_Recognizing(object sender, SpeechRecognitionEventArgs e)
|
|
{
|
|
if (this == null || this.SpeechRecognitionState != ESpeechRecognitionState.Listening)
|
|
{
|
|
// Listener was stopped before this
|
|
return;
|
|
}
|
|
|
|
// Save pending partial transcription (to later invoke on main thread)
|
|
this.pendingPartialTranscription = e?.Result?.Text;
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Public Events
|
|
|
|
public event EventHandler<bool> OnUserSpeechInputStartedEvent;
|
|
|
|
public event EventHandler<string> OnPartialTranscriptionChangedEvent;
|
|
|
|
public event EventHandler<string> OnFullTranscriptionChangedEvent;
|
|
|
|
public event EventHandler<string> OnUserSpeechInputEndedEvent;
|
|
|
|
#endregion
|
|
|
|
#region Public Functions
|
|
|
|
public async void StartListeningOnceAsync()
|
|
{
|
|
if (this.speechConfig == null)
|
|
{
|
|
Debug.LogError("Speech recognizer is not ready yet. Wait for initialize before calling StartListeningOnceAsync");
|
|
return;
|
|
}
|
|
|
|
AudioConfig audioConfigInput = AudioConfig.FromDefaultMicrophoneInput();
|
|
|
|
using (SpeechRecognizer recognizer = new SpeechRecognizer(speechConfig))
|
|
{
|
|
lock (threadLocker)
|
|
{
|
|
this.SpeechRecognitionState = ESpeechRecognitionState.Listening;
|
|
}
|
|
|
|
recognizer.SpeechStartDetected += this.speechRecognizer_SpeechStartDetected;
|
|
recognizer.Recognizing += this.speechRecognizer_Recognizing;
|
|
|
|
SpeechRecognitionResult result = await recognizer.RecognizeOnceAsync().ConfigureAwait(false);
|
|
|
|
if (this.SpeechRecognitionState != ESpeechRecognitionState.Listening)
|
|
{
|
|
// Listener was stopped before recognition finished -> unsubscribe and return
|
|
recognizer.SpeechStartDetected -= this.speechRecognizer_SpeechStartDetected;
|
|
recognizer.Recognizing -= this.speechRecognizer_Recognizing;
|
|
|
|
return;
|
|
}
|
|
|
|
// Check result
|
|
string fullTranscription = null;
|
|
|
|
if (result.Reason == ResultReason.RecognizedSpeech && !string.IsNullOrEmpty(result.Text))
|
|
{
|
|
fullTranscription = result.Text;
|
|
this.invokeUserSpeechInputEndedEventIsPending = fullTranscription;
|
|
}
|
|
else if (result.Reason == ResultReason.NoMatch)
|
|
{
|
|
this.logIfInDebugMode("SpeechIntentService NoMatch: Speech could not be recognized.");
|
|
listeningRestartIsPending = true;
|
|
}
|
|
else if (result.Reason == ResultReason.Canceled)
|
|
{
|
|
var cancellation = CancellationDetails.FromResult(result);
|
|
this.logIfInDebugMode($"SpeechIntentService Canceled: Reason={cancellation.Reason} ErrorDetails={cancellation.ErrorDetails}");
|
|
}
|
|
|
|
lock (threadLocker)
|
|
{
|
|
this.SpeechRecognitionState = ESpeechRecognitionState.Ready;
|
|
|
|
this.pendingFullTranscription = fullTranscription;
|
|
}
|
|
|
|
recognizer.SpeechStartDetected -= this.speechRecognizer_SpeechStartDetected;
|
|
recognizer.Recognizing -= this.speechRecognizer_Recognizing;
|
|
}
|
|
}
|
|
|
|
public void StopListening()
|
|
{
|
|
this.SpeechRecognitionState = ESpeechRecognitionState.Ready;
|
|
}
|
|
|
|
public void SetRecognitionLanguage(string languageCode)
|
|
{
|
|
this.initialize(languageCode);
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Private Functions
|
|
|
|
private void initialize(string languageCode)
|
|
{
|
|
this.SpeechRecognitionState = ESpeechRecognitionState.StartingUp;
|
|
|
|
this.speechConfig = SpeechConfig.FromSubscription(this.speechKey, this.speechRegion);
|
|
this.speechConfig.SpeechRecognitionLanguage = languageCode;
|
|
|
|
// Is this hack still necessary?
|
|
var audioClip = Microphone.Start(Microphone.devices[0], true, 200, 16000);
|
|
|
|
this.logIfInDebugMode($"SpeechRecognitionService initialized using microphone: {Microphone.devices[0]}");
|
|
|
|
this.SpeechRecognitionState = ESpeechRecognitionState.Ready;
|
|
}
|
|
|
|
private void doMainThreadTasks()
|
|
{
|
|
if (this.listeningRestartIsPending)
|
|
{
|
|
this.StartListeningOnceAsync();
|
|
listeningRestartIsPending = false;
|
|
}
|
|
|
|
if (this.invokeUserSpeechInputStartedEventIsPending)
|
|
{
|
|
this.OnUserSpeechInputStartedEvent?.Invoke(this, true);
|
|
this.invokeUserSpeechInputStartedEventIsPending = false;
|
|
}
|
|
|
|
if (this.pendingPartialTranscription != null)
|
|
{
|
|
this.OnPartialTranscriptionChangedEvent?.Invoke(this, this.pendingPartialTranscription);
|
|
this.pendingPartialTranscription = null;
|
|
}
|
|
|
|
if (this.pendingFullTranscription != null)
|
|
{
|
|
this.OnFullTranscriptionChangedEvent?.Invoke(this, this.pendingFullTranscription);
|
|
this.pendingFullTranscription = null;
|
|
}
|
|
|
|
if (this.invokeUserSpeechInputEndedEventIsPending != null)
|
|
{
|
|
this.OnUserSpeechInputEndedEvent?.Invoke(this, invokeUserSpeechInputEndedEventIsPending);
|
|
this.invokeUserSpeechInputEndedEventIsPending = null;
|
|
}
|
|
}
|
|
|
|
private void logIfInDebugMode(string message)
|
|
{
|
|
if (!this.debugModeIsActive)
|
|
{
|
|
return;
|
|
}
|
|
|
|
Debug.Log($"(SpeechRecognitionService) => {message}");
|
|
}
|
|
|
|
#endregion
|
|
|
|
}
|