UP-Viagg-io/Viagg-io/Assets/afca/ViaggioAI/Scripts/AIServices/SpeechSynthesizerService.cs

314 lines
8.7 KiB
C#
Executable File

using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using UnityEngine;
#region Enums
public enum ESpeechSynthesizerState
{
StartingUp = 0,
Ready = 10,
Speaking = 20,
GeneratingClip = 30,
Disabled = 90,
}
public enum ESpeechOutputType
{
PlayDirectly = 0,
GenerateAudioClip = 10,
}
#endregion
public class SpeechSynthesizerService : MonoBehaviour
{
#region Inspector Properties
[Header("Config Values")]
[SerializeField]
private bool debugModeIsActive;
[SerializeField]
private ESpeechOutputType speechOutputType;
[SerializeField]
private string speechKey;
[SerializeField]
private string speechRegion;
[SerializeField]
[Tooltip("Format: 'de-CH'")]
private string initialSynthesisLanguage;
[SerializeField]
[Tooltip("Voice gallery: https://speech.microsoft.com/portal/voicegallery")]
private List<LanguageVoice> defaultSynthesisVoices;
[SerializeField]
private string fallbackMultilingualSynthesisVoiceName;
#endregion
#region Public Properties
#region SpeechSynthesizerState
private ESpeechSynthesizerState _speechSynthesizerState = ESpeechSynthesizerState.StartingUp;
public ESpeechSynthesizerState SpeechSynthesizerState
{
get { return this._speechSynthesizerState; }
set
{
if (value != this._speechSynthesizerState)
{
this.logIfInDebugMode($"SpeechSynthesizerState changed, new value= {value}");
this._speechSynthesizerState = value;
this.OnSpeechSynthesizerStateChangedEvent?.Invoke(this, value);
}
}
}
public event EventHandler<ESpeechSynthesizerState> OnSpeechSynthesizerStateChangedEvent;
#endregion
#region GeneratedAudioClip
private AudioClip _generatedAudioClip = null;
public AudioClip GeneratedAudioClip
{
get { return this._generatedAudioClip; }
set
{
if (value != this._generatedAudioClip)
{
this.logIfInDebugMode($"GeneratedAudioClip changed, new value= {value}");
this._generatedAudioClip = value;
this.OnGeneratedAudioClipChangedEvent?.Invoke(this, value);
}
}
}
public event EventHandler<AudioClip> OnGeneratedAudioClipChangedEvent;
#endregion
#endregion
#region Private Properties
private SpeechSynthesizer speechSynthesizer;
private SpeechConfig speechConfig;
private string activeLanguage = null;
private string activeVoice = null;
#endregion
#region Framework Functions
async void OnDisable()
{
Debug.Log("SpeechSynthesizerService disposing speechSynthesizer from OnDisable");
await this.disposeSynthesizer();
}
#endregion
#region Private Events
#endregion
#region Public Events
public event EventHandler<bool> OnSpeechOutputStartedEvent;
public event EventHandler<bool> OnSpeechOutputEndedEvent;
#endregion
#region Public Functions
public async void Synthesize(string text, string languageCode, string voiceNameOverride = null)
{
if (this.SpeechSynthesizerState == ESpeechSynthesizerState.Speaking)
{
await this.speechSynthesizer.StopSpeakingAsync();
}
await this.reInitializeIfNecessary(languageCode, voiceNameOverride);
if (this.speechOutputType == ESpeechOutputType.GenerateAudioClip)
{
this.SpeechSynthesizerState = ESpeechSynthesizerState.GeneratingClip;
}
else if (this.speechOutputType == ESpeechOutputType.PlayDirectly)
{
this.SpeechSynthesizerState = ESpeechSynthesizerState.Speaking;
this.OnSpeechOutputStartedEvent?.Invoke(this, true);
}
SpeechSynthesisResult synthesisResult = await this.speechSynthesizer.SpeakTextAsync(text);
if (this.speechOutputType == ESpeechOutputType.GenerateAudioClip)
{
// Todo add WavUtility
// this.GeneratedAudioClip = WavUtility.ToAudioClip(synthesisResult.AudioData);
}
else if (this.speechOutputType == ESpeechOutputType.PlayDirectly)
{
// Set state to SpeakingEnded to allow On
this.OnSpeechOutputEndedEvent?.Invoke(this, true);
}
this.SpeechSynthesizerState = ESpeechSynthesizerState.Ready;
}
#endregion
#region Private Functions
private async Task initialize(string languageCode, string voiceName)
{
this.logIfInDebugMode($"SpeechSynthesizerService initializing using language={languageCode} and voice={voiceName}");
if (this.speechSynthesizer != null)
{
Debug.Log("SpeechSynthesizerService disposing speechSynthesizer from initialize");
await this.disposeSynthesizer();
}
this.SpeechSynthesizerState = ESpeechSynthesizerState.StartingUp;
this.speechConfig = SpeechConfig.FromSubscription(this.speechKey, this.speechRegion);
this.speechConfig.SpeechSynthesisLanguage = languageCode;
this.speechConfig.SpeechSynthesisVoiceName = voiceName;
if (this.speechOutputType == ESpeechOutputType.GenerateAudioClip)
{
this.speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm);
}
AudioConfig audioConfigOutput = AudioConfig.FromDefaultSpeakerOutput();
if (this.speechOutputType == ESpeechOutputType.PlayDirectly)
{
this.speechSynthesizer = new SpeechSynthesizer(speechConfig, audioConfigOutput);
}
else if (this.speechOutputType == ESpeechOutputType.GenerateAudioClip)
{
this.speechSynthesizer = new SpeechSynthesizer(speechConfig, null);
}
this.activeLanguage = languageCode;
this.activeVoice = voiceName;
this.logIfInDebugMode($"SpeechSynthesizerService initialized using language={languageCode} and voice={voiceName}");
this.SpeechSynthesizerState = ESpeechSynthesizerState.Ready;
}
private string getDefaultVoiceForLanguage(string language)
{
string retVoiceName = this.fallbackMultilingualSynthesisVoiceName;
LanguageVoice? languageSpecificVoice = this.defaultSynthesisVoices.FirstOrDefault(lv => lv.LanguageCode == language);
if (!string.IsNullOrEmpty(languageSpecificVoice?.VoiceName))
{
retVoiceName = ((LanguageVoice)languageSpecificVoice).VoiceName;
}
return retVoiceName;
}
private async Task reInitializeIfNecessary(string languageCode, string voiceNameOverride)
{
bool reInitNecessary = false;
if (languageCode != this.activeLanguage)
{
// Language changed
reInitNecessary = true;
}
else
{
// Language unchanged
if (voiceNameOverride != null && voiceNameOverride != this.activeLanguage)
{
// Voice changed
reInitNecessary = true;
}
}
if (reInitNecessary)
{
string voiceName;
if (voiceNameOverride == null)
{
// No specific voice defined -> get default voice for this language
voiceName = this.getDefaultVoiceForLanguage(languageCode);
}
else
{
voiceName = voiceNameOverride;
}
await this.initialize(languageCode, voiceName);
}
}
private void logIfInDebugMode(string message)
{
if (!this.debugModeIsActive)
{
return;
}
Debug.Log($"(SpeechSynthesizerService) => {message}");
}
private async Task disposeSynthesizer()
{
if (this.speechSynthesizer == null)
{
return;
}
// Make sure speaking isn't active anymore, otherwise dispose will throw exception
await this.speechSynthesizer.StopSpeakingAsync();
while (this.SpeechSynthesizerState == ESpeechSynthesizerState.Speaking)
{
await Task.Delay(10);
}
this.activeLanguage = null;
this.activeVoice = null;
this.speechSynthesizer.Dispose();
this.speechSynthesizer = null;
this.SpeechSynthesizerState = ESpeechSynthesizerState.Disabled;
}
#endregion
}