314 lines
8.7 KiB
C#
314 lines
8.7 KiB
C#
|
using System;
|
||
|
using System.Collections.Generic;
|
||
|
using System.Linq;
|
||
|
using System.Threading.Tasks;
|
||
|
using Microsoft.CognitiveServices.Speech;
|
||
|
using Microsoft.CognitiveServices.Speech.Audio;
|
||
|
using UnityEngine;
|
||
|
|
||
|
#region Enums
|
||
|
|
||
|
public enum ESpeechSynthesizerState
|
||
|
{
|
||
|
StartingUp = 0,
|
||
|
Ready = 10,
|
||
|
Speaking = 20,
|
||
|
GeneratingClip = 30,
|
||
|
Disabled = 90,
|
||
|
}
|
||
|
|
||
|
public enum ESpeechOutputType
|
||
|
{
|
||
|
PlayDirectly = 0,
|
||
|
GenerateAudioClip = 10,
|
||
|
}
|
||
|
|
||
|
#endregion
|
||
|
|
||
|
public class SpeechSynthesizerService : MonoBehaviour
|
||
|
{
|
||
|
|
||
|
#region Inspector Properties
|
||
|
|
||
|
[Header("Config Values")]
|
||
|
[SerializeField]
|
||
|
private bool debugModeIsActive;
|
||
|
|
||
|
[SerializeField]
|
||
|
private ESpeechOutputType speechOutputType;
|
||
|
|
||
|
[SerializeField]
|
||
|
private string speechKey;
|
||
|
|
||
|
[SerializeField]
|
||
|
private string speechRegion;
|
||
|
|
||
|
[SerializeField]
|
||
|
[Tooltip("Format: 'de-CH'")]
|
||
|
private string initialSynthesisLanguage;
|
||
|
|
||
|
[SerializeField]
|
||
|
[Tooltip("Voice gallery: https://speech.microsoft.com/portal/voicegallery")]
|
||
|
private List<LanguageVoice> defaultSynthesisVoices;
|
||
|
|
||
|
[SerializeField]
|
||
|
private string fallbackMultilingualSynthesisVoiceName;
|
||
|
|
||
|
#endregion
|
||
|
|
||
|
#region Public Properties
|
||
|
|
||
|
#region SpeechSynthesizerState
|
||
|
|
||
|
private ESpeechSynthesizerState _speechSynthesizerState = ESpeechSynthesizerState.StartingUp;
|
||
|
|
||
|
public ESpeechSynthesizerState SpeechSynthesizerState
|
||
|
{
|
||
|
get { return this._speechSynthesizerState; }
|
||
|
set
|
||
|
{
|
||
|
if (value != this._speechSynthesizerState)
|
||
|
{
|
||
|
this.logIfInDebugMode($"SpeechSynthesizerState changed, new value= {value}");
|
||
|
|
||
|
this._speechSynthesizerState = value;
|
||
|
|
||
|
this.OnSpeechSynthesizerStateChangedEvent?.Invoke(this, value);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
public event EventHandler<ESpeechSynthesizerState> OnSpeechSynthesizerStateChangedEvent;
|
||
|
|
||
|
#endregion
|
||
|
|
||
|
#region GeneratedAudioClip
|
||
|
|
||
|
private AudioClip _generatedAudioClip = null;
|
||
|
|
||
|
public AudioClip GeneratedAudioClip
|
||
|
{
|
||
|
get { return this._generatedAudioClip; }
|
||
|
set
|
||
|
{
|
||
|
if (value != this._generatedAudioClip)
|
||
|
{
|
||
|
this.logIfInDebugMode($"GeneratedAudioClip changed, new value= {value}");
|
||
|
|
||
|
this._generatedAudioClip = value;
|
||
|
|
||
|
this.OnGeneratedAudioClipChangedEvent?.Invoke(this, value);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
public event EventHandler<AudioClip> OnGeneratedAudioClipChangedEvent;
|
||
|
|
||
|
#endregion
|
||
|
|
||
|
#endregion
|
||
|
|
||
|
#region Private Properties
|
||
|
|
||
|
private SpeechSynthesizer speechSynthesizer;
|
||
|
private SpeechConfig speechConfig;
|
||
|
|
||
|
private string activeLanguage = null;
|
||
|
private string activeVoice = null;
|
||
|
|
||
|
#endregion
|
||
|
|
||
|
#region Framework Functions
|
||
|
|
||
|
async void OnDisable()
|
||
|
{
|
||
|
Debug.Log("SpeechSynthesizerService disposing speechSynthesizer from OnDisable");
|
||
|
await this.disposeSynthesizer();
|
||
|
}
|
||
|
|
||
|
#endregion
|
||
|
|
||
|
#region Private Events
|
||
|
|
||
|
#endregion
|
||
|
|
||
|
#region Public Events
|
||
|
|
||
|
public event EventHandler<bool> OnSpeechOutputStartedEvent;
|
||
|
|
||
|
public event EventHandler<bool> OnSpeechOutputEndedEvent;
|
||
|
|
||
|
#endregion
|
||
|
|
||
|
#region Public Functions
|
||
|
|
||
|
public async void Synthesize(string text, string languageCode, string voiceNameOverride = null)
|
||
|
{
|
||
|
if (this.SpeechSynthesizerState == ESpeechSynthesizerState.Speaking)
|
||
|
{
|
||
|
await this.speechSynthesizer.StopSpeakingAsync();
|
||
|
}
|
||
|
|
||
|
await this.reInitializeIfNecessary(languageCode, voiceNameOverride);
|
||
|
|
||
|
if (this.speechOutputType == ESpeechOutputType.GenerateAudioClip)
|
||
|
{
|
||
|
this.SpeechSynthesizerState = ESpeechSynthesizerState.GeneratingClip;
|
||
|
}
|
||
|
else if (this.speechOutputType == ESpeechOutputType.PlayDirectly)
|
||
|
{
|
||
|
this.SpeechSynthesizerState = ESpeechSynthesizerState.Speaking;
|
||
|
|
||
|
this.OnSpeechOutputStartedEvent?.Invoke(this, true);
|
||
|
}
|
||
|
|
||
|
SpeechSynthesisResult synthesisResult = await this.speechSynthesizer.SpeakTextAsync(text);
|
||
|
|
||
|
if (this.speechOutputType == ESpeechOutputType.GenerateAudioClip)
|
||
|
{
|
||
|
// Todo add WavUtility
|
||
|
// this.GeneratedAudioClip = WavUtility.ToAudioClip(synthesisResult.AudioData);
|
||
|
|
||
|
}
|
||
|
else if (this.speechOutputType == ESpeechOutputType.PlayDirectly)
|
||
|
{
|
||
|
// Set state to SpeakingEnded to allow On
|
||
|
this.OnSpeechOutputEndedEvent?.Invoke(this, true);
|
||
|
}
|
||
|
|
||
|
this.SpeechSynthesizerState = ESpeechSynthesizerState.Ready;
|
||
|
}
|
||
|
|
||
|
#endregion
|
||
|
|
||
|
#region Private Functions
|
||
|
|
||
|
private async Task initialize(string languageCode, string voiceName)
|
||
|
{
|
||
|
this.logIfInDebugMode($"SpeechSynthesizerService initializing using language={languageCode} and voice={voiceName}");
|
||
|
|
||
|
if (this.speechSynthesizer != null)
|
||
|
{
|
||
|
Debug.Log("SpeechSynthesizerService disposing speechSynthesizer from initialize");
|
||
|
await this.disposeSynthesizer();
|
||
|
}
|
||
|
|
||
|
this.SpeechSynthesizerState = ESpeechSynthesizerState.StartingUp;
|
||
|
|
||
|
this.speechConfig = SpeechConfig.FromSubscription(this.speechKey, this.speechRegion);
|
||
|
this.speechConfig.SpeechSynthesisLanguage = languageCode;
|
||
|
this.speechConfig.SpeechSynthesisVoiceName = voiceName;
|
||
|
|
||
|
if (this.speechOutputType == ESpeechOutputType.GenerateAudioClip)
|
||
|
{
|
||
|
this.speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm);
|
||
|
}
|
||
|
|
||
|
AudioConfig audioConfigOutput = AudioConfig.FromDefaultSpeakerOutput();
|
||
|
|
||
|
if (this.speechOutputType == ESpeechOutputType.PlayDirectly)
|
||
|
{
|
||
|
this.speechSynthesizer = new SpeechSynthesizer(speechConfig, audioConfigOutput);
|
||
|
}
|
||
|
else if (this.speechOutputType == ESpeechOutputType.GenerateAudioClip)
|
||
|
{
|
||
|
this.speechSynthesizer = new SpeechSynthesizer(speechConfig, null);
|
||
|
}
|
||
|
|
||
|
this.activeLanguage = languageCode;
|
||
|
this.activeVoice = voiceName;
|
||
|
|
||
|
this.logIfInDebugMode($"SpeechSynthesizerService initialized using language={languageCode} and voice={voiceName}");
|
||
|
|
||
|
this.SpeechSynthesizerState = ESpeechSynthesizerState.Ready;
|
||
|
}
|
||
|
|
||
|
private string getDefaultVoiceForLanguage(string language)
|
||
|
{
|
||
|
string retVoiceName = this.fallbackMultilingualSynthesisVoiceName;
|
||
|
|
||
|
LanguageVoice? languageSpecificVoice = this.defaultSynthesisVoices.FirstOrDefault(lv => lv.LanguageCode == language);
|
||
|
|
||
|
if (!string.IsNullOrEmpty(languageSpecificVoice?.VoiceName))
|
||
|
{
|
||
|
retVoiceName = ((LanguageVoice)languageSpecificVoice).VoiceName;
|
||
|
}
|
||
|
|
||
|
return retVoiceName;
|
||
|
}
|
||
|
|
||
|
private async Task reInitializeIfNecessary(string languageCode, string voiceNameOverride)
|
||
|
{
|
||
|
bool reInitNecessary = false;
|
||
|
|
||
|
if (languageCode != this.activeLanguage)
|
||
|
{
|
||
|
// Language changed
|
||
|
reInitNecessary = true;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
// Language unchanged
|
||
|
if (voiceNameOverride != null && voiceNameOverride != this.activeLanguage)
|
||
|
{
|
||
|
// Voice changed
|
||
|
reInitNecessary = true;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (reInitNecessary)
|
||
|
{
|
||
|
string voiceName;
|
||
|
|
||
|
if (voiceNameOverride == null)
|
||
|
{
|
||
|
// No specific voice defined -> get default voice for this language
|
||
|
voiceName = this.getDefaultVoiceForLanguage(languageCode);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
voiceName = voiceNameOverride;
|
||
|
}
|
||
|
|
||
|
await this.initialize(languageCode, voiceName);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private void logIfInDebugMode(string message)
|
||
|
{
|
||
|
if (!this.debugModeIsActive)
|
||
|
{
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
Debug.Log($"(SpeechSynthesizerService) => {message}");
|
||
|
}
|
||
|
|
||
|
private async Task disposeSynthesizer()
|
||
|
{
|
||
|
if (this.speechSynthesizer == null)
|
||
|
{
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
// Make sure speaking isn't active anymore, otherwise dispose will throw exception
|
||
|
await this.speechSynthesizer.StopSpeakingAsync();
|
||
|
|
||
|
while (this.SpeechSynthesizerState == ESpeechSynthesizerState.Speaking)
|
||
|
{
|
||
|
await Task.Delay(10);
|
||
|
}
|
||
|
|
||
|
this.activeLanguage = null;
|
||
|
this.activeVoice = null;
|
||
|
|
||
|
this.speechSynthesizer.Dispose();
|
||
|
this.speechSynthesizer = null;
|
||
|
|
||
|
this.SpeechSynthesizerState = ESpeechSynthesizerState.Disabled;
|
||
|
}
|
||
|
|
||
|
#endregion
|
||
|
|
||
|
}
|