using System; using System.Collections.Generic; using System.Linq; using System.Threading.Tasks; using Microsoft.CognitiveServices.Speech; using Microsoft.CognitiveServices.Speech.Audio; using UnityEngine; #region Enums public enum ESpeechSynthesizerState { StartingUp = 0, Ready = 10, Speaking = 20, GeneratingClip = 30, Disabled = 90, } public enum ESpeechOutputType { PlayDirectly = 0, GenerateAudioClip = 10, } #endregion public class SpeechSynthesizerService : MonoBehaviour { #region Inspector Properties [Header("Config Values")] [SerializeField] private bool debugModeIsActive; [SerializeField] private ESpeechOutputType speechOutputType; [SerializeField] private string speechKey; [SerializeField] private string speechRegion; [SerializeField] [Tooltip("Format: 'de-CH'")] private string initialSynthesisLanguage; [SerializeField] [Tooltip("Voice gallery: https://speech.microsoft.com/portal/voicegallery")] private List defaultSynthesisVoices; [SerializeField] private string fallbackMultilingualSynthesisVoiceName; #endregion #region Public Properties #region SpeechSynthesizerState private ESpeechSynthesizerState _speechSynthesizerState = ESpeechSynthesizerState.StartingUp; public ESpeechSynthesizerState SpeechSynthesizerState { get { return this._speechSynthesizerState; } set { if (value != this._speechSynthesizerState) { this.logIfInDebugMode($"SpeechSynthesizerState changed, new value= {value}"); this._speechSynthesizerState = value; this.OnSpeechSynthesizerStateChangedEvent?.Invoke(this, value); } } } public event EventHandler OnSpeechSynthesizerStateChangedEvent; #endregion #region GeneratedAudioClip private AudioClip _generatedAudioClip = null; public AudioClip GeneratedAudioClip { get { return this._generatedAudioClip; } set { if (value != this._generatedAudioClip) { this.logIfInDebugMode($"GeneratedAudioClip changed, new value= {value}"); this._generatedAudioClip = value; this.OnGeneratedAudioClipChangedEvent?.Invoke(this, value); } } } public event EventHandler OnGeneratedAudioClipChangedEvent; #endregion #endregion #region Private Properties private SpeechSynthesizer speechSynthesizer; private SpeechConfig speechConfig; private string activeLanguage = null; private string activeVoice = null; #endregion #region Framework Functions async void OnDisable() { Debug.Log("SpeechSynthesizerService disposing speechSynthesizer from OnDisable"); await this.disposeSynthesizer(); } #endregion #region Private Events #endregion #region Public Events public event EventHandler OnSpeechOutputStartedEvent; public event EventHandler OnSpeechOutputEndedEvent; #endregion #region Public Functions public async void Synthesize(string text, string languageCode, string voiceNameOverride = null) { if (this.SpeechSynthesizerState == ESpeechSynthesizerState.Speaking) { await this.speechSynthesizer.StopSpeakingAsync(); } await this.reInitializeIfNecessary(languageCode, voiceNameOverride); if (this.speechOutputType == ESpeechOutputType.GenerateAudioClip) { this.SpeechSynthesizerState = ESpeechSynthesizerState.GeneratingClip; } else if (this.speechOutputType == ESpeechOutputType.PlayDirectly) { this.SpeechSynthesizerState = ESpeechSynthesizerState.Speaking; this.OnSpeechOutputStartedEvent?.Invoke(this, true); } SpeechSynthesisResult synthesisResult = await this.speechSynthesizer.SpeakTextAsync(text); if (this.speechOutputType == ESpeechOutputType.GenerateAudioClip) { // Todo add WavUtility // this.GeneratedAudioClip = WavUtility.ToAudioClip(synthesisResult.AudioData); } else if (this.speechOutputType == ESpeechOutputType.PlayDirectly) { // Set state to SpeakingEnded to allow On this.OnSpeechOutputEndedEvent?.Invoke(this, true); } this.SpeechSynthesizerState = ESpeechSynthesizerState.Ready; } #endregion #region Private Functions private async Task initialize(string languageCode, string voiceName) { this.logIfInDebugMode($"SpeechSynthesizerService initializing using language={languageCode} and voice={voiceName}"); if (this.speechSynthesizer != null) { Debug.Log("SpeechSynthesizerService disposing speechSynthesizer from initialize"); await this.disposeSynthesizer(); } this.SpeechSynthesizerState = ESpeechSynthesizerState.StartingUp; this.speechConfig = SpeechConfig.FromSubscription(this.speechKey, this.speechRegion); this.speechConfig.SpeechSynthesisLanguage = languageCode; this.speechConfig.SpeechSynthesisVoiceName = voiceName; if (this.speechOutputType == ESpeechOutputType.GenerateAudioClip) { this.speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm); } AudioConfig audioConfigOutput = AudioConfig.FromDefaultSpeakerOutput(); if (this.speechOutputType == ESpeechOutputType.PlayDirectly) { this.speechSynthesizer = new SpeechSynthesizer(speechConfig, audioConfigOutput); } else if (this.speechOutputType == ESpeechOutputType.GenerateAudioClip) { this.speechSynthesizer = new SpeechSynthesizer(speechConfig, null); } this.activeLanguage = languageCode; this.activeVoice = voiceName; this.logIfInDebugMode($"SpeechSynthesizerService initialized using language={languageCode} and voice={voiceName}"); this.SpeechSynthesizerState = ESpeechSynthesizerState.Ready; } private string getDefaultVoiceForLanguage(string language) { string retVoiceName = this.fallbackMultilingualSynthesisVoiceName; LanguageVoice? languageSpecificVoice = this.defaultSynthesisVoices.FirstOrDefault(lv => lv.LanguageCode == language); if (!string.IsNullOrEmpty(languageSpecificVoice?.VoiceName)) { retVoiceName = ((LanguageVoice)languageSpecificVoice).VoiceName; } return retVoiceName; } private async Task reInitializeIfNecessary(string languageCode, string voiceNameOverride) { bool reInitNecessary = false; if (languageCode != this.activeLanguage) { // Language changed reInitNecessary = true; } else { // Language unchanged if (voiceNameOverride != null && voiceNameOverride != this.activeLanguage) { // Voice changed reInitNecessary = true; } } if (reInitNecessary) { string voiceName; if (voiceNameOverride == null) { // No specific voice defined -> get default voice for this language voiceName = this.getDefaultVoiceForLanguage(languageCode); } else { voiceName = voiceNameOverride; } await this.initialize(languageCode, voiceName); } } private void logIfInDebugMode(string message) { if (!this.debugModeIsActive) { return; } Debug.Log($"(SpeechSynthesizerService) => {message}"); } private async Task disposeSynthesizer() { if (this.speechSynthesizer == null) { return; } // Make sure speaking isn't active anymore, otherwise dispose will throw exception await this.speechSynthesizer.StopSpeakingAsync(); while (this.SpeechSynthesizerState == ESpeechSynthesizerState.Speaking) { await Task.Delay(10); } this.activeLanguage = null; this.activeVoice = null; this.speechSynthesizer.Dispose(); this.speechSynthesizer = null; this.SpeechSynthesizerState = ESpeechSynthesizerState.Disabled; } #endregion }