328 lines
		
	
	
		
			9.2 KiB
		
	
	
	
		
			C#
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			328 lines
		
	
	
		
			9.2 KiB
		
	
	
	
		
			C#
		
	
	
		
			Executable File
		
	
	
	
	
| using System;
 | |
| using System.Collections.Generic;
 | |
| using System.Linq;
 | |
| using System.Threading.Tasks;
 | |
| using Microsoft.CognitiveServices.Speech;
 | |
| using Microsoft.CognitiveServices.Speech.Audio;
 | |
| using UnityEngine;
 | |
| 
 | |
| #region Enums
 | |
| 
 | |
| public enum ESpeechSynthesizerState
 | |
| {
 | |
|     StartingUp = 0,
 | |
|     Ready = 10,
 | |
|     Speaking = 20,
 | |
|     GeneratingClip = 30,
 | |
|     Disabled = 90,
 | |
| }
 | |
| 
 | |
| public enum ESpeechOutputType
 | |
| {
 | |
|     PlayDirectly = 0,
 | |
|     GenerateAudioClip = 10,
 | |
| }
 | |
| 
 | |
| #endregion
 | |
| 
 | |
| public class SpeechSynthesizerService : MonoBehaviour
 | |
| {
 | |
| 
 | |
|     #region Inspector Properties
 | |
| 
 | |
|     [Header("Config Values")]
 | |
|     [SerializeField]
 | |
|     private bool debugModeIsActive;
 | |
| 
 | |
|     [SerializeField]
 | |
|     private ESpeechOutputType speechOutputType;
 | |
| 
 | |
|     [SerializeField]
 | |
|     private string speechKey;
 | |
| 
 | |
|     [SerializeField]
 | |
|     private string speechRegion;
 | |
| 
 | |
|     [SerializeField]
 | |
|     [Tooltip("Format: 'de-CH'")]
 | |
|     private string initialSynthesisLanguage;
 | |
| 
 | |
|     [SerializeField]
 | |
|     [Tooltip("Voice gallery: https://speech.microsoft.com/portal/voicegallery")]
 | |
|     private List<LanguageVoice> defaultSynthesisVoices;
 | |
| 
 | |
|     [SerializeField]
 | |
|     private string fallbackMultilingualSynthesisVoiceName;
 | |
| 
 | |
|     #endregion
 | |
| 
 | |
|     #region Public Properties
 | |
| 
 | |
|     #region SpeechSynthesizerState
 | |
| 
 | |
|     private ESpeechSynthesizerState _speechSynthesizerState = ESpeechSynthesizerState.StartingUp;
 | |
| 
 | |
|     public ESpeechSynthesizerState SpeechSynthesizerState
 | |
|     {
 | |
|         get { return this._speechSynthesizerState; }
 | |
|         set
 | |
|         {
 | |
|             if (value != this._speechSynthesizerState)
 | |
|             {
 | |
|                 this.logIfInDebugMode($"SpeechSynthesizerState changed, new value= {value}");
 | |
| 
 | |
|                 this._speechSynthesizerState = value;
 | |
| 
 | |
|                 this.OnSpeechSynthesizerStateChangedEvent?.Invoke(this, value);
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     public event EventHandler<ESpeechSynthesizerState> OnSpeechSynthesizerStateChangedEvent;
 | |
| 
 | |
|     #endregion
 | |
| 
 | |
|     #region GeneratedAudioClip
 | |
| 
 | |
|     private AudioClip _generatedAudioClip = null;
 | |
| 
 | |
|     public AudioClip GeneratedAudioClip
 | |
|     {
 | |
|         get { return this._generatedAudioClip; }
 | |
|         set
 | |
|         {
 | |
|             if (value != this._generatedAudioClip)
 | |
|             {
 | |
|                 this.logIfInDebugMode($"GeneratedAudioClip changed, new value= {value}");
 | |
| 
 | |
|                 this._generatedAudioClip = value;
 | |
| 
 | |
|                 this.OnGeneratedAudioClipChangedEvent?.Invoke(this, value);
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     public event EventHandler<AudioClip> OnGeneratedAudioClipChangedEvent;
 | |
| 
 | |
|     #endregion
 | |
| 
 | |
|     #endregion
 | |
| 
 | |
|     #region Private Properties
 | |
|     
 | |
|     private ViaggioAIManager vaim { get { return ViaggioAIManager.Instance; } }
 | |
| 
 | |
|     private SpeechSynthesizer speechSynthesizer;
 | |
|     private SpeechConfig speechConfig;
 | |
| 
 | |
|     private string activeLanguage = null;
 | |
|     private string activeVoice = null;
 | |
| 
 | |
|     #endregion
 | |
| 
 | |
|     #region Framework Functions
 | |
| 
 | |
|     async void OnDisable()
 | |
|     {
 | |
|         this.logIfInDebugMode("SpeechSynthesizerService disposing speechSynthesizer from OnDisable");
 | |
|         await this.disposeSynthesizer();
 | |
|     }
 | |
| 
 | |
|     #endregion
 | |
| 
 | |
|     #region Private Events
 | |
| 
 | |
|     #endregion
 | |
| 
 | |
|     #region Public Events
 | |
| 
 | |
|     public event EventHandler<bool> OnSpeechOutputStartedEvent;
 | |
| 
 | |
|     public event EventHandler<bool> OnSpeechOutputEndedEvent;
 | |
| 
 | |
|     public event EventHandler<string> OnSynthErrorChangedEvent;
 | |
| 
 | |
|     #endregion
 | |
| 
 | |
|     #region Public Functions
 | |
| 
 | |
|     public async void Synthesize(string text, string languageCode, string voiceNameOverride = null)
 | |
|     {
 | |
|         if (this.SpeechSynthesizerState == ESpeechSynthesizerState.Speaking)
 | |
|         {
 | |
|             await this.speechSynthesizer.StopSpeakingAsync();
 | |
|         }
 | |
| 
 | |
|         await this.reInitializeIfNecessary(languageCode, voiceNameOverride);
 | |
| 
 | |
|         if (this.speechOutputType == ESpeechOutputType.GenerateAudioClip)
 | |
|         {
 | |
|             this.SpeechSynthesizerState = ESpeechSynthesizerState.GeneratingClip;
 | |
|         }
 | |
|         else if (this.speechOutputType == ESpeechOutputType.PlayDirectly)
 | |
|         {
 | |
|             this.SpeechSynthesizerState = ESpeechSynthesizerState.Speaking;
 | |
| 
 | |
|             this.OnSpeechOutputStartedEvent?.Invoke(this, true);
 | |
|         }
 | |
| 
 | |
|         SpeechSynthesisResult synthesisResult = await this.speechSynthesizer.SpeakTextAsync(text);
 | |
| 
 | |
|         if (synthesisResult.Reason == ResultReason.Canceled)
 | |
|         {
 | |
|             this.OnSynthErrorChangedEvent?.Invoke(this, "Speech synthesis failed! Check internet connection");
 | |
|         }
 | |
| 
 | |
|         if (this.speechOutputType == ESpeechOutputType.GenerateAudioClip)
 | |
|         {
 | |
|             // Todo add WavUtility
 | |
|             // this.GeneratedAudioClip = WavUtility.ToAudioClip(synthesisResult.AudioData);
 | |
| 
 | |
|         }
 | |
|         else if (this.speechOutputType == ESpeechOutputType.PlayDirectly)
 | |
|         {
 | |
|             // Set state to SpeakingEnded to allow On
 | |
|             this.OnSpeechOutputEndedEvent?.Invoke(this, true);
 | |
|         }
 | |
| 
 | |
|         this.SpeechSynthesizerState = ESpeechSynthesizerState.Ready;
 | |
|     }
 | |
| 
 | |
|     public async Task Stop()
 | |
|     {
 | |
|         await this.speechSynthesizer.StopSpeakingAsync();
 | |
|     }
 | |
| 
 | |
|     #endregion
 | |
| 
 | |
|     #region Private Functions
 | |
| 
 | |
|     private async Task initialize(string languageCode, string voiceName)
 | |
|     {
 | |
|         this.logIfInDebugMode($"SpeechSynthesizerService initializing using language={languageCode} and voice={voiceName}");
 | |
| 
 | |
|         if (this.speechSynthesizer != null)
 | |
|         {
 | |
|             this.logIfInDebugMode("SpeechSynthesizerService disposing speechSynthesizer from initialize");
 | |
|             await this.disposeSynthesizer();
 | |
|         }
 | |
| 
 | |
|         this.SpeechSynthesizerState = ESpeechSynthesizerState.StartingUp;
 | |
| 
 | |
|         this.speechConfig = SpeechConfig.FromSubscription(this.speechKey, this.speechRegion);
 | |
|         this.speechConfig.SpeechSynthesisLanguage = languageCode;
 | |
|         this.speechConfig.SpeechSynthesisVoiceName = voiceName;
 | |
| 
 | |
|         if (this.speechOutputType == ESpeechOutputType.GenerateAudioClip)
 | |
|         {
 | |
|             this.speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm);
 | |
|         }
 | |
| 
 | |
|         AudioConfig audioConfigOutput = AudioConfig.FromDefaultSpeakerOutput();
 | |
| 
 | |
|         if (this.speechOutputType == ESpeechOutputType.PlayDirectly)
 | |
|         {
 | |
|             this.speechSynthesizer = new SpeechSynthesizer(speechConfig, audioConfigOutput);
 | |
|         }
 | |
|         else if (this.speechOutputType == ESpeechOutputType.GenerateAudioClip)
 | |
|         {
 | |
|             this.speechSynthesizer = new SpeechSynthesizer(speechConfig, null);
 | |
|         }
 | |
| 
 | |
|         this.activeLanguage = languageCode;
 | |
|         this.activeVoice = voiceName;
 | |
| 
 | |
|         this.logIfInDebugMode($"SpeechSynthesizerService initialized using language={languageCode} and voice={voiceName}");
 | |
| 
 | |
|         this.SpeechSynthesizerState = ESpeechSynthesizerState.Ready;
 | |
|     }
 | |
| 
 | |
|     private string getDefaultVoiceForLanguage(string language)
 | |
|     {
 | |
|         string retVoiceName = this.fallbackMultilingualSynthesisVoiceName;
 | |
| 
 | |
|         LanguageVoice? languageSpecificVoice = this.defaultSynthesisVoices.FirstOrDefault(lv => lv.LanguageCode == language);
 | |
| 
 | |
|         if (!string.IsNullOrEmpty(languageSpecificVoice?.VoiceName))
 | |
|         {
 | |
|             retVoiceName = ((LanguageVoice)languageSpecificVoice).VoiceName;
 | |
|         }
 | |
| 
 | |
|         return retVoiceName;
 | |
|     }
 | |
| 
 | |
|     private async Task reInitializeIfNecessary(string languageCode, string voiceNameOverride)
 | |
|     {
 | |
|         bool reInitNecessary = false;
 | |
| 
 | |
|         if (languageCode != this.activeLanguage)
 | |
|         {
 | |
|             // Language changed
 | |
|             reInitNecessary = true;
 | |
|         }
 | |
|         else
 | |
|         {
 | |
|             // Language unchanged
 | |
|             if (voiceNameOverride != null && voiceNameOverride != this.activeLanguage)
 | |
|             {
 | |
|                 // Voice changed
 | |
|                 reInitNecessary = true;
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         if (reInitNecessary)
 | |
|         {
 | |
|             string voiceName;
 | |
| 
 | |
|             if (voiceNameOverride == null)
 | |
|             {
 | |
|                 // No specific voice defined -> get default voice for this language
 | |
|                 voiceName = this.getDefaultVoiceForLanguage(languageCode);
 | |
|             }
 | |
|             else
 | |
|             {
 | |
|                 voiceName = voiceNameOverride;
 | |
|             }
 | |
| 
 | |
|             await this.initialize(languageCode, voiceName);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     private void logIfInDebugMode(string message)
 | |
|     {
 | |
|         if (!this.debugModeIsActive)
 | |
|         {
 | |
|             return;
 | |
|         }
 | |
| 
 | |
|         Debug.Log($"(SpeechSynthesizerService) => {message}");
 | |
|     }
 | |
| 
 | |
|     private async Task disposeSynthesizer()
 | |
|     {
 | |
|         if (this.speechSynthesizer == null)
 | |
|         {
 | |
|             return;
 | |
|         }
 | |
| 
 | |
|         // Make sure speaking isn't active anymore, otherwise dispose will throw exception
 | |
|         await this.speechSynthesizer.StopSpeakingAsync();
 | |
| 
 | |
|         while (this.SpeechSynthesizerState == ESpeechSynthesizerState.Speaking)
 | |
|         {
 | |
|             await Task.Delay(10);
 | |
|         }
 | |
| 
 | |
|         this.activeLanguage = null;
 | |
|         this.activeVoice = null;
 | |
| 
 | |
|         this.speechSynthesizer.Dispose();
 | |
|         this.speechSynthesizer = null;
 | |
| 
 | |
|         this.SpeechSynthesizerState = ESpeechSynthesizerState.Disabled;
 | |
|     }
 | |
| 
 | |
|     #endregion
 | |
| 
 | |
| }
 |