From 90f6a95251570fadb884dc99883ee8e3d0a8e4b4 Mon Sep 17 00:00:00 2001 From: lolodomo Date: Sat, 9 Apr 2022 16:26:09 +0200 Subject: [PATCH] [voice] New feature "listen and answer" (#2793) * New method listenAndAnswer in VoiceManager * New console command "voice listenandanswer" * New REST API * New rule action * Enhanced console command "voice startdialog" Closes #2688 Signed-off-by: Laurent Garnier --- .../io/rest/voice/internal/VoiceResource.java | 60 +++++++++ .../core/model/script/actions/Voice.java | 89 +++++++++++++ .../org/openhab/core/voice/VoiceManager.java | 48 +++++-- .../core/voice/internal/DialogProcessor.java | 118 ++++++++++++------ .../VoiceConsoleCommandExtension.java | 16 +++ .../core/voice/internal/VoiceManagerImpl.java | 42 +++++++ 6 files changed, 325 insertions(+), 48 deletions(-) diff --git a/bundles/org.openhab.core.io.rest.voice/src/main/java/org/openhab/core/io/rest/voice/internal/VoiceResource.java b/bundles/org.openhab.core.io.rest.voice/src/main/java/org/openhab/core/io/rest/voice/internal/VoiceResource.java index 13ed4403b..bb21e0ddd 100644 --- a/bundles/org.openhab.core.io.rest.voice/src/main/java/org/openhab/core/io/rest/voice/internal/VoiceResource.java +++ b/bundles/org.openhab.core.io.rest.voice/src/main/java/org/openhab/core/io/rest/voice/internal/VoiceResource.java @@ -312,4 +312,64 @@ public class VoiceResource implements RESTResource { return JSONResponse.createErrorResponse(Status.BAD_REQUEST, e.getMessage()); } } + + @POST + @Path("/listenandanswer") + @Consumes(MediaType.TEXT_PLAIN) + @Operation(operationId = "listenAndAnswer", summary = "Executes a simple dialog sequence without keyword spotting for a given audio source.", responses = { + @ApiResponse(responseCode = "200", description = "OK"), + @ApiResponse(responseCode = "404", description = "One of the given ids is wrong."), + @ApiResponse(responseCode = "400", description = "Services are missing or language is not supported by services or dialog processing is already started for the audio source.") }) + public Response listenAndAnswer( + @HeaderParam(HttpHeaders.ACCEPT_LANGUAGE) @Parameter(description = "language") @Nullable String language, + @QueryParam("sourceId") @Parameter(description = "source ID") @Nullable String sourceId, + @QueryParam("sttId") @Parameter(description = "Speech-to-Text ID") @Nullable String sttId, + @QueryParam("ttsId") @Parameter(description = "Text-to-Speech ID") @Nullable String ttsId, + @QueryParam("hliId") @Parameter(description = "interpreter ID") @Nullable String hliId, + @QueryParam("sinkId") @Parameter(description = "audio sink ID") @Nullable String sinkId, + @QueryParam("listeningItem") @Parameter(description = "listening item") @Nullable String listeningItem) { + AudioSource source = null; + if (sourceId != null) { + source = audioManager.getSource(sourceId); + if (source == null) { + return JSONResponse.createErrorResponse(Status.NOT_FOUND, "Audio source not found"); + } + } + STTService stt = null; + if (sttId != null) { + stt = voiceManager.getSTT(sttId); + if (stt == null) { + return JSONResponse.createErrorResponse(Status.NOT_FOUND, "Speech-to-Text not found"); + } + } + TTSService tts = null; + if (ttsId != null) { + tts = voiceManager.getTTS(ttsId); + if (tts == null) { + return JSONResponse.createErrorResponse(Status.NOT_FOUND, "Text-to-Speech not found"); + } + } + HumanLanguageInterpreter hli = null; + if (hliId != null) { + hli = voiceManager.getHLI(hliId); + if (hli == null) { + return JSONResponse.createErrorResponse(Status.NOT_FOUND, "Interpreter not found"); + } + } + AudioSink sink = null; + if (sinkId != null) { + sink = audioManager.getSink(sinkId); + if (sink == null) { + return JSONResponse.createErrorResponse(Status.NOT_FOUND, "Audio sink not found"); + } + } + final Locale locale = localeService.getLocale(language); + + try { + voiceManager.listenAndAnswer(stt, tts, hli, source, sink, locale, listeningItem); + return Response.ok(null, MediaType.TEXT_PLAIN).build(); + } catch (IllegalStateException e) { + return JSONResponse.createErrorResponse(Status.BAD_REQUEST, e.getMessage()); + } + } } diff --git a/bundles/org.openhab.core.model.script/src/org/openhab/core/model/script/actions/Voice.java b/bundles/org.openhab.core.model.script/src/org/openhab/core/model/script/actions/Voice.java index 390939ab2..40961843f 100644 --- a/bundles/org.openhab.core.model.script/src/org/openhab/core/model/script/actions/Voice.java +++ b/bundles/org.openhab.core.model.script/src/org/openhab/core/model/script/actions/Voice.java @@ -320,4 +320,93 @@ public class Voice { logger.warn("Failed stopping dialog processing: {}", e.getMessage()); } } + + /** + * Executes a simple dialog sequence without keyword spotting for a given audio source using default speech-to-text + * service, default text-to-speech service, default human language text interpreter and default locale. + * + * @param source the name of audio source to use or null to use the default source + * @param sink the name of audio sink to use or null to use the default sink + */ + @ActionDoc(text = "executes a simple dialog sequence without keyword spotting for a given audio source") + public static void listenAndAnswer(@ParamDoc(name = "source") @Nullable String source, + @ParamDoc(name = "sink") @Nullable String sink) { + listenAndAnswer(null, null, null, source, sink, null, null); + } + + /** + * Executes a simple dialog sequence without keyword spotting for a given audio source. + * + * @param stt the speech-to-text service to use or null to use the default service + * @param tts the text-to-speech service to use or null to use the default service + * @param interpreter the human language text interpreter to use or null to use the default service + * @param source the name of audio source to use or null to use the default source + * @param sink the name of audio sink to use or null to use the default sink + * @param Locale the locale to use or null to use the default locale + * @param listeningItem the item to switch ON while listening to a question + */ + @ActionDoc(text = "executes a simple dialog sequence without keyword spotting for a given audio source") + public static void listenAndAnswer(@ParamDoc(name = "speech-to-text service") @Nullable String stt, + @ParamDoc(name = "text-to-speech service") @Nullable String tts, + @ParamDoc(name = "interpreter") @Nullable String interpreter, + @ParamDoc(name = "source") @Nullable String source, @ParamDoc(name = "sink") @Nullable String sink, + @ParamDoc(name = "locale") @Nullable String locale, + @ParamDoc(name = "listening item") @Nullable String listeningItem) { + AudioSource audioSource = null; + if (source != null) { + audioSource = VoiceActionService.audioManager.getSource(source); + if (audioSource == null) { + logger.warn("Failed executing simple dialog: audio source '{}' not found", source); + return; + } + } + STTService sttService = null; + if (stt != null) { + sttService = VoiceActionService.voiceManager.getSTT(stt); + if (sttService == null) { + logger.warn("Failed executing simple dialog: speech-to-text service '{}' not found", stt); + return; + } + } + TTSService ttsService = null; + if (tts != null) { + ttsService = VoiceActionService.voiceManager.getTTS(tts); + if (ttsService == null) { + logger.warn("Failed executing simple dialog: text-to-speech service '{}' not found", tts); + return; + } + } + HumanLanguageInterpreter hliService = null; + if (interpreter != null) { + hliService = VoiceActionService.voiceManager.getHLI(interpreter); + if (hliService == null) { + logger.warn("Failed executing simple dialog: interpreter '{}' not found", interpreter); + return; + } + } + AudioSink audioSink = null; + if (sink != null) { + audioSink = VoiceActionService.audioManager.getSink(sink); + if (audioSink == null) { + logger.warn("Failed executing simple dialog: audio sink '{}' not found", sink); + return; + } + } + Locale loc = null; + if (locale != null) { + String[] split = locale.split("-"); + if (split.length == 2) { + loc = new Locale(split[0], split[1]); + } else { + loc = new Locale(split[0]); + } + } + + try { + VoiceActionService.voiceManager.listenAndAnswer(sttService, ttsService, hliService, audioSource, audioSink, + loc, listeningItem); + } catch (IllegalStateException e) { + logger.warn("Failed executing simple dialog: {}", e.getMessage()); + } + } } diff --git a/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/VoiceManager.java b/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/VoiceManager.java index ddf896de1..ae56765f3 100644 --- a/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/VoiceManager.java +++ b/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/VoiceManager.java @@ -123,20 +123,21 @@ public interface VoiceManager { /** * Starts an infinite dialog sequence using all default services: keyword spotting on the default audio source, - * audio source listening to retrieve the question, speech to text conversion, interpretation, text to speech - * conversion and playback of the answer on the default audio sink + * audio source listening to retrieve a question or a command (default Speech to Text service), interpretation and + * handling of the command, and finally playback of the answer on the default audio sink (default Text to Speech + * service). * * Only one dialog can be started for the default audio source. * - * @throws IllegalStateException if required services are not all available or the provided locale is not supported - * by all these services or the dialog is already started for the default audio source + * @throws IllegalStateException if required services are not all available or the default locale is not supported + * by all these services or a dialog is already started for the default audio source */ void startDialog() throws IllegalStateException; /** * Starts an infinite dialog sequence: keyword spotting on the audio source, audio source listening to retrieve - * the question, speech to text conversion, interpretation, text to speech conversion and playback of the answer - * on the audio sink + * a question or a command (Speech to Text service), interpretation and handling of the command, and finally + * playback of the answer on the audio sink (Text to Speech service). * * Only one dialog can be started for an audio source. * @@ -150,7 +151,7 @@ public interface VoiceManager { * @param keyword the keyword to use during keyword spotting or null to use the default keyword * @param listeningItem the item to switch ON while listening to a question * @throws IllegalStateException if required services are not all available or the provided locale is not supported - * by all these services or the dialog is already started for this audio source + * by all these services or a dialog is already started for this audio source */ void startDialog(@Nullable KSService ks, @Nullable STTService stt, @Nullable TTSService tts, @Nullable HumanLanguageInterpreter hli, @Nullable AudioSource source, @Nullable AudioSink sink, @@ -165,6 +166,39 @@ public interface VoiceManager { */ void stopDialog(@Nullable AudioSource source) throws IllegalStateException; + /** + * Executes a simple dialog sequence without keyword spotting using all default services: default audio source + * listening to retrieve a question or a command (default Speech to Text service), interpretation and handling of + * the command, and finally playback of the answer on the default audio sink (default Text to Speech service). + * + * Only possible if no dialog processor is already started for the default audio source. + * + * @throws IllegalStateException if required services are not all available or the provided default locale is not + * supported by all these services or a dialog is already started for the default audio source + */ + void listenAndAnswer() throws IllegalStateException; + + /** + * Executes a simple dialog sequence without keyword spotting: audio source listening to retrieve a question or a + * command (Speech to Text service), interpretation and handling of the command, and finally playback of the + * answer on the audio sink (Text to Speech service). + * + * Only possible if no dialog processor is already started for the audio source. + * + * @param stt the speech-to-text service to use or null to use the default service + * @param tts the text-to-speech service to use or null to use the default service + * @param hli the human language text interpreter to use or null to use the default service + * @param source the audio source to use or null to use the default source + * @param sink the audio sink to use or null to use the default sink + * @param locale the locale to use or null to use the default locale + * @param listeningItem the item to switch ON while listening to a question + * @throws IllegalStateException if required services are not all available or the provided locale is not supported + * by all these services or a dialog is already started for this audio source + */ + void listenAndAnswer(@Nullable STTService stt, @Nullable TTSService tts, @Nullable HumanLanguageInterpreter hli, + @Nullable AudioSource source, @Nullable AudioSink sink, @Nullable Locale locale, + @Nullable String listeningItem) throws IllegalStateException; + /** * Retrieves a TTS service. * If a default name is configured and the service available, this is returned. Otherwise, the first available diff --git a/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/internal/DialogProcessor.java b/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/internal/DialogProcessor.java index 0d72ad899..d89024e5d 100644 --- a/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/internal/DialogProcessor.java +++ b/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/internal/DialogProcessor.java @@ -71,7 +71,7 @@ public class DialogProcessor implements KSListener, STTListener { private final Logger logger = LoggerFactory.getLogger(DialogProcessor.class); - private final KSService ks; + private final @Nullable KSService ks; private final STTService stt; private final TTSService tts; private final HumanLanguageInterpreter hli; @@ -124,23 +124,76 @@ public class DialogProcessor implements KSListener, STTListener { this.ttsFormat = VoiceManagerImpl.getBestMatch(tts.getSupportedFormats(), sink.getSupportedFormats()); } + public DialogProcessor(STTService stt, TTSService tts, HumanLanguageInterpreter hli, AudioSource source, + AudioSink sink, Locale locale, @Nullable String listeningItem, EventPublisher eventPublisher, + TranslationProvider i18nProvider, Bundle bundle) { + this.locale = locale; + this.ks = null; + this.hli = hli; + this.stt = stt; + this.tts = tts; + this.source = source; + this.sink = sink; + this.keyword = ""; + this.listeningItem = listeningItem; + this.eventPublisher = eventPublisher; + this.i18nProvider = i18nProvider; + this.bundle = bundle; + this.ksFormat = null; + this.sttFormat = VoiceManagerImpl.getBestMatch(source.getSupportedFormats(), stt.getSupportedFormats()); + this.ttsFormat = VoiceManagerImpl.getBestMatch(tts.getSupportedFormats(), sink.getSupportedFormats()); + } + public void start() { - AudioFormat fmt = ksFormat; + KSService ksService = ks; + if (ksService != null) { + abortKS(); + closeStreamKS(); + AudioFormat fmt = ksFormat; + if (fmt == null) { + logger.warn("No compatible audio format found for ks '{}' and source '{}'", ksService.getId(), + source.getId()); + return; + } + try { + AudioStream stream = source.getInputStream(fmt); + streamKS = stream; + ksServiceHandle = ksService.spot(this, stream, locale, keyword); + } catch (AudioException e) { + logger.warn("Encountered audio error: {}", e.getMessage()); + } catch (KSException e) { + logger.warn("Encountered error calling spot: {}", e.getMessage()); + closeStreamKS(); + } + } else { + executeSimpleDialog(); + } + } + + private void executeSimpleDialog() { + abortSTT(); + closeStreamSTT(); + isSTTServerAborting = false; + AudioFormat fmt = sttFormat; if (fmt == null) { - logger.warn("No compatible audio format found for ks '{}' and source '{}'", ks.getId(), source.getId()); + logger.warn("No compatible audio format found for stt '{}' and source '{}'", stt.getId(), source.getId()); return; } - abortKS(); - closeStreamKS(); try { AudioStream stream = source.getInputStream(fmt); - streamKS = stream; - ksServiceHandle = ks.spot(this, stream, locale, keyword); + streamSTT = stream; + sttServiceHandle = stt.recognize(this, stream, locale, new HashSet<>()); } catch (AudioException e) { - logger.warn("Encountered audio error: {}", e.getMessage()); - } catch (KSException e) { - logger.warn("Encountered error calling spot: {}", e.getMessage()); - closeStreamKS(); + logger.warn("Error creating the audio stream: {}", e.getMessage()); + } catch (STTException e) { + closeStreamSTT(); + String msg = e.getMessage(); + String text = i18nProvider.getText(bundle, "error.stt-exception", null, locale); + if (msg != null) { + say(text == null ? msg : text.replace("{0}", msg)); + } else if (text != null) { + say(text.replace("{0}", "")); + } } } @@ -210,32 +263,10 @@ public class DialogProcessor implements KSListener, STTListener { if (!processing) { isSTTServerAborting = false; if (ksEvent instanceof KSpottedEvent) { - abortSTT(); - closeStreamSTT(); - isSTTServerAborting = false; - AudioFormat fmt = sttFormat; - if (fmt != null) { - try { - AudioStream stream = source.getInputStream(fmt); - streamSTT = stream; - sttServiceHandle = stt.recognize(this, stream, locale, new HashSet<>()); - } catch (AudioException e) { - logger.warn("Error creating the audio stream: {}", e.getMessage()); - } catch (STTException e) { - closeStreamSTT(); - String msg = e.getMessage(); - String text = i18nProvider.getText(bundle, "error.stt-exception", null, locale); - if (msg != null) { - say(text == null ? msg : text.replace("{0}", msg)); - } else if (text != null) { - say(text.replace("{0}", "")); - } - } - } else { - logger.warn("No compatible audio format found for stt '{}' and source '{}'", stt.getId(), - source.getId()); - } + logger.debug("KSpottedEvent event received"); + executeSimpleDialog(); } else if (ksEvent instanceof KSErrorEvent) { + logger.debug("KSErrorEvent event received"); KSErrorEvent kse = (KSErrorEvent) ksEvent; String text = i18nProvider.getText(bundle, "error.ks-error", null, locale); say(text == null ? kse.getMessage() : text.replace("{0}", kse.getMessage())); @@ -246,25 +277,30 @@ public class DialogProcessor implements KSListener, STTListener { @Override public synchronized void sttEventReceived(STTEvent sttEvent) { if (sttEvent instanceof SpeechRecognitionEvent) { + logger.debug("SpeechRecognitionEvent event received"); if (!isSTTServerAborting) { SpeechRecognitionEvent sre = (SpeechRecognitionEvent) sttEvent; String question = sre.getTranscript(); + logger.debug("Text recognized: {}", question); try { toggleProcessing(false); - say(hli.interpret(locale, question)); + String answer = hli.interpret(locale, question); + logger.debug("Interpretation result: {}", answer); + say(answer); } catch (InterpretationException e) { - String msg = e.getMessage(); - if (msg != null) { - say(msg); - } + logger.debug("Interpretation exception: {}", e.getMessage()); + say(e.getMessage()); } abortSTT(); } } else if (sttEvent instanceof RecognitionStartEvent) { + logger.debug("RecognitionStartEvent event received"); toggleProcessing(true); } else if (sttEvent instanceof RecognitionStopEvent) { + logger.debug("RecognitionStopEvent event received"); toggleProcessing(false); } else if (sttEvent instanceof SpeechRecognitionErrorEvent) { + logger.debug("SpeechRecognitionErrorEvent event received"); if (!isSTTServerAborting) { abortSTT(); toggleProcessing(false); diff --git a/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/internal/VoiceConsoleCommandExtension.java b/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/internal/VoiceConsoleCommandExtension.java index ebc4783a5..bf8affce1 100644 --- a/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/internal/VoiceConsoleCommandExtension.java +++ b/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/internal/VoiceConsoleCommandExtension.java @@ -60,6 +60,7 @@ public class VoiceConsoleCommandExtension extends AbstractConsoleCommandExtensio private static final String SUBCMD_VOICES = "voices"; private static final String SUBCMD_START_DIALOG = "startdialog"; private static final String SUBCMD_STOP_DIALOG = "stopdialog"; + private static final String SUBCMD_LISTEN_ANSWER = "listenandanswer"; private static final String SUBCMD_INTERPRETERS = "interpreters"; private static final String SUBCMD_KEYWORD_SPOTTERS = "keywordspotters"; private static final String SUBCMD_STT_SERVICES = "sttservices"; @@ -91,6 +92,8 @@ public class VoiceConsoleCommandExtension extends AbstractConsoleCommandExtensio "start a new dialog processing using the default services or the services identified with provided arguments"), buildCommandUsage(SUBCMD_STOP_DIALOG + " []", "stop the dialog processing for the default audio source or the audio source identified with provided argument"), + buildCommandUsage(SUBCMD_LISTEN_ANSWER + " [ [ [ [ []]]]]", + "Execute a simple dialog sequence without keyword spotting using the default services or the services identified with provided arguments"), buildCommandUsage(SUBCMD_INTERPRETERS, "lists the interpreters"), buildCommandUsage(SUBCMD_KEYWORD_SPOTTERS, "lists the keyword spotters"), buildCommandUsage(SUBCMD_STT_SERVICES, "lists the Speech-to-Text services"), @@ -151,6 +154,19 @@ public class VoiceConsoleCommandExtension extends AbstractConsoleCommandExtensio "An error occurred while stopping the dialog")); } break; + case SUBCMD_LISTEN_ANSWER: + try { + AudioSource source = args.length < 2 ? null : audioManager.getSource(args[1]); + AudioSink sink = args.length < 3 ? null : audioManager.getSink(args[2]); + HumanLanguageInterpreter hli = args.length < 4 ? null : voiceManager.getHLI(args[3]); + TTSService tts = args.length < 5 ? null : voiceManager.getTTS(args[4]); + STTService stt = args.length < 6 ? null : voiceManager.getSTT(args[5]); + voiceManager.listenAndAnswer(stt, tts, hli, source, sink, null, null); + } catch (IllegalStateException e) { + console.println(Objects.requireNonNullElse(e.getMessage(), + "An error occurred while executing the simple dialog sequence")); + } + break; case SUBCMD_INTERPRETERS: listInterpreters(console); return; diff --git a/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/internal/VoiceManagerImpl.java b/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/internal/VoiceManagerImpl.java index 94bd72c60..178c925d4 100644 --- a/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/internal/VoiceManagerImpl.java +++ b/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/internal/VoiceManagerImpl.java @@ -549,6 +549,48 @@ public class VoiceManagerImpl implements VoiceManager, ConfigOptionProvider { dialogProcessors.clear(); } + @Override + public void listenAndAnswer() throws IllegalStateException { + listenAndAnswer(null, null, null, null, null, null, null); + } + + @Override + public void listenAndAnswer(@Nullable STTService stt, @Nullable TTSService tts, + @Nullable HumanLanguageInterpreter hli, @Nullable AudioSource source, @Nullable AudioSink sink, + @Nullable Locale locale, @Nullable String listeningItem) throws IllegalStateException { + // use defaults, if null + STTService sttService = (stt == null) ? getSTT() : stt; + TTSService ttsService = (tts == null) ? getTTS() : tts; + HumanLanguageInterpreter interpreter = (hli == null) ? getHLI() : hli; + AudioSource audioSource = (source == null) ? audioManager.getSource() : source; + AudioSink audioSink = (sink == null) ? audioManager.getSink() : sink; + Locale loc = (locale == null) ? localeProvider.getLocale() : locale; + String item = (listeningItem == null) ? this.listeningItem : listeningItem; + Bundle b = bundle; + + if (sttService == null || ttsService == null || interpreter == null || audioSource == null || audioSink == null + || b == null) { + throw new IllegalStateException("Cannot execute a simple dialog as services are missing."); + } else if (!checkLocales(sttService.getSupportedLocales(), loc) + || !checkLocales(interpreter.getSupportedLocales(), loc)) { + throw new IllegalStateException( + "Cannot execute a simple dialog as provided locale is not supported by all services."); + } else { + DialogProcessor processor = dialogProcessors.get(audioSource.getId()); + if (processor == null) { + logger.debug("Executing a simple dialog for source {} ({})", audioSource.getLabel(null), + audioSource.getId()); + processor = new DialogProcessor(sttService, ttsService, interpreter, audioSource, audioSink, loc, item, + this.eventPublisher, this.i18nProvider, b); + processor.start(); + } else { + throw new IllegalStateException(String.format( + "Cannot execute a simple dialog as a dialog is already started for audio source '%s'.", + audioSource.getLabel(null))); + } + } + } + private boolean checkLocales(Set supportedLocales, Locale locale) { if (supportedLocales.isEmpty()) { return true;