[voice] New feature "listen and answer" (#2793)

* New method listenAndAnswer in VoiceManager * New console command "voice listenandanswer" * New REST API * New rule action * Enhanced console command "voice startdialog" Closes #2688 Signed-off-by: Laurent Garnier <lg.hc@free.fr>
2025-01-11 05:41:52 +01:00 · 2022-04-09 16:26:09 +02:00 · 2022-04-09 16:26:09 +02:00 · 90f6a95251
commit 90f6a95251
parent 6a75130355
6 changed files with 325 additions and 48 deletions
--- a/bundles/org.openhab.core.io.rest.voice/src/main/java/org/openhab/core/io/rest/voice/internal/VoiceResource.java
+++ b/bundles/org.openhab.core.io.rest.voice/src/main/java/org/openhab/core/io/rest/voice/internal/VoiceResource.java
@ -312,4 +312,64 @@ public class VoiceResource implements RESTResource {
            return JSONResponse.createErrorResponse(Status.BAD_REQUEST, e.getMessage());
        }
    }
+
+    @POST
+    @Path("/listenandanswer")
+    @Consumes(MediaType.TEXT_PLAIN)
+    @Operation(operationId = "listenAndAnswer", summary = "Executes a simple dialog sequence without keyword spotting for a given audio source.", responses = {
+            @ApiResponse(responseCode = "200", description = "OK"),
+            @ApiResponse(responseCode = "404", description = "One of the given ids is wrong."),
+            @ApiResponse(responseCode = "400", description = "Services are missing or language is not supported by services or dialog processing is already started for the audio source.") })
+    public Response listenAndAnswer(
+            @HeaderParam(HttpHeaders.ACCEPT_LANGUAGE) @Parameter(description = "language") @Nullable String language,
+            @QueryParam("sourceId") @Parameter(description = "source ID") @Nullable String sourceId,
+            @QueryParam("sttId") @Parameter(description = "Speech-to-Text ID") @Nullable String sttId,
+            @QueryParam("ttsId") @Parameter(description = "Text-to-Speech ID") @Nullable String ttsId,
+            @QueryParam("hliId") @Parameter(description = "interpreter ID") @Nullable String hliId,
+            @QueryParam("sinkId") @Parameter(description = "audio sink ID") @Nullable String sinkId,
+            @QueryParam("listeningItem") @Parameter(description = "listening item") @Nullable String listeningItem) {
+        AudioSource source = null;
+        if (sourceId != null) {
+            source = audioManager.getSource(sourceId);
+            if (source == null) {
+                return JSONResponse.createErrorResponse(Status.NOT_FOUND, "Audio source not found");
+            }
+        }
+        STTService stt = null;
+        if (sttId != null) {
+            stt = voiceManager.getSTT(sttId);
+            if (stt == null) {
+                return JSONResponse.createErrorResponse(Status.NOT_FOUND, "Speech-to-Text not found");
+            }
+        }
+        TTSService tts = null;
+        if (ttsId != null) {
+            tts = voiceManager.getTTS(ttsId);
+            if (tts == null) {
+                return JSONResponse.createErrorResponse(Status.NOT_FOUND, "Text-to-Speech not found");
+            }
+        }
+        HumanLanguageInterpreter hli = null;
+        if (hliId != null) {
+            hli = voiceManager.getHLI(hliId);
+            if (hli == null) {
+                return JSONResponse.createErrorResponse(Status.NOT_FOUND, "Interpreter not found");
+            }
+        }
+        AudioSink sink = null;
+        if (sinkId != null) {
+            sink = audioManager.getSink(sinkId);
+            if (sink == null) {
+                return JSONResponse.createErrorResponse(Status.NOT_FOUND, "Audio sink not found");
+            }
+        }
+        final Locale locale = localeService.getLocale(language);
+
+        try {
+            voiceManager.listenAndAnswer(stt, tts, hli, source, sink, locale, listeningItem);
+            return Response.ok(null, MediaType.TEXT_PLAIN).build();
+        } catch (IllegalStateException e) {
+            return JSONResponse.createErrorResponse(Status.BAD_REQUEST, e.getMessage());
+        }
+    }
 }
--- a/bundles/org.openhab.core.model.script/src/org/openhab/core/model/script/actions/Voice.java
+++ b/bundles/org.openhab.core.model.script/src/org/openhab/core/model/script/actions/Voice.java
@ -320,4 +320,93 @@ public class Voice {
            logger.warn("Failed stopping dialog processing: {}", e.getMessage());
        }
    }
+
+    /**
+     * Executes a simple dialog sequence without keyword spotting for a given audio source using default speech-to-text
+     * service, default text-to-speech service, default human language text interpreter and default locale.
+     *
+     * @param source the name of audio source to use or null to use the default source
+     * @param sink the name of audio sink to use or null to use the default sink
+     */
+    @ActionDoc(text = "executes a simple dialog sequence without keyword spotting for a given audio source")
+    public static void listenAndAnswer(@ParamDoc(name = "source") @Nullable String source,
+            @ParamDoc(name = "sink") @Nullable String sink) {
+        listenAndAnswer(null, null, null, source, sink, null, null);
+    }
+
+    /**
+     * Executes a simple dialog sequence without keyword spotting for a given audio source.
+     *
+     * @param stt the speech-to-text service to use or null to use the default service
+     * @param tts the text-to-speech service to use or null to use the default service
+     * @param interpreter the human language text interpreter to use or null to use the default service
+     * @param source the name of audio source to use or null to use the default source
+     * @param sink the name of audio sink to use or null to use the default sink
+     * @param Locale the locale to use or null to use the default locale
+     * @param listeningItem the item to switch ON while listening to a question
+     */
+    @ActionDoc(text = "executes a simple dialog sequence without keyword spotting for a given audio source")
+    public static void listenAndAnswer(@ParamDoc(name = "speech-to-text service") @Nullable String stt,
+            @ParamDoc(name = "text-to-speech service") @Nullable String tts,
+            @ParamDoc(name = "interpreter") @Nullable String interpreter,
+            @ParamDoc(name = "source") @Nullable String source, @ParamDoc(name = "sink") @Nullable String sink,
+            @ParamDoc(name = "locale") @Nullable String locale,
+            @ParamDoc(name = "listening item") @Nullable String listeningItem) {
+        AudioSource audioSource = null;
+        if (source != null) {
+            audioSource = VoiceActionService.audioManager.getSource(source);
+            if (audioSource == null) {
+                logger.warn("Failed executing simple dialog: audio source '{}' not found", source);
+                return;
+            }
+        }
+        STTService sttService = null;
+        if (stt != null) {
+            sttService = VoiceActionService.voiceManager.getSTT(stt);
+            if (sttService == null) {
+                logger.warn("Failed executing simple dialog: speech-to-text service '{}' not found", stt);
+                return;
+            }
+        }
+        TTSService ttsService = null;
+        if (tts != null) {
+            ttsService = VoiceActionService.voiceManager.getTTS(tts);
+            if (ttsService == null) {
+                logger.warn("Failed executing simple dialog: text-to-speech service '{}' not found", tts);
+                return;
+            }
+        }
+        HumanLanguageInterpreter hliService = null;
+        if (interpreter != null) {
+            hliService = VoiceActionService.voiceManager.getHLI(interpreter);
+            if (hliService == null) {
+                logger.warn("Failed executing simple dialog: interpreter '{}' not found", interpreter);
+                return;
+            }
+        }
+        AudioSink audioSink = null;
+        if (sink != null) {
+            audioSink = VoiceActionService.audioManager.getSink(sink);
+            if (audioSink == null) {
+                logger.warn("Failed executing simple dialog: audio sink '{}' not found", sink);
+                return;
+            }
+        }
+        Locale loc = null;
+        if (locale != null) {
+            String[] split = locale.split("-");
+            if (split.length == 2) {
+                loc = new Locale(split[0], split[1]);
+            } else {
+                loc = new Locale(split[0]);
+            }
+        }
+
+        try {
+            VoiceActionService.voiceManager.listenAndAnswer(sttService, ttsService, hliService, audioSource, audioSink,
+                    loc, listeningItem);
+        } catch (IllegalStateException e) {
+            logger.warn("Failed executing simple dialog: {}", e.getMessage());
+        }
+    }
 }
--- a/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/VoiceManager.java
+++ b/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/VoiceManager.java
@ -123,20 +123,21 @@ public interface VoiceManager {

    /**
     * Starts an infinite dialog sequence using all default services: keyword spotting on the default audio source,
-     * audio source listening to retrieve the question, speech to text conversion, interpretation, text to speech
-     * conversion and playback of the answer on the default audio sink
+     * audio source listening to retrieve a question or a command (default Speech to Text service), interpretation and
+     * handling of the command, and finally playback of the answer on the default audio sink (default Text to Speech
+     * service).
     *
     * Only one dialog can be started for the default audio source.
     *
-     * @throws IllegalStateException if required services are not all available or the provided locale is not supported
-     *             by all these services or the dialog is already started for the default audio source
+     * @throws IllegalStateException if required services are not all available or the default locale is not supported
+     *             by all these services or a dialog is already started for the default audio source
     */
    void startDialog() throws IllegalStateException;

    /**
     * Starts an infinite dialog sequence: keyword spotting on the audio source, audio source listening to retrieve
-     * the question, speech to text conversion, interpretation, text to speech conversion and playback of the answer
-     * on the audio sink
+     * a question or a command (Speech to Text service), interpretation and handling of the command, and finally
+     * playback of the answer on the audio sink (Text to Speech service).
     *
     * Only one dialog can be started for an audio source.
     *
@ -150,7 +151,7 @@ public interface VoiceManager {
     * @param keyword the keyword to use during keyword spotting or null to use the default keyword
     * @param listeningItem the item to switch ON while listening to a question
     * @throws IllegalStateException if required services are not all available or the provided locale is not supported
-     *             by all these services or the dialog is already started for this audio source
+     *             by all these services or a dialog is already started for this audio source
     */
    void startDialog(@Nullable KSService ks, @Nullable STTService stt, @Nullable TTSService tts,
            @Nullable HumanLanguageInterpreter hli, @Nullable AudioSource source, @Nullable AudioSink sink,
@ -165,6 +166,39 @@ public interface VoiceManager {
     */
    void stopDialog(@Nullable AudioSource source) throws IllegalStateException;

+    /**
+     * Executes a simple dialog sequence without keyword spotting using all default services: default audio source
+     * listening to retrieve a question or a command (default Speech to Text service), interpretation and handling of
+     * the command, and finally playback of the answer on the default audio sink (default Text to Speech service).
+     *
+     * Only possible if no dialog processor is already started for the default audio source.
+     *
+     * @throws IllegalStateException if required services are not all available or the provided default locale is not
+     *             supported by all these services or a dialog is already started for the default audio source
+     */
+    void listenAndAnswer() throws IllegalStateException;
+
+    /**
+     * Executes a simple dialog sequence without keyword spotting: audio source listening to retrieve a question or a
+     * command (Speech to Text service), interpretation and handling of the command, and finally playback of the
+     * answer on the audio sink (Text to Speech service).
+     *
+     * Only possible if no dialog processor is already started for the audio source.
+     *
+     * @param stt the speech-to-text service to use or null to use the default service
+     * @param tts the text-to-speech service to use or null to use the default service
+     * @param hli the human language text interpreter to use or null to use the default service
+     * @param source the audio source to use or null to use the default source
+     * @param sink the audio sink to use or null to use the default sink
+     * @param locale the locale to use or null to use the default locale
+     * @param listeningItem the item to switch ON while listening to a question
+     * @throws IllegalStateException if required services are not all available or the provided locale is not supported
+     *             by all these services or a dialog is already started for this audio source
+     */
+    void listenAndAnswer(@Nullable STTService stt, @Nullable TTSService tts, @Nullable HumanLanguageInterpreter hli,
+            @Nullable AudioSource source, @Nullable AudioSink sink, @Nullable Locale locale,
+            @Nullable String listeningItem) throws IllegalStateException;
+
    /**
     * Retrieves a TTS service.
     * If a default name is configured and the service available, this is returned. Otherwise, the first available
--- a/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/internal/DialogProcessor.java
+++ b/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/internal/DialogProcessor.java
@ -71,7 +71,7 @@ public class DialogProcessor implements KSListener, STTListener {

    private final Logger logger = LoggerFactory.getLogger(DialogProcessor.class);

-    private final KSService ks;
+    private final @Nullable KSService ks;
    private final STTService stt;
    private final TTSService tts;
    private final HumanLanguageInterpreter hli;
@ -124,23 +124,76 @@ public class DialogProcessor implements KSListener, STTListener {
        this.ttsFormat = VoiceManagerImpl.getBestMatch(tts.getSupportedFormats(), sink.getSupportedFormats());
    }

+    public DialogProcessor(STTService stt, TTSService tts, HumanLanguageInterpreter hli, AudioSource source,
+            AudioSink sink, Locale locale, @Nullable String listeningItem, EventPublisher eventPublisher,
+            TranslationProvider i18nProvider, Bundle bundle) {
+        this.locale = locale;
+        this.ks = null;
+        this.hli = hli;
+        this.stt = stt;
+        this.tts = tts;
+        this.source = source;
+        this.sink = sink;
+        this.keyword = "";
+        this.listeningItem = listeningItem;
+        this.eventPublisher = eventPublisher;
+        this.i18nProvider = i18nProvider;
+        this.bundle = bundle;
+        this.ksFormat = null;
+        this.sttFormat = VoiceManagerImpl.getBestMatch(source.getSupportedFormats(), stt.getSupportedFormats());
+        this.ttsFormat = VoiceManagerImpl.getBestMatch(tts.getSupportedFormats(), sink.getSupportedFormats());
+    }
+
    public void start() {
-        AudioFormat fmt = ksFormat;
+        KSService ksService = ks;
+        if (ksService != null) {
+            abortKS();
+            closeStreamKS();
+            AudioFormat fmt = ksFormat;
+            if (fmt == null) {
+                logger.warn("No compatible audio format found for ks '{}' and source '{}'", ksService.getId(),
+                        source.getId());
+                return;
+            }
+            try {
+                AudioStream stream = source.getInputStream(fmt);
+                streamKS = stream;
+                ksServiceHandle = ksService.spot(this, stream, locale, keyword);
+            } catch (AudioException e) {
+                logger.warn("Encountered audio error: {}", e.getMessage());
+            } catch (KSException e) {
+                logger.warn("Encountered error calling spot: {}", e.getMessage());
+                closeStreamKS();
+            }
+        } else {
+            executeSimpleDialog();
+        }
+    }
+
+    private void executeSimpleDialog() {
+        abortSTT();
+        closeStreamSTT();
+        isSTTServerAborting = false;
+        AudioFormat fmt = sttFormat;
        if (fmt == null) {
-            logger.warn("No compatible audio format found for ks '{}' and source '{}'", ks.getId(), source.getId());
+            logger.warn("No compatible audio format found for stt '{}' and source '{}'", stt.getId(), source.getId());
            return;
        }
-        abortKS();
-        closeStreamKS();
        try {
            AudioStream stream = source.getInputStream(fmt);
-            streamKS = stream;
-            ksServiceHandle = ks.spot(this, stream, locale, keyword);
+            streamSTT = stream;
+            sttServiceHandle = stt.recognize(this, stream, locale, new HashSet<>());
        } catch (AudioException e) {
-            logger.warn("Encountered audio error: {}", e.getMessage());
-        } catch (KSException e) {
-            logger.warn("Encountered error calling spot: {}", e.getMessage());
-            closeStreamKS();
+            logger.warn("Error creating the audio stream: {}", e.getMessage());
+        } catch (STTException e) {
+            closeStreamSTT();
+            String msg = e.getMessage();
+            String text = i18nProvider.getText(bundle, "error.stt-exception", null, locale);
+            if (msg != null) {
+                say(text == null ? msg : text.replace("{0}", msg));
+            } else if (text != null) {
+                say(text.replace("{0}", ""));
+            }
        }
    }

@ -210,32 +263,10 @@ public class DialogProcessor implements KSListener, STTListener {
        if (!processing) {
            isSTTServerAborting = false;
            if (ksEvent instanceof KSpottedEvent) {
-                abortSTT();
-                closeStreamSTT();
-                isSTTServerAborting = false;
-                AudioFormat fmt = sttFormat;
-                if (fmt != null) {
-                    try {
-                        AudioStream stream = source.getInputStream(fmt);
-                        streamSTT = stream;
-                        sttServiceHandle = stt.recognize(this, stream, locale, new HashSet<>());
-                    } catch (AudioException e) {
-                        logger.warn("Error creating the audio stream: {}", e.getMessage());
-                    } catch (STTException e) {
-                        closeStreamSTT();
-                        String msg = e.getMessage();
-                        String text = i18nProvider.getText(bundle, "error.stt-exception", null, locale);
-                        if (msg != null) {
-                            say(text == null ? msg : text.replace("{0}", msg));
-                        } else if (text != null) {
-                            say(text.replace("{0}", ""));
-                        }
-                    }
-                } else {
-                    logger.warn("No compatible audio format found for stt '{}' and source '{}'", stt.getId(),
-                            source.getId());
-                }
+                logger.debug("KSpottedEvent event received");
+                executeSimpleDialog();
            } else if (ksEvent instanceof KSErrorEvent) {
+                logger.debug("KSErrorEvent event received");
                KSErrorEvent kse = (KSErrorEvent) ksEvent;
                String text = i18nProvider.getText(bundle, "error.ks-error", null, locale);
                say(text == null ? kse.getMessage() : text.replace("{0}", kse.getMessage()));
@ -246,25 +277,30 @@ public class DialogProcessor implements KSListener, STTListener {
    @Override
    public synchronized void sttEventReceived(STTEvent sttEvent) {
        if (sttEvent instanceof SpeechRecognitionEvent) {
+            logger.debug("SpeechRecognitionEvent event received");
            if (!isSTTServerAborting) {
                SpeechRecognitionEvent sre = (SpeechRecognitionEvent) sttEvent;
                String question = sre.getTranscript();
+                logger.debug("Text recognized: {}", question);
                try {
                    toggleProcessing(false);
-                    say(hli.interpret(locale, question));
+                    String answer = hli.interpret(locale, question);
+                    logger.debug("Interpretation result: {}", answer);
+                    say(answer);
                } catch (InterpretationException e) {
-                    String msg = e.getMessage();
-                    if (msg != null) {
-                        say(msg);
-                    }
+                    logger.debug("Interpretation exception: {}", e.getMessage());
+                    say(e.getMessage());
                }
                abortSTT();
            }
        } else if (sttEvent instanceof RecognitionStartEvent) {
+            logger.debug("RecognitionStartEvent event received");
            toggleProcessing(true);
        } else if (sttEvent instanceof RecognitionStopEvent) {
+            logger.debug("RecognitionStopEvent event received");
            toggleProcessing(false);
        } else if (sttEvent instanceof SpeechRecognitionErrorEvent) {
+            logger.debug("SpeechRecognitionErrorEvent event received");
            if (!isSTTServerAborting) {
                abortSTT();
                toggleProcessing(false);
--- a/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/internal/VoiceConsoleCommandExtension.java
+++ b/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/internal/VoiceConsoleCommandExtension.java
@ -60,6 +60,7 @@ public class VoiceConsoleCommandExtension extends AbstractConsoleCommandExtensio
    private static final String SUBCMD_VOICES = "voices";
    private static final String SUBCMD_START_DIALOG = "startdialog";
    private static final String SUBCMD_STOP_DIALOG = "stopdialog";
+    private static final String SUBCMD_LISTEN_ANSWER = "listenandanswer";
    private static final String SUBCMD_INTERPRETERS = "interpreters";
    private static final String SUBCMD_KEYWORD_SPOTTERS = "keywordspotters";
    private static final String SUBCMD_STT_SERVICES = "sttservices";
@ -91,6 +92,8 @@ public class VoiceConsoleCommandExtension extends AbstractConsoleCommandExtensio
                        "start a new dialog processing using the default services or the services identified with provided arguments"),
                buildCommandUsage(SUBCMD_STOP_DIALOG + " [<source>]",
                        "stop the dialog processing for the default audio source or the audio source identified with provided argument"),
+                buildCommandUsage(SUBCMD_LISTEN_ANSWER + " [<source> [<sink> [<interpreter> [<tts> [<stt>]]]]]",
+                        "Execute a simple dialog sequence without keyword spotting using the default services or the services identified with provided arguments"),
                buildCommandUsage(SUBCMD_INTERPRETERS, "lists the interpreters"),
                buildCommandUsage(SUBCMD_KEYWORD_SPOTTERS, "lists the keyword spotters"),
                buildCommandUsage(SUBCMD_STT_SERVICES, "lists the Speech-to-Text services"),
@ -151,6 +154,19 @@ public class VoiceConsoleCommandExtension extends AbstractConsoleCommandExtensio
                                "An error occurred while stopping the dialog"));
                    }
                    break;
+                case SUBCMD_LISTEN_ANSWER:
+                    try {
+                        AudioSource source = args.length < 2 ? null : audioManager.getSource(args[1]);
+                        AudioSink sink = args.length < 3 ? null : audioManager.getSink(args[2]);
+                        HumanLanguageInterpreter hli = args.length < 4 ? null : voiceManager.getHLI(args[3]);
+                        TTSService tts = args.length < 5 ? null : voiceManager.getTTS(args[4]);
+                        STTService stt = args.length < 6 ? null : voiceManager.getSTT(args[5]);
+                        voiceManager.listenAndAnswer(stt, tts, hli, source, sink, null, null);
+                    } catch (IllegalStateException e) {
+                        console.println(Objects.requireNonNullElse(e.getMessage(),
+                                "An error occurred while executing the simple dialog sequence"));
+                    }
+                    break;
                case SUBCMD_INTERPRETERS:
                    listInterpreters(console);
                    return;
--- a/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/internal/VoiceManagerImpl.java
+++ b/bundles/org.openhab.core.voice/src/main/java/org/openhab/core/voice/internal/VoiceManagerImpl.java
@ -549,6 +549,48 @@ public class VoiceManagerImpl implements VoiceManager, ConfigOptionProvider {
        dialogProcessors.clear();
    }

+    @Override
+    public void listenAndAnswer() throws IllegalStateException {
+        listenAndAnswer(null, null, null, null, null, null, null);
+    }
+
+    @Override
+    public void listenAndAnswer(@Nullable STTService stt, @Nullable TTSService tts,
+            @Nullable HumanLanguageInterpreter hli, @Nullable AudioSource source, @Nullable AudioSink sink,
+            @Nullable Locale locale, @Nullable String listeningItem) throws IllegalStateException {
+        // use defaults, if null
+        STTService sttService = (stt == null) ? getSTT() : stt;
+        TTSService ttsService = (tts == null) ? getTTS() : tts;
+        HumanLanguageInterpreter interpreter = (hli == null) ? getHLI() : hli;
+        AudioSource audioSource = (source == null) ? audioManager.getSource() : source;
+        AudioSink audioSink = (sink == null) ? audioManager.getSink() : sink;
+        Locale loc = (locale == null) ? localeProvider.getLocale() : locale;
+        String item = (listeningItem == null) ? this.listeningItem : listeningItem;
+        Bundle b = bundle;
+
+        if (sttService == null || ttsService == null || interpreter == null || audioSource == null || audioSink == null
+                || b == null) {
+            throw new IllegalStateException("Cannot execute a simple dialog as services are missing.");
+        } else if (!checkLocales(sttService.getSupportedLocales(), loc)
+                || !checkLocales(interpreter.getSupportedLocales(), loc)) {
+            throw new IllegalStateException(
+                    "Cannot execute a simple dialog as provided locale is not supported by all services.");
+        } else {
+            DialogProcessor processor = dialogProcessors.get(audioSource.getId());
+            if (processor == null) {
+                logger.debug("Executing a simple dialog for source {} ({})", audioSource.getLabel(null),
+                        audioSource.getId());
+                processor = new DialogProcessor(sttService, ttsService, interpreter, audioSource, audioSink, loc, item,
+                        this.eventPublisher, this.i18nProvider, b);
+                processor.start();
+            } else {
+                throw new IllegalStateException(String.format(
+                        "Cannot execute a simple dialog as a dialog is already started for audio source '%s'.",
+                        audioSource.getLabel(null)));
+            }
+        }
+    }
+
    private boolean checkLocales(Set<Locale> supportedLocales, Locale locale) {
        if (supportedLocales.isEmpty()) {
            return true;