STT service improvements (#12453)

* [googlestt|voskstt] change default maxSilenceSeconds to 3 * [watsonstt] add singleUtterance mode, rename inativityTimeout to maxSilenceSeconds and minor improvements * [watsonstt] trim transcription Signed-off-by: Miguel Álvarez Díez <miguelwork92@gmail.com>
2025-01-25 14:55:55 +01:00 · 2022-03-12 23:06:51 +01:00 · 2022-03-12 23:06:51 +01:00 · 480cddbf2c
commit 480cddbf2c
parent ce6ef25ac3
10 changed files with 51 additions and 47 deletions
--- a/bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTConfiguration.java
+++ b/bundles/org.openhab.voice.googlestt/src/main/java/org/openhab/voice/googlestt/internal/GoogleSTTConfiguration.java
@ -49,7 +49,7 @@ public class GoogleSTTConfiguration {
     * Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop
     * listening.
     */
-    public int maxSilenceSeconds = 5;
+    public int maxSilenceSeconds = 3;
    /**
     * Single phrase mode.
     */
--- a/bundles/org.openhab.voice.googlestt/src/main/resources/OH-INF/config/config.xml
+++ b/bundles/org.openhab.voice.googlestt/src/main/resources/OH-INF/config/config.xml
@ -46,7 +46,7 @@
 			<label>Max Silence Seconds</label>
 			<description>Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop
 				listening.</description>
-			<default>5</default>
+			<default>3</default>
 		</parameter>
 		<parameter name="refreshSupportedLocales" type="boolean" groupName="stt">
 			<label>Refresh Supported Locales</label>
--- a/bundles/org.openhab.voice.voskstt/src/main/java/org/openhab/voice/voskstt/internal/VoskSTTConfiguration.java
+++ b/bundles/org.openhab.voice.voskstt/src/main/java/org/openhab/voice/voskstt/internal/VoskSTTConfiguration.java
@ -33,7 +33,7 @@ public class VoskSTTConfiguration {
     * Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop
     * listening.
     */
-    public int maxSilenceSeconds = 5;
+    public int maxSilenceSeconds = 3;
    /**
     * Message to be told when no results.
     */
--- a/bundles/org.openhab.voice.voskstt/src/main/resources/OH-INF/config/config.xml
+++ b/bundles/org.openhab.voice.voskstt/src/main/resources/OH-INF/config/config.xml
@ -27,7 +27,7 @@
 			<label>Max Silence Seconds</label>
 			<description>Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop
 				listening.</description>
-			<default>5</default>
+			<default>3</default>
 		</parameter>
 		<parameter name="preloadModel" type="boolean" groupName="stt">
 			<label>Preload Model</label>
--- a/bundles/org.openhab.voice.watsonstt/README.md
+++ b/bundles/org.openhab.voice.watsonstt/README.md
@ -26,7 +26,8 @@ Use your favorite configuration UI to edit **Settings / Other Services - IBM Wat
 * **Background Audio Suppression** - Use the parameter to suppress side conversations or background noise.
 * **Speech Detector Sensitivity** - Use the parameter to suppress word insertions from music, coughing, and other non-speech events.
-* **Inactivity Timeout** - The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed.
+* **Single Utterance Mode** - When enabled recognition stops listening after a single utterance.
 * **Max Silence Seconds** - The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed.
 * **Opt Out Logging** - By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the services for future users. The logged data is not shared or made public.
 * **No Results Message** - Message to be told when no results.
 * **Smart Formatting** - If true, the service converts dates, times, series of digits and numbers, phone numbers, currency values, and internet addresses into more readable. (Not available for all locales)
@ -43,7 +44,8 @@ org.openhab.voice.watsonstt:apiKey=******
 org.openhab.voice.watsonstt:instanceUrl=https://api.***.speech-to-text.watson.cloud.ibm.com/instances/*****
 org.openhab.voice.watsonstt:backgroundAudioSuppression=0.5
 org.openhab.voice.watsonstt:speechDetectorSensitivity=0.5
-org.openhab.voice.watsonstt:inactivityTimeout=2
+org.openhab.voice.watsonstt:singleUtteranceMode=true
 org.openhab.voice.watsonstt:maxSilenceSeconds=2
 org.openhab.voice.watsonstt:optOutLogging=false
 org.openhab.voice.watsonstt:smartFormatting=false
 org.openhab.voice.watsonstt:redaction=false
--- a/bundles/org.openhab.voice.watsonstt/pom.xml
+++ b/bundles/org.openhab.voice.watsonstt/pom.xml
@ -27,7 +27,7 @@
    <dependency>
      <groupId>com.ibm.cloud</groupId>
      <artifactId>sdk-core</artifactId>
-      <version>9.15.0</version>
+      <version>9.15.4</version>
      <scope>compile</scope>
    </dependency>
    <dependency>
@ -39,13 +39,13 @@
    <dependency>
      <groupId>com.squareup.okhttp3</groupId>
      <artifactId>okhttp</artifactId>
-      <version>4.9.1</version>
+      <version>4.9.3</version>
      <scope>compile</scope>
    </dependency>
    <dependency>
      <groupId>com.squareup.okhttp3</groupId>
      <artifactId>okhttp-urlconnection</artifactId>
-      <version>4.9.1</version>
+      <version>4.9.3</version>
      <scope>compile</scope>
    </dependency>
    <dependency>
--- a/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTConfiguration.java
+++ b/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTConfiguration.java
@ -48,9 +48,13 @@ public class WatsonSTTConfiguration {
     */
    public boolean redaction = false;
    /**
-     * The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed.
+     * Single phrase mode.
     */
-    public int inactivityTimeout = 3;
+    public boolean singleUtteranceMode = true;
    /**
     * max seconds without getting new transcriptions to stop listening.
     */
    public int maxSilenceSeconds = 3;
    /**
     * Message to be told when no results
     */
--- a/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTService.java
+++ b/bundles/org.openhab.voice.watsonstt/src/main/java/org/openhab/voice/watsonstt/internal/WatsonSTTService.java
@ -23,8 +23,6 @@ import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.stream.Collectors;
 import javax.net.ssl.SSLPeerUnverifiedException;
 import org.eclipse.jdt.annotation.NonNullByDefault;
 import org.eclipse.jdt.annotation.Nullable;
 import org.openhab.core.audio.AudioFormat;
@ -47,6 +45,7 @@ import org.osgi.service.component.annotations.Modified;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.google.gson.JsonObject;
 import com.ibm.cloud.sdk.core.http.HttpMediaType;
 import com.ibm.cloud.sdk.core.security.IamAuthenticator;
 import com.ibm.watson.speech_to_text.v1.SpeechToText;
@ -130,31 +129,13 @@ public class WatsonSTTService implements STTService {
                .contentType(contentType).redaction(config.redaction).smartFormatting(config.smartFormatting)
                .model(locale.toLanguageTag() + "_BroadbandModel").interimResults(true)
                .backgroundAudioSuppression(config.backgroundAudioSuppression)
-                .speechDetectorSensitivity(config.speechDetectorSensitivity).inactivityTimeout(config.inactivityTimeout)
+                .speechDetectorSensitivity(config.speechDetectorSensitivity).inactivityTimeout(config.maxSilenceSeconds)
                .build();
        final AtomicReference<@Nullable WebSocket> socketRef = new AtomicReference<>();
        final AtomicBoolean aborted = new AtomicBoolean(false);
        executor.submit(() -> {
-            int retries = 2;
+            socketRef.set(speechToText.recognizeUsingWebSocket(wsOptions,
-            while (retries > 0) {
+                    new TranscriptionListener(socketRef, sttListener, config, aborted)));
                try {
                    socketRef.set(speechToText.recognizeUsingWebSocket(wsOptions,
                            new TranscriptionListener(sttListener, config, aborted)));
                    break;
                } catch (RuntimeException e) {
                    var cause = e.getCause();
                    if (cause instanceof SSLPeerUnverifiedException) {
                        logger.debug("Retrying on error: {}", cause.getMessage());
                        retries--;
                    } else {
                        var errorMessage = e.getMessage();
                        logger.warn("Aborting on error: {}", errorMessage);
                        sttListener.sttEventReceived(
                                new SpeechRecognitionErrorEvent(errorMessage != null ? errorMessage : "Unknown error"));
                        break;
                    }
                }
            }
        });
        return new STTServiceHandle() {
            @Override
@ -162,12 +143,7 @@ public class WatsonSTTService implements STTService {
                if (!aborted.getAndSet(true)) {
                    var socket = socketRef.get();
                    if (socket != null) {
-                        socket.close(1000, null);
+                        sendStopMessage(socket);
                        socket.cancel();
                        try {
                            Thread.sleep(100);
                        } catch (InterruptedException ignored) {
                        }
                    }
                }
            }
@ -224,17 +200,26 @@ public class WatsonSTTService implements STTService {
        return null;
    }
    private static void sendStopMessage(WebSocket ws) {
        JsonObject stopMessage = new JsonObject();
        stopMessage.addProperty("action", "stop");
        ws.send(stopMessage.toString());
    }
    private static class TranscriptionListener implements RecognizeCallback {
        private final Logger logger = LoggerFactory.getLogger(TranscriptionListener.class);
        private final StringBuilder transcriptBuilder = new StringBuilder();
        private final STTListener sttListener;
        private final WatsonSTTConfiguration config;
        private final AtomicBoolean aborted;
        private final AtomicReference<@Nullable WebSocket> socketRef;
        private float confidenceSum = 0f;
        private int responseCount = 0;
        private boolean disconnected = false;
-        public TranscriptionListener(STTListener sttListener, WatsonSTTConfiguration config, AtomicBoolean aborted) {
+        public TranscriptionListener(AtomicReference<@Nullable WebSocket> socketRef, STTListener sttListener,
                WatsonSTTConfiguration config, AtomicBoolean aborted) {
            this.socketRef = socketRef;
            this.sttListener = sttListener;
            this.config = config;
            this.aborted = aborted;
@ -256,6 +241,12 @@ public class WatsonSTTService implements STTService {
                transcriptBuilder.append(alternative.getTranscript());
                confidenceSum += confidence != null ? confidence.floatValue() : 0f;
                responseCount++;
                if (config.singleUtteranceMode) {
                    var socket = socketRef.get();
                    if (socket != null) {
                        sendStopMessage(socket);
                    }
                }
            });
        }
@ -272,7 +263,7 @@ public class WatsonSTTService implements STTService {
                return;
            }
            logger.warn("TranscriptionError: {}", errorMessage);
-            if (!aborted.get()) {
+            if (!aborted.getAndSet(true)) {
                sttListener.sttEventReceived(
                        new SpeechRecognitionErrorEvent(errorMessage != null ? errorMessage : "Unknown error"));
            }
@ -285,7 +276,7 @@ public class WatsonSTTService implements STTService {
            if (!aborted.getAndSet(true)) {
                sttListener.sttEventReceived(new RecognitionStopEvent());
                float averageConfidence = confidenceSum / (float) responseCount;
-                String transcript = transcriptBuilder.toString();
+                String transcript = transcriptBuilder.toString().trim();
                if (!transcript.isBlank()) {
                    sttListener.sttEventReceived(new SpeechRecognitionEvent(transcript, averageConfidence));
                } else {
--- a/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/config/config.xml
+++ b/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/config/config.xml
@ -32,8 +32,8 @@
 			<description>Use the parameter to suppress word insertions from music, coughing, and other non-speech events.</description>
 			<default>0.5</default>
 		</parameter>
-		<parameter name="inactivityTimeout" type="integer" unit="s" groupName="stt">
+		<parameter name="maxSilenceSeconds" type="integer" unit="s" groupName="stt">
-			<label>Inactivity Timeout</label>
+			<label>Max Silence Seconds</label>
 			<description>The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is
 				closed.</description>
 			<default>3</default>
@ -43,6 +43,11 @@
 			<description>Message to be told when no transcription is done.</description>
 			<default>No results</default>
 		</parameter>
 		<parameter name="singleUtteranceMode" type="boolean" groupName="stt">
 			<label>Single Utterance Mode</label>
 			<description>When enabled recognition stops listening after a single utterance.</description>
 			<default>true</default>
 		</parameter>
 		<parameter name="optOutLogging" type="boolean" groupName="stt">
 			<label>Opt Out Logging</label>
 			<description>By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the
--- a/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/i18n/watsonstt.properties
+++ b/bundles/org.openhab.voice.watsonstt/src/main/resources/OH-INF/i18n/watsonstt.properties
@ -6,16 +6,18 @@ voice.config.watsonstt.group.authentication.label = Authentication
 voice.config.watsonstt.group.authentication.description = Information for connection to your Watson Speech-to-Text instance.
 voice.config.watsonstt.group.stt.label = STT Configuration
 voice.config.watsonstt.group.stt.description = Parameters for Watson Speech-to-Text API.
 voice.config.watsonstt.inactivityTimeout.label = Inactivity Timeout
 voice.config.watsonstt.inactivityTimeout.description = The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed.
 voice.config.watsonstt.instanceUrl.label = Instance Url
 voice.config.watsonstt.instanceUrl.description = Url for Speech-to-Text instance created on IBM Cloud.
 voice.config.watsonstt.maxSilenceSeconds.label = Max Silence Seconds
 voice.config.watsonstt.maxSilenceSeconds.description = The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed.
 voice.config.watsonstt.noResultsMessage.label = No Results Message
 voice.config.watsonstt.noResultsMessage.description = Message to be told when no transcription is done.
 voice.config.watsonstt.optOutLogging.label = Opt Out Logging
 voice.config.watsonstt.optOutLogging.description = By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the services for future users. The logged data is not shared or made public.
 voice.config.watsonstt.redaction.label = Redaction
 voice.config.watsonstt.redaction.description = If true, the service redacts, or masks, numeric data from final transcripts. (Not available for all locales)
 voice.config.watsonstt.singleUtteranceMode.label = Single Utterance Mode
 voice.config.watsonstt.singleUtteranceMode.description = When enabled recognition stops listening after a single utterance.
 voice.config.watsonstt.smartFormatting.label = Smart Formatting
 voice.config.watsonstt.smartFormatting.description = If true, the service converts dates, times, series of digits and numbers, phone numbers, currency values, and internet addresses into more readable. (Not available for all locales)
 voice.config.watsonstt.speechDetectorSensitivity.label = Speech Detector Sensitivity