STT service improvements (#12453)

* [googlestt|voskstt] change default maxSilenceSeconds to 3
* [watsonstt] add singleUtterance mode, rename inativityTimeout to maxSilenceSeconds and minor improvements
* [watsonstt] trim transcription

Signed-off-by: Miguel Álvarez Díez <miguelwork92@gmail.com>
This commit is contained in:
GiviMAD 2022-03-12 23:06:51 +01:00 committed by GitHub
parent ce6ef25ac3
commit 480cddbf2c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 51 additions and 47 deletions

View File

@ -49,7 +49,7 @@ public class GoogleSTTConfiguration {
* Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop * Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop
* listening. * listening.
*/ */
public int maxSilenceSeconds = 5; public int maxSilenceSeconds = 3;
/** /**
* Single phrase mode. * Single phrase mode.
*/ */

View File

@ -46,7 +46,7 @@
<label>Max Silence Seconds</label> <label>Max Silence Seconds</label>
<description>Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop <description>Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop
listening.</description> listening.</description>
<default>5</default> <default>3</default>
</parameter> </parameter>
<parameter name="refreshSupportedLocales" type="boolean" groupName="stt"> <parameter name="refreshSupportedLocales" type="boolean" groupName="stt">
<label>Refresh Supported Locales</label> <label>Refresh Supported Locales</label>

View File

@ -33,7 +33,7 @@ public class VoskSTTConfiguration {
* Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop * Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop
* listening. * listening.
*/ */
public int maxSilenceSeconds = 5; public int maxSilenceSeconds = 3;
/** /**
* Message to be told when no results. * Message to be told when no results.
*/ */

View File

@ -27,7 +27,7 @@
<label>Max Silence Seconds</label> <label>Max Silence Seconds</label>
<description>Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop <description>Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop
listening.</description> listening.</description>
<default>5</default> <default>3</default>
</parameter> </parameter>
<parameter name="preloadModel" type="boolean" groupName="stt"> <parameter name="preloadModel" type="boolean" groupName="stt">
<label>Preload Model</label> <label>Preload Model</label>

View File

@ -26,7 +26,8 @@ Use your favorite configuration UI to edit **Settings / Other Services - IBM Wat
* **Background Audio Suppression** - Use the parameter to suppress side conversations or background noise. * **Background Audio Suppression** - Use the parameter to suppress side conversations or background noise.
* **Speech Detector Sensitivity** - Use the parameter to suppress word insertions from music, coughing, and other non-speech events. * **Speech Detector Sensitivity** - Use the parameter to suppress word insertions from music, coughing, and other non-speech events.
* **Inactivity Timeout** - The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed. * **Single Utterance Mode** - When enabled recognition stops listening after a single utterance.
* **Max Silence Seconds** - The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed.
* **Opt Out Logging** - By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the services for future users. The logged data is not shared or made public. * **Opt Out Logging** - By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the services for future users. The logged data is not shared or made public.
* **No Results Message** - Message to be told when no results. * **No Results Message** - Message to be told when no results.
* **Smart Formatting** - If true, the service converts dates, times, series of digits and numbers, phone numbers, currency values, and internet addresses into more readable. (Not available for all locales) * **Smart Formatting** - If true, the service converts dates, times, series of digits and numbers, phone numbers, currency values, and internet addresses into more readable. (Not available for all locales)
@ -43,7 +44,8 @@ org.openhab.voice.watsonstt:apiKey=******
org.openhab.voice.watsonstt:instanceUrl=https://api.***.speech-to-text.watson.cloud.ibm.com/instances/***** org.openhab.voice.watsonstt:instanceUrl=https://api.***.speech-to-text.watson.cloud.ibm.com/instances/*****
org.openhab.voice.watsonstt:backgroundAudioSuppression=0.5 org.openhab.voice.watsonstt:backgroundAudioSuppression=0.5
org.openhab.voice.watsonstt:speechDetectorSensitivity=0.5 org.openhab.voice.watsonstt:speechDetectorSensitivity=0.5
org.openhab.voice.watsonstt:inactivityTimeout=2 org.openhab.voice.watsonstt:singleUtteranceMode=true
org.openhab.voice.watsonstt:maxSilenceSeconds=2
org.openhab.voice.watsonstt:optOutLogging=false org.openhab.voice.watsonstt:optOutLogging=false
org.openhab.voice.watsonstt:smartFormatting=false org.openhab.voice.watsonstt:smartFormatting=false
org.openhab.voice.watsonstt:redaction=false org.openhab.voice.watsonstt:redaction=false

View File

@ -27,7 +27,7 @@
<dependency> <dependency>
<groupId>com.ibm.cloud</groupId> <groupId>com.ibm.cloud</groupId>
<artifactId>sdk-core</artifactId> <artifactId>sdk-core</artifactId>
<version>9.15.0</version> <version>9.15.4</version>
<scope>compile</scope> <scope>compile</scope>
</dependency> </dependency>
<dependency> <dependency>
@ -39,13 +39,13 @@
<dependency> <dependency>
<groupId>com.squareup.okhttp3</groupId> <groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId> <artifactId>okhttp</artifactId>
<version>4.9.1</version> <version>4.9.3</version>
<scope>compile</scope> <scope>compile</scope>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.squareup.okhttp3</groupId> <groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp-urlconnection</artifactId> <artifactId>okhttp-urlconnection</artifactId>
<version>4.9.1</version> <version>4.9.3</version>
<scope>compile</scope> <scope>compile</scope>
</dependency> </dependency>
<dependency> <dependency>

View File

@ -48,9 +48,13 @@ public class WatsonSTTConfiguration {
*/ */
public boolean redaction = false; public boolean redaction = false;
/** /**
* The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed. * Single phrase mode.
*/ */
public int inactivityTimeout = 3; public boolean singleUtteranceMode = true;
/**
* max seconds without getting new transcriptions to stop listening.
*/
public int maxSilenceSeconds = 3;
/** /**
* Message to be told when no results * Message to be told when no results
*/ */

View File

@ -23,8 +23,6 @@ import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import javax.net.ssl.SSLPeerUnverifiedException;
import org.eclipse.jdt.annotation.NonNullByDefault; import org.eclipse.jdt.annotation.NonNullByDefault;
import org.eclipse.jdt.annotation.Nullable; import org.eclipse.jdt.annotation.Nullable;
import org.openhab.core.audio.AudioFormat; import org.openhab.core.audio.AudioFormat;
@ -47,6 +45,7 @@ import org.osgi.service.component.annotations.Modified;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.google.gson.JsonObject;
import com.ibm.cloud.sdk.core.http.HttpMediaType; import com.ibm.cloud.sdk.core.http.HttpMediaType;
import com.ibm.cloud.sdk.core.security.IamAuthenticator; import com.ibm.cloud.sdk.core.security.IamAuthenticator;
import com.ibm.watson.speech_to_text.v1.SpeechToText; import com.ibm.watson.speech_to_text.v1.SpeechToText;
@ -130,31 +129,13 @@ public class WatsonSTTService implements STTService {
.contentType(contentType).redaction(config.redaction).smartFormatting(config.smartFormatting) .contentType(contentType).redaction(config.redaction).smartFormatting(config.smartFormatting)
.model(locale.toLanguageTag() + "_BroadbandModel").interimResults(true) .model(locale.toLanguageTag() + "_BroadbandModel").interimResults(true)
.backgroundAudioSuppression(config.backgroundAudioSuppression) .backgroundAudioSuppression(config.backgroundAudioSuppression)
.speechDetectorSensitivity(config.speechDetectorSensitivity).inactivityTimeout(config.inactivityTimeout) .speechDetectorSensitivity(config.speechDetectorSensitivity).inactivityTimeout(config.maxSilenceSeconds)
.build(); .build();
final AtomicReference<@Nullable WebSocket> socketRef = new AtomicReference<>(); final AtomicReference<@Nullable WebSocket> socketRef = new AtomicReference<>();
final AtomicBoolean aborted = new AtomicBoolean(false); final AtomicBoolean aborted = new AtomicBoolean(false);
executor.submit(() -> { executor.submit(() -> {
int retries = 2; socketRef.set(speechToText.recognizeUsingWebSocket(wsOptions,
while (retries > 0) { new TranscriptionListener(socketRef, sttListener, config, aborted)));
try {
socketRef.set(speechToText.recognizeUsingWebSocket(wsOptions,
new TranscriptionListener(sttListener, config, aborted)));
break;
} catch (RuntimeException e) {
var cause = e.getCause();
if (cause instanceof SSLPeerUnverifiedException) {
logger.debug("Retrying on error: {}", cause.getMessage());
retries--;
} else {
var errorMessage = e.getMessage();
logger.warn("Aborting on error: {}", errorMessage);
sttListener.sttEventReceived(
new SpeechRecognitionErrorEvent(errorMessage != null ? errorMessage : "Unknown error"));
break;
}
}
}
}); });
return new STTServiceHandle() { return new STTServiceHandle() {
@Override @Override
@ -162,12 +143,7 @@ public class WatsonSTTService implements STTService {
if (!aborted.getAndSet(true)) { if (!aborted.getAndSet(true)) {
var socket = socketRef.get(); var socket = socketRef.get();
if (socket != null) { if (socket != null) {
socket.close(1000, null); sendStopMessage(socket);
socket.cancel();
try {
Thread.sleep(100);
} catch (InterruptedException ignored) {
}
} }
} }
} }
@ -224,17 +200,26 @@ public class WatsonSTTService implements STTService {
return null; return null;
} }
private static void sendStopMessage(WebSocket ws) {
JsonObject stopMessage = new JsonObject();
stopMessage.addProperty("action", "stop");
ws.send(stopMessage.toString());
}
private static class TranscriptionListener implements RecognizeCallback { private static class TranscriptionListener implements RecognizeCallback {
private final Logger logger = LoggerFactory.getLogger(TranscriptionListener.class); private final Logger logger = LoggerFactory.getLogger(TranscriptionListener.class);
private final StringBuilder transcriptBuilder = new StringBuilder(); private final StringBuilder transcriptBuilder = new StringBuilder();
private final STTListener sttListener; private final STTListener sttListener;
private final WatsonSTTConfiguration config; private final WatsonSTTConfiguration config;
private final AtomicBoolean aborted; private final AtomicBoolean aborted;
private final AtomicReference<@Nullable WebSocket> socketRef;
private float confidenceSum = 0f; private float confidenceSum = 0f;
private int responseCount = 0; private int responseCount = 0;
private boolean disconnected = false; private boolean disconnected = false;
public TranscriptionListener(STTListener sttListener, WatsonSTTConfiguration config, AtomicBoolean aborted) { public TranscriptionListener(AtomicReference<@Nullable WebSocket> socketRef, STTListener sttListener,
WatsonSTTConfiguration config, AtomicBoolean aborted) {
this.socketRef = socketRef;
this.sttListener = sttListener; this.sttListener = sttListener;
this.config = config; this.config = config;
this.aborted = aborted; this.aborted = aborted;
@ -256,6 +241,12 @@ public class WatsonSTTService implements STTService {
transcriptBuilder.append(alternative.getTranscript()); transcriptBuilder.append(alternative.getTranscript());
confidenceSum += confidence != null ? confidence.floatValue() : 0f; confidenceSum += confidence != null ? confidence.floatValue() : 0f;
responseCount++; responseCount++;
if (config.singleUtteranceMode) {
var socket = socketRef.get();
if (socket != null) {
sendStopMessage(socket);
}
}
}); });
} }
@ -272,7 +263,7 @@ public class WatsonSTTService implements STTService {
return; return;
} }
logger.warn("TranscriptionError: {}", errorMessage); logger.warn("TranscriptionError: {}", errorMessage);
if (!aborted.get()) { if (!aborted.getAndSet(true)) {
sttListener.sttEventReceived( sttListener.sttEventReceived(
new SpeechRecognitionErrorEvent(errorMessage != null ? errorMessage : "Unknown error")); new SpeechRecognitionErrorEvent(errorMessage != null ? errorMessage : "Unknown error"));
} }
@ -285,7 +276,7 @@ public class WatsonSTTService implements STTService {
if (!aborted.getAndSet(true)) { if (!aborted.getAndSet(true)) {
sttListener.sttEventReceived(new RecognitionStopEvent()); sttListener.sttEventReceived(new RecognitionStopEvent());
float averageConfidence = confidenceSum / (float) responseCount; float averageConfidence = confidenceSum / (float) responseCount;
String transcript = transcriptBuilder.toString(); String transcript = transcriptBuilder.toString().trim();
if (!transcript.isBlank()) { if (!transcript.isBlank()) {
sttListener.sttEventReceived(new SpeechRecognitionEvent(transcript, averageConfidence)); sttListener.sttEventReceived(new SpeechRecognitionEvent(transcript, averageConfidence));
} else { } else {

View File

@ -32,8 +32,8 @@
<description>Use the parameter to suppress word insertions from music, coughing, and other non-speech events.</description> <description>Use the parameter to suppress word insertions from music, coughing, and other non-speech events.</description>
<default>0.5</default> <default>0.5</default>
</parameter> </parameter>
<parameter name="inactivityTimeout" type="integer" unit="s" groupName="stt"> <parameter name="maxSilenceSeconds" type="integer" unit="s" groupName="stt">
<label>Inactivity Timeout</label> <label>Max Silence Seconds</label>
<description>The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is <description>The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is
closed.</description> closed.</description>
<default>3</default> <default>3</default>
@ -43,6 +43,11 @@
<description>Message to be told when no transcription is done.</description> <description>Message to be told when no transcription is done.</description>
<default>No results</default> <default>No results</default>
</parameter> </parameter>
<parameter name="singleUtteranceMode" type="boolean" groupName="stt">
<label>Single Utterance Mode</label>
<description>When enabled recognition stops listening after a single utterance.</description>
<default>true</default>
</parameter>
<parameter name="optOutLogging" type="boolean" groupName="stt"> <parameter name="optOutLogging" type="boolean" groupName="stt">
<label>Opt Out Logging</label> <label>Opt Out Logging</label>
<description>By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the <description>By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the

View File

@ -6,16 +6,18 @@ voice.config.watsonstt.group.authentication.label = Authentication
voice.config.watsonstt.group.authentication.description = Information for connection to your Watson Speech-to-Text instance. voice.config.watsonstt.group.authentication.description = Information for connection to your Watson Speech-to-Text instance.
voice.config.watsonstt.group.stt.label = STT Configuration voice.config.watsonstt.group.stt.label = STT Configuration
voice.config.watsonstt.group.stt.description = Parameters for Watson Speech-to-Text API. voice.config.watsonstt.group.stt.description = Parameters for Watson Speech-to-Text API.
voice.config.watsonstt.inactivityTimeout.label = Inactivity Timeout
voice.config.watsonstt.inactivityTimeout.description = The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed.
voice.config.watsonstt.instanceUrl.label = Instance Url voice.config.watsonstt.instanceUrl.label = Instance Url
voice.config.watsonstt.instanceUrl.description = Url for Speech-to-Text instance created on IBM Cloud. voice.config.watsonstt.instanceUrl.description = Url for Speech-to-Text instance created on IBM Cloud.
voice.config.watsonstt.maxSilenceSeconds.label = Max Silence Seconds
voice.config.watsonstt.maxSilenceSeconds.description = The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed.
voice.config.watsonstt.noResultsMessage.label = No Results Message voice.config.watsonstt.noResultsMessage.label = No Results Message
voice.config.watsonstt.noResultsMessage.description = Message to be told when no transcription is done. voice.config.watsonstt.noResultsMessage.description = Message to be told when no transcription is done.
voice.config.watsonstt.optOutLogging.label = Opt Out Logging voice.config.watsonstt.optOutLogging.label = Opt Out Logging
voice.config.watsonstt.optOutLogging.description = By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the services for future users. The logged data is not shared or made public. voice.config.watsonstt.optOutLogging.description = By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the services for future users. The logged data is not shared or made public.
voice.config.watsonstt.redaction.label = Redaction voice.config.watsonstt.redaction.label = Redaction
voice.config.watsonstt.redaction.description = If true, the service redacts, or masks, numeric data from final transcripts. (Not available for all locales) voice.config.watsonstt.redaction.description = If true, the service redacts, or masks, numeric data from final transcripts. (Not available for all locales)
voice.config.watsonstt.singleUtteranceMode.label = Single Utterance Mode
voice.config.watsonstt.singleUtteranceMode.description = When enabled recognition stops listening after a single utterance.
voice.config.watsonstt.smartFormatting.label = Smart Formatting voice.config.watsonstt.smartFormatting.label = Smart Formatting
voice.config.watsonstt.smartFormatting.description = If true, the service converts dates, times, series of digits and numbers, phone numbers, currency values, and internet addresses into more readable. (Not available for all locales) voice.config.watsonstt.smartFormatting.description = If true, the service converts dates, times, series of digits and numbers, phone numbers, currency values, and internet addresses into more readable. (Not available for all locales)
voice.config.watsonstt.speechDetectorSensitivity.label = Speech Detector Sensitivity voice.config.watsonstt.speechDetectorSensitivity.label = Speech Detector Sensitivity