mirror of
https://github.com/openhab/openhab-addons.git
synced 2025-01-25 14:55:55 +01:00
STT service improvements (#12453)
* [googlestt|voskstt] change default maxSilenceSeconds to 3 * [watsonstt] add singleUtterance mode, rename inativityTimeout to maxSilenceSeconds and minor improvements * [watsonstt] trim transcription Signed-off-by: Miguel Álvarez Díez <miguelwork92@gmail.com>
This commit is contained in:
parent
ce6ef25ac3
commit
480cddbf2c
@ -49,7 +49,7 @@ public class GoogleSTTConfiguration {
|
|||||||
* Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop
|
* Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop
|
||||||
* listening.
|
* listening.
|
||||||
*/
|
*/
|
||||||
public int maxSilenceSeconds = 5;
|
public int maxSilenceSeconds = 3;
|
||||||
/**
|
/**
|
||||||
* Single phrase mode.
|
* Single phrase mode.
|
||||||
*/
|
*/
|
||||||
|
@ -46,7 +46,7 @@
|
|||||||
<label>Max Silence Seconds</label>
|
<label>Max Silence Seconds</label>
|
||||||
<description>Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop
|
<description>Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop
|
||||||
listening.</description>
|
listening.</description>
|
||||||
<default>5</default>
|
<default>3</default>
|
||||||
</parameter>
|
</parameter>
|
||||||
<parameter name="refreshSupportedLocales" type="boolean" groupName="stt">
|
<parameter name="refreshSupportedLocales" type="boolean" groupName="stt">
|
||||||
<label>Refresh Supported Locales</label>
|
<label>Refresh Supported Locales</label>
|
||||||
|
@ -33,7 +33,7 @@ public class VoskSTTConfiguration {
|
|||||||
* Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop
|
* Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop
|
||||||
* listening.
|
* listening.
|
||||||
*/
|
*/
|
||||||
public int maxSilenceSeconds = 5;
|
public int maxSilenceSeconds = 3;
|
||||||
/**
|
/**
|
||||||
* Message to be told when no results.
|
* Message to be told when no results.
|
||||||
*/
|
*/
|
||||||
|
@ -27,7 +27,7 @@
|
|||||||
<label>Max Silence Seconds</label>
|
<label>Max Silence Seconds</label>
|
||||||
<description>Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop
|
<description>Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop
|
||||||
listening.</description>
|
listening.</description>
|
||||||
<default>5</default>
|
<default>3</default>
|
||||||
</parameter>
|
</parameter>
|
||||||
<parameter name="preloadModel" type="boolean" groupName="stt">
|
<parameter name="preloadModel" type="boolean" groupName="stt">
|
||||||
<label>Preload Model</label>
|
<label>Preload Model</label>
|
||||||
|
@ -26,7 +26,8 @@ Use your favorite configuration UI to edit **Settings / Other Services - IBM Wat
|
|||||||
|
|
||||||
* **Background Audio Suppression** - Use the parameter to suppress side conversations or background noise.
|
* **Background Audio Suppression** - Use the parameter to suppress side conversations or background noise.
|
||||||
* **Speech Detector Sensitivity** - Use the parameter to suppress word insertions from music, coughing, and other non-speech events.
|
* **Speech Detector Sensitivity** - Use the parameter to suppress word insertions from music, coughing, and other non-speech events.
|
||||||
* **Inactivity Timeout** - The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed.
|
* **Single Utterance Mode** - When enabled recognition stops listening after a single utterance.
|
||||||
|
* **Max Silence Seconds** - The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed.
|
||||||
* **Opt Out Logging** - By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the services for future users. The logged data is not shared or made public.
|
* **Opt Out Logging** - By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the services for future users. The logged data is not shared or made public.
|
||||||
* **No Results Message** - Message to be told when no results.
|
* **No Results Message** - Message to be told when no results.
|
||||||
* **Smart Formatting** - If true, the service converts dates, times, series of digits and numbers, phone numbers, currency values, and internet addresses into more readable. (Not available for all locales)
|
* **Smart Formatting** - If true, the service converts dates, times, series of digits and numbers, phone numbers, currency values, and internet addresses into more readable. (Not available for all locales)
|
||||||
@ -43,7 +44,8 @@ org.openhab.voice.watsonstt:apiKey=******
|
|||||||
org.openhab.voice.watsonstt:instanceUrl=https://api.***.speech-to-text.watson.cloud.ibm.com/instances/*****
|
org.openhab.voice.watsonstt:instanceUrl=https://api.***.speech-to-text.watson.cloud.ibm.com/instances/*****
|
||||||
org.openhab.voice.watsonstt:backgroundAudioSuppression=0.5
|
org.openhab.voice.watsonstt:backgroundAudioSuppression=0.5
|
||||||
org.openhab.voice.watsonstt:speechDetectorSensitivity=0.5
|
org.openhab.voice.watsonstt:speechDetectorSensitivity=0.5
|
||||||
org.openhab.voice.watsonstt:inactivityTimeout=2
|
org.openhab.voice.watsonstt:singleUtteranceMode=true
|
||||||
|
org.openhab.voice.watsonstt:maxSilenceSeconds=2
|
||||||
org.openhab.voice.watsonstt:optOutLogging=false
|
org.openhab.voice.watsonstt:optOutLogging=false
|
||||||
org.openhab.voice.watsonstt:smartFormatting=false
|
org.openhab.voice.watsonstt:smartFormatting=false
|
||||||
org.openhab.voice.watsonstt:redaction=false
|
org.openhab.voice.watsonstt:redaction=false
|
||||||
|
@ -27,7 +27,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.ibm.cloud</groupId>
|
<groupId>com.ibm.cloud</groupId>
|
||||||
<artifactId>sdk-core</artifactId>
|
<artifactId>sdk-core</artifactId>
|
||||||
<version>9.15.0</version>
|
<version>9.15.4</version>
|
||||||
<scope>compile</scope>
|
<scope>compile</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
@ -39,13 +39,13 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.squareup.okhttp3</groupId>
|
<groupId>com.squareup.okhttp3</groupId>
|
||||||
<artifactId>okhttp</artifactId>
|
<artifactId>okhttp</artifactId>
|
||||||
<version>4.9.1</version>
|
<version>4.9.3</version>
|
||||||
<scope>compile</scope>
|
<scope>compile</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.squareup.okhttp3</groupId>
|
<groupId>com.squareup.okhttp3</groupId>
|
||||||
<artifactId>okhttp-urlconnection</artifactId>
|
<artifactId>okhttp-urlconnection</artifactId>
|
||||||
<version>4.9.1</version>
|
<version>4.9.3</version>
|
||||||
<scope>compile</scope>
|
<scope>compile</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -48,9 +48,13 @@ public class WatsonSTTConfiguration {
|
|||||||
*/
|
*/
|
||||||
public boolean redaction = false;
|
public boolean redaction = false;
|
||||||
/**
|
/**
|
||||||
* The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed.
|
* Single phrase mode.
|
||||||
*/
|
*/
|
||||||
public int inactivityTimeout = 3;
|
public boolean singleUtteranceMode = true;
|
||||||
|
/**
|
||||||
|
* max seconds without getting new transcriptions to stop listening.
|
||||||
|
*/
|
||||||
|
public int maxSilenceSeconds = 3;
|
||||||
/**
|
/**
|
||||||
* Message to be told when no results
|
* Message to be told when no results
|
||||||
*/
|
*/
|
||||||
|
@ -23,8 +23,6 @@ import java.util.concurrent.atomic.AtomicBoolean;
|
|||||||
import java.util.concurrent.atomic.AtomicReference;
|
import java.util.concurrent.atomic.AtomicReference;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import javax.net.ssl.SSLPeerUnverifiedException;
|
|
||||||
|
|
||||||
import org.eclipse.jdt.annotation.NonNullByDefault;
|
import org.eclipse.jdt.annotation.NonNullByDefault;
|
||||||
import org.eclipse.jdt.annotation.Nullable;
|
import org.eclipse.jdt.annotation.Nullable;
|
||||||
import org.openhab.core.audio.AudioFormat;
|
import org.openhab.core.audio.AudioFormat;
|
||||||
@ -47,6 +45,7 @@ import org.osgi.service.component.annotations.Modified;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.google.gson.JsonObject;
|
||||||
import com.ibm.cloud.sdk.core.http.HttpMediaType;
|
import com.ibm.cloud.sdk.core.http.HttpMediaType;
|
||||||
import com.ibm.cloud.sdk.core.security.IamAuthenticator;
|
import com.ibm.cloud.sdk.core.security.IamAuthenticator;
|
||||||
import com.ibm.watson.speech_to_text.v1.SpeechToText;
|
import com.ibm.watson.speech_to_text.v1.SpeechToText;
|
||||||
@ -130,31 +129,13 @@ public class WatsonSTTService implements STTService {
|
|||||||
.contentType(contentType).redaction(config.redaction).smartFormatting(config.smartFormatting)
|
.contentType(contentType).redaction(config.redaction).smartFormatting(config.smartFormatting)
|
||||||
.model(locale.toLanguageTag() + "_BroadbandModel").interimResults(true)
|
.model(locale.toLanguageTag() + "_BroadbandModel").interimResults(true)
|
||||||
.backgroundAudioSuppression(config.backgroundAudioSuppression)
|
.backgroundAudioSuppression(config.backgroundAudioSuppression)
|
||||||
.speechDetectorSensitivity(config.speechDetectorSensitivity).inactivityTimeout(config.inactivityTimeout)
|
.speechDetectorSensitivity(config.speechDetectorSensitivity).inactivityTimeout(config.maxSilenceSeconds)
|
||||||
.build();
|
.build();
|
||||||
final AtomicReference<@Nullable WebSocket> socketRef = new AtomicReference<>();
|
final AtomicReference<@Nullable WebSocket> socketRef = new AtomicReference<>();
|
||||||
final AtomicBoolean aborted = new AtomicBoolean(false);
|
final AtomicBoolean aborted = new AtomicBoolean(false);
|
||||||
executor.submit(() -> {
|
executor.submit(() -> {
|
||||||
int retries = 2;
|
socketRef.set(speechToText.recognizeUsingWebSocket(wsOptions,
|
||||||
while (retries > 0) {
|
new TranscriptionListener(socketRef, sttListener, config, aborted)));
|
||||||
try {
|
|
||||||
socketRef.set(speechToText.recognizeUsingWebSocket(wsOptions,
|
|
||||||
new TranscriptionListener(sttListener, config, aborted)));
|
|
||||||
break;
|
|
||||||
} catch (RuntimeException e) {
|
|
||||||
var cause = e.getCause();
|
|
||||||
if (cause instanceof SSLPeerUnverifiedException) {
|
|
||||||
logger.debug("Retrying on error: {}", cause.getMessage());
|
|
||||||
retries--;
|
|
||||||
} else {
|
|
||||||
var errorMessage = e.getMessage();
|
|
||||||
logger.warn("Aborting on error: {}", errorMessage);
|
|
||||||
sttListener.sttEventReceived(
|
|
||||||
new SpeechRecognitionErrorEvent(errorMessage != null ? errorMessage : "Unknown error"));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
return new STTServiceHandle() {
|
return new STTServiceHandle() {
|
||||||
@Override
|
@Override
|
||||||
@ -162,12 +143,7 @@ public class WatsonSTTService implements STTService {
|
|||||||
if (!aborted.getAndSet(true)) {
|
if (!aborted.getAndSet(true)) {
|
||||||
var socket = socketRef.get();
|
var socket = socketRef.get();
|
||||||
if (socket != null) {
|
if (socket != null) {
|
||||||
socket.close(1000, null);
|
sendStopMessage(socket);
|
||||||
socket.cancel();
|
|
||||||
try {
|
|
||||||
Thread.sleep(100);
|
|
||||||
} catch (InterruptedException ignored) {
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -224,17 +200,26 @@ public class WatsonSTTService implements STTService {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void sendStopMessage(WebSocket ws) {
|
||||||
|
JsonObject stopMessage = new JsonObject();
|
||||||
|
stopMessage.addProperty("action", "stop");
|
||||||
|
ws.send(stopMessage.toString());
|
||||||
|
}
|
||||||
|
|
||||||
private static class TranscriptionListener implements RecognizeCallback {
|
private static class TranscriptionListener implements RecognizeCallback {
|
||||||
private final Logger logger = LoggerFactory.getLogger(TranscriptionListener.class);
|
private final Logger logger = LoggerFactory.getLogger(TranscriptionListener.class);
|
||||||
private final StringBuilder transcriptBuilder = new StringBuilder();
|
private final StringBuilder transcriptBuilder = new StringBuilder();
|
||||||
private final STTListener sttListener;
|
private final STTListener sttListener;
|
||||||
private final WatsonSTTConfiguration config;
|
private final WatsonSTTConfiguration config;
|
||||||
private final AtomicBoolean aborted;
|
private final AtomicBoolean aborted;
|
||||||
|
private final AtomicReference<@Nullable WebSocket> socketRef;
|
||||||
private float confidenceSum = 0f;
|
private float confidenceSum = 0f;
|
||||||
private int responseCount = 0;
|
private int responseCount = 0;
|
||||||
private boolean disconnected = false;
|
private boolean disconnected = false;
|
||||||
|
|
||||||
public TranscriptionListener(STTListener sttListener, WatsonSTTConfiguration config, AtomicBoolean aborted) {
|
public TranscriptionListener(AtomicReference<@Nullable WebSocket> socketRef, STTListener sttListener,
|
||||||
|
WatsonSTTConfiguration config, AtomicBoolean aborted) {
|
||||||
|
this.socketRef = socketRef;
|
||||||
this.sttListener = sttListener;
|
this.sttListener = sttListener;
|
||||||
this.config = config;
|
this.config = config;
|
||||||
this.aborted = aborted;
|
this.aborted = aborted;
|
||||||
@ -256,6 +241,12 @@ public class WatsonSTTService implements STTService {
|
|||||||
transcriptBuilder.append(alternative.getTranscript());
|
transcriptBuilder.append(alternative.getTranscript());
|
||||||
confidenceSum += confidence != null ? confidence.floatValue() : 0f;
|
confidenceSum += confidence != null ? confidence.floatValue() : 0f;
|
||||||
responseCount++;
|
responseCount++;
|
||||||
|
if (config.singleUtteranceMode) {
|
||||||
|
var socket = socketRef.get();
|
||||||
|
if (socket != null) {
|
||||||
|
sendStopMessage(socket);
|
||||||
|
}
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -272,7 +263,7 @@ public class WatsonSTTService implements STTService {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
logger.warn("TranscriptionError: {}", errorMessage);
|
logger.warn("TranscriptionError: {}", errorMessage);
|
||||||
if (!aborted.get()) {
|
if (!aborted.getAndSet(true)) {
|
||||||
sttListener.sttEventReceived(
|
sttListener.sttEventReceived(
|
||||||
new SpeechRecognitionErrorEvent(errorMessage != null ? errorMessage : "Unknown error"));
|
new SpeechRecognitionErrorEvent(errorMessage != null ? errorMessage : "Unknown error"));
|
||||||
}
|
}
|
||||||
@ -285,7 +276,7 @@ public class WatsonSTTService implements STTService {
|
|||||||
if (!aborted.getAndSet(true)) {
|
if (!aborted.getAndSet(true)) {
|
||||||
sttListener.sttEventReceived(new RecognitionStopEvent());
|
sttListener.sttEventReceived(new RecognitionStopEvent());
|
||||||
float averageConfidence = confidenceSum / (float) responseCount;
|
float averageConfidence = confidenceSum / (float) responseCount;
|
||||||
String transcript = transcriptBuilder.toString();
|
String transcript = transcriptBuilder.toString().trim();
|
||||||
if (!transcript.isBlank()) {
|
if (!transcript.isBlank()) {
|
||||||
sttListener.sttEventReceived(new SpeechRecognitionEvent(transcript, averageConfidence));
|
sttListener.sttEventReceived(new SpeechRecognitionEvent(transcript, averageConfidence));
|
||||||
} else {
|
} else {
|
||||||
|
@ -32,8 +32,8 @@
|
|||||||
<description>Use the parameter to suppress word insertions from music, coughing, and other non-speech events.</description>
|
<description>Use the parameter to suppress word insertions from music, coughing, and other non-speech events.</description>
|
||||||
<default>0.5</default>
|
<default>0.5</default>
|
||||||
</parameter>
|
</parameter>
|
||||||
<parameter name="inactivityTimeout" type="integer" unit="s" groupName="stt">
|
<parameter name="maxSilenceSeconds" type="integer" unit="s" groupName="stt">
|
||||||
<label>Inactivity Timeout</label>
|
<label>Max Silence Seconds</label>
|
||||||
<description>The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is
|
<description>The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is
|
||||||
closed.</description>
|
closed.</description>
|
||||||
<default>3</default>
|
<default>3</default>
|
||||||
@ -43,6 +43,11 @@
|
|||||||
<description>Message to be told when no transcription is done.</description>
|
<description>Message to be told when no transcription is done.</description>
|
||||||
<default>No results</default>
|
<default>No results</default>
|
||||||
</parameter>
|
</parameter>
|
||||||
|
<parameter name="singleUtteranceMode" type="boolean" groupName="stt">
|
||||||
|
<label>Single Utterance Mode</label>
|
||||||
|
<description>When enabled recognition stops listening after a single utterance.</description>
|
||||||
|
<default>true</default>
|
||||||
|
</parameter>
|
||||||
<parameter name="optOutLogging" type="boolean" groupName="stt">
|
<parameter name="optOutLogging" type="boolean" groupName="stt">
|
||||||
<label>Opt Out Logging</label>
|
<label>Opt Out Logging</label>
|
||||||
<description>By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the
|
<description>By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the
|
||||||
|
@ -6,16 +6,18 @@ voice.config.watsonstt.group.authentication.label = Authentication
|
|||||||
voice.config.watsonstt.group.authentication.description = Information for connection to your Watson Speech-to-Text instance.
|
voice.config.watsonstt.group.authentication.description = Information for connection to your Watson Speech-to-Text instance.
|
||||||
voice.config.watsonstt.group.stt.label = STT Configuration
|
voice.config.watsonstt.group.stt.label = STT Configuration
|
||||||
voice.config.watsonstt.group.stt.description = Parameters for Watson Speech-to-Text API.
|
voice.config.watsonstt.group.stt.description = Parameters for Watson Speech-to-Text API.
|
||||||
voice.config.watsonstt.inactivityTimeout.label = Inactivity Timeout
|
|
||||||
voice.config.watsonstt.inactivityTimeout.description = The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed.
|
|
||||||
voice.config.watsonstt.instanceUrl.label = Instance Url
|
voice.config.watsonstt.instanceUrl.label = Instance Url
|
||||||
voice.config.watsonstt.instanceUrl.description = Url for Speech-to-Text instance created on IBM Cloud.
|
voice.config.watsonstt.instanceUrl.description = Url for Speech-to-Text instance created on IBM Cloud.
|
||||||
|
voice.config.watsonstt.maxSilenceSeconds.label = Max Silence Seconds
|
||||||
|
voice.config.watsonstt.maxSilenceSeconds.description = The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed.
|
||||||
voice.config.watsonstt.noResultsMessage.label = No Results Message
|
voice.config.watsonstt.noResultsMessage.label = No Results Message
|
||||||
voice.config.watsonstt.noResultsMessage.description = Message to be told when no transcription is done.
|
voice.config.watsonstt.noResultsMessage.description = Message to be told when no transcription is done.
|
||||||
voice.config.watsonstt.optOutLogging.label = Opt Out Logging
|
voice.config.watsonstt.optOutLogging.label = Opt Out Logging
|
||||||
voice.config.watsonstt.optOutLogging.description = By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the services for future users. The logged data is not shared or made public.
|
voice.config.watsonstt.optOutLogging.description = By default, all IBM Watson™ services log requests and their results. Logging is done only to improve the services for future users. The logged data is not shared or made public.
|
||||||
voice.config.watsonstt.redaction.label = Redaction
|
voice.config.watsonstt.redaction.label = Redaction
|
||||||
voice.config.watsonstt.redaction.description = If true, the service redacts, or masks, numeric data from final transcripts. (Not available for all locales)
|
voice.config.watsonstt.redaction.description = If true, the service redacts, or masks, numeric data from final transcripts. (Not available for all locales)
|
||||||
|
voice.config.watsonstt.singleUtteranceMode.label = Single Utterance Mode
|
||||||
|
voice.config.watsonstt.singleUtteranceMode.description = When enabled recognition stops listening after a single utterance.
|
||||||
voice.config.watsonstt.smartFormatting.label = Smart Formatting
|
voice.config.watsonstt.smartFormatting.label = Smart Formatting
|
||||||
voice.config.watsonstt.smartFormatting.description = If true, the service converts dates, times, series of digits and numbers, phone numbers, currency values, and internet addresses into more readable. (Not available for all locales)
|
voice.config.watsonstt.smartFormatting.description = If true, the service converts dates, times, series of digits and numbers, phone numbers, currency values, and internet addresses into more readable. (Not available for all locales)
|
||||||
voice.config.watsonstt.speechDetectorSensitivity.label = Speech Detector Sensitivity
|
voice.config.watsonstt.speechDetectorSensitivity.label = Speech Detector Sensitivity
|
||||||
|
Loading…
Reference in New Issue
Block a user