diff --git a/CODEOWNERS b/CODEOWNERS index 50a78212abf..1e9001fe90f 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -383,6 +383,7 @@ /bundles/org.openhab.voice.pollytts/ @hillmanr /bundles/org.openhab.voice.porcupineks/ @GiviMAD /bundles/org.openhab.voice.voicerss/ @JochenHiller @lolodomo +/bundles/org.openhab.voice.voskstt/ @GiviMAD /bundles/org.openhab.voice.watsonstt/ @GiviMAD /itests/org.openhab.binding.astro.tests/ @gerrieg /itests/org.openhab.binding.avmfritz.tests/ @cweitkamp diff --git a/bom/openhab-addons/pom.xml b/bom/openhab-addons/pom.xml index 930b8d50a74..415aecf3611 100644 --- a/bom/openhab-addons/pom.xml +++ b/bom/openhab-addons/pom.xml @@ -1906,6 +1906,11 @@ org.openhab.voice.voicerss ${project.version} + + org.openhab.addons.bundles + org.openhab.voice.voskstt + ${project.version} + org.openhab.addons.bundles org.openhab.voice.watsonstt diff --git a/bundles/org.openhab.voice.voskstt/NOTICE b/bundles/org.openhab.voice.voskstt/NOTICE new file mode 100644 index 00000000000..4e69b70319b --- /dev/null +++ b/bundles/org.openhab.voice.voskstt/NOTICE @@ -0,0 +1,20 @@ +This content is produced and maintained by the openHAB project. + +* Project home: https://www.openhab.org + +== Declared Project Licenses + +This program and the accompanying materials are made available under the terms +of the Eclipse Public License 2.0 which is available at +https://www.eclipse.org/legal/epl-2.0/. + +== Source Code + +https://github.com/openhab/openhab-addons + +== Third-party Content + +com.alphacephei: vosk +* License: Apache 2.0 License +* Project: https://github.com/alphacep/vosk-api +* Source: https://github.com/alphacep/vosk-api/tree/master/java diff --git a/bundles/org.openhab.voice.voskstt/README.md b/bundles/org.openhab.voice.voskstt/README.md new file mode 100644 index 00000000000..224d56f732b --- /dev/null +++ b/bundles/org.openhab.voice.voskstt/README.md @@ -0,0 +1,60 @@ +# Vosk Speech-to-Text + +Vosk STT Service uses [vosk-api](https://github.com/alphacep/vosk-api) to perform offline speech-to-text in openHAB. + +[Vosk](https://alphacephei.com/vosk/) is an offline open source speech recognition toolkit. +It enables speech recognition for 20+ languages and dialects - English, Indian English, German, French, Spanish, Portuguese, Chinese, Russian, Turkish, Vietnamese, Italian, Dutch, Catalan, Arabic, Greek, Farsi, Filipino, Ukrainian, Kazakh, Swedish, Japanese, Esperanto. +More to come. + +## Configuring the model + +Before you can use this service you should configure your language model. +You can download it from [here](https://alphacephei.com/vosk/models). +You should unzip the contained folder into '\/vosk/' and rename it to model for the add-on to work. + +## Configuration + +### Speech to Text Configuration + +Use your favorite configuration UI to edit **Settings / Other Services - Vosk Speech-to-Text**: + +* **Preload Model** - Keep language model loaded. +* **Single Utterance Mode** - When enabled recognition stops listening after a single utterance. +* **Max Transcription Seconds** - Max seconds to wait to force stop the transcription. +* **Max Silence Seconds** - Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop listening. + +### Messages Configuration + +Use your favorite configuration UI to edit **Settings / Other Services - Vosk Speech-to-Text**: + +* **No Results Message** - Message to be told when no results. +* **Error Message** - Message to be told when an error has happened. + +### Configuration via a text file + +In case you would like to setup the service via a text file, create a new file in `$OPENHAB_ROOT/conf/services` named `voskstt.cfg` + +Its contents should look similar to: + +``` +org.openhab.voice.voskstt:preloadModel=true +org.openhab.voice.voskstt:singleUtteranceMode=true +org.openhab.voice.voskstt:maxTranscriptionSeconds=60 +org.openhab.voice.voskstt:maxSilenceSeconds=5 +org.openhab.voice.voskstt:noResultsMessage="Sorry, I didn't understand you" +org.openhab.voice.voskstt:errorMessage="Sorry, something went wrong" +``` + +### Default Speech-to-Text Configuration + +You can setup your preferred default Speech-to-Text in the UI: + +* Go to **Settings**. +* Edit **System Services - Voice**. +* Set **Vosk** as **Speech-to-Text**. + +In case you would like to setup these settings via a text file, you can edit the file `runtime.cfg` in `$OPENHAB_ROOT/conf/services` and set the following entries: + +``` +org.openhab.voice:defaultSTT=voskstt +``` diff --git a/bundles/org.openhab.voice.voskstt/pom.xml b/bundles/org.openhab.voice.voskstt/pom.xml new file mode 100644 index 00000000000..761328d6247 --- /dev/null +++ b/bundles/org.openhab.voice.voskstt/pom.xml @@ -0,0 +1,31 @@ + + + + 4.0.0 + + + org.openhab.addons.bundles + org.openhab.addons.reactor.bundles + 3.3.0-SNAPSHOT + + + org.openhab.voice.voskstt + + openHAB Add-ons :: Bundles :: Voice :: Vosk Speech to Text + + + com.alphacephei + vosk + 0.3.33 + compile + + + + net.java.dev.jna + jna + 5.7.0 + compile + + + diff --git a/bundles/org.openhab.voice.voskstt/src/main/feature/feature.xml b/bundles/org.openhab.voice.voskstt/src/main/feature/feature.xml new file mode 100644 index 00000000000..0f68035cdd4 --- /dev/null +++ b/bundles/org.openhab.voice.voskstt/src/main/feature/feature.xml @@ -0,0 +1,9 @@ + + + mvn:org.openhab.core.features.karaf/org.openhab.core.features.karaf.openhab-core/${ohc.version}/xml/features + + + openhab-runtime-base + mvn:org.openhab.addons.bundles/org.openhab.voice.voskstt/${project.version} + + diff --git a/bundles/org.openhab.voice.voskstt/src/main/java/org/openhab/voice/voskstt/internal/VoskSTTConfiguration.java b/bundles/org.openhab.voice.voskstt/src/main/java/org/openhab/voice/voskstt/internal/VoskSTTConfiguration.java new file mode 100644 index 00000000000..1f09cf98ddc --- /dev/null +++ b/bundles/org.openhab.voice.voskstt/src/main/java/org/openhab/voice/voskstt/internal/VoskSTTConfiguration.java @@ -0,0 +1,49 @@ +/** + * Copyright (c) 2010-2022 Contributors to the openHAB project + * + * See the NOTICE file(s) distributed with this work for additional + * information. + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0 + * + * SPDX-License-Identifier: EPL-2.0 + */ +package org.openhab.voice.voskstt.internal; + +import org.eclipse.jdt.annotation.NonNullByDefault; + +/** + * The {@link VoskSTTConfiguration} class contains Vosk STT Service configuration. + * + * @author Miguel Álvarez - Initial contribution + */ +@NonNullByDefault +public class VoskSTTConfiguration { + /** + * Single phrase mode. + */ + public boolean singleUtteranceMode = true; + /** + * Max seconds to wait to force stop the transcription. + */ + public int maxTranscriptionSeconds = 60; + /** + * Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop + * listening. + */ + public int maxSilenceSeconds = 5; + /** + * Message to be told when no results. + */ + public String noResultsMessage = "Sorry, I didn't understand you"; + /** + * Message to be told when an error has happened. + */ + public String errorMessage = "Sorry, something went wrong"; + /** + * Keep language model loaded + */ + public boolean preloadModel = true; +} diff --git a/bundles/org.openhab.voice.voskstt/src/main/java/org/openhab/voice/voskstt/internal/VoskSTTConstants.java b/bundles/org.openhab.voice.voskstt/src/main/java/org/openhab/voice/voskstt/internal/VoskSTTConstants.java new file mode 100644 index 00000000000..b08776a9e62 --- /dev/null +++ b/bundles/org.openhab.voice.voskstt/src/main/java/org/openhab/voice/voskstt/internal/VoskSTTConstants.java @@ -0,0 +1,42 @@ +/** + * Copyright (c) 2010-2022 Contributors to the openHAB project + * + * See the NOTICE file(s) distributed with this work for additional + * information. + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0 + * + * SPDX-License-Identifier: EPL-2.0 + */ +package org.openhab.voice.voskstt.internal; + +import org.eclipse.jdt.annotation.NonNullByDefault; + +/** + * The {@link VoskSTTConstants} class defines common constants. + * + * @author Miguel Álvarez - Initial contribution + */ +@NonNullByDefault +public class VoskSTTConstants { + /** + * Service name + */ + public static final String SERVICE_NAME = "Vosk"; + /** + * Service id + */ + public static final String SERVICE_ID = "voskstt"; + + /** + * Service category + */ + public static final String SERVICE_CATEGORY = "voice"; + + /** + * Service pid + */ + public static final String SERVICE_PID = "org.openhab." + SERVICE_CATEGORY + "." + SERVICE_ID; +} diff --git a/bundles/org.openhab.voice.voskstt/src/main/java/org/openhab/voice/voskstt/internal/VoskSTTService.java b/bundles/org.openhab.voice.voskstt/src/main/java/org/openhab/voice/voskstt/internal/VoskSTTService.java new file mode 100644 index 00000000000..532ffbb22c3 --- /dev/null +++ b/bundles/org.openhab.voice.voskstt/src/main/java/org/openhab/voice/voskstt/internal/VoskSTTService.java @@ -0,0 +1,292 @@ +/** + * Copyright (c) 2010-2022 Contributors to the openHAB project + * + * See the NOTICE file(s) distributed with this work for additional + * information. + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0 + * + * SPDX-License-Identifier: EPL-2.0 + */ +package org.openhab.voice.voskstt.internal; + +import static org.openhab.voice.voskstt.internal.VoskSTTConstants.*; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Path; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.Future; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.atomic.AtomicBoolean; + +import org.eclipse.jdt.annotation.NonNullByDefault; +import org.eclipse.jdt.annotation.Nullable; +import org.openhab.core.OpenHAB; +import org.openhab.core.audio.AudioFormat; +import org.openhab.core.audio.AudioStream; +import org.openhab.core.common.ThreadPoolManager; +import org.openhab.core.config.core.ConfigurableService; +import org.openhab.core.config.core.Configuration; +import org.openhab.core.io.rest.LocaleService; +import org.openhab.core.voice.RecognitionStartEvent; +import org.openhab.core.voice.RecognitionStopEvent; +import org.openhab.core.voice.STTException; +import org.openhab.core.voice.STTListener; +import org.openhab.core.voice.STTService; +import org.openhab.core.voice.STTServiceHandle; +import org.openhab.core.voice.SpeechRecognitionErrorEvent; +import org.openhab.core.voice.SpeechRecognitionEvent; +import org.osgi.framework.Constants; +import org.osgi.service.component.annotations.Activate; +import org.osgi.service.component.annotations.Component; +import org.osgi.service.component.annotations.Deactivate; +import org.osgi.service.component.annotations.Modified; +import org.osgi.service.component.annotations.Reference; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.vosk.Model; +import org.vosk.Recognizer; + +import com.fasterxml.jackson.databind.ObjectMapper; + +/** + * The {@link VoskSTTService} class is a service implementation to use Vosk-API for Speech-to-Text. + * + * @author Miguel Álvarez - Initial contribution + */ +@NonNullByDefault +@Component(configurationPid = SERVICE_PID, property = Constants.SERVICE_PID + "=" + SERVICE_PID) +@ConfigurableService(category = SERVICE_CATEGORY, label = SERVICE_NAME + + " Speech-to-Text", description_uri = SERVICE_CATEGORY + ":" + SERVICE_ID) +public class VoskSTTService implements STTService { + private static final String VOSK_FOLDER = Path.of(OpenHAB.getUserDataFolder(), "vosk").toString(); + private static final String MODEL_PATH = Path.of(VOSK_FOLDER, "model").toString(); + static { + Logger logger = LoggerFactory.getLogger(VoskSTTService.class); + File directory = new File(VOSK_FOLDER); + if (!directory.exists()) { + if (directory.mkdir()) { + logger.info("vosk dir created {}", VOSK_FOLDER); + } + } + } + private final Logger logger = LoggerFactory.getLogger(VoskSTTService.class); + private final ScheduledExecutorService executor = ThreadPoolManager.getScheduledPool("OH-voice-voskstt"); + private final LocaleService localeService; + private VoskSTTConfiguration config = new VoskSTTConfiguration(); + private @Nullable Model model; + + @Activate + public VoskSTTService(@Reference LocaleService localeService) { + this.localeService = localeService; + } + + @Activate + protected void activate(Map config) { + configChange(config); + } + + @Modified + protected void modified(Map config) { + configChange(config); + } + + @Deactivate + protected void deactivate(Map config) { + try { + unloadModel(); + } catch (IOException e) { + logger.warn("IOException unloading model: {}", e.getMessage()); + } + } + + private void configChange(Map config) { + this.config = new Configuration(config).as(VoskSTTConfiguration.class); + if (this.config.preloadModel) { + try { + loadModel(); + } catch (IOException e) { + logger.warn("IOException loading model: {}", e.getMessage()); + } + } else { + try { + unloadModel(); + } catch (IOException e) { + logger.warn("IOException unloading model: {}", e.getMessage()); + } + } + } + + @Override + public String getId() { + return SERVICE_ID; + } + + @Override + public String getLabel(@Nullable Locale locale) { + return SERVICE_NAME; + } + + @Override + public Set getSupportedLocales() { + // as it is not possible to determine the language of the model that was downloaded and setup by the user, it is + // assumed the language of the model is matching the locale of the openHAB server + return Set.of(localeService.getLocale(null)); + } + + @Override + public Set getSupportedFormats() { + return Set.of( + new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, false, null, null, 16000L)); + } + + @Override + public STTServiceHandle recognize(STTListener sttListener, AudioStream audioStream, Locale locale, Set set) + throws STTException { + AtomicBoolean aborted = new AtomicBoolean(false); + try { + var frequency = audioStream.getFormat().getFrequency(); + if (frequency == null) { + throw new IOException("missing audio stream frequency"); + } + backgroundRecognize(sttListener, audioStream, frequency, aborted); + } catch (IOException e) { + throw new STTException(e); + } + return () -> { + aborted.set(true); + }; + } + + private Model getModel() throws IOException { + var model = this.model; + if (model != null) { + return model; + } + return loadModel(); + } + + private Model loadModel() throws IOException { + unloadModel(); + var modelFile = new File(MODEL_PATH); + if (!modelFile.exists() || !modelFile.isDirectory()) { + throw new IOException("missing model dir: " + MODEL_PATH); + } + logger.debug("loading model"); + var model = new Model(MODEL_PATH); + if (config.preloadModel) { + this.model = model; + } + return model; + } + + private void unloadModel() throws IOException { + var model = this.model; + if (model != null) { + logger.debug("unloading model"); + model.close(); + this.model = null; + } + } + + private Future backgroundRecognize(STTListener sttListener, InputStream audioStream, long frequency, + AtomicBoolean aborted) { + StringBuilder transcriptBuilder = new StringBuilder(); + long maxTranscriptionMillis = (config.maxTranscriptionSeconds * 1000L); + long maxSilenceMillis = (config.maxSilenceSeconds * 1000L); + long startTime = System.currentTimeMillis(); + return executor.submit(() -> { + Recognizer recognizer = null; + Model model = null; + try { + model = getModel(); + recognizer = new Recognizer(model, frequency); + long lastInputTime = System.currentTimeMillis(); + int nbytes; + byte[] b = new byte[4096]; + sttListener.sttEventReceived(new RecognitionStartEvent()); + while (!aborted.get()) { + nbytes = audioStream.read(b); + if (aborted.get()) { + break; + } + if (isExpiredInterval(maxTranscriptionMillis, startTime)) { + logger.debug("Stops listening, max transcription time reached"); + break; + } + if (!config.singleUtteranceMode && isExpiredInterval(maxSilenceMillis, lastInputTime)) { + logger.debug("Stops listening, max silence time reached"); + break; + } + if (nbytes == 0) { + trySleep(100); + continue; + } + if (recognizer.acceptWaveForm(b, nbytes)) { + lastInputTime = System.currentTimeMillis(); + var result = recognizer.getResult(); + logger.debug("Result: {}", result); + ObjectMapper mapper = new ObjectMapper(); + var json = mapper.readTree(result); + transcriptBuilder.append(json.get("text").asText()).append(" "); + if (config.singleUtteranceMode) { + break; + } + } else { + logger.debug("Partial: {}", recognizer.getPartialResult()); + } + } + if (!aborted.get()) { + sttListener.sttEventReceived(new RecognitionStopEvent()); + var transcript = transcriptBuilder.toString().trim(); + logger.debug("Final: {}", transcript); + if (!transcript.isBlank()) { + sttListener.sttEventReceived(new SpeechRecognitionEvent(transcript, 1F)); + } else { + if (!config.noResultsMessage.isBlank()) { + sttListener.sttEventReceived(new SpeechRecognitionErrorEvent(config.noResultsMessage)); + } else { + sttListener.sttEventReceived(new SpeechRecognitionErrorEvent("No results")); + } + } + } + } catch (IOException e) { + logger.warn("Error running speech to text: {}", e.getMessage()); + if (config.errorMessage.isBlank()) { + sttListener.sttEventReceived(new SpeechRecognitionErrorEvent("Error")); + } else { + sttListener.sttEventReceived(new SpeechRecognitionErrorEvent(config.errorMessage)); + } + } finally { + if (recognizer != null) { + recognizer.close(); + } + if (!config.preloadModel && model != null) { + model.close(); + } + } + try { + audioStream.close(); + } catch (IOException e) { + logger.warn("IOException on close: {}", e.getMessage()); + } + }); + } + + private void trySleep(long ms) { + try { + Thread.sleep(ms); + } catch (InterruptedException ignored) { + } + } + + private boolean isExpiredInterval(long interval, long referenceTime) { + return System.currentTimeMillis() - referenceTime > interval; + } +} diff --git a/bundles/org.openhab.voice.voskstt/src/main/resources/OH-INF/config/config.xml b/bundles/org.openhab.voice.voskstt/src/main/resources/OH-INF/config/config.xml new file mode 100644 index 00000000000..c0266cb9109 --- /dev/null +++ b/bundles/org.openhab.voice.voskstt/src/main/resources/OH-INF/config/config.xml @@ -0,0 +1,50 @@ + + + + + + Configure Speech to Text. + + + + Configure service information messages. + + + + When enabled recognition stops listening after a single utterance. + true + + + + Max seconds to wait to force stop the transcription. + 60 + + + + Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop + listening. + 5 + + + + Keep language model loaded. If true model is just reload the model on config updates, if not model will + be loaded and offloaded on each execution. It will fallback to try to load the model when executed if it was not + able to load it before. + true + + + + Message to be told when no results. + Sorry, I didn't understand you + + + + Message to be told when an error has happened. (Empty for disabled) + Sorry, something went wrong + + + diff --git a/bundles/org.openhab.voice.voskstt/src/main/resources/OH-INF/i18n/voskstt.properties b/bundles/org.openhab.voice.voskstt/src/main/resources/OH-INF/i18n/voskstt.properties new file mode 100644 index 00000000000..952f0783740 --- /dev/null +++ b/bundles/org.openhab.voice.voskstt/src/main/resources/OH-INF/i18n/voskstt.properties @@ -0,0 +1,20 @@ +voice.config.voskstt.errorMessage.label = Error Message +voice.config.voskstt.errorMessage.description = Message to be told when an error has happened. (Empty for disabled) +voice.config.voskstt.group.messages.label = Info Messages +voice.config.voskstt.group.messages.description = Configure service information messages. +voice.config.voskstt.group.stt.label = STT Configuration +voice.config.voskstt.group.stt.description = Configure Speech to Text. +voice.config.voskstt.maxSilenceSeconds.label = Max Silence Seconds +voice.config.voskstt.maxSilenceSeconds.description = Only works when singleUtteranceMode is disabled, max seconds without getting new transcriptions to stop listening. +voice.config.voskstt.maxTranscriptionSeconds.label = Max Transcription Seconds +voice.config.voskstt.maxTranscriptionSeconds.description = Max seconds to wait to force stop the transcription. +voice.config.voskstt.noResultsMessage.label = No Results Message +voice.config.voskstt.noResultsMessage.description = Message to be told when no results. +voice.config.voskstt.preloadModel.label = Preload Model +voice.config.voskstt.preloadModel.description = Keep language model loaded. If true model is just reload the model on config updates, if not model will be loaded and offloaded on each execution. It will fallback to try to load the model when executed if it was not able to load it before. +voice.config.voskstt.singleUtteranceMode.label = Single Utterance Mode +voice.config.voskstt.singleUtteranceMode.description = When enabled recognition stops listening after a single utterance. + +# service + +service.voice.voskstt.label = Vosk Speech-to-Text diff --git a/bundles/pom.xml b/bundles/pom.xml index e1ab1accb9c..191b76beade 100644 --- a/bundles/pom.xml +++ b/bundles/pom.xml @@ -401,6 +401,7 @@ org.openhab.voice.pollytts org.openhab.voice.porcupineks org.openhab.voice.voicerss + org.openhab.voice.voskstt org.openhab.voice.watsonstt