From 5487ef17bc3b51efaef1ce2d73a5628bc467c630 Mon Sep 17 00:00:00 2001 From: Gwendal Roulleau Date: Mon, 30 Dec 2024 11:47:42 +0100 Subject: [PATCH] [whisper] Add OpenAI API compatibility Apply PR comments Signed-off-by: Gwendal Roulleau --- .../internal/WhisperSTTService.java | 4 +--- .../OH-INF/i18n/whisperstt.properties | 22 +++++++++++++++---- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/bundles/org.openhab.voice.whisperstt/src/main/java/org/openhab/voice/whisperstt/internal/WhisperSTTService.java b/bundles/org.openhab.voice.whisperstt/src/main/java/org/openhab/voice/whisperstt/internal/WhisperSTTService.java index b6566670222..38d3ea06a03 100644 --- a/bundles/org.openhab.voice.whisperstt/src/main/java/org/openhab/voice/whisperstt/internal/WhisperSTTService.java +++ b/bundles/org.openhab.voice.whisperstt/src/main/java/org/openhab/voice/whisperstt/internal/WhisperSTTService.java @@ -286,7 +286,6 @@ public class WhisperSTTService implements STTService { @Override public STTServiceHandle recognize(STTListener sttListener, AudioStream audioStream, Locale locale, Set set) throws STTException { - AtomicBoolean aborted = new AtomicBoolean(false); try { logger.debug("Creating VAD instance..."); @@ -607,7 +606,6 @@ public class WhisperSTTService implements STTService { } private String recognizeAPI(int audioSamplesOffset, short[] audioStream, String language) throws STTException { - // convert to byte array, Each short has 2 bytes int size = audioSamplesOffset * 2; ByteBuffer byteArrayBuffer = ByteBuffer.allocate(size).order(ByteOrder.LITTLE_ENDIAN); @@ -621,7 +619,7 @@ public class WhisperSTTService implements STTService { try { AudioInputStream audioInputStream = new AudioInputStream(new ByteArrayInputStream(byteArray), jAudioFormat, - size); + audioSamplesOffset); // write stream as a WAV file, in a byte array stream : ByteArrayInputStream byteArrayInputStream = null; diff --git a/bundles/org.openhab.voice.whisperstt/src/main/resources/OH-INF/i18n/whisperstt.properties b/bundles/org.openhab.voice.whisperstt/src/main/resources/OH-INF/i18n/whisperstt.properties index 0780316715b..9051bda8e4b 100644 --- a/bundles/org.openhab.voice.whisperstt/src/main/resources/OH-INF/i18n/whisperstt.properties +++ b/bundles/org.openhab.voice.whisperstt/src/main/resources/OH-INF/i18n/whisperstt.properties @@ -3,6 +3,12 @@ addon.whisperstt.name = Whisper Speech-to-Text addon.whisperstt.description = Whisper STT Service uses the whisper.cpp library to transcript audio data to text. +voice.config.whisperstt.apiKey.label = API Key +voice.config.whisperstt.apiKey.description = Key to access the API +voice.config.whisperstt.apiModelName.label = API Model +voice.config.whisperstt.apiModelName.description = Model name to use (API only). Default to OpenAI only available model (whisper-1). +voice.config.whisperstt.apiUrl.label = API Url +voice.config.whisperstt.apiUrl.description = OpenAI compatible API URL. Default to OpenAI transcription service. voice.config.whisperstt.audioContext.label = Audio Context voice.config.whisperstt.audioContext.description = Overwrite the audio context size. (0 to use whisper default context size) voice.config.whisperstt.beamSize.label = Beam Size @@ -24,27 +30,35 @@ voice.config.whisperstt.greedyBestOf.description = Best Of configuration for sam voice.config.whisperstt.group.developer.label = Developer voice.config.whisperstt.group.developer.description = Options added for developers. voice.config.whisperstt.group.grammar.label = Grammar -voice.config.whisperstt.group.grammar.description = Define a grammar to improve transcrptions. +voice.config.whisperstt.group.grammar.description = Define a grammar to improve transcriptions. voice.config.whisperstt.group.messages.label = Info Messages voice.config.whisperstt.group.messages.description = Configure service information messages. +voice.config.whisperstt.group.openaiapi.label = API Configuration Options +voice.config.whisperstt.group.openaiapi.description = Configure OpenAI compatible API, if you don't want to use the local model. voice.config.whisperstt.group.stt.label = STT Configuration voice.config.whisperstt.group.stt.description = Configure Speech to Text. voice.config.whisperstt.group.vad.label = Voice Activity Detection -voice.config.whisperstt.group.vad.description = Configure the VAD mechanisim used to isolate single phrases to feed whisper with. +voice.config.whisperstt.group.vad.description = Configure the VAD mechanism used to isolate single phrases to feed whisper with. voice.config.whisperstt.group.whisper.label = Whisper Options voice.config.whisperstt.group.whisper.description = Configure the whisper.cpp transcription options. voice.config.whisperstt.initSilenceSeconds.label = Initial Silence Seconds voice.config.whisperstt.initSilenceSeconds.description = Max initial seconds of silence to discard transcription. voice.config.whisperstt.initialPrompt.label = Initial Prompt voice.config.whisperstt.initialPrompt.description = Initial prompt to feed whisper with. +voice.config.whisperstt.language.label = Language +voice.config.whisperstt.language.description = If specified, speed up recognition by avoiding auto-detection. Default to system locale. voice.config.whisperstt.maxSeconds.label = Max Transcription Seconds voice.config.whisperstt.maxSeconds.description = Seconds to force transcription before silence detection. voice.config.whisperstt.maxSilenceSeconds.label = Max Silence Seconds voice.config.whisperstt.maxSilenceSeconds.description = Seconds of silence to trigger transcription. voice.config.whisperstt.minSeconds.label = Min Transcription Seconds voice.config.whisperstt.minSeconds.description = Min transcription seconds passed to whisper. -voice.config.whisperstt.modelName.label = Model Name -voice.config.whisperstt.modelName.description = Model name without extension. +voice.config.whisperstt.mode.label = Local Mode Or API +voice.config.whisperstt.mode.description = Use the local model or the OpenAI compatible API. +voice.config.whisperstt.mode.option.LOCAL = Local +voice.config.whisperstt.mode.option.API = OpenAI API +voice.config.whisperstt.modelName.label = Local Model Name +voice.config.whisperstt.modelName.description = Model name without extension. Local mode only. voice.config.whisperstt.openvinoDevice.label = OpenVINO Device voice.config.whisperstt.openvinoDevice.description = Initialize OpenVINO encoder. (built-in binaries do not support OpenVINO, this has no effect) voice.config.whisperstt.preloadModel.label = Preload Model