[audio|voice] Add console commands to troubleshoot audio sources and speech-to-text services (#4202)

Signed-off-by: Miguel Álvarez <miguelwork92@gmail.com>
This commit is contained in:
GiviMAD 2024-05-12 12:25:46 +02:00 committed by GitHub
parent ec7674752a
commit e14b0a8d0d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 295 additions and 8 deletions

View File

@ -386,8 +386,9 @@ public class AudioFormat {
continue;
}
// Prefer WAVE container
if (!CONTAINER_WAVE.equals(format.getContainer())) {
// Prefer WAVE container or raw SIGNED PCM encoded audio
if (!CONTAINER_WAVE.equals(format.getContainer())
&& !(CONTAINER_NONE.equals(format.getContainer()) && CODEC_PCM_SIGNED.equals(format.getCodec()))) {
continue;
}

View File

@ -26,6 +26,7 @@ import org.openhab.core.library.types.PercentType;
* @author Kai Kreuzer - removed unwanted dependencies
* @author Christoph Weitkamp - Added parameter to adjust the volume
* @author Wouter Born - Added methods for getting all sinks and sources
* @author Miguel Álvarez - Add record method
*/
@NonNullByDefault
public interface AudioManager {
@ -151,6 +152,15 @@ public interface AudioManager {
*/
void playMelody(String melody, @Nullable String sinkId, @Nullable PercentType volume);
/**
* Record audio as a WAV file of the specified length to the sounds folder.
*
* @param seconds seconds to record.
* @param filename record filename.
* @param sourceId The id of the audio source to use or null for the default.
*/
void record(int seconds, String filename, @Nullable String sourceId) throws AudioException;
/**
* Retrieves the current volume of a sink
*

View File

@ -42,12 +42,14 @@ import org.osgi.service.component.annotations.Reference;
* @author Kai Kreuzer - refactored to match AudioManager implementation
* @author Christoph Weitkamp - Added parameter to adjust the volume
* @author Wouter Born - Sort audio sink and source options
* @author Miguel Álvarez Díez - Add record command
*/
@Component(service = ConsoleCommandExtension.class)
@NonNullByDefault
public class AudioConsoleCommandExtension extends AbstractConsoleCommandExtension {
static final String SUBCMD_PLAY = "play";
static final String SUBCMD_RECORD = "record";
static final String SUBCMD_STREAM = "stream";
static final String SUBCMD_SYNTHESIZE = "synthesize";
static final String SUBCMD_SOURCES = "sources";
@ -71,6 +73,8 @@ public class AudioConsoleCommandExtension extends AbstractConsoleCommandExtensio
"plays a sound file from the sounds folder through the optionally specified audio sink(s)"),
buildCommandUsage(SUBCMD_PLAY + " <sink> <filename> <volume>",
"plays a sound file from the sounds folder through the specified audio sink(s) with the specified volume"),
buildCommandUsage(SUBCMD_RECORD + " [<source>] <seconds> <filename>",
"record an audio file of the specified seconds to the sounds folder. The extension '.wav' will be added to the filename if missed."),
buildCommandUsage(SUBCMD_STREAM + " [<sink>] <url>",
"streams the sound from the url through the optionally specified audio sink(s)"),
buildCommandUsage(SUBCMD_SYNTHESIZE + " [<sink>] \"<melody>\"",
@ -95,6 +99,14 @@ public class AudioConsoleCommandExtension extends AbstractConsoleCommandExtensio
"Specify file to play, and optionally the sink(s) to use (e.g. 'play javasound hello.mp3')");
}
return;
case SUBCMD_RECORD:
if (args.length > 2) {
record(Arrays.copyOfRange(args, 1, args.length), console);
} else {
console.println(
"Specify time to record and the desired filename, and optionally the source to use (e.g. 'record javasound 10 good_morning.wav')");
}
return;
case SUBCMD_STREAM:
if (args.length > 1) {
stream(Arrays.copyOfRange(args, 1, args.length), console);
@ -175,6 +187,21 @@ public class AudioConsoleCommandExtension extends AbstractConsoleCommandExtensio
}
}
private void record(String[] args, Console console) {
try {
@Nullable
String sourceId = args.length > 2 ? args[0] : null;
int seconds = Integer.parseInt(args.length > 2 ? args[1] : args[0]);
String filename = args.length > 2 ? args[2] : args[1];
audioManager.record(seconds, filename, sourceId);
console.println("Recording completed");
} catch (NumberFormatException e) {
console.println("Unable to parse the recording time: " + e.getMessage());
} catch (AudioException e) {
console.println("Recording terminated with audio exception: " + e.getMessage());
}
}
private void synthesizeMelody(String[] args, Console console) {
switch (args.length) {
case 1:

View File

@ -14,9 +14,13 @@ package org.openhab.core.audio.internal;
import static java.util.Comparator.comparing;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URI;
import java.nio.ByteBuffer;
import java.nio.file.Path;
import java.text.ParseException;
import java.util.Collection;
import java.util.HashSet;
@ -26,6 +30,10 @@ import java.util.Objects;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import javax.sound.sampled.AudioFileFormat;
import javax.sound.sampled.AudioInputStream;
import javax.sound.sampled.AudioSystem;
import org.eclipse.jdt.annotation.NonNullByDefault;
import org.eclipse.jdt.annotation.Nullable;
import org.openhab.core.OpenHAB;
@ -37,6 +45,7 @@ import org.openhab.core.audio.AudioSource;
import org.openhab.core.audio.AudioStream;
import org.openhab.core.audio.FileAudioStream;
import org.openhab.core.audio.URLAudioStream;
import org.openhab.core.audio.utils.AudioWaveUtils;
import org.openhab.core.audio.utils.ToneSynthesizer;
import org.openhab.core.config.core.ConfigOptionProvider;
import org.openhab.core.config.core.ConfigurableService;
@ -61,6 +70,7 @@ import org.slf4j.LoggerFactory;
* @author Christoph Weitkamp - Added getSupportedStreams() and UnsupportedAudioStreamException
* @author Christoph Weitkamp - Added parameter to adjust the volume
* @author Wouter Born - Sort audio sink and source options
* @author Miguel Álvarez - Add record from source
*/
@NonNullByDefault
@Component(immediate = true, configurationPid = "org.openhab.audio", //
@ -147,8 +157,7 @@ public class AudioManagerImpl implements AudioManager, ConfigOptionProvider {
@Override
public void playFile(String fileName, @Nullable String sinkId, @Nullable PercentType volume) throws AudioException {
Objects.requireNonNull(fileName, "File cannot be played as fileName is null.");
File file = new File(OpenHAB.getConfigFolder() + File.separator + SOUND_DIR + File.separator + fileName);
File file = Path.of(OpenHAB.getConfigFolder(), SOUND_DIR, fileName).toFile();
FileAudioStream is = new FileAudioStream(file);
play(is, sinkId, volume);
}
@ -195,6 +204,67 @@ public class AudioManagerImpl implements AudioManager, ConfigOptionProvider {
}
}
@Override
public void record(int seconds, String filename, @Nullable String sourceId) throws AudioException {
var audioSource = sourceId != null ? getSource(sourceId) : getSource();
if (audioSource == null) {
throw new AudioException("Audio source '" + (sourceId != null ? sourceId : "default") + "' not available");
}
var audioFormat = AudioFormat.getBestMatch(audioSource.getSupportedFormats(),
Set.of(AudioFormat.PCM_SIGNED, AudioFormat.WAV));
if (audioFormat == null) {
throw new AudioException("Unable to find valid audio format");
}
javax.sound.sampled.AudioFormat jAudioFormat = new javax.sound.sampled.AudioFormat(
Objects.requireNonNull(audioFormat.getFrequency()), Objects.requireNonNull(audioFormat.getBitDepth()),
Objects.requireNonNull(audioFormat.getChannels()), true, false);
int secondByteLength = ((int) jAudioFormat.getSampleRate() * jAudioFormat.getFrameSize());
int targetByteLength = secondByteLength * seconds;
ByteBuffer recordBuffer = ByteBuffer.allocate(targetByteLength);
try (var audioStream = audioSource.getInputStream(audioFormat)) {
if (audioFormat.isCompatible(AudioFormat.WAV)) {
AudioWaveUtils.removeFMT(audioStream);
}
while (true) {
try {
var bytes = audioStream.readNBytes(secondByteLength);
if (bytes.length == 0) {
logger.debug("End of input audio stream reached");
break;
}
if (recordBuffer.position() + bytes.length > recordBuffer.limit()) {
logger.debug("Recording limit reached");
break;
}
recordBuffer.put(bytes);
} catch (IOException e) {
logger.warn("Reading audio data failed");
}
}
} catch (IOException e) {
logger.warn("IOException while reading audioStream: {}", e.getMessage());
}
String recordFilename = filename.endsWith(".wav") ? filename : filename + ".wav";
logger.info("Saving record file: {}", recordFilename);
byte[] audioBytes = new byte[recordBuffer.position()];
logger.info("Saving bytes: {}", audioBytes.length);
recordBuffer.rewind();
recordBuffer.get(audioBytes);
File recordFile = new File(
OpenHAB.getConfigFolder() + File.separator + SOUND_DIR + File.separator + recordFilename);
try (FileOutputStream fileOutputStream = new FileOutputStream(recordFile)) {
AudioSystem.write(
new AudioInputStream(new ByteArrayInputStream(audioBytes), jAudioFormat,
(long) Math.ceil(((double) audioBytes.length) / jAudioFormat.getFrameSize())), //
AudioFileFormat.Type.WAVE, //
fileOutputStream //
);
fileOutputStream.flush();
} catch (IOException e) {
logger.warn("IOException while saving record file: {}", e.getMessage());
}
}
@Override
public PercentType getVolume(@Nullable String sinkId) throws IOException {
AudioSink sink = getSink(sinkId);

View File

@ -14,11 +14,13 @@ package org.openhab.core.voice;
import java.util.Collection;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import org.eclipse.jdt.annotation.NonNullByDefault;
import org.eclipse.jdt.annotation.Nullable;
import org.openhab.core.audio.AudioSource;
import org.openhab.core.audio.AudioStream;
import org.openhab.core.library.types.PercentType;
import org.openhab.core.voice.text.HumanLanguageInterpreter;
import org.openhab.core.voice.text.InterpretationException;
@ -30,6 +32,7 @@ import org.openhab.core.voice.text.InterpretationException;
* @author Christoph Weitkamp - Added parameter to adjust the volume
* @author Laurent Garnier - Updated methods startDialog and added method stopDialog
* @author Miguel Álvarez - New dialog methods using DialogContext
* @author Miguel Álvarez - Add transcribe method
*/
@NonNullByDefault
public interface VoiceManager {
@ -93,6 +96,26 @@ public interface VoiceManager {
*/
void say(String text, @Nullable String voiceId, @Nullable String sinkId, @Nullable PercentType volume);
/**
* Run speech-to-text using the provided audio source.
*
* @param audioSourceId Audio source to listen.
* @param sttId The id of the speech-to-text service to use or null to use the default.
* @param locale The locale to use or null to use the default.
* @return a human language transcription or empty.
*/
String transcribe(@Nullable String audioSourceId, @Nullable String sttId, @Nullable Locale locale);
/**
* Run speech-to-text over the provided audio stream.
*
* @param audioStream Audio stream to transcribe.
* @param sttId The id of the speech-to-text service to use or null to use the default.
* @param locale The locale to use or null to use the default.
* @return a human language transcription or empty.
*/
String transcribe(AudioStream audioStream, @Nullable String sttId, @Nullable Locale locale);
/**
* Interprets the passed string using the default services for HLI and locale.
*

View File

@ -14,17 +14,23 @@ package org.openhab.core.voice.internal;
import static java.util.Comparator.comparing;
import java.io.FileNotFoundException;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.MissingResourceException;
import java.util.Objects;
import java.util.stream.Collectors;
import org.eclipse.jdt.annotation.NonNullByDefault;
import org.eclipse.jdt.annotation.Nullable;
import org.openhab.core.OpenHAB;
import org.openhab.core.audio.AudioException;
import org.openhab.core.audio.AudioManager;
import org.openhab.core.audio.FileAudioStream;
import org.openhab.core.i18n.LocaleProvider;
import org.openhab.core.io.console.Console;
import org.openhab.core.io.console.extensions.AbstractConsoleCommandExtension;
@ -52,12 +58,14 @@ import org.osgi.service.component.annotations.Reference;
* @author Kai Kreuzer - Initial contribution
* @author Wouter Born - Sort TTS voices
* @author Laurent Garnier - Added sub-commands startdialog and stopdialog
* @author Miguel Álvarez - Add transcribe command
*/
@Component(service = ConsoleCommandExtension.class)
@NonNullByDefault
public class VoiceConsoleCommandExtension extends AbstractConsoleCommandExtension {
private static final String SUBCMD_SAY = "say";
private static final String SUBCMD_TRANSCRIBE = "transcribe";
private static final String SUBCMD_INTERPRET = "interpret";
private static final String SUBCMD_VOICES = "voices";
private static final String SUBCMD_START_DIALOG = "startdialog";
@ -90,7 +98,9 @@ public class VoiceConsoleCommandExtension extends AbstractConsoleCommandExtensio
@Override
public List<String> getUsages() {
return List.of(buildCommandUsage(SUBCMD_SAY + " <text>", "speaks a text"),
return List.of(buildCommandUsage(SUBCMD_SAY + " <text>", "speaks a text"), buildCommandUsage(
SUBCMD_TRANSCRIBE + " [--source <source>]|[--file <file>] [--stt <stt>] [--locale <locale>]",
"transcribe audio from default source, optionally specify a different source/file, speech-to-text service or locale"),
buildCommandUsage(SUBCMD_INTERPRET + " <command>", "interprets a human language command"),
buildCommandUsage(SUBCMD_VOICES, "lists available voices of the TTS services"),
buildCommandUsage(SUBCMD_DIALOGS, "lists the running dialog and their audio/voice services"),
@ -128,6 +138,10 @@ public class VoiceConsoleCommandExtension extends AbstractConsoleCommandExtensio
}
return;
}
case SUBCMD_TRANSCRIBE -> {
transcribe(args, console);
return;
}
case SUBCMD_INTERPRET -> {
if (args.length > 1) {
interpret(Arrays.copyOfRange(args, 1, args.length), console);
@ -305,6 +319,51 @@ public class VoiceConsoleCommandExtension extends AbstractConsoleCommandExtensio
voiceManager.say(msg.toString());
}
private void transcribe(String[] args, Console console) {
HashMap<String, String> parameters;
try {
parameters = parseNamedParameters(args);
} catch (IllegalStateException e) {
console.println(Objects.requireNonNullElse(e.getMessage(), "An error parsing positional parameters"));
return;
}
@Nullable
Locale locale;
try {
locale = parameters.containsKey("locale")
? Locale.forLanguageTag(Objects.requireNonNull(parameters.get("locale")))
: null;
} catch (MissingResourceException e) {
console.println("Error: Locale '" + parameters.get("locale") + "' is not correct.");
return;
}
String text;
if (parameters.containsKey("file")) {
FileAudioStream fileAudioStream;
try {
var file = Path.of(OpenHAB.getConfigFolder(), AudioManager.SOUND_DIR, parameters.get("file")).toFile();
if (!file.exists()) {
throw new FileNotFoundException();
}
fileAudioStream = new FileAudioStream(file);
} catch (AudioException e) {
console.println("Error: Unable to open '" + parameters.get("file") + "' file audio stream.");
return;
} catch (FileNotFoundException e) {
console.println("Error: File '" + parameters.get("file") + "' not found in sound folder.");
return;
}
text = voiceManager.transcribe(fileAudioStream, parameters.get("stt"), locale);
} else {
text = voiceManager.transcribe(parameters.get("source"), parameters.get("stt"), null);
}
if (!text.isBlank()) {
console.println("Transcription: " + text);
} else {
console.println("No transcription generated");
}
}
private void listDialogRegistrations(Console console) {
Collection<DialogRegistration> registrations = voiceManager.getDialogRegistrations();
if (!registrations.isEmpty()) {
@ -405,7 +464,7 @@ public class VoiceConsoleCommandExtension extends AbstractConsoleCommandExtensio
.orElse(null);
}
private HashMap<String, String> parseDialogParameters(String[] args) {
private HashMap<String, String> parseNamedParameters(String[] args) {
var parameters = new HashMap<String, String>();
for (int i = 1; i < args.length; i++) {
var arg = args[i].trim();
@ -428,7 +487,7 @@ public class VoiceConsoleCommandExtension extends AbstractConsoleCommandExtensio
if (args.length < 2) {
return dialogContextBuilder;
}
var parameters = parseDialogParameters(args);
var parameters = parseNamedParameters(args);
String sourceId = parameters.remove("source");
if (sourceId != null) {
var source = audioManager.getSource(sourceId);
@ -463,7 +522,7 @@ public class VoiceConsoleCommandExtension extends AbstractConsoleCommandExtensio
}
private DialogRegistration parseDialogRegistration(String[] args) {
var parameters = parseDialogParameters(args);
var parameters = parseNamedParameters(args);
@Nullable
String sourceId = parameters.remove("source");
if (sourceId == null) {

View File

@ -12,6 +12,7 @@
*/
package org.openhab.core.voice.internal;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Arrays;
@ -28,15 +29,19 @@ import java.util.Map.Entry;
import java.util.Objects;
import java.util.Set;
import java.util.WeakHashMap;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import org.eclipse.jdt.annotation.NonNullByDefault;
import org.eclipse.jdt.annotation.Nullable;
import org.openhab.core.audio.AudioException;
import org.openhab.core.audio.AudioFormat;
import org.openhab.core.audio.AudioManager;
import org.openhab.core.audio.AudioSink;
@ -55,7 +60,13 @@ import org.openhab.core.storage.StorageService;
import org.openhab.core.voice.DialogContext;
import org.openhab.core.voice.DialogRegistration;
import org.openhab.core.voice.KSService;
import org.openhab.core.voice.RecognitionStartEvent;
import org.openhab.core.voice.RecognitionStopEvent;
import org.openhab.core.voice.STTException;
import org.openhab.core.voice.STTService;
import org.openhab.core.voice.STTServiceHandle;
import org.openhab.core.voice.SpeechRecognitionErrorEvent;
import org.openhab.core.voice.SpeechRecognitionEvent;
import org.openhab.core.voice.TTSException;
import org.openhab.core.voice.TTSService;
import org.openhab.core.voice.Voice;
@ -85,6 +96,7 @@ import org.slf4j.LoggerFactory;
* @author Wouter Born - Sort TTS options
* @author Laurent Garnier - Updated methods startDialog and added method stopDialog
* @author Miguel Álvarez - Use dialog context
* @author Miguel Álvarez - Add transcribe method
*/
@Component(immediate = true, configurationPid = VoiceManagerImpl.CONFIGURATION_PID, //
property = Constants.SERVICE_PID + "=org.openhab.voice")
@ -288,6 +300,91 @@ public class VoiceManagerImpl implements VoiceManager, ConfigOptionProvider, Dia
}
}
@Override
public String transcribe(@Nullable String audioSourceId, @Nullable String sttId, @Nullable Locale locale) {
var audioSource = audioSourceId != null ? audioManager.getSource(audioSourceId) : audioManager.getSource();
if (audioSource == null) {
logger.warn("Audio source '{}' not available", audioSourceId != null ? audioSourceId : "default");
return "";
}
var sttService = sttId != null ? getSTT(sttId) : getSTT();
if (sttService == null) {
logger.warn("Speech-to-text service '{}' not available", sttId != null ? sttId : "default");
return "";
}
var sttFormat = VoiceManagerImpl.getBestMatch(audioSource.getSupportedFormats(),
sttService.getSupportedFormats());
if (sttFormat == null) {
logger.warn("No compatible audio format found for stt '{}' and the provided audio stream",
sttService.getId());
return "";
}
AudioStream audioStream;
try {
audioStream = audioSource.getInputStream(sttFormat);
} catch (AudioException e) {
logger.warn("AudioException creating source audio stream: {}", e.getMessage());
return "";
}
return transcribe(audioStream, sttService, locale);
}
@Override
public String transcribe(AudioStream audioStream, @Nullable String sttId, @Nullable Locale locale) {
var sttService = sttId != null ? getSTT(sttId) : getSTT();
if (sttService == null) {
logger.warn("Speech-to-text service '{}' not available", sttId != null ? sttId : "default");
return "";
}
var sttFormat = VoiceManagerImpl.getBestMatch(Set.of(audioStream.getFormat()),
sttService.getSupportedFormats());
if (sttFormat == null) {
logger.warn("No compatible audio format found for stt '{}' and the provided audio stream",
sttService.getId());
return "";
}
return transcribe(audioStream, sttService, locale);
}
private String transcribe(AudioStream audioStream, STTService sttService, @Nullable Locale locale) {
Locale nullSafeLocale = locale != null ? locale : localeProvider.getLocale();
CompletableFuture<String> transcriptionResult = new CompletableFuture<>();
STTServiceHandle sttServiceHandle;
try {
sttServiceHandle = sttService.recognize(sttEvent -> {
if (sttEvent instanceof SpeechRecognitionEvent sre) {
logger.debug("SpeechRecognitionEvent event received");
String transcript = sre.getTranscript();
logger.debug("Text recognized: {}", transcript);
transcriptionResult.complete(transcript);
} else if (sttEvent instanceof RecognitionStartEvent) {
logger.debug("RecognitionStartEvent event received");
} else if (sttEvent instanceof RecognitionStopEvent) {
logger.debug("RecognitionStopEvent event received");
} else if (sttEvent instanceof SpeechRecognitionErrorEvent sre) {
logger.debug("SpeechRecognitionErrorEvent event received");
transcriptionResult.completeExceptionally(
new IOException("SpeechRecognitionErrorEvent emitted: " + sre.getMessage()));
}
}, audioStream, nullSafeLocale, new HashSet<>());
} catch (STTException e) {
logger.warn("STTException while running transcription");
return "";
}
try {
return transcriptionResult.get(60, TimeUnit.SECONDS);
} catch (InterruptedException e) {
logger.warn("InterruptedException waiting for transcription: {}", e.getMessage());
sttServiceHandle.abort();
} catch (ExecutionException e) {
logger.warn("ExecutionException running transcription: {}", e.getCause().getMessage());
} catch (TimeoutException e) {
logger.warn("TimeoutException waiting for transcription");
sttServiceHandle.abort();
}
return "";
}
@Override
public String interpret(String text) throws InterpretationException {
return interpret(text, null);