[voice] New feature "listen and answer" (#2793)

* New method listenAndAnswer in VoiceManager
* New console command "voice listenandanswer"
* New REST API
* New rule action
* Enhanced console command "voice startdialog"

Closes #2688

Signed-off-by: Laurent Garnier <lg.hc@free.fr>
This commit is contained in:
lolodomo 2022-04-09 16:26:09 +02:00 committed by GitHub
parent 6a75130355
commit 90f6a95251
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 325 additions and 48 deletions

View File

@ -312,4 +312,64 @@ public class VoiceResource implements RESTResource {
return JSONResponse.createErrorResponse(Status.BAD_REQUEST, e.getMessage());
}
}
@POST
@Path("/listenandanswer")
@Consumes(MediaType.TEXT_PLAIN)
@Operation(operationId = "listenAndAnswer", summary = "Executes a simple dialog sequence without keyword spotting for a given audio source.", responses = {
@ApiResponse(responseCode = "200", description = "OK"),
@ApiResponse(responseCode = "404", description = "One of the given ids is wrong."),
@ApiResponse(responseCode = "400", description = "Services are missing or language is not supported by services or dialog processing is already started for the audio source.") })
public Response listenAndAnswer(
@HeaderParam(HttpHeaders.ACCEPT_LANGUAGE) @Parameter(description = "language") @Nullable String language,
@QueryParam("sourceId") @Parameter(description = "source ID") @Nullable String sourceId,
@QueryParam("sttId") @Parameter(description = "Speech-to-Text ID") @Nullable String sttId,
@QueryParam("ttsId") @Parameter(description = "Text-to-Speech ID") @Nullable String ttsId,
@QueryParam("hliId") @Parameter(description = "interpreter ID") @Nullable String hliId,
@QueryParam("sinkId") @Parameter(description = "audio sink ID") @Nullable String sinkId,
@QueryParam("listeningItem") @Parameter(description = "listening item") @Nullable String listeningItem) {
AudioSource source = null;
if (sourceId != null) {
source = audioManager.getSource(sourceId);
if (source == null) {
return JSONResponse.createErrorResponse(Status.NOT_FOUND, "Audio source not found");
}
}
STTService stt = null;
if (sttId != null) {
stt = voiceManager.getSTT(sttId);
if (stt == null) {
return JSONResponse.createErrorResponse(Status.NOT_FOUND, "Speech-to-Text not found");
}
}
TTSService tts = null;
if (ttsId != null) {
tts = voiceManager.getTTS(ttsId);
if (tts == null) {
return JSONResponse.createErrorResponse(Status.NOT_FOUND, "Text-to-Speech not found");
}
}
HumanLanguageInterpreter hli = null;
if (hliId != null) {
hli = voiceManager.getHLI(hliId);
if (hli == null) {
return JSONResponse.createErrorResponse(Status.NOT_FOUND, "Interpreter not found");
}
}
AudioSink sink = null;
if (sinkId != null) {
sink = audioManager.getSink(sinkId);
if (sink == null) {
return JSONResponse.createErrorResponse(Status.NOT_FOUND, "Audio sink not found");
}
}
final Locale locale = localeService.getLocale(language);
try {
voiceManager.listenAndAnswer(stt, tts, hli, source, sink, locale, listeningItem);
return Response.ok(null, MediaType.TEXT_PLAIN).build();
} catch (IllegalStateException e) {
return JSONResponse.createErrorResponse(Status.BAD_REQUEST, e.getMessage());
}
}
}

View File

@ -320,4 +320,93 @@ public class Voice {
logger.warn("Failed stopping dialog processing: {}", e.getMessage());
}
}
/**
* Executes a simple dialog sequence without keyword spotting for a given audio source using default speech-to-text
* service, default text-to-speech service, default human language text interpreter and default locale.
*
* @param source the name of audio source to use or null to use the default source
* @param sink the name of audio sink to use or null to use the default sink
*/
@ActionDoc(text = "executes a simple dialog sequence without keyword spotting for a given audio source")
public static void listenAndAnswer(@ParamDoc(name = "source") @Nullable String source,
@ParamDoc(name = "sink") @Nullable String sink) {
listenAndAnswer(null, null, null, source, sink, null, null);
}
/**
* Executes a simple dialog sequence without keyword spotting for a given audio source.
*
* @param stt the speech-to-text service to use or null to use the default service
* @param tts the text-to-speech service to use or null to use the default service
* @param interpreter the human language text interpreter to use or null to use the default service
* @param source the name of audio source to use or null to use the default source
* @param sink the name of audio sink to use or null to use the default sink
* @param Locale the locale to use or null to use the default locale
* @param listeningItem the item to switch ON while listening to a question
*/
@ActionDoc(text = "executes a simple dialog sequence without keyword spotting for a given audio source")
public static void listenAndAnswer(@ParamDoc(name = "speech-to-text service") @Nullable String stt,
@ParamDoc(name = "text-to-speech service") @Nullable String tts,
@ParamDoc(name = "interpreter") @Nullable String interpreter,
@ParamDoc(name = "source") @Nullable String source, @ParamDoc(name = "sink") @Nullable String sink,
@ParamDoc(name = "locale") @Nullable String locale,
@ParamDoc(name = "listening item") @Nullable String listeningItem) {
AudioSource audioSource = null;
if (source != null) {
audioSource = VoiceActionService.audioManager.getSource(source);
if (audioSource == null) {
logger.warn("Failed executing simple dialog: audio source '{}' not found", source);
return;
}
}
STTService sttService = null;
if (stt != null) {
sttService = VoiceActionService.voiceManager.getSTT(stt);
if (sttService == null) {
logger.warn("Failed executing simple dialog: speech-to-text service '{}' not found", stt);
return;
}
}
TTSService ttsService = null;
if (tts != null) {
ttsService = VoiceActionService.voiceManager.getTTS(tts);
if (ttsService == null) {
logger.warn("Failed executing simple dialog: text-to-speech service '{}' not found", tts);
return;
}
}
HumanLanguageInterpreter hliService = null;
if (interpreter != null) {
hliService = VoiceActionService.voiceManager.getHLI(interpreter);
if (hliService == null) {
logger.warn("Failed executing simple dialog: interpreter '{}' not found", interpreter);
return;
}
}
AudioSink audioSink = null;
if (sink != null) {
audioSink = VoiceActionService.audioManager.getSink(sink);
if (audioSink == null) {
logger.warn("Failed executing simple dialog: audio sink '{}' not found", sink);
return;
}
}
Locale loc = null;
if (locale != null) {
String[] split = locale.split("-");
if (split.length == 2) {
loc = new Locale(split[0], split[1]);
} else {
loc = new Locale(split[0]);
}
}
try {
VoiceActionService.voiceManager.listenAndAnswer(sttService, ttsService, hliService, audioSource, audioSink,
loc, listeningItem);
} catch (IllegalStateException e) {
logger.warn("Failed executing simple dialog: {}", e.getMessage());
}
}
}

View File

@ -123,20 +123,21 @@ public interface VoiceManager {
/**
* Starts an infinite dialog sequence using all default services: keyword spotting on the default audio source,
* audio source listening to retrieve the question, speech to text conversion, interpretation, text to speech
* conversion and playback of the answer on the default audio sink
* audio source listening to retrieve a question or a command (default Speech to Text service), interpretation and
* handling of the command, and finally playback of the answer on the default audio sink (default Text to Speech
* service).
*
* Only one dialog can be started for the default audio source.
*
* @throws IllegalStateException if required services are not all available or the provided locale is not supported
* by all these services or the dialog is already started for the default audio source
* @throws IllegalStateException if required services are not all available or the default locale is not supported
* by all these services or a dialog is already started for the default audio source
*/
void startDialog() throws IllegalStateException;
/**
* Starts an infinite dialog sequence: keyword spotting on the audio source, audio source listening to retrieve
* the question, speech to text conversion, interpretation, text to speech conversion and playback of the answer
* on the audio sink
* a question or a command (Speech to Text service), interpretation and handling of the command, and finally
* playback of the answer on the audio sink (Text to Speech service).
*
* Only one dialog can be started for an audio source.
*
@ -150,7 +151,7 @@ public interface VoiceManager {
* @param keyword the keyword to use during keyword spotting or null to use the default keyword
* @param listeningItem the item to switch ON while listening to a question
* @throws IllegalStateException if required services are not all available or the provided locale is not supported
* by all these services or the dialog is already started for this audio source
* by all these services or a dialog is already started for this audio source
*/
void startDialog(@Nullable KSService ks, @Nullable STTService stt, @Nullable TTSService tts,
@Nullable HumanLanguageInterpreter hli, @Nullable AudioSource source, @Nullable AudioSink sink,
@ -165,6 +166,39 @@ public interface VoiceManager {
*/
void stopDialog(@Nullable AudioSource source) throws IllegalStateException;
/**
* Executes a simple dialog sequence without keyword spotting using all default services: default audio source
* listening to retrieve a question or a command (default Speech to Text service), interpretation and handling of
* the command, and finally playback of the answer on the default audio sink (default Text to Speech service).
*
* Only possible if no dialog processor is already started for the default audio source.
*
* @throws IllegalStateException if required services are not all available or the provided default locale is not
* supported by all these services or a dialog is already started for the default audio source
*/
void listenAndAnswer() throws IllegalStateException;
/**
* Executes a simple dialog sequence without keyword spotting: audio source listening to retrieve a question or a
* command (Speech to Text service), interpretation and handling of the command, and finally playback of the
* answer on the audio sink (Text to Speech service).
*
* Only possible if no dialog processor is already started for the audio source.
*
* @param stt the speech-to-text service to use or null to use the default service
* @param tts the text-to-speech service to use or null to use the default service
* @param hli the human language text interpreter to use or null to use the default service
* @param source the audio source to use or null to use the default source
* @param sink the audio sink to use or null to use the default sink
* @param locale the locale to use or null to use the default locale
* @param listeningItem the item to switch ON while listening to a question
* @throws IllegalStateException if required services are not all available or the provided locale is not supported
* by all these services or a dialog is already started for this audio source
*/
void listenAndAnswer(@Nullable STTService stt, @Nullable TTSService tts, @Nullable HumanLanguageInterpreter hli,
@Nullable AudioSource source, @Nullable AudioSink sink, @Nullable Locale locale,
@Nullable String listeningItem) throws IllegalStateException;
/**
* Retrieves a TTS service.
* If a default name is configured and the service available, this is returned. Otherwise, the first available

View File

@ -71,7 +71,7 @@ public class DialogProcessor implements KSListener, STTListener {
private final Logger logger = LoggerFactory.getLogger(DialogProcessor.class);
private final KSService ks;
private final @Nullable KSService ks;
private final STTService stt;
private final TTSService tts;
private final HumanLanguageInterpreter hli;
@ -124,23 +124,76 @@ public class DialogProcessor implements KSListener, STTListener {
this.ttsFormat = VoiceManagerImpl.getBestMatch(tts.getSupportedFormats(), sink.getSupportedFormats());
}
public DialogProcessor(STTService stt, TTSService tts, HumanLanguageInterpreter hli, AudioSource source,
AudioSink sink, Locale locale, @Nullable String listeningItem, EventPublisher eventPublisher,
TranslationProvider i18nProvider, Bundle bundle) {
this.locale = locale;
this.ks = null;
this.hli = hli;
this.stt = stt;
this.tts = tts;
this.source = source;
this.sink = sink;
this.keyword = "";
this.listeningItem = listeningItem;
this.eventPublisher = eventPublisher;
this.i18nProvider = i18nProvider;
this.bundle = bundle;
this.ksFormat = null;
this.sttFormat = VoiceManagerImpl.getBestMatch(source.getSupportedFormats(), stt.getSupportedFormats());
this.ttsFormat = VoiceManagerImpl.getBestMatch(tts.getSupportedFormats(), sink.getSupportedFormats());
}
public void start() {
AudioFormat fmt = ksFormat;
KSService ksService = ks;
if (ksService != null) {
abortKS();
closeStreamKS();
AudioFormat fmt = ksFormat;
if (fmt == null) {
logger.warn("No compatible audio format found for ks '{}' and source '{}'", ksService.getId(),
source.getId());
return;
}
try {
AudioStream stream = source.getInputStream(fmt);
streamKS = stream;
ksServiceHandle = ksService.spot(this, stream, locale, keyword);
} catch (AudioException e) {
logger.warn("Encountered audio error: {}", e.getMessage());
} catch (KSException e) {
logger.warn("Encountered error calling spot: {}", e.getMessage());
closeStreamKS();
}
} else {
executeSimpleDialog();
}
}
private void executeSimpleDialog() {
abortSTT();
closeStreamSTT();
isSTTServerAborting = false;
AudioFormat fmt = sttFormat;
if (fmt == null) {
logger.warn("No compatible audio format found for ks '{}' and source '{}'", ks.getId(), source.getId());
logger.warn("No compatible audio format found for stt '{}' and source '{}'", stt.getId(), source.getId());
return;
}
abortKS();
closeStreamKS();
try {
AudioStream stream = source.getInputStream(fmt);
streamKS = stream;
ksServiceHandle = ks.spot(this, stream, locale, keyword);
streamSTT = stream;
sttServiceHandle = stt.recognize(this, stream, locale, new HashSet<>());
} catch (AudioException e) {
logger.warn("Encountered audio error: {}", e.getMessage());
} catch (KSException e) {
logger.warn("Encountered error calling spot: {}", e.getMessage());
closeStreamKS();
logger.warn("Error creating the audio stream: {}", e.getMessage());
} catch (STTException e) {
closeStreamSTT();
String msg = e.getMessage();
String text = i18nProvider.getText(bundle, "error.stt-exception", null, locale);
if (msg != null) {
say(text == null ? msg : text.replace("{0}", msg));
} else if (text != null) {
say(text.replace("{0}", ""));
}
}
}
@ -210,32 +263,10 @@ public class DialogProcessor implements KSListener, STTListener {
if (!processing) {
isSTTServerAborting = false;
if (ksEvent instanceof KSpottedEvent) {
abortSTT();
closeStreamSTT();
isSTTServerAborting = false;
AudioFormat fmt = sttFormat;
if (fmt != null) {
try {
AudioStream stream = source.getInputStream(fmt);
streamSTT = stream;
sttServiceHandle = stt.recognize(this, stream, locale, new HashSet<>());
} catch (AudioException e) {
logger.warn("Error creating the audio stream: {}", e.getMessage());
} catch (STTException e) {
closeStreamSTT();
String msg = e.getMessage();
String text = i18nProvider.getText(bundle, "error.stt-exception", null, locale);
if (msg != null) {
say(text == null ? msg : text.replace("{0}", msg));
} else if (text != null) {
say(text.replace("{0}", ""));
}
}
} else {
logger.warn("No compatible audio format found for stt '{}' and source '{}'", stt.getId(),
source.getId());
}
logger.debug("KSpottedEvent event received");
executeSimpleDialog();
} else if (ksEvent instanceof KSErrorEvent) {
logger.debug("KSErrorEvent event received");
KSErrorEvent kse = (KSErrorEvent) ksEvent;
String text = i18nProvider.getText(bundle, "error.ks-error", null, locale);
say(text == null ? kse.getMessage() : text.replace("{0}", kse.getMessage()));
@ -246,25 +277,30 @@ public class DialogProcessor implements KSListener, STTListener {
@Override
public synchronized void sttEventReceived(STTEvent sttEvent) {
if (sttEvent instanceof SpeechRecognitionEvent) {
logger.debug("SpeechRecognitionEvent event received");
if (!isSTTServerAborting) {
SpeechRecognitionEvent sre = (SpeechRecognitionEvent) sttEvent;
String question = sre.getTranscript();
logger.debug("Text recognized: {}", question);
try {
toggleProcessing(false);
say(hli.interpret(locale, question));
String answer = hli.interpret(locale, question);
logger.debug("Interpretation result: {}", answer);
say(answer);
} catch (InterpretationException e) {
String msg = e.getMessage();
if (msg != null) {
say(msg);
}
logger.debug("Interpretation exception: {}", e.getMessage());
say(e.getMessage());
}
abortSTT();
}
} else if (sttEvent instanceof RecognitionStartEvent) {
logger.debug("RecognitionStartEvent event received");
toggleProcessing(true);
} else if (sttEvent instanceof RecognitionStopEvent) {
logger.debug("RecognitionStopEvent event received");
toggleProcessing(false);
} else if (sttEvent instanceof SpeechRecognitionErrorEvent) {
logger.debug("SpeechRecognitionErrorEvent event received");
if (!isSTTServerAborting) {
abortSTT();
toggleProcessing(false);

View File

@ -60,6 +60,7 @@ public class VoiceConsoleCommandExtension extends AbstractConsoleCommandExtensio
private static final String SUBCMD_VOICES = "voices";
private static final String SUBCMD_START_DIALOG = "startdialog";
private static final String SUBCMD_STOP_DIALOG = "stopdialog";
private static final String SUBCMD_LISTEN_ANSWER = "listenandanswer";
private static final String SUBCMD_INTERPRETERS = "interpreters";
private static final String SUBCMD_KEYWORD_SPOTTERS = "keywordspotters";
private static final String SUBCMD_STT_SERVICES = "sttservices";
@ -91,6 +92,8 @@ public class VoiceConsoleCommandExtension extends AbstractConsoleCommandExtensio
"start a new dialog processing using the default services or the services identified with provided arguments"),
buildCommandUsage(SUBCMD_STOP_DIALOG + " [<source>]",
"stop the dialog processing for the default audio source or the audio source identified with provided argument"),
buildCommandUsage(SUBCMD_LISTEN_ANSWER + " [<source> [<sink> [<interpreter> [<tts> [<stt>]]]]]",
"Execute a simple dialog sequence without keyword spotting using the default services or the services identified with provided arguments"),
buildCommandUsage(SUBCMD_INTERPRETERS, "lists the interpreters"),
buildCommandUsage(SUBCMD_KEYWORD_SPOTTERS, "lists the keyword spotters"),
buildCommandUsage(SUBCMD_STT_SERVICES, "lists the Speech-to-Text services"),
@ -151,6 +154,19 @@ public class VoiceConsoleCommandExtension extends AbstractConsoleCommandExtensio
"An error occurred while stopping the dialog"));
}
break;
case SUBCMD_LISTEN_ANSWER:
try {
AudioSource source = args.length < 2 ? null : audioManager.getSource(args[1]);
AudioSink sink = args.length < 3 ? null : audioManager.getSink(args[2]);
HumanLanguageInterpreter hli = args.length < 4 ? null : voiceManager.getHLI(args[3]);
TTSService tts = args.length < 5 ? null : voiceManager.getTTS(args[4]);
STTService stt = args.length < 6 ? null : voiceManager.getSTT(args[5]);
voiceManager.listenAndAnswer(stt, tts, hli, source, sink, null, null);
} catch (IllegalStateException e) {
console.println(Objects.requireNonNullElse(e.getMessage(),
"An error occurred while executing the simple dialog sequence"));
}
break;
case SUBCMD_INTERPRETERS:
listInterpreters(console);
return;

View File

@ -549,6 +549,48 @@ public class VoiceManagerImpl implements VoiceManager, ConfigOptionProvider {
dialogProcessors.clear();
}
@Override
public void listenAndAnswer() throws IllegalStateException {
listenAndAnswer(null, null, null, null, null, null, null);
}
@Override
public void listenAndAnswer(@Nullable STTService stt, @Nullable TTSService tts,
@Nullable HumanLanguageInterpreter hli, @Nullable AudioSource source, @Nullable AudioSink sink,
@Nullable Locale locale, @Nullable String listeningItem) throws IllegalStateException {
// use defaults, if null
STTService sttService = (stt == null) ? getSTT() : stt;
TTSService ttsService = (tts == null) ? getTTS() : tts;
HumanLanguageInterpreter interpreter = (hli == null) ? getHLI() : hli;
AudioSource audioSource = (source == null) ? audioManager.getSource() : source;
AudioSink audioSink = (sink == null) ? audioManager.getSink() : sink;
Locale loc = (locale == null) ? localeProvider.getLocale() : locale;
String item = (listeningItem == null) ? this.listeningItem : listeningItem;
Bundle b = bundle;
if (sttService == null || ttsService == null || interpreter == null || audioSource == null || audioSink == null
|| b == null) {
throw new IllegalStateException("Cannot execute a simple dialog as services are missing.");
} else if (!checkLocales(sttService.getSupportedLocales(), loc)
|| !checkLocales(interpreter.getSupportedLocales(), loc)) {
throw new IllegalStateException(
"Cannot execute a simple dialog as provided locale is not supported by all services.");
} else {
DialogProcessor processor = dialogProcessors.get(audioSource.getId());
if (processor == null) {
logger.debug("Executing a simple dialog for source {} ({})", audioSource.getLabel(null),
audioSource.getId());
processor = new DialogProcessor(sttService, ttsService, interpreter, audioSource, audioSink, loc, item,
this.eventPublisher, this.i18nProvider, b);
processor.start();
} else {
throw new IllegalStateException(String.format(
"Cannot execute a simple dialog as a dialog is already started for audio source '%s'.",
audioSource.getLabel(null)));
}
}
}
private boolean checkLocales(Set<Locale> supportedLocales, Locale locale) {
if (supportedLocales.isEmpty()) {
return true;