[mimictts] Add LRU cache (#14564)

* [mimictts] Add LRU cache

And simplifies code with new core capabilities (no more need to create temporary files implementing FixedLengthAudioStream)

---------

Signed-off-by: Gwendal Roulleau <gwendal.roulleau@gmail.com>
This commit is contained in:
Gwendal Roulleau 2023-07-12 21:59:48 +02:00 committed by GitHub
parent 287cee32a5
commit 7587e0c504
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 28 additions and 147 deletions

View File

@ -17,7 +17,6 @@ It supports a subset of SSML, and if you want to use it, be sure to start your t
Using your favorite configuration UI to edit **Settings / Other Services - Mimic Text-to-Speech** and set:
* **url** - Mimic URL. Default to `http://localhost:59125`
* **workaroundServletSink** - A boolean activating a workaround for audiosink using the openHAB servlet. It stores audio file temporarily on disk, allowing the servlet to get a cloned stream as needed. Default false.
* **speakingRate** - Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less than 1 is faster, and more than 1 is slower.
* **audioVolatility** - The amount of noise added to the generated audio (0-1). Can help mask audio artifacts from the voice model. Multi-speaker models tend to sound better with a lower amount of noise than single speaker models.
* **phonemeVolatility** - The amount of noise used to generate phoneme durations (0-1). Allows for variable speaking cadance, with a value closer to 1 being more variable. Multi-speaker models tend to sound better with a lower amount of phoneme variability than single speaker models.
@ -48,3 +47,7 @@ In case you would like to setup these settings via a text file, you can edit the
org.openhab.voice:defaultTTS=mimictts
org.openhab.voice:defaultVoice=mimictts:fr_FR_siwis_low
```
## Caching
The mimic TTS service uses the openHAB TTS cache to cache audio files produced from the most recent queries in order to reduce traffic, improve performance and reduce number of requests.

View File

@ -1,84 +0,0 @@
/**
* Copyright (c) 2010-2023 Contributors to the openHAB project
*
* See the NOTICE file(s) distributed with this work for additional
* information.
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0
*
* SPDX-License-Identifier: EPL-2.0
*/
package org.openhab.voice.mimic.internal;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import org.eclipse.jdt.annotation.NonNullByDefault;
import org.openhab.core.audio.AudioException;
import org.openhab.core.audio.AudioFormat;
import org.openhab.core.audio.FileAudioStream;
/**
* A FileAudioStream that autodelete after it and its clone are closed
* Useful to not congest temporary directory
*
* @author Gwendal Roulleau - Initial contribution
*/
@NonNullByDefault
public class AutoDeleteFileAudioStream extends FileAudioStream {
private final File file;
private final AudioFormat audioFormat;
private final List<ClonedFileInputStream> clonedAudioStreams = new ArrayList<>(1);
private boolean isOpen = true;
public AutoDeleteFileAudioStream(File file, AudioFormat format) throws AudioException {
super(file, format);
this.file = file;
this.audioFormat = format;
}
@Override
public void close() throws IOException {
super.close();
this.isOpen = false;
deleteIfPossible();
}
protected void deleteIfPossible() {
boolean aClonedStreamIsOpen = clonedAudioStreams.stream().anyMatch(as -> as.isOpen);
if (!isOpen && !aClonedStreamIsOpen) {
file.delete();
}
}
@Override
public InputStream getClonedStream() throws AudioException {
ClonedFileInputStream clonedInputStream = new ClonedFileInputStream(this, file, audioFormat);
clonedAudioStreams.add(clonedInputStream);
return clonedInputStream;
}
private static class ClonedFileInputStream extends FileAudioStream {
protected boolean isOpen = true;
private final AutoDeleteFileAudioStream parent;
public ClonedFileInputStream(AutoDeleteFileAudioStream parent, File file, AudioFormat audioFormat)
throws AudioException {
super(file, audioFormat);
this.parent = parent;
}
@Override
public void close() throws IOException {
super.close();
this.isOpen = false;
parent.deleteIfPossible();
}
}
}

View File

@ -18,9 +18,9 @@ import java.io.OutputStream;
import org.eclipse.jdt.annotation.NonNullByDefault;
import org.eclipse.jdt.annotation.Nullable;
import org.openhab.core.audio.AudioException;
import org.openhab.core.audio.AudioFormat;
import org.openhab.core.audio.FixedLengthAudioStream;
import org.openhab.core.audio.AudioStream;
import org.openhab.core.audio.SizeableAudioStream;
/**
* An AudioStream with an {@link InputStream} inside
@ -28,7 +28,7 @@ import org.openhab.core.audio.FixedLengthAudioStream;
* @author Gwendal Roulleau - Initial contribution
*/
@NonNullByDefault
public class InputStreamAudioStream extends FixedLengthAudioStream {
public class InputStreamAudioStream extends AudioStream implements SizeableAudioStream {
public InputStream innerInputStream;
public AudioFormat audioFormat;
@ -115,9 +115,4 @@ public class InputStreamAudioStream extends FixedLengthAudioStream {
public long length() {
return length;
}
@Override
public InputStream getClonedStream() throws AudioException {
throw new AudioException("Operation not supported");
}
}

View File

@ -25,5 +25,4 @@ public class MimicConfiguration {
public Double speakingRate = 1.0;
public Double audioVolatility = 0.667;
public Double phonemeVolatility = 0.8;
public Boolean workaroundServletSink = false;
}

View File

@ -12,20 +12,19 @@
*/
package org.openhab.voice.mimic.internal;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.math.BigInteger;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
@ -38,13 +37,13 @@ import org.eclipse.jetty.client.util.InputStreamResponseListener;
import org.eclipse.jetty.client.util.StringContentProvider;
import org.eclipse.jetty.http.HttpHeader;
import org.eclipse.jetty.http.HttpStatus;
import org.openhab.core.OpenHAB;
import org.openhab.core.audio.AudioException;
import org.openhab.core.audio.AudioFormat;
import org.openhab.core.audio.AudioStream;
import org.openhab.core.config.core.ConfigurableService;
import org.openhab.core.io.net.http.HttpClientFactory;
import org.openhab.core.io.net.http.HttpRequestBuilder;
import org.openhab.core.voice.AbstractCachedTTSService;
import org.openhab.core.voice.TTSCache;
import org.openhab.core.voice.TTSException;
import org.openhab.core.voice.TTSService;
import org.openhab.core.voice.Voice;
@ -67,11 +66,11 @@ import com.google.gson.JsonSyntaxException;
* @author Gwendal Roulleau - Initial contribution
*/
@Component(configurationPid = MimicTTSService.SERVICE_PID, property = Constants.SERVICE_PID + "="
+ MimicTTSService.SERVICE_PID)
+ MimicTTSService.SERVICE_PID, service = TTSService.class)
@ConfigurableService(category = MimicTTSService.SERVICE_CATEGORY, label = MimicTTSService.SERVICE_NAME
+ " Text-to-Speech", description_uri = MimicTTSService.SERVICE_CATEGORY + ":" + MimicTTSService.SERVICE_ID)
@NonNullByDefault
public class MimicTTSService implements TTSService {
public class MimicTTSService extends AbstractCachedTTSService {
private final Logger logger = LoggerFactory.getLogger(MimicTTSService.class);
@ -84,7 +83,6 @@ public class MimicTTSService implements TTSService {
* Configuration parameters
*/
private static final String PARAM_URL = "url";
private static final String PARAM_WORKAROUNDSERVLETSINK = "workaroundServletSink";
private static final String PARAM_SPEAKINGRATE = "speakingRate";
private static final String PARAM_AUDIOVOLATITLITY = "audioVolatility";
private static final String PARAM_PHONEMEVOLATITLITY = "phonemeVolatility";
@ -108,7 +106,9 @@ public class MimicTTSService implements TTSService {
private final HttpClient httpClient;
@Activate
public MimicTTSService(final @Reference HttpClientFactory httpClientFactory, Map<String, Object> config) {
public MimicTTSService(final @Reference HttpClientFactory httpClientFactory, @Reference TTSCache ttsCache,
Map<String, Object> config) {
super(ttsCache);
updateConfig(config);
this.httpClient = httpClientFactory.getCommonHttpClient();
}
@ -130,12 +130,6 @@ public class MimicTTSService implements TTSService {
config.url = param.toString();
}
// workaround
param = newConfig.get(PARAM_WORKAROUNDSERVLETSINK);
if (param != null) {
config.workaroundServletSink = Boolean.parseBoolean(param.toString());
}
// audio volatility
try {
param = newConfig.get(PARAM_AUDIOVOLATITLITY);
@ -227,8 +221,7 @@ public class MimicTTSService implements TTSService {
* @throws TTSException in case the service is unavailable or a parameter is invalid.
*/
@Override
public AudioStream synthesize(String text, Voice voice, AudioFormat requestedFormat) throws TTSException {
public AudioStream synthesizeForCache(String text, Voice voice, AudioFormat requestedFormat) throws TTSException {
if (!availableVoices.contains(voice)) {
// let a chance for the service to update :
refreshVoices();
@ -294,24 +287,7 @@ public class MimicTTSService implements TTSService {
}
InputStream inputStreamFromMimic = inputStreamResponseListener.getInputStream();
try {
if (!config.workaroundServletSink) {
return new InputStreamAudioStream(inputStreamFromMimic, AUDIO_FORMAT, length);
} else {
// Some audio sinks use the openHAB servlet to get audio. This servlet require the
// getClonedStream()
// method
// So we cache the file on disk, thus implementing the method thanks to FileAudioStream.
return createTemporaryFile(inputStreamFromMimic, AUDIO_FORMAT);
}
} catch (TTSException e) {
try {
inputStreamFromMimic.close();
} catch (IOException e1) {
}
throw e;
}
return new InputStreamAudioStream(inputStreamFromMimic, AUDIO_FORMAT, length);
} else {
String errorMessage = "Cannot get wav from mimic url " + urlTTS + " with HTTP response code "
+ response.getStatus() + " for reason " + response.getReason();
@ -325,16 +301,16 @@ public class MimicTTSService implements TTSService {
}
}
private AudioStream createTemporaryFile(InputStream inputStream, AudioFormat audioFormat) throws TTSException {
File mimicDirectory = new File(OpenHAB.getUserDataFolder(), "mimic");
mimicDirectory.mkdir();
@Override
public String getCacheKey(String text, Voice voice, AudioFormat requestedFormat) {
MessageDigest md;
try {
File tempFile = File.createTempFile(UUID.randomUUID().toString(), ".wav", mimicDirectory);
tempFile.deleteOnExit();
Files.copy(inputStream, tempFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
return new AutoDeleteFileAudioStream(tempFile, audioFormat);
} catch (AudioException | IOException e) {
throw new TTSException("Cannot create temporary audio file", e);
md = MessageDigest.getInstance("MD5");
} catch (NoSuchAlgorithmException e) {
return "nomd5algorithm";
}
byte[] binaryKey = ((text + voice.getUID() + requestedFormat.toString() + config.speakingRate
+ config.audioVolatility + config.phonemeVolatility).getBytes());
return String.format("%032x", new BigInteger(1, md.digest(binaryKey)));
}
}

View File

@ -11,12 +11,6 @@
<description>Mimic 3 URL.</description>
<default>http://localhost:59125</default>
</parameter>
<parameter name="workaroundServletSink" type="boolean" required="false">
<label>Workaround For Servlet-Based Audiosink</label>
<description>Enable this workaround to store temporarily the file on disk. Needed if you play on audiosink based on
the openHAB audio servlet.</description>
<default>false</default>
</parameter>
<parameter name="speakingRate" min="0" max="1" type="decimal" required="false">
<label>Speaking Rate</label>
<description>Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less

View File

@ -4,8 +4,6 @@ voice.config.mimictts.phonemeVolatility.label = Phoneme Volatility
voice.config.mimictts.phonemeVolatility.description = The amount of noise used to generate phoneme durations (0-1). Allows for variable speaking cadance, with a value closer to 1 being more variable. Multi-speaker models tend to sound better with a lower amount of phoneme variability than single speaker models.
voice.config.mimictts.speakingRate.label = Speaking Rate
voice.config.mimictts.speakingRate.description = Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less than 1 is faster, and more than 1 is slower.
voice.config.mimictts.workaroundServletSink.label= Workaround For Servlet-Based Audiosink
voice.config.mimictts.workaroundServletSink.description= Enable this workaround to store temporarily the file on disk. Needed if you play on audiosink based on the openHAB audio servlet.
voice.config.mimictts.url.label = URL
voice.config.mimictts.url.description = Mimic 3 URL.