mirror of
https://github.com/openhab/openhab-addons.git
synced 2025-01-25 14:55:55 +01:00
[mimictts] Add LRU cache (#14564)
* [mimictts] Add LRU cache And simplifies code with new core capabilities (no more need to create temporary files implementing FixedLengthAudioStream) --------- Signed-off-by: Gwendal Roulleau <gwendal.roulleau@gmail.com>
This commit is contained in:
parent
287cee32a5
commit
7587e0c504
@ -17,7 +17,6 @@ It supports a subset of SSML, and if you want to use it, be sure to start your t
|
|||||||
Using your favorite configuration UI to edit **Settings / Other Services - Mimic Text-to-Speech** and set:
|
Using your favorite configuration UI to edit **Settings / Other Services - Mimic Text-to-Speech** and set:
|
||||||
|
|
||||||
* **url** - Mimic URL. Default to `http://localhost:59125`
|
* **url** - Mimic URL. Default to `http://localhost:59125`
|
||||||
* **workaroundServletSink** - A boolean activating a workaround for audiosink using the openHAB servlet. It stores audio file temporarily on disk, allowing the servlet to get a cloned stream as needed. Default false.
|
|
||||||
* **speakingRate** - Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less than 1 is faster, and more than 1 is slower.
|
* **speakingRate** - Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less than 1 is faster, and more than 1 is slower.
|
||||||
* **audioVolatility** - The amount of noise added to the generated audio (0-1). Can help mask audio artifacts from the voice model. Multi-speaker models tend to sound better with a lower amount of noise than single speaker models.
|
* **audioVolatility** - The amount of noise added to the generated audio (0-1). Can help mask audio artifacts from the voice model. Multi-speaker models tend to sound better with a lower amount of noise than single speaker models.
|
||||||
* **phonemeVolatility** - The amount of noise used to generate phoneme durations (0-1). Allows for variable speaking cadance, with a value closer to 1 being more variable. Multi-speaker models tend to sound better with a lower amount of phoneme variability than single speaker models.
|
* **phonemeVolatility** - The amount of noise used to generate phoneme durations (0-1). Allows for variable speaking cadance, with a value closer to 1 being more variable. Multi-speaker models tend to sound better with a lower amount of phoneme variability than single speaker models.
|
||||||
@ -48,3 +47,7 @@ In case you would like to setup these settings via a text file, you can edit the
|
|||||||
org.openhab.voice:defaultTTS=mimictts
|
org.openhab.voice:defaultTTS=mimictts
|
||||||
org.openhab.voice:defaultVoice=mimictts:fr_FR_siwis_low
|
org.openhab.voice:defaultVoice=mimictts:fr_FR_siwis_low
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Caching
|
||||||
|
|
||||||
|
The mimic TTS service uses the openHAB TTS cache to cache audio files produced from the most recent queries in order to reduce traffic, improve performance and reduce number of requests.
|
||||||
|
@ -1,84 +0,0 @@
|
|||||||
/**
|
|
||||||
* Copyright (c) 2010-2023 Contributors to the openHAB project
|
|
||||||
*
|
|
||||||
* See the NOTICE file(s) distributed with this work for additional
|
|
||||||
* information.
|
|
||||||
*
|
|
||||||
* This program and the accompanying materials are made available under the
|
|
||||||
* terms of the Eclipse Public License 2.0 which is available at
|
|
||||||
* http://www.eclipse.org/legal/epl-2.0
|
|
||||||
*
|
|
||||||
* SPDX-License-Identifier: EPL-2.0
|
|
||||||
*/
|
|
||||||
package org.openhab.voice.mimic.internal;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.eclipse.jdt.annotation.NonNullByDefault;
|
|
||||||
import org.openhab.core.audio.AudioException;
|
|
||||||
import org.openhab.core.audio.AudioFormat;
|
|
||||||
import org.openhab.core.audio.FileAudioStream;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A FileAudioStream that autodelete after it and its clone are closed
|
|
||||||
* Useful to not congest temporary directory
|
|
||||||
*
|
|
||||||
* @author Gwendal Roulleau - Initial contribution
|
|
||||||
*/
|
|
||||||
@NonNullByDefault
|
|
||||||
public class AutoDeleteFileAudioStream extends FileAudioStream {
|
|
||||||
|
|
||||||
private final File file;
|
|
||||||
private final AudioFormat audioFormat;
|
|
||||||
private final List<ClonedFileInputStream> clonedAudioStreams = new ArrayList<>(1);
|
|
||||||
private boolean isOpen = true;
|
|
||||||
|
|
||||||
public AutoDeleteFileAudioStream(File file, AudioFormat format) throws AudioException {
|
|
||||||
super(file, format);
|
|
||||||
this.file = file;
|
|
||||||
this.audioFormat = format;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void close() throws IOException {
|
|
||||||
super.close();
|
|
||||||
this.isOpen = false;
|
|
||||||
deleteIfPossible();
|
|
||||||
}
|
|
||||||
|
|
||||||
protected void deleteIfPossible() {
|
|
||||||
boolean aClonedStreamIsOpen = clonedAudioStreams.stream().anyMatch(as -> as.isOpen);
|
|
||||||
if (!isOpen && !aClonedStreamIsOpen) {
|
|
||||||
file.delete();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public InputStream getClonedStream() throws AudioException {
|
|
||||||
ClonedFileInputStream clonedInputStream = new ClonedFileInputStream(this, file, audioFormat);
|
|
||||||
clonedAudioStreams.add(clonedInputStream);
|
|
||||||
return clonedInputStream;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static class ClonedFileInputStream extends FileAudioStream {
|
|
||||||
protected boolean isOpen = true;
|
|
||||||
private final AutoDeleteFileAudioStream parent;
|
|
||||||
|
|
||||||
public ClonedFileInputStream(AutoDeleteFileAudioStream parent, File file, AudioFormat audioFormat)
|
|
||||||
throws AudioException {
|
|
||||||
super(file, audioFormat);
|
|
||||||
this.parent = parent;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void close() throws IOException {
|
|
||||||
super.close();
|
|
||||||
this.isOpen = false;
|
|
||||||
parent.deleteIfPossible();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -18,9 +18,9 @@ import java.io.OutputStream;
|
|||||||
|
|
||||||
import org.eclipse.jdt.annotation.NonNullByDefault;
|
import org.eclipse.jdt.annotation.NonNullByDefault;
|
||||||
import org.eclipse.jdt.annotation.Nullable;
|
import org.eclipse.jdt.annotation.Nullable;
|
||||||
import org.openhab.core.audio.AudioException;
|
|
||||||
import org.openhab.core.audio.AudioFormat;
|
import org.openhab.core.audio.AudioFormat;
|
||||||
import org.openhab.core.audio.FixedLengthAudioStream;
|
import org.openhab.core.audio.AudioStream;
|
||||||
|
import org.openhab.core.audio.SizeableAudioStream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An AudioStream with an {@link InputStream} inside
|
* An AudioStream with an {@link InputStream} inside
|
||||||
@ -28,7 +28,7 @@ import org.openhab.core.audio.FixedLengthAudioStream;
|
|||||||
* @author Gwendal Roulleau - Initial contribution
|
* @author Gwendal Roulleau - Initial contribution
|
||||||
*/
|
*/
|
||||||
@NonNullByDefault
|
@NonNullByDefault
|
||||||
public class InputStreamAudioStream extends FixedLengthAudioStream {
|
public class InputStreamAudioStream extends AudioStream implements SizeableAudioStream {
|
||||||
|
|
||||||
public InputStream innerInputStream;
|
public InputStream innerInputStream;
|
||||||
public AudioFormat audioFormat;
|
public AudioFormat audioFormat;
|
||||||
@ -115,9 +115,4 @@ public class InputStreamAudioStream extends FixedLengthAudioStream {
|
|||||||
public long length() {
|
public long length() {
|
||||||
return length;
|
return length;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public InputStream getClonedStream() throws AudioException {
|
|
||||||
throw new AudioException("Operation not supported");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -25,5 +25,4 @@ public class MimicConfiguration {
|
|||||||
public Double speakingRate = 1.0;
|
public Double speakingRate = 1.0;
|
||||||
public Double audioVolatility = 0.667;
|
public Double audioVolatility = 0.667;
|
||||||
public Double phonemeVolatility = 0.8;
|
public Double phonemeVolatility = 0.8;
|
||||||
public Boolean workaroundServletSink = false;
|
|
||||||
}
|
}
|
||||||
|
@ -12,20 +12,19 @@
|
|||||||
*/
|
*/
|
||||||
package org.openhab.voice.mimic.internal;
|
package org.openhab.voice.mimic.internal;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
|
import java.math.BigInteger;
|
||||||
import java.net.URLEncoder;
|
import java.net.URLEncoder;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.nio.file.Files;
|
import java.security.MessageDigest;
|
||||||
import java.nio.file.StandardCopyOption;
|
import java.security.NoSuchAlgorithmException;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.UUID;
|
|
||||||
import java.util.concurrent.ExecutionException;
|
import java.util.concurrent.ExecutionException;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.TimeoutException;
|
import java.util.concurrent.TimeoutException;
|
||||||
@ -38,13 +37,13 @@ import org.eclipse.jetty.client.util.InputStreamResponseListener;
|
|||||||
import org.eclipse.jetty.client.util.StringContentProvider;
|
import org.eclipse.jetty.client.util.StringContentProvider;
|
||||||
import org.eclipse.jetty.http.HttpHeader;
|
import org.eclipse.jetty.http.HttpHeader;
|
||||||
import org.eclipse.jetty.http.HttpStatus;
|
import org.eclipse.jetty.http.HttpStatus;
|
||||||
import org.openhab.core.OpenHAB;
|
|
||||||
import org.openhab.core.audio.AudioException;
|
|
||||||
import org.openhab.core.audio.AudioFormat;
|
import org.openhab.core.audio.AudioFormat;
|
||||||
import org.openhab.core.audio.AudioStream;
|
import org.openhab.core.audio.AudioStream;
|
||||||
import org.openhab.core.config.core.ConfigurableService;
|
import org.openhab.core.config.core.ConfigurableService;
|
||||||
import org.openhab.core.io.net.http.HttpClientFactory;
|
import org.openhab.core.io.net.http.HttpClientFactory;
|
||||||
import org.openhab.core.io.net.http.HttpRequestBuilder;
|
import org.openhab.core.io.net.http.HttpRequestBuilder;
|
||||||
|
import org.openhab.core.voice.AbstractCachedTTSService;
|
||||||
|
import org.openhab.core.voice.TTSCache;
|
||||||
import org.openhab.core.voice.TTSException;
|
import org.openhab.core.voice.TTSException;
|
||||||
import org.openhab.core.voice.TTSService;
|
import org.openhab.core.voice.TTSService;
|
||||||
import org.openhab.core.voice.Voice;
|
import org.openhab.core.voice.Voice;
|
||||||
@ -67,11 +66,11 @@ import com.google.gson.JsonSyntaxException;
|
|||||||
* @author Gwendal Roulleau - Initial contribution
|
* @author Gwendal Roulleau - Initial contribution
|
||||||
*/
|
*/
|
||||||
@Component(configurationPid = MimicTTSService.SERVICE_PID, property = Constants.SERVICE_PID + "="
|
@Component(configurationPid = MimicTTSService.SERVICE_PID, property = Constants.SERVICE_PID + "="
|
||||||
+ MimicTTSService.SERVICE_PID)
|
+ MimicTTSService.SERVICE_PID, service = TTSService.class)
|
||||||
@ConfigurableService(category = MimicTTSService.SERVICE_CATEGORY, label = MimicTTSService.SERVICE_NAME
|
@ConfigurableService(category = MimicTTSService.SERVICE_CATEGORY, label = MimicTTSService.SERVICE_NAME
|
||||||
+ " Text-to-Speech", description_uri = MimicTTSService.SERVICE_CATEGORY + ":" + MimicTTSService.SERVICE_ID)
|
+ " Text-to-Speech", description_uri = MimicTTSService.SERVICE_CATEGORY + ":" + MimicTTSService.SERVICE_ID)
|
||||||
@NonNullByDefault
|
@NonNullByDefault
|
||||||
public class MimicTTSService implements TTSService {
|
public class MimicTTSService extends AbstractCachedTTSService {
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(MimicTTSService.class);
|
private final Logger logger = LoggerFactory.getLogger(MimicTTSService.class);
|
||||||
|
|
||||||
@ -84,7 +83,6 @@ public class MimicTTSService implements TTSService {
|
|||||||
* Configuration parameters
|
* Configuration parameters
|
||||||
*/
|
*/
|
||||||
private static final String PARAM_URL = "url";
|
private static final String PARAM_URL = "url";
|
||||||
private static final String PARAM_WORKAROUNDSERVLETSINK = "workaroundServletSink";
|
|
||||||
private static final String PARAM_SPEAKINGRATE = "speakingRate";
|
private static final String PARAM_SPEAKINGRATE = "speakingRate";
|
||||||
private static final String PARAM_AUDIOVOLATITLITY = "audioVolatility";
|
private static final String PARAM_AUDIOVOLATITLITY = "audioVolatility";
|
||||||
private static final String PARAM_PHONEMEVOLATITLITY = "phonemeVolatility";
|
private static final String PARAM_PHONEMEVOLATITLITY = "phonemeVolatility";
|
||||||
@ -108,7 +106,9 @@ public class MimicTTSService implements TTSService {
|
|||||||
private final HttpClient httpClient;
|
private final HttpClient httpClient;
|
||||||
|
|
||||||
@Activate
|
@Activate
|
||||||
public MimicTTSService(final @Reference HttpClientFactory httpClientFactory, Map<String, Object> config) {
|
public MimicTTSService(final @Reference HttpClientFactory httpClientFactory, @Reference TTSCache ttsCache,
|
||||||
|
Map<String, Object> config) {
|
||||||
|
super(ttsCache);
|
||||||
updateConfig(config);
|
updateConfig(config);
|
||||||
this.httpClient = httpClientFactory.getCommonHttpClient();
|
this.httpClient = httpClientFactory.getCommonHttpClient();
|
||||||
}
|
}
|
||||||
@ -130,12 +130,6 @@ public class MimicTTSService implements TTSService {
|
|||||||
config.url = param.toString();
|
config.url = param.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
// workaround
|
|
||||||
param = newConfig.get(PARAM_WORKAROUNDSERVLETSINK);
|
|
||||||
if (param != null) {
|
|
||||||
config.workaroundServletSink = Boolean.parseBoolean(param.toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
// audio volatility
|
// audio volatility
|
||||||
try {
|
try {
|
||||||
param = newConfig.get(PARAM_AUDIOVOLATITLITY);
|
param = newConfig.get(PARAM_AUDIOVOLATITLITY);
|
||||||
@ -227,8 +221,7 @@ public class MimicTTSService implements TTSService {
|
|||||||
* @throws TTSException in case the service is unavailable or a parameter is invalid.
|
* @throws TTSException in case the service is unavailable or a parameter is invalid.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public AudioStream synthesize(String text, Voice voice, AudioFormat requestedFormat) throws TTSException {
|
public AudioStream synthesizeForCache(String text, Voice voice, AudioFormat requestedFormat) throws TTSException {
|
||||||
|
|
||||||
if (!availableVoices.contains(voice)) {
|
if (!availableVoices.contains(voice)) {
|
||||||
// let a chance for the service to update :
|
// let a chance for the service to update :
|
||||||
refreshVoices();
|
refreshVoices();
|
||||||
@ -294,24 +287,7 @@ public class MimicTTSService implements TTSService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
InputStream inputStreamFromMimic = inputStreamResponseListener.getInputStream();
|
InputStream inputStreamFromMimic = inputStreamResponseListener.getInputStream();
|
||||||
try {
|
|
||||||
if (!config.workaroundServletSink) {
|
|
||||||
return new InputStreamAudioStream(inputStreamFromMimic, AUDIO_FORMAT, length);
|
return new InputStreamAudioStream(inputStreamFromMimic, AUDIO_FORMAT, length);
|
||||||
} else {
|
|
||||||
// Some audio sinks use the openHAB servlet to get audio. This servlet require the
|
|
||||||
// getClonedStream()
|
|
||||||
// method
|
|
||||||
// So we cache the file on disk, thus implementing the method thanks to FileAudioStream.
|
|
||||||
return createTemporaryFile(inputStreamFromMimic, AUDIO_FORMAT);
|
|
||||||
}
|
|
||||||
} catch (TTSException e) {
|
|
||||||
try {
|
|
||||||
inputStreamFromMimic.close();
|
|
||||||
} catch (IOException e1) {
|
|
||||||
}
|
|
||||||
throw e;
|
|
||||||
}
|
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
String errorMessage = "Cannot get wav from mimic url " + urlTTS + " with HTTP response code "
|
String errorMessage = "Cannot get wav from mimic url " + urlTTS + " with HTTP response code "
|
||||||
+ response.getStatus() + " for reason " + response.getReason();
|
+ response.getStatus() + " for reason " + response.getReason();
|
||||||
@ -325,16 +301,16 @@ public class MimicTTSService implements TTSService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private AudioStream createTemporaryFile(InputStream inputStream, AudioFormat audioFormat) throws TTSException {
|
@Override
|
||||||
File mimicDirectory = new File(OpenHAB.getUserDataFolder(), "mimic");
|
public String getCacheKey(String text, Voice voice, AudioFormat requestedFormat) {
|
||||||
mimicDirectory.mkdir();
|
MessageDigest md;
|
||||||
try {
|
try {
|
||||||
File tempFile = File.createTempFile(UUID.randomUUID().toString(), ".wav", mimicDirectory);
|
md = MessageDigest.getInstance("MD5");
|
||||||
tempFile.deleteOnExit();
|
} catch (NoSuchAlgorithmException e) {
|
||||||
Files.copy(inputStream, tempFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
|
return "nomd5algorithm";
|
||||||
return new AutoDeleteFileAudioStream(tempFile, audioFormat);
|
|
||||||
} catch (AudioException | IOException e) {
|
|
||||||
throw new TTSException("Cannot create temporary audio file", e);
|
|
||||||
}
|
}
|
||||||
|
byte[] binaryKey = ((text + voice.getUID() + requestedFormat.toString() + config.speakingRate
|
||||||
|
+ config.audioVolatility + config.phonemeVolatility).getBytes());
|
||||||
|
return String.format("%032x", new BigInteger(1, md.digest(binaryKey)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -11,12 +11,6 @@
|
|||||||
<description>Mimic 3 URL.</description>
|
<description>Mimic 3 URL.</description>
|
||||||
<default>http://localhost:59125</default>
|
<default>http://localhost:59125</default>
|
||||||
</parameter>
|
</parameter>
|
||||||
<parameter name="workaroundServletSink" type="boolean" required="false">
|
|
||||||
<label>Workaround For Servlet-Based Audiosink</label>
|
|
||||||
<description>Enable this workaround to store temporarily the file on disk. Needed if you play on audiosink based on
|
|
||||||
the openHAB audio servlet.</description>
|
|
||||||
<default>false</default>
|
|
||||||
</parameter>
|
|
||||||
<parameter name="speakingRate" min="0" max="1" type="decimal" required="false">
|
<parameter name="speakingRate" min="0" max="1" type="decimal" required="false">
|
||||||
<label>Speaking Rate</label>
|
<label>Speaking Rate</label>
|
||||||
<description>Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less
|
<description>Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less
|
||||||
|
@ -4,8 +4,6 @@ voice.config.mimictts.phonemeVolatility.label = Phoneme Volatility
|
|||||||
voice.config.mimictts.phonemeVolatility.description = The amount of noise used to generate phoneme durations (0-1). Allows for variable speaking cadance, with a value closer to 1 being more variable. Multi-speaker models tend to sound better with a lower amount of phoneme variability than single speaker models.
|
voice.config.mimictts.phonemeVolatility.description = The amount of noise used to generate phoneme durations (0-1). Allows for variable speaking cadance, with a value closer to 1 being more variable. Multi-speaker models tend to sound better with a lower amount of phoneme variability than single speaker models.
|
||||||
voice.config.mimictts.speakingRate.label = Speaking Rate
|
voice.config.mimictts.speakingRate.label = Speaking Rate
|
||||||
voice.config.mimictts.speakingRate.description = Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less than 1 is faster, and more than 1 is slower.
|
voice.config.mimictts.speakingRate.description = Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less than 1 is faster, and more than 1 is slower.
|
||||||
voice.config.mimictts.workaroundServletSink.label= Workaround For Servlet-Based Audiosink
|
|
||||||
voice.config.mimictts.workaroundServletSink.description= Enable this workaround to store temporarily the file on disk. Needed if you play on audiosink based on the openHAB audio servlet.
|
|
||||||
voice.config.mimictts.url.label = URL
|
voice.config.mimictts.url.label = URL
|
||||||
voice.config.mimictts.url.description = Mimic 3 URL.
|
voice.config.mimictts.url.description = Mimic 3 URL.
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user