[googletts] Replace custom TTS cache with common TTS cache (#15208)

* [googletts] Replace custom TTS cache with common TTS cache

--------

Signed-off-by: Gwendal Roulleau <gwendal.roulleau@gmail.com>
This commit is contained in:
Gwendal Roulleau 2023-07-11 00:29:13 +02:00 committed by GitHub
parent 72c0e1f29f
commit 2899421ec3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 38 additions and 170 deletions

View File

@ -2,8 +2,7 @@
Google Cloud TTS Service uses the non-free Google Cloud Text-to-Speech API to convert text or Speech Synthesis Markup Language (SSML) input into audio data of natural human speech.
It provides multiple voices, available in different languages and variants and applies DeepMinds groundbreaking research in WaveNet and Googles powerful neural networks.
The implementation caches the converted texts to reduce the load on the API and make the conversion faster.
You can find them in the `$OPENHAB_USERDATA/cache/org.openhab.voice.googletts` folder.
The Google Cloud TTS service uses the openHAB TTS cache to cache audio files produced from the most recent queries in order to reduce traffic, improve performance and reduce number of requests.
Be aware, that using this service may incur cost on your Google Cloud account.
You can find pricing information on the [documentation page](https://cloud.google.com/text-to-speech/#pricing-summary).
@ -47,10 +46,6 @@ It is recommended to clear this configuration parameter afterwards.
* **Pitch** - The pitch of selected voice, up to 20 semitones.
* **Volume Gain** - The volume of the output between 16dB and -96dB.
* **Speaking Rate** - The speaking rate can be 4x faster or slower than the normal rate.
* **Purge Cache** - Purges the cache e.g. after testing different voice configuration parameters.
When enabled the cache is purged once.
Make sure to disable this setting again so the cache is maintained after restarts.
In case you would like to setup the service via a text file, create a new file in `$OPENHAB_ROOT/conf/services` named `googletts.cfg`
@ -63,7 +58,6 @@ org.openhab.voice.googletts:authcode=XXXXX
org.openhab.voice.googletts:pitch=0
org.openhab.voice.googletts:volumeGain=0
org.openhab.voice.googletts:speakingRate=1
org.openhab.voice.googletts:purgeCache=false
```
### Default Text-to-Speech and Voice Configuration

View File

@ -12,17 +12,9 @@
*/
package org.openhab.voice.googletts.internal;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Base64;
import java.util.Dictionary;
import java.util.HashMap;
@ -69,10 +61,6 @@ import com.google.gson.JsonSyntaxException;
*/
class GoogleCloudAPI {
private static final char EXTENSION_SEPARATOR = '.';
private static final char UNIX_SEPARATOR = '/';
private static final char WINDOWS_SEPARATOR = '\\';
private static final String BEARER = "Bearer ";
private static final String GCP_AUTH_URI = "https://accounts.google.com/o/oauth2/auth";
@ -103,11 +91,6 @@ class GoogleCloudAPI {
*/
private final Map<Locale, Set<GoogleTTSVoice>> voices = new HashMap<>();
/**
* Cache folder
*/
private File cacheFolder;
/**
* Configuration
*/
@ -122,12 +105,10 @@ class GoogleCloudAPI {
/**
* Constructor.
*
* @param cacheFolder Service cache folder
*/
GoogleCloudAPI(ConfigurationAdmin configAdmin, OAuthFactory oAuthFactory, File cacheFolder) {
GoogleCloudAPI(ConfigurationAdmin configAdmin, OAuthFactory oAuthFactory) {
this.configAdmin = configAdmin;
this.oAuthFactory = oAuthFactory;
this.cacheFolder = cacheFolder;
}
/**
@ -161,15 +142,6 @@ class GoogleCloudAPI {
} else {
voices.clear();
}
// maintain cache
if (config.purgeCache) {
File[] files = cacheFolder.listFiles();
if (files != null && files.length > 0) {
Arrays.stream(files).forEach(File::delete);
}
logger.debug("Cache purged.");
}
}
public void dispose() {
@ -341,34 +313,21 @@ class GoogleCloudAPI {
* @param codec Requested codec
* @return String array of Google audio format and the file extension to use.
*/
private String[] getFormatForCodec(String codec) {
private String getFormatForCodec(String codec) {
switch (codec) {
case AudioFormat.CODEC_MP3:
return new String[] { AudioEncoding.MP3.toString(), "mp3" };
return AudioEncoding.MP3.toString();
case AudioFormat.CODEC_PCM_SIGNED:
return new String[] { AudioEncoding.LINEAR16.toString(), "wav" };
return AudioEncoding.LINEAR16.toString();
default:
throw new IllegalArgumentException("Audio format " + codec + " is not yet supported");
}
}
public byte[] synthesizeSpeech(String text, GoogleTTSVoice voice, String codec) {
String[] format = getFormatForCodec(codec);
String fileNameInCache = getUniqueFilenameForText(text, voice.getTechnicalName());
File audioFileInCache = new File(cacheFolder, fileNameInCache + "." + format[1]);
String format = getFormatForCodec(codec);
try {
// check if in cache
if (audioFileInCache.exists()) {
logger.debug("Audio file {} was found in cache.", audioFileInCache.getName());
return Files.readAllBytes(audioFileInCache.toPath());
}
// if not in cache, get audio data and put to cache
byte[] audio = synthesizeSpeechByGoogle(text, voice, format[0]);
if (audio != null) {
saveAudioAndTextToFile(text, audioFileInCache, audio, voice.getTechnicalName());
}
return audio;
return synthesizeSpeechByGoogle(text, voice, format);
} catch (AuthenticationException | CommunicationException e) {
logger.warn("Error initializing Google Cloud TTS service: {}", e.getMessage());
if (oAuthService != null) {
@ -376,62 +335,10 @@ class GoogleCloudAPI {
oAuthService = null;
}
voices.clear();
} catch (FileNotFoundException e) {
logger.warn("Could not write file {} to cache: {}", audioFileInCache, e.getMessage());
} catch (IOException e) {
logger.debug("An unexpected IOException occurred: {}", e.getMessage());
}
return null;
}
/**
* Create cache entry.
*
* @param text Converted text.
* @param cacheFile Cache entry file.
* @param audio Byte array of the audio.
* @param voiceName Used voice
* @throws FileNotFoundException
* @throws IOException in case of file handling exceptions
*/
private void saveAudioAndTextToFile(String text, File cacheFile, byte[] audio, String voiceName)
throws IOException, FileNotFoundException {
logger.debug("Caching audio file {}", cacheFile.getName());
try (FileOutputStream audioFileOutputStream = new FileOutputStream(cacheFile)) {
audioFileOutputStream.write(audio);
}
// write text to file for transparency too
// this allows to know which contents is in which audio file
String textFileName = removeExtension(cacheFile.getName()) + ".txt";
logger.debug("Caching text file {}", textFileName);
try (FileOutputStream textFileOutputStream = new FileOutputStream(new File(cacheFolder, textFileName))) {
// @formatter:off
StringBuilder sb = new StringBuilder("Config: ")
.append(config.toConfigString())
.append(",voice=")
.append(voiceName)
.append(System.lineSeparator())
.append("Text: ")
.append(text)
.append(System.lineSeparator());
// @formatter:on
textFileOutputStream.write(sb.toString().getBytes(StandardCharsets.UTF_8));
}
}
/**
* Removes the extension of a file name.
*
* @param fileName the file name to remove the extension of
* @return the filename without the extension
*/
private String removeExtension(String fileName) {
int extensionPos = fileName.lastIndexOf(EXTENSION_SEPARATOR);
int lastSeparator = Math.max(fileName.lastIndexOf(UNIX_SEPARATOR), fileName.lastIndexOf(WINDOWS_SEPARATOR));
return lastSeparator > extensionPos ? fileName : fileName.substring(0, extensionPos);
}
/**
* Call Google service to synthesize the required text
*
@ -476,25 +383,6 @@ class GoogleCloudAPI {
return null;
}
/**
* Gets a unique filename for a give text, by creating a MD5 hash of it. It
* will be preceded by the locale.
* <p>
* Sample: "en-US_00a2653ac5f77063bc4ea2fee87318d3"
*/
private String getUniqueFilenameForText(String text, String voiceName) {
try {
MessageDigest md = MessageDigest.getInstance("MD5");
byte[] bytesOfMessage = (config.toConfigString() + text).getBytes(StandardCharsets.UTF_8);
String fileNameHash = String.format("%032x", new BigInteger(1, md.digest(bytesOfMessage)));
return voiceName + "_" + fileNameHash;
} catch (NoSuchAlgorithmException e) {
// should not happen
logger.error("Could not create MD5 hash for '{}'", text, e);
return null;
}
}
boolean isInitialized() {
return oAuthService != null;
}

View File

@ -44,15 +44,10 @@ class GoogleTTSConfig {
*/
public Double speakingRate = 1d;
/**
* Purge cache after configuration changes.
*/
public Boolean purgeCache = Boolean.FALSE;
@Override
public String toString() {
return "GoogleTTSConfig{pitch=" + pitch + ", speakingRate=" + speakingRate + ", volumeGainDb=" + volumeGainDb
+ ", purgeCache=" + purgeCache + '}';
+ '}';
}
String toConfigString() {

View File

@ -15,24 +15,29 @@ package org.openhab.voice.googletts.internal;
import static org.openhab.voice.googletts.internal.GoogleTTSService.*;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Collections;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import org.eclipse.jdt.annotation.NonNull;
import org.eclipse.jdt.annotation.NonNullByDefault;
import org.eclipse.jdt.annotation.Nullable;
import org.openhab.core.OpenHAB;
import org.openhab.core.audio.AudioFormat;
import org.openhab.core.audio.AudioStream;
import org.openhab.core.audio.ByteArrayAudioStream;
import org.openhab.core.audio.utils.AudioWaveUtils;
import org.openhab.core.auth.client.oauth2.OAuthFactory;
import org.openhab.core.config.core.ConfigurableService;
import org.openhab.core.voice.AbstractCachedTTSService;
import org.openhab.core.voice.TTSCache;
import org.openhab.core.voice.TTSException;
import org.openhab.core.voice.TTSService;
import org.openhab.core.voice.Voice;
@ -52,10 +57,11 @@ import org.slf4j.LoggerFactory;
*
* @author Gabor Bicskei - Initial contribution
*/
@Component(configurationPid = SERVICE_PID, property = Constants.SERVICE_PID + "=" + SERVICE_PID)
@Component(configurationPid = SERVICE_PID, property = Constants.SERVICE_PID + "="
+ SERVICE_PID, service = TTSService.class)
@ConfigurableService(category = SERVICE_CATEGORY, label = SERVICE_NAME
+ " Text-to-Speech", description_uri = SERVICE_CATEGORY + ":" + SERVICE_ID)
public class GoogleTTSService implements TTSService {
public class GoogleTTSService extends AbstractCachedTTSService {
/**
* Service name
*/
@ -76,11 +82,6 @@ public class GoogleTTSService implements TTSService {
*/
static final String SERVICE_PID = "org.openhab." + SERVICE_CATEGORY + "." + SERVICE_ID;
/**
* Cache folder under $userdata
*/
private static final String CACHE_FOLDER_NAME = "cache";
/**
* Configuration parameters
*/
@ -90,7 +91,6 @@ public class GoogleTTSService implements TTSService {
private static final String PARAM_PITCH = "pitch";
private static final String PARAM_SPEAKING_RATE = "speakingRate";
private static final String PARAM_VOLUME_GAIN_DB = "volumeGainDb";
private static final String PARAM_PURGE_CACHE = "purgeCache";
/**
* Logger.
@ -117,8 +117,9 @@ public class GoogleTTSService implements TTSService {
private final GoogleTTSConfig config = new GoogleTTSConfig();
@Activate
public GoogleTTSService(final @Reference ConfigurationAdmin configAdmin,
final @Reference OAuthFactory oAuthFactory) {
public GoogleTTSService(final @Reference ConfigurationAdmin configAdmin, final @Reference OAuthFactory oAuthFactory,
@Reference TTSCache ttsCache, Map<String, Object> config) {
super(ttsCache);
this.configAdmin = configAdmin;
this.oAuthFactory = oAuthFactory;
}
@ -128,15 +129,7 @@ public class GoogleTTSService implements TTSService {
*/
@Activate
protected void activate(Map<String, Object> config) {
// create cache folder
File userData = new File(OpenHAB.getUserDataFolder());
File cacheFolder = new File(new File(userData, CACHE_FOLDER_NAME), SERVICE_PID);
if (!cacheFolder.exists()) {
cacheFolder.mkdirs();
}
logger.debug("Using cache folder {}", cacheFolder.getAbsolutePath());
apiImpl = new GoogleCloudAPI(configAdmin, oAuthFactory, cacheFolder);
apiImpl = new GoogleCloudAPI(configAdmin, oAuthFactory);
updateConfig(config);
}
@ -236,13 +229,6 @@ public class GoogleTTSService implements TTSService {
config.volumeGainDb = Double.parseDouble(param);
}
// purgeCache
param = newConfig.containsKey(PARAM_PURGE_CACHE) ? newConfig.get(PARAM_PURGE_CACHE).toString() : null;
if (param != null) {
config.purgeCache = Boolean.parseBoolean(param);
}
logger.trace("New configuration: {}", config.toString());
if (config.clientId != null && !config.clientId.isEmpty() && config.clientSecret != null
&& !config.clientSecret.isEmpty()) {
apiImpl.setConfig(config);
@ -313,7 +299,7 @@ public class GoogleTTSService implements TTSService {
* @throws TTSException in case the service is unavailable or a parameter is invalid.
*/
@Override
public AudioStream synthesize(String text, Voice voice, AudioFormat requestedFormat) throws TTSException {
public AudioStream synthesizeForCache(String text, Voice voice, AudioFormat requestedFormat) throws TTSException {
logger.debug("Synthesize '{}' for voice '{}' in format {}", text, voice.getUID(), requestedFormat);
// Validate known api key
if (!apiImpl.isInitialized()) {
@ -361,4 +347,19 @@ public class GoogleTTSService implements TTSService {
throw new TTSException("Cannot parse WAV format", e);
}
}
@Override
public @NonNull String getCacheKey(@NonNull String text, @NonNull Voice voice,
@NonNull AudioFormat requestedFormat) {
try {
MessageDigest md = MessageDigest.getInstance("MD5");
byte[] bytesOfMessage = (config.toConfigString() + text + requestedFormat).getBytes(StandardCharsets.UTF_8);
String hash = String.format("%032x", new BigInteger(1, md.digest(bytesOfMessage)));
return ((GoogleTTSVoice) voice).getTechnicalName() + "_" + hash;
} catch (NoSuchAlgorithmException e) {
// should not happen
logger.warn("Could not create MD5 hash for '{}'", text, e);
return "nomd5algorithm";
}
}
}

View File

@ -43,13 +43,5 @@
<description>Speaking rate can be 4x faster or slower than the normal rate.</description>
<default>1</default>
</parameter>
<parameter name="purgeCache" type="boolean">
<advanced>true</advanced>
<label>Purge Cache</label>
<description>Purges the cache e.g. after testing different voice configuration parameters. When enabled the cache is
purged once. Make sure to disable this setting again so the cache is maintained after restarts.</description>
<default>false</default>
</parameter>
</config-description>
</config-description:config-descriptions>

View File

@ -10,8 +10,6 @@ voice.config.googletts.group.tts.label = TTS Configuration
voice.config.googletts.group.tts.description = Parameters for Google Cloud TTS API.
voice.config.googletts.pitch.label = Pitch
voice.config.googletts.pitch.description = Customize the pitch of your selected voice, up to 20 semitones more or less than the default output.
voice.config.googletts.purgeCache.label = Purge Cache
voice.config.googletts.purgeCache.description = Purges the cache e.g. after testing different voice configuration parameters. When enabled the cache is purged once. Make sure to disable this setting again so the cache is maintained after restarts.
voice.config.googletts.speakingRate.label = Speaking Rate
voice.config.googletts.speakingRate.description = Speaking rate can be 4x faster or slower than the normal rate.
voice.config.googletts.volumeGain.label = Volume Gain