[voicerss] Add support for WAV audio format (#11916)

* [voicerss] add unit test for supported formats
* [voicerss] add support for WAV audio format

Signed-off-by: Andreas Brenk <mail@andreasbrenk.com>
This commit is contained in:
Andreas Brenk 2022-01-22 16:53:05 +01:00 committed by GitHub
parent efa8963d20
commit 03b53475ba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 262 additions and 70 deletions

View File

@ -50,6 +50,20 @@ public class VoiceRSSTTSService implements TTSService {
// API Key comes from ConfigAdmin
private static final String CONFIG_API_KEY = "apiKey";
/**
* Map from openHAB AudioFormat Codec to VoiceRSS API Audio Codec
*/
private static final Map<String, String> CODEC_MAP = Map.of(AudioFormat.CODEC_PCM_SIGNED, "WAV",
AudioFormat.CODEC_PCM_UNSIGNED, "WAV", AudioFormat.CODEC_PCM_ALAW, "WAV", AudioFormat.CODEC_PCM_ULAW, "WAV",
AudioFormat.CODEC_MP3, "MP3", AudioFormat.CODEC_VORBIS, "OGG", AudioFormat.CODEC_AAC, "AAC");
/**
* Map from openHAB AudioFormat Frequency to VoiceRSS API Audio Frequency
*/
private static final Map<Long, String> FREQUENCY_MAP = Map.of(8_000L, "8khz", 11_025L, "11khz", 12_000L, "12khz",
16_000L, "16khz", 22_050L, "22khz", 24_000L, "24khz", 32_000L, "32khz", 44_100L, "44khz", 48_000L, "48khz");
private String apiKey;
private final Logger logger = LoggerFactory.getLogger(VoiceRSSTTSService.class);
@ -121,22 +135,12 @@ public class VoiceRSSTTSService implements TTSService {
if (!voices.contains(voice)) {
throw new TTSException("The passed voice is unsupported");
}
boolean isAudioFormatSupported = false;
for (AudioFormat currentAudioFormat : audioFormats) {
if (currentAudioFormat.isCompatible(requestedFormat)) {
isAudioFormatSupported = true;
break;
}
}
if (!isAudioFormatSupported) {
throw new TTSException("The passed AudioFormat is unsupported");
}
// now create the input stream for given text, locale, format. There is
// only a default voice
// now create the input stream for given text, locale, voice, codec and format.
try {
File cacheAudioFile = voiceRssImpl.getTextToSpeechAsFile(apiKey, trimmedText,
voice.getLocale().toLanguageTag(), voice.getLabel(), getApiAudioFormat(requestedFormat));
voice.getLocale().toLanguageTag(), voice.getLabel(), getApiAudioCodec(requestedFormat),
getApiAudioFormat(requestedFormat));
if (cacheAudioFile == null) {
throw new TTSException("Could not read from VoiceRSS service");
}
@ -169,46 +173,53 @@ public class VoiceRSSTTSService implements TTSService {
* @return The audio formats of this instance
*/
private Set<AudioFormat> initAudioFormats() {
Set<AudioFormat> audioFormats = new HashSet<>();
for (String format : voiceRssImpl.getAvailableAudioFormats()) {
audioFormats.add(getAudioFormat(format));
}
return audioFormats;
return voiceRssImpl.getAvailableAudioFormats();
}
private AudioFormat getAudioFormat(String apiFormat) {
Boolean bigEndian = null;
Integer bitDepth = 16;
Integer bitRate = null;
Long frequency = 44100L;
/**
* Map {@link AudioFormat#getCodec() codec} to VoiceRSS API codec.
*
* @throws TTSException if {@code format} is not supported
*/
private String getApiAudioCodec(AudioFormat format) throws TTSException {
final String internalCodec = format.getCodec();
final String apiCodec = CODEC_MAP.get(internalCodec != null ? internalCodec : AudioFormat.CODEC_PCM_SIGNED);
if ("MP3".equals(apiFormat)) {
// we use by default: MP3, 44khz_16bit_mono with bitrate 64 kbps
bitRate = 64000;
return new AudioFormat(AudioFormat.CONTAINER_NONE, AudioFormat.CODEC_MP3, bigEndian, bitDepth, bitRate,
frequency);
} else if ("OGG".equals(apiFormat)) {
// we use by default: OGG, 44khz_16bit_mono
return new AudioFormat(AudioFormat.CONTAINER_OGG, AudioFormat.CODEC_VORBIS, bigEndian, bitDepth, bitRate,
frequency);
} else if ("AAC".equals(apiFormat)) {
// we use by default: AAC, 44khz_16bit_mono
return new AudioFormat(AudioFormat.CONTAINER_NONE, AudioFormat.CODEC_AAC, bigEndian, bitDepth, bitRate,
frequency);
} else {
throw new IllegalArgumentException("Audio format " + apiFormat + " not yet supported");
if (apiCodec == null) {
throw new TTSException("Unsupported audio format: " + format);
}
return apiCodec;
}
private String getApiAudioFormat(AudioFormat format) {
if (format.getCodec().equals(AudioFormat.CODEC_MP3)) {
return "MP3";
} else if (format.getCodec().equals(AudioFormat.CODEC_VORBIS)) {
return "OGG";
} else if (format.getCodec().equals(AudioFormat.CODEC_AAC)) {
return "AAC";
} else {
throw new IllegalArgumentException("Audio format " + format.getCodec() + " not yet supported");
/**
* Map {@link AudioFormat#getBitDepth() bit depth} and {@link AudioFormat#getFrequency() frequency} to VoiceRSS API
* format.
*
* @throws TTSException if {@code format} is not supported
*/
private String getApiAudioFormat(AudioFormat format) throws TTSException {
final int bitDepth = format.getBitDepth() != null ? format.getBitDepth() : 16;
final Long frequency = format.getFrequency() != null ? format.getFrequency() : 44_100L;
final String apiFrequency = FREQUENCY_MAP.get(frequency);
if (apiFrequency == null || (bitDepth != 8 && bitDepth != 16)) {
throw new TTSException("Unsupported audio format: " + format);
}
switch (format.getCodec() != null ? format.getCodec() : AudioFormat.CODEC_PCM_SIGNED) {
case AudioFormat.CODEC_PCM_ALAW:
return "alaw_" + apiFrequency + "_mono";
case AudioFormat.CODEC_PCM_ULAW:
return "ulaw_" + apiFrequency + "_mono";
case AudioFormat.CODEC_PCM_SIGNED:
case AudioFormat.CODEC_PCM_UNSIGNED:
case AudioFormat.CODEC_MP3:
case AudioFormat.CODEC_VORBIS:
case AudioFormat.CODEC_AAC:
return apiFrequency + "_" + bitDepth + "_mono";
default:
throw new TTSException("Unsupported audio format: " + format);
}
}

View File

@ -22,6 +22,7 @@ import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Objects;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -55,17 +56,17 @@ public class CachedVoiceRSSCloudImpl extends VoiceRSSCloudImpl {
}
}
public File getTextToSpeechAsFile(String apiKey, String text, String locale, String voice, String audioFormat)
throws IOException {
String fileNameInCache = getUniqueFilenameForText(text, locale, voice);
public File getTextToSpeechAsFile(String apiKey, String text, String locale, String voice, String audioCodec,
String audioFormat) throws IOException {
String fileNameInCache = getUniqueFilenameForText(text, locale, voice, audioFormat);
// check if in cache
File audioFileInCache = new File(cacheFolder, fileNameInCache + "." + audioFormat.toLowerCase());
File audioFileInCache = new File(cacheFolder, fileNameInCache + "." + audioCodec.toLowerCase());
if (audioFileInCache.exists()) {
return audioFileInCache;
}
// if not in cache, get audio data and put to cache
try (InputStream is = super.getTextToSpeech(apiKey, text, locale, voice, audioFormat);
try (InputStream is = super.getTextToSpeech(apiKey, text, locale, voice, audioCodec, audioFormat);
FileOutputStream fos = new FileOutputStream(audioFileInCache)) {
copyStream(is, fos);
// write text to file for transparency too
@ -85,11 +86,12 @@ public class CachedVoiceRSSCloudImpl extends VoiceRSSCloudImpl {
/**
* Gets a unique filename for a give text, by creating a MD5 hash of it. It
* will be preceded by the locale.
* will be preceded by the locale and suffixed by the format if it is not the
* default of "44khz_16bit_mono".
*
* Sample: "en-US_00a2653ac5f77063bc4ea2fee87318d3"
*/
private String getUniqueFilenameForText(String text, String locale, String voice) {
private String getUniqueFilenameForText(String text, String locale, String voice, String format) {
try {
byte[] bytesOfMessage = text.getBytes(StandardCharsets.UTF_8);
MessageDigest md = MessageDigest.getInstance("MD5");
@ -106,6 +108,9 @@ public class CachedVoiceRSSCloudImpl extends VoiceRSSCloudImpl {
filename += voice + "_";
}
filename += hashtext;
if (!Objects.equals(format, "44khz_16bit_mono")) {
filename += "_" + format;
}
return filename;
} catch (NoSuchAlgorithmException ex) {
// should not happen

View File

@ -41,7 +41,7 @@ public interface VoiceRSSCloudAPI {
*
* @return A set of all audio formats supported
*/
Set<String> getAvailableAudioFormats();
Set<AudioFormat> getAvailableAudioFormats();
/**
* Get all supported voices.
@ -70,6 +70,8 @@ public interface VoiceRSSCloudAPI {
* the locale to use
* @param voice
* the voice to use, "default" for the default voice
* @param audioCodec
* the audio codec to use
* @param audioFormat
* the audio format to use
* @return an InputStream to the audio data in specified format
@ -77,6 +79,6 @@ public interface VoiceRSSCloudAPI {
* will be raised if the audio data can not be retrieved from
* cloud service
*/
InputStream getTextToSpeech(String apiKey, String text, String locale, String voice, String audioFormat)
throws IOException;
InputStream getTextToSpeech(String apiKey, String text, String locale, String voice, String audioCodec,
String audioFormat) throws IOException;
}

View File

@ -12,8 +12,6 @@
*/
package org.openhab.voice.voicerss.internal.cloudapi;
import static java.util.stream.Collectors.toSet;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
@ -28,8 +26,8 @@ import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.stream.Stream;
import org.openhab.core.audio.AudioFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -41,13 +39,14 @@ import org.slf4j.LoggerFactory;
* <ul>
* <li>All API languages supported</li>
* <li>Only default voice supported with good audio quality</li>
* <li>Only MP3, OGG and AAC audio formats supported</li>
* <li>MP3, OGG, AAC and WAV audio formats supported</li>
* <li>It uses HTTP and not HTTPS (for performance reasons)</li>
* </ul>
*
* @author Jochen Hiller - Initial contribution
* @author Laurent Garnier - add support for all API languages
* @author Laurent Garnier - add support for OGG and AAC audio formats
* @author Andreas Brenk - add support for WAV audio format
*/
public class VoiceRSSCloudImpl implements VoiceRSSCloudAPI {
@ -55,7 +54,36 @@ public class VoiceRSSCloudImpl implements VoiceRSSCloudAPI {
private final Logger logger = LoggerFactory.getLogger(VoiceRSSCloudImpl.class);
private static final Set<String> SUPPORTED_AUDIO_FORMATS = Stream.of("MP3", "OGG", "AAC").collect(toSet());
private static final Set<AudioFormat> SUPPORTED_AUDIO_FORMATS = Set.of(
new AudioFormat(AudioFormat.CONTAINER_NONE, AudioFormat.CODEC_MP3, null, 16, null, 44_100L),
new AudioFormat(AudioFormat.CONTAINER_OGG, AudioFormat.CODEC_VORBIS, null, 16, null, 44_100L),
new AudioFormat(AudioFormat.CONTAINER_NONE, AudioFormat.CODEC_AAC, null, 16, null, 44_100L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_UNSIGNED, null, 8, 64_000, 8_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, null, 16, 128_000, 8_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_UNSIGNED, false, 8, 88_200, 11_025L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, false, 16, 176_400, 11_025L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_UNSIGNED, false, 8, 96_000, 12_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, false, 16, 192_000, 12_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_UNSIGNED, false, 8, 128_000, 16_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, false, 16, 256_000, 16_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_UNSIGNED, false, 8, 176_400, 22_050L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, false, 16, 352_800, 22_050L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_UNSIGNED, false, 8, 192_000, 24_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, false, 16, 384_000, 24_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_UNSIGNED, false, 8, 256_000, 32_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, false, 16, 512_000, 32_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_UNSIGNED, false, 8, 352_800, 44_100L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, false, 16, 705_600, 44_100L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_UNSIGNED, false, 8, 384_000, 48_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, false, 16, 768_000, 48_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_ALAW, null, 8, 64_000, 8_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_ALAW, null, 8, 88_200, 11_025L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_ALAW, null, 8, 176_400, 22_050L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_ALAW, null, 8, 352_800, 44_100L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_ULAW, null, 8, 64_000, 8_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_ULAW, null, 8, 88_200, 11_025L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_ULAW, null, 8, 176_400, 22_050L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_ULAW, null, 8, 352_800, 44_100L));
private static final Set<Locale> SUPPORTED_LOCALES = new HashSet<>();
static {
@ -164,7 +192,7 @@ public class VoiceRSSCloudImpl implements VoiceRSSCloudAPI {
}
@Override
public Set<String> getAvailableAudioFormats() {
public Set<AudioFormat> getAvailableAudioFormats() {
return SUPPORTED_AUDIO_FORMATS;
}
@ -208,9 +236,9 @@ public class VoiceRSSCloudImpl implements VoiceRSSCloudAPI {
* dependencies.
*/
@Override
public InputStream getTextToSpeech(String apiKey, String text, String locale, String voice, String audioFormat)
throws IOException {
String url = createURL(apiKey, text, locale, voice, audioFormat);
public InputStream getTextToSpeech(String apiKey, String text, String locale, String voice, String audioCodec,
String audioFormat) throws IOException {
String url = createURL(apiKey, text, locale, voice, audioCodec, audioFormat);
logger.debug("Call {}", url);
URLConnection connection = new URL(url).openConnection();
@ -254,13 +282,15 @@ public class VoiceRSSCloudImpl implements VoiceRSSCloudAPI {
*
* It is in package scope to be accessed by tests.
*/
private String createURL(String apiKey, String text, String locale, String voice, String audioFormat) {
private String createURL(String apiKey, String text, String locale, String voice, String audioCodec,
String audioFormat) {
String encodedMsg = URLEncoder.encode(text, StandardCharsets.UTF_8);
String url = "http://api.voicerss.org/?key=" + apiKey + "&hl=" + locale + "&c=" + audioFormat;
String url = "http://api.voicerss.org/?key=" + apiKey + "&hl=" + locale + "&c=" + audioCodec + "&f="
+ audioFormat;
if (!DEFAULT_VOICE.equals(voice)) {
url += "&v=" + voice;
}
url += "&f=44khz_16bit_mono&src=" + encodedMsg;
url += "&src=" + encodedMsg;
return url;
}
}

View File

@ -106,7 +106,7 @@ public class CreateTTSCache {
return;
}
CachedVoiceRSSCloudImpl impl = new CachedVoiceRSSCloudImpl(cacheDir);
File cachedFile = impl.getTextToSpeechAsFile(apiKey, trimmedMsg, locale, voice, "MP3");
File cachedFile = impl.getTextToSpeechAsFile(apiKey, trimmedMsg, locale, voice, "MP3", null);
System.out.println(
"Created cached audio for locale='" + locale + "', msg='" + trimmedMsg + "' to file=" + cachedFile);
}

View File

@ -0,0 +1,52 @@
/**
* Copyright (c) 2010-2022 Contributors to the openHAB project
*
* See the NOTICE file(s) distributed with this work for additional
* information.
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0
*
* SPDX-License-Identifier: EPL-2.0
*/
package org.openhab.voice.voicerss.internal;
import org.hamcrest.Description;
import org.hamcrest.Matcher;
import org.hamcrest.TypeSafeMatcher;
import org.openhab.core.audio.AudioFormat;
/**
* Hamcrest {@link Matcher} to assert a compatible {@link AudioFormat}.
*
* @author Andreas Brenk - Initial contribution
*/
public class CompatibleAudioFormatMatcher extends TypeSafeMatcher<AudioFormat> {
private final AudioFormat audioFormat;
public CompatibleAudioFormatMatcher(AudioFormat audioFormat) {
this.audioFormat = audioFormat;
}
@Override
protected boolean matchesSafely(AudioFormat actual) {
return audioFormat.isCompatible(actual);
}
@Override
public void describeTo(Description description) {
description.appendText("an audio format compatible to ").appendValue(audioFormat);
}
/**
* Creates a matcher that matches when the examined object is
* compatible to the specified <code>audioFormat</code>.
*
* @param audioFormat the audio format which must be compatible
*/
public static Matcher<AudioFormat> compatibleAudioFormat(AudioFormat audioFormat) {
return new CompatibleAudioFormatMatcher(audioFormat);
}
}

View File

@ -0,0 +1,92 @@
/**
* Copyright (c) 2010-2022 Contributors to the openHAB project
*
* See the NOTICE file(s) distributed with this work for additional
* information.
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0
*
* SPDX-License-Identifier: EPL-2.0
*/
package org.openhab.voice.voicerss.internal;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.core.IsIterableContaining.hasItem;
import static org.hamcrest.core.IsNot.not;
import static org.openhab.core.audio.AudioFormat.*;
import static org.openhab.voice.voicerss.internal.CompatibleAudioFormatMatcher.compatibleAudioFormat;
import java.util.Set;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.openhab.core.audio.AudioFormat;
import org.openhab.core.voice.TTSService;
/**
* Tests for {@link VoiceRSSTTSService}.
*
* @author Andreas Brenk - Initial contribution
*/
public class VoiceRSSTTSServiceTest {
private static final AudioFormat MP3_44KHZ_16BIT = new AudioFormat(AudioFormat.CONTAINER_NONE,
AudioFormat.CODEC_MP3, null, 16, null, 44_100L);
private static final AudioFormat OGG_44KHZ_16BIT = new AudioFormat(AudioFormat.CONTAINER_OGG,
AudioFormat.CODEC_VORBIS, null, 16, null, 44_100L);
private static final AudioFormat AAC_44KHZ_16BIT = new AudioFormat(AudioFormat.CONTAINER_NONE,
AudioFormat.CODEC_MP3, null, 16, null, 44_100L);
private static final AudioFormat WAV_22KHZ_8BIT = new AudioFormat(AudioFormat.CONTAINER_WAVE,
AudioFormat.CODEC_PCM_UNSIGNED, null, 8, null, 22_050L);
private static final AudioFormat WAV_48KHZ_16BIT = new AudioFormat(AudioFormat.CONTAINER_WAVE,
AudioFormat.CODEC_PCM_SIGNED, false, 16, null, 48_000L);
/**
* The {@link VoiceRSSTTSService} under test.
*/
private TTSService ttsService;
@BeforeEach
public void setUp() {
final VoiceRSSTTSService ttsService = new VoiceRSSTTSService();
ttsService.activate(null);
this.ttsService = ttsService;
}
@Test
public void testSupportedFormats() {
final Set<AudioFormat> supportedFormats = ttsService.getSupportedFormats();
// check generic formats without any further constraints
assertThat(supportedFormats, hasItem(compatibleAudioFormat(MP3)));
assertThat(supportedFormats, hasItem(compatibleAudioFormat(WAV)));
assertThat(supportedFormats, hasItem(compatibleAudioFormat(OGG)));
assertThat(supportedFormats, hasItem(compatibleAudioFormat(AAC)));
// check specific formats with common constraints
assertThat(supportedFormats, hasItem(compatibleAudioFormat(MP3_44KHZ_16BIT)));
assertThat(supportedFormats, hasItem(compatibleAudioFormat(OGG_44KHZ_16BIT)));
assertThat(supportedFormats, hasItem(compatibleAudioFormat(AAC_44KHZ_16BIT)));
assertThat(supportedFormats, hasItem(compatibleAudioFormat(WAV_22KHZ_8BIT)));
assertThat(supportedFormats, hasItem(compatibleAudioFormat(WAV_48KHZ_16BIT)));
// check specific formats with additional constraints
assertThat(supportedFormats, hasItem(compatibleAudioFormat(bitRate(WAV, 705_600)))); // 44.1 kHz 16-bit
// check unsupported formats
assertThat(supportedFormats, not(hasItem(compatibleAudioFormat(bitDepth(WAV, 24)))));
}
private AudioFormat bitDepth(AudioFormat format, Integer bitDepth) {
return new AudioFormat(format.getContainer(), format.getCodec(), format.isBigEndian(), bitDepth,
format.getBitRate(), format.getFrequency());
}
private AudioFormat bitRate(AudioFormat format, Integer bitRate) {
return new AudioFormat(format.getContainer(), format.getCodec(), format.isBigEndian(), format.getBitDepth(),
bitRate, format.getFrequency());
}
}