blob: 52ac781332197d211a038119b845cba0a5c52f9d [file] [log] [blame]
/* com_svox_picottsengine.cpp
* Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* This is the Manager layer. It sits on top of the native Pico engine
* and provides the interface to the defined Google TTS engine API.
* The Google engine API is the boundary to allow a TTS engine to be swapped.
* The Manager layer also provide the SSML tag interpretation.
* The supported SSML tags are mapped to corresponding tags natively supported by Pico.
* Native Pico functions always begin with picoXXX.
*
* In the Pico engine, the language cannot be changed indpendently of the voice.
* If either the voice or locale/language are changed, a new resource is loaded.
*
* Only a subset of SSML 1.0 tags are supported.
* Some SSML tags involve significant complexity.
* If the language is changed through an SSML tag, there is a latency for the load.
*
*/
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#define LOG_TAG "SVOX Pico Engine"
#include <utils/Log.h>
#include <utils/String16.h> /* for strlen16 */
#include <android_runtime/AndroidRuntime.h>
#include <tts/TtsEngine.h>
#include <cutils/jstring.h>
#include <picoapi.h>
#include <picodefs.h>
#include "svox_ssml_parser.h"
using namespace android;
/* adaptation layer defines */
#define PICO_MEM_SIZE 2500000
/* speaking rate */
#define PICO_MIN_RATE 20
#define PICO_MAX_RATE 500
#define PICO_DEF_RATE 100
/* speaking pitch */
#define PICO_MIN_PITCH 50
#define PICO_MAX_PITCH 200
#define PICO_DEF_PITCH 100
/* speaking volume */
#define PICO_MIN_VOLUME 0
#define PICO_MAX_VOLUME 500
#define PICO_DEF_VOLUME 250
/* string constants */
#define MAX_OUTBUF_SIZE 128
const char * PICO_LINGWARE_PATH = "/sdcard/svox/";
const char * PICO_VOICE_NAME = "PicoVoice";
const char * PICO_SPEED_OPEN_TAG = "<speed level='%d'>";
const char * PICO_SPEED_CLOSE_TAG = "</speed>";
const char * PICO_PITCH_OPEN_TAG = "<pitch level='%d'>";
const char * PICO_PITCH_CLOSE_TAG = "</pitch>";
const char * PICO_VOLUME_OPEN_TAG = "<volume level='%d'>";
const char * PICO_VOLUME_CLOSE_TAG = "</volume>";
const char * PICO_PHONEME_OPEN_TAG = "<phoneme ph='";
const char * PICO_PHONEME_CLOSE_TAG = "'/>";
/* supported voices
Pico does not seperately specify the voice and locale. */
const char * picoSupportedLangIso3[] = { "eng", "eng", "deu", "spa", "fra", "ita" };
const char * picoSupportedCountryIso3[] = { "USA", "GBR", "DEU", "ESP", "FRA", "ITA" };
const char * picoSupportedLang[] = { "en-rUS", "en-rGB", "de-rDE", "es-rES", "fr-rFR", "it-rIT" };
const char * picoInternalLang[] = { "en-US", "en-GB", "de-DE", "es-ES", "fr-FR", "it-IT" };
const char * picoInternalTaLingware[] = { "en-US_ta.bin", "en-GB_ta.bin", "de-DE_ta.bin", "es-ES_ta.bin", "fr-FR_ta.bin", "it-IT_ta.bin" };
const char * picoInternalSgLingware[] = { "en-US_lh0_sg.bin", "en-GB_kh0_sg.bin", "de-DE_gl0_sg.bin", "es-ES_zl0_sg.bin", "fr-FR_nk0_sg.bin", "it-IT_cm0_sg.bin" };
const char * picoInternalUtppLingware[] = { "en-US_utpp.bin", "en-GB_utpp.bin", "de-DE_utpp.bin", "es-ES_utpp.bin", "fr-FR_utpp.bin", "it-IT_utpp.bin" };
const int picoNumSupportedVocs = 6;
/* supported properties */
const char * picoSupportedProperties[] = { "language", "rate", "pitch", "volume" };
const int picoNumSupportedProperties = 4;
/* adapation layer global variables */
synthDoneCB_t * picoSynthDoneCBPtr;
void * picoMemArea = NULL;
pico_System picoSystem = NULL;
pico_Resource picoTaResource = NULL;
pico_Resource picoSgResource = NULL;
pico_Resource picoUtppResource = NULL;
pico_Engine picoEngine = NULL;
pico_Char * picoTaFileName = NULL;
pico_Char * picoSgFileName = NULL;
pico_Char * picoUtppFileName = NULL;
pico_Char * picoTaResourceName = NULL;
pico_Char * picoSgResourceName = NULL;
pico_Char * picoUtppResourceName = NULL;
int picoSynthAbort = 0;
char * picoProp_currLang = NULL; /* current language */
int picoProp_currRate = PICO_DEF_RATE; /* current rate */
int picoProp_currPitch = PICO_DEF_PITCH; /* current pitch */
int picoProp_currVolume = PICO_DEF_VOLUME; /* current volume */
int picoCurrentLangIndex = -1;
/* internal helper functions */
/** checkForLocale
* Check whether the requested locale is among the supported locales.
* @locale - the locale to check, either in xx or xx-rYY format
* return index of the locale, or -1 if not supported.
*/
static int checkForLocale( const char * locale )
{
int found = -1; /* language not found */
int i;
if (locale == NULL) {
LOGE("checkForLanguage called with NULL language");
return found;
}
/* Verify that the requested locale is a locale that we support. */
for (i = 0; i < picoNumSupportedVocs; i ++) {
if (strcmp(locale, picoSupportedLang[i]) == 0) { /* in array */
found = i;
break;
}
};
/* The locale was not found. */
if (found < 0) {
/* We didn't find an exact match; it may have been specified with only the first 2 characters.
This could overmatch ISO 639-3 language codes.%% */
for (i = 0; i < picoNumSupportedVocs; i ++) {
if (strncmp(locale, picoSupportedLang[i], 2) == 0) {
found = i;
break;
}
}
if (found < 0) {
LOGE("TtsEngine::set language called with unsupported locale");
}
};
return found;
}
/** cleanResources
* Unloads any loaded Pico resources.
*/
static void cleanResources( void )
{
if (picoEngine) {
pico_disposeEngine( picoSystem, &picoEngine );
pico_releaseVoiceDefinition( picoSystem, (pico_Char *) PICO_VOICE_NAME );
picoEngine = NULL;
}
if (picoUtppResource) {
pico_unloadResource( picoSystem, &picoUtppResource );
picoUtppResource = NULL;
}
if (picoTaResource) {
pico_unloadResource( picoSystem, &picoTaResource );
picoTaResource = NULL;
}
if (picoSgResource) {
pico_unloadResource( picoSystem, &picoSgResource );
picoSgResource = NULL;
}
picoCurrentLangIndex = -1;
}
/** cleanFiles
* Frees any memory allocated for file and resource strings.
*/
static void cleanFiles( void )
{
if (picoProp_currLang) {
free( picoProp_currLang );
picoProp_currLang = NULL;
}
if (picoTaFileName) {
free( picoTaFileName );
picoTaFileName = NULL;
}
if (picoSgFileName) {
free( picoSgFileName );
picoSgFileName = NULL;
}
if (picoUtppFileName) {
free( picoUtppFileName );
picoUtppFileName = NULL;
}
if (picoTaResourceName) {
free( picoTaResourceName );
picoTaResourceName = NULL;
}
if (picoSgResourceName) {
free( picoSgResourceName );
picoSgResourceName = NULL;
}
if (picoUtppResourceName) {
free( picoUtppResourceName );
picoUtppResourceName = NULL;
}
}
/** hasResourcesForLanguage
* Check to see if the resources required to load the language at the specified index
* are properly installed
* @langIndex - the index of the language to check the resources for. The index is valid.
* return true if the required resources are installed, false otherwise
*/
static bool hasResourcesForLanguage(int langIndex) {
FILE * pFile;
char* fileName = (char*)malloc(PICO_MAX_DATAPATH_NAME_SIZE + PICO_MAX_FILE_NAME_SIZE);
strcpy((char*)fileName, PICO_LINGWARE_PATH);
strcat((char*)fileName, (const char*)picoInternalTaLingware[langIndex]);
pFile = fopen(fileName, "r");
if (pFile == NULL) {
free(fileName);
return false;
} else {
fclose (pFile);
}
strcpy((char*)fileName, PICO_LINGWARE_PATH);
strcat((char*)fileName, (const char*)picoInternalSgLingware[langIndex]);
pFile = fopen(fileName, "r");
if (pFile == NULL) {
free(fileName);
return false;
} else {
fclose(pFile);
free(fileName);
return true;
}
}
/** doLanguageSwitchFromLangIndex
* Switch to the requested locale.
* If the locale is already loaded, it returns immediately.
* If another locale is already is loaded, it will first be unloaded and the new one then loaded.
* If no locale is loaded, the requested locale will be loaded.
* @langIndex - the index of the locale/voice to load, which is guaranteed to be supported.
* return TTS_SUCCESS or TTS_FAILURE
*/
static tts_result doLanguageSwitchFromLangIndex( int langIndex )
{
int ret; /* function result code */
/* If we already have a loaded locale, check whether it is the same one as requested. */
if (picoProp_currLang && (strcmp(picoProp_currLang, picoSupportedLang[langIndex]) == 0)) {
LOGI("Language already loaded (%s == %s)", picoProp_currLang, picoSupportedLang[langIndex]);
return TTS_SUCCESS;
}
/* It is not the same locale; unload the current one first. */
cleanResources();
/* Allocate memory for file and resource names. */
cleanFiles();
picoProp_currLang = (char *) malloc( 10 );
picoTaFileName = (pico_Char *) malloc( PICO_MAX_DATAPATH_NAME_SIZE + PICO_MAX_FILE_NAME_SIZE );
picoSgFileName = (pico_Char *) malloc( PICO_MAX_DATAPATH_NAME_SIZE + PICO_MAX_FILE_NAME_SIZE );
picoUtppFileName = (pico_Char *) malloc( PICO_MAX_DATAPATH_NAME_SIZE + PICO_MAX_FILE_NAME_SIZE );
picoTaResourceName = (pico_Char *) malloc( PICO_MAX_RESOURCE_NAME_SIZE );
picoSgResourceName = (pico_Char *) malloc( PICO_MAX_RESOURCE_NAME_SIZE );
picoUtppResourceName =(pico_Char *) malloc( PICO_MAX_RESOURCE_NAME_SIZE );
/* Set the path and file names for resource files. */
strcpy((char *) picoTaFileName, PICO_LINGWARE_PATH);
strcat((char *) picoTaFileName, (const char *) picoInternalTaLingware[langIndex]);
strcpy((char *) picoSgFileName, PICO_LINGWARE_PATH);
strcat((char *) picoSgFileName, (const char *) picoInternalSgLingware[langIndex]);
strcpy((char *) picoUtppFileName, PICO_LINGWARE_PATH);
strcat((char *) picoUtppFileName, (const char *) picoInternalUtppLingware[langIndex]);
/* Load the text analysis Lingware resource file. */
ret = pico_loadResource( picoSystem, picoTaFileName, &picoTaResource );
if (PICO_OK != ret) {
LOGE("Failed to load textana resource for %s [%d]", picoSupportedLang[langIndex], ret);
cleanResources();
cleanFiles();
return TTS_FAILURE;
}
/* Load the signal generation Lingware resource file. */
ret = pico_loadResource( picoSystem, picoSgFileName, &picoSgResource );
if (PICO_OK != ret) {
LOGE("Failed to load siggen resource for %s [%d]", picoSupportedLang[langIndex], ret);
cleanResources();
cleanFiles();
return TTS_FAILURE;
}
/* Load the utpp Lingware resource file if exists - NOTE: this file is optional
and is currently not used. Loading is only attempted for future compatibility.
If this file is not present the loading will still succeed. */
ret = pico_loadResource( picoSystem, picoUtppFileName, &picoUtppResource );
if ((PICO_OK != ret) && (ret != PICO_EXC_CANT_OPEN_FILE)) {
LOGE("Failed to load utpp resource for %s [%d]", picoSupportedLang[langIndex], ret);
cleanResources();
cleanFiles();
return TTS_FAILURE;
}
/* Get the text analysis resource name. */
ret = pico_getResourceName( picoSystem, picoTaResource, (char *) picoTaResourceName );
if (PICO_OK != ret) {
LOGE("Failed to get textana resource name for %s [%d]", picoSupportedLang[langIndex], ret);
cleanResources();
cleanFiles();
return TTS_FAILURE;
}
/* Get the signal generation resource name. */
ret = pico_getResourceName( picoSystem, picoSgResource, (char *) picoSgResourceName );
if (PICO_OK == ret && picoUtppResource != NULL) {
/* Get utpp resource name - optional: see note above. */
ret = pico_getResourceName( picoSystem, picoUtppResource, (char *) picoUtppResourceName );
if (PICO_OK != ret) {
LOGE("Failed to get utpp resource name for %s [%d]", picoSupportedLang[langIndex], ret);
cleanResources();
cleanFiles();
return TTS_FAILURE;
}
}
if (PICO_OK != ret) {
LOGE("Failed to get siggen resource name for %s [%d]", picoSupportedLang[langIndex], ret);
cleanResources();
cleanFiles();
return TTS_FAILURE;
}
/* Create a voice definition. */
ret = pico_createVoiceDefinition( picoSystem, (const pico_Char *) PICO_VOICE_NAME );
if (PICO_OK != ret) {
LOGE("Failed to create voice for %s [%d]", picoSupportedLang[langIndex], ret);
cleanResources();
cleanFiles();
return TTS_FAILURE;
}
/* Add the text analysis resource to the voice. */
ret = pico_addResourceToVoiceDefinition( picoSystem, (const pico_Char *) PICO_VOICE_NAME, picoTaResourceName );
if (PICO_OK != ret) {
LOGE("Failed to add textana resource to voice for %s [%d]", picoSupportedLang[langIndex], ret);
cleanResources();
cleanFiles();
return TTS_FAILURE;
}
/* Add the signal generation resource to the voice. */
ret = pico_addResourceToVoiceDefinition( picoSystem, (const pico_Char *) PICO_VOICE_NAME, picoSgResourceName );
if (PICO_OK == ret && picoUtppResource != NULL) {
/* Add utpp resource to voice - optional: see note above. */
ret = pico_addResourceToVoiceDefinition( picoSystem, (const pico_Char *) PICO_VOICE_NAME, picoUtppResourceName );
if (PICO_OK != ret) {
LOGE("Failed to add utpp resource to voice for %s [%d]", picoSupportedLang[langIndex], ret);
cleanResources();
cleanFiles();
return TTS_FAILURE;
}
}
if (PICO_OK != ret) {
LOGE("Failed to add siggen resource to voice for %s [%d]", picoSupportedLang[langIndex], ret);
cleanResources();
cleanFiles();
return TTS_FAILURE;
}
ret = pico_newEngine( picoSystem, (const pico_Char *) PICO_VOICE_NAME, &picoEngine );
if (PICO_OK != ret) {
LOGE("Failed to create engine for %s [%d]", picoSupportedLang[langIndex], ret);
cleanResources();
cleanFiles();
return TTS_FAILURE;
}
/* Set the current locale/voice. */
strcpy( picoProp_currLang, picoSupportedLang[langIndex] );
LOGI("loaded %s successfully", picoProp_currLang);
return TTS_SUCCESS;
}
/** doLanguageSwitch
* Switch to the requested locale.
* If this locale is already loaded, it returns immediately.
* If another locale is already loaded, this will first be unloaded
* and the new one then loaded.
* If no locale is loaded, the requested will be loaded.
* @locale - the locale to check, either in xx or xx-rYY format (i.e "en" or "en-rUS")
* return TTS_SUCCESS or TTS_FAILURE
*/
static tts_result doLanguageSwitch( const char * locale )
{
int loclIndex; /* locale index */
/* Load the new locale. */
loclIndex = checkForLocale( locale );
if (loclIndex < 0) {
LOGE("Tried to swith to non-supported locale %s", locale);
return TTS_FAILURE;
}
LOGI("Found supported locale %s", picoSupportedLang[loclIndex]);
return doLanguageSwitchFromLangIndex( loclIndex );
}
/** doAddProperties
* Add <speed>, <pitch> and <volume> tags to the text,
* if the properties have been set to non-default values, and return the new string.
* The calling function is responsible for freeing the returned string.
* @str - text to apply tags to
* return new string with tags applied
*/
static char * doAddProperties( const char * str )
{
char * data = NULL;
int haspitch, hasspeed, hasvol; /* parameters */
int textlen; /* property string length */
haspitch = 0; hasspeed = 0; hasvol = 0;
textlen = strlen(str) + 1;
if (picoProp_currPitch != PICO_DEF_PITCH) { /* non-default pitch */
textlen += strlen(PICO_PITCH_OPEN_TAG) + 5;
textlen += strlen(PICO_PITCH_CLOSE_TAG);
haspitch = 1;
}
if (picoProp_currRate != PICO_DEF_RATE) { /* non-default rate */
textlen += strlen(PICO_SPEED_OPEN_TAG) + 5;
textlen += strlen(PICO_SPEED_CLOSE_TAG);
hasspeed = 1;
}
if (picoProp_currVolume != PICO_DEF_VOLUME) { /* non-default volume */
textlen += strlen(PICO_VOLUME_OPEN_TAG) + 5;
textlen += strlen(PICO_VOLUME_CLOSE_TAG);
hasvol = 1;
}
/* Compose the property strings. */
data = (char *) malloc( textlen ); /* allocate string */
if (!data) {
return NULL;
}
memset(data, 0, textlen); /* clear it */
if (haspitch) {
char* tmp = (char*)malloc(strlen(PICO_PITCH_OPEN_TAG) + strlen(PICO_PITCH_CLOSE_TAG) + 5);
sprintf(tmp, PICO_PITCH_OPEN_TAG, picoProp_currPitch);
strcat(data, tmp);
free(tmp);
}
if (hasspeed) {
char* tmp = (char*)malloc(strlen(PICO_SPEED_OPEN_TAG) + strlen(PICO_SPEED_CLOSE_TAG) + 5);
sprintf(tmp, PICO_SPEED_OPEN_TAG, picoProp_currRate);
strcat(data, tmp);
free(tmp);
}
if (hasvol) {
char* tmp = (char*)malloc(strlen(PICO_VOLUME_OPEN_TAG) + strlen(PICO_VOLUME_CLOSE_TAG) + 5);
sprintf(tmp, PICO_VOLUME_OPEN_TAG, picoProp_currVolume);
strcat(data, tmp);
free(tmp);
}
strcat(data, str);
if (hasvol) {
strcat(data, PICO_VOLUME_CLOSE_TAG);
}
if (hasspeed) {
strcat(data, PICO_SPEED_CLOSE_TAG);
}
if (haspitch) {
strcat(data, PICO_PITCH_CLOSE_TAG);
}
return data;
}
/** createPhonemeString
* Wrap all individual words in <phoneme> tags.
* The Pico <phoneme> tag only supports one word in each tag,
* therefore they must be individually wrapped!
* @xsampa - text to convert to Pico phomene string
* @length - length of the input string
* return new string with tags applied
*/
extern char * createPhonemeString( const char * xsampa, int length )
{
char * convstring = NULL;
int origStrLen = strlen(xsampa);
int numWords = 1;
int start, totalLength, i, j;
for (i = 0; i < origStrLen; i ++) {
if (xsampa[i] == ' ') numWords++;
}
if (numWords == 1) {
convstring = new char[origStrLen + 17];
convstring[0] = '\0';
strcat(convstring, PICO_PHONEME_OPEN_TAG);
strcat(convstring, xsampa);
strcat(convstring, PICO_PHONEME_CLOSE_TAG);
}
else {
char * words[numWords];
start = 0; totalLength = 0; i = 0; j = 0;
for (i=0, j=0; i < origStrLen; i++) {
if (xsampa[i] == ' ') {
words[j] = new char[i+1-start+17];
words[j][0] = '\0';
strcat( words[j], PICO_PHONEME_OPEN_TAG);
strncat(words[j], xsampa+start, i-start);
strcat( words[j], PICO_PHONEME_CLOSE_TAG);
start = i + 1;
j++;
totalLength += strlen(words[j-1]);
}
}
words[j] = new char[i+1-start+17];
words[j][0] = '\0';
strcat(words[j], PICO_PHONEME_OPEN_TAG);
strcat(words[j], xsampa+start);
strcat(words[j], PICO_PHONEME_CLOSE_TAG);
totalLength += strlen(words[j]);
convstring = new char[totalLength + 1];
convstring[0] = '\0';
for (i=0; i < numWords; i++) {
strcat(convstring, words[i]);
delete [] words[i];
}
}
return convstring;
}
/* The XSAMPA uses as many as 5 characters to represent a single IPA code. */
typedef struct tagPhnArr
{
char16_t strIPA; /* IPA Unicode symbol */
char strXSAMPA[6]; /* SAMPA sequence */
} PArr;
#define phn_cnt (134)
PArr PhnAry[phn_cnt] = {
/* XSAMPA conversion table */
/* Vowels (23) complete */
{0x025B, "E"},
{0x0251, "A"},
{0x0254, "O"},
{0x00F8, "2"},
{0x0153, "9"},
{0x0276, "&"},
{0x0252, "Q"},
{0x028C, "V"},
{0x0264, "7"},
{0x026F, "M"},
{0x0268, "1"},
{0x0289, "}"},
{0x026A, "I"},
{0x028F, "Y"},
{0x028A, "U"},
{0x0259, "@"},
{0x0275, "8"},
{0x0250, "6"},
{0x00E6, "{"},
{0x025C, "3"},
{0x025A, "@`"},
{0x025E, "3\\\\"},
{0x0258, "@\\\\"},
/* Consonants (60) complete */
{0x0288, "t`"},
{0x0256, "d`"},
{0x025F, "J\\\\"},
{0x0261, "g"},
{0x0262, "G\\\\"},
{0x0294, "?"},
{0x0271, "F"},
{0x0273, "n`"},
{0x0272, "J"},
{0x014B, "N"},
{0x0274, "N\\\\"},
{0x0299, "B\\\\"},
{0x0280, "R\\\\"},
{0x027E, "4"},
{0x027D, "r`"},
{0x0278, "p\\\\"},
{0x03B2, "B"},
{0x03B8, "T"},
{0x00F0, "D"},
{0x0283, "S"},
{0x0292, "Z"},
{0x0282, "s`"},
{0x0290, "z`"},
{0x00E7, "C"},
{0x029D, "j\\\\"},
{0x0263, "G"},
{0x03C7, "X"},
{0x0281, "R"},
{0x0127, "X\\\\"},
{0x0295, "?\\\\"},
{0x0266, "h\\\\"},
{0x026C, "K"},
{0x026E, "K\\\\"},
{0x028B, "P"},
{0x0279, "r\\\\"},
{0x027B, "r\\\\'"},
{0x0270, "M\\\\"},
{0x026D, "l`"},
{0x028E, "L"},
{0x029F, "L\\\\"},
{0x0253, "b_<"},
{0x0257, "d_<"},
{0x0284, "J\\_<"},
{0x0260, "g_<"},
{0x029B, "G\\_<"},
{0x028D, "W"},
{0x0265, "H"},
{0x029C, "H\\\\"},
{0x02A1, ">\\\\"},
{0x02A2, "<\\\\"},
{0x0298, "O\\\\"},
{0x01C0, "|\\\\"},
{0x01C3, "!\\\\"},
{0x01C2, "=\\"},
{0x01C1, "|\\|\\"},
{0x027A, "l\\\\"},
{0x0255, "s\\\\"},
{0x0291, "z\\\\"},
{0x0267, "x\\\\"},
{0x026B, "l_G"},
/* Diacritics (34) */
{0x02BC, "_>"},
{0x0325, "_0"},
{0x030A, "_0"},
{0x032C, "_v"},
{0x02B0, "_h"},
{0x0324, "_t"},
{0x0330, "_k"},
{0x033C, "_N"},
{0x032A, "_d"},
{0x033A, "_a"},
{0x033B, "_m"},
{0x0339, "_O"},
{0x031C, "_c"},
{0x031F, "_+"},
{0x0320, "_-"},
{0x0308, "_\""}, /* centralized */
{0x033D, "_x"},
{0x0318, "_A"},
{0x0319, "_q"},
{0x02DE, "`"},
{0x02B7, "_w"},
{0x02B2, "_j"},
{0x02E0, "_G"},
{0x02E4, "_?\\\\"},
{0x0303, "~"},
{0x207F, "_n"},
{0x02E1, "_l"},
{0x031A, "_}"},
{0x0334, "_e"},
{0x031D, "_r"},
{0x031E, "_o"},
{0x0329, "="},
{0x032F, "_^"},
{0x02D0, ":"},
/* Others (11) complete */
{0x0361, "_"},
{0x035C, "_"},
{0x02C8, "\""},
{0x02CC, "%"},
{0x02D1, ":\\\\"},
{0x0306, "_X"},
{0x2016, "||"},
{0x203F, "-\\\\"},
{0x2197, "<R>"},
{0x2198, "<F>"},
{0x025D, "3`"},
/* Affricates (6) complete */
{0x02A3, "d_z"},
{0x02A4, "d_Z"},
{0x02A5, "d_z\\\\"},
{0x02A6, "t_s"},
{0x02A7, "t_S"},
{0x02A8, "t_s\\\\"}
};
void CnvIPAPnt( const char16_t IPnt, char * XPnt )
{
char16_t ThisPnt = IPnt; /* local copy of single IPA codepoint */
int idx; /* index into table */
/* Convert an individual IPA codepoint.
A single IPA code could map to a string.
Search the table. If it is not found, use the same character.
Since most codepoints can be contained within 16 bits,
they are represented as wide chars. */
XPnt[0] = 0; /* clear the result string */
/* Search the table for the conversion. */
for (idx = 0; idx < phn_cnt; idx ++) { /* for each item in table */
if (IPnt == PhnAry[idx].strIPA) { /* matches IPA code */
strcat( XPnt, (const char *)&(PhnAry[idx].strXSAMPA) ); /* copy the XSAMPA string */
return;
}
}
strcat(XPnt, (const char *)&ThisPnt); /* just copy it */
}
/** cnvIpaToXsampa
* Convert an IPA character string to an XSAMPA character string.
* @ipaString - input IPA string to convert
* @outXsampaString - converted XSAMPA string is passed back in this parameter
* return size of the new string
*/
int cnvIpaToXsampa( const char16_t * ipaString, size_t ipaStringSize, char ** outXsampaString )
{
size_t xsize; /* size of result */
int ipidx; /* index into IPA string */
char * XPnt; /* short XSAMPA char sequence */
/* Convert an IPA string to an XSAMPA string and store the xsampa string in *outXsampaString.
It is the responsibility of the caller to free the allocated string.
Increment through the string. For each base & combination convert it to the XSAMP equivalent.
Because of the XSAMPA limitations, not all IPA characters will be covered. */
XPnt = (char *) malloc(6);
xsize = (4 * ipaStringSize) + 8; /* assume more than double size */
*outXsampaString = (char *) malloc( xsize ); /* allocate return string */
*outXsampaString[0] = 0;
xsize = 0; /* clear final */
for (ipidx = 0; ipidx < ipaStringSize; ipidx ++) { /* for each IPA code */
CnvIPAPnt( ipaString[ipidx], XPnt ); /* get converted character */
strcat((char *)*outXsampaString, XPnt ); /* concatenate XSAMPA */
}
free(XPnt);
xsize = strlen(*outXsampaString); /* get the final length */
return xsize;
}
/* Google Engine API function implementations */
/** init
* Allocates Pico memory block and initializes the Pico system.
* synthDoneCBPtr - Pointer to callback function which will receive generated samples
* return tts_result
*/
tts_result TtsEngine::init( synthDoneCB_t synthDoneCBPtr )
{
if (synthDoneCBPtr == NULL) {
LOGE("Callback pointer is NULL");
return TTS_FAILURE;
}
picoMemArea = malloc( PICO_MEM_SIZE );
if (!picoMemArea) {
LOGE("Failed to allocate memory for Pico system");
return TTS_FAILURE;
}
pico_Status ret = pico_initialize( picoMemArea, PICO_MEM_SIZE, &picoSystem );
if (PICO_OK != ret) {
LOGE("Failed to initialize Pico system");
free( picoMemArea );
picoMemArea = NULL;
return TTS_FAILURE;
}
picoSynthDoneCBPtr = synthDoneCBPtr;
picoCurrentLangIndex = -1;
return TTS_SUCCESS;
}
/** shutdown
* Unloads all Pico resources; terminates Pico system and frees Pico memory block.
* return tts_result
*/
tts_result TtsEngine::shutdown( void )
{
cleanResources();
if (picoSystem) {
pico_terminate(&picoSystem);
picoSystem = NULL;
}
if (picoMemArea) {
free(picoMemArea);
picoMemArea = NULL;
}
cleanFiles();
return TTS_SUCCESS;
}
/** loadLanguage
* Load a new language.
* @lang - string with ISO 3 letter language code.
* @country - string with ISO 3 letter country code .
* @variant - string with language variant for that language and country pair.
* return tts_result
*/
tts_result TtsEngine::loadLanguage(const char *lang, const char *country, const char *variant)
{
return TTS_FAILURE;
//return setProperty("language", value, size);
}
/** setLanguage
* Load a new language (locale). Use the ISO 639-3 language codes.
* @lang - string with ISO 639-3 language code.
* @country - string with ISO 3 letter country code.
* @variant - string with language variant for that language and country pair.
* return tts_result
*/
tts_result TtsEngine::setLanguage( const char * lang, const char * country, const char * variant )
{
int langIndex;
int countryIndex;
int i;
if (lang == NULL)
{
LOGE("TtsEngine::setLanguage called with NULL language");
return TTS_FAILURE;
}
/* We look for a match on the language first
then we look for a match on the country.
If no match on the language:
return an error.
If match on the language, but no match on the country:
load the language found for the language match.
If match on the language, and match on the country:
load the language found for the country match. */
/* Find a match on the language. */
langIndex = -1; /* no match */
for (i = 0; i < picoNumSupportedVocs; i ++)
{
if (strcmp(lang, picoSupportedLangIso3[i]) == 0)
{
langIndex = i;
break;
}
}
if (langIndex < 0)
{
/* The language isn't supported. */
LOGE("TtsEngine::setLanguage called with unsupported language");
return TTS_FAILURE;
}
/* Find a match on the country, if there is one. */
if (country != NULL)
{
countryIndex = -1;
for (i = langIndex; i < picoNumSupportedVocs; i ++)
{
if ( (strcmp(lang, picoSupportedLangIso3[i]) == 0)
&& (strcmp(country, picoSupportedCountryIso3[i]) == 0))
{
countryIndex = i;
break;
}
}
if (countryIndex < 0)
{
/* We didn't find a match on the country, but we had a match on the language.
Use that language. */
LOGI("TtsEngine::setLanguage found matching language(%s) but not matching country(%s).",
lang, country);
}
else
{
/* We have a match on both the language and the country. */
langIndex = countryIndex;
}
}
return doLanguageSwitchFromLangIndex( langIndex ); /* switch the language */
}
/** isLanguageAvailable
* Returns the level of support for a language.
* @lang - string with ISO 3 letter language code.
* @country - string with ISO 3 letter country code .
* @variant - string with language variant for that language and country pair.
* return tts_support_result
*/
tts_support_result TtsEngine::isLanguageAvailable(const char *lang, const char *country,
const char *variant) {
int langIndex = -1;
int countryIndex = -1;
//-------------------------
// language matching
// if no language specified
if (lang == NULL) {
LOGE("TtsEngine::isLanguageAvailable called with no language");
return TTS_LANG_NOT_SUPPORTED;
}
// find a match on the language
for (int i = 0; i < picoNumSupportedVocs; i++)
{
if (strcmp(lang, picoSupportedLangIso3[i]) == 0) {
langIndex = i;
break;
}
}
if (langIndex < 0) {
// language isn't supported
LOGV("TtsEngine::isLanguageAvailable called with unsupported language");
return TTS_LANG_NOT_SUPPORTED;
}
//-------------------------
// country matching
// if no country specified
if ((country == NULL) || (strlen(country) == 0)) {
// check installation of matched language
return (hasResourcesForLanguage(langIndex) ? TTS_LANG_AVAILABLE : TTS_LANG_MISSING_DATA);
}
// find a match on the country
for (int i = langIndex; i < picoNumSupportedVocs; i++) {
if ((strcmp(lang, picoSupportedLangIso3[i]) == 0)
&& (strcmp(country, picoSupportedCountryIso3[i]) == 0)) {
countryIndex = i;
break;
}
}
if (countryIndex < 0) {
// we didn't find a match on the country, but we had a match on the language
// check installation of matched language
return (hasResourcesForLanguage(langIndex) ? TTS_LANG_AVAILABLE : TTS_LANG_MISSING_DATA);
} else {
// we have a match on the language and the country
langIndex = countryIndex;
// check installation of matched language + country
return (hasResourcesForLanguage(langIndex) ? TTS_LANG_COUNTRY_AVAILABLE : TTS_LANG_MISSING_DATA);
}
// no variants supported in this library, TTS_LANG_COUNTRY_VAR_AVAILABLE cannot be returned.
}
/** getLanguage
* Get the currently loaded language - if any.
* @lang - string with current ISO 3 letter language code, empty string if no loaded language.
* @country - string with current ISO 3 letter country code, empty string if no loaded language.
* @variant - string with current language variant, empty string if no loaded language.
* return tts_result
*/
tts_result TtsEngine::getLanguage(char *language, char *country, char *variant)
{
if (picoCurrentLangIndex == -1) {
strcpy(language, "\0");
strcpy(country, "\0");
strcpy(variant, "\0");
} else {
strcpy(language, picoSupportedLangIso3[picoCurrentLangIndex]);
strcpy(country, picoSupportedCountryIso3[picoCurrentLangIndex]);
// no variant in this implementation
strcpy(variant, "\0");
}
return TTS_SUCCESS;
}
/** setAudioFormat
* sets the audio format to use for synthesis, returns what is actually used.
* @encoding - reference to encoding format
* @rate - reference to sample rate
* @channels - reference to number of channels
* return tts_result
* */
tts_result TtsEngine::setAudioFormat(AudioSystem::audio_format& encoding, uint32_t& rate,
int& channels)
{
// ignore the input parameters, the enforced audio parameters are fixed here
encoding = AudioSystem::PCM_16_BIT;
rate = 16000;
channels = 1;
return TTS_SUCCESS;
}
/** setProperty
* Set property. The supported properties are: language, rate, pitch and volume.
* @property - name of property to set
* @value - value to set
* @size - size of value
* return tts_result
*/
tts_result TtsEngine::setProperty( const char * property, const char * value, const size_t size )
{
int rate;
int pitch;
int volume;
/* Set a specific property for the engine.
Supported properties include: language (locale), rate, pitch, volume. */
/* Sanity check */
if (property == NULL) {
LOGE("setProperty called with property NULL");
return TTS_PROPERTY_UNSUPPORTED;
}
if (value == NULL) {
LOGE("setProperty called with value NULL");
return TTS_VALUE_INVALID;
}
if (strncmp(property, "language", 8) == 0) {
/* Verify it's in correct format. */
if (strlen(value) != 2 && strlen(value) != 6) {
LOGE("change language called with incorrect format");
return TTS_VALUE_INVALID;
}
/* Try to switch to specified language. */
if (doLanguageSwitch(value) == TTS_FAILURE) {
LOGE("failed to load language");
return TTS_FAILURE;
} else {
return TTS_SUCCESS;
}
} else if (strncmp(property, "rate", 4) == 0) {
rate = atoi(value);
if (rate < PICO_MIN_RATE) {
rate = PICO_MIN_RATE;
}
if (rate > PICO_MAX_RATE) {
rate = PICO_MAX_RATE;
}
picoProp_currRate = rate;
return TTS_SUCCESS;
} else if (strncmp(property, "pitch", 5) == 0) {
pitch = atoi(value);
if (pitch < PICO_MIN_PITCH) {
pitch = PICO_MIN_PITCH;
}
if (pitch > PICO_MAX_PITCH) {
pitch = PICO_MAX_PITCH;
}
picoProp_currPitch = pitch;
return TTS_SUCCESS;
} else if (strncmp(property, "volume", 6) == 0) {
volume = atoi(value);
if (volume < PICO_MIN_VOLUME) {
volume = PICO_MIN_VOLUME;
}
if (volume > PICO_MAX_VOLUME) {
volume = PICO_MAX_VOLUME;
}
picoProp_currVolume = volume;
return TTS_SUCCESS;
}
return TTS_PROPERTY_UNSUPPORTED;
}
/** getProperty
* Get the property. Supported properties are: language, rate, pitch and volume.
* @property - name of property to get
* @value - buffer which will receive value of property
* @iosize - size of value - if size is too small on return this will contain actual size needed
* return tts_result
*/
tts_result TtsEngine::getProperty( const char * property, char * value, size_t * iosize )
{
/* Get the property for the engine.
This property was previously set by setProperty or by default. */
/* sanity check */
if (property == NULL) {
LOGE("getProperty called with property NULL");
return TTS_PROPERTY_UNSUPPORTED;
}
if (value == NULL) {
LOGE("getProperty called with value NULL");
return TTS_VALUE_INVALID;
}
if (strncmp(property, "language", 8) == 0) {
if (picoProp_currLang == NULL) {
strcpy(value, "");
} else {
if (*iosize < strlen(picoProp_currLang)+1) {
*iosize = strlen(picoProp_currLang) + 1;
return TTS_PROPERTY_SIZE_TOO_SMALL;
}
strcpy(value, picoProp_currLang);
}
return TTS_SUCCESS;
} else if (strncmp(property, "rate", 4) == 0) {
char tmprate[4];
sprintf(tmprate, "%d", picoProp_currRate);
if (*iosize < strlen(tmprate)+1) {
*iosize = strlen(tmprate) + 1;
return TTS_PROPERTY_SIZE_TOO_SMALL;
}
strcpy(value, tmprate);
return TTS_SUCCESS;
} else if (strncmp(property, "pitch", 5) == 0) {
char tmppitch[4];
sprintf(tmppitch, "%d", picoProp_currPitch);
if (*iosize < strlen(tmppitch)+1) {
*iosize = strlen(tmppitch) + 1;
return TTS_PROPERTY_SIZE_TOO_SMALL;
}
strcpy(value, tmppitch);
return TTS_SUCCESS;
} else if (strncmp(property, "volume", 6) == 0) {
char tmpvol[4];
sprintf(tmpvol, "%d", picoProp_currVolume);
if (*iosize < strlen(tmpvol)+1) {
*iosize = strlen(tmpvol) + 1;
return TTS_PROPERTY_SIZE_TOO_SMALL;
}
strcpy(value, tmpvol);
return TTS_SUCCESS;
}
/* Unknown property */
LOGE("Unsupported property");
return TTS_PROPERTY_UNSUPPORTED;
}
/** synthesizeText
* Synthesizes a text string.
* The text string could be annotated with SSML tags.
* @text - text to synthesize
* @buffer - buffer which will receive generated samples
* @bufferSize - size of buffer
* @userdata - pointer to user data which will be passed back to callback function
* return tts_result
*/
tts_result TtsEngine::synthesizeText( const char * text, int8_t * buffer, size_t bufferSize, void * userdata )
{
int err;
int cbret;
pico_Char * inp = NULL;
pico_Char * local_text = NULL;
short outbuf[MAX_OUTBUF_SIZE/2];
pico_Int16 bytes_sent, bytes_recv, text_remaining, out_data_type;
pico_Status ret;
SvoxSsmlParser * parser = NULL;
picoSynthAbort = 0;
if (text == NULL) {
LOGE("synthesizeText called with NULL string");
return TTS_FAILURE;
}
if (buffer == NULL) {
LOGE("synthesizeText called with NULL buffer");
return TTS_FAILURE;
}
if ( (strncmp(text, "<speak", 6) == 0) || (strncmp(text, "<?xml", 5) == 0) ) {
/* SSML input */
parser = new SvoxSsmlParser();
if (parser && parser->initSuccessful()) {
err = parser->parseDocument(text, 1);
if (err == XML_STATUS_ERROR) {
/* Note: for some reason expat always thinks the input document has an error
at the end, even when the XML document is perfectly formed */
LOGI("Warning: SSML document parsed with errors");
}
char * parsed_text = parser->getParsedDocument();
if (parsed_text) {
/* Add property tags to the string - if any. */
local_text = (pico_Char *) doAddProperties( parsed_text );
if (!local_text) {
LOGE("Failed to allocate memory for text string");
delete parser;
return TTS_FAILURE;
}
char * lang = parser->getParsedDocumentLanguage();
if (doLanguageSwitch(lang) == TTS_FAILURE) {
LOGE("Failed to switch to language specified in SSML document.");
delete parser;
return TTS_FAILURE;
}
delete parser;
} else {
LOGE("Failed to parse SSML document");
delete parser;
return TTS_FAILURE;
}
} else {
LOGE("Failed to create SSML parser");
if (parser) {
delete parser;
}
return TTS_FAILURE;
}
} else {
/* Add property tags to the string - if any. */
local_text = (pico_Char *) doAddProperties( text );
if (!local_text) {
LOGE("Failed to allocate memory for text string");
return TTS_FAILURE;
}
}
text_remaining = strlen((const char *) local_text) + 1;
inp = (pico_Char *) local_text;
size_t bufused = 0;
/* synthesis loop */
while (text_remaining) {
if (picoSynthAbort) {
ret = pico_resetEngine( picoEngine );
break;
}
/* Feed the text into the engine. */
ret = pico_putTextUtf8( picoEngine, inp, text_remaining, &bytes_sent );
if (ret != PICO_OK) {
LOGE("Error synthesizing string '%s': [%d]", text, ret);
if (local_text) {
free( local_text );
}
return TTS_FAILURE;
}
text_remaining -= bytes_sent;
inp += bytes_sent;
do {
if (picoSynthAbort) {
break;
}
/* Retrieve the samples and add them to the buffer. */
ret = pico_getData( picoEngine, (void *) outbuf, MAX_OUTBUF_SIZE, &bytes_recv,
&out_data_type );
if (bytes_recv) {
if ((bufused + bytes_recv) <= bufferSize) {
memcpy(buffer+bufused, (int8_t *) outbuf, bytes_recv);
bufused += bytes_recv;
} else {
/* The buffer filled; pass this on to the callback function. */
cbret = picoSynthDoneCBPtr(userdata, 16000, AudioSystem::PCM_16_BIT, 1, buffer,
bufused, TTS_SYNTH_PENDING);
if (cbret == TTS_CALLBACK_HALT) {
LOGI("Halt requested by caller. Halting.");
picoSynthAbort = 1;
break;
}
bufused = 0;
memcpy(buffer, (int8_t *) outbuf, bytes_recv);
bufused += bytes_recv;
}
}
} while (PICO_STEP_BUSY == ret);
/* This chunk of synthesis is finished; pass the remaining samples.
Use 16 KHz, 16-bit samples. */
if (!picoSynthAbort) {
picoSynthDoneCBPtr( userdata, 16000, AudioSystem::PCM_16_BIT, 1, buffer, bufused,
TTS_SYNTH_PENDING);
}
picoSynthAbort = 0;
if (ret != PICO_STEP_IDLE) {
LOGE("Error occurred during synthesis [%d]", ret);
if (local_text) {
free(local_text);
}
LOGV("Synth loop: sending TTS_SYNTH_DONE after error");
picoSynthDoneCBPtr( userdata, 16000, AudioSystem::PCM_16_BIT, 1, buffer, bufused,
TTS_SYNTH_DONE);
return TTS_FAILURE;
}
}
/* Synthesis is done; notify the caller */
LOGV("Synth loop: sending TTS_SYNTH_DONE after all done, or was asked to stop");
picoSynthDoneCBPtr( userdata, 16000, AudioSystem::PCM_16_BIT, 1, buffer, bufused,
TTS_SYNTH_DONE);
if (local_text) {
free( local_text );
}
return TTS_SUCCESS;
}
/** synthesizeIpa
* Synthesizes a phonetic string in IPA format.
The Pico engine understands only XSAMPA, so the IPA must be converted.
The string must also be parceled into words since it can only hanlde a word at a time.
Each phonemic word must be wrapped with a tag.
* @ipa - phonetic string to synthesize
* @buffer - buffer which will receive generated samples
* @bufferSize - size of buffer
* @userdata - pointer to user data which will be passed back to callback function
* return tts_result
*/
tts_result TtsEngine::synthesizeIpa( const char * ipa, int8_t * buffer, size_t bufferSize, void * userdata )
{
// deprecated call
return TTS_FAILURE;
}
/** stop
* Aborts the running synthesis.
* return tts_result
*/
tts_result TtsEngine::stop( void )
{
picoSynthAbort = 1;
return TTS_SUCCESS;
}
#ifdef __cplusplus
extern "C" {
#endif
TtsEngine * getTtsEngine( void )
{
return new TtsEngine();
}
#ifdef __cplusplus
}
#endif