PageRenderTime 132ms CodeModel.GetById 15ms app.highlight 104ms RepoModel.GetById 1ms app.codeStats 0ms

/native/external/espeak/src/translate.cpp

http://eyes-free.googlecode.com/
C++ | 2355 lines | 2055 code | 178 blank | 122 comment | 356 complexity | 61c8975e020ed42b82768670e4c3515c MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/***************************************************************************
   2 *   Copyright (C) 2005 to 2007 by Jonathan Duddington                     *
   3 *   email: jonsd@users.sourceforge.net                                    *
   4 *                                                                         *
   5 *   This program is free software; you can redistribute it and/or modify  *
   6 *   it under the terms of the GNU General Public License as published by  *
   7 *   the Free Software Foundation; either version 3 of the License, or     *
   8 *   (at your option) any later version.                                   *
   9 *                                                                         *
  10 *   This program is distributed in the hope that it will be useful,       *
  11 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  12 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
  13 *   GNU General Public License for more details.                          *
  14 *                                                                         *
  15 *   You should have received a copy of the GNU General Public License     *
  16 *   along with this program; if not, see:                                 *
  17 *               <http://www.gnu.org/licenses/>.                           *
  18 ***************************************************************************/
  19
  20#include "StdAfx.h"
  21
  22#include <stdio.h>
  23#include <ctype.h>
  24#include <stdlib.h>
  25#include <string.h>
  26
  27#include <wctype.h>
  28////#include <wchar.h>
  29
  30#include "speak_lib.h"
  31#include "speech.h"
  32#include "phoneme.h"
  33#include "synthesize.h"
  34#include "voice.h"
  35#include "translate.h"
  36
  37#define WORD_STRESS_CHAR   '*'
  38
  39
  40Translator *translator = NULL;    // the main translator
  41Translator *translator2 = NULL;   // secondary translator for certain words
  42static char translator2_language[20] = {0};
  43
  44FILE *f_trans = NULL;     // phoneme output text
  45int option_tone1 = 0;
  46int option_tone2 = 0;
  47int option_tone_flags = 0;   // bit 8=emphasize allcaps, bit 9=emphasize penultimate stress
  48int option_phonemes = 0;
  49int option_phoneme_events = 0;
  50int option_quiet = 0;
  51int option_endpause = 0;  // suppress pause after end of text
  52int option_capitals = 0;
  53int option_punctuation = 0;
  54int option_sayas = 0;
  55int option_sayas2 = 0;  // used in translate_clause()
  56int option_emphasis = 0;  // 0=normal, 1=normal, 2=weak, 3=moderate, 4=strong
  57int option_ssml = 0;
  58int option_phoneme_input = 1;  // allow [[phonemes]] in input
  59int option_phoneme_variants = 0;  // 0= don't display phoneme variant mnemonics
  60int option_wordgap = 0;
  61
  62int count_sayas_digits;
  63int skip_sentences;
  64int skip_words;
  65int skip_characters;
  66char skip_marker[N_MARKER_LENGTH];
  67int skipping_text;   // waiting until word count, sentence count, or named marker is reached
  68int end_character_position;
  69int count_sentences;
  70int count_words;
  71int clause_start_char;
  72int clause_start_word;
  73int new_sentence;
  74int word_emphasis = 0;    // set if emphasis level 3 or 4
  75
  76int prev_clause_pause=0;
  77int max_clause_pause = 0;
  78
  79
  80wchar_t option_punctlist[N_PUNCTLIST]={0};
  81char ctrl_embedded = '\001';    // to allow an alternative CTRL for embedded commands
  82int option_multibyte=espeakCHARS_AUTO;   // 0=auto, 1=utf8, 2=8bit, 3=wchar
  83
  84// these are overridden by defaults set in the "speak" file
  85int option_linelength = 0;
  86
  87#define N_EMBEDDED_LIST  250
  88static int embedded_ix;
  89static int embedded_read;
  90unsigned int embedded_list[N_EMBEDDED_LIST];
  91
  92// the source text of a single clause (UTF8 bytes)
  93#define N_TR_SOURCE    700
  94static char source[N_TR_SOURCE+40];     // extra space for embedded command & voice change info at end
  95
  96int n_replace_phonemes;
  97REPLACE_PHONEMES replace_phonemes[N_REPLACE_PHONEMES];
  98
  99
 100// brackets, also 0x2014 to 0x021f which don't need to be in this list
 101static const unsigned short brackets[] = {
 102'(',')','[',']','{','}','<','>','"','\'','`',
 1030xab,0xbb,  // double angle brackets
 1040x300a,0x300b,  // double angle brackets (ideograph)
 1050};
 106
 107// other characters which break a word, but don't produce a pause
 108static const unsigned short breaks[] = {'_', 0};
 109
 110// treat these characters as spaces, in addition to iswspace()
 111static const wchar_t chars_space[] = {0x2500,0};  // box drawing horiz
 112
 113
 114// Translate character codes 0xA0 to 0xFF into their unicode values
 115// ISO_8859_1 is set as default
 116static const unsigned short ISO_8859_1[0x60] = {
 117   0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, // a0
 118   0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, // a8
 119   0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, // b0
 120   0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, // b8
 121   0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, // c0
 122   0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, // c8
 123   0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, // d0
 124   0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, // d8
 125   0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, // e0
 126   0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, // e8
 127   0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, // f0
 128   0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, // f8
 129};
 130
 131static const unsigned short ISO_8859_2[0x60] = {
 132   0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7, // a0
 133   0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b, // a8
 134   0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7, // b0
 135   0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c, // b8
 136   0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, // c0
 137   0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, // c8
 138   0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, // d0
 139   0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, // d8
 140   0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, // e0
 141   0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, // e8
 142   0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, // f0
 143   0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9, // f8
 144};
 145
 146static const unsigned short ISO_8859_3[0x60] = {
 147   0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, 0x0000, 0x0124, 0x00a7, // a0
 148   0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, 0x0000, 0x017b, // a8
 149   0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7, // b0
 150   0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, 0x0000, 0x017c, // b8
 151   0x00c0, 0x00c1, 0x00c2, 0x0000, 0x00c4, 0x010a, 0x0108, 0x00c7, // c0
 152   0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, // c8
 153   0x0000, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7, // d0
 154   0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df, // d8
 155   0x00e0, 0x00e1, 0x00e2, 0x0000, 0x00e4, 0x010b, 0x0109, 0x00e7, // e0
 156   0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, // e8
 157   0x0000, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7, // f0
 158   0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9, // f8
 159};
 160
 161static const unsigned short ISO_8859_4[0x60] = {
 162   0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7, // a0
 163   0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af, // a8
 164   0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7, // b0
 165   0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b, // b8
 166   0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, // c0
 167   0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a, // c8
 168   0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7, // d0
 169   0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df, // d8
 170   0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, // e0
 171   0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b, // e8
 172   0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7, // f0
 173   0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9, // f8
 174};
 175
 176static const unsigned short ISO_8859_5[0x60] = {
 177   0x00a0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, // a0  Cyrillic
 178   0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x00ad, 0x040e, 0x040f, // a8
 179   0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, // b0
 180   0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, // b8
 181   0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, // c0
 182   0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, // c8
 183   0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, // d0
 184   0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, // d8
 185   0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, // e0
 186   0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, // e8
 187   0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, // f0
 188   0x0458, 0x0459, 0x045a, 0x045b, 0x045c, 0x00a7, 0x045e, 0x045f, // f8
 189};
 190
 191static const unsigned short ISO_8859_7[0x60] = {
 192   0x00a0, 0x2018, 0x2019, 0x00a3, 0x20ac, 0x20af, 0x00a6, 0x00a7, // a0  Greek
 193   0x00a8, 0x00a9, 0x037a, 0x00ab, 0x00ac, 0x00ad, 0x0000, 0x2015, // a8
 194   0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x0384, 0x0385, 0x0386, 0x00b7, // b0
 195   0x0388, 0x0389, 0x038a, 0x00bb, 0x038c, 0x00bd, 0x038e, 0x038f, // b8
 196   0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, // c0
 197   0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f, // c8
 198   0x03a0, 0x03a1, 0x0000, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7, // d0
 199   0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x03ac, 0x03ad, 0x03ae, 0x03af, // d8
 200   0x03b0, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7, // e0
 201   0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf, // e8
 202   0x03c0, 0x03c1, 0x03c2, 0x03c3, 0x03c4, 0x03c5, 0x03c6, 0x03c7, // f0
 203   0x03c8, 0x03c9, 0x03ca, 0x03cb, 0x03cc, 0x03cd, 0x03ce, 0x0000, // f8
 204};
 205
 206static const unsigned short ISO_8859_9[0x60] = {
 207   0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, // a0
 208   0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, // a8
 209   0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, // b0
 210   0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, // b8
 211   0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, // c0
 212   0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, // c8
 213   0x011e, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, // d0
 214   0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0130, 0x015e, 0x00df, // d8
 215   0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, // e0
 216   0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, // e8
 217   0x011f, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, // f0
 218   0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0131, 0x015f, 0x00ff, // f8
 219};
 220
 221static const unsigned short ISO_8859_14[0x60] = {
 222   0x00a0, 0x1e02, 0x1e03, 0x00a3, 0x010a, 0x010b, 0x1e0a, 0x00a7, // a0  Welsh
 223   0x1e80, 0x00a9, 0x1e82, 0x1e0b, 0x1ef2, 0x00ad, 0x00ae, 0x0178, // a8
 224   0x1e1e, 0x1e1f, 0x0120, 0x0121, 0x1e40, 0x1e41, 0x00b6, 0x1e56, // b0
 225   0x1e81, 0x1e57, 0x1e83, 0x1e60, 0x1ef3, 0x1e84, 0x1e85, 0x1e61, // b8
 226   0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, // c0
 227   0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, // c8
 228   0x0174, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x1e6a, // d0
 229   0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x0176, 0x00df, // d8
 230   0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, // e0
 231   0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, // e8
 232   0x0175, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x1e6b, // f0
 233   0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x0177, 0x00ff, // f8
 234};
 235
 236static const unsigned short KOI8_R[0x60] = {
 237   0x2550, 0x2551, 0x2552, 0x0451, 0x2553, 0x2554, 0x2555, 0x2556, // a0  Russian
 238   0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 0x255e, // a8
 239   0x255f, 0x2560, 0x2561, 0x0401, 0x2562, 0x2563, 0x2564, 0x2565, // b0
 240   0x2566, 0x2567, 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x00a9, // b8
 241   0x044e, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433, // c0
 242   0x0445, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, // c8
 243   0x043f, 0x044f, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432, // d0
 244   0x044c, 0x044b, 0x0437, 0x0448, 0x044d, 0x0449, 0x0447, 0x044a, // d8
 245   0x042e, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413, // e0
 246   0x0425, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, // e8
 247   0x041f, 0x042f, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412, // f0
 248   0x042c, 0x042b, 0x0417, 0x0428, 0x042d, 0x0429, 0x0427, 0x042a, // f8
 249};
 250
 251static const unsigned short ISCII[0x60] = {
 252   0x0020, 0x0901, 0x0902, 0x0903, 0x0905, 0x0906, 0x0907, 0x0908, // a0
 253   0x0909, 0x090a, 0x090b, 0x090e, 0x090f, 0x0910, 0x090d, 0x0912, // a8
 254   0x0913, 0x0914, 0x0911, 0x0915, 0x0916, 0x0917, 0x0918, 0x0919, // b0
 255   0x091a, 0x091b, 0x091c, 0x091d, 0x091e, 0x091f, 0x0920, 0x0921, // b8
 256   0x0922, 0x0923, 0x0924, 0x0925, 0x0926, 0x0927, 0x0928, 0x0929, // c0
 257   0x092a, 0x092b, 0x092c, 0x092d, 0x092e, 0x092f, 0x095f, 0x0930, // c8
 258   0x0931, 0x0932, 0x0933, 0x0934, 0x0935, 0x0936, 0x0937, 0x0938, // d0
 259   0x0939, 0x0020, 0x093e, 0x093f, 0x0940, 0x0941, 0x0942, 0x0943, // d8
 260   0x0946, 0x0947, 0x0948, 0x0945, 0x094a, 0x094b, 0x094c, 0x0949, // e0
 261   0x094d, 0x093c, 0x0964, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, // e8
 262   0x0020, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, // f0
 263   0x0037, 0x0038, 0x0039, 0x20,   0x20,   0x20,   0x20,   0x20,   // f8
 264};
 265
 266const unsigned short *charsets[N_CHARSETS] = {
 267	ISO_8859_1,
 268	ISO_8859_1,
 269	ISO_8859_2,
 270	ISO_8859_3,
 271	ISO_8859_4,
 272	ISO_8859_5,
 273	ISO_8859_1,
 274	ISO_8859_7,
 275	ISO_8859_1,
 276	ISO_8859_9,
 277	ISO_8859_1,
 278	ISO_8859_1,
 279	ISO_8859_1,
 280	ISO_8859_1,
 281	ISO_8859_14,
 282	ISO_8859_1,
 283	ISO_8859_1,
 284	ISO_8859_1,
 285	KOI8_R,          // 18
 286	ISCII };
 287
 288// Tables of the relative lengths of vowels, depending on the
 289// type of the two phonemes that follow
 290// indexes are the "length_mod" value for the following phonemes
 291
 292// use this table if vowel is not the last in the word
 293static unsigned char length_mods_en[100] = {
 294/*  a   ,   t   s   n   d   z   r   N   <- next */
 295	100,120,100,105,100,110,110,100, 95, 100,  /* a  <- next2 */
 296	105,120,105,110,125,130,135,115,125, 100,  /* , */
 297	105,120, 75,100, 75,105,120, 85, 75, 100,  /* t */
 298	105,120, 85,105, 95,115,120,100, 95, 100,  /* s */
 299	110,120, 95,105,100,115,120,100,100, 100,  /* n */
 300	105,120,100,105, 95,115,120,110, 95, 100,  /* d */
 301	105,120,100,105,105,122,125,110,105, 100,  /* z */
 302	105,120,100,105,105,122,125,110,105, 100,  /* r */
 303	105,120, 95,105,100,115,120,110,100, 100,  /* N */
 304	100,120,100,100,100,100,100,100,100, 100 }; // SPARE
 305
 306// as above, but for the last syllable in a word
 307static unsigned char length_mods_en0[100] = {
 308/*  a   ,   t   s   n   d   z   r    N  <- next */
 309	100,150,100,105,110,115,110,110,110, 100,  /* a  <- next2 */
 310	105,150,105,110,125,135,140,115,135, 100,  /* , */
 311	105,150, 90,105, 90,122,135,100, 90, 100,  /* t */
 312	105,150,100,105,100,122,135,100,100, 100,  /* s */
 313	105,150,100,105,105,115,135,110,105, 100,  /* n */
 314	105,150,100,105,105,122,130,120,125, 100,  /* d */
 315	105,150,100,105,110,122,125,115,110, 100,  /* z */
 316	105,150,100,105,105,122,135,120,105, 100,  /* r */
 317	105,150,100,105,105,115,135,110,105, 100,  /* N */
 318	100,100,100,100,100,100,100,100,100, 100 }; // SPARE
 319
 320
 321static unsigned char length_mods_equal[100] = {
 322/*  a   ,   t   s   n   d   z   r   N   <- next */
 323	110,110,110,110,110,110,110,110,110, 110,  /* a  <- next2 */
 324	110,110,110,110,110,110,110,110,110, 110,  /* , */
 325	110,110,110,110,110,110,110,110,110, 110,  /* t */
 326	110,110,110,110,110,110,110,110,110, 110,  /* s */
 327	110,110,110,110,110,110,110,110,110, 110,  /* n */
 328	110,110,110,110,110,110,110,110,110, 110,  /* d */
 329	110,110,110,110,110,110,110,110,110, 110,  /* z */
 330	110,110,110,110,110,110,110,110,110, 110,  /* r */
 331	110,110,110,110,110,110,110,110,110, 110,  /* N */
 332	110,110,110,110,110,110,110,110,110, 110 }; // SPARE
 333
 334
 335unsigned char *length_mod_tabs[6] = {
 336  length_mods_en,
 337  length_mods_en,     // 1
 338  length_mods_en0,    // 2
 339  length_mods_equal,  // 3
 340  length_mods_equal,  // 4
 341  length_mods_equal   // 5
 342 };
 343
 344
 345void SetLengthMods(Translator *tr, int value)
 346{//==========================================
 347	int value2;
 348
 349	tr->langopts.length_mods0 = tr->langopts.length_mods = length_mod_tabs[value % 100];
 350	if((value2 = value / 100) != 0)
 351	{
 352		tr->langopts.length_mods0 = length_mod_tabs[value2];
 353	}
 354}
 355
 356
 357int IsAlpha(unsigned int c)
 358{//========================
 359// Replacement for iswalph() which also checks for some in-word symbols
 360
 361	if(iswalpha(c))
 362		return(1);
 363
 364	if((c >= 0x901) && (c <= 0x957))
 365		return(1);    // Devanagari  vowel signs and other signs
 366
 367	if((c >= 0xb81) && (c <= 0xbe5))
 368		return(1);    // Tamil  vowel signs and other signs
 369
 370	if((c >= 0x300) && (c <= 0x36f))
 371		return(1);   // combining accents
 372
 373	if((c >= 0x1100) && (c <= 0x11ff))
 374		return(1);  //Korean jamo
 375
 376	if((c > 0x3040) && (c <= 0xa700))
 377		return(1); // Chinese/Japanese.  Should never get here, but Mac OS 10.4's iswalpha seems to be broken, so just make sure
 378
 379	return(0);
 380}
 381
 382int IsDigit09(unsigned int c)
 383{//=========================
 384	if((c >= '0') && (c <= '9'))
 385		return(1);
 386	return(0);
 387}
 388
 389int IsDigit(unsigned int c)
 390{//========================
 391	if(iswdigit(c))
 392		return(1);
 393
 394	if((c >= 0x966) && (c <= 0x96f))
 395		return(1);
 396
 397	return(0);
 398}
 399
 400int IsSpace(unsigned int c)
 401{//========================
 402	if(c == 0)
 403		return(0);
 404	if(wcschr(chars_space,c))
 405		return(1);
 406	return(iswspace(c));
 407}
 408
 409
 410Translator::Translator()
 411{//=====================
 412	int ix;
 413	static const unsigned char stress_amps2[] = {16,16, 20,20, 20,24, 24,21 };
 414	static const short stress_lengths2[8] = {182,140, 220,220, 220,240, 260,280};
 415	static const wchar_t empty_wstring[1] = {0};
 416	static const wchar_t punct_in_word[2] = {'\'', 0};  // allow hyphen within words
 417
 418	charset_a0 = charsets[1];   // ISO-8859-1, this is for when the input is not utf8
 419	dictionary_name[0] = 0;
 420	dict_condition=0;
 421	data_dictrules = NULL;     // language_1   translation rules file
 422	data_dictlist = NULL;      // language_2   dictionary lookup file
 423
 424	transpose_offset = 0;
 425
 426	// only need lower case
 427	letter_bits_offset = 0;
 428	memset(letter_bits,0,sizeof(letter_bits));
 429	memset(letter_groups,0,sizeof(letter_groups));
 430
 431	// 0-5 sets of characters matched by A B C E F G in pronunciation rules
 432	// these may be set differently for different languages
 433	SetLetterBits(this,0,"aeiou");  // A  vowels, except y
 434	SetLetterBits(this,1,"bcdfgjklmnpqstvxz");      // B  hard consonants, excluding h,r,w
 435	SetLetterBits(this,2,"bcdfghjklmnpqrstvwxz");  // C  all consonants
 436	SetLetterBits(this,3,"hlmnr");                 // H  'soft' consonants
 437	SetLetterBits(this,4,"cfhkpqstx");             // F  voiceless consonants
 438	SetLetterBits(this,5,"bdgjlmnrvwyz");   // G voiced
 439	SetLetterBits(this,6,"eiy");   // Letter group Y, front vowels
 440	SetLetterBits(this,7,"aeiouy");  // vowels, including y
 441
 442
 443	char_plus_apostrophe = empty_wstring;
 444	punct_within_word = punct_in_word;
 445
 446	for(ix=0; ix<8; ix++)
 447	{
 448		stress_amps[ix] = stress_amps2[ix];
 449		stress_amps_r[ix] = stress_amps2[ix] - 1;
 450		stress_lengths[ix] = stress_lengths2[ix];
 451	}
 452	memset(&langopts,0,sizeof(langopts));
 453
 454	langopts.stress_rule = 2;
 455	langopts.unstressed_wd1 = 1;
 456	langopts.unstressed_wd2 = 3;
 457	langopts.param[LOPT_SONORANT_MIN] = 95;
 458	langopts.param[LOPT_MAXAMP_EOC] = 19;
 459	langopts.param[LOPT_UNPRONOUNCABLE] = 's';    // don't count this character at start of word
 460	langopts.max_initial_consonants = 3;
 461	langopts.replace_chars = NULL;
 462
 463	langopts.length_mods = length_mods_en;
 464	langopts.length_mods0 = length_mods_en0;
 465	langopts.long_stop = 100;
 466
 467	langopts.max_roman = 49;
 468	langopts.thousands_sep = ',';
 469	langopts.decimal_sep = '.';
 470
 471	memcpy(punct_to_tone,punctuation_to_tone,sizeof(punct_to_tone));
 472}
 473
 474
 475Translator::~Translator(void)
 476{//==========================
 477	if(data_dictlist != NULL)
 478		Free(data_dictlist);
 479}
 480
 481
 482int lookupwchar(const unsigned short *list,int c)
 483{//==============================================
 484// Is the character c in the list ?
 485	int ix;
 486
 487	for(ix=0; list[ix] != 0; ix++)
 488	{
 489		if(list[ix] == c)
 490			return(ix+1);
 491	}
 492	return(0);
 493}
 494
 495int IsBracket(int c)
 496{//=================
 497	if((c >= 0x2014) && (c <= 0x201f))
 498		return(1);
 499	return(lookupwchar(brackets,c));
 500}
 501
 502
 503int utf8_out(unsigned int c, char *buf)
 504{//====================================
 505// write a unicode character into a buffer as utf8
 506// returns the number of bytes written
 507	int n_bytes;
 508	int j;
 509	int shift;
 510	static char unsigned code[4] = {0,0xc0,0xe0,0xf0};
 511
 512	if(c < 0x80)
 513	{
 514		buf[0] = c;
 515		return(1);
 516	}
 517	if(c >= 0x110000)
 518	{
 519		buf[0] = ' ';      // out of range character code
 520		return(1);
 521	}
 522	if(c < 0x0800)
 523		n_bytes = 1;
 524	else
 525	if(c < 0x10000)
 526		n_bytes = 2;
 527	else
 528		n_bytes = 3;
 529
 530	shift = 6*n_bytes;
 531	buf[0] = code[n_bytes] | (c >> shift);
 532	for(j=0; j<n_bytes; j++)
 533	{
 534		shift -= 6;
 535		buf[j+1] = 0x80 + ((c >> shift) & 0x3f);
 536	}
 537	return(n_bytes+1);
 538}  // end of utf8_out
 539
 540
 541int utf8_nbytes(const char *buf)
 542{//=============================
 543// Returns the number of bytes for the first UTF-8 character in buf
 544	unsigned char c = (unsigned char)buf[0];
 545	if(c < 0x80)
 546		return(1);
 547	if(c < 0xe0)
 548		return(2);
 549	if(c < 0xf0)
 550		return(3);
 551	return(4);
 552}
 553
 554
 555int utf8_in(int *c, const char *buf, int backwards)
 556{//================================================
 557// Read a unicode characater from a UTF8 string 
 558// Returns the number of UTF8 bytes used.
 559// backwards: set if we are moving backwards through the UTF8 string
 560	int c1;
 561	int n_bytes;
 562	int ix;
 563	static const unsigned char mask[4] = {0xff,0x1f,0x0f,0x07};
 564
 565	// find the start of the next/previous character
 566	while((*buf & 0xc0) == 0x80)
 567	{
 568		// skip over non-initial bytes of a multi-byte utf8 character
 569		if(backwards)
 570			buf--;
 571		else
 572			buf++;
 573	}
 574
 575	n_bytes = 0;
 576
 577	if((c1 = *buf++) & 0x80)
 578	{
 579		if((c1 & 0xe0) == 0xc0)
 580			n_bytes = 1;
 581		else
 582		if((c1 & 0xf0) == 0xe0)
 583			n_bytes = 2;
 584		else
 585		if((c1 & 0xf8) == 0xf0)
 586			n_bytes = 3;
 587
 588		c1 &= mask[n_bytes];
 589		for(ix=0; ix<n_bytes; ix++)
 590		{
 591			c1 = (c1 << 6) + (*buf++ & 0x3f);
 592		}
 593	}
 594	*c = c1;
 595	return(n_bytes+1);
 596}
 597
 598
 599
 600
 601char *strchr_w(const char *s, int c)
 602{//=================================
 603// return NULL for any non-ascii character
 604	if(c >= 0x80)
 605		return(NULL);
 606	return(strchr((char *)s,c));    // (char *) is needed for Borland compiler
 607}
 608
 609
 610int PhonemeCode(unsigned int mnem)
 611{//===============================
 612	int ix;
 613
 614	for(ix=0; ix<n_phoneme_tab; ix++)
 615	{
 616		if(phoneme_tab[ix]->mnemonic == mnem)
 617			return(phoneme_tab[ix]->code);
 618	}
 619	return(phonSCHWA);
 620}
 621
 622
 623
 624
 625int Translator::TranslateWord(char *word1, int next_pause, WORD_TAB *wtab)
 626{//=======================================================================
 627// word1 is terminated by space (0x20) character
 628
 629	int length;
 630	int word_length;
 631	int ix;
 632	int posn;
 633	int pfix;
 634	int n_chars;
 635	unsigned int dictionary_flags[2];
 636	unsigned int dictionary_flags2[2];
 637	int end_type=0;
 638	int prefix_type=0;
 639	char *wordx;
 640	char phonemes[N_WORD_PHONEMES];
 641	char *ph_limit;
 642	char *phonemes_ptr;
 643	char prefix_phonemes[N_WORD_PHONEMES];
 644	char end_phonemes[N_WORD_PHONEMES];
 645	char word_copy[N_WORD_BYTES];
 646	char prefix_chars[N_WORD_BYTES];
 647	int found=0;
 648   int end_flags;
 649	char c_temp;   // save a character byte while we temporarily replace it with space
 650	int first_char;
 651	int last_char = 0;
 652	int unpron_length;
 653	int add_plural_suffix = 0;
 654	int prefix_flags = 0;
 655	int confirm_prefix;
 656	int spell_word;
 657	int stress_bits;
 658	int emphasize_allcaps = 0;
 659	int wflags = wtab->flags;
 660	int wmark = wtab->wmark;
 661
 662	// translate these to get pronunciations of plural 's' suffix (different forms depending on
 663	// the preceding letter
 664	static char word_zz[4] = {0,'z','z',0};
 665	static char word_iz[4] = {0,'i','z',0};
 666	static char word_ss[4] = {0,'s','s',0};
 667
 668	dictionary_flags[0] = 0;
 669	dictionary_flags[1] = 0;
 670	dictionary_flags2[0] = 0;
 671	dictionary_flags2[1] = 0;
 672	dictionary_skipwords = 0;
 673
 674	prefix_phonemes[0] = 0;
 675	end_phonemes[0] = 0;
 676	ph_limit = &phonemes[N_WORD_PHONEMES];
 677
 678	// count the length of the word
 679	wordx = word1;
 680	utf8_in(&first_char,wordx,0);
 681	word_length = 0;
 682	while((*wordx != 0) && (*wordx != ' '))
 683	{
 684		wordx += utf8_in(&last_char,wordx,0);
 685		word_length++;
 686	}
 687
 688	// try an initial lookup in the dictionary list, we may find a pronunciation specified, or
 689	// we may just find some flags
 690	spell_word = 0;
 691	if(option_sayas == SAYAS_KEY)
 692	{
 693		if(word_length == 1)
 694			spell_word = 4;
 695	}
 696
 697	if(option_sayas & 0x10)
 698	{
 699		// SAYAS_CHAR, SAYAS_GYLPH, or SAYAS_SINGLE_CHAR
 700		spell_word = option_sayas & 0xf;    // 2,3,4
 701	}
 702	else
 703	{
 704		found = LookupDictList(&word1, phonemes, dictionary_flags, FLAG_ALLOW_TEXTMODE, wtab);   // the original word
 705		if(dictionary_flags[0] & FLAG_TEXTMODE)
 706		{
 707			stress_bits = dictionary_flags[0] & 0x7f;
 708			found = LookupDictList(&word1, phonemes, dictionary_flags2, 0, wtab);   // the text replacement
 709			if(dictionary_flags2[0]!=0)
 710			{
 711				dictionary_flags[0] = dictionary_flags2[0];
 712				dictionary_flags[1] = dictionary_flags2[1];
 713				if(stress_bits != 0)
 714				{
 715					// keep any stress information from the original word
 716					dictionary_flags[0] = (dictionary_flags[0] & ~0x7f) | stress_bits;
 717				}
 718			}
 719		}
 720		else
 721		if((found==0) && (dictionary_flags[0] & FLAG_SKIPWORDS))
 722		{
 723			// grouped words, but no translation.  Join the words with hyphens.
 724			wordx = word1;
 725			ix = 0;
 726			while(ix < dictionary_skipwords)
 727			{
 728				if(*wordx == ' ')
 729				{
 730					*wordx = '-';
 731					ix++;
 732				}
 733				wordx++;
 734			}
 735		}
 736
 737		// if textmode, LookupDictList() replaces word1 by the new text and returns found=0
 738
 739		if(phonemes[0] == phonSWITCH)
 740		{
 741			// change to another language in order to translate this word
 742			strcpy(word_phonemes,phonemes);
 743			return(0);
 744		}
 745
 746if((wmark > 0) && (wmark < 8))
 747{
 748	// the stressed syllable has been specified in the text  (TESTING)
 749	dictionary_flags[0] = (dictionary_flags[0] & ~0xf) | wmark;
 750}
 751
 752		if(!found && (dictionary_flags[0] & FLAG_ABBREV))
 753		{
 754			// the word has $abbrev flag, but no pronunciation specified.  Speak as individual letters
 755			spell_word = 1;
 756		}
 757 
 758		if(!found && iswdigit(first_char))
 759		{
 760			Lookup("_0lang",word_phonemes);
 761			if(word_phonemes[0] == phonSWITCH)
 762				return(0);
 763
 764			found = TranslateNumber(word1,phonemes,dictionary_flags,wflags);
 765		}
 766
 767		if(!found & ((word_flags & FLAG_UPPERS) != FLAG_FIRST_UPPER))
 768		{
 769			// either all upper or all lower case
 770
 771			if((langopts.numbers & NUM_ROMAN) || ((langopts.numbers & NUM_ROMAN_UC) && (word_flags & FLAG_ALL_UPPER)))
 772			{
 773				if((found = TranslateRoman(word1,phonemes)) != 0)
 774					dictionary_flags[0] |= FLAG_ABBREV;   // prevent emphasis if capitals
 775			}
 776		}
 777
 778		if((wflags & FLAG_ALL_UPPER) && (word_length > 1)&& iswalpha(first_char))
 779		{
 780			if((option_tone_flags & OPTION_EMPHASIZE_ALLCAPS) && !(dictionary_flags[0] & FLAG_ABBREV))
 781			{
 782				// emphasize words which are in capitals
 783				emphasize_allcaps = FLAG_EMPHASIZED;
 784			}
 785			else
 786			if(!found && !(dictionary_flags[0] &  FLAG_SKIPWORDS) && (word_length<4) && (clause_lower_count > 3) && (clause_upper_count <= clause_lower_count))
 787			{
 788				// An upper case word in a lower case clause. This could be an abbreviation.
 789				spell_word = 1;
 790			}
 791		}
 792	}
 793
 794	if(spell_word > 0)
 795	{
 796		// Speak as individual letters
 797		wordx = word1;
 798		posn = 0;
 799		phonemes[0] = 0;
 800		end_type = 0;
 801
 802		while(*wordx != ' ')
 803		{
 804			wordx += TranslateLetter(wordx, phonemes,spell_word, word_length);
 805			posn++;
 806			if(phonemes[0] == phonSWITCH)
 807			{
 808				// change to another language in order to translate this word
 809				strcpy(word_phonemes,phonemes);
 810				if(word_length > 1)
 811					return(FLAG_SPELLWORD);  // a mixture of languages, retranslate as individual letters, separated by spaces
 812				return(0);
 813			}
 814		}
 815		SetSpellingStress(phonemes,spell_word,posn);
 816	}
 817	else
 818	if(found == 0)
 819	{
 820		// word's pronunciation is not given in the dictionary list, although
 821		// dictionary_flags may have ben set there
 822
 823		posn = 0;
 824		length = 999;
 825		wordx = word1;
 826
 827		while(((length < 3) && (length > 0))|| (word_length > 1 && Unpronouncable(wordx)))
 828		{
 829			char *p;
 830			// This word looks "unpronouncable", so speak letters individually until we
 831			// find a remainder that we can pronounce.
 832			emphasize_allcaps = 0;
 833			wordx += TranslateLetter(wordx,phonemes,0, word_length);
 834			posn++;
 835			if(phonemes[0] == phonSWITCH)
 836			{
 837				// change to another language in order to translate this word
 838				strcpy(word_phonemes,phonemes);
 839				if(strcmp(&phonemes[1],"en")==0)
 840					return(FLAG_SPELLWORD);   // _^_en must have been set in TranslateLetter(), not *_rules
 841				return(0);
 842			}
 843
 844			p = &wordx[word_length-3];    // this looks wrong.  Doesn't consider multi-byte chars.
 845			if(memcmp(p,"'s ",3) == 0)
 846			{
 847				// remove a 's suffix and pronounce this separately (not as an individual letter)
 848				add_plural_suffix = 1;
 849				p[0] = ' ';
 850				p[1] = ' ';
 851				last_char = p[-1];
 852			}
 853
 854			length=0;
 855			while(wordx[length] != ' ') length++;
 856			if(length > 0)
 857				wordx[-1] = ' ';            // prevent this affecting the pronunciation of the pronuncable part
 858		}
 859		SetSpellingStress(phonemes,0,posn);
 860
 861		// anything left ?
 862		if(*wordx != ' ')
 863		{
 864			// Translate the stem
 865			unpron_length = strlen(phonemes);
 866			end_type = TranslateRules(wordx, phonemes, N_WORD_PHONEMES, end_phonemes, wflags, dictionary_flags);
 867
 868			if(phonemes[0] == phonSWITCH)
 869			{
 870				// change to another language in order to translate this word
 871				strcpy(word_phonemes,phonemes);
 872				return(0);
 873			}
 874
 875			if((phonemes[0] == 0) && (end_phonemes[0] == 0))
 876			{
 877				int wc;
 878				// characters not recognised, speak them individually
 879
 880				utf8_in(&wc, wordx, 0);
 881				if((word_length == 1) && IsAlpha(wc))
 882				{
 883					posn = 0;
 884					while((*wordx != ' ') && (*wordx != 0))
 885					{
 886						wordx += TranslateLetter(wordx, phonemes, 4, word_length);
 887						posn++;
 888						if(phonemes[0] == phonSWITCH)
 889						{
 890							// change to another language in order to translate this word
 891							strcpy(word_phonemes,phonemes);
 892							return(0);
 893						}
 894					}
 895					SetSpellingStress(phonemes,spell_word,posn);
 896				}
 897			}
 898
 899			c_temp = wordx[-1];
 900
 901			found = 0;
 902			confirm_prefix = 1;
 903			while(end_type & SUFX_P)
 904			{
 905				// Found a standard prefix, remove it and retranslate
 906
 907				if(confirm_prefix && !(end_type & SUFX_B))
 908				{
 909					int end2;
 910					char phonemes2[N_WORD_PHONEMES];
 911					char end_phonemes2[N_WORD_PHONEMES];
 912
 913					// remove any standard suffix and confirm that the prefix is still recognised
 914					phonemes2[0] = 0;
 915					end2 = TranslateRules(wordx, phonemes2, N_WORD_PHONEMES, end_phonemes2, wflags|FLAG_NO_PREFIX|FLAG_NO_TRACE, dictionary_flags);
 916					if(end2)
 917					{
 918						RemoveEnding(wordx,end2,word_copy);
 919						end_type = TranslateRules(wordx, phonemes, N_WORD_PHONEMES, end_phonemes, wflags|FLAG_NO_TRACE, dictionary_flags);
 920						memcpy(wordx,word_copy,strlen(word_copy));
 921						if((end_type & SUFX_P) == 0)
 922						{
 923							// after removing the suffix, the prefix is no longer recognised.
 924							// Keep the suffix, but don't use the prefix
 925							end_type = end2;
 926							strcpy(phonemes,phonemes2);
 927							strcpy(end_phonemes,end_phonemes2);
 928							if(option_phonemes == 2)
 929							{
 930								DecodePhonemes(end_phonemes,end_phonemes2);
 931								fprintf(f_trans,"  suffix [%s]\n\n",end_phonemes2);
 932							}
 933						}
 934						confirm_prefix = 0;
 935						continue;
 936					}
 937				}
 938
 939				prefix_type = end_type;
 940
 941				if(prefix_type & SUFX_V)
 942				{
 943					expect_verb = 1;      // use the verb form of the word
 944				}
 945
 946				wordx[-1] = c_temp;
 947
 948				if((prefix_type & SUFX_B) == 0)
 949				{
 950					for(ix=(prefix_type & 0xf); ix>0; ix--)    // num. of characters to remove
 951					{
 952						wordx++;
 953						while((*wordx & 0xc0) == 0x80) wordx++;  // for multibyte characters
 954					}
 955				}
 956				else
 957				{
 958					pfix = 1;
 959					prefix_chars[0] = 0;
 960					n_chars = prefix_type & 0x3f;
 961
 962					for(ix=0; ix < n_chars; ix++)    // num. of bytes to remove
 963					{
 964						prefix_chars[pfix++] = *wordx++;
 965	
 966						if((prefix_type & SUFX_B) && (ix == (n_chars-1)))
 967						{
 968							prefix_chars[pfix-1] = 0;  // discard the last character of the prefix, this is the separator character
 969						}
 970					}
 971					prefix_chars[pfix] = 0;
 972				}
 973				c_temp = wordx[-1];
 974				wordx[-1] = ' ';
 975				confirm_prefix = 1;
 976
 977				if(prefix_type & SUFX_B)
 978				{
 979// SUFX_B is used for Turkish, tr_rules contains "(PbÂ?
 980					// retranslate the prefix part
 981					char *wordpf;
 982					char prefix_phonemes2[12];
 983
 984					strncpy0(prefix_phonemes2,end_phonemes,sizeof(prefix_phonemes2));
 985					wordpf = &prefix_chars[1];
 986					found = LookupDictList(&wordpf, phonemes, dictionary_flags, SUFX_P, wtab);   // without prefix
 987					if(found == 0)
 988					{
 989						end_type = TranslateRules(wordpf, phonemes, N_WORD_PHONEMES, end_phonemes, 0, dictionary_flags);
 990						sprintf(prefix_phonemes,"%s%s%s",phonemes,end_phonemes,prefix_phonemes2);
 991					}
 992					prefix_flags = 1;
 993				}
 994				else
 995				{
 996					strcat(prefix_phonemes,end_phonemes);
 997				}
 998				end_phonemes[0] = 0;
 999
1000				end_type = 0;
1001				found = LookupDictList(&wordx, phonemes, dictionary_flags2, SUFX_P, wtab);   // without prefix
1002				if(dictionary_flags[0]==0)
1003				{
1004					dictionary_flags[0] = dictionary_flags2[0];
1005					dictionary_flags[1] = dictionary_flags2[1];
1006				}
1007				else
1008					prefix_flags = 1;
1009				if(found == 0)
1010				{
1011					end_type = TranslateRules(wordx, phonemes, N_WORD_PHONEMES, end_phonemes, 0, dictionary_flags);
1012
1013					if(phonemes[0] == phonSWITCH)
1014					{
1015						// change to another language in order to translate this word
1016						wordx[-1] = c_temp;
1017						strcpy(word_phonemes,phonemes);
1018						return(0);
1019					}
1020				}
1021			}
1022
1023			if((end_type != 0) && !(end_type & SUFX_P))
1024			{
1025char phonemes2[N_WORD_PHONEMES];
1026strcpy(phonemes2,phonemes);
1027
1028				// The word has a standard ending, re-translate without this ending
1029				end_flags = RemoveEnding(wordx,end_type,word_copy);
1030
1031				phonemes_ptr = &phonemes[unpron_length];
1032				phonemes_ptr[0] = 0;
1033
1034				if(prefix_phonemes[0] != 0)
1035				{
1036					// lookup the stem without the prefix removed
1037					wordx[-1] = c_temp;
1038					found = LookupDictList(&word1, phonemes_ptr, dictionary_flags2, end_flags, wtab);  // include prefix, but not suffix
1039					wordx[-1] = ' ';
1040					if(dictionary_flags[0]==0)
1041					{
1042						dictionary_flags[0] = dictionary_flags2[0];
1043						dictionary_flags[1] = dictionary_flags2[1];
1044					}
1045					if(found)
1046						prefix_phonemes[0] = 0;  // matched whole word, don't need prefix now
1047
1048					if((found==0) && (dictionary_flags2[0] != 0))
1049						prefix_flags = 1;
1050				}
1051				if(found == 0)
1052				{
1053					found = LookupDictList(&wordx, phonemes_ptr, dictionary_flags2, end_flags, wtab);  // without prefix and suffix
1054					if(phonemes_ptr[0] == phonSWITCH)
1055					{
1056						// change to another language in order to translate this word
1057						memcpy(wordx,word_copy,strlen(word_copy));
1058						strcpy(word_phonemes,phonemes_ptr);
1059						return(0);
1060					}
1061					if(dictionary_flags[0]==0)
1062					{
1063						dictionary_flags[0] = dictionary_flags2[0];
1064						dictionary_flags[1] = dictionary_flags2[1];
1065					}
1066				}
1067				if(found == 0)
1068				{
1069					if(end_type & SUFX_Q)
1070					{
1071						// don't retranslate, use the original lookup result
1072						strcpy(phonemes,phonemes2);
1073
1074						// language specific changes
1075						ApplySpecialAttribute(phonemes,dictionary_flags[0]);
1076					}
1077					else
1078					{
1079						if(end_flags & FLAG_SUFX)
1080							TranslateRules(wordx, phonemes, N_WORD_PHONEMES, NULL,wflags | FLAG_SUFFIX_REMOVED, dictionary_flags);
1081						else
1082							TranslateRules(wordx, phonemes, N_WORD_PHONEMES, NULL,wflags,dictionary_flags);
1083
1084						if(phonemes[0] == phonSWITCH)
1085						{
1086							// change to another language in order to translate this word
1087							strcpy(word_phonemes,phonemes);
1088							memcpy(wordx,word_copy,strlen(word_copy));
1089							wordx[-1] = c_temp;
1090							return(0);
1091						}
1092					}
1093				}
1094
1095				if((end_type & SUFX_T) == 0)
1096				{
1097					// the default is to add the suffix and then determine the word's stress pattern
1098					AppendPhonemes(phonemes, N_WORD_PHONEMES, end_phonemes);
1099					end_phonemes[0] = 0;
1100				}
1101			}
1102			wordx[-1] = c_temp;
1103		}
1104	}
1105
1106	if((add_plural_suffix) || (wflags & FLAG_HAS_PLURAL))
1107	{
1108		// s or 's suffix, append [s], [z] or [Iz] depending on previous letter
1109		if(last_char == 'f')
1110			TranslateRules(&word_ss[1], phonemes, N_WORD_PHONEMES, NULL, 0, NULL);
1111		else
1112		if((last_char==0) || (strchr_w("hsx",last_char)==NULL))
1113			TranslateRules(&word_zz[1], phonemes, N_WORD_PHONEMES, NULL, 0, NULL);
1114		else
1115			TranslateRules(&word_iz[1], phonemes, N_WORD_PHONEMES, NULL, 0, NULL);
1116	}
1117
1118	wflags |= emphasize_allcaps;
1119
1120
1121	/* determine stress pattern for this word */
1122	/******************************************/
1123	/* NOTE: this also adds a single PAUSE if the previous word ended
1124				in a primary stress, and this one starts with one */
1125	if(prefix_flags || (strchr(prefix_phonemes,phonSTRESS_P)!=NULL))
1126	{
1127		if((langopts.param[LOPT_PREFIXES]) || (prefix_type & SUFX_T))
1128		{
1129			char *p;
1130			// German, keep a secondary stress on the stem
1131			SetWordStress(phonemes,dictionary_flags[0],3,0);
1132
1133			// reduce all but the first primary stress
1134			ix=0;
1135			for(p=prefix_phonemes; *p != 0; p++)
1136			{
1137				if(*p == phonSTRESS_P)
1138				{
1139					if(ix==0)
1140						ix=1;
1141					else
1142						*p = phonSTRESS_3;
1143				}
1144			}
1145			strcpy(word_phonemes,prefix_phonemes);
1146			strcat(word_phonemes,phonemes);
1147			SetWordStress(word_phonemes,dictionary_flags[0],-1,0);
1148		}
1149		else
1150		{
1151			// stress position affects the whole word, including prefix
1152			strcpy(word_phonemes,prefix_phonemes);
1153			strcat(word_phonemes,phonemes);
1154			SetWordStress(word_phonemes,dictionary_flags[0],-1,prev_last_stress);
1155		}
1156	}
1157	else
1158	{
1159		if(prefix_phonemes[0] == 0)
1160			SetWordStress(phonemes,dictionary_flags[0],-1,prev_last_stress);
1161		else
1162			SetWordStress(phonemes,dictionary_flags[0],-1,0);
1163		strcpy(word_phonemes,prefix_phonemes);
1164		strcat(word_phonemes,phonemes);
1165	}
1166
1167	if(end_phonemes[0] != 0)
1168	{
1169		// a suffix had the SUFX_T option set, add the suffix after the stress pattern has been determined
1170		strcat(word_phonemes,end_phonemes);
1171	}
1172
1173	if(wflags & FLAG_LAST_WORD)
1174	{
1175		// don't use $brk pause before the last word of a sentence
1176		// (but allow it for emphasis, see below
1177		dictionary_flags[0] &= ~FLAG_PAUSE1;
1178	}
1179
1180	if(wflags & FLAG_EMPHASIZED)
1181	{
1182		// A word is indicated in the source text as stressed
1183		// Give it stress level 6 (for the intonation module)
1184		ChangeWordStress(this,word_phonemes,6);
1185
1186//		if(!(wflags & FLAG_LAST_WORD))     // ?? omit pre-pause if it's the last word in the sentence?
1187			dictionary_flags[0] |= FLAG_PAUSE1;   // precede by short pause
1188	}
1189	else
1190	if(wtab[dictionary_skipwords].flags & FLAG_LAST_WORD)
1191	{
1192		// the word has attribute to stress or unstress when at end of clause
1193		if(dictionary_flags[0] & (FLAG_STRESS_END | FLAG_STRESS_END2))
1194			ChangeWordStress(this,word_phonemes,4);
1195		else
1196		if(dictionary_flags[0] & FLAG_UNSTRESS_END)
1197			ChangeWordStress(this,word_phonemes,3);
1198	}
1199
1200	// dictionary flags for this word give a clue about which alternative pronunciations of
1201	// following words to use.
1202	if(end_type & SUFX_F)
1203	{
1204		// expect a verb form, with or without -s suffix
1205		expect_verb = 2;
1206		expect_verb_s = 2;
1207	}
1208
1209	if(dictionary_flags[1] & FLAG_PASTF)
1210	{
1211		/* expect perfect tense in next two words */
1212		expect_past = 3;
1213		expect_verb = 0;
1214		expect_noun = 0;
1215	}
1216	else
1217	if(dictionary_flags[1] & FLAG_VERBF)
1218	{
1219		/* expect a verb in the next word */
1220		expect_verb = 2;
1221		expect_verb_s = 0;   /* verb won't have -s suffix */
1222		expect_noun = 0;
1223	}
1224	else
1225	if(dictionary_flags[1] & FLAG_VERBSF)
1226	{
1227		// expect a verb, must have a -s suffix
1228		expect_verb = 0;
1229		expect_verb_s = 2;
1230		expect_past = 0;
1231		expect_noun = 0;
1232	}
1233	else
1234	if(dictionary_flags[1] & FLAG_NOUNF)
1235	{
1236		/* not expecting a verb next */
1237		expect_noun = 3;
1238		expect_verb = 0;
1239		expect_verb_s = 0;
1240		expect_past = 0;
1241	}
1242
1243	if((wordx[0] != 0) && (!(dictionary_flags[1] & FLAG_VERB_EXT)))
1244	{
1245		if(expect_verb > 0)
1246			expect_verb--;
1247
1248		if(expect_verb_s > 0)
1249			expect_verb_s--;
1250
1251		if(expect_noun >0)
1252			expect_noun--;
1253
1254		if(expect_past > 0)
1255			expect_past--;
1256	}
1257
1258	if((word_length == 1) && iswalpha(first_char) && (first_char != 'i'))
1259	{
1260// English Specific !!!!
1261		// any single letter before a dot is an abbreviation, except 'I'
1262		dictionary_flags[0] |= FLAG_DOT;
1263	}
1264
1265	return(dictionary_flags[0]);
1266}  //  end of TranslateWord
1267
1268
1269
1270static void SetPlist2(PHONEME_LIST2 *p, unsigned char phcode)
1271{//==========================================================
1272	p->phcode = phcode;
1273	p->stress = 0;
1274	p->tone_number = 0;
1275	p->synthflags = 0;
1276	p->sourceix = 0;
1277}
1278
1279static int CountSyllables(unsigned char *phonemes)
1280{//===============================================
1281	int count = 0;
1282	int phon;
1283	while((phon = *phonemes++) != 0)
1284	{
1285		if(phoneme_tab[phon]->type == phVOWEL)
1286			count++;
1287	}
1288	return(count);
1289}
1290
1291
1292int SetTranslator2(const char *new_language)
1293{//=========================================
1294// Set translator2 to a second language
1295	int new_phoneme_tab;
1296
1297	if((new_phoneme_tab = SelectPhonemeTableName(new_language)) >= 0)
1298	{
1299		if((translator2 != NULL) && (strcmp(new_language,translator2_language) != 0))
1300		{
1301			// we already have an alternative translator, but not for the required language, delete it
1302			delete translator2;
1303			translator2 = NULL;
1304		}
1305
1306		if(translator2 == NULL)
1307		{
1308			translator2 = SelectTranslator(new_language);
1309			strcpy(translator2_language,new_language);
1310
1311			if(translator2->LoadDictionary(new_language,0) != 0)
1312			{
1313				SelectPhonemeTable(voice->phoneme_tab_ix);  // revert to original phoneme table
1314				new_phoneme_tab = -1;
1315				translator2_language[0] = 0;
1316			}
1317		}
1318	}
1319	return(new_phoneme_tab);
1320}  // end of SetTranslator2
1321
1322
1323
1324int Translator::TranslateWord2(char *word, WORD_TAB *wtab, int pre_pause, int next_pause)
1325{//======================================================================================
1326	int flags=0;
1327	int stress;
1328	int next_stress;
1329	int next_tone=0;
1330	unsigned char *p;
1331	int srcix;
1332	int embedded_flag=0;
1333	int embedded_cmd;
1334	int value;
1335	int found_dict_flag;
1336	unsigned char ph_code;
1337	PHONEME_LIST2 *plist2;
1338	PHONEME_TAB *ph;
1339	int max_stress;
1340	int max_stress_ix=0;
1341	int prev_vowel = -1;
1342	int pitch_raised = 0;
1343	int switch_phonemes = -1;
1344	int first_phoneme = 1;
1345	int source_ix;
1346	int len;
1347	int ix;
1348	int sylimit;        // max. number of syllables in a word to be combined with a preceding preposition
1349	const char *new_language;
1350	unsigned char bad_phoneme[4];
1351	int word_copy_len;
1352	char word_copy[N_WORD_BYTES+1];
1353
1354	len = wtab->length;
1355	if(len > 31) len = 31;
1356	source_ix = (wtab->sourceix & 0x7ff) | (len << 11); // bits 0-10 sourceix, bits 11-15 word length
1357
1358	word_flags = wtab[0].flags;
1359	if(word_flags & FLAG_EMBEDDED)
1360	{
1361		embedded_flag = SFLAG_EMBEDDED;
1362
1363		do
1364		{
1365			embedded_cmd = embedded_list[embedded_read++];
1366			value = embedded_cmd >> 8;
1367
1368			switch(embedded_cmd & 0x1f)
1369			{
1370			case EMBED_Y:
1371				option_sayas = value;
1372				break;
1373	
1374			case EMBED_F:
1375				option_emphasis = value;
1376				break;
1377	
1378			case EMBED_B:
1379				// break command
1380				if(value == 0)
1381					pre_pause = 0;  // break=none
1382				else
1383					pre_pause += value;
1384				break;
1385			}
1386		} while((embedded_cmd & 0x80) == 0);
1387	}
1388
1389	if(word[0] == 0)
1390	{
1391		// nothing to translate
1392		word_phonemes[0] = 0;
1393		return(0);
1394	}
1395
1396	// after a $pause word attribute, ignore a $pause attribute on the next two words
1397	if(prepause_timeout > 0)
1398		prepause_timeout--;
1399
1400	if((option_sayas & 0xf0) == 0x10)
1401	{
1402		if(!(word_flags & FLAG_FIRST_WORD))
1403		{
1404			// SAYAS_CHARS, SAYAS_GLYPHS, or SAYAS_SINGLECHARS.  Pause between each word.
1405			pre_pause += 4;
1406		}
1407	}
1408
1409	if(word_flags & FLAG_FIRST_UPPER)
1410	{
1411		if((option_capitals > 2) && (embedded_ix < N_EMBEDDED_LIST-6))
1412		{
1413			// indicate capital letter by raising pitch
1414			if(embedded_flag)
1415				embedded_list[embedded_ix-1] &= ~0x80;   // already embedded command before this word, remove terminator
1416			if((pitch_raised = option_capitals) == 3)
1417				pitch_raised = 20;  // default pitch raise for capitals
1418			embedded_list[embedded_ix++] = EMBED_P+0x40+0x80 + (pitch_raised << 8);  // raise pitch
1419			embedded_flag = SFLAG_EMBEDDED;
1420		}
1421	}
1422
1423	p = (unsigned char *)translator->word_phonemes;
1424	if(word_flags & FLAG_PHONEMES)
1425	{
1426		// The input is in phoneme mnemonics, not language text
1427		int c1;
1428		char lang_name[12];
1429
1430		if(memcmp(word,"_^_",3)==0)
1431		{
1432			// switch languages
1433			word+=3;
1434			for(ix=0;;)
1435			{
1436				c1 = *word++;
1437				if((c1==' ') || (c1==0))
1438					break;
1439				lang_name[ix++] = tolower(c1);
1440			}
1441			lang_name[ix] = 0;
1442
1443			if((ix = LookupPhonemeTable(lang_name)) > 0)
1444			{
1445				SelectPhonemeTable(ix);
1446				word_phonemes[0] = phonSWITCH;
1447				word_phonemes[1] = ix;
1448				word_phonemes[2] = 0;
1449			}
1450		}
1451		else
1452		{
1453			EncodePhonemes(word,word_phonemes,bad_phoneme);
1454		}
1455		flags = FLAG_FOUND;
1456	}
1457	else
1458	{
1459		int c2;
1460		ix = 0;
1461		while(((c2 = word_copy[ix] = word[ix]) != ' ') && (c2 != 0) && (ix < N_WORD_BYTES)) ix++;
1462		word_copy_len = ix;
1463
1464		flags = translator->TranslateWord(word, next_pause, wtab);
1465
1466		if(flags & FLAG_SPELLWORD)
1467		{
1468			// re-translate the word as individual letters, separated by spaces
1469			memcpy(word, word_copy, word_copy_len);
1470			return(flags);
1471		}
1472
1473		if((flags & FLAG_ALT2_TRANS) && ((sylimit = langopts.param[LOPT_COMBINE_WORDS]) > 0))
1474		{
1475			char *p2;
1476			int ok = 1;
1477			int flags2;
1478			int c_word2;
1479			char ph_buf[N_WORD_PHONEMES];
1480
1481			// LANG=cs,sk
1482			// combine a preposition with the following word
1483			p2 = word;
1484			while(*p2 != ' ') p2++;
1485
1486			utf8_in(&c_word2, p2+1, 0);   // first character of the next word;
1487			if(!iswalpha(c_word2))
1488			{
1489				ok =0;
1490			}
1491
1492			if(ok != 0)
1493			{
1494				if(sylimit & 0x100)
1495				{
1496					// only if the second word has $alt attribute
1497					strcpy(ph_buf,word_phonemes);
1498					flags2 = translator->TranslateWord(p2+1, 0, wtab+1);
1499					if((flags2 & FLAG_ALT_TRANS) == 0)
1500					{
1501						ok = 0;
1502						strcpy(word_phonemes,ph_buf);
1503					}
1504				}
1505	
1506				if((sylimit & 0x200) && ((wtab+1)->flags & FLAG_LAST_WORD))
1507				{
1508					// not if the next word is end-of-sentence
1509					ok = 0;
1510				}
1511			}
1512
1513			if(ok)
1514			{
1515				*p2 = '-'; // replace next space by hyphen
1516				flags = translator->TranslateWord(word, next_pause, wtab);  // translate the combined word
1517				if(CountSyllables(p) > (sylimit & 0xf))
1518				{
1519					// revert to separate words
1520					*p2 = ' ';
1521					flags = translator->TranslateWord(word, next_pause, wtab);
1522				}
1523				else
1524				{
1525					flags |= FLAG_SKIPWORDS;
1526					dictionary_skipwords = 1;
1527				}
1528			}
1529		}
1530
1531		if(p[0] == phonSWITCH)
1532		{
1533			// this word uses a different language
1534			memcpy(word, word_copy, word_copy_len);
1535
1536			new_language = (char *)(&p[1]);
1537			if(new_language[0]==0)
1538				new_language = "en";
1539
1540			switch_phonemes = SetTranslator2(new_language);
1541
1542			if(switch_phonemes >= 0)
1543			{
1544				// re-translate the word using the new translator
1545				flags = translator2->TranslateWord(word, next_pause, wtab);
1546				strcpy((char *)p,translator2->word_phonemes);
1547				if(p[0] == phonSWITCH)
1548				{
1549					// the second translator doesn't want to process this word
1550					switch_phonemes = -1;
1551				}
1552			}
1553			if(switch_phonemes < 0)
1554			{
1555				// language code is not recognised or 2nd translator won't translate it
1556				p[0] = phonSCHWA;  // just say something
1557				p[1] = phonSCHWA;
1558				p[2] = 0;
1559			}
1560		}
1561
1562		if(!(word_flags & FLAG_HYPHEN))
1563		{
1564			if(flags & FLAG_PAUSE1)
1565			{
1566				if(pre_pause < 1)
1567					pre_pause = 1;
1568			}
1569			if((flags & FLAG_PREPAUSE) && (prepause_timeout == 0))
1570			{
1571				// the word is marked in the dictionary list with $pause
1572				if(pre_pause < 4) pre_pause = 4;
1573				prepause_timeout = 3;
1574			}
1575		}
1576
1577		if((option_emphasis >= 3) && (pre_pause < 1))
1578			pre_pause = 1;
1579	}
1580
1581	plist2 = &ph_list2[n_ph_list2];
1582	stress = 0;
1583	next_stress = 0;
1584	srcix = 0;
1585	max_stress = -1;
1586
1587	found_dict_flag = 0;
1588	if(flags & FLAG_FOUND)
1589		found_dict_flag = SFLAG_DICTIONARY;
1590
1591	while((pre_pause > 0) && (n_ph_list2 < N_PHONEME_LIST-4))
1592	{
1593		// add pause phonemes here. Either because of punctuation (brackets or quotes) in the
1594		// text, or because the word is marked in the dictionary lookup as a conjunction
1595		if(pre_pause > 1)
1596		{
1597			SetPlist2(&ph_list2[n_ph_list2++],phonPAUSE);
1598			pre_pause -= 2;
1599		}
1600		else
1601		{
1602			SetPlist2(&ph_list2[n_ph_list2++],phonPAUSE_NOLINK);
1603			pre_pause--;
1604		}
1605		end_stressed_vowel = 0;   // forget about the previous word
1606		prev_dict_flags = 0;
1607	}
1608
1609	if((option_capitals==1) && (word_flags & FLAG_FIRST_UPPER))
1610	{
1611		SetPlist2(&ph_list2[n_ph_list2++],phonPAUSE_SHORT);
1612		SetPlist2(&ph_list2[n_ph_list2++],phonCAPITAL);
1613		if((word_flags & FLAG_ALL_UPPER) && IsAlpha(word[1]))
1614		{
1615			// word > 1 letter and all capitals
1616			SetPlist2(&ph_list2[n_ph_list2++],phonPAUSE_SHORT);
1617			SetPlist2(&ph_list2[n_ph_list2++],phonCAPITAL);
1618		}
1619	}
1620
1621	if(switch_phonemes >= 0)
1622	{
1623		// this word uses a different phoneme table
1624		SetPlist2(&ph_list2[n_ph_list2],phonSWITCH);
1625		ph_list2[n_ph_list2++].tone_number = switch_phonemes;  // temporary phoneme table number
1626	}
1627
1628	// remove initial pause from a word if it follows a hyphen
1629	if((word_flags & FLAG_HYPHEN) && (phoneme_tab[*p]->type == phPAUSE))
1630		p++;
1631
1632	while(((ph_code = *p++) != 0) && (n_ph_list2 < N_PHONEME_LIST-4))
1633	{
1634		if(ph_code == 255)
1635			continue;      // unknown phoneme
1636
1637		// Add the phonemes to the first stage phoneme list (ph_list2)
1638		ph = phoneme_tab[ph_code];
1639
1640		if(ph_code == phonSWITCH)
1641		{
1642			ph_list2[n_ph_list2].phcode = ph_code;
1643			ph_list2[n_ph_list2].sourceix = 0;
1644			ph_list2[n_ph_list2].synthflags = embedded_flag;
1645			ph_list2[n_ph_list2++].tone_number = *p++;
1646		}
1647		else
1648		if(ph->type == phSTRESS)
1649		{
1650			// don't add stress phonemes codes to the list, but give their stress
1651			// value to the next vowel phoneme
1652			// std_length is used to hold stress number or (if >10) a tone number for a tone language
1653			if(ph->spect == 0)
1654				next_stress = ph->std_length;
1655			else
1656			{
1657				// for tone languages, the tone number for a syllable follows the vowel
1658				if(prev_vowel >= 0)
1659				{
1660					ph_list2[prev_vowel].tone_number = ph_code;
1661				}
1662				else
1663				{
1664					next_tone = ph_code;       // no previous v…

Large files files are truncated, but you can click here to view the full file