PageRenderTime 146ms CodeModel.GetById 15ms app.highlight 116ms RepoModel.GetById 1ms app.codeStats 1ms

/native/external/espeak/src/dictionary.cpp

http://eyes-free.googlecode.com/
C++ | 2754 lines | 2122 code | 384 blank | 248 comment | 612 complexity | d1049eaa6bbd8c21c88e0cbfb5da8e43 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/***************************************************************************
   2 *   Copyright (C) 2005 to 2007 by Jonathan Duddington                     *
   3 *   email: jonsd@users.sourceforge.net                                    *
   4 *                                                                         *
   5 *   This program is free software; you can redistribute it and/or modify  *
   6 *   it under the terms of the GNU General Public License as published by  *
   7 *   the Free Software Foundation; either version 3 of the License, or     *
   8 *   (at your option) any later version.                                   *
   9 *                                                                         *
  10 *   This program is distributed in the hope that it will be useful,       *
  11 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  12 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
  13 *   GNU General Public License for more details.                          *
  14 *                                                                         *
  15 *   You should have received a copy of the GNU General Public License     *
  16 *   along with this program; if not, write see:                           *
  17 *               <http://www.gnu.org/licenses/>.                           *
  18 ***************************************************************************/
  19
  20#include "StdAfx.h"
  21
  22#define LOG_TRANSLATE
  23 
  24#include <stdio.h>
  25#include <ctype.h>
  26#include <stdlib.h>
  27#include <string.h>
  28
  29#include <wctype.h>
  30////#include <wchar.h>
  31
  32#include "speak_lib.h"
  33#include "speech.h"
  34#include "phoneme.h"
  35#include "synthesize.h"
  36#include "translate.h"
  37
  38
  39int dictionary_skipwords;
  40char dictionary_name[40];
  41
  42extern MNEM_TAB mnem_flags[];
  43extern PHONEME_TAB_LIST phoneme_tab_list[N_PHONEME_TABS];
  44
  45// accented characters which indicate (in some languages) the start of a separate syllable
  46//static const unsigned short diereses_list[7] = {L'ä',L'ë',L'ď',L'ö',L'ü',L'˙',0};
  47static const unsigned short diereses_list[7] = {0xe4,0xeb,0xef,0xf6,0xfc,0xff,0};
  48
  49// convert characters to an approximate 7 bit ascii equivalent
  50// used for checking for vowels
  51static unsigned char remove_accent[] = {
  52'a','a','a','a','a','a','a','c','e','e','e','e','i','i','i','i',  // 0c0
  53'd','n','o','o','o','o','o', 0, 'o','u','u','u','u','y','t','s',  // 0d0
  54'a','a','a','a','a','a','a','c','e','e','e','e','i','i','i','i',  // 0e0
  55'd','n','o','o','o','o','o', 0 ,'o','u','u','u','u','y','t','y',  // 0f0
  56
  57'a','a','a','a','a','a','c','c','c','c','c','c','c','c','d','d',  // 100
  58'd','d','e','e','e','e','e','e','e','e','e','e','g','g','g','g',  // 110
  59'g','g','g','g','h','h','h','h','i','i','i','i','i','i','i','i',  // 120
  60'i','i','i','i','j','j','k','k','k','l','l','l','l','l','l','l',  // 130
  61'l','l','l','n','n','n','n','n','n','n','n','n','o','o','o','o',  // 140
  62'o','o','o','o','r','r','r','r','r','r','s','s','s','s','s','s',  // 150
  63's','s','t','t','t','t','t','t','u','u','u','u','u','u','u','u',  // 160
  64'u','u','u','u','w','w','y','y','y','z','z','z','z','z','z','s',  // 170
  65'b','b','b','b', 0,  0, 'o','c','c','d','d','d','d','d','e','e',  // 180
  66'e','f','f','g','g','h','i','i','k','k','l','l','m','n','n','o',  // 190
  67'o','o','o','o','p','p','y', 0,  0, 's','s','t','t','t','t','u',  // 1a0
  68'u','u','v','y','y','z','z','z','z','z','z','z', 0,  0,  0, 'w',  // 1b0
  69't','t','t','k','d','d','d','l','l','l','n','n','n','a','a','i',  // 1c0
  70'i','o','o','u','u','u','u','u','u','u','u','u','u','e','a','a',  // 1d0
  71'a','a','a','a','g','g','g','g','k','k','o','o','o','o','z','z',  // 1e0
  72'j','d','d','d','g','g','w','w','n','n','a','a','a','a','o','o',  // 1f0
  73
  74'a','a','a','a','e','e','e','e','i','i','i','i','o','o','o','o',  // 200
  75'r','r','r','r','u','u','u','u','s','s','t','t','y','y','h','h',  // 210
  76'n','d','o','o','z','z','a','a','e','e','o','o','o','o','o','o',  // 220
  77'o','o','y','y','l','n','t','j','d','q','a','c','c','l','t','s',  // 230
  78'z', 0 };
  79
  80
  81
  82
  83void strncpy0(char *to,const char *from, int size)
  84{//===============================================
  85	// strcpy with limit, ensures a zero terminator
  86	strncpy(to,from,size);
  87	to[size-1] = 0;
  88}
  89
  90
  91static int reverse_word_bytes(int word)
  92{//=============================
  93	// reverse the order of bytes from little-endian to big-endian
  94#ifdef ARCH_BIG
  95	int ix;
  96	int word2 = 0;
  97
  98	for(ix=0; ix<=24; ix+=8)
  99	{
 100		word2 = word2 << 8;
 101		word2 |= (word >> ix) & 0xff;
 102	}
 103	return(word2);
 104#else
 105	return(word);
 106#endif
 107}
 108
 109
 110int LookupMnem(MNEM_TAB *table, char *string)
 111{//==========================================
 112	while(table->mnem != NULL)
 113	{
 114		if(strcmp(string,table->mnem)==0)
 115			return(table->value);
 116		table++;
 117	}
 118	return(table->value);
 119}
 120
 121const char *LookupMnem(MNEM_TAB *table, int value)
 122{//===============================================
 123	while(table->mnem != NULL)
 124	{
 125		if(table->value == value)
 126			return(table->mnem);
 127		table++;
 128	}
 129	return("");
 130}
 131
 132
 133
 134//=============================================================================================
 135//   Read pronunciation rules and pronunciation lookup dictionary
 136//
 137//=============================================================================================
 138
 139
 140
 141int Translator::LoadDictionary(const char *name, int no_error)
 142{//===========================================================
 143	int hash;
 144	char *p;
 145	int *pw;
 146	int length;
 147	FILE *f;
 148	unsigned int size;
 149	char fname[sizeof(path_home)+20];
 150
 151	strcpy(dictionary_name,name);   // currently loaded dictionary name
 152
 153	if(no_error)   // don't load dictionary, just set the dictionary_name
 154		return(1);
 155
 156	// Load a pronunciation data file into memory
 157	// bytes 0-3:  offset to rules data
 158	// bytes 4-7:  number of hash table entries
 159	sprintf(fname,"%s%c%s_dict",path_home,PATHSEP,name);
 160	size = GetFileLength(fname);
 161
 162	f = fopen(fname,"rb");
 163	if((f == NULL) || (size <= 0))
 164	{
 165		if(no_error == 0)
 166		{
 167			fprintf(stderr,"Can't read dictionary file: '%s'\n",fname);
 168		}
 169		return(1);
 170	}
 171
 172	if(data_dictlist != NULL)
 173		Free(data_dictlist);
 174
 175	data_dictlist = Alloc(size);
 176	fread(data_dictlist,size,1,f);
 177	fclose(f);
 178
 179
 180	pw = (int *)data_dictlist;
 181	length = reverse_word_bytes(pw[1]);
 182
 183	if(size <= (N_HASH_DICT + sizeof(int)*2))
 184	{
 185		fprintf(stderr,"Empty _dict file: '%s\n",fname);
 186		return(2);
 187	}
 188
 189	if((reverse_word_bytes(pw[0]) != N_HASH_DICT) ||
 190	   (length <= 0) || (length > 0x8000000))
 191	{
 192		fprintf(stderr,"Bad data: '%s' (%x length=%x)\n",fname,reverse_word_bytes(pw[0]),length);
 193		return(2);
 194	}
 195	data_dictrules = &data_dictlist[length];
 196
 197	// set up indices into data_dictrules
 198	InitGroups();
 199	if(groups1[0] == NULL)
 200	{
 201		fprintf(stderr,"Error in %s_rules, no default rule group\n",name);
 202	}
 203
 204	// set up hash table for data_dictlist
 205	p = &data_dictlist[8];
 206
 207	for(hash=0; hash<N_HASH_DICT; hash++)
 208	{
 209		dict_hashtab[hash] = p;
 210		while((length = *p) != 0)
 211		{
 212			p += length;
 213		}
 214		p++;   // skip over the zero which terminates the list for this hash value
 215	}
 216
 217	return(0);
 218}  //  end of LoadDictionary
 219
 220
 221void Translator::InitGroups(void)
 222{//==============================
 223/* Called after dictionary 1 is loaded, to set up table of entry points for translation rule chains
 224	for single-letters and two-letter combinations
 225*/
 226
 227	int  ix;
 228	char *p;
 229	char *p_name;
 230	unsigned int *pw;
 231	unsigned char c, c2;
 232	int len;
 233
 234	n_groups2 = 0;
 235	for(ix=0; ix<256; ix++)
 236	{
 237		groups1[ix]=NULL;
 238		groups2_count[ix]=0;
 239		groups2_start[ix]=255;  // indicates "not set"
 240	}
 241	memset(letterGroups,0,sizeof(letterGroups));
 242
 243	p = data_dictrules;
 244	while(*p != 0)
 245	{
 246		if(*p != RULE_GROUP_START)
 247		{
 248			fprintf(stderr,"Bad rules data in '%s_dict' at 0x%x\n",dictionary_name,(unsigned int)(p-data_dictrules));
 249			break;
 250		}
 251		p++;
 252
 253		if(p[0] == RULE_REPLACEMENTS)
 254		{
 255			pw = (unsigned int *)(((long)p+4) & ~3);  // advance to next word boundary
 256			langopts.replace_chars = pw;
 257			while(pw[0] != 0)
 258			{
 259				pw += 2;   // find the end of the replacement list, each entry is 2 words.
 260			}
 261			p = (char *)(pw+1);
 262
 263#ifdef ARCH_BIG
 264			pw = (unsigned int *)langopts.replace_chars;
 265			while(*pw != 0)
 266			{
 267				*pw = reverse_word_bytes(*pw);
 268				pw++;
 269				*pw = reverse_word_bytes(*pw);
 270				pw++;
 271			}
 272#endif
 273			continue;
 274		}
 275
 276		if(p[0] == RULE_LETTERGP2)
 277		{
 278			ix = p[1] - 'A';
 279			p += 2;
 280			if((ix >= 0) && (ix < N_LETTER_GROUPS))
 281			{
 282				letterGroups[ix] = p;
 283			}
 284		}
 285		else
 286		{
 287			len = strlen(p);
 288			p_name = p;
 289			c = p_name[0];
 290			
 291			p += (len+1);
 292			if(len == 1)
 293			{
 294				groups1[c] = p;
 295			}
 296			else
 297			if(len == 0)
 298			{
 299				groups1[0] = p;
 300			}
 301			else
 302			{
 303				if(groups2_start[c] == 255)
 304					groups2_start[c] = n_groups2;
 305	
 306				groups2_count[c]++;
 307				groups2[n_groups2] = p;
 308				c2 = p_name[1];
 309				groups2_name[n_groups2++] = (c + (c2 << 8));
 310			}
 311		}
 312
 313		// skip over all the rules in this group
 314		while(*p != RULE_GROUP_END)
 315		{
 316			p += (strlen(p) + 1);
 317		}
 318		p++;
 319	}
 320
 321}  //  end of InitGroups
 322
 323
 324int HashDictionary(const char *string)
 325//====================================
 326/* Generate a hash code from the specified string
 327	This is used to access the dictionary_2 word-lookup dictionary
 328*/
 329{
 330   int  c;
 331	int  chars=0;
 332   int  hash=0;
 333
 334   while((c = (*string++ & 0xff)) != 0)
 335   {
 336      hash = hash * 8 + c;
 337      hash = (hash & 0x3ff) ^ (hash >> 8);    /* exclusive or */
 338		chars++;
 339   }
 340
 341   return((hash+chars) & 0x3ff);  // a 10 bit hash code
 342}   //  end of HashDictionary
 343
 344
 345
 346//=============================================================================================
 347//   Translate between internal representation of phonemes and a mnemonic form for display
 348//
 349//=============================================================================================
 350
 351
 352
 353char *EncodePhonemes(char *p, char *outptr, unsigned char *bad_phoneme)
 354/*********************************************************************/
 355/* Translate a phoneme string from ascii mnemonics to internal phoneme numbers,
 356   from 'p' up to next blank .
 357   Returns advanced 'p'
 358   outptr contains encoded phonemes, unrecognised phonemes are encoded as 255
 359   bad_phoneme must point to char array of length 2 of more
 360*/
 361{
 362	int ix;
 363	unsigned char  c;
 364	int  count;    /* num. of matching characters */
 365	int  max;      /* highest num. of matching found so far */
 366	int  max_ph;   /* corresponding phoneme with highest matching */
 367	int  consumed;
 368	unsigned int  mnemonic_word;
 369
 370	bad_phoneme[0] = 0;
 371
 372	// skip initial blanks
 373	while(isspace(*p))
 374	{
 375		p++;
 376	}
 377
 378	while(((c = *p) != 0) && !isspace(c))
 379	{
 380		consumed = 0;
 381
 382		switch(c)
 383		{
 384		case '|':
 385			// used to separate phoneme mnemonics if needed, to prevent characters being treated
 386			// as a multi-letter mnemonic
 387
 388			if((c = p[1]) == '|')
 389			{
 390				// treat double || as a word-break symbol, drop through
 391            // to the default case with c = '|'
 392			}
 393			else
 394			{
 395				p++;
 396				break;
 397			}
 398
 399		default:
 400			// lookup the phoneme mnemonic, find the phoneme with the highest number of
 401			// matching characters
 402			max= -1;
 403			max_ph= 0;
 404
 405			for(ix=1; ix<n_phoneme_tab; ix++)
 406			{
 407				if(phoneme_tab[ix] == NULL)
 408					continue;
 409				if(phoneme_tab[ix]->type == phINVALID)
 410					continue;       // this phoneme is not defined for this language
 411
 412				count = 0;
 413				mnemonic_word = phoneme_tab[ix]->mnemonic;
 414
 415				while(((c = p[count]) > ' ') && (count < 4) &&
 416										(c == ((mnemonic_word >> (count*8)) & 0xff)))
 417					count++;
 418
 419				if((count > max) &&
 420					((count == 4) || (((mnemonic_word >> (count*8)) & 0xff)==0)))
 421				{
 422					max = count;
 423					max_ph = phoneme_tab[ix]->code;
 424				}
 425			}
 426
 427			if(max_ph == 0)
 428			{
 429				max_ph = 255;   /* not recognised */
 430				bad_phoneme[0] = *p;
 431				bad_phoneme[1] = 0;
 432			}
 433
 434			if(max <= 0)
 435				max = 1;
 436			p += (consumed + max);
 437			*outptr++ = (char)(max_ph);
 438
 439			if(max_ph == phonSWITCH)
 440			{
 441				// Switch Language: this phoneme is followed by a text string
 442				char *p_lang = outptr;
 443				while(!isspace(c = *p) && (c != 0))
 444				{
 445					p++;
 446					*outptr++ = tolower(c);
 447				}
 448				*outptr = 0;
 449				if(c == 0)
 450				{
 451					if(strcmp(p_lang,"en")==0)
 452					{
 453						*p_lang = 0;   // don't need "en", it's assumed by default
 454						return(p);
 455					}
 456				}
 457				else
 458				{
 459					*outptr++ = '|';  // more phonemes follow, terminate language string with separator
 460				}
 461			}
 462			break;
 463		}
 464	}
 465	/* terminate the encoded string */
 466	*outptr = 0;
 467	return(p);
 468}   // end of EncodePhonemes
 469
 470
 471
 472void DecodePhonemes(const char *inptr, char *outptr)
 473//==================================================
 474// Translate from internal phoneme codes into phoneme mnemonics
 475{
 476	unsigned char phcode;
 477	unsigned char c;
 478	unsigned int  mnem;
 479	PHONEME_TAB *ph;
 480	static const char *stress_chars = "==,,'*  ";
 481
 482	while((phcode = *inptr++) > 0)
 483	{
 484		if(phcode == 255)
 485			continue;     /* indicates unrecognised phoneme */
 486		if((ph = phoneme_tab[phcode]) == NULL)
 487			continue;
 488	
 489		if((ph->type == phSTRESS) && (ph->std_length <= 4) && (ph->spect == 0))
 490		{
 491			if(ph->std_length > 1)
 492				*outptr++ = stress_chars[ph->std_length];
 493		}
 494		else
 495		{
 496			mnem = ph->mnemonic;
 497
 498			while((c = (mnem & 0xff)) != 0)	
 499			{
 500				*outptr++ = c;
 501				mnem = mnem >> 8;
 502			}
 503			if(phcode == phonSWITCH)
 504			{
 505				while(isalpha(*inptr))
 506				{
 507					*outptr++ = *inptr++;
 508				}
 509			}
 510		}
 511	}
 512	*outptr = 0;    /* string terminator */
 513}   //  end of DecodePhonemes
 514
 515
 516
 517void Translator::WriteMnemonic(int *ix, int mnem)
 518{//==============================================
 519	unsigned char c;
 520
 521	while((c = mnem & 0xff) != 0)
 522	{
 523		if((c == '/') && (option_phoneme_variants==0))
 524			break;      // discard phoneme variant indicator
 525		phon_out[(*ix)++]= c;
 526	//	phon_out[phon_out_ix++]= ipa1[c];
 527		mnem = mnem >> 8;
 528	}
 529}
 530
 531
 532void Translator::GetTranslatedPhonemeString(char *phon_out, int n_phon_out)
 533{//========================================================================
 534/* Can be called after a clause has been translated into phonemes, in order
 535   to display the clause in phoneme mnemonic form.
 536*/
 537
 538	int  ix;
 539	int  phon_out_ix=0;
 540	int  stress;
 541	char *p;
 542	PHONEME_LIST *plist;
 543	
 544	static const char *stress_chars = "==,,''";
 545
 546	if(phon_out != NULL)
 547	{
 548		for(ix=1; ix<(n_phoneme_list-2) && (phon_out_ix < (n_phon_out - 6)); ix++)
 549		{
 550			plist = &phoneme_list[ix];
 551			if(plist->newword)
 552				phon_out[phon_out_ix++] = ' ';
 553
 554			if(plist->synthflags & SFLAG_SYLLABLE)
 555			{
 556				if((stress = plist->tone) > 1)
 557				{
 558					if(stress > 5) stress = 5;
 559					phon_out[phon_out_ix++] = stress_chars[stress];
 560				}
 561			}
 562			WriteMnemonic(&phon_out_ix,plist->ph->mnemonic);
 563
 564			if(plist->synthflags & SFLAG_LENGTHEN)
 565			{
 566				WriteMnemonic(&phon_out_ix,phoneme_tab[phonLENGTHEN]->mnemonic);
 567			}
 568			if((plist->synthflags & SFLAG_SYLLABLE) && (plist->type != phVOWEL))
 569			{
 570				// syllablic consonant
 571				WriteMnemonic(&phon_out_ix,phoneme_tab[phonSYLLABIC]->mnemonic);
 572			}
 573			if(plist->ph->code == phonSWITCH)
 574			{
 575				// the tone_ph field contains a phoneme table number
 576				p = phoneme_tab_list[plist->tone_ph].name;
 577				while(*p != 0)
 578				{
 579					phon_out[phon_out_ix++] = *p++;
 580				}
 581				phon_out[phon_out_ix++] = ' ';
 582			}
 583			else
 584			if(plist->tone_ph > 0)
 585			{
 586				WriteMnemonic(&phon_out_ix,phoneme_tab[plist->tone_ph]->mnemonic);
 587			}
 588		}
 589	
 590		if(phon_out_ix >= n_phon_out)
 591			phon_out_ix = n_phon_out - 1;
 592		phon_out[phon_out_ix] = 0;
 593	}
 594}  // end of Translator::GetTranslatedPhonemeString
 595
 596
 597
 598//=============================================================================================
 599//   Is a word Unpronouncable - and so should be spoken as individual letters
 600//
 601//=============================================================================================
 602
 603
 604#ifdef deleted
 605// this is the initials_bitmap for english
 606static unsigned char initials_bitmap[86] = {
 607 0x00, 0x00, 0x00, 0x00, 0x22, 0x08, 0x00, 0x88,  //  0
 608 0x20, 0x24, 0x20, 0x80, 0x10, 0x00, 0x00, 0x00,
 609 0x00, 0x28, 0x08, 0x00, 0x88, 0x22, 0x04, 0x00,  // 16
 610 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 611 0x00, 0x88, 0x22, 0x04, 0x00, 0x02, 0x00, 0x00,  // 32
 612 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 613 0x00, 0x28, 0x8a, 0x03, 0x00, 0x00, 0x40, 0x00,  // 48
 614 0x02, 0x00, 0x41, 0xca, 0x9b, 0x06, 0x20, 0x80,
 615 0x91, 0x00, 0x00, 0x00, 0x00, 0x20, 0x08, 0x00,  // 64
 616 0x08, 0x20, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
 617 0x00, 0x00, 0x22, 0x00, 0x01, 0x00, };
 618#endif
 619
 620
 621int Translator::Unpronouncable(char *word)
 622{//=======================================
 623/* Determines whether a word in 'unpronouncable', i.e. whether it should
 624	be spoken as individual letters.
 625
 626	This function may be language specific. This is a generic version.
 627*/
 628
 629	int  c;
 630	int  c1=0;
 631	int  vowel_posn=9;
 632	int  index;
 633	int  count;
 634	int  apostrophe=0;
 635
 636	if(langopts.param[LOPT_UNPRONOUNCABLE] == 1)
 637		return(0);
 638
 639	if((*word == ' ') || (*word == 0))
 640		return(0);
 641
 642	index = 0;
 643	count = 0;
 644	for(;;)
 645	{
 646		index += utf8_in(&c,&word[index],0);
 647		if((c==0) || (c==' '))
 648			break;
 649
 650		if(count==0)
 651			c1 = c;
 652		count++;
 653
 654		if(IsVowel(c))
 655		{
 656			vowel_posn = count;    // position of the first vowel
 657			break;
 658		}
 659
 660		if(c == '\'')
 661			apostrophe = 1;
 662		else
 663		if(!iswalpha(c))
 664			return(0);        // letter (not vowel) outside a-z range or apostrophe, abort test
 665	}
 666
 667	if((vowel_posn < 9) && (langopts.param[LOPT_UNPRONOUNCABLE] == 2))
 668		return(0);   // option means allow any word with a vowel
 669
 670	if(c1 == langopts.param[LOPT_UNPRONOUNCABLE])
 671		vowel_posn--;   // disregard this as the initial letter when counting
 672
 673	if(vowel_posn > (langopts.max_initial_consonants+1))
 674		return(1);  // no vowel, or no vowel in first four letters
 675
 676return(0);
 677
 678}   /* end of Unpronounceable */
 679
 680
 681
 682int Translator::IsLetterGroup(char *word, int group, int pre)
 683{//==========================================================
 684	// match the word against a list of utf-8 strings
 685	char *p;
 686	char *w;
 687
 688	p = letterGroups[group];
 689
 690	while(*p != RULE_GROUP_END)
 691	{
 692		w = word;
 693		while(*p == *w)
 694		{
 695			w++;
 696			p++;
 697		}
 698		if(*p == 0)
 699			return(w-word);   // matched a complete string
 700
 701		while(*p++ != 0);  // skip to end of string
 702	}
 703	return(0);
 704}
 705
 706
 707int Translator::IsLetter(int letter, int group)
 708{//============================================
 709	int letter2;
 710
 711	if(letter_groups[group] != NULL)
 712	{
 713		if(wcschr(letter_groups[group],letter))
 714			return(1);
 715		return(0);
 716	}
 717
 718	if(group > 7)
 719		return(0);
 720
 721	if(letter_bits_offset > 0)
 722	{
 723		if(((letter2 = (letter - letter_bits_offset)) > 0) && (letter2 < 0x80))
 724				letter = letter2;
 725		else
 726			return(0);
 727	}
 728	else
 729	{
 730		if((letter >= 0xc0) && (letter <= 0x241))
 731			return(letter_bits[remove_accent[letter-0xc0]] & (1L << group));
 732	}
 733
 734	if((letter >= 0) && (letter < 0x80))
 735		return(letter_bits[letter] & (1L << group));
 736
 737	return(0);
 738}
 739
 740
 741int Translator::IsVowel(int letter)
 742{//================================
 743	return(IsLetter(letter,0));
 744}
 745
 746void SetLetterVowel(Translator *tr, int c)
 747{//=======================================
 748	tr->letter_bits[c] = (tr->letter_bits[c] & 0x40) | 0x81;  // keep value for group 6 (front vowels e,i,y)
 749}
 750
 751void ResetLetterBits(Translator *tr, int groups)
 752{//=============================================
 753// Clear all the specified groups
 754	unsigned int ix;
 755	unsigned int mask;
 756
 757	mask = ~groups;
 758
 759	for(ix=0; ix<sizeof(tr->letter_bits); ix++)
 760	{
 761		tr->letter_bits[ix] &= mask;
 762	}
 763}
 764
 765
 766void SetLetterBits(Translator *tr, int group, const char *string)
 767{//==============================================================
 768	int bits;
 769	unsigned char c;
 770	
 771	bits = (1L << group);
 772	while((c = *string++) != 0)
 773		tr->letter_bits[c] |= bits;
 774}
 775
 776void SetLetterBitsRange(Translator *tr, int group, int first, int last)
 777{//====================================================================
 778	int bits;
 779	int ix;
 780
 781	bits = (1L << group);
 782	for(ix=first; ix<=last; ix++)
 783	{
 784		tr->letter_bits[ix] |= bits;
 785	}
 786}
 787
 788
 789
 790//=============================================================================================
 791//   Determine the stress pattern of a word
 792//
 793//=============================================================================================
 794
 795
 796
 797static int GetVowelStress(Translator *tr, unsigned char *phonemes, unsigned char *vowel_stress, int &vowel_count, int &stressed_syllable, int control)
 798{//====================================================================================================================================================
 799// control = 1, set stress to 1 for forced unstressed vowels
 800	unsigned char phcode;
 801	PHONEME_TAB *ph;
 802	unsigned char *ph_out = phonemes;
 803	int count = 1;
 804	int max_stress = 0;
 805	int ix;
 806	int j;
 807	int stress = 0;
 808	int primary_posn = 0;
 809
 810	vowel_stress[0] = 0;
 811	while(((phcode = *phonemes++) != 0) && (count < (N_WORD_PHONEMES/2)-1))
 812	{
 813		if((ph = phoneme_tab[phcode]) == NULL)
 814			continue;
 815
 816		if((ph->type == phSTRESS) && (ph->spect == 0))
 817		{
 818			/* stress marker, use this for the following vowel */
 819
 820			if(phcode == phonSTRESS_PREV)
 821			{
 822				/* primary stress on preceeding vowel */
 823				j = count - 1;
 824				while((j > 0) && (stressed_syllable == 0) && (vowel_stress[j] < 4))
 825				{
 826					if(vowel_stress[j] != 1)
 827					{
 828						// don't promote a phoneme which must be unstressed
 829						vowel_stress[j] = 4;
 830
 831						if(max_stress < 4)
 832						{
 833							max_stress = 4;
 834							primary_posn = j;
 835						}
 836	
 837						/* reduce any preceding primary stress markers */
 838						for(ix=1; ix<j; ix++)
 839						{
 840							if(vowel_stress[ix] == 4)
 841								vowel_stress[ix] = 3;
 842						}
 843						break;
 844					}
 845					j--;
 846				}
 847			}
 848			else
 849			{
 850				if((ph->std_length < 4) || (stressed_syllable == 0))
 851				{
 852					stress = ph->std_length;
 853
 854					if(stress > max_stress)
 855						max_stress = stress;
 856				}
 857			}
 858			continue;
 859		}
 860
 861		if((ph->type == phVOWEL) && !(ph->phflags & phNONSYLLABIC))
 862		{
 863			vowel_stress[count] = (char)stress;
 864			if((stress >= 4) && (stress >= max_stress))
 865			{
 866				primary_posn = count;
 867				max_stress = stress;
 868			}
 869
 870			if((stress == 0) && (control & 1) && (ph->phflags & phUNSTRESSED))
 871				vowel_stress[count] = 1;   /* weak vowel, must be unstressed */
 872
 873			count++;
 874			stress = 0;
 875		}
 876		else
 877		if(phcode == phonSYLLABIC)
 878		{
 879			// previous consonant phoneme is syllablic
 880			vowel_stress[count] = (char)stress;
 881			if((stress == 0) && (control & 1))
 882				vowel_stress[count++] = 1;    // syllabic consonant, usually unstressed
 883		}
 884
 885		*ph_out++ = phcode;
 886	}
 887	vowel_stress[count] = 0;
 888	*ph_out = 0;
 889
 890	/* has the position of the primary stress been specified by $1, $2, etc? */
 891	if(stressed_syllable > 0)
 892	{
 893		if(stressed_syllable >= count)
 894			stressed_syllable = count-1;   // the final syllable
 895
 896		vowel_stress[stressed_syllable] = 4;
 897		max_stress = 4;
 898		primary_posn = stressed_syllable;
 899	}
 900
 901	if(max_stress == 5)
 902	{
 903		// priority stress, replaces any other primary stress marker
 904		for(ix=1; ix<count; ix++)
 905		{
 906			if(vowel_stress[ix] == 4)
 907			{
 908				if(tr->langopts.stress_flags & 0x20000)
 909					vowel_stress[ix] = 0;
 910				else
 911					vowel_stress[ix] = 3;
 912			}
 913
 914			if(vowel_stress[ix] == 5)
 915			{
 916				vowel_stress[ix] = 4;
 917				primary_posn = ix;
 918			}
 919		}
 920		max_stress = 4;
 921	}
 922
 923	stressed_syllable = primary_posn;
 924	vowel_count = count;
 925	return(max_stress);
 926}  // end of GetVowelStress
 927
 928
 929
 930static char stress_phonemes[] = {phonSTRESS_U, phonSTRESS_D, phonSTRESS_2, phonSTRESS_3,
 931		phonSTRESS_P, phonSTRESS_P2, phonSTRESS_TONIC};
 932
 933
 934void ChangeWordStress(Translator *tr, char *word, int new_stress)
 935{//==============================================================
 936	int ix;
 937	unsigned char *p;
 938	int  max_stress;
 939	int  vowel_count;              // num of vowels + 1
 940	int  stressed_syllable=0;      // position of stressed syllable
 941	unsigned char phonetic[N_WORD_PHONEMES];
 942	unsigned char vowel_stress[N_WORD_PHONEMES/2];
 943
 944	strcpy((char *)phonetic,word);
 945	max_stress = GetVowelStress(tr, phonetic, vowel_stress, vowel_count, stressed_syllable, 0);
 946
 947	if(new_stress >= 4)
 948	{
 949		// promote to primary stress
 950		for(ix=1; ix<vowel_count; ix++)
 951		{
 952			if(vowel_stress[ix] >= max_stress)
 953			{
 954				vowel_stress[ix] = new_stress;
 955				break;
 956			}
 957		}
 958	}
 959	else
 960	{
 961		// remove primary stress
 962		for(ix=1; ix<vowel_count; ix++)
 963		{
 964			if(vowel_stress[ix] > new_stress)   // >= allows for diminished stress (=1)
 965				vowel_stress[ix] = new_stress;
 966		}
 967	}
 968
 969	// write out phonemes
 970	ix = 1;
 971	p = phonetic;
 972	while(*p != 0)
 973	{
 974		if((phoneme_tab[*p]->type == phVOWEL) && !(phoneme_tab[*p]->phflags & phNONSYLLABIC))
 975		{
 976			if(vowel_stress[ix] != 0)
 977				*word++ = stress_phonemes[vowel_stress[ix]];
 978
 979			ix++;
 980		}
 981		*word++ = *p++;
 982	}
 983	*word = 0;
 984}  // end of ChangeWordStress
 985
 986
 987
 988void Translator::SetWordStress(char *output, unsigned int dictionary_flags, int tonic, int prev_stress)
 989{//===================================================================================================
 990/* Guess stress pattern of word.  This is language specific
 991
 992   'dictionary_flags' has bits 0-3   position of stressed vowel (if > 0)
 993                                     or unstressed (if == 7) or syllables 1 and 2 (if == 6)
 994                          bits 8...  dictionary flags
 995
 996   If 'tonic' is set (>= 0), replace highest stress by this value.
 997
 998   Parameter used for input and output
 999*/
1000
1001	unsigned char phcode;
1002	unsigned char *p;
1003	PHONEME_TAB *ph;
1004	int  stress;
1005	int  max_stress;
1006	int  vowel_count;      // num of vowels + 1
1007	int  ix;
1008	int  v;
1009	int  v_stress;
1010	int  stressed_syllable;      // position of stressed syllable
1011	int  max_stress_posn;
1012	int  unstressed_word = 0;
1013	char *max_output;
1014	int final_ph;
1015	int mnem;
1016	int post_tonic;
1017	int opt_length;
1018	int done;
1019
1020	unsigned char vowel_stress[N_WORD_PHONEMES/2];
1021	char syllable_weight[N_WORD_PHONEMES/2];
1022	unsigned char phonetic[N_WORD_PHONEMES];
1023
1024	static char consonant_types[16] = {0,0,0,1,1,1,1,1,1,1,0,0,0,0,0,0};
1025
1026
1027	/* stress numbers  STRESS_BASE +
1028		0  diminished, unstressed within a word
1029		1  unstressed, weak
1030		2
1031		3  secondary stress
1032		4  main stress */
1033
1034	/* copy input string into internal buffer */
1035	for(ix=0; ix<N_WORD_PHONEMES; ix++)
1036	{
1037		phonetic[ix] = output[ix];
1038		// check for unknown phoneme codes
1039		if(phonetic[ix] >= n_phoneme_tab)
1040			phonetic[ix] = phonSCHWA;
1041		if(phonetic[ix] == 0)
1042			break;
1043	}
1044	if(ix == 0) return;
1045	final_ph = phonetic[ix-1];
1046
1047	max_output = output + (N_WORD_PHONEMES-3);   /* check for overrun */
1048
1049	// any stress position marked in the xx_list dictionary ? 
1050	stressed_syllable = dictionary_flags & 0x7;
1051	if(dictionary_flags & 0x8)
1052	{
1053		// this indicates a word without a primary stress
1054		stressed_syllable = dictionary_flags & 0x3;
1055		unstressed_word = 1;
1056	}
1057
1058	max_stress = GetVowelStress(this, phonetic, vowel_stress, vowel_count, stressed_syllable, 1);
1059
1060	// heavy or light syllables
1061	ix = 1;
1062	for(p = phonetic; *p != 0; p++)
1063	{
1064		if((phoneme_tab[p[0]]->type == phVOWEL) && !(phoneme_tab[p[0]]->phflags & phNONSYLLABIC))
1065		{
1066			int weight = 0;
1067			int lengthened = 0;
1068
1069			if(phoneme_tab[p[1]]->code == phonLENGTHEN)
1070				lengthened = 1;
1071
1072			if(lengthened || (phoneme_tab[p[0]]->phflags & phLONG))
1073			{
1074				// long vowel, increase syllable weight
1075				weight++;
1076			}
1077
1078			if(lengthened) p++;  // advance over phonLENGTHEN
1079
1080			if(consonant_types[phoneme_tab[p[1]]->type] && ((phoneme_tab[p[2]]->type != phVOWEL) || (phoneme_tab[p[1]]->phflags & phLONG)))
1081			{
1082				// followed by two consonants, a long consonant, or consonant and end-of-word
1083				weight++;
1084			}
1085			syllable_weight[ix] = weight;
1086			ix++;
1087		}
1088	}
1089	
1090	switch(langopts.stress_rule)
1091	{
1092	case 8:
1093		// stress on first syllable, unless it is a light syllable
1094		if(syllable_weight[1] > 0)
1095			break;
1096		// else drop through to case 1
1097	case 1:
1098		// stress on second syllable
1099		if((stressed_syllable == 0) && (vowel_count > 2))
1100		{
1101			stressed_syllable = 2;
1102			if(max_stress == 0)
1103			{
1104				vowel_stress[stressed_syllable] = 4;
1105			}
1106			max_stress = 4;
1107		}
1108		break;
1109
1110	case 2:
1111		// a language with stress on penultimate vowel
1112
1113		if(stressed_syllable == 0)
1114		{
1115			/* no explicit stress - stress the penultimate vowel */
1116			max_stress = 4;
1117
1118			if(vowel_count > 2)
1119			{
1120				stressed_syllable = vowel_count - 2;
1121
1122				if(langopts.stress_flags & 0x300)
1123				{
1124					// LANG=Spanish, stress on last vowel if the word ends in a consonant other than 'n' or 's'
1125					if(phoneme_tab[final_ph]->type != phVOWEL)
1126					{
1127						if(langopts.stress_flags & 0x100)
1128						{
1129							stressed_syllable = vowel_count - 1;
1130						}
1131						else
1132						{
1133							mnem = phoneme_tab[final_ph]->mnemonic;
1134							if((mnem != 'n') && (mnem != 's'))
1135							{
1136								stressed_syllable = vowel_count - 1;
1137							}
1138						}
1139					}
1140				}
1141
1142				if(vowel_stress[stressed_syllable] == 1)
1143				{
1144					// but this vowel is explicitly marked as unstressed
1145					if(stressed_syllable > 1)
1146						stressed_syllable--;
1147					else
1148						stressed_syllable++;
1149				}
1150			}
1151			else
1152			{
1153				stressed_syllable = 1;
1154				if(langopts.stress_flags & 0x1)
1155					max_stress = 3;   // don't give full stress to monosyllables
1156			}
1157
1158			// only set the stress if it's not already marked explicitly
1159			if(vowel_stress[stressed_syllable] == 0)
1160			{
1161				// don't stress if next and prev syllables are stressed
1162				if((vowel_stress[stressed_syllable-1] < 4) || (vowel_stress[stressed_syllable+1] < 4))
1163					vowel_stress[stressed_syllable] = max_stress;
1164			}
1165		}
1166		break;
1167
1168   case 3:
1169		// stress on last vowel
1170		if(stressed_syllable == 0)
1171		{
1172			/* no explicit stress - stress the final vowel */
1173			stressed_syllable = vowel_count - 1;
1174			if(max_stress == 0)
1175			{
1176				while(stressed_syllable > 0)
1177				{
1178					if(vowel_stress[stressed_syllable] == 0)
1179					{
1180						vowel_stress[stressed_syllable] = 4;
1181						break;
1182					}
1183					else
1184						stressed_syllable--;
1185				}
1186			}
1187			max_stress = 4;
1188		}
1189		break;
1190
1191	case 4:   // stress on antipenultimate vowel
1192		if(stressed_syllable == 0)
1193		{
1194			stressed_syllable = vowel_count - 3;
1195			if(stressed_syllable < 1)
1196				stressed_syllable = 1;
1197
1198			if(max_stress == 0)
1199			{
1200				vowel_stress[stressed_syllable] = 4;
1201			}
1202			max_stress = 4;
1203		}
1204		break;
1205
1206	case 5:
1207		// LANG=Russian
1208		if(stressed_syllable == 0)
1209		{
1210			/* no explicit stress - guess the stress from the number of syllables */
1211			static char guess_ru[16] =   {0,0,1,1,2,3,3,4,5,6,7,7,8,9,10,11};
1212			static char guess_ru_v[16] = {0,0,1,1,2,2,3,3,4,5,6,7,7,8,9,10};  // for final phoneme is a vowel
1213			static char guess_ru_t[16] = {0,0,1,2,3,3,3,4,5,6,7,7,7,8,9,10};  // for final phoneme is an unvoiced stop
1214
1215			stressed_syllable = vowel_count - 3;
1216			if(vowel_count < 16)
1217			{
1218				if(phoneme_tab[final_ph]->type == phVOWEL)
1219					stressed_syllable = guess_ru_v[vowel_count];
1220				else
1221				if(phoneme_tab[final_ph]->type == phSTOP)
1222					stressed_syllable = guess_ru_t[vowel_count];
1223				else
1224					stressed_syllable = guess_ru[vowel_count];
1225			}
1226			vowel_stress[stressed_syllable] = 4;
1227			max_stress = 4;
1228		}
1229		break;
1230
1231	case 6:    // LANG=hi stress on the last heaviest syllable
1232		if(stressed_syllable == 0)
1233		{
1234			int wt;
1235			int max_weight = -1;
1236			int prev_stressed;
1237
1238			// find the heaviest syllable, excluding the final syllable
1239			for(ix = 1; ix < (vowel_count-1); ix++)
1240			{
1241				if(vowel_stress[ix] == 0)
1242				{
1243					if((wt = syllable_weight[ix]) >= max_weight)
1244					{
1245						max_weight = wt;
1246						prev_stressed = stressed_syllable;
1247						stressed_syllable = ix;
1248					}
1249				}
1250			}
1251
1252			if((syllable_weight[vowel_count-1] == 2) &&  (max_weight< 2))
1253			{
1254				// the only double=heavy syllable is the final syllable, so stress this
1255				stressed_syllable = vowel_count-1;
1256			}
1257			else
1258			if(max_weight <= 0)
1259			{
1260				// all syllables, exclusing the last, are light. Stress the first syllable
1261				stressed_syllable = 1;
1262			}
1263
1264			vowel_stress[stressed_syllable] = 4;
1265			max_stress = 4;
1266		}
1267		break;
1268
1269	case 7:  // LANG=tr, the last syllable for any vowel markes explicitly as unstressed
1270		if(stressed_syllable == 0)
1271		{
1272			stressed_syllable = vowel_count - 1;
1273			for(ix=1; ix < vowel_count; ix++)
1274			{
1275				if(vowel_stress[ix] == 1)
1276				{
1277					stressed_syllable = ix-1;
1278					break;
1279				}
1280			}
1281			vowel_stress[stressed_syllable] = 4;
1282			max_stress = 4;
1283		}
1284		break;
1285
1286	case 9:  // mark all as stressed
1287		for(ix=1; ix<vowel_count; ix++)
1288		{
1289			if(vowel_stress[ix] == 0)
1290				vowel_stress[ix] = 4;
1291		}
1292		break;
1293	}
1294
1295	/* now guess the complete stress pattern */
1296	if(max_stress < 4)
1297		stress = 4;  /* no primary stress marked, use for 1st syllable */
1298	else
1299		stress = 3;
1300
1301
1302	if((langopts.stress_flags & 0x1000) && (vowel_count == 2))
1303	{
1304		// Two syllable word, if one syllable has primary stress, then give the other secondary stress
1305		if(vowel_stress[1] == 4)
1306			vowel_stress[2] = 3;
1307		if(vowel_stress[2] == 4)
1308			vowel_stress[1] = 3;
1309	}
1310#if deleted
1311	if((langopts.stress_flags & 0x2000) && (vowel_stress[1] == 0))
1312	{
1313		// If there is only one syllable before the primary stress, give it a secondary stress
1314		if((vowel_count > 2) && (vowel_stress[2] >= 4))
1315		{
1316			vowel_stress[1] = 3;
1317		}
1318	}
1319#endif
1320
1321	done = 0;
1322	for(v=1; v<vowel_count; v++)
1323	{
1324		if(vowel_stress[v] == 0)
1325		{
1326			if((langopts.stress_flags & 0x10) && (stress < 4) && (v == vowel_count-1))
1327			{
1328				// flag: don't give secondary stress to final vowel
1329			}
1330			else
1331			if((langopts.stress_flags & 0x8000) && (done == 0))
1332			{
1333				vowel_stress[v] = (char)stress;
1334				done =1;
1335				stress = 3;  /* use secondary stress for remaining syllables */
1336			}
1337			else
1338			if((vowel_stress[v-1] <= 1) && (vowel_stress[v+1] <= 1))
1339			{
1340				/* trochaic: give stress to vowel surrounded by unstressed vowels */
1341
1342				if((stress == 3) && (langopts.stress_flags & 0x20))
1343					continue;      // don't use secondary stress
1344
1345				if((v > 1) && (langopts.stress_flags & 0x40) && (syllable_weight[v]==0) && (syllable_weight[v+1]>0))
1346				{
1347					// don't put secondary stress on a light syllable which is followed by a heavy syllable
1348					continue;
1349				}
1350
1351// should start with secondary stress on the first syllable, or should it count back from
1352// the primary stress and put secondary stress on alternate syllables?
1353				vowel_stress[v] = (char)stress;
1354				done =1;
1355				stress = 3;  /* use secondary stress for remaining syllables */
1356			}
1357		}
1358	}
1359
1360	if((unstressed_word) && (tonic < 0))
1361	{
1362		if(vowel_count <= 2)
1363			tonic = langopts.unstressed_wd1;   /* monosyllable - unstressed */
1364		else
1365			tonic = langopts.unstressed_wd2;   /* more than one syllable, used secondary stress as the main stress */
1366	}
1367
1368	max_stress = 0;
1369	max_stress_posn = 0;
1370	for(v=1; v<vowel_count; v++)
1371	{
1372		if(vowel_stress[v] >= max_stress)
1373		{
1374			max_stress = vowel_stress[v];
1375			max_stress_posn = v;
1376		}
1377	}
1378
1379	if(tonic >= 0)
1380	{
1381		/* find position of highest stress, and replace it by 'tonic' */
1382
1383		/* don't disturb an explicitly set stress by 'unstress-at-end' flag */
1384		if((tonic > max_stress) || (max_stress <= 4))
1385			vowel_stress[max_stress_posn] = (char)tonic;
1386		max_stress = tonic;
1387	}
1388
1389
1390	/* produce output phoneme string */
1391	p = phonetic;
1392	v = 1;
1393
1394	if((ph = phoneme_tab[*p]) != NULL)
1395	{
1396
1397		if(ph->type == phSTRESS)
1398			ph = phoneme_tab[p[1]];
1399
1400#ifdef deleted
1401		int gap = langopts.word_gap & 0x700;
1402		if((gap) && (vowel_stress[1] >= 4) && (prev_stress >= 4))
1403		{
1404			/* two primary stresses together, insert a short pause */
1405			*output++ = pause_phonemes[gap >> 8];
1406		}
1407		else
1408#endif
1409		if((langopts.vowel_pause & 0x30) && (ph->type == phVOWEL))
1410		{
1411			// word starts with a vowel
1412
1413			if((langopts.vowel_pause & 0x20) && (vowel_stress[1] >= 4))
1414			{
1415					*output++ = phonPAUSE_NOLINK;   // not to be replaced by link
1416			}
1417			else
1418			{
1419				*output++ = phonPAUSE_VSHORT;     // break, but no pause
1420			}
1421		}
1422	}
1423
1424	p = phonetic;
1425	post_tonic = 0;
1426	while(((phcode = *p++) != 0) && (output < max_output))
1427	{
1428		if((ph = phoneme_tab[phcode]) == NULL)
1429			continue;
1430
1431//		if(ph->type == phSTRESS)
1432//			continue;
1433
1434		if(ph->type == phPAUSE)
1435		{
1436			prev_last_stress = 0;
1437		}
1438		else
1439		if(((ph->type == phVOWEL) && !(ph->phflags & phNONSYLLABIC)) || (*p == phonSYLLABIC))
1440		{
1441			// a vowel, or a consonant followed by a syllabic consonant marker
1442
1443			v_stress = vowel_stress[v];
1444			prev_last_stress = v_stress;
1445
1446			if(vowel_stress[v-1] >= max_stress)
1447				post_tonic = 1;
1448
1449			if(v_stress <= 1)
1450			{
1451				if((v > 1) && (max_stress >= 4) && (langopts.stress_flags & 4) && (v == (vowel_count-1)))
1452				{
1453					// option: mark unstressed final syllable as diminished
1454					v_stress = 1;
1455				}
1456				else
1457				if((langopts.stress_flags & 2) || (v == 1) || (v == (vowel_count-1)))
1458				{
1459					// first or last syllable, or option 'don't set diminished stress'
1460					v_stress = 0;
1461				}
1462				else
1463				if((v == (vowel_count-2)) && (vowel_stress[vowel_count-1] <= 1))
1464				{
1465					// penultimate syllable, followed by an unstressed final syllable
1466					v_stress = 0;
1467				}
1468				else
1469				{
1470					// unstressed syllable within a word
1471					if((vowel_stress[v-1] != 1) || ((langopts.stress_flags & 0x10000) == 0))
1472					{
1473						v_stress = 1;      /* change from 0 (unstressed) to 1 (diminished stress) */
1474						vowel_stress[v] = v_stress;
1475					}
1476				}
1477			}
1478
1479			if(v_stress > 0)
1480				*output++ = stress_phonemes[v_stress];  // mark stress of all vowels except 0 (unstressed)
1481
1482
1483			if(vowel_stress[v] > max_stress)
1484			{
1485				max_stress = vowel_stress[v];
1486			}
1487
1488			if((*p == phonLENGTHEN) && ((opt_length = langopts.param[LOPT_IT_LENGTHEN]) != 0))
1489			{
1490				// remove lengthen indicator from non-stressed syllables
1491				int shorten=0;
1492
1493				if(opt_length & 0x10)
1494				{
1495					// only allow lengthen indicator on the highest stress syllable in the word
1496					if(v != max_stress_posn)
1497						shorten = 1;
1498				}
1499				else
1500				if(v_stress < 4)
1501				{
1502					// only allow lengthen indicator if stress >= 4.
1503					shorten = 1;
1504				}
1505
1506				if(((opt_length & 0xf)==2) && (v != (vowel_count - 2)))
1507					shorten = 1;    // LANG=Italian, remove lengthen indicator from non-penultimate syllables
1508
1509				if(shorten)
1510					p++;
1511			}
1512
1513			v++;
1514		}
1515
1516		if(phcode != 1)
1517			*output++ = phcode;
1518	}
1519	*output++ = 0;
1520
1521}  /* end of SetWordStress */
1522
1523
1524
1525
1526//=============================================================================================
1527//   Look up a word in the pronunciation rules
1528//
1529//=============================================================================================
1530
1531
1532#ifdef LOG_TRANSLATE
1533char *Translator::DecodeRule(const char *group, char *rule)
1534{//==================================================
1535/* Convert compiled match template to ascii */
1536
1537   unsigned char rb;
1538	unsigned char c;
1539	char *p;
1540   int  ix;
1541	int  match_type;
1542	int  finished=0;
1543	int  value;
1544	int  linenum=0;
1545	int  flags;
1546	int  suffix_char;
1547	int  condition_num=0;
1548   char buf[60];
1549   char buf_pre[60];
1550	char suffix[20];
1551	static char output[60];
1552
1553	static char symbols[] = {' ',' ',' ',' ',' ',' ',' ',' ',' ',
1554			'@','&','%','+','#','S','D','Z','A','L',' ',' ',' ',' ',' ','N','K','V',' ','T','X','?','W'};
1555
1556	static char symbols_lg[] = {'A','B','C','H','F','G','Y'};
1557
1558	match_type = 0;
1559   buf_pre[0] = 0;
1560	strcpy(buf,group);
1561	p = &buf[strlen(buf)];
1562   while(!finished)
1563   {
1564		rb = *rule++;
1565
1566		if(rb <= RULE_LINENUM)
1567		{
1568			switch(rb)
1569			{
1570			case 0:
1571			case RULE_PHONEMES:
1572				finished=1;
1573				break;
1574			case RULE_PRE:
1575				match_type = RULE_PRE;
1576				*p = 0;
1577				p = buf_pre;
1578				break;
1579			case RULE_POST:
1580				match_type = RULE_POST;
1581				*p = 0;
1582				strcat(buf," (");
1583				p = &buf[strlen(buf)];
1584				break;
1585			case RULE_PH_COMMON:
1586				break;
1587			case RULE_CONDITION:
1588				/* conditional rule, next byte gives condition number */
1589				condition_num = *rule++;
1590				break;
1591			case RULE_LINENUM:
1592				value = (rule[1] & 0xff) - 1;
1593				linenum = (rule[0] & 0xff) - 1 + (value * 255);
1594				rule+=2;
1595				break;
1596			}
1597			continue;
1598		}
1599		
1600		if(rb == RULE_ENDING)
1601		{
1602			static const char *flag_chars = "ei vtfq t";
1603			flags = ((rule[0] & 0x7f)<< 8) + (rule[1] & 0x7f);
1604			suffix_char = 'S';
1605			if(flags & (SUFX_P >> 8))
1606				suffix_char = 'P';
1607			sprintf(suffix,"%c%d",suffix_char,rule[2] & 0x7f);
1608			rule += 3;
1609			for(ix=0;ix<9;ix++)
1610			{
1611				if(flags & 1)
1612					sprintf(&suffix[strlen(suffix)],"%c",flag_chars[ix]);
1613				flags = (flags >> 1);
1614			}
1615			strcpy(p,suffix);
1616			p += strlen(suffix);
1617			c = ' ';
1618		}
1619		else
1620		if(rb == RULE_LETTERGP)
1621		{
1622			c = symbols_lg[*rule++ - 'A'];
1623		}
1624		else
1625		if(rb == RULE_LETTERGP2)
1626		{
1627			value = *rule++ - 'A';
1628			p[0] = 'L';
1629			p[1] = (value / 10) + '0';
1630			c = (value % 10) + '0';
1631
1632			if(match_type == RULE_PRE)
1633			{
1634				p[0] = c;
1635				c = 'L';
1636			}
1637			p+=2;
1638		}
1639		else
1640		if(rb <= RULE_LAST_RULE)
1641			c = symbols[rb];
1642		else
1643		if(rb == RULE_SPACE)
1644			c = '_';
1645		else
1646			c = rb;
1647		*p++ = c;
1648	}
1649	*p = 0;
1650
1651	p = output;
1652	if(linenum > 0)
1653	{
1654		sprintf(p,"%5d:\t",linenum);
1655		p += 7;
1656	}
1657	if(condition_num > 0)
1658	{
1659		sprintf(p,"?%d ",condition_num);
1660		p = &p[strlen(p)];
1661	}
1662	if((ix = strlen(buf_pre)) > 0)
1663	{
1664		while(--ix >= 0)
1665			*p++ = buf_pre[ix];
1666		*p++ = ')';
1667		*p++ = ' ';
1668	}
1669	*p = 0;
1670	strcat(p,buf);
1671	ix = strlen(output);
1672	while(ix < 8)
1673		output[ix++]=' ';
1674	output[ix]=0;
1675   return(output);
1676}   /* end of decode_match */
1677#endif
1678
1679
1680
1681void Translator::AppendPhonemes(char *string, int size, const char *ph)
1682{//====================================================================
1683/* Add new phoneme string "ph" to "string"
1684	Keeps count of the number of vowel phonemes in the word, and whether these
1685   can be stressed syllables.  These values can be used in translation rules
1686*/	
1687	const char *p;
1688	unsigned char  c;
1689	int  unstress_mark;
1690	int length;
1691
1692	length = strlen(ph) + strlen(string);
1693	if(length >= size)
1694	{
1695		return;
1696	}
1697
1698	/* any stressable vowel ? */
1699	unstress_mark = 0;
1700	p = ph;
1701	while((c = *p++) != 0)
1702	{
1703		if(c >= n_phoneme_tab) continue;
1704
1705		if(phoneme_tab[c]->type == phSTRESS)
1706		{
1707			if(phoneme_tab[c]->std_length < 4)
1708				unstress_mark = 1;
1709		}
1710		else
1711		{
1712			if(phoneme_tab[c]->type == phVOWEL)
1713			{
1714				if(((phoneme_tab[c]->phflags & phUNSTRESSED) == 0) &&
1715					(unstress_mark == 0))
1716				{
1717					word_stressed_count++;
1718				}
1719				unstress_mark = 0;
1720				word_vowel_count++;
1721			}
1722		}
1723	}
1724	
1725	if(string != NULL)
1726		strcat(string,ph);
1727}   /* end of AppendPhonemes */
1728
1729
1730
1731void Translator::MatchRule(char *word[], const char *group, char *rule, MatchRecord *match_out, int word_flags, int dict_flags)
1732{//============================================================================================================================
1733/* Checks a specified word against dictionary rules.
1734	Returns with phoneme code string, or NULL if no match found.
1735
1736	word (indirect) points to current character group within the input word
1737			This is advanced by this procedure as characters are consumed
1738
1739	group:  the initial characters used to choose the rules group
1740
1741	rule:  address of dictionary rule data for this character group
1742
1743	match_out:  returns best points score
1744
1745	word_flags:  indicates whether this is a retranslation after a suffix has been removed
1746*/
1747
1748	unsigned char rb;     // current instuction from rule
1749	unsigned char letter;   // current letter from input word, single byte
1750	int letter_w;         // current letter, wide character
1751	int letter_xbytes;    // number of extra bytes of multibyte character (num bytes - 1)
1752	unsigned char last_letter;
1753
1754	char *pre_ptr;
1755	char *post_ptr;       /* pointer to first character after group */
1756
1757	char *rule_start;       /* start of current match template */
1758	char *p;
1759
1760	int  match_type;      /* left, right, or consume */
1761	int  failed;
1762	int  consumed;        /* number of letters consumed from input */
1763	int  count;           /* count through rules in the group */
1764	int  syllable_count;
1765	int  vowel;
1766	int  letter_group;
1767	int  distance_right;
1768	int  distance_left;
1769	int  lg_pts;
1770	int  n_bytes;
1771
1772	MatchRecord match;
1773	static MatchRecord best;
1774
1775	int  total_consumed;  /* letters consumed for best match */
1776	int  group_length;
1777
1778	unsigned char condition_num;
1779	char *common_phonemes;  /* common to a group of entries */
1780
1781
1782
1783	if(rule == NULL)
1784	{
1785		match_out->points = 0;
1786		(*word)++;
1787		return;
1788	}
1789
1790
1791	total_consumed = 0;
1792	count = 0;
1793	common_phonemes = NULL;
1794	match_type = 0;
1795
1796	best.points = 0;
1797	best.phonemes = "";
1798	best.end_type = 0;
1799	best.del_fwd = NULL;
1800
1801	group_length = strlen(group);
1802	
1803	/* search through dictionary rules */
1804	while(rule[0] != RULE_GROUP_END)
1805	{
1806		match_type=0;
1807		consumed = 0;
1808		letter = 0;
1809		distance_right= -6;   /* used to reduce points for matches further away the current letter */
1810		distance_left= -2;
1811		count++;
1812
1813		match.points = 1;
1814		match.end_type = 0;
1815		match.del_fwd = NULL;
1816		
1817		pre_ptr = *word;
1818		post_ptr = *word + group_length;
1819
1820		/* work through next rule until end, or until no-match proved */
1821		rule_start = rule;
1822		failed = 0;
1823		while(!failed)
1824		{
1825			rb = *rule++;
1826
1827			if(rb <= RULE_LINENUM)
1828			{
1829				switch(rb)
1830				{
1831				case 0:  // no phoneme string for this rule, use previous common rule
1832					if(common_phonemes != NULL)
1833					{
1834						match.phonemes = common_phonemes;
1835						if(*match.phonemes == RULE_CONDITION)
1836							match.phonemes += 2;   // skip over condition number
1837						while(((rb = *match.phonemes++) != 0) && (rb != RULE_PHONEMES));
1838					}
1839					else
1840					{
1841						match.phonemes = "";
1842					}
1843					rule--;      // so we are still pointing at the 0
1844					failed=2;    // matched OK
1845					break;
1846				case RULE_PRE:
1847					match_type = RULE_PRE;
1848					break;
1849				case RULE_POST:
1850					match_type = RULE_POST;
1851					break;
1852				case RULE_PHONEMES:
1853					match.phonemes = rule;
1854					failed=2;     // matched OK
1855					break;
1856				case RULE_PH_COMMON:
1857					common_phonemes = rule;
1858					break;
1859				case RULE_CONDITION:
1860					/* conditional rule, next byte gives condition number */
1861					condition_num = *rule++;
1862					
1863					if(condition_num >= 32)
1864					{
1865						// allow the rule only if the condition number is NOT set
1866						if((dict_condition & (1L << (condition_num-32))) != 0)
1867							failed = 1;
1868					}
1869					else
1870					{
1871						// allow the rule only if the condition number is set
1872						if((dict_condition & (1L << condition_num)) == 0)
1873							failed = 1;
1874					}
1875
1876					if(!failed)
1877						match.points++;  // add one point for a matched conditional rule
1878					break;
1879				case RULE_LINENUM:
1880					rule+=2;
1881					break;
1882				}
1883				continue;
1884			}
1885
1886			switch(match_type)
1887			{
1888			case 0:
1889				/* match and consume this letter */
1890				last_letter = letter;
1891				letter = *post_ptr++;
1892
1893				if((letter == rb) || ((letter==(unsigned char)REPLACED_E) && (rb=='e')))
1894				{
1895					match.points += 21;
1896					consumed++;
1897				}
1898				else
1899					failed = 1;
1900				break;
1901
1902
1903			case RULE_POST:
1904				/* continue moving fowards */
1905				distance_right += 6;
1906				if(distance_right > 18)
1907					distance_right = 19;
1908				last_letter = letter;
1909				letter_xbytes = utf8_in(&letter_w,post_ptr,0)-1;
1910				letter = *post_ptr++;
1911
1912				switch(rb)
1913				{
1914				case RULE_LETTERGP:
1915					letter_group = *rule++ - 'A';
1916					if(IsLetter(letter_w,letter_group))
1917					{
1918						lg_pts = 20;
1919						if(letter_group==2)
1920							lg_pts = 19;  // fewer points for C, general consonant
1921						match.points += (lg_pts-distance_right);
1922						post_ptr += letter_xbytes;
1923					}
1924					else
1925						failed = 1;
1926					break;
1927
1928				case RULE_LETTERGP2:   // match against a list of utf-8 strings
1929					letter_group = *rule++ - 'A';
1930					if((n_bytes = IsLetterGroup(post_ptr-1,letter_group,0)) >0)
1931					{
1932						match.points += (20-distance_right);
1933						post_ptr += (n_bytes-1);
1934					}
1935					else
1936						failed =1;
1937					break;
1938
1939				case RULE_NOTVOWEL:
1940					if(!IsLetter(letter_w,0))
1941					{
1942						match.points += (20-distance_right);
1943						post_ptr += letter_xbytes;
1944					}
1945					else
1946						failed = 1;
1947					break;
1948
1949				case RULE_DIGIT:
1950					if(IsDigit(letter_w))
1951					{
1952						match.points += (20-distance_right);
1953						post_ptr += letter_xbytes;
1954					}
1955					else
1956					if(langopts.tone_numbers)
1957					{
1958						// also match if there is no digit
1959						match.points += (20-distance_right);
1960						post_ptr--;
1961					}
1962					else
1963						failed = 1;
1964					break;
1965					
1966				case RULE_NONALPHA:
1967					if(!iswalpha(letter_w))
1968					{
1969						match.points += (21-distance_right);
1970						post_ptr += letter_xbytes;
1971					}
1972					else
1973						failed = 1;
1974					break;
1975
1976				case RULE_DOUBLE:
1977					if(letter == last_letter)
1978						match.points += (21-distance_right);
1979					else
1980						failed = 1;
1981					break;
1982
1983				case RULE_ALT1:
1984					if(dict_flags & FLAG_ALT_TRANS)
1985						match.points++;
1986					else
1987						failed = 1;
1988					break;
1989
1990				case '-':
1991					if((letter == '-') || ((letter == ' ') && (word_flags & FLAG_HYPHEN_AFTER)))
1992					{
1993						match.points += (22-distance_right);    // one point more than match against space
1994					}
1995					else
1996						failed = 1;
1997					break;
1998
1999				case RULE_SYLLABLE:
2000					{
2001						/* more than specified number of vowel letters to the right */
2002						char *p = post_ptr + letter_xbytes;
2003
2004						syllable_count = 1;
2005						while(*rule == RULE_SYLLABLE)
2006						{
2007							rule++;
2008							syllable_count+=1;   /* number of syllables to match */
2009						}
2010						vowel = 0;
2011						while(letter_w != RULE_SPACE)
2012						{
2013							if((vowel==0) && IsLetter(letter_w,LETTERGP_VOWEL2))
2014							{
2015								// this is counting vowels which are separated by non-vowels
2016								syllable_count--;
2017							}
2018							vowel = IsLetter(letter_w,LETTERGP_VOWEL2);
2019							p += utf8_in(&letter_w,p,0);
2020						}
2021						if(syllable_count <= 0)
2022							match.points+= (19-distance_right);
2023						else
2024							failed = 1;
2025					}
2026					break;
2027
2028				case RULE_NOVOWELS:
2029					{
2030						char *p = post_ptr + letter_xbytes;
2031						while(letter_w != RULE_SPACE)
2032						{
2033							if(IsLetter(letter_w,LETTERGP_VOWEL2))
2034							{
2035								failed = 1;
2036								break;
2037							}
2038							p += utf8_in(&letter_w,p,0);
2039						}
2040						if(!failed)
2041							match.points += (19-distance_right);
2042					}
2043					break;
2044
2045				case RULE_INC_SCORE:
2046					match.points += 20;      // force an increase in points
2047					break;
2048
2049				case RULE_DEL_FWD:
2050					// find the next 'e' in the word and replace by ''
2051					for(p = *word + group_length; *p != ' '; p++)
2052					{
2053						if(*p == 'e')
2054						{
2055							match.del_fwd = p;
2056							break;
2057						}
2058					}
2059					break;
2060
2061				case RULE_ENDING:
2062					// next 3 bytes are a (non-zero) ending type. 2 bytes of flags + suffix length
2063					match.end_type = (rule[0] << 16) + ((rule[1] & 0x7f) << 8) + (rule[2] & 0x7f);
2064					rule += 3;
2065					break;
2066
2067				case RULE_NO_SUFFIX:
2068					if(word_flags & FLAG_SUFFIX_REMOVED)
2069						failed = 1;             // a suffix has been removed
2070					else
2071						match.points++;
2072					break;
2073
2074				default:
2075					if(letter == rb)
2076					{
2077						if(letter == RULE_SPACE)
2078							match.points += (21-distance_right);
2079						else
2080							match.points += (21-distance_right);
2081					}
2082					else
2083						failed = 1;
2084					break;
2085				}
2086				break;
2087
2088
2089			case RULE_PRE:
2090				/*…

Large files files are truncated, but you can click here to view the full file