/native/external/espeak/src/dictionary.cpp
C++ | 2754 lines | 2122 code | 384 blank | 248 comment | 612 complexity | d1049eaa6bbd8c21c88e0cbfb5da8e43 MD5 | raw file
Large files files are truncated, but you can click here to view the full file
1/*************************************************************************** 2 * Copyright (C) 2005 to 2007 by Jonathan Duddington * 3 * email: jonsd@users.sourceforge.net * 4 * * 5 * This program is free software; you can redistribute it and/or modify * 6 * it under the terms of the GNU General Public License as published by * 7 * the Free Software Foundation; either version 3 of the License, or * 8 * (at your option) any later version. * 9 * * 10 * This program is distributed in the hope that it will be useful, * 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 13 * GNU General Public License for more details. * 14 * * 15 * You should have received a copy of the GNU General Public License * 16 * along with this program; if not, write see: * 17 * <http://www.gnu.org/licenses/>. * 18 ***************************************************************************/ 19 20#include "StdAfx.h" 21 22#define LOG_TRANSLATE 23 24#include <stdio.h> 25#include <ctype.h> 26#include <stdlib.h> 27#include <string.h> 28 29#include <wctype.h> 30////#include <wchar.h> 31 32#include "speak_lib.h" 33#include "speech.h" 34#include "phoneme.h" 35#include "synthesize.h" 36#include "translate.h" 37 38 39int dictionary_skipwords; 40char dictionary_name[40]; 41 42extern MNEM_TAB mnem_flags[]; 43extern PHONEME_TAB_LIST phoneme_tab_list[N_PHONEME_TABS]; 44 45// accented characters which indicate (in some languages) the start of a separate syllable 46//static const unsigned short diereses_list[7] = {L'ä',L'ë',L'ď',L'ö',L'ü',L'˙',0}; 47static const unsigned short diereses_list[7] = {0xe4,0xeb,0xef,0xf6,0xfc,0xff,0}; 48 49// convert characters to an approximate 7 bit ascii equivalent 50// used for checking for vowels 51static unsigned char remove_accent[] = { 52'a','a','a','a','a','a','a','c','e','e','e','e','i','i','i','i', // 0c0 53'd','n','o','o','o','o','o', 0, 'o','u','u','u','u','y','t','s', // 0d0 54'a','a','a','a','a','a','a','c','e','e','e','e','i','i','i','i', // 0e0 55'd','n','o','o','o','o','o', 0 ,'o','u','u','u','u','y','t','y', // 0f0 56 57'a','a','a','a','a','a','c','c','c','c','c','c','c','c','d','d', // 100 58'd','d','e','e','e','e','e','e','e','e','e','e','g','g','g','g', // 110 59'g','g','g','g','h','h','h','h','i','i','i','i','i','i','i','i', // 120 60'i','i','i','i','j','j','k','k','k','l','l','l','l','l','l','l', // 130 61'l','l','l','n','n','n','n','n','n','n','n','n','o','o','o','o', // 140 62'o','o','o','o','r','r','r','r','r','r','s','s','s','s','s','s', // 150 63's','s','t','t','t','t','t','t','u','u','u','u','u','u','u','u', // 160 64'u','u','u','u','w','w','y','y','y','z','z','z','z','z','z','s', // 170 65'b','b','b','b', 0, 0, 'o','c','c','d','d','d','d','d','e','e', // 180 66'e','f','f','g','g','h','i','i','k','k','l','l','m','n','n','o', // 190 67'o','o','o','o','p','p','y', 0, 0, 's','s','t','t','t','t','u', // 1a0 68'u','u','v','y','y','z','z','z','z','z','z','z', 0, 0, 0, 'w', // 1b0 69't','t','t','k','d','d','d','l','l','l','n','n','n','a','a','i', // 1c0 70'i','o','o','u','u','u','u','u','u','u','u','u','u','e','a','a', // 1d0 71'a','a','a','a','g','g','g','g','k','k','o','o','o','o','z','z', // 1e0 72'j','d','d','d','g','g','w','w','n','n','a','a','a','a','o','o', // 1f0 73 74'a','a','a','a','e','e','e','e','i','i','i','i','o','o','o','o', // 200 75'r','r','r','r','u','u','u','u','s','s','t','t','y','y','h','h', // 210 76'n','d','o','o','z','z','a','a','e','e','o','o','o','o','o','o', // 220 77'o','o','y','y','l','n','t','j','d','q','a','c','c','l','t','s', // 230 78'z', 0 }; 79 80 81 82 83void strncpy0(char *to,const char *from, int size) 84{//=============================================== 85 // strcpy with limit, ensures a zero terminator 86 strncpy(to,from,size); 87 to[size-1] = 0; 88} 89 90 91static int reverse_word_bytes(int word) 92{//============================= 93 // reverse the order of bytes from little-endian to big-endian 94#ifdef ARCH_BIG 95 int ix; 96 int word2 = 0; 97 98 for(ix=0; ix<=24; ix+=8) 99 { 100 word2 = word2 << 8; 101 word2 |= (word >> ix) & 0xff; 102 } 103 return(word2); 104#else 105 return(word); 106#endif 107} 108 109 110int LookupMnem(MNEM_TAB *table, char *string) 111{//========================================== 112 while(table->mnem != NULL) 113 { 114 if(strcmp(string,table->mnem)==0) 115 return(table->value); 116 table++; 117 } 118 return(table->value); 119} 120 121const char *LookupMnem(MNEM_TAB *table, int value) 122{//=============================================== 123 while(table->mnem != NULL) 124 { 125 if(table->value == value) 126 return(table->mnem); 127 table++; 128 } 129 return(""); 130} 131 132 133 134//============================================================================================= 135// Read pronunciation rules and pronunciation lookup dictionary 136// 137//============================================================================================= 138 139 140 141int Translator::LoadDictionary(const char *name, int no_error) 142{//=========================================================== 143 int hash; 144 char *p; 145 int *pw; 146 int length; 147 FILE *f; 148 unsigned int size; 149 char fname[sizeof(path_home)+20]; 150 151 strcpy(dictionary_name,name); // currently loaded dictionary name 152 153 if(no_error) // don't load dictionary, just set the dictionary_name 154 return(1); 155 156 // Load a pronunciation data file into memory 157 // bytes 0-3: offset to rules data 158 // bytes 4-7: number of hash table entries 159 sprintf(fname,"%s%c%s_dict",path_home,PATHSEP,name); 160 size = GetFileLength(fname); 161 162 f = fopen(fname,"rb"); 163 if((f == NULL) || (size <= 0)) 164 { 165 if(no_error == 0) 166 { 167 fprintf(stderr,"Can't read dictionary file: '%s'\n",fname); 168 } 169 return(1); 170 } 171 172 if(data_dictlist != NULL) 173 Free(data_dictlist); 174 175 data_dictlist = Alloc(size); 176 fread(data_dictlist,size,1,f); 177 fclose(f); 178 179 180 pw = (int *)data_dictlist; 181 length = reverse_word_bytes(pw[1]); 182 183 if(size <= (N_HASH_DICT + sizeof(int)*2)) 184 { 185 fprintf(stderr,"Empty _dict file: '%s\n",fname); 186 return(2); 187 } 188 189 if((reverse_word_bytes(pw[0]) != N_HASH_DICT) || 190 (length <= 0) || (length > 0x8000000)) 191 { 192 fprintf(stderr,"Bad data: '%s' (%x length=%x)\n",fname,reverse_word_bytes(pw[0]),length); 193 return(2); 194 } 195 data_dictrules = &data_dictlist[length]; 196 197 // set up indices into data_dictrules 198 InitGroups(); 199 if(groups1[0] == NULL) 200 { 201 fprintf(stderr,"Error in %s_rules, no default rule group\n",name); 202 } 203 204 // set up hash table for data_dictlist 205 p = &data_dictlist[8]; 206 207 for(hash=0; hash<N_HASH_DICT; hash++) 208 { 209 dict_hashtab[hash] = p; 210 while((length = *p) != 0) 211 { 212 p += length; 213 } 214 p++; // skip over the zero which terminates the list for this hash value 215 } 216 217 return(0); 218} // end of LoadDictionary 219 220 221void Translator::InitGroups(void) 222{//============================== 223/* Called after dictionary 1 is loaded, to set up table of entry points for translation rule chains 224 for single-letters and two-letter combinations 225*/ 226 227 int ix; 228 char *p; 229 char *p_name; 230 unsigned int *pw; 231 unsigned char c, c2; 232 int len; 233 234 n_groups2 = 0; 235 for(ix=0; ix<256; ix++) 236 { 237 groups1[ix]=NULL; 238 groups2_count[ix]=0; 239 groups2_start[ix]=255; // indicates "not set" 240 } 241 memset(letterGroups,0,sizeof(letterGroups)); 242 243 p = data_dictrules; 244 while(*p != 0) 245 { 246 if(*p != RULE_GROUP_START) 247 { 248 fprintf(stderr,"Bad rules data in '%s_dict' at 0x%x\n",dictionary_name,(unsigned int)(p-data_dictrules)); 249 break; 250 } 251 p++; 252 253 if(p[0] == RULE_REPLACEMENTS) 254 { 255 pw = (unsigned int *)(((long)p+4) & ~3); // advance to next word boundary 256 langopts.replace_chars = pw; 257 while(pw[0] != 0) 258 { 259 pw += 2; // find the end of the replacement list, each entry is 2 words. 260 } 261 p = (char *)(pw+1); 262 263#ifdef ARCH_BIG 264 pw = (unsigned int *)langopts.replace_chars; 265 while(*pw != 0) 266 { 267 *pw = reverse_word_bytes(*pw); 268 pw++; 269 *pw = reverse_word_bytes(*pw); 270 pw++; 271 } 272#endif 273 continue; 274 } 275 276 if(p[0] == RULE_LETTERGP2) 277 { 278 ix = p[1] - 'A'; 279 p += 2; 280 if((ix >= 0) && (ix < N_LETTER_GROUPS)) 281 { 282 letterGroups[ix] = p; 283 } 284 } 285 else 286 { 287 len = strlen(p); 288 p_name = p; 289 c = p_name[0]; 290 291 p += (len+1); 292 if(len == 1) 293 { 294 groups1[c] = p; 295 } 296 else 297 if(len == 0) 298 { 299 groups1[0] = p; 300 } 301 else 302 { 303 if(groups2_start[c] == 255) 304 groups2_start[c] = n_groups2; 305 306 groups2_count[c]++; 307 groups2[n_groups2] = p; 308 c2 = p_name[1]; 309 groups2_name[n_groups2++] = (c + (c2 << 8)); 310 } 311 } 312 313 // skip over all the rules in this group 314 while(*p != RULE_GROUP_END) 315 { 316 p += (strlen(p) + 1); 317 } 318 p++; 319 } 320 321} // end of InitGroups 322 323 324int HashDictionary(const char *string) 325//==================================== 326/* Generate a hash code from the specified string 327 This is used to access the dictionary_2 word-lookup dictionary 328*/ 329{ 330 int c; 331 int chars=0; 332 int hash=0; 333 334 while((c = (*string++ & 0xff)) != 0) 335 { 336 hash = hash * 8 + c; 337 hash = (hash & 0x3ff) ^ (hash >> 8); /* exclusive or */ 338 chars++; 339 } 340 341 return((hash+chars) & 0x3ff); // a 10 bit hash code 342} // end of HashDictionary 343 344 345 346//============================================================================================= 347// Translate between internal representation of phonemes and a mnemonic form for display 348// 349//============================================================================================= 350 351 352 353char *EncodePhonemes(char *p, char *outptr, unsigned char *bad_phoneme) 354/*********************************************************************/ 355/* Translate a phoneme string from ascii mnemonics to internal phoneme numbers, 356 from 'p' up to next blank . 357 Returns advanced 'p' 358 outptr contains encoded phonemes, unrecognised phonemes are encoded as 255 359 bad_phoneme must point to char array of length 2 of more 360*/ 361{ 362 int ix; 363 unsigned char c; 364 int count; /* num. of matching characters */ 365 int max; /* highest num. of matching found so far */ 366 int max_ph; /* corresponding phoneme with highest matching */ 367 int consumed; 368 unsigned int mnemonic_word; 369 370 bad_phoneme[0] = 0; 371 372 // skip initial blanks 373 while(isspace(*p)) 374 { 375 p++; 376 } 377 378 while(((c = *p) != 0) && !isspace(c)) 379 { 380 consumed = 0; 381 382 switch(c) 383 { 384 case '|': 385 // used to separate phoneme mnemonics if needed, to prevent characters being treated 386 // as a multi-letter mnemonic 387 388 if((c = p[1]) == '|') 389 { 390 // treat double || as a word-break symbol, drop through 391 // to the default case with c = '|' 392 } 393 else 394 { 395 p++; 396 break; 397 } 398 399 default: 400 // lookup the phoneme mnemonic, find the phoneme with the highest number of 401 // matching characters 402 max= -1; 403 max_ph= 0; 404 405 for(ix=1; ix<n_phoneme_tab; ix++) 406 { 407 if(phoneme_tab[ix] == NULL) 408 continue; 409 if(phoneme_tab[ix]->type == phINVALID) 410 continue; // this phoneme is not defined for this language 411 412 count = 0; 413 mnemonic_word = phoneme_tab[ix]->mnemonic; 414 415 while(((c = p[count]) > ' ') && (count < 4) && 416 (c == ((mnemonic_word >> (count*8)) & 0xff))) 417 count++; 418 419 if((count > max) && 420 ((count == 4) || (((mnemonic_word >> (count*8)) & 0xff)==0))) 421 { 422 max = count; 423 max_ph = phoneme_tab[ix]->code; 424 } 425 } 426 427 if(max_ph == 0) 428 { 429 max_ph = 255; /* not recognised */ 430 bad_phoneme[0] = *p; 431 bad_phoneme[1] = 0; 432 } 433 434 if(max <= 0) 435 max = 1; 436 p += (consumed + max); 437 *outptr++ = (char)(max_ph); 438 439 if(max_ph == phonSWITCH) 440 { 441 // Switch Language: this phoneme is followed by a text string 442 char *p_lang = outptr; 443 while(!isspace(c = *p) && (c != 0)) 444 { 445 p++; 446 *outptr++ = tolower(c); 447 } 448 *outptr = 0; 449 if(c == 0) 450 { 451 if(strcmp(p_lang,"en")==0) 452 { 453 *p_lang = 0; // don't need "en", it's assumed by default 454 return(p); 455 } 456 } 457 else 458 { 459 *outptr++ = '|'; // more phonemes follow, terminate language string with separator 460 } 461 } 462 break; 463 } 464 } 465 /* terminate the encoded string */ 466 *outptr = 0; 467 return(p); 468} // end of EncodePhonemes 469 470 471 472void DecodePhonemes(const char *inptr, char *outptr) 473//================================================== 474// Translate from internal phoneme codes into phoneme mnemonics 475{ 476 unsigned char phcode; 477 unsigned char c; 478 unsigned int mnem; 479 PHONEME_TAB *ph; 480 static const char *stress_chars = "==,,'* "; 481 482 while((phcode = *inptr++) > 0) 483 { 484 if(phcode == 255) 485 continue; /* indicates unrecognised phoneme */ 486 if((ph = phoneme_tab[phcode]) == NULL) 487 continue; 488 489 if((ph->type == phSTRESS) && (ph->std_length <= 4) && (ph->spect == 0)) 490 { 491 if(ph->std_length > 1) 492 *outptr++ = stress_chars[ph->std_length]; 493 } 494 else 495 { 496 mnem = ph->mnemonic; 497 498 while((c = (mnem & 0xff)) != 0) 499 { 500 *outptr++ = c; 501 mnem = mnem >> 8; 502 } 503 if(phcode == phonSWITCH) 504 { 505 while(isalpha(*inptr)) 506 { 507 *outptr++ = *inptr++; 508 } 509 } 510 } 511 } 512 *outptr = 0; /* string terminator */ 513} // end of DecodePhonemes 514 515 516 517void Translator::WriteMnemonic(int *ix, int mnem) 518{//============================================== 519 unsigned char c; 520 521 while((c = mnem & 0xff) != 0) 522 { 523 if((c == '/') && (option_phoneme_variants==0)) 524 break; // discard phoneme variant indicator 525 phon_out[(*ix)++]= c; 526 // phon_out[phon_out_ix++]= ipa1[c]; 527 mnem = mnem >> 8; 528 } 529} 530 531 532void Translator::GetTranslatedPhonemeString(char *phon_out, int n_phon_out) 533{//======================================================================== 534/* Can be called after a clause has been translated into phonemes, in order 535 to display the clause in phoneme mnemonic form. 536*/ 537 538 int ix; 539 int phon_out_ix=0; 540 int stress; 541 char *p; 542 PHONEME_LIST *plist; 543 544 static const char *stress_chars = "==,,''"; 545 546 if(phon_out != NULL) 547 { 548 for(ix=1; ix<(n_phoneme_list-2) && (phon_out_ix < (n_phon_out - 6)); ix++) 549 { 550 plist = &phoneme_list[ix]; 551 if(plist->newword) 552 phon_out[phon_out_ix++] = ' '; 553 554 if(plist->synthflags & SFLAG_SYLLABLE) 555 { 556 if((stress = plist->tone) > 1) 557 { 558 if(stress > 5) stress = 5; 559 phon_out[phon_out_ix++] = stress_chars[stress]; 560 } 561 } 562 WriteMnemonic(&phon_out_ix,plist->ph->mnemonic); 563 564 if(plist->synthflags & SFLAG_LENGTHEN) 565 { 566 WriteMnemonic(&phon_out_ix,phoneme_tab[phonLENGTHEN]->mnemonic); 567 } 568 if((plist->synthflags & SFLAG_SYLLABLE) && (plist->type != phVOWEL)) 569 { 570 // syllablic consonant 571 WriteMnemonic(&phon_out_ix,phoneme_tab[phonSYLLABIC]->mnemonic); 572 } 573 if(plist->ph->code == phonSWITCH) 574 { 575 // the tone_ph field contains a phoneme table number 576 p = phoneme_tab_list[plist->tone_ph].name; 577 while(*p != 0) 578 { 579 phon_out[phon_out_ix++] = *p++; 580 } 581 phon_out[phon_out_ix++] = ' '; 582 } 583 else 584 if(plist->tone_ph > 0) 585 { 586 WriteMnemonic(&phon_out_ix,phoneme_tab[plist->tone_ph]->mnemonic); 587 } 588 } 589 590 if(phon_out_ix >= n_phon_out) 591 phon_out_ix = n_phon_out - 1; 592 phon_out[phon_out_ix] = 0; 593 } 594} // end of Translator::GetTranslatedPhonemeString 595 596 597 598//============================================================================================= 599// Is a word Unpronouncable - and so should be spoken as individual letters 600// 601//============================================================================================= 602 603 604#ifdef deleted 605// this is the initials_bitmap for english 606static unsigned char initials_bitmap[86] = { 607 0x00, 0x00, 0x00, 0x00, 0x22, 0x08, 0x00, 0x88, // 0 608 0x20, 0x24, 0x20, 0x80, 0x10, 0x00, 0x00, 0x00, 609 0x00, 0x28, 0x08, 0x00, 0x88, 0x22, 0x04, 0x00, // 16 610 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 611 0x00, 0x88, 0x22, 0x04, 0x00, 0x02, 0x00, 0x00, // 32 612 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 613 0x00, 0x28, 0x8a, 0x03, 0x00, 0x00, 0x40, 0x00, // 48 614 0x02, 0x00, 0x41, 0xca, 0x9b, 0x06, 0x20, 0x80, 615 0x91, 0x00, 0x00, 0x00, 0x00, 0x20, 0x08, 0x00, // 64 616 0x08, 0x20, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 617 0x00, 0x00, 0x22, 0x00, 0x01, 0x00, }; 618#endif 619 620 621int Translator::Unpronouncable(char *word) 622{//======================================= 623/* Determines whether a word in 'unpronouncable', i.e. whether it should 624 be spoken as individual letters. 625 626 This function may be language specific. This is a generic version. 627*/ 628 629 int c; 630 int c1=0; 631 int vowel_posn=9; 632 int index; 633 int count; 634 int apostrophe=0; 635 636 if(langopts.param[LOPT_UNPRONOUNCABLE] == 1) 637 return(0); 638 639 if((*word == ' ') || (*word == 0)) 640 return(0); 641 642 index = 0; 643 count = 0; 644 for(;;) 645 { 646 index += utf8_in(&c,&word[index],0); 647 if((c==0) || (c==' ')) 648 break; 649 650 if(count==0) 651 c1 = c; 652 count++; 653 654 if(IsVowel(c)) 655 { 656 vowel_posn = count; // position of the first vowel 657 break; 658 } 659 660 if(c == '\'') 661 apostrophe = 1; 662 else 663 if(!iswalpha(c)) 664 return(0); // letter (not vowel) outside a-z range or apostrophe, abort test 665 } 666 667 if((vowel_posn < 9) && (langopts.param[LOPT_UNPRONOUNCABLE] == 2)) 668 return(0); // option means allow any word with a vowel 669 670 if(c1 == langopts.param[LOPT_UNPRONOUNCABLE]) 671 vowel_posn--; // disregard this as the initial letter when counting 672 673 if(vowel_posn > (langopts.max_initial_consonants+1)) 674 return(1); // no vowel, or no vowel in first four letters 675 676return(0); 677 678} /* end of Unpronounceable */ 679 680 681 682int Translator::IsLetterGroup(char *word, int group, int pre) 683{//========================================================== 684 // match the word against a list of utf-8 strings 685 char *p; 686 char *w; 687 688 p = letterGroups[group]; 689 690 while(*p != RULE_GROUP_END) 691 { 692 w = word; 693 while(*p == *w) 694 { 695 w++; 696 p++; 697 } 698 if(*p == 0) 699 return(w-word); // matched a complete string 700 701 while(*p++ != 0); // skip to end of string 702 } 703 return(0); 704} 705 706 707int Translator::IsLetter(int letter, int group) 708{//============================================ 709 int letter2; 710 711 if(letter_groups[group] != NULL) 712 { 713 if(wcschr(letter_groups[group],letter)) 714 return(1); 715 return(0); 716 } 717 718 if(group > 7) 719 return(0); 720 721 if(letter_bits_offset > 0) 722 { 723 if(((letter2 = (letter - letter_bits_offset)) > 0) && (letter2 < 0x80)) 724 letter = letter2; 725 else 726 return(0); 727 } 728 else 729 { 730 if((letter >= 0xc0) && (letter <= 0x241)) 731 return(letter_bits[remove_accent[letter-0xc0]] & (1L << group)); 732 } 733 734 if((letter >= 0) && (letter < 0x80)) 735 return(letter_bits[letter] & (1L << group)); 736 737 return(0); 738} 739 740 741int Translator::IsVowel(int letter) 742{//================================ 743 return(IsLetter(letter,0)); 744} 745 746void SetLetterVowel(Translator *tr, int c) 747{//======================================= 748 tr->letter_bits[c] = (tr->letter_bits[c] & 0x40) | 0x81; // keep value for group 6 (front vowels e,i,y) 749} 750 751void ResetLetterBits(Translator *tr, int groups) 752{//============================================= 753// Clear all the specified groups 754 unsigned int ix; 755 unsigned int mask; 756 757 mask = ~groups; 758 759 for(ix=0; ix<sizeof(tr->letter_bits); ix++) 760 { 761 tr->letter_bits[ix] &= mask; 762 } 763} 764 765 766void SetLetterBits(Translator *tr, int group, const char *string) 767{//============================================================== 768 int bits; 769 unsigned char c; 770 771 bits = (1L << group); 772 while((c = *string++) != 0) 773 tr->letter_bits[c] |= bits; 774} 775 776void SetLetterBitsRange(Translator *tr, int group, int first, int last) 777{//==================================================================== 778 int bits; 779 int ix; 780 781 bits = (1L << group); 782 for(ix=first; ix<=last; ix++) 783 { 784 tr->letter_bits[ix] |= bits; 785 } 786} 787 788 789 790//============================================================================================= 791// Determine the stress pattern of a word 792// 793//============================================================================================= 794 795 796 797static int GetVowelStress(Translator *tr, unsigned char *phonemes, unsigned char *vowel_stress, int &vowel_count, int &stressed_syllable, int control) 798{//==================================================================================================================================================== 799// control = 1, set stress to 1 for forced unstressed vowels 800 unsigned char phcode; 801 PHONEME_TAB *ph; 802 unsigned char *ph_out = phonemes; 803 int count = 1; 804 int max_stress = 0; 805 int ix; 806 int j; 807 int stress = 0; 808 int primary_posn = 0; 809 810 vowel_stress[0] = 0; 811 while(((phcode = *phonemes++) != 0) && (count < (N_WORD_PHONEMES/2)-1)) 812 { 813 if((ph = phoneme_tab[phcode]) == NULL) 814 continue; 815 816 if((ph->type == phSTRESS) && (ph->spect == 0)) 817 { 818 /* stress marker, use this for the following vowel */ 819 820 if(phcode == phonSTRESS_PREV) 821 { 822 /* primary stress on preceeding vowel */ 823 j = count - 1; 824 while((j > 0) && (stressed_syllable == 0) && (vowel_stress[j] < 4)) 825 { 826 if(vowel_stress[j] != 1) 827 { 828 // don't promote a phoneme which must be unstressed 829 vowel_stress[j] = 4; 830 831 if(max_stress < 4) 832 { 833 max_stress = 4; 834 primary_posn = j; 835 } 836 837 /* reduce any preceding primary stress markers */ 838 for(ix=1; ix<j; ix++) 839 { 840 if(vowel_stress[ix] == 4) 841 vowel_stress[ix] = 3; 842 } 843 break; 844 } 845 j--; 846 } 847 } 848 else 849 { 850 if((ph->std_length < 4) || (stressed_syllable == 0)) 851 { 852 stress = ph->std_length; 853 854 if(stress > max_stress) 855 max_stress = stress; 856 } 857 } 858 continue; 859 } 860 861 if((ph->type == phVOWEL) && !(ph->phflags & phNONSYLLABIC)) 862 { 863 vowel_stress[count] = (char)stress; 864 if((stress >= 4) && (stress >= max_stress)) 865 { 866 primary_posn = count; 867 max_stress = stress; 868 } 869 870 if((stress == 0) && (control & 1) && (ph->phflags & phUNSTRESSED)) 871 vowel_stress[count] = 1; /* weak vowel, must be unstressed */ 872 873 count++; 874 stress = 0; 875 } 876 else 877 if(phcode == phonSYLLABIC) 878 { 879 // previous consonant phoneme is syllablic 880 vowel_stress[count] = (char)stress; 881 if((stress == 0) && (control & 1)) 882 vowel_stress[count++] = 1; // syllabic consonant, usually unstressed 883 } 884 885 *ph_out++ = phcode; 886 } 887 vowel_stress[count] = 0; 888 *ph_out = 0; 889 890 /* has the position of the primary stress been specified by $1, $2, etc? */ 891 if(stressed_syllable > 0) 892 { 893 if(stressed_syllable >= count) 894 stressed_syllable = count-1; // the final syllable 895 896 vowel_stress[stressed_syllable] = 4; 897 max_stress = 4; 898 primary_posn = stressed_syllable; 899 } 900 901 if(max_stress == 5) 902 { 903 // priority stress, replaces any other primary stress marker 904 for(ix=1; ix<count; ix++) 905 { 906 if(vowel_stress[ix] == 4) 907 { 908 if(tr->langopts.stress_flags & 0x20000) 909 vowel_stress[ix] = 0; 910 else 911 vowel_stress[ix] = 3; 912 } 913 914 if(vowel_stress[ix] == 5) 915 { 916 vowel_stress[ix] = 4; 917 primary_posn = ix; 918 } 919 } 920 max_stress = 4; 921 } 922 923 stressed_syllable = primary_posn; 924 vowel_count = count; 925 return(max_stress); 926} // end of GetVowelStress 927 928 929 930static char stress_phonemes[] = {phonSTRESS_U, phonSTRESS_D, phonSTRESS_2, phonSTRESS_3, 931 phonSTRESS_P, phonSTRESS_P2, phonSTRESS_TONIC}; 932 933 934void ChangeWordStress(Translator *tr, char *word, int new_stress) 935{//============================================================== 936 int ix; 937 unsigned char *p; 938 int max_stress; 939 int vowel_count; // num of vowels + 1 940 int stressed_syllable=0; // position of stressed syllable 941 unsigned char phonetic[N_WORD_PHONEMES]; 942 unsigned char vowel_stress[N_WORD_PHONEMES/2]; 943 944 strcpy((char *)phonetic,word); 945 max_stress = GetVowelStress(tr, phonetic, vowel_stress, vowel_count, stressed_syllable, 0); 946 947 if(new_stress >= 4) 948 { 949 // promote to primary stress 950 for(ix=1; ix<vowel_count; ix++) 951 { 952 if(vowel_stress[ix] >= max_stress) 953 { 954 vowel_stress[ix] = new_stress; 955 break; 956 } 957 } 958 } 959 else 960 { 961 // remove primary stress 962 for(ix=1; ix<vowel_count; ix++) 963 { 964 if(vowel_stress[ix] > new_stress) // >= allows for diminished stress (=1) 965 vowel_stress[ix] = new_stress; 966 } 967 } 968 969 // write out phonemes 970 ix = 1; 971 p = phonetic; 972 while(*p != 0) 973 { 974 if((phoneme_tab[*p]->type == phVOWEL) && !(phoneme_tab[*p]->phflags & phNONSYLLABIC)) 975 { 976 if(vowel_stress[ix] != 0) 977 *word++ = stress_phonemes[vowel_stress[ix]]; 978 979 ix++; 980 } 981 *word++ = *p++; 982 } 983 *word = 0; 984} // end of ChangeWordStress 985 986 987 988void Translator::SetWordStress(char *output, unsigned int dictionary_flags, int tonic, int prev_stress) 989{//=================================================================================================== 990/* Guess stress pattern of word. This is language specific 991 992 'dictionary_flags' has bits 0-3 position of stressed vowel (if > 0) 993 or unstressed (if == 7) or syllables 1 and 2 (if == 6) 994 bits 8... dictionary flags 995 996 If 'tonic' is set (>= 0), replace highest stress by this value. 997 998 Parameter used for input and output 999*/ 1000 1001 unsigned char phcode; 1002 unsigned char *p; 1003 PHONEME_TAB *ph; 1004 int stress; 1005 int max_stress; 1006 int vowel_count; // num of vowels + 1 1007 int ix; 1008 int v; 1009 int v_stress; 1010 int stressed_syllable; // position of stressed syllable 1011 int max_stress_posn; 1012 int unstressed_word = 0; 1013 char *max_output; 1014 int final_ph; 1015 int mnem; 1016 int post_tonic; 1017 int opt_length; 1018 int done; 1019 1020 unsigned char vowel_stress[N_WORD_PHONEMES/2]; 1021 char syllable_weight[N_WORD_PHONEMES/2]; 1022 unsigned char phonetic[N_WORD_PHONEMES]; 1023 1024 static char consonant_types[16] = {0,0,0,1,1,1,1,1,1,1,0,0,0,0,0,0}; 1025 1026 1027 /* stress numbers STRESS_BASE + 1028 0 diminished, unstressed within a word 1029 1 unstressed, weak 1030 2 1031 3 secondary stress 1032 4 main stress */ 1033 1034 /* copy input string into internal buffer */ 1035 for(ix=0; ix<N_WORD_PHONEMES; ix++) 1036 { 1037 phonetic[ix] = output[ix]; 1038 // check for unknown phoneme codes 1039 if(phonetic[ix] >= n_phoneme_tab) 1040 phonetic[ix] = phonSCHWA; 1041 if(phonetic[ix] == 0) 1042 break; 1043 } 1044 if(ix == 0) return; 1045 final_ph = phonetic[ix-1]; 1046 1047 max_output = output + (N_WORD_PHONEMES-3); /* check for overrun */ 1048 1049 // any stress position marked in the xx_list dictionary ? 1050 stressed_syllable = dictionary_flags & 0x7; 1051 if(dictionary_flags & 0x8) 1052 { 1053 // this indicates a word without a primary stress 1054 stressed_syllable = dictionary_flags & 0x3; 1055 unstressed_word = 1; 1056 } 1057 1058 max_stress = GetVowelStress(this, phonetic, vowel_stress, vowel_count, stressed_syllable, 1); 1059 1060 // heavy or light syllables 1061 ix = 1; 1062 for(p = phonetic; *p != 0; p++) 1063 { 1064 if((phoneme_tab[p[0]]->type == phVOWEL) && !(phoneme_tab[p[0]]->phflags & phNONSYLLABIC)) 1065 { 1066 int weight = 0; 1067 int lengthened = 0; 1068 1069 if(phoneme_tab[p[1]]->code == phonLENGTHEN) 1070 lengthened = 1; 1071 1072 if(lengthened || (phoneme_tab[p[0]]->phflags & phLONG)) 1073 { 1074 // long vowel, increase syllable weight 1075 weight++; 1076 } 1077 1078 if(lengthened) p++; // advance over phonLENGTHEN 1079 1080 if(consonant_types[phoneme_tab[p[1]]->type] && ((phoneme_tab[p[2]]->type != phVOWEL) || (phoneme_tab[p[1]]->phflags & phLONG))) 1081 { 1082 // followed by two consonants, a long consonant, or consonant and end-of-word 1083 weight++; 1084 } 1085 syllable_weight[ix] = weight; 1086 ix++; 1087 } 1088 } 1089 1090 switch(langopts.stress_rule) 1091 { 1092 case 8: 1093 // stress on first syllable, unless it is a light syllable 1094 if(syllable_weight[1] > 0) 1095 break; 1096 // else drop through to case 1 1097 case 1: 1098 // stress on second syllable 1099 if((stressed_syllable == 0) && (vowel_count > 2)) 1100 { 1101 stressed_syllable = 2; 1102 if(max_stress == 0) 1103 { 1104 vowel_stress[stressed_syllable] = 4; 1105 } 1106 max_stress = 4; 1107 } 1108 break; 1109 1110 case 2: 1111 // a language with stress on penultimate vowel 1112 1113 if(stressed_syllable == 0) 1114 { 1115 /* no explicit stress - stress the penultimate vowel */ 1116 max_stress = 4; 1117 1118 if(vowel_count > 2) 1119 { 1120 stressed_syllable = vowel_count - 2; 1121 1122 if(langopts.stress_flags & 0x300) 1123 { 1124 // LANG=Spanish, stress on last vowel if the word ends in a consonant other than 'n' or 's' 1125 if(phoneme_tab[final_ph]->type != phVOWEL) 1126 { 1127 if(langopts.stress_flags & 0x100) 1128 { 1129 stressed_syllable = vowel_count - 1; 1130 } 1131 else 1132 { 1133 mnem = phoneme_tab[final_ph]->mnemonic; 1134 if((mnem != 'n') && (mnem != 's')) 1135 { 1136 stressed_syllable = vowel_count - 1; 1137 } 1138 } 1139 } 1140 } 1141 1142 if(vowel_stress[stressed_syllable] == 1) 1143 { 1144 // but this vowel is explicitly marked as unstressed 1145 if(stressed_syllable > 1) 1146 stressed_syllable--; 1147 else 1148 stressed_syllable++; 1149 } 1150 } 1151 else 1152 { 1153 stressed_syllable = 1; 1154 if(langopts.stress_flags & 0x1) 1155 max_stress = 3; // don't give full stress to monosyllables 1156 } 1157 1158 // only set the stress if it's not already marked explicitly 1159 if(vowel_stress[stressed_syllable] == 0) 1160 { 1161 // don't stress if next and prev syllables are stressed 1162 if((vowel_stress[stressed_syllable-1] < 4) || (vowel_stress[stressed_syllable+1] < 4)) 1163 vowel_stress[stressed_syllable] = max_stress; 1164 } 1165 } 1166 break; 1167 1168 case 3: 1169 // stress on last vowel 1170 if(stressed_syllable == 0) 1171 { 1172 /* no explicit stress - stress the final vowel */ 1173 stressed_syllable = vowel_count - 1; 1174 if(max_stress == 0) 1175 { 1176 while(stressed_syllable > 0) 1177 { 1178 if(vowel_stress[stressed_syllable] == 0) 1179 { 1180 vowel_stress[stressed_syllable] = 4; 1181 break; 1182 } 1183 else 1184 stressed_syllable--; 1185 } 1186 } 1187 max_stress = 4; 1188 } 1189 break; 1190 1191 case 4: // stress on antipenultimate vowel 1192 if(stressed_syllable == 0) 1193 { 1194 stressed_syllable = vowel_count - 3; 1195 if(stressed_syllable < 1) 1196 stressed_syllable = 1; 1197 1198 if(max_stress == 0) 1199 { 1200 vowel_stress[stressed_syllable] = 4; 1201 } 1202 max_stress = 4; 1203 } 1204 break; 1205 1206 case 5: 1207 // LANG=Russian 1208 if(stressed_syllable == 0) 1209 { 1210 /* no explicit stress - guess the stress from the number of syllables */ 1211 static char guess_ru[16] = {0,0,1,1,2,3,3,4,5,6,7,7,8,9,10,11}; 1212 static char guess_ru_v[16] = {0,0,1,1,2,2,3,3,4,5,6,7,7,8,9,10}; // for final phoneme is a vowel 1213 static char guess_ru_t[16] = {0,0,1,2,3,3,3,4,5,6,7,7,7,8,9,10}; // for final phoneme is an unvoiced stop 1214 1215 stressed_syllable = vowel_count - 3; 1216 if(vowel_count < 16) 1217 { 1218 if(phoneme_tab[final_ph]->type == phVOWEL) 1219 stressed_syllable = guess_ru_v[vowel_count]; 1220 else 1221 if(phoneme_tab[final_ph]->type == phSTOP) 1222 stressed_syllable = guess_ru_t[vowel_count]; 1223 else 1224 stressed_syllable = guess_ru[vowel_count]; 1225 } 1226 vowel_stress[stressed_syllable] = 4; 1227 max_stress = 4; 1228 } 1229 break; 1230 1231 case 6: // LANG=hi stress on the last heaviest syllable 1232 if(stressed_syllable == 0) 1233 { 1234 int wt; 1235 int max_weight = -1; 1236 int prev_stressed; 1237 1238 // find the heaviest syllable, excluding the final syllable 1239 for(ix = 1; ix < (vowel_count-1); ix++) 1240 { 1241 if(vowel_stress[ix] == 0) 1242 { 1243 if((wt = syllable_weight[ix]) >= max_weight) 1244 { 1245 max_weight = wt; 1246 prev_stressed = stressed_syllable; 1247 stressed_syllable = ix; 1248 } 1249 } 1250 } 1251 1252 if((syllable_weight[vowel_count-1] == 2) && (max_weight< 2)) 1253 { 1254 // the only double=heavy syllable is the final syllable, so stress this 1255 stressed_syllable = vowel_count-1; 1256 } 1257 else 1258 if(max_weight <= 0) 1259 { 1260 // all syllables, exclusing the last, are light. Stress the first syllable 1261 stressed_syllable = 1; 1262 } 1263 1264 vowel_stress[stressed_syllable] = 4; 1265 max_stress = 4; 1266 } 1267 break; 1268 1269 case 7: // LANG=tr, the last syllable for any vowel markes explicitly as unstressed 1270 if(stressed_syllable == 0) 1271 { 1272 stressed_syllable = vowel_count - 1; 1273 for(ix=1; ix < vowel_count; ix++) 1274 { 1275 if(vowel_stress[ix] == 1) 1276 { 1277 stressed_syllable = ix-1; 1278 break; 1279 } 1280 } 1281 vowel_stress[stressed_syllable] = 4; 1282 max_stress = 4; 1283 } 1284 break; 1285 1286 case 9: // mark all as stressed 1287 for(ix=1; ix<vowel_count; ix++) 1288 { 1289 if(vowel_stress[ix] == 0) 1290 vowel_stress[ix] = 4; 1291 } 1292 break; 1293 } 1294 1295 /* now guess the complete stress pattern */ 1296 if(max_stress < 4) 1297 stress = 4; /* no primary stress marked, use for 1st syllable */ 1298 else 1299 stress = 3; 1300 1301 1302 if((langopts.stress_flags & 0x1000) && (vowel_count == 2)) 1303 { 1304 // Two syllable word, if one syllable has primary stress, then give the other secondary stress 1305 if(vowel_stress[1] == 4) 1306 vowel_stress[2] = 3; 1307 if(vowel_stress[2] == 4) 1308 vowel_stress[1] = 3; 1309 } 1310#if deleted 1311 if((langopts.stress_flags & 0x2000) && (vowel_stress[1] == 0)) 1312 { 1313 // If there is only one syllable before the primary stress, give it a secondary stress 1314 if((vowel_count > 2) && (vowel_stress[2] >= 4)) 1315 { 1316 vowel_stress[1] = 3; 1317 } 1318 } 1319#endif 1320 1321 done = 0; 1322 for(v=1; v<vowel_count; v++) 1323 { 1324 if(vowel_stress[v] == 0) 1325 { 1326 if((langopts.stress_flags & 0x10) && (stress < 4) && (v == vowel_count-1)) 1327 { 1328 // flag: don't give secondary stress to final vowel 1329 } 1330 else 1331 if((langopts.stress_flags & 0x8000) && (done == 0)) 1332 { 1333 vowel_stress[v] = (char)stress; 1334 done =1; 1335 stress = 3; /* use secondary stress for remaining syllables */ 1336 } 1337 else 1338 if((vowel_stress[v-1] <= 1) && (vowel_stress[v+1] <= 1)) 1339 { 1340 /* trochaic: give stress to vowel surrounded by unstressed vowels */ 1341 1342 if((stress == 3) && (langopts.stress_flags & 0x20)) 1343 continue; // don't use secondary stress 1344 1345 if((v > 1) && (langopts.stress_flags & 0x40) && (syllable_weight[v]==0) && (syllable_weight[v+1]>0)) 1346 { 1347 // don't put secondary stress on a light syllable which is followed by a heavy syllable 1348 continue; 1349 } 1350 1351// should start with secondary stress on the first syllable, or should it count back from 1352// the primary stress and put secondary stress on alternate syllables? 1353 vowel_stress[v] = (char)stress; 1354 done =1; 1355 stress = 3; /* use secondary stress for remaining syllables */ 1356 } 1357 } 1358 } 1359 1360 if((unstressed_word) && (tonic < 0)) 1361 { 1362 if(vowel_count <= 2) 1363 tonic = langopts.unstressed_wd1; /* monosyllable - unstressed */ 1364 else 1365 tonic = langopts.unstressed_wd2; /* more than one syllable, used secondary stress as the main stress */ 1366 } 1367 1368 max_stress = 0; 1369 max_stress_posn = 0; 1370 for(v=1; v<vowel_count; v++) 1371 { 1372 if(vowel_stress[v] >= max_stress) 1373 { 1374 max_stress = vowel_stress[v]; 1375 max_stress_posn = v; 1376 } 1377 } 1378 1379 if(tonic >= 0) 1380 { 1381 /* find position of highest stress, and replace it by 'tonic' */ 1382 1383 /* don't disturb an explicitly set stress by 'unstress-at-end' flag */ 1384 if((tonic > max_stress) || (max_stress <= 4)) 1385 vowel_stress[max_stress_posn] = (char)tonic; 1386 max_stress = tonic; 1387 } 1388 1389 1390 /* produce output phoneme string */ 1391 p = phonetic; 1392 v = 1; 1393 1394 if((ph = phoneme_tab[*p]) != NULL) 1395 { 1396 1397 if(ph->type == phSTRESS) 1398 ph = phoneme_tab[p[1]]; 1399 1400#ifdef deleted 1401 int gap = langopts.word_gap & 0x700; 1402 if((gap) && (vowel_stress[1] >= 4) && (prev_stress >= 4)) 1403 { 1404 /* two primary stresses together, insert a short pause */ 1405 *output++ = pause_phonemes[gap >> 8]; 1406 } 1407 else 1408#endif 1409 if((langopts.vowel_pause & 0x30) && (ph->type == phVOWEL)) 1410 { 1411 // word starts with a vowel 1412 1413 if((langopts.vowel_pause & 0x20) && (vowel_stress[1] >= 4)) 1414 { 1415 *output++ = phonPAUSE_NOLINK; // not to be replaced by link 1416 } 1417 else 1418 { 1419 *output++ = phonPAUSE_VSHORT; // break, but no pause 1420 } 1421 } 1422 } 1423 1424 p = phonetic; 1425 post_tonic = 0; 1426 while(((phcode = *p++) != 0) && (output < max_output)) 1427 { 1428 if((ph = phoneme_tab[phcode]) == NULL) 1429 continue; 1430 1431// if(ph->type == phSTRESS) 1432// continue; 1433 1434 if(ph->type == phPAUSE) 1435 { 1436 prev_last_stress = 0; 1437 } 1438 else 1439 if(((ph->type == phVOWEL) && !(ph->phflags & phNONSYLLABIC)) || (*p == phonSYLLABIC)) 1440 { 1441 // a vowel, or a consonant followed by a syllabic consonant marker 1442 1443 v_stress = vowel_stress[v]; 1444 prev_last_stress = v_stress; 1445 1446 if(vowel_stress[v-1] >= max_stress) 1447 post_tonic = 1; 1448 1449 if(v_stress <= 1) 1450 { 1451 if((v > 1) && (max_stress >= 4) && (langopts.stress_flags & 4) && (v == (vowel_count-1))) 1452 { 1453 // option: mark unstressed final syllable as diminished 1454 v_stress = 1; 1455 } 1456 else 1457 if((langopts.stress_flags & 2) || (v == 1) || (v == (vowel_count-1))) 1458 { 1459 // first or last syllable, or option 'don't set diminished stress' 1460 v_stress = 0; 1461 } 1462 else 1463 if((v == (vowel_count-2)) && (vowel_stress[vowel_count-1] <= 1)) 1464 { 1465 // penultimate syllable, followed by an unstressed final syllable 1466 v_stress = 0; 1467 } 1468 else 1469 { 1470 // unstressed syllable within a word 1471 if((vowel_stress[v-1] != 1) || ((langopts.stress_flags & 0x10000) == 0)) 1472 { 1473 v_stress = 1; /* change from 0 (unstressed) to 1 (diminished stress) */ 1474 vowel_stress[v] = v_stress; 1475 } 1476 } 1477 } 1478 1479 if(v_stress > 0) 1480 *output++ = stress_phonemes[v_stress]; // mark stress of all vowels except 0 (unstressed) 1481 1482 1483 if(vowel_stress[v] > max_stress) 1484 { 1485 max_stress = vowel_stress[v]; 1486 } 1487 1488 if((*p == phonLENGTHEN) && ((opt_length = langopts.param[LOPT_IT_LENGTHEN]) != 0)) 1489 { 1490 // remove lengthen indicator from non-stressed syllables 1491 int shorten=0; 1492 1493 if(opt_length & 0x10) 1494 { 1495 // only allow lengthen indicator on the highest stress syllable in the word 1496 if(v != max_stress_posn) 1497 shorten = 1; 1498 } 1499 else 1500 if(v_stress < 4) 1501 { 1502 // only allow lengthen indicator if stress >= 4. 1503 shorten = 1; 1504 } 1505 1506 if(((opt_length & 0xf)==2) && (v != (vowel_count - 2))) 1507 shorten = 1; // LANG=Italian, remove lengthen indicator from non-penultimate syllables 1508 1509 if(shorten) 1510 p++; 1511 } 1512 1513 v++; 1514 } 1515 1516 if(phcode != 1) 1517 *output++ = phcode; 1518 } 1519 *output++ = 0; 1520 1521} /* end of SetWordStress */ 1522 1523 1524 1525 1526//============================================================================================= 1527// Look up a word in the pronunciation rules 1528// 1529//============================================================================================= 1530 1531 1532#ifdef LOG_TRANSLATE 1533char *Translator::DecodeRule(const char *group, char *rule) 1534{//================================================== 1535/* Convert compiled match template to ascii */ 1536 1537 unsigned char rb; 1538 unsigned char c; 1539 char *p; 1540 int ix; 1541 int match_type; 1542 int finished=0; 1543 int value; 1544 int linenum=0; 1545 int flags; 1546 int suffix_char; 1547 int condition_num=0; 1548 char buf[60]; 1549 char buf_pre[60]; 1550 char suffix[20]; 1551 static char output[60]; 1552 1553 static char symbols[] = {' ',' ',' ',' ',' ',' ',' ',' ',' ', 1554 '@','&','%','+','#','S','D','Z','A','L',' ',' ',' ',' ',' ','N','K','V',' ','T','X','?','W'}; 1555 1556 static char symbols_lg[] = {'A','B','C','H','F','G','Y'}; 1557 1558 match_type = 0; 1559 buf_pre[0] = 0; 1560 strcpy(buf,group); 1561 p = &buf[strlen(buf)]; 1562 while(!finished) 1563 { 1564 rb = *rule++; 1565 1566 if(rb <= RULE_LINENUM) 1567 { 1568 switch(rb) 1569 { 1570 case 0: 1571 case RULE_PHONEMES: 1572 finished=1; 1573 break; 1574 case RULE_PRE: 1575 match_type = RULE_PRE; 1576 *p = 0; 1577 p = buf_pre; 1578 break; 1579 case RULE_POST: 1580 match_type = RULE_POST; 1581 *p = 0; 1582 strcat(buf," ("); 1583 p = &buf[strlen(buf)]; 1584 break; 1585 case RULE_PH_COMMON: 1586 break; 1587 case RULE_CONDITION: 1588 /* conditional rule, next byte gives condition number */ 1589 condition_num = *rule++; 1590 break; 1591 case RULE_LINENUM: 1592 value = (rule[1] & 0xff) - 1; 1593 linenum = (rule[0] & 0xff) - 1 + (value * 255); 1594 rule+=2; 1595 break; 1596 } 1597 continue; 1598 } 1599 1600 if(rb == RULE_ENDING) 1601 { 1602 static const char *flag_chars = "ei vtfq t"; 1603 flags = ((rule[0] & 0x7f)<< 8) + (rule[1] & 0x7f); 1604 suffix_char = 'S'; 1605 if(flags & (SUFX_P >> 8)) 1606 suffix_char = 'P'; 1607 sprintf(suffix,"%c%d",suffix_char,rule[2] & 0x7f); 1608 rule += 3; 1609 for(ix=0;ix<9;ix++) 1610 { 1611 if(flags & 1) 1612 sprintf(&suffix[strlen(suffix)],"%c",flag_chars[ix]); 1613 flags = (flags >> 1); 1614 } 1615 strcpy(p,suffix); 1616 p += strlen(suffix); 1617 c = ' '; 1618 } 1619 else 1620 if(rb == RULE_LETTERGP) 1621 { 1622 c = symbols_lg[*rule++ - 'A']; 1623 } 1624 else 1625 if(rb == RULE_LETTERGP2) 1626 { 1627 value = *rule++ - 'A'; 1628 p[0] = 'L'; 1629 p[1] = (value / 10) + '0'; 1630 c = (value % 10) + '0'; 1631 1632 if(match_type == RULE_PRE) 1633 { 1634 p[0] = c; 1635 c = 'L'; 1636 } 1637 p+=2; 1638 } 1639 else 1640 if(rb <= RULE_LAST_RULE) 1641 c = symbols[rb]; 1642 else 1643 if(rb == RULE_SPACE) 1644 c = '_'; 1645 else 1646 c = rb; 1647 *p++ = c; 1648 } 1649 *p = 0; 1650 1651 p = output; 1652 if(linenum > 0) 1653 { 1654 sprintf(p,"%5d:\t",linenum); 1655 p += 7; 1656 } 1657 if(condition_num > 0) 1658 { 1659 sprintf(p,"?%d ",condition_num); 1660 p = &p[strlen(p)]; 1661 } 1662 if((ix = strlen(buf_pre)) > 0) 1663 { 1664 while(--ix >= 0) 1665 *p++ = buf_pre[ix]; 1666 *p++ = ')'; 1667 *p++ = ' '; 1668 } 1669 *p = 0; 1670 strcat(p,buf); 1671 ix = strlen(output); 1672 while(ix < 8) 1673 output[ix++]=' '; 1674 output[ix]=0; 1675 return(output); 1676} /* end of decode_match */ 1677#endif 1678 1679 1680 1681void Translator::AppendPhonemes(char *string, int size, const char *ph) 1682{//==================================================================== 1683/* Add new phoneme string "ph" to "string" 1684 Keeps count of the number of vowel phonemes in the word, and whether these 1685 can be stressed syllables. These values can be used in translation rules 1686*/ 1687 const char *p; 1688 unsigned char c; 1689 int unstress_mark; 1690 int length; 1691 1692 length = strlen(ph) + strlen(string); 1693 if(length >= size) 1694 { 1695 return; 1696 } 1697 1698 /* any stressable vowel ? */ 1699 unstress_mark = 0; 1700 p = ph; 1701 while((c = *p++) != 0) 1702 { 1703 if(c >= n_phoneme_tab) continue; 1704 1705 if(phoneme_tab[c]->type == phSTRESS) 1706 { 1707 if(phoneme_tab[c]->std_length < 4) 1708 unstress_mark = 1; 1709 } 1710 else 1711 { 1712 if(phoneme_tab[c]->type == phVOWEL) 1713 { 1714 if(((phoneme_tab[c]->phflags & phUNSTRESSED) == 0) && 1715 (unstress_mark == 0)) 1716 { 1717 word_stressed_count++; 1718 } 1719 unstress_mark = 0; 1720 word_vowel_count++; 1721 } 1722 } 1723 } 1724 1725 if(string != NULL) 1726 strcat(string,ph); 1727} /* end of AppendPhonemes */ 1728 1729 1730 1731void Translator::MatchRule(char *word[], const char *group, char *rule, MatchRecord *match_out, int word_flags, int dict_flags) 1732{//============================================================================================================================ 1733/* Checks a specified word against dictionary rules. 1734 Returns with phoneme code string, or NULL if no match found. 1735 1736 word (indirect) points to current character group within the input word 1737 This is advanced by this procedure as characters are consumed 1738 1739 group: the initial characters used to choose the rules group 1740 1741 rule: address of dictionary rule data for this character group 1742 1743 match_out: returns best points score 1744 1745 word_flags: indicates whether this is a retranslation after a suffix has been removed 1746*/ 1747 1748 unsigned char rb; // current instuction from rule 1749 unsigned char letter; // current letter from input word, single byte 1750 int letter_w; // current letter, wide character 1751 int letter_xbytes; // number of extra bytes of multibyte character (num bytes - 1) 1752 unsigned char last_letter; 1753 1754 char *pre_ptr; 1755 char *post_ptr; /* pointer to first character after group */ 1756 1757 char *rule_start; /* start of current match template */ 1758 char *p; 1759 1760 int match_type; /* left, right, or consume */ 1761 int failed; 1762 int consumed; /* number of letters consumed from input */ 1763 int count; /* count through rules in the group */ 1764 int syllable_count; 1765 int vowel; 1766 int letter_group; 1767 int distance_right; 1768 int distance_left; 1769 int lg_pts; 1770 int n_bytes; 1771 1772 MatchRecord match; 1773 static MatchRecord best; 1774 1775 int total_consumed; /* letters consumed for best match */ 1776 int group_length; 1777 1778 unsigned char condition_num; 1779 char *common_phonemes; /* common to a group of entries */ 1780 1781 1782 1783 if(rule == NULL) 1784 { 1785 match_out->points = 0; 1786 (*word)++; 1787 return; 1788 } 1789 1790 1791 total_consumed = 0; 1792 count = 0; 1793 common_phonemes = NULL; 1794 match_type = 0; 1795 1796 best.points = 0; 1797 best.phonemes = ""; 1798 best.end_type = 0; 1799 best.del_fwd = NULL; 1800 1801 group_length = strlen(group); 1802 1803 /* search through dictionary rules */ 1804 while(rule[0] != RULE_GROUP_END) 1805 { 1806 match_type=0; 1807 consumed = 0; 1808 letter = 0; 1809 distance_right= -6; /* used to reduce points for matches further away the current letter */ 1810 distance_left= -2; 1811 count++; 1812 1813 match.points = 1; 1814 match.end_type = 0; 1815 match.del_fwd = NULL; 1816 1817 pre_ptr = *word; 1818 post_ptr = *word + group_length; 1819 1820 /* work through next rule until end, or until no-match proved */ 1821 rule_start = rule; 1822 failed = 0; 1823 while(!failed) 1824 { 1825 rb = *rule++; 1826 1827 if(rb <= RULE_LINENUM) 1828 { 1829 switch(rb) 1830 { 1831 case 0: // no phoneme string for this rule, use previous common rule 1832 if(common_phonemes != NULL) 1833 { 1834 match.phonemes = common_phonemes; 1835 if(*match.phonemes == RULE_CONDITION) 1836 match.phonemes += 2; // skip over condition number 1837 while(((rb = *match.phonemes++) != 0) && (rb != RULE_PHONEMES)); 1838 } 1839 else 1840 { 1841 match.phonemes = ""; 1842 } 1843 rule--; // so we are still pointing at the 0 1844 failed=2; // matched OK 1845 break; 1846 case RULE_PRE: 1847 match_type = RULE_PRE; 1848 break; 1849 case RULE_POST: 1850 match_type = RULE_POST; 1851 break; 1852 case RULE_PHONEMES: 1853 match.phonemes = rule; 1854 failed=2; // matched OK 1855 break; 1856 case RULE_PH_COMMON: 1857 common_phonemes = rule; 1858 break; 1859 case RULE_CONDITION: 1860 /* conditional rule, next byte gives condition number */ 1861 condition_num = *rule++; 1862 1863 if(condition_num >= 32) 1864 { 1865 // allow the rule only if the condition number is NOT set 1866 if((dict_condition & (1L << (condition_num-32))) != 0) 1867 failed = 1; 1868 } 1869 else 1870 { 1871 // allow the rule only if the condition number is set 1872 if((dict_condition & (1L << condition_num)) == 0) 1873 failed = 1; 1874 } 1875 1876 if(!failed) 1877 match.points++; // add one point for a matched conditional rule 1878 break; 1879 case RULE_LINENUM: 1880 rule+=2; 1881 break; 1882 } 1883 continue; 1884 } 1885 1886 switch(match_type) 1887 { 1888 case 0: 1889 /* match and consume this letter */ 1890 last_letter = letter; 1891 letter = *post_ptr++; 1892 1893 if((letter == rb) || ((letter==(unsigned char)REPLACED_E) && (rb=='e'))) 1894 { 1895 match.points += 21; 1896 consumed++; 1897 } 1898 else 1899 failed = 1; 1900 break; 1901 1902 1903 case RULE_POST: 1904 /* continue moving fowards */ 1905 distance_right += 6; 1906 if(distance_right > 18) 1907 distance_right = 19; 1908 last_letter = letter; 1909 letter_xbytes = utf8_in(&letter_w,post_ptr,0)-1; 1910 letter = *post_ptr++; 1911 1912 switch(rb) 1913 { 1914 case RULE_LETTERGP: 1915 letter_group = *rule++ - 'A'; 1916 if(IsLetter(letter_w,letter_group)) 1917 { 1918 lg_pts = 20; 1919 if(letter_group==2) 1920 lg_pts = 19; // fewer points for C, general consonant 1921 match.points += (lg_pts-distance_right); 1922 post_ptr += letter_xbytes; 1923 } 1924 else 1925 failed = 1; 1926 break; 1927 1928 case RULE_LETTERGP2: // match against a list of utf-8 strings 1929 letter_group = *rule++ - 'A'; 1930 if((n_bytes = IsLetterGroup(post_ptr-1,letter_group,0)) >0) 1931 { 1932 match.points += (20-distance_right); 1933 post_ptr += (n_bytes-1); 1934 } 1935 else 1936 failed =1; 1937 break; 1938 1939 case RULE_NOTVOWEL: 1940 if(!IsLetter(letter_w,0)) 1941 { 1942 match.points += (20-distance_right); 1943 post_ptr += letter_xbytes; 1944 } 1945 else 1946 failed = 1; 1947 break; 1948 1949 case RULE_DIGIT: 1950 if(IsDigit(letter_w)) 1951 { 1952 match.points += (20-distance_right); 1953 post_ptr += letter_xbytes; 1954 } 1955 else 1956 if(langopts.tone_numbers) 1957 { 1958 // also match if there is no digit 1959 match.points += (20-distance_right); 1960 post_ptr--; 1961 } 1962 else 1963 failed = 1; 1964 break; 1965 1966 case RULE_NONALPHA: 1967 if(!iswalpha(letter_w)) 1968 { 1969 match.points += (21-distance_right); 1970 post_ptr += letter_xbytes; 1971 } 1972 else 1973 failed = 1; 1974 break; 1975 1976 case RULE_DOUBLE: 1977 if(letter == last_letter) 1978 match.points += (21-distance_right); 1979 else 1980 failed = 1; 1981 break; 1982 1983 case RULE_ALT1: 1984 if(dict_flags & FLAG_ALT_TRANS) 1985 match.points++; 1986 else 1987 failed = 1; 1988 break; 1989 1990 case '-': 1991 if((letter == '-') || ((letter == ' ') && (word_flags & FLAG_HYPHEN_AFTER))) 1992 { 1993 match.points += (22-distance_right); // one point more than match against space 1994 } 1995 else 1996 failed = 1; 1997 break; 1998 1999 case RULE_SYLLABLE: 2000 { 2001 /* more than specified number of vowel letters to the right */ 2002 char *p = post_ptr + letter_xbytes; 2003 2004 syllable_count = 1; 2005 while(*rule == RULE_SYLLABLE) 2006 { 2007 rule++; 2008 syllable_count+=1; /* number of syllables to match */ 2009 } 2010 vowel = 0; 2011 while(letter_w != RULE_SPACE) 2012 { 2013 if((vowel==0) && IsLetter(letter_w,LETTERGP_VOWEL2)) 2014 { 2015 // this is counting vowels which are separated by non-vowels 2016 syllable_count--; 2017 } 2018 vowel = IsLetter(letter_w,LETTERGP_VOWEL2); 2019 p += utf8_in(&letter_w,p,0); 2020 } 2021 if(syllable_count <= 0) 2022 match.points+= (19-distance_right); 2023 else 2024 failed = 1; 2025 } 2026 break; 2027 2028 case RULE_NOVOWELS: 2029 { 2030 char *p = post_ptr + letter_xbytes; 2031 while(letter_w != RULE_SPACE) 2032 { 2033 if(IsLetter(letter_w,LETTERGP_VOWEL2)) 2034 { 2035 failed = 1; 2036 break; 2037 } 2038 p += utf8_in(&letter_w,p,0); 2039 } 2040 if(!failed) 2041 match.points += (19-distance_right); 2042 } 2043 break; 2044 2045 case RULE_INC_SCORE: 2046 match.points += 20; // force an increase in points 2047 break; 2048 2049 case RULE_DEL_FWD: 2050 // find the next 'e' in the word and replace by '' 2051 for(p = *word + group_length; *p != ' '; p++) 2052 { 2053 if(*p == 'e') 2054 { 2055 match.del_fwd = p; 2056 break; 2057 } 2058 } 2059 break; 2060 2061 case RULE_ENDING: 2062 // next 3 bytes are a (non-zero) ending type. 2 bytes of flags + suffix length 2063 match.end_type = (rule[0] << 16) + ((rule[1] & 0x7f) << 8) + (rule[2] & 0x7f); 2064 rule += 3; 2065 break; 2066 2067 case RULE_NO_SUFFIX: 2068 if(word_flags & FLAG_SUFFIX_REMOVED) 2069 failed = 1; // a suffix has been removed 2070 else 2071 match.points++; 2072 break; 2073 2074 default: 2075 if(letter == rb) 2076 { 2077 if(letter == RULE_SPACE) 2078 match.points += (21-distance_right); 2079 else 2080 match.points += (21-distance_right); 2081 } 2082 else 2083 failed = 1; 2084 break; 2085 } 2086 break; 2087 2088 2089 case RULE_PRE: 2090 /*…
Large files files are truncated, but you can click here to view the full file