PageRenderTime 65ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 1ms

/Source/TaiwaneseRomanization/TaiwaneseRomanization.cpp

http://github.com/lukhnos/formosana
C++ | 1207 lines | 866 code | 213 blank | 128 comment | 306 complexity | 494a29750bd97ff5fba49b2ceb0a84c9 MD5 | raw file
  1. //
  2. // TaiwaneseRomanization.cpp
  3. //
  4. // Copyright (c) 2006-2010 Lukhnos D. Liu (http://lukhnos.org)
  5. //
  6. // Permission is hereby granted, free of charge, to any person
  7. // obtaining a copy of this software and associated documentation
  8. // files (the "Software"), to deal in the Software without
  9. // restriction, including without limitation the rights to use,
  10. // copy, modify, merge, publish, distribute, sublicense, and/or sell
  11. // copies of the Software, and to permit persons to whom the
  12. // Software is furnished to do so, subject to the following
  13. // conditions:
  14. //
  15. // The above copyright notice and this permission notice shall be
  16. // included in all copies or substantial portions of the Software.
  17. //
  18. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19. // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  20. // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  21. // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  22. // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  23. // WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  24. // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  25. // OTHER DEALINGS IN THE SOFTWARE.
  26. //
  27. #include "TaiwaneseRomanization.h"
  28. using namespace Formosa::TaiwaneseRomanization;
  29. Composable::~Composable()
  30. {
  31. }
  32. ComposableStringBuffer::ComposableStringBuffer() : _cursor(0)
  33. {
  34. }
  35. unsigned int ComposableStringBuffer::cursor()
  36. {
  37. return _cursor;
  38. }
  39. unsigned int ComposableStringBuffer::setCursor(unsigned int c)
  40. {
  41. if (c <= numberOfCodepoints()) _cursor = c;
  42. return _cursor;
  43. }
  44. void ComposableStringBuffer::clear()
  45. {
  46. _cursor = 0;
  47. strvec.clear();
  48. }
  49. bool ComposableStringBuffer::empty()
  50. {
  51. return strvec.empty();
  52. }
  53. unsigned int ComposableStringBuffer::numberOfCodepoints()
  54. {
  55. return (unsigned int)strvec.size();
  56. }
  57. bool ComposableStringBuffer::insertCharacterAt(unsigned int i, char c)
  58. {
  59. if (i > numberOfCodepoints()) return false;
  60. strvec.insert(strvec.begin()+i, string(1, c));
  61. return true;
  62. }
  63. bool ComposableStringBuffer::removeCodepointAt(unsigned int i)
  64. {
  65. if (i >= numberOfCodepoints()) return false;
  66. strvec.erase(strvec.begin() + i);
  67. return true;
  68. }
  69. const string ComposableStringBuffer::composedForm()
  70. {
  71. return internalForm();
  72. }
  73. const string ComposableStringBuffer::internalForm()
  74. {
  75. string newstr;
  76. unsigned int s=numberOfCodepoints();
  77. for (unsigned int i=0; i<s; i++) newstr+=strvec[i];
  78. return newstr;
  79. }
  80. RomanizationSymbol::RomanizationSymbol() : _tone(0), _type(POJSyllable)
  81. {
  82. }
  83. RomanizationSymbol::RomanizationSymbol(const string &s, SyllableType t) : _tone(0), _type(t), _symbol(s)
  84. {
  85. }
  86. RomanizationSymbol::RomanizationSymbol(const RomanizationSymbol &s) : _tone(s._tone), _type(s._type), _symbol(s._symbol)
  87. {
  88. }
  89. void RomanizationSymbol::setType(SyllableType t)
  90. {
  91. _type = t;
  92. }
  93. const RomanizationSymbol& RomanizationSymbol::operator=(const RomanizationSymbol &s)
  94. {
  95. _symbol = s._symbol;
  96. _tone = s._tone;
  97. _type = s._type;
  98. return *this;
  99. }
  100. const string RomanizationSymbol::symbol() const
  101. {
  102. return string(_symbol);
  103. }
  104. const string RomanizationSymbol::symbolInLowerCase() const
  105. {
  106. string lower;
  107. unsigned int s = (unsigned int)_symbol.length();
  108. for (unsigned int i=0; i<s; i++) lower+=tolower(_symbol[i]);
  109. return lower;
  110. }
  111. const string RomanizationSymbol::setSymbol(const string& s)
  112. {
  113. return (_symbol = s);
  114. }
  115. const string RomanizationSymbol::composedForm(bool forcePOJStyle) const
  116. {
  117. bool usePOJStyleOUAndNN = (_type == POJSyllable) || (_type == HakkaPFSSyllable) || forcePOJStyle;
  118. bool usePOJStyleNinthToneMark = (_type == POJSyllable);
  119. bool composeII = (_type == HakkaPFSSyllable);
  120. unsigned int nanTone = _tone;
  121. if (_type == HakkaPFSSyllable) {
  122. switch (_tone) {
  123. case 1: nanTone = 5; break;
  124. case 2: nanTone = 3; break;
  125. case 3: nanTone = 2; break;
  126. case 4: nanTone = 4; break;
  127. case 5: nanTone = 8; break;
  128. case 6: nanTone = 1; break;
  129. }
  130. }
  131. string composed = VowelHelper::symbolForVowel(_symbol, nanTone, usePOJStyleOUAndNN, usePOJStyleNinthToneMark, composeII);
  132. if (!composed.length()) return _symbol;
  133. return composed;
  134. }
  135. unsigned int RomanizationSymbol::composedLength() const
  136. {
  137. string composed = composedForm();
  138. unsigned int len = 0, clen = (unsigned int)composed.length();
  139. for (unsigned int i=0; i<clen; )
  140. {
  141. if (!(composed[i] & 0x80)) {
  142. len++;
  143. i++;
  144. }
  145. else if ((composed[i] & 0xe0) == 0xc0) {
  146. len++;
  147. i+=2;
  148. }
  149. else if ((composed[i] & 0xf0) == 0xe0) {
  150. len++;
  151. i+=3;
  152. }
  153. else {
  154. len++;
  155. i+=4;
  156. }
  157. }
  158. // fprintf (stderr, "composed=%s, strlen=%d, calculated len=%d\n", composed.c_str(), clen, len);
  159. return len;
  160. }
  161. unsigned int RomanizationSymbol::tone() const
  162. {
  163. return _tone;
  164. }
  165. unsigned int RomanizationSymbol::setTone(unsigned int t)
  166. {
  167. _tone = t > 9 ? _tone : t;
  168. return _tone;
  169. }
  170. bool RomanizationSymbol::isUpperCase() const
  171. {
  172. if (!_symbol.length()) return false;
  173. return toupper(_symbol[0]) == _symbol[0];
  174. }
  175. RomanizationSyllable::RomanizationSyllable() : _inputType(POJSyllable), _inputOption(DiacriticGivenBeforeVowel),
  176. _forcePOJStyle(false),
  177. _cursor(0), _preparedTone(0)
  178. {
  179. }
  180. RomanizationSyllable::RomanizationSyllable(const RomanizationSyllable &s) : _inputType(s._inputType),
  181. _inputOption(s._inputOption),
  182. _forcePOJStyle(s._forcePOJStyle),
  183. _symvec(s._symvec),
  184. _cursor(s._cursor), _preparedTone(s._preparedTone)
  185. {
  186. }
  187. const RomanizationSyllable& RomanizationSyllable::operator=(const RomanizationSyllable &s)
  188. {
  189. _inputType = s._inputType;
  190. _inputOption = s._inputOption;
  191. _forcePOJStyle = s._forcePOJStyle;
  192. _symvec = s._symvec;
  193. _cursor = s._cursor;
  194. _preparedTone = s._preparedTone;
  195. return *this;
  196. }
  197. void RomanizationSyllable::setInputType(SyllableType t)
  198. {
  199. _inputType = t;
  200. }
  201. void RomanizationSyllable::setInputOption(DiacriticInputOption o)
  202. {
  203. if (o != _inputOption) clearPreparedTone();
  204. _inputOption = o;
  205. }
  206. void RomanizationSyllable::setForcePOJStyle(bool p)
  207. {
  208. _forcePOJStyle = p;
  209. }
  210. void RomanizationSyllable::clear()
  211. {
  212. _symvec.clear();
  213. _cursor = 0;
  214. _preparedTone = 0;
  215. }
  216. bool RomanizationSyllable::empty() const
  217. {
  218. return _symvec.empty();
  219. }
  220. unsigned int RomanizationSyllable::numberOfCodepoints() const
  221. {
  222. return (unsigned int)_symvec.size();
  223. }
  224. const string RomanizationSyllable::composedForm()
  225. {
  226. unsigned int s = (unsigned int)_symvec.size();
  227. string composed;
  228. unsigned int i;
  229. if (_preparedTone) _cursor--;
  230. for (i=0; i<_cursor; i++)
  231. {
  232. composed += _symvec[i].composedForm(_forcePOJStyle);
  233. // fprintf(stderr, "%d, symbol=%s, composed=%s, composd form=%s\n", i, _symvec[i].symbol().c_str(), _symvec[i].composedForm().c_str(), composed.c_str());
  234. }
  235. char diacriticShorthand = VowelHelper::diacriticShorthandFromTone(_preparedTone);
  236. if (diacriticShorthand) {
  237. composed += diacriticShorthand;
  238. }
  239. // fprintf(stderr, "composd form=%s\n", composed.c_str());
  240. for (; i<s; i++)
  241. {
  242. composed += _symvec[i].composedForm(_forcePOJStyle);
  243. // fprintf(stderr, "composd form=%s\n", composed.c_str());
  244. }
  245. if (_preparedTone) _cursor++;
  246. return composed;
  247. }
  248. void RomanizationSyllable::setCursor(unsigned int c)
  249. {
  250. clearPreparedTone();
  251. _cursor = c;
  252. }
  253. unsigned int RomanizationSyllable::cursor() const
  254. {
  255. unsigned int realcursor = _preparedTone ? _cursor-1 : _cursor;
  256. unsigned codepointCursor=0;
  257. for (unsigned int i=0; i<realcursor; i++) codepointCursor+=_symvec[i].composedLength();
  258. if (_preparedTone) codepointCursor++;
  259. return codepointCursor;
  260. }
  261. bool RomanizationSyllable::cursorHome()
  262. {
  263. clearPreparedTone();
  264. if (_cursor==0) return false;
  265. _cursor=0;
  266. return true;
  267. }
  268. bool RomanizationSyllable::cursorEnd()
  269. {
  270. clearPreparedTone();
  271. unsigned int len = numberOfCodepoints();
  272. if (_cursor == len) return false;
  273. _cursor = len;
  274. return true;
  275. }
  276. bool RomanizationSyllable::cursorLeft()
  277. {
  278. clearPreparedTone();
  279. if (_cursor==0) return false;
  280. _cursor--;
  281. return true;
  282. }
  283. bool RomanizationSyllable::cursorRight()
  284. {
  285. clearPreparedTone();
  286. if (_cursor == numberOfCodepoints()) return false;
  287. _cursor++;
  288. return true;
  289. }
  290. bool RomanizationSyllable::insertSymbolAtCursor(const RomanizationSymbol &s)
  291. {
  292. clearPreparedTone();
  293. RomanizationSymbol newsym(s);
  294. newsym.setType(_inputType);
  295. _symvec.insert(_symvec.begin() + _cursor, newsym);
  296. _cursor++;
  297. return true;
  298. }
  299. // if there is a prepared tone, the given tone parameter will be ignored
  300. bool RomanizationSyllable::insertCharacterAtCursor(char c, unsigned int tone)
  301. {
  302. // fprintf(stderr, "insert char %d ('%c'), cursor=%d\n", c, c, _cursor);
  303. if (VowelHelper::isDiacriticShorthand(c))
  304. {
  305. unsigned int tone = VowelHelper::toneFromDiacriticShorthand(c);
  306. // if there's already a prepared tone, we replace it with the current one
  307. if (_preparedTone) {
  308. _preparedTone = tone;
  309. return true;
  310. }
  311. if (_inputOption==DiacriticGivenBeforeVowel) {
  312. _preparedTone = tone;
  313. _cursor++;
  314. }
  315. else {
  316. // diacritic given after vowel
  317. if (hasPreviousSymbolAtCursor()) previousSymbolAtCursor().setTone(tone);
  318. }
  319. return true;
  320. }
  321. // if it's not a diacritic symbol, it's POJ^W^W^W, and it's n or u or g,
  322. // (and if there's no prepared tone!)
  323. // we need to do something special...
  324. if (!VowelHelper::isDiacriticShorthand(c) && !_preparedTone /* && _inputType==POJSyllable */)
  325. {
  326. if (hasPreviousSymbolAtCursor())
  327. {
  328. string prev = previousSymbolAtCursor().symbolInLowerCase();
  329. // N -> nn only works if the first character of the syllable is not an
  330. // all uppercase symbol
  331. if (c=='N' && ((prev != "n") && (prev != "nn")) && _inputType == POJSyllable
  332. && (_symvec.size() > 0 && !_symvec[0].isUpperCase()))
  333. {
  334. // insert two n's in a row
  335. _symvec.insert(_symvec.begin() + _cursor, RomanizationSymbol(string("nn"), _inputType));
  336. _cursor++;
  337. return true;
  338. }
  339. else if (tolower(c)=='n' && prev=="n") {
  340. previousSymbolAtCursor().setSymbol(previousSymbolAtCursor().symbol() + string(1, c));
  341. return true;
  342. }
  343. else if (_inputType == POJSyllable && tolower(c)=='u' && prev=="o") {
  344. previousSymbolAtCursor().setSymbol(previousSymbolAtCursor().symbol() + string(1, c));
  345. return true;
  346. }
  347. else if (_inputType == TLSyllable && tolower(c)=='o' && prev=="o") {
  348. previousSymbolAtCursor().setSymbol(previousSymbolAtCursor().symbol() + string(1, c));
  349. return true;
  350. }
  351. else if ((_inputType == HakkaPFSSyllable) && tolower(c)=='i' && prev=="i") {
  352. previousSymbolAtCursor().setSymbol(previousSymbolAtCursor().symbol() + string(1, c));
  353. return true;
  354. }
  355. else if (tolower(c)=='g' && prev=="nn") {
  356. // we need to break them up!
  357. string before = previousSymbolAtCursor().symbol();
  358. // and the tone of the previous symbol (when it's combined into nn) will be retained
  359. previousSymbolAtCursor().setSymbol(before.substr(0, 1));
  360. // insert one n and one g
  361. _symvec.insert(_symvec.begin() + _cursor, RomanizationSymbol(before.substr(1,1), _inputType));
  362. _cursor++;
  363. _symvec.insert(_symvec.begin() + _cursor, RomanizationSymbol(string(1, c), _inputType));
  364. _cursor++;
  365. return true;
  366. }
  367. }
  368. }
  369. RomanizationSymbol s(string(1, c), _inputType);
  370. if (_preparedTone)
  371. {
  372. _cursor--;
  373. s.setTone(_preparedTone);
  374. _preparedTone = 0;
  375. }
  376. else if (tone > 1 || _inputType == HakkaPFSSyllable)
  377. {
  378. s.setTone(tone);
  379. }
  380. _symvec.insert(_symvec.begin() + _cursor, s);
  381. _cursor++;
  382. return true;
  383. }
  384. bool RomanizationSyllable::removeCharacterAtRightOfCursor() // backspace
  385. {
  386. if (_preparedTone)
  387. {
  388. clearPreparedTone();
  389. return true;
  390. }
  391. if (atBeginning()) return false;
  392. _cursor--;
  393. _symvec.erase(_symvec.begin() + _cursor);
  394. return true;
  395. }
  396. bool RomanizationSyllable::removeCharacterAtLeftOfCursor() // delete
  397. {
  398. // we do some tightrope trick here: if we have _preparedTone ready,
  399. // we "push back" the real _cursor position, do the delete thing,
  400. // then push it back
  401. bool retval=true;
  402. if (_preparedTone) _cursor--;
  403. if (atEnd()) retval=false; else _symvec.erase(_symvec.begin() + _cursor);
  404. if (_preparedTone) _cursor++;
  405. return retval;
  406. }
  407. // returns a normalized string that represents the "internal form" for querying the database
  408. // implies normalization
  409. const string RomanizationSyllable::normalizedQueryData(unsigned int finalTone)
  410. {
  411. RomanizationSyllable s(*this);
  412. s.normalize(finalTone);
  413. string query;
  414. unsigned int size = (unsigned int)s._symvec.size();
  415. unsigned int loudest = 0;
  416. for (unsigned int i=0; i<size; i++) {
  417. query = query + s._symvec[i].symbol();
  418. if (s._symvec[i].tone() > 1) loudest = s._symvec[i].tone();
  419. // fprintf (stderr, "combining query data %s, tone %d\n", s._symvec[i].symbol().c_str(), s._symvec[i].tone());
  420. }
  421. // TODO: Accept 1 when Hakka
  422. if (loudest > 1) query = query + string(1, loudest+'0');
  423. return query;
  424. }
  425. static unsigned int FindVowel(vector<RomanizationSymbol>& symvec, unsigned int start)
  426. {
  427. unsigned i = start >= (unsigned int)symvec.size() ? (unsigned int)symvec.size() : start;
  428. while (i < symvec.size()) {
  429. string s = symvec[i].symbolInLowerCase();
  430. if (s == "a" || s == "e" || s == "i" || s == "o" || s == "u" || s == "ou" || s == "oo" || s == "ii") {
  431. return i;
  432. }
  433. i++;
  434. }
  435. return i;
  436. }
  437. // normalization is an "identpotent" function, ie. the result should
  438. // be the same no matter how many times you call it--this being a very
  439. // important linguistic characteristic of this function
  440. void RomanizationSyllable::normalize(unsigned int finalTone)
  441. {
  442. bool pureTL = (_inputType == TLSyllable && !_forcePOJStyle);
  443. // fprintf (stderr, "input finalTone=%d\n", finalTone);
  444. unsigned int end = (unsigned int)_symvec.size();
  445. // if it's empty, just return
  446. if (!end) return;
  447. unsigned int loudestVowel = end;
  448. unsigned int loudestTone = 0;
  449. unsigned int p;
  450. // find the loudest vowel
  451. #define FLV(x) ((p=findSymbol(x)) != end)
  452. #define SETLOUDEST(v) do { loudestVowel = v; if (_symvec[loudestVowel].tone()>0) { loudestTone = _symvec[loudestVowel].tone(); } } while(0)
  453. if (!pureTL) {
  454. // do ng first
  455. // see if it's ng
  456. if ((p=findSymbolPair("n", "g")) != end) {
  457. SETLOUDEST(p);
  458. }
  459. else {
  460. // do m and n
  461. if (FLV("m")) SETLOUDEST(p);
  462. if (FLV("n")) SETLOUDEST(p);
  463. }
  464. unsigned first = FindVowel(_symvec, 0);
  465. if (first != end) {
  466. SETLOUDEST(first);
  467. unsigned second = FindVowel(_symvec, first + 1);
  468. if (second != end && _symvec[first].symbolInLowerCase() != "a") {
  469. if (!(_symvec[first].symbolInLowerCase() == "e" && _symvec[second].symbolInLowerCase() == "e") && _symvec[second].symbolInLowerCase() != "i") {
  470. SETLOUDEST(second);
  471. }
  472. }
  473. }
  474. // exceptions: oa/oai, oe/oei
  475. first = findSymbolPair("o", "e");
  476. if (first != end) {
  477. string symAfter;
  478. if (first + 2 != end) {
  479. symAfter = _symvec[first + 2].symbolInLowerCase();
  480. }
  481. if (!symAfter.size() || symAfter == "nn") {
  482. SETLOUDEST(first);
  483. }
  484. }
  485. first = findSymbolPair("o", "a");
  486. if (first != end) {
  487. if (findSymbolTriple("o", "a", "i") == end) {
  488. string symAfter;
  489. if (first + 2 != end) {
  490. symAfter = _symvec[first + 2].symbolInLowerCase();
  491. }
  492. if (!symAfter.size() || symAfter == "nn") {
  493. SETLOUDEST(first);
  494. }
  495. }
  496. }
  497. }
  498. else {
  499. if (end==1 && _symvec[0].symbolInLowerCase()=="m") SETLOUDEST(0);
  500. if (FLV("n")) SETLOUDEST(p);
  501. if (FLV("m")) SETLOUDEST(p);
  502. // see if it's ng
  503. if ((p=findSymbolPair("n", "g")) != end)
  504. SETLOUDEST(p);
  505. if (FLV("u")) SETLOUDEST(p);
  506. if (FLV("ii")) SETLOUDEST(p); // TODO: Check the rule here
  507. if (FLV("i")) SETLOUDEST(p);
  508. if (FLV("o")) SETLOUDEST(p);
  509. if (FLV("e")) SETLOUDEST(p);
  510. if (FLV("ou")) SETLOUDEST(p);
  511. if (FLV("oo")) SETLOUDEST(p);
  512. if (FLV("a")) SETLOUDEST(p);
  513. }
  514. // the last "ere" override
  515. if (end >= 3) {
  516. if (_symvec[end-1].symbolInLowerCase() == "e" && _symvec[end-2].symbolInLowerCase() == "r" && _symvec[end-3].symbolInLowerCase() == "e")
  517. {
  518. SETLOUDEST(end-1);
  519. }
  520. }
  521. if (loudestVowel==end) return;
  522. // fprintf(stderr, "found loudest vowel=%d (%s), loudest tone=%d\n", loudestVowel, _symvec[loudestVowel].symbol().c_str(), loudestTone);
  523. // finalTone overrides
  524. if (finalTone > 0) loudestTone = finalTone;
  525. for (unsigned int i=0; i<end; i++) _symvec[i].setTone(0);
  526. string lastSymbolStr = _symvec[end-1].symbolInLowerCase();
  527. // if the symbol is "i", and there's a next "u", we shift the vowel to "u" (TL only)
  528. if (_symvec[loudestVowel].symbolInLowerCase()=="i" && pureTL)
  529. {
  530. if ((loudestVowel+1 < end) && (_symvec[loudestVowel+1].symbolInLowerCase() == "u" || _symvec[loudestVowel+1].symbolInLowerCase() == "ii")) {
  531. // if i follows a vowel, and the next vowel is u or á¹³, we put the accent on the succeeding vowel
  532. loudestVowel++;
  533. }
  534. else if (_inputType == POJSyllable && loudestVowel && (_symvec[loudestVowel-1].symbolInLowerCase() == "u" || _symvec[loudestVowel-1].symbolInLowerCase() == "ii")) {
  535. // if (and only if) in POJ mode/forced POJ style, and if i precedes a vowel, and the next voewl is u or á¹³, we put the accent on the preceeding vowel
  536. loudestVowel--;
  537. }
  538. }
  539. if (loudestTone==4 || /* loudestTone==6 || */ (_inputType != HakkaPFSSyllable && loudestTone <= 1)) {
  540. // ignore the 4th, 6th and 1th (or no tone), so everything is set to 0 now
  541. return;
  542. }
  543. unsigned int tpkhTone = (_inputType == HakkaPFSSyllable) ? 5 : 8;
  544. if (lastSymbolStr=="t" || lastSymbolStr=="p" || lastSymbolStr=="k" || lastSymbolStr=="h") {
  545. // only when the ending is t, p, k, h is the tone set -- and only when the tone is 8
  546. if (loudestTone==tpkhTone) _symvec[loudestVowel].setTone(loudestTone);
  547. return;
  548. }
  549. else {
  550. // if not t,p,k,h, we need to override the loudest tone--back to tone 1 !
  551. if (loudestTone==tpkhTone) {
  552. _symvec[loudestVowel].setTone(0);
  553. return;
  554. }
  555. }
  556. _symvec[loudestVowel].setTone(loudestTone);
  557. #undef FLV
  558. #undef SETTONE
  559. }
  560. char RomanizationSyllable::charAccordingToCaseOf(char c, char ref)
  561. {
  562. return (ref == toupper(ref)) ? toupper(c) : tolower(c);
  563. }
  564. RomanizationSyllable RomanizationSyllable::convertToPOJSyllable()
  565. {
  566. RomanizationSyllable syl = *this;
  567. syl.clearPreparedTone();
  568. syl.setCursor(0);
  569. // if (_inputType==POJSyllable) return syl;
  570. syl.setInputType(POJSyllable);
  571. syl.clear();
  572. // begin TL->POJ conversion
  573. unsigned int size = (unsigned int)_symvec.size();
  574. unsigned int i;
  575. for (i=0; i<size; i++)
  576. {
  577. RomanizationSymbol sym1 = _symvec[i];
  578. string str1 = sym1.symbol();
  579. // fprintf (stderr, "converting to POJ: %s\n", str1.c_str());
  580. string lowstr1 = sym1.symbolInLowerCase();
  581. // oo -> ou
  582. if (lowstr1=="oo")
  583. {
  584. syl.insertCharacterAtCursor(charAccordingToCaseOf('o', str1[0]), sym1.tone());
  585. syl.insertCharacterAtCursor(charAccordingToCaseOf('u', str1[1]));
  586. continue;
  587. }
  588. if (hasNextSymbol(i)) {
  589. RomanizationSymbol sym2 = _symvec[i+1];
  590. string str2 = sym2.symbol();
  591. string lowstr2 = sym2.symbolInLowerCase();
  592. // ou -> oo for POJ but not combined ou
  593. if (lowstr1=="o" && lowstr2=="u") {
  594. // detect case
  595. syl.insertCharacterAtCursor(charAccordingToCaseOf('o', str1[0]), sym1.tone());
  596. syl.insertCharacterAtCursor(charAccordingToCaseOf('u', str2[0]));
  597. i++;
  598. continue;
  599. }
  600. // ts -> ch with case detection
  601. if (lowstr1=="t" && lowstr2=="s") {
  602. // detect case
  603. syl.insertCharacterAtCursor(charAccordingToCaseOf('c', str1[0]));
  604. syl.insertCharacterAtCursor(charAccordingToCaseOf('h', str2[0]));
  605. i++;
  606. continue;
  607. }
  608. // ue -> oe
  609. if (lowstr1=="u" && lowstr2=="e") {
  610. // detect case
  611. syl.insertCharacterAtCursor(charAccordingToCaseOf('o', str1[0]), sym1.tone());
  612. syl.insertCharacterAtCursor(charAccordingToCaseOf('e', str2[0]), sym2.tone());
  613. i++;
  614. continue;
  615. }
  616. // ua -> oa
  617. if (lowstr1=="u" && lowstr2=="a") {
  618. // detect case
  619. syl.insertCharacterAtCursor(charAccordingToCaseOf('o', str1[0]), sym1.tone());
  620. syl.insertCharacterAtCursor(charAccordingToCaseOf('a', str2[0]), sym2.tone());
  621. i++;
  622. continue;
  623. }
  624. // ik -> ek (at ending)
  625. if (lowstr1=="i" && lowstr2=="k" && (i+2)==size) {
  626. // detect case
  627. syl.insertCharacterAtCursor(charAccordingToCaseOf('e', str1[0]), sym1.tone());
  628. syl.insertCharacterAtCursor(charAccordingToCaseOf('k', str2[0]), sym2.tone());
  629. i++;
  630. continue;
  631. }
  632. if (hasNextNextSymbol(i) && (i+3)==size) {
  633. RomanizationSymbol sym3 = _symvec[i+2];
  634. string str3 = sym3.symbol();
  635. string lowstr3 = sym3.symbolInLowerCase();
  636. // ing -> eng (must be ending)
  637. if (lowstr1=="i" && lowstr2=="n" && lowstr3=="g") {
  638. // detect case
  639. syl.insertCharacterAtCursor(charAccordingToCaseOf('e', str1[0]), sym1.tone());
  640. syl.insertCharacterAtCursor(charAccordingToCaseOf('n', str2[0]), sym2.tone());
  641. syl.insertCharacterAtCursor(charAccordingToCaseOf('g', str3[0]), sym3.tone());
  642. i+=2;
  643. continue;
  644. }
  645. // ouh -> oh (ending)
  646. if (lowstr1=="o" && lowstr2=="u" && lowstr3=="h") {
  647. // detect case
  648. syl.insertCharacterAtCursor(charAccordingToCaseOf('o', str1[0]), sym1.tone());
  649. syl.insertCharacterAtCursor(charAccordingToCaseOf('h', str2[0]), sym2.tone());
  650. i+=2;
  651. continue;
  652. }
  653. }
  654. }
  655. syl.insertSymbolAtCursor(sym1);
  656. }
  657. return syl;
  658. }
  659. RomanizationSyllable RomanizationSyllable::convertToTLSyllable()
  660. {
  661. RomanizationSyllable syl = *this;
  662. syl.clearPreparedTone();
  663. syl.setCursor(0);
  664. // if (_inputType==TLSyllable) return syl;
  665. syl.setInputType(TLSyllable);
  666. syl.clear();
  667. // begin POJ->TL conversion
  668. unsigned int size = (unsigned int)_symvec.size();
  669. unsigned int i;
  670. for (i=0; i<size; i++)
  671. {
  672. RomanizationSymbol sym1 = _symvec[i];
  673. string str1 = sym1.symbol();
  674. string lowstr1 = sym1.symbolInLowerCase();
  675. // ou -> oo
  676. if (lowstr1=="ou")
  677. {
  678. // detect case
  679. syl.insertCharacterAtCursor(charAccordingToCaseOf('o', str1[0]), sym1.tone());
  680. syl.insertCharacterAtCursor(charAccordingToCaseOf('o', str1[1]));
  681. continue;
  682. }
  683. if (hasNextSymbol(i)) {
  684. RomanizationSymbol sym2 = _symvec[i+1];
  685. string str2 = sym2.symbol();
  686. string lowstr2 = sym2.symbolInLowerCase();
  687. // ch -> ts with case detection
  688. if (lowstr1=="c" && lowstr2=="h") {
  689. // detect case
  690. syl.insertCharacterAtCursor(charAccordingToCaseOf('t', str1[0]));
  691. syl.insertCharacterAtCursor(charAccordingToCaseOf('s', str2[0]));
  692. i++;
  693. continue;
  694. }
  695. // oe -> ue
  696. if (lowstr1=="o" && lowstr2=="e") {
  697. // detect case
  698. syl.insertCharacterAtCursor(charAccordingToCaseOf('u', str1[0]), sym1.tone());
  699. syl.insertCharacterAtCursor(charAccordingToCaseOf('e', str2[0]), sym2.tone());
  700. i++;
  701. continue;
  702. }
  703. // oa -> ua
  704. if (lowstr1=="o" && lowstr2=="a") {
  705. // detect case
  706. syl.insertCharacterAtCursor(charAccordingToCaseOf('u', str1[0]), sym1.tone());
  707. syl.insertCharacterAtCursor(charAccordingToCaseOf('a', str2[0]), sym2.tone());
  708. i++;
  709. continue;
  710. }
  711. // ek -> ik (at ending)
  712. if (lowstr1=="e" && lowstr2=="k" && (i+2)==size) {
  713. // detect case
  714. syl.insertCharacterAtCursor(charAccordingToCaseOf('i', str1[0]), sym1.tone());
  715. syl.insertCharacterAtCursor(charAccordingToCaseOf('k', str2[0]), sym2.tone());
  716. i++;
  717. continue;
  718. }
  719. if (hasNextNextSymbol(i) && (i+3)==size) {
  720. RomanizationSymbol sym3 = _symvec[i+2];
  721. string str3 = sym3.symbol();
  722. string lowstr3 = sym3.symbolInLowerCase();
  723. // ing -> eng (must be ending)
  724. if (lowstr1=="e" && lowstr2=="n" && lowstr3=="g") {
  725. // detect case
  726. syl.insertCharacterAtCursor(charAccordingToCaseOf('i', str1[0]), sym1.tone());
  727. syl.insertCharacterAtCursor(charAccordingToCaseOf('n', str2[0]), sym2.tone());
  728. syl.insertCharacterAtCursor(charAccordingToCaseOf('g', str3[0]), sym3.tone());
  729. i+=2;
  730. continue;
  731. }
  732. }
  733. }
  734. syl.insertSymbolAtCursor(sym1);
  735. }
  736. return syl;
  737. }
  738. bool RomanizationSyllable::atBeginning() const
  739. {
  740. return _cursor == 0;
  741. }
  742. bool RomanizationSyllable::atEnd() const
  743. {
  744. return _cursor == numberOfCodepoints();
  745. }
  746. void RomanizationSyllable::clearPreparedTone()
  747. {
  748. if (!_preparedTone) return;
  749. _preparedTone = 0;
  750. _cursor--;
  751. }
  752. bool RomanizationSyllable::hasPreviousSymbolAtCursor() const
  753. {
  754. unsigned int realcursor = _preparedTone ? _cursor-1 : _cursor;
  755. return realcursor > 0;
  756. }
  757. bool RomanizationSyllable::hasNextSymbol(unsigned int pos) const
  758. {
  759. if (pos+1 >= _symvec.size()) return false;
  760. return true;
  761. }
  762. bool RomanizationSyllable::hasNextNextSymbol(unsigned int pos) const
  763. {
  764. if (pos+2 >= _symvec.size()) return false;
  765. return true;
  766. }
  767. // the result of this function is unpredictable if there's no
  768. // previous symbol--always check with hasPreviousSymbolAtCursor() !
  769. RomanizationSymbol& RomanizationSyllable::previousSymbolAtCursor()
  770. {
  771. unsigned int realcursor = _preparedTone ? _cursor-1 : _cursor;
  772. return _symvec[realcursor-1];
  773. }
  774. // always assumes that the given input is in all lower case
  775. unsigned int RomanizationSyllable::findSymbol(const char *s) const
  776. {
  777. string cpps(s);
  778. unsigned int size = (unsigned int)_symvec.size();
  779. unsigned int i;
  780. for (i = 0; i < size; i++) {
  781. if (_symvec[i].symbolInLowerCase() == cpps) break;
  782. }
  783. return i;
  784. }
  785. unsigned int RomanizationSyllable::findSymbolPair(const char *s1, const char *s2) const
  786. {
  787. string cpps1(s1), cpps2(s2);
  788. unsigned int size = (unsigned int)_symvec.size();
  789. if (size < 2) return size;
  790. unsigned int i;
  791. for (i = 0; i < size-1; i++) {
  792. if (_symvec[i].symbolInLowerCase()==cpps1 && _symvec[i+1].symbolInLowerCase()==cpps2) return i;
  793. }
  794. return size;
  795. }
  796. unsigned int RomanizationSyllable::findSymbolTriple(const char *s1, const char *s2, const char *s3) const
  797. {
  798. string cpps1(s1), cpps2(s2), cpps3(s3);
  799. unsigned int size = (unsigned int)_symvec.size();
  800. if (size < 3) return size;
  801. unsigned int i;
  802. for (i = 0; i < size-2; i++) {
  803. if (_symvec[i].symbolInLowerCase()==cpps1 && _symvec[i+1].symbolInLowerCase()==cpps2 && _symvec[i+2].symbolInLowerCase()==cpps3) return i;
  804. }
  805. return size;
  806. }
  807. const RomanizationSyllable FreeFormSyllable::convertToTLFromTLPA(unsigned int finalTone)
  808. {
  809. string rep=internalForm();
  810. RomanizationSyllable syl;
  811. syl.setInputType(TLSyllable);
  812. unsigned int size = (unsigned int)rep.length();
  813. for (unsigned int i=0; i<size; i++)
  814. {
  815. if (rep[i]=='c') {
  816. syl.insertCharacterAtCursor('t');
  817. syl.insertCharacterAtCursor('s');
  818. }
  819. else if (rep[i]=='C')
  820. {
  821. syl.insertCharacterAtCursor('T');
  822. syl.insertCharacterAtCursor('S');
  823. }
  824. else syl.insertCharacterAtCursor(rep[i]);
  825. }
  826. syl.normalize(finalTone);
  827. return syl;
  828. }
  829. const RomanizationSyllable FreeFormSyllable::convertToTLFromDT(unsigned int finalTone)
  830. {
  831. string rep=internalForm();
  832. RomanizationSyllable syl;
  833. syl.setInputType(TLSyllable);
  834. unsigned int size = (unsigned int)rep.length();
  835. for (unsigned int i=0; i<size; i++)
  836. {
  837. char dt1 = rep[i];
  838. char lowdt1 = tolower(dt1);
  839. // r -> j (beginning)
  840. if (i==0 && lowdt1=='r') {
  841. syl.insertCharacterAtCursor(charWithCaseAccordingTo('j', dt1));
  842. continue;
  843. }
  844. // replaces the two-character combinations
  845. if (i+1 < size) {
  846. string part=rep.substr(i, 2);
  847. string lower=toLowerString(part);
  848. // or -> o
  849. if (lower=="or") {
  850. syl.insertCharacterAtCursor(charWithCaseAccordingTo('o', part));
  851. i++;
  852. continue;
  853. }
  854. // en -> ian
  855. if (lower=="en") {
  856. syl.insertCharacterAtCursor(charWithCaseAccordingTo('i', part));
  857. syl.insertCharacterAtCursor(charWithCaseAccordingTo('a', part));
  858. syl.insertCharacterAtCursor(charWithCaseAccordingTo('n', part));
  859. i++;
  860. continue;
  861. }
  862. // et -> iat
  863. if (lower=="et") {
  864. syl.insertCharacterAtCursor(charWithCaseAccordingTo('i', part));
  865. syl.insertCharacterAtCursor(charWithCaseAccordingTo('a', part));
  866. syl.insertCharacterAtCursor(charWithCaseAccordingTo('t', part));
  867. i++;
  868. continue;
  869. }
  870. // bh -> b (beginning)
  871. if (i==0 && lower=="bh") {
  872. syl.insertCharacterAtCursor(charWithCaseAccordingTo('b', part));
  873. i++;
  874. continue;
  875. }
  876. // gh -> g (beginning)
  877. if (i==0 && lower=="gh") {
  878. syl.insertCharacterAtCursor(charWithCaseAccordingTo('g', part));
  879. i++;
  880. continue;
  881. }
  882. // wa -> ua (beginning)
  883. if (lower=="wa" && i==0) {
  884. syl.insertCharacterAtCursor(charWithCaseAccordingTo('u', part));
  885. syl.insertCharacterAtCursor(charWithCaseAccordingTo('a', part));
  886. i++;
  887. continue;
  888. }
  889. // we -> ue (beginning)
  890. if (lower=="we" && i==0) {
  891. syl.insertCharacterAtCursor(charWithCaseAccordingTo('u', part));
  892. syl.insertCharacterAtCursor(charWithCaseAccordingTo('e', part));
  893. i++;
  894. continue;
  895. }
  896. // wi -> ui (beginning)
  897. if (lower=="wi" && i==0) {
  898. syl.insertCharacterAtCursor(charWithCaseAccordingTo('u', part));
  899. syl.insertCharacterAtCursor(charWithCaseAccordingTo('i', part));
  900. i++;
  901. continue;
  902. }
  903. // yo -> io (beginning)
  904. if (lower=="yo" && i==0) {
  905. syl.insertCharacterAtCursor(charWithCaseAccordingTo('i', part));
  906. syl.insertCharacterAtCursor(charWithCaseAccordingTo('o', part));
  907. i++;
  908. continue;
  909. }
  910. // yi -> i (beginning)
  911. if (lower=="yi" && i==0) {
  912. syl.insertCharacterAtCursor(charWithCaseAccordingTo('i', part));
  913. i++;
  914. continue;
  915. }
  916. }
  917. // o -> oo
  918. if (lowdt1=='o') {
  919. syl.insertCharacterAtCursor(charWithCaseAccordingTo('o', dt1));
  920. syl.insertCharacterAtCursor(charWithCaseAccordingTo('o', dt1));
  921. continue;
  922. }
  923. // b -> p (beginning)
  924. if (i==0 && lowdt1=='b') {
  925. syl.insertCharacterAtCursor(charWithCaseAccordingTo('p', dt1));
  926. continue;
  927. }
  928. // p -> ph (beginning)
  929. if (i==0 && lowdt1=='p') {
  930. syl.insertCharacterAtCursor(charWithCaseAccordingTo('p', dt1));
  931. syl.insertCharacterAtCursor(charWithCaseAccordingTo('h', dt1));
  932. continue;
  933. }
  934. // k -> kh (beginning)
  935. if (i==0 && lowdt1=='k') {
  936. syl.insertCharacterAtCursor(charWithCaseAccordingTo('k', dt1));
  937. syl.insertCharacterAtCursor(charWithCaseAccordingTo('h', dt1));
  938. continue;
  939. }
  940. // g -> k (beginning)
  941. if (i==0 && lowdt1=='g') {
  942. syl.insertCharacterAtCursor(charWithCaseAccordingTo('k', dt1));
  943. continue;
  944. }
  945. // d -> t (beginning)
  946. if (i==0 && lowdt1=='d') {
  947. syl.insertCharacterAtCursor(charWithCaseAccordingTo('t', dt1));
  948. continue;
  949. }
  950. // t -> th (beginning)
  951. if (i==0 && lowdt1=='t') {
  952. syl.insertCharacterAtCursor(charWithCaseAccordingTo('t', dt1));
  953. syl.insertCharacterAtCursor(charWithCaseAccordingTo('h', dt1));
  954. continue;
  955. }
  956. // z -> ts (beginning)
  957. if (i==0 && lowdt1=='z') {
  958. syl.insertCharacterAtCursor(charWithCaseAccordingTo('t', dt1));
  959. syl.insertCharacterAtCursor(charWithCaseAccordingTo('s', dt1));
  960. continue;
  961. }
  962. // c -> tsh (beginning)
  963. if (i==0 && lowdt1=='c') {
  964. syl.insertCharacterAtCursor(charWithCaseAccordingTo('t', dt1));
  965. syl.insertCharacterAtCursor(charWithCaseAccordingTo('s', dt1));
  966. syl.insertCharacterAtCursor(charWithCaseAccordingTo('h', dt1));
  967. continue;
  968. }
  969. // else ...
  970. syl.insertCharacterAtCursor(dt1);
  971. }
  972. // remap the final tone
  973. unsigned int tltone=finalTone;
  974. syl.normalize(tltone);
  975. return syl;
  976. }
  977. char FreeFormSyllable::charWithCaseAccordingTo(char c, char ref) const
  978. {
  979. if (tolower(ref) == ref) return tolower(c);
  980. return toupper(c);
  981. }
  982. char FreeFormSyllable::charWithCaseAccordingTo(char c, const string &r) const
  983. {
  984. if (tolower(r[0]) == r[0]) return tolower(c);
  985. return toupper(c);
  986. }
  987. const string FreeFormSyllable::toLowerString(const string &s) const
  988. {
  989. unsigned int size = (unsigned int)s.length();
  990. string lower;
  991. unsigned int i;
  992. for (i=0;i<size;i++) lower+=string(1, tolower(s[i]));
  993. return lower;
  994. }