/main/src/com/google/refine/clustering/binning/Metaphone3.java
http://google-refine.googlecode.com/ · Java · 7460 lines · 4716 code · 652 blank · 2092 comment · 2026 complexity · 8774dc1d2a94526d600a7cc2d818d021 MD5 · raw file
Large files are truncated click here to view the full file
- /*
-
- Copyright 2010, Lawrence Philips
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
-
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above
- copyright notice, this list of conditions and the following disclaimer
- in the documentation and/or other materials provided with the
- distribution.
- * Neither the name of Google Inc. nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- */
-
- /*
- * A request from the author: Please comment and sign any changes you make to
- * the Metaphone 3 reference implementation.
- * <br>
- * Please do NOT reformat this module to Google Refine's coding standard,
- * but instead keep the original format so that it can be more easily compare
- * to any modified fork of the original.
- */
-
- /**
- * Metaphone 3<br>
- * VERSION 2.1.3
- *
- * by Lawrence Philips<br>
- *
- * Metaphone 3 is designed to return an *approximate* phonetic key (and an alternate
- * approximate phonetic key when appropriate) that should be the same for English
- * words, and most names familiar in the United States, that are pronounced *similarly*.
- * The key value is *not* intended to be an *exact* phonetic, or even phonemic,
- * representation of the word. This is because a certain degree of 'fuzziness' has
- * proven to be useful in compensating for variations in pronunciation, as well as
- * misheard pronunciations. For example, although americans are not usually aware of it,
- * the letter 's' is normally pronounced 'z' at the end of words such as "sounds".<br><br>
- *
- * The 'approximate' aspect of the encoding is implemented according to the following rules:<br><br>
- *
- * (1) All vowels are encoded to the same value - 'A'. If the parameter encodeVowels
- * is set to false, only *initial* vowels will be encoded at all. If encodeVowels is set
- * to true, 'A' will be encoded at all places in the word that any vowels are normally
- * pronounced. 'W' as well as 'Y' are treated as vowels. Although there are differences in
- * the pronunciation of 'W' and 'Y' in different circumstances that lead to their being
- * classified as vowels under some circumstances and as consonants in others, for the purposes
- * of the 'fuzziness' component of the Soundex and Metaphone family of algorithms they will
- * be always be treated here as vowels.<br><br>
- *
- * (2) Voiced and un-voiced consonant pairs are mapped to the same encoded value. This
- * means that:<br>
- * 'D' and 'T' -> 'T'<br>
- * 'B' and 'P' -> 'P'<br>
- * 'G' and 'K' -> 'K'<br>
- * 'Z' and 'S' -> 'S'<br>
- * 'V' and 'F' -> 'F'<br><br>
- *
- * - In addition to the above voiced/unvoiced rules, 'CH' and 'SH' -> 'X', where 'X'
- * represents the "-SH-" and "-CH-" sounds in Metaphone 3 encoding.<br><br>
- *
- * - Also, the sound that is spelled as "TH" in English is encoded to '0' (zero symbol). (Although
- * Americans are not usually aware of it, "TH" is pronounced in a voiced (e.g. "that") as
- * well as an unvoiced (e.g. "theater") form, which are naturally mapped to the same encoding.)<br><br>
- *
- * The encodings in this version of Metaphone 3 are according to pronunciations common in the
- * United States. This means that they will be inaccurate for consonant pronunciations that
- * are different in the United Kingdom, for example "tube" -> "CHOOBE" -> XAP rather than american TAP.<br><br>
- *
- * Metaphone 3 was preceded by by Soundex, patented in 1919, and Metaphone and Double Metaphone,
- * developed by Lawrence Philips. All of these algorithms resulted in a significant number of
- * incorrect encodings. Metaphone3 was tested against a database of about 100 thousand English words,
- * names common in the United States, and non-English words found in publications in the United States,
- * with an emphasis on words that are commonly mispronounced, prepared by the Moby Words website,
- * but with the Moby Words 'phonetic' encodings algorithmically mapped to Double Metaphone encodings.
- * Metaphone3 increases the accuracy of encoding of english words, common names, and non-English
- * words found in american publications from the 89% for Double Metaphone, to over 98%.<br><br>
- *
- * DISCLAIMER:
- * Anthropomorphic Software LLC claims only that Metaphone 3 will return correct encodings,
- * within the 'fuzzy' definition of correct as above, for a very high percentage of correctly
- * spelled English and commonly recognized non-English words. Anthropomorphic Software LLC
- * warns the user that a number of words remain incorrectly encoded, that misspellings may not
- * be encoded 'properly', and that people often have differing ideas about the pronunciation
- * of a word. Therefore, Metaphone 3 is not guaranteed to return correct results every time, and
- * so a desired target word may very well be missed. Creators of commercial products should
- * keep in mind that systems like Metaphone 3 produce a 'best guess' result, and should
- * condition the expectations of end users accordingly.<br><br>
- *
- * METAPHONE3 IS PROVIDED "AS IS" WITHOUT
- * WARRANTY OF ANY KIND. LAWRENCE PHILIPS AND ANTHROPOMORPHIC SOFTWARE LLC
- * MAKE NO WARRANTIES, EXPRESS OR IMPLIED, THAT IT IS FREE OF ERROR,
- * OR ARE CONSISTENT WITH ANY PARTICULAR STANDARD OF MERCHANTABILITY,
- * OR THAT IT WILL MEET YOUR REQUIREMENTS FOR ANY PARTICULAR APPLICATION.
- * LAWRENCE PHILIPS AND ANTHROPOMORPHIC SOFTWARE LLC DISCLAIM ALL LIABILITY
- * FOR DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES RESULTING FROM USE
- * OF THIS SOFTWARE.
- *
- * @author Lawrence Philips
- *
- * Metaphone 3 is designed to return an <i>approximate</i> phonetic key (and an alternate
- * approximate phonetic key when appropriate) that should be the same for English
- * words, and most names familiar in the United States, that are pronounced "similarly".
- * The key value is <i>not</i> intended to be an exact phonetic, or even phonemic,
- * representation of the word. This is because a certain degree of 'fuzziness' has
- * proven to be useful in compensating for variations in pronunciation, as well as
- * misheard pronunciations. For example, although americans are not usually aware of it,
- * the letter 's' is normally pronounced 'z' at the end of words such as "sounds".<br><br>
- *
- * The 'approximate' aspect of the encoding is implemented according to the following rules:<br><br>
- *
- * (1) All vowels are encoded to the same value - 'A'. If the parameter encodeVowels
- * is set to false, only *initial* vowels will be encoded at all. If encodeVowels is set
- * to true, 'A' will be encoded at all places in the word that any vowels are normally
- * pronounced. 'W' as well as 'Y' are treated as vowels. Although there are differences in
- * the pronunciation of 'W' and 'Y' in different circumstances that lead to their being
- * classified as vowels under some circumstances and as consonants in others, for the purposes
- * of the 'fuzziness' component of the Soundex and Metaphone family of algorithms they will
- * be always be treated here as vowels.<br><br>
- *
- * (2) Voiced and un-voiced consonant pairs are mapped to the same encoded value. This
- * means that:<br>
- * 'D' and 'T' -> 'T'<br>
- * 'B' and 'P' -> 'P'<br>
- * 'G' and 'K' -> 'K'<br>
- * 'Z' and 'S' -> 'S'<br>
- * 'V' and 'F' -> 'F'<br><br>
- *
- * - In addition to the above voiced/unvoiced rules, 'CH' and 'SH' -> 'X', where 'X'
- * represents the "-SH-" and "-CH-" sounds in Metaphone 3 encoding.<br><br>
- *
- * - Also, the sound that is spelled as "TH" in English is encoded to '0' (zero symbol). (Although
- * americans are not usually aware of it, "TH" is pronounced in a voiced (e.g. "that") as
- * well as an unvoiced (e.g. "theater") form, which are naturally mapped to the same encoding.)<br><br>
- *
- * In the "Exact" encoding, voiced/unvoiced pairs are <i>not</i> mapped to the same encoding, except
- * for the voiced and unvoiced versions of 'TH', sounds such as 'CH' and 'SH', and for 'S' and 'Z',
- * so that the words whose metaph keys match will in fact be closer in pronunciation that with the
- * more approximate setting. Keep in mind that encoding settings for search strings should always
- * be exactly the same as the encoding settings of the stored metaph keys in your database!
- * Because of the considerably increased accuracy of Metaphone3, it is now possible to use this
- * setting and have a very good chance of getting a correct encoding.
- * <br><br>
- * In the Encode Vowels encoding, all non-initial vowels and diphthongs will be encoded to
- * 'A', and there will only be one such vowel encoding character between any two consonants.
- * It turns out that there are some surprising wrinkles to encoding non-initial vowels in
- * practice, pre-eminently in inversions between spelling and pronunciation such as e.g.
- * "wrinkle" => 'RANKAL', where the last two sounds are inverted when spelled.
- * <br><br>
- * The encodings in this version of Metaphone 3 are according to pronunciations common in the
- * United States. This means that they will be inaccurate for consonant pronunciations that
- * are different in the United Kingdom, for example "tube" -> "CHOOBE" -> XAP rather than american TAP.
- * <br><br>
- *
- */
-
- package com.google.refine.clustering.binning;
-
- public class Metaphone3 {
-
- /** Length of word sent in to be encoded, as
- * measured at beginning of encoding. */
- int m_length;
-
- /** Length of encoded key string. */
- int m_metaphLength;
-
- /** Flag whether or not to encode non-initial vowels. */
- boolean m_encodeVowels;
-
- /** Flag whether or not to encode consonants as exactly
- * as possible. */
- boolean m_encodeExact;
-
- /** Internal copy of word to be encoded, allocated separately
- * from string pointed to in incoming parameter. */
- String m_inWord;
-
- /** Running copy of primary key. */
- StringBuffer m_primary;
-
- /** Running copy of secondary key. */
- StringBuffer m_secondary;
-
- /** Index of character in m_inWord currently being
- * encoded. */
- int m_current;
-
- /** Index of last character in m_inWord. */
- int m_last;
-
- /** Flag that an AL inversion has already been done. */
- boolean flag_AL_inversion;
-
- /** Default size of key storage allocation */
- int MAX_KEY_ALLOCATION = 32;
-
- /** Default maximum length of encoded key. */
- int DEFAULT_MAX_KEY_LENGTH = 8;
-
- ////////////////////////////////////////////////////////////////////////////////
- // Metaphone3 class definition
- ////////////////////////////////////////////////////////////////////////////////
-
- /**
- * Constructor, default. This constructor is most convenient when
- * encoding more than one word at a time. New words to encode can
- * be set using SetWord(char *).
- *
- */
- Metaphone3()
- {
- m_primary = new StringBuffer();
- m_secondary = new StringBuffer();
-
- m_metaphLength = DEFAULT_MAX_KEY_LENGTH;
- m_encodeVowels = false;
- m_encodeExact = false;
- }
-
- /**
- * Constructor, parameterized. The Metaphone3 object will
- * be initialized with the incoming string, and can be called
- * on to encode this string. This constructor is most convenient
- * when only one word needs to be encoded.
- *
- * @param in pointer to char string of word to be encoded.
- *
- */
- Metaphone3(String in)
- {
- this();
-
- SetWord(in);
- }
-
- /**
- * Sets word to be encoded.
- *
- * @param in pointer to EXTERNALLY ALLOCATED char string of
- * the word to be encoded.
- *
- */
- void SetWord(String in)
- {
- m_inWord = in.toUpperCase();;
- m_length = m_inWord.length();
- }
-
- /**
- * Sets length allocated for output keys.
- * If incoming number is greater than maximum allowable
- * length returned by GetMaximumKeyLength(), set key length
- * to maximum key length and return false; otherwise, set key
- * length to parameter value and return true.
- *
- * @param inKeyLength new length of key.
- * @return true if able to set key length to requested value.
- *
- */
- boolean SetKeyLength(int inKeyLength)
- {
- if(inKeyLength < 1)
- {
- // can't have that -
- // no room for terminating null
- inKeyLength = 1;
- }
-
- if(inKeyLength > MAX_KEY_ALLOCATION)
- {
- m_metaphLength = MAX_KEY_ALLOCATION;
- return false;
- }
-
- m_metaphLength = inKeyLength;
- return true;
- }
-
- /**
- * Adds an encoding character to the encoded key value string - one parameter version.
- *
- * @param main primary encoding character to be added to encoded key string.
- */
- void MetaphAdd(String in)
- {
- if(!(in.equals("A")
- && (m_primary.length() > 0)
- && (m_primary.charAt(m_primary.length() - 1) == 'A')))
- {
- m_primary.append(in);
- }
-
- if(!(in.equals("A")
- && (m_secondary.length() > 0)
- && (m_secondary.charAt(m_secondary.length() - 1) == 'A')))
- {
- m_secondary.append(in);
- }
- }
-
- /**
- * Adds an encoding character to the encoded key value string - two parameter version
- *
- * @param main primary encoding character to be added to encoded key string
- * @param alt alternative encoding character to be added to encoded alternative key string
- *
- */
- void MetaphAdd(String main, String alt)
- {
- if(!(main.equals("A")
- && (m_primary.length() > 0)
- && (m_primary.charAt(m_primary.length() - 1) == 'A')))
- {
- m_primary.append(main);
- }
-
- if(!(alt.equals("A")
- && (m_secondary.length() > 0)
- && (m_secondary.charAt(m_secondary.length() - 1) == 'A')))
- {
- if(!alt.isEmpty())
- {
- m_secondary.append(alt);
- }
- }
- }
-
- /**
- * Adds an encoding character to the encoded key value string - Exact/Approx version
- *
- * @param mainExact primary encoding character to be added to encoded key string if
- * m_encodeExact is set
- *
- * @param altExact alternative encoding character to be added to encoded alternative
- * key string if m_encodeExact is set
- *
- * @param main primary encoding character to be added to encoded key string
- *
- * @param alt alternative encoding character to be added to encoded alternative key string
- *
- */
- void MetaphAddExactApprox(String mainExact, String altExact, String main, String alt)
- {
- if(m_encodeExact)
- {
- MetaphAdd(mainExact, altExact);
- }
- else
- {
- MetaphAdd(main, alt);
- }
- }
-
- /**
- * Adds an encoding character to the encoded key value string - Exact/Approx version
- *
- * @param mainExact primary encoding character to be added to encoded key string if
- * m_encodeExact is set
- *
- * @param main primary encoding character to be added to encoded key string
- *
- */
- void MetaphAddExactApprox(String mainExact, String main)
- {
- if(m_encodeExact)
- {
- MetaphAdd(mainExact);
- }
- else
- {
- MetaphAdd(main);
- }
- }
- /** Retrieves maximum number of characters currently allocated for encoded key.
- *
- * @return short integer representing the length allowed for the key.
- */
- int GetKeyLength(){return m_metaphLength;}
-
- /** Retrieves maximum number of characters allowed for encoded key.
- *
- * @return short integer representing the length of allocated storage for the key.
- */
- int GetMaximumKeyLength(){return (int)MAX_KEY_ALLOCATION;}
-
- /** Sets flag that causes Metaphone3 to encode non-initial vowels. However, even
- * if there are more than one vowel sound in a vowel sequence (i.e.
- * vowel diphthong, etc.), only one 'A' will be encoded before the next consonant or the
- * end of the word.
- *
- * @param inEncodeVowels Non-initial vowels encoded if true, not if false.
- */
- void SetEncodeVowels(boolean inEncodeVowels){m_encodeVowels = inEncodeVowels;}
-
- /** Retrieves setting determining whether or not non-initial vowels will be encoded.
- *
- * @return true if the Metaphone3 object has been set to encode non-initial vowels, false if not.
- */
- boolean GetEncodeVowels(){return m_encodeVowels;}
-
- /** Sets flag that causes Metaphone3 to encode consonants as exactly as possible.
- * This does not include 'S' vs. 'Z', since americans will pronounce 'S' at the
- * at the end of many words as 'Z', nor does it include "CH" vs. "SH". It does cause
- * a distinction to be made between 'B' and 'P', 'D' and 'T', 'G' and 'K', and 'V'
- * and 'F'.
- *
- * @param inEncodeExact consonants to be encoded "exactly" if true, not if false.
- */
- void SetEncodeExact(boolean inEncodeExact){m_encodeExact = inEncodeExact;}
-
- /** Retrieves setting determining whether or not consonants will be encoded "exactly".
- *
- * @return true if the Metaphone3 object has been set to encode "exactly", false if not.
- */
- boolean GetEncodeExact(){return m_encodeExact;}
-
- /** Retrieves primary encoded key.
- *
- * @return a character pointer to the primary encoded key
- */
- String GetMetaph()
- {
- String primary = new String(m_primary);
- return primary;
- }
-
- /** Retrieves alternate encoded key, if any.
- *
- * @return a character pointer to the alternate encoded key
- */
- String GetAlternateMetaph()
- {
- String secondary = new String(m_secondary);
- return secondary;
- }
-
- /**
- * Test for close front vowels
- *
- * @return true if close front vowel
- */
- boolean Front_Vowel(int at)
- {
- if(((CharAt(at) == 'E') || (CharAt(at) == 'I') || (CharAt(at) == 'Y')))
- {
- return true;
- }
-
- return false;
- }
-
- /**
- * Detect names or words that begin with spellings
- * typical of german or slavic words, for the purpose
- * of choosing alternate pronunciations correctly
- *
- */
- boolean SlavoGermanic()
- {
- if(StringAt(0, 3, "SCH", "")
- || StringAt(0, 2, "SW", "")
- || (CharAt(0) == 'J')
- || (CharAt(0) == 'W'))
- {
- return true;
- }
-
- return false;
- }
- /**
- * Tests if character is a vowel
- *
- * @param inChar character to be tested in string to be encoded
- * @return true if character is a vowel, false if not
- *
- */
- boolean IsVowel(char inChar)
- {
- if((inChar == 'A')
- || (inChar == 'E')
- || (inChar == 'I')
- || (inChar == 'O')
- || (inChar == 'U')
- || (inChar == 'Y')
- || (inChar == 'Ŕ')
- || (inChar == 'Á')
- || (inChar == 'Â')
- || (inChar == 'Ă')
- || (inChar == 'Ä')
- || (inChar == 'Ĺ')
- || (inChar == 'Ć')
- || (inChar == 'Č')
- || (inChar == 'É')
- || (inChar == 'Ę')
- || (inChar == 'Ë')
- || (inChar == 'Ě')
- || (inChar == 'Í')
- || (inChar == 'Î')
- || (inChar == 'Ď')
- || (inChar == 'Ň')
- || (inChar == 'Ó')
- || (inChar == 'Ô')
- || (inChar == 'Ő')
- || (inChar == 'Ö')
- || (inChar == '?')
- || (inChar == 'Ř')
- || (inChar == 'Ů')
- || (inChar == 'Ú')
- || (inChar == 'Ű')
- || (inChar == 'Ü')
- || (inChar == 'Ý')
- || (inChar == '?'))
- {
- return true;
- }
-
- return false;
- }
-
- /**
- * Tests if character in the input string is a vowel
- *
- * @param at position of character to be tested in string to be encoded
- * @return true if character is a vowel, false if not
- *
- */
- boolean IsVowel(int at)
- {
- if((at < 0) || (at >= m_length))
- {
- return false;
- }
-
- char it = CharAt(at);
-
- if(IsVowel(it))
- {
- return true;
- }
-
- return false;
- }
-
- /**
- * Skips over vowels in a string. Has exceptions for skipping consonants that
- * will not be encoded.
- *
- * @param at position, in string to be encoded, of character to start skipping from
- *
- * @return position of next consonant in string to be encoded
- */
- int SkipVowels(int at)
- {
- if(at < 0)
- {
- return 0;
- }
-
- if(at >= m_length)
- {
- return m_length;
- }
-
- char it = CharAt(at);
-
- while(IsVowel(it) || (it == 'W'))
- {
- if(StringAt(at, 4, "WICZ", "WITZ", "WIAK", "")
- || StringAt((at - 1), 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY", "")
- || (StringAt(at, 5, "WICKI", "WACKI", "") && ((at + 4) == m_last)))
- {
- break;
- }
-
- at++;
- if(((CharAt(at - 1) == 'W') && (CharAt(at) == 'H'))
- && !(StringAt(at, 3, "HOP", "")
- || StringAt(at, 4, "HIDE", "HARD", "HEAD", "HAWK", "HERD", "HOOK", "HAND", "HOLE", "")
- || StringAt(at, 5, "HEART", "HOUSE", "HOUND", "")
- || StringAt(at, 6, "HAMMER", "")))
- {
- at++;
- }
-
- if(at > (m_length - 1))
- {
- break;
- }
- it = CharAt(at);
- }
-
- return at;
- }
-
- /**
- * Advanced counter m_current so that it indexes the next character to be encoded
- *
- * @param ifNotEncodeVowels number of characters to advance if not encoding internal vowels
- * @param ifEncodeVowels number of characters to advance if encoding internal vowels
- *
- */
- void AdvanceCounter(int ifNotEncodeVowels, int ifEncodeVowels)
- {
- if(!m_encodeVowels)
- {
- m_current += ifNotEncodeVowels;
- }
- else
- {
- m_current += ifEncodeVowels;
- }
- }
-
-
- /**
- * Subscript safe .charAt()
- *
- * @param at index of character to access
- * @return null if index out of bounds, .charAt() otherwise
- */
- char CharAt(int at)
- {
- // check substring bounds
- if((at < 0)
- || (at > (m_length - 1)))
- {
- return '\0';
- }
-
- return m_inWord.charAt(at);
- }
-
- /**
- * Tests whether the word is the root or a regular english inflection
- * of it, e.g. "ache", "achy", "aches", "ached", "aching", "achingly"
- * This is for cases where we want to match only the root and corresponding
- * inflected forms, and not completely different words which may have the
- * same substring in them.
- */
- boolean RootOrInflections(String inWord, String root)
- {
- int len = root.length();
- String test;
-
- test = root + "S";
- if((inWord.equals(root))
- || (inWord.equals(test)))
- {
- return true;
- }
-
- if(root.charAt(len - 1) != 'E')
- {
- test = root + "ES";
- }
-
- if(inWord.equals(test))
- {
- return true;
- }
-
- if(root.charAt(len - 1) != 'E')
- {
- test = root + "ED";
- }
- else
- {
- test = root + "D";
- }
-
- if(inWord.equals(test))
- {
- return true;
- }
-
- if(root.charAt(len - 1) == 'E')
- {
- root = root.substring(0, len - 1);
- }
-
- test = root + "ING";
- if(inWord.equals(test))
- {
- return true;
- }
-
- test = root + "INGLY";
- if(inWord.equals(test))
- {
- return true;
- }
-
- test = root + "Y";
- if(inWord.equals(test))
- {
- return true;
- }
-
- return false;
- }
-
- /**
- * Determines if one of the substrings sent in is the same as
- * what is at the specified position in the string being encoded.
- *
- * @param start
- * @param length
- * @param compareStrings
- * @return
- */
- boolean StringAt(int start, int length, String... compareStrings)
- {
- // check substring bounds
- if((start < 0)
- || (start > (m_length - 1))
- || ((start + length - 1) > (m_length - 1)))
- {
- return false;
- }
-
- String target = m_inWord.substring(start, (start + length));
-
- for(String strFragment : compareStrings)
- {
- if(target.equals(strFragment))
- {
- return true;
- }
- }
- return false;
- }
-
- /**
- * Encodes input string to one or two key values according to Metaphone 3 rules.
- *
- */
- void Encode()
- {
- flag_AL_inversion = false;
-
- m_current = 0;
-
- m_primary.setLength(0);
- m_secondary.setLength(0);
-
- if(m_length < 1)
- {
- return;
- }
-
- //zero based index
- m_last = m_length - 1;
-
- ///////////main loop//////////////////////////
- while(!(m_primary.length() > m_metaphLength) && !(m_secondary.length() > m_metaphLength))
- {
- if(m_current >= m_length)
- {
- break;
- }
-
- switch(CharAt(m_current))
- {
- case 'B':
-
- Encode_B();
- break;
-
- case 'ß':
- case 'Ç':
-
- MetaphAdd("S");
- m_current++;
- break;
-
- case 'C':
-
- Encode_C();
- break;
-
- case 'D':
-
- Encode_D();
- break;
-
- case 'F':
-
- Encode_F();
- break;
-
- case 'G':
-
- Encode_G();
- break;
-
- case 'H':
-
- Encode_H();
- break;
-
- case 'J':
-
- Encode_J();
- break;
-
- case 'K':
-
- Encode_K();
- break;
-
- case 'L':
-
- Encode_L();
- break;
-
- case 'M':
-
- Encode_M();
- break;
-
- case 'N':
-
- Encode_N();
- break;
-
- case 'Ń':
-
- MetaphAdd("N");
- m_current++;
- break;
-
- case 'P':
-
- Encode_P();
- break;
-
- case 'Q':
-
- Encode_Q();
- break;
-
- case 'R':
-
- Encode_R();
- break;
-
- case 'S':
-
- Encode_S();
- break;
-
- case 'T':
-
- Encode_T();
- break;
-
- case 'Đ': // eth
- case 'Ţ': // thorn
-
- MetaphAdd("0");
- m_current++;
- break;
-
- case 'V':
-
- Encode_V();
- break;
-
- case 'W':
-
- Encode_W();
- break;
-
- case 'X':
-
- Encode_X();
- break;
-
- case '?':
-
- MetaphAdd("X");
- m_current++;
- break;
-
- case '?':
-
- MetaphAdd("S");
- m_current++;
- break;
-
- case 'Z':
-
- Encode_Z();
- break;
-
- default:
-
- if(IsVowel(CharAt(m_current)))
- {
- Encode_Vowels();
- break;
- }
-
- m_current++;
-
- }
- }
-
- //only give back m_metaphLength number of chars in m_metaph
- if(m_primary.length() > m_metaphLength)
- {
- m_primary.setLength(m_metaphLength);
- }
-
- if(m_secondary.length() > m_metaphLength)
- {
- m_secondary.setLength(m_metaphLength);
- }
-
- // it is possible for the two metaphs to be the same
- // after truncation. lose the second one if so
- if((m_primary.toString()).equals(m_secondary.toString()))
- {
- m_secondary.setLength(0);
- }
- }
-
- /**
- * Encodes all initial vowels to A.
- *
- * Encodes non-initial vowels to A if m_encodeVowels is true
- *
- *
- */
- void Encode_Vowels()
- {
- if(m_current == 0)
- {
- // all init vowels map to 'A'
- // as of Double Metaphone
- MetaphAdd("A");
- }
- else if(m_encodeVowels)
- {
- if(CharAt(m_current) != 'E')
- {
- if(Skip_Silent_UE())
- {
- return;
- }
-
- if (O_Silent())
- {
- m_current++;
- return;
- }
-
- // encode all vowels and
- // diphthongs to the same value
- MetaphAdd("A");
- }
- else
- {
- Encode_E_Pronounced();
- }
- }
-
- if(!(!IsVowel(m_current - 2) && StringAt((m_current - 1), 4, "LEWA", "LEWO", "LEWI", "")))
- {
- m_current = SkipVowels(m_current);
- }
- else
- {
- m_current++;
- }
- }
-
- /**
- * Encodes cases where non-initial 'e' is pronounced, taking
- * care to detect unusual cases from the greek.
- *
- * Only executed if non initial vowel encoding is turned on
- *
- *
- */
- void Encode_E_Pronounced()
- {
- // special cases with two pronunciations
- // 'agape' 'lame' 'resume'
- if((StringAt(0, 4, "LAME", "SAKE", "PATE", "") && (m_length == 4))
- || (StringAt(0, 5, "AGAPE", "") && (m_length == 5))
- || ((m_current == 5) && StringAt(0, 6, "RESUME", "")))
- {
- MetaphAdd("", "A");
- return;
- }
-
- // special case "inge" => 'INGA', 'INJ'
- if(StringAt(0, 4, "INGE", "")
- && (m_length == 4))
- {
- MetaphAdd("A", "");
- return;
- }
-
- // special cases with two pronunciations
- // special handling due to the difference in
- // the pronunciation of the '-D'
- if((m_current == 5) && StringAt(0, 7, "BLESSED", "LEARNED", ""))
- {
- MetaphAddExactApprox("D", "AD", "T", "AT");
- m_current += 2;
- return;
- }
-
- // encode all vowels and diphthongs to the same value
- if((!E_Silent()
- && !flag_AL_inversion
- && !Silent_Internal_E())
- || E_Pronounced_Exceptions())
- {
- MetaphAdd("A");
- }
-
- // now that we've visited the vowel in question
- flag_AL_inversion = false;
- }
-
- /**
- * Tests for cases where non-initial 'o' is not pronounced
- * Only executed if non initial vowel encoding is turned on
- *
- * @return true if encoded as silent - no addition to m_metaph key
- *
- */
- boolean O_Silent()
- {
- // if "iron" at beginning or end of word and not "irony"
- if ((CharAt(m_current) == 'O')
- && StringAt((m_current - 2), 4, "IRON", ""))
- {
- if ((StringAt(0, 4, "IRON", "")
- || (StringAt((m_current - 2), 4, "IRON", "")
- && (m_last == (m_current + 1))))
- && !StringAt((m_current - 2), 6, "IRONIC", ""))
- {
- return true;
- }
- }
-
- return false;
- }
-
- /**
- * Tests and encodes cases where non-initial 'e' is never pronounced
- * Only executed if non initial vowel encoding is turned on
- *
- * @return true if encoded as silent - no addition to m_metaph key
- *
- */
- boolean E_Silent()
- {
- if(E_Pronounced_At_End())
- {
- return false;
- }
-
- // 'e' silent when last letter, altho
- if((m_current == m_last)
- // also silent if before plural 's'
- // or past tense or participle 'd', e.g.
- // 'grapes' and 'banished' => PNXT
- || ((StringAt(m_last, 1, "S", "D", "")
- && (m_current > 1)
- && ((m_current + 1) == m_last)
- // and not e.g. "nested", "rises", or "pieces" => RASAS
- && !(StringAt((m_current - 1), 3, "TED", "SES", "CES", "")
- || StringAt(0, 9, "ANTIPODES", "ANOPHELES", "")
- || StringAt(0, 8, "MOHAMMED", "MUHAMMED", "MOUHAMED", "")
- || StringAt(0, 7, "MOHAMED", "")
- || StringAt(0, 6, "NORRED", "MEDVED", "MERCED", "ALLRED", "KHALED", "RASHED", "MASJED", "")
- || StringAt(0, 5, "JARED", "AHMED", "HAMED", "JAVED", "")
- || StringAt(0, 4, "ABED", "IMED", ""))))
- // e.g. 'wholeness', 'boneless', 'barely'
- || (StringAt((m_current + 1), 4, "NESS", "LESS", "") && ((m_current + 4) == m_last))
- || (StringAt((m_current + 1), 2, "LY", "") && ((m_current + 2) == m_last)
- && !StringAt(0, 6, "CICELY", "")))
- {
- return true;
- }
-
- return false;
- }
-
- /**
- * Tests for words where an 'E' at the end of the word
- * is pronounced
- *
- * special cases, mostly from the greek, spanish, japanese,
- * italian, and french words normally having an acute accent.
- * also, pronouns and articles
- *
- * Many Thanks to ali, QuentinCompson, JeffCO, ToonScribe, Xan,
- * Trafalz, and VictorLaszlo, all of them atriots from the Eschaton,
- * for all their fine contributions!
- *
- * @return true if 'E' at end is pronounced
- *
- */
- boolean E_Pronounced_At_End()
- {
- if((m_current == m_last)
- && (StringAt((m_current - 6), 7, "STROPHE", "")
- // if a vowel is before the 'E', vowel eater will have eaten it.
- //otherwise, consonant + 'E' will need 'E' pronounced
- || (m_length == 2)
- || ((m_length == 3) && !IsVowel(0))
- // these german name endings can be relied on to have the 'e' pronounced
- || (StringAt((m_last - 2), 3, "BKE", "DKE", "FKE", "KKE", "LKE",
- "NKE", "MKE", "PKE", "TKE", "VKE", "ZKE", "")
- && !StringAt(0, 5, "FINKE", "FUNKE", "")
- && !StringAt(0, 6, "FRANKE", ""))
- || StringAt((m_last - 4), 5, "SCHKE", "")
- || (StringAt(0, 4, "ACME", "NIKE", "CAFE", "RENE", "LUPE", "JOSE", "ESME", "") && (m_length == 4))
- || (StringAt(0, 5, "LETHE", "CADRE", "TILDE", "SIGNE", "POSSE", "LATTE", "ANIME", "DOLCE", "CROCE",
- "ADOBE", "OUTRE", "JESSE", "JAIME", "JAFFE", "BENGE", "RUNGE",
- "CHILE", "DESME", "CONDE", "URIBE", "LIBRE", "ANDRE", "") && (m_length == 5))
- || (StringAt(0, 6, "HECATE", "PSYCHE", "DAPHNE", "PENSKE", "CLICHE", "RECIPE",
- "TAMALE", "SESAME", "SIMILE", "FINALE", "KARATE", "RENATE", "SHANTE",
- "OBERLE", "COYOTE", "KRESGE", "STONGE", "STANGE", "SWAYZE", "FUENTE",
- "SALOME", "URRIBE", "") && (m_length == 6))
- || (StringAt(0, 7, "ECHIDNE", "ARIADNE", "MEINEKE", "PORSCHE", "ANEMONE", "EPITOME",
- "SYNCOPE", "SOUFFLE", "ATTACHE", "MACHETE", "KARAOKE", "BUKKAKE",
- "VICENTE", "ELLERBE", "VERSACE", "") && (m_length == 7))
- || (StringAt(0, 8, "PENELOPE", "CALLIOPE", "CHIPOTLE", "ANTIGONE", "KAMIKAZE", "EURIDICE",
- "YOSEMITE", "FERRANTE", "") && (m_length == 8))
- || (StringAt(0, 9, "HYPERBOLE", "GUACAMOLE", "XANTHIPPE", "") && (m_length == 9))
- || (StringAt(0, 10, "SYNECDOCHE", "") && (m_length == 10))))
- {
- return true;
- }
-
- return false;
- }
-
- /**
- * Detect internal silent 'E's e.g. "roseman",
- * "firestone"
- *
- */
- boolean Silent_Internal_E()
- {
- // 'olesen' but not 'olen' RAKE BLAKE
- if((StringAt(0, 3, "OLE", "")
- && E_Silent_Suffix(3) && !E_Pronouncing_Suffix(3))
- || (StringAt(0, 4, "BARE", "FIRE", "FORE", "GATE", "HAGE", "HAVE",
- "HAZE", "HOLE", "CAPE", "HUSE", "LACE", "LINE",
- "LIVE", "LOVE", "MORE", "MOSE", "MORE", "NICE",
- "RAKE", "ROBE", "ROSE", "SISE", "SIZE", "WARE",
- "WAKE", "WISE", "WINE", "")
- && E_Silent_Suffix(4) && !E_Pronouncing_Suffix(4))
- || (StringAt(0, 5, "BLAKE", "BRAKE", "BRINE", "CARLE", "CLEVE", "DUNNE",
- "HEDGE", "HOUSE", "JEFFE", "LUNCE", "STOKE", "STONE",
- "THORE", "WEDGE", "WHITE", "")
- && E_Silent_Suffix(5) && !E_Pronouncing_Suffix(5))
- || (StringAt(0, 6, "BRIDGE", "CHEESE", "")
- && E_Silent_Suffix(6) && !E_Pronouncing_Suffix(6))
- || StringAt((m_current - 5), 7, "CHARLES", ""))
- {
- return true;
- }
-
- return false;
- }
-
- /**
- * Detect conditions required
- * for the 'E' not to be pronounced
- *
- */
- boolean E_Silent_Suffix(int at)
- {
- if((m_current == (at - 1))
- && (m_length > (at + 1))
- && (IsVowel((at + 1))
- || (StringAt(at, 2, "ST", "SL", "")
- && (m_length > (at + 2)))))
- {
- return true;
- }
-
- return false;
- }
-
- /**
- * Detect endings that will
- * cause the 'e' to be pronounced
- *
- */
- boolean E_Pronouncing_Suffix(int at)
- {
- // e.g. 'bridgewood' - the other vowels will get eaten
- // up so we need to put one in here
- if((m_length == (at + 4)) && StringAt(at, 4, "WOOD", ""))
- {
- return true;
- }
-
- // same as above
- if((m_length == (at + 5)) && StringAt(at, 5, "WATER", "WORTH", ""))
- {
- return true;
- }
-
- // e.g. 'bridgette'
- if((m_length == (at + 3)) && StringAt(at, 3, "TTE", "LIA", "NOW", "ROS", "RAS", ""))
- {
- return true;
- }
-
- // e.g. 'olena'
- if((m_length == (at + 2)) && StringAt(at, 2, "TA", "TT", "NA", "NO", "NE",
- "RS", "RE", "LA", "AU", "RO", "RA", ""))
- {
- return true;
- }
-
- // e.g. 'bridget'
- if((m_length == (at + 1)) && StringAt(at, 1, "T", "R", ""))
- {
- return true;
- }
-
- return false;
- }
-
- /**
- * Exceptions where 'E' is pronounced where it
- * usually wouldn't be, and also some cases
- * where 'LE' transposition rules don't apply
- * and the vowel needs to be encoded here
- *
- * @return true if 'E' pronounced
- *
- */
- boolean E_Pronounced_Exceptions()
- {
- // greek names e.g. "herakles" or hispanic names e.g. "robles", where 'e' is pronounced, other exceptions
- if((((m_current + 1) == m_last)
- && (StringAt((m_current - 3), 5, "OCLES", "ACLES", "AKLES", "")
- || StringAt(0, 4, "INES", "")
- || StringAt(0, 5, "LOPES", "ESTES", "GOMES", "NUNES", "ALVES", "ICKES",
- "INNES", "PERES", "WAGES", "NEVES", "BENES", "DONES", "")
- || StringAt(0, 6, "CORTES", "CHAVES", "VALDES", "ROBLES", "TORRES", "FLORES", "BORGES",
- "NIEVES", "MONTES", "SOARES", "VALLES", "GEDDES", "ANDRES", "VIAJES",
- "CALLES", "FONTES", "HERMES", "ACEVES", "BATRES", "MATHES", "")
- || StringAt(0, 7, "DELORES", "MORALES", "DOLORES", "ANGELES", "ROSALES", "MIRELES", "LINARES",
- "PERALES", "PAREDES", "BRIONES", "SANCHES", "CAZARES", "REVELES", "ESTEVES",
- "ALVARES", "MATTHES", "SOLARES", "CASARES", "CACERES", "STURGES", "RAMIRES",
- "FUNCHES", "BENITES", "FUENTES", "PUENTES", "TABARES", "HENTGES", "VALORES", "")
- || StringAt(0, 8, "GONZALES", "MERCEDES", "FAGUNDES", "JOHANNES", "GONSALES", "BERMUDES",
- "CESPEDES", "BETANCES", "TERRONES", "DIOGENES", "CORRALES", "CABRALES",
- "MARTINES", "GRAJALES", "")
- || StringAt(0, 9, "CERVANTES", "FERNANDES", "GONCALVES", "BENEVIDES", "CIFUENTES", "SIFUENTES",
- "SERVANTES", "HERNANDES", "BENAVIDES", "")
- || StringAt(0, 10, "ARCHIMEDES", "CARRIZALES", "MAGALLANES", "")))
- || StringAt(m_current - 2, 4, "FRED", "DGES", "DRED", "GNES", "")
- || StringAt((m_current - 5), 7, "PROBLEM", "RESPLEN", "")
- || StringAt((m_current - 4), 6, "REPLEN", "")
- || StringAt((m_current - 3), 4, "SPLE", ""))
- {
- return true;
- }
-
- return false;
- }
-
- /**
- * Encodes "-UE".
- *
- * @return true if encoding handled in this routine, false if not
- */
- boolean Skip_Silent_UE()
- {
- // always silent except for cases listed below
- if((StringAt((m_current - 1), 3, "QUE", "GUE", "")
- && !StringAt(0, 8, "BARBEQUE", "PALENQUE", "APPLIQUE", "")
- // '-que' cases usually french but missing the acute accent
- && !StringAt(0, 6, "RISQUE", "")
- && !StringAt((m_current - 3), 5, "ARGUE", "SEGUE", "")
- && !StringAt(0, 7, "PIROGUE", "ENRIQUE", "")
- && !StringAt(0, 10, "COMMUNIQUE", ""))
- && (m_current > 1)
- && (((m_current + 1) == m_last)
- || StringAt(0, 7, "JACQUES", "")))
- {
- m_current = SkipVowels(m_current);
- return true;
- }
-
- return false;
- }
-
- /**
- * Encodes 'B'
- *
- *
- */
- void Encode_B()
- {
- if(Encode_Silent_B())
- {
- return;
- }
-
- // "-mb", e.g", "dumb", already skipped over under
- // 'M', altho it should really be handled here...
- MetaphAddExactApprox("B", "P");
-
- if((CharAt(m_current + 1) == 'B')
- || ((CharAt(m_current + 1) == 'P')
- && ((m_current + 1 < m_last) && (CharAt(m_current + 2) != 'H'))))
- {
- m_current += 2;
- }
- else
- {
- m_current++;
- }
- }
-
- /**
- * Encodes silent 'B' for cases not covered under "-mb-"
- *
- *
- * @return true if encoding handled in this routine, false if not
- *
- */
- boolean Encode_Silent_B()
- {
- //'debt', 'doubt', 'subtle'
- if(StringAt((m_current - 2), 4, "DEBT", "")
- || StringAt((m_current - 2), 5, "SUBTL", "")
- || StringAt((m_current - 2), 6, "SUBTIL", "")
- || StringAt((m_current - 3), 5, "DOUBT", ""))
- {
- MetaphAdd("T");
- m_current += 2;
- return true;
- }
-
- return false;
- }
-
- /**
- * Encodes 'C'
- *
- */
- void Encode_C()
- {
-
- if(Encode_Silent_C_At_Beginning()
- || Encode_CA_To_S()
- || Encode_CO_To_S()
- || Encode_CH()
- || Encode_CCIA()
- || Encode_CC()
- || Encode_CK_CG_CQ()
- || Encode_C_Front_Vowel()
- || Encode_Silent_C()
- || Encode_CZ()
- || Encode_CS())
- {
- return;
- }
-
- //else
- if(!StringAt((m_current - 1), 1, "C", "K", "G", "Q", ""))
- {
- MetaphAdd("K");
- }
-
- //name sent in 'mac caffrey', 'mac gregor
- if(StringAt((m_current + 1), 2, " C", " Q", " G", ""))
- {
- m_current += 2;
- }
- else
- {
- if(StringAt((m_current + 1), 1, "C", "K", "Q", "")
- && !StringAt((m_current + 1), 2, "CE", "CI", ""))
- {
- m_current += 2;
- // account for combinations such as Ro-ckc-liffe
- if(StringAt((m_current), 1, "C", "K", "Q", "")
- && !StringAt((m_current + 1), 2, "CE", "CI", ""))
- {
- m_current++;
- }
- }
- else
- {
- m_current++;
- }
- }
- }
-
- /**
- * Encodes cases where 'C' is silent at beginning of word
- *
- * @return true if encoding handled in this routine, false if not
- *
- */
- boolean Encode_Silent_C_At_Beginning()
- {
- //skip these when at start of word
- if((m_current == 0)
- && StringAt(m_current, 2, "CT", "CN", ""))
- {
- m_current += 1;
- return true;
- }
-
- return false;
- }
-
-
- /**
- * Encodes exceptions where "-CA-" should encode to S
- * instead of K including cases where the cedilla has not been used
- *
- * @return true if encoding handled in this routine, false if not
- *
- */
- boolean Encode_CA_To_S()
- {
- // Special case: 'caesar'.
- // Also, where cedilla not used, as in "linguica" => LNKS
- if(((m_current == 0) && StringAt(m_current, 4, "CAES", "CAEC", "CAEM", ""))
- || StringAt(0, 8, "FRANCAIS", "FRANCAIX", "LINGUICA", "")
- || StringAt(0, 6, "FACADE", "")
- || StringAt(0, 9, "GONCALVES", "PROVENCAL", ""))
- {
- MetaphAdd("S");
- AdvanceCounter(2, 1);
- return true;
- }
-
- return false;
- }
-
- /**
- * Encodes exceptions where "-CO-" encodes to S instead of K
- * including cases where the cedilla has not been used
- *
- * @return true if encoding handled in this routine, false if not
- *
- */
- boolean Encode_CO_To_S()
- {
- // e.g. 'coelecanth' => SLKN0
- if((StringAt(m_current, 4, "COEL", "")
- && (IsVowel(m_current + 4) || ((m_current + 3) == m_last)))
- || StringAt(m_current, 5, "COENA", "COENO", "")
- || StringAt(0, 8, "FRANCOIS", "MELANCON", "")
- || StringAt(0, 6, "GARCON", ""))
- {
- MetaphAdd("S");
- AdvanceCounter(3, 1);
- return true;
- }
-
- return false;
- }
-
- /**
- * Encode "-CH-"
- *
- * @return true if encoding handled in this routine, false if not
- *
- */
- boolean Encode_CH()
- {
- if(StringAt(m_current, 2, "CH", ""))
- {
- if(Encode_CHAE()
- || Encode_CH_To_H()
- || Encode_Silent_CH()
- || Encode_ARCH()
- // Encode_CH_To_X() should be
- // called before the germanic
- // and greek encoding functions
- || Encode_CH_To_X()
- || Encode_English_CH_To_K()
- || Encode_Germanic_CH_To_K()
- || Encode_Greek_CH_Initial()
- || Encode_Greek_CH_Non_Initial())
- {
- return true;
- }
-
- if(m_current > 0)
- {
- if(StringAt(0, 2, "MC", "")
- && (m_current == 1))
- {
- //e.g., "McHugh"
- MetaphAdd("K");
- }
- else
- {
- MetaphAdd("X", "K");
- }
- }
- else
- {
- MetaphAdd("X");
- }
- m_current += 2;
- return true;
- }
-
- return false;
- }
-
- /**
- * Encodes "-CHAE-"
- *
- * @return true if encoding handled in this routine, false if not
- *
- */
- boolean Encode_CHAE()
- {
- // e.g. 'michael'
- if(((m_current > 0) && StringAt((m_current + 2), 2, "AE", "")))
- {
- if(StringAt(0, 7, "RACHAEL", ""))
- {
- MetaphAdd("X");
- }
- else if(!StringAt((m_current - 1), 1, "C", "K", "G", "Q", ""))
- {
- MetaphAdd("K");
- }
-
- AdvanceCounter(4, 2);
- return true;
- }
-
- return false;
- }
-
- /**
- * Encdoes transliterations from the hebrew where the
- * sound 'kh' is represented as "-CH-". The normal pronounciation
- * of this in english is either 'h' or 'kh', and alternate
- * spellings most often use "-H-"
- *
- * @return true if encoding handled in this routine, false if not
- *
- */
- boolean Encode_CH_To_H()
- {
- // hebrew => 'H', e.g. 'channukah', 'chabad'
- if(((m_current == 0)
- && (StringAt((m_current + 2), 3, "AIM", "ETH", "ELM", "")
- || StringAt((m_current + 2), 4, "ASID", "AZAN", "")
- || StringAt((m_current + 2), 5, "UPPAH", "UTZPA", "ALLAH", "ALUTZ", "AMETZ", "")
- || StringAt((m_current + 2), 6, "ESHVAN", "ADARIM", "ANUKAH", "")
- || StringAt((m_current + 2), 7, "ALLLOTH", "ANNUKAH", "AROSETH", "")))
- // and an irish name with the same encoding
- || StringAt((m_current - 3), 7, "CLACHAN", ""))
- {
- MetaphAdd("H");
- AdvanceCounter(3, 2);
- return true;
- }
-
- return false;
- }
-
- /**
- * Encodes cases where "-CH-" is not pronounced
- *
- * @return true if encoding handled in this routine, false if not
- *
- */
- boolean Encode_Silent_CH()
- {
- // '-ch-' not pronounced
- if(StringAt((m_current - 2), 7, "FUCHSIA", "")
- || StringAt((m_current - 2), 5, "YACHT", "")
- || StringAt(0, 8, "STRACHAN", "")
- || StringAt(0, 8, "CRICHTON", "")
- || (StringAt((m_current - 3), 6, "DRACHM", ""))
- && !StringAt((m_current - 3), 7, "DRACHMA", ""))
- {
- m_current += 2;
- return true;
- }
-
- return false;
- }
-
- /**
- * Encodes "-CH-" to X
- * English language patterns
- *
- * @return true if encoding handled in this routine, false if not
- *
- */
- boolean Encode_CH_To_X()
- {
- // e.g. 'approach', 'beach'
- if((StringAt((m_current - 2), 4, "OACH", "EACH", "EECH", "OUCH", "OOCH", "MUCH", "SUCH", "")
- && !StringAt((m_current - 3), 5, "JOACH", ""))
- // e.g. 'dacha', 'macho'
- || (((m_current + 2) == m_last ) && StringAt((m_current - 1), 4, "ACHA", "ACHO", ""))
- || (StringAt(m_current, 4, "CHOT", "CHOD", "CHAT", "") && ((m_current + 3) == m_last))
- || ((StringAt((m_current - 1), 4, "OCHE", "") && ((m_current + 2) == m_last))
- && !StringAt((m_current - 2), 5, "DOCHE", ""))
- || StringAt((m_current - 4), 6, "ATTACH", "DETACH", "KOVACH", "")
- || StringAt((m_current - 5), 7, "SPINACH", "")
- || StringAt(0, 6, "MACHAU", "")
- || StringAt((m_current - 4), 8, "PARACHUT", "")
- || StringAt((m_current - 5), 8, "MASSACHU", "")
- || (StringAt((m_current - 3), 5, "THACH", "") && !StringAt((m_current - 1), 4, "ACHE", ""))
- || StringAt((m_current - 2), 6, "VACHON", "") )
- {
- MetaphAdd("X");
- m_current += 2;
- return true;
- }
-
- return false;
- }
-
- /**
- * Encodes "-CH-" to K in contexts of
- * initial "A" or "E" follwed by "CH"
- *
- * @return true if encoding handled in this routine, false if not
- *
- */
- boolean Encode_English_CH_To_K()
- {
- //'ache', 'echo', alternate spelling of 'michael'
- if(((m_current == 1) && RootOrInflections(m_inWord, "ACHE"))
- || (((m_current > 3) && RootOrInflections(m_inWord.substring(m_current - 1), "ACHE"))
- && (StringAt(0, 3, "EAR", "")
- || StringAt(0, 4, "HEAD", "BACK", "")
- || StringAt(0, 5, "HEART", "BELLY", "TOOTH", "")))
- || StringAt((m_current - 1), 4, "ECHO", "")
- || StringAt((m_current - 2), 7, "MICHEAL", "")
- || StringAt((m_current - 4), 7, "JERICHO", "")
- || StringAt((m_current - 5), 7, "LEPRECH", ""))
- {
- MetaphAdd("K", "X");
- m_current += 2;
- return true;
- }
-
- return false;
- }
-
- /**
- * Encodes "-CH-" to K in mostly germanic context
- * of internal "-ACH-", with exceptions
- *
- * @return true if encoding handled in this routine, false if not
- *
- */
- boolean Encode_Germanic_CH_To_K()
- {
- // various germanic
- // "<consonant><vowel>CH-"implies a german word where 'ch' => K
- if(((m_current > 1)
- && !IsVowel(m_current - 2)
- && StringAt((m_current - 1), 3, "ACH", "")
- && !StringAt((m_current - 2), 7, "MACHADO", "MACHUCA", "LACHANC", "LACHAPE", "KACHATU", "")
- && !StringAt((m_current - 3), 7, "KHACHAT", "")
- && ((CharAt(m_current + 2) != 'I')
- && ((CharAt(m_current + 2) != 'E')
- || StringAt((m_current - 2), 6, "BACHER", "MACHER", "MACHEN", "LACHER", "")) )
- // e.g. 'brecht', 'fuchs'
- || (StringAt((m_current + 2), 1, "T", "S", "")
- && !(StringAt(0, 11, "WHICHSOEVER", "") || StringAt(0, 9, "LUNCHTIME", "") ))
- // e.g. 'andromache'
- || StringAt(0, 4, "SCHR", "")
- || ((m_current > 2) && StringAt((m_current - 2), 5, "MACHE", ""))
- || ((m_current == 2) && StringAt((m_current - 2), 4, "ZACH", ""))
- || StringAt((m_current - 4), 6, "SCHACH", "")
- || StringAt((m_current - 1), 5, "ACHEN", "")
- || StringAt((m_current - 3), 5, "SPICH", "ZURCH", "BUECH", "")
- || (StringAt((m_current - 3), 5, "KIRCH", "JOACH", "BLECH", "MALCH", "")
- // "kirch" and "blech" both get 'X'
- && !(StringAt((m_current - 3),…