/src/thirdParty/Porter.java
https://bitbucket.org/teamwildtreechase/hatparsing · Java · 393 lines · 318 code · 72 blank · 3 comment · 123 complexity · 368ae6cf16aae023d4e1ca651ae3ab73 MD5 · raw file
- /*
- * Copyright (c) <2013> <Gideon Maillette de Buy Wenniger>. All Rights Reserved.
- */
- package thirdParty;
- class NewString {
- public String str;
- NewString() {
- str = "";
- }
- }
- public class Porter {
- private static String Clean( String str ) {
- int last = str.length();
-
- String temp = "";
- for ( int i=0; i < last; i++ ) {
- if ( Character.isLetterOrDigit( str.charAt(i) ) )
- temp += str.charAt(i);
- }
-
- return temp;
- } //clean
-
- private static boolean hasSuffix( String word, String suffix, NewString stem ) {
- String tmp = "";
- if ( word.length() <= suffix.length() )
- return false;
- if ((suffix.length() > 1) && ( word.charAt( word.length()-2 ) != suffix.charAt( suffix.length()-2 ) ))
- {
- return false;
- }
-
- stem.str = "";
- for ( int i=0; i<word.length()-suffix.length(); i++ )
- stem.str += word.charAt( i );
- tmp = stem.str;
- for ( int i=0; i<suffix.length(); i++ )
- tmp += suffix.charAt( i );
- if ( tmp.compareTo( word ) == 0 )
- return true;
- else
- return false;
- }
- private static boolean vowel( char ch, char prev ) {
- switch ( ch ) {
- case 'a': case 'e': case 'i': case 'o': case 'u':
- return true;
- case 'y': {
- switch ( prev ) {
- case 'a': case 'e': case 'i': case 'o': case 'u':
- return false;
- default:
- return true;
- }
- }
-
- default :
- return false;
- }
- }
- private static int measure( String stem ) {
-
- int i=0, count = 0;
- int length = stem.length();
- while ( i < length ) {
- for ( ; i < length ; i++ ) {
- if ( i > 0 ) {
- if ( vowel(stem.charAt(i),stem.charAt(i-1)) )
- break;
- }
- else {
- if ( vowel(stem.charAt(i),'a') )
- break;
- }
- }
- for ( i++ ; i < length ; i++ ) {
- if ( i > 0 ) {
- if ( !vowel(stem.charAt(i),stem.charAt(i-1)) )
- break;
- }
- else {
- if ( !vowel(stem.charAt(i),'?') )
- break;
- }
- }
- if ( i < length ) {
- count++;
- i++;
- }
- } //while
-
- return(count);
- }
- private static boolean containsVowel( String word ) {
- for (int i=0 ; i < word.length(); i++ )
- if ( i > 0 ) {
- if ( vowel(word.charAt(i),word.charAt(i-1)) )
- return true;
- }
- else {
- if ( vowel(word.charAt(0),'a') )
- return true;
- }
-
- return false;
- }
- private static boolean cvc( String str ) {
- int length=str.length();
- if ( length < 3 )
- return false;
-
- if ( (!vowel(str.charAt(length-1),str.charAt(length-2)) )
- && (str.charAt(length-1) != 'w') && (str.charAt(length-1) != 'x') && (str.charAt(length-1) != 'y')
- && (vowel(str.charAt(length-2),str.charAt(length-3))) ) {
- if (length == 3) {
- if (!vowel(str.charAt(0),'?'))
- return true;
- else
- return false;
- }
- else {
- if (!vowel(str.charAt(length-3),str.charAt(length-4)) )
- return true;
- else
- return false;
- }
- }
-
- return false;
- }
- private static String step1( String str ) {
-
- NewString stem = new NewString();
- if ( str.charAt( str.length()-1 ) == 's' ) {
- if ( (hasSuffix( str, "sses", stem )) || (hasSuffix( str, "ies", stem)) ){
- String tmp = "";
- for (int i=0; i<str.length()-2; i++)
- tmp += str.charAt(i);
- str = tmp;
- }
- else {
- if ( ( str.length() == 1 ) && ( str.charAt(str.length()-1) == 's' ) ) {
- str = "";
- return str;
- }
- if ( str.charAt( str.length()-2 ) != 's' ) {
- String tmp = "";
- for (int i=0; i<str.length()-1; i++)
- tmp += str.charAt(i);
- str = tmp;
- }
- }
- }
- if ( hasSuffix( str,"eed",stem ) ) {
- if ( measure( stem.str ) > 0 ) {
- String tmp = "";
- for (int i=0; i<str.length()-1; i++)
- tmp += str.charAt( i );
- str = tmp;
- }
- }
- else
- {
- if( (hasSuffix( str,"ed",stem )) || (hasSuffix( str,"ing",stem )) && (containsVowel( stem.str )) )
- {
- String tmp = "";
- for ( int i = 0; i < stem.str.length(); i++)
- tmp += str.charAt( i );
- str = tmp;
- if ( str.length() == 1 )
- return str;
- if ( ( hasSuffix( str,"at",stem) ) || ( hasSuffix( str,"bl",stem ) ) || ( hasSuffix( str,"iz",stem) ) ) {
- str += "e";
-
- }
- else {
- int length = str.length();
- if ( (str.charAt(length-1) == str.charAt(length-2))
- && (str.charAt(length-1) != 'l') && (str.charAt(length-1) != 's') && (str.charAt(length-1) != 'z') ) {
-
- tmp = "";
- for (int i=0; i<str.length()-1; i++)
- tmp += str.charAt(i);
- str = tmp;
- }
- else
- if ( measure( str ) == 1 && ( cvc(str) ))
- {
- str += "e";
- }
- }
- }
- }
- if ( hasSuffix(str,"y",stem) && containsVowel( stem.str ))
- {
- String tmp = "";
- for (int i=0; i<str.length()-1; i++ )
- tmp += str.charAt(i);
- str = tmp + "i";
- }
- return str;
- }
- private static String step2( String str ) {
- String[][] suffixes = { { "ational", "ate" },
- { "tional", "tion" },
- { "enci", "ence" },
- { "anci", "ance" },
- { "izer", "ize" },
- { "iser", "ize" },
- { "abli", "able" },
- { "alli", "al" },
- { "entli", "ent" },
- { "eli", "e" },
- { "ousli", "ous" },
- { "ization", "ize" },
- { "isation", "ize" },
- { "ation", "ate" },
- { "ator", "ate" },
- { "alism", "al" },
- { "iveness", "ive" },
- { "fulness", "ful" },
- { "ousness", "ous" },
- { "aliti", "al" },
- { "iviti", "ive" },
- { "biliti", "ble" }};
- NewString stem = new NewString();
-
- for ( int index = 0 ; index < suffixes.length; index++ )
- {
- if ( hasSuffix( str, suffixes[index][0], stem ) && ( measure ( stem.str ) > 0 ) )
- {
- str = stem.str + suffixes[index][1];
- return str;
- }
- }
- return str;
- }
- private static String step3( String str ) {
- String[][] suffixes = { { "icate", "ic" },
- { "ative", "" },
- { "alize", "al" },
- { "alise", "al" },
- { "iciti", "ic" },
- { "ical", "ic" },
- { "ful", "" },
- { "ness", "" }};
- NewString stem = new NewString();
- for ( int index = 0 ; index<suffixes.length; index++ ) {
- if ( hasSuffix ( str, suffixes[index][0], stem ) && ( measure ( stem.str ) > 0 ))
- {
- str = stem.str + suffixes[index][1];
- return str;
- }
- }
- return str;
- }
- private static String step4( String str ) {
-
- String[] suffixes = { "al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent", "sion", "tion",
- "ou", "ism", "ate", "iti", "ous", "ive", "ize", "ise"};
-
- NewString stem = new NewString();
-
- for ( int index = 0 ; index<suffixes.length; index++ ) {
- if ( hasSuffix ( str, suffixes[index], stem ) && ( measure ( stem.str ) > 1 ))
- {
- str = stem.str;
- return str;
- }
- }
- return str;
- }
- private static String step5( String str ) {
- if ( str.charAt(str.length()-1) == 'e' ) {
- if ( measure(str) > 1 ) {/* measure(str)==measure(stem) if ends in vowel */
- String tmp = "";
- for ( int i=0; i<str.length()-1; i++ )
- tmp += str.charAt( i );
- str = tmp;
- }
- else
- if ( measure(str) == 1 ) {
- String stem = "";
- for ( int i=0; i<str.length()-1; i++ )
- stem += str.charAt( i );
- if ( !cvc(stem) )
- str = stem;
- }
- }
-
- if ( str.length() == 1 )
- return str;
- if ( (str.charAt(str.length()-1) == 'l') && (str.charAt(str.length()-2) == 'l') && (measure(str) > 1) && ( measure(str) > 1 ) )
- {/* measure(str)==measure(stem) if ends in vowel */
- String tmp = "";
- for ( int i=0; i<str.length()-1; i++ )
- {
- tmp += str.charAt( i );
- }
- str = tmp;
- }
- return str;
- }
- private static String stripPrefixes ( String str) {
- String[] prefixes = { "kilo", "micro", "milli", "intra", "ultra", "mega", "nano", "pico", "pseudo"};
- int last = prefixes.length;
- for ( int i=0 ; i<last; i++ ) {
- if ( str.startsWith( prefixes[i] ) ) {
- String temp = "";
- for ( int j=0 ; j< str.length()-prefixes[i].length(); j++ )
- temp += str.charAt( j+prefixes[i].length() );
- return temp;
- }
- }
-
- return str;
- }
- private static String stripSuffixes( String str ) {
- str = step1( str );
- if ( str.length() >= 1 )
- str = step2( str );
- if ( str.length() >= 1 )
- str = step3( str );
- if ( str.length() >= 1 )
- str = step4( str );
- if ( str.length() >= 1 )
- str = step5( str );
-
- return str;
- }
- public static String stripAffixes( String str ) {
- str = str.toLowerCase();
- str = Clean(str);
-
- if (( str != "" ) && (str.length() > 2)) {
- str = stripPrefixes(str);
- if (str != "" )
- str = stripSuffixes(str);
- }
- return str;
- } //stripAffixes
- } //class