PageRenderTime 61ms CodeModel.GetById 31ms RepoModel.GetById 0ms app.codeStats 0ms

/src/thirdParty/Porter.java

https://bitbucket.org/teamwildtreechase/hatparsing
Java | 393 lines | 318 code | 72 blank | 3 comment | 123 complexity | 368ae6cf16aae023d4e1ca651ae3ab73 MD5 | raw file
  1. /*
  2. * Copyright (c) <2013> <Gideon Maillette de Buy Wenniger>. All Rights Reserved.
  3. */
  4. package thirdParty;
  5. class NewString {
  6. public String str;
  7. NewString() {
  8. str = "";
  9. }
  10. }
  11. public class Porter {
  12. private static String Clean( String str ) {
  13. int last = str.length();
  14. String temp = "";
  15. for ( int i=0; i < last; i++ ) {
  16. if ( Character.isLetterOrDigit( str.charAt(i) ) )
  17. temp += str.charAt(i);
  18. }
  19. return temp;
  20. } //clean
  21. private static boolean hasSuffix( String word, String suffix, NewString stem ) {
  22. String tmp = "";
  23. if ( word.length() <= suffix.length() )
  24. return false;
  25. if ((suffix.length() > 1) && ( word.charAt( word.length()-2 ) != suffix.charAt( suffix.length()-2 ) ))
  26. {
  27. return false;
  28. }
  29. stem.str = "";
  30. for ( int i=0; i<word.length()-suffix.length(); i++ )
  31. stem.str += word.charAt( i );
  32. tmp = stem.str;
  33. for ( int i=0; i<suffix.length(); i++ )
  34. tmp += suffix.charAt( i );
  35. if ( tmp.compareTo( word ) == 0 )
  36. return true;
  37. else
  38. return false;
  39. }
  40. private static boolean vowel( char ch, char prev ) {
  41. switch ( ch ) {
  42. case 'a': case 'e': case 'i': case 'o': case 'u':
  43. return true;
  44. case 'y': {
  45. switch ( prev ) {
  46. case 'a': case 'e': case 'i': case 'o': case 'u':
  47. return false;
  48. default:
  49. return true;
  50. }
  51. }
  52. default :
  53. return false;
  54. }
  55. }
  56. private static int measure( String stem ) {
  57. int i=0, count = 0;
  58. int length = stem.length();
  59. while ( i < length ) {
  60. for ( ; i < length ; i++ ) {
  61. if ( i > 0 ) {
  62. if ( vowel(stem.charAt(i),stem.charAt(i-1)) )
  63. break;
  64. }
  65. else {
  66. if ( vowel(stem.charAt(i),'a') )
  67. break;
  68. }
  69. }
  70. for ( i++ ; i < length ; i++ ) {
  71. if ( i > 0 ) {
  72. if ( !vowel(stem.charAt(i),stem.charAt(i-1)) )
  73. break;
  74. }
  75. else {
  76. if ( !vowel(stem.charAt(i),'?') )
  77. break;
  78. }
  79. }
  80. if ( i < length ) {
  81. count++;
  82. i++;
  83. }
  84. } //while
  85. return(count);
  86. }
  87. private static boolean containsVowel( String word ) {
  88. for (int i=0 ; i < word.length(); i++ )
  89. if ( i > 0 ) {
  90. if ( vowel(word.charAt(i),word.charAt(i-1)) )
  91. return true;
  92. }
  93. else {
  94. if ( vowel(word.charAt(0),'a') )
  95. return true;
  96. }
  97. return false;
  98. }
  99. private static boolean cvc( String str ) {
  100. int length=str.length();
  101. if ( length < 3 )
  102. return false;
  103. if ( (!vowel(str.charAt(length-1),str.charAt(length-2)) )
  104. && (str.charAt(length-1) != 'w') && (str.charAt(length-1) != 'x') && (str.charAt(length-1) != 'y')
  105. && (vowel(str.charAt(length-2),str.charAt(length-3))) ) {
  106. if (length == 3) {
  107. if (!vowel(str.charAt(0),'?'))
  108. return true;
  109. else
  110. return false;
  111. }
  112. else {
  113. if (!vowel(str.charAt(length-3),str.charAt(length-4)) )
  114. return true;
  115. else
  116. return false;
  117. }
  118. }
  119. return false;
  120. }
  121. private static String step1( String str ) {
  122. NewString stem = new NewString();
  123. if ( str.charAt( str.length()-1 ) == 's' ) {
  124. if ( (hasSuffix( str, "sses", stem )) || (hasSuffix( str, "ies", stem)) ){
  125. String tmp = "";
  126. for (int i=0; i<str.length()-2; i++)
  127. tmp += str.charAt(i);
  128. str = tmp;
  129. }
  130. else {
  131. if ( ( str.length() == 1 ) && ( str.charAt(str.length()-1) == 's' ) ) {
  132. str = "";
  133. return str;
  134. }
  135. if ( str.charAt( str.length()-2 ) != 's' ) {
  136. String tmp = "";
  137. for (int i=0; i<str.length()-1; i++)
  138. tmp += str.charAt(i);
  139. str = tmp;
  140. }
  141. }
  142. }
  143. if ( hasSuffix( str,"eed",stem ) ) {
  144. if ( measure( stem.str ) > 0 ) {
  145. String tmp = "";
  146. for (int i=0; i<str.length()-1; i++)
  147. tmp += str.charAt( i );
  148. str = tmp;
  149. }
  150. }
  151. else
  152. {
  153. if( (hasSuffix( str,"ed",stem )) || (hasSuffix( str,"ing",stem )) && (containsVowel( stem.str )) )
  154. {
  155. String tmp = "";
  156. for ( int i = 0; i < stem.str.length(); i++)
  157. tmp += str.charAt( i );
  158. str = tmp;
  159. if ( str.length() == 1 )
  160. return str;
  161. if ( ( hasSuffix( str,"at",stem) ) || ( hasSuffix( str,"bl",stem ) ) || ( hasSuffix( str,"iz",stem) ) ) {
  162. str += "e";
  163. }
  164. else {
  165. int length = str.length();
  166. if ( (str.charAt(length-1) == str.charAt(length-2))
  167. && (str.charAt(length-1) != 'l') && (str.charAt(length-1) != 's') && (str.charAt(length-1) != 'z') ) {
  168. tmp = "";
  169. for (int i=0; i<str.length()-1; i++)
  170. tmp += str.charAt(i);
  171. str = tmp;
  172. }
  173. else
  174. if ( measure( str ) == 1 && ( cvc(str) ))
  175. {
  176. str += "e";
  177. }
  178. }
  179. }
  180. }
  181. if ( hasSuffix(str,"y",stem) && containsVowel( stem.str ))
  182. {
  183. String tmp = "";
  184. for (int i=0; i<str.length()-1; i++ )
  185. tmp += str.charAt(i);
  186. str = tmp + "i";
  187. }
  188. return str;
  189. }
  190. private static String step2( String str ) {
  191. String[][] suffixes = { { "ational", "ate" },
  192. { "tional", "tion" },
  193. { "enci", "ence" },
  194. { "anci", "ance" },
  195. { "izer", "ize" },
  196. { "iser", "ize" },
  197. { "abli", "able" },
  198. { "alli", "al" },
  199. { "entli", "ent" },
  200. { "eli", "e" },
  201. { "ousli", "ous" },
  202. { "ization", "ize" },
  203. { "isation", "ize" },
  204. { "ation", "ate" },
  205. { "ator", "ate" },
  206. { "alism", "al" },
  207. { "iveness", "ive" },
  208. { "fulness", "ful" },
  209. { "ousness", "ous" },
  210. { "aliti", "al" },
  211. { "iviti", "ive" },
  212. { "biliti", "ble" }};
  213. NewString stem = new NewString();
  214. for ( int index = 0 ; index < suffixes.length; index++ )
  215. {
  216. if ( hasSuffix( str, suffixes[index][0], stem ) && ( measure ( stem.str ) > 0 ) )
  217. {
  218. str = stem.str + suffixes[index][1];
  219. return str;
  220. }
  221. }
  222. return str;
  223. }
  224. private static String step3( String str ) {
  225. String[][] suffixes = { { "icate", "ic" },
  226. { "ative", "" },
  227. { "alize", "al" },
  228. { "alise", "al" },
  229. { "iciti", "ic" },
  230. { "ical", "ic" },
  231. { "ful", "" },
  232. { "ness", "" }};
  233. NewString stem = new NewString();
  234. for ( int index = 0 ; index<suffixes.length; index++ ) {
  235. if ( hasSuffix ( str, suffixes[index][0], stem ) && ( measure ( stem.str ) > 0 ))
  236. {
  237. str = stem.str + suffixes[index][1];
  238. return str;
  239. }
  240. }
  241. return str;
  242. }
  243. private static String step4( String str ) {
  244. String[] suffixes = { "al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent", "sion", "tion",
  245. "ou", "ism", "ate", "iti", "ous", "ive", "ize", "ise"};
  246. NewString stem = new NewString();
  247. for ( int index = 0 ; index<suffixes.length; index++ ) {
  248. if ( hasSuffix ( str, suffixes[index], stem ) && ( measure ( stem.str ) > 1 ))
  249. {
  250. str = stem.str;
  251. return str;
  252. }
  253. }
  254. return str;
  255. }
  256. private static String step5( String str ) {
  257. if ( str.charAt(str.length()-1) == 'e' ) {
  258. if ( measure(str) > 1 ) {/* measure(str)==measure(stem) if ends in vowel */
  259. String tmp = "";
  260. for ( int i=0; i<str.length()-1; i++ )
  261. tmp += str.charAt( i );
  262. str = tmp;
  263. }
  264. else
  265. if ( measure(str) == 1 ) {
  266. String stem = "";
  267. for ( int i=0; i<str.length()-1; i++ )
  268. stem += str.charAt( i );
  269. if ( !cvc(stem) )
  270. str = stem;
  271. }
  272. }
  273. if ( str.length() == 1 )
  274. return str;
  275. if ( (str.charAt(str.length()-1) == 'l') && (str.charAt(str.length()-2) == 'l') && (measure(str) > 1) && ( measure(str) > 1 ) )
  276. {/* measure(str)==measure(stem) if ends in vowel */
  277. String tmp = "";
  278. for ( int i=0; i<str.length()-1; i++ )
  279. {
  280. tmp += str.charAt( i );
  281. }
  282. str = tmp;
  283. }
  284. return str;
  285. }
  286. private static String stripPrefixes ( String str) {
  287. String[] prefixes = { "kilo", "micro", "milli", "intra", "ultra", "mega", "nano", "pico", "pseudo"};
  288. int last = prefixes.length;
  289. for ( int i=0 ; i<last; i++ ) {
  290. if ( str.startsWith( prefixes[i] ) ) {
  291. String temp = "";
  292. for ( int j=0 ; j< str.length()-prefixes[i].length(); j++ )
  293. temp += str.charAt( j+prefixes[i].length() );
  294. return temp;
  295. }
  296. }
  297. return str;
  298. }
  299. private static String stripSuffixes( String str ) {
  300. str = step1( str );
  301. if ( str.length() >= 1 )
  302. str = step2( str );
  303. if ( str.length() >= 1 )
  304. str = step3( str );
  305. if ( str.length() >= 1 )
  306. str = step4( str );
  307. if ( str.length() >= 1 )
  308. str = step5( str );
  309. return str;
  310. }
  311. public static String stripAffixes( String str ) {
  312. str = str.toLowerCase();
  313. str = Clean(str);
  314. if (( str != "" ) && (str.length() > 2)) {
  315. str = stripPrefixes(str);
  316. if (str != "" )
  317. str = stripSuffixes(str);
  318. }
  319. return str;
  320. } //stripAffixes
  321. } //class