PageRenderTime 57ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 1ms

/code/FeatureVectorObj.java

http://silkin.googlecode.com/
Java | 724 lines | 568 code | 50 blank | 106 comment | 106 complexity | 3310a636eb7f77849867c0ff7c6506de MD5 | raw file
  1. //
  2. // FeatureVectorObj.java
  3. // SILKin
  4. //
  5. // Created by Gary Morris on Sun Jul 25 2004.
  6. //
  7. import java.util.*;
  8. import java.io.*;
  9. import java.text.*;
  10. /**
  11. A FeatureVectorObj implements a mixed-type vector of integers, reals, and lists. Collectively,
  12. these vector elements define the high-level characteristics of a {@link DomainTheory}. This vector will be used to
  13. compare any two <code>DomainTheories</code> and compute their similarity.
  14. @author Gary Morris, Northern Virginia Community College garymorris2245@verizon.net
  15. */
  16. public class FeatureVectorObj implements Serializable {
  17. // These 4 arrays tabulate the distributions of the 2 integer-valued features --
  18. // and float- and symbol-valued features -- over all DomainTheories.
  19. // These are filled by DomainTheory.computeFeatureVector().
  20. // The float arrays are initialized in Library.generateSimMatrix().
  21. public static SymbolDistributionObj el_3_distn, el_4_distn, el_8_distn, el_9_distn;
  22. public static FloatDistributionObj el_1_distn, el_2_distn, el_5_distn;
  23. public static final double[] zTable = {0.500d, 0.5040d, 0.5080d, 0.5120d, 0.5160d, 0.5199d, 0.5239d, 0.5279d, 0.5319d, 0.5359d,
  24. 0.5398d, 0.5438d, 0.5478d, 0.5517d, 0.5557d, 0.5596d, 0.5636d, 0.5675d, 0.5714d, 0.5753d,
  25. 0.5793d, 0.5832d, 0.5871d, 0.5910d, 0.5948d, 0.5987d, 0.6026d, 0.6064d, 0.6103d, 0.6141d,
  26. 0.6179d, 0.6217d, 0.6255d, 0.6293d, 0.6331d, 0.6368d, 0.6406d, 0.6443d, 0.6480d, 0.6517d,
  27. 0.6554d, 0.6591d, 0.6628d, 0.6664d, 0.6700d, 0.6736d, 0.6772d, 0.6808d, 0.6844d, 0.6879d,
  28. 0.6915d, 0.6950d, 0.6985d, 0.7019d, 0.7054d, 0.7088d, 0.7123d, 0.7157d, 0.7190d, 0.7224d,
  29. 0.7257d, 0.7291d, 0.7324d, 0.7357d, 0.7389d, 0.7422d, 0.7454d, 0.7486d, 0.7517d, 0.7549d,
  30. 0.7580d, 0.7611d, 0.7642d, 0.7673d, 0.7704d, 0.7734d, 0.7764d, 0.7794d, 0.7823d, 0.7852d,
  31. 0.7881d, 0.7910d, 0.7939d, 0.7967d, 0.7995d, 0.8023d, 0.8051d, 0.8078d, 0.8106d, 0.8133d,
  32. 0.8159d, 0.8186d, 0.8212d, 0.8238d, 0.8264d, 0.8289d, 0.8315d, 0.8340d, 0.8365d, 0.8389d,
  33. 0.8413d, 0.8438d, 0.8461d, 0.8485d, 0.8508d, 0.8531d, 0.8554d, 0.8577d, 0.8599d, 0.8621d,
  34. 0.8643d, 0.8665d, 0.8686d, 0.8708d, 0.8729d, 0.8749d, 0.8770d, 0.8790d, 0.8810d, 0.8830d,
  35. 0.8840d, 0.8869d, 0.8888d, 0.8907d, 0.8925d, 0.8944d, 0.8962d, 0.8980d, 0.8997d, 0.9015d,
  36. 0.9032d, 0.9049d, 0.9066d, 0.9082d, 0.9099d, 0.9115d, 0.9131d, 0.9147d, 0.9162d, 0.9177d,
  37. 0.9192d, 0.9207d, 0.9222d, 0.9236d, 0.9251d, 0.9265d, 0.9278d, 0.9292d, 0.9306d, 0.9319d,
  38. 0.9332d, 0.9345d, 0.9357d, 0.9370d, 0.9382d, 0.9394d, 0.9406d, 0.9418d, 0.9429d, 0.9441d,
  39. 0.9452d, 0.9463d, 0.9474d, 0.9484d, 0.9485d, 0.9505d, 0.9515d, 0.9525d, 0.9535d, 0.9545d,
  40. 0.9554d, 0.9564d, 0.9573d, 0.9582d, 0.9591d, 0.9599d, 0.9608d, 0.9616d, 0.9625d, 0.9633d,
  41. 0.9641d, 0.9649d, 0.9656d, 0.9664d, 0.9671d, 0.9678d, 0.9686d, 0.9693d, 0.9699d, 0.9706d,
  42. 0.9713d, 0.9719d, 0.9726d, 0.9732d, 0.9738d, 0.9744d, 0.9750d, 0.9756d, 0.9761d, 0.9767d,
  43. 0.9772d, 0.9778d, 0.9783d, 0.9788d, 0.9793d, 0.9798d, 0.9803d, 0.9808d, 0.9812d, 0.9817d,
  44. 0.9821d, 0.9826d, 0.9830d, 0.9834d, 0.9838d, 0.9842d, 0.9846d, 0.9850d, 0.9854d, 0.9857d,
  45. 0.9861d, 0.9864d, 0.9868d, 0.9871d, 0.9875d, 0.9878d, 0.9881d, 0.9884d, 0.9887d, 0.9890d,
  46. 0.9893d, 0.9896d, 0.9898d, 0.9901d, 0.9904d, 0.9906d, 0.9909d, 0.9911d, 0.9913d, 0.9916d,
  47. 0.9918d, 0.9920d, 0.9922d, 0.9925d, 0.9927d, 0.9929d, 0.9931d, 0.9932d, 0.9934d, 0.9936d,
  48. 0.9938d, 0.9940d, 0.9941d, 0.9943d, 0.9945d, 0.9946d, 0.9948d, 0.9949d, 0.9951d, 0.9952d,
  49. 0.9953d, 0.9955d, 0.9956d, 0.9957d, 0.9959d, 0.9960d, 0.9961d, 0.9962d, 0.9963d, 0.9964d,
  50. 0.9965d, 0.9966d, 0.9967d, 0.9968d, 0.9969d, 0.9970d, 0.9971d, 0.9972d, 0.9973d, 0.9974d,
  51. 0.9974d, 0.9975d, 0.9976d, 0.9977d, 0.9977d, 0.9978d, 0.9979d, 0.9979d, 0.9980d, 0.9981d,
  52. 0.9981d, 0.9982d, 0.9982d, 0.9983d, 0.9984d, 0.9984d, 0.9985d, 0.9985d, 0.9986d, 0.9986d,
  53. 0.9987d, 0.9987d, 0.9987d, 0.9988d, 0.9988d, 0.9989d, 0.9989d, 0.9989d, 0.9990d, 0.9990d,
  54. 0.9990d, 0.9991d, 0.9991d, 0.9991d, 0.9992d, 0.9992d, 0.9992d, 0.9992d, 0.9993d, 0.9993d,
  55. 0.9993d, 0.9993d, 0.9994d, 0.9994d, 0.9994d, 0.9994d, 0.9994d, 0.9995d, 0.9995d, 0.9995d,
  56. 0.9995d, 0.9995d, 0.9995d, 0.9996d, 0.9996d, 0.9996d, 0.9996d, 0.9996d, 0.9996d, 0.9997d,
  57. 0.9997d, 0.9997d, 0.9997d, 0.9997d, 0.9997d, 0.9997d, 0.9997d, 0.9997d, 0.9997d, 0.9998d};
  58. public String languageName; // of the DomainTheory this FeatureVector represents
  59. public boolean addressTerms; // is this a FV for Terms of Address?
  60. public int fvID;
  61. public boolean genSkewing = false; // element 0
  62. public float percentRecip = 0; // element 1
  63. public float percentMultiGen = 0; // element 2
  64. public ArrayList<Object> ivcList = new ArrayList<Object>(); // element 3
  65. public ArrayList<Object> hcvcList = new ArrayList<Object>(); // element 4
  66. public float avgLateralCount; // element 5
  67. public boolean stepTerms; // element 6
  68. public boolean udps; // element 7
  69. public TreeSet exactSigSet; // element 8
  70. public TreeSet structSigSet; // element 9
  71. static abstract class DistributionObj {
  72. ArrayList<Object> values = new ArrayList<Object>();
  73. public abstract float mean();
  74. public abstract void writeToFile(PrintWriter outFile);
  75. } // end of abstract class DisributionObj
  76. static class FloatDistributionObj extends DistributionObj implements Serializable {
  77. float sumOfVals = 0f, sumOfSquares = 0f;
  78. private double variance;
  79. int n = 0;
  80. public FloatDistributionObj() {
  81. } // 0-arg constructor
  82. public FloatDistributionObj(BufferedReader file) throws JavaSystemException {
  83. String line;
  84. try {
  85. line = file.readLine();
  86. } catch (IOException ioe) {
  87. throw new JavaSystemException("Error while reading distribution files:\n" + ioe);
  88. }
  89. int start = line.indexOf("[") + 1, middle,
  90. listEnd = line.indexOf("]\t"),
  91. item = line.indexOf(",");
  92. if (item == -1) {
  93. item = listEnd;
  94. }
  95. if (start == 0 || listEnd == -1) {
  96. throw new JavaSystemException("Corrupted format for FloatDistribution file: " + file);
  97. }
  98. while (start < listEnd) {
  99. values.add(new Float(line.substring(start, item)));
  100. start = item + 1;
  101. item = line.indexOf(",", start);
  102. if (item == -1 || item > listEnd) {
  103. item = listEnd;
  104. }
  105. } // end of loop reading values
  106. start = listEnd + 2;
  107. listEnd = line.indexOf("\t", start);
  108. sumOfVals = Float.parseFloat(line.substring(start, listEnd));
  109. start = listEnd + 1;
  110. listEnd = line.indexOf("\t", start);
  111. sumOfSquares = Float.parseFloat(line.substring(start, listEnd));
  112. start = listEnd + 1;
  113. listEnd = line.indexOf("\t", start);
  114. variance = Double.parseDouble(line.substring(start, listEnd));
  115. start = listEnd + 1;
  116. n = Integer.parseInt(line.substring(start).trim());
  117. } // end of constructor from file
  118. public void add(float num) {
  119. values.add(new Float(num));
  120. n++;
  121. sumOfVals += num;
  122. sumOfSquares += (num * num);
  123. } // end of method add()
  124. public float mean() {
  125. return ((float) sumOfVals / values.size());
  126. }
  127. public void writeToFile(PrintWriter file) {
  128. file.print("[");
  129. file.print(values.get(0));
  130. for (int i = 1; i < values.size(); i++) {
  131. file.print("," + values.get(i));
  132. }
  133. file.print("]\t");
  134. file.print(sumOfVals + "\t");
  135. file.print(sumOfSquares + "\t");
  136. file.print(variance() + "\t");
  137. file.print(n + "\t");
  138. file.flush();
  139. file.close();
  140. } // end of method writeToFile
  141. public double variance() {
  142. variance = (double) (sumOfVals * sumOfVals) / n;
  143. variance = sumOfSquares - variance;
  144. return variance;
  145. } // end of method variance
  146. public double stdDev() {
  147. variance = (double) (sumOfVals * sumOfVals) / n;
  148. variance = sumOfSquares - variance;
  149. return Math.sqrt(variance);
  150. } // end of method stdDev
  151. } // end of inner class FloatDistributionObj
  152. static class IntDistributionObj extends DistributionObj implements Serializable {
  153. TreeMap intCounts = new TreeMap(); // Integer => Counter
  154. int sumOfVals = 0, n = 0;
  155. public IntDistributionObj() {
  156. } // 0-arg constructor
  157. public IntDistributionObj(BufferedReader file) throws JavaSystemException {
  158. String line;
  159. try {
  160. line = file.readLine();
  161. } catch (IOException ioe) {
  162. throw new JavaSystemException("Error while reading distribution files:\n" + ioe);
  163. }
  164. int start = line.indexOf("[") + 1, middle,
  165. listEnd = line.indexOf("]\t"),
  166. item = line.indexOf(",");
  167. if (item == -1) {
  168. item = listEnd;
  169. }
  170. if (start == 0 || listEnd == -1) {
  171. throw new JavaSystemException("Corrupted format for IntDistribution file: " + file);
  172. }
  173. while (start < listEnd) {
  174. values.add(new Integer(line.substring(start, item)));
  175. start = item + 1;
  176. item = line.indexOf(",", start);
  177. if (item == -1 || item > listEnd) {
  178. item = listEnd;
  179. }
  180. } // end of loop reading values
  181. start = listEnd + 2;
  182. listEnd = line.indexOf("\t", start);
  183. sumOfVals = Integer.parseInt(line.substring(start, listEnd));
  184. start = line.indexOf("((") + 2;
  185. middle = line.indexOf(" = ", start);
  186. item = line.indexOf(",", middle);
  187. listEnd = line.indexOf("))");
  188. if (start == 1 || listEnd == -1) {
  189. throw new JavaSystemException("Corrupted format for IntDistribution file: " + file);
  190. }
  191. while (start < listEnd) {
  192. Integer num = new Integer(line.substring(start, middle));
  193. Counter ctr = new Counter();
  194. ctr.incr(Integer.parseInt(line.substring(middle + 3, item)));
  195. intCounts.put(num, ctr);
  196. start = item + 1;
  197. middle = line.indexOf(" = ", start);
  198. item = line.indexOf(",", middle);
  199. } // end of loop reading symbolCounts
  200. start = listEnd + 3;
  201. n = Integer.parseInt(line.substring(start).trim());
  202. } // end of constructor from file
  203. public void add(int num) {
  204. Integer number = new Integer(num);
  205. values.add(number); // for de-bugging purposes, to see sequence of values added
  206. sumOfVals += num;
  207. n++;
  208. if (intCounts.get(number) == null) {
  209. intCounts.put(number, new Counter());
  210. }
  211. ((Counter) intCounts.get(number)).incr();
  212. } // end of method add()
  213. public float mean() {
  214. return ((float) sumOfVals / n);
  215. }
  216. public void writeToFile(PrintWriter file) {
  217. file.print("[");
  218. file.print(values.get(0));
  219. for (int i = 1; i < values.size(); i++) {
  220. file.print("," + values.get(i));
  221. }
  222. file.print("]\t");
  223. file.print(sumOfVals + "\t");
  224. file.print("((");
  225. Iterator iter = intCounts.entrySet().iterator();
  226. while (iter.hasNext()) {
  227. Map.Entry entry = (Map.Entry) iter.next();
  228. file.print(entry.getKey() + " = " + entry.getValue() + ",");
  229. }
  230. file.print("))\t");
  231. file.print(n + "\t");
  232. file.flush();
  233. file.close();
  234. } // end of method writeToFile
  235. public int freq(int num) { // how often this int appears in the distribution
  236. if (intCounts.get(new Integer(num)) == null) {
  237. return 0;
  238. } else {
  239. return ((Counter) intCounts.get(new Integer(num))).total();
  240. }
  241. } // end of method freq
  242. public float probability(int num) { // what percentage of all observations is this num
  243. return (float) freq(num) / n;
  244. } // end of method probability/1
  245. public float probability(int num1, int num2) { // what percentage of observations are in this range
  246. int total = 0, lo, hi;
  247. if (num1 > num2) {
  248. lo = num2;
  249. hi = num1;
  250. } else {
  251. lo = num1;
  252. hi = num2;
  253. }
  254. for (int i = lo; i <= hi; i++) {
  255. total += freq(i);
  256. }
  257. return (float) total / n;
  258. } // end of method probability/2
  259. } // end of inner class IntDistributionObj
  260. static class SymbolDistributionObj extends DistributionObj implements Serializable {
  261. TreeMap symbolCounts = new TreeMap();
  262. int n = 0;
  263. public SymbolDistributionObj() {
  264. } // 0-arg constructor
  265. public SymbolDistributionObj(BufferedReader file) throws JavaSystemException {
  266. String line;
  267. try {
  268. line = file.readLine();
  269. } catch (IOException ioe) {
  270. throw new JavaSystemException("Error while reading distribution files:\n" + ioe);
  271. }
  272. int start = line.indexOf("[") + 1, middle,
  273. listEnd = line.indexOf("]\t"),
  274. item = line.indexOf("\t");
  275. if (item == -1 || item > listEnd) {
  276. item = listEnd;
  277. }
  278. if (start == 0 || listEnd == -1) {
  279. throw new JavaSystemException("Corrupted format for SymbolDistribution file: " + file);
  280. }
  281. while (start < listEnd) {
  282. values.add(line.substring(start, item));
  283. start = item + 1;
  284. item = line.indexOf("\t", start);
  285. if (item == -1 || item > listEnd) {
  286. item = listEnd;
  287. }
  288. } // end of loop reading values
  289. start = line.indexOf("((") + 2;
  290. middle = line.indexOf(" = ", start);
  291. item = line.indexOf(",", middle);
  292. listEnd = line.indexOf("))");
  293. if (start == 1 || listEnd == -1) {
  294. throw new JavaSystemException("Corrupted format for SymbolDistribution file: " + file);
  295. }
  296. while (start < listEnd) {
  297. String symbol = line.substring(start, middle);
  298. Counter ctr = new Counter();
  299. ctr.incr(Integer.parseInt(line.substring(middle + 3, item)));
  300. symbolCounts.put(symbol, ctr);
  301. start = item + 1;
  302. middle = line.indexOf(" = ", start);
  303. item = line.indexOf(",", middle);
  304. } // end of loop reading symbolCounts
  305. start = listEnd + 3;
  306. n = Integer.parseInt(line.substring(start).trim());
  307. } // end of constructor from file
  308. public void addAll(ArrayList<Object> symbols) {
  309. for (int i = 0; i < symbols.size(); i++) {
  310. add((String) symbols.get(i));
  311. }
  312. } // end of method addAll
  313. public void addAll(TreeSet items) { // Assumes set members are Strings.
  314. Iterator iter = items.iterator();
  315. while (iter.hasNext()) {
  316. add((String) iter.next());
  317. }
  318. } // end of method addAll
  319. public void add(String symb) {
  320. values.add(symb); // for de-bugging purposes, to see sequence of values added
  321. n++;
  322. if (symbolCounts.get(symb) == null) {
  323. symbolCounts.put(symb, new Counter());
  324. }
  325. ((Counter) symbolCounts.get(symb)).incr();
  326. } // end of method add()
  327. public float mean() {
  328. return 0f;
  329. } // a silly answer to a silly question!!
  330. public void writeToFile(PrintWriter file) {
  331. file.print("[");
  332. file.print(values.get(0));
  333. for (int i = 1; i < values.size(); i++) {
  334. file.print("\t" + values.get(i));
  335. }
  336. file.print("]\t");
  337. file.print("((");
  338. Iterator iter = symbolCounts.entrySet().iterator();
  339. while (iter.hasNext()) {
  340. Map.Entry entry = (Map.Entry) iter.next();
  341. file.print(entry.getKey() + " = " + entry.getValue() + ",");
  342. }
  343. file.print("))\t");
  344. file.print(n + "\t");
  345. file.flush();
  346. file.close();
  347. } // end of method writeToFile
  348. public int freq(String symb) { // how often this symbol appears in the distribution
  349. if (symbolCounts.get(symb) == null) {
  350. return 0;
  351. } else {
  352. return ((Counter) symbolCounts.get(symb)).total();
  353. }
  354. } // end of method freq
  355. public float probability(String symb) { // what percentage of all observations is this symbol
  356. return (float) freq(symb) / n;
  357. } // end of method probability
  358. } // end of inner class SymbolDistributionObj
  359. /* This zero-arg constructor is for general use and by Serialization. */
  360. FeatureVectorObj() {
  361. }
  362. FeatureVectorObj(BufferedReader file) throws JavaSystemException {
  363. int num, comma, start;
  364. try {
  365. String line = file.readLine();
  366. languageName = line.trim();
  367. line = file.readLine();
  368. addressTerms = Boolean.valueOf(line).booleanValue();
  369. line = file.readLine();
  370. fvID = Integer.parseInt(line);
  371. line = file.readLine();
  372. genSkewing = Boolean.valueOf(line).booleanValue();
  373. line = file.readLine();
  374. percentRecip = Float.parseFloat(line);
  375. line = file.readLine();
  376. percentMultiGen = Float.parseFloat(line);
  377. line = file.readLine();
  378. ivcList = parseList(line, true);
  379. line = file.readLine();
  380. hcvcList = parseList(line, true);
  381. line = file.readLine();
  382. avgLateralCount = Float.parseFloat(line);
  383. line = file.readLine();
  384. stepTerms = Boolean.valueOf(line).booleanValue();
  385. line = file.readLine();
  386. udps = Boolean.valueOf(line).booleanValue();
  387. line = file.readLine();
  388. exactSigSet = new TreeSet(parseList(line, false));
  389. line = file.readLine();
  390. structSigSet = new TreeSet(parseList(line, false));
  391. } catch (NumberFormatException e) {
  392. throw new JavaSystemException("Can't construct Feature Vector: corrupted File.\n" + e);
  393. } catch (IOException iox) {
  394. throw new JavaSystemException("Can't construct Feature Vector: corrupted File.\n" + iox);
  395. }
  396. } // end of constructor from file
  397. public ArrayList<Object> parseList(String line, boolean parens) {
  398. // line will look like '[item1<target><space>item2<target><space>...item99]
  399. ArrayList<Object> answer = new ArrayList<Object>();
  400. int comma, start = 1, pad = (parens ? 1 : 0);
  401. String target = (parens ? ")," : ","), item;
  402. comma = line.indexOf(target) + pad;
  403. while (comma > start) {
  404. item = line.substring(start, comma);
  405. answer.add(item);
  406. start = comma + 2;
  407. comma = line.substring(start).indexOf(target) + start + pad;
  408. } // end of while comma > start
  409. // Now capture last item before ']'
  410. item = line.substring(start, line.length() - 1);
  411. if (item.length() > 0) {
  412. answer.add(item);
  413. }
  414. return answer;
  415. } // end of method parseList
  416. public String toString() {
  417. String str = "Feature Vector for " + languageName;
  418. str += "\n0: Generational Skewing = ";
  419. if (genSkewing) {
  420. str += "Yes";
  421. } else {
  422. str += "No";
  423. }
  424. str += "\n1: Percent Reciprocal Terms = " + percentRecip + "%";
  425. str += "\n2: Percent Multi-Generational Terms = " + percentMultiGen + "%";
  426. str += "\n3: Constraints on Linking Kinsmen = ";
  427. if (0 < ivcList.size()) {
  428. str += ivcList;
  429. } else {
  430. str += " None ";
  431. }
  432. str += "\n4: Constraints on Ego & Alter = ";
  433. if (0 < hcvcList.size()) {
  434. str += hcvcList;
  435. } else {
  436. str += " None ";
  437. }
  438. str += "\n5: Average Lateral Count = " + avgLateralCount;
  439. str += "\n6: Explicit Terms for Half/Step-Kin = ";
  440. if (stepTerms) {
  441. str += "Yes";
  442. } else {
  443. str += " None ";
  444. }
  445. str += "\n7: User-Defined Properties = ";
  446. if (udps) {
  447. str += "Present";
  448. } else {
  449. str += " None ";
  450. }
  451. str += "\n8: Exact EQCs used: " + exactSigSet.size();
  452. str += "\n9: Structural EQCs used:";
  453. Iterator sigIter = structSigSet.iterator();
  454. while (sigIter.hasNext()) {
  455. str += "\n\t" + sigIter.next();
  456. }
  457. return str + "\n";
  458. } // end of over-riding method toString
  459. public void toDisk(PrintWriter outFile) throws IOException {
  460. outFile.println(languageName);
  461. outFile.println(addressTerms);
  462. outFile.println(fvID);
  463. outFile.println(genSkewing);
  464. outFile.println(percentRecip);
  465. outFile.println(percentMultiGen);
  466. outFile.println(ivcList);
  467. outFile.println(hcvcList);
  468. outFile.println(avgLateralCount);
  469. outFile.println(stepTerms);
  470. outFile.println(udps);
  471. outFile.println(exactSigSet);
  472. outFile.println(structSigSet);
  473. outFile.flush();
  474. outFile.close();
  475. } // end of method toDisk
  476. /**
  477. Compare this feature vector with another one feature-by-feature. Call a method to compare
  478. each feature based on the type of that feature.
  479. @param fv2 the other feature vector.
  480. @return a float array containing all the similarity scores.
  481. */
  482. public float[] computeSimilarity(FeatureVectorObj fv2) {
  483. float[] answer = new float[Library.clSt.numberOfFeatures];
  484. ArrayList<Object> list1 = new ArrayList<Object>(exactSigSet),
  485. list2 = new ArrayList<Object>(fv2.exactSigSet);
  486. answer[0] = (((genSkewing && fv2.genSkewing) || (!genSkewing && !fv2.genSkewing)) ? 1 : 0);
  487. answer[1] = floatSim(percentRecip, fv2.percentRecip, el_1_distn);
  488. answer[2] = floatSim(percentMultiGen, fv2.percentMultiGen, el_2_distn);
  489. answer[3] = litListSim(ivcList, fv2.ivcList, el_3_distn);
  490. answer[4] = litListSim(hcvcList, fv2.hcvcList, el_4_distn);
  491. answer[5] = floatSim(avgLateralCount, fv2.avgLateralCount, el_5_distn);
  492. answer[6] = (((stepTerms && fv2.stepTerms) || (!stepTerms && !fv2.stepTerms)) ? 1 : 0);
  493. answer[7] = (((udps && fv2.udps) || (!udps && !fv2.udps)) ? 1 : 0);
  494. answer[8] = litListSim(list1, list2, el_8_distn);
  495. list1 = new ArrayList<Object>(structSigSet);
  496. list2 = new ArrayList<Object>(fv2.structSigSet);
  497. answer[9] = litListSim(list1, list2, el_9_distn);
  498. /* // DeBug code
  499. String str = "\n" + languageName;
  500. str += " vs. " + fv2.languageName;
  501. System.out.println(str);
  502. str = "[ " + answer[0];
  503. for (int i=1; i < 10; i++) str += ", " + answer[i];
  504. System.out.println(str + "]");
  505. */
  506. return answer;
  507. } // end of method computeSimilarity
  508. /**
  509. Compare this HeadClauseVariableConstraint list (of {@link Literal}s in standard form) with another,
  510. using Lin's similarity metric.
  511. @param hcvc1 this hcvc list.
  512. @param hcvc2 the other hcvc list.
  513. @return a float = the similarity score.
  514. */
  515. float litListSim(ArrayList<Object> listA, ArrayList<Object> listB, SymbolDistributionObj distn) {
  516. // Similarity(list1, list2) = (2 * I(set-intersection of list1, list2))
  517. // -------------------------------------------
  518. // I(list1) + I(list2)
  519. //
  520. // I(a list) = - Sum_for_all_lits of log( This_LitFrequency / TotalLitCount )
  521. //
  522. // note: should be log-base-2, but since the ratio is taken, I can use natural logs.
  523. if ((listA.isEmpty()) && (listB.isEmpty())) {
  524. return 1f;
  525. }
  526. if ((listA.isEmpty()) || (listB.isEmpty())) {
  527. return 0f;
  528. }
  529. ArrayList<Object> list1, list2;
  530. if (listA.size() > listB.size()) {
  531. list1 = listB;
  532. list2 = listA;
  533. } // end of B-is-smaller
  534. else {
  535. list1 = listA;
  536. list2 = listB;
  537. } // end of A-is-smaller
  538. float answer, intersectVal = 0f, list1Val = 0f, list2Val = 0f;
  539. ArrayList<Object> intersect = new ArrayList<Object>(), copy2 = new ArrayList<Object>(list2);
  540. int where;
  541. String litImage;
  542. for (int i = 0; i < list1.size(); i++) {
  543. where = copy2.indexOf(list1.get(i));
  544. if (where > -1) { // matching canonical String in copy2
  545. intersect.add(copy2.get(where));
  546. copy2.remove(where);
  547. } // end of if-a-match-was-found
  548. } // end of loop thru literals in list1
  549. // Compute I(set_intersection)
  550. for (int i = 0; i < intersect.size(); i++) {
  551. litImage = (String) intersect.get(i);
  552. intersectVal -= Math.log(distn.probability(litImage));
  553. } // end of summation for all lits in the intersection
  554. // Compute I(list1)
  555. // If all of list1 is in the intersection (i.e. both lists are same size)
  556. // then the total must be the same.
  557. if (intersect.size() == list1.size()) {
  558. list1Val = intersectVal;
  559. } else {
  560. for (int i = 0; i < list1.size(); i++) {
  561. litImage = (String) list1.get(i);
  562. list1Val -= Math.log(distn.probability(litImage));
  563. } // end of summation for all lits in list1
  564. } // Compute I(list2)
  565. if (intersect.size() == list2.size()) {
  566. list2Val = intersectVal;
  567. } else {
  568. for (int i = 0; i < list2.size(); i++) {
  569. litImage = (String) list2.get(i);
  570. list2Val -= Math.log(distn.probability(litImage));
  571. } // end of summation for all lits in list2
  572. }
  573. answer = (2 * intersectVal) / (list1Val + list2Val);
  574. return answer;
  575. } // end of method litListSim
  576. /**
  577. Compare the 2 integer values using Lin's similarity metric.
  578. @param num1 this int.
  579. @param num2 the other int.
  580. @param distribution an array of ints = the number of occurences of each value. dist'n[4] = number of 4's seen
  581. @return a float = the similarity score.
  582. */
  583. float intSim(int num1, int num2, IntDistributionObj distn) {
  584. //
  585. // Similarity(int1, int2) = (2 * I(range-from-int1-to-int2))
  586. // -------------------------------
  587. // I(int1) + I(int2)
  588. //
  589. // I(an int range) = - log[ Sum_for_all_ints_in_range of( This_IntFrequency / TotalCount ) ]
  590. //
  591. // Note1: should be log-base-2, but since the ratio is taken, I use natural logs.
  592. //
  593. // Note2: These may be positive or negative numbers
  594. //
  595. double iRange = Math.log(distn.probability(num1, num2));
  596. double iNum1 = Math.log(distn.probability(num1));
  597. double iNum2 = Math.log(distn.probability(num2));
  598. double answer = (2 * iRange) / (iNum1 + iNum2);
  599. return (float) answer;
  600. } // end of method intSim
  601. /**
  602. Compare the 2 float values using Lin's similarity metric.
  603. @param flt1 this float.
  604. @param flt2 the other float.
  605. @param distn the distribution of floats for this FeatureVector element, from which these 2 numbers came
  606. @return a float = the similarity score.
  607. */
  608. float floatSim(float flt1, float flt2, FloatDistributionObj distn) {
  609. double num1, num2, epsilon = (distn.stdDev() / 8), iCommon, iNum1, iNum2, score;
  610. //
  611. // Similarity(num1, num2) = (2 * I(range:num1-minus-epsilon-to-num2-plus-epsilon))
  612. // ------------------------------------------------------------------------
  613. // I(num1-minus-epsilon-to-num1-plus-epsilon) + I(num2-minus-epsilon-to-num2-plus-epsilon)
  614. //
  615. // where: I(range: R1 to R2) = - log[ phiScore(R2) - phiScore(R1) ]
  616. //
  617. // Note1: phiScore(x) = % of the (presumed normal) distribution curve to left of x.
  618. //
  619. // Note2: Should be using log-base-2, but since it's a ratio, it doesn't matter.
  620. //
  621. // Note3: Epsilon is arbitrarily set at 1/8th of the standard deviation.
  622. //
  623. // Note4: It is possible to get a distribution containing all the same number (e.g. all zeroes).
  624. // Using the normal formula breaks down in that case, but we know that when all the numbers
  625. // are the same, the similarity must = 1.0. So shortstop that situation.
  626. if (flt2 == flt1) {
  627. return 1.0f;
  628. }
  629. if (flt2 < flt1) {
  630. num1 = (double) flt2;
  631. num2 = (double) flt1;
  632. } // end of switch-em
  633. else {
  634. num1 = (double) flt1;
  635. num2 = (double) flt2;
  636. } // end of don't-switch-em
  637. iCommon = Math.log(phiScore((num2 + epsilon), distn) - phiScore((num1 - epsilon), distn));
  638. iNum1 = Math.log(phiScore((num1 + epsilon), distn) - phiScore((num1 - epsilon), distn));
  639. iNum2 = Math.log(phiScore((num2 + epsilon), distn) - phiScore((num2 - epsilon), distn));
  640. score = (2 * iCommon) / (iNum1 + iNum2);
  641. return (float) score;
  642. } // end of method floatSim
  643. /**
  644. Implements a standard look-up table of values for the percent of the distribution area to the left
  645. of a value.
  646. @param num a real number drawn from the distribution.
  647. @param distn the distribution of floats for this FeatureVector element, including num
  648. @return the statistical Phi-score. Taken from the table in a standard textbook
  649. (Jay L. Devore, Probability & Statistics 5, Pacific Grove CA, Duxbury)
  650. */
  651. public double phiScore(double num, FloatDistributionObj distn) {
  652. double answer, norm = (num - distn.mean()) / distn.stdDev();
  653. int cell;
  654. if (norm >= 0) {
  655. norm += 0.005; // for rounding
  656. cell = (int) Math.floor(norm * 100);
  657. cell = Math.min(cell, 349);
  658. answer = zTable[cell];
  659. } // end of larger-than-zero
  660. else {
  661. norm -= 0.005;
  662. cell = (int) Math.floor(norm * -100);
  663. cell = Math.min(cell, 349);
  664. answer = 1.0d - zTable[cell];
  665. } // end of less-than-zero
  666. return answer;
  667. } // end of method phiScore
  668. } // end of class FeatureVectorObj