PageRenderTime 136ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/core/infinit.e.harvest.library/src/com/ikanow/infinit/e/harvest/test/TestCode.java

https://github.com/IKANOW/Infinit.e
Java | 300 lines | 36 code | 24 blank | 240 comment | 0 complexity | 4e876703f5b8fa37113407d42e4463ec MD5 | raw file
Possible License(s): BSD-3-Clause
  1. /*******************************************************************************
  2. * Copyright 2012, The Infinit.e Open Source Project.
  3. *
  4. * This program is free software: you can redistribute it and/or modify
  5. * it under the terms of the GNU Affero General Public License, version 3,
  6. * as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU Affero General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU Affero General Public License
  14. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  15. ******************************************************************************/
  16. package com.ikanow.infinit.e.harvest.test;
  17. import java.io.IOException;
  18. import java.net.URL;
  19. import java.util.Arrays;
  20. import java.util.LinkedList;
  21. import java.util.List;
  22. import java.util.TreeSet;
  23. import org.bson.types.ObjectId;
  24. import com.google.gson.GsonBuilder;
  25. import com.ikanow.infinit.e.data_model.store.DbManager;
  26. import com.ikanow.infinit.e.data_model.store.config.source.SimpleTextCleanserPojo;
  27. import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
  28. import com.ikanow.infinit.e.data_model.store.config.source.UnstructuredAnalysisConfigPojo;
  29. import com.ikanow.infinit.e.data_model.store.config.source.UnstructuredAnalysisConfigPojo.Context;
  30. import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
  31. import com.ikanow.infinit.e.data_model.Globals;
  32. import com.ikanow.infinit.e.data_model.Globals.Identity;
  33. import com.ikanow.infinit.e.harvest.HarvestController;
  34. import com.ikanow.infinit.e.harvest.utils.ProxyManager;
  35. import com.mongodb.BasicDBObject;
  36. @SuppressWarnings("unused")
  37. public class TestCode {
  38. /**
  39. * @param args
  40. * @throws IOException
  41. */
  42. public static void main(String[] args) throws IOException {
  43. // Configuration:
  44. System.out.println(Arrays.toString(args));
  45. Globals.setIdentity(com.ikanow.infinit.e.data_model.Globals.Identity.IDENTITY_SERVICE);
  46. Globals.overrideConfigLocation(args[0]);
  47. // Check proxy:
  48. ProxyManager.getProxy(new URL("http://www.ikanow.com"), null);
  49. // TESTING
  50. HarvestController harvester = new HarvestController();
  51. //harvester.setStandaloneMode(0);
  52. harvester.setStandaloneMode(5);
  53. List<DocumentPojo> toAdd = new LinkedList<DocumentPojo>();
  54. List<DocumentPojo> toUpdate = new LinkedList<DocumentPojo>();
  55. List<DocumentPojo> toRemove = new LinkedList<DocumentPojo>();
  56. BasicDBObject query = null;
  57. SourcePojo feedSource = null;
  58. // 1. Get documents from a "feed" source
  59. // 1.1 OPENCALAIS
  60. // toAdd.clear();
  61. // toUpdate.clear();
  62. // toRemove.clear();
  63. // query = new BasicDBObject("extractType", "Feed");
  64. // feedSource = SourcePojo.fromDb(DbManager.getConfig().getSource().findOne(query), SourcePojo.class);
  65. // feedSource.getHarvestConfig().setHarvested(null);
  66. // System.out.println("RSS1_SOURCE=" + feedSource.getUrl());
  67. // harvester.harvestSource(feedSource, toAdd, toUpdate, toRemove);
  68. // System.out.println("RSS1_STATUS=" + new GsonBuilder().setPrettyPrinting().create().toJson(feedSource.getHarvestConfig()));
  69. // System.out.println("RSS1_TOADD (" + toAdd.size() + "):");
  70. // if (toAdd.size() > 0) {
  71. // System.out.println("RSS1_EGDOC=" + new GsonBuilder().setPrettyPrinting().create().toJson(toAdd.get(0)));
  72. // }
  73. // System.out.println("RSS1_TOUPDATE (" + toUpdate.size() + ").");
  74. // System.out.println("RSS1_TOREMOVE (" + toRemove.size() + ").");
  75. // 1.2 ALCHEMYAPI
  76. // toAdd.clear();
  77. // toUpdate.clear();
  78. // toRemove.clear();
  79. // query = new BasicDBObject("extractType", "Feed");
  80. // query.put("useExtractor", "alchemyapi");
  81. // feedSource = SourcePojo.fromDb(DbManager.getConfig().getSource().findOne(query), SourcePojo.class);
  82. // feedSource.getHarvestConfig().setHarvested(null);
  83. // System.out.println("RSS2_SOURCE=" + feedSource.getUrl());
  84. // harvester.harvestSource(feedSource, toAdd, toUpdate, toRemove);
  85. // System.out.println("RSS2_STATUS=" + new GsonBuilder().setPrettyPrinting().create().toJson(feedSource.getHarvestConfig()));
  86. // System.out.println("RSS2_TOADD (" + toAdd.size() + "):");
  87. // if (toAdd.size() > 0) {
  88. // System.out.println("RSS2_EGDOC=" + new GsonBuilder().setPrettyPrinting().create().toJson(toAdd.get(0)));
  89. // }
  90. // System.out.println("RSS2_TOUPDATE (" + toUpdate.size() + ").");
  91. // System.out.println("RSS2_TOREMOVE (" + toRemove.size() + ").");
  92. // 2. Get documents from a "database" source
  93. // toAdd.clear();
  94. // toUpdate.clear();
  95. // toRemove.clear();
  96. // query = new BasicDBObject("extractType", "Database");
  97. // feedSource = SourcePojo.fromDb(DbManager.getConfig().getSource().findOne(query), SourcePojo.class);
  98. // feedSource.getHarvestConfig().setHarvested(null);
  99. // feedSource.getDatabaseConfig().setDeltaQuery("SELECT * FROM IncidentReport LIMIT 10");
  100. // feedSource.getDatabaseConfig().setDeleteQuery("SELECT * FROM IncidentReport LIMIT 2");
  101. // System.out.println("DB1_SOURCE=" + feedSource.getUrl());
  102. // harvester.harvestSource(feedSource, toAdd, toUpdate, toRemove);
  103. // System.out.println("DB1_STATUS=" + new GsonBuilder().setPrettyPrinting().create().toJson(feedSource.getHarvestConfig()));
  104. // System.out.println("DB1_TOADD (" + toAdd.size() + "):");
  105. // if (toAdd.size() > 0) {
  106. // System.out.println("DB1_EGDOC=" + new GsonBuilder().setPrettyPrinting().create().toJson(toAdd.get(0)));
  107. // }
  108. // System.out.println("DB1_TOUPDATE (" + toUpdate.size() + ").");
  109. // System.out.println("DB1_TOREMOVE (" + toRemove.size() + ").");
  110. // if (toRemove.size() > 0) {
  111. // System.out.println("DB1_TOREMOVE=" + new GsonBuilder().setPrettyPrinting().create().toJson(toRemove.get(0)));
  112. // }
  113. // 3. Get documents from a "file" source (non-XML)
  114. // 3.1. Modus test dataset (also checks UAH code still called)
  115. // toAdd.clear();
  116. // toUpdate.clear();
  117. // toRemove.clear();
  118. // query = new BasicDBObject("useExtractor", "ModusOperandi");
  119. // feedSource = SourcePojo.fromDb(DbManager.getConfig().getSource().findOne(query), SourcePojo.class);
  120. // feedSource.getHarvestConfig().setHarvested(null);
  121. // System.out.println("FILE1_SOURCE=" + feedSource.getUrl());
  122. // harvester.harvestSource(feedSource, toAdd, toUpdate, toRemove);
  123. // System.out.println("FILE1_STATUS=" + new GsonBuilder().setPrettyPrinting().create().toJson(feedSource.getHarvestConfig()));
  124. // System.out.println("FILE1_TOADD (" + toAdd.size() + "):");
  125. // if (toAdd.size() > 0) {
  126. // System.out.println("FILE1_EGDOC=" + new GsonBuilder().setPrettyPrinting().create().toJson(toAdd.get(0)));
  127. // }
  128. // System.out.println("FILE1_TOUPDATE (" + toUpdate.size() + ").");
  129. // if (toUpdate.size() > 0) {
  130. // System.out.println("FILE1_TOUPDATE=" + new GsonBuilder().setPrettyPrinting().create().toJson(toUpdate.get(0)));
  131. // }
  132. // System.out.println("FILE1_TOREMOVE (" + toRemove.size() + ").");
  133. // if (toRemove.size() > 0) {
  134. // System.out.println("FILE1_TOREMOVE=" + new GsonBuilder().setPrettyPrinting().create().toJson(toRemove.get(0)));
  135. // }
  136. // 4. Get documents from a "file" source (XML)
  137. // 4.1. WITS dataset, also checks SAH code still called
  138. // toAdd.clear();
  139. // toUpdate.clear();
  140. // toRemove.clear();
  141. // query = new BasicDBObject("url", "smb://modus:139/wits/allfiles/");
  142. // feedSource = SourcePojo.fromDb(DbManager.getConfig().getSource().findOne(query), SourcePojo.class);
  143. // feedSource.getHarvestConfig().setHarvested(null);
  144. // System.out.println("FILE2_SOURCE=" + feedSource.getUrl());
  145. // harvester.harvestSource(feedSource, toAdd, toUpdate, toRemove);
  146. // System.out.println("FILE2_STATUS=" + new GsonBuilder().setPrettyPrinting().create().toJson(feedSource.getHarvestConfig()));
  147. // System.out.println("FILE2_TOADD (" + toAdd.size() + "):");
  148. // if (toAdd.size() > 0) {
  149. // System.out.println("FILE2_EGDOC=" + new GsonBuilder().setPrettyPrinting().create().toJson(toAdd.get(0)));
  150. // }
  151. // System.out.println("FILE2_TOUPDATE (" + toUpdate.size() + ").");
  152. // if (toUpdate.size() > 0) {
  153. // System.out.println("FILE2_TOUPDATE=" + new GsonBuilder().setPrettyPrinting().create().toJson(toUpdate.get(0)));
  154. // }
  155. // System.out.println("FILE2_TOREMOVE (" + toRemove.size() + ").");
  156. // if (toRemove.size() > 0) {
  157. // System.out.println("FILE2_TOREMOVE=" + new GsonBuilder().setPrettyPrinting().create().toJson(toRemove.get(0)));
  158. // }
  159. // 5. Test communities with multiple sources
  160. // toAdd.clear();
  161. // toUpdate.clear();
  162. // toRemove.clear();
  163. // query = new BasicDBObject("extractType", "Feed");
  164. // // A useful source known to work during V0S1 testing:
  165. // query = new BasicDBObject("key", "http.www.stjude.org.stjude.rss.medical_science_news_rss.xml");
  166. // feedSource = SourcePojo.fromDb(DbManager.getIngest().getSource().findOne(query), SourcePojo.class);
  167. // feedSource.addToCommunityIds(new ObjectId(0 ,0, 0));
  168. // feedSource.addToCommunityIds(new ObjectId(0 ,0, 1));
  169. // System.out.println("DUP1 feedSource=" + feedSource.getKey() + " communities=" + new com.google.gson.Gson().toJson(feedSource.getCommunityIds()));
  170. // harvester.harvestSource(feedSource, toAdd, toUpdate, toRemove);
  171. //
  172. // // Check for duplicate sources...
  173. // System.out.println("DUP1");
  174. // //System.out.println(new GsonBuilder().setPrettyPrinting().create().toJson(toAdd));
  175. // for (DocumentPojo showContent: toAdd) {
  176. // //System.out.println("DUP1 text for " + showContent.getUrl() + ":" + showContent.getFullText().substring(0, 64));
  177. // System.out.println("DUP1 text for " + showContent.getUrl() + ":" + showContent.getCommunityId().toString() + "/" + showContent.getSourceKey() + "/" + showContent.getFullText().length());
  178. // }
  179. // 6. Test duplication across sources
  180. // Need a "non-standalone" harvester so it will actually test the duplication
  181. // The idea here will be to run the normal harvester once on a source and then rerun
  182. // toAdd.clear();
  183. // toUpdate.clear();
  184. // toRemove.clear();
  185. // query = new BasicDBObject("key", "http.www.stjude.org.stjude.rss.medical_science_news_rss.xml"); // ie run the harvester against this source before testing
  186. // feedSource = SourcePojo.fromDb(DbManager.getConfig().getSource().findOne(query), SourcePojo.class);
  187. // feedSource.setCommunityIDs(new TreeSet<String>());
  188. // feedSource.addToCommunityIDs("test_dup2a");
  189. // feedSource.addToCommunityIDs("test_dup2b");
  190. // feedSource.setKey("DUP2_TEST_"+feedSource.getKey());
  191. // new HarvestController().harvestSource(feedSource, toAdd, toUpdate, toRemove);
  192. // System.out.println("DUP2");
  193. // System.out.println(new GsonBuilder().setPrettyPrinting().create().toJson(toAdd));
  194. // for (DocumentPojo showContent: toAdd) {
  195. // System.out.println("DUP2 text for " + showContent.getUrl() + ":" + showContent.getFullText().substring(0, 64));
  196. // }
  197. // 7. The UAH now allows arbitrary scripts to be run vs the content ... to generate metadata - the
  198. // SAH then can run arbitrary scripts to run vs the metadata to generate entities and associations (phew!)
  199. // query = new BasicDBObject("extractType", "Feed");
  200. // // A useful source known to work during V0S1 testing:
  201. // //query = new BasicDBObject("key", "http.www.stjude.org.stjude.rss.medical_science_news_rss.xml");
  202. // feedSource = SourcePojo.fromDb(DbManager.getIngest().getSource().findOne(query), SourcePojo.class);
  203. // // Add markup to feed source:
  204. // UnstructuredAnalysisConfigPojo uah = new UnstructuredAnalysisConfigPojo();
  205. // uah.setSimpleTextCleanser(new LinkedList<SimpleTextCleanserPojo>());
  206. // SimpleTextCleanserPojo textCleanse1 = new SimpleTextCleanserPojo();
  207. // textCleanse1.setField("description");
  208. // textCleanse1.setScript("[aeiou]");
  209. // textCleanse1.setReplacement("XXX");
  210. // uah.getSimpleTextCleanser().add(textCleanse1);
  211. // SimpleTextCleanserPojo textCleanse2 = new SimpleTextCleanserPojo();
  212. // textCleanse2.setField("title");
  213. // textCleanse2.setScript("[aeiou]");
  214. // textCleanse2.setReplacement("YYY");
  215. // uah.getSimpleTextCleanser().add(textCleanse2);
  216. // SimpleTextCleanserPojo textCleanse3 = new SimpleTextCleanserPojo();
  217. // textCleanse3.setField("fulltext");
  218. // textCleanse3.setScript("[aeiou]");
  219. // textCleanse3.setReplacement("ATCPSQZ");
  220. // uah.getSimpleTextCleanser().add(textCleanse3);
  221. // uah.AddMetaField("TEST1", Context.All, "var a = ['alex']; a;", "javascript");
  222. // uah.AddMetaField("TEST2", Context.All, "var a = { 'test': 'alex' }; a;", "javascript");
  223. // uah.AddMetaField("TEST3", Context.All, "var a = [ { 'test': 'alex' }, 'chris' ]; a;", "javascript");
  224. // uah.AddMetaField("TEST4", Context.All, "var a = [ { 'test': { 's1': 'alex', 's2':['chris','craig'] } }, [ 'chris', 'alex' ] ]; a;", "javascript");
  225. // uah.AddMetaField("TEST5", Context.All, "var a = [ { 'test': { 's1': 'alex', 's2':['chris','craig'] } }, [ 'chris', 'alex' ] ]; null;", "javascript");
  226. // uah.AddMetaField("TEST6", Context.All, "if (-1 == text.indexOf('ATCPSQZ')) true; else false; ", "javascript");
  227. // feedSource.setUnstructuredAnalysisConfig(uah);
  228. // // Run harvester:
  229. // toAdd.clear();
  230. // toUpdate.clear();
  231. // toRemove.clear();
  232. // harvester.harvestSource(feedSource, toAdd, toUpdate, toRemove);
  233. // // Check results:
  234. // if (toAdd.size() > 0) {
  235. // DocumentPojo doc = toAdd.get(0);
  236. // // Check text cleansing:
  237. // if (!doc.getDescription().contains("XXX")) {
  238. // System.out.println("UAH: ******** FAIL: title not subbed: " + doc.getTitle());
  239. // }
  240. // if (!doc.getTitle().contains("YYY")) {
  241. // System.out.println("UAH: ******** FAIL: title not subbed: " + doc.getTitle());
  242. // }
  243. // Object[] fullTextSubTest = doc.getMetadata().get("TEST6");
  244. // if ((null != fullTextSubTest) && (1 == fullTextSubTest.length)) {
  245. // Boolean bFullTextSubTest = (Boolean)fullTextSubTest[0];
  246. // if ((null == bFullTextSubTest) || (!bFullTextSubTest)) {
  247. // System.out.println("UAH: ******** FAIL: full text not subbed (or scripts not working) 1");
  248. // }
  249. // }
  250. // else {
  251. // System.out.println("UAH: ******** FAIL: full text not subbed (or scripts not working) 2");
  252. // }
  253. // // Check fields
  254. // String test1 = new com.google.gson.Gson().toJson(doc.getMetadata().get("TEST1"));
  255. // System.out.println("UAH TEST1: " + test1);
  256. // if (!test1.equals("[\"alex\"]")) System.out.println("UAH: ******** FAIL: TEST1");
  257. // String test2 = new com.google.gson.Gson().toJson(doc.getMetadata().get("TEST2"));
  258. // System.out.println("UAH TEST2: " + new com.google.gson.Gson().toJson(doc.getMetadata().get("TEST2")));
  259. // if (!test2.equals("[{\"test\":\"alex\"}]")) System.out.println("UAH: ******** FAIL: TEST2");
  260. // String test3 = new com.google.gson.Gson().toJson(doc.getMetadata().get("TEST3"));
  261. // System.out.println("UAH TEST3: " + new com.google.gson.Gson().toJson(doc.getMetadata().get("TEST3")));
  262. // if (!test3.equals("[{\"test\":\"alex\"},\"chris\"]")) System.out.println("UAH: ******** FAIL: TEST3");
  263. // String test4 = new com.google.gson.Gson().toJson(doc.getMetadata().get("TEST4"));
  264. // System.out.println("UAH TEST4: " + new com.google.gson.Gson().toJson(doc.getMetadata().get("TEST4")));
  265. // if (!test4.equals("[{\"test\":{\"s2\":[\"chris\",\"craig\"],\"s1\":\"alex\"}},[\"chris\",\"alex\"]]")) System.out.println("UAH: ******** FAIL: TEST4");
  266. // if (null != doc.getMetadata().get("TEST5")) {
  267. // System.out.println("UAH: ******** FAIL: TEST5 should not be present");
  268. // }
  269. // //(test6 tested above)
  270. // }
  271. // else {
  272. // System.out.println("UAH: ******** FAIL: no documents to check");
  273. // }
  274. // System.out.println("UAH: (all tests completed)");
  275. }
  276. }