PageRenderTime 26ms CodeModel.GetById 20ms app.highlight 3ms RepoModel.GetById 1ms app.codeStats 0ms

/core/infinit.e.harvest.library/src/com/ikanow/infinit/e/harvest/test/TestCode.java

https://github.com/IKANOW/Infinit.e
Java | 300 lines | 36 code | 24 blank | 240 comment | 0 complexity | 4e876703f5b8fa37113407d42e4463ec MD5 | raw file
Possible License(s): BSD-3-Clause
  1/*******************************************************************************
  2 * Copyright 2012, The Infinit.e Open Source Project.
  3 * 
  4 * This program is free software: you can redistribute it and/or modify
  5 * it under the terms of the GNU Affero General Public License, version 3,
  6 * as published by the Free Software Foundation.
  7 * 
  8 * This program is distributed in the hope that it will be useful,
  9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 11 * GNU Affero General Public License for more details.
 12 * 
 13 * You should have received a copy of the GNU Affero General Public License
 14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 15 ******************************************************************************/
 16package com.ikanow.infinit.e.harvest.test;
 17
 18import java.io.IOException;
 19import java.net.URL;
 20import java.util.Arrays;
 21import java.util.LinkedList;
 22import java.util.List;
 23import java.util.TreeSet;
 24
 25import org.bson.types.ObjectId;
 26
 27import com.google.gson.GsonBuilder;
 28import com.ikanow.infinit.e.data_model.store.DbManager;
 29import com.ikanow.infinit.e.data_model.store.config.source.SimpleTextCleanserPojo;
 30import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
 31import com.ikanow.infinit.e.data_model.store.config.source.UnstructuredAnalysisConfigPojo;
 32import com.ikanow.infinit.e.data_model.store.config.source.UnstructuredAnalysisConfigPojo.Context;
 33import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
 34import com.ikanow.infinit.e.data_model.Globals;
 35import com.ikanow.infinit.e.data_model.Globals.Identity;
 36import com.ikanow.infinit.e.harvest.HarvestController;
 37import com.ikanow.infinit.e.harvest.utils.ProxyManager;
 38import com.mongodb.BasicDBObject;
 39
 40@SuppressWarnings("unused")
 41public class TestCode {
 42
 43	/**
 44	 * @param args
 45	 * @throws IOException 
 46	 */
 47	public static void main(String[] args) throws IOException {
 48		
 49		// Configuration:
 50		
 51		System.out.println(Arrays.toString(args));
 52		Globals.setIdentity(com.ikanow.infinit.e.data_model.Globals.Identity.IDENTITY_SERVICE);
 53		Globals.overrideConfigLocation(args[0]);
 54
 55		// Check proxy:
 56		ProxyManager.getProxy(new URL("http://www.ikanow.com"), null);
 57		
 58		// TESTING
 59		
 60		HarvestController harvester = new HarvestController();
 61		//harvester.setStandaloneMode(0);
 62		harvester.setStandaloneMode(5);
 63		
 64		List<DocumentPojo> toAdd = new LinkedList<DocumentPojo>();
 65		List<DocumentPojo> toUpdate = new LinkedList<DocumentPojo>();
 66		List<DocumentPojo> toRemove = new LinkedList<DocumentPojo>();
 67		
 68		BasicDBObject query = null;
 69		SourcePojo feedSource = null;
 70		
 71		// 1. Get documents from a "feed" source
 72		
 73		// 1.1 OPENCALAIS		
 74//		toAdd.clear();
 75//		toUpdate.clear();
 76//		toRemove.clear();
 77//		query = new BasicDBObject("extractType", "Feed");
 78//		feedSource = SourcePojo.fromDb(DbManager.getConfig().getSource().findOne(query), SourcePojo.class);
 79//		feedSource.getHarvestConfig().setHarvested(null);
 80//		System.out.println("RSS1_SOURCE=" + feedSource.getUrl());
 81//		harvester.harvestSource(feedSource, toAdd, toUpdate, toRemove);
 82//		System.out.println("RSS1_STATUS=" + new GsonBuilder().setPrettyPrinting().create().toJson(feedSource.getHarvestConfig()));
 83//		System.out.println("RSS1_TOADD (" + toAdd.size() + "):");
 84//		if (toAdd.size() > 0) {
 85//			System.out.println("RSS1_EGDOC=" + new GsonBuilder().setPrettyPrinting().create().toJson(toAdd.get(0)));
 86//		}
 87//		System.out.println("RSS1_TOUPDATE (" + toUpdate.size() + ").");
 88//		System.out.println("RSS1_TOREMOVE (" + toRemove.size() + ").");
 89		
 90		// 1.2 ALCHEMYAPI
 91//		toAdd.clear();
 92//		toUpdate.clear();
 93//		toRemove.clear();
 94//		query = new BasicDBObject("extractType", "Feed");
 95//		query.put("useExtractor", "alchemyapi");
 96//		feedSource = SourcePojo.fromDb(DbManager.getConfig().getSource().findOne(query), SourcePojo.class);
 97//		feedSource.getHarvestConfig().setHarvested(null);
 98//		System.out.println("RSS2_SOURCE=" + feedSource.getUrl());
 99//		harvester.harvestSource(feedSource, toAdd, toUpdate, toRemove);
100//		System.out.println("RSS2_STATUS=" + new GsonBuilder().setPrettyPrinting().create().toJson(feedSource.getHarvestConfig()));
101//		System.out.println("RSS2_TOADD (" + toAdd.size() + "):");
102//		if (toAdd.size() > 0) {
103//			System.out.println("RSS2_EGDOC=" + new GsonBuilder().setPrettyPrinting().create().toJson(toAdd.get(0)));
104//		}
105//		System.out.println("RSS2_TOUPDATE (" + toUpdate.size() + ").");
106//		System.out.println("RSS2_TOREMOVE (" + toRemove.size() + ").");
107		
108		// 2. Get documents from a "database" source
109//		toAdd.clear();
110//		toUpdate.clear();
111//		toRemove.clear();
112//		query = new BasicDBObject("extractType", "Database");
113//		feedSource = SourcePojo.fromDb(DbManager.getConfig().getSource().findOne(query), SourcePojo.class);
114//		feedSource.getHarvestConfig().setHarvested(null);
115//		feedSource.getDatabaseConfig().setDeltaQuery("SELECT * FROM IncidentReport LIMIT 10");
116//		feedSource.getDatabaseConfig().setDeleteQuery("SELECT * FROM IncidentReport LIMIT 2");
117//		System.out.println("DB1_SOURCE=" + feedSource.getUrl());
118//		harvester.harvestSource(feedSource, toAdd, toUpdate, toRemove);
119//		System.out.println("DB1_STATUS=" + new GsonBuilder().setPrettyPrinting().create().toJson(feedSource.getHarvestConfig()));
120//		System.out.println("DB1_TOADD (" + toAdd.size() + "):");
121//		if (toAdd.size() > 0) {
122//			System.out.println("DB1_EGDOC=" + new GsonBuilder().setPrettyPrinting().create().toJson(toAdd.get(0)));
123//		}
124//		System.out.println("DB1_TOUPDATE (" + toUpdate.size() + ").");
125//		System.out.println("DB1_TOREMOVE (" + toRemove.size() + ").");
126//		if (toRemove.size() > 0) {
127//			System.out.println("DB1_TOREMOVE=" + new GsonBuilder().setPrettyPrinting().create().toJson(toRemove.get(0)));
128//		}
129		
130		// 3. Get documents from a "file" source (non-XML)
131		
132		// 3.1. Modus test dataset (also checks UAH code still called)
133//		toAdd.clear();
134//		toUpdate.clear();
135//		toRemove.clear();
136//		query = new BasicDBObject("useExtractor", "ModusOperandi");
137//		feedSource = SourcePojo.fromDb(DbManager.getConfig().getSource().findOne(query), SourcePojo.class);
138//		feedSource.getHarvestConfig().setHarvested(null);
139//		System.out.println("FILE1_SOURCE=" + feedSource.getUrl());
140//		harvester.harvestSource(feedSource, toAdd, toUpdate, toRemove);
141//		System.out.println("FILE1_STATUS=" + new GsonBuilder().setPrettyPrinting().create().toJson(feedSource.getHarvestConfig()));
142//		System.out.println("FILE1_TOADD (" + toAdd.size() + "):");
143//		if (toAdd.size() > 0) {
144//			System.out.println("FILE1_EGDOC=" + new GsonBuilder().setPrettyPrinting().create().toJson(toAdd.get(0)));
145//		}
146//		System.out.println("FILE1_TOUPDATE (" + toUpdate.size() + ").");
147//		if (toUpdate.size() > 0) {
148//			System.out.println("FILE1_TOUPDATE=" + new GsonBuilder().setPrettyPrinting().create().toJson(toUpdate.get(0)));
149//		}
150//		System.out.println("FILE1_TOREMOVE (" + toRemove.size() + ").");
151//		if (toRemove.size() > 0) {
152//			System.out.println("FILE1_TOREMOVE=" + new GsonBuilder().setPrettyPrinting().create().toJson(toRemove.get(0)));
153//		}
154				
155		// 4. Get documents from a "file" source (XML)
156		
157		// 4.1. WITS dataset, also checks SAH code still called
158//		toAdd.clear();
159//		toUpdate.clear();
160//		toRemove.clear();
161//		query = new BasicDBObject("url", "smb://modus:139/wits/allfiles/");
162//		feedSource = SourcePojo.fromDb(DbManager.getConfig().getSource().findOne(query), SourcePojo.class);
163//		feedSource.getHarvestConfig().setHarvested(null);
164//		System.out.println("FILE2_SOURCE=" + feedSource.getUrl());
165//		harvester.harvestSource(feedSource, toAdd, toUpdate, toRemove);
166//		System.out.println("FILE2_STATUS=" + new GsonBuilder().setPrettyPrinting().create().toJson(feedSource.getHarvestConfig()));
167//		System.out.println("FILE2_TOADD (" + toAdd.size() + "):");
168//		if (toAdd.size() > 0) {
169//			System.out.println("FILE2_EGDOC=" + new GsonBuilder().setPrettyPrinting().create().toJson(toAdd.get(0)));
170//		}
171//		System.out.println("FILE2_TOUPDATE (" + toUpdate.size() + ").");
172//		if (toUpdate.size() > 0) {
173//			System.out.println("FILE2_TOUPDATE=" + new GsonBuilder().setPrettyPrinting().create().toJson(toUpdate.get(0)));
174//		}
175//		System.out.println("FILE2_TOREMOVE (" + toRemove.size() + ").");
176//		if (toRemove.size() > 0) {
177//			System.out.println("FILE2_TOREMOVE=" + new GsonBuilder().setPrettyPrinting().create().toJson(toRemove.get(0)));
178//		}
179		
180		// 5. Test communities with multiple sources
181//		toAdd.clear();
182//		toUpdate.clear();
183//		toRemove.clear();
184//		query = new BasicDBObject("extractType", "Feed");
185//		// A useful source known to work during V0S1 testing:
186//		query = new BasicDBObject("key", "http.www.stjude.org.stjude.rss.medical_science_news_rss.xml");
187//		feedSource = SourcePojo.fromDb(DbManager.getIngest().getSource().findOne(query), SourcePojo.class);
188//		feedSource.addToCommunityIds(new ObjectId(0 ,0, 0));
189//		feedSource.addToCommunityIds(new ObjectId(0 ,0, 1));
190//		System.out.println("DUP1 feedSource=" + feedSource.getKey() + " communities=" + new com.google.gson.Gson().toJson(feedSource.getCommunityIds()));
191//		harvester.harvestSource(feedSource, toAdd, toUpdate, toRemove);
192//
193//		// Check for duplicate sources...
194//		System.out.println("DUP1");
195//		//System.out.println(new GsonBuilder().setPrettyPrinting().create().toJson(toAdd));
196//		for (DocumentPojo showContent: toAdd) {
197//			//System.out.println("DUP1 text for " + showContent.getUrl() + ":" + showContent.getFullText().substring(0, 64));			
198//			System.out.println("DUP1 text for " + showContent.getUrl() + ":" + showContent.getCommunityId().toString() + "/" + showContent.getSourceKey() + "/" + showContent.getFullText().length());			
199//		}
200		
201		// 6. Test duplication across sources
202		// Need a "non-standalone" harvester so it will actually test the duplication
203		// The idea here will be to run the normal harvester once on a source and then rerun
204//		toAdd.clear();
205//		toUpdate.clear();
206//		toRemove.clear();
207//		query = new BasicDBObject("key", "http.www.stjude.org.stjude.rss.medical_science_news_rss.xml"); // ie run the harvester against this source before testing
208//		feedSource = SourcePojo.fromDb(DbManager.getConfig().getSource().findOne(query), SourcePojo.class);
209//		feedSource.setCommunityIDs(new TreeSet<String>());
210//		feedSource.addToCommunityIDs("test_dup2a");
211//		feedSource.addToCommunityIDs("test_dup2b");
212//		feedSource.setKey("DUP2_TEST_"+feedSource.getKey());
213//		new HarvestController().harvestSource(feedSource, toAdd, toUpdate, toRemove);
214//		System.out.println("DUP2");
215//		System.out.println(new GsonBuilder().setPrettyPrinting().create().toJson(toAdd));
216//		for (DocumentPojo showContent: toAdd) {
217//			System.out.println("DUP2 text for " + showContent.getUrl() + ":" + showContent.getFullText().substring(0, 64));			
218//		}
219		
220		
221		// 7. The UAH now allows arbitrary scripts to be run vs the content ... to generate metadata - the
222		//    SAH then can run arbitrary scripts to run vs the metadata to generate entities and associations (phew!)
223//		query = new BasicDBObject("extractType", "Feed");
224//		// A useful source known to work during V0S1 testing:
225//		//query = new BasicDBObject("key", "http.www.stjude.org.stjude.rss.medical_science_news_rss.xml");
226//		feedSource = SourcePojo.fromDb(DbManager.getIngest().getSource().findOne(query), SourcePojo.class);
227//		// Add markup to feed source:
228//		UnstructuredAnalysisConfigPojo uah = new UnstructuredAnalysisConfigPojo();
229//		uah.setSimpleTextCleanser(new LinkedList<SimpleTextCleanserPojo>());
230//		SimpleTextCleanserPojo textCleanse1 = new SimpleTextCleanserPojo();
231//		textCleanse1.setField("description");
232//		textCleanse1.setScript("[aeiou]");
233//		textCleanse1.setReplacement("XXX");
234//		uah.getSimpleTextCleanser().add(textCleanse1);
235//		SimpleTextCleanserPojo textCleanse2 = new SimpleTextCleanserPojo();
236//		textCleanse2.setField("title");
237//		textCleanse2.setScript("[aeiou]");
238//		textCleanse2.setReplacement("YYY");
239//		uah.getSimpleTextCleanser().add(textCleanse2);
240//		SimpleTextCleanserPojo textCleanse3 = new SimpleTextCleanserPojo();
241//		textCleanse3.setField("fulltext");
242//		textCleanse3.setScript("[aeiou]");
243//		textCleanse3.setReplacement("ATCPSQZ");
244//		uah.getSimpleTextCleanser().add(textCleanse3);
245//		uah.AddMetaField("TEST1", Context.All, "var a = ['alex']; a;", "javascript");
246//		uah.AddMetaField("TEST2", Context.All, "var a = { 'test': 'alex' }; a;", "javascript");
247//		uah.AddMetaField("TEST3", Context.All, "var a = [ { 'test': 'alex' }, 'chris' ]; a;", "javascript");
248//		uah.AddMetaField("TEST4", Context.All, "var a = [ { 'test': { 's1': 'alex', 's2':['chris','craig'] } }, [ 'chris', 'alex' ] ]; a;", "javascript");
249//		uah.AddMetaField("TEST5", Context.All, "var a = [ { 'test': { 's1': 'alex', 's2':['chris','craig'] } }, [ 'chris', 'alex' ] ]; null;", "javascript");
250//		uah.AddMetaField("TEST6", Context.All, "if (-1 == text.indexOf('ATCPSQZ')) true; else false; ", "javascript");
251//		feedSource.setUnstructuredAnalysisConfig(uah);
252//		// Run harvester:
253//		toAdd.clear();
254//		toUpdate.clear();
255//		toRemove.clear();
256//		harvester.harvestSource(feedSource, toAdd, toUpdate, toRemove);
257//		// Check results:
258//		if (toAdd.size() > 0) {
259//			DocumentPojo doc = toAdd.get(0);
260//			// Check text cleansing:
261//			if (!doc.getDescription().contains("XXX")) {
262//				System.out.println("UAH: ******** FAIL: title not subbed: " + doc.getTitle());				
263//			}
264//			if (!doc.getTitle().contains("YYY")) {
265//				System.out.println("UAH: ******** FAIL: title not subbed: " + doc.getTitle());				
266//			}
267//			Object[] fullTextSubTest = doc.getMetadata().get("TEST6");
268//			if ((null != fullTextSubTest) && (1 == fullTextSubTest.length)) {
269//				Boolean bFullTextSubTest = (Boolean)fullTextSubTest[0];
270//				if ((null == bFullTextSubTest) || (!bFullTextSubTest)) {
271//					System.out.println("UAH: ******** FAIL: full text not subbed (or scripts not working) 1");									
272//				}
273//			}
274//			else {
275//				System.out.println("UAH: ******** FAIL: full text not subbed (or scripts not working) 2");				
276//			}
277//			// Check fields
278//			String test1 = new com.google.gson.Gson().toJson(doc.getMetadata().get("TEST1"));
279//			System.out.println("UAH TEST1: " + test1);
280//			if (!test1.equals("[\"alex\"]")) System.out.println("UAH: ******** FAIL: TEST1");
281//			String test2 = new com.google.gson.Gson().toJson(doc.getMetadata().get("TEST2"));
282//			System.out.println("UAH TEST2: " + new com.google.gson.Gson().toJson(doc.getMetadata().get("TEST2")));
283//			if (!test2.equals("[{\"test\":\"alex\"}]")) System.out.println("UAH: ******** FAIL: TEST2");
284//			String test3 = new com.google.gson.Gson().toJson(doc.getMetadata().get("TEST3"));
285//			System.out.println("UAH TEST3: " + new com.google.gson.Gson().toJson(doc.getMetadata().get("TEST3")));
286//			if (!test3.equals("[{\"test\":\"alex\"},\"chris\"]")) System.out.println("UAH: ******** FAIL: TEST3");
287//			String test4 = new com.google.gson.Gson().toJson(doc.getMetadata().get("TEST4"));
288//			System.out.println("UAH TEST4: " + new com.google.gson.Gson().toJson(doc.getMetadata().get("TEST4")));
289//			if (!test4.equals("[{\"test\":{\"s2\":[\"chris\",\"craig\"],\"s1\":\"alex\"}},[\"chris\",\"alex\"]]")) System.out.println("UAH: ******** FAIL: TEST4");
290//			if (null != doc.getMetadata().get("TEST5")) {
291//				System.out.println("UAH: ******** FAIL: TEST5 should not be present");								
292//			}
293//			//(test6 tested above)
294//		}
295//		else {
296//			System.out.println("UAH: ******** FAIL: no documents to check");
297//		}
298//		System.out.println("UAH: (all tests completed)");
299	}
300}