PageRenderTime 2486ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/core/infinit.e.data_model/src/com/ikanow/infinit/e/data_model/custom/InfiniteFileInputJsonParser.java

https://github.com/IKANOW/Infinit.e
Java | 464 lines | 343 code | 70 blank | 51 comment | 119 complexity | d44a3389bcb14f06476327c35f2bdc1d MD5 | raw file
Possible License(s): BSD-3-Clause
  1. /*******************************************************************************
  2. * Copyright 2012 The Infinit.e Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. ******************************************************************************/
  16. package com.ikanow.infinit.e.data_model.custom;
  17. import java.io.IOException;
  18. import java.io.InputStream;
  19. import java.io.InputStreamReader;
  20. import java.util.Arrays;
  21. import java.util.HashSet;
  22. import java.util.Map;
  23. import org.apache.commons.lang.StringEscapeUtils;
  24. import org.bson.BSONObject;
  25. import com.google.gson.JsonArray;
  26. import com.google.gson.JsonElement;
  27. import com.google.gson.JsonObject;
  28. import com.google.gson.JsonParser;
  29. import com.google.gson.stream.JsonReader;
  30. import com.google.gson.stream.JsonToken;
  31. import com.ikanow.infinit.e.data_model.store.config.source.SourceFileConfigPojo;
  32. import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
  33. import com.mongodb.BasicDBObject;
  34. //(taken from com.ikanow.infinit.e.harvest.extraction.document.file.JsonToMetadataParser)
  35. public class InfiniteFileInputJsonParser implements InfiniteFileInputParser {
  36. private HashSet<String> objectIdentifiers = new HashSet<String>();
  37. private HashSet<String> recursiveObjectIdentifiers = new HashSet<String>();
  38. private HashSet<String> fieldsThatNeedToExist = new HashSet<String>();
  39. private String primaryKey = null;
  40. private String sourceName = null;
  41. private boolean bRecurse = false;
  42. private JsonParser parser = null;
  43. private JsonReader reader = null;
  44. private JsonArray _secondaryArray = null;
  45. private int _posInSecondaryArray = 0;
  46. ///////////////////////////////////////////////////////////////
  47. // INTERFACE CODE
  48. @Override
  49. public InfiniteFileInputParser initialize(InputStream inStream,
  50. SourceFileConfigPojo fileConfig) throws IOException {
  51. this.primaryKey = fileConfig.XmlPrimaryKey;
  52. this.sourceName = fileConfig.XmlSourceName;
  53. if (null != fileConfig.XmlRootLevelValues) {
  54. for (String objectId: fileConfig.XmlRootLevelValues) {
  55. if (objectId.startsWith("*")) {
  56. this.bRecurse = true;
  57. this.recursiveObjectIdentifiers.add(objectId.substring(1).toLowerCase());
  58. throw new RuntimeException("JSON metadata parser: Don't currently support recursive parsing.");
  59. //TODO (INF-2469): Not currently supported, it gets a bit tricky?
  60. }//TESTED
  61. this.objectIdentifiers.add(objectId.toLowerCase());
  62. }
  63. }
  64. if (null != fileConfig.XmlIgnoreValues) {
  65. this.fieldsThatNeedToExist.addAll(fileConfig.XmlIgnoreValues);
  66. }
  67. reader = new JsonReader(new InputStreamReader(inStream, "UTF-8"));
  68. reader.setLenient(true);
  69. parser = new JsonParser();
  70. return this;
  71. }
  72. @Override
  73. public BSONObject getNextRecord() throws IOException {
  74. if (null != _secondaryArray) {
  75. for (; _posInSecondaryArray < _secondaryArray.size(); ) {
  76. JsonElement meta2 = _secondaryArray.get(_posInSecondaryArray);
  77. _posInSecondaryArray++;
  78. BasicDBObject currObj = convertJsonToDocument(meta2);
  79. if (null != currObj) {
  80. return currObj;
  81. }
  82. }
  83. _secondaryArray = null;
  84. }//TESTED
  85. return parseDocument();
  86. }
  87. @Override
  88. public void close() throws IOException {
  89. if (null != reader) reader.close();
  90. }
  91. @Override
  92. public String getCanonicalExtension() {
  93. return ".json";
  94. }
  95. ///////////////////////////////////////////////////////////////
  96. // PROCESSING CODE
  97. private boolean _inTopLevelArray = false;
  98. private JsonToken tok = JsonToken.BEGIN_OBJECT;
  99. public BasicDBObject parseDocument() throws IOException {
  100. // Different cases:
  101. // {}
  102. // ^^ many of these
  103. // [ {}, {}, {} ]
  104. // For each of these 2/3 cases, you might either want to grab the entire object, or a field
  105. // within the object
  106. try {
  107. while (true) { // (use exceptions to get outta here)
  108. try {
  109. tok = reader.peek();
  110. }
  111. catch (Exception e) {
  112. // EOF or end of object, keep going and find out...
  113. tok = reader.peek();
  114. }
  115. //TESTED
  116. if (JsonToken.BEGIN_ARRAY == tok) {
  117. if (!_inTopLevelArray) {
  118. reader.beginArray();
  119. _inTopLevelArray = true;
  120. }
  121. if (objectIdentifiers.isEmpty()) {
  122. while (reader.hasNext()) {
  123. JsonElement meta = parser.parse(reader);
  124. BasicDBObject currObj = convertJsonToDocument(meta);
  125. if (null != currObj) {
  126. return currObj;
  127. }//(else carry on...)
  128. }
  129. }//TESTED
  130. else {
  131. while (reader.hasNext()) {
  132. BasicDBObject currObj = getDocumentFromJson(false);
  133. if (null != currObj) {
  134. return currObj;
  135. }//(else carry on...)
  136. }
  137. }//TESTED
  138. }
  139. else if (JsonToken.BEGIN_OBJECT == tok) {
  140. if (objectIdentifiers.isEmpty()) {
  141. JsonElement meta = parser.parse(reader);
  142. BasicDBObject currObj = convertJsonToDocument(meta);
  143. if (null != currObj) {
  144. return currObj;
  145. }//(else carry on...)
  146. }//TESTED (single and multiple doc case)
  147. else {
  148. BasicDBObject currObj = getDocumentFromJson(false);
  149. if (null != currObj) {
  150. return currObj;
  151. }//(else carry on...)
  152. }//TESTED (single and multiple doc case)
  153. }
  154. else if ((JsonToken.END_DOCUMENT == tok) || (JsonToken.END_ARRAY == tok) || (JsonToken.END_OBJECT == tok)) {
  155. return null;
  156. }
  157. else { // Must be recursing through the next level(s)
  158. BasicDBObject currObj = getDocumentFromJson(false);
  159. if (null != currObj) {
  160. return currObj;
  161. }//(else carry on...)
  162. }
  163. } // (end loop forever - exception out)
  164. }
  165. catch (Exception e) {} // This is our EOF
  166. return null;
  167. }
  168. ////////////////////////////
  169. // Look into the JSON object and find the object with the specified name
  170. // (for now the "path" is ignored - maybe later we allow "x.y" terminology)
  171. private boolean _inSecondaryObject = false;
  172. private BasicDBObject getDocumentFromJson(boolean bRecursing) throws IOException {
  173. if (!_inSecondaryObject) {
  174. reader.beginObject();
  175. _inSecondaryObject = true;
  176. }
  177. while (reader.hasNext()) {
  178. String name = reader.nextName();
  179. boolean bMatch = false;
  180. if (bRecursing) {
  181. bMatch = recursiveObjectIdentifiers.contains(name.toLowerCase());
  182. }
  183. else {
  184. bMatch = objectIdentifiers.contains(name.toLowerCase());
  185. }//TESTED
  186. if (bMatch) {
  187. JsonElement meta = parser.parse(reader);
  188. if (meta.isJsonObject()) {
  189. BasicDBObject currObj = convertJsonToDocument(meta);
  190. if (null != currObj) {
  191. return currObj;
  192. }
  193. }//TESTED
  194. else if (meta.isJsonArray()) {
  195. _secondaryArray = meta.getAsJsonArray();
  196. _posInSecondaryArray = 0;
  197. for (JsonElement meta2: _secondaryArray) {
  198. _posInSecondaryArray++;
  199. BasicDBObject currObj = convertJsonToDocument(meta2);
  200. if (null != currObj) {
  201. return currObj;
  202. }
  203. }
  204. _secondaryArray = null;
  205. }//TESTED
  206. }//TESTED
  207. else {
  208. if (bRecurse) { //TODO (INF-2469): Not currently supported, it gets a bit tricky? (need to convert to a stack)
  209. JsonToken tok = reader.peek();
  210. if (JsonToken.BEGIN_OBJECT == tok) {
  211. BasicDBObject currObj = getDocumentFromJson(true);
  212. if (null != currObj) {
  213. return currObj;
  214. }
  215. }//TESTED
  216. else if (JsonToken.BEGIN_ARRAY == tok) {
  217. reader.beginArray();
  218. while (reader.hasNext()) {
  219. JsonToken tok2 = reader.peek();
  220. if (JsonToken.BEGIN_OBJECT == tok2) {
  221. BasicDBObject currObj = getDocumentFromJson(true);
  222. if (null != currObj) {
  223. return currObj;
  224. }
  225. }
  226. else {
  227. reader.skipValue();
  228. }//TESTED
  229. }//TESTED
  230. reader.endArray();
  231. }
  232. else {
  233. reader.skipValue();
  234. }//TESTED
  235. }
  236. else {
  237. reader.skipValue();
  238. }//TESTED
  239. }
  240. }//(end loop over reader)
  241. reader.endObject();
  242. _inSecondaryObject = false;
  243. return null;
  244. } //TESTED
  245. ////////////////////////////////////////////////////////////////////////////////
  246. // Utility - Check the object is well formed
  247. private boolean checkIfMandatoryFieldsExist(JsonElement meta) {
  248. if ((null != this.fieldsThatNeedToExist) && !this.fieldsThatNeedToExist.isEmpty())
  249. {
  250. boolean fieldsExist = false;
  251. for (String field: this.fieldsThatNeedToExist) {
  252. String exists = getKey(meta, field, false);
  253. if (null != exists) {
  254. fieldsExist = true;
  255. break;
  256. }
  257. }
  258. return fieldsExist;
  259. }
  260. return true;
  261. }//TESTED
  262. /////////////////////////////////
  263. // Utility - get the primary key (does handle recursion)
  264. private String getPrimaryKey(JsonElement meta) {
  265. return getKey(meta, primaryKey, true);
  266. }
  267. /////////////////////////////////
  268. // Utility - create document set
  269. private BasicDBObject convertJsonToDocument(JsonElement meta) {
  270. // Check if all required fields exist:
  271. if (!checkIfMandatoryFieldsExist(meta)) {
  272. return null;
  273. }
  274. //TESTED
  275. // Primary key and create doc
  276. BasicDBObject currObj = new BasicDBObject();
  277. if ((null != primaryKey) && (null != sourceName)) {
  278. String primaryKey = getPrimaryKey(meta);
  279. if (null != primaryKey) {
  280. currObj.put(DocumentPojo.url_, sourceName + primaryKey);
  281. }
  282. }
  283. if (meta.isJsonObject()) {
  284. currObj.put(DocumentPojo.metadata_, new BasicDBObject("json", Arrays.asList(convertJsonObjectToBson(meta.getAsJsonObject()))));
  285. }
  286. return currObj;
  287. } //TESTED
  288. // Utility - get an arbitrary key (does handle recursion)
  289. private String getKey(JsonElement meta, String key, boolean bPrimitiveOnly) {
  290. try {
  291. String[] components = key.split("\\.");
  292. JsonObject metaObj = meta.getAsJsonObject();
  293. for (String comp: components) {
  294. meta = metaObj.get(comp);
  295. if (null == meta) {
  296. return null;
  297. }//TESTED
  298. else if (meta.isJsonObject()) {
  299. metaObj = meta.getAsJsonObject();
  300. }//TESTED
  301. else if (meta.isJsonPrimitive()) {
  302. return meta.getAsString();
  303. }//TESTED
  304. else if (bPrimitiveOnly) { // (meta isn't allowed to be an array, then you'd have too many primary keys!)
  305. return null;
  306. }//TOTEST (? - see JsonToMetadataParser)
  307. else { // Check with first instance
  308. JsonArray array = meta.getAsJsonArray();
  309. meta = array.get(0);
  310. if (meta.isJsonObject()) {
  311. metaObj = meta.getAsJsonObject();
  312. }
  313. }//TESTED
  314. }
  315. if (!bPrimitiveOnly) { // allow objects, we just care if the field exists...
  316. if (null != metaObj) {
  317. return "[Object]";
  318. }
  319. }//TESTED
  320. }
  321. catch (Exception e) {} // no primary key
  322. return null;
  323. }
  324. //(TEST status unknown - see JsonToMetadataParser)
  325. /////////////////////////////////
  326. // Utility - conversion
  327. /**
  328. * Converts a JsonObject to a LinkedHashMap.
  329. * @param json JSONObject to convert
  330. */
  331. static private int capacity(int expectedSize) {
  332. if (expectedSize < 3) {
  333. return expectedSize + 1;
  334. }
  335. return expectedSize + expectedSize / 3;
  336. }
  337. static public BasicDBObject convertJsonObjectToBson(JsonObject json)
  338. {
  339. return convertJsonObjectToBson(json, false);
  340. }
  341. static public BasicDBObject convertJsonObjectToBson(JsonObject json, boolean bHtmlUnescape)
  342. {
  343. int length = json.entrySet().size();
  344. BasicDBObject list = new BasicDBObject(capacity(length));
  345. for (Map.Entry<String, JsonElement> jsonKeyEl: json.entrySet())
  346. {
  347. JsonElement jsonEl = jsonKeyEl.getValue();
  348. if (jsonEl.isJsonArray()) {
  349. list.put(jsonKeyEl.getKey(), handleJsonArray(jsonEl.getAsJsonArray(), bHtmlUnescape));
  350. }
  351. else if (jsonEl.isJsonObject()) {
  352. list.put(jsonKeyEl.getKey(), convertJsonObjectToBson(jsonEl.getAsJsonObject(), bHtmlUnescape));
  353. }
  354. else if (jsonEl.isJsonPrimitive()) {
  355. if (bHtmlUnescape) {
  356. list.put(jsonKeyEl.getKey(), StringEscapeUtils.unescapeHtml(jsonEl.getAsString()));
  357. }
  358. else {
  359. list.put(jsonKeyEl.getKey(), jsonEl.getAsString());
  360. }
  361. }
  362. }
  363. if (list.size() > 0)
  364. {
  365. return list;
  366. }
  367. return null;
  368. }
  369. //TESTED
  370. static private Object[] handleJsonArray(JsonArray jarray, boolean bHtmlUnescape)
  371. {
  372. Object o[] = new Object[jarray.size()];
  373. for (int i = 0; i < jarray.size(); i++)
  374. {
  375. JsonElement jsonEl = jarray.get(i);
  376. if (jsonEl.isJsonObject()) {
  377. o[i] = convertJsonObjectToBson(jsonEl.getAsJsonObject(), bHtmlUnescape);
  378. }
  379. if (jsonEl.isJsonArray()) {
  380. o[i] = handleJsonArray(jsonEl.getAsJsonArray(), bHtmlUnescape);
  381. }
  382. else if (jsonEl.isJsonPrimitive()) {
  383. if (bHtmlUnescape) {
  384. o[i] = StringEscapeUtils.unescapeHtml(jsonEl.getAsString());
  385. }
  386. else {
  387. o[i] = jsonEl.getAsString();
  388. }
  389. }
  390. }
  391. return o;
  392. }
  393. //TESTED
  394. }