/core/infinit.e.data_model/src/com/ikanow/infinit/e/data_model/custom/InfiniteFileInputJsonParser.java
Java | 464 lines | 343 code | 70 blank | 51 comment | 119 complexity | d44a3389bcb14f06476327c35f2bdc1d MD5 | raw file
Possible License(s): BSD-3-Clause
- /*******************************************************************************
- * Copyright 2012 The Infinit.e Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- ******************************************************************************/
- package com.ikanow.infinit.e.data_model.custom;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.util.Arrays;
- import java.util.HashSet;
- import java.util.Map;
- import org.apache.commons.lang.StringEscapeUtils;
- import org.bson.BSONObject;
- import com.google.gson.JsonArray;
- import com.google.gson.JsonElement;
- import com.google.gson.JsonObject;
- import com.google.gson.JsonParser;
- import com.google.gson.stream.JsonReader;
- import com.google.gson.stream.JsonToken;
- import com.ikanow.infinit.e.data_model.store.config.source.SourceFileConfigPojo;
- import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
- import com.mongodb.BasicDBObject;
- //(taken from com.ikanow.infinit.e.harvest.extraction.document.file.JsonToMetadataParser)
- public class InfiniteFileInputJsonParser implements InfiniteFileInputParser {
- private HashSet<String> objectIdentifiers = new HashSet<String>();
- private HashSet<String> recursiveObjectIdentifiers = new HashSet<String>();
- private HashSet<String> fieldsThatNeedToExist = new HashSet<String>();
- private String primaryKey = null;
- private String sourceName = null;
- private boolean bRecurse = false;
-
- private JsonParser parser = null;
- private JsonReader reader = null;
-
- private JsonArray _secondaryArray = null;
- private int _posInSecondaryArray = 0;
-
- ///////////////////////////////////////////////////////////////
-
- // INTERFACE CODE
-
- @Override
- public InfiniteFileInputParser initialize(InputStream inStream,
- SourceFileConfigPojo fileConfig) throws IOException {
- this.primaryKey = fileConfig.XmlPrimaryKey;
- this.sourceName = fileConfig.XmlSourceName;
- if (null != fileConfig.XmlRootLevelValues) {
- for (String objectId: fileConfig.XmlRootLevelValues) {
- if (objectId.startsWith("*")) {
- this.bRecurse = true;
- this.recursiveObjectIdentifiers.add(objectId.substring(1).toLowerCase());
- throw new RuntimeException("JSON metadata parser: Don't currently support recursive parsing.");
- //TODO (INF-2469): Not currently supported, it gets a bit tricky?
- }//TESTED
- this.objectIdentifiers.add(objectId.toLowerCase());
- }
- }
- if (null != fileConfig.XmlIgnoreValues) {
- this.fieldsThatNeedToExist.addAll(fileConfig.XmlIgnoreValues);
- }
- reader = new JsonReader(new InputStreamReader(inStream, "UTF-8"));
- reader.setLenient(true);
- parser = new JsonParser();
-
- return this;
- }
- @Override
- public BSONObject getNextRecord() throws IOException {
- if (null != _secondaryArray) {
- for (; _posInSecondaryArray < _secondaryArray.size(); ) {
- JsonElement meta2 = _secondaryArray.get(_posInSecondaryArray);
- _posInSecondaryArray++;
-
- BasicDBObject currObj = convertJsonToDocument(meta2);
- if (null != currObj) {
- return currObj;
- }
- }
- _secondaryArray = null;
- }//TESTED
- return parseDocument();
- }
- @Override
- public void close() throws IOException {
- if (null != reader) reader.close();
- }
- @Override
- public String getCanonicalExtension() {
- return ".json";
- }
- ///////////////////////////////////////////////////////////////
-
- // PROCESSING CODE
-
- private boolean _inTopLevelArray = false;
- private JsonToken tok = JsonToken.BEGIN_OBJECT;
-
- public BasicDBObject parseDocument() throws IOException {
-
- // Different cases:
- // {}
- // ^^ many of these
- // [ {}, {}, {} ]
- // For each of these 2/3 cases, you might either want to grab the entire object, or a field
- // within the object
-
- try {
- while (true) { // (use exceptions to get outta here)
- try {
- tok = reader.peek();
- }
- catch (Exception e) {
- // EOF or end of object, keep going and find out...
- tok = reader.peek();
- }
- //TESTED
-
- if (JsonToken.BEGIN_ARRAY == tok) {
- if (!_inTopLevelArray) {
- reader.beginArray();
- _inTopLevelArray = true;
- }
- if (objectIdentifiers.isEmpty()) {
- while (reader.hasNext()) {
- JsonElement meta = parser.parse(reader);
- BasicDBObject currObj = convertJsonToDocument(meta);
-
- if (null != currObj) {
- return currObj;
- }//(else carry on...)
- }
- }//TESTED
- else {
- while (reader.hasNext()) {
- BasicDBObject currObj = getDocumentFromJson(false);
- if (null != currObj) {
- return currObj;
- }//(else carry on...)
- }
- }//TESTED
- }
- else if (JsonToken.BEGIN_OBJECT == tok) {
- if (objectIdentifiers.isEmpty()) {
- JsonElement meta = parser.parse(reader);
- BasicDBObject currObj = convertJsonToDocument(meta);
-
- if (null != currObj) {
- return currObj;
- }//(else carry on...)
-
- }//TESTED (single and multiple doc case)
- else {
- BasicDBObject currObj = getDocumentFromJson(false);
- if (null != currObj) {
- return currObj;
- }//(else carry on...)
-
- }//TESTED (single and multiple doc case)
- }
- else if ((JsonToken.END_DOCUMENT == tok) || (JsonToken.END_ARRAY == tok) || (JsonToken.END_OBJECT == tok)) {
- return null;
- }
- else { // Must be recursing through the next level(s)
- BasicDBObject currObj = getDocumentFromJson(false);
- if (null != currObj) {
- return currObj;
- }//(else carry on...)
- }
- } // (end loop forever - exception out)
- }
- catch (Exception e) {} // This is our EOF
-
- return null;
- }
-
- ////////////////////////////
- // Look into the JSON object and find the object with the specified name
- // (for now the "path" is ignored - maybe later we allow "x.y" terminology)
-
- private boolean _inSecondaryObject = false;
-
- private BasicDBObject getDocumentFromJson(boolean bRecursing) throws IOException {
- if (!_inSecondaryObject) {
- reader.beginObject();
- _inSecondaryObject = true;
- }
-
- while (reader.hasNext()) {
- String name = reader.nextName();
-
- boolean bMatch = false;
- if (bRecursing) {
- bMatch = recursiveObjectIdentifiers.contains(name.toLowerCase());
- }
- else {
- bMatch = objectIdentifiers.contains(name.toLowerCase());
- }//TESTED
-
- if (bMatch) {
- JsonElement meta = parser.parse(reader);
-
- if (meta.isJsonObject()) {
-
- BasicDBObject currObj = convertJsonToDocument(meta);
- if (null != currObj) {
- return currObj;
- }
- }//TESTED
- else if (meta.isJsonArray()) {
- _secondaryArray = meta.getAsJsonArray();
- _posInSecondaryArray = 0;
- for (JsonElement meta2: _secondaryArray) {
- _posInSecondaryArray++;
- BasicDBObject currObj = convertJsonToDocument(meta2);
- if (null != currObj) {
- return currObj;
- }
- }
- _secondaryArray = null;
- }//TESTED
-
- }//TESTED
- else {
- if (bRecurse) { //TODO (INF-2469): Not currently supported, it gets a bit tricky? (need to convert to a stack)
-
- JsonToken tok = reader.peek();
- if (JsonToken.BEGIN_OBJECT == tok) {
- BasicDBObject currObj = getDocumentFromJson(true);
- if (null != currObj) {
- return currObj;
- }
- }//TESTED
- else if (JsonToken.BEGIN_ARRAY == tok) {
- reader.beginArray();
- while (reader.hasNext()) {
- JsonToken tok2 = reader.peek();
-
- if (JsonToken.BEGIN_OBJECT == tok2) {
- BasicDBObject currObj = getDocumentFromJson(true);
- if (null != currObj) {
- return currObj;
- }
- }
- else {
- reader.skipValue();
- }//TESTED
-
- }//TESTED
- reader.endArray();
- }
- else {
- reader.skipValue();
- }//TESTED
- }
- else {
- reader.skipValue();
- }//TESTED
- }
-
- }//(end loop over reader)
-
- reader.endObject();
- _inSecondaryObject = false;
-
- return null;
-
- } //TESTED
- ////////////////////////////////////////////////////////////////////////////////
- // Utility - Check the object is well formed
-
- private boolean checkIfMandatoryFieldsExist(JsonElement meta) {
- if ((null != this.fieldsThatNeedToExist) && !this.fieldsThatNeedToExist.isEmpty())
- {
- boolean fieldsExist = false;
-
- for (String field: this.fieldsThatNeedToExist) {
- String exists = getKey(meta, field, false);
- if (null != exists) {
- fieldsExist = true;
- break;
- }
- }
- return fieldsExist;
- }
- return true;
- }//TESTED
-
- /////////////////////////////////
-
- // Utility - get the primary key (does handle recursion)
-
- private String getPrimaryKey(JsonElement meta) {
- return getKey(meta, primaryKey, true);
- }
-
- /////////////////////////////////
-
- // Utility - create document set
-
- private BasicDBObject convertJsonToDocument(JsonElement meta) {
-
- // Check if all required fields exist:
- if (!checkIfMandatoryFieldsExist(meta)) {
- return null;
- }
- //TESTED
-
- // Primary key and create doc
- BasicDBObject currObj = new BasicDBObject();
- if ((null != primaryKey) && (null != sourceName)) {
- String primaryKey = getPrimaryKey(meta);
- if (null != primaryKey) {
- currObj.put(DocumentPojo.url_, sourceName + primaryKey);
- }
- }
-
- if (meta.isJsonObject()) {
- currObj.put(DocumentPojo.metadata_, new BasicDBObject("json", Arrays.asList(convertJsonObjectToBson(meta.getAsJsonObject()))));
- }
- return currObj;
-
- } //TESTED
-
- // Utility - get an arbitrary key (does handle recursion)
-
- private String getKey(JsonElement meta, String key, boolean bPrimitiveOnly) {
- try {
- String[] components = key.split("\\.");
- JsonObject metaObj = meta.getAsJsonObject();
- for (String comp: components) {
- meta = metaObj.get(comp);
-
- if (null == meta) {
- return null;
- }//TESTED
- else if (meta.isJsonObject()) {
- metaObj = meta.getAsJsonObject();
- }//TESTED
- else if (meta.isJsonPrimitive()) {
- return meta.getAsString();
- }//TESTED
- else if (bPrimitiveOnly) { // (meta isn't allowed to be an array, then you'd have too many primary keys!)
- return null;
- }//TOTEST (? - see JsonToMetadataParser)
- else { // Check with first instance
- JsonArray array = meta.getAsJsonArray();
- meta = array.get(0);
- if (meta.isJsonObject()) {
- metaObj = meta.getAsJsonObject();
- }
- }//TESTED
- }
- if (!bPrimitiveOnly) { // allow objects, we just care if the field exists...
- if (null != metaObj) {
- return "[Object]";
- }
- }//TESTED
- }
- catch (Exception e) {} // no primary key
-
- return null;
- }
- //(TEST status unknown - see JsonToMetadataParser)
- /////////////////////////////////
- // Utility - conversion
- /**
- * Converts a JsonObject to a LinkedHashMap.
- * @param json JSONObject to convert
- */
- static private int capacity(int expectedSize) {
- if (expectedSize < 3) {
- return expectedSize + 1;
- }
- return expectedSize + expectedSize / 3;
- }
- static public BasicDBObject convertJsonObjectToBson(JsonObject json)
- {
- return convertJsonObjectToBson(json, false);
- }
- static public BasicDBObject convertJsonObjectToBson(JsonObject json, boolean bHtmlUnescape)
- {
- int length = json.entrySet().size();
- BasicDBObject list = new BasicDBObject(capacity(length));
- for (Map.Entry<String, JsonElement> jsonKeyEl: json.entrySet())
- {
- JsonElement jsonEl = jsonKeyEl.getValue();
- if (jsonEl.isJsonArray()) {
- list.put(jsonKeyEl.getKey(), handleJsonArray(jsonEl.getAsJsonArray(), bHtmlUnescape));
- }
- else if (jsonEl.isJsonObject()) {
- list.put(jsonKeyEl.getKey(), convertJsonObjectToBson(jsonEl.getAsJsonObject(), bHtmlUnescape));
- }
- else if (jsonEl.isJsonPrimitive()) {
- if (bHtmlUnescape) {
- list.put(jsonKeyEl.getKey(), StringEscapeUtils.unescapeHtml(jsonEl.getAsString()));
- }
- else {
- list.put(jsonKeyEl.getKey(), jsonEl.getAsString());
- }
- }
- }
- if (list.size() > 0)
- {
- return list;
- }
- return null;
- }
- //TESTED
-
- static private Object[] handleJsonArray(JsonArray jarray, boolean bHtmlUnescape)
- {
- Object o[] = new Object[jarray.size()];
- for (int i = 0; i < jarray.size(); i++)
- {
- JsonElement jsonEl = jarray.get(i);
- if (jsonEl.isJsonObject()) {
- o[i] = convertJsonObjectToBson(jsonEl.getAsJsonObject(), bHtmlUnescape);
- }
- if (jsonEl.isJsonArray()) {
- o[i] = handleJsonArray(jsonEl.getAsJsonArray(), bHtmlUnescape);
- }
- else if (jsonEl.isJsonPrimitive()) {
- if (bHtmlUnescape) {
- o[i] = StringEscapeUtils.unescapeHtml(jsonEl.getAsString());
- }
- else {
- o[i] = jsonEl.getAsString();
- }
- }
- }
- return o;
- }
- //TESTED
- }