PageRenderTime 69ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 0ms

/core/infinit.e.data_model/src/com/ikanow/infinit/e/data_model/store/document/DocumentPojo.java

https://github.com/IKANOW/Infinit.e
Java | 724 lines | 502 code | 102 blank | 120 comment | 64 complexity | c014265c2180366d54e96bb5ddb562a9 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. /*******************************************************************************
  2. * Copyright 2012 The Infinit.e Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. ******************************************************************************/
  16. /**
  17. *
  18. */
  19. package com.ikanow.infinit.e.data_model.store.document;
  20. import java.lang.reflect.Type;
  21. import java.util.Date;
  22. import java.util.HashMap;
  23. import java.util.HashSet;
  24. import java.util.Iterator;
  25. import java.util.LinkedHashMap;
  26. import java.util.List;
  27. import java.util.Map;
  28. import java.util.Map.Entry;
  29. import java.util.Set;
  30. import org.apache.commons.lang.ArrayUtils;
  31. import org.bson.types.ObjectId;
  32. import com.google.gson.GsonBuilder;
  33. import com.google.gson.JsonArray;
  34. import com.google.gson.JsonDeserializationContext;
  35. import com.google.gson.JsonDeserializer;
  36. import com.google.gson.JsonElement;
  37. import com.google.gson.JsonObject;
  38. import com.google.gson.JsonParseException;
  39. import com.google.gson.JsonSerializationContext;
  40. import com.google.gson.JsonSerializer;
  41. import com.google.gson.reflect.TypeToken;
  42. import com.ikanow.infinit.e.data_model.store.BaseDbPojo;
  43. import com.ikanow.infinit.e.data_model.store.MongoDbUtil;
  44. import com.ikanow.infinit.e.data_model.store.config.source.SourcePipelinePojo;
  45. import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
  46. import com.mongodb.BasicDBList;
  47. /**
  48. * @author apiggott
  49. * The generic document data model
  50. */
  51. public class DocumentPojo extends BaseDbPojo {
  52. // Standard static function for readability
  53. @SuppressWarnings("unchecked")
  54. static public TypeToken<List<DocumentPojo>> listType() { return new TypeToken<List<DocumentPojo>>(){}; }
  55. //*** IMPORTANT: don't add to this list without considering the ES mapping in DocumentIndexPojoMap
  56. // Storage (Mongo) data model
  57. // API data model is the same except where otherwise specified (DocumentApiPojoMap converts)
  58. // For index data model see DocumentIndexPojoMap
  59. ////////////////////////////////////////////////////////////////////////////////
  60. // Stored Fields:
  61. // Basic metadata
  62. private ObjectId _id = null;
  63. final public static String _id_ = "_id";
  64. // (API-side, this is an immutable id for the doc, DB-side this the DB _id and changes with every update)
  65. private ObjectId updateId = null;
  66. final public static String updateId_ = "updateId";
  67. // (API-side, this is the current DB id, DB-side this is the original _id, or null if this doc has never been updated)
  68. private String title = null;
  69. final public static String title_ = "title";
  70. private String url = null;
  71. final public static String url_ = "url";
  72. private Date created = null;
  73. final public static String created_ = "created";
  74. private Date modified = null;
  75. final public static String modified_ = "modified";
  76. private Date publishedDate = null;
  77. final public static String publishedDate_ = "publishedDate";
  78. // Data source
  79. private String source = null; // (API side is Set<String>)
  80. final public static String source_ = "source";
  81. private String sourceKey = null; // (API side is Set<String>)
  82. final public static String sourceKey_ = "sourceKey";
  83. private String mediaType = null; // (API side is Set<String>)
  84. final public static String mediaType_ = "mediaType";
  85. transient String sourceType = null; //feed, db, or filesys
  86. final public static String sourceType_ = "sourceType";
  87. // Content
  88. private String description = null;
  89. final public static String description_ = "description";
  90. // Enriched content
  91. private List<EntityPojo> entities = null;
  92. final public static String entities_ = "entities";
  93. // (moved metadata to beta because of wholesale changes)
  94. // Data source/Content
  95. private Set<String> tags = null;
  96. final public static String tags_ = "tags";
  97. private String displayUrl = null;
  98. final public static String displayUrl_ = "displayUrl";
  99. // Data source
  100. private ObjectId communityId = null;
  101. final public static String communityId_ = "communityId";
  102. // (note as far as the API is concerned this a Set<String>)
  103. //currently only used for xml files
  104. private String sourceUrl = null;
  105. final public static String sourceUrl_ = "sourceUrl";
  106. // Enriched content
  107. private List<AssociationPojo> associations = null;
  108. final public static String associations_ = "associations";
  109. private LinkedHashMap<String, Object[]> metadata = null; // has to be [] to allow for 1+
  110. final public static String metadata_ = "metadata";
  111. private GeoPojo docGeo = null; // holds the location of the document, if it has one separate to its entities and events
  112. final public static String docGeo_ = "docGeo";
  113. // Mongo/Elasticsearch-specific field
  114. private String index = null; // The name of the index to which the feed's been added
  115. final public static String index_ = "index";
  116. // Only used for query responses
  117. private Object explain = null;
  118. final public static String explain_ = "explain";
  119. /////////////////////////////////////////////////////////////////////////////////////////////////
  120. // The following won't be stored in the DB (either created by index map or transient)
  121. // Alpha unstored (eg index or API fields)
  122. // Content
  123. private String fullText = null;
  124. final public static String fullText_ = "fullText";
  125. // Per query (transient, created on the way to the API for query, not currently stored anywhere)
  126. private Double aggregateSignif; // The document significance normalized against Lucene relevance
  127. final public static String aggregateSignif_ = "aggregateSignif";
  128. private Double queryRelevance; // The Lucene relevance normalized against Infinit.e significance
  129. final public static String queryRelevance_ = "queryRelevance";
  130. private Double score; // The combined scores (vs the query weighting)
  131. final public static String score_ = "score";
  132. // Alpha transient:
  133. private transient String tmpFullText = null; // (temporary storage until obj written to MongoDB)
  134. private transient String rawFullText = null; // (stores a pointer to the first full text set, ie normally directly from URL/file)
  135. // Beta unstored (eg index or API fields)
  136. // Index-specific fields (ElasticSearch):
  137. private Set<String> locs = null;
  138. final public static String locs_ = "locs";
  139. @SuppressWarnings("unused")
  140. private List<GeoPojo> timeRanges = null; // (won't be used for beta - allow encapsulation of time ranges as 2d points)
  141. final public static String timeRanges_ = "timeRanges";
  142. private Set<Integer> months = null; // (dates represented as YYYYMM - used to generate histograms, nothing else)
  143. final public static String months_ = "months";
  144. // Beta transient:
  145. private transient SourcePojo _source = null; // (handy accessor for the "parent" source info)
  146. //header & Footer Data (doesn't persist in the DB - used for extraction and enrichment)
  147. private transient int headerEndIndex = 0; // (obv starts at 0)
  148. private transient int footerStartIndex = Integer.MAX_VALUE; // (obv ends at the end of the document)
  149. private transient Set<String> headerFields = null;
  150. private transient Set<String> footerFields = null;
  151. private transient String headerText = null; // (\n-separated list of headerFields)
  152. private transient String footerText = null; // (\n-separated list of headerFields)
  153. // V0 transient
  154. // multi-community/source handling
  155. private transient String duplicateFrom = null; // Indicates this document should be cloned from the DB entry with matching URL, "duplicateFrom" source
  156. private transient DocumentPojo cloneFrom = null; // Indicate this document should be cloned from the "cloneFrom" in memory copy after enrichment
  157. private transient SourcePipelinePojo spawnedFrom = null; // Indicates this document was spawned from a "document splitter" (so should ignore previous pipeline elements)
  158. private transient boolean hasDefaultUrl = false; // (for files only) if true then can skip an extra dedup step
  159. ////////////////////////////////////////////////////////////////////////////////
  160. // Alpha gets and sets
  161. public DocumentPojo()
  162. {
  163. }
  164. public ObjectId getId() {
  165. return _id;
  166. }
  167. public void setId(ObjectId _id) {
  168. this._id = _id;
  169. }
  170. public String getTitle() {
  171. return title;
  172. }
  173. public void setTitle(String title) {
  174. this.title = title;
  175. }
  176. public String getDescription() {
  177. return description;
  178. }
  179. public void setDescription(String description) {
  180. this.description = description;
  181. }
  182. public String getUrl() {
  183. return url;
  184. }
  185. public void setUrl(String url) {
  186. this.url = url;
  187. }
  188. /**
  189. * @param created the created to set
  190. */
  191. public Date getCreated() {
  192. return this.created;
  193. }
  194. public void setCreated(Date created) {
  195. this.created = created;
  196. }
  197. public Date getModified() {
  198. return this.modified;
  199. }
  200. public void setModified(Date modified) {
  201. this.modified = modified;
  202. }
  203. public Date getPublishedDate() {
  204. return this.publishedDate;
  205. }
  206. public void setPublishedDate(Date publishedDate) {
  207. this.publishedDate = publishedDate;
  208. }
  209. public String getSource() {
  210. return source;
  211. }
  212. public void setSource(String source) {
  213. this.source = source;
  214. }
  215. public String getSourceKey() {
  216. return sourceKey;
  217. }
  218. public void setSourceKey(String sourceKey) {
  219. this.sourceKey = sourceKey;
  220. }
  221. public void setEntities(List<EntityPojo> entities) {
  222. this.entities = entities;
  223. }
  224. public List<EntityPojo> getEntities() {
  225. return entities;
  226. }
  227. public String getMediaType() {
  228. return mediaType;
  229. }
  230. public void setMediaType(String mediaType) {
  231. this.mediaType = mediaType;
  232. }
  233. public String getFullText() {
  234. return (null == fullText)?tmpFullText:fullText;
  235. }
  236. public void setFullText(String fullText) {
  237. if (null == this.rawFullText) { // very first time, set the raw full text
  238. rawFullText = fullText;
  239. }
  240. this.tmpFullText = fullText;
  241. }
  242. public void makeFullTextNonTransient() {
  243. this.fullText = this.tmpFullText;
  244. }
  245. // This is used for convenience, also used as a hacky flag to spot update documents
  246. // that have been discarded from the update list.
  247. public SourcePojo getTempSource() { return _source; }
  248. public void setTempSource(SourcePojo tempSource) { _source = tempSource; }
  249. ////////////////////////////////////////////////////////////////////////////////
  250. // Alpha utility
  251. ////////////////////////////////////////////////////////////////////////////////
  252. // Beta gets and sets
  253. public void setAssociations(List<AssociationPojo> events)
  254. {
  255. this.associations = events;
  256. }
  257. public List<AssociationPojo> getAssociations()
  258. {
  259. return this.associations;
  260. }
  261. public void addToMetadata(String fieldName, Object fieldVal) {
  262. if (null == metadata) {
  263. metadata = new LinkedHashMap<String, Object[]>();
  264. }
  265. Object obj[] = new Object[1]; obj[0] = fieldVal;
  266. Object[] current = metadata.get(fieldName);
  267. if (null != current) {
  268. metadata.put(fieldName, ArrayUtils.add(current, obj));
  269. }
  270. else {
  271. metadata.put(fieldName, obj);
  272. }
  273. }
  274. public void addToMetadata(String fieldName, Object[] fieldVals) {
  275. if (null == metadata) {
  276. metadata = new LinkedHashMap<String, Object[]>();
  277. }
  278. Object[] current = metadata.get(fieldName);
  279. if (null != current) {
  280. metadata.put(fieldName, ArrayUtils.addAll(current, fieldVals));
  281. }
  282. else {
  283. metadata.put(fieldName, fieldVals);
  284. }
  285. }
  286. public void setMetadata(LinkedHashMap<String, Object[]> metadata)
  287. {
  288. this.metadata = metadata;
  289. }
  290. public LinkedHashMap<String, Object[]> getMetadata()
  291. {
  292. return this.metadata;
  293. }
  294. public LinkedHashMap<String, Object[]> getMetaData() {
  295. return metadata;
  296. }
  297. public Set<String> getTags() {
  298. return tags;
  299. }
  300. public void setTags(Set<String> tags_) {
  301. tags = tags_;
  302. }
  303. public void addTags(Set<String> tags_) {
  304. tags.addAll(tags_);
  305. }
  306. public void setCommunityId(ObjectId communityId) {
  307. this.communityId = communityId;
  308. }
  309. public ObjectId getCommunityId() {
  310. return this.communityId;
  311. }
  312. public GeoPojo getDocGeo() {
  313. return docGeo;
  314. }
  315. public void setDocGeo(GeoPojo docGeo) {
  316. this.docGeo = GeoPojo.cleanseBadGeotag(docGeo);
  317. }
  318. /**
  319. * @param locs the locs to set
  320. */
  321. public void setLocs(Set<String> locs) {
  322. this.locs = locs;
  323. }
  324. /**
  325. * @return the locs
  326. */
  327. public Set<String> getLocs() {
  328. return locs;
  329. }
  330. /**
  331. * @param months the months to set
  332. */
  333. public void setMonths(Set<Integer> months) {
  334. this.months = months;
  335. }
  336. /**
  337. * @return the months
  338. */
  339. public Set<Integer> getMonths() {
  340. return months;
  341. }
  342. /**
  343. * @param sourceUrl the sourceUrl to set
  344. */
  345. public void setSourceUrl(String sourceUrl) {
  346. this.sourceUrl = sourceUrl;
  347. }
  348. /**
  349. * @return the sourceUrl
  350. */
  351. public String getSourceUrl() {
  352. return sourceUrl;
  353. }
  354. /**
  355. * @return the index
  356. */
  357. public String getIndex() {
  358. return index;
  359. }
  360. /**
  361. * @param index the index to set
  362. */
  363. public void setIndex(String index) {
  364. this.index = index;
  365. }
  366. ////////////////////////////////////////////////////////////////////////////////
  367. // Beta utility
  368. // Add the metadata as separate lines to perform extraction on them
  369. public String metaDataToText() {
  370. StringBuffer sb = new StringBuffer();
  371. for ( Object md : metadata.values())
  372. {
  373. sb.append(md).append('\n');
  374. }
  375. return sb.toString();
  376. }//TOTEST - to be done during DB integration
  377. ////////////////////////////////////////////////////////////////////////////////
  378. //(Still beta) Header Footer Stuff ... can be used by entity extractors
  379. /**
  380. * @return the headerStartIndex
  381. */
  382. public int getHeaderEndIndex() {
  383. return headerEndIndex;
  384. }
  385. /**
  386. * @param headerStartIndex the headerStartIndex to set
  387. */
  388. public void setHeaderEndIndex(int headerEndIndex) {
  389. this.headerEndIndex = headerEndIndex;
  390. }
  391. /**
  392. * @return the footerStartIndex
  393. */
  394. public int getFooterStartIndex() {
  395. return footerStartIndex;
  396. }
  397. /**
  398. * @param footerEndIndex the footerEndIndex to set
  399. */
  400. public void setFooterStartIndex(int footerStartIndex) {
  401. this.footerStartIndex = footerStartIndex;
  402. }
  403. public void addToHeader(String sHeaderField) {
  404. if (headerFields == null)
  405. headerFields = new HashSet<String>();
  406. headerFields.add(sHeaderField.toLowerCase());
  407. }
  408. public void addToFooter(String sFooterField) {
  409. if (footerFields == null)
  410. footerFields = new HashSet<String>();
  411. footerFields.add(sFooterField.toLowerCase());
  412. }
  413. public Set<String> getHeaderFields() {
  414. return headerFields;
  415. }
  416. public Set<String> getFooterFields() {
  417. return footerFields;
  418. }
  419. public String getHeader() {
  420. if (null == headerFields) {
  421. return "";
  422. }
  423. return headerText;
  424. }
  425. public String getFooter() {
  426. if (null == footerFields) {
  427. return "";
  428. }
  429. return footerText;
  430. }
  431. public String getBody() {
  432. if (null == getFullText())
  433. {
  434. return null;
  435. }
  436. else{
  437. if (footerStartIndex == Integer.MAX_VALUE && headerEndIndex == 0 )
  438. {
  439. return getFullText();
  440. }
  441. else if (footerStartIndex > getFullText().length()) {
  442. return getFullText().substring(headerEndIndex);
  443. }
  444. else {
  445. return getFullText().substring(headerEndIndex, footerStartIndex);
  446. }
  447. }
  448. }
  449. ////////////////////////////////////////////////////////////////////////////////
  450. // V0 gets and sets
  451. public void setDuplicateFrom(String sourceKey) {
  452. duplicateFrom = sourceKey;
  453. }
  454. public String getDuplicateFrom() {
  455. return duplicateFrom;
  456. }
  457. public void setCloneFrom(DocumentPojo masterClone) {
  458. cloneFrom = masterClone;
  459. }
  460. public DocumentPojo getCloneFrom() {
  461. return cloneFrom;
  462. }
  463. ////////////////////////////////////////////////////////////////////////////////
  464. ////////////////////////////////////////////////////////////////////////////////
  465. // Base overrides:
  466. public GsonBuilder extendBuilder(GsonBuilder gp) {
  467. return gp.registerTypeAdapter(DocumentPojo.class, new DocumentPojoDeserializer()).
  468. registerTypeAdapter(DocumentPojo.class, new DocumentPojoSerializer());
  469. }
  470. protected static class DocumentPojoSerializer implements JsonSerializer<DocumentPojo>
  471. {
  472. @Override
  473. public JsonElement serialize(DocumentPojo doc, Type typeOfT, JsonSerializationContext context)
  474. {
  475. // GSON transformation:
  476. JsonElement je = DocumentPojo.getDefaultBuilder().create().toJsonTree(doc, typeOfT);
  477. // Convert object names in metadata
  478. if ((null != doc.getMetadata()) && !doc.getMetadata().isEmpty()) {
  479. if (je.isJsonObject()) {
  480. JsonElement metadata = je.getAsJsonObject().get("metadata");
  481. if (null != metadata) {
  482. enforceTypeNamingPolicy(metadata, 0);
  483. }
  484. }
  485. }
  486. return je;
  487. }
  488. //////////////////////////////////////////////////////////////////////////////////////////
  489. // Utility function for encoding "."s and "%"s (also duplicate in index)
  490. private static boolean enforceTypeNamingPolicy(JsonElement je, int nDepth) {
  491. if (je.isJsonPrimitive()) {
  492. return false; // Done
  493. }
  494. else if (je.isJsonArray()) {
  495. JsonArray ja = je.getAsJsonArray();
  496. if (0 == ja.size()) {
  497. return false; // No idea, carry on
  498. }
  499. JsonElement jaje = ja.get(0);
  500. return enforceTypeNamingPolicy(jaje, nDepth + 1); // keep going until you find primitive/object
  501. }
  502. else if (je.isJsonObject()) {
  503. JsonObject jo = je.getAsJsonObject();
  504. // Nested variables:
  505. Iterator<Entry<String, JsonElement>> it = jo.entrySet().iterator();
  506. Map<String, JsonElement> toFixList = null;
  507. while (it.hasNext()) {
  508. boolean bFix = false;
  509. Entry<String, JsonElement> el = it.next();
  510. String currKey = el.getKey();
  511. if ((currKey.indexOf('.') >= 0) || (currKey.indexOf('%') >= 0)) {
  512. it.remove();
  513. currKey = currKey.replace("%", "%25").replace(".", "%2e");
  514. bFix = true;
  515. }
  516. if (null == el.getValue()) {
  517. if (!bFix) it.remove(); // nice easy case, just get rid of it (if bFix, it's already removed)
  518. bFix = false;
  519. }
  520. else {
  521. enforceTypeNamingPolicy(el.getValue(), nDepth + 1);
  522. }
  523. if (bFix) {
  524. if (null == toFixList) {
  525. toFixList = new HashMap<String, JsonElement>();
  526. }
  527. toFixList.put(currKey, el.getValue());
  528. }
  529. } // (end loop over params)
  530. if (null != toFixList) {
  531. for (Entry<String, JsonElement> el: toFixList.entrySet()) {
  532. jo.add(el.getKey(), el.getValue());
  533. }
  534. }
  535. return true; // (in any case, I get renamed by calling parent)
  536. }
  537. return false;
  538. }
  539. //TESTED (see DOC_META in test/TestCode)
  540. }
  541. protected static class DocumentPojoDeserializer implements JsonDeserializer<DocumentPojo>
  542. {
  543. @Override
  544. public DocumentPojo deserialize(JsonElement json, Type typeOfT, JsonDeserializationContext context) throws JsonParseException
  545. {
  546. JsonObject metadata = json.getAsJsonObject().getAsJsonObject("metadata");
  547. if (null != metadata) {
  548. json.getAsJsonObject().remove("metadata");
  549. }
  550. DocumentPojo doc = BaseDbPojo.getDefaultBuilder().create().fromJson(json, DocumentPojo.class);
  551. if (null != metadata) {
  552. for (Entry<String, JsonElement> entry: metadata.entrySet()) {
  553. if (entry.getValue().isJsonArray()) {
  554. doc.addToMetadata(entry.getKey(), MongoDbUtil.encodeArray(entry.getValue().getAsJsonArray()).toArray());
  555. }
  556. else {
  557. BasicDBList dbl = new BasicDBList();
  558. dbl.add(MongoDbUtil.encodeUnknown(entry.getValue()));
  559. doc.addToMetadata(entry.getKey(), dbl);
  560. }
  561. }//TESTED
  562. }
  563. return doc;
  564. }
  565. }
  566. ////////////////////////////////////////////////////////////////////////////////
  567. // Per query (transient, created on the way to the API for query, not currently stored anywhere)
  568. public Double getAggregateSignif() {
  569. return aggregateSignif;
  570. }
  571. public void setAggregateSignif(Double aggregateSignif) {
  572. this.aggregateSignif = aggregateSignif;
  573. }
  574. public Double getQueryRelevance() {
  575. return queryRelevance;
  576. }
  577. public void setQueryRelevance(Double queryRelevance) {
  578. this.queryRelevance = queryRelevance;
  579. }
  580. public Double getScore() {
  581. return score;
  582. }
  583. public void setScore(Double score) {
  584. this.score = score;
  585. }
  586. public void setUpdateId(ObjectId updateId) {
  587. this.updateId = updateId;
  588. }
  589. public ObjectId getUpdateId() {
  590. return updateId;
  591. }
  592. public void setDisplayUrl(String displayUrl) {
  593. this.displayUrl = displayUrl;
  594. }
  595. public String getDisplayUrl() {
  596. return displayUrl;
  597. }
  598. public void setExplain(Object explain) {
  599. this.explain = explain;
  600. }
  601. public Object getExplain() {
  602. return explain;
  603. }
  604. public void resetRawFullText() {
  605. this.rawFullText = null;
  606. }
  607. public String getRawFullText() {
  608. return rawFullText;
  609. }
  610. public SourcePipelinePojo getSpawnedFrom() {
  611. return spawnedFrom;
  612. }
  613. public void setSpawnedFrom(SourcePipelinePojo spawnedFrom) {
  614. this.spawnedFrom = spawnedFrom;
  615. }
  616. public boolean getHasDefaultUrl() {
  617. return hasDefaultUrl;
  618. }
  619. public void setHasDefaultUrl(boolean hasDefaultUrl) {
  620. this.hasDefaultUrl = hasDefaultUrl;
  621. }
  622. }