PageRenderTime 52ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/connectors/EnvironmentsDataConnector.php

https://github.com/EOL/eol_php_code
PHP | 236 lines | 211 code | 15 blank | 10 comment | 30 complexity | 3467a037552dd34e3986bdf9b055fcba MD5 | raw file
  1. <?php
  2. namespace php_active_record;
  3. require_library('PreferredEntriesCalculator');
  4. class EnvironmentsDataConnector
  5. {
  6. const DUMP_URL = "/Users/pleary/Downloads/eol_section_matches_extracted.txt";
  7. const ENVO_OWL_URL = "/Users/pleary/Downloads/envo.owl.txt";
  8. public function __construct($resource_id)
  9. {
  10. $this->resource_id = $resource_id;
  11. $this->taxon_ids = array();
  12. }
  13. public function build_archive()
  14. {
  15. $this->path_to_archive_directory = CONTENT_RESOURCE_LOCAL_PATH . "/$this->resource_id/";
  16. $this->archive_builder = new \eol_schema\ContentArchiveBuilder(array('directory_path' => $this->path_to_archive_directory));
  17. $this->taxon_ids = array();
  18. $this->occurrence_ids = array();
  19. $this->taxon_ids_terms = array();
  20. $this->taxon_subjects = array();
  21. $this->taxon_names = array();
  22. $this->prepare_envo_schema();
  23. $subjects = array();
  24. echo "Reading file...\n";
  25. foreach(new FileIterator(self::DUMP_URL) as $line_number => $line)
  26. {
  27. if($line_number % 50000 == 0) echo "line $line_number\n";
  28. // if($line_number >= 5000) break;
  29. $line_data = explode("\t", $line);
  30. $taxon_concept_id = str_replace('EOL:', '', $line_data[0]);
  31. $subject = str_replace(' ', '_', strtolower(trim($line_data[1])));
  32. if($subject == 'taxon_biology') $subject = 'brief_summary';
  33. if($subject == 'biology') $subject = 'comprehensive_description';
  34. if($subject == 'general_description') $subject = 'comprehensive_description';
  35. if($subject == 'description') $subject = 'comprehensive_description';
  36. @$subjects[$subject]++;
  37. $text = trim($line_data[2]);
  38. $uri = trim($line_data[3]);
  39. if(!trim($uri)) continue;
  40. $uri = "http://purl.obolibrary.org/obo/" . str_replace(":", "_", $uri);
  41. if(!isset($this->envo_term_uris[$uri]))
  42. {
  43. echo "$uri is not valid\n";
  44. continue;
  45. }
  46. $label = $this->envo_term_uris[$uri];
  47. if(preg_match("/ (feature|region|entity|material|physical)/", $label)) continue;
  48. static $terms_to_skip = array('habitat', 'environmental condition');
  49. if(in_array($label, $terms_to_skip)) continue;
  50. $this->taxon_ids_terms[$taxon_concept_id][$uri] = $text;
  51. $this->taxon_subjects[$taxon_concept_id] = $subject;
  52. }
  53. arsort($subjects);
  54. print_r($subjects);
  55. $this->filter_out_parent_classes();
  56. $this->lookup_taxon_names();
  57. foreach($this->taxon_ids_terms as $taxon_concept_id => $uris)
  58. {
  59. static $i = 0;
  60. if($i % 1000 == 0) echo "Inserting taxon $i\n";
  61. $i++;
  62. $taxon = $this->add_taxon($taxon_concept_id);
  63. $occurrence = $this->add_occurrence($taxon);
  64. $this->add_measurements($taxon_concept_id, $occurrence, $uris);
  65. }
  66. $this->archive_builder->finalize(true);
  67. }
  68. private function add_taxon($taxon_concept_id)
  69. {
  70. $taxon_id = 'EOL:' . $taxon_concept_id;
  71. if(isset($this->taxon_ids[$taxon_id])) return $this->taxon_ids[$taxon_id];
  72. $t = new \eol_schema\Taxon();
  73. $t->taxonID = $taxon_id;
  74. $names = @$this->taxon_concept_names[$taxon_concept_id];
  75. if(!$names) return false;
  76. if(!$names['scientificName']) return false;
  77. $t->scientificName = $names['scientificName'];
  78. $t->kingdom = @$names['kingdom'];
  79. $t->phylum = @$names['phylum'];
  80. // $t->class = @$names['class'];
  81. // $t->order = @$names['order'];
  82. $t->family = @$names['family'];
  83. $this->archive_builder->write_object_to_file($t);
  84. $this->taxon_ids[$taxon_id] = $t;
  85. return $t;
  86. }
  87. private function lookup_taxon_names()
  88. {
  89. $batches = array_chunk(array_keys($this->taxon_ids_terms), 10000);
  90. foreach($batches as $batch)
  91. {
  92. $this->lookup_taxon_name_batch($batch);
  93. }
  94. }
  95. private function lookup_taxon_name_batch($taxon_concept_ids)
  96. {
  97. $entry_taxon_concept_ids = array();
  98. foreach($GLOBALS['db_connection']->iterate("
  99. SELECT pref.taxon_concept_id, he.id, n.string
  100. FROM taxon_concept_preferred_entries pref
  101. JOIN hierarchy_entries he ON (pref.hierarchy_entry_id=he.id)
  102. LEFT JOIN names n ON (he.name_id=n.id)
  103. WHERE pref.taxon_concept_id IN (". implode(",", $taxon_concept_ids) .")") as $row)
  104. {
  105. $entry_taxon_concept_ids[$row['id']] = $row['taxon_concept_id'];
  106. $this->taxon_concept_names[$row['taxon_concept_id']]['scientificName'] = $row['string'];
  107. }
  108. $kingdom_ids = Rank::kingdom_rank_ids();
  109. $phylum_ids = Rank::phylum_rank_ids();
  110. // $class_ids = Rank::class_rank_ids();
  111. // $order_ids = Rank::order_rank_ids();
  112. $family_ids = Rank::family_rank_ids();
  113. // $all_rank_ids = array_merge($kingdom_ids, $phylum_ids, $class_ids, $order_ids, $family_ids);
  114. $all_rank_ids = array_merge($kingdom_ids, $phylum_ids, $family_ids);
  115. foreach($GLOBALS['db_connection']->iterate("
  116. SELECT hef.hierarchy_entry_id, he.id, he.rank_id, n.string
  117. FROM hierarchy_entries_flattened hef
  118. JOIN hierarchy_entries he ON (hef.ancestor_id=he.id)
  119. LEFT JOIN names n ON (he.name_id=n.id)
  120. WHERE hef.hierarchy_entry_id IN (". implode(",", array_keys($entry_taxon_concept_ids)) .")
  121. AND he.rank_id IN (". implode(",", $all_rank_ids) .")") as $row)
  122. {
  123. $taxon_concept_id = $entry_taxon_concept_ids[$row['hierarchy_entry_id']];
  124. $name_string = $row['string'];
  125. if(Name::is_surrogate($name_string)) continue;
  126. if(in_array($row['rank_id'], $kingdom_ids)) $this->taxon_concept_names[$taxon_concept_id]['kingdom'] = $name_string;
  127. elseif(in_array($row['rank_id'], $phylum_ids)) $this->taxon_concept_names[$taxon_concept_id]['phylum'] = $name_string;
  128. // elseif(in_array($row['rank_id'], $class_ids)) $this->taxon_concept_names[$taxon_concept_id]['class'] = $name_string;
  129. // elseif(in_array($row['rank_id'], $order_ids)) $this->taxon_concept_names[$taxon_concept_id]['order'] = $name_string;
  130. elseif(in_array($row['rank_id'], $family_ids)) $this->taxon_concept_names[$taxon_concept_id]['family'] = $name_string;
  131. }
  132. }
  133. private function add_occurrence($taxon)
  134. {
  135. $occurrence_id = md5($taxon->taxonID . 'occurrence');
  136. if(isset($this->occurrence_ids[$occurrence_id])) return $this->occurrence_ids[$occurrence_id];
  137. $o = new \eol_schema\Occurrence();
  138. $o->occurrenceID = $occurrence_id;
  139. $o->taxonID = $taxon->taxonID;
  140. $this->archive_builder->write_object_to_file($o);
  141. $this->occurrence_ids[$occurrence_id] = $o;
  142. return $o;
  143. }
  144. private function add_measurements($taxon_id, $occurrence, $uris)
  145. {
  146. foreach($uris as $uri => $source_text)
  147. {
  148. $m = new \eol_schema\MeasurementOrFact();
  149. $m->occurrenceID = $occurrence->occurrenceID;
  150. $m->measurementOfTaxon = 'true';
  151. $m->measurementType = 'http://eol.org/schema/terms/Habitat';
  152. $m->measurementValue = $uri;
  153. $m->measurementMethod = 'text mining';
  154. $m->contributor = '<a href="http://environments-eol.blogspot.com/2013/03/welcome-to-environments-eol-few-words.html">Environments-EOL</a>';
  155. $m->source = "http://eol.org/pages/$taxon_id/details#". $this->taxon_subjects[$taxon_id];
  156. $m->measurementRemarks = "source text: \"$source_text\"";
  157. $this->archive_builder->write_object_to_file($m);
  158. }
  159. }
  160. private function filter_out_parent_classes()
  161. {
  162. foreach($this->taxon_ids_terms as $taxon_id => $uris)
  163. {
  164. if(count($uris) <= 1) continue;
  165. $filtered_uris = $uris;
  166. foreach($uris as $uri => $junk)
  167. {
  168. if($this->envo_term_parents[$uri])
  169. {
  170. foreach($this->envo_term_parents[$uri] as $parent_uri)
  171. {
  172. unset($filtered_uris[$parent_uri]);
  173. }
  174. }
  175. }
  176. $this->taxon_ids_terms[$taxon_id] = $filtered_uris;
  177. }
  178. }
  179. private function prepare_envo_schema()
  180. {
  181. $envo_schema = file_get_contents(self::ENVO_OWL_URL);
  182. $this->envo_term_uris = array();
  183. $this->envo_term_parents = array();
  184. if(preg_match_all("/\n <owl:Class rdf:about=\"(.*?)\"(.*?)\n <\/owl:Class>/ims", $envo_schema, $matches, PREG_SET_ORDER))
  185. {
  186. foreach($matches as $match)
  187. {
  188. $class_uri = $match[1];
  189. $class_xml = $match[2];
  190. if(preg_match("/<rdfs:label.*?>(.*?)<\/rdfs:label>/", $class_xml, $arr)) $this->envo_term_uris[$class_uri] = $arr[1];
  191. // subclass
  192. if(preg_match_all("/<rdfs:subClassOf rdf:resource=\"(.*?)\"\/>/ims", $class_xml, $arrs, PREG_SET_ORDER))
  193. {
  194. foreach($arrs as $arr) $this->envo_term_parents[$class_uri][] = $arr[1];
  195. }
  196. // part_of
  197. if(preg_match_all("/<owl:onProperty rdf:resource=\"http:\/\/purl.obolibrary.org\/obo\/BFO_0000050\"\/>\s+<owl:someValuesFrom rdf:resource=\"(.*?)\"\/>/ims", $class_xml, $arrs, PREG_SET_ORDER))
  198. {
  199. foreach($arrs as $arr) $this->envo_term_parents[$class_uri][] = $arr[1];
  200. }
  201. }
  202. }
  203. $this->processed_uri_parents = array();
  204. foreach($this->envo_term_parents as $uri => $parent_uri) $this->add_parents_recursively($uri);
  205. }
  206. private function add_parents_recursively($uri)
  207. {
  208. if(isset($this->processed_uri_parents[$uri])) return $this->processed_uri_parents[$uri];
  209. if(isset($this->envo_term_parents[$uri]))
  210. {
  211. foreach($this->envo_term_parents[$uri] as $parent_uri)
  212. {
  213. $parent_uris = $this->add_parents_recursively($parent_uri);
  214. $this->envo_term_parents[$uri] = array_unique(array_merge($this->envo_term_parents[$uri], $parent_uris));
  215. }
  216. $this->processed_uri_parents[$uri] = $this->envo_term_parents[$uri];
  217. }else $this->processed_uri_parents[$uri] = array();
  218. return $this->processed_uri_parents[$uri];
  219. }
  220. }
  221. ?>