PageRenderTime 26ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/connectors/FemoraleAPI.php

https://github.com/EOL/eol_php_code
PHP | 222 lines | 187 code | 11 blank | 24 comment | 20 complexity | 0ca6959587c5326df8a3fcf97669af06 MD5 | raw file
  1. <?php
  2. namespace php_active_record;
  3. /* connector: [793]: at this point it is a one-time export
  4. Partner gave us spreadsheets (4). There is structured data (body length), and images. We scrape the mediaURLs from the site.
  5. */
  6. class FemoraleAPI
  7. {
  8. function __construct($folder = null)
  9. {
  10. $this->path_to_archive_directory = CONTENT_RESOURCE_LOCAL_PATH . '/' . $folder . '_working/';
  11. $this->archive_builder = new \eol_schema\ContentArchiveBuilder(array('directory_path' => $this->path_to_archive_directory));
  12. $this->taxon_ids = array();
  13. $this->occurrence_ids = array();
  14. $this->measurement_ids = array();
  15. $this->object_ids = array();
  16. $this->download_options = array('download_wait_time' => 500000, 'timeout' => 10800, 'download_attempts' => 1);
  17. // $this->download_options['expire_seconds'] = false;
  18. $this->url_path = "http://localhost/~eolit/cp/Femorale/";
  19. $this->url_path = "https://dl.dropboxusercontent.com/u/7597512/Femorale/";
  20. $this->images_path = "http://www.femorale.com/shellphotos/detmore.asp?&localidade=&url=";
  21. $this->spreadsheets = array();
  22. $this->spreadsheets[] = "Encyclopedia_Of_Life_Other.xls"; // 365 KB
  23. $this->spreadsheets[] = "Encyclopedia_Of_Life_Bivalvia.xls"; // 1.0 MB
  24. $this->spreadsheets[] = "Encyclopedia_Of_Life_Land.xls"; // 1.9 MB
  25. $this->spreadsheets[] = "Encyclopedia_Of_Life_Gastropoda.xls"; // 4.6 MB
  26. // $this->spreadsheets[] = "Encyclopedia_Of_Life_Bivalvia_small.xls";
  27. }
  28. function get_all_taxa()
  29. {
  30. require_library('XLSParser');
  31. $docs = count($this->spreadsheets);
  32. $doc_count = 0;
  33. foreach($this->spreadsheets as $doc)
  34. {
  35. $doc_count++;
  36. echo "\n processing [$doc]...\n";
  37. if($path = Functions::save_remote_file_to_local($this->url_path . $doc, array("cache" => 1, "timeout" => 3600, "file_extension" => "xls", 'download_attempts' => 2, 'delay_in_minutes' => 2)))
  38. {
  39. $parser = new XLSParser();
  40. $arr = $parser->convert_sheet_to_array($path);
  41. $fields = array_keys($arr);
  42. $i = -1;
  43. $rows = count($arr["Species"]);
  44. echo "\n total $path: $rows \n";
  45. foreach($arr["Species"] as $Species)
  46. {
  47. $i++;
  48. $rec = array();
  49. foreach($fields as $field) $rec[$field] = $arr[$field][$i];
  50. $rec = array_map('trim', $rec);
  51. /* breakdown when caching
  52. $cont = false;
  53. // if($i >= 1 && $i < 6000) $cont = true;
  54. // if($i >= 3000 && $i < 6000) $cont = true;
  55. // if($i >= 6000 && $i < 9000) $cont = true;
  56. // if($i >= 9000 && $i < 12000) $cont = true;
  57. // if($i >= 11800 && $i < 15000) $cont = true;
  58. if(!$cont) continue;
  59. */
  60. print "\n [$doc_count of $docs][" . ($i+1) . " of $rows] " . $rec["Species"] . "\n";
  61. $rec = self::clean_taxon_name($rec);
  62. $taxon_id = trim(preg_replace('/\s*\([^)]*\)/', '', $rec["sciname"])); // remove parenthesis
  63. $taxon_id = str_replace(" ", "_", $taxon_id);
  64. $rec["taxon_id"] = md5($taxon_id);
  65. self::create_instances_from_taxon_object($rec);
  66. self::prepare_images($rec);
  67. self::prepare_data($rec);
  68. }
  69. unlink($path);
  70. }
  71. else echo "\n [$doc] unavailable! \n";
  72. }
  73. $this->archive_builder->finalize(TRUE);
  74. }
  75. private function prepare_data($rec)
  76. {
  77. $rec["object_id"] = "size";
  78. $val = trim(str_replace(array("mm", " up"), "", $rec["Size"]));
  79. if($val) self::add_string_types($rec, "size", $val, "http://purl.obolibrary.org/obo/CMO_0000013", "true");
  80. // commented for now
  81. // $rec["object_id"] = "locality";
  82. // self::add_string_types($rec, "locality", $rec["Locality"], "http://rs.tdwg.org/dwc/terms/locality", "false");
  83. }
  84. private function prepare_images($rec)
  85. {
  86. if($mediaURLs = self::get_image_urls($rec))
  87. {
  88. print "\n images: " . count($mediaURLs) . "\n";
  89. foreach($mediaURLs as $mediaURL)
  90. {
  91. /* not used for now
  92. $desc = "";
  93. if($val = $rec["Locality"]) $desc .= "Locality: " . $val . "<br>";
  94. if($val = $rec["Size"]) $desc .= "Size: " . $val . "<br>";
  95. if($val = $rec["Book"]) $desc .= "Book: " . $val . "<br>";
  96. if($val = $rec["Synonym"]) $desc .= "Synonym: " . $val . "<br>";
  97. */
  98. $mr = new \eol_schema\MediaResource();
  99. $mr->taxonID = $rec["taxon_id"];
  100. $mr->identifier = md5($mediaURL);
  101. $mr->type = "http://purl.org/dc/dcmitype/StillImage";
  102. $mr->format = Functions::get_mimetype($mediaURL);
  103. $mr->Owner = "Femorale";
  104. $mr->UsageTerms = "http://creativecommons.org/licenses/by-nc/3.0/";
  105. $mr->accessURI = $mediaURL;
  106. $mr->furtherInformationURL = str_replace(" ", "%20", $rec["Expr1"]);
  107. if(!isset($this->object_ids[$mr->identifier]))
  108. {
  109. $this->archive_builder->write_object_to_file($mr);
  110. $this->object_ids[$mr->identifier] = '';
  111. }
  112. }
  113. }
  114. }
  115. private function get_image_urls($rec)
  116. {
  117. $mediaURLs = array();
  118. $url = $this->images_path . "&species=" . $rec["Species"] . "&navi=";
  119. if($html = Functions::lookup_with_cache($url . "1", $this->download_options))
  120. {
  121. $navi = 1;
  122. if(preg_match("/>1 of (.*?)<\/font/ims", $html, $arr)) $navi = trim($arr[1]);
  123. for($i=1; $i<=$navi; $i++)
  124. {
  125. if($i == 1)
  126. {
  127. if(preg_match_all("/src=\"(.*?)\"/ims", $html, $arr)) $mediaURLs = array_merge($mediaURLs, $arr[1]);
  128. }
  129. else
  130. {
  131. if($html = Functions::lookup_with_cache($url . $i, $this->download_options))
  132. {
  133. if(preg_match_all("/src=\"(.*?)\"/ims", $html, $arr)) $mediaURLs = array_merge($mediaURLs, $arr[1]);
  134. }
  135. }
  136. }
  137. }
  138. return $mediaURLs;
  139. }
  140. private function create_instances_from_taxon_object($rec)
  141. {
  142. $taxon = new \eol_schema\Taxon();
  143. $taxon->taxonID = $rec["taxon_id"];
  144. $taxon->scientificName = $rec["sciname"];
  145. $taxon->taxonRank = $rec["rank"];
  146. $taxon->family = ucfirst(strtolower($rec["Family"]));
  147. if(!isset($this->taxon_ids[$taxon->taxonID]))
  148. {
  149. $this->archive_builder->write_object_to_file($taxon);
  150. $this->taxon_ids[$taxon->taxonID] = '';
  151. }
  152. }
  153. private function clean_taxon_name($rec)
  154. {
  155. $strings = array(" sp ", " sp.");
  156. $found = false;
  157. foreach($strings as $string)
  158. {
  159. if(is_numeric(stripos($rec["Species"], $string))) $found = true;
  160. }
  161. if($found)
  162. {
  163. $rec["sciname"] = Functions::canonical_form($rec["Species"]);
  164. $rec["rank"] = "genus";
  165. }
  166. else
  167. {
  168. $rec["sciname"] = $rec["Species"];
  169. $rec["rank"] = "species";
  170. }
  171. return $rec;
  172. }
  173. private function add_string_types($rec, $label, $value, $measurementType, $measurementOfTaxon)
  174. {
  175. $taxon_id = $rec["taxon_id"];
  176. $object_id = $rec["object_id"];
  177. $m = new \eol_schema\MeasurementOrFact();
  178. $occurrence = $this->add_occurrence($taxon_id, $object_id);
  179. $m->occurrenceID = $occurrence->occurrenceID;
  180. $m->measurementOfTaxon = $measurementOfTaxon;
  181. if($label == "size")
  182. {
  183. $m->source = str_replace(" ", "%20", $rec["Expr1"]);
  184. $m->source = str_replace(",", "%2C", $m->source);
  185. $m->source = str_replace("(", "%28", $m->source);
  186. $m->source = str_replace(")", "%29", $m->source);
  187. $m->measurementUnit = "http://purl.obolibrary.org/obo/UO_0000016"; //mm - millimeter
  188. $m->measurementRemarks = "maximum shell dimension";
  189. }
  190. $m->measurementType = $measurementType;
  191. $m->measurementValue = $value;
  192. $m->statisticalMethod = "http://www.ebi.ac.uk/efo/EFO_0001444";
  193. if(!isset($this->measurement_ids[$m->occurrenceID]))
  194. {
  195. $this->archive_builder->write_object_to_file($m);
  196. $this->measurement_ids[$m->occurrenceID] = '';
  197. }
  198. }
  199. private function add_occurrence($taxon_id, $object_id)
  200. {
  201. $occurrence_id = $taxon_id . '_' . $object_id;
  202. if(isset($this->occurrence_ids[$occurrence_id])) return $this->occurrence_ids[$occurrence_id];
  203. $o = new \eol_schema\Occurrence();
  204. $o->occurrenceID = $occurrence_id;
  205. $o->taxonID = $taxon_id;
  206. $this->archive_builder->write_object_to_file($o);
  207. $this->occurrence_ids[$occurrence_id] = $o;
  208. return $o;
  209. }
  210. }
  211. ?>