PageRenderTime 51ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/connectors/DipteraCentralAmericaAPI.php

https://github.com/EOL/eol_php_code
PHP | 310 lines | 256 code | 14 blank | 40 comment | 41 complexity | 19f817e2c35ff2f0a94fd0c6113993ca MD5 | raw file
  1. <?php
  2. namespace php_active_record;
  3. // connector: [683] formerly 661
  4. class DipteraCentralAmericaAPI
  5. {
  6. function __construct($folder)
  7. {
  8. $this->domain = "http://www.phorid.net/diptera/";
  9. $this->taxa_list_url = $this->domain . "diptera_index.html";
  10. $this->taxa = array();
  11. $this->path_to_archive_directory = CONTENT_RESOURCE_LOCAL_PATH . '/' . $folder . '_working/';
  12. $this->archive_builder = new \eol_schema\ContentArchiveBuilder(array('directory_path' => $this->path_to_archive_directory));
  13. $this->resource_reference_ids = array();
  14. $this->do_ids = array();
  15. $this->download_options = array('download_wait_time' => 2000000, 'timeout' => 1200, 'download_attempts' => 2, 'delay_in_minutes' => 2);
  16. }
  17. function get_all_taxa()
  18. {
  19. if($records = self::parse_html())
  20. {
  21. $i = 0;
  22. $total = count($records);
  23. echo "\n total records: $total";
  24. foreach($records as $taxon => $rec)
  25. {
  26. $i++;
  27. echo "\n $i of $total: " . $taxon;
  28. if(isset($rec[0]["image"])) self::prepare_images($taxon, $rec);
  29. else $this->create_instances_from_taxon_object($taxon, $rec, array());
  30. }
  31. $this->create_archive();
  32. }
  33. }
  34. private function prepare_object_refs($connections)
  35. {
  36. $reference_ids = array();
  37. $string = "";
  38. foreach($connections as $conn)
  39. {
  40. if($conn["title"] == "Selected References") $string = $conn["desc"];
  41. }
  42. if(preg_match_all("/<li>(.*?)<\/li>/ims", $string, $arr))
  43. {
  44. $refs = $arr[1];
  45. foreach($refs as $ref)
  46. {
  47. $ref = (string) trim($ref);
  48. if(!$ref) continue;
  49. $r = new \eol_schema\Reference();
  50. $r->full_reference = $ref;
  51. $r->identifier = md5($ref);
  52. $reference_ids[] = $r->identifier;
  53. if(!in_array($r->identifier, $this->resource_reference_ids))
  54. {
  55. $this->resource_reference_ids[] = $r->identifier;
  56. $this->archive_builder->write_object_to_file($r);
  57. }
  58. }
  59. }
  60. return $reference_ids;
  61. }
  62. private function prepare_images($taxon, $images)
  63. {
  64. $reference_ids = array();
  65. $ref_ids = array();
  66. $agent_ids = array();
  67. foreach($images as $rec)
  68. {
  69. echo "\n - " . $taxon . " - " . $rec['url'];
  70. $media_url = $rec["image"];
  71. echo "\n media url: " . $media_url . "\n\n";
  72. $path_parts = pathinfo($rec["image"]);
  73. $identifier = (string) $rec["taxon_id"] . "_" . str_replace(" ", "_", $path_parts["basename"]);
  74. if(in_array($identifier, $this->do_ids)) continue;
  75. else $this->do_ids[] = $identifier;
  76. $mr = new \eol_schema\MediaResource();
  77. if($reference_ids) $mr->referenceID = implode("; ", $reference_ids);
  78. if($agent_ids) $mr->agentID = implode("; ", $agent_ids);
  79. $mr->taxonID = (string) $rec["taxon_id"];
  80. $mr->identifier = $identifier;
  81. $mr->type = "http://purl.org/dc/dcmitype/StillImage";
  82. $mr->language = 'en';
  83. $mr->format = (string) Functions::get_mimetype($media_url);
  84. $mr->furtherInformationURL = (string) $rec['url'];
  85. $mr->accessURI = (string) $media_url;
  86. $mr->Owner = "";
  87. $mr->UsageTerms = "http://creativecommons.org/licenses/by-nc-sa/3.0/";
  88. $mr->description = (string) $rec["caption"];
  89. $this->archive_builder->write_object_to_file($mr);
  90. $this->create_instances_from_taxon_object($taxon, $rec, $reference_ids);
  91. }
  92. }
  93. private function parse_html()
  94. {
  95. $records = array();
  96. if($html = Functions::lookup_with_cache($this->taxa_list_url, $this->download_options))
  97. {
  98. $html = str_ireplace(array(' width="150"', ' align="left"', ' width="300"'), "", $html);
  99. if(preg_match_all("/<p class=\"FamilyNames\">(.*?)<\/div>/ims", $html, $arr))
  100. {
  101. $i = 0;
  102. foreach($arr[1] as $block)
  103. {
  104. $i++;
  105. // if($i != 3) continue; //debug -- to select which block to process, e.g. choosing "Lower Cyclorrhapha families:"
  106. if(preg_match("/(.*?)\:/ims", $block, $match)) $group_name = trim($match[1]);
  107. if(preg_match_all("/<td>(.*?)<\/td>/ims", $block, $match))
  108. {
  109. foreach($match[1] as $line)
  110. {
  111. $taxon_name = "";
  112. $url = "";
  113. if(is_numeric(stripos($line, "href=")))
  114. {
  115. if(preg_match("/>(.*?)</ims", $line, $match)) $taxon_name = trim($match[1]);
  116. if(preg_match("/\"(.*?)\"/ims", $line, $match)) $url = trim($match[1]);
  117. }
  118. else $taxon_name = $line;
  119. if($taxon_name != "&nbsp;")
  120. {
  121. if($url) $records[$taxon_name]["url"] = $this->domain . $url;
  122. $records[$taxon_name]["rank"] = "family";
  123. $records[$taxon_name]["taxon_id"] = self::get_taxon_id($taxon_name);
  124. }
  125. }
  126. }
  127. }
  128. }
  129. }
  130. else
  131. {
  132. echo ("\n Problem with the remote file: $this->taxa_list_url");
  133. return false;
  134. }
  135. $records = self::get_genera($records);
  136. return $records;
  137. }
  138. private function get_taxon_id($name)
  139. {
  140. if(is_numeric(stripos($name, " sp"))) return str_ireplace(" ", "_", $name);
  141. else return str_ireplace(" ", "_", Functions::canonical_form($name));
  142. }
  143. private function get_genera($records)
  144. {
  145. $i = 0; $total = count($records);
  146. echo "\n cumulative total records: $total";
  147. $image_records = array();
  148. foreach($records as $taxon => $info)
  149. {
  150. $i++;
  151. echo "\n $i of $total: " . $taxon . "\n";
  152. // if($i != 4) continue; //debug --- to select which family to process, e.g. choosing "Phoridae" under "Lower Cyclorrhapha families:"
  153. if($url = @$info["url"])
  154. {
  155. if($html = Functions::lookup_with_cache($url, $this->download_options))
  156. {
  157. //manual adjustment
  158. $html = str_ireplace("Microdon Megacephalus", "Microdon megacephalus", $html);
  159. $image_records = array_merge($image_records, self::get_images_from_genera_list_page($html, $url, $taxon));
  160. /*
  161. <div class="DipteraGenera">
  162. <p><em>Amphicnephes</em> Loew </p>
  163. <p><em>Rivellia</em> Robineau-Desvoidy </p>
  164. <p><em>Senopterina</em> Macquart</p>
  165. </div>
  166. */
  167. if(preg_match("/<div class=\"DipteraGenera\">(.*?)<\/div>/ims", $html, $match))
  168. {
  169. if(preg_match_all("/<p>(.*?)<\/p>/ims", $match[1], $matches))
  170. {
  171. $k = 0;
  172. foreach($matches[1] as $genera)
  173. {
  174. // start getting images per genera
  175. $k++;
  176. // if($k != 1) continue; //debug -- to select what row, which genera to get image from
  177. if(preg_match("/openBrWindow\(\'(.*?)\'/ims", $genera, $arr))
  178. {
  179. $image_page_url = $arr[1];
  180. $path_parts = pathinfo($url);
  181. $image_page_url = $path_parts["dirname"] . "/" . $image_page_url;
  182. echo("\n image_page_url: [$image_page_url] \n ");
  183. if($popup_page = Functions::lookup_with_cache($image_page_url, $this->download_options))
  184. {
  185. $records = self::scrape_image_info($popup_page, $records, $image_page_url, $taxon);
  186. }
  187. }
  188. // start getting each genera name
  189. $genera = trim(strip_tags($genera));
  190. if(!preg_match("/(Undescribed|undet)/i", $genera))
  191. {
  192. $records[$genera]["url"] = $url;
  193. $records[$genera]["rank"] = "genus";
  194. $records[$genera]["family"] = $taxon;
  195. $records[$genera]["taxon_id"] = self::get_taxon_id($genera);
  196. }
  197. }
  198. }
  199. else echo "\n\n alert: investigate 01 - no genera list detected: $url \n\n";
  200. }
  201. }
  202. }
  203. // if($i >= 1) break; //debug -- limit the no. of families
  204. }
  205. $records = array_merge($records, $image_records);
  206. return $records;
  207. }
  208. private function get_images_from_genera_list_page($html, $url, $family)
  209. {
  210. /*
  211. <div class="DipteraImage">
  212. <img src="tabanidae_image1.jpg" width="400" height="282" alt="Tabanus albocirculas" />
  213. <p class="PhotoLabels"><em>Tabanus albocirculas</em> Hine 1907, Costa Rica: La Selva Biological Station</p>
  214. <p class="PhotoLabels">&nbsp;</p>
  215. <img src="tabanidae_image2.jpg" width="400" height="304" alt="Chlorotabanus mexicanus" />
  216. <p class="PhotoLabels"><em>Chlorotabanus mexicanus</em> (Linnaeus 1758), Costa Rica: 29 km W Tortuguero</p>
  217. </div>
  218. <div class="DipteraImage"><img src="ptychopteridae_image.jpg" width="400" height="293" alt="Ptychoptera townesi" />
  219. <span class="PhotoLabels"><em>Ptychoptera townesi</em> Alexander 1943, USA: California: 4mi SW Stirling City</span>
  220. </div>
  221. */
  222. /*
  223. <div class="DipteraImage"><img src="pseudopomyzidae_image.jpg" width="400" height="278" alt="undet. Pseudopomyzidae" />
  224. <p class="PhotoLabels">undet. Pseudopomyzidae, Costa Rica: Albergue de Heliconia</p>
  225. </div>
  226. */
  227. /*
  228. <div class="DipteraImage">
  229. <img src="syrphidae_image1.jpg" width="400" height="366" alt="Microdon megacephalus" /><span class="PhotoLabels"><em>Microdon Megacephalus</em>
  230. Shannon 1929, Costa Rica: Santa Rosa NP</span>
  231. <p>&nbsp;</p>
  232. <img src="syrphidae_image2.jpg" width="400" height="314" alt="Ornidia obesa" /><span class="PhotoLabels"><em>Ornidia obesa</em> (Fabricius 1775),
  233. Mexico: hills west of Fortin de las Flores </span></div>
  234. */
  235. $records = array();
  236. if(preg_match("/<div class=\"DipteraImage\">(.*?)<\/div>/ims", $html, $match)) $records = self::scrape_image_info($match[1], $records, $url, $family);
  237. return $records;
  238. }
  239. private function scrape_image_info($match, $records, $url, $family)
  240. {
  241. $match = str_ireplace("<p>&nbsp;</p>", "", $match);
  242. if(preg_match_all("/<img src=(.*?)<\/p>/ims", $match, $matches) || preg_match_all("/<img src=(.*?)<\/span>/ims", $match, $matches))
  243. {
  244. foreach($matches[1] as $line)
  245. {
  246. $image = "";
  247. $taxon = "";
  248. $caption = "";
  249. $rank = "";
  250. if(preg_match("/\"(.*?)\"/ims", $line, $match))
  251. {
  252. $image = $match[1];
  253. $path_parts = pathinfo($url);
  254. $image = $path_parts["dirname"] . "/" . $image;
  255. }
  256. $line .= "xxx";
  257. if(preg_match("/class=\"PhotoLabels\">(.*?)xxx/ims", $line, $match))
  258. {
  259. $caption = trim(strip_tags($match[1], "<em><i>"));
  260. $caption = str_ireplace(array("\n", "\r", "&nbsp;"), " ", $caption);
  261. $taxon = explode(",", $caption);
  262. $taxon = strip_tags($taxon[0]);
  263. $taxon = trim(str_ireplace(array("undet."), "", $taxon));
  264. }
  265. if($taxon == $family)
  266. {
  267. $family = "";
  268. $rank = "family";
  269. }
  270. $records[$taxon][] = array("url" => $url, "rank" => $rank, "family" => $family, "image" => $image, "caption" => $caption, "taxon_id" => self::get_taxon_id($taxon));
  271. }
  272. }
  273. return $records;
  274. }
  275. function create_instances_from_taxon_object($sciname, $rec, $reference_ids)
  276. {
  277. $taxon = new \eol_schema\Taxon();
  278. if($reference_ids) $taxon->referenceID = implode("; ", $reference_ids);
  279. $taxon->taxonID = $rec["taxon_id"];
  280. $taxonRemarks = "";
  281. $taxon->scientificName = (string) $sciname;
  282. $taxon->family = (string) @$rec['family'];
  283. $taxon->taxonRank = (string) $rec['rank'];
  284. $taxon->furtherInformationURL = (string) @$rec['url']; // e.g. some families are not hyperlinked
  285. $this->taxa[$rec["taxon_id"]] = $taxon;
  286. }
  287. function create_archive()
  288. {
  289. foreach($this->taxa as $t)
  290. {
  291. $this->archive_builder->write_object_to_file($t);
  292. }
  293. $this->archive_builder->finalize(true);
  294. }
  295. }
  296. ?>