PageRenderTime 47ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/connectors/CoralsOfTheWorldAPI.php

https://github.com/EOL/eol_php_code
PHP | 247 lines | 224 code | 13 blank | 10 comment | 32 complexity | 5f71e5b2f82deb7a7052c4ab7f627ed0 MD5 | raw file
  1. <?php
  2. namespace php_active_record;
  3. // connector: [corals]
  4. class CoralsOfTheWorldAPI
  5. {
  6. function __construct($folder)
  7. {
  8. $this->path_to_archive_directory = CONTENT_RESOURCE_LOCAL_PATH . '/' . $folder . '_working/';
  9. $this->archive_builder = new \eol_schema\ContentArchiveBuilder(array('directory_path' => $this->path_to_archive_directory));
  10. $this->taxon_ids = array();
  11. $this->object_ids = array();
  12. $this->domain = "http://coral.aims.gov.au";
  13. $this->species_list = $this->domain . "/info/factsheets.jsp";
  14. $this->download_options = array("download_wait_time" => 1000000, "timeout" => 1800, "download_attempts" => 1, "expire_seconds" => 5184000, "delay_in_minutes" => 1);
  15. $this->debug = array();
  16. }
  17. function get_all_taxa()
  18. {
  19. $taxa = self::get_taxa_list();
  20. $total = count($taxa);
  21. $i = 0;
  22. foreach($taxa as $taxon)
  23. {
  24. $i++;
  25. if(($i % 50) == 0) echo "\n $i of $total - " . $taxon["sciname"] . "\n";
  26. if($html = Functions::lookup_with_cache($this->domain . $taxon["source"], $this->download_options))
  27. {
  28. $rec = self::parse_html($html, $taxon);
  29. $taxon["authorship"] = $rec["authorship"];
  30. self::create_instances_from_taxon_object($taxon, $rec["texts"]);
  31. self::get_objects($taxon, $rec["images"], "image");
  32. $texts = self::arrange_texts($rec["texts"], $taxon["taxon_id"]);
  33. self::get_objects($taxon, $texts, "text");
  34. }
  35. // break; //debug
  36. }
  37. $this->archive_builder->finalize(TRUE);
  38. // print_r($this->debug);
  39. }
  40. private function arrange_texts($texts, $taxon_id)
  41. {
  42. $final = array();
  43. $texts["Taxonomic note"] = "";
  44. foreach($texts as $topic => $desc)
  45. {
  46. if($desc) $final[] = array("identifier" => $taxon_id . "_$topic", "description" => $desc, "subject" => self::get_subject($topic));
  47. }
  48. return $final;
  49. }
  50. private function get_subject($topic)
  51. {
  52. switch($topic)
  53. {
  54. case "Colour": return "http://rs.tdwg.org/ontology/voc/SPMInfoItems#Morphology";
  55. case "Habitat": return "http://rs.tdwg.org/ontology/voc/SPMInfoItems#Habitat";
  56. case "Abundance": return "http://rs.tdwg.org/ontology/voc/SPMInfoItems#PopulationBiology";
  57. case "Similar species": return "http://rs.tdwg.org/ontology/voc/SPMInfoItems#LookAlikes";
  58. case "GenDesc": return "http://rs.tdwg.org/ontology/voc/SPMInfoItems#TaxonBiology";
  59. }
  60. }
  61. private function parse_html($html, $taxon)
  62. {
  63. $rec = array();
  64. // for authorship
  65. if(preg_match("/<p class=\"surname\">(.*?)<\/p>/ims", $html, $arr)) $rec["authorship"] = trim($arr[1]);
  66. // for the different topics
  67. $texts = array();
  68. $topics = array("Colour", "Habitat", "Abundance", "Similar species", "Taxonomic note"); // e.g. Taxonomic note:
  69. foreach($topics as $topic)
  70. {
  71. if(preg_match("/<b>" . $topic . ":<\/b>(.*?)<\/p>/ims", $html, $arr)) $texts[$topic] = $arr[1];
  72. if($topic == "Similar species")
  73. {
  74. $desc = str_ireplace(' class="fullname" ', " ", $texts[$topic]);
  75. $desc = str_ireplace('href="/factsheet', 'href="' . $this->domain . '/factsheet', $desc);
  76. $desc = str_replace(array("\n", "\t", "\r", chr(9), chr(10), chr(13), " "), "", $desc);
  77. $texts[$topic] = trim($desc);
  78. }
  79. }
  80. // for the general description - enclosed by <p></p>
  81. if(preg_match("/<p class=\"surname\">(.*?)<b>Colour:<\/b>/ims", $html, $arr))
  82. {
  83. if(preg_match("/<p>(.*?)<\/p>/ims", $arr[1], $arr)) $texts["GenDesc"] = $arr[1];
  84. }
  85. $texts = array_map('trim', $texts);
  86. $rec["texts"] = $texts;
  87. // for images
  88. $images = array();
  89. if(preg_match_all("/<a class=\"fancybox\" rel=\"group\" (.*?)<\/a>/ims", $html, $arr))
  90. {
  91. foreach($arr[1] as $line)
  92. {
  93. $media_url = false;
  94. if(preg_match("/href=\"(.*?)\"/ims", $line, $arr2)) $media_url = $arr2[1];
  95. if(preg_match("/alt='(.*?)'/ims", $line, $arr2))
  96. {
  97. $caption = $arr2[1];
  98. $photographer = self::get_photographer($caption);
  99. $this->debug[$photographer] = '';
  100. }
  101. if($media_url)
  102. {
  103. $parts = pathinfo($media_url);
  104. $images[] = array("identifier" => $parts["filename"], "media_url" => $media_url, "description" => $caption, "photographer" => $photographer);
  105. }
  106. }
  107. }
  108. $rec["images"] = $images;
  109. return $rec;
  110. }
  111. private function get_photographer($string)
  112. {
  113. $photographer = "";
  114. $parts = explode(". ", $string);
  115. $parts = array_map('trim', $parts);
  116. foreach($parts as $part)
  117. {
  118. $words = explode(" ", $part);
  119. $cont = false;
  120. foreach($words as $word)
  121. {
  122. if(ctype_upper(substr($word,0,1))) $cont = true;
  123. else
  124. {
  125. $cont = false;
  126. break;
  127. }
  128. }
  129. if($cont) $photographer = trim($part);
  130. }
  131. // remove "." if last char in photographer
  132. if(substr($photographer, -1) == ".") $photographer = substr($photographer, 0, strlen($photographer)-1);
  133. // manual checking, this is needed bec. there is no clear distinction for photographer name
  134. $remove_if_this_exists_in_photographer = array("Australia", "Indonesia", "Japan", "Guam", "Philippines", "New ", "Oman", "Vietnam", "Ocean", " Sea", "Africa", "Micronesia", "Caribbean", "Kuwait", "Sri Lanka", " Islands", " Gulf", "Showing", "Surface", " USA", "Polynesia", "Vanuatu", "Madagascar", "Tanzania", "Palau", "Tahiti", "Fiji", "Mediterranean", "Hawaii", "Thailand", "Brazil", "Taiwan", "Mesenterina");
  135. foreach($remove_if_this_exists_in_photographer as $word)
  136. {
  137. if(is_numeric(stripos($photographer, $word)))
  138. {
  139. $photographer = "";
  140. break;
  141. }
  142. }
  143. return $photographer;
  144. }
  145. private function get_taxa_list()
  146. {
  147. $taxa = array();
  148. if($html = Functions::lookup_with_cache($this->species_list, $this->download_options))
  149. {
  150. if(preg_match_all("/<a class=\"fullname\"(.*?)<\/a>/ims", $html, $arr))
  151. {
  152. $rows = array_map('trim', $arr[1]);
  153. foreach($rows as $row)
  154. {
  155. if(preg_match("/speciesCode=(.*?)\"/ims", $row, $arr)) $id = $arr[1];
  156. if(preg_match("/\">(.*?)xxx/ims", $row."xxx", $arr)) $name = trim($arr[1]);
  157. if(preg_match("/href=\"(.*?)\"/ims", $row, $arr)) $source = $arr[1];
  158. $taxa[] = array("taxon_id" => $id, "sciname" => $name, "source" => $source);
  159. }
  160. }
  161. }
  162. return $taxa;
  163. }
  164. private function get_objects($taxon, $records, $type)
  165. {
  166. foreach($records as $rec)
  167. {
  168. $mr = new \eol_schema\MediaResource();
  169. if($type == "text")
  170. {
  171. $mr->type = 'http://purl.org/dc/dcmitype/Text';
  172. $mr->format = 'text/html';
  173. $mr->CVterm = $rec["subject"];
  174. $mr->bibliographicCitation = "Australian Institute of Marine Science, (" . date("Y") . "). AIMS Coral Fact Sheets - " . $taxon["sciname"] .
  175. ". Viewed " . date("d M Y") . ". http://coral.aims.gov.au/factsheet.jsp?speciesCode=" . $taxon["taxon_id"];
  176. }
  177. elseif($type == "image")
  178. {
  179. $mr->type = 'http://purl.org/dc/dcmitype/StillImage';
  180. $mr->format = Functions::get_mimetype($rec["media_url"]);
  181. $mr->accessURI = $rec["media_url"];
  182. $mr->title = "";
  183. }
  184. $mr->taxonID = $taxon["taxon_id"];
  185. $mr->identifier = $rec["identifier"];
  186. $mr->language = 'en';
  187. $mr->furtherInformationURL = $this->domain . $taxon["source"];
  188. $mr->description = $rec["description"];
  189. $mr->UsageTerms = 'http://creativecommons.org/licenses/by-nc/3.0/';
  190. $mr->Owner = @$rec["photographer"] ? $rec["photographer"] : "Australian Institute of Marine Science";
  191. if($val = @$rec["photographer"])
  192. {
  193. $agent_ids = self::create_agent($val);
  194. if($agent_ids) $mr->agentID = implode("; ", $agent_ids);
  195. }
  196. if(!isset($this->object_ids[$mr->identifier]))
  197. {
  198. $this->object_ids[$mr->identifier] = 1;
  199. $this->archive_builder->write_object_to_file($mr);
  200. }
  201. }
  202. }
  203. private function create_agent($agent)
  204. {
  205. $agent_ids = array();
  206. $r = new \eol_schema\Agent();
  207. $r->term_name = $agent;
  208. $r->agentRole = 'photographer';
  209. $r->identifier = md5("$agent|" . $r->agentRole);
  210. // $r->term_homepage = '';
  211. $agent_ids[] = $r->identifier;
  212. if(!isset($this->resource_agent_ids[$r->identifier]))
  213. {
  214. $this->resource_agent_ids[$r->identifier] = '';
  215. $this->archive_builder->write_object_to_file($r);
  216. }
  217. return $agent_ids;
  218. }
  219. private function create_instances_from_taxon_object($rec, $texts)
  220. {
  221. $taxon = new \eol_schema\Taxon();
  222. $taxon->taxonID = $rec["taxon_id"];
  223. $taxon->scientificName = $rec["sciname"];
  224. $taxon->scientificNameAuthorship = @$rec["authorship"];
  225. $taxon->furtherInformationURL = $this->domain . $rec["source"];
  226. if($val = @$texts["Taxonomic note"]) $taxon->taxonRemarks = $val;
  227. if(!isset($this->taxon_ids[$taxon->taxonID]))
  228. {
  229. $this->taxon_ids[$taxon->taxonID] = 1;
  230. $this->archive_builder->write_object_to_file($taxon);
  231. }
  232. }
  233. }
  234. ?>