PageRenderTime 46ms CodeModel.GetById 14ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/connectors/FEISDataAPI.php

https://github.com/EOL/eol_php_code
PHP | 331 lines | 264 code | 35 blank | 32 comment | 38 complexity | 4d11fc9c151a4302e217c481ec4a8b95 MD5 | raw file
  1. <?php
  2. namespace php_active_record;
  3. /* connector: [feis]
  4. This connector screen-scrapes the data from the individual pages in FEIS website.
  5. */
  6. class FEISDataAPI
  7. {
  8. function __construct($folder)
  9. {
  10. $this->path_to_archive_directory = CONTENT_RESOURCE_LOCAL_PATH . '/' . $folder . '_working/';
  11. $this->archive_builder = new \eol_schema\ContentArchiveBuilder(array('directory_path' => $this->path_to_archive_directory));
  12. $this->download_options = array('resource_id' => 'FEIS', 'expire_seconds' => false, 'download_wait_time' => 1000000, 'timeout' => 10800, 'download_attempts' => 1);
  13. $this->pages['mappings'] = 'dropbox path';
  14. $this->pages['mappings'] = 'http://localhost/cp/FEIS/Traitbank_resource/fireeffects habitat terms.xlsx';
  15. $this->debug = array();
  16. }
  17. function get_all_taxa()
  18. {
  19. require_library('connectors/USDAfsfeisAPI');
  20. $resource_id = false;
  21. $group["Plantae"] = "plants";
  22. $func = new USDAfsfeisAPI($resource_id, $group);
  23. $records = $func->prepare_taxa_urls();
  24. /*
  25. [taxonID] => WISFLO
  26. [url] => http://www.fs.fed.us/database/feis/plants/vine/wisspp/all.html
  27. [sciname] => Wisteria floribunda
  28. [vernacular] => Japanese wisteria
  29. [kingdom] => Plantae
  30. */
  31. $info = self::get_spreadsheet($this->pages['mappings']);
  32. $subsections = $info['subsections'];
  33. $habitats = $info['habitats'];
  34. foreach($records as $record)
  35. {
  36. // $record['url'] = 'http://www.fs.fed.us/database/feis/plants/tree/robpse/all.html'; //debug
  37. // $record['url'] = 'http://www.fs.fed.us/database/feis/plants/cactus/echfen/all.html'; //debug
  38. // $record['url'] = 'http://www.fs.fed.us/database/feis/plants/tree/alnrho/all.html'; //debug
  39. // $record['url'] = 'http://www.fs.fed.us/database/feis/plants/fern/botspp/all.html'; //debug
  40. // $record['url'] = 'http://www.fs.fed.us/database/feis/plants/forb/corvar/all.html'; //debug
  41. $rec = self::process_page($record['url']);
  42. $rec['taxon_id'] = $record['taxonID'];
  43. $rec['kingdom'] = $record['kingdom'];
  44. $rec['sciname'] = $record['sciname'];
  45. if(@$rec['life_form'] || @$rec['habitat'])
  46. {
  47. self::create_archive($rec, $habitats, $subsections);
  48. // print_r($rec);
  49. }
  50. // break; //debug
  51. }
  52. $this->archive_builder->finalize(TRUE);
  53. if($val = $this->debug) print_r($val);
  54. }
  55. private function process_page($url)
  56. {
  57. $rec = self::parse_html($url);
  58. return $rec;
  59. }
  60. private function parse_html($url)
  61. {
  62. $final = array();
  63. if($html = Functions::lookup_with_cache($url, $this->download_options))
  64. {
  65. $html = str_ireplace("APPENDIX: FIRE REGEIME TABLE", "APPENDIX: FIRE REGIME TABLE", $html);
  66. $final['source'] = $url;
  67. $final['life_form'] = self::get_Raunkiaer_life_form($html, $url);
  68. $final['authorship_citation'] = self::get_authorship_citation($html);
  69. if(preg_match("/<a name=\"AppendixFireRegimeTable\"(.*?)<a name=\"AppendixB\">/ims", $html, $arr) ||
  70. preg_match("/<a name='AppendixFireRegimeTable'(.*?)<a name='AppendixB'>/ims", $html, $arr) ||
  71. preg_match("/<a name='APPENDIX: FIRE REGIME TABLE'(.*?)<a name='REFERENCES'>/ims", $html, $arr) ||
  72. preg_match("/<a name=\"APPENDIX: FIRE REGIME TABLE\"(.*?)<a name=\"REFERENCES\">/ims", $html, $arr) ||
  73. preg_match("/<a name=\"APPENDIX: FIRE REGIME TABLE\"(.*?)<a name='REFERENCES'>/ims", $html, $arr) ||
  74. preg_match("/<a name=\"AppendixFireRegimeTable\"(.*?)<a name='REFERENCES'>/ims", $html, $arr) ||
  75. preg_match("/<a name='AppendixFireRegimeTable'(.*?)<a name='REFERENCES'>/ims", $html, $arr)
  76. )
  77. {
  78. if(preg_match_all("/<tr>(.*?)<\/tr>/ims", $arr[1], $arr2))
  79. {
  80. $TRs = $arr2[1];
  81. $i = 0;
  82. foreach($TRs as $tr)
  83. {
  84. $i++;
  85. if($i == 1) continue; //exclude first <tr>
  86. if(preg_match_all("/<td(.*?)<\/td>/ims", $tr, $arr3))
  87. {
  88. $temp = $arr3[1];
  89. $exclude = array(">Vegetation Community", ">Percent of fires", ">Surface or low", ">Mixed<", "vegetation communities");
  90. if(self::needle_occurs_in_this_haystack($temp[0]."<", $exclude)) continue;
  91. if(count($temp) == 1) $index = self::clean_html(strip_tags("<td" . $temp[0]));
  92. else
  93. {
  94. if(isset($index))
  95. {
  96. if($to_be_added = self::get_term_to_be_added($temp[0]))
  97. {
  98. /* // a good way to catch/debug
  99. if($to_be_added == "Pacific Northwest")
  100. {
  101. print_r($temp);
  102. echo "\nindex[$index]\n";
  103. }
  104. */
  105. if(isset($final['habitat'][$index]))
  106. {
  107. if(!in_array($to_be_added, @$final['habitat'][$index])) @$final['habitat'][$index][] = $to_be_added;
  108. }
  109. else @$final['habitat'][$index][] = $to_be_added;
  110. }
  111. }
  112. }
  113. }
  114. }
  115. }
  116. // else echo "\n No <tr>s\n";
  117. }
  118. // else echo "\nAPPENDIX: FIRE REGIME TABLE not found\n";
  119. }
  120. return $final;
  121. }
  122. private function get_term_to_be_added($str)
  123. {
  124. if(stripos($str, "<table") === false)
  125. {
  126. return self::clean_html(strip_tags("<td" . $str));
  127. }
  128. else return false;
  129. }
  130. private function get_Raunkiaer_life_form($html, $url)
  131. {
  132. $final = array();
  133. if(preg_match("/<a name=\"Raunkiaer life form\">(.*?)<span/ims", $html, $arr))
  134. {
  135. $html = strip_tags($arr[1], "<a>");
  136. if(preg_match_all("/<a href(.*?)<\/a>/ims", $html, $arr))
  137. {
  138. foreach($arr[1] as $t)
  139. {
  140. if(preg_match("/>(.*?)xxx/ims", $t."xxx", $arr2))
  141. {
  142. if(!is_numeric($arr2[1]))
  143. {
  144. $final[] = $arr2[1];
  145. }
  146. }
  147. }
  148. }
  149. }
  150. return $final;
  151. }
  152. private function get_authorship_citation($html)
  153. {
  154. if(preg_match("/AUTHORSHIP AND CITATION:(.*?)\[<script/ims", $html, $arr))
  155. {
  156. $temp = self::clean_html(strip_tags($arr[1]));
  157. $temp .= " [" . date("Y, F d") . "].";
  158. return $temp;
  159. }
  160. return false;
  161. }
  162. private function needle_occurs_in_this_haystack($needle, $haystack)
  163. {
  164. foreach($haystack as $phrase)
  165. {
  166. if(is_numeric(stripos($needle, $phrase))) return true;
  167. }
  168. return false;
  169. }
  170. private function create_archive($rec, $habitats, $subsections)
  171. {
  172. $taxon = new \eol_schema\Taxon();
  173. $taxon->taxonID = $rec['taxon_id'];
  174. $taxon->scientificName = self::format_utf8($rec['sciname']);
  175. $taxon->kingdom = $rec['kingdom'];
  176. $taxon->furtherInformationURL = $rec['source'];
  177. if(!isset($this->taxon_ids[$taxon->taxonID]))
  178. {
  179. $this->taxon_ids[$taxon->taxonID] = '';
  180. $this->archive_builder->write_object_to_file($taxon);
  181. }
  182. //start structured data - habitat
  183. $rek = array();
  184. $rek['source'] = $taxon->furtherInformationURL;
  185. $rek['taxon_id'] = $taxon->taxonID;
  186. $rek['citation'] = $rec['authorship_citation'];
  187. if($val = @$rec['habitat'])
  188. {
  189. foreach($val as $subsection => $terms)
  190. {
  191. foreach($terms as $term)
  192. {
  193. $rek['catnum'] = $taxon->taxonID . "_[" . $subsection . "]_" . $term;
  194. $rek['catnum'] = md5($rek['catnum']);
  195. if($val = @$habitats[$term]) self::add_string_types($rek, $val, "http://eol.org/schema/terms/Habitat");
  196. else
  197. {
  198. $section = @$subsections[$subsection]['section'];
  199. $this->debug[$rec['source']][$section][$subsection][$term] = '';
  200. }
  201. }
  202. }
  203. }
  204. //start structured data - life form
  205. if($val = @$rec['life_form'])
  206. {
  207. foreach($val as $life_form)
  208. {
  209. $rek['catnum'] = $taxon->taxonID . "_" . $life_form;
  210. self::add_string_types($rek, "http://eol.org/schema/terms/".self::format_life_form($life_form), "http://eol.org/schema/terms/PlantHabit");
  211. }
  212. }
  213. }
  214. private function format_life_form($life_form)
  215. {
  216. //manual adjustment
  217. $life_form = str_ireplace("\ntherophyte", "therophyte", $life_form);
  218. $life_form = str_ireplace("phytes", "phyte", $life_form);
  219. $arr = explode(" ", $life_form);
  220. if(@$arr[1])
  221. {
  222. $arr[1] = ucfirst($arr[1]);
  223. $arr[0] = strtolower($arr[0]);
  224. return implode("", $arr);
  225. }
  226. else return strtolower($life_form);
  227. }
  228. private function add_string_types($rec, $value, $mtype)
  229. {
  230. $taxon_id = $rec['taxon_id'];
  231. $catnum = $rec['catnum'];
  232. $m = new \eol_schema\MeasurementOrFact();
  233. $occurrence_id = $this->add_occurrence($taxon_id, $catnum, $rec);
  234. $m->occurrenceID = $occurrence_id;
  235. $m->measurementOfTaxon = 'true';
  236. $m->measurementType = $mtype;
  237. $m->measurementValue = $value;
  238. $m->source = $rec['source'];
  239. if($val = @$rec['citation']) $m->bibliographicCitation = $val;
  240. // $m->measurementMethod = '';
  241. // $m->measurementRemarks = '';
  242. // $m->contributor = '';
  243. $this->archive_builder->write_object_to_file($m);
  244. }
  245. private function add_occurrence($taxon_id, $catnum, $rec)
  246. {
  247. $occurrence_id = $catnum; //can be just this, no need to add taxon_id
  248. if(isset($this->occurrence_ids[$occurrence_id])) return $occurrence_id;
  249. $o = new \eol_schema\Occurrence();
  250. $o->occurrenceID = $occurrence_id;
  251. $o->taxonID = $taxon_id;
  252. $this->archive_builder->write_object_to_file($o);
  253. $this->occurrence_ids[$occurrence_id] = '';
  254. return $occurrence_id;
  255. }
  256. private function get_spreadsheet($spreadsheet)
  257. {
  258. require_library('connectors/LifeDeskToScratchpadAPI');
  259. $func = new LifeDeskToScratchpadAPI();
  260. $final = array();
  261. $habitats = array();
  262. $spreadsheet_options = array("cache" => 0, "timeout" => 3600, "file_extension" => "xlsx", 'download_attempts' => 2, 'delay_in_minutes' => 1); //we don't want to cache spreadsheet
  263. if($filename = Functions::save_remote_file_to_local($spreadsheet, $spreadsheet_options))
  264. {
  265. if($arr = $func->convert_spreadsheet($filename, 0, $spreadsheet_options))
  266. {
  267. $i = 0;
  268. foreach($arr['subsection'] as $subsection)
  269. {
  270. if($subsection)
  271. {
  272. $final[$subsection]['section'] = $arr['section'][$i];
  273. $final[$subsection]['habitats'][] = $arr['source text'][$i];
  274. }
  275. $habitats[$arr['source text'][$i]] = $arr['term'][$i];
  276. $i++;
  277. }
  278. }
  279. unlink($filename);
  280. }
  281. $final = array_filter($final); //remove null arrays
  282. $habitats = array_filter($habitats); //remove null arrays
  283. return array('subsections' => $final, 'habitats' => $habitats);
  284. }
  285. private function clean_html($html)
  286. {
  287. $html = str_ireplace(array("\n", "\r", "\t", "\o", "\xOB", "\11", "\011"), "", trim($html));
  288. return Functions::remove_whitespace($html);
  289. }
  290. private function format_utf8($str)
  291. {
  292. if(Functions::is_utf8($str)) return $str;
  293. else return utf8_encode($str);
  294. }
  295. }
  296. ?>