PageRenderTime 59ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/system/application/controllers/eu_scraper.php

https://github.com/cflabs/Product-Recalls
PHP | 665 lines | 408 code | 153 blank | 104 comment | 69 complexity | feb8a77f0f8b94e93b0e3a46eb7812cb MD5 | raw file
  1. <?php
  2. class EU_Scraper extends Controller {
  3. private $log = array();
  4. private $imported_counter = 0;
  5. private $recalls_counter = 0;
  6. private $recent_updates = FALSE;
  7. private $updated_recalls = 0;
  8. private $updates_counter = 0;
  9. function index() {
  10. //if this is being run through the main website
  11. //disallow and throw a 404 error
  12. if(substr_count($_SERVER['SCRIPT_FILENAME'],'run_scraper.php') < 1) {
  13. $this->load->helper('url_helper');
  14. show_404(current_url());
  15. exit;
  16. }
  17. //load helper and model
  18. $this->load->helper('scraping_helper');
  19. $this->load->helper('security_helper');
  20. $this->load->model('Recall_model','',TRUE);
  21. $this->load->library('MP_Cache');
  22. //scrape the content
  23. $this->begin_scrape();
  24. //run import checker
  25. $this->checker();
  26. //send log email
  27. $this->log_mail();
  28. //delete recent recalls cache if cache is active
  29. if (WWW_CACHE_ACTIVE) {
  30. $this->mp_cache->delete('recent-recalls');
  31. }
  32. //load the view
  33. $this->load->view('import');
  34. }
  35. function bypass_import($id=0,$dt="") {
  36. //load helper and model
  37. $this->load->helper('scraping_helper');
  38. $this->load->model('Recall_model','',TRUE);
  39. if ($id > 0) {
  40. //echo "DT:<br/>".$dt;
  41. $this->back_scrape_week("http://ec.europa.eu/consumers/dyna/rapex/create_rapex.cfm?rx_id=".$id,$dt);
  42. }
  43. }
  44. private function log($message,$log_type='debug') {
  45. //print the message to the screen
  46. print $message."\n<br/>";
  47. //push it to the log
  48. array_push($this->log,$message);
  49. log_message($log_type,SCRAPER_EMAIL_LOG_PREFIX." ".$message);
  50. }
  51. private function log_mail($complete = TRUE) {
  52. //load the email helper
  53. //$this->load->helper('email_helper');
  54. $this->load->library('email');
  55. //decide on the subject
  56. $subject_prefix = "";
  57. if ( ! $this->recent_updates) {
  58. $subject_prefix = " NO RECENT UPDATES!";
  59. }
  60. //set up the email
  61. $this->email->from(EMAILER_ADDRESS,SITE_NAME);
  62. $this->email->to(SCRAPER_EMAIL_LOG);
  63. $subject = "";
  64. if ($complete) {
  65. $subject = SCRAPER_EMAIL_LOG_PREFIX.$subject_prefix." Scraped ".$this->imported_counter."/".$this->recalls_counter;
  66. if ($this->updates_counter > 0) {
  67. $subject .= " Updated ".$this->updated_recalls."/".$this->updates_counter;
  68. }
  69. } else {
  70. $subject = SCRAPER_EMAIL_LOG_PREFIX." ERROR";
  71. }
  72. $this->email->subject($subject);
  73. //decide if warning needs to be appended to content
  74. $content_warning = "";
  75. if ( ! $this->recent_updates) {
  76. $content_warning = "*********************************************\n";
  77. $content_warning .= "WARNING: There have been no updates for 7 days. Please check, something may be broken.\n";
  78. $content_warning .= "*********************************************\n\n";
  79. }
  80. $this->email->message($content_warning.implode("\n",$this->log));
  81. //send the email
  82. $this->email->send();
  83. }
  84. private function begin_scrape() {
  85. $this->log("starting EU scraper ".time());
  86. try {
  87. //get rss feed
  88. $rss_content = scrape_content("http://ec.europa.eu/consumers/dyna/rapex/rapex_archives_rss_en.cfm");
  89. } catch (Exception $e) {
  90. $this->log("fatal error: ". $e->getMessage(),'error');
  91. $this->log_mail(FALSE);
  92. exit;
  93. }
  94. if ($rss_content != "") {
  95. //get all the links from the feed
  96. $link_regex = "/<guid>(.*?)<\/guid>/s";
  97. preg_match_all($link_regex,$rss_content,$link_matches,PREG_PATTERN_ORDER);
  98. $this->log("found ".count($link_matches[1])." weeks listed in feed");
  99. //for each item in the list
  100. if (count($link_matches[1])) {
  101. $i = 0; $u = count($link_matches[1]);
  102. //get the top 5 links
  103. while ($i < 10 && $i < $u) {
  104. //scrape the week's recalls
  105. $this->scrape_week($link_matches[1][$i]);
  106. $i++;
  107. }
  108. } else {
  109. $this->log("fatal error: no links to scrape",'error');
  110. $this->log_mail(FALSE);
  111. }
  112. } else {
  113. $this->log("fatal error: no content - feed empty",'error');
  114. }
  115. $this->log("imported ".$this->imported_counter." of ".$this->recalls_counter);
  116. $this->log("updated ".$this->updated_recalls." of ".$this->updates_counter);
  117. $this->log("end ".time());
  118. }
  119. private function scrape_week($url) {
  120. //get content
  121. $this->log("-scraping ".$url);
  122. try {
  123. $page_html = scrape_content($url);
  124. if ($page_html != "") {
  125. //reg exs
  126. $table_regex = "/<table border=\"1\" cellpadding=\"6\" cellspacing=\"0\" style=\"border-collapse: collapse\" bordercolor=\"#111111\" width=\"100%\" id=\"AutoNumber1\" height=\"100%\">(.*?)<\/table>/s";
  127. $row_regex = "/<tr>(.*?)<\/tr>/s";
  128. //get tables
  129. preg_match_all($table_regex,$page_html,$table_matches,PREG_PATTERN_ORDER);
  130. //get rows from table
  131. //print_r($table_matches[1]);
  132. preg_match_all($row_regex,$table_matches[1][0],$row_matches,PREG_PATTERN_ORDER);
  133. //loop through all the rows (ignoring the first which contains headings)
  134. $first_row = TRUE;
  135. foreach($row_matches[1] as $row) {
  136. if ($first_row) {
  137. $first_row = FALSE;
  138. //$this->process_row($row,$td_regex);
  139. continue;
  140. }
  141. //process the row
  142. $result = $this->process_row($row);
  143. if ($result['title'] != "") {
  144. //does the product already exist?
  145. $exists = $this->Recall_model->id_exist($result['id'],2);
  146. //if it does not exist, import it
  147. if (!$exists) {
  148. $this->log("-- importing product (".$result['id'].")");
  149. //produce the unique url
  150. $slug_title = $result['title'];
  151. if ($result['brand'] != "") { $slug_title = $result['brand_cleaned']." ".$slug_title; }
  152. //print ($slug_title."<br/>");
  153. $internal_url = $this->produce_slug($slug_title,$result['id']);
  154. //get category id
  155. $category_id = $this->get_category_id($result['category']);
  156. //store in database
  157. $data = array(
  158. 'product_name' => $result['title'],
  159. 'description' => $result['content_cleaned'],
  160. 'danger' => $result['danger'],
  161. 'measures_taken' => $result['measures'],
  162. 'country' => $result['country'],
  163. 'category' => $result['category'],
  164. 'category_id' => $category_id,
  165. 'brand' => $result['brand'],
  166. 'external_img_url' => $result['image'],
  167. 'external_lrgimg_url' => $result['image_large'],
  168. 'external_url' => $url,
  169. 'internal_url' => $internal_url,
  170. 'source_id' => $result['id'],
  171. 'source' => 2
  172. );
  173. $this->db->insert('recalls',$data);
  174. $this->imported_counter++;
  175. } else {
  176. $this->log("-- product already recorded (".$result['id'].")");
  177. }
  178. $this->recalls_counter++;
  179. }
  180. }
  181. $this->check_for_updates($page_html);
  182. } else {
  183. $this->log("-error: no content - empty page");
  184. }
  185. } catch (Exception $e) {
  186. $this->log("-error: ". $e->getMessage(),'error');
  187. }
  188. sleep(SCRAPER_SLEEP);
  189. }
  190. private function get_category_id($category) {
  191. $this->load->model('Category_model','',TRUE);
  192. $this->load->helper('slug_helper');
  193. //clean up the category name
  194. //in some cases product information leaks into the category
  195. //we need to remove this
  196. $clean_required = "";
  197. if (strpos($category, 'Product:') !== false) {
  198. //cleaning is required
  199. $clean_required = "YES";
  200. //run a regex on the category to tidy it
  201. $category_regex = "/(.*?) Product.*?/s";
  202. preg_match_all($category_regex,$category,$cat_matches,PREG_PATTERN_ORDER);
  203. //if there is a match, store it
  204. if (count($cat_matches[1]) > 0) {
  205. $category = html_entity_decode(trim(strip_tags($cat_matches[1][0])));
  206. }
  207. }
  208. //produce the slug
  209. $slug = produce_slug($category,"-");
  210. //merge some categories
  211. //we have some categories that are virtually identical
  212. //so merging them will make things easier for people
  213. switch ($slug) {
  214. case "personal-protective-equipment" :
  215. $slug = "protective-equipment";
  216. break;
  217. case "lighting-chains" :
  218. $slug = "lighting-equipment";
  219. break;
  220. }
  221. //does this category exist?
  222. $exists = $this->Category_model->category_exists($slug);
  223. //if it does exist, return the id
  224. if ($exists > 0) {
  225. echo ('<div style="background:#3981AA;padding:5px;margin:5px;color:#fff;font-weight:bold;">');
  226. echo ('Exists: '.$exists);
  227. echo ('</div>');
  228. return $exists;
  229. } else {
  230. //does not exist, add to database
  231. $id = $this->Category_model->add_category($category,$slug);
  232. echo ('<div style="background:#398100;padding:5px;margin:5px;color:#fff;font-weight:bold;">');
  233. echo ('Create: '.$slug.' ('.$id.')');
  234. echo ('</div>');
  235. return $id;
  236. }
  237. }
  238. private function process_row($row) {
  239. //default values
  240. $return = array();
  241. //td reg ex & process
  242. $td_regex = "/<td.*?>(.*?)<\/td>/s";
  243. preg_match_all($td_regex,$row,$td_matches,PREG_PATTERN_ORDER);
  244. //if there are 6 columns, then process the row
  245. if (count($td_matches[1]) == 6) {
  246. $image = $this->process_image($td_matches[1][2]);
  247. $info = $this->process_product_info($td_matches[1][2]);
  248. $return['id'] = trim(html_entity_decode(strip_tags(str_replace("/","-",str_replace("<br>","-",$td_matches[1][0])))));
  249. $return['country'] = trim(html_entity_decode(strip_tags($td_matches[1][1])));
  250. $return['content'] = trim(html_entity_decode(strip_tags(str_replace("<br/>","\n",$td_matches[1][2]))));
  251. $return['danger'] = trim(html_entity_decode(strip_tags(str_replace("<br/><br/>","\n",$td_matches[1][3]))));
  252. $return['measures'] = trim(html_entity_decode(strip_tags($td_matches[1][4])));
  253. $return['image'] = $image['image'];
  254. $return['image_large'] = $image['image_large'];
  255. $return['category'] = $info['category'];
  256. $return['title'] = $info['product'];
  257. $return['brand'] = $info['brand'];
  258. $return['brand_cleaned'] = preg_replace('/[^a-zA-Z0-9 -,]/','',$info['brand']);
  259. $return['content_cleaned'] = $info['content_cleaned'];
  260. } else {
  261. $return['title'] = "";
  262. }
  263. return $return;
  264. }
  265. private function process_image($content) {
  266. //default values
  267. $return = array();
  268. $return['image'] = "";
  269. $return['image_large'] = "";
  270. //image reg ex
  271. $img_regex = "/<img src=\"(.*?).jpg\".*?\/>/s";
  272. $img_l_regex = "/<img .*? onClick=\"ZoomPicture\('(.*?).jpg'\)\".*?\/>/s";
  273. //run reg ex
  274. preg_match_all($img_regex,$content,$img_matches,PREG_PATTERN_ORDER);
  275. preg_match_all($img_l_regex,$content,$img_l_matches,PREG_PATTERN_ORDER);
  276. //if there is an image, store it
  277. if (count($img_matches[1]) > 0) {
  278. $return['image'] = "http://ec.europa.eu".$img_matches[1][0].".jpg";
  279. }
  280. //if there is a large image, store it
  281. if (count($img_l_matches[1]) > 0) {
  282. $return['image_large'] = $img_l_matches[1][0].".jpg";
  283. }
  284. //return the object
  285. return $return;
  286. }
  287. private function process_product_info($content) {
  288. //default values
  289. $return = array();
  290. $return['category'] = "Unknown";
  291. $return['product'] = "";
  292. $return['brand'] = "Unknown";
  293. $return['content_cleaned'] = "";
  294. //regex for picking out category, product and brand from description
  295. $category_regex = "/Category: (.*?)<br\/>/s";
  296. $product_regex = "/Product: (.*?)<br\/>/s";
  297. $brand_regex = "/Brand: (.*?)Type|Brand: (.*?)<br\/>/s";
  298. $content_regex = "/Category: .*?<br\/>.*?Product: .*?<br\/>(.*?)<\/span>/s";
  299. //run regexs on the content
  300. preg_match_all($category_regex,$content,$category_matches,PREG_PATTERN_ORDER);
  301. preg_match_all($product_regex,$content,$product_matches,PREG_PATTERN_ORDER);
  302. preg_match_all($brand_regex,$content,$brand_matches,PREG_PATTERN_ORDER);
  303. preg_match_all($content_regex,$content,$content_matches,PREG_PATTERN_ORDER);
  304. //if there is a category, store it
  305. if (count($category_matches[1]) > 0) {
  306. $return['category'] = html_entity_decode(trim(strip_tags($category_matches[1][0])));
  307. }
  308. //if there is a product, store it
  309. if (count($product_matches[1]) > 0) {
  310. $return['product'] = html_entity_decode(trim(strip_tags($product_matches[1][0])));
  311. }
  312. //if there is a brand, store it
  313. if (count($brand_matches[1]) > 0) {
  314. $return['brand'] = html_entity_decode(trim(strip_tags($brand_matches[1][0])));
  315. }
  316. //if there is content, store it
  317. if (count($content_matches[1]) > 0) {
  318. $return['content_cleaned'] = trim(html_entity_decode(strip_tags(str_replace("<br/>","\n",$content_matches[1][0]))));
  319. }
  320. return $return;
  321. }
  322. private function produce_slug($title,$id) {
  323. $this->load->helper('slug_helper');
  324. $internal_base_url = produce_slug($title,"-");
  325. $internal_url = $internal_base_url;
  326. $internal_url_attempts = 0;
  327. //do while url already exists
  328. while ($this->Recall_model->internal_url_exist($internal_url)) {
  329. if ($internal_url_attempts < 10) {
  330. $internal_url = $internal_base_url."-".($internal_url_attempts+1);
  331. $internal_url_attempts++;
  332. } else {
  333. $internal_url = $internal_base_url."-".$id;
  334. break;
  335. }
  336. }
  337. return $internal_url;
  338. }
  339. private function checker() {
  340. $this->load->model('Recall_model','',TRUE);
  341. $this->load->library('email');
  342. $this->load->helper('date');
  343. if ($this->Recall_model->are_there_recent_updates() < 1) {
  344. //there have been no updates for 7 days, note that an email must be sent!
  345. $this->recent_updates = FALSE;
  346. $this->log("WARNING: There have been no recent updates. Please check as there might be a problem",'warning');
  347. } else {
  348. $this->recent_updates = TRUE;
  349. }
  350. }
  351. private function back_scrape_week($url,$dt) {
  352. //get content
  353. $this->log("-scraping ".$url);
  354. try {
  355. $page_html = scrape_content($url);
  356. if ($page_html != "") {
  357. //reg exs
  358. $table_regex = "/<table border=\"1\" cellpadding=\"6\" cellspacing=\"0\" style=\"border-collapse: collapse\" bordercolor=\"#111111\" width=\"100%\" id=\"AutoNumber1\" height=\"100%\">(.*?)<\/table>/s";
  359. $row_regex = "/<tr>(.*?)<\/tr>/s";
  360. //get tables
  361. preg_match_all($table_regex,$page_html,$table_matches,PREG_PATTERN_ORDER);
  362. //get rows from table
  363. preg_match_all($row_regex,$table_matches[1][0],$row_matches,PREG_PATTERN_ORDER);
  364. //loop through all the rows (ignoring the first which contains headings)
  365. $first_row = TRUE;
  366. foreach($row_matches[1] as $row) {
  367. if ($first_row) {
  368. $first_row = FALSE;
  369. //$this->process_row($row,$td_regex);
  370. continue;
  371. }
  372. //process the row
  373. $result = $this->process_row($row);
  374. if ($result['title'] != "") {
  375. //does the product already exist?
  376. $exists = $this->Recall_model->id_exist($result['id'],2);
  377. //if it does not exist, import it
  378. if (!$exists) {
  379. $this->log("-- importing product (".$result['id'].")");
  380. //produce the unique url
  381. $slug_title = $result['title'];
  382. if ($result['brand'] != "") { $slug_title = $result['brand_cleaned']." ".$slug_title; }
  383. //print ($slug_title."<br/>");
  384. $internal_url = $this->produce_slug($slug_title,$result['id']);
  385. //get category id
  386. $category_id = $this->get_category_id($result['category_id']);
  387. //store in database
  388. $data = array(
  389. 'product_name' => $result['title'],
  390. 'description' => $result['content_cleaned'],
  391. 'danger' => $result['danger'],
  392. 'measures_taken' => $result['measures'],
  393. 'country' => $result['country'],
  394. 'category' => $result['category'],
  395. 'category_id' => $result['category_id'],
  396. 'brand' => $result['brand'],
  397. 'external_img_url' => $result['image'],
  398. 'external_lrgimg_url' => $result['image_large'],
  399. 'external_url' => $url,
  400. 'internal_url' => $internal_url,
  401. 'source_id' => $result['id'],
  402. 'source' => 2,
  403. 'date_scraped' => $dt
  404. );
  405. $this->db->insert('recalls',$data);
  406. $this->imported_counter++;
  407. } else {
  408. $this->log("-- product already recorded (".$result['id'].")");
  409. }
  410. $this->recalls_counter++;
  411. }
  412. }
  413. } else {
  414. $this->log("-error: no content - empty page");
  415. }
  416. } catch (Exception $e) {
  417. $this->log("-error: ". $e->getMessage(),'error');
  418. }
  419. sleep(SCRAPER_SLEEP);
  420. }
  421. public function categorise() {
  422. $this->output->enable_profiler();
  423. $this->load->model('Recall_model','',TRUE);
  424. $this->load->model('Category_model','',TRUE);
  425. $this->load->helper('Security_helper');
  426. //get all the recalls in the database
  427. $recalls = $this->Recall_model->get_all_entries();
  428. //loop through them all
  429. foreach ($recalls as $recall) {
  430. //if a category id has not yet been assigned
  431. if ($recall['category_id'] < 1) {
  432. //get the name of the category
  433. $category_string = $recall['category'];
  434. //if the name of the category is empty
  435. if ($recall['category'] == "") {
  436. //assign it to the "other" category
  437. $category_string = "Other";
  438. }
  439. //get the category id
  440. $category_id = $this->get_category_id($category_string);
  441. //update the record
  442. $data = array('category_id'=>$category_id);
  443. $this->db->where('id',$recall['id']);
  444. $this->db->update('recalls',$data);
  445. }
  446. }
  447. }
  448. private function check_for_updates($page_html) {
  449. //get content
  450. $this->output->enable_profiler();
  451. $this->log("-- checking for updates");
  452. try {
  453. if ($page_html != "") {
  454. //reg exs
  455. $note_regex = "/<p class=\"texte\"><font color=\"CC0000\"><font color=\"#ff0000\"><b>[A-Za-z\:]*<\/b><\/font>.*?<font color=\"#000000\">(.*?)<\/font><\/font><\/p>/s";
  456. $content_regex = "/Notification (\d{4}\/\d{2}).*?removed(.*?)\./";
  457. //get notes
  458. preg_match_all($note_regex,$page_html,$note_matches,PREG_PATTERN_ORDER);
  459. if (count($note_matches[1]) < 1) {
  460. $this->log("--- no updates found");
  461. } else {
  462. $this->log("--- ".count($note_matches[1])." updates found");
  463. }
  464. //get content
  465. foreach ($note_matches[1] as $match) {
  466. //print_r($match);
  467. preg_match_all($content_regex,$match,$content_matches,PREG_PATTERN_ORDER);
  468. if (! empty($content_matches[1])) {
  469. $product_id = trim(html_entity_decode(strip_tags(str_replace("/","-",$content_matches[1][0]))));
  470. //$product_id = "0811-09";
  471. $removal_reason = trim(html_entity_decode(strip_tags($content_matches[2][0])));
  472. //does the product already exist?
  473. $exists = $this->Recall_model->id_like($product_id,2);
  474. //if it exits, we need to update the record
  475. if ($exists) {
  476. //has it already been updated?
  477. $already_updated = $this->Recall_model->already_updated($product_id,2,'removed');
  478. if (! $already_updated) {
  479. //store in database
  480. $data = array(
  481. 'status' => 'removed',
  482. 'status_text' => $removal_reason,
  483. 'status_updated' => date('Y-m-d 00:00:00',time())
  484. );
  485. $this->db->where('source_id like','%'.$product_id);
  486. $this->db->update('recalls',$data);
  487. $this->updated_recalls ++;
  488. $this->log("--- removing (".$product_id.")");
  489. } else {
  490. $this->log("--- already removed (".$product_id.")");
  491. }
  492. } else {
  493. $this->log("--- cannot find (".$product_id.")");
  494. }
  495. $this->updates_counter ++;
  496. //echo "\n<div style=\"background:#ccc;margin:10px;\">\n";
  497. //echo "Product ID: ".$product_id;
  498. //echo "<br/>Reason: ".$removal_reason;
  499. //echo "\n</div>\n";
  500. }
  501. }
  502. //print_r($note_matches[1]);
  503. } else {
  504. $this->log("-error: no content - empty page");
  505. }
  506. } catch (Exception $e) {
  507. $this->log("-error: ". $e->getMessage(),'error');
  508. }
  509. sleep(SCRAPER_SLEEP);
  510. }
  511. }
  512. ?>