/library.ucursos.scrapper.php

https://github.com/rduenasf/ucursos-scrapper · PHP · 106 lines · 83 code · 16 blank · 7 comment · 12 complexity · 75beea3f30b3e2c834c93a52c5dee83e MD5 · raw file

  1. <?php
  2. require('phpQuery/phpQuery.php');
  3. class UcursosScrapper {
  4. var $url;
  5. var $status_code;
  6. var $curl_handler;
  7. var $content;
  8. var $phpQueryDocument;
  9. var $processed;
  10. var $nombre_seccion;
  11. function __construct($user_id) {
  12. $this->user_id = $user_id;
  13. $this->curl_handler = curl_init();
  14. curl_setopt($this->curl_handler, CURLOPT_HEADER, 0);
  15. curl_setopt($this->curl_handler, CURLOPT_FOLLOWLOCATION, 1);
  16. curl_setopt($this->curl_handler, CURLOPT_SSL_VERIFYPEER, false);
  17. curl_setopt($this->curl_handler, CURLOPT_COOKIEJAR, dirname(__FILE__)."/cookies/".$this->user_id.".txt");
  18. curl_setopt($this->curl_handler, CURLOPT_COOKIEFILE, dirname(__FILE__)."/cookies/".$this->user_id.".txt");
  19. curl_setopt($this->curl_handler, CURLOPT_RETURNTRANSFER, 1);
  20. curl_setopt($this->curl_handler, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3");
  21. }
  22. public function is_authorized() {
  23. // En el login me mostró el login
  24. if (strpos($this->content, '<script src="https://www.u-cursos.cl/upasaporte/javascript?servicio=ucursos&UCURSOS_SERVER') !== false) {
  25. return false;
  26. }
  27. // Me tiró un no autorizado
  28. else if ($this->status_code == '403') {
  29. return false;
  30. }
  31. else {
  32. return true;
  33. }
  34. }
  35. public function is_connected() {
  36. return $this->status_code != '0';
  37. }
  38. public function exists() {
  39. // Si lo encontró con 200, existe
  40. return $this->status_code == '200';
  41. }
  42. function fetch($url, $output_format = 'UTF-8') {
  43. $this->url = $url;
  44. curl_setopt($this->curl_handler, CURLOPT_URL, $this->url);
  45. $this->content = curl_exec($this->curl_handler);
  46. $info = curl_getinfo($this->curl_handler);
  47. curl_close($this->curl_handler);
  48. $this->status_code = $info['http_code'];
  49. $this->charset = substr($info['content_type'], strpos($info['content_type'], 'charset=')+strlen('charset='));
  50. if(!$this->is_authorized() || !$this->exists()) {
  51. return false;
  52. }
  53. // $this->content = mb_convert_encoding($this->content, $output_format, $this->charset);
  54. return true;
  55. }
  56. function process() {
  57. if (!$this->processed) {
  58. $this->processed = true;
  59. $this->phpQueryDocument = phpQuery::newDocumentHTML($this->content);
  60. phpQuery::selectDocument($this->phpQueryDocument);
  61. }
  62. }
  63. function getSeccion() {
  64. if ($this->nombre_seccion != null) return $this->nombre_seccion;
  65. $this->process();
  66. $this->nombre_seccion = pq('h2.ucursos')->text();
  67. return $this->nombre_seccion;
  68. }
  69. public static function toUserType($str, $default = 'administrador_de_comunidad', $nan = 'miembro_de_comunidad') {
  70. // $default: cuando no tiene tipo
  71. // $nan: cuando no es un tipo conocido
  72. $types = array('profesor_de_catedra', 'auxiliar', 'ayudante', 'alumno', 'miembro_de_comunidad', 'administrador_de_comunidad');
  73. $str = UcursosScrapper::toAscii($str);
  74. return $str ? (in_array($str, $types) ? $str : $nan ) : $default;
  75. }
  76. public static function toAscii($str, $delimiter='_') {
  77. //$clean = iconv('UTF-8', 'ASCII//TRANSLIT', $str);
  78. $clean = $str;
  79. $clean = preg_replace("/[^a-zA-Z0-9\/_|+ -]/", '', $clean);
  80. $clean = strtolower(trim($clean, '-'));
  81. $clean = preg_replace("/[\/_|+ -]+/", $delimiter, $clean);
  82. return $clean;
  83. }
  84. }
  85. require('scrapper.resources/ucursos.cursos.scrapper.php');
  86. require('scrapper.resources/ucursos.foro.scrapper.php');
  87. require('scrapper.resources/ucursos.home.scrapper.php');
  88. require('scrapper.resources/ucursos.horario.scrapper.php');
  89. require('scrapper.resources/ucursos.notas.scrapper.php');
  90. require('scrapper.resources/ucursos.novedades.scrapper.php');