PageRenderTime 25ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/util.php

https://github.com/aramk/IMDb-Scraper
PHP | 179 lines | 142 code | 22 blank | 15 comment | 38 complexity | 2e0e27f33cd747198ad56954f1e4192b MD5 | raw file
  1. <?php
  2. function curl_get_html($url) {
  3. $ch = curl_init($url);
  4. $options = array(CURLOPT_RETURNTRANSFER => TRUE,
  5. CURLOPT_FOLLOWLOCATION => TRUE,
  6. CURLOPT_MAXREDIRS => 5,
  7. CURLOPT_CONNECTTIMEOUT => 20,
  8. CURLOPT_USERAGENT => $_SERVER['HTTP_USER_AGENT']);
  9. curl_setopt_array($ch, $options);
  10. $html = curl_exec($ch);
  11. $http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
  12. if ($http_code < 200 || $http_code >= 400) {
  13. throw new Exception("cURL received HTTP code ".$http_code);
  14. }
  15. curl_close($ch);
  16. return $html;
  17. }
  18. // Count how many times the $needles occur in the $haystack
  19. function substr_count_array( $haystack, $needles, $insensitive = TRUE ) {
  20. $count = 0;
  21. foreach ($needles as $substring) {
  22. if ($insensitive) {
  23. $count += substr_count_i( $haystack, $substring);
  24. } else {
  25. $count += substr_count( $haystack, $substring);
  26. }
  27. }
  28. return $count;
  29. }
  30. // Case insensitive substr_count
  31. function substr_count_i( $haystack, $substring ) {
  32. $haystack = strtolower($haystack);
  33. $substring = strtolower($substring);
  34. return substr_count( $haystack, $substring );
  35. }
  36. // Returns an array of the occurences found inside the $haystacks array using the $needles array.
  37. function substr_count_arrays( $haystacks, $needles ) {
  38. $counts = array();
  39. for ($i = 0; $i < count($haystacks); $i++) {
  40. $counts[$i] = substr_count_array($haystacks[$i], $needles);
  41. }
  42. return $counts;
  43. }
  44. //$needles = array('fuck', 'cunt', 'bitch');
  45. //$haystacks = array('fuck bitch', 'fuck fuck fuck', 'fuck cunt');
  46. //var_dump(substr_count_arrays($haystacks, $needles));
  47. // Trim a string or an array of strings recursively
  48. function trim_r($array) {
  49. if (is_string($array)) {
  50. return trim($array);
  51. } else if (!is_array($array)) {
  52. return '';
  53. }
  54. $keys = array_keys($array);
  55. for ($i=0; $i<count($keys); $i++) {
  56. $key = $keys[$i];
  57. if ( is_array($array[$key]) ) {
  58. $array[$key] = trim_r($array[$key]);
  59. } else if ( is_string($array[$key]) ) {
  60. $array[$key] = trim($array[$key]);
  61. }
  62. }
  63. return $array;
  64. }
  65. // Match regex and return given index
  66. function regex_get($regex, $str, $index = 1, $clean = TRUE) {
  67. preg_match($regex, $str, $matches);
  68. if (count($matches) > 0) {
  69. $index = intval($index);
  70. if ($index >= count($matches)) {
  71. return '';
  72. }
  73. $match = $matches[$index];
  74. if ($clean === 'num') {
  75. $match = clean_num($match);
  76. } else if ($clean == TRUE) {
  77. $match = clean_str($match);
  78. }
  79. return $match;
  80. } else {
  81. return '';
  82. }
  83. }
  84. // Extract the title id out of a URL
  85. function imdb_url_id($url) {
  86. if (!is_string($url)) {
  87. return '';
  88. }
  89. $id = regex_get('#title\\/(.*)\\/#', $url, 1);
  90. return empty($id) ? FALSE : $id;
  91. }
  92. function url_add_slash($url) {
  93. return preg_replace('#([^\\/])$#', '\1/', $url);
  94. }
  95. // Removes HTML encoding
  96. function clean_str($str, $quotes = FALSE, $only_chars = FALSE) {
  97. if (is_string($str)) {
  98. $str = trim( html_entity_decode( strip_tags($str) , ENT_NOQUOTES, 'UTF-8') );
  99. if ($quotes) {
  100. $str = preg_replace('#"|\'#', '', $str);
  101. }
  102. if ($only_chars) {
  103. $str = preg_replace('#[^\\w\\s]#', '', $str);
  104. }
  105. }
  106. return $str;
  107. }
  108. function clean_num($str) {
  109. return preg_replace('#[^\d\.,]#', '', clean_str($str));
  110. }
  111. // Converts a string to a number. Decides whether to use int or float.
  112. function numval($str) {
  113. $int = intval($str);
  114. $float = floatval($str);
  115. if ($int == $float) {
  116. return $int;
  117. } else {
  118. return $float;
  119. }
  120. }
  121. // Recursively cleans an array of strings
  122. function clean_array($array) {
  123. if (is_array($array)) {
  124. for ($i = 0; $i < count($array); $i++) {
  125. if ( is_array($array[$i]) ) {
  126. $array[$i] = clean_array($array[$i]);
  127. } else if ( is_string($array[$i]) ) {
  128. $array[$i] = clean_str($array[$i]);
  129. }
  130. }
  131. }
  132. return $array;
  133. }
  134. // Removes all non-alphanumerics, makes lowercase and cleans
  135. function normalise($str) {
  136. if (is_string($str)) {
  137. return strtolower(preg_replace('#[^\\w]#', '', clean_str($str)));
  138. }
  139. return '';
  140. }
  141. // Returns TRUE if a $needle found in $haystack, normalises both first
  142. function normpos($haystack, $needle) {
  143. if (is_string($haystack) && is_string($needle)) {
  144. return stripos(normalise($haystack), normalise($needle)) !== FALSE;
  145. } else {
  146. return FALSE;
  147. }
  148. }
  149. // Tests if two normalised strings are equal
  150. function normeq($str1, $str2) {
  151. if (is_string($str1) && is_string($str2)) {
  152. return normalise($str1) === normalise($str2);
  153. } else {
  154. return FALSE;
  155. }
  156. }
  157. ?>