PageRenderTime 53ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/core/php/crawler.utils.php

https://github.com/thiagof/omCrawler
PHP | 391 lines | 273 code | 48 blank | 70 comment | 16 complexity | b1609a8d35bf56deee99e756ced7062f MD5 | raw file
Possible License(s): GPL-2.0, MIT
  1. <?php
  2. /**
  3. * Implode an array with the key and value pair giving
  4. * a glue, a separator between pairs and the array
  5. * to implode.
  6. * @param string $glue The glue between key and value
  7. * @param string $separator Separator between pairs
  8. * @param array $array The array to implode
  9. * @return string The imploded array
  10. */
  11. function array_implode( $glue, $separator, $array ) {
  12. if ( ! is_array( $array ) ) return $array;
  13. $string = array();
  14. foreach ( $array as $key => $val ) {
  15. if ( is_array( $val ) )
  16. $val = implode( ',', $val );
  17. $string[] = "{$key}{$glue}{$val}";
  18. }
  19. return implode( $separator, $string );
  20. }
  21. //http://stackoverflow.com/questions/1019076/how-to-search-by-key-value-in-a-multidimensional-array-in-php
  22. function array_msearch($array, $key, $value)
  23. {
  24. $results = array();
  25. if (is_array($array))
  26. {
  27. if (isset($array[$key]) && $array[$key] == $value)
  28. $results[] = $array;
  29. foreach ($array as $subarray)
  30. $results = array_merge($results, search($subarray, $key, $value));
  31. }
  32. return $results;
  33. }
  34. //return array with only the selected fields of the array set
  35. //array or beans?->export()
  36. function array_select($array, $selection) {
  37. $selected = array();
  38. foreach ( $array as $row ) {
  39. if ( $row instanceof RedBean_OODBBean )
  40. $row = $row->export();
  41. $selected[] = array_intersect_key($row, array_flip($selection));;
  42. }
  43. return $selected;
  44. }
  45. function agent_random() {
  46. //http://www.zytrax.com/tech/web/browser_ids.htm
  47. $agents = array(
  48. 'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0',
  49. 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0) Gecko/20100101 Firefox/10.0',
  50. 'Mozilla/5.0 (Ubuntu; X11; Linux x86_64; rv:9.0.1) Gecko/20100101 Firefox/9.0.1',
  51. 'Mozilla/5.0 (Windows NT 5.1; rv:7.0.1) Gecko/20100101 Firefox/7.0.1',
  52. 'Mozilla/5.0 (Windows NT 5.1; rv:8.0) Gecko/20100101 Firefox/8.0',
  53. 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.79 Safari/535.11',
  54. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.0 Safari/535.11',
  55. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.77 Safari/535.7',
  56. 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.121 Safari/535.2',
  57. 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_3) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1',
  58. 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.57 Safari/534.24',
  59. 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
  60. 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
  61. 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; GTB6.4; .NET CLR 1.1.4322; FDM; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)',
  62. 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.10.229 Version/11.61',
  63. 'Opera/9.80 (Macintosh; Intel Mac OS X; U; en) Presto/2.6.30 Version/10.61',
  64. 'Opera/9.80 (Windows NT 5.1; U; en) Presto/2.5.22 Version/10.50',
  65. 'Opera/9.80 (X11; Linux i686; U; nl) Presto/2.2.15 Version/10.00',
  66. 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-us) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4',
  67. 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-us) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8',
  68. 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
  69. 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; en-us) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
  70. 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Version/3.1.2 Safari/525.21'
  71. );
  72. return $agents[
  73. array_rand($agents)
  74. ];
  75. }
  76. //know issues:
  77. //- precisa melhorar como trata os caracteres especiais dentro de strings ",;[]{}"
  78. //- comments strip is not 100%, inside string
  79. //* testar com retorno da scrape de nomes.json -> feminino
  80. function json_clean( $jsonstr, $function=false ) {
  81. //take care of encoding
  82. $jsonstr = mb_convert_encoding($jsonstr, 'UTF-8', 'ASCII,UTF-8,ISO-8859-1');
  83. //Remove UTF-8 BOM if present, json_decode() does not like it.
  84. if(substr($jsonstr, 0, 3) == pack("CCC", 0xEF, 0xBB, 0xBF)) $jsonstr = substr($jsonstr, 3);
  85. //evaluate escaped string operators
  86. $jsonstr = str_replace('\n', "\n", $jsonstr);
  87. $cleanstr = '';
  88. //clean-up unformatted json
  89. //todo: testar se flag //m da mesmo resultado que o foreach
  90. foreach ( preg_split('/\n/', $jsonstr) as $line ) {
  91. //comments - single line, unique
  92. $line = preg_replace('/^\s*\/\/.*/', '',$line);
  93. //space trim
  94. $line = preg_replace('/(^\s*|\s*$)/', '', $line);
  95. //space after :
  96. $line = preg_replace('/:\s+([\'\"\[\{\w])/', ':$1', $line);
  97. //escape double-quotes when they are inside single quote
  98. $line = preg_replace_callback('/(\'.*?)"(.*?\')/', function($item) {
  99. return preg_replace('/"/', '\"', $item[0]);
  100. }, $line);
  101. //incapsulate properties into quotes
  102. $line = preg_replace('/(^|,|\{)+\s*(\w+):/', '$1"$2":', $line);
  103. //incapsulate properties into quotes - updated to better match the end
  104. //$line = preg_replace('/(^|\s|,|\{)+(\w+)(:[\w\s\{\[\'"])/', '$1"$2"$3', $line);
  105. //empty vars quotes
  106. $line = preg_replace("/''/", '""', $line);
  107. $cleanstr .= $line;
  108. }
  109. //trailing commas/spaces (end of object) -- think on take it up to foreach
  110. $cleanstr = preg_replace('/,\s*([\}\]])/', '$1', $cleanstr);
  111. //incapsulate values into double quotes - nao usar escape dentro das strings!!
  112. $cleanstr = preg_replace("/([:,\[\{])\s*'([^']+)\'/", '$1"$2"$3', $cleanstr);
  113. //$content = preg_replace('@(/\*.*?\*/)@se', "remove_non_linebreaks('\\1',$step)", $content); //comments not tested!
  114. //$jsonstr = preg_replace('%((//)).*%', '',$jsonstr); //comments
  115. //$jsonstr = preg_replace('/([\{\,])(\w+)\s*:/', '$1"$2":', $jsonstr); //incapsulate props into quotes
  116. //$jsonstr = preg_replace('/([:,\{\[])\s*(\w+)\s*([:\{\]\}])/', '$1"$2"$3', $jsonstr); //incapsulate constants into quotes
  117. //$jsonstr = preg_replace('/([:,\{\[])\s*\'([^\']+)\'/', '$1"$2"$3', $jsonstr); //escape semicolons
  118. //$jsonstr = preg_replace('/([:,\{\[]\s*\"[^\"]*)\"(.*?"[,\}\]])/', '$1\'$2', $jsonstr); //replace values " for '
  119. //$jsonstr = preg_replace("/([\w\"'])[\s:]+'/", '$1:"', $jsonstr); //replace props first ' to " of vars
  120. //$jsonstr = preg_replace("/([\{\[,])[\s]*'/", '$1"', $jsonstr); //replace props first ' to " of vars
  121. //$jsonstr = preg_replace("/'\s*(,[\"\}\]])/", '"$1', $jsonstr); //replace props last ' to "
  122. //$jsonstr = preg_replace("/'([\}\]\w])/", '"$1', $jsonstr); //replace props last ' to "
  123. //$jsonstr = preg_replace("/\"'/", '""', $jsonstr); //replace props empty
  124. //$jsonstr = preg_replace("/(\"[\s,]*)'(.)/", '$1"$2', $jsonstr); //replace ' to "*/
  125. return $cleanstr;
  126. }
  127. //Clean JSON notation with functions in it (convert them to strings)
  128. function json_clean_functions( $json, $clean=false ) {
  129. $qjson = ''; //quoted function json
  130. $fncptr = false; //point out when a function is beeing wraped
  131. $bcopn = 0; //counter for open braces
  132. $bccls = 0; //counter for close braces
  133. //evaluate escaped string operators
  134. $json = str_replace('\n', "\n", $json);
  135. //step line-by-line looking for functions
  136. foreach ( preg_split('/\n/', $json) as $line ) {
  137. $qstr = '';
  138. //find unquoted function
  139. if ( !$fncptr && preg_match('/(.*[^"\'])(function.*)/', $line, $mt) > 0 ) {
  140. $qstr = "$mt[1]\"$mt[2]";
  141. $fncptr = true;
  142. $line = $qstr;
  143. }
  144. if ($fncptr === true) { //wrapping a function
  145. //line clean-up
  146. //comments - single line, unique
  147. $line = preg_replace('/^\s*\/\/.*/', '',$line);
  148. //space trim
  149. $line = preg_replace('/(^\s*|\s*$)/', '', $line);
  150. //$line = json_clean($line, true);
  151. //$line = addcslashes($line, '"');
  152. //find function's braces and sum them
  153. if ( ($mtc = preg_match_all('/\{/', $line, $mt)) > 0 )
  154. $bcopn += $mtc; //sum to counter found braces
  155. if ( ($mtc = preg_match_all('/\}/', $line, $mt)) > 0 )
  156. $bccls += $mtc; //sum to counter found braces
  157. //when open braces counter is the same as the close brace
  158. // we can close the function quote
  159. if ( $bcopn > 0 && $bcopn == $bccls ) {
  160. if ( preg_match('/(^.*\})(.*)/', $line, $mt) > 0 ) {
  161. //final quote the last close brace
  162. $qstr = "$mt[1]\"$mt[2]\n";
  163. //reset vars to continue
  164. $fncptr = false;
  165. $bcopn = 0; $bccls = 0;
  166. }
  167. }
  168. if ( empty($qstr) ) {
  169. //escape quotes inside function
  170. $line = addcslashes($line, '"');
  171. }
  172. }
  173. //echo $qstr; exit;
  174. $qjson .= (empty($qstr)) ? "$line\n" : $qstr ;
  175. };
  176. if ($clean)
  177. return json_clean($qjson);
  178. else
  179. return $qjson;
  180. }
  181. # A static class of mine, has more conversions in it, yet these two are the relevent ones.
  182. final class Convert {
  183. # Convert a stdClass to an Array.
  184. # http://www.php.net/manual/es/language.types.object.php#102735
  185. static public function object_to_array(stdClass $Class){
  186. # Typecast to (array) automatically converts stdClass -> array.
  187. $Class = (array)$Class;
  188. # Iterate through the former properties looking for any stdClass properties.
  189. # Recursively apply (array).
  190. foreach($Class as $key => $value){
  191. if(is_object($value)&&get_class($value)==='stdClass'){
  192. $Class[$key] = self::object_to_array($value);
  193. }
  194. }
  195. return $Class;
  196. }
  197. # Convert an Array to stdClass.
  198. # http://www.php.net/manual/es/language.types.object.php#102735
  199. static public function array_to_object(array $array){
  200. # Iterate through our array looking for array values.
  201. # If found recurvisely call itself.
  202. foreach($array as $key => $value){
  203. if(is_array($value)){
  204. $array[$key] = self::array_to_object($value);
  205. }
  206. }
  207. # Typecast to (object) will automatically convert array -> stdClass
  208. return (object)$array;
  209. }
  210. static public function text_to_slug( $text ) {
  211. //sanitize
  212. self::sane_text( $text );
  213. // replace non letter or digits by -
  214. $text = preg_replace('~[^\\pL\d]+~u', '-', $text);
  215. // trim
  216. $text = trim($text);
  217. // lowercase
  218. $text = strtolower($text);
  219. // remove unwanted characters
  220. $text = preg_replace('~[^-\w]+~', '', $text);
  221. if (empty($text))
  222. return 'n-a';
  223. return $text;
  224. }
  225. //sanitize text strings
  226. static function sane_text( $text ) {
  227. $table = array(
  228. 'Š'=>'S', 'š'=>'s', 'Đ'=>'Dj', 'đ'=>'dj', 'Ž'=>'Z', 'ž'=>'z', 'Č'=>'C', 'č'=>'c', 'Ć'=>'C', 'ć'=>'c',
  229. 'À'=>'A', 'Á'=>'A', 'Â'=>'A', 'Ã'=>'A', 'Ä'=>'A', 'Å'=>'A', 'Æ'=>'A', 'Ç'=>'C', 'È'=>'E', 'É'=>'E',
  230. 'Ê'=>'E', 'Ë'=>'E', 'Ì'=>'I', 'Í'=>'I', 'Î'=>'I', 'Ï'=>'I', 'Ñ'=>'N', 'Ò'=>'O', 'Ó'=>'O', 'Ô'=>'O',
  231. 'Õ'=>'O', 'Ö'=>'O', 'Ø'=>'O', 'Ù'=>'U', 'Ú'=>'U', 'Û'=>'U', 'Ü'=>'U', 'Ý'=>'Y', 'Þ'=>'B', 'ß'=>'Ss',
  232. 'à'=>'a', 'á'=>'a', 'â'=>'a', 'ã'=>'a', 'ä'=>'a', 'å'=>'a', 'æ'=>'a', 'ç'=>'c', 'è'=>'e', 'é'=>'e',
  233. 'ê'=>'e', 'ë'=>'e', 'ì'=>'i', 'í'=>'i', 'î'=>'i', 'ï'=>'i', 'ð'=>'o', 'ñ'=>'n', 'ò'=>'o', 'ó'=>'o',
  234. 'ô'=>'o', 'õ'=>'o', 'ö'=>'o', 'ø'=>'o', 'ù'=>'u', 'ú'=>'u', 'û'=>'u', 'ý'=>'y', 'ý'=>'y', 'þ'=>'b',
  235. 'ÿ'=>'y', 'Ŕ'=>'R', 'ŕ'=>'r',
  236. );
  237. $text = strtr($text, $table);
  238. return $text;
  239. }
  240. //todo:verificar
  241. //http://www.php.net/manual/en/function.addcslashes.php#92495
  242. function javascript_escaped($str) {
  243. return addcslashes($str,"\\\'\"&\n\r<>");
  244. }
  245. }
  246. Class Sanitize {
  247. //lookup the docs ->
  248. //taken from wordpress
  249. function utf8_uri_encode( $utf8_string, $length = 0 ) {
  250. $unicode = '';
  251. $values = array();
  252. $num_octets = 1;
  253. $unicode_length = 0;
  254. $string_length = strlen( $utf8_string );
  255. for ($i = 0; $i < $string_length; $i++ ) {
  256. $value = ord( $utf8_string[ $i ] );
  257. if ( $value < 128 ) {
  258. if ( $length && ( $unicode_length >= $length ) )
  259. break;
  260. $unicode .= chr($value);
  261. $unicode_length++;
  262. } else {
  263. if ( count( $values ) == 0 ) $num_octets = ( $value < 224 ) ? 2 : 3;
  264. $values[] = $value;
  265. if ( $length && ( $unicode_length + ($num_octets * 3) ) > $length )
  266. break;
  267. if ( count( $values ) == $num_octets ) {
  268. if ($num_octets == 3) {
  269. $unicode .= '%' . dechex($values[0]) . '%' . dechex($values[1]) . '%' . dechex($values[2]);
  270. $unicode_length += 9;
  271. } else {
  272. $unicode .= '%' . dechex($values[0]) . '%' . dechex($values[1]);
  273. $unicode_length += 6;
  274. }
  275. $values = array();
  276. $num_octets = 1;
  277. }
  278. }
  279. }
  280. return $unicode;
  281. }
  282. //taken from wordpress
  283. function seems_utf8($str) {
  284. $length = strlen($str);
  285. for ($i=0; $i < $length; $i++) {
  286. $c = ord($str[$i]);
  287. if ($c < 0x80) $n = 0; # 0bbbbbbb
  288. elseif (($c & 0xE0) == 0xC0) $n=1; # 110bbbbb
  289. elseif (($c & 0xF0) == 0xE0) $n=2; # 1110bbbb
  290. elseif (($c & 0xF8) == 0xF0) $n=3; # 11110bbb
  291. elseif (($c & 0xFC) == 0xF8) $n=4; # 111110bb
  292. elseif (($c & 0xFE) == 0xFC) $n=5; # 1111110b
  293. else return false; # Does not match any model
  294. for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
  295. if ((++$i == $length) || ((ord($str[$i]) & 0xC0) != 0x80))
  296. return false;
  297. }
  298. }
  299. return true;
  300. }
  301. //function sanitize_title_with_dashes taken from wordpress
  302. function sanitize($title) {
  303. $title = strip_tags($title);
  304. // Preserve escaped octets.
  305. $title = preg_replace('|%([a-fA-F0-9][a-fA-F0-9])|', '---$1---', $title);
  306. // Remove percent signs that are not part of an octet.
  307. $title = str_replace('%', '', $title);
  308. // Restore octets.
  309. $title = preg_replace('|---([a-fA-F0-9][a-fA-F0-9])---|', '%$1', $title);
  310. if (seems_utf8($title)) {
  311. if (function_exists('mb_strtolower')) {
  312. $title = mb_strtolower($title, 'UTF-8');
  313. }
  314. $title = utf8_uri_encode($title, 200);
  315. }
  316. $title = strtolower($title);
  317. $title = preg_replace('/&.+?;/', '', $title); // kill entities
  318. $title = str_replace('.', '-', $title);
  319. $title = preg_replace('/[^%a-z0-9 _-]/', '', $title);
  320. $title = preg_replace('/\s+/', '-', $title);
  321. $title = preg_replace('|-+|', '-', $title);
  322. $title = trim($title, '-');
  323. return $title;
  324. }
  325. }