util.php - Count how many times the $needles occur in the $…

/util.php

https://github.com/aramk/IMDb-Scraper · PHP · 179 lines · 142 code · 22 blank · 15 comment · 38 complexity · 2e0e27f33cd747198ad56954f1e4192b MD5 · raw file


<?php

function curl_get_html($url) {
	$ch = curl_init($url);
	
	$options = array(CURLOPT_RETURNTRANSFER => TRUE,
					 CURLOPT_FOLLOWLOCATION => TRUE,
					 CURLOPT_MAXREDIRS => 5,
					 CURLOPT_CONNECTTIMEOUT => 20,
					 CURLOPT_USERAGENT => $_SERVER['HTTP_USER_AGENT']);
	curl_setopt_array($ch, $options);
	
	$html = curl_exec($ch);
	
	$http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
	if ($http_code < 200 || $http_code >= 400) {
		throw new Exception("cURL received HTTP code ".$http_code);
	}
	
	curl_close($ch);
	
	return $html;
}

// Count how many times the $needles occur in the $haystack
function substr_count_array( $haystack, $needles, $insensitive = TRUE ) {
	$count = 0;
	foreach ($needles as $substring) {
		if ($insensitive) {
			$count += substr_count_i( $haystack, $substring);
		} else {
			$count += substr_count( $haystack, $substring);
		}
	}
	return $count;
}

// Case insensitive substr_count
function substr_count_i( $haystack, $substring ) {
	$haystack = strtolower($haystack);
	$substring = strtolower($substring);
	return substr_count( $haystack, $substring );
}

// Returns an array of the occurences found inside the $haystacks array using the $needles array.
function substr_count_arrays( $haystacks, $needles ) {
	$counts = array();	
	for ($i = 0; $i < count($haystacks); $i++) {
		$counts[$i] = substr_count_array($haystacks[$i], $needles);
	}
	return $counts;
}

//$needles = array('fuck', 'cunt', 'bitch');
//$haystacks = array('fuck bitch', 'fuck fuck fuck', 'fuck cunt');
//var_dump(substr_count_arrays($haystacks, $needles));

// Trim a string or an array of strings recursively
function trim_r($array) {
    if (is_string($array)) {
        return trim($array);
    } else if (!is_array($array)) {
        return '';
    }
    $keys = array_keys($array);
    for ($i=0; $i<count($keys); $i++) {
        $key = $keys[$i];
        if ( is_array($array[$key]) ) {
            $array[$key] = trim_r($array[$key]);
        } else if ( is_string($array[$key]) ) {
            $array[$key] = trim($array[$key]);
        }
    }
    return $array;
}

// Match regex and return given index
function regex_get($regex, $str, $index = 1, $clean = TRUE) {
    preg_match($regex, $str, $matches);
    if (count($matches) > 0) {
        $index = intval($index);
        if ($index >= count($matches)) {
            return '';
        }
        $match = $matches[$index];
        if ($clean === 'num') {
        	$match = clean_num($match);
        } else if ($clean == TRUE) {
        	$match = clean_str($match);
        }
        return $match;
    } else {
        return '';
    }
}

// Extract the title id out of a URL
function imdb_url_id($url) {
    if (!is_string($url)) {
        return '';
    }
    $id = regex_get('#title\\/(.*)\\/#', $url, 1);
    return empty($id) ? FALSE : $id;
}

function url_add_slash($url) {
	return preg_replace('#([^\\/])$#', '\1/', $url);
}

// Removes HTML encoding
function clean_str($str, $quotes = FALSE, $only_chars = FALSE) {
	if (is_string($str)) {
		$str = trim( html_entity_decode( strip_tags($str) , ENT_NOQUOTES, 'UTF-8') );
		if ($quotes) {
			$str = preg_replace('#"|\'#', '', $str);
		}
		if ($only_chars) {
			$str = preg_replace('#[^\\w\\s]#', '', $str);
		}
	}
	return $str;
}

function clean_num($str) {
	return preg_replace('#[^\d\.,]#', '', clean_str($str));
}

// Converts a string to a number. Decides whether to use int or float.
function numval($str) {
	$int = intval($str);
	$float = floatval($str);
	if ($int == $float) {
		return $int;
	} else {
		return $float;
	}
}

// Recursively cleans an array of strings
function clean_array($array) {
	if (is_array($array)) {
		for ($i = 0; $i < count($array); $i++) {
			if ( is_array($array[$i]) ) {
				$array[$i] = clean_array($array[$i]);
			} else if ( is_string($array[$i]) ) {
				$array[$i] = clean_str($array[$i]);
			}
		}
	}
	return $array;
}

// Removes all non-alphanumerics, makes lowercase and cleans
function normalise($str) {
	if (is_string($str)) {
		return strtolower(preg_replace('#[^\\w]#', '', clean_str($str)));
	}
	return '';
}

// Returns TRUE if a $needle found in $haystack, normalises both first
function normpos($haystack, $needle) {
	if (is_string($haystack) && is_string($needle)) {
		return stripos(normalise($haystack), normalise($needle)) !== FALSE;
	} else {
		return FALSE;
	}
}

// Tests if two normalised strings are equal
function normeq($str1, $str2) {
	if (is_string($str1) && is_string($str2)) {
		return normalise($str1) === normalise($str2);
	} else {
		return FALSE;
	}
}

?>

Alerts (5)

'exec(' System command execution detected; use safer alternatives (e.g., escapeshellarg) or avoid if possible
13
Complexity hotspot; line 88 (total complexity: 3)
88
Complexity hotspot; lines 141 to 143 (total complexity: 3)
141 142 143