/releases/ripcurl-0.6.0/ripcurl.class.php
PHP | 1398 lines | 944 code | 210 blank | 244 comment | 134 complexity | 178370c55e16f7d90b4aaf25a0d9b08d MD5 | raw file
- <?php
- //RipCURL class 0.6 - SD Linux Services
- //This class was written by Brandon Ching of SD Linux Services
- //It is licensed under the GPL 2 and requires PHP5 and CURL 7
- class ripCurl{
- //Class definitions
- //Define the default directory for ripWrite. Must include trailing "/". Please turn safe_mode off or make the proper UID/GID changes
- //to the php.ini file or in the ripWrite function below. The directory should be located under the web root directory for links to work
- //or use a directory alias in httpd.conf.
- const WRITEDIRECTORY = "/tmp/";
-
- //Set class variables
- //curl handler
- private $ch;
- //Returned value of curl_exec. This value should never be directly written to. It is meant to be used by class methods only.
- //It always holds the current HTML of the last page to be fetched/processed.
- private $rawHtml = '';
- //Value of the last POST fields
- private $postData = '';
- //Error variable
- private $error = '';
- //Total HTTP errors for class instance
- private $httpErrors = 0;
- //Count for ripWrite
- private $writeCount = 0;
- //Count for ripGetlinks
- private $linksCount = 0;
- //Total redirects for class instance
- private $totalRedirects = 0;
- //Total time for class existance
- private $totalTime = 0;
- //Total size of information transfered
- private $totalSize = 0;
- //Average speed of all transfers
- private $averageSpeed = 0;
- //Average time per transfer
- private $averageTime = 0;
- //Total number of connections for class existance
- private $totalConnections = 0;
- //Last ripRun status code
- private $lastStatusCode = null;
- //Initialize cookiejar directory. Filename gets set in the constructor
- private $cookiejar = "/tmp/";
- //Form action of last getFormElements()
- private $formAction = "";
- //Form method of last getFormElements()
- private $formMethod = "post";
- //Maintains internal track of all links in getLinks()
- private $currentLinks = "";
- //Last URL in ripRun();
- private $lastUrl = "";
-
- //Accessor methods
- public function getRawHtml(){
- return $this->rawHtml;
- }
-
- public function getLastUrl(){
- return $this->lastUrl;
- }
-
- public function getPostData(){
- return $this->postData;
- }
-
- public function getHttpErrors(){
- return $this->httpErrors;
- }
-
- public function getWriteCount(){
- return $this->writeCount;
- }
-
- public function getLinksCount(){
- return $this->linksCount;
- }
-
- public function zeroLinksCount(){
- if($this->linksCount = 0){
- return true;
- }else{
- $this->error = "zeroLinksCount: Could not reset linksCount variable.";
- return false;
- }
- }
-
- public function getTotalRedirects(){
- return $this->totalRedirects;
- }
-
- public function getTotalTime(){
- return $this->totalTime;
- }
-
- public function getTotalSize(){
- return $this->totalSize;
- }
-
- public function getAverageSpeed(){
- return $this->averageSpeed;
- }
-
- public function getAverageTime(){
- return $this->averageTime;
- }
-
- public function getTotalConnections(){
- return $this->totalConnections;
- }
-
- public function getCookieJar(){
- return $this->cookiejar;
- }
- public function getError(){
- return $this->error;
- }
-
- public function getLastStatus(){
- return $this->lastStatus;
- }
-
- public function setCookieJar($o){
- if(is_writeable($o) && $this->cookiejar = $o){
- $this->cookieJar($o);
- $this->cookieFile($o);
- return true;
- }else{
- $this->error = "setCookieJar: Cookiejar could not be set";
- return false;
- }
- }
-
- public function getFormAction(){
- return $this->formAction;
- }
-
- public function getFormMethod(){
- return $this->formMethod;
- }
-
- public function getCurrentLinks(){
- return $this->currentLinks;
- }
- public function clearCurrentLinks(){
- if($this->currentLinks = array()){
- return true;
- }else{
- $this->error = "clearCurrentLinks: Could not clear currentLinks array";
- return false;
- }
- }
-
- public function setFormMethod($o){
- $this->formMethod = $o;
- }
- //Constructor method. Designed to set standard defaults for pulling sites into a variable
- public function ripCurl($multi = null){
- $this->cookiejar = $this->cookiejar . md5(date('l dS \of F Y h:i:s A')+rand(5, 50)) . ".cookiejar.txt";
- $this->ch = curl_init();
- curl_setopt($this->ch, CURLOPT_RETURNTRANSFER,1);
- curl_setopt($this->ch, CURLOPT_AUTOREFERER,1);
- curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION,1);
- curl_setopt($this->ch, CURLOPT_SSL_VERIFYHOST,0);
- curl_setopt($this->ch, CURLOPT_SSL_VERIFYPEER,0);
- curl_setopt($this->ch, CURLOPT_CONNECTTIMEOUT,30);
- curl_setopt($this->ch, CURLOPT_COOKIEJAR, $this->cookiejar);
- curl_setopt($this->ch, CURLOPT_COOKIEFILE, $this->cookiejar);
- }
-
- //Class methods for setting curlopt variables
- public function autoReferer($o = 1){
- curl_setopt($this->ch, CURLOPT_AUTOREFERER, $o);
- }
-
- public function cookieSession($o = 1){
- curl_setopt($this->ch, CURLOPT_COOKIESESSION, $o);
- }
-
- public function failOnError($o = 1){
- curl_setopt($this->ch, CURLOPT_FAILONERROR, $o);
- }
-
- public function followLocation($o = 1){
- curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION, $o);
- }
-
- public function forbidReuse($o = 1){
- curl_setopt($this->ch, CURLOPT_FORBID_REUSE, $o);
- }
-
- public function freshConnect($o = 1){
- curl_setopt($this->ch, CURLOPT_FRESH_CONNECT, $o);
- }
-
- public function showHeader($o = 1){
- curl_setopt($this->ch, CURLOPT_HEADER, $o);
- }
-
- public function httpGet($o = 1){
- curl_setopt($this->ch, CURLOPT_HTTPGET, $o);
- }
-
- public function mute($o = 1){
- curl_setopt($this->ch, CURLOPT_MUTE, $o);
- }
-
- public function noSignal($o = 1){
- curl_setopt($this->ch, CURLOPT_NOSIGNAL, $o);
- }
-
- public function post($o = 1){
- curl_setopt($this->ch, CURLOPT_POST, $o);
- }
-
- public function put($o = 1){
- curl_setopt($this->ch, CURLOPT_PUT, $o);
- }
-
- public function returnTransfer($o = 1){
- curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, $o);
- }
-
- public function sslVerifyPeer($o = 1){
- curl_setopt($this->ch, CURLOPT_SSL_VERIFYPEER, $o);
- }
-
- public function sslVerifyHost($o = 1){
- curl_setopt($this->ch, CURLOPT_SSL_VERIFYHOST, $o);
- }
-
- public function verbose($o = 1){
- curl_setopt($this->ch, CURLOPT_VERBOSE, $o);
- }
-
- public function bufferSize($o){
- curl_setopt($this->ch, CURLOPT_BUFFERSIZE, $o);
- }
-
- public function connectTimeout($o){
- curl_setopt($this->ch, CURLOPT_CONNECTTIMEOUT, $o);
- }
-
- public function dnsCacheTimeout($o){
- curl_setopt($this->ch, CURLOPT_DNS_CACHE_TIMEOUT, $o);
- }
-
- public function httpVersion($o){
- curl_setopt($this->ch, CURLOPT_HTTP_VERSION, $o);
- }
-
- public function httpAuth($o){
- curl_setopt($this->ch, CURLOPT_HTTPAUTH, $o);
- }
-
- public function maxConnects($o){
- curl_setopt($this->ch, CURLOPT_MAXCONNECTS, $o);
- }
-
- public function port($o){
- curl_setopt($this->ch, CURLOPT_PORT, $o);
- }
-
- public function maxRedirs($o){
- curl_setopt($this->ch, CURLOPT_MAXREDIRS, $o);
- }
-
- public function sslVersion($o){
- curl_setopt($this->ch, CURLOPT_SSLVERSION, $o);
- }
-
- public function timeOut($o){
- curl_setopt($this->ch, CURLOPT_TIMEOUT, $o);
- }
-
- public function cookie($o){
- curl_setopt($this->ch, CURLOPT_COOKIE, $o);
- }
-
- public function cookieJar($o){
- curl_setopt($this->ch, CURLOPT_COOKIEJAR, $o);
- }
-
- public function cookieFile($o){
- curl_setopt($this->ch, CURLOPT_COOKIEFILE, $o);
- }
-
- public function encoding($o){
- curl_setopt($this->ch, CURLOPT_ENCODING, $o);
- }
-
- public function postFields($o){
- curl_setopt($this->ch, CURLOPT_POSTFIELDS, $o);
- }
-
- public function referer($o){
- curl_setopt($this->ch, CURLOPT_REFERER, $o);
- }
-
- public function url($o){
- curl_setopt($this->ch, CURLOPT_URL, $o);
- }
-
- public function userAgent($o){
- switch ($o){
- case 'ie6':
- $o = 'Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1)';
- break;
- case 'ie7':
- $o = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)';
- break;
- case 'ff':
- $o = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; es-ES; rv:1.8.0.6) Gecko/20060728 Firefox/1.5.0.6';
- break;
- case 'google':
- $o = 'Googlebot/2.1 (+http://www.google.com/bot.html)';
- break;
- case 'msn':
- $o = 'msnbot/1.0 (+http://search.msn.com/msnbot.htm)';
- break;
- case 'yahoo':
- $o = 'Mozilla/5.0 (compatible; Yahoo! Slurp;http://help.yahoo.com/help/us/ysearch/slurp)';
- break;
- }
- curl_setopt($this->ch, CURLOPT_USERAGENT, $o);
- }
-
- //Username and password in uname:pass format
- public function userPwd($o){
- curl_setopt($this->ch, CURLOPT_USERPWD, $o);
- }
-
- public function httpHeader($o){
- curl_setopt($this->ch, CURLOPT_HTTPHEADER, $o);
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- // Begin major class methods
- //
- /////////////////////////////////////////////////////////////////////////////////////
- //Used to display a summary of the entire ripCurl session.
- //
- //If a parameter is specified, it will return only that key value.
- //Otherwise, return an array
- //
- public function getInfo($o = null){
- if(!is_null($o)){
- $info = curl_getinfo($this->ch);
- $value = $info[$o];
- return $value;
- } else {
- $info = curl_getinfo($this->ch);
- return $info;
- }
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Get contents of cookiefile
- //
- //Returns an associative array of cookie parts
- //
- public function getCookieJarContents($cookiefile = null){
- if(is_null($cookiefile)){
- $cookiefile = $this->cookiejar;
- }
- if(!is_readable($cookiefile)){
- $this->error = "ripGetCookieContents: Cookie file is not readable";
- return false;
- }
- $contents = file_get_contents($cookiefile);
- preg_match_all('|.*\t.*|', $contents, $dough);
- $cookies = array();
- for($i = 0; $i < count($dough[0]); $i++){
- $c = explode("\t", $dough[0][$i]);
- $cookies[$i]['host'] = $c[0];
- $cookies[$i]['secure'] = $c[1];
- $cookies[$i]['path'] = $c[2];
- $cookies[$i]['httpOnly'] = $c[3];
- $cookies[$i]['expire'] = $c[4];
- $cookies[$i]['name'] = $c[5];
- $cookies[$i]['value'] = $c[6];
- }
- return $cookies;
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Sets a cookie accroding to Netscape cookie files.
- //
- //Returns TRUE on successfull cookie write
- //
- public function writeCookie($cookie){
- if(!is_array($cookie) || count($cookie) != 7){
- $this->error = "setCookie: Argument must be an array with host, secure, path, httpOnly, expire, name, and value";
- return false;
- }
- $cookiefile = $this->cookiejar;
-
- if(!is_writable($cookiefile)){
- $this->error = "Cookie file is not writeable";
- return false;
- }
- $fileh = fopen($cookiefile, 'a+');
- $cookieString = implode("\t", $cookie);
- if(fwrite($fileh, $cookieString) === FALSE){
- $this->error = "setCookie: Could not write cookie to file";
- return false;
- }
- fclose($fileh);
- return true;
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Pulls all HTML between the $start and $end and returns it.
- //
- //Returns array of captured content
- //
- public function ripInBetween($start, $end, $greedy = null, $html = null){
- if(is_null($html)){
- $html = $this->getRawHtml();
- }
- if(is_null($greedy)){
- $needle="|$start" . "(.*?)" . "$end|is";
- }else{//Do greedy search
- $needle="|$start" . "(.*)" . "$end|is";
- }
- preg_match_all($needle, $html, $result, PREG_PATTERN_ORDER);
- return $result[1];
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Search and replace function.
- //
- //Returns value of $html after replacement. Also sets replaced value to $rawHtml
- //
- public function sandr($search, $replace, $html = null){
- if(is_null($html)){
- $html = $this->getRawHtml();
- }
-
- $html = preg_replace("|$search|is", $replace, $html);
- $this->rawHtml = $html;
- return $html;
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Removes all javascript from the passed html
- //
- //Returns clean HTML. Also sets value to $rawHtml
- //
- public function ripJS($html = null){
- if(is_null($html)){
- $html = $this->getRawHtml();
- }
- $clean = preg_replace('|<script.*?/script>|is', '', $html);
- $this->rawHtml = $clean;
- return $clean;
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Removes all style sheets and <style> contents.
- //
- //Returns clean HTML. Also sets value to $rawHtml
- //
- public function ripStyles($html = null){
- if(is_null($html)){
- $html = $this->getRawHtml();
- }
- $clean = preg_replace('|<style.*?/style>|is', '', $html);
- $this->rawHtml = $clean;
- return $clean;
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Method to get all links in a page. If the $id is specified, returned links must
- //contain $id. Link counts are automatically tracked in the $this->linksCount class
- //variable
- //
- //Returns array of all links found with $id specified
- //
- public function getLinks($id = null, $html = null){
- if(is_null($html)){
- $html = $this->getRawHtml();
- }
- if(is_null($id)){
- $id = ".*";
- }
- $links = array();
- $pattern = '/<A\s*HREF=[\"\']?([^\"\'>]*)[\"\']?[^>]*>(.*?)<\/A>/is';
- $idpattern = '|' . $id . '|';
- if (preg_match_all($pattern, $html, $matches, PREG_SET_ORDER)) {
- if(!is_null($id)){
- foreach($matches as $match){
- if(preg_match($idpattern, $match[1])){
- array_push($links, $match[1]);
- }
- }
- } else {
- $links = $matches;
- }
- }
-
- $links = array_unique($links);
- $this->linksCount = $this->linksCount + count($links);
-
- $this->currentLinks = $links;
- return $links;
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Like getLinks method but for links with javascript in the href portion
- //
- //Returns array of all links found with $id
- //
- public function getWeirdLinks($id = null, $html = null){
- if(is_null($html)){
- $html = $this->getRawHtml();
- }
- if(is_null($id)){
- $id = ".*";
- }
-
- $return = array();
- $doc = new DOMDocument();
- @$doc->loadHTML($html);
- $tags = $doc->getElementsByTagName('a');
- foreach($tags as $tag){
- $href = $tag->getAttribute('href');
- if(preg_match("|$id|s", $href)){
- $return[] = $href;
- }
- }
-
- $return = array_unique($return);
- $this->currentLinks = $return;
-
- return $return;
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Search for string within $html. Set $caseSensitive to true for case sensitive search
- //
- //Returns true if $needle found within
- //
- public function printLinks($links = null){
- if(is_null($links)){
- $links = $this->currentLinks;
- }
-
- if(!is_array($links) || count($links) < 1){
- $this->error = 'printLinks: $links is not an array or has no elements.';
- return false;
- }
-
- foreach($links as $link){
- echo "<a href=$link>Link</a><br />\n";
- }
- return true;
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Search for string within $html. Set $caseSensitive to true for case sensitive search
- //
- //Returns true if $needle found within
- //
- public function inData($needle, $caseSensitive = null, $html = null){
- if(is_null($html)){
- $html = $this->getRawHtml();
- }
-
- if(is_null($caseSensitive)){
- $search = "|$needle|is";
- }else{
- $search = "|$needle|s";
- }
-
- if(preg_match($search, $html)){
- return true;
- }else{
- return false;
- }
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Deletes specified directory and its contents.
- //
- //Returns TRUE if directory and contents were successfully deleted
- //
- public function clearDir($dir){
- //remove leading / if it's there
- if (substr($dir, 0, 1) == '/'){
- preg_match('|/(.*)|', $dir, $match);
- $dir=$match[1];
- }
- $ogdir = $dir;
- $dir = self::WRITEDIRECTORY . $dir;
-
- if ($handle = @opendir("$dir")) {
- while (false !== ($item = readdir($handle))) {
- if ($item != "." && $item != "..") {
- if(!@unlink("$dir/$item")){
- if(is_dir("$dir/$item")){
- $this->clearDir("$ogdir/$item");
- }
- }
- }
- }
- closedir($handle);
- }else{
- $this->error = "clearDir: Could not open $dir for delete";
- return false;
- }
-
- if(rmdir($dir)){
- return true;
- } else {
- $this->error = "clearDir: There was a problem clearing $dir.";
- return false;
- }
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Writes the value of $html to $dir directory. If $html is not specified, it uses
- //$this->rawHtml. If safe_mode is on in php.ini, directories must be same UID/GID
- //(if relaxed to GID) as the sctipt calling this function. If a filename is specified,
- //it will write to that file, otherwise, it will create a unique filename in the dir
- //specified. Filename must have .extension
- //
- //Returns TRUE if write was successfull
- //
- public function write($dir, $html = null){
- //If no data was passed, assign the class variable
- if(is_null($html)){
- $html = $this->rawHtml;
- }
- //remove leading / if it's there
- if (substr($dir, 0, 1) == '/'){
- preg_match('|/(.*)|', $dir, $match);
- $dir=$match[1];
- }
- $dir = self::WRITEDIRECTORY . $dir;
- $path_info = pathinfo($dir);
- if(!strstr($path_info['basename'], '.')){
- if (substr($dir, -1, 1) != '/'){
- $dir=$dir . "/";
- }
- //Check if dir already exists
- if(!is_dir($dir)){
- //file is not specified, create directory
- if(exec('mkdir -p ' . $dir)){
- $this->error = "write: Creation of $dir was unsuccessfull";
- return false;
- }
- }
- }else{
- //file was specified, create path upto filename
- if(!is_dir($path_info['dirname'])){
- if(exec('mkdir -p '. $path_info['dirname'])){
- $this->error = "write: Creation of $dir was unsuccessfull";
- return false;
- }
- }
- }
- //If directory was specified, cd to it and enable random filenames.
- if(is_dir($dir)){
- chdir($dir);
- $randomName = 1;
- }else{
- //Dir not specified, just cd into dirname to create file.
- chdir($path_info['dirname']);
- }
-
- //Assures a unique filename and no duplication
- //Check for specified file, if not, then create unique filename
- if($randomName){
- $hash = md5($html) . ".html";
- $abspath = $dir . $hash;
- $file = fopen($abspath, "w");
- }else{
- $file = fopen($dir, 'w');
- }
-
- if(fwrite($file, $html)){
- $this->writeCount++;
- } else {
- $this->error = "write: There was an error writing to $abspath.";
- return false;
- }
- fclose($file);
- return true;
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Writes $image to $image_name. $image_name will be appended to WRITEDIRECTORY constant.
- //$image can be actual binary image or image URL
- //
- //Returns true if image wrote to disk successfully
- //
- public function writeImage($image, $img_name){
- if(empty($image)){
- $this->error = "writeImage: There is no image to write.";
- return false;
- }
-
- //remove leading / if it's there
- if (substr($img_name, 0, 1) == '/'){
- preg_match('|/(.*)|', $img_name, $match);
- $img_name=$match[1];
- }
- $img_name = self::WRITEDIRECTORY . $img_name;
- $path_info = pathinfo($img_name);
- //file was specified, create path upto filename
- if(!is_dir($path_info['dirname'])){
- if(exec('mkdir -p '. $path_info['dirname'])){
- $this->error = "write: Creation of $img_name was unsuccessfull";
- return false;
- }
- }
-
- //Allows for either actual binary image file or web address
- if(preg_match('/^[http:|https:|ftp:]/i', $image)){
- $image = $this->ripRun($image);
- }
-
- $file = fopen($img_name, "w");
- if(fwrite($file, $image)){
- return true;
- } else {
- $this->error = "writeImage: Could not write image";
- return false;
- }
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Removes any alert or confirm javascript popups.
- //
- //Returns clean HTML. Also sets $rawHtml to clean value.
- public function ripJSPopups($html = null){
- if(is_null($html)){
- $html = $this->rawHtml;
- }
-
- $clean = preg_replace('/alert\(.*?\)[;]/', '', $html);
- $realclean = preg_replace('/confirm\(.*?\)[;]/', '', $clean);
- $this->rawHtml = $realclean;
- return $this->rawHtml;
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Makes all ralative links absolute with the provided $basrUrl. This function is usually
- //only used by ripRun() when specified in argument 2.
- //
- //Returns clean HTML. Also sets $rawHtml to cleaned value
- //
- public function fixLinks($baseUrl, $html = null){
- if(is_null($html)){
- $html = $this->rawHtml;
- }
-
- $tagAttributes=array(
- 'table'=>'background',
- 'td'=>'background',
- 'tr'=>'background',
- 'th'=>'background',
- 'body'=>'background',
- 'a'=>'href',
- 'link'=>'href',
- 'area'=>'href',
- 'form'=>'action',
- 'script'=>'src',
- 'img'=>'src',
- 'input'=>'src',
- 'iframe'=>'src',
- 'frame'=>'src',
- 'embed'=>'src');
-
- //Get hostname for relative URL's
- $host = parse_url($baseUrl);
- $host = $host['scheme'] . "://" . $host['host'];
-
- if(preg_match('/<base(?:.*?)href=["\']?([^\'"\s]*)[\'"\s]?/is', $html, $base)){
- $baseUrl = $base[1];
- $host = $baseUrl;
- }
-
- // Append a trailing slash to the url if it doesn't exist
- if (substr($baseUrl, -1, 1) !='/'){
- $baseUrl.='/';
- }
-
- //Since we process $html through DOM, it replaces HTML special chars with ?
- //Anyone know of a better solution, changing the DOM character encoding does not work.
- $encodingValues = array(
- ' '=>'s%s%',
- '©'=>'c%c%',
- '§'=>'se%se%',
- '&'=>'a%a%',
- '¢'=>'ce%ce%',
- '£'=>'p%p%',
- '®'=>'r%r%',
- '¥'=>'y%y%',
- '°'=>'d%d%',
- '«'=>'l%l%',
- '"'=>'q%q%',
- '<'=>'lt%lt%',
- '>'=>'g%g%',
- '™'=>'t%t%',
- '»'=>'ra%ra%'
- );
-
- foreach($encodingValues as $code=>$replace){
- $html = preg_replace("|$code|i", $replace, $html);
- }
-
- $doc = new DOMDocument();
- @$doc->loadHTML($html);
- foreach($tagAttributes as $tag=>$attribute){
- $tagType = $doc->getElementsByTagName($tag);
- foreach($tagType as $link){
- //Link is relative
- $url = $link->getAttribute($attribute);
- if(substr($url, 0, 1) == "/"){
- if(substr($url, 1, 2) == ".."){
- //Leave as if relative since /../needs to maintain its current path
- $link->setAttribute($attribute, $baseUrl . $link->getAttribute($attribute));
- }
- $link->setAttribute($attribute, $host . $link->getAttribute($attribute));
- }elseif(preg_match('/^https?:|ftp:|javascript:/', $url)){
- //Don't need to change anything
- continue;
- }else{
- $link->setAttribute($attribute, $baseUrl . $link->getAttribute($attribute));
- }
- }
- }
- $html = $doc->saveHTML();
-
- foreach($encodingValues as $code=>$replace){
- $html = preg_replace("|$replace|i", $code, $html);
- }
-
- //Need to catch the @import styles
- if(preg_match('|<style.*?@import.*?</style>|is', $html)){
- $data = $this->ripInBetween('<style.*?>', '</style>', null, $html);
- foreach($data as $link){
- preg_match_all('$[\(|\'|"](.*?)[\)|\'|"]$s', $link, $matches);
- $tail = $matches[1][0];
- if(!preg_match('|^https?:|is', $tail)){
- if(!substr($tail, 0, 1 == "/")){
- $html = $this->sandr($tail, $baseUrl . $tail, $html);
- }
- }
- }
- }
-
- $this->rawHtml = $html;
- return $html;
- }
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Find all links to other files...inside body, img, iframe, etc...
- //
- //Returns array of all possible external links found
- //
- function getAllLinks($html = null){
- if(is_null($html)){
- $html = $this->rawHtml;
- }
- $links = array();
-
- $tagAttributes=array(
- 'table'=>'background',
- 'td'=>'background',
- 'tr'=>'background',
- 'th'=>'background',
- 'body'=>'background',
- 'a'=>'href',
- 'link'=>'href',
- 'area'=>'href',
- 'form'=>'action',
- 'script'=>'src',
- 'img'=>'src',
- 'iframe'=>'src',
- 'input'=>'src',
- 'frame'=>'src',
- 'embed'=>'src');
-
- // Single, double, and no quotes are both supported
- foreach($tagAttributes as $tag=>$attribute){
- $pattern="/<$tag([^>]*) $attribute=[\"']?([^\"' ]*)[\"']?/is";
- preg_match_all($pattern, $html, $matches);
- $links = array_merge($links, $matches[2]);
- }
- return $links;
- }
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Finds all hidden form elements within specified $formName. If empty, it searches
- //the entire document.
- //
- //Returns asociative array (element name=>element value) of found elements and their
- //current values
- //
- public function getHiddenFormElements($formName = null, $html = null){
- if(is_null($html)){
- $html = $this->rawHtml;
- }
-
- $elementsArray = array();
-
- $page = new DOMDocument();
- $page->preserveWhiteSpace=false;
- @$page->loadHTML($html);
-
- //Look for the form that matches given name
- $forms = $page->getElementsByTagName('form');
- foreach($forms as $form){
- if(is_null($formName) || ($form->getAttribute('name') == $formName)){
- $currentForm = $form;
- }
- }
-
- $inputElements = $currentForm->getElementsByTagName('input');
- foreach($inputElements as $element){
- $name = $element->getAttribute('name');
- $value = $element->getAttribute('value');
- $type = $element->getAttribute('type');
-
- if(strtolower($type) == 'hidden'){
- if(array_key_exists($name, $elementsArray)){
- if(is_array($elementsArray[$name])){
- $elementsArray[$name][] = $value;
- }else{
- $elementsArray[$name] = array($elementsArray[$name], $value);
- }
- }else{
- $elementsArray[$name] = $value;
- }
- }
- }
-
- return $elementsArray;
- }
- /////////////////////////////////////////////////////////////////////////////////////
- //Finds all form elements within specified $formName. If empty, it searches the entire
- //document.
- //
- //Returns asociative array (element name=>element value) of found elements and their
- //current values
- //
- public function getFormElements($formName = null, $html = null){
- if(is_null($html)){
- $html = $this->rawHtml;
- }
-
- $elementsArray = array();
-
- $page = new DOMDocument();
- $page->preserveWhiteSpace=false;
- @$page->loadHTML($html);
-
- //Look for the form that matches given name
- $forms = $page->getElementsByTagName('form');
- foreach($forms as $form){
- if(is_null($formName) || ($form->getAttribute('name') == $formName)){
- $currentForm = $form;
- }
- }
-
- if(!is_object($currentForm)){
- $this->error = "getFormElements: HTML contains no forms.";
- return false;
- }
-
- //Set form info class variables
- $this->formAction = $currentForm->getAttribute('action');
- $this->formMethod = $currentForm->getAttribute('method');
-
- //Process for input elements
- $inputElements = $currentForm->getElementsByTagName('input');
- foreach($inputElements as $element){
- $name = $element->getAttribute('name');
- $value = $element->getAttribute('value');
- $type = $element->getAttribute('type');
-
- switch (strtolower($type)){
- case 'radio':
- if(!$element->attributes->getNamedItem('checked')){
- continue 2;
- }
- break;
- case 'checkbox':
- if(!$element->attributes->getNamedItem('checked')){
- continue 2;
- }
- break;
- case 'submit';
- continue 2;
- break;
- case 'button';
- continue 2;
- break;
- default:
-
- }
-
- if(array_key_exists($name, $elementsArray)){
- if(is_array($elementsArray[$name])){
- $elementsArray[$name][] = $value;
- }else{
- $elementsArray[$name] = array($elementsArray[$name], $value);
- }
- }else{
- $elementsArray[$name] = $value;
- }
-
- }
-
- //Process for select options
- $selectElements = $currentForm->getElementsByTagName('select');
- foreach($selectElements as $element){
- $name = $element->getAttribute('name');
- $options = $element->getElementsByTagName('option');
- foreach($options as $option){
- $value = $option->getAttribute('value');
- $optionValue = $option->getAttribute('value');
- if($option->attributes->getNamedItem('selected')){
- if(array_key_exists($name, $elementsArray)){
- if(is_array($elementsArray[$name])){
- $elementsArray[$name][] = $value;
- }else{
- $elementsArray[$name] = array($elementsArray[$name], $value);
- }
- }else{
- $elementsArray[$name] = $value;
- }
- }
- }
- }
-
- //Process for textarea
- $textElements = $currentForm->getElementsByTagName('textarea');
- foreach($textElements as $element){
- $name = $element->getAttribute('name');
- $value = $element->textContent;
- if(array_key_exists($name, $elementsArray)){
- if(is_array($elementsArray[$name])){
- $elementsArray[$name][] = $value;
- }else{
- $elementsArray[$name] = array($elementsArray[$name], $value);
- }
- }else{
- $elementsArray[$name] = $value;
- }
- }
-
- return $elementsArray;
- }
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Gets form information
- //
- //Returns array of form information
- //
- public function getFormInfo($formName = null, $html = null){
- if(is_null($html)){
- $html = $this->rawHtml;
- }
-
- $page = new DOMDocument();
- $page->preserveWhiteSpace=false;
- @$page->loadHTML($html);
-
- //Look for the form that matches given name
- $forms = $page->getElementsByTagName('form');
- foreach($forms as $form){
- if(is_null($formName) || ($form->getAttribute('name') == $formName)){
- $currentForm = $form;
- }
- }
-
- //Set form info class variables
- $this->formAction = $currentForm->getAttribute('action');
- $return['action'] = $this->formAction;
- $this->formMethod = $currentForm->getAttribute('method');
- $return['method'] = $this->formMethod;
- return $return;
- }
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Some general statistics for the current ripCurl session. This is a cumulative total
- //of all ripRun()'s.
- //
- //Returns HTML of totals.
- //
- public function getStats(){
- return $stats = "<br /><br />
- Total Connections: $this->totalConnections<br />
- Total Time: $this->totalTime<br />
- Total Download Size: $this->totalSize<br />
- Average Connection Speed: $this->averageSpeed<br />
- Total Redirects: $this->totalRedirects<br />
- Average Time per Connection: $this->averageTime<br />";
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Assigns POST data and enables POST settings in the CURL object. Primarily used
- //from 3rd argument of ripRun().
- //
- //Returns true if settings set and data ready for ripRun().
- //
- public function postRequest($data){
- if(empty($data)){
- $this->error = "postRequest: No post data specified";
- return false;
- }
-
- $this->post();
- $string = "";
- if(!is_array($data)){
- $this->postFields($data);
- $this->postData = $string;
- } else {
- foreach($data as $k=>$v){
- if(is_array($v)){
- foreach($v as $subval){
- $string .= urlencode($k) . "=" . urlencode($subval) . "&";
- }
- }else{
- $string .= urlencode($k) . "=" . urlencode($v) . "&";
- }
- }
- $string = rtrim($string, '&');
- $this->postFields($string);
- $this->postData = $string;
- return true;
- }
- }
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Sets up data for get requests
- //
- //Returns query string
- public function getRequest($data){
- $string = "?";
- if(!is_array($data)){
- return $data;
- } else {
- foreach($data as $k=>$v){
- if(is_array($v)){
- foreach($v as $subval){
- $string .= urlencode($k) . "=" . urlencode($subval) . "&";
- }
- }else{
- $string .= urlencode($k) . "=" . urlencode($v) . "&";
- }
- }
- $string = rtrim($string, '&');
- return $string;
- }
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Remove all HTML comments
- //
- //Returns HTML stripped of comments.
- //
- public function stripComments($html = null){
- if(is_null($html)){
- $html = $this->rawHtml;
- }
- $html = $this->sandr('<!--.*?-->', '', $html);
- $this->rawHtml = $html;
- return $html;
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Retrieve 'href' value for a given link value
- //
- //Returns array of matching values
- //
- public function getNamedLinks($id, $html = null){
- if(is_null($html)){
- $html = $this->rawHtml;
- }
-
- $return = array();
- $doc = new DOMDocument();
- @$doc->loadHTML($html);
- $tags = $doc->getElementsByTagName('a');
- foreach($tags as $tag){
- $href = $tag->getAttribute('href');
- $link = $tag->nodeValue;
- if(preg_match("|$id|s", $link)){
- $return[] = $href;
- }
- }
- unset($doc);
- $return = array_unique($return);
- return $return;
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Main ripCurl execution method. $url is the URL to pull from. if not set, it is taken
- //from $this->ripUrl. $fixlinks is boolean option to automatically fix relative links.
- //$postdata can be array or urlencoded string. Result page is both returned and set
- //in $this->rawHtml.
- //
- //Returns HTML content retrieved from URL query.
- //
- public function ripRun($url, $fixLinks = null, $postdata = null){
-
- $this->url($url);
-
- if(!is_null($postdata)){
- if(strtolower($this->formMethod) == 'get'){
- $url = $url . $this->getRequest($postdata);
- $this->url($url);
- }else{
- $this->url($url);
- $this->postRequest($postdata);
- }
- }
-
- $html = curl_exec($this->ch);
-
- $this->rawHtml = $html;
- $this->lastUrl = $url;
- //$this->lastUrl = curl_getinfo($this->ch, CURLINFO_EFFECTIVE_URL);
-
- if($fixLinks){
- $parse = parse_url($url);
- if(preg_match('|.*?/([^/]*\..*?[^/]*)|is', $parse['path'], $file)){
- $parse['path'] = preg_replace('|(.*?/)([^/]*\..*?[^/]*)|is', '$1', $parse['path']);
- }
- $host = $parse['scheme'] . "://" . $parse['host'] . $parse['path'];
- //$host = $path;
- $this->fixLinks($host, $this->rawHtml);
- }
-
- //Stats tracking
- $this->totalConnections = $this->totalConnections + 1;
- $this->totalTime = $this->totalTime + $this->getInfo('total_time');
- $this->totalSize = $this->totalSize + $this->getInfo('size_download');
- $this->averageSpeed = ($this->averageSpeed + $this->getInfo('speed_download')) / $this->totalConnections;
- $redir = $this->getInfo('redirect_time');
-
- if(!empty($redir)){
- $this->totalRedirects = $this->totalRedirects + 1;
- }
- $this->averageTime = $this->totalTime / $this->totalConnections;
- $this->lastStatus = curl_getinfo($this->ch, CURLINFO_HTTP_CODE);
- return $this->getRawHtml();
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //XML methods
- /////////////////////////////////////////////////////////////////////////////////////
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Parses XML file or http address where the output is valid XML
- //
- //Returns array
- //
- public function getXMLArray($filename){
- if(!is_file($filename)){
- $this->error = "getXMLArray: Bad filename $filename";
- return false;
- }
-
- $contents = file_get_contents($filename);
-
- if(!$this->inData('^<\?[.*?]xml version=[.*?]\?>', null, $contents)){
- $contents = '<?xml version="1.0"?>' . $contents;
- }
-
- $xml= new DOMDocument();
- $xml->preserveWhiteSpace=false;
- $xml->loadXML($contents);
- $result = $this->xml2array($xml);
- return $result;
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Creates array of XML file or existing XML array and pull out only the desired named
- //fields.
- //
- //Returns array of named fields
- //
- public function getXMLElements($filename, $elementName){
- if(is_file($filename) && is_readable($filename)){
- $data = $this->getXMLArray($filename);
- }elseif(is_array($filename)){
- $data = $filename;
- }else{
- $this->error = "getXMLElements: Bad filename or array given";
- return false;
- }
-
- foreach($data as $k=>$v){
- if(is_array($v)){
- $this->getXMLElements($v, $elementName);
- }elseif($k == $elementName){
- $return[] = $v;
- }
- }
- return $return;
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //Helper function for XML parsing
- //
- //Returns XML structure as array
- //
- private function xml2array($n){
- $return=array();
- foreach($n->childNodes as $nc){
- if($nc->hasChildNodes()){
- if($n->firstChild->nodeName== $n->lastChild->nodeName&&$n->childNodes->length>1){
- $return[$nc->nodeName][]=$this->xml2array($nc);
- }else{
- $return[$nc->nodeName]=$this->xml2array($nc);
- }
- }else{
- $return=$nc->nodeValue;
- }
- }
- return $return;
- }
-
-
- /////////////////////////////////////////////////////////////////////////////////////
- //This section is to layout future development ideas and projects
- //
- //Expansion ideas:
- //Expand image features with GD lib
- //Build XML parsing methods
- //Include validation checks for XML, HTML, CSS, etc.
- //Include limited JavaScript parsing capabilities
- //Build pagination methods with callback functionality
- //
- /////////////////////////////////////////////////////////////////////////////////////
-
- }
- ?>