PageRenderTime 56ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/releases/ripcurl-0.6.0/ripcurl.class.php

https://bitbucket.org/linuxdream/ripcurl
PHP | 1398 lines | 944 code | 210 blank | 244 comment | 134 complexity | 178370c55e16f7d90b4aaf25a0d9b08d MD5 | raw file
  1. <?php
  2. //RipCURL class 0.6 - SD Linux Services
  3. //This class was written by Brandon Ching of SD Linux Services
  4. //It is licensed under the GPL 2 and requires PHP5 and CURL 7
  5. class ripCurl{
  6. //Class definitions
  7. //Define the default directory for ripWrite. Must include trailing "/". Please turn safe_mode off or make the proper UID/GID changes
  8. //to the php.ini file or in the ripWrite function below. The directory should be located under the web root directory for links to work
  9. //or use a directory alias in httpd.conf.
  10. const WRITEDIRECTORY = "/tmp/";
  11. //Set class variables
  12. //curl handler
  13. private $ch;
  14. //Returned value of curl_exec. This value should never be directly written to. It is meant to be used by class methods only.
  15. //It always holds the current HTML of the last page to be fetched/processed.
  16. private $rawHtml = '';
  17. //Value of the last POST fields
  18. private $postData = '';
  19. //Error variable
  20. private $error = '';
  21. //Total HTTP errors for class instance
  22. private $httpErrors = 0;
  23. //Count for ripWrite
  24. private $writeCount = 0;
  25. //Count for ripGetlinks
  26. private $linksCount = 0;
  27. //Total redirects for class instance
  28. private $totalRedirects = 0;
  29. //Total time for class existance
  30. private $totalTime = 0;
  31. //Total size of information transfered
  32. private $totalSize = 0;
  33. //Average speed of all transfers
  34. private $averageSpeed = 0;
  35. //Average time per transfer
  36. private $averageTime = 0;
  37. //Total number of connections for class existance
  38. private $totalConnections = 0;
  39. //Last ripRun status code
  40. private $lastStatusCode = null;
  41. //Initialize cookiejar directory. Filename gets set in the constructor
  42. private $cookiejar = "/tmp/";
  43. //Form action of last getFormElements()
  44. private $formAction = "";
  45. //Form method of last getFormElements()
  46. private $formMethod = "post";
  47. //Maintains internal track of all links in getLinks()
  48. private $currentLinks = "";
  49. //Last URL in ripRun();
  50. private $lastUrl = "";
  51. //Accessor methods
  52. public function getRawHtml(){
  53. return $this->rawHtml;
  54. }
  55. public function getLastUrl(){
  56. return $this->lastUrl;
  57. }
  58. public function getPostData(){
  59. return $this->postData;
  60. }
  61. public function getHttpErrors(){
  62. return $this->httpErrors;
  63. }
  64. public function getWriteCount(){
  65. return $this->writeCount;
  66. }
  67. public function getLinksCount(){
  68. return $this->linksCount;
  69. }
  70. public function zeroLinksCount(){
  71. if($this->linksCount = 0){
  72. return true;
  73. }else{
  74. $this->error = "zeroLinksCount: Could not reset linksCount variable.";
  75. return false;
  76. }
  77. }
  78. public function getTotalRedirects(){
  79. return $this->totalRedirects;
  80. }
  81. public function getTotalTime(){
  82. return $this->totalTime;
  83. }
  84. public function getTotalSize(){
  85. return $this->totalSize;
  86. }
  87. public function getAverageSpeed(){
  88. return $this->averageSpeed;
  89. }
  90. public function getAverageTime(){
  91. return $this->averageTime;
  92. }
  93. public function getTotalConnections(){
  94. return $this->totalConnections;
  95. }
  96. public function getCookieJar(){
  97. return $this->cookiejar;
  98. }
  99. public function getError(){
  100. return $this->error;
  101. }
  102. public function getLastStatus(){
  103. return $this->lastStatus;
  104. }
  105. public function setCookieJar($o){
  106. if(is_writeable($o) && $this->cookiejar = $o){
  107. $this->cookieJar($o);
  108. $this->cookieFile($o);
  109. return true;
  110. }else{
  111. $this->error = "setCookieJar: Cookiejar could not be set";
  112. return false;
  113. }
  114. }
  115. public function getFormAction(){
  116. return $this->formAction;
  117. }
  118. public function getFormMethod(){
  119. return $this->formMethod;
  120. }
  121. public function getCurrentLinks(){
  122. return $this->currentLinks;
  123. }
  124. public function clearCurrentLinks(){
  125. if($this->currentLinks = array()){
  126. return true;
  127. }else{
  128. $this->error = "clearCurrentLinks: Could not clear currentLinks array";
  129. return false;
  130. }
  131. }
  132. public function setFormMethod($o){
  133. $this->formMethod = $o;
  134. }
  135. //Constructor method. Designed to set standard defaults for pulling sites into a variable
  136. public function ripCurl($multi = null){
  137. $this->cookiejar = $this->cookiejar . md5(date('l dS \of F Y h:i:s A')+rand(5, 50)) . ".cookiejar.txt";
  138. $this->ch = curl_init();
  139. curl_setopt($this->ch, CURLOPT_RETURNTRANSFER,1);
  140. curl_setopt($this->ch, CURLOPT_AUTOREFERER,1);
  141. curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION,1);
  142. curl_setopt($this->ch, CURLOPT_SSL_VERIFYHOST,0);
  143. curl_setopt($this->ch, CURLOPT_SSL_VERIFYPEER,0);
  144. curl_setopt($this->ch, CURLOPT_CONNECTTIMEOUT,30);
  145. curl_setopt($this->ch, CURLOPT_COOKIEJAR, $this->cookiejar);
  146. curl_setopt($this->ch, CURLOPT_COOKIEFILE, $this->cookiejar);
  147. }
  148. //Class methods for setting curlopt variables
  149. public function autoReferer($o = 1){
  150. curl_setopt($this->ch, CURLOPT_AUTOREFERER, $o);
  151. }
  152. public function cookieSession($o = 1){
  153. curl_setopt($this->ch, CURLOPT_COOKIESESSION, $o);
  154. }
  155. public function failOnError($o = 1){
  156. curl_setopt($this->ch, CURLOPT_FAILONERROR, $o);
  157. }
  158. public function followLocation($o = 1){
  159. curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION, $o);
  160. }
  161. public function forbidReuse($o = 1){
  162. curl_setopt($this->ch, CURLOPT_FORBID_REUSE, $o);
  163. }
  164. public function freshConnect($o = 1){
  165. curl_setopt($this->ch, CURLOPT_FRESH_CONNECT, $o);
  166. }
  167. public function showHeader($o = 1){
  168. curl_setopt($this->ch, CURLOPT_HEADER, $o);
  169. }
  170. public function httpGet($o = 1){
  171. curl_setopt($this->ch, CURLOPT_HTTPGET, $o);
  172. }
  173. public function mute($o = 1){
  174. curl_setopt($this->ch, CURLOPT_MUTE, $o);
  175. }
  176. public function noSignal($o = 1){
  177. curl_setopt($this->ch, CURLOPT_NOSIGNAL, $o);
  178. }
  179. public function post($o = 1){
  180. curl_setopt($this->ch, CURLOPT_POST, $o);
  181. }
  182. public function put($o = 1){
  183. curl_setopt($this->ch, CURLOPT_PUT, $o);
  184. }
  185. public function returnTransfer($o = 1){
  186. curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, $o);
  187. }
  188. public function sslVerifyPeer($o = 1){
  189. curl_setopt($this->ch, CURLOPT_SSL_VERIFYPEER, $o);
  190. }
  191. public function sslVerifyHost($o = 1){
  192. curl_setopt($this->ch, CURLOPT_SSL_VERIFYHOST, $o);
  193. }
  194. public function verbose($o = 1){
  195. curl_setopt($this->ch, CURLOPT_VERBOSE, $o);
  196. }
  197. public function bufferSize($o){
  198. curl_setopt($this->ch, CURLOPT_BUFFERSIZE, $o);
  199. }
  200. public function connectTimeout($o){
  201. curl_setopt($this->ch, CURLOPT_CONNECTTIMEOUT, $o);
  202. }
  203. public function dnsCacheTimeout($o){
  204. curl_setopt($this->ch, CURLOPT_DNS_CACHE_TIMEOUT, $o);
  205. }
  206. public function httpVersion($o){
  207. curl_setopt($this->ch, CURLOPT_HTTP_VERSION, $o);
  208. }
  209. public function httpAuth($o){
  210. curl_setopt($this->ch, CURLOPT_HTTPAUTH, $o);
  211. }
  212. public function maxConnects($o){
  213. curl_setopt($this->ch, CURLOPT_MAXCONNECTS, $o);
  214. }
  215. public function port($o){
  216. curl_setopt($this->ch, CURLOPT_PORT, $o);
  217. }
  218. public function maxRedirs($o){
  219. curl_setopt($this->ch, CURLOPT_MAXREDIRS, $o);
  220. }
  221. public function sslVersion($o){
  222. curl_setopt($this->ch, CURLOPT_SSLVERSION, $o);
  223. }
  224. public function timeOut($o){
  225. curl_setopt($this->ch, CURLOPT_TIMEOUT, $o);
  226. }
  227. public function cookie($o){
  228. curl_setopt($this->ch, CURLOPT_COOKIE, $o);
  229. }
  230. public function cookieJar($o){
  231. curl_setopt($this->ch, CURLOPT_COOKIEJAR, $o);
  232. }
  233. public function cookieFile($o){
  234. curl_setopt($this->ch, CURLOPT_COOKIEFILE, $o);
  235. }
  236. public function encoding($o){
  237. curl_setopt($this->ch, CURLOPT_ENCODING, $o);
  238. }
  239. public function postFields($o){
  240. curl_setopt($this->ch, CURLOPT_POSTFIELDS, $o);
  241. }
  242. public function referer($o){
  243. curl_setopt($this->ch, CURLOPT_REFERER, $o);
  244. }
  245. public function url($o){
  246. curl_setopt($this->ch, CURLOPT_URL, $o);
  247. }
  248. public function userAgent($o){
  249. switch ($o){
  250. case 'ie6':
  251. $o = 'Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1)';
  252. break;
  253. case 'ie7':
  254. $o = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)';
  255. break;
  256. case 'ff':
  257. $o = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; es-ES; rv:1.8.0.6) Gecko/20060728 Firefox/1.5.0.6';
  258. break;
  259. case 'google':
  260. $o = 'Googlebot/2.1 (+http://www.google.com/bot.html)';
  261. break;
  262. case 'msn':
  263. $o = 'msnbot/1.0 (+http://search.msn.com/msnbot.htm)';
  264. break;
  265. case 'yahoo':
  266. $o = 'Mozilla/5.0 (compatible; Yahoo! Slurp;http://help.yahoo.com/help/us/ysearch/slurp)';
  267. break;
  268. }
  269. curl_setopt($this->ch, CURLOPT_USERAGENT, $o);
  270. }
  271. //Username and password in uname:pass format
  272. public function userPwd($o){
  273. curl_setopt($this->ch, CURLOPT_USERPWD, $o);
  274. }
  275. public function httpHeader($o){
  276. curl_setopt($this->ch, CURLOPT_HTTPHEADER, $o);
  277. }
  278. /////////////////////////////////////////////////////////////////////////////////////
  279. // Begin major class methods
  280. //
  281. /////////////////////////////////////////////////////////////////////////////////////
  282. //Used to display a summary of the entire ripCurl session.
  283. //
  284. //If a parameter is specified, it will return only that key value.
  285. //Otherwise, return an array
  286. //
  287. public function getInfo($o = null){
  288. if(!is_null($o)){
  289. $info = curl_getinfo($this->ch);
  290. $value = $info[$o];
  291. return $value;
  292. } else {
  293. $info = curl_getinfo($this->ch);
  294. return $info;
  295. }
  296. }
  297. /////////////////////////////////////////////////////////////////////////////////////
  298. //Get contents of cookiefile
  299. //
  300. //Returns an associative array of cookie parts
  301. //
  302. public function getCookieJarContents($cookiefile = null){
  303. if(is_null($cookiefile)){
  304. $cookiefile = $this->cookiejar;
  305. }
  306. if(!is_readable($cookiefile)){
  307. $this->error = "ripGetCookieContents: Cookie file is not readable";
  308. return false;
  309. }
  310. $contents = file_get_contents($cookiefile);
  311. preg_match_all('|.*\t.*|', $contents, $dough);
  312. $cookies = array();
  313. for($i = 0; $i < count($dough[0]); $i++){
  314. $c = explode("\t", $dough[0][$i]);
  315. $cookies[$i]['host'] = $c[0];
  316. $cookies[$i]['secure'] = $c[1];
  317. $cookies[$i]['path'] = $c[2];
  318. $cookies[$i]['httpOnly'] = $c[3];
  319. $cookies[$i]['expire'] = $c[4];
  320. $cookies[$i]['name'] = $c[5];
  321. $cookies[$i]['value'] = $c[6];
  322. }
  323. return $cookies;
  324. }
  325. /////////////////////////////////////////////////////////////////////////////////////
  326. //Sets a cookie accroding to Netscape cookie files.
  327. //
  328. //Returns TRUE on successfull cookie write
  329. //
  330. public function writeCookie($cookie){
  331. if(!is_array($cookie) || count($cookie) != 7){
  332. $this->error = "setCookie: Argument must be an array with host, secure, path, httpOnly, expire, name, and value";
  333. return false;
  334. }
  335. $cookiefile = $this->cookiejar;
  336. if(!is_writable($cookiefile)){
  337. $this->error = "Cookie file is not writeable";
  338. return false;
  339. }
  340. $fileh = fopen($cookiefile, 'a+');
  341. $cookieString = implode("\t", $cookie);
  342. if(fwrite($fileh, $cookieString) === FALSE){
  343. $this->error = "setCookie: Could not write cookie to file";
  344. return false;
  345. }
  346. fclose($fileh);
  347. return true;
  348. }
  349. /////////////////////////////////////////////////////////////////////////////////////
  350. //Pulls all HTML between the $start and $end and returns it.
  351. //
  352. //Returns array of captured content
  353. //
  354. public function ripInBetween($start, $end, $greedy = null, $html = null){
  355. if(is_null($html)){
  356. $html = $this->getRawHtml();
  357. }
  358. if(is_null($greedy)){
  359. $needle="|$start" . "(.*?)" . "$end|is";
  360. }else{//Do greedy search
  361. $needle="|$start" . "(.*)" . "$end|is";
  362. }
  363. preg_match_all($needle, $html, $result, PREG_PATTERN_ORDER);
  364. return $result[1];
  365. }
  366. /////////////////////////////////////////////////////////////////////////////////////
  367. //Search and replace function.
  368. //
  369. //Returns value of $html after replacement. Also sets replaced value to $rawHtml
  370. //
  371. public function sandr($search, $replace, $html = null){
  372. if(is_null($html)){
  373. $html = $this->getRawHtml();
  374. }
  375. $html = preg_replace("|$search|is", $replace, $html);
  376. $this->rawHtml = $html;
  377. return $html;
  378. }
  379. /////////////////////////////////////////////////////////////////////////////////////
  380. //Removes all javascript from the passed html
  381. //
  382. //Returns clean HTML. Also sets value to $rawHtml
  383. //
  384. public function ripJS($html = null){
  385. if(is_null($html)){
  386. $html = $this->getRawHtml();
  387. }
  388. $clean = preg_replace('|<script.*?/script>|is', '', $html);
  389. $this->rawHtml = $clean;
  390. return $clean;
  391. }
  392. /////////////////////////////////////////////////////////////////////////////////////
  393. //Removes all style sheets and <style> contents.
  394. //
  395. //Returns clean HTML. Also sets value to $rawHtml
  396. //
  397. public function ripStyles($html = null){
  398. if(is_null($html)){
  399. $html = $this->getRawHtml();
  400. }
  401. $clean = preg_replace('|<style.*?/style>|is', '', $html);
  402. $this->rawHtml = $clean;
  403. return $clean;
  404. }
  405. /////////////////////////////////////////////////////////////////////////////////////
  406. //Method to get all links in a page. If the $id is specified, returned links must
  407. //contain $id. Link counts are automatically tracked in the $this->linksCount class
  408. //variable
  409. //
  410. //Returns array of all links found with $id specified
  411. //
  412. public function getLinks($id = null, $html = null){
  413. if(is_null($html)){
  414. $html = $this->getRawHtml();
  415. }
  416. if(is_null($id)){
  417. $id = ".*";
  418. }
  419. $links = array();
  420. $pattern = '/<A\s*HREF=[\"\']?([^\"\'>]*)[\"\']?[^>]*>(.*?)<\/A>/is';
  421. $idpattern = '|' . $id . '|';
  422. if (preg_match_all($pattern, $html, $matches, PREG_SET_ORDER)) {
  423. if(!is_null($id)){
  424. foreach($matches as $match){
  425. if(preg_match($idpattern, $match[1])){
  426. array_push($links, $match[1]);
  427. }
  428. }
  429. } else {
  430. $links = $matches;
  431. }
  432. }
  433. $links = array_unique($links);
  434. $this->linksCount = $this->linksCount + count($links);
  435. $this->currentLinks = $links;
  436. return $links;
  437. }
  438. /////////////////////////////////////////////////////////////////////////////////////
  439. //Like getLinks method but for links with javascript in the href portion
  440. //
  441. //Returns array of all links found with $id
  442. //
  443. public function getWeirdLinks($id = null, $html = null){
  444. if(is_null($html)){
  445. $html = $this->getRawHtml();
  446. }
  447. if(is_null($id)){
  448. $id = ".*";
  449. }
  450. $return = array();
  451. $doc = new DOMDocument();
  452. @$doc->loadHTML($html);
  453. $tags = $doc->getElementsByTagName('a');
  454. foreach($tags as $tag){
  455. $href = $tag->getAttribute('href');
  456. if(preg_match("|$id|s", $href)){
  457. $return[] = $href;
  458. }
  459. }
  460. $return = array_unique($return);
  461. $this->currentLinks = $return;
  462. return $return;
  463. }
  464. /////////////////////////////////////////////////////////////////////////////////////
  465. //Search for string within $html. Set $caseSensitive to true for case sensitive search
  466. //
  467. //Returns true if $needle found within
  468. //
  469. public function printLinks($links = null){
  470. if(is_null($links)){
  471. $links = $this->currentLinks;
  472. }
  473. if(!is_array($links) || count($links) < 1){
  474. $this->error = 'printLinks: $links is not an array or has no elements.';
  475. return false;
  476. }
  477. foreach($links as $link){
  478. echo "<a href=$link>Link</a><br />\n";
  479. }
  480. return true;
  481. }
  482. /////////////////////////////////////////////////////////////////////////////////////
  483. //Search for string within $html. Set $caseSensitive to true for case sensitive search
  484. //
  485. //Returns true if $needle found within
  486. //
  487. public function inData($needle, $caseSensitive = null, $html = null){
  488. if(is_null($html)){
  489. $html = $this->getRawHtml();
  490. }
  491. if(is_null($caseSensitive)){
  492. $search = "|$needle|is";
  493. }else{
  494. $search = "|$needle|s";
  495. }
  496. if(preg_match($search, $html)){
  497. return true;
  498. }else{
  499. return false;
  500. }
  501. }
  502. /////////////////////////////////////////////////////////////////////////////////////
  503. //Deletes specified directory and its contents.
  504. //
  505. //Returns TRUE if directory and contents were successfully deleted
  506. //
  507. public function clearDir($dir){
  508. //remove leading / if it's there
  509. if (substr($dir, 0, 1) == '/'){
  510. preg_match('|/(.*)|', $dir, $match);
  511. $dir=$match[1];
  512. }
  513. $ogdir = $dir;
  514. $dir = self::WRITEDIRECTORY . $dir;
  515. if ($handle = @opendir("$dir")) {
  516. while (false !== ($item = readdir($handle))) {
  517. if ($item != "." && $item != "..") {
  518. if(!@unlink("$dir/$item")){
  519. if(is_dir("$dir/$item")){
  520. $this->clearDir("$ogdir/$item");
  521. }
  522. }
  523. }
  524. }
  525. closedir($handle);
  526. }else{
  527. $this->error = "clearDir: Could not open $dir for delete";
  528. return false;
  529. }
  530. if(rmdir($dir)){
  531. return true;
  532. } else {
  533. $this->error = "clearDir: There was a problem clearing $dir.";
  534. return false;
  535. }
  536. }
  537. /////////////////////////////////////////////////////////////////////////////////////
  538. //Writes the value of $html to $dir directory. If $html is not specified, it uses
  539. //$this->rawHtml. If safe_mode is on in php.ini, directories must be same UID/GID
  540. //(if relaxed to GID) as the sctipt calling this function. If a filename is specified,
  541. //it will write to that file, otherwise, it will create a unique filename in the dir
  542. //specified. Filename must have .extension
  543. //
  544. //Returns TRUE if write was successfull
  545. //
  546. public function write($dir, $html = null){
  547. //If no data was passed, assign the class variable
  548. if(is_null($html)){
  549. $html = $this->rawHtml;
  550. }
  551. //remove leading / if it's there
  552. if (substr($dir, 0, 1) == '/'){
  553. preg_match('|/(.*)|', $dir, $match);
  554. $dir=$match[1];
  555. }
  556. $dir = self::WRITEDIRECTORY . $dir;
  557. $path_info = pathinfo($dir);
  558. if(!strstr($path_info['basename'], '.')){
  559. if (substr($dir, -1, 1) != '/'){
  560. $dir=$dir . "/";
  561. }
  562. //Check if dir already exists
  563. if(!is_dir($dir)){
  564. //file is not specified, create directory
  565. if(exec('mkdir -p ' . $dir)){
  566. $this->error = "write: Creation of $dir was unsuccessfull";
  567. return false;
  568. }
  569. }
  570. }else{
  571. //file was specified, create path upto filename
  572. if(!is_dir($path_info['dirname'])){
  573. if(exec('mkdir -p '. $path_info['dirname'])){
  574. $this->error = "write: Creation of $dir was unsuccessfull";
  575. return false;
  576. }
  577. }
  578. }
  579. //If directory was specified, cd to it and enable random filenames.
  580. if(is_dir($dir)){
  581. chdir($dir);
  582. $randomName = 1;
  583. }else{
  584. //Dir not specified, just cd into dirname to create file.
  585. chdir($path_info['dirname']);
  586. }
  587. //Assures a unique filename and no duplication
  588. //Check for specified file, if not, then create unique filename
  589. if($randomName){
  590. $hash = md5($html) . ".html";
  591. $abspath = $dir . $hash;
  592. $file = fopen($abspath, "w");
  593. }else{
  594. $file = fopen($dir, 'w');
  595. }
  596. if(fwrite($file, $html)){
  597. $this->writeCount++;
  598. } else {
  599. $this->error = "write: There was an error writing to $abspath.";
  600. return false;
  601. }
  602. fclose($file);
  603. return true;
  604. }
  605. /////////////////////////////////////////////////////////////////////////////////////
  606. //Writes $image to $image_name. $image_name will be appended to WRITEDIRECTORY constant.
  607. //$image can be actual binary image or image URL
  608. //
  609. //Returns true if image wrote to disk successfully
  610. //
  611. public function writeImage($image, $img_name){
  612. if(empty($image)){
  613. $this->error = "writeImage: There is no image to write.";
  614. return false;
  615. }
  616. //remove leading / if it's there
  617. if (substr($img_name, 0, 1) == '/'){
  618. preg_match('|/(.*)|', $img_name, $match);
  619. $img_name=$match[1];
  620. }
  621. $img_name = self::WRITEDIRECTORY . $img_name;
  622. $path_info = pathinfo($img_name);
  623. //file was specified, create path upto filename
  624. if(!is_dir($path_info['dirname'])){
  625. if(exec('mkdir -p '. $path_info['dirname'])){
  626. $this->error = "write: Creation of $img_name was unsuccessfull";
  627. return false;
  628. }
  629. }
  630. //Allows for either actual binary image file or web address
  631. if(preg_match('/^[http:|https:|ftp:]/i', $image)){
  632. $image = $this->ripRun($image);
  633. }
  634. $file = fopen($img_name, "w");
  635. if(fwrite($file, $image)){
  636. return true;
  637. } else {
  638. $this->error = "writeImage: Could not write image";
  639. return false;
  640. }
  641. }
  642. /////////////////////////////////////////////////////////////////////////////////////
  643. //Removes any alert or confirm javascript popups.
  644. //
  645. //Returns clean HTML. Also sets $rawHtml to clean value.
  646. public function ripJSPopups($html = null){
  647. if(is_null($html)){
  648. $html = $this->rawHtml;
  649. }
  650. $clean = preg_replace('/alert\(.*?\)[;]/', '', $html);
  651. $realclean = preg_replace('/confirm\(.*?\)[;]/', '', $clean);
  652. $this->rawHtml = $realclean;
  653. return $this->rawHtml;
  654. }
  655. /////////////////////////////////////////////////////////////////////////////////////
  656. //Makes all ralative links absolute with the provided $basrUrl. This function is usually
  657. //only used by ripRun() when specified in argument 2.
  658. //
  659. //Returns clean HTML. Also sets $rawHtml to cleaned value
  660. //
  661. public function fixLinks($baseUrl, $html = null){
  662. if(is_null($html)){
  663. $html = $this->rawHtml;
  664. }
  665. $tagAttributes=array(
  666. 'table'=>'background',
  667. 'td'=>'background',
  668. 'tr'=>'background',
  669. 'th'=>'background',
  670. 'body'=>'background',
  671. 'a'=>'href',
  672. 'link'=>'href',
  673. 'area'=>'href',
  674. 'form'=>'action',
  675. 'script'=>'src',
  676. 'img'=>'src',
  677. 'input'=>'src',
  678. 'iframe'=>'src',
  679. 'frame'=>'src',
  680. 'embed'=>'src');
  681. //Get hostname for relative URL's
  682. $host = parse_url($baseUrl);
  683. $host = $host['scheme'] . "://" . $host['host'];
  684. if(preg_match('/<base(?:.*?)href=["\']?([^\'"\s]*)[\'"\s]?/is', $html, $base)){
  685. $baseUrl = $base[1];
  686. $host = $baseUrl;
  687. }
  688. // Append a trailing slash to the url if it doesn't exist
  689. if (substr($baseUrl, -1, 1) !='/'){
  690. $baseUrl.='/';
  691. }
  692. //Since we process $html through DOM, it replaces HTML special chars with ?
  693. //Anyone know of a better solution, changing the DOM character encoding does not work.
  694. $encodingValues = array(
  695. '&nbsp;'=>'s%s%',
  696. '&copy;'=>'c%c%',
  697. '&sect;'=>'se%se%',
  698. '&amp;'=>'a%a%',
  699. '&cent;'=>'ce%ce%',
  700. '&pound;'=>'p%p%',
  701. '&reg;'=>'r%r%',
  702. '&yen;'=>'y%y%',
  703. '&deg;'=>'d%d%',
  704. '&laquo;'=>'l%l%',
  705. '&quot;'=>'q%q%',
  706. '&lt;'=>'lt%lt%',
  707. '&gt;'=>'g%g%',
  708. '&trade;'=>'t%t%',
  709. '&raquo;'=>'ra%ra%'
  710. );
  711. foreach($encodingValues as $code=>$replace){
  712. $html = preg_replace("|$code|i", $replace, $html);
  713. }
  714. $doc = new DOMDocument();
  715. @$doc->loadHTML($html);
  716. foreach($tagAttributes as $tag=>$attribute){
  717. $tagType = $doc->getElementsByTagName($tag);
  718. foreach($tagType as $link){
  719. //Link is relative
  720. $url = $link->getAttribute($attribute);
  721. if(substr($url, 0, 1) == "/"){
  722. if(substr($url, 1, 2) == ".."){
  723. //Leave as if relative since /../needs to maintain its current path
  724. $link->setAttribute($attribute, $baseUrl . $link->getAttribute($attribute));
  725. }
  726. $link->setAttribute($attribute, $host . $link->getAttribute($attribute));
  727. }elseif(preg_match('/^https?:|ftp:|javascript:/', $url)){
  728. //Don't need to change anything
  729. continue;
  730. }else{
  731. $link->setAttribute($attribute, $baseUrl . $link->getAttribute($attribute));
  732. }
  733. }
  734. }
  735. $html = $doc->saveHTML();
  736. foreach($encodingValues as $code=>$replace){
  737. $html = preg_replace("|$replace|i", $code, $html);
  738. }
  739. //Need to catch the @import styles
  740. if(preg_match('|<style.*?@import.*?</style>|is', $html)){
  741. $data = $this->ripInBetween('<style.*?>', '</style>', null, $html);
  742. foreach($data as $link){
  743. preg_match_all('$[\(|\'|"](.*?)[\)|\'|"]$s', $link, $matches);
  744. $tail = $matches[1][0];
  745. if(!preg_match('|^https?:|is', $tail)){
  746. if(!substr($tail, 0, 1 == "/")){
  747. $html = $this->sandr($tail, $baseUrl . $tail, $html);
  748. }
  749. }
  750. }
  751. }
  752. $this->rawHtml = $html;
  753. return $html;
  754. }
  755. /////////////////////////////////////////////////////////////////////////////////////
  756. //Find all links to other files...inside body, img, iframe, etc...
  757. //
  758. //Returns array of all possible external links found
  759. //
  760. function getAllLinks($html = null){
  761. if(is_null($html)){
  762. $html = $this->rawHtml;
  763. }
  764. $links = array();
  765. $tagAttributes=array(
  766. 'table'=>'background',
  767. 'td'=>'background',
  768. 'tr'=>'background',
  769. 'th'=>'background',
  770. 'body'=>'background',
  771. 'a'=>'href',
  772. 'link'=>'href',
  773. 'area'=>'href',
  774. 'form'=>'action',
  775. 'script'=>'src',
  776. 'img'=>'src',
  777. 'iframe'=>'src',
  778. 'input'=>'src',
  779. 'frame'=>'src',
  780. 'embed'=>'src');
  781. // Single, double, and no quotes are both supported
  782. foreach($tagAttributes as $tag=>$attribute){
  783. $pattern="/<$tag([^>]*) $attribute=[\"']?([^\"' ]*)[\"']?/is";
  784. preg_match_all($pattern, $html, $matches);
  785. $links = array_merge($links, $matches[2]);
  786. }
  787. return $links;
  788. }
  789. /////////////////////////////////////////////////////////////////////////////////////
  790. //Finds all hidden form elements within specified $formName. If empty, it searches
  791. //the entire document.
  792. //
  793. //Returns asociative array (element name=>element value) of found elements and their
  794. //current values
  795. //
  796. public function getHiddenFormElements($formName = null, $html = null){
  797. if(is_null($html)){
  798. $html = $this->rawHtml;
  799. }
  800. $elementsArray = array();
  801. $page = new DOMDocument();
  802. $page->preserveWhiteSpace=false;
  803. @$page->loadHTML($html);
  804. //Look for the form that matches given name
  805. $forms = $page->getElementsByTagName('form');
  806. foreach($forms as $form){
  807. if(is_null($formName) || ($form->getAttribute('name') == $formName)){
  808. $currentForm = $form;
  809. }
  810. }
  811. $inputElements = $currentForm->getElementsByTagName('input');
  812. foreach($inputElements as $element){
  813. $name = $element->getAttribute('name');
  814. $value = $element->getAttribute('value');
  815. $type = $element->getAttribute('type');
  816. if(strtolower($type) == 'hidden'){
  817. if(array_key_exists($name, $elementsArray)){
  818. if(is_array($elementsArray[$name])){
  819. $elementsArray[$name][] = $value;
  820. }else{
  821. $elementsArray[$name] = array($elementsArray[$name], $value);
  822. }
  823. }else{
  824. $elementsArray[$name] = $value;
  825. }
  826. }
  827. }
  828. return $elementsArray;
  829. }
  830. /////////////////////////////////////////////////////////////////////////////////////
  831. //Finds all form elements within specified $formName. If empty, it searches the entire
  832. //document.
  833. //
  834. //Returns asociative array (element name=>element value) of found elements and their
  835. //current values
  836. //
  837. public function getFormElements($formName = null, $html = null){
  838. if(is_null($html)){
  839. $html = $this->rawHtml;
  840. }
  841. $elementsArray = array();
  842. $page = new DOMDocument();
  843. $page->preserveWhiteSpace=false;
  844. @$page->loadHTML($html);
  845. //Look for the form that matches given name
  846. $forms = $page->getElementsByTagName('form');
  847. foreach($forms as $form){
  848. if(is_null($formName) || ($form->getAttribute('name') == $formName)){
  849. $currentForm = $form;
  850. }
  851. }
  852. if(!is_object($currentForm)){
  853. $this->error = "getFormElements: HTML contains no forms.";
  854. return false;
  855. }
  856. //Set form info class variables
  857. $this->formAction = $currentForm->getAttribute('action');
  858. $this->formMethod = $currentForm->getAttribute('method');
  859. //Process for input elements
  860. $inputElements = $currentForm->getElementsByTagName('input');
  861. foreach($inputElements as $element){
  862. $name = $element->getAttribute('name');
  863. $value = $element->getAttribute('value');
  864. $type = $element->getAttribute('type');
  865. switch (strtolower($type)){
  866. case 'radio':
  867. if(!$element->attributes->getNamedItem('checked')){
  868. continue 2;
  869. }
  870. break;
  871. case 'checkbox':
  872. if(!$element->attributes->getNamedItem('checked')){
  873. continue 2;
  874. }
  875. break;
  876. case 'submit';
  877. continue 2;
  878. break;
  879. case 'button';
  880. continue 2;
  881. break;
  882. default:
  883. }
  884. if(array_key_exists($name, $elementsArray)){
  885. if(is_array($elementsArray[$name])){
  886. $elementsArray[$name][] = $value;
  887. }else{
  888. $elementsArray[$name] = array($elementsArray[$name], $value);
  889. }
  890. }else{
  891. $elementsArray[$name] = $value;
  892. }
  893. }
  894. //Process for select options
  895. $selectElements = $currentForm->getElementsByTagName('select');
  896. foreach($selectElements as $element){
  897. $name = $element->getAttribute('name');
  898. $options = $element->getElementsByTagName('option');
  899. foreach($options as $option){
  900. $value = $option->getAttribute('value');
  901. $optionValue = $option->getAttribute('value');
  902. if($option->attributes->getNamedItem('selected')){
  903. if(array_key_exists($name, $elementsArray)){
  904. if(is_array($elementsArray[$name])){
  905. $elementsArray[$name][] = $value;
  906. }else{
  907. $elementsArray[$name] = array($elementsArray[$name], $value);
  908. }
  909. }else{
  910. $elementsArray[$name] = $value;
  911. }
  912. }
  913. }
  914. }
  915. //Process for textarea
  916. $textElements = $currentForm->getElementsByTagName('textarea');
  917. foreach($textElements as $element){
  918. $name = $element->getAttribute('name');
  919. $value = $element->textContent;
  920. if(array_key_exists($name, $elementsArray)){
  921. if(is_array($elementsArray[$name])){
  922. $elementsArray[$name][] = $value;
  923. }else{
  924. $elementsArray[$name] = array($elementsArray[$name], $value);
  925. }
  926. }else{
  927. $elementsArray[$name] = $value;
  928. }
  929. }
  930. return $elementsArray;
  931. }
  932. /////////////////////////////////////////////////////////////////////////////////////
  933. //Gets form information
  934. //
  935. //Returns array of form information
  936. //
  937. public function getFormInfo($formName = null, $html = null){
  938. if(is_null($html)){
  939. $html = $this->rawHtml;
  940. }
  941. $page = new DOMDocument();
  942. $page->preserveWhiteSpace=false;
  943. @$page->loadHTML($html);
  944. //Look for the form that matches given name
  945. $forms = $page->getElementsByTagName('form');
  946. foreach($forms as $form){
  947. if(is_null($formName) || ($form->getAttribute('name') == $formName)){
  948. $currentForm = $form;
  949. }
  950. }
  951. //Set form info class variables
  952. $this->formAction = $currentForm->getAttribute('action');
  953. $return['action'] = $this->formAction;
  954. $this->formMethod = $currentForm->getAttribute('method');
  955. $return['method'] = $this->formMethod;
  956. return $return;
  957. }
  958. /////////////////////////////////////////////////////////////////////////////////////
  959. //Some general statistics for the current ripCurl session. This is a cumulative total
  960. //of all ripRun()'s.
  961. //
  962. //Returns HTML of totals.
  963. //
  964. public function getStats(){
  965. return $stats = "<br /><br />
  966. Total Connections: $this->totalConnections<br />
  967. Total Time: $this->totalTime<br />
  968. Total Download Size: $this->totalSize<br />
  969. Average Connection Speed: $this->averageSpeed<br />
  970. Total Redirects: $this->totalRedirects<br />
  971. Average Time per Connection: $this->averageTime<br />";
  972. }
  973. /////////////////////////////////////////////////////////////////////////////////////
  974. //Assigns POST data and enables POST settings in the CURL object. Primarily used
  975. //from 3rd argument of ripRun().
  976. //
  977. //Returns true if settings set and data ready for ripRun().
  978. //
  979. public function postRequest($data){
  980. if(empty($data)){
  981. $this->error = "postRequest: No post data specified";
  982. return false;
  983. }
  984. $this->post();
  985. $string = "";
  986. if(!is_array($data)){
  987. $this->postFields($data);
  988. $this->postData = $string;
  989. } else {
  990. foreach($data as $k=>$v){
  991. if(is_array($v)){
  992. foreach($v as $subval){
  993. $string .= urlencode($k) . "=" . urlencode($subval) . "&";
  994. }
  995. }else{
  996. $string .= urlencode($k) . "=" . urlencode($v) . "&";
  997. }
  998. }
  999. $string = rtrim($string, '&');
  1000. $this->postFields($string);
  1001. $this->postData = $string;
  1002. return true;
  1003. }
  1004. }
  1005. /////////////////////////////////////////////////////////////////////////////////////
  1006. //Sets up data for get requests
  1007. //
  1008. //Returns query string
  1009. public function getRequest($data){
  1010. $string = "?";
  1011. if(!is_array($data)){
  1012. return $data;
  1013. } else {
  1014. foreach($data as $k=>$v){
  1015. if(is_array($v)){
  1016. foreach($v as $subval){
  1017. $string .= urlencode($k) . "=" . urlencode($subval) . "&";
  1018. }
  1019. }else{
  1020. $string .= urlencode($k) . "=" . urlencode($v) . "&";
  1021. }
  1022. }
  1023. $string = rtrim($string, '&');
  1024. return $string;
  1025. }
  1026. }
  1027. /////////////////////////////////////////////////////////////////////////////////////
  1028. //Remove all HTML comments
  1029. //
  1030. //Returns HTML stripped of comments.
  1031. //
  1032. public function stripComments($html = null){
  1033. if(is_null($html)){
  1034. $html = $this->rawHtml;
  1035. }
  1036. $html = $this->sandr('<!--.*?-->', '', $html);
  1037. $this->rawHtml = $html;
  1038. return $html;
  1039. }
  1040. /////////////////////////////////////////////////////////////////////////////////////
  1041. //Retrieve 'href' value for a given link value
  1042. //
  1043. //Returns array of matching values
  1044. //
  1045. public function getNamedLinks($id, $html = null){
  1046. if(is_null($html)){
  1047. $html = $this->rawHtml;
  1048. }
  1049. $return = array();
  1050. $doc = new DOMDocument();
  1051. @$doc->loadHTML($html);
  1052. $tags = $doc->getElementsByTagName('a');
  1053. foreach($tags as $tag){
  1054. $href = $tag->getAttribute('href');
  1055. $link = $tag->nodeValue;
  1056. if(preg_match("|$id|s", $link)){
  1057. $return[] = $href;
  1058. }
  1059. }
  1060. unset($doc);
  1061. $return = array_unique($return);
  1062. return $return;
  1063. }
  1064. /////////////////////////////////////////////////////////////////////////////////////
  1065. //Main ripCurl execution method. $url is the URL to pull from. if not set, it is taken
  1066. //from $this->ripUrl. $fixlinks is boolean option to automatically fix relative links.
  1067. //$postdata can be array or urlencoded string. Result page is both returned and set
  1068. //in $this->rawHtml.
  1069. //
  1070. //Returns HTML content retrieved from URL query.
  1071. //
  1072. public function ripRun($url, $fixLinks = null, $postdata = null){
  1073. $this->url($url);
  1074. if(!is_null($postdata)){
  1075. if(strtolower($this->formMethod) == 'get'){
  1076. $url = $url . $this->getRequest($postdata);
  1077. $this->url($url);
  1078. }else{
  1079. $this->url($url);
  1080. $this->postRequest($postdata);
  1081. }
  1082. }
  1083. $html = curl_exec($this->ch);
  1084. $this->rawHtml = $html;
  1085. $this->lastUrl = $url;
  1086. //$this->lastUrl = curl_getinfo($this->ch, CURLINFO_EFFECTIVE_URL);
  1087. if($fixLinks){
  1088. $parse = parse_url($url);
  1089. if(preg_match('|.*?/([^/]*\..*?[^/]*)|is', $parse['path'], $file)){
  1090. $parse['path'] = preg_replace('|(.*?/)([^/]*\..*?[^/]*)|is', '$1', $parse['path']);
  1091. }
  1092. $host = $parse['scheme'] . "://" . $parse['host'] . $parse['path'];
  1093. //$host = $path;
  1094. $this->fixLinks($host, $this->rawHtml);
  1095. }
  1096. //Stats tracking
  1097. $this->totalConnections = $this->totalConnections + 1;
  1098. $this->totalTime = $this->totalTime + $this->getInfo('total_time');
  1099. $this->totalSize = $this->totalSize + $this->getInfo('size_download');
  1100. $this->averageSpeed = ($this->averageSpeed + $this->getInfo('speed_download')) / $this->totalConnections;
  1101. $redir = $this->getInfo('redirect_time');
  1102. if(!empty($redir)){
  1103. $this->totalRedirects = $this->totalRedirects + 1;
  1104. }
  1105. $this->averageTime = $this->totalTime / $this->totalConnections;
  1106. $this->lastStatus = curl_getinfo($this->ch, CURLINFO_HTTP_CODE);
  1107. return $this->getRawHtml();
  1108. }
  1109. /////////////////////////////////////////////////////////////////////////////////////
  1110. //XML methods
  1111. /////////////////////////////////////////////////////////////////////////////////////
  1112. /////////////////////////////////////////////////////////////////////////////////////
  1113. //Parses XML file or http address where the output is valid XML
  1114. //
  1115. //Returns array
  1116. //
  1117. public function getXMLArray($filename){
  1118. if(!is_file($filename)){
  1119. $this->error = "getXMLArray: Bad filename $filename";
  1120. return false;
  1121. }
  1122. $contents = file_get_contents($filename);
  1123. if(!$this->inData('^<\?[.*?]xml version=[.*?]\?>', null, $contents)){
  1124. $contents = '<?xml version="1.0"?>' . $contents;
  1125. }
  1126. $xml= new DOMDocument();
  1127. $xml->preserveWhiteSpace=false;
  1128. $xml->loadXML($contents);
  1129. $result = $this->xml2array($xml);
  1130. return $result;
  1131. }
  1132. /////////////////////////////////////////////////////////////////////////////////////
  1133. //Creates array of XML file or existing XML array and pull out only the desired named
  1134. //fields.
  1135. //
  1136. //Returns array of named fields
  1137. //
  1138. public function getXMLElements($filename, $elementName){
  1139. if(is_file($filename) && is_readable($filename)){
  1140. $data = $this->getXMLArray($filename);
  1141. }elseif(is_array($filename)){
  1142. $data = $filename;
  1143. }else{
  1144. $this->error = "getXMLElements: Bad filename or array given";
  1145. return false;
  1146. }
  1147. foreach($data as $k=>$v){
  1148. if(is_array($v)){
  1149. $this->getXMLElements($v, $elementName);
  1150. }elseif($k == $elementName){
  1151. $return[] = $v;
  1152. }
  1153. }
  1154. return $return;
  1155. }
  1156. /////////////////////////////////////////////////////////////////////////////////////
  1157. //Helper function for XML parsing
  1158. //
  1159. //Returns XML structure as array
  1160. //
  1161. private function xml2array($n){
  1162. $return=array();
  1163. foreach($n->childNodes as $nc){
  1164. if($nc->hasChildNodes()){
  1165. if($n->firstChild->nodeName== $n->lastChild->nodeName&&$n->childNodes->length>1){
  1166. $return[$nc->nodeName][]=$this->xml2array($nc);
  1167. }else{
  1168. $return[$nc->nodeName]=$this->xml2array($nc);
  1169. }
  1170. }else{
  1171. $return=$nc->nodeValue;
  1172. }
  1173. }
  1174. return $return;
  1175. }
  1176. /////////////////////////////////////////////////////////////////////////////////////
  1177. //This section is to layout future development ideas and projects
  1178. //
  1179. //Expansion ideas:
  1180. //Expand image features with GD lib
  1181. //Build XML parsing methods
  1182. //Include validation checks for XML, HTML, CSS, etc.
  1183. //Include limited JavaScript parsing capabilities
  1184. //Build pagination methods with callback functionality
  1185. //
  1186. /////////////////////////////////////////////////////////////////////////////////////
  1187. }
  1188. ?>