PageRenderTime 52ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/core/classes/remotefile.class.php

http://github.com/newscloud/open-social-media-toolkit
PHP | 486 lines | 377 code | 46 blank | 63 comment | 74 complexity | 6094e42b729d15b0110b8bcf692cf76e MD5 | raw file
  1. <?php
  2. class remotePageProperty extends remoteFileProperty {
  3. /**
  4. * Get images from remote page
  5. * get title from remote page
  6. */
  7. var $parsed_response = array();
  8. var $page_url = "";
  9. var $page_content = "";
  10. var $page_parsed_xml = array();
  11. var $minSentenceLength=110;
  12. var $contentRetrievalLimit;
  13. function remotePageProperty($url, $open_socket_on_instantiation = true, $timeout = 10,$limit=250000)
  14. {
  15. $this->page_url = $url;
  16. $this->contentRetrievalLimit=$limit; // designed to allow retrieval of only first part of page when set
  17. if ($open_socket_on_instantiation) {
  18. $this->openPage();
  19. }
  20. }
  21. function openPage() {
  22. // construct parent class
  23. $this->remoteFileProperty($this->page_url, true);
  24. // parse the response headers and store as an attribute
  25. $this->parsed_response = $this->_parseResponseHeader($this->headers_raw);
  26. // now get the content
  27. $this->_getPageContent();
  28. }
  29. function getPageTitle()
  30. {
  31. preg_match_all("/<title[^>]*>([^<]+)<\/title>/i", $this->page_content, $matches);
  32. if ($matches[1][0]) {
  33. return $matches[1][0];
  34. } else {
  35. return false;
  36. }
  37. }
  38. function getPageParagraphs() {
  39. require_once(PATH_CORE.'/utilities/class.html2text.inc');
  40. $regex='/<p[^>]*>([^<]*)<\/p>/i';
  41. preg_match_all($regex, $this->page_content, $matches, PREG_PATTERN_ORDER);
  42. // preg_match_all("/<p[\w]*[^>]*>[\n]*(.*)[\n]*<\/p>/i", $this->page_content, $matches, PREG_PATTERN_ORDER);
  43. $text='';
  44. foreach ($matches[1] as $e) {
  45. $h2t =& new html2text($e);
  46. $temp=$h2t->get_text();
  47. if (strlen($temp)>$this->minSentenceLength) // only add longer sentences
  48. $text.= $temp.' ';
  49. }
  50. return $text;
  51. }
  52. function getAltPageParagraphs() {
  53. require_once(PATH_CORE.'/utilities/class.html2text.inc');
  54. $regex='/<div[^>]*>([^<]*)<\/div>/i';
  55. preg_match_all($regex, $this->page_content, $matches, PREG_PATTERN_ORDER);
  56. // preg_match_all("/<p[\w]*[^>]*>[\n]*(.*)[\n]*<\/p>/i", $this->page_content, $matches, PREG_PATTERN_ORDER);
  57. $text='';
  58. foreach ($matches[1] as $e) {
  59. $h2t =& new html2text($e);
  60. $temp=$h2t->get_text();
  61. if (strlen($temp)>$this->minSentenceLength) // only add longer sentences
  62. $text.= $temp.' ';
  63. }
  64. return $text;
  65. }
  66. function getPageFeed()
  67. {
  68. preg_match_all("/<link rel\=\"alternate\" type\=\"application\/rss[\+|\ ]xml\" (title=\"[^\"]+\"\ ){0,1}href=\"([^\"]+)\"/i", $this->page_content, $matches);
  69. if ($matches[2][0]) {
  70. return $matches[2][0];
  71. } else {
  72. return false;
  73. }
  74. }
  75. function getPageImages($dropQueryString=false)
  76. {
  77. $allimages = $this->_parsePageImages();
  78. $ret = array();
  79. foreach ($allimages as $key => $attrib) {
  80. $src = trim($attrib['src']);
  81. $p = parse_url($src);
  82. //var_dump($p);
  83. if ( isset($p['scheme']) && isset($p['host']) ) {
  84. $src = $src;
  85. } else if ( isset($p['path']) && isset($p['query']) && !$dropQueryString) {
  86. $src = $this->parsed_url['scheme'] . "://" . $this->parsed_url['host'] . $p['path'] . "?" . $p['query'];
  87. } else if (isset($p['path']) && isset($p['query']) && $dropQueryString) {
  88. $src = $this->parsed_url['scheme'] . "://" . $this->parsed_url['host'] . $p['path'];
  89. } else if ( isset($p['path']) && !isset($p['query'])) {
  90. $src = $this->parsed_url['scheme'] . "://" . $this->parsed_url['host'] . $p['path'];
  91. }
  92. $ret[] = $src; // . "HOST: " . $this->parsed_url['host'];
  93. }
  94. // remove duplicates
  95. $ret = array_unique($ret);
  96. // get only JPGs
  97. $ret = preg_grep("/\.jpg$/i", $ret);
  98. return $ret;
  99. }
  100. function _parsePageImages()
  101. // returns a nested array of page images, with attributes
  102. // urls will need to be cleaned up by another function, hence: _private
  103. {
  104. $images = array();
  105. $stickem = array();
  106. // regex finds image elements
  107. // BUG: IS CASE-SENSITIVE - note jr, ius it? /i should be insensitive
  108. preg_match_all("/<img([^>]+)/i", $this->page_content, $matches);
  109. foreach ($matches[1] as $key => $val) {
  110. // regex finds 'attribute=value'
  111. preg_match_all("/([\w]+)=([^\s]+)/i", $val, $att_matches);
  112. foreach ($att_matches[1] as $key => $attribute) {
  113. $stickem[$attribute] = $this->cleanQuotes($att_matches[2][$key]); // that last ugly is the value
  114. }
  115. // only count it if the 'src' attribute is set
  116. if (isset($stickem['src'])) {
  117. $images[] = $stickem;
  118. }
  119. }
  120. return $images;
  121. }
  122. function getjpegsize($img_loc) {
  123. // note - this function is unreliable and not much faster than getimagesize
  124. // can be used to quickly get dimensions for autoposting and help predict best image, doesn't work for gif (which are likely smaller)
  125. // Retrieve JPEG width and height without downloading/reading entire image. via http://us.php.net/function.getimagesize
  126. $handle = fopen($img_loc, "rb") or die("Invalid file stream.");
  127. $new_block = NULL;
  128. if(!feof($handle)) {
  129. $new_block = fread($handle, 32);
  130. $i = 0;
  131. if($new_block[$i]=="\xFF" && $new_block[$i+1]=="\xD8" && $new_block[$i+2]=="\xFF" && $new_block[$i+3]=="\xE0") {
  132. $i += 4;
  133. if($new_block[$i+2]=="\x4A" && $new_block[$i+3]=="\x46" && $new_block[$i+4]=="\x49" && $new_block[$i+5]=="\x46" && $new_block[$i+6]=="\x00") {
  134. // Read block size and skip ahead to begin cycling through blocks in search of SOF marker
  135. $block_size = unpack("H*", $new_block[$i] . $new_block[$i+1]);
  136. $block_size = hexdec($block_size[1]);
  137. while(!feof($handle)) {
  138. $i += $block_size;
  139. $new_block .= fread($handle, $block_size);
  140. if($new_block[$i]=="\xFF") {
  141. // New block detected, check for SOF marker
  142. $sof_marker = array("\xC0", "\xC1", "\xC2", "\xC3", "\xC5", "\xC6", "\xC7", "\xC8", "\xC9", "\xCA", "\xCB", "\xCD", "\xCE", "\xCF");
  143. if(in_array($new_block[$i+1], $sof_marker)) {
  144. // SOF marker detected. Width and height information is contained in bytes 4-7 after this byte.
  145. $size_data = $new_block[$i+2] . $new_block[$i+3] . $new_block[$i+4] . $new_block[$i+5] . $new_block[$i+6] . $new_block[$i+7] . $new_block[$i+8];
  146. $unpacked = unpack("H*", $size_data);
  147. $unpacked = $unpacked[1];
  148. $height = hexdec($unpacked[6] . $unpacked[7] . $unpacked[8] . $unpacked[9]);
  149. $width = hexdec($unpacked[10] . $unpacked[11] . $unpacked[12] . $unpacked[13]);
  150. return array($width, $height);
  151. } else {
  152. // Skip block marker and read block size
  153. $i += 2;
  154. $block_size = unpack("H*", $new_block[$i] . $new_block[$i+1]);
  155. $block_size = hexdec($block_size[1]);
  156. }
  157. } else {
  158. return FALSE;
  159. }
  160. }
  161. }
  162. }
  163. }
  164. return FALSE;
  165. }
  166. function remote_filesize($url, $user = "", $pw = "")
  167. {
  168. // from http://snipplr.com/view/29/get-remote-filesize/
  169. ob_start();
  170. $ch = curl_init($url);
  171. curl_setopt($ch, CURLOPT_HEADER, 1);
  172. curl_setopt($ch, CURLOPT_NOBODY, 1);
  173. if(!empty($user) && !empty($pw))
  174. {
  175. $headers = array('Authorization: Basic ' . base64_encode("$user:$pw"));
  176. curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
  177. }
  178. $ok = curl_exec($ch);
  179. curl_close($ch);
  180. $head = ob_get_contents();
  181. ob_end_clean();
  182. $regex = '/Content-Length:\s([0-9].+?)\s/';
  183. $count = preg_match($regex, $head, $matches);
  184. return isset($matches[1]) ? $matches[1] : "unknown";
  185. }
  186. function _repairImageURL($url)
  187. {
  188. //$uri = 'http://some-domain-name.org';
  189. if(preg_match('/^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}((:[0-9]{1,5})?\/.*)?$/i', $url)) {
  190. // url looks ok, return unmodified
  191. return $url;
  192. } else {
  193. }
  194. }
  195. function cleanQuotes($string)
  196. {
  197. return trim(trim(trim($string), "'"), '"');
  198. }
  199. function _getPageContent($timeout = 10)
  200. /**
  201. *
  202. */
  203. {
  204. $handle = fopen($this->page_url, "r");
  205. if ($handle) {
  206. $x=0;
  207. while (!feof($handle) ) { // AND $x<$this->contentRetrievalLimit
  208. $this->page_content .= fgets($handle, 4096);
  209. $x+=4096;
  210. }
  211. fclose($handle);
  212. } else {
  213. $this->parsed_reponse['status'] == 'DEAD';
  214. //die( "fopen failed for $filename" ) ; // problem opening url
  215. }
  216. }
  217. function isAlive()
  218. /**
  219. * checks for basic aliveness only -- doesn't account for redirects
  220. */
  221. {
  222. if ($this->parsed_response['status'] == 'OK') {
  223. return true;
  224. } else {
  225. return false;
  226. }
  227. }
  228. function hasMoved()
  229. {
  230. if ($this->parsed_response['response_class'] == '3') {
  231. return true;
  232. } else {
  233. return false;
  234. }
  235. }
  236. function getRedirect()
  237. {
  238. if (isset($this->headers['Location']) && $this->hasMoved()) {
  239. return $this->headers['Location'];
  240. } else {
  241. return false;
  242. }
  243. }
  244. function _parseResponseHeader($headerstring)
  245. {
  246. $ret = array();
  247. preg_match_all("/^HTTP\/([^\s]+)\s([0-9]+)\s([^\s]+)/", $headerstring, $matches);
  248. $ret['version'] = $matches[1][0];
  249. $ret['response'] = $matches[2][0];
  250. $ret['response_class'] = substr($matches[2][0], 0, 1);
  251. $ret['status'] = $matches[3][0];
  252. return $ret;
  253. }
  254. }
  255. class remoteFileLinkStatus extends remoteFileProperty {
  256. /**
  257. * class to get further status of a remote file, including redirects
  258. */
  259. var $parsed_response = array();
  260. function remoteFileLinkStatus($url, $live = true, $timeout = 10)
  261. {
  262. // construct parent class
  263. $this->remoteFileProperty($url, $live, $timeout);
  264. // parse the response headers and store as an attribute
  265. $this->parsed_response = $this->_parseResponseHeader($this->headers_raw);
  266. }
  267. function isAlive()
  268. /**
  269. * checks for basic aliveness only -- doesn't account for redirects
  270. */
  271. {
  272. if ($this->parsed_response['status'] == 'OK') {
  273. return true;
  274. } else {
  275. return false;
  276. }
  277. }
  278. function hasMoved()
  279. {
  280. if ($this->parsed_response['response_class'] == '3') {
  281. return true;
  282. } else {
  283. return false;
  284. }
  285. }
  286. function getRedirect()
  287. {
  288. if (isset($this->headers['Location']) && $this->hasMoved()) {
  289. return $this->headers['Location'];
  290. } else {
  291. return false;
  292. }
  293. }
  294. function _parseResponseHeader($headerstring)
  295. {
  296. $ret = array();
  297. preg_match_all("/^HTTP\/([^\s]+)\s([0-9]+)\s([^\s]+)/", $headerstring, $matches);
  298. $ret['version'] = $matches[1][0];
  299. $ret['response'] = $matches[2][0];
  300. $ret['response_class'] = substr($matches[2][0], 0, 1);
  301. $ret['status'] = $matches[3][0];
  302. return $ret;
  303. }
  304. }
  305. class remoteFileProperty {
  306. /**
  307. * build info for a remote file via HEAD response headers
  308. * warning: opens socket when instantiated (speed is dependent on network conditions)
  309. */
  310. var $file_url = "";
  311. var $parsed_url = array();
  312. var $headers = array();
  313. var $headers_raw = "";
  314. var $error = false;
  315. function remoteFileProperty($url, $open_socket_on_instantiation = true, $timeout = 10)
  316. {
  317. $this->file_url = $url;
  318. $this->parsed_url = parse_url($url); // split url into components
  319. if ($open_socket_on_instantiation) {
  320. $this->headers = $this->_getHTTPHeaders($timeout);
  321. }
  322. }
  323. function _getHTTPHeaders($timeout = 10)
  324. /**
  325. * returns an array of all HTTP response headers for the provided url path
  326. * format is array{ ['headername'] => ['headervalue'] }
  327. * timeout is in seconds, and optional
  328. */
  329. {
  330. $parsed = $this->parsed_url;
  331. $ret = array();
  332. if ( strtolower($parsed['scheme']) == "http" && isset($parsed['host']) ) {
  333. $fp = fsockopen($parsed['host'], 80, $errno, $errstr, $timeout);
  334. if ($fp) {
  335. // attempt to add path if none
  336. if ($parsed['path'] == "") $parsed['path'] = "/";
  337. stream_set_timeout($fp, $timeout);
  338. // HEAD requests headers only, no content
  339. fputs($fp,"HEAD " . $parsed['path'] ." HTTP/1.1\r\n");
  340. fputs($fp,"Host: " . $parsed['host'] . "\r\n");
  341. fputs($fp,"Connection: close\r\n\r\n");
  342. while (!feof($fp)) {
  343. $line = fgets($fp, 128);
  344. // keep a copy of raw response
  345. $this->headers_raw .= $line;
  346. // regex to split returned headers
  347. preg_match_all("/(^[^:]*):([^$]*)/", $line, $matches);
  348. // build output array
  349. if (isset($matches[1][0]) && isset($matches[2][0])) {
  350. $ret[trim($matches[1][0])] = trim($matches[2][0]);
  351. }
  352. }
  353. } else {
  354. $this->error = "Problem with connection";
  355. return false; // problem with the socket
  356. }
  357. return $ret; // all is well, return the array
  358. } else {
  359. $this->error = "Problem parsing URL";
  360. return false; // url is invalid or not http
  361. }
  362. }
  363. function getSize()
  364. /**
  365. * return file size int in bytes
  366. */
  367. {
  368. if ( isset($this->headers['Content-Length']) ) {
  369. return $this->headers['Content-Length'];
  370. /*
  371. } else if ($this->headers['Content-Length'] == 0) {
  372. return 1;
  373. */
  374. } else {
  375. return $this->_guessSize();
  376. }
  377. }
  378. function _guessSize()
  379. {
  380. $dummy = 1000000;
  381. return $dummy;
  382. }
  383. function getMIMEType()
  384. /*
  385. * return MIME type string
  386. */
  387. {
  388. if ( isset($this->headers['Content-Type']) ) {
  389. return $this->headers['Content-Type'];
  390. } else {
  391. // if header is missing type, try guessing
  392. return $this->_guessMIMEType();
  393. }
  394. }
  395. function _guessMIMEType()
  396. {
  397. $types = array(
  398. "wma" => "audio/x-ms-wma",
  399. "mp3" => "audio/x-mpeg",
  400. "mov" => "video/quicktime",
  401. "ram" => "audio/x-pn-realaudio",
  402. "wmv" => "video/x-ms-wmv",
  403. "aac" => "audio/aac",
  404. "aiff" => "audio/x-aiff",
  405. "wav" => "audio/x-wav",
  406. "asx" => "video/x-ms-asf",
  407. "avi" => "video/msvideo",
  408. "doc" => "application/msword",
  409. "dcr" => "application/x-director",
  410. "dmg" => "application/octet-stream",
  411. "gif" => "image/gif",
  412. "gz" => "application/x-gzip",
  413. "jpg" => "image/jpeg",
  414. "mp4" => "video/mp4",
  415. "mpg" => "video/mpeg",
  416. "ogg" => "application/ogg",
  417. "pdf" => "application/pdf",
  418. "ppt" => "application/vnd.ms-powerpoint",
  419. "qt" => "video/quicktime",
  420. "ra" => "audio/x-pn-realaudio",
  421. "swf" => "application/x-shockwave-flash",
  422. "tar" => "application/x-tar",
  423. "txt" => "text/plain",
  424. "xls" => "application/vnd.ms-excel",
  425. "zip" => "application/zip");
  426. // match it with types array
  427. return $types[$this->getFileExtension()];
  428. }
  429. function getFileExtension()
  430. {
  431. // regex to get ext in 'filename.ext'
  432. preg_match_all("/\.([^\.]+)$/", $this->parsed_url['path'], $ext_match);
  433. return $ext_match[1][0];
  434. }
  435. }
  436. ?>