PageRenderTime 24ms CodeModel.GetById 12ms RepoModel.GetById 0ms app.codeStats 1ms

/src/link-preview/php/classes/LinkPreview.php

https://gitlab.com/alidzapp/Link-Preview
PHP | 238 lines | 187 code | 43 blank | 8 comment | 65 complexity | d1d6ea66f7abf953d640de844eb7c929 MD5 | raw file
  1. <?php
  2. /**
  3. * Copyright (c) 2015 Leonardo Cardoso (http://leocardz.com)
  4. * Dual licensed under the MIT (http://www.opensource.org/licenses/mit-license.php)
  5. * and GPL (http://www.opensource.org/licenses/gpl-license.php) licenses.
  6. *
  7. * Version: 1.0.0
  8. */
  9. /** Important php5-curl must be installed and enabled */
  10. include_once "Media.php";
  11. include_once "Regex.php";
  12. include_once "SetUp.php";
  13. include_once "Url.php";
  14. include_once "Content.php";
  15. include_once "Json.php";
  16. class LinkPreview
  17. {
  18. function __construct()
  19. {
  20. }
  21. function joinAll($matching, $number, $url, $content)
  22. {
  23. for ($i = 0; $i < count($matching[$number]); $i++) {
  24. $imgSrc = $matching[$number][$i] . $matching[$number + 1][$i];
  25. $src = "";
  26. $pathCounter = substr_count($imgSrc, "../");
  27. if (!preg_match(Regex::$HTTP_REGEX, $imgSrc)) {
  28. $src = Url::getImageUrl($pathCounter, Url::canonicalLink($imgSrc, $url));
  29. }
  30. if ($src . $imgSrc != $url) {
  31. if ($src == "")
  32. array_push($content, $src . $imgSrc);
  33. else
  34. array_push($content, $src);
  35. }
  36. }
  37. return $content;
  38. }
  39. function crawl($text, $imageQuantity, $header)
  40. {
  41. if (preg_match(Regex::$URL_REGEX, $text, $match)) {
  42. $title = "";
  43. $description = "";
  44. $videoIframe = "";
  45. $video = false;
  46. if (strpos($match[0], " ") === 0)
  47. $match[0] = "http://" . substr($match[0], 1);
  48. $finalUrl = $match[0];
  49. $pageUrl = $finalUrl;
  50. if (Content::isImage($pageUrl)) {
  51. $images = [$pageUrl];
  52. } else {
  53. $urlData = $this->getPage($pageUrl);
  54. if (!$urlData["content"] && strpos($pageUrl, "//www.") === false) {
  55. if (strpos($pageUrl, "http://") !== false)
  56. $pageUrl = str_replace("http://", "http://www.", $pageUrl);
  57. elseif (strpos($pageUrl, "https://") !== false)
  58. $pageUrl = str_replace("https://", "https://www.", $pageUrl);
  59. $urlData = $this->getPage($pageUrl);
  60. }
  61. $pageUrl = $finalUrl = $urlData["url"];
  62. $raw = $urlData["content"];
  63. $header = $urlData["header"];
  64. $metaTags = Content::getMetaTags($raw);
  65. $tempTitle = Content::extendedTrim($metaTags["title"]);
  66. if ($tempTitle != "")
  67. $title = $tempTitle;
  68. if ($title == "") {
  69. if (preg_match(Regex::$TITLE_REGEX, str_replace("\n", " ", $raw), $matching))
  70. $title = $matching[2];
  71. }
  72. $tempDescription = Content::extendedTrim($metaTags["description"]);
  73. if ($tempDescription != "")
  74. $description = $tempDescription;
  75. else
  76. $description = Content::crawlCode($raw);
  77. $descriptionUnderstood = false;
  78. if ($description != "")
  79. $descriptionUnderstood = true;
  80. if (($descriptionUnderstood == false && strlen($title) > strlen($description) && !preg_match(Regex::$URL_REGEX, $description) && $description != "" && !preg_match('/[A-Z]/', $description)) || $title == $description) {
  81. $title = $description;
  82. $description = Content::crawlCode($raw);
  83. }
  84. if (Content::isJson($title)) {
  85. $title = "";
  86. }
  87. if (Content::isJson($description)) {
  88. $description = "";
  89. }
  90. $media = $this->getMedia($pageUrl);
  91. $images = count($media) == 0 || $media[0] == "" ? array(Content::extendedTrim($metaTags["image"])) : array($media[0]);
  92. $videoIframe = $media[1];
  93. if (count($images) == 0 || $images[0] === "") {
  94. $images = Content::getImages($raw, $pageUrl, $imageQuantity);
  95. }
  96. if ($media != null && $media[1] != "") {
  97. $video = true;
  98. }
  99. $title = Content::extendedTrim($title);
  100. $pageUrl = Content::extendedTrim($pageUrl);
  101. $description = Content::extendedTrim($description);
  102. $description = preg_replace(Regex::$SCRIPT_REGEX, "", $description);
  103. }
  104. $finalLink = explode("&", $finalUrl);
  105. $finalLink = $finalLink[0];
  106. $description = strip_tags($description);
  107. $videoIframe = $videoIframe == null ? "" : $videoIframe;
  108. $answer = array(
  109. "title" => $title,
  110. "url" => $finalLink,
  111. "pageUrl" => $finalUrl,
  112. "canonicalUrl" => Url::canonicalPage($pageUrl),
  113. "description" => $description,
  114. "image" => $images[0],
  115. "images" => $images,
  116. "video" => $video,
  117. "videoIframe" => $videoIframe);
  118. $result_json = Json::jsonSafe($answer, $header);
  119. $result_json_decoded = json_decode($result_json);
  120. $flagged = false;
  121. if (!isset($result_json_decoded->title)) {
  122. $title = utf8_encode($title);
  123. $flagged = true;
  124. }
  125. if (!isset($result_json_decoded->description)) {
  126. $description = utf8_encode($description);
  127. $flagged = true;
  128. }
  129. if ($flagged) {
  130. $answer = array(
  131. "title" => $title,
  132. "url" => $finalLink,
  133. "pageUrl" => $finalUrl,
  134. "canonicalUrl" => Url::canonicalPage($pageUrl),
  135. "description" => $description,
  136. "image" => $images[0],
  137. "images" => $images,
  138. "video" => $video,
  139. "videoIframe" => $videoIframe);
  140. return Json::jsonSafe($answer, $header);
  141. } else {
  142. return $result_json;
  143. }
  144. }
  145. return null;
  146. }
  147. function getPage($url)
  148. {
  149. $res = array();
  150. $options = array(CURLOPT_RETURNTRANSFER => true, // return web page
  151. CURLOPT_HEADER => false, // do not return headers
  152. CURLOPT_FOLLOWLOCATION => true, // follow redirects
  153. CURLOPT_USERAGENT => "leocardz", // who am i
  154. CURLOPT_AUTOREFERER => true, // set referer on redirect
  155. CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect
  156. CURLOPT_TIMEOUT => 120, // timeout on response
  157. CURLOPT_MAXREDIRS => 10, // stop after 10 redirects
  158. );
  159. $ch = curl_init($url);
  160. curl_setopt_array($ch, $options);
  161. $content = curl_exec($ch);
  162. $header = curl_getinfo($ch);
  163. curl_close($ch);
  164. $hrd = $header["content_type"];
  165. header("Content-Type: " . $hrd, true);
  166. $res['content'] = $content;
  167. $res['url'] = $header['url'];
  168. $res['header'] = $hrd;
  169. return $res;
  170. }
  171. function getMedia($pageUrl)
  172. {
  173. $media = array();
  174. if (strpos($pageUrl, "youtube.com") !== false) {
  175. $media = Media::mediaYoutube($pageUrl);
  176. } else if (strpos($pageUrl, "ted.com") !== false) {
  177. $media = Media::mediaTED($pageUrl);
  178. } else if (strpos($pageUrl, "vimeo.com") !== false) {
  179. $media = Media::mediaVimeo($pageUrl);
  180. } else if (strpos($pageUrl, "vine.co") !== false) {
  181. $media = Media::mediaVine($pageUrl);
  182. } else if (strpos($pageUrl, "metacafe.com") !== false) {
  183. $media = Media::mediaMetacafe($pageUrl);
  184. } else if (strpos($pageUrl, "dailymotion.com") !== false) {
  185. $media = Media::mediaDailymotion($pageUrl);
  186. } else if (strpos($pageUrl, "collegehumor.com") !== false) {
  187. $media = Media::mediaCollegehumor($pageUrl);
  188. } else if (strpos($pageUrl, "blip.tv") !== false) {
  189. $media = Media::mediaBlip($pageUrl);
  190. } else if (strpos($pageUrl, "funnyordie.com") !== false) {
  191. $media = Media::mediaFunnyordie($pageUrl);
  192. }
  193. return $media;
  194. }
  195. }