PageRenderTime 26ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/184.168.182.1/wp-content/plugins/jetpack/class.media-extractor.php

https://gitlab.com/endomorphosis/falkenstein
PHP | 405 lines | 234 code | 69 blank | 102 comment | 48 complexity | f466d410ab8ab178e91640a6f4b95886 MD5 | raw file
  1. <?php
  2. /**
  3. * Class with methods to extract metadata from a post/page about videos, images, links, mentions embedded
  4. * in or attached to the post/page.
  5. *
  6. * @todo Additionally, have some filters on number of items in each field
  7. */
  8. class Jetpack_Media_Meta_Extractor {
  9. // Some consts for what to extract
  10. const ALL = 255;
  11. const LINKS = 1;
  12. const MENTIONS = 2;
  13. const IMAGES = 4;
  14. const SHORTCODES = 8; // Only the keeper shortcodes below
  15. const EMBEDS = 16;
  16. const HASHTAGS = 32;
  17. // For these, we try to extract some data from the shortcode, rather than just recording its presence (which we do for all)
  18. // There should be a function get_{shortcode}_id( $atts ) or static method SomethingShortcode::get_{shortcode}_id( $atts ) for these.
  19. private static $KEEPER_SHORTCODES = array(
  20. 'youtube',
  21. 'vimeo',
  22. 'hulu',
  23. 'ted',
  24. 'wpvideo',
  25. );
  26. /**
  27. * Gets the specified media and meta info from the given post.
  28. * NOTE: If you have the post's HTML content already and don't need image data, use extract_from_content() instead.
  29. *
  30. * @param $blog_id The ID of the blog
  31. * @param $post_id The ID of the post
  32. * @param $what_to_extract (int) A mask of things to extract, e.g. Jetpack_Media_Meta_Extractor::IMAGES | Jetpack_Media_Meta_Extractor::MENTIONS
  33. * @returns a structure containing metadata about the embedded things, or empty array if nothing found, or WP_Error on error
  34. */
  35. static public function extract( $blog_id, $post_id, $what_to_extract = self::ALL ) {
  36. // multisite?
  37. if ( function_exists( 'switch_to_blog') )
  38. switch_to_blog( $blog_id );
  39. $post = get_post( $post_id );
  40. $content = $post->post_title . "\n\n" . $post->post_content;
  41. $char_cnt = strlen( $content );
  42. //prevent running extraction on really huge amounts of content
  43. if ( $char_cnt > 100000 ) //about 20k English words
  44. $content = substr( $content, 0, 100000 );
  45. $extracted = array();
  46. // Get images first, we need the full post for that
  47. if ( self::IMAGES & $what_to_extract ) {
  48. $extracted = self::get_image_fields( $post );
  49. // Turn off images so we can safely call extract_from_content() below
  50. $what_to_extract = $what_to_extract - self::IMAGES;
  51. }
  52. if ( function_exists( 'switch_to_blog') )
  53. restore_current_blog();
  54. // All of the other things besides images can be extracted from just the content
  55. $extracted = self::extract_from_content( $content, $what_to_extract, $extracted );
  56. return $extracted;
  57. }
  58. /**
  59. * Gets the specified meta info from the given post content.
  60. * NOTE: If you want IMAGES, call extract( $blog_id, $post_id, ...) which will give you more/better image extraction
  61. * This method will give you an error if you ask for IMAGES.
  62. *
  63. * @param $content The HTML post_content of a post
  64. * @param $what_to_extract (int) A mask of things to extract, e.g. Jetpack_Media_Meta_Extractor::IMAGES | Jetpack_Media_Meta_Extractor::MENTIONS
  65. * @param $already_extracted (array) Previously extracted things, e.g. images from extract(), which can be used for x-referencing here
  66. * @returns a structure containing metadata about the embedded things, or empty array if nothing found, or WP_Error on error
  67. */
  68. static public function extract_from_content( $content, $what_to_extract = self::ALL, $already_extracted = array() ) {
  69. $stripped_content = self::get_stripped_content( $content );
  70. // Maybe start wtih some previously extracted things (e.g. images from extract()
  71. $extracted = $already_extracted;
  72. // Embedded media objects will have already been converted to shortcodes by pre_kses hooks on save.
  73. if ( self::IMAGES & $what_to_extract ) {
  74. // Should've called extract( $blog_id, $post_id ) if you want images
  75. return new WP_Error( 'media-extraction-error', "IMAGES extraction not supported in extract_from_content()" );
  76. }
  77. // ----------------------------------- MENTIONS ------------------------------
  78. if ( self::MENTIONS & $what_to_extract ) {
  79. if ( preg_match_all( '/(^|\s)@(\w+)/u', $stripped_content, $matches ) ) {
  80. $mentions = array_values( array_unique( $matches[2] ) ); //array_unique() retains the keys!
  81. $mentions = array_map( 'strtolower', $mentions );
  82. $extracted['mention'] = array( 'name' => $mentions );
  83. if ( !isset( $extracted['has'] ) )
  84. $extracted['has'] = array();
  85. $extracted['has']['mention'] = count( $mentions );
  86. }
  87. }
  88. // ----------------------------------- HASHTAGS ------------------------------
  89. /* Some hosts may not compile with --enable-unicode-properties and kick a warning
  90. Warning: preg_match_all() [function.preg-match-all]: Compilation failed: support for \P, \p, and \X has not been compiled
  91. if ( self::HASHTAGS & $what_to_extract ) {
  92. //This regex does not exactly match Twitter's
  93. // if there are problems/complaints we should implement this:
  94. // https://github.com/twitter/twitter-text-java/blob/master/src/com/twitter/Regex.java
  95. if ( preg_match_all( '/(?:^|\s)#(\w*\p{L}+\w*)/u', $stripped_content, $matches ) ) {
  96. $hashtags = array_values( array_unique( $matches[1] ) ); //array_unique() retains the keys!
  97. $hashtags = array_map( 'strtolower', $hashtags );
  98. $extracted['hashtag'] = array( 'name' => $hashtags );
  99. if ( !isset( $extracted['has'] ) )
  100. $extracted['has'] = array();
  101. $extracted['has']['hashtag'] = count( $hashtags );
  102. }
  103. }
  104. */
  105. // ----------------------------------- SHORTCODES ------------------------------
  106. // Always look for shortcodes.
  107. // If we don't want them, we'll just remove them, so we don't grab them as links below
  108. $shortcode_pattern = '/' . get_shortcode_regex() . '/s';
  109. if ( preg_match_all( $shortcode_pattern, $content, $matches ) ) {
  110. $shortcode_total_count = 0;
  111. $shortcode_type_counts = array();
  112. $shortcode_types = array();
  113. $shortcode_details = array();
  114. if ( self::SHORTCODES & $what_to_extract ) {
  115. foreach( $matches[2] as $key => $shortcode ) {
  116. //Elasticsearch (and probably other things) doesn't deal well with some chars as key names
  117. $shortcode_name = preg_replace( '/[.,*"\'\/\\\\#+ ]/', '_', $shortcode );
  118. $attr = shortcode_parse_atts( $matches[3][ $key ] );
  119. $shortcode_total_count++;
  120. if ( ! isset( $shortcode_type_counts[$shortcode_name] ) )
  121. $shortcode_type_counts[$shortcode_name] = 0;
  122. $shortcode_type_counts[$shortcode_name]++;
  123. // Store (uniquely) presence of all shortcode regardless of whether it's a keeper (for those, get ID below)
  124. // @todo Store number of occurrences?
  125. if ( ! in_array( $shortcode_name, $shortcode_types ) )
  126. $shortcode_types[] = $shortcode_name;
  127. // For keeper shortcodes, also store the id/url of the object (e.g. youtube video, TED talk, etc.)
  128. if ( in_array( $shortcode, self::$KEEPER_SHORTCODES ) ) {
  129. unset( $id ); // Clear shortcode ID data left from the last shortcode
  130. // We'll try to get the salient ID from the function jetpack_shortcode_get_xyz_id()
  131. // If the shortcode is a class, we'll call XyzShortcode::get_xyz_id()
  132. $shortcode_get_id_func = "jetpack_shortcode_get_{$shortcode}_id";
  133. $shortcode_class_name = ucfirst( $shortcode ) . 'Shortcode';
  134. $shortcode_get_id_method = "get_{$shortcode}_id";
  135. if ( function_exists( $shortcode_get_id_func ) ) {
  136. $id = call_user_func( $shortcode_get_id_func, $attr );
  137. } else if ( method_exists( $shortcode_class_name, $shortcode_get_id_method ) ) {
  138. $id = call_user_func( array( $shortcode_class_name, $shortcode_get_id_method ), $attr );
  139. }
  140. if ( ! empty( $id )
  141. && ( ! isset( $shortcode_details[$shortcode_name] ) || ! in_array( $id, $shortcode_details[$shortcode_name] ) ) )
  142. $shortcode_details[$shortcode_name][] = $id;
  143. }
  144. }
  145. if ( $shortcode_total_count > 0 ) {
  146. // Add the shortcode info to the $extracted array
  147. if ( !isset( $extracted['has'] ) )
  148. $extracted['has'] = array();
  149. $extracted['has']['shortcode'] = $shortcode_total_count;
  150. $extracted['shortcode'] = array();
  151. foreach ( $shortcode_type_counts as $type => $count )
  152. $extracted['shortcode'][$type] = array( 'count' => $count );
  153. if ( ! empty( $shortcode_types ) )
  154. $extracted['shortcode_types'] = $shortcode_types;
  155. foreach ( $shortcode_details as $type => $id )
  156. $extracted['shortcode'][$type]['id'] = $id;
  157. }
  158. }
  159. // Remove the shortcodes form our copy of $content, so we don't count links in them as links below.
  160. $content = preg_replace( $shortcode_pattern, ' ', $content );
  161. }
  162. // ----------------------------------- LINKS ------------------------------
  163. if ( self::LINKS & $what_to_extract ) {
  164. // To hold the extracted stuff we find
  165. $links = array();
  166. // @todo Get the text inside the links?
  167. // Grab any links, whether in <a href="..." or not, but subtract those from shortcodes and images
  168. // (we treat embed links as just another link)
  169. if ( preg_match_all( '#(?:^|\s|"|\')(https?://([^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/))))#', $content, $matches ) ) {
  170. foreach ( $matches[1] as $link_raw ) {
  171. $url = parse_url( $link_raw );
  172. // Build a simple form of the URL so we can compare it to ones we found in IMAGES or SHORTCODES and exclude those
  173. $simple_url = $url['scheme'] . '://' . $url['host'] . ( ! empty( $url['path'] ) ? $url['path'] : '' );
  174. if ( isset( $extracted['image']['url'] ) ) {
  175. if ( in_array( $simple_url, (array) $extracted['image']['url'] ) )
  176. continue;
  177. }
  178. list( $proto, $link_all_but_proto ) = explode( '://', $link_raw );
  179. // Build a reversed hostname
  180. $host_parts = array_reverse( explode( '.', $url['host'] ) );
  181. $host_reversed = '';
  182. foreach ( $host_parts as $part ) {
  183. $host_reversed .= ( ! empty( $host_reversed ) ? '.' : '' ) . $part;
  184. }
  185. $link_analyzed = '';
  186. if ( !empty( $url['path'] ) ) {
  187. // The whole path (no query args or fragments)
  188. $path = substr( $url['path'], 1 ); // strip the leading '/'
  189. $link_analyzed .= ( ! empty( $link_analyzed ) ? ' ' : '' ) . $path;
  190. // The path split by /
  191. $path_split = explode( '/', $path );
  192. if ( count( $path_split ) > 1 ) {
  193. $link_analyzed .= ' ' . implode( ' ', $path_split );
  194. }
  195. // The fragment
  196. if ( ! empty( $url['fragment'] ) )
  197. $link_analyzed .= ( ! empty( $link_analyzed ) ? ' ' : '' ) . $url['fragment'];
  198. }
  199. // @todo Check unique before adding
  200. $links[] = array(
  201. 'url' => $link_all_but_proto,
  202. 'host_reversed' => $host_reversed,
  203. 'host' => $url['host'],
  204. );
  205. }
  206. }
  207. $link_count = count( $links );
  208. $extracted['link'] = $links;
  209. if ( $link_count ) {
  210. if ( !isset( $extracted['has'] ) )
  211. $extracted['has'] = array();
  212. $extracted['has']['link'] = $link_count;
  213. }
  214. }
  215. // ----------------------------------- EMBEDS ------------------------------
  216. //Embeds are just individual links on their own line
  217. if ( self::EMBEDS & $what_to_extract ) {
  218. if ( !function_exists( '_wp_oembed_get_object' ) )
  219. include( ABSPATH . WPINC . '/class-oembed.php' );
  220. // get an oembed object
  221. $oembed = _wp_oembed_get_object();
  222. // Grab any links on their own lines that may be embeds
  223. if ( preg_match_all( '|^\s*(https?://[^\s"]+)\s*$|im', $content, $matches ) ) {
  224. // To hold the extracted stuff we find
  225. $embeds = array();
  226. foreach ( $matches[1] as $link_raw ) {
  227. $url = parse_url( $link_raw );
  228. list( $proto, $link_all_but_proto ) = explode( '://', $link_raw );
  229. // Check whether this "link" is really an embed.
  230. foreach ( $oembed->providers as $matchmask => $data ) {
  231. list( $providerurl, $regex ) = $data;
  232. // Turn the asterisk-type provider URLs into regex
  233. if ( !$regex ) {
  234. $matchmask = '#' . str_replace( '___wildcard___', '(.+)', preg_quote( str_replace( '*', '___wildcard___', $matchmask ), '#' ) ) . '#i';
  235. $matchmask = preg_replace( '|^#http\\\://|', '#https?\://', $matchmask );
  236. }
  237. if ( preg_match( $matchmask, $link_raw ) ) {
  238. $provider = str_replace( '{format}', 'json', $providerurl ); // JSON is easier to deal with than XML
  239. $embeds[] = $link_all_but_proto; // @todo Check unique before adding
  240. // @todo Try to get ID's for the ones we care about (shortcode_keepers)
  241. break;
  242. }
  243. }
  244. }
  245. if ( ! empty( $embeds ) ) {
  246. if ( !isset( $extracted['has'] ) )
  247. $extracted['has'] = array();
  248. $extracted['has']['embed'] = count( $embeds );
  249. $extracted['embed'] = array( 'url' => array() );
  250. foreach ( $embeds as $e )
  251. $extracted['embed']['url'][] = $e;
  252. }
  253. }
  254. }
  255. return $extracted;
  256. }
  257. /**
  258. * @param $post A post object
  259. * @param $args (array) Optional args, see defaults list for details
  260. * @returns array Returns an array of all images meeting the specified criteria in $args
  261. *
  262. * Uses Jetpack Post Images
  263. */
  264. private static function get_image_fields( $post, $args = array() ) {
  265. $defaults = array(
  266. 'width' => 200, // Required minimum width (if possible to determine)
  267. 'height' => 200, // Required minimum height (if possible to determine)
  268. );
  269. $args = wp_parse_args( $args, $defaults );
  270. $image_list = array();
  271. $image_booleans = array();
  272. $image_booleans['gallery'] = 0;
  273. $from_slideshow = Jetpack_PostImages::from_slideshow( $post->ID, $args['width'], $args['height'] );
  274. if ( !empty( $from_slideshow ) ) {
  275. $srcs = wp_list_pluck( $from_slideshow, 'src' );
  276. $image_list = array_merge( $image_list, $srcs );
  277. }
  278. $from_gallery = Jetpack_PostImages::from_gallery( $post->ID );
  279. if ( !empty( $from_gallery ) ) {
  280. $srcs = wp_list_pluck( $from_gallery, 'src' );
  281. $image_list = array_merge( $image_list, $srcs );
  282. $image_booleans['gallery']++; // @todo This count isn't correct, will only every count 1
  283. }
  284. // @todo Can we check width/height of these efficiently? Could maybe use query args at least, before we strip them out
  285. $image_list = Jetpack_Media_Meta_Extractor::get_images_from_html( $post->post_content, $image_list );
  286. if ( ! empty( $image_list ) ) {
  287. $retval = array( 'image' => array() );
  288. $unique_imgs = array_unique( $image_list );
  289. foreach ( $image_list as $img ) {
  290. $retval['image'][] = array( 'url' => $img );
  291. }
  292. $image_booleans['image'] = count( $retval['image'] );
  293. if ( ! empty( $image_booleans ) )
  294. $retval['has'] = $image_booleans;
  295. return $retval;
  296. } else {
  297. return array();
  298. }
  299. }
  300. /**
  301. *
  302. * @param string $html Some markup, possibly containing image tags
  303. * @param array $images_already_extracted (just an array of image URLs without query strings, no special structure), used for de-duplication
  304. * @return array Image URLs extracted from the HTML, stripped of query params and de-duped
  305. */
  306. public static function get_images_from_html( $html, $images_already_extracted ) {
  307. $image_list = $images_already_extracted;
  308. $from_html = Jetpack_PostImages::from_html( $html );
  309. if ( !empty( $from_html ) ) {
  310. $srcs = wp_list_pluck( $from_html, 'src' );
  311. foreach( $srcs as $image_url ) {
  312. if ( ( $src = parse_url( $image_url ) ) && isset( $src['scheme'], $src['host'], $src['path'] ) ) {
  313. // Rebuild the URL without the query string
  314. $queryless = $src['scheme'] . '://' . $src['host'] . $src['path'];
  315. } elseif ( $length = strpos( $image_url, '?' ) ) {
  316. // If parse_url() didn't work, strip off theh query string the old fashioned way
  317. $queryless = substr( $image_url, 0, $length );
  318. } else {
  319. // Failing that, there was no spoon! Err ... query string!
  320. $queryless = $image_url;
  321. }
  322. if ( ! in_array( $queryless, $image_list ) ) {
  323. $image_list[] = $queryless;
  324. }
  325. }
  326. }
  327. return $image_list;
  328. }
  329. private static function get_stripped_content( $content ) {
  330. $clean_content = strip_tags( $content );
  331. $clean_content = html_entity_decode( $clean_content );
  332. //completely strip shortcodes and any content they enclose
  333. $clean_content = strip_shortcodes( $clean_content );
  334. return $clean_content;
  335. }
  336. }