PageRenderTime 48ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/imdb_scraper.php

https://github.com/aramk/IMDb-Scraper
PHP | 185 lines | 122 code | 30 blank | 33 comment | 24 complexity | 8cef3464e70256165183017068a87e3f MD5 | raw file
  1. <?php
  2. /*
  3. IMDb Scraper v. 1.0 - 14th of September, 2011
  4. Scrapes information about movie and tv show titles from IMDb (imdb.com).
  5. By Aram Kocharyan
  6. http://ak.net84.net/php/imdb-scraper/
  7. akarmenia@gmail.com
  8. twitter.com/akarmenia
  9. */
  10. // Utility functions
  11. require_once('util.php');
  12. // Prevent timeout
  13. set_time_limit(0);
  14. ini_set('max_execution_time', 0);
  15. Class IMDbScraper {
  16. // Performs an IMDb search and returns the info for the best match using the given query title and year
  17. public static function get($title, $year = NULL) {
  18. if ( ($result = self::find($title, $year = NULL)) !== FALSE ) {
  19. return self::info($result['id']);
  20. } else {
  21. return FALSE;
  22. }
  23. }
  24. // Return array of info for a given IMDb id string. eg. 'tt0206512'
  25. public static function info($id) {
  26. if (!is_string($id)) {
  27. throw new Exception("The title must be a string");
  28. } else {
  29. $id = preg_replace('#[^t\d]#', '', $id);
  30. }
  31. $url = 'http://www.imdb.com/title/' . $id . '/';
  32. if ( ($html = curl_get_html($url)) !== FALSE ) {
  33. $info = self::scrape_info($html);
  34. $info['id'] = $id;
  35. $info['url'] = $url;
  36. return $info;
  37. } else {
  38. return FALSE;
  39. }
  40. }
  41. // Returns the list of IMDb search results for the given title query.
  42. function search($title) {
  43. if ( !is_string($title) ) {
  44. throw new Exception("The title '".$title."' is not valid");
  45. }
  46. $url = 'http://www.imdb.com/find?s=tt&q=' . urlencode($title);
  47. $html = curl_get_html($url);
  48. return self::scrape_search($html);
  49. }
  50. // Performs an IMDb search and finds the best match to the given title and year.
  51. function find($title, $year = NULL) {
  52. if ( !is_string($title) || empty($title) ) {
  53. throw new Exception("The title is not valid");
  54. }
  55. $query = $title;
  56. if ( is_string($year) ) {
  57. $year = intval($year);
  58. }
  59. if ( is_int($year) ) {
  60. $query .= ' ' . $year;
  61. }
  62. // Get results for the search query
  63. $results = self::search($query);
  64. if ( empty($results) ) {
  65. return FALSE;
  66. }
  67. // Remove any queries that don't match the year
  68. if ($year !== NULL) {
  69. $subset = array();
  70. foreach ($results as $r) {
  71. if ( intval($r[2]) == $year ) {
  72. // Add result into subset, year matches
  73. $subset[] = $r;
  74. }
  75. }
  76. }
  77. // If no year is provided, or it was and we were left with no results, use the original results
  78. if ($year === NULL || empty($subset)) {
  79. $subset = $results;
  80. }
  81. // Break title query into words
  82. $query_bits = explode(' ', $title);
  83. // Get the search result titles
  84. $titles = array();
  85. foreach ($results as $r) {
  86. $titles[] = $r[1];
  87. }
  88. // Run a search using the words and see how many matches each search result gets
  89. $counts = substr_count_arrays($titles, $query_bits);
  90. // TODO check the results and see if the counts are equal (no good matches)
  91. // Get the highest count, or if they are all equal use the first result
  92. $highest_index = 0;
  93. $highest_count = $counts[0];
  94. for ($i = 1; $i < count($counts); $i++) {
  95. if ($counts[$i] > $highest_count) {
  96. $highest_index = $i;
  97. }
  98. }
  99. // Create an associative array, now that we have our result
  100. $result['id'] = $subset[$highest_index][0];
  101. $result['title'] = $subset[$highest_index][1];
  102. $result['year'] = $subset[$highest_index][2];
  103. return $result;
  104. }
  105. // Returns an associative array of IMDb information scrapped from an HTML string.
  106. public static function scrape_info($html) {
  107. $result = array();
  108. $result['name'] = regex_get('#<h1.*?>(.*?)<span#msi', $html, 1);
  109. $result['desc'] = regex_get('#"description">(.*?)</p>#msi', $html, 1);
  110. $date = regex_get('#datetime="(\d+)#msi', $html, 1, 'num');
  111. if (empty($date)) {
  112. $date = clean_num(regex_get('#<title>[^\(]*\(([^\)]+)\)#msi', $html, 1, 'num'));
  113. }
  114. $result['date'] = $date;
  115. $result['duration'] = regex_get('#class="absmiddle"[^<]*?(\d+\s*min)#msi', $html, 1);
  116. // Only for Movies
  117. $result['director'] = regex_get('#writer.*?([\s\w]*)</a#msi', $html, 1);
  118. $result['writer'] = regex_get('#writer.*?([\s\w]*)</a#msi', $html, 1);
  119. // Only for TV shows
  120. $result['creator'] = regex_get('#creator.*?([\s\w]*)</a#msi', $html, 1);
  121. $result['cast'] = array();
  122. if (preg_match_all('#class="name".*?>([^<]*)</a>#msi', $html, $cast)) {
  123. $result['cast'] = $cast[1];
  124. }
  125. $result['genres'] = array();
  126. if (preg_match_all('#/genre/([^"]*)"\s*>\1#msi', $html, $genre)) {
  127. $result['genres'] = $genre[1];
  128. }
  129. $result['plot'] = regex_get('#storyline</h2>\s*<p>(.*?)<#msi', $html, 1);
  130. $result['rating'] = regex_get('#"ratingValue">(.*?)<#msi', $html, 1, 'num');
  131. $result['max-rating'] = regex_get('#"bestRating">(.*?)<#msi', $html, 1, 'num');
  132. $result['voter-count'] = regex_get('#"ratingCount">(.*?)<#msi', $html, 1, 'num');
  133. $result['user-review-count'] = regex_get('#"reviewCount">(.*?)<#msi', $html, 1, 'num');
  134. $result['critic-review-count'] = regex_get('#(\d+) external critic#msi', $html, 1, 'num');
  135. return $result;
  136. }
  137. // Returns an array of search results for the given HTML string of an IMDB search page.
  138. // Each result is an array: (title ID, title, year)
  139. public static function scrape_search($html) {
  140. $results = array();
  141. if (preg_match_all('#<a\s*href\s*=\s*"([^)]*?)"[^>]*?>([^<]*)</a>\s*\((\d*)\)#msi', $html, $matches)) {
  142. for ($i = 0; $i < count($matches[0]); $i++) {
  143. $results[$i] = array( imdb_url_id($matches[1][$i]),
  144. clean_str($matches[2][$i]),
  145. clean_str($matches[3][$i]) );
  146. }
  147. }
  148. return $results;
  149. }
  150. }
  151. ?>