PageRenderTime 44ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/tmp/allocinescraper.php

https://github.com/bdunogier/mkvmanager
PHP | 219 lines | 182 code | 18 blank | 19 comment | 30 complexity | ed1896dca32260bb3194d453b943c268 MD5 | raw file
  1. <?php
  2. if ( !isset( $_GET['q'] ) and !isset( $_GET['movie'] ) )
  3. {
  4. exit('q|movie GET parameter missing');
  5. }
  6. // search
  7. if ( isset( $_GET['q'] ) )
  8. {
  9. $query = $_GET['q'];
  10. if ( preg_match( '#([^\(]+) \(([0-9]{4})\)#', $query, $queryMatches ) )
  11. {
  12. $query = $queryMatches[1];
  13. $queryYear = $queryMatches[2];
  14. }
  15. echo "query: $query<br />";
  16. echo "queryYear: $queryYear<br />";
  17. $searchURL = 'http://www.allocine.fr/recherche/?q=' . urlencode( $query );
  18. $searchPage = file_get_contents( $searchURL );
  19. $searchPage = substr( $searchPage, strpos( $searchPage, '<h3><b>Films <h4>' ) );
  20. $searchPage = substr( $searchPage, 0, strpos( $searchPage, '<h3><b>', 1 ) );
  21. // @todo match more details (year, director, actors, image)
  22. preg_match_all( '#<a href="/film/fichefilm_gen_cfilm=([0-9]+)\.html" class="link1">(.*?)</a>#', $searchPage, $matches, PREG_SET_ORDER );
  23. preg_match_all( '#&nbsp;\(([^)]+)\)</a>#', $searchPage, $origTitleMatches, PREG_SET_ORDER );
  24. preg_match_all( '#<h4 style="color: \#808080">([0-9]{4})</h4>#', $searchPage, $yearMatches, PREG_SET_ORDER );
  25. foreach ( $matches as $key => $match )
  26. {
  27. $movieTitle = strip_tags( $match[2] );
  28. $originalTitle = strip_tags( $origTitleMatches[$key][1] );
  29. $year = strip_tags( $yearMatches[$key][1] );
  30. $perfectMatch = false;
  31. if ( strtolower( $movieTitle ) == strtolower( $query ) or ( strtolower( $originalTitle ) == strtolower( $query ) ) )
  32. {
  33. if ( isset( $queryYear ) )
  34. {
  35. if ( $queryYear == $year )
  36. $perfectMatch = true;
  37. }
  38. else
  39. $perfectMatch = true;
  40. }
  41. elseif ( isset( $queryYear ) && ( $queryYear == $year ) )
  42. {
  43. $perfectMatch = true;
  44. }
  45. $movieID = $match[1];
  46. $url = $_SERVER['PHP_SELF'] . '?movie=' . urlencode( $movieID );
  47. if ( $perfectMatch ) echo "<b>";
  48. echo "<a href=\"$url\">$movieTitle ($year)</a> (<i>$originalTitle</i>)";
  49. if ( $perfectMatch ) echo "</b>";
  50. echo "<br />";
  51. }
  52. }
  53. // details page
  54. elseif ( isset( $_GET['movie'] ) )
  55. {
  56. $movieData = array();
  57. $detailsPage = utf8_encode( file_get_contents( 'http://www.allocine.fr/film/fichefilm_gen_cfilm=' . $_GET['movie'] . '.html' ) );
  58. // title
  59. if ( preg_match( '|<h1 class="TitleFilm">([^<]+)</h1>|', $detailsPage, $matches ) )
  60. {
  61. $movieData['title'] = $matches[1];
  62. }
  63. // original title
  64. if ( preg_match( '|<h3 class="SpProse">Titre original : <i>([^<]+)</i></h3>|', $detailsPage, $matches ) )
  65. {
  66. $movieData['original-title'] = $matches[1];
  67. }
  68. else
  69. {
  70. $movieData['original-title'] = $movieData['title'];
  71. }
  72. // release date
  73. if ( preg_match( '|<h4>Date de sortie : <b>(.*?)</b>|', $detailsPage, $matches ) )
  74. {
  75. $movieData['release-date'] = $matches[1];
  76. }
  77. // genre
  78. $substring = substr( $detailsPage, strpos( $detailsPage, 'Genre : ' ) + 8 );
  79. $substring = substr( $substring, 0, strpos( $substring, '</h3>' ) );
  80. if ( preg_match_all( '#<a href="(/film/alaffiche_genre_gen_genre[^"]+)" class="link1">([^<]+)</a>#', $substring, $genreMatches, PREG_SET_ORDER ) )
  81. {
  82. foreach( $genreMatches as $genreMatch )
  83. {
  84. $movieData['genre'][] = $genreMatch[2];
  85. }
  86. }
  87. // director(s)
  88. $substring = substr( $detailsPage, strpos( $detailsPage, 'Réalisé par ' ) + 12 );
  89. $substring = substr( $substring, 0, strpos( $substring, '</h3>' ) );
  90. if ( preg_match_all( '#<a[^>]*href="([^"]+)"[^>]*>(.*?)</a>#', $substring, $matches, PREG_SET_ORDER ) )
  91. {
  92. foreach ( $matches as $match )
  93. {
  94. $movieData['directors'][] = array( 'name' => $match[2], 'url' => $match[1] );
  95. }
  96. }
  97. // distributed by...
  98. if ( strstr( $detailsPage, 'Distribué par ' ) !== false )
  99. {
  100. $substring = substr( $detailsPage, strpos( $detailsPage, 'Distribué par ' ) + 14 );
  101. $substring = substr( $substring, 0, strpos( $substring, '</h3>' ) );
  102. if ( preg_match_all( '#<a[^>]*href="([^"]+)"[^>]*>(.*?)</a>#', $substring, $matches, PREG_SET_ORDER ) )
  103. {
  104. foreach ( $matches as $match )
  105. {
  106. $movieData['distributed-by'][] = array( 'name' => $match[2], 'url' => $match[1] );
  107. }
  108. }
  109. }
  110. // runtime
  111. if ( preg_match( '#Durée : ([^\.]+).#', $detailsPage, $runtimeMatches ) )
  112. {
  113. sscanf( $runtimeMatches[1], '%dh %dmin', $hours, $minutes );
  114. $movieData['runtime'] = $hours * 60 + $minutes;
  115. }
  116. // synopsis
  117. $substring = substr( $detailsPage, strpos( $detailsPage, '<h2 class="SpBlocTitle" >Synopsis</h2>' ) + 38 );
  118. // echo "$substring";
  119. $substring = substr( $substring, strpos( $substring, '<h4>' ) + 4 );
  120. $substring = substr( $substring, 0, strpos( $substring, '</h4>' ) );
  121. $movieData['synopsis'] = utf8_decode( $substring );
  122. unset( $substring );
  123. // images
  124. $imagesPage = utf8_encode( file_get_contents( 'http://www.allocine.fr/film/galerievignette_gen_cfilm=' . $_GET['movie'] . '.html' ) );
  125. if ( preg_match( '#<img id=\'imgNormal\' class=\'photo\' src=\'([^\']+)\'#', $imagesPage, $matches ) )
  126. {
  127. $movieData['cover'] = $matches[1];
  128. }
  129. // trailers
  130. if ( preg_match( '#<a href="/video/player_gen_cmedia=([0-9]+)&cfilm=[0-9]+\.html" class="link5">#', $detailsPage, $trailerLinkMatches ) )
  131. {
  132. $videoID = $trailerLinkMatches[1];
  133. $videoListURL = "http://www.allocine.fr/webtv/film.html?cfilm={$_GET['movie']}";
  134. $trailersListPage = utf8_encode( file_get_contents( $videoListURL ) );
  135. // $trailersPage = utf8_encode( file_get_contents( 'http://www.allocine.fr/webtv/acvision.asp?nopub=1&emission=&player=ASF&debit=HD&cvid=' . $videoID ) );
  136. if ( preg_match_all( '#<a href="(acvision\.asp\?cvid=[0-9]+)" [^>]+>([^<]+)</a>#', $trailersListPage, $trailersMatches, PREG_SET_ORDER ) )
  137. {
  138. foreach( $trailersMatches as $trailerMatch )
  139. {
  140. $trailerLabel = strtolower( utf8_decode( $trailerMatch[2] ) );
  141. if ( strpos( $trailerLabel, 'annonce' ) !== false )
  142. {
  143. $trailerURL = 'http://www.allocine.fr/webtv/' . $trailerMatch[1] . '&nopub=1&player=ASF';
  144. $trailerHTML = utf8_encode( file_get_contents( $trailerURL . '&debit=HD' ) );
  145. if ( !$trailerFileURL = GetTrailerFileURL( $trailerHTML ) )
  146. {
  147. $trailerHTML = utf8_encode( file_get_contents( $trailerURL . '&debit=H' ) );
  148. if ( !$trailerFileURL = GetTrailerFileURL( $trailerHTML ) )
  149. {
  150. continue;
  151. }
  152. }
  153. // $trailerPage = utf8_encode( file_get_contents( $trailerURL ) );
  154. $movieData['trailers'][] = array( 'name' => $trailerLabel, 'url' => $trailerFileURL );
  155. }
  156. }
  157. }
  158. }
  159. // casting
  160. $movieData['actors'] = array();
  161. $castingURL = "http://www.allocine.fr/film/casting_gen_cfilm={$_GET['movie']}.html";
  162. $castingHTML = utf8_encode( file_get_contents( $castingURL ) );
  163. $substring = substr( $castingHTML, strpos( $castingHTML, '<h2 class="SpProse" style="color: #D20000; font-weight:bold;">Acteurs</h2>' ) + 74 );
  164. $substring = substr( $substring, 0, strpos( $substring, '</table' ) );
  165. if ( preg_match_all( '#<h5><a href="(/personne/fichepersonne_gen_cpersonne=[0-9]+\.html)" class="link1">([^<]+)</a></h5>#', $substring, $actorsMatches, PREG_SET_ORDER ) )
  166. {
  167. preg_match_all( '#<h5>([^<]+)</h5>#', $substring, $actorsRolesMatches );
  168. foreach ( $actorsMatches as $key => $actorMatch )
  169. {
  170. $actor = array();
  171. $actor['link'] = $actorMatch[1];
  172. $actor['name'] = $actorMatch[2];
  173. $actor['role'] = $actorsRolesMatches[1][$key];
  174. // actor role
  175. $movieData['actors'][] = $actor;
  176. unset( $actor );
  177. }
  178. }
  179. echo "<b>MOVIE DATA</b><br /><pre>";
  180. print_r( $movieData );
  181. echo "</pre>";
  182. }
  183. function GetTrailerFileURL( $trailerHTML )
  184. {
  185. if ( preg_match( '|<PARAM name="URL" value="([^"]+)">|', $trailerHTML, $trailerGeneratorMatch ) )
  186. {
  187. // get generation page to get the MMS URL
  188. $generationURL = $trailerGeneratorMatch[1];
  189. $generationHTML = utf8_encode( file_get_contents( $generationURL ) );
  190. if ( preg_match( '|<REF HREF = "(mms://[^"]+)" />|', $generationHTML, $generationMatch ) )
  191. {
  192. $trailerURL = $generationMatch[1];
  193. $trailerURL = substr( $trailerURL, strpos( $trailerURL, 'mediaplayer.allocine.fr' ) );
  194. $trailerURL = str_replace( '.wmv', '.flv', $trailerURL );
  195. $trailerURL = 'http://h.fr.' . $trailerURL;
  196. return $trailerURL;
  197. }
  198. }
  199. return false;
  200. }
  201. ?>