/php/webarchive.php

https://github.com/yoya/misc · PHP · 212 lines · 203 code · 8 blank · 1 comment · 9 complexity · b0bf00683b47417f7e9605d78d4cca91 MD5 · raw file

  1. <?php
  2. function usage() {
  3. echo "Usage: php webarchive.php <get|check> <target_url> # http:// https:// only ".PHP_EOL;
  4. echo "ex) php webarchive.php get http://app.awm.jp".PHP_EOL;
  5. }
  6. if ($argc != 3) {
  7. echo "too few arguments".PHP_EOL;
  8. usage();
  9. exit (1);
  10. }
  11. $method = $argv[1];
  12. if (($method !== "check") && ($method !== "get")) {
  13. echo "method:$method support, check or get".PHP_EOL;
  14. usage();
  15. exit (1);
  16. }
  17. $targetURL = $argv[2];
  18. $permit_protocols = ["http://", "https://"];
  19. $permit_protocol = null;
  20. foreach ($permit_protocols as $proto) {
  21. if (strncmp($targetURL, $proto, strlen($proto)) === 0){
  22. $permit_protocol = $proto;
  23. break;
  24. }
  25. }
  26. if (is_null($permit_protocol)) {
  27. echo "protocol support, http:// or https://".PHP_EOL;
  28. usage();
  29. exit (1);
  30. }
  31. $calenderURL = "https://web.archive.org/__wb/sparkline?output=json&url=" . urlencode($targetURL);
  32. //$pagedir = "web";
  33. $pagedir = substr($targetURL, strlen($permit_protocol));
  34. $pagedir = urlencode($pagedir);
  35. $calenderFile = "$pagedir/sparkline.json";
  36. if (! is_dir($pagedir)) {
  37. if ($method !== "get") {
  38. echo "Error: pagedir:$pagedir not found.".PHP_EOL;
  39. exit (1);
  40. }
  41. if (! mkdir($pagedir, 0755)) {
  42. echo "Can't make directory $pagedir".PHP_EOL;
  43. exit (1);
  44. }
  45. }
  46. if (! is_file($calenderFile)) {
  47. if ($method !== "get") {
  48. echo "Error: calenderFile:$calenderFile not found.".PHP_EOL;
  49. exit (1);
  50. }
  51. echo $calenderURL.PHP_EOL;
  52. $calenderJSON = file_get_contents($calenderURL);
  53. file_put_contents($calenderFile, $calenderJSON);
  54. }
  55. echo $calenderFile.PHP_EOL;
  56. $calenderJSON = file_get_contents($calenderFile);
  57. $calenderInfo = json_decode($calenderJSON);
  58. $first_ts = $calenderInfo->first_ts;
  59. $last_ts = $calenderInfo->last_ts;
  60. echo "Range: {$first_ts} => {$last_ts}".PHP_EOL;
  61. $years = (array) $calenderInfo->years;
  62. ksort($years);
  63. foreach ($years as $year => $months) {
  64. foreach ($months as $month => $count) {
  65. if ($count <= 0) { continue; } // skip
  66. $month++; // one origin
  67. if ($method === "check") {
  68. $ret = check_month_archive($year, $month, $count);
  69. } else { // "get"
  70. $ret = fetch_month_archive($year, $month, $count);
  71. }
  72. if (! $ret) {
  73. var_dump($ret);
  74. echo "failed: fetch_month_archive".PHP_EOL;
  75. exit (1);
  76. }
  77. }
  78. }
  79. function check_month_archive($year, $month, $count) {
  80. global $pagedir;
  81. echo "Year:{$year} Month:{$month}: count:{$count}";
  82. $ymd = sprintf("%02d%02d", $year, $month);
  83. $actual_count = 0;
  84. foreach (glob("$pagedir/$ymd"."????????")as $file) {
  85. if ($n = preg_match('/\d{4}\d{2}(\d{2})(\d{6})/', $file, $matches) === 0) {
  86. echo "unexpected file:$file got.".PHP_EOL;
  87. return false;
  88. }
  89. list ($dummy, $day, $hms) = $matches;
  90. if ($hms !== "000000") {
  91. $actual_count++;
  92. }
  93. }
  94. echo "=>$actual_count";
  95. if ($count !== $actual_count) {
  96. echo " diff(".($actual_count-$count).")";
  97. }
  98. echo PHP_EOL;
  99. return true;
  100. }
  101. function fetch_month_archive($year, $month, $count) {
  102. echo "Year:{$year} Month:{$month}: count:{$count}".PHP_EOL;
  103. return fetch_month_archive_rec($year, $month, 1, 31, $count);
  104. }
  105. // Binary search
  106. function fetch_month_archive_rec($year, $month, $day_start, $day_end, $count) {
  107. if ($day_start >= $day_end) {
  108. echo "day_start:$day_start >= day_end:$day_end";
  109. return 0;
  110. }
  111. echo "DayRange:$day_start-$day_end count:{$count}".PHP_EOL;
  112. $day = round(($day_start + $day_end) / 2);
  113. $foundURL = fetch_redirect($year, $month, $day);
  114. if ($n = preg_match('/web\/(\d{14})/', $foundURL, $matches) === 0) {
  115. return false;
  116. }
  117. $datekey = $matches[1];
  118. if ($n = preg_match('/web\/(\d{4})(\d{2})(\d{2})/', $foundURL, $matches) === 0) {
  119. return false;
  120. }
  121. list($dummy, $foundYear, $foundMonth, $foundDay) = $matches;
  122. if (($year !== $foundYear) || ($month !== $foundMonth)) {
  123. echo "outrange found: $year=>$foundYear, $month=>$foundMonth".PHP_EOL;
  124. }
  125. if (($foundDay < $day_start) || ($day_end < $foundDay)) {
  126. echo "outrange found: $foundDay < $day_start || $day_end < $foundDay".PHP_EOL;
  127. return 0;
  128. }
  129. fetch_page($foundURL, $datekey);
  130. $count--;
  131. if ($count == 0) {
  132. return true;
  133. }
  134. $leftDay = min($day, $foundDay);
  135. $rightDay = max($day, $foundDay);
  136. // start_dat - $leftDay - $rightDay - $end_day
  137. $left_found = fetch_month_archive_rec($year, $month, $day_start, $leftDay-1, $count);
  138. if (is_bool($left_found)) {
  139. return $left_found;
  140. }
  141. $count -= $left_found;
  142. $right_found = fetch_month_archive_rec($year, $month, $rightDay+1, $day_end, $count);
  143. $count -= $right_found;
  144. if (is_bool($right_found)) {
  145. return $right_found;
  146. }
  147. return 1 + $left_found + $right_found;
  148. }
  149. function fetch_redirect($year, $month, $day) {
  150. global $targetURL, $pagedir;
  151. $ymd = sprintf("%02d%02d%02d", $year, $month, $day);
  152. $datekey = $ymd."000000";
  153. $pagefile = "$pagedir/$datekey";
  154. if (is_file($pagefile)) {
  155. echo "File: $pagefile".PHP_EOL;
  156. return file_get_contents($pagefile);
  157. }
  158. $url = "https://web.archive.org/web/$datekey/$targetURL";
  159. echo "URL: $url".PHP_EOL;
  160. $ch = curl_init();
  161. curl_setopt($ch, CURLOPT_URL, $url);
  162. curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); // mute curl print
  163. curl_setopt($ch, CURLOPT_HEADER, true);
  164. $ret = curl_exec($ch);
  165. if (! $ret) {
  166. echo "Error: $url".PHP_EOL;
  167. return false;
  168. }
  169. $info = curl_getinfo ($ch);
  170. curl_close($ch);
  171. $redirect_url = $info["redirect_url"];
  172. file_put_contents($pagefile, $redirect_url);
  173. return $redirect_url;
  174. }
  175. function fetch_page($url, $datekey) {
  176. global $pagedir;
  177. echo "Url: $url".PHP_EOL;
  178. $pagefile = "$pagedir/$datekey";
  179. if (is_file($pagefile)) {
  180. echo "File: $pagefile".PHP_EOL;
  181. return file_get_contents($pagefile);
  182. }
  183. echo $url.PHP_EOL;
  184. $ch = curl_init();
  185. curl_setopt($ch, CURLOPT_URL, $url);
  186. curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); // mute curl print
  187. curl_setopt($ch, CURLOPT_HEADER, false);
  188. $html = curl_exec($ch);
  189. if (! $html) {
  190. echo "Error: $url".PHP_EOL;
  191. return false;
  192. }
  193. curl_close($ch);
  194. file_put_contents($pagefile, $html);
  195. return $html;
  196. }