/downloaders/Sankakucomplex_Downloader.php

https://github.com/captainkuro/crawler · PHP · 130 lines · 111 code · 17 blank · 2 comment · 18 complexity · bafcd5aa77edc643094901fbd1c7dbad MD5 · raw file

  1. <?php
  2. class Sankakucomplex_Downloader implements ADownloader {
  3. private $default_dir;
  4. private $page_from = 1;
  5. private $page_to = 10;
  6. public function display() {
  7. return 'Sankakucomplex Chan/Idol';
  8. }
  9. public function download () {
  10. $this->default_dir = DConfig::p_folder();
  11. echo "List URL: ";
  12. $list_url = trim(fgets(STDIN));
  13. echo "Save Dir [{$this->default_dir}]: ";
  14. $dir = trim(fgets(STDIN));
  15. echo "Page setting [1,10]: ";
  16. $page_conf = trim(fgets(STDIN));
  17. $page_conf = explode(',', $page_conf);
  18. if (!empty($page_conf[0])) {
  19. $this->page_from = (int)$page_conf[0];
  20. }
  21. if (!empty($page_conf[1])) {
  22. $this->page_to = (int)$page_conf[1];
  23. }
  24. $dir = $this->prepare_dir($dir, $list_url);
  25. $this->collect_images($list_url, $dir);
  26. }
  27. private function prepare_dir($dir, $list_url) {
  28. $dir = $dir ? $dir : $this->default_dir;
  29. $query = parse_url($list_url, PHP_URL_QUERY);
  30. parse_str($query, $params);
  31. $tags = urldecode($params['tags']);
  32. $tags = preg_replace('#[^\w \-)(]#', '', $tags);
  33. $new_dir = rtrim($dir, '/') . '/' . $tags . '/';
  34. if (!is_dir($new_dir)) {
  35. mkdir($new_dir);
  36. }
  37. return $new_dir;
  38. }
  39. private function collect_images($url, $dir) {
  40. if (strpos($url, '/idol.')) {
  41. $base = 'https://idol.sankakucomplex.com';
  42. } else {
  43. $base = 'https://chan.sankakucomplex.com';
  44. }
  45. $page = $this->page_from;
  46. $id = ($page-1) * 24 + 1;
  47. $Turl = Text::create($url);
  48. do {
  49. if ($page > $this->page_to) break;
  50. $purl = $url.'&page='.$page;
  51. echo "$purl\n";
  52. do {
  53. $P = new Page($purl, array('become_firefox'=>true));
  54. $T = new Text($P->content());
  55. sleep(3); // 429 too many requests
  56. } while ($T->contain('429 Too many requests'));
  57. $a = $T->extract_to_array('href="', '"');
  58. foreach ($a as $i => $e) {
  59. $E = new Text($e);
  60. if (!$E->contain('/post/show')) {
  61. unset($a[$i]);
  62. }
  63. }
  64. if (!count($a)) break;
  65. foreach ($a as $i => $e) {
  66. $E = new Text($e);
  67. $kurl = $base . $e;
  68. echo "$kurl\n";flush();
  69. do {
  70. $P = new Page($kurl, array('become_firefox'=>true));
  71. $T = new Text($P->content());
  72. sleep(3); // 429 too many requests
  73. } while ($T->contain('429 Too many requests'));
  74. $P->go_line('id=highres');
  75. $img = $P->curr_line()->cut_between('href="', '"');
  76. /*if ($img->contain('.webm')) {
  77. echo "This is WEBM\n";
  78. } else*/ if ($img->to_s()) {
  79. $this->download_if_not_exist($img, $dir, $id);
  80. $id++;
  81. } else {
  82. echo "No id=highres\n";
  83. }
  84. }
  85. $page++;
  86. } while (true);
  87. }
  88. private function download_if_not_exist($img, $dir, $id) {
  89. if ($img->pos('//') === 0) {
  90. $src = 'https:' . $img->html_entity_decode()->to_s();
  91. } else {
  92. $src = $img->to_s();
  93. }
  94. $filename = $img->cut_rafter('/')->cut_before('?');
  95. $outpath = $dir . Text::create($id)->pad(3)->to_s() . '-' . $filename->to_s();
  96. $hash = $filename->cut_rbefore('.')->to_s();
  97. if (!in_array($hash, $this->existing_hashes($dir)) && !is_file($outpath)) {
  98. sleep(3);
  99. download_it($src, $outpath, "--header=\"User-Agent: Mozilla/5.0 (Windows NT 5.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2\"");
  100. }
  101. }
  102. private function existing_hashes($dir) {
  103. $result = array();
  104. foreach (glob($dir.'*.*') as $f) {
  105. $filename = Text::create($f);
  106. $match = $filename->regex_match('/\d+-(\w+)\./');
  107. if ($match) {
  108. $result[] = $match[1];
  109. }
  110. }
  111. return $result;
  112. }
  113. }