PageRenderTime 41ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/modules/wiki/shell/scanwikihow.sh.php

http://awarenet.googlecode.com/
PHP | 101 lines | 60 code | 22 blank | 19 comment | 9 complexity | 21da814955477667056ed0bbc1cfa906 MD5 | raw file
Possible License(s): GPL-3.0
  1. <?
  2. include "../../../setup.php";
  3. //--------------------------------------------------------------------------------------------------
  4. //* page to scan wikihow through mothsorchid.com proxy
  5. //--------------------------------------------------------------------------------------------------
  6. $scanDir = "../../../data/mwimport/scan/";
  7. $step = 50;
  8. $startTitle = '';
  9. $startTitle = 'Differentiate Math Functions'; //temp
  10. $max = 500000;
  11. $continue = true;
  12. $totalPages = 0;
  13. while (true == $continue) {
  14. //------------------------------------------------------------------------------------------
  15. // get and parse next page of results
  16. //------------------------------------------------------------------------------------------
  17. //$reference = "Create-a-Text-Input-Using-HTML"; //test data
  18. $reference = "api.php?action=query&list=allpages&format=xml"
  19. . "&apfrom=" . urlencode($startTitle)
  20. . "&aplimit=" . $step;
  21. $relayUrl = "http://mothsorchid.com/whrelay.php?p=whp6x&q=" . base64_encode($reference);
  22. $raw = implode(file($relayUrl));
  23. $data = wiki_shell_expandAllPages($raw);
  24. //------------------------------------------------------------------------------------------
  25. // print to console
  26. //------------------------------------------------------------------------------------------
  27. echo ">> " . strtoupper($reference) . "\n";
  28. foreach($data['allpages'] as $page) {
  29. echo "page id: " . $page['id']
  30. . " namespace: " . $page['ns']
  31. . " title: " . $page['title'] . "\n";
  32. }
  33. echo "next apfrom: " . $data['apfrom'] . "\n";
  34. $startTitle = $data['apfrom'];
  35. if ('' == $data['apfrom']) { $continue = false; }
  36. //echo str_repeat('-', 79) . "\n";
  37. //echo "raw data:" . $raw . "\n";
  38. echo str_repeat('-', 79) . "\n";
  39. //------------------------------------------------------------------------------------------
  40. // save raw listing
  41. //------------------------------------------------------------------------------------------
  42. $fileName = $scanDir . "mwi50_" . $kapenta->time() . ".raw";
  43. $fH = fopen($fileName, 'w+');
  44. fwrite($fH, $raw);
  45. fclose($fH);
  46. $max--;
  47. if (0 == $max) { $continue = false; }
  48. $totalPages += count($data['allpages']);
  49. $cd = 15;
  50. echo $totalPages . " scanned (wait $cd) ";
  51. while ($cd > 0) { sleep(1); $cd--; echo "."; }
  52. echo "\n\n";
  53. }
  54. //--------------------------------------------------------------------------------------------------
  55. //| utility functions
  56. //--------------------------------------------------------------------------------------------------
  57. function wiki_shell_expandAllPages($xml) {
  58. $result = array('apfrom' => '', 'allpages' => array());
  59. $xml = str_replace(">", ">\n", $xml);
  60. $lines = explode("\n", $xml);
  61. //foreach($lines as $line) { echo htmlentities($line) . "<br/>\n"; }
  62. foreach($lines as $line) {
  63. if ('<allpages apfrom' == substr($line, 0, 16)) {
  64. $parts = explode("\"", $line);
  65. $result['apfrom'] = $parts[1];
  66. }
  67. if ('<p pageid' == substr($line, 0, 9)) {
  68. $parts = explode("\"", $line);
  69. $result['allpages'][] = array(
  70. 'id' => $parts[1],
  71. 'ns' => $parts[3],
  72. 'title' => $parts[5]
  73. );
  74. }
  75. }
  76. return $result;
  77. }
  78. ?>