PageRenderTime 52ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 1ms

/haikubook.php

https://github.com/exmosis/haikuchaos
PHP | 533 lines | 338 code | 117 blank | 78 comment | 58 complexity | 3cf8ee0a1d1b0ba2ed19227ef4c51d00 MD5 | raw file
  1. <?php
  2. /*
  3. * Script to scrape poeet webpages for haiku and turn into a Leanpub book.
  4. *
  5. * Graham Lally
  6. * butterfliesandsand@exmosis.net
  7. * Check the book out at http://leanpub.com/butterflies_and_sand
  8. * Twitter: http://twitter.com/6loss
  9. */
  10. define('NO_OF_SECTIONS', 5);
  11. define('PAGES_PER_SECTION', 3);
  12. define('HAIKU_PER_PAGE', 3);
  13. define('ENABLE_IMAGES', true);
  14. define('ENABLE_EXAIKSIS', true);
  15. define('MIN_SECTION_TITLE_LENGTH', 5);
  16. // Change this to the manuscript directory for the Leanpub book in Dropbox
  17. define('OUTPUT_DIR', '/Users/graham/Dropbox/butterflies_and_sand/manuscript/');
  18. // Directory to take random images from
  19. define('SRC_IMG_DIR', '/Users/graham/Pictures/Lightoom Exports/Butterflies and Sand/');
  20. // Directory to store images in for Leanpub
  21. define('LEANPUB_IMG_DIR', '/Users/graham/Dropbox/butterflies_and_sand/manuscript/images/');
  22. require_once('_inc/fns__poeet.php');
  23. require_once('_inc/fns__twitter_archive.php');
  24. require_once('_inc/fns__exaiksis.php');
  25. // list of chapters to insert before and after main content
  26. $pre_chapters = array(
  27. 'about_this_book.txt',
  28. 'reading_this_book.txt'
  29. );
  30. $post_chapters = array(
  31. 'on_syllables.txt',
  32. 'thanks.txt'
  33. );
  34. // list of haiku to regexp match and skip if found
  35. $haiku_to_skip = array(
  36. 'Dusty tag chi shoes \/ a re-birth, the Great Tao makes \/ vacuum cleaner noise.',
  37. 'A baby laughing \/ contains more reality \/ than Radio 4.',
  38. '{ dew like shadow lies \/ on the haven\'s waterfall \/ summer\'s strange disguise }',
  39. 'Catching my pale breath \/ A red leaf swings on the air \/ Caught in a cobweb\.',
  40. 'Last chance to catch the January haiku.*'
  41. );
  42. // Haiku to use as "last of the last" release - all haiku found after this will be listed in
  43. // the "Latest Haiku" section
  44. // $last_haiku = 'Big garden, big house / A violent argument / with broken voices.';
  45. // $last_haiku = 'Sat in the window / a cooling cup of coffee / fills me with stories.';
  46. // $last_haiku = 'Watching the leaves shake / The wind outside the window / far from a sickbed.';
  47. // $last_haiku = 'Carcasses and skin / tumbling from the stock pot / without flavour.';
  48. // $last_haiku = 'Looking for comets / stepping through fallen leaves / among old rain clouds.';
  49. // $last_haiku = 'The hairs on my arm / among the rose garden plants / wanders a greenfly.';
  50. $last_haiku = "After the showers / the hitchhikers' cardboard sign / in the recycling.";
  51. // Pages to scrape content from
  52. $haiku_pages = array(
  53. 'http://poeet.com/e/x/exmosis.html',
  54. 'http://poeet.com/6/l/6loss.html'
  55. );
  56. // source types can be:
  57. // - poeet: scrape a poeet URL page
  58. // - twitter_archive tweet archive
  59. $haiku_sources = array (
  60. /*array(
  61. 'type' => 'poeet',
  62. 'location' => 'http://poeet.com/e/x/exmosis.html'
  63. ),
  64. array(
  65. 'type' => 'poeet',
  66. 'location' => 'http://poeet.com/6/l/6loss.html'
  67. ),
  68. */
  69. array(
  70. 'type' => 'twitter_archive_csv',
  71. 'location' => '/Users/graham/Archive/Backups/tweets_2013-12-22/tweets.csv'
  72. ),
  73. );
  74. // set up variables we're using
  75. $haiku = array();
  76. $recent_haiku = array();
  77. $possible_titles = array();
  78. $section_titles = array();
  79. chdir(OUTPUT_DIR);
  80. $page_i = 0;
  81. $in_recent = false;
  82. // Go through list of sources
  83. $src_i = 1;
  84. foreach ($haiku_sources as $source) {
  85. // Check type is set
  86. if (! isset($source['type'])) {
  87. echo "No source type found for source " . $src_i . ":\n";
  88. print_r($source);
  89. echo "Skipping.\n\n";
  90. continue;
  91. }
  92. // Final array to store this source's haiku in, ordered in forwards date order (ie. start at earliest)
  93. $source_haiku_by_date = array();
  94. switch($source['type']) {
  95. case 'poeet':
  96. // get page from poeet
  97. if (! isset($source['location'])) {
  98. echo "No location URL found for poeet, for source " . $src_i . " - skipping.\n\n";
  99. continue;
  100. }
  101. $source_haiku_by_date = getPoeetHaiku($source['location']);
  102. break;
  103. case 'twitter_archive_csv':
  104. // Read CSV file in downloaded Twitter archive
  105. if (! isset($source['location'])) {
  106. echo "No file location found for Twitter archive CSV, for source " . $src_i . " - skipping.\n\n";
  107. continue;
  108. }
  109. $source_haiku_by_date = getTwitterArchiveCsvHaiku($source['location']);
  110. break;
  111. }
  112. // Skip if we got nothing back
  113. if (! $source_haiku_by_date) {
  114. echo "No haiku found for source " . $src_i . ":\n";
  115. print_r($source);
  116. continue;
  117. }
  118. // Only add non-blank haiku to our list
  119. foreach ($source_haiku_by_date as $h) {
  120. if (! trim($h)) {
  121. $haiku[] = $h;
  122. }
  123. // Check which to skip
  124. $remove = false;
  125. foreach ($haiku_to_skip as $hs) {
  126. if (preg_match('/' . $hs . '/', $h)) {
  127. $remove = true;
  128. }
  129. }
  130. if ($remove) {
  131. echo " Removed: " . $h . "\n";
  132. } else {
  133. // Add to our complete list
  134. $haiku[] = $h;
  135. // Are we hitting "recent haiku" yet?
  136. if ($in_recent) {
  137. $recent_haiku[] = $h;
  138. }
  139. // Check for "latest" haiku now
  140. if ($h == $last_haiku) {
  141. $in_recent = true;
  142. }
  143. }
  144. }
  145. $src_i++;
  146. }
  147. echo "COMPLETE HAIKU LIST:\n";
  148. echo "====================\n\n";
  149. print_r($haiku);
  150. /** Finished with scraping content now - start randomerising everything **/
  151. global $content_files, $sample_files;
  152. $content_files = array();
  153. $sample_files = array();
  154. if ($haiku) {
  155. // set up random images
  156. $images = new ImageSet(SRC_IMG_DIR, LEANPUB_IMG_DIR);
  157. // get possible section titles
  158. foreach ($haiku as $h) {
  159. // check length
  160. if (trim($h)) {
  161. $h = preg_replace('/\//', ' ', $h);
  162. $h_words = explode(' ', $h);
  163. foreach ($h_words as $hw) {
  164. $hw = preg_replace('/[^a-zA-Z\']/', '', $hw);
  165. if (strlen(trim($hw)) >= MIN_SECTION_TITLE_LENGTH) {
  166. $possible_titles[] = ucwords(strtolower(trim($hw)));
  167. }
  168. }
  169. }
  170. }
  171. shuffle($possible_titles);
  172. // shuffle haiku
  173. shuffle($haiku);
  174. /** Start outputting files **/
  175. $content_files[] = 'frontmatter:';
  176. // Insert pre-content files
  177. foreach ($pre_chapters as $pc) {
  178. $content_files[] = $pc;
  179. $sample_files[] = $pc;
  180. }
  181. $content_files[] = 'mainmatter:';
  182. // $images->addRandomImage(true);
  183. // Put text files together - sections, pages, titles
  184. for ($section_i = 0; $section_i < NO_OF_SECTIONS; $section_i++) {
  185. // Check we have more titles than sections
  186. if (count($possible_titles) > $section_i) {
  187. $f = fopen($section_i . '_0_title.txt', 'w');
  188. fwrite($f, '#' . $possible_titles[$section_i] . "\n");
  189. if (ENABLE_IMAGES) {
  190. fwrite($f, $images->addRandomImage() . "\n");
  191. }
  192. fclose($f);
  193. $content_files[] = 'section' . $section_i . ':';
  194. $content_files[] = $section_i . '_0_title.txt';
  195. if ($section_i == 0) {
  196. $sample_files[] = $section_i . '_0_title.txt';
  197. }
  198. }
  199. // Updated for version 5 December 2012: Switch to starting with 1 haiku per page,
  200. // increasing haikus per page up tp HAIKU_PER_PAGE
  201. $no_of_haiku = 1;
  202. // get pages
  203. for ($page_i = 0; $page_i < PAGES_PER_SECTION; $page_i++) {
  204. $file = $section_i . '_' . ($page_i + 1) . '_page.txt';
  205. $f = fopen($file, 'w');
  206. fwrite($f, "\n{::pagebreak /}\n\n");
  207. for ($haiku_i = 0; $haiku_i < $no_of_haiku && $haiku_i < HAIKU_PER_PAGE; $haiku_i++) {
  208. // get next haiku, write to this file
  209. if ($haiku) {
  210. $h = '';
  211. while (! trim($h)) {
  212. $h = array_shift($haiku);
  213. }
  214. $h = preg_replace('/\s*\/\s*/', " \r\n", $h);
  215. fwrite($f, $h . "\n\n");
  216. }
  217. }
  218. fclose($f);
  219. $content_files[] = $file;
  220. if ($section_i == 0) {
  221. $sample_files[] = $file;
  222. }
  223. $no_of_haiku++;
  224. }
  225. // Add section break image
  226. $sample_img = ($section_i == 0) ? true : false;
  227. // $images->addRandomImage($sample_img);
  228. }
  229. // Output recent haiku
  230. if (count($recent_haiku) > 0) {
  231. $rh_page = 1;
  232. $rh_count = 0;
  233. $f = fopen('recent_haiku_' . $rh_page . '.txt', 'w');
  234. $content_files[] = 'recent_haiku_' . $rh_page . '.txt';
  235. fwrite($f, '#Recent memories' . "\n\n");
  236. foreach ($recent_haiku as $rh) {
  237. $h = preg_replace('/\s*\/\s*/', " \r\n", $rh);
  238. $rh_count++;
  239. if ($rh_count == 4) {
  240. // Open new page after 3 haiku
  241. fwrite($f, "\n{::pagebreak /}\n\n");
  242. $rh_count = 1;
  243. $rh_page++;
  244. fclose($f);
  245. $f = fopen('recent_haiku_' . $rh_page . '.txt', 'w');
  246. $content_files[] = 'recent_haiku_' . $rh_page . '.txt';
  247. }
  248. fwrite($f, $h . "\n\n");
  249. }
  250. fclose($f);
  251. }
  252. if (ENABLE_IMAGES) {
  253. $images->addRandomImage();
  254. }
  255. // Markov text
  256. echo "Markov:\n\n";
  257. $markov_url = 'http://projects.haykranen.nl/markov/demo/'; // 'http://www.beetleinabox.com/cgi-bin/mkv_short1.cgi';
  258. // echo "--\n" . implode("\n", $haiku) . "\n--\n";
  259. $postdata = http_build_query(
  260. array(
  261. // 'user_text' => implode("\n", $haiku),
  262. 'input' => implode("\n", $haiku),
  263. // 'maxwords' => 100,
  264. 'length' => 500,
  265. 'order' => 5,
  266. // 'submit' => 'Markov-ize!'
  267. 'submit' => 'GO'
  268. )
  269. );
  270. $opts = array('http' => array(
  271. 'method' => 'POST',
  272. 'header' => 'Content-type: application/x-www-form-urlencoded',
  273. 'content' => $postdata
  274. ));
  275. $context = stream_context_create($opts);
  276. $result = file_get_contents($markov_url, false, $context);
  277. $result = preg_replace("/\n/", "!!!", $result);
  278. $result = preg_replace('/^.*<h2>Output text<\/h2>/', '', $result);
  279. $result = preg_replace('/^!!![^a-zA-Z0-9]*<textarea [^>]*>/', '', trim($result));
  280. $result = preg_replace('/<\/textarea>.*$/', '', $result);
  281. $result = preg_replace('/!!!/', " \r\n", trim($result));
  282. // remove start and end "words"
  283. // $result = preg_replace('/^[a-zA-Z0-9\'\-]*\s*\/?\s*/', '', $result);
  284. if (! preg_match('/^[a-zA-Z0-9]/', $result)) {
  285. $result = preg_replace('/^[a-zA-Z0-9]*([^a-zA-Z0-9]+)/', '$1', $result);
  286. }
  287. if (! preg_match('/\.$/', $result)) {
  288. $result = preg_replace('/\.[^\.]*$/', '', $result);
  289. }
  290. $result = trim($result);
  291. echo $result . "\n\n";
  292. $f = fopen('markov.txt', 'w');
  293. fwrite($f, '#Markov Mashup' . "\n\n");
  294. fwrite($f, $result);
  295. fclose($f);
  296. $content_files[] = 'markov.txt';
  297. if (ENABLE_EXAIKSIS) {
  298. $exaiksis_file = generateExaiksis();
  299. if ($exaiksis_file) {
  300. $content_files[] = $exaiksis_file;
  301. }
  302. }
  303. // Insert post-content files
  304. foreach ($post_chapters as $pc) {
  305. $content_files[] = $pc;
  306. }
  307. // Final image
  308. if (ENABLE_IMAGES) {
  309. $images->addRandomImage();
  310. }
  311. // Now write out content list
  312. $f = fopen('Book.txt', 'w');
  313. fwrite($f, implode("\n", $content_files));
  314. fclose($f);
  315. // Output sample book (1st section)
  316. $f = fopen('Sample.txt', 'w');
  317. fwrite($f, implode("\n", $sample_files));
  318. fclose($f);
  319. }
  320. class ImageSet {
  321. var $all_images = array();
  322. var $used_images = array();
  323. var $current_img_i = 0;
  324. var $src_dir = null;
  325. var $target_dir = null;
  326. function ImageSet($src_img_dir, $target_img_dir) {
  327. $this->src_dir = $src_img_dir;
  328. $this->target_dir = $target_img_dir;
  329. if ($handle = opendir($src_img_dir)) {
  330. echo "Getting images from $src_img_dir\n";
  331. while (false !== ($entry = readdir($handle))) {
  332. if (is_file($this->src_dir . $entry)) {
  333. $this->all_images[] = $entry;
  334. }
  335. }
  336. closedir($handle);
  337. }
  338. shuffle($this->all_images);
  339. }
  340. function addRandomImage($separate_page = false, $add_to_sample = false) {
  341. global $content_files, $sample_files;
  342. // get next image
  343. if ($this->all_images) {
  344. $next_image = array_shift($this->all_images);
  345. // work out format
  346. $ext = 'jpg';
  347. if (preg_match('/jpe?g$/', strtolower($next_image))) {
  348. } else if (preg_match('/png$/', strtolower($next_image))) {
  349. $ext = 'png';
  350. }
  351. if (copy($this->src_dir . $next_image, $this->target_dir . 'content_image_' . $this->current_img_i . '.' . $ext)) {
  352. $img_ref = 'images/content_image_' . $this->current_img_i . '.' . $ext;
  353. $img_markdown = '![](' . $img_ref . ')';
  354. if ($separate_page) {
  355. $f = fopen('content_image_' . $this->current_img_i . '.txt', 'w');
  356. fwrite($f, $img_markdown);
  357. // fwrite($f, '![](images/content_image_' . $this->current_img_i . '.' . $ext . ")");
  358. fwrite($f, "\n{::pagebreak /}\n\n");
  359. fclose($f);
  360. $content_files[] = 'content_image_' . $this->current_img_i . '.txt';
  361. if ($add_to_sample) {
  362. $sample_files[] = 'content_image_' . $this->current_img_i . '.txt';
  363. }
  364. $return = '';
  365. } else {
  366. $return = $img_markdown;
  367. }
  368. $this->current_img_i++;
  369. }
  370. }
  371. return $return;
  372. }
  373. }
  374. class clsMarkov {
  375. var $wordList= array();
  376. var $termTree = array();
  377. function makeList($string) {
  378. $string = strtolower($string);
  379. $string = preg_replace("/[^A-z0-9\/\.\-\s]/i", "", $string);
  380. preg_match_all("/[A-z0-9]+\S/", $string, $op);
  381. $this->wordList = $op[0];
  382. return $this->wordList;
  383. }
  384. function buildTree() {
  385. // $searchList = $this->wordList;
  386. $arraySize = count($this->wordList);
  387. $ns = 0;
  388. while ($ns!=$arraySize) {
  389. $termRoot = current($this->wordList);
  390. $termKeys = array_keys($this->wordList,$termRoot);
  391. foreach ($termKeys as $key=>$num) {
  392. $this->termTree[$termRoot][] = $this->wordList[($num+1)];
  393. }
  394. $this->termTree[$termRoot] = array_unique($this->termTree[$termRoot]);
  395. next($this->wordList);
  396. $ns++;
  397. }
  398. }
  399. function phraseWriter($seed, $words) {
  400. $results = $seed = strtolower($seed);
  401. if($this->termTree[$seed]) {
  402. $n=0;
  403. while($nn!=$this->termTree[$seed]){
  404. if($this->termTree[$seed][$rndseed]) {
  405. $results .= ' '.$this->termTree[$seed][$rndseed];
  406. $seed = $this->termTree[$seed][$rndseed];
  407. $nn++;
  408. }
  409. else $nn++;
  410. }
  411. return $results;
  412. } else return 'No seed match';
  413. }
  414. }
  415. ?>