PageRenderTime 26ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/t2tx.php

https://github.com/wolli/t2tx
PHP | 452 lines | 338 code | 30 blank | 84 comment | 38 complexity | c3678ba58b6df211784617bf011e6f47 MD5 | raw file
  1. #!/usr/bin/php
  2. <?php
  3. /**
  4. * t2tx - split XHTML pages created with txt2tags into multi-page "books"
  5. *
  6. * @author Peeter P. Mõtsküla <peeterpaul@motskula.net>
  7. * @copyright (c) 2010 Peeter P. Mõtsküla
  8. * @version 1.1
  9. * @license New BSD license (http://opensource.org/licenses/bsd-license.php)
  10. *
  11. * Usage: see main block below or just call t2tx without parameters
  12. *
  13. * Notes / to do:
  14. * * internal hyperlinks are not recalculated.
  15. *
  16. * The script contains two classes (Book and Chapter), a simple error-handling
  17. * routine (error) and a main block that takes the arguments from the command
  18. * line, creates a new Book from the input file, and saves the Book.
  19. */
  20. error_reporting(E_ALL | E_STRICT);
  21. /*
  22. * global constants
  23. */
  24. define('t2tx_PROGID',
  25. 't2tx 1.1 by Peeter P. Mõtsküla <peeterpaul@motskula.net>');
  26. define('t2tx_FNFORNOHTML', 'not html or file not found');
  27. define('t2tx_NOT2TXHTML', 'not txt2tags-generated xhtml');
  28. define('t2tx_NOBODY', 'empty document body');
  29. define('t2tx_BADSPLITLEVEL', 'invalid splitLevel');
  30. define('t2tx_BADTOCLEVEL', 'invalid tocLevel');
  31. define('t2tx_BADCHAPTER', 'invalid chapter content');
  32. define('t2tx_NOCHAPTERS', 'no chapters found');
  33. define('t2tx_BADSECTION', 'invalid section specified');
  34. /**
  35. * Chapter of a book
  36. */
  37. class Chapter {
  38. protected $_body;
  39. protected $_level;
  40. protected $_title;
  41. protected $_anchor;
  42. /**
  43. * @param string $body HTML snippet containing the chapter body
  44. */
  45. public function __construct($body) {
  46. $this->_body = $body;
  47. }
  48. /**
  49. * @return string HTML snippet containing the chapter body
  50. */
  51. public function body() {
  52. return $this->_body;
  53. }
  54. /**
  55. * return int chapter level -- the level of the first heading
  56. */
  57. public function level() {
  58. if (! isset($this->_level)) {
  59. if (preg_match('#<h([1-6])#s', $this->_body, $matches)) {
  60. $this->_level = $matches[1];
  61. } else {
  62. $this->_level = 0;
  63. }
  64. }
  65. return $this->_level;
  66. }
  67. /**
  68. * return string chapter title -- the content of the first heading
  69. */
  70. public function title() {
  71. if (! isset($this->_title)) {
  72. if (preg_match('#<h[1-6].*?>(.*?)</h[1-6]>#s',
  73. $this->_body, $matches)) {
  74. $this->_title = $matches[1];
  75. } else {
  76. $this->_title = '';
  77. }
  78. }
  79. return $this->_title;
  80. }
  81. /**
  82. * return string the anchor string immediately preceding the first heading
  83. */
  84. public function anchor() {
  85. if (! $this->_anchor) {
  86. if (preg_match('#^<a.*? name="(.*?)".*?></a>#s',
  87. $this->_body, $matches)) {
  88. $this->_anchor = $matches[1];
  89. } else {
  90. $this->_anchor = '';
  91. }
  92. }
  93. return $this->_anchor;
  94. }
  95. }
  96. /**
  97. * Book containing the chapters
  98. */
  99. class Book {
  100. protected $_html;
  101. protected $_filename;
  102. protected $_splitLevel;
  103. protected $_tocLevel;
  104. protected $_bookPath;
  105. protected $_bookName;
  106. protected $_htmlHead;
  107. protected $_title;
  108. protected $_header;
  109. protected $_body;
  110. protected $_toc;
  111. protected $_chapters;
  112. /*
  113. * @property string $content input filename or contents thereof
  114. * @property int $splitLevel
  115. * deepest heading to break document into chapters (default 1)
  116. * @property int $tocLevel
  117. * how many sublevels to include into table of contents
  118. * (default 1; 0 means all)
  119. */
  120. public function __construct($content = NULL, $splitLevel = 1,
  121. $tocLevel = 1) {
  122. // check $splitLevel
  123. if ($splitLevel < 1 || $splitLevel > 6 ||
  124. intval($splitLevel) != $splitLevel) {
  125. throw new Exception(t2tx_BADSPLITLEVEL);
  126. } else {
  127. $this->_splitLevel = $splitLevel;
  128. }
  129. // check $tocLevel
  130. if ($tocLevel < 0 || intval($tocLevel) != $tocLevel) {
  131. throw new Exception(t2tx_BADTOCLEVEL);
  132. } else {
  133. $this->_tocLevel = ($tocLevel > 0 ? $tocLevel : 6);
  134. }
  135. // do we have a file?
  136. if (file_exists($content)) {
  137. $this->_filename = $content;
  138. $html = file_get_contents($this->_filename);
  139. } else {
  140. $html = $content;
  141. }
  142. // do we have HTML content?
  143. if (preg_match('#<html.*?>.*</html>#s', $html)) {
  144. $this->_html = $html;
  145. } else {
  146. throw new Exception(t2tx_FNFORNOHTML);
  147. }
  148. // do we have something in document body?
  149. if (! $body = trim($this->body())) {
  150. throw new Exception(t2tx_NOBODY);
  151. }
  152. // extract chapters
  153. $this->chapters();
  154. }
  155. /**
  156. * @return string full path to input file without .html extension
  157. */
  158. public function bookPath() {
  159. if (! $this->_bookPath) {
  160. $this->_bookPath = preg_replace('#\.html$#', '', $this->_filename);
  161. }
  162. return $this->_bookPath;
  163. }
  164. /**
  165. * @return string name of input file without .html extension
  166. */
  167. public function bookName() {
  168. if (! $this->_bookName) {
  169. $this->_bookName = preg_replace('#.*/#', '', $this->bookPath());
  170. }
  171. return $this->_bookName;
  172. }
  173. /**
  174. * @return string start of input file until the end of <head> tag
  175. * with an extra <meta name="generator"... added
  176. */
  177. public function htmlHead() {
  178. if (! $this->_htmlHead) {
  179. if (preg_match('#(^.*?</head>)#s', $this->_html, $matches)) {
  180. $this->_htmlHead = preg_replace(
  181. "#</title>\n#",
  182. "</title>\n" .
  183. '<meta name="generator" content="' .
  184. t2tx_PROGID .
  185. "\" />\n",
  186. $matches[0]);
  187. } else {
  188. throw new Exception(t2tx_NOT2TXHTML);
  189. }
  190. }
  191. return $this->_htmlHead;
  192. }
  193. /**
  194. * @return string book title taken from the <title> of input file
  195. */
  196. public function title() {
  197. if (! $this->_title) {
  198. if (preg_match('#<title>(.*?)</title>#s', $this->htmlHead(),
  199. $matches)) {
  200. $this->_title = $matches[1];
  201. } else {
  202. throw new Exception(t2tx_NOT2TXHTML);
  203. }
  204. }
  205. return $this->_title;
  206. }
  207. /**
  208. * @return string content of input file's div#header
  209. */
  210. public function header() {
  211. if (! $this->_header) {
  212. if (preg_match('#<div class="header" id="header">(.*?)</div>#s',
  213. $this->_html, $matches)) {
  214. $this->_header = $matches[1];
  215. } else {
  216. throw new Exception(t2tx_NOT2TXHTML);
  217. }
  218. }
  219. return $this->_header;
  220. }
  221. /**
  222. * @return string content of input file's div#body
  223. */
  224. public function body() {
  225. if (! $this->_body) {
  226. if (preg_match('#<div\ class="body"\ id="body">' . "\n" .
  227. '(.*?)</div>' . "\n\n" .
  228. '<!-- xhtml code generated by txt2tags#s',
  229. $this->_html, $matches)) {
  230. $this->_body = $matches[1];
  231. } else {
  232. throw new Exception(t2tx_NOT2TXHTML);
  233. }
  234. }
  235. return $this->_body;
  236. }
  237. /**
  238. * @return array list of chapters in the book
  239. */
  240. public function chapters() {
  241. if (! is_array($this->_chapters)) {
  242. $this->_getSections($this->body(), $this->_chapters,
  243. $this->_splitLevel);
  244. }
  245. return $this->_chapters;
  246. }
  247. /**
  248. * Break input file into chapters
  249. *
  250. * @param string $body content of input file's div#body
  251. * @param array $chapters chapter list to be populated
  252. * @param int $splitLevel smallest heading to split input into chapters at
  253. * @return void
  254. */
  255. protected function _getSections($body, &$chapters, $splitLevel = NULL) {
  256. // set up nextHead;
  257. if (! $splitLevel) {
  258. $splitLevel = $this->_splitLevel;
  259. }
  260. $nextHead = '(<a[^>]+></a>\n)?<h';
  261. if ($splitLevel == 1) {
  262. $nextHead .= '1';
  263. } else {
  264. $nextHead .= "[1-$splitLevel]";
  265. }
  266. // extract chapter 0
  267. $chapters = array();
  268. preg_match("#^(.*?)(?=$nextHead|$)#s", $body, $matches);
  269. $chapters[] = new Chapter($matches[0]);
  270. $body = trim(str_replace($matches[0], '', $body));
  271. // extract chapters 1..n
  272. while ($body) {
  273. if (preg_match("#^($nextHead.*?)(?=$nextHead)#s",
  274. $body, $matches)) {
  275. $chapters[] = new Chapter($matches[0]);
  276. $body = trim(str_replace($matches[0], '', $body));
  277. } else {
  278. $chapters[] = new Chapter($body);
  279. $body = '';
  280. }
  281. }
  282. }
  283. /**
  284. * Create table of contents
  285. *
  286. * @return string HTML-formatted table of contents
  287. */
  288. public function toc() {
  289. if (! $this->_toc) {
  290. // bail out if no chapters found
  291. if (! count($this->_chapters)) {
  292. throw new Exception(t2tx_NOCHAPTERS);
  293. }
  294. // build table of contents
  295. $level = 1;
  296. $toc = '<div class="toc" id="toc">' . "\n<ul>\n";
  297. $chapNum = 0;
  298. foreach ($this->_chapters as $chapter) {
  299. if ($chapter->level() == 0) {
  300. continue;
  301. }
  302. $chapNum++;
  303. $this->_getSections($chapter->body(), $_chapters,
  304. $this->_splitLevel + $this->_tocLevel);
  305. foreach ($_chapters as $_chapter) {
  306. if ($_chapter->level() == 0) {
  307. continue;
  308. }
  309. while ($_chapter->level() > $level) {
  310. $toc .= str_repeat(" ", $level++) . "<li><ul>\n";
  311. }
  312. while ($_chapter->level() < $level) {
  313. $toc .= str_repeat(" ", --$level) . "</ul></li>\n";
  314. }
  315. $toc .= str_repeat(" ", $level) .
  316. '<li><a href="' . $this->bookName() . "-$chapNum.html" .
  317. ($_chapter->anchor() ? "#{$_chapter->anchor()}" : '') .
  318. "\">{$_chapter->title()}</a></li>\n";
  319. }
  320. // close last open ul-s
  321. while ($level > 1) {
  322. $toc .= str_repeat(" ", --$level) . "</ul></li>\n";
  323. }
  324. }
  325. $toc .= "</ul>\n</div>\n";
  326. // remove unnecessary li-s around ul-s
  327. $toc = preg_replace('#</li>(\n +)<li><ul>#s', '$1<ul>', $toc);
  328. $toc = preg_replace('#(\n +)</ul></li>#s', '$1</ul>$1</li>', $toc);
  329. $this->_toc = $toc;
  330. }
  331. return $this->_toc;
  332. }
  333. /**
  334. * Create chapter-specific navigation bars
  335. *
  336. * @param int $section section number (0 - TOC/preamble, 1..n - chapters)
  337. * @return string HTML-formatted navigation bar for given chapter
  338. */
  339. public function navbar($section) {
  340. // do we have a valid section?
  341. if ($section < 1 || $section >= count($this->_chapters) ||
  342. $section != intval($section)) {
  343. throw new Exception(t2tx_BADSECTION);
  344. }
  345. // build navbar
  346. $navbar = '<div class="navbar" id="navbar">' . "\n" .
  347. '<table width="100%"><tr>' . "\n" .
  348. ' <td align="left" width="5%">';
  349. if ($section > 1) { # link to previous if exists
  350. $navbar .= '<a href="' .
  351. $this->bookName() . '-' . ($section - 1) . '.html">' .
  352. '&lt;&lt;</a>';
  353. }
  354. $navbar .= "</td>\n" . ' <td align="center" width="90%"><a href="' .
  355. $this->bookName() . '-0.html">' . $this->title() . "</a></td>\n" .
  356. ' <td align="right" width="5%">';
  357. if ($section < count($this->_chapters) -1) { # link to next if exists
  358. $navbar .= '<a href="' .
  359. $this->bookName() . '-' . ($section + 1) . '.html">' .
  360. '&gt;&gt;</a>';
  361. }
  362. $navbar .= "</td>\n</tr></table>\n</div>\n";
  363. return $navbar;
  364. }
  365. /**
  366. * Save the current book into set of files
  367. *
  368. * @return void
  369. */
  370. public function save() {
  371. $chapNum = 0;
  372. foreach ($this->_chapters as $chapter) {
  373. if ($chapNum == 0) {
  374. $content = $this->htmlHead() . "<body>\n" .
  375. $this->header() . $this->toc() .
  376. '<div class="body" id="body">' . "\n" .
  377. $chapter->body() .
  378. "</div>\n</body>\n</html>\n";
  379. } else {
  380. $content = $this->htmlHead() . "<body>\n" .
  381. $this->navbar($chapNum) .
  382. '<div class="body" id="body">' . "\n" .
  383. $chapter->body() .
  384. "</div>\n" .
  385. $this->navbar($chapNum) .
  386. "</body>\n</html>\n";
  387. }
  388. file_put_contents($this->bookPath() . "-$chapNum.html", $content);
  389. $chapNum++;
  390. }
  391. }
  392. }
  393. /**
  394. * main block
  395. */
  396. if ($argc < 2 || $argc > 4) {
  397. echo <<<END
  398. t2tx - split XHTML pages created with txt2tags into multi-page "books"
  399. Usage:
  400. t2tx docName.html [splitLevel [tocLevel]]
  401. docName.html - to be converted into docName-0.html .. docName-n.html
  402. splitLevel - deepest heading level to split into chapters
  403. tocLevel - how many levels of subchapters to include in TOC
  404. See README.txt for more details.
  405. END;
  406. exit;
  407. }
  408. $input = $argv[1];
  409. $splitLevel = ($argc > 2 ? $argv[2] : 1);
  410. $tocLevel = ($argc > 3 ? $argv[3] : 1);
  411. $book = new Book($input, $splitLevel, $tocLevel);
  412. $book->save();
  413. echo "Done, ", count($book->chapters()) - 1, " chapters.\n";