/doc/html2wiki.sh

http://cmockery.googlecode.com/ · Shell · 154 lines · 103 code · 1 blank · 50 comment · 0 complexity · c03225fa0b5093f4f8c211c9c6d5b65e MD5 · raw file

  1. #!/bin/bash
  2. #
  3. # Translate really simple html to googlecode.com wiki.
  4. #
  5. # Usage: cat input.html | html2wiki.sh > outputwiki.txt
  6. #
  7. # Most of this script is simple sed substitutions with an awk script to handle
  8. # hierarchical lists.
  9. # Awk program to escape all instances of * outside of <listing></listing>
  10. awk '
  11. BEGIN { in_listing = 0; }
  12. /<[Ll][Ii][Ss][Tt][Ii][Nn][Gg]>/ { in_listing = 1; }
  13. /<\/[Ll][Ii][Ss][Tt][Ii][Nn][Gg]>/ { in_listing = 0; }
  14. /.*/ {
  15. if (in_listing) {
  16. print $0;
  17. } else {
  18. print gensub("*", "`*`", "g", $0)
  19. }
  20. }' | \
  21. # Awk program to convert hierachical unordered and ordered lists into
  22. # googlecode wiki list markup. This is limited to converting very simple
  23. # html lists in the form:
  24. #
  25. # <ul>
  26. # <li>item 1</li>
  27. # ...
  28. # <li>item N</li>
  29. # </ul>
  30. #
  31. # This script also removes leading spaces from all lines outside of <listing>
  32. # sections.
  33. awk '
  34. BEGIN {
  35. list_type_none = 0;
  36. list_type_ordered = 1;
  37. list_type_unordered = 2;
  38. # Number of nested lists.
  39. list_depth = 0;
  40. # Number of items in the list.
  41. list_items[list_depth] = 0;
  42. # Type of list.
  43. list_type[list_depth] = list_type_none;
  44. # Do nott strip whitespace from listing sections.
  45. in_listing = 0;
  46. }
  47. # Generate a string of indent spaces.
  48. function list_indent(indent) {
  49. format = sprintf("%%%ds", indent);
  50. return sprintf(format, "");
  51. }
  52. /<[Ll][Ii][Ss][Tt][Ii][Nn][Gg]>/ { in_listing = 1; }
  53. /<\/[Ll][Ii][Ss][Tt][Ii][Nn][Gg]>/ { in_listing = 0; }
  54. # Process all lines non-blank lines.
  55. /^.*$/ {
  56. # Remove leading white space.
  57. if (!in_listing) {
  58. output_string = gensub(/^ */, "", 1, $0);
  59. } else {
  60. output_string = $0;
  61. }
  62. search_string = output_string
  63. # Replace list tags with googlecode wiki markup.
  64. while (match(search_string, /<[^>]*>/, matches)) {
  65. tag = matches[0];
  66. search_string = substr(search_string,
  67. matches[0, "start"] + matches[0, "length"]);
  68. if (match(tag, /^<[Uu][Ll]>$/)) {
  69. list_depth++;
  70. list_type[list_depth] = list_type_unordered;
  71. list_items[list_depth] = 0;
  72. output_string = gensub(tag, "", 1, output_string);
  73. } else if (match(tag, /^[Oo][Ll]>$/)) {
  74. list_depth++;
  75. list_type[list_depth] = list_type_ordered;
  76. list_items[list_depth] = 0;
  77. output_string = gensub(tag, "", 1, output_string);
  78. } else if (match(tag, /^<\/[Ll][Ii]>$/)) {
  79. output_string = gensub(tag, "", 1, output_string);
  80. } else if (list_depth) {
  81. if (match(tag, /^<[Ll][Ii]>$/)) {
  82. if (list_type[list_depth] == list_type_unordered) {
  83. output_string = gensub(tag, list_indent(list_depth) "* ", 1,
  84. output_string);
  85. } else if (list_type[list_depth] == list_type_ordered) {
  86. output_string = gensub(tag, list_indent(list_depth) "# ", 1,
  87. output_string);
  88. }
  89. } else if (match(tag, /^<\/[Uu][Ll]>$/) ||
  90. match(tag, /^<\/[Ou][Ll]>$/)) {
  91. output_string = gensub(tag, "", 1, output_string);
  92. list_depth --;
  93. }
  94. }
  95. }
  96. # If a list is being parsed then filter blank lines.
  97. if (list_depth == 0 || length(output_string)) {
  98. print output_string
  99. }
  100. }
  101. ' | \
  102. # This sed program translates really simple html into wiki suitable for
  103. # googlecode.com.
  104. #
  105. # Supported tags:
  106. # <p>
  107. # <br>
  108. # <h1>
  109. # <h2>
  110. # <h3>
  111. # <h4>
  112. # <h5>
  113. # <b>
  114. # <i>
  115. # <a href="#.*">.*</a>
  116. # <a href=".*">.*</a>
  117. # <a name=".*'>.*</a>
  118. #
  119. # Supported entities:
  120. # &gt;
  121. # &lt;
  122. #
  123. # Limitations:
  124. # * Anchors must be on a single line and must contain one of either the name or
  125. # href attributes.
  126. # * Href of local anchors (href="#.*") should be set to the name of a heading
  127. # within the document. If the heading contains spaces the href should
  128. # contain underscores.
  129. # * All external links are relative to
  130. # http://cmockery.googlecode.com/svn/trunk/doc/
  131. sed -r '
  132. s@<[Pp]>@\n@g;
  133. s@<[[Bb][Rr]]>@\n@g;
  134. s@</?[Hh]1>@=@g;
  135. s@</?[Hh]2>@==@g;
  136. s@</?[Hh]3>@===@g;
  137. s@</?[Hh]4>@====@g;
  138. s@</?[Hh]5>@====@g;
  139. s@</?[Bb]>@*@g;
  140. s@</?[Ii]>@_@g;
  141. s@<[Ll][Ii][Ss][Tt][Ii][Nn][Gg]>@{{{@g;
  142. s@</[Ll][Ii][Ss][Tt][Ii][Nn][Gg]>@}}}@g;
  143. s@<[Aa].*?href="#(.*)?">(.*)?</[Aa]>@[#\1 \2]@g;
  144. s@<[Aa].*?href="(.*)?">(.*)?</[Aa]>@[http://cmockery.googlecode.com/svn/trunk/doc/\1 \2]@g;
  145. s@<[Aa].*?name="(.*)?">@@g;
  146. s@</[Aa]>@@g;
  147. s@<.*?>@@g;
  148. s@&lt;@<@g;
  149. s@&gt;@>@g;'