PageRenderTime 40ms CodeModel.GetById 12ms RepoModel.GetById 0ms app.codeStats 0ms

/djbdns-opendns/dehtml.awk

https://bitbucket.org/kaendfinger/aur-mirror
AWK | 107 lines | 58 code | 0 blank | 49 comment | 0 complexity | 97ec893c2b7198313440290cf36badf8 MD5 | raw file
Possible License(s): LGPL-2.0, Unlicense, AGPL-1.0, BitTorrent-1.0, EPL-1.0, GPL-3.0, BSD-3-Clause, GPL-2.0, MIT, CC-BY-SA-3.0, BSD-2-Clause, MPL-2.0, BSD-3-Clause-No-Nuclear-License-2014, JSON, AGPL-3.0, MPL-2.0-no-copyleft-exception, IPL-1.0, LGPL-2.1, ISC, CC-BY-3.0, WTFPL, 0BSD, CC0-1.0, LGPL-3.0, Cube, Apache-2.0
  1. #dehtml.awk: Removes all HTML tags from file, preliminary to spell check; common
  2. # ampersand "&entities;" are also resolved into single characters.
  3. #
  4. # Typical use:
  5. #
  6. # awk -f dehtml.awk infile.html > outfile.txt
  7. #
  8. # This program is written in the ``awk'' programming language (on Sun systems
  9. # and some others, non-archaic ``awk'' is called ``nawk'', so that ``nawk''
  10. # should be used instead of ``awk''). Also, a freely-redistributable ``awk''
  11. # interpreter called ``gawk'', which is free of the bugs that some of the
  12. # vendor-supplied ``awk''/``nawk'' programs suffer from, is available for most
  13. # platforms, and as source from the FSF GNU project.
  14. #
  15. # This program processes all files on the command line to STDOUT; to process a
  16. # number of files individually, use the iteration mechanism of your shell; for
  17. # example:
  18. #
  19. # for a in *.html ; do awk -f dehtml.awk$a > otherdir/$a ; done
  20. #
  21. # in Unix sh, or:
  22. #
  23. # for %a in (*.htm) do call dehtml %a otherdir\%a
  24. #
  25. # in MS-DOS, where dehtml.bat is the following one-line batch file:
  26. #
  27. # gawk -f dehtml.awk %1 > %2
  28. #
  29. # Copyright H. Churchyard 1994, 1995 -- freely redistributable.
  30. #
  31. # Version 1.0 11/27/94 -- Included in htmlchek 3.0 release.
  32. # Version 1.1 12/6/94 -- Fixed minor bug which could unpredictably cause a
  33. # string such as "é" to be reduced into a single character;
  34. # added "­". Included in htmlchek 3.01 release.
  35. # Version 1.2 1/12/95 -- No error on `>' outside tag; minor bugfix. Included
  36. # in htmlchek 4.0 release.
  37. #
  38. #This will test the 8-bit-cleanliness of your awk:
  39. BEGIN{
  40. amp["&[\043]32;"]="\040";amp[" "]="\040";
  41. amp["&[\043]34;"]="\042";amp["""]="\042";
  42. amp["&[\043]60;"]="\074";amp["<"]="\074";amp["&[\043]62;"]="\076";
  43. amp[">"]="\076";amp["À"]="\300";amp["Á"]="\301";
  44. amp["Â"]="\302";amp["Ã"]="\303";amp["Ä"]="\304";
  45. amp["Å"]="\305";amp["Æ"]="\306";amp["Ç"]="\307";
  46. amp["È"]="\310";amp["É"]="\311";amp["Ê"]="\312";
  47. amp["Ë"]="\313";amp["Ì"]="\314";amp["Í"]="\315";
  48. amp["Î"]="\316";amp["Ï"]="\317";amp["Ð"]="\320";
  49. amp["Ñ"]="\321";amp["Ò"]="\322";amp["Ó"]="\323";
  50. amp["Ô"]="\324";amp["Õ"]="\325";amp["Ö"]="\326";
  51. amp["Ø"]="\330";amp["Ù"]="\331";amp["Ú"]="\332";
  52. amp["Û"]="\333";amp["Ü"]="\334";amp["Ý"]="\335";
  53. amp["Þ"]="\336";amp["ß"]="\337";amp["à"]="\340";
  54. amp["á"]="\341";amp["â"]="\342";amp["ã"]="\343";
  55. amp["ä"]="\344";amp["å"]="\345";amp["æ"]="\346";
  56. amp["ç"]="\347";amp["è"]="\350";amp["é"]="\351";
  57. amp["ê"]="\352";amp["ë"]="\353";amp["ì"]="\354";
  58. amp["í"]="\355";amp["î"]="\356";amp["ï"]="\357";
  59. amp["ð"]="\360";amp["ñ"]="\361";amp["ò"]="\362";
  60. amp["ó"]="\363";amp["ô"]="\364";amp["õ"]="\365";
  61. amp["ö"]="\366";amp["ø"]="\370";amp["ù"]="\371";
  62. amp["ú"]="\372";amp["û"]="\373";amp["ü"]="\374";
  63. amp["ý"]="\375";amp["þ"]="\376";amp["ÿ"]="\377";
  64. amp["®"]="\256";amp["©"]="\251";amp["&[\043]163;"]="\243";
  65. amp["­"]="-";
  66. }
  67. #
  68. # Main
  69. #
  70. # Variable ``state'' is one if unresolved `<', zero otherwise.
  71. #
  72. {line="";errstr="";erra=0;errb=0;currsrch=1;txtbeg=1;
  73. while (match(substr($0,currsrch),/[<>]/)!=0)
  74. {currsrch=(currsrch+RSTART);
  75. if (substr($0,(currsrch-1),1)=="<")
  76. {if (state)
  77. {if (!erra)
  78. {errstr=(errstr "&&^Multiple `<' without `>' ERROR!, Ignoring^&&\n");
  79. erra=1}}
  80. else {if ((currsrch>length($0))||(substr($0,currsrch,1)~/^[ \t]$/))
  81. {if (!errb)
  82. {errstr=(errstr "&&^Whitespace after `<': Bad SGML syntax ERROR!, Ignoring^&&\n");
  83. errb=1}}
  84. else {if (currsrch>(txtbeg+1))
  85. {line=(line substr($0,txtbeg,(currsrch-(txtbeg+1))))};
  86. state=1}}}
  87. else {if (substr($0,(currsrch-1),1)==">")
  88. {if (state==0)
  89. {continue} #`>' without `<'
  90. else {txtbeg=currsrch;state=0;}}
  91. else {print "Internal error, ignore"}}};
  92. #At EOL:
  93. if ((!state)&&(txtbeg<=length($0))) {line=(line substr($0,txtbeg))};
  94. if (line~/&[\043]?[-0-9a-zA-Z.]*;/)
  95. {for (x in amp) {gsub(x,amp[x],line);if (line!~/&/) {break}};
  96. gsub(/&([\043]38|amp);/,"\\&",line)};
  97. if ((line)||((!state)&&($0~/^$/)))
  98. {if ((!state) || (errstr) || (line~/[ \t]$/))
  99. {print line}
  100. else {printf "%s",line}};
  101. if (errstr) {printf "%s",errstr}}
  102. #
  103. #Minor bug: &g<X>t; will translate to a `>' character!
  104. #
  105. #
  106. END{if (state) {print "&&^Was awaiting a `>' ERROR! at END^&&"}}
  107. ##EOF