PageRenderTime 54ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 1ms

/lib/redcloth/base.rb

https://code.google.com/p/rextile/
Ruby | 674 lines | 517 code | 60 blank | 97 comment | 32 complexity | 5db5ff171d2c0566243414ae8bbf572a MD5 | raw file
Possible License(s): BSD-3-Clause
  1. class RedCloth < String
  2. VERSION = '3.0.99.0.svn.20060519'
  3. DEFAULT_RULES = [] # let each class add to this array
  4. TEXTILE_RULES = [:refs_textile, :block_textile_table, :block_textile_lists, :block_textile_defs,
  5. :block_textile_prefix, :inline_textile_image, :inline_textile_link,
  6. :inline_textile_code, :inline_textile_span, :glyphs_textile,
  7. :inline_textile_autolink_urls, :inline_textile_autolink_emails]
  8. MARKDOWN_RULES = [:refs_markdown, :block_markdown_setext, :block_markdown_atx, :block_markdown_rule,
  9. :block_markdown_bq, :block_markdown_lists,
  10. :inline_markdown_reflink, :inline_markdown_link]
  11. DOCBOOK_RULES = [:refs_docbook, :block_docbook_table, :block_docbook_lists, :block_docbook_simple_lists,
  12. :block_docbook_defs, :block_docbook_prefix, :inline_docbook_image, :inline_docbook_link,
  13. :inline_docbook_code, :inline_docbook_glyphs, :inline_docbook_span,
  14. :inline_docbook_wiki_words, :inline_docbook_wiki_links, :inline_docbook_autolink_urls,
  15. :inline_docbook_autolink_emails]
  16. @@escape_keyword ||= "redcloth"
  17. #
  18. # Two accessor for setting security restrictions.
  19. #
  20. # This is a nice thing if you're using RedCloth for
  21. # formatting in public places (e.g. Wikis) where you
  22. # don't want users to abuse HTML for bad things.
  23. #
  24. # If +:filter_html+ is set, HTML which wasn't
  25. # created by the Textile processor will be escaped.
  26. #
  27. # If +:filter_styles+ is set, it will also disable
  28. # the style markup specifier. ('{color: red}')
  29. #
  30. # If +:filter_classes+ is set, it will also disable
  31. # class attributes. ('!(classname)image!')
  32. #
  33. # If +:filter_ids+ is set, it will also disable
  34. # id attributes. ('!(classname#id)image!')
  35. #
  36. attr_accessor :filter_html, :filter_styles, :filter_classes, :filter_ids
  37. #
  38. # Accessor for toggling hard breaks.
  39. #
  40. # If +:hard_breaks+ is set, single newlines will
  41. # be converted to HTML break tags. This is the
  42. # default behavior for traditional RedCloth.
  43. #
  44. attr_accessor :hard_breaks
  45. # Accessor for toggling lite mode.
  46. #
  47. # In lite mode, block-level rules are ignored. This means
  48. # that tables, paragraphs, lists, and such aren't available.
  49. # Only the inline markup for bold, italics, entities and so on.
  50. #
  51. # r = RedCloth.new( "And then? She *fell*!", [:lite_mode] )
  52. # r.to_html
  53. # #=> "And then? She <strong>fell</strong>!"
  54. #
  55. attr_accessor :lite_mode
  56. #
  57. # Accessor for toggling span caps.
  58. #
  59. # Textile places `span' tags around capitalized
  60. # words by default, but this wreaks havoc on Wikis.
  61. # If +:no_span_caps+ is set, this will be
  62. # suppressed.
  63. #
  64. attr_accessor :no_span_caps
  65. #
  66. # Establishes the markup predence.
  67. #
  68. attr_accessor :rules
  69. # Returns a new RedCloth object, based on _string_ and
  70. # enforcing all the included _restrictions_.
  71. #
  72. # r = RedCloth.new( "h1. A <b>bold</b> man", [:filter_html] )
  73. # r.to_html
  74. # #=>"<h1>A &lt;b&gt;bold&lt;/b&gt; man</h1>"
  75. #
  76. def initialize( string, restrictions = [] )
  77. restrictions.each { |r| method( "#{ r }=" ).call( true ) }
  78. super( string )
  79. end
  80. #
  81. # Generates HTML from the Textile contents.
  82. #
  83. # r = RedCloth.new( "And then? She *fell*!" )
  84. # r.to_html( true )
  85. # #=>"And then? She <strong>fell</strong>!"
  86. #
  87. def to_html( *rules )
  88. rules = DEFAULT_RULES if rules.empty?
  89. # make our working copy
  90. text = self.dup
  91. return "" if text == ""
  92. @urlrefs = {}
  93. @shelf = []
  94. @rules = rules.collect do |rule|
  95. case rule
  96. when :markdown
  97. MARKDOWN_RULES
  98. when :textile
  99. TEXTILE_RULES
  100. else
  101. rule
  102. end
  103. end.flatten
  104. # standard clean up
  105. @pre_list = []
  106. pre_process text
  107. DEFAULT_RULES.each {|ruleset| send("#{ruleset}_pre_process", text) if private_methods.include? "#{ruleset}_pre_process"}
  108. incoming_entities text
  109. clean_white_space text
  110. # start processor
  111. no_textile text
  112. rip_offtags text
  113. hard_break text
  114. unless @lite_mode
  115. refs text
  116. blocks text
  117. end
  118. inline text
  119. smooth_offtags text
  120. retrieve text
  121. post_process text
  122. DEFAULT_RULES.each {|ruleset| send("#{ruleset}_post_process", text) if private_methods.include? "#{ruleset}_post_process"}
  123. clean_html text if filter_html
  124. return text.strip
  125. end
  126. #######
  127. private
  128. #######
  129. #
  130. # Regular expressions to convert to HTML.
  131. #
  132. LB = "0docbook0line0break0"
  133. NB = "0docbook0no0break0\n\n"
  134. A_HLGN = /(?:(?:<>|<|>|\=|[()]+)+)/
  135. A_VLGN = /[\-^~]/
  136. C_CLAS = '(?:\([^)]+\))'
  137. C_LNGE = '(?:\[[^\]]+\])'
  138. C_STYL = '(?:\{[^}]+\})'
  139. S_CSPN = '(?:\\\\\d+)'
  140. S_RSPN = '(?:/\d+)'
  141. A = "(?:#{A_HLGN}?#{A_VLGN}?|#{A_VLGN}?#{A_HLGN}?)"
  142. S = "(?:#{S_CSPN}?#{S_RSPN}|#{S_RSPN}?#{S_CSPN}?)"
  143. C = "(?:#{C_CLAS}?#{C_STYL}?#{C_LNGE}?|#{C_STYL}?#{C_LNGE}?#{C_CLAS}?|#{C_LNGE}?#{C_STYL}?#{C_CLAS}?)"
  144. PUNCT = Regexp::quote( '!"#$%&\'*+,-./:;=?@\\^_`|~' )
  145. PUNCT_NOQ = Regexp::quote( '!"#$&\',./:;=?@\\`|' )
  146. PUNCT_Q = Regexp::quote( '*-_+^~%' )
  147. HYPERLINK = '(\S+?)([^\w\s/;=\?]*?)(?=\s|<|$)'
  148. TABLE_RE = /^(?:caption ?\{(.*?)\}\. ?\n)?^(?:id ?\{(.*?)\}\. ?\n)?^(?:table(_?#{S}#{A}#{C})\. ?\n)?^(#{A}#{C}\.? ?\|.*?\|)(\n\n|\Z)/m
  149. LISTS_RE = /^([#*_0-9]+?#{C} .*?)$(?![^#*])/m
  150. LISTS_CONTENT_RE = /^([#*]+)([_0-9]*)(#{A}#{C}) (.*)$/m
  151. DEFS_RE = /^(-#{C}\s.*?\:\=.*?)$(?![^-])/m
  152. DEFS_CONTENT_RE = /^(-)(#{A}#{C})\s+(.*?):=(.*)$/m
  153. BACKTICK_CODE_RE = /(.*?)
  154. ```
  155. (?:\|(\w+?)\|)?
  156. (.*?[^\\])
  157. ```
  158. (.*?)/mx
  159. CODE_RE = /(.*?)
  160. @@?
  161. (?:\|(\w+?)\|)?
  162. (.*?[^\\])
  163. @@?
  164. (.*?)/x
  165. BLOCKS_GROUP_RE = /\n{2,}(?! )/m
  166. BLOCK_RE = /^(([a-z]+)(\d*))(#{A}#{C})\.(?::(\S+))? (.*)$/
  167. SETEXT_RE = /\A(.+?)\n([=-])[=-]* *$/m
  168. ATX_RE = /\A(\#{1,6}) # $1 = string of #'s
  169. [ ]*
  170. (.+?) # $2 = Header text
  171. [ ]*
  172. \#* # optional closing #'s (not counted)
  173. $/x
  174. LINK_RE = /
  175. ([\s\[{(]|[#{PUNCT}])? # $pre
  176. " # start
  177. (#{C}) # $atts
  178. ([^"]+?) # $text
  179. \s?
  180. (?:\(([^)]+?)\)(?="))? # $title
  181. ":
  182. ([^\s<]+?) # $url
  183. (\/)? # $slash
  184. ([^\w\/;]*?) # $post
  185. (?=<|\s|$)
  186. /x
  187. IMAGE_RE = /
  188. (<p>|.|^) # start of line?
  189. \! # opening
  190. (\<|\=|\>)? # optional alignment atts
  191. (#{C}) # optional style,class atts
  192. (?:\. )? # optional dot-space
  193. ([^\s(!]+?) # presume this is the src
  194. \s? # optional space
  195. (?:\(((?:[^\(\)]|\([^\)]+\))+?)\))? # optional title
  196. \! # closing
  197. (?::#{ HYPERLINK })? # optional href
  198. /x
  199. # Text markup tags, don't conflict with block tags
  200. SIMPLE_HTML_TAGS = [
  201. 'tt', 'b', 'i', 'big', 'small', 'em', 'strong', 'dfn', 'code',
  202. 'samp', 'kbd', 'var', 'cite', 'abbr', 'acronym', 'a', 'img', 'br',
  203. 'br', 'map', 'q', 'sub', 'sup', 'span', 'bdo'
  204. ]
  205. QTAGS = [
  206. ['**', 'b'],
  207. ['*', 'strong'],
  208. ['??', 'cite', :limit],
  209. ['-', 'del', :limit],
  210. ['__', 'i'],
  211. ['_', 'em', :limit],
  212. ['%', 'span', :limit],
  213. ['+', 'ins', :limit],
  214. ['^', 'sup'],
  215. ['~', 'sub']
  216. ]
  217. QTAGS.collect! do |rc, ht, rtype|
  218. rcq = Regexp::quote rc
  219. re =
  220. case rtype
  221. when :limit
  222. /(\W)
  223. (#{rcq})
  224. (#{C})
  225. (?::(\S+?))?
  226. (\S|\S.*?\S)
  227. #{rcq}
  228. (?=\W)/x
  229. else
  230. /(#{rcq})
  231. (#{C})
  232. (?::(\S+))?
  233. (\S|\S.*?\S)
  234. #{rcq}/xm
  235. end
  236. escaped_re =
  237. case rtype
  238. when :limit
  239. /(\W)
  240. (#{@@escape_keyword}#{rcq})
  241. (#{C})
  242. (?::(\S+?))?
  243. (\S|\S.*?\S)
  244. #{rcq}#{@@escape_keyword}
  245. (?=\W)/x
  246. else
  247. /(#{@@escape_keyword}#{rcq})
  248. (#{C})
  249. (?::(\S+))?
  250. (\S|\S.*?\S)
  251. #{rcq}#{@@escape_keyword}/xm
  252. end
  253. [rc, ht, re, rtype, escaped_re]
  254. end
  255. # Elements to handle
  256. GLYPHS = [
  257. # [ /([^\s\[{(>])?\'([dmst]\b|ll\b|ve\b|\s|:|$)/, '\1&#8217;\2' ], # single closing
  258. [ /([^\s\[{(>#{PUNCT_Q}][#{PUNCT_Q}]*)\'/, '\1&#8217;' ], # single closing
  259. [ /\'(?=[#{PUNCT_Q}]*(s\b|[\s#{PUNCT_NOQ}]))/, '&#8217;' ], # single closing
  260. [ /\'/, '&#8216;' ], # single opening
  261. # [ /([^\s\[{(])?"(\s|:|$)/, '\1&#8221;\2' ], # double closing
  262. [ /([^\s\[{(>#{PUNCT_Q}][#{PUNCT_Q}]*)"/, '\1&#8221;' ], # double closing
  263. [ /"(?=[#{PUNCT_Q}]*[\s#{PUNCT_NOQ}])/, '&#8221;' ], # double closing
  264. [ /"/, '&#8220;' ], # double opening
  265. [ /\b( )?\.{3}/, '\1&#8230;' ], # ellipsis
  266. [ /\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])/, '<acronym title="\2">\1</acronym>' ], # 3+ uppercase acronym
  267. [ /(^|[^"][>\s])([A-Z][A-Z0-9 ]+[A-Z0-9])([^<A-Za-z0-9]|$)/, '\1<span class="caps">\2</span>\3', :no_span_caps ], # 3+ uppercase caps
  268. [ /(\.\s)?\s?--\s?/, '\1&#8212;' ], # em dash
  269. [ /(^|\s)->(\s|$)/, ' &rarr; ' ], # right arrow
  270. [ /(^|\s)-(\s|$)/, ' &#8211; ' ], # en dash
  271. [ /(\d+) x (\d+)/, '\1&#215;\2' ], # dimension sign
  272. [ /\b ?[(\[]TM[\])]/i, '&#8482;' ], # trademark
  273. [ /\b ?[(\[]R[\])]/i, '&#174;' ], # registered
  274. [ /\b ?[(\[]C[\])]/i, '&#169;' ] # copyright
  275. ]
  276. H_ALGN_VALS = {
  277. '<' => 'left',
  278. '=' => 'center',
  279. '>' => 'right',
  280. '<>' => 'justify'
  281. }
  282. V_ALGN_VALS = {
  283. '^' => 'top',
  284. '-' => 'middle',
  285. '~' => 'bottom'
  286. }
  287. OFFTAGS = /(code|pre|kbd|notextile)/i
  288. OFFTAG_MATCH = /(?:(<\/#{ OFFTAGS }>)|(<#{ OFFTAGS }[^>]*>))(.*?)(?=<\/?#{ OFFTAGS }|\Z)/mi
  289. OFFTAG_OPEN = /<#{ OFFTAGS }/
  290. OFFTAG_CLOSE = /<\/?#{ OFFTAGS }/
  291. HASTAG_MATCH = /(<\/?\w[^\n]*?>)/m
  292. ALLTAG_MATCH = /(<\/?\w[^\n]*?>)|.*?(?=<\/?\w[^\n]*?>|$)/m
  293. def pre_process( text )
  294. text.gsub!( /={2}\`\`\`={2}/, "XXXpreformatted_backticksXXX" )
  295. end
  296. def post_process( text )
  297. text.gsub!( /XXXpreformatted_backticksXXX/, '```' )
  298. text.gsub!( LB, "\n" )
  299. text.gsub!( NB, "" )
  300. text.gsub!( /<\/?notextile>/, '' )
  301. text.gsub!( /x%x%/, '&#38;' )
  302. text << "</div>" if @div_atts
  303. end
  304. # Search and replace for glyphs (quotes, dashes, other symbols)
  305. def pgl( text )
  306. GLYPHS.each do |re, resub, tog|
  307. next if tog and method( tog ).call
  308. text.gsub! re, resub
  309. end
  310. end
  311. # Parses attribute lists and builds an HTML attribute string
  312. def pba( text_in, element = "" )
  313. return '' unless text_in
  314. style = []
  315. text = text_in.dup
  316. if element == 'td'
  317. colspan = $1 if text =~ /\\(\d+)/
  318. rowspan = $1 if text =~ /\/(\d+)/
  319. style << "vertical-align:#{ v_align( $& ) };" if text =~ A_VLGN
  320. end
  321. style << "#{ $1 };" if not filter_styles and
  322. text.sub!( /\{([^}]*)\}/, '' )
  323. lang = $1 if
  324. text.sub!( /\[([^)]+?)\]/, '' )
  325. cls = $1 if
  326. text.sub!( /\(([^()]+?)\)/, '' )
  327. style << "padding-left:#{ $1.length }em;" if
  328. text.sub!( /([(]+)/, '' )
  329. style << "padding-right:#{ $1.length }em;" if text.sub!( /([)]+)/, '' )
  330. style << "text-align:#{ h_align( $& ) };" if text =~ A_HLGN
  331. cls, id = $1, $2 if cls =~ /^(.*?)#(.*)$/
  332. atts = ''
  333. atts << " style=\"#{ style.join }\"" unless style.empty?
  334. atts << " class=\"#{ cls }\"" unless cls.to_s.empty? or filter_classes
  335. atts << " lang=\"#{ lang }\"" if lang
  336. atts << " id=\"#{ id }\"" if id and not filter_ids
  337. atts << " colspan=\"#{ colspan }\"" if colspan
  338. atts << " rowspan=\"#{ rowspan }\"" if rowspan
  339. atts
  340. end
  341. #
  342. # Flexible HTML escaping
  343. #
  344. def htmlesc( str, mode )
  345. str.gsub!( '&', '&amp;' )
  346. str.gsub!( '"', '&quot;' ) if mode != :NoQuotes
  347. str.gsub!( "'", '&#039;' ) if mode == :Quotes
  348. str.gsub!( '<', '&lt;')
  349. str.gsub!( '>', '&gt;')
  350. end
  351. def hard_break( text )
  352. text.gsub!( /(.)\n(?!\n|\Z| *([#*=]+(\s|$)|[{|]))/, "\\1<br />" ) if hard_breaks
  353. end
  354. def lT( text )
  355. text =~ /\#$/ ? 'o' : 'u'
  356. end
  357. BLOCK_GROUP_SPLITTER = "XXX_BLOCK_GROUP_XXX\n\n"
  358. def blocks( text, deep_code = false )
  359. @current_class ||= nil
  360. # Find all occurences of div(class). and process them as blocks
  361. text.gsub!( /^div\((.*?)\)\.\s*(.*?)(?=div\([^\)]+\)\.\s*)/m ) do |blk|
  362. block_class = (@current_class == $1) ? nil : %{ class=#{$1.inspect}}
  363. @current_class = $1
  364. BLOCK_GROUP_SPLITTER + ( ($2.strip.empty? || block_class.nil?) ? $2 : textile_p('div', block_class, nil, "\n\n#{$2.strip}\n\n") )
  365. end
  366. # Take care of the very last div
  367. text.sub!( /div\((.*?)\)\.\s*(.*)/m ) do |blk|
  368. block_class = (@current_class == $1) ? nil : %{ class=#{$1.inspect}}
  369. @current_class = $1
  370. BLOCK_GROUP_SPLITTER + ( ($2.strip.empty? || block_class.nil?) ? $2 : textile_p('div', block_class, nil, "\n\n#{$2.strip}\n\n") )
  371. end
  372. # Handle the text now that the placeholders for divs are set, splitting at BLOCK_GROUP_SPLITTER
  373. text.replace(text.strip.split(BLOCK_GROUP_SPLITTER.strip).map do |chunk|
  374. block_groups(chunk, deep_code)
  375. end.join)
  376. end
  377. def block_groups( text, deep_code = false )
  378. text.replace text.split( BLOCKS_GROUP_RE ).collect { |blk| blk(blk, deep_code) }.join("\n")
  379. end
  380. # Surrounds blocks with paragraphs and shelves them when necessary
  381. def blk( text, deep_code = false )
  382. return text if text =~ /<[0-9]+>/
  383. plain = text !~ /\A[#*> ]/
  384. # skip blocks that are complex HTML
  385. if text =~ /^<\/?(\w+).*>/ and not SIMPLE_HTML_TAGS.include? $1
  386. text
  387. else
  388. # search for indentation levels
  389. text.strip!
  390. if text.empty?
  391. text
  392. else
  393. code_blk = nil
  394. text.gsub!( /((?:\n(?:\n^ +[^\n]*)+)+)/m ) do |iblk|
  395. flush_left iblk
  396. blocks iblk, plain
  397. iblk.gsub( /^(\S)/, "\\1" )
  398. if plain
  399. code_blk = iblk; ""
  400. else
  401. iblk
  402. end
  403. end
  404. block_applied = 0
  405. @rules.each do |rule_name|
  406. block_applied += 1 if ( rule_name.to_s.match /^block_/ and method( rule_name ).call( text ) )
  407. end
  408. if block_applied.zero?
  409. if deep_code
  410. text = "\t<pre><code>#{ text }</code></pre>\n"
  411. else
  412. text = "\t<p>#{ text }</p>\n"
  413. end
  414. end
  415. # hard_break text
  416. text << "\n#{ code_blk }"
  417. end
  418. return text
  419. end
  420. end
  421. def refs( text )
  422. @rules.each do |rule_name|
  423. method( rule_name ).call( text ) if rule_name.to_s.match /^refs_/
  424. end
  425. end
  426. def check_refs( text )
  427. ret = @urlrefs[text.downcase] if text
  428. ret || [text, nil]
  429. end
  430. # Puts text in storage and returns is placeholder
  431. # e.g. shelve("some text") => <1>
  432. def shelve( val )
  433. @shelf << val
  434. " <#{ @shelf.length }>"
  435. end
  436. # Retrieves text from storage using its placeholder
  437. # e.g. retrieve("<1>") => "some text"
  438. def retrieve( text )
  439. @shelf.each_with_index do |r, i|
  440. text.gsub!( " <#{ i + 1 }>" ){|m| r }
  441. end
  442. end
  443. def incoming_entities( text )
  444. ## turn any incoming ampersands into a dummy character for now.
  445. ## This uses a negative lookahead for alphanumerics followed by a semicolon,
  446. ## implying an incoming html entity, to be skipped
  447. text.gsub!( /&(?![#a-z0-9]+;)/i, "x%x%" )
  448. end
  449. def clean_white_space( text )
  450. # normalize line breaks
  451. text.gsub!( /\r\n/, "\n" )
  452. text.gsub!( /\r/, "\n" )
  453. text.gsub!( /\t/, ' ' )
  454. text.gsub!( /^ +$/, '' )
  455. text.gsub!( /\n{3,}/, "\n\n" )
  456. text.gsub!( /"$/, "\" " )
  457. # if entire document is indented, flush
  458. # to the left side
  459. flush_left text
  460. end
  461. def flush_left( text )
  462. indt = 0
  463. if text =~ /^ /
  464. while text !~ /^ {#{indt}}\S/
  465. indt += 1
  466. end unless text.empty?
  467. if indt.nonzero?
  468. text.gsub!( /^ {#{indt}}/, '' )
  469. end
  470. end
  471. end
  472. def footnote_ref( text )
  473. text.gsub!( /\b\[([0-9]+?)\](\s)?/,
  474. '<sup><a href="#fn\1">\1</a></sup>\2' )
  475. end
  476. def rip_offtags( text )
  477. if text =~ /<.*>/
  478. ## strip and encode <pre> content
  479. codepre, used_offtags = 0, {}
  480. text.gsub!( OFFTAG_MATCH ) do |line|
  481. if $3
  482. offtag, aftertag = $4, $5
  483. codepre += 1
  484. used_offtags[offtag] = true
  485. if codepre - used_offtags.length > 0
  486. htmlesc( line, :NoQuotes ) unless used_offtags['notextile']
  487. @pre_list.last << line
  488. line = ""
  489. else
  490. htmlesc( aftertag, :NoQuotes ) if aftertag and not used_offtags['notextile']
  491. line = "<redpre##{ @pre_list.length }>"
  492. @pre_list << "#{ $3 }#{ aftertag }"
  493. end
  494. elsif $1 and codepre > 0
  495. if codepre - used_offtags.length > 0
  496. htmlesc( line, :NoQuotes ) unless used_offtags['notextile']
  497. @pre_list.last << line
  498. line = ""
  499. end
  500. codepre -= 1 unless codepre.zero?
  501. used_offtags = {} if codepre.zero?
  502. end
  503. line
  504. end
  505. end
  506. text
  507. end
  508. def smooth_offtags( text )
  509. unless @pre_list.empty?
  510. ## replace <pre> content
  511. text.gsub!( /<redpre#(\d+)>/ ) { @pre_list[$1.to_i] }
  512. end
  513. end
  514. def inline( text )
  515. [/^inline_/, /^glyphs_/].each do |meth_re|
  516. @rules.each do |rule_name|
  517. method( rule_name ).call( text ) if rule_name.to_s.match( meth_re )
  518. end
  519. end
  520. end
  521. def h_align( text )
  522. H_ALGN_VALS[text]
  523. end
  524. def v_align( text )
  525. V_ALGN_VALS[text]
  526. end
  527. # HTML cleansing stuff
  528. BASIC_TAGS = {
  529. 'a' => ['href', 'title'],
  530. 'img' => ['src', 'alt', 'title'],
  531. 'br' => [],
  532. 'i' => nil,
  533. 'u' => nil,
  534. 'b' => nil,
  535. 'pre' => nil,
  536. 'kbd' => nil,
  537. 'code' => ['lang'],
  538. 'cite' => nil,
  539. 'strong' => nil,
  540. 'em' => nil,
  541. 'ins' => nil,
  542. 'sup' => nil,
  543. 'sub' => nil,
  544. 'del' => nil,
  545. 'table' => nil,
  546. 'tr' => nil,
  547. 'td' => ['colspan', 'rowspan'],
  548. 'th' => nil,
  549. 'ol' => ['start'],
  550. 'ul' => nil,
  551. 'li' => nil,
  552. 'p' => nil,
  553. 'h1' => nil,
  554. 'h2' => nil,
  555. 'h3' => nil,
  556. 'h4' => nil,
  557. 'h5' => nil,
  558. 'h6' => nil,
  559. 'blockquote' => ['cite']
  560. }
  561. def clean_html( text, tags = BASIC_TAGS )
  562. text.gsub!( /<!\[CDATA\[/, '' )
  563. text.gsub!( /<(\/*)(\w+)([^>]*)>/ ) do
  564. raw = $~
  565. tag = raw[2].downcase
  566. if tags.has_key? tag
  567. pcs = [tag]
  568. tags[tag].each do |prop|
  569. ['"', "'", ''].each do |q|
  570. q2 = ( q != '' ? q : '\s' )
  571. if raw[3] =~ /#{prop}\s*=\s*#{q}([^#{q2}]+)#{q}/i
  572. attrv = $1
  573. next if (prop == 'src' or prop == 'href') and not attrv =~ %r{^(http|https|ftp):}
  574. pcs << "#{prop}=\"#{attrv.gsub('"', '\\"')}\""
  575. break
  576. end
  577. end
  578. end if tags[tag]
  579. "<#{raw[1]}#{pcs.join " "}>"
  580. else
  581. " "
  582. end
  583. end
  584. end
  585. AUTO_LINK_RE = /
  586. ( # leading text
  587. <\w+.*?>| # leading HTML tag, or
  588. [^=!:'"\/]| # leading punctuation, or
  589. ^ # beginning of line
  590. )
  591. (
  592. (?:http[s]?:\/\/)| # protocol spec, or
  593. (?:www\.) # www.*
  594. )
  595. (
  596. ([\w]+[=?&:%\/\.\~\-]*)* # url segment
  597. \w+[\/]? # url tail
  598. (?:\#\w*)? # trailing anchor
  599. )
  600. ([[:punct:]]|\s|<|$) # trailing text
  601. /x
  602. end