PageRenderTime 46ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/lab/wgettsv

https://bitbucket.org/mattsta/tokyocabinet-reverse-lexicographical-sorting
Ruby | 235 lines | 224 code | 6 blank | 5 comment | 63 complexity | d46acd43f7584d81cb17caafdc630618 MD5 | raw file
Possible License(s): LGPL-2.1
  1. #! /usr/bin/ruby -w
  2. #================================================================
  3. # wgettsv
  4. # Collect WWW resources and generate TSV data
  5. #================================================================
  6. require 'open-uri'
  7. require 'iconv'
  8. require 'kconv'
  9. require 'date'
  10. require 'time'
  11. require 'cgi'
  12. def main
  13. seeds = []
  14. hist = {}
  15. filters = []
  16. max = 1 << 30
  17. lim = 1 << 20
  18. wait = 0
  19. ndf = false
  20. i = 0
  21. while i < ARGV.length
  22. if seeds.length < 1 && ARGV[i] =~ /^-/
  23. if ARGV[i] == '-allow'
  24. usage if (i += 1) >= ARGV.length
  25. regex = Regexp::new(ARGV[i])
  26. filters.push([true, regex]) if regex
  27. elsif ARGV[i] == '-deny'
  28. usage if (i += 1) >= ARGV.length
  29. regex = Regexp::new(ARGV[i])
  30. filters.push([false, regex]) if regex
  31. elsif ARGV[i] == '-max'
  32. usage if (i += 1) >= ARGV.length
  33. max = ARGV[i].to_i
  34. elsif ARGV[i] == '-lim'
  35. usage if (i += 1) >= ARGV.length
  36. lim = ARGV[i].to_i
  37. elsif ARGV[i] == '-wait'
  38. usage if (i += 1) >= ARGV.length
  39. wait = ARGV[i].to_f
  40. elsif ARGV[i] == '-ndf'
  41. ndf = true
  42. else
  43. usage
  44. end
  45. else
  46. if ARGV[i] =~ /^http:\/\//i
  47. seeds.push(ARGV[i])
  48. hist[ARGV[i]] = true
  49. else
  50. usage
  51. end
  52. end
  53. i += 1
  54. end
  55. usage if seeds.length < 1
  56. if !ndf
  57. filters.push([false, /\.(txt|text|asc|c|cc|cxx|cpp|h|hxx|hpp|in)$/i])
  58. filters.push([false, /\.(css|js|csv|tsv|log|md5|crc|conf|ini|inf|lnk|sys|tmp|bak)$/i])
  59. filters.push([false, /\.(xml|xsl|xslt|rdf|rss|dtd|sgml|sgm)$/i])
  60. filters.push([false, /\.(pgp|sig|cer|csr|pem|key|b64|uu|uue|[0-9])$/i])
  61. filters.push([false, /\.(rtf|pdf|ps|eps|ai|doc|xls|ppt|sxw|sxc|sxi|xdw|jtd|oas|swf)$/i])
  62. filters.push([false, /\.(zip|tar|tgz|gz|bz2|tbz2|z|lha|lzh)?$/i])
  63. filters.push([false, /\.(7z|lzo|lzma|cpio|shar|cab|rar|sit|ace|hqx)?$/i])
  64. filters.push([false, /\.(bin|o|a|so|exe|dll|lib|obj|ocx|class|jar|war)?$/i])
  65. filters.push([false, /\.(rpm|deb|qdb|qdb|dbx|dbf|dat|msi|bat|com|iso)?$/i])
  66. filters.push([false, /\.(png|gif|jpg|jpeg|tif|tiff|bmp|ico|pbm|pgm|ppm|xbm|xpm|dvi)$/i])
  67. filters.push([false, /\.(au|snd|mid|midi|kar|smf|mp2|mp3|m3u|wav|wma|wmp|asx|at3|aif)$/i])
  68. filters.push([false, /\.(mpg|mpeg|qt|mov|avi|wmv|wvx|asf|ram|rm)$/i])
  69. filters.push([false, /\.(tch|tdb|tdf|tct)$/i])
  70. filters.push([false, /\.idx\./i])
  71. filters.push([false, /(core|casket|Makefile|README|NEWS|COPYING|LISENCE)($|\/)/i])
  72. end
  73. return proc(seeds, hist, filters, max, lim, wait) ? 0 : 1
  74. end
  75. def usage
  76. STDERR.printf("%s: collect WWW resources and generate TSV data\n", $progname)
  77. STDERR.printf("\n")
  78. STDERR.printf("usage:\n")
  79. STDERR.printf(" %s [-allow regex] [-deny regex] [-max num] [-lim num] [-wait num]" +
  80. " url ...\n", $progname)
  81. STDERR.printf("\n")
  82. exit(1)
  83. end
  84. def proc(seeds, hist, filters, max, lim, wait)
  85. cnt = 0
  86. while (url = seeds.shift) && cnt < max
  87. STDERR.printf("%d: getting: %s\n", cnt + 1, url)
  88. begin
  89. opts = {}
  90. OpenURI.open_uri(url, 0, 0, opts) do |sio|
  91. baseuri = sio.base_uri
  92. if baseuri && baseuri.to_s != url
  93. url = baseuri.to_s
  94. hist[url] = true
  95. end
  96. size = sio.size
  97. raise "invalid size" if size > lim || size < 3
  98. type = sio.content_type
  99. type = "text/plain" if !type
  100. str = sio.read
  101. head = str[0,2048]
  102. if (head[0] == 0xfe && head[1] == 0xff) || (head[0] == 0xff && head[1] == 0xfe)
  103. str = Kconv::kconv(str, Kconv::UTF8, Kconv::UTF16)
  104. charset = "UTF-8"
  105. elsif str.include?(0)
  106. raise "binary data"
  107. end
  108. raise "not HTML" if type != "text/html" && head !~ /<html/i && head !~ /<body/i
  109. if !charset && head =~ /<\?xml.*encoding=("|')?[-_a-zA-Z0-9]+("|')/im
  110. charset = head.gsub(/.*<\?xml.*encoding=["']?([-_a-zA-Z0-9]+)["']?.*/im, '\1')
  111. end
  112. if !charset && head =~ /<meta.*content-type.*charset=[-_a-zA-Z0-9]+/im
  113. charset = head.gsub(/.*<meta.*content-type.*charset=([-_a-zA-Z0-9]+).*/im, '\1')
  114. end
  115. charset = sio.charset if !charset || charset.length < 1
  116. if charset && charset.length > 0
  117. if charset !~ /^UTF-?8$/i
  118. begin
  119. nstr = Iconv.conv("UTF-8", charset, str)
  120. str = nstr if nstr && nstr.length > 0
  121. rescue
  122. str = str.toutf8
  123. end
  124. end
  125. else
  126. str = str.toutf8
  127. end
  128. body = str.gsub(/.*<body[^>]*>/im, "")
  129. body = body.gsub(/<\/body>.*/im, "")
  130. body = htmltotext(body)
  131. if str =~ /<title[^>]*>[^<]*<\/title>/im
  132. title = str.gsub(/.*<title[^>]*>([^<]*)<\/title>.*/im, '\1')
  133. title = htmltotext(title)
  134. end
  135. title = "" if !title
  136. title = title[0,128] if title.length > 128
  137. mtime = sio.last_modified
  138. if mtime
  139. mtime = Time::parse(mtime.to_s).to_i
  140. else
  141. mtime = 0
  142. end
  143. printf("%d", cnt + 1)
  144. printf("\turl\t%s", url)
  145. printf("\tsize\t%s", size) if size > 0
  146. printf("\tmtime\t%s", mtime) if mtime > 0
  147. printf("\ttitle\t%s", title) if title.length > 0
  148. printf("\tbody\t%s", body) if body.length > 0
  149. printf("\n")
  150. str.gsub(/<a[^>]*>/im) do |tag|
  151. if tag =~ /href=["']?[^"'>]+["']?/
  152. href = tag.gsub(/.*href=["']?([^"'>]+)["']?.*/, '\1')
  153. href = URI::join(url, href).to_s
  154. href = href.gsub(/#.*/, "")
  155. if !hist[href] && checkurl(href, filters)
  156. seeds.push(href)
  157. hist[href] = true
  158. end
  159. end
  160. end
  161. end
  162. cnt += 1
  163. rescue
  164. STDERR.printf("%d: failed: %s: %s\n", cnt + 1, url, $!)
  165. end
  166. sleep(wait) if wait > 0
  167. end
  168. return 0
  169. end
  170. def htmltotext(str)
  171. str = str.gsub(/<style[^>]*>.*?<\/style>/im, " ")
  172. str = str.gsub(/<script[^>]*>.*?<\/script>/im, " ")
  173. str = str.gsub(/<\/?(p|br|div|h1|h2|h3|h4|h5|h6|ul|ol|dl|li|dd|dt|td|th|pre)[^>]*>/im, " ")
  174. str = str.gsub(/<[^>]*>/, "")
  175. str = str.gsub(/&(nbsp|#160|#0160|#xa0|#x00a0);/i, " ")
  176. hexrx = Regexp::new("^&#x[0-9a-zA-Z]+;")
  177. decrx = Regexp::new("^&#[0-9]+;")
  178. str = str.gsub(/&#?[A-Za-z0-9]+;/) do |pat|
  179. case pat
  180. when "&lt;"
  181. pat = '<'
  182. when "&gt;"
  183. pat = '>'
  184. when "&quot;"
  185. pat = '"'
  186. when "&apos;"
  187. pat = "'"
  188. when "&nbsp;"
  189. pat = " "
  190. else
  191. begin
  192. if pat =~ hexrx
  193. pat = [ pat.gsub(/&#x([A-Za-z0-9]+);/i, '\1').hex ].pack("n")
  194. pat = Iconv.conv("UTF-8", "UTF-16BE", pat)
  195. elsif pat =~ decrx
  196. pat = [ pat.gsub(/&#([A-Za-z0-9]+);/i, '\1').to_i ].pack("n")
  197. pat = Iconv.conv("UTF-8", "UTF-16BE", pat)
  198. else
  199. pat = " "
  200. end
  201. rescue
  202. pat = ""
  203. end
  204. end
  205. pat
  206. end
  207. str = str.gsub(/[\x00-\x20]/, " ")
  208. str = str.gsub(/\xe3\x80\x80/, " ")
  209. str = str.gsub(/ +/, " ")
  210. str = str.gsub(/^ */, "")
  211. str = str.gsub(/ *$/, "")
  212. return str
  213. end
  214. def checkurl(url, filters)
  215. return false if url !~ /^http:\/\//i;
  216. return true if filters.length < 1
  217. ok = !filters[0][0]
  218. filters.each do |filter|
  219. ok = filter[0] if url =~ filter[1]
  220. end
  221. return ok
  222. end
  223. STDOUT.sync = true
  224. $progname = $0.dup
  225. $progname.gsub!(/.*\//, "")
  226. srand
  227. exit(main)