PageRenderTime 98ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/extras/robots-wordlists/process-results.py

http://raft.googlecode.com/
Python | 415 lines | 372 code | 14 blank | 29 comment | 20 complexity | 8fc27ea6cce87cd0ba9926a97aa49fb4 MD5 | raw file
Possible License(s): GPL-3.0
  1. #
  2. # Author: Gregory Fleischer (gfleischer@gmail.com)
  3. #
  4. # Copyright (c) 2011 RAFT Team
  5. #
  6. # This file is part of RAFT.
  7. #
  8. # RAFT is free software: you can redistribute it and/or modify
  9. # it under the terms of the GNU General Public License as published by
  10. # the Free Software Foundation, either version 3 of the License, or
  11. # (at your option) any later version.
  12. #
  13. # RAFT is distributed in the hope that it will be useful,
  14. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. # GNU General Public License for more details.
  17. #
  18. # You should have received a copy of the GNU General Public License
  19. # along with RAFT. If not, see <http://www.gnu.org/licenses/>.
  20. #
  21. import sys, re
  22. from raftparse import raft_parse_xml
  23. import urllib2
  24. from urllib2 import urlparse
  25. import hashlib
  26. import time
  27. unique_hashes = {}
  28. status_counts = {}
  29. user_agents = {}
  30. sitemaps = {}
  31. robot_mappings = {}
  32. re_comment = re.compile(r'^#\s*(.*)')
  33. re_allow = re.compile(r'^(?:#+\s*)?Allow\s*:\s*(.*)', re.I)
  34. re_disallow = re.compile(r'^(?:#+\s*)?Disallow\s*:\s*(.*)', re.I)
  35. re_disallow_broken = re.compile(r'^(?:Dissalow|Disallow)\s*:?\s*(.*)', re.I)
  36. re_sitemap = re.compile(r'^sitemap\s*:\s*(.*)', re.I)
  37. re_useragent = re.compile(r'^(?:ACAP-crawler|User-agent):\s*(.*)', re.I)
  38. re_experimental = re.compile(r'^(?:#+\s*)?(?:noindex|nofollow|noarchive|nopreview):\s*(.*)', re.I)
  39. re_acap_disallow = re.compile(r'^ACAP-disallow-crawl\s*:\s*(.*)', re.I)
  40. re_acap_allow = re.compile(r'^ACAP-allow-crawl\s*:\s*(.*)', re.I)
  41. re_questionable = re.compile(r'^((?:https?:/)?/[-/a-z0-9_\*]+)\s*$', re.I)
  42. # TODO: could parse out host name values
  43. re_host = re.compile(r'^Host\s*:\s*((?:\w+\.)*\w+)', re.I)
  44. re_ignore = re.compile(r'^(?:Visit-time|Request-rate|Crawl-delay)\s*:\s*', re.I)
  45. re_words_splitter = re.compile(r'[^-a-z0-9_.]+', re.I)
  46. re_domain = re.compile(r'(?:^|/)(?:\w+\.)+(?:AC|AD|AE|AERO|AF|AG|AI|AL|AM|AN|AO|AQ|AR|ARPA|AS|ASIA|AT|AU|AW|AX|AZ|BA|BB|BD|BE|BF|BG|BH|BI|BIZ|BJ|BL|BM|BN|BO|BQ|BR|BS|BT|BV|BW|BY|BZ|CA|CAT|CC|CD|CF|CG|CH|CI|CK|CL|CM|CN|CO|COM|COOP|CR|CU|CV|CW|CX|CY|CZ|DE|DJ|DK|DM|DO|DZ|EC|EDU|EE|EG|EH|ER|ES|ET|EU|FI|FJ|FK|FM|FO|FR|GA|GB|GD|GE|GF|GG|GH|GI|GL|GM|GN|GOV|GP|GQ|GR|GS|GT|GU|GW|GY|HK|HM|HN|HR|HT|HU|ID|IE|IL|IM|IN|INFO|INT|IO|IQ|IR|IS|IT|JE|JM|JO|JOBS|JP|KE|KG|KH|KI|KM|KN|KP|KR|KW|KY|KZ|LA|LB|LC|LI|LK|LR|LS|LT|LU|LV|LY|MA|MC|MD|ME|MF|MG|MH|MIL|MK|ML|MM|MN|MO|MOBI|MP|MQ|MR|MS|MT|MU|MUSEUM|MV|MW|MX|MY|MZ|NA|NAME|NC|NE|NET|NF|NG|NI|NL|NO|NP|NR|NU|NZ|OM|ORG|PA|PE|PF|PG|PH|PK|PL|PM|PN|PR|PRO|PS|PT|PW|PY|QA|RE|RO|RS|RU|RW|SA|SB|SC|SD|SE|SG|SH|SI|SJ|SK|SL|SM|SN|SO|SR|ST|SU|SV|SX|SY|SZ|TC|TD|TEL|TF|TG|TH|TJ|TK|TL|TM|TN|TO|TP|TR|TRAVEL|TT|TV|TW|TZ|UA|UG|UK|UM|US|UY|UZ|VA|VC|VE|VG|VI|VN|VU|WF|WS|XXX|YE|YT|ZA|ZM|ZW|INT|ARPA)(?:/|$)', re.I)
  47. re_strip_chars = re.compile(r'[\'\"\\()|@+]')
  48. re_chomp_chars = re.compile(r'[?#;&$%].*$')
  49. re_remove_junk = re.compile(r'/\w+,\w+/|/com\.\w+(\.\w+)*/')
  50. re_remove_comments = re.compile(r'#[^\n]*\n')
  51. re_remove_sitemap = re.compile(r'Sitemap:[^\n]+\n', re.I)
  52. re_reject_spammer = re.compile(r'Disallow:[^\n]*\b(?:adderall|percocet|cialis|OptionARMCalc|ChicagoExperts|ChicagoSellers|ChicagoBuyers|Win\$\d|Loan-Analysis|RealEstateTips|DreamHome)\b', re.I)
  53. matchers = [
  54. # (re_allow, robot_mappings),
  55. # (re_acap_allow, robot_mappings),
  56. # (re_disallow, None),
  57. # (re_acap_disallow, None),
  58. (re_disallow, robot_mappings),
  59. (re_acap_disallow, robot_mappings),
  60. (re_allow, None),
  61. (re_acap_allow, None),
  62. (re_useragent, None),
  63. (re_host, None),
  64. (re_ignore, None),
  65. (re_experimental, robot_mappings),
  66. (re_questionable, None),
  67. (re_disallow_broken, robot_mappings),
  68. ]
  69. other_content = [
  70. (re_sitemap, None),
  71. ]
  72. reject_content = [
  73. re.compile(r'/[-0-9_]{8,}|[-0-9_]{8,}\.\w+|Allow:|Disallow:|related-content\.g|related_content_helper\.html|[:<>=]')
  74. ]
  75. def initialize_mapcounts(mapcount):
  76. for name in ['all', 'words', 'files', 'directories', 'extensions']:
  77. mapcount[name] = {}
  78. def update_entry_count(mapcount, path):
  79. if not mapcount.has_key(path):
  80. mapcount[path] = 1
  81. else:
  82. mapcount[path] += 1
  83. def normalize_entry(entry):
  84. if entry.startswith('http://') or entry.startswith('https://'):
  85. # TODO: could parse out host name values
  86. splitted = urlparse.urlsplit(entry)
  87. entry = splitted.path
  88. entry = urllib2.unquote(entry)
  89. entry = re_strip_chars.sub('', entry)
  90. entry = re_chomp_chars.sub('', entry)
  91. # TODO: could parse out host name values
  92. entry = re_domain.sub('/', entry)
  93. entry = re_remove_junk.sub('/', entry)
  94. # assume garbage
  95. for rej in reject_content:
  96. if rej.search(entry):
  97. return None
  98. return entry
  99. def add_entry(entry, mapping):
  100. path = normalize_entry(entry)
  101. if not path:
  102. return # useless
  103. fields = path.split('/')
  104. dirpath = '/'
  105. update_entry_count(mapping['directories'], dirpath)
  106. if path.startswith('/'):
  107. fields = fields[1:]
  108. ###print(path, fields)
  109. add_dir = True
  110. for i in range(0, len(fields)):
  111. field = fields[i]
  112. add_this_one = True
  113. if '..' in field:
  114. add_this_one = add_dir = False
  115. elif '.' in field:
  116. # assume file (with path_info maybe)
  117. add_dir = False
  118. if field.endswith('*'):
  119. field = field[0:-1]
  120. if field.startswith('.'):
  121. filename = field
  122. ext = ''
  123. ext2 = ''
  124. else:
  125. ndx = field.rindex('.')
  126. filename = field[0:ndx]
  127. ext = field[ndx:]
  128. # look for other
  129. ndx2 = field.index('.')
  130. ext2 = field[ndx2:]
  131. if '*' not in field and len(field) < 20 and not filename.endswith('-') and not filename.startswith('-') and not filename.endswith('_'):
  132. update_entry_count(mapping['files'], field)
  133. if len(ext) > 1 and '*' not in ext:
  134. update_entry_count(mapping['extensions'], ext)
  135. if ext2 != ext:
  136. if len(ext2) > 1 and '*' not in ext2:
  137. update_entry_count(mapping['extensions'], ext2)
  138. else:
  139. this_field = field
  140. if i < (len(fields) - 1):
  141. dirpath += field + '/'
  142. this_field += '/'
  143. else:
  144. dirpath += field
  145. if not field or ('*' in field) or ('..' in field) or len(field) > 16 or field.endswith('-') or field.startswith('-') or (',' in field):
  146. add_this_one = add_dir = False
  147. if add_dir:
  148. update_entry_count(mapping['directories'], dirpath)
  149. elif add_this_one:
  150. update_entry_count(mapping['directories'], this_field)
  151. # TODO: support ALL
  152. # if add_this_one:
  153. # update_entry_count(mapping['all'], field)
  154. def process(files):
  155. start_time = time.time()
  156. count = 0
  157. duplicates = 0
  158. skipcount = 0
  159. for filename in files:
  160. try:
  161. tcount, tdup, tskip = process_file(filename)
  162. count += tcount
  163. duplicates += tdup
  164. skipcount += tskip
  165. except Exception, e:
  166. import traceback
  167. sys.stdout.write('ERROR: processing %s\n%s' % (filename, traceback.format_exc(e)))
  168. sys.stderr.write('\n***processed %d records in %d seconds and ignored %d duplicates and %d skips\n' % (count, int(time.time()-start_time), duplicates, skipcount))
  169. def process_response(host, body, content_type):
  170. site_mapping = {}
  171. initialize_mapcounts(site_mapping)
  172. charset = ''
  173. if content_type and 'charset=' in content_type:
  174. charset = content_type[content_type.index('charset=')+8:]
  175. elif ord(body[0]) > 127:
  176. charset = 'UTF-8'
  177. elif ord(body[0]) == 0:
  178. charset = 'UTF-16'
  179. if charset:
  180. try:
  181. body = body.decode(charset)
  182. body = body.encode('ascii', 'ignore')
  183. except Exception, e:
  184. # sys.stderr.write('ignoring: %s' % (e))
  185. pass
  186. comments = ''
  187. for line in body.splitlines():
  188. line = line.replace('\xa0', ' ')
  189. line = line.strip()
  190. if not line:
  191. continue
  192. matched = False
  193. ndx = line.find('#')
  194. if 0 == ndx:
  195. m = re_comment.search(line)
  196. if m:
  197. matched = True
  198. # comments += line + '\n'
  199. elif comments:
  200. print(comments)
  201. comments = ''
  202. elif ndx > 0:
  203. line = line[0:ndx].strip()
  204. for matcher in matchers:
  205. m = matcher[0].search(line)
  206. if m:
  207. matched = True
  208. mapping = matcher[1]
  209. if not mapping is None:
  210. entry = m.group(1)
  211. # TODO: some entries have spaces to list multiple paths ... is this valid?
  212. for subentry in entry.split(' '):
  213. add_entry(subentry, site_mapping)
  214. break
  215. for matcher in other_content:
  216. m = matcher[0].search(line)
  217. if m:
  218. matched = True
  219. mapping = matcher[1]
  220. if not mapping is None:
  221. entry = m.group(1)
  222. break
  223. if not matched and content_type and 'text/plain' in content_type:
  224. try:
  225. # sys.stderr.write('unmatched: %s\n' % (line))
  226. pass
  227. except UnicodeEncodeError:
  228. pass
  229. merge_mappings(site_mapping)
  230. def merge_mappings(site_mapping):
  231. found_words = {}
  232. for name in ['all', 'files', 'directories', 'extensions']:
  233. entries = site_mapping[name].keys()
  234. entries.sort()
  235. entries.reverse()
  236. for entry in entries:
  237. include = True
  238. if '/' == entry:
  239. include = False
  240. if site_mapping[name][entry] <= 2:
  241. if entry.endswith('/'):
  242. pos = entry.rfind('/',0,-1)
  243. else:
  244. pos = entry.rfind('/')
  245. if pos > -1:
  246. this_entry = entry[pos+1:]
  247. parent_entry = entry[0:pos+1]
  248. if len(this_entry) > 0 and this_entry != parent_entry and site_mapping['directories'].has_key(parent_entry):
  249. parent_count = site_mapping['directories'][parent_entry]
  250. # print('*** [%s]: %d (%s)' % (parent_entry, parent_count, entry))
  251. if parent_count > 256: # TODO: adjust ?
  252. include = False
  253. if include:
  254. words = re_words_splitter.split(entry)
  255. for w in words:
  256. if w:
  257. if '.' in w and not w.startswith('.'):
  258. p1 = 0
  259. p0 = w.find('.')
  260. while p0 > -1:
  261. w2 = w[p1:p0]
  262. found_words[w2] = True
  263. p1 = p0
  264. p0 += 1
  265. p0 = w.find('.', p0)
  266. w2 = w[p1:]
  267. found_words[w2] = True
  268. else:
  269. found_words[w] = True
  270. if not robot_mappings[name].has_key(entry):
  271. robot_mappings[name][entry] = 1
  272. else:
  273. robot_mappings[name][entry] += 1
  274. # process words separately
  275. for word in site_mapping['words'].keys():
  276. found_words[word] = True
  277. name = 'words'
  278. for word in found_words.keys():
  279. if not robot_mappings[name].has_key(word):
  280. robot_mappings[name][word] = 1
  281. else:
  282. robot_mappings[name][word] += 1
  283. def normalized_robots(body):
  284. io = StringIO()
  285. for line in body.splitlines():
  286. line = line.rstrip()
  287. n = line.find('#')
  288. if n > -1:
  289. line = line[0:n]
  290. if re_reject_spam.search(line):
  291. return None
  292. io.write(line.lower())
  293. response = io.getvalue()
  294. io.close()
  295. return response
  296. def process_file(filename):
  297. count = 0
  298. duplicates = 0
  299. skipcount = 0
  300. for result in raft_parse_xml(filename):
  301. origin, host, hostip, url, status, datetime, request, response, method, content_type, extras = result
  302. if status in (200, 206,) and response and response[1]:
  303. if 'google.' in host or 'blogspot.com' in host:
  304. pass
  305. else:
  306. body = response[1]
  307. if re_reject_spammer.search(body):
  308. # print(body)
  309. skipcount += 1
  310. continue
  311. # normalize and calculate hashval
  312. normalized = body.lower().replace(host, '')
  313. normalized = re_remove_comments.sub('\n', normalized)
  314. # normalized = re_remove_sitemap.sub('\n', normalized)
  315. sha1 = hashlib.sha1()
  316. sha1.update(normalized)
  317. hashval = sha1.hexdigest()
  318. if not unique_hashes.has_key(hashval):
  319. unique_hashes[hashval] = True
  320. process_response(host, body, content_type)
  321. count += 1
  322. else:
  323. duplicates += 1
  324. if not status_counts.has_key(status):
  325. status_counts[status] = 1
  326. else:
  327. status_counts[status] += 1
  328. return count, duplicates, skipcount
  329. def print_mapcount(fhandle, cutoff, mapcount):
  330. count_mapping = {}
  331. for entry in mapcount.keys():
  332. count = mapcount[entry]
  333. if count >= cutoff:
  334. if not count_mapping.has_key(count):
  335. count_mapping[count] = [entry]
  336. else:
  337. count_mapping[count].append(entry)
  338. keys = count_mapping.keys()
  339. keys.sort(key=int)
  340. keys.reverse()
  341. for value in keys:
  342. values = count_mapping[value]
  343. values.sort()
  344. fhandle.write('%d=>\n\t%s\n' % (value, '\n\t'.join(values)))
  345. def print_ordered(mapping_name, mapping):
  346. groupings = [('all', 1), ('large', 2), ('medium', 3), ('small', 4)] # TODO: base on total sizes
  347. # print('--%s--' % mapping_name)
  348. for grouping in groupings:
  349. label, cutoff = grouping
  350. for name in ['words', 'files', 'directories', 'extensions']:
  351. # print(' -%s-%s-' % (label, name))
  352. fhandle = open('raft-%s-%s.dat' % (label, name), 'w')
  353. print_mapcount(fhandle, cutoff, mapping[name])
  354. for matcher in matchers:
  355. mapping = matcher[1]
  356. if mapping is not None:
  357. initialize_mapcounts(mapping)
  358. files = []
  359. for arg in sys.argv[1:]:
  360. if arg.startswith('-'):
  361. pass
  362. else:
  363. files.append(arg)
  364. process(files)
  365. for status in status_counts.keys():
  366. print('status %s: %d' % (status, status_counts[status]))
  367. print_ordered('mappigns', robot_mappings)