PageRenderTime 259ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 0ms

/src/libraries/sehistory/__init__.py

https://github.com/theduke/sehistory
Python | 238 lines | 210 code | 25 blank | 3 comment | 8 complexity | aa869ea7dbee8d72358a305ab94d7afe MD5 | raw file
Possible License(s): Apache-2.0
  1. from frontend.models import *
  2. from libraries.BeautifulSoup import BeautifulSoup
  3. import logging
  4. import re
  5. import urllib2
  6. import hashlib
  7. import datetime
  8. import sys
  9. class Crawler(object):
  10. def crawl(self, domain):
  11. crawl = Crawl(
  12. domain = domain,
  13. date = datetime.datetime.now()
  14. )
  15. request = urllib2.Request(url = domain.url,
  16. headers={
  17. 'User-Agent': "Mozilla/5.0 (Windows; U; Windows NT 6.0; de; rv:1.9.1.3) Gecko/20090824 Firefox/3.5.3 (.NET CLR 3.5.30729)",
  18. });
  19. try:
  20. url = urllib2.urlopen(request)
  21. except:
  22. crawl.success = False
  23. return crawl
  24. html = url.read()
  25. info = url.info();
  26. ignore, charset = info['Content-Type'].split('charset=')
  27. html = html.decode(charset)
  28. crawl.body = html
  29. crawl.info = info
  30. crawl.charset = charset
  31. crawl.put()
  32. return crawl
  33. def crawlFile(self, url, type, description, domain):
  34. try:
  35. response = urllib2.urlopen(url)
  36. except:
  37. return False
  38. data = response.read()
  39. hash = hashlib.sha256(data).hexdigest()
  40. file = File.getOneByHash(hash)
  41. if not file:
  42. # file is not yet in database, so insert it
  43. filename = re.search(r'[a-zA-Z0-9\.\-_]+$', url).group(0)
  44. extension = re.search(r'(?<=\.)[\S]{3}$', filename).group(0)
  45. q = File.all()
  46. q.filter('filename =', filename).filter('extension =', extension)
  47. version = q.count()+1
  48. file = File(hash=hash,
  49. data=data ,
  50. filename = filename,
  51. extension = extension,
  52. description = description,
  53. type = type
  54. )
  55. file.domains.append(domain.key())
  56. file.computeUniqueFilename()
  57. file.put()
  58. else:
  59. #file does already exist, check if domain is already in urls property
  60. if not domain.key() in file.domains:
  61. #domain is not yet in list, add it
  62. file.domains.append(domain.key())
  63. file.put()
  64. return file
  65. class Bing(object):
  66. def parseForBackground(self, crawl):
  67. url = re.search(r'(?<=\;g\_img\=\{url\:\').*(?=\'\,id)', crawl.body).group(0)
  68. if not url: return False
  69. url = url.replace('\\', '')
  70. url = crawl.domain.url + url
  71. return url
  72. def crawlAllDomains(self):
  73. se = self.getSearchEngineObject()
  74. if not se: raise Exception('Could not find Bing SearchEngine')
  75. domains = db.Query(Domain).filter('searchEngine =', se).fetch(99999999)
  76. for domain in domains:
  77. c = Crawler()
  78. # crawl the domain, retrieve the crawl object with html response
  79. crawl = c.crawl(domain)
  80. # if the crawl failed, skip to next domain
  81. if not crawl: continue
  82. # extract url and description for logo from html
  83. picUrl = self.parseForBackground(crawl)
  84. # if no logo can be found, continue
  85. if not picUrl: continue
  86. file = c.crawlFile(picUrl, 'bing-background', '', domain)
  87. def getSearchEngineObject(self):
  88. result = db.Query(SearchEngine).filter('name =', 'Bing').fetch(1)
  89. # if the SearchEngine object does not exist yet in the db
  90. # create it
  91. if not len(result):
  92. bing = SearchEngine(name = 'Bing')
  93. bing.put()
  94. domain = Domain(url='http://www.bing.com', searchEngine=bing)
  95. domain.put()
  96. else: bing = result[0]
  97. return bing
  98. class Google(object):
  99. def parseForLogo(self, crawl):
  100. info = Google.extractLogo(self, crawl.body)
  101. pic = info[0]
  102. # return false if the pic could not be found
  103. if not pic:
  104. return False
  105. url = crawl.domain.url + pic
  106. return {'pic': pic, 'description': info[1], 'url': url}
  107. def extractLogo(self, html):
  108. soup = BeautifulSoup(html)
  109. img = soup.find('img', id='logo')
  110. div = soup.find('div', id='logo')
  111. description = ''
  112. if img:
  113. pic = img['src']
  114. if 'title' in img: description = img['title']
  115. elif div:
  116. pic = re.search(r'(?<=url\()\S+(?=\))', str(div)).group(0)
  117. else:
  118. msg = 'Cronjob: Logos: could not find IMG or DIV tag with logo id!'
  119. logging.error(msg)
  120. print msg
  121. return False
  122. #pic = pic.decode(charset)
  123. return [pic, description]
  124. def crawlAllDomains(self):
  125. se = self.getSearchEngineObject()
  126. if not se: raise Exception('Could not find Google SearchEngine')
  127. domains = db.Query(Domain).filter('searchEngine =', se).fetch(99999999)
  128. for domain in domains:
  129. try:
  130. c = Crawler()
  131. # crawl the domain, retrieve the crawl object with html response
  132. crawl = c.crawl(domain)
  133. # if the crawl failed, skip to next domain
  134. if not crawl: continue
  135. # extract url and description for logo from html
  136. info = self.parseForLogo(crawl)
  137. # if no logo can be found, continue
  138. if not info: continue
  139. file = c.crawlFile(info['url'], 'google-logo', info['description'], domain)
  140. except Exception as e:
  141. logging.error("Error while crawling Google domain " + crawl.domain.url + ": " + str(e))
  142. def crawlForDomains(self):
  143. # get google searchengine
  144. se = self.getSearchEngineObject()
  145. # fetch html from google domain list
  146. url = urllib2.urlopen('http://www.google.com/language_tools')
  147. html = url.read()
  148. info = url.info();
  149. ignore, charset = info['Content-Type'].split('charset=')
  150. # decode with proper charset
  151. html = html.decode(charset)
  152. # parse html for all domains
  153. domains = re.findall('www.google.[a-z]+<br>\S+(?=</a>)', html)
  154. for d in domains:
  155. url, country = d.split('<br>')
  156. url = 'http://' + url
  157. q = Domain.all();
  158. q.filter('url =', url).order('url')
  159. r = q.fetch(1)
  160. if len(r)==0:
  161. m = Domain(url = url, country = unicode( country ), searchEngine = se )
  162. m.put()
  163. def getSearchEngineObject(self):
  164. result = db.Query(SearchEngine).filter('name =', 'Google').fetch(1)
  165. # if the SearchEngine object does not exist yet in the db
  166. # create it
  167. if not len(result):
  168. goog = SearchEngine(name = 'Google')
  169. goog.put()
  170. else: goog = result[0]
  171. return goog