PageRenderTime 54ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/modules/head.py

https://github.com/jaymzcd/phenny
Python | 189 lines | 160 code | 18 blank | 11 comment | 41 complexity | 3be259a0d78bbf3828d93ce94ef56906 MD5 | raw file
  1. #!/usr/bin/env python
  2. """
  3. head.py - Phenny HTTP Metadata Utilities
  4. Copyright 2008, Sean B. Palmer, inamidst.com
  5. Licensed under the Eiffel Forum License 2.
  6. http://inamidst.com/phenny/
  7. """
  8. import re, urllib, urllib2, httplib, urlparse, time
  9. from htmlentitydefs import name2codepoint
  10. import web
  11. from tools import deprecated
  12. def head(phenny, input):
  13. """Provide HTTP HEAD information."""
  14. uri = input.group(2)
  15. uri = (uri or '').encode('utf-8')
  16. if ' ' in uri:
  17. uri, header = uri.rsplit(' ', 1)
  18. else: uri, header = uri, None
  19. if not uri and hasattr(phenny, 'last_seen_uri'):
  20. try: uri = phenny.last_seen_uri[input.sender]
  21. except KeyError: return phenny.say('?')
  22. if not uri.startswith('htt'):
  23. uri = 'http://' + uri
  24. # uri = uri.replace('#!', '?_escaped_fragment_=')
  25. try: info = web.head(uri)
  26. except IOError: return phenny.say("Can't connect to %s" % uri)
  27. except httplib.InvalidURL: return phenny.say("Not a valid URI, sorry.")
  28. if not isinstance(info, list):
  29. try: info = dict(info)
  30. except TypeError:
  31. return phenny.reply('Try .head http://example.org/ [optional header]')
  32. info['Status'] = '200'
  33. else:
  34. newInfo = dict(info[0])
  35. newInfo['Status'] = str(info[1])
  36. info = newInfo
  37. if header is None:
  38. data = []
  39. if info.has_key('Status'):
  40. data.append(info['Status'])
  41. if info.has_key('content-type'):
  42. data.append(info['content-type'].replace('; charset=', ', '))
  43. if info.has_key('last-modified'):
  44. modified = info['last-modified']
  45. modified = time.strptime(modified, '%a, %d %b %Y %H:%M:%S %Z')
  46. data.append(time.strftime('%Y-%m-%d %H:%M:%S UTC', modified))
  47. if info.has_key('content-length'):
  48. data.append(info['content-length'] + ' bytes')
  49. phenny.reply(', '.join(data))
  50. else:
  51. headerlower = header.lower()
  52. if info.has_key(headerlower):
  53. phenny.say(header + ': ' + info.get(headerlower))
  54. else:
  55. msg = 'There was no %s header in the response.' % header
  56. phenny.say(msg)
  57. head.commands = ['head']
  58. head.example = '.head http://www.w3.org/'
  59. r_title = re.compile(r'(?ims)<title[^>]*>(.*?)</title\s*>')
  60. r_entity = re.compile(r'&[A-Za-z0-9#]+;')
  61. @deprecated
  62. def f_title(self, origin, match, args):
  63. """.title <URI> - Return the title of URI."""
  64. uri = match.group(2)
  65. uri = (uri or '').encode('utf-8')
  66. if not uri and hasattr(self, 'last_seen_uri'):
  67. uri = self.last_seen_uri.get(origin.sender)
  68. if not uri:
  69. return self.msg(origin.sender, 'I need a URI to give the title of...')
  70. if not ':' in uri:
  71. uri = 'http://' + uri
  72. uri = uri.replace('#!', '?_escaped_fragment_=')
  73. localhost = [
  74. 'http://localhost/', 'http://localhost:80/',
  75. 'http://localhost:8080/', 'http://127.0.0.1/',
  76. 'http://127.0.0.1:80/', 'http://127.0.0.1:8080/',
  77. 'https://localhost/', 'https://localhost:80/',
  78. 'https://localhost:8080/', 'https://127.0.0.1/',
  79. 'https://127.0.0.1:80/', 'https://127.0.0.1:8080/',
  80. ]
  81. for s in localhost:
  82. if uri.startswith(s):
  83. return phenny.reply('Sorry, access forbidden.')
  84. try:
  85. redirects = 0
  86. while True:
  87. headers = {
  88. 'Accept': 'text/html',
  89. 'User-Agent': 'Mozilla/5.0 (Phenny)'
  90. }
  91. req = urllib2.Request(uri, headers=headers)
  92. u = urllib2.urlopen(req)
  93. info = u.info()
  94. u.close()
  95. # info = web.head(uri)
  96. if not isinstance(info, list):
  97. status = '200'
  98. else:
  99. status = str(info[1])
  100. info = info[0]
  101. if status.startswith('3'):
  102. uri = urlparse.urljoin(uri, info['Location'])
  103. else: break
  104. redirects += 1
  105. if redirects >= 25:
  106. self.msg(origin.sender, origin.nick + ": Too many redirects")
  107. return
  108. try: mtype = info['content-type']
  109. except:
  110. err = ": Couldn't get the Content-Type, sorry"
  111. return self.msg(origin.sender, origin.nick + err)
  112. if not (('/html' in mtype) or ('/xhtml' in mtype)):
  113. self.msg(origin.sender, origin.nick + ": Document isn't HTML")
  114. return
  115. u = urllib2.urlopen(req)
  116. bytes = u.read(262144)
  117. u.close()
  118. except IOError:
  119. self.msg(origin.sender, "Can't connect to %s" % uri)
  120. return
  121. m = r_title.search(bytes)
  122. if m:
  123. title = m.group(1)
  124. title = title.strip()
  125. title = title.replace('\t', ' ')
  126. title = title.replace('\r', ' ')
  127. title = title.replace('\n', ' ')
  128. while ' ' in title:
  129. title = title.replace(' ', ' ')
  130. if len(title) > 200:
  131. title = title[:200] + '[...]'
  132. def e(m):
  133. entity = m.group(0)
  134. if entity.startswith('&#x'):
  135. cp = int(entity[3:-1], 16)
  136. return unichr(cp).encode('utf-8')
  137. elif entity.startswith('&#'):
  138. cp = int(entity[2:-1])
  139. return unichr(cp).encode('utf-8')
  140. else:
  141. char = name2codepoint[entity[1:-1]]
  142. return unichr(char).encode('utf-8')
  143. title = r_entity.sub(e, title)
  144. if title:
  145. try: title.decode('utf-8')
  146. except:
  147. try: title = title.decode('iso-8859-1').encode('utf-8')
  148. except: title = title.decode('cp1252').encode('utf-8')
  149. else: pass
  150. else: title = '[The title is empty.]'
  151. title = title.replace('\n', '')
  152. title = title.replace('\r', '')
  153. self.msg(origin.sender, origin.nick + ': ' + title)
  154. else: self.msg(origin.sender, origin.nick + ': No title found')
  155. f_title.commands = ['title']
  156. def noteuri(phenny, input):
  157. uri = input.group(1).encode('utf-8')
  158. if not hasattr(phenny.bot, 'last_seen_uri'):
  159. phenny.bot.last_seen_uri = {}
  160. phenny.bot.last_seen_uri[input.sender] = uri
  161. noteuri.rule = r'.*(http[s]?://[^<> "\x01]+)[,.]?'
  162. noteuri.priority = 'low'
  163. if __name__ == '__main__':
  164. print __doc__.strip()