PageRenderTime 46ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/TitleGetter2.py

https://github.com/kunwon1/Subtitle
Python | 203 lines | 172 code | 15 blank | 16 comment | 13 complexity | e193904baa0e39b78ae1d09b7bfe0cad MD5 | raw file
  1. # Copyright (c) 2010 David Moore.
  2. # See LICENSE for details.
  3. from twisted.internet import reactor
  4. from twisted.web.client import HTTPClientFactory, _parse, HTTPPageGetter
  5. from twisted.python.util import println
  6. from twisted.python.failure import Failure
  7. from BeautifulSoup import BeautifulSoup, SoupStrainer
  8. import sys, re, string, htmlentitydefs
  9. ttags = SoupStrainer('title')
  10. entityPattern = re.compile("&(\w+?);")
  11. decPattern = re.compile("&#(\d+?);")
  12. whitespacePattern = re.compile("\s+")
  13. charsetPattern = re.compile(r'charset=([^\s]+)', re.I)
  14. class CustomPageGetter(HTTPPageGetter):
  15. def dataReceived(self, data):
  16. try:
  17. self.detectedDelimiter
  18. except AttributeError:
  19. if data.find("\r\n") >= 0:
  20. self.detectedDelimiter = 1
  21. else:
  22. self.detectedDelimiter = 1
  23. self.delimiter = "\n"
  24. return HTTPPageGetter.dataReceived(self, data)
  25. class Getter(HTTPClientFactory):
  26. """ A title fetcher
  27. A new class is instantiated for each title fetch.
  28. Subclasses HTTPClientFactory.
  29. Takes one mandatory argument and one optional argument.
  30. url = the url to fetch the title of (mandatory)
  31. contextFactory = SSL context factory (optional)
  32. output is handled by the callback chain, standard practice is to override
  33. the Output method, which will be called with the title. Output will never
  34. get None as an arg."""
  35. protocol = CustomPageGetter
  36. def __init__(self, url, contextFactory=None, retries=0):
  37. url = stripNoPrint(url)
  38. if retries > 0:
  39. print "Retrying: ", url
  40. else:
  41. print "Get: ", url
  42. self.retries = retries
  43. self.url = url
  44. self.charset = None
  45. scheme, host, port, path = _parse(url)
  46. HTTPClientFactory.__init__(self, url,
  47. method='GET', postdata=None, headers=None,
  48. agent='Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US;' +
  49. ' rv:1.9.2.10) Gecko/20100914 Firefox/3.6.10')
  50. if scheme == 'https':
  51. from twisted.internet import ssl
  52. if contextFactory is None:
  53. contextFactory = ssl.ClientContextFactory()
  54. reactor.connectSSL(host, port, self, contextFactory)
  55. else:
  56. reactor.connectTCP(host, port, self)
  57. self.deferred.addCallbacks(self.getCharset, self.Err)
  58. self.deferred.addCallbacks(self.getTitle, self.Err)
  59. def getCharset(self, body):
  60. """This is inserted in the callback chain before getTitle to get any
  61. data we'll need for the actual title extraction. Currently just gets
  62. and stores charset.
  63. returns the body it's passed, unmodified"""
  64. h = self.response_headers
  65. if h.has_key('content-type'):
  66. for item in h['content-type']:
  67. m = charsetPattern.search(item)
  68. if not m is None:
  69. self.charset = m.group(1)
  70. return body
  71. def getTitle(self, body):
  72. """Shouldn't be called directly. Called in the callback chain from
  73. __init__.
  74. Gets page body as arg, searches for, and normalizes title.
  75. Converts to unicode from whichever encoding is specified by the
  76. content-type response header, removes undesirable whitespace, de-escapes
  77. entities, and stuffs it all back into a bytestring
  78. returns bytestring """
  79. if body is None:
  80. return
  81. if not self.charset is None:
  82. soup = BeautifulSoup(
  83. body, fromEncoding=self.charset, parseOnlyThese=ttags)
  84. else:
  85. soup = BeautifulSoup(body, parseOnlyThese=ttags)
  86. try:
  87. soup.title.string
  88. except AttributeError:
  89. print 'Got no title from soup: ', self.url
  90. return
  91. title = soup.title.string
  92. title.extract()
  93. if not title is None:
  94. title = string.strip(title)
  95. title = descape_ents(title)
  96. title = descape_decs(title)
  97. title = normalizeWhitespace(title)
  98. title = title.encode("utf-8", "ignore")
  99. if not title is None:
  100. self.Output(title)
  101. else:
  102. print 'Got no title for url after string processing: ', self.url
  103. else:
  104. print 'Found no title string for url: ', self.url
  105. def Output(self, title):
  106. """ default Output method.
  107. Should be overridden, ancestor can be called for debugging
  108. Should be at the end of the callback chain """
  109. print title
  110. def Err(self, fail):
  111. """ error handler """
  112. print 'Error in the titlegetter for url ' + self.url + ' - ' + str(fail)
  113. def descape_dec(m):
  114. """ de-escape one html decimal entity
  115. ex: "
  116. returns string """
  117. return unichr(int(m.group(1)))
  118. def descape_ent(m, defs=htmlentitydefs.name2codepoint):
  119. """ de-escape one html named entity
  120. ex: "
  121. returns string """
  122. try:
  123. return unichr(defs[m.group(1)])
  124. except KeyError:
  125. return m.group(0) # use as is
  126. def descape_decs(string):
  127. """ de-escape all decimal entities in a string
  128. returns string """
  129. return decPattern.sub(descape_dec, string)
  130. def descape_ents(string):
  131. """ de-escape all named entities in a string
  132. returns string """
  133. return entityPattern.sub(descape_ent, string)
  134. def stripNoPrint(str):
  135. """ strips non-printable characters from a string
  136. ***this function is incomplete, it works for our
  137. current needs - obviously there are other non-printable
  138. characters besides those below ascii 32***
  139. returns string """
  140. results = ""
  141. for char in str:
  142. if not int(ord(char)) <= 31:
  143. results += char
  144. return results
  145. def normalizeWhitespace(str):
  146. """ replaces sequential whitespaces with a single space
  147. returns string """
  148. str = re.sub(whitespacePattern, ' ', str)
  149. return str
  150. if __name__ == '__main__':
  151. for n in sys.argv[1:]:
  152. Getter(n)
  153. reactor.run()