PageRenderTime 24ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/modules/twitter/pages.py

https://github.com/laurentb/weboob
Python | 207 lines | 154 code | 36 blank | 17 comment | 17 complexity | 634835bd8b81a9cd38ac57c349b52e7f MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. # Copyright(C) 2014 Bezleputh
  3. #
  4. # This file is part of a weboob module.
  5. #
  6. # This weboob module is free software: you can redistribute it and/or modify
  7. # it under the terms of the GNU Affero General Public License as published by
  8. # the Free Software Foundation, either version 3 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # This weboob module is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU Affero General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU Affero General Public License
  17. # along with this weboob module. If not, see <http://www.gnu.org/licenses/>.
  18. from datetime import datetime
  19. from weboob.tools.date import DATE_TRANSLATE_FR
  20. from io import StringIO
  21. import lxml.html as html
  22. from weboob.tools.json import json
  23. from weboob.browser.pages import HTMLPage, JsonPage, FormNotFound, pagination, LoggedPage
  24. from weboob.browser.elements import ListElement, ItemElement, method
  25. from weboob.browser.filters.standard import CleanText, Format, Regexp, Env, DateTime, Filter
  26. from weboob.browser.filters.html import Link, Attr
  27. from weboob.capabilities.messages import Thread, Message
  28. from weboob.capabilities.base import BaseObject
  29. from weboob.tools.compat import urlencode
  30. class DatetimeFromTimestamp(Filter):
  31. def filter(self, el):
  32. return datetime.fromtimestamp(float(el))
  33. class TwitterJsonHTMLPage(JsonPage):
  34. ENCODING = None
  35. has_next = None
  36. scroll_cursor = None
  37. def __init__(self, browser, response, *args, **kwargs):
  38. super(TwitterJsonHTMLPage, self).__init__(browser, response, *args, **kwargs)
  39. self.encoding = self.ENCODING or response.encoding
  40. parser = html.HTMLParser(encoding=self.encoding)
  41. if 'module_html' in self.doc:
  42. self.doc = html.parse(StringIO(self.doc['module_html']), parser)
  43. else:
  44. self.has_next = self.doc['has_more_items']
  45. self.min_position = None
  46. if 'min_position' in self.doc:
  47. self.min_position = self.doc['min_position']
  48. if self.doc['items_html']:
  49. el = html.parse(StringIO(self.doc['items_html']), parser)
  50. self.doc = el if el.getroot() is not None else html.Element('brinbrin')
  51. else:
  52. self.doc = html.Element('brinbrin')
  53. class LoginPage(HTMLPage):
  54. def login(self, login, passwd):
  55. try:
  56. form = self.get_form(xpath='//form[@action="https://twitter.com/sessions"]')
  57. form['session[username_or_email]'] = login
  58. form['session[password]'] = passwd
  59. form.submit()
  60. return form['authenticity_token']
  61. except FormNotFound:
  62. return CleanText('(//input[@id="authenticity_token"])[1]/@value')(self.doc)
  63. @property
  64. def logged(self):
  65. try:
  66. self.get_form(xpath='//form[@action="https://twitter.com/sessions"]')
  67. return False
  68. except FormNotFound:
  69. return True
  70. def get_me(self):
  71. return Regexp(Link('//a[@data-nav="view_profile"]'), '/(.+)')(self.doc)
  72. class ThreadPage(HTMLPage):
  73. @method
  74. class get_thread(ItemElement):
  75. klass = Thread
  76. obj_id = Format('%s#%s', Env('user'), Env('_id'))
  77. obj_title = Format('%s \n\t %s',
  78. CleanText('//div[has-class("permalink-inner permalink-tweet-container")]/div/div/div/a',
  79. replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]),
  80. CleanText('//div[has-class("permalink-inner permalink-tweet-container")]/div/div/p',
  81. replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]))
  82. obj_date = DateTime(Regexp(CleanText('//div[has-class("permalink-inner permalink-tweet-container")]/div/div/div[@class="client-and-actions"]/span/span'),
  83. '(\d+:\d+).+- (.+\d{4})',
  84. '\\2 \\1'), translations=DATE_TRANSLATE_FR)
  85. @method
  86. class iter_comments(ListElement):
  87. item_xpath = '//ol[@id="stream-items-id"]/li/ol/div/li/div'
  88. class item(ItemElement):
  89. klass = Message
  90. obj_id = Regexp(Link('./div/div/small/a', default=''), '/.+/status/(.+)', default=None)
  91. obj_title = Regexp(CleanText('./div[@class="content"]/div/p[has-class("tweet-text")]',
  92. replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]),
  93. '(.{50}|.+).+')
  94. obj_content = CleanText('./div[@class="content"]/div/p[has-class("tweet-text")]',
  95. replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')])
  96. obj_sender = Regexp(Link('./div/div/small/a', default=''), '/(.+)/status/.+', default=None)
  97. obj_date = DatetimeFromTimestamp(Attr('./div/div[@class="stream-item-header"]/small/a/span | ./div/div[@class="ProfileTweet-authorDetails"]/span/a/span', 'data-time'))
  98. def validate(self, obj):
  99. return obj.id is not None
  100. class SearchPage(HTMLPage):
  101. def get_trends_token(self):
  102. json_data = CleanText('//input[@id="init-data"]/@value')(self.doc)
  103. return json.loads(json_data)['trendsCacheKey']
  104. def get_min_position(self):
  105. return CleanText('//div[@class="stream-container "]/@data-min-position')(self.doc)
  106. class TrendsPage(TwitterJsonHTMLPage):
  107. @method
  108. class get_trendy_subjects(ListElement):
  109. item_xpath = '//li[@class="trend-item js-trend-item "]'
  110. class item(ItemElement):
  111. klass = BaseObject
  112. obj_id = Attr('.', 'data-trend-name')
  113. class TimelineListElement(ListElement):
  114. item_xpath = '//*[@data-item-type="tweet"]/div[@data-tweet-id]'
  115. ignore_duplicate = True
  116. def get_last_id(self):
  117. _el = self.page.doc.xpath('//*[@data-item-type="tweet"]/div')[-1]
  118. return CleanText('./@data-tweet-id')(_el)
  119. class item(ItemElement):
  120. klass = Thread
  121. obj_id = Format('%s#%s', CleanText('./@data-screen-name'), CleanText('./@data-tweet-id'))
  122. obj_title = Format('%s \n\t %s',
  123. CleanText('./div/div[@class="stream-item-header"]/a|./div/div[@class="ProfileTweet-authorDetails"]/a',
  124. replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]),
  125. CleanText('./div/div/p',
  126. replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]))
  127. obj_date = DatetimeFromTimestamp(Attr('./div/div[@class="stream-item-header"]/small/a/span | ./div/div[@class="ProfileTweet-authorDetails"]/span/a/span', 'data-time'))
  128. class TimelinePage(TwitterJsonHTMLPage):
  129. @pagination
  130. @method
  131. class iter_threads(TimelineListElement):
  132. def next_page(self):
  133. if self.page.has_next:
  134. return u'%s?max_position=%s' % (self.page.url.split('?')[0], self.get_last_id())
  135. class HomeTimelinePage(TwitterJsonHTMLPage, LoggedPage):
  136. @pagination
  137. @method
  138. class iter_threads(TimelineListElement):
  139. def next_page(self):
  140. if self.page.has_next:
  141. return u'%s?max_id=%s' % (self.page.url.split('?')[0], self.get_last_id())
  142. class SearchTimelinePage(TwitterJsonHTMLPage):
  143. @pagination
  144. @method
  145. class iter_threads(TimelineListElement):
  146. def next_page(self):
  147. params = self.env['params']
  148. params['max_position'] = self.page.min_position
  149. if 'min_position' in self.env and not params['max_position']:
  150. params['max_position'] = self.env['min_position']
  151. if self.page.has_next:
  152. return u'%s?%s' % (self.page.url.split('?')[0], urlencode(params))
  153. class LoginErrorPage(HTMLPage):
  154. pass
  155. class Tweet(JsonPage):
  156. pass