/modules/twitter/pages.py
Python | 207 lines | 154 code | 36 blank | 17 comment | 17 complexity | 634835bd8b81a9cd38ac57c349b52e7f MD5 | raw file
- # -*- coding: utf-8 -*-
- # Copyright(C) 2014 Bezleputh
- #
- # This file is part of a weboob module.
- #
- # This weboob module is free software: you can redistribute it and/or modify
- # it under the terms of the GNU Affero General Public License as published by
- # the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
- #
- # This weboob module is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU Affero General Public License for more details.
- #
- # You should have received a copy of the GNU Affero General Public License
- # along with this weboob module. If not, see <http://www.gnu.org/licenses/>.
- from datetime import datetime
- from weboob.tools.date import DATE_TRANSLATE_FR
- from io import StringIO
- import lxml.html as html
- from weboob.tools.json import json
- from weboob.browser.pages import HTMLPage, JsonPage, FormNotFound, pagination, LoggedPage
- from weboob.browser.elements import ListElement, ItemElement, method
- from weboob.browser.filters.standard import CleanText, Format, Regexp, Env, DateTime, Filter
- from weboob.browser.filters.html import Link, Attr
- from weboob.capabilities.messages import Thread, Message
- from weboob.capabilities.base import BaseObject
- from weboob.tools.compat import urlencode
- class DatetimeFromTimestamp(Filter):
- def filter(self, el):
- return datetime.fromtimestamp(float(el))
- class TwitterJsonHTMLPage(JsonPage):
- ENCODING = None
- has_next = None
- scroll_cursor = None
- def __init__(self, browser, response, *args, **kwargs):
- super(TwitterJsonHTMLPage, self).__init__(browser, response, *args, **kwargs)
- self.encoding = self.ENCODING or response.encoding
- parser = html.HTMLParser(encoding=self.encoding)
- if 'module_html' in self.doc:
- self.doc = html.parse(StringIO(self.doc['module_html']), parser)
- else:
- self.has_next = self.doc['has_more_items']
- self.min_position = None
- if 'min_position' in self.doc:
- self.min_position = self.doc['min_position']
- if self.doc['items_html']:
- el = html.parse(StringIO(self.doc['items_html']), parser)
- self.doc = el if el.getroot() is not None else html.Element('brinbrin')
- else:
- self.doc = html.Element('brinbrin')
- class LoginPage(HTMLPage):
- def login(self, login, passwd):
- try:
- form = self.get_form(xpath='//form[@action="https://twitter.com/sessions"]')
- form['session[username_or_email]'] = login
- form['session[password]'] = passwd
- form.submit()
- return form['authenticity_token']
- except FormNotFound:
- return CleanText('(//input[@id="authenticity_token"])[1]/@value')(self.doc)
- @property
- def logged(self):
- try:
- self.get_form(xpath='//form[@action="https://twitter.com/sessions"]')
- return False
- except FormNotFound:
- return True
- def get_me(self):
- return Regexp(Link('//a[@data-nav="view_profile"]'), '/(.+)')(self.doc)
- class ThreadPage(HTMLPage):
- @method
- class get_thread(ItemElement):
- klass = Thread
- obj_id = Format('%s#%s', Env('user'), Env('_id'))
- obj_title = Format('%s \n\t %s',
- CleanText('//div[has-class("permalink-inner permalink-tweet-container")]/div/div/div/a',
- replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]),
- CleanText('//div[has-class("permalink-inner permalink-tweet-container")]/div/div/p',
- replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]))
- obj_date = DateTime(Regexp(CleanText('//div[has-class("permalink-inner permalink-tweet-container")]/div/div/div[@class="client-and-actions"]/span/span'),
- '(\d+:\d+).+- (.+\d{4})',
- '\\2 \\1'), translations=DATE_TRANSLATE_FR)
- @method
- class iter_comments(ListElement):
- item_xpath = '//ol[@id="stream-items-id"]/li/ol/div/li/div'
- class item(ItemElement):
- klass = Message
- obj_id = Regexp(Link('./div/div/small/a', default=''), '/.+/status/(.+)', default=None)
- obj_title = Regexp(CleanText('./div[@class="content"]/div/p[has-class("tweet-text")]',
- replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]),
- '(.{50}|.+).+')
- obj_content = CleanText('./div[@class="content"]/div/p[has-class("tweet-text")]',
- replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')])
- obj_sender = Regexp(Link('./div/div/small/a', default=''), '/(.+)/status/.+', default=None)
- obj_date = DatetimeFromTimestamp(Attr('./div/div[@class="stream-item-header"]/small/a/span | ./div/div[@class="ProfileTweet-authorDetails"]/span/a/span', 'data-time'))
- def validate(self, obj):
- return obj.id is not None
- class SearchPage(HTMLPage):
- def get_trends_token(self):
- json_data = CleanText('//input[@id="init-data"]/@value')(self.doc)
- return json.loads(json_data)['trendsCacheKey']
- def get_min_position(self):
- return CleanText('//div[@class="stream-container "]/@data-min-position')(self.doc)
- class TrendsPage(TwitterJsonHTMLPage):
- @method
- class get_trendy_subjects(ListElement):
- item_xpath = '//li[@class="trend-item js-trend-item "]'
- class item(ItemElement):
- klass = BaseObject
- obj_id = Attr('.', 'data-trend-name')
- class TimelineListElement(ListElement):
- item_xpath = '//*[@data-item-type="tweet"]/div[@data-tweet-id]'
- ignore_duplicate = True
- def get_last_id(self):
- _el = self.page.doc.xpath('//*[@data-item-type="tweet"]/div')[-1]
- return CleanText('./@data-tweet-id')(_el)
- class item(ItemElement):
- klass = Thread
- obj_id = Format('%s#%s', CleanText('./@data-screen-name'), CleanText('./@data-tweet-id'))
- obj_title = Format('%s \n\t %s',
- CleanText('./div/div[@class="stream-item-header"]/a|./div/div[@class="ProfileTweet-authorDetails"]/a',
- replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]),
- CleanText('./div/div/p',
- replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]))
- obj_date = DatetimeFromTimestamp(Attr('./div/div[@class="stream-item-header"]/small/a/span | ./div/div[@class="ProfileTweet-authorDetails"]/span/a/span', 'data-time'))
- class TimelinePage(TwitterJsonHTMLPage):
- @pagination
- @method
- class iter_threads(TimelineListElement):
- def next_page(self):
- if self.page.has_next:
- return u'%s?max_position=%s' % (self.page.url.split('?')[0], self.get_last_id())
- class HomeTimelinePage(TwitterJsonHTMLPage, LoggedPage):
- @pagination
- @method
- class iter_threads(TimelineListElement):
- def next_page(self):
- if self.page.has_next:
- return u'%s?max_id=%s' % (self.page.url.split('?')[0], self.get_last_id())
- class SearchTimelinePage(TwitterJsonHTMLPage):
- @pagination
- @method
- class iter_threads(TimelineListElement):
- def next_page(self):
- params = self.env['params']
- params['max_position'] = self.page.min_position
- if 'min_position' in self.env and not params['max_position']:
- params['max_position'] = self.env['min_position']
- if self.page.has_next:
- return u'%s?%s' % (self.page.url.split('?')[0], urlencode(params))
- class LoginErrorPage(HTMLPage):
- pass
- class Tweet(JsonPage):
- pass