PageRenderTime 93ms CodeModel.GetById 45ms app.highlight 25ms RepoModel.GetById 14ms app.codeStats 0ms

/modules/twitter/pages.py

https://gitlab.com/phyks/weboob
Python | 207 lines | 154 code | 36 blank | 17 comment | 16 complexity | fda83b4258a320b4818f71900eb60d95 MD5 | raw file
  1# -*- coding: utf-8 -*-
  2
  3# Copyright(C) 2014      Bezleputh
  4#
  5# This file is part of weboob.
  6#
  7# weboob is free software: you can redistribute it and/or modify
  8# it under the terms of the GNU Affero General Public License as published by
  9# the Free Software Foundation, either version 3 of the License, or
 10# (at your option) any later version.
 11#
 12# weboob is distributed in the hope that it will be useful,
 13# but WITHOUT ANY WARRANTY; without even the implied warranty of
 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 15# GNU Affero General Public License for more details.
 16#
 17# You should have received a copy of the GNU Affero General Public License
 18# along with weboob. If not, see <http://www.gnu.org/licenses/>.
 19
 20from datetime import datetime
 21from weboob.tools.date import DATE_TRANSLATE_FR
 22from io import StringIO
 23import lxml.html as html
 24import urllib
 25
 26from weboob.tools.json import json
 27from weboob.browser.pages import HTMLPage, JsonPage, FormNotFound, pagination, LoggedPage
 28from weboob.browser.elements import ListElement, ItemElement, method
 29from weboob.browser.filters.standard import CleanText, Format, Regexp, Env, DateTime, Filter
 30from weboob.browser.filters.html import Link, Attr
 31from weboob.capabilities.messages import Thread, Message
 32from weboob.capabilities.base import BaseObject
 33
 34
 35class DatetimeFromTimestamp(Filter):
 36    def filter(self, el):
 37        return datetime.fromtimestamp(float(el))
 38
 39
 40class TwitterJsonHTMLPage(JsonPage):
 41
 42    ENCODING = None
 43    has_next = None
 44    scroll_cursor = None
 45
 46    def __init__(self, browser, response, *args, **kwargs):
 47        super(TwitterJsonHTMLPage, self).__init__(browser, response, *args, **kwargs)
 48        self.encoding = self.ENCODING or response.encoding
 49        parser = html.HTMLParser(encoding=self.encoding)
 50        if 'module_html' in self.doc:
 51            self.doc = html.parse(StringIO(self.doc['module_html']), parser)
 52        else:
 53            self.has_next = self.doc['has_more_items']
 54
 55            self.min_position = None
 56            if 'min_position' in self.doc:
 57                self.min_position = self.doc['min_position']
 58
 59            if self.doc['items_html']:
 60                el = html.parse(StringIO(self.doc['items_html']), parser)
 61                self.doc = el if el.getroot() is not None else html.Element('brinbrin')
 62            else:
 63                self.doc = html.Element('brinbrin')
 64
 65
 66class LoginPage(HTMLPage):
 67    def login(self, login, passwd):
 68        try:
 69            form = self.get_form(xpath='//form[@action="https://twitter.com/sessions"]')
 70            form['session[username_or_email]'] = login
 71            form['session[password]'] = passwd
 72            form.submit()
 73            return form['authenticity_token']
 74        except FormNotFound:
 75            return CleanText('(//input[@id="authenticity_token"])[1]/@value')(self.doc)
 76
 77    @property
 78    def logged(self):
 79        try:
 80            self.get_form(xpath='//form[@action="https://twitter.com/sessions"]')
 81            return False
 82        except FormNotFound:
 83            return True
 84
 85    def get_me(self):
 86        return Regexp(Link('//a[@data-nav="view_profile"]'), '/(.+)')(self.doc)
 87
 88
 89class ThreadPage(HTMLPage):
 90
 91    @method
 92    class get_thread(ItemElement):
 93        klass = Thread
 94
 95        obj_id = Format('%s#%s', Env('user'), Env('_id'))
 96        obj_title = Format('%s \n\t %s',
 97                           CleanText('//div[has-class("permalink-inner permalink-tweet-container")]/div/div/div/a',
 98                                     replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]),
 99                           CleanText('//div[has-class("permalink-inner permalink-tweet-container")]/div/div/p',
100                                     replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]))
101        obj_date = DateTime(Regexp(CleanText('//div[has-class("permalink-inner permalink-tweet-container")]/div/div/div[@class="client-and-actions"]/span/span'),
102                                   '(\d+:\d+).+- (.+\d{4})',
103                                   '\\2 \\1'), translations=DATE_TRANSLATE_FR)
104
105    @method
106    class iter_comments(ListElement):
107        item_xpath = '//ol[@id="stream-items-id"]/li/ol/div/li/div'
108
109        class item(ItemElement):
110            klass = Message
111
112            obj_id = Regexp(Link('./div/div/small/a', default=''), '/.+/status/(.+)', default=None)
113
114            obj_title = Regexp(CleanText('./div[@class="content"]/div/p[has-class("tweet-text")]',
115                                         replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]),
116                               '(.{50}|.+).+')
117            obj_content = CleanText('./div[@class="content"]/div/p[has-class("tweet-text")]',
118                                    replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')])
119            obj_sender = Regexp(Link('./div/div/small/a', default=''), '/(.+)/status/.+', default=None)
120            obj_date = DatetimeFromTimestamp(Attr('./div/div[@class="stream-item-header"]/small/a/span | ./div/div[@class="ProfileTweet-authorDetails"]/span/a/span', 'data-time'))
121
122            def validate(self, obj):
123                return obj.id is not None
124
125
126class SearchPage(HTMLPage):
127    def get_trends_token(self):
128        json_data = CleanText('//input[@id="init-data"]/@value')(self.doc)
129        return json.loads(json_data)['trendsCacheKey']
130
131    def get_min_position(self):
132        return CleanText('//div[@class="stream-container "]/@data-min-position')(self.doc)
133
134
135class TrendsPage(TwitterJsonHTMLPage):
136
137    @method
138    class get_trendy_subjects(ListElement):
139        item_xpath = '//li[@class="trend-item js-trend-item  "]'
140
141        class item(ItemElement):
142            klass = BaseObject
143
144            obj_id = Attr('.', 'data-trend-name')
145
146
147class TimelineListElement(ListElement):
148    item_xpath = '//*[@data-item-type="tweet"]/div[@data-tweet-id]'
149    ignore_duplicate = True
150
151    def get_last_id(self):
152        _el = self.page.doc.xpath('//*[@data-item-type="tweet"]/div')[-1]
153        return CleanText('./@data-tweet-id')(_el)
154
155    class item(ItemElement):
156        klass = Thread
157
158        obj_id = Format('%s#%s', CleanText('./@data-screen-name'), CleanText('./@data-tweet-id'))
159        obj_title = Format('%s \n\t %s',
160                           CleanText('./div/div[@class="stream-item-header"]/a|./div/div[@class="ProfileTweet-authorDetails"]/a',
161                                     replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]),
162                           CleanText('./div/div/p',
163                                     replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]))
164        obj_date = DatetimeFromTimestamp(Attr('./div/div[@class="stream-item-header"]/small/a/span | ./div/div[@class="ProfileTweet-authorDetails"]/span/a/span', 'data-time'))
165
166
167class TimelinePage(TwitterJsonHTMLPage):
168    @pagination
169    @method
170    class iter_threads(TimelineListElement):
171
172        def next_page(self):
173            if self.page.has_next:
174                return u'%s?max_position=%s' % (self.page.url.split('?')[0], self.get_last_id())
175
176
177class HomeTimelinePage(TwitterJsonHTMLPage, LoggedPage):
178    @pagination
179    @method
180    class iter_threads(TimelineListElement):
181
182        def next_page(self):
183            if self.page.has_next:
184                return u'%s?max_id=%s' % (self.page.url.split('?')[0], self.get_last_id())
185
186
187class SearchTimelinePage(TwitterJsonHTMLPage):
188    @pagination
189    @method
190    class iter_threads(TimelineListElement):
191
192        def next_page(self):
193            params = self.env['params']
194            params['max_position'] = self.page.min_position
195            if 'min_position' in self.env and not params['max_position']:
196                params['max_position'] = self.env['min_position']
197
198            if self.page.has_next:
199                return u'%s?%s' % (self.page.url.split('?')[0], urllib.urlencode(params))
200
201
202class LoginErrorPage(HTMLPage):
203    pass
204
205
206class Tweet(JsonPage):
207    pass