PageRenderTime 81ms CodeModel.GetById 72ms app.highlight 7ms RepoModel.GetById 1ms app.codeStats 0ms

/modules/twitter/pages.py

https://github.com/laurentb/weboob
Python | 207 lines | 154 code | 36 blank | 17 comment | 16 complexity | 634835bd8b81a9cd38ac57c349b52e7f MD5 | raw file
  1# -*- coding: utf-8 -*-
  2
  3# Copyright(C) 2014      Bezleputh
  4#
  5# This file is part of a weboob module.
  6#
  7# This weboob module is free software: you can redistribute it and/or modify
  8# it under the terms of the GNU Affero General Public License as published by
  9# the Free Software Foundation, either version 3 of the License, or
 10# (at your option) any later version.
 11#
 12# This weboob module is distributed in the hope that it will be useful,
 13# but WITHOUT ANY WARRANTY; without even the implied warranty of
 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 15# GNU Affero General Public License for more details.
 16#
 17# You should have received a copy of the GNU Affero General Public License
 18# along with this weboob module. If not, see <http://www.gnu.org/licenses/>.
 19
 20from datetime import datetime
 21from weboob.tools.date import DATE_TRANSLATE_FR
 22from io import StringIO
 23import lxml.html as html
 24
 25from weboob.tools.json import json
 26from weboob.browser.pages import HTMLPage, JsonPage, FormNotFound, pagination, LoggedPage
 27from weboob.browser.elements import ListElement, ItemElement, method
 28from weboob.browser.filters.standard import CleanText, Format, Regexp, Env, DateTime, Filter
 29from weboob.browser.filters.html import Link, Attr
 30from weboob.capabilities.messages import Thread, Message
 31from weboob.capabilities.base import BaseObject
 32from weboob.tools.compat import urlencode
 33
 34
 35class DatetimeFromTimestamp(Filter):
 36    def filter(self, el):
 37        return datetime.fromtimestamp(float(el))
 38
 39
 40class TwitterJsonHTMLPage(JsonPage):
 41
 42    ENCODING = None
 43    has_next = None
 44    scroll_cursor = None
 45
 46    def __init__(self, browser, response, *args, **kwargs):
 47        super(TwitterJsonHTMLPage, self).__init__(browser, response, *args, **kwargs)
 48        self.encoding = self.ENCODING or response.encoding
 49        parser = html.HTMLParser(encoding=self.encoding)
 50        if 'module_html' in self.doc:
 51            self.doc = html.parse(StringIO(self.doc['module_html']), parser)
 52        else:
 53            self.has_next = self.doc['has_more_items']
 54
 55            self.min_position = None
 56            if 'min_position' in self.doc:
 57                self.min_position = self.doc['min_position']
 58
 59            if self.doc['items_html']:
 60                el = html.parse(StringIO(self.doc['items_html']), parser)
 61                self.doc = el if el.getroot() is not None else html.Element('brinbrin')
 62            else:
 63                self.doc = html.Element('brinbrin')
 64
 65
 66class LoginPage(HTMLPage):
 67    def login(self, login, passwd):
 68        try:
 69            form = self.get_form(xpath='//form[@action="https://twitter.com/sessions"]')
 70            form['session[username_or_email]'] = login
 71            form['session[password]'] = passwd
 72            form.submit()
 73            return form['authenticity_token']
 74        except FormNotFound:
 75            return CleanText('(//input[@id="authenticity_token"])[1]/@value')(self.doc)
 76
 77    @property
 78    def logged(self):
 79        try:
 80            self.get_form(xpath='//form[@action="https://twitter.com/sessions"]')
 81            return False
 82        except FormNotFound:
 83            return True
 84
 85    def get_me(self):
 86        return Regexp(Link('//a[@data-nav="view_profile"]'), '/(.+)')(self.doc)
 87
 88
 89class ThreadPage(HTMLPage):
 90
 91    @method
 92    class get_thread(ItemElement):
 93        klass = Thread
 94
 95        obj_id = Format('%s#%s', Env('user'), Env('_id'))
 96        obj_title = Format('%s \n\t %s',
 97                           CleanText('//div[has-class("permalink-inner permalink-tweet-container")]/div/div/div/a',
 98                                     replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]),
 99                           CleanText('//div[has-class("permalink-inner permalink-tweet-container")]/div/div/p',
100                                     replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]))
101        obj_date = DateTime(Regexp(CleanText('//div[has-class("permalink-inner permalink-tweet-container")]/div/div/div[@class="client-and-actions"]/span/span'),
102                                   '(\d+:\d+).+- (.+\d{4})',
103                                   '\\2 \\1'), translations=DATE_TRANSLATE_FR)
104
105    @method
106    class iter_comments(ListElement):
107        item_xpath = '//ol[@id="stream-items-id"]/li/ol/div/li/div'
108
109        class item(ItemElement):
110            klass = Message
111
112            obj_id = Regexp(Link('./div/div/small/a', default=''), '/.+/status/(.+)', default=None)
113
114            obj_title = Regexp(CleanText('./div[@class="content"]/div/p[has-class("tweet-text")]',
115                                         replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]),
116                               '(.{50}|.+).+')
117            obj_content = CleanText('./div[@class="content"]/div/p[has-class("tweet-text")]',
118                                    replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')])
119            obj_sender = Regexp(Link('./div/div/small/a', default=''), '/(.+)/status/.+', default=None)
120            obj_date = DatetimeFromTimestamp(Attr('./div/div[@class="stream-item-header"]/small/a/span | ./div/div[@class="ProfileTweet-authorDetails"]/span/a/span', 'data-time'))
121
122            def validate(self, obj):
123                return obj.id is not None
124
125
126class SearchPage(HTMLPage):
127    def get_trends_token(self):
128        json_data = CleanText('//input[@id="init-data"]/@value')(self.doc)
129        return json.loads(json_data)['trendsCacheKey']
130
131    def get_min_position(self):
132        return CleanText('//div[@class="stream-container "]/@data-min-position')(self.doc)
133
134
135class TrendsPage(TwitterJsonHTMLPage):
136
137    @method
138    class get_trendy_subjects(ListElement):
139        item_xpath = '//li[@class="trend-item js-trend-item  "]'
140
141        class item(ItemElement):
142            klass = BaseObject
143
144            obj_id = Attr('.', 'data-trend-name')
145
146
147class TimelineListElement(ListElement):
148    item_xpath = '//*[@data-item-type="tweet"]/div[@data-tweet-id]'
149    ignore_duplicate = True
150
151    def get_last_id(self):
152        _el = self.page.doc.xpath('//*[@data-item-type="tweet"]/div')[-1]
153        return CleanText('./@data-tweet-id')(_el)
154
155    class item(ItemElement):
156        klass = Thread
157
158        obj_id = Format('%s#%s', CleanText('./@data-screen-name'), CleanText('./@data-tweet-id'))
159        obj_title = Format('%s \n\t %s',
160                           CleanText('./div/div[@class="stream-item-header"]/a|./div/div[@class="ProfileTweet-authorDetails"]/a',
161                                     replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]),
162                           CleanText('./div/div/p',
163                                     replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]))
164        obj_date = DatetimeFromTimestamp(Attr('./div/div[@class="stream-item-header"]/small/a/span | ./div/div[@class="ProfileTweet-authorDetails"]/span/a/span', 'data-time'))
165
166
167class TimelinePage(TwitterJsonHTMLPage):
168    @pagination
169    @method
170    class iter_threads(TimelineListElement):
171
172        def next_page(self):
173            if self.page.has_next:
174                return u'%s?max_position=%s' % (self.page.url.split('?')[0], self.get_last_id())
175
176
177class HomeTimelinePage(TwitterJsonHTMLPage, LoggedPage):
178    @pagination
179    @method
180    class iter_threads(TimelineListElement):
181
182        def next_page(self):
183            if self.page.has_next:
184                return u'%s?max_id=%s' % (self.page.url.split('?')[0], self.get_last_id())
185
186
187class SearchTimelinePage(TwitterJsonHTMLPage):
188    @pagination
189    @method
190    class iter_threads(TimelineListElement):
191
192        def next_page(self):
193            params = self.env['params']
194            params['max_position'] = self.page.min_position
195            if 'min_position' in self.env and not params['max_position']:
196                params['max_position'] = self.env['min_position']
197
198            if self.page.has_next:
199                return u'%s?%s' % (self.page.url.split('?')[0], urlencode(params))
200
201
202class LoginErrorPage(HTMLPage):
203    pass
204
205
206class Tweet(JsonPage):
207    pass