PageRenderTime 43ms CodeModel.GetById 2ms app.highlight 35ms RepoModel.GetById 1ms app.codeStats 0ms

/webkit.py

https://code.google.com/p/webscraping/
Python | 486 lines | 450 code | 8 blank | 28 comment | 35 complexity | d08294a562c3a7bc40ee4a2addc830ae MD5 | raw file
  1__doc__ = 'Framework for crawling and scraping webpages with JQuery'
  2
  3import sys
  4import os
  5import re
  6import urllib2
  7import random
  8from time import time, sleep
  9from datetime import datetime
 10from PyQt4.QtGui import QApplication, QDesktopServices, QImage, QPainter
 11from PyQt4.QtCore import QByteArray, QString, QUrl, QTimer, QEventLoop, QIODevice, QObject, QVariant
 12from PyQt4.QtWebKit import QWebFrame, QWebView, QWebPage, QWebSettings
 13from PyQt4.QtNetwork import QNetworkAccessManager, QNetworkProxy, QNetworkRequest, QNetworkReply, QNetworkDiskCache
 14import common
 15import settings
 16import xpath
 17
 18"""
 19TODO
 20right click find xpath:
 21    http://doc.qt.nokia.com/4.6/webkit-domtraversal.html
 22    http://doc.qt.nokia.com/4.6/webkit-simpleselector.html
 23textbox for jquery input
 24    http://www.rkblog.rk.edu.pl/w/p/webkit-pyqt-rendering-web-pages/
 25threaded multiple URLs
 26
 27exit on close window signal
 28
 29add progress bar for loading page
 30implement watir API?
 31"""
 32
 33def qstring_to_unicode(qstr):
 34    """Convert QString to unicode
 35    """
 36    if isinstance(qstr, unicode):
 37        return qstr
 38    else:
 39        return common.to_unicode(qstr.toUtf8().data(), 'utf-8')
 40
 41
 42class NetworkAccessManager(QNetworkAccessManager):
 43    """Subclass QNetworkAccessManager for finer control network operations
 44    """
 45
 46    def __init__(self, proxy, allowed_media, allowed_regex, cache_size=100, cache_dir='.webkit_cache'):
 47        """
 48        See JQueryBrowser for details of arguments
 49        cache_size is the maximum size of the webkit cache (MB)
 50        """
 51        QNetworkAccessManager.__init__(self)
 52        # initialize the manager cache
 53        #QDesktopServices.storageLocation(QDesktopServices.CacheLocation)
 54        cache = QNetworkDiskCache()
 55        cache.setCacheDirectory(cache_dir)
 56        cache.setMaximumCacheSize(cache_size * 1024 * 1024) # need to convert cache value to bytes
 57        self.setCache(cache)
 58        self.allowed_regex = allowed_regex
 59        # allowed content extensions
 60        self.banned_extensions = common.MEDIA_EXTENSIONS
 61        for ext in allowed_media:
 62            if ext in self.banned_extensions:
 63                self.banned_extensions.remove(ext)
 64        # and proxy
 65        self.setProxy(proxy)
 66
 67
 68    def setProxy(self, proxy):
 69        """Allow setting string as proxy
 70        """
 71        if isinstance(proxy, basestring):
 72            match = re.match('((?P<username>\w+):(?P<password>\w+)@)?(?P<host>\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3})(:(?P<port>\d+))?', proxy)
 73            if match:
 74                groups = match.groupdict()
 75                username = groups.get('username') or ''
 76                password = groups.get('password') or ''
 77                host = groups.get('host')
 78                port = groups.get('port')
 79                #print host, port, username, password
 80                proxy = QNetworkProxy(QNetworkProxy.HttpProxy, host, int(port), username, password)
 81            else:
 82                common.logger.info('Invalid proxy:' + proxy)
 83                proxy = None
 84        if proxy:
 85            QNetworkAccessManager.setProxy(self, proxy)
 86
 87
 88    def createRequest(self, operation, request, data):
 89        if operation == self.GetOperation:
 90            if self.is_forbidden(request):
 91                # deny GET request for banned media type by setting dummy URL
 92                # XXX abort properly
 93                request.setUrl(QUrl(QString('forbidden://localhost/')))
 94            else:
 95                common.logger.debug(common.to_unicode(request.url().toString().toUtf8().data()).encode('utf-8'))
 96        
 97        #print request.url().toString(), operation
 98        request.setAttribute(QNetworkRequest.CacheLoadControlAttribute, QNetworkRequest.PreferCache)
 99        reply = QNetworkAccessManager.createRequest(self, operation, request, data)
100        reply.error.connect(self.catch_error)
101        
102        #add Base-Url header, then we can get it from QWebView
103        if isinstance(request.originatingObject(), QWebFrame):
104            try:
105                reply.setRawHeader(QByteArray('Base-Url'), QByteArray('').append(request.originatingObject().page().mainFrame().baseUrl().toString()))
106            except Exception, e:
107                common.logger.debug(e)
108        return reply
109
110
111    def is_forbidden(self, request):
112        """Returns whether this request is permitted by checking URL extension and regex
113        XXX head request for mime?
114        """
115        forbidden = False
116        url = common.to_unicode(request.url().toString().toUtf8().data()).encode('utf-8')
117        if common.get_extension(url) in self.banned_extensions:
118            forbidden = True
119        elif re.match(self.allowed_regex, url) is None:
120            forbidden = True
121        return forbidden
122
123
124    def catch_error(self, eid):
125        if eid not in (5, 301):
126            errors = {
127                0 : 'no error condition. Note: When the HTTP protocol returns a redirect no error will be reported. You can check if there is a redirect with the QNetworkRequest::RedirectionTargetAttribute attribute.',
128                1 : 'the remote server refused the connection (the server is not accepting requests)',
129                2 : 'the remote server closed the connection prematurely, before the entire reply was received and processed',
130                3 : 'the remote host name was not found (invalid hostname)',
131                4 : 'the connection to the remote server timed out',
132                5 : 'the operation was canceled via calls to abort() or close() before it was finished.',
133                6 : 'the SSL/TLS handshake failed and the encrypted channel could not be established. The sslErrors() signal should have been emitted.',
134                7 : 'the connection was broken due to disconnection from the network, however the system has initiated roaming to another access point. The request should be resubmitted and will be processed as soon as the connection is re-established.',
135                101 : 'the connection to the proxy server was refused (the proxy server is not accepting requests)',
136                102 : 'the proxy server closed the connection prematurely, before the entire reply was received and processed',
137                103 : 'the proxy host name was not found (invalid proxy hostname)',
138                104 : 'the connection to the proxy timed out or the proxy did not reply in time to the request sent',
139                105 : 'the proxy requires authentication in order to honour the request but did not accept any credentials offered (if any)',
140                201 : 'the access to the remote content was denied (similar to HTTP error 401)',
141                202 : 'the operation requested on the remote content is not permitted',
142                203 : 'the remote content was not found at the server (similar to HTTP error 404)',
143                204 : 'the remote server requires authentication to serve the content but the credentials provided were not accepted (if any)',
144                205 : 'the request needed to be sent again, but this failed for example because the upload data could not be read a second time.',
145                301 : 'the Network Access API cannot honor the request because the protocol is not known',
146                302 : 'the requested operation is invalid for this protocol',
147                99 : 'an unknown network-related error was detected',
148                199 : 'an unknown proxy-related error was detected',
149                299 : 'an unknown error related to the remote content was detected',
150                399 : 'a breakdown in protocol was detected (parsing error, invalid or unexpected responses, etc.)',
151            }
152            common.logger.debug('Error %d: %s (%s)' % (eid, errors.get(eid, 'unknown error'), self.sender().url().toString()))
153
154
155class NetworkReply(QNetworkReply):
156    def __init__(self, parent, reply):
157        QNetworkReply.__init__(self, parent)
158        self.reply = reply # reply to proxy
159        self.data = '' # contains downloaded data
160        self.buffer = '' # contains buffer of data to read
161        self.setOpenMode(QNetworkReply.ReadOnly | QNetworkReply.Unbuffered)
162        #print dir(reply)
163        
164        # connect signal from proxy reply
165        reply.metaDataChanged.connect(self.applyMetaData)
166        reply.readyRead.connect(self.readInternal)
167        reply.finished.connect(self.finished)
168        reply.uploadProgress.connect(self.uploadProgress)
169        reply.downloadProgress.connect(self.downloadProgress)
170
171    
172    def __getattribute__(self, attr):
173        """Send undefined methods straight through to proxied reply
174        """
175        # send these attributes through to proxy reply 
176        if attr in ('operation', 'request', 'url', 'abort', 'close'):#, 'isSequential'):
177            value = self.reply.__getattribute__(attr)
178        else:
179            value = QNetworkReply.__getattribute__(self, attr)
180        #print attr, value
181        return value
182    
183    def abort(self):
184        pass # qt requires that this be defined
185    
186    def isSequential(self):
187        return True
188
189    def applyMetaData(self):
190        for header in self.reply.rawHeaderList():
191            self.setRawHeader(header, self.reply.rawHeader(header))
192
193        self.setHeader(QNetworkRequest.ContentTypeHeader, self.reply.header(QNetworkRequest.ContentTypeHeader))
194        self.setHeader(QNetworkRequest.ContentLengthHeader, self.reply.header(QNetworkRequest.ContentLengthHeader))
195        self.setHeader(QNetworkRequest.LocationHeader, self.reply.header(QNetworkRequest.LocationHeader))
196        self.setHeader(QNetworkRequest.LastModifiedHeader, self.reply.header(QNetworkRequest.LastModifiedHeader))
197        self.setHeader(QNetworkRequest.SetCookieHeader, self.reply.header(QNetworkRequest.SetCookieHeader))
198
199        self.setAttribute(QNetworkRequest.HttpStatusCodeAttribute, self.reply.attribute(QNetworkRequest.HttpStatusCodeAttribute))
200        self.setAttribute(QNetworkRequest.HttpReasonPhraseAttribute, self.reply.attribute(QNetworkRequest.HttpReasonPhraseAttribute))
201        self.setAttribute(QNetworkRequest.RedirectionTargetAttribute, self.reply.attribute(QNetworkRequest.RedirectionTargetAttribute))
202        self.setAttribute(QNetworkRequest.ConnectionEncryptedAttribute, self.reply.attribute(QNetworkRequest.ConnectionEncryptedAttribute))
203        self.setAttribute(QNetworkRequest.CacheLoadControlAttribute, self.reply.attribute(QNetworkRequest.CacheLoadControlAttribute))
204        self.setAttribute(QNetworkRequest.CacheSaveControlAttribute, self.reply.attribute(QNetworkRequest.CacheSaveControlAttribute))
205        self.setAttribute(QNetworkRequest.SourceIsFromCacheAttribute, self.reply.attribute(QNetworkRequest.SourceIsFromCacheAttribute))
206        # attribute is undefined
207        #self.setAttribute(QNetworkRequest.DoNotBufferUploadDataAttribute, self.reply.attribute(QNetworkRequest.DoNotBufferUploadDataAttribute))
208        self.metaDataChanged.emit()
209
210    def bytesAvailable(self):
211        """How many bytes in the buffer are available to be read
212        """
213        return len(self.buffer) + QNetworkReply.bytesAvailable(self)
214
215    def readInternal(self):
216        """New data available to read
217        """
218        s = self.reply.readAll()
219        self.data += s
220        self.buffer += s
221        self.readyRead.emit()
222
223    def readData(self, size):
224        """Return up to size bytes from buffer
225        """
226        size = min(size, len(self.buffer))
227        data, self.buffer = self.buffer[:size], self.buffer[size:]
228        return str(data)
229
230
231class WebPage(QWebPage):
232    """Override QWebPage to set User-Agent and JavaScript messages
233    """
234
235    def __init__(self, user_agent, confirm=True):
236        QWebPage.__init__(self)
237        self.user_agent = user_agent
238        self.confirm = confirm
239
240    def userAgentForUrl(self, url):
241        return self.user_agent
242
243    def javaScriptAlert(self, frame, message):
244        """Override default JavaScript alert popup and print results
245        """
246        common.logger.debug('Alert:' + message)
247
248    def javaScriptConfirm(self, frame, message):
249        """Override default JavaScript confirm popup and print results
250        """
251        common.logger.debug('Confirm:' + message)
252        return self.confirm
253
254    def javaScriptPrompt(self, frame, message, default):
255        """Override default JavaScript prompt popup and print results
256        """
257        common.logger.debug('Prompt:%s%s' % (message, default))
258
259    def javaScriptConsoleMessage(self, message, line_number, source_id):
260        """Print JavaScript console messages
261        """
262        common.logger.debug('Console:%s%s%s' % (message, line_number, source_id))
263
264    def shouldInterruptJavaScript(self):
265        """Disable javascript interruption dialog box
266        """
267        return True
268
269
270
271class WebkitBrowser(QWebView):
272    """Render webpages using webkit
273    """
274
275    def __init__(self, base_url=None, gui=False, user_agent=None, proxy=None, allowed_media=None, allowed_regex='.*?', timeout=20, delay=5, enable_plugins=True):#, cache_file=None):
276        """
277        base_url is the domain that will be crawled
278        gui is whether to show webkit window or run headless
279        user_agent is used to set the user-agent when downloading content
280        proxy is a QNetworkProxy to download through
281        allowed_media are the media extensions to allow
282        allowed_regex is a regular expressions of URLS to allow
283        timeout is the maximum amount of seconds to wait for a request
284        delay is the minimum amount of seconds to wait between requests
285        """
286        self.app = QApplication(sys.argv) # must instantiate first
287        QWebView.__init__(self)
288        webpage = WebPage(user_agent or settings.user_agent)
289        allowed_media = allowed_media or ['css', 'js']
290        manager = NetworkAccessManager(proxy, allowed_media, allowed_regex)
291        manager.finished.connect(self.finished)
292        webpage.setNetworkAccessManager(manager)
293        self.setPage(webpage)
294        self.setHtml('<html><head></head><body>No content loaded</body></html>', QUrl('http://localhost'))
295        self.timeout = timeout
296        self.delay = delay
297        #self.cache = pdict.PersistentDict(cache_file or settings.cache_file) # cache to store webpages
298        self.base_url = base_url
299        self.jquery_lib = None
300        #enable flash plugin etc.
301        self.settings().setAttribute(QWebSettings.PluginsEnabled, enable_plugins)
302        #XXXQTimer.singleShot(0, self.run) # start crawling when all events processed
303        if gui: self.show() 
304    
305    def set_proxy(self, proxy):
306        self.page().networkAccessManager().setProxy(proxy)
307
308    def current_url(self):
309        """Return current URL
310        """
311        return str(self.url().toString())
312
313    def current_html(self):
314        """Return current rendered HTML
315        """
316        return unicode(self.page().mainFrame().toHtml())
317
318
319    def get(self, url=None, script=None, num_retries=1, jquery=False):
320        """Load given url in webkit and return html when loaded
321
322        script is some javasript to exexute that will change the loaded page (eg form submission)
323        num_retries is how many times to try downloading this URL or executing this script
324        jquery is whether to inject JQuery into the document
325        """
326        t1 = time()
327        self.base_url = self.base_url or url # set base URL if not set
328        #html = self.cache.get(key, {}).get('value')
329        #if html:
330        #    self.debug('Load cache ' + key)
331        #    self.setHtml(html, QUrl(self.base_url))
332        #else:
333        if 1:
334            loop = QEventLoop()
335            timer = QTimer()
336            timer.setSingleShot(True)
337            timer.timeout.connect(loop.quit)
338            self.loadFinished.connect(loop.quit)
339            if url:
340                self.load(QUrl(url))
341            elif script:
342                self.js(script)
343            timer.start(self.timeout * 1000)
344            loop.exec_() # delay here until download finished or timeout
345        
346            if timer.isActive():
347                # downloaded successfully
348                timer.stop()
349                parsed_html = self.current_html()
350                #if key:
351                #    self.cache[key] = html
352                self.wait(self.delay - (time() - t1))
353            else:
354                # didn't download in time
355                if num_retries > 0:
356                    common.logger.debug('Timeout - retrying')
357                    parsed_html = self.get(url, script=script, num_retries=num_retries-1, jquery=jquery)
358                else:
359                    common.logger.debug('Timed out')
360                    parsed_html = ''
361        return parsed_html
362
363
364    def wait(self, secs=1):
365        """Wait for delay time
366        """
367        deadline = time() + secs
368        while time() < deadline:
369            sleep(0)
370            self.app.processEvents()
371            #print 'wait', wait_secs
372        # randomize the delay so less suspicious
373        #wait_secs += 0.5 * self.delay * (random.random() - 0.5)
374        #time.sleep(max(0, wait_secs))
375
376
377    def jsget(self, script, num_retries=1, jquery=True):
378        """Execute JavaScript that will cause page submission, and wait for page to load
379        """
380        return self.get(script=script, num_retries=num_retries, jquery=jquery)
381
382    def js(self, script):
383        """Shortcut to execute javascript on current document and return result
384        """
385        self.app.processEvents()
386        return qstring_to_unicode(self.page().mainFrame().evaluateJavaScript(script).toString())
387
388    def inject_jquery(self):
389        """Inject jquery library into this webpage for easier manipulation
390        """
391        if self.jquery_lib is None:
392            url = 'http://ajax.googleapis.com/ajax/libs/jquery/1/jquery.min.js'
393            self.jquery_lib = urllib2.urlopen(url).read()
394        self.js(self.jquery_lib)
395
396
397    def click(self, pattern='input'):
398        """Click all elements that match the pattern
399
400        uses standard CSS pattern matching: http://www.w3.org/TR/CSS2/selector.html
401        """
402        for e in self.page().mainFrame().findAllElements(pattern):
403            e.evaluateJavaScript("var evObj = document.createEvent('MouseEvents'); evObj.initEvent('click', true, true); this.dispatchEvent(evObj);")
404
405    def attr(self, pattern, name, value=None):
406        """Set attribute if value is defined, else get
407        """
408        if value is None:
409            # want to get attribute
410            return str(self.page().mainFrame().findFirstElement(pattern).attribute(name))
411        else:
412            for e in self.page().mainFrame().findAllElements(pattern):
413                e.setAttribute(name, value)
414           
415    def fill(self, pattern, value):
416        """Set text of these elements to value
417        """
418        for e in self.page().mainFrame().findAllElements(pattern):
419            tag = str(e.tagName()).lower()
420            if tag == 'input':
421                #e.setAttribute('value', value)
422                e.evaluateJavaScript('this.value = "%s"' % value)
423            else:
424                e.setPlainText(value)
425        
426    def find(self, pattern):
427        """Returns whether element matching xpath pattern exists
428        """
429        return self.page().mainFrame().findAllElements(pattern)
430
431
432    def data(self, url):
433        """Get data for this downloaded resource, if exists
434        """
435        record = self.page().networkAccessManager().cache().data(QUrl(url))
436        if record:
437            data = record.readAll()
438            record.reset()
439        else:
440            data = None
441        return data
442    
443    
444    def run(self):
445        """Run the Qt event loop so can interact with the browser
446        """
447        self.app.exec_() # start GUI thread
448
449    def finished(self, reply):
450        """Override this method in subclasses to process downloaded urls
451        """
452        pass 
453        #print reply.url().toString(), ':', len(reply.data)
454        
455
456    def screenshot(self, output_file):
457        """Take screenshot of current webpage and save results
458        """
459        frame = self.page().mainFrame()
460        image = QImage(self.page().viewportSize(), QImage.Format_ARGB32)
461        painter = QPainter(image)
462        frame.render(painter)
463        painter.end()
464        common.logger.debug('saving', output_file)
465        image.save(output_file)
466
467    def closeEvent(self, event):
468        """Catch the close window event and stop the script
469        """
470        sys.exit(self.app.quit())
471
472
473if __name__ == '__main__':
474    # initiate webkit and show gui
475    # once script is working you can disable the gui
476    w = WebkitBrowser(gui=True) 
477    # load webpage
478    w.get('http://duckduckgo.com')
479    # fill search textbox 
480    w.fill('input[id=search_form_input_homepage]', 'sitescraper')
481    # take screenshot of webpage
482    w.screenshot('duckduckgo.jpg')
483    # click search button 
484    w.click('input[id=search_button_homepage]')
485    # show webpage for 10 seconds
486    w.wait(10)