PageRenderTime 66ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/webkit.py

https://code.google.com/p/webscraping/
Python | 486 lines | 450 code | 8 blank | 28 comment | 21 complexity | d08294a562c3a7bc40ee4a2addc830ae MD5 | raw file
  1. __doc__ = 'Framework for crawling and scraping webpages with JQuery'
  2. import sys
  3. import os
  4. import re
  5. import urllib2
  6. import random
  7. from time import time, sleep
  8. from datetime import datetime
  9. from PyQt4.QtGui import QApplication, QDesktopServices, QImage, QPainter
  10. from PyQt4.QtCore import QByteArray, QString, QUrl, QTimer, QEventLoop, QIODevice, QObject, QVariant
  11. from PyQt4.QtWebKit import QWebFrame, QWebView, QWebPage, QWebSettings
  12. from PyQt4.QtNetwork import QNetworkAccessManager, QNetworkProxy, QNetworkRequest, QNetworkReply, QNetworkDiskCache
  13. import common
  14. import settings
  15. import xpath
  16. """
  17. TODO
  18. right click find xpath:
  19. http://doc.qt.nokia.com/4.6/webkit-domtraversal.html
  20. http://doc.qt.nokia.com/4.6/webkit-simpleselector.html
  21. textbox for jquery input
  22. http://www.rkblog.rk.edu.pl/w/p/webkit-pyqt-rendering-web-pages/
  23. threaded multiple URLs
  24. exit on close window signal
  25. add progress bar for loading page
  26. implement watir API?
  27. """
  28. def qstring_to_unicode(qstr):
  29. """Convert QString to unicode
  30. """
  31. if isinstance(qstr, unicode):
  32. return qstr
  33. else:
  34. return common.to_unicode(qstr.toUtf8().data(), 'utf-8')
  35. class NetworkAccessManager(QNetworkAccessManager):
  36. """Subclass QNetworkAccessManager for finer control network operations
  37. """
  38. def __init__(self, proxy, allowed_media, allowed_regex, cache_size=100, cache_dir='.webkit_cache'):
  39. """
  40. See JQueryBrowser for details of arguments
  41. cache_size is the maximum size of the webkit cache (MB)
  42. """
  43. QNetworkAccessManager.__init__(self)
  44. # initialize the manager cache
  45. #QDesktopServices.storageLocation(QDesktopServices.CacheLocation)
  46. cache = QNetworkDiskCache()
  47. cache.setCacheDirectory(cache_dir)
  48. cache.setMaximumCacheSize(cache_size * 1024 * 1024) # need to convert cache value to bytes
  49. self.setCache(cache)
  50. self.allowed_regex = allowed_regex
  51. # allowed content extensions
  52. self.banned_extensions = common.MEDIA_EXTENSIONS
  53. for ext in allowed_media:
  54. if ext in self.banned_extensions:
  55. self.banned_extensions.remove(ext)
  56. # and proxy
  57. self.setProxy(proxy)
  58. def setProxy(self, proxy):
  59. """Allow setting string as proxy
  60. """
  61. if isinstance(proxy, basestring):
  62. match = re.match('((?P<username>\w+):(?P<password>\w+)@)?(?P<host>\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3})(:(?P<port>\d+))?', proxy)
  63. if match:
  64. groups = match.groupdict()
  65. username = groups.get('username') or ''
  66. password = groups.get('password') or ''
  67. host = groups.get('host')
  68. port = groups.get('port')
  69. #print host, port, username, password
  70. proxy = QNetworkProxy(QNetworkProxy.HttpProxy, host, int(port), username, password)
  71. else:
  72. common.logger.info('Invalid proxy:' + proxy)
  73. proxy = None
  74. if proxy:
  75. QNetworkAccessManager.setProxy(self, proxy)
  76. def createRequest(self, operation, request, data):
  77. if operation == self.GetOperation:
  78. if self.is_forbidden(request):
  79. # deny GET request for banned media type by setting dummy URL
  80. # XXX abort properly
  81. request.setUrl(QUrl(QString('forbidden://localhost/')))
  82. else:
  83. common.logger.debug(common.to_unicode(request.url().toString().toUtf8().data()).encode('utf-8'))
  84. #print request.url().toString(), operation
  85. request.setAttribute(QNetworkRequest.CacheLoadControlAttribute, QNetworkRequest.PreferCache)
  86. reply = QNetworkAccessManager.createRequest(self, operation, request, data)
  87. reply.error.connect(self.catch_error)
  88. #add Base-Url header, then we can get it from QWebView
  89. if isinstance(request.originatingObject(), QWebFrame):
  90. try:
  91. reply.setRawHeader(QByteArray('Base-Url'), QByteArray('').append(request.originatingObject().page().mainFrame().baseUrl().toString()))
  92. except Exception, e:
  93. common.logger.debug(e)
  94. return reply
  95. def is_forbidden(self, request):
  96. """Returns whether this request is permitted by checking URL extension and regex
  97. XXX head request for mime?
  98. """
  99. forbidden = False
  100. url = common.to_unicode(request.url().toString().toUtf8().data()).encode('utf-8')
  101. if common.get_extension(url) in self.banned_extensions:
  102. forbidden = True
  103. elif re.match(self.allowed_regex, url) is None:
  104. forbidden = True
  105. return forbidden
  106. def catch_error(self, eid):
  107. if eid not in (5, 301):
  108. errors = {
  109. 0 : 'no error condition. Note: When the HTTP protocol returns a redirect no error will be reported. You can check if there is a redirect with the QNetworkRequest::RedirectionTargetAttribute attribute.',
  110. 1 : 'the remote server refused the connection (the server is not accepting requests)',
  111. 2 : 'the remote server closed the connection prematurely, before the entire reply was received and processed',
  112. 3 : 'the remote host name was not found (invalid hostname)',
  113. 4 : 'the connection to the remote server timed out',
  114. 5 : 'the operation was canceled via calls to abort() or close() before it was finished.',
  115. 6 : 'the SSL/TLS handshake failed and the encrypted channel could not be established. The sslErrors() signal should have been emitted.',
  116. 7 : 'the connection was broken due to disconnection from the network, however the system has initiated roaming to another access point. The request should be resubmitted and will be processed as soon as the connection is re-established.',
  117. 101 : 'the connection to the proxy server was refused (the proxy server is not accepting requests)',
  118. 102 : 'the proxy server closed the connection prematurely, before the entire reply was received and processed',
  119. 103 : 'the proxy host name was not found (invalid proxy hostname)',
  120. 104 : 'the connection to the proxy timed out or the proxy did not reply in time to the request sent',
  121. 105 : 'the proxy requires authentication in order to honour the request but did not accept any credentials offered (if any)',
  122. 201 : 'the access to the remote content was denied (similar to HTTP error 401)',
  123. 202 : 'the operation requested on the remote content is not permitted',
  124. 203 : 'the remote content was not found at the server (similar to HTTP error 404)',
  125. 204 : 'the remote server requires authentication to serve the content but the credentials provided were not accepted (if any)',
  126. 205 : 'the request needed to be sent again, but this failed for example because the upload data could not be read a second time.',
  127. 301 : 'the Network Access API cannot honor the request because the protocol is not known',
  128. 302 : 'the requested operation is invalid for this protocol',
  129. 99 : 'an unknown network-related error was detected',
  130. 199 : 'an unknown proxy-related error was detected',
  131. 299 : 'an unknown error related to the remote content was detected',
  132. 399 : 'a breakdown in protocol was detected (parsing error, invalid or unexpected responses, etc.)',
  133. }
  134. common.logger.debug('Error %d: %s (%s)' % (eid, errors.get(eid, 'unknown error'), self.sender().url().toString()))
  135. class NetworkReply(QNetworkReply):
  136. def __init__(self, parent, reply):
  137. QNetworkReply.__init__(self, parent)
  138. self.reply = reply # reply to proxy
  139. self.data = '' # contains downloaded data
  140. self.buffer = '' # contains buffer of data to read
  141. self.setOpenMode(QNetworkReply.ReadOnly | QNetworkReply.Unbuffered)
  142. #print dir(reply)
  143. # connect signal from proxy reply
  144. reply.metaDataChanged.connect(self.applyMetaData)
  145. reply.readyRead.connect(self.readInternal)
  146. reply.finished.connect(self.finished)
  147. reply.uploadProgress.connect(self.uploadProgress)
  148. reply.downloadProgress.connect(self.downloadProgress)
  149. def __getattribute__(self, attr):
  150. """Send undefined methods straight through to proxied reply
  151. """
  152. # send these attributes through to proxy reply
  153. if attr in ('operation', 'request', 'url', 'abort', 'close'):#, 'isSequential'):
  154. value = self.reply.__getattribute__(attr)
  155. else:
  156. value = QNetworkReply.__getattribute__(self, attr)
  157. #print attr, value
  158. return value
  159. def abort(self):
  160. pass # qt requires that this be defined
  161. def isSequential(self):
  162. return True
  163. def applyMetaData(self):
  164. for header in self.reply.rawHeaderList():
  165. self.setRawHeader(header, self.reply.rawHeader(header))
  166. self.setHeader(QNetworkRequest.ContentTypeHeader, self.reply.header(QNetworkRequest.ContentTypeHeader))
  167. self.setHeader(QNetworkRequest.ContentLengthHeader, self.reply.header(QNetworkRequest.ContentLengthHeader))
  168. self.setHeader(QNetworkRequest.LocationHeader, self.reply.header(QNetworkRequest.LocationHeader))
  169. self.setHeader(QNetworkRequest.LastModifiedHeader, self.reply.header(QNetworkRequest.LastModifiedHeader))
  170. self.setHeader(QNetworkRequest.SetCookieHeader, self.reply.header(QNetworkRequest.SetCookieHeader))
  171. self.setAttribute(QNetworkRequest.HttpStatusCodeAttribute, self.reply.attribute(QNetworkRequest.HttpStatusCodeAttribute))
  172. self.setAttribute(QNetworkRequest.HttpReasonPhraseAttribute, self.reply.attribute(QNetworkRequest.HttpReasonPhraseAttribute))
  173. self.setAttribute(QNetworkRequest.RedirectionTargetAttribute, self.reply.attribute(QNetworkRequest.RedirectionTargetAttribute))
  174. self.setAttribute(QNetworkRequest.ConnectionEncryptedAttribute, self.reply.attribute(QNetworkRequest.ConnectionEncryptedAttribute))
  175. self.setAttribute(QNetworkRequest.CacheLoadControlAttribute, self.reply.attribute(QNetworkRequest.CacheLoadControlAttribute))
  176. self.setAttribute(QNetworkRequest.CacheSaveControlAttribute, self.reply.attribute(QNetworkRequest.CacheSaveControlAttribute))
  177. self.setAttribute(QNetworkRequest.SourceIsFromCacheAttribute, self.reply.attribute(QNetworkRequest.SourceIsFromCacheAttribute))
  178. # attribute is undefined
  179. #self.setAttribute(QNetworkRequest.DoNotBufferUploadDataAttribute, self.reply.attribute(QNetworkRequest.DoNotBufferUploadDataAttribute))
  180. self.metaDataChanged.emit()
  181. def bytesAvailable(self):
  182. """How many bytes in the buffer are available to be read
  183. """
  184. return len(self.buffer) + QNetworkReply.bytesAvailable(self)
  185. def readInternal(self):
  186. """New data available to read
  187. """
  188. s = self.reply.readAll()
  189. self.data += s
  190. self.buffer += s
  191. self.readyRead.emit()
  192. def readData(self, size):
  193. """Return up to size bytes from buffer
  194. """
  195. size = min(size, len(self.buffer))
  196. data, self.buffer = self.buffer[:size], self.buffer[size:]
  197. return str(data)
  198. class WebPage(QWebPage):
  199. """Override QWebPage to set User-Agent and JavaScript messages
  200. """
  201. def __init__(self, user_agent, confirm=True):
  202. QWebPage.__init__(self)
  203. self.user_agent = user_agent
  204. self.confirm = confirm
  205. def userAgentForUrl(self, url):
  206. return self.user_agent
  207. def javaScriptAlert(self, frame, message):
  208. """Override default JavaScript alert popup and print results
  209. """
  210. common.logger.debug('Alert:' + message)
  211. def javaScriptConfirm(self, frame, message):
  212. """Override default JavaScript confirm popup and print results
  213. """
  214. common.logger.debug('Confirm:' + message)
  215. return self.confirm
  216. def javaScriptPrompt(self, frame, message, default):
  217. """Override default JavaScript prompt popup and print results
  218. """
  219. common.logger.debug('Prompt:%s%s' % (message, default))
  220. def javaScriptConsoleMessage(self, message, line_number, source_id):
  221. """Print JavaScript console messages
  222. """
  223. common.logger.debug('Console:%s%s%s' % (message, line_number, source_id))
  224. def shouldInterruptJavaScript(self):
  225. """Disable javascript interruption dialog box
  226. """
  227. return True
  228. class WebkitBrowser(QWebView):
  229. """Render webpages using webkit
  230. """
  231. def __init__(self, base_url=None, gui=False, user_agent=None, proxy=None, allowed_media=None, allowed_regex='.*?', timeout=20, delay=5, enable_plugins=True):#, cache_file=None):
  232. """
  233. base_url is the domain that will be crawled
  234. gui is whether to show webkit window or run headless
  235. user_agent is used to set the user-agent when downloading content
  236. proxy is a QNetworkProxy to download through
  237. allowed_media are the media extensions to allow
  238. allowed_regex is a regular expressions of URLS to allow
  239. timeout is the maximum amount of seconds to wait for a request
  240. delay is the minimum amount of seconds to wait between requests
  241. """
  242. self.app = QApplication(sys.argv) # must instantiate first
  243. QWebView.__init__(self)
  244. webpage = WebPage(user_agent or settings.user_agent)
  245. allowed_media = allowed_media or ['css', 'js']
  246. manager = NetworkAccessManager(proxy, allowed_media, allowed_regex)
  247. manager.finished.connect(self.finished)
  248. webpage.setNetworkAccessManager(manager)
  249. self.setPage(webpage)
  250. self.setHtml('<html><head></head><body>No content loaded</body></html>', QUrl('http://localhost'))
  251. self.timeout = timeout
  252. self.delay = delay
  253. #self.cache = pdict.PersistentDict(cache_file or settings.cache_file) # cache to store webpages
  254. self.base_url = base_url
  255. self.jquery_lib = None
  256. #enable flash plugin etc.
  257. self.settings().setAttribute(QWebSettings.PluginsEnabled, enable_plugins)
  258. #XXXQTimer.singleShot(0, self.run) # start crawling when all events processed
  259. if gui: self.show()
  260. def set_proxy(self, proxy):
  261. self.page().networkAccessManager().setProxy(proxy)
  262. def current_url(self):
  263. """Return current URL
  264. """
  265. return str(self.url().toString())
  266. def current_html(self):
  267. """Return current rendered HTML
  268. """
  269. return unicode(self.page().mainFrame().toHtml())
  270. def get(self, url=None, script=None, num_retries=1, jquery=False):
  271. """Load given url in webkit and return html when loaded
  272. script is some javasript to exexute that will change the loaded page (eg form submission)
  273. num_retries is how many times to try downloading this URL or executing this script
  274. jquery is whether to inject JQuery into the document
  275. """
  276. t1 = time()
  277. self.base_url = self.base_url or url # set base URL if not set
  278. #html = self.cache.get(key, {}).get('value')
  279. #if html:
  280. # self.debug('Load cache ' + key)
  281. # self.setHtml(html, QUrl(self.base_url))
  282. #else:
  283. if 1:
  284. loop = QEventLoop()
  285. timer = QTimer()
  286. timer.setSingleShot(True)
  287. timer.timeout.connect(loop.quit)
  288. self.loadFinished.connect(loop.quit)
  289. if url:
  290. self.load(QUrl(url))
  291. elif script:
  292. self.js(script)
  293. timer.start(self.timeout * 1000)
  294. loop.exec_() # delay here until download finished or timeout
  295. if timer.isActive():
  296. # downloaded successfully
  297. timer.stop()
  298. parsed_html = self.current_html()
  299. #if key:
  300. # self.cache[key] = html
  301. self.wait(self.delay - (time() - t1))
  302. else:
  303. # didn't download in time
  304. if num_retries > 0:
  305. common.logger.debug('Timeout - retrying')
  306. parsed_html = self.get(url, script=script, num_retries=num_retries-1, jquery=jquery)
  307. else:
  308. common.logger.debug('Timed out')
  309. parsed_html = ''
  310. return parsed_html
  311. def wait(self, secs=1):
  312. """Wait for delay time
  313. """
  314. deadline = time() + secs
  315. while time() < deadline:
  316. sleep(0)
  317. self.app.processEvents()
  318. #print 'wait', wait_secs
  319. # randomize the delay so less suspicious
  320. #wait_secs += 0.5 * self.delay * (random.random() - 0.5)
  321. #time.sleep(max(0, wait_secs))
  322. def jsget(self, script, num_retries=1, jquery=True):
  323. """Execute JavaScript that will cause page submission, and wait for page to load
  324. """
  325. return self.get(script=script, num_retries=num_retries, jquery=jquery)
  326. def js(self, script):
  327. """Shortcut to execute javascript on current document and return result
  328. """
  329. self.app.processEvents()
  330. return qstring_to_unicode(self.page().mainFrame().evaluateJavaScript(script).toString())
  331. def inject_jquery(self):
  332. """Inject jquery library into this webpage for easier manipulation
  333. """
  334. if self.jquery_lib is None:
  335. url = 'http://ajax.googleapis.com/ajax/libs/jquery/1/jquery.min.js'
  336. self.jquery_lib = urllib2.urlopen(url).read()
  337. self.js(self.jquery_lib)
  338. def click(self, pattern='input'):
  339. """Click all elements that match the pattern
  340. uses standard CSS pattern matching: http://www.w3.org/TR/CSS2/selector.html
  341. """
  342. for e in self.page().mainFrame().findAllElements(pattern):
  343. e.evaluateJavaScript("var evObj = document.createEvent('MouseEvents'); evObj.initEvent('click', true, true); this.dispatchEvent(evObj);")
  344. def attr(self, pattern, name, value=None):
  345. """Set attribute if value is defined, else get
  346. """
  347. if value is None:
  348. # want to get attribute
  349. return str(self.page().mainFrame().findFirstElement(pattern).attribute(name))
  350. else:
  351. for e in self.page().mainFrame().findAllElements(pattern):
  352. e.setAttribute(name, value)
  353. def fill(self, pattern, value):
  354. """Set text of these elements to value
  355. """
  356. for e in self.page().mainFrame().findAllElements(pattern):
  357. tag = str(e.tagName()).lower()
  358. if tag == 'input':
  359. #e.setAttribute('value', value)
  360. e.evaluateJavaScript('this.value = "%s"' % value)
  361. else:
  362. e.setPlainText(value)
  363. def find(self, pattern):
  364. """Returns whether element matching xpath pattern exists
  365. """
  366. return self.page().mainFrame().findAllElements(pattern)
  367. def data(self, url):
  368. """Get data for this downloaded resource, if exists
  369. """
  370. record = self.page().networkAccessManager().cache().data(QUrl(url))
  371. if record:
  372. data = record.readAll()
  373. record.reset()
  374. else:
  375. data = None
  376. return data
  377. def run(self):
  378. """Run the Qt event loop so can interact with the browser
  379. """
  380. self.app.exec_() # start GUI thread
  381. def finished(self, reply):
  382. """Override this method in subclasses to process downloaded urls
  383. """
  384. pass
  385. #print reply.url().toString(), ':', len(reply.data)
  386. def screenshot(self, output_file):
  387. """Take screenshot of current webpage and save results
  388. """
  389. frame = self.page().mainFrame()
  390. image = QImage(self.page().viewportSize(), QImage.Format_ARGB32)
  391. painter = QPainter(image)
  392. frame.render(painter)
  393. painter.end()
  394. common.logger.debug('saving', output_file)
  395. image.save(output_file)
  396. def closeEvent(self, event):
  397. """Catch the close window event and stop the script
  398. """
  399. sys.exit(self.app.quit())
  400. if __name__ == '__main__':
  401. # initiate webkit and show gui
  402. # once script is working you can disable the gui
  403. w = WebkitBrowser(gui=True)
  404. # load webpage
  405. w.get('http://duckduckgo.com')
  406. # fill search textbox
  407. w.fill('input[id=search_form_input_homepage]', 'sitescraper')
  408. # take screenshot of webpage
  409. w.screenshot('duckduckgo.jpg')
  410. # click search button
  411. w.click('input[id=search_button_homepage]')
  412. # show webpage for 10 seconds
  413. w.wait(10)