PageRenderTime 26ms CodeModel.GetById 25ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/phpQuery/plugins/WebBrowser.php

https://github.com/wzs/Scrappr
PHP | 405 lines | 279 code | 2 blank | 124 comment | 71 complexity | a1460314cc669aef13ca1b3a4653a2b4 MD5 | raw file
  1. <?php
  2. /**
  3. * WebBrowser plugin.
  4. *
  5. */
  6. class phpQueryObjectPlugin_WebBrowser {
  7. /**
  8. * Limit binded methods to specified ones.
  9. *
  10. * @var array
  11. */
  12. public static $phpQueryMethods = null;
  13. /**
  14. * Enter description here...
  15. *
  16. * @param phpQueryObject $self
  17. * @todo support 'reset' event
  18. */
  19. public static function WebBrowser($self, $callback = null, $location = null) {
  20. $self = $self->_clone()->toRoot();
  21. $location = $location
  22. ? $location
  23. // TODO use document.location
  24. : $self->document->xhr->getUri(true);
  25. // FIXME tmp
  26. $self->document->WebBrowserCallback = $callback;
  27. if (! $location)
  28. throw new Exception('Location needed to activate WebBrowser plugin !');
  29. else {
  30. $self->bind('click', array($location, $callback), array('phpQueryPlugin_WebBrowser', 'hadleClick'));
  31. $self->bind('submit', array($location, $callback), array('phpQueryPlugin_WebBrowser', 'handleSubmit'));
  32. }
  33. }
  34. public static function browser($self, $callback = null, $location = null) {
  35. return $self->WebBrowser($callback, $location);
  36. }
  37. public static function downloadTo($self, $dir = null, $filename = null) {
  38. $url = null;
  39. if ($self->is('a[href]'))
  40. $url = $self->attr('href');
  41. else if ($self->find('a')->length)
  42. $url = $self->find('a')->attr('href');
  43. if ($url) {
  44. $url = resolve_url($self->document->location, $url);
  45. if (! $dir)
  46. $dir = getcwd();
  47. // TODO resolv name from response headers
  48. if (! $filename) {
  49. $matches = null;
  50. preg_match('@/([^/]+)$@', $url, $matches);
  51. $filename = $matches[1];
  52. }
  53. //print $url;
  54. $path = rtrim($dir, '/').'/'.$filename;
  55. phpQuery::debug("Requesting download of $url\n");
  56. // TODO use AJAX instead of file_get_contents
  57. file_put_contents($path, file_get_contents($url));
  58. }
  59. return $self;
  60. }
  61. /**
  62. * Method changing browser location.
  63. * Fires callback registered with WebBrowser(), if any.
  64. * @param $self
  65. * @param $url
  66. * @return unknown_type
  67. */
  68. public static function location($self, $url = null) {
  69. // TODO if ! $url return actual location ???
  70. $xhr = isset($self->document->xhr)
  71. ? $self->document->xhr
  72. : null;
  73. $xhr = phpQuery::ajax(array(
  74. 'url' => $url,
  75. ), $xhr);
  76. $return = false;
  77. if ($xhr->getLastResponse()->isSuccessful()) {
  78. $return = phpQueryPlugin_WebBrowser::browserReceive($xhr);
  79. if (isset($self->document->WebBrowserCallback))
  80. phpQuery::callbackRun(
  81. $self->document->WebBrowserCallback,
  82. array($return)
  83. );
  84. }
  85. return $return;
  86. }
  87. }
  88. class phpQueryPlugin_WebBrowser {
  89. /**
  90. *
  91. * @param $url
  92. * @param $callback
  93. * @param $param1
  94. * @param $param2
  95. * @param $param3
  96. * @return Zend_Http_Client
  97. */
  98. public static function browserGet($url, $callback,
  99. $param1 = null, $param2 = null, $param3 = null) {
  100. phpQuery::debug("[WebBrowser] GET: $url");
  101. self::authorizeHost($url);
  102. $xhr = phpQuery::ajax(array(
  103. 'type' => 'GET',
  104. 'url' => $url,
  105. 'dataType' => 'html',
  106. ));
  107. $paramStructure = null;
  108. if (func_num_args() > 2) {
  109. $paramStructure = func_get_args();
  110. $paramStructure = array_slice($paramStructure, 2);
  111. }
  112. if ($xhr->getLastResponse()->isSuccessful()) {
  113. phpQuery::callbackRun($callback,
  114. array(self::browserReceive($xhr)->WebBrowser()),
  115. $paramStructure
  116. );
  117. // phpQuery::callbackRun($callback, array(
  118. // self::browserReceive($xhr)//->WebBrowser($callback)
  119. // ));
  120. return $xhr;
  121. } else {
  122. throw new Exception("[WebBrowser] GET request failed; url: $url");
  123. return false;
  124. }
  125. }
  126. /**
  127. *
  128. * @param $url
  129. * @param $data
  130. * @param $callback
  131. * @param $param1
  132. * @param $param2
  133. * @param $param3
  134. * @return Zend_Http_Client
  135. */
  136. public static function browserPost($url, $data, $callback,
  137. $param1 = null, $param2 = null, $param3 = null) {
  138. self::authorizeHost($url);
  139. $xhr = phpQuery::ajax(array(
  140. 'type' => 'POST',
  141. 'url' => $url,
  142. 'dataType' => 'html',
  143. 'data' => $data,
  144. ));
  145. $paramStructure = null;
  146. if (func_num_args() > 3) {
  147. $paramStructure = func_get_args();
  148. $paramStructure = array_slice($paramStructure, 3);
  149. }
  150. if ($xhr->getLastResponse()->isSuccessful()) {
  151. phpQuery::callbackRun($callback,
  152. array(self::browserReceive($xhr)->WebBrowser()),
  153. $paramStructure
  154. );
  155. // phpQuery::callbackRun($callback, array(
  156. // self::browserReceive($xhr)//->WebBrowser($callback)
  157. // ));
  158. return $xhr;
  159. } else
  160. return false;
  161. }
  162. /**
  163. *
  164. * @param $ajaxSettings
  165. * @param $callback
  166. * @param $param1
  167. * @param $param2
  168. * @param $param3
  169. * @return Zend_Http_Client
  170. */
  171. public static function browser($ajaxSettings, $callback,
  172. $param1 = null, $param2 = null, $param3 = null) {
  173. self::authorizeHost($ajaxSettings['url']);
  174. $xhr = phpQuery::ajax(
  175. self::ajaxSettingsPrepare($ajaxSettings)
  176. );
  177. $paramStructure = null;
  178. if (func_num_args() > 2) {
  179. $paramStructure = func_get_args();
  180. $paramStructure = array_slice($paramStructure, 2);
  181. }
  182. if ($xhr->getLastResponse()->isSuccessful()) {
  183. phpQuery::callbackRun($callback,
  184. array(self::browserReceive($xhr)->WebBrowser()),
  185. $paramStructure
  186. );
  187. // phpQuery::callbackRun($callback, array(
  188. // self::browserReceive($xhr)//->WebBrowser($callback)
  189. // ));
  190. return $xhr;
  191. } else
  192. return false;
  193. }
  194. protected static function authorizeHost($url) {
  195. $host = parse_url($url, PHP_URL_HOST);
  196. if ($host)
  197. phpQuery::ajaxAllowHost($host);
  198. }
  199. protected static function ajaxSettingsPrepare($settings) {
  200. unset($settings['success']);
  201. unset($settings['error']);
  202. return $settings;
  203. }
  204. /**
  205. * @param Zend_Http_Client $xhr
  206. */
  207. public static function browserReceive($xhr) {
  208. phpQuery::debug("[WebBrowser] Received from ".$xhr->getUri(true));
  209. // TODO handle meta redirects
  210. $body = $xhr->getLastResponse()->getBody();
  211. // XXX error ???
  212. if (strpos($body, '<!doctype html>') !== false) {
  213. $body = '<html>'
  214. .str_replace('<!doctype html>', '', $body)
  215. .'</html>';
  216. }
  217. $pq = phpQuery::newDocument($body);
  218. $pq->document->xhr = $xhr;
  219. $pq->document->location = $xhr->getUri(true);
  220. $refresh = $pq->find('meta[http-equiv=refresh]')
  221. ->add('meta[http-equiv=Refresh]');
  222. if ($refresh->size()) {
  223. // print htmlspecialchars(var_export($xhr->getCookieJar()->getAllCookies(), true));
  224. // print htmlspecialchars(var_export($xhr->getLastResponse()->getHeader('Set-Cookie'), true));
  225. phpQuery::debug("Meta redirect... '{$refresh->attr('content')}'\n");
  226. // there is a refresh, so get the new url
  227. $content = $refresh->attr('content');
  228. $urlRefresh = substr($content, strpos($content, '=')+1);
  229. $urlRefresh = trim($urlRefresh, '\'"');
  230. // XXX not secure ?!
  231. phpQuery::ajaxAllowURL($urlRefresh);
  232. // $urlRefresh = urldecode($urlRefresh);
  233. // make ajax call, passing last $xhr object to preserve important stuff
  234. $xhr = phpQuery::ajax(array(
  235. 'type' => 'GET',
  236. 'url' => $urlRefresh,
  237. 'dataType' => 'html',
  238. ), $xhr);
  239. if ($xhr->getLastResponse()->isSuccessful()) {
  240. // if all is ok, repeat this method...
  241. return call_user_func_array(
  242. array('phpQueryPlugin_WebBrowser', 'browserReceive'), array($xhr)
  243. );
  244. }
  245. } else
  246. return $pq;
  247. }
  248. /**
  249. *
  250. * @param $e
  251. * @param $callback
  252. * @return unknown_type
  253. */
  254. public static function hadleClick($e, $callback = null) {
  255. $node = phpQuery::pq($e->target);
  256. $type = null;
  257. if ($node->is('a[href]')) {
  258. // TODO document.location
  259. $xhr = isset($node->document->xhr)
  260. ? $node->document->xhr
  261. : null;
  262. $xhr = phpQuery::ajax(array(
  263. 'url' => resolve_url($e->data[0], $node->attr('href')),
  264. 'referer' => $node->document->location,
  265. ), $xhr);
  266. if ((! $callback || !($callback instanceof Callback)) && $e->data[1])
  267. $callback = $e->data[1];
  268. if ($xhr->getLastResponse()->isSuccessful() && $callback)
  269. phpQuery::callbackRun($callback, array(
  270. self::browserReceive($xhr)
  271. ));
  272. } else if ($node->is(':submit') && $node->parents('form')->size())
  273. $node->parents('form')->trigger('submit', array($e));
  274. }
  275. /**
  276. * Enter description here...
  277. *
  278. * @param unknown_type $e
  279. * @TODO trigger submit for form after form's submit button has a click event
  280. */
  281. public static function handleSubmit($e, $callback = null) {
  282. $node = phpQuery::pq($e->target);
  283. if (!$node->is('form') || !$node->is('[action]'))
  284. return;
  285. // TODO document.location
  286. $xhr = isset($node->document->xhr)
  287. ? $node->document->xhr
  288. : null;
  289. $submit = pq($e->relatedTarget)->is(':submit')
  290. ? $e->relatedTarget
  291. // will this work ?
  292. // : $node->find(':submit:first')->get(0);
  293. : $node->find('*:submit:first')->get(0);
  294. $data = array();
  295. foreach($node->serializeArray($submit) as $r)
  296. // XXXt.c maybe $node->not(':submit')->add($sumit) would be better ?
  297. // foreach($node->serializeArray($submit) as $r)
  298. $data[ $r['name'] ] = $r['value'];
  299. $options = array(
  300. 'type' => $node->attr('method')
  301. ? $node->attr('method')
  302. : 'GET',
  303. 'url' => resolve_url($e->data[0], $node->attr('action')),
  304. 'data' => $data,
  305. 'referer' => $node->document->location,
  306. // 'success' => $e->data[1],
  307. );
  308. if ($node->attr('enctype'))
  309. $options['contentType'] = $node->attr('enctype');
  310. $xhr = phpQuery::ajax($options, $xhr);
  311. if ((! $callback || !($callback instanceof Callback)) && $e->data[1])
  312. $callback = $e->data[1];
  313. if ($xhr->getLastResponse()->isSuccessful() && $callback)
  314. phpQuery::callbackRun($callback, array(
  315. self::browserReceive($xhr)
  316. ));
  317. }
  318. }
  319. /**
  320. *
  321. * @param unknown_type $parsed
  322. * @return unknown
  323. * @link http://www.php.net/manual/en/function.parse-url.php
  324. * @author stevenlewis at hotmail dot com
  325. */
  326. function glue_url($parsed)
  327. {
  328. if (! is_array($parsed)) return false;
  329. $uri = isset($parsed['scheme']) ? $parsed['scheme'].':'.((strtolower($parsed['scheme']) == 'mailto') ? '':'//'): '';
  330. $uri .= isset($parsed['user']) ? $parsed['user'].($parsed['pass']? ':'.$parsed['pass']:'').'@':'';
  331. $uri .= isset($parsed['host']) ? $parsed['host'] : '';
  332. $uri .= isset($parsed['port']) ? ':'.$parsed['port'] : '';
  333. if(isset($parsed['path']))
  334. {
  335. $uri .= (substr($parsed['path'],0,1) == '/')?$parsed['path']:'/'.$parsed['path'];
  336. }
  337. $uri .= isset($parsed['query']) ? '?'.$parsed['query'] : '';
  338. $uri .= isset($parsed['fragment']) ? '#'.$parsed['fragment'] : '';
  339. return $uri;
  340. }
  341. /**
  342. * Enter description here...
  343. *
  344. * @param unknown_type $base
  345. * @param unknown_type $url
  346. * @return unknown
  347. * @author adrian-php at sixfingeredman dot net
  348. */
  349. function resolve_url($base, $url) {
  350. if (!strlen($base)) return $url;
  351. // Step 2
  352. if (!strlen($url)) return $base;
  353. // Step 3
  354. if (preg_match('!^[a-z]+:!i', $url)) return $url;
  355. $base = parse_url($base);
  356. if ($url{0} == "#") {
  357. // Step 2 (fragment)
  358. $base['fragment'] = substr($url, 1);
  359. return unparse_url($base);
  360. }
  361. unset($base['fragment']);
  362. unset($base['query']);
  363. if (substr($url, 0, 2) == "//") {
  364. // Step 4
  365. return unparse_url(array(
  366. 'scheme'=>$base['scheme'],
  367. 'path'=>substr($url,2),
  368. ));
  369. } else if ($url{0} == "/") {
  370. // Step 5
  371. $base['path'] = $url;
  372. } else {
  373. // Step 6
  374. $path = explode('/', $base['path']);
  375. $url_path = explode('/', $url);
  376. // Step 6a: drop file from base
  377. array_pop($path);
  378. // Step 6b, 6c, 6e: append url while removing "." and ".." from
  379. // the directory portion
  380. $end = array_pop($url_path);
  381. foreach ($url_path as $segment) {
  382. if ($segment == '.') {
  383. // skip
  384. } else if ($segment == '..' && $path && $path[sizeof($path)-1] != '..') {
  385. array_pop($path);
  386. } else {
  387. $path[] = $segment;
  388. }
  389. }
  390. // Step 6d, 6f: remove "." and ".." from file portion
  391. if ($end == '.') {
  392. $path[] = '';
  393. } else if ($end == '..' && $path && $path[sizeof($path)-1] != '..') {
  394. $path[sizeof($path)-1] = '';
  395. } else {
  396. $path[] = $end;
  397. }
  398. // Step 6h
  399. $base['path'] = join('/', $path);
  400. }
  401. // Step 7
  402. return glue_url($base);
  403. }