/application/controllers/ScrapeController.php
PHP | 2654 lines | 2298 code | 304 blank | 52 comment | 301 complexity | f30d8d6021e73b335616cb04c791d2d4 MD5 | raw file
Large files files are truncated, but you can click here to view the full file
- <?php
- include_once APPLICATION_PATH . '/controllers/SnaapiController.php';
- class ScrapeController extends SnaapiController {
- private $_pages_scraped;
- const MAX_PAGES_TO_SCRAPE = 1;
- public function init() {
- SnaapiController::init();
- if( 'development' == $this->getInvokeArg('env') ) {
- $this->_helper->viewRenderer->setRender('index');
- }
- }
- public function phpAction() {
- if( 'development' == $this->getInvokeArg('env') ) {
- $this->view->results = '';
- $this->_pages_scraped = 0;
- /*$model = $this->getFunctionsModel();
- $db = $model->getTable()->getAdapter();
- $sql = "SELECT * FROM `functions` WHERE `data` LIKE '% ,%'";
- foreach( $db->query($sql)->fetchAll() as $result ) {
- $result['data'] = str_replace(" ,", ',', $result['data']);
- $this->getFunctionsModel()->setData(array(
- 'category' => $result['category'],
- 'id' => $result['id'],
- 'data' => $result['data']
- ));
- }*/
- $this->scrapePHPHierarchies();
- $this->scrapePHPFunctions();
- } else {
- $this->_forward('error', 'error');
- }
- }
- public function pythonAction() {
- if( 'development' == $this->getInvokeArg('env') ) {
- $this->view->results = '';
- $this->_pages_scraped = 0;
- $this->scrapePythonModules(true);
- } else {
- $this->_forward('error', 'error');
- }
- }
- public function cssAction() {
- if( 'development' == $this->getInvokeArg('env') ) {
- $this->view->results = '';
- $this->_pages_scraped = 0;
- $this->scrapeCSSFunctions();
- } else {
- $this->_forward('error', 'error');
- }
- }
- public function zendAction() {
- if( 'development' == $this->getInvokeArg('env') ) {
- $this->view->results = '';
- $this->_pages_scraped = 0;
- $this->scrapeZend();
- } else {
- $this->_forward('error', 'error');
- }
- }
- public function fbAction() {
- if( 'development' == $this->getInvokeArg('env') ) {
- $this->view->results = '';
- $this->_pages_scraped = 0;
- //$this->scrapeFacebook();
- //$this->scrapeFacebookFbml();
- $this->scrapeFacebookFbmlPhase2();
- } else {
- $this->_forward('error', 'error');
- }
- }
- public function djangoAction() {
- if( 'development' == $this->getInvokeArg('env') ) {
- $this->view->results = '';
- $this->_pages_scraped = 0;
- //$this->scrapeDjango1();
- $this->scrapeDjango2();
- } else {
- $this->_forward('error', 'error');
- }
- }
- public function iphoneAction() {
- if( 'development' == $this->getInvokeArg('env') ) {
- $this->view->results = '';
- $this->_pages_scraped = 0;
- $this->scrapeiPhone();
- //$this->scrapeiPhoneDir();
- } else {
- $this->_forward('error', 'error');
- }
- }
- public function jsAction() {
- if( 'development' == $this->getInvokeArg('env') ) {
- $this->view->results = '';
- $this->_pages_scraped = 0;
- $this->scrapeJavascript();
- } else {
- $this->_forward('error', 'error');
- }
- }
- public function jqueryAction() {
- if( 'development' == $this->getInvokeArg('env') ) {
- $this->view->results = '';
- $this->_pages_scraped = 0;
- //$this->scrapejQuery();
- $this->scrapejQuery2();
- } else {
- $this->_forward('error', 'error');
- }
- }
- public function androidAction() {
- if( 'development' == $this->getInvokeArg('env') ) {
- $this->view->results = '';
- $this->_pages_scraped = 0;
-
- //$this->scrapeAndroidPackageList();
- //$this->scrapeAndroidPackages(2);
- $this->scrapeAndroidFunctions();
- } else {
- $this->_forward('error', 'error');
- }
- }
- public function mootoolsAction() {
- if( 'development' == $this->getInvokeArg('env') ) {
- $this->view->results = '';
- $this->_pages_scraped = 0;
- $this->scrapeMootoolsFunctions();
- } else {
- $this->_forward('error', 'error');
- }
- }
- public function clojureAction() {
- if( 'development' == $this->getInvokeArg('env') ) {
- $this->view->results = '';
- $this->_pages_scraped = 0;
- //$this->scrapeClojureHierarchies();
- $this->scrapeClojureFunctions();
- } else {
- $this->_forward('error', 'error');
- }
- }
- private function scrapeClojureFunctions() {
- $category = 'Clojure';
- $category_id = $this->getCategoriesModel()->fetchCategoryByName($category);
- if( !$category_id ) {
- $this->invalid_category($category);
- return;
- }
- $contents = file_get_contents(APPLICATION_PATH . '/scraper/clojure/api.html');
- $hierarchies = array_slice(explode('<h2 id="', $contents), 1);
- foreach( $hierarchies as $hierarchy ) {
- if( !preg_match('/(.+?)">(.+?)<\/h2>/', $hierarchy, $matches) ) {
- $this->view->results .= 'No name found, skipping...' . "\n";
- continue;
- }
- $name = trim($matches[2]);
- $sub_id = $this->getHierarchiesModel()->fetchByName($category_id, 1, $name);
- $functions = array_slice(explode('<hr>', $hierarchy), 1);
- foreach( $functions as $function ) {
- if( !preg_match_all('/<h3 id="(.+?)">(.+?)<\/h3>/', $function, $matches) ) {
- $this->view->results .= 'No function info found, skipping...' . "\n";
- $this->view->results .= $function . "\n\n";
- continue;
- }
- if( !preg_match('/(?:.+<\/h3>) (.+?)<br>/', str_replace("\n", ' ', $function), $desc_matches) ) {
- $this->view->results .= 'No desc found, skipping...' . "\n";
- $this->view->results .= $function . "\n\n";
- continue;
- }
-
- $desc = trim(strip_tags($desc_matches[1]));
-
- for( $index = 0; $index < count($matches[0]); ++$index ) {
- $url = 'http://clojure.org/api#'.$matches[1][$index];
- $name = trim(str_replace('&', '&', strip_tags($matches[2][$index])));
- $this->view->results .= $sub_id ."\n";
- $this->view->results .= $name ."\n";
- $this->view->results .= $url ."\n";
- $this->view->results .= $desc ."\n\n";
- $this->getFunctionsModel()->insertOrUpdateFunction(array(
- 'category' => $category_id,
- 'hierarchy' => $sub_id,
- 'name' => $name,
- 'url' => $url,
- 'short_description' => $desc,
- 'scrapeable' => 0
- ));
- }
- }
- }
- }
- private function scrapeClojureHierarchies() {
- $category = 'Clojure';
- $category_id = $this->getCategoriesModel()->fetchCategoryByName($category);
- if( !$category_id ) {
- $this->invalid_category($category);
- return;
- }
- $contents = file_get_contents(APPLICATION_PATH . '/scraper/clojure/api.html');
- $hierarchies = array_slice(explode('<h2 id="', $contents), 1);
- foreach( $hierarchies as $hierarchy ) {
- if( !preg_match('/(.+?)">(.+?)<\/h2>/', $hierarchy, $matches) ) {
- $this->view->results .= 'No name found, skipping...' . "\n";
- continue;
- }
- $name = trim($matches[2]);
- $url = 'http://clojure.org/api#'.$matches[1];
- $this->view->results .= $this->getHierarchiesModel()->insert($category_id, 1, $name, $url, 0)."\n";
- }
- }
- private function scrapeMootoolsFunctions() {
- $category = 'mootools';
- $category_id = $this->getCategoriesModel()->fetchCategoryByName($category);
- if( !$category_id ) {
- $this->invalid_category($category);
- return;
- }
- $scrapeable = $this->getHierarchiesModel()->fetchAllScrapeable($category_id);
- if( empty($scrapeable) ) {
- $this->nothing_to_scrape($category);
- return;
- }
- foreach( $scrapeable as $hierarchy ) {
- $this->view->results .= $hierarchy['name'] . "\n";
- if( !$hierarchy['source_url'] ) {
- $this->view->results .= 'No source URL specified, skipping...' . "\n";
- continue;
- }
- $source_url = $hierarchy['source_url'];
- $this->view->results .= '<a href="'.$source_url.'">'.$source_url."</a>\n";
- $contents = file_get_contents($source_url);
- $start_index = strpos($contents, '<h2 id="');
- $data = substr($contents, $start_index);
- if( !preg_match_all('/<h2 id=".+?"(?: class="description")?><a href="(.+?)">(?:(?:.+? )?(?:Function|Method|Property|Selector|Event)): (.+?)<\/a><\/h2>/', $data, $matches ) ) {
- $this->view->results .= 'No functions found, checking for features...' . "\n";
- if( !preg_match_all('/<li>(.+?) - \(<em>(.+?)<\/em>\) (.+?)<\/li>/', $contents, $matches) ) {
- $this->view->results .= 'No features found, skipping...' . "\n";
- continue;
- }
- for( $index = 0; $index < count($matches[0]); ++$index ) {
- $url = $source_url;
- $name = trim($matches[1][$index]);
- $desc = trim(strip_tags($matches[3][$index]));
- $this->view->results .= $name."\n";
- $this->view->results .= $url."\n";
- $this->view->results .= $desc."\n\n";
- $this->getFunctionsModel()->insertOrUpdateFunction(array(
- 'category' => $category_id,
- 'hierarchy' => $hierarchy['id'],
- 'name' => $name,
- 'url' => $url,
- 'short_description' => $desc,
- 'scrapeable' => 0
- ));
- }
- $this->getHierarchiesModel()->touch($category_id, $hierarchy['id']);
- continue;
- }
- $functions = array_slice(explode('<h2 id="', $data), 1);
- foreach( $functions as $function ) {
- $desc = '';
- if( preg_match('/<p class="description">(.+?)<\/p>/', str_replace("\n", ' ', $function), $matches) ) {
- $desc = $matches[1];
- }
- if( !preg_match('/.+?"(?: class="description")?><a href="(.+?)">(?:(?:.+? )?(?:Function|Method|Property|Selector|Event)): (.+?)<\/a><\/h2>/', $function, $matches ) ) {
- $this->view->results .= 'Couldn\'t find the function name, skipping...' . "\n";
- continue;
- }
- $url = $source_url . $matches[1];
- $name = trim($matches[2]);
- if( $hierarchy['name'] != 'Core' && $name[0] != '$' ) {
- $name = $hierarchy['name'].'.'.$name;
- }
- $desc = trim(strip_tags($desc));
- $this->view->results .= $name."\n";
- $this->view->results .= $url."\n";
- $this->view->results .= $desc."\n\n";
- $this->getFunctionsModel()->insertOrUpdateFunction(array(
- 'category' => $category_id,
- 'hierarchy' => $hierarchy['id'],
- 'name' => $name,
- 'url' => $url,
- 'short_description' => $desc,
- 'scrapeable' => 0
- ));
- }
- $this->getHierarchiesModel()->touch($category_id, $hierarchy['id']);
- }
- }
- private function scrapeAndroidFunctions() {
- $category = 'android';
- $category_id = $this->getCategoriesModel()->fetchCategoryByName($category);
- if( !$category_id ) {
- $this->invalid_category($category);
- return;
- }
- $scrapeable = $this->getHierarchiesModel()->fetchAllScrapeable($category_id);
- if( empty($scrapeable) ) {
- $this->nothing_to_scrape($category);
- return;
- }
- foreach( $scrapeable as $hierarchy ) {
- $this->view->results .= $hierarchy['name'] . "\n";
- if( !$hierarchy['source_url'] ) {
- $this->view->results .= 'No source URL specified, skipping...' . "\n";
- continue;
- }
- $source_url = $hierarchy['source_url'];
- $this->view->results .= '<a href="'.$source_url.'">'.$source_url."</a>\n";
- $contents = file_get_contents($source_url);
- if( !preg_match('/<td colspan="1" class="jd-inheritance-class-cell">(.+?)<\/td>/', $contents, $matches) ) {
- $this->view->results .= 'No name found, skipping...' . "\n";
- break;
- }
- $name = $matches[1];
- $this->view->results .= $name ."\n";
- $desc = '';
- $OVERVIEW_TXT = '<h2>Class Overview</h2>';
- $desc_start = strpos($contents, $OVERVIEW_TXT);
- if( false !== $desc_start ) {
- $desc_start += strlen($OVERVIEW_TXT);
- $desc_end = strpos($contents, '</p>', $desc_start);
- if( false !== $desc_end ) {
- $desc = trim(strip_tags(str_replace("\n", ' ', substr($contents, $desc_start, $desc_end - $desc_start))));
- }
- }
- if( $desc == '' ) {
- $this->view->results .= 'No description found...'."\n";
- } else {
- $this->view->results .= $desc ."\n";
- }
- $this->getFunctionsModel()->insertOrUpdateFunction(array(
- 'category' => $category_id,
- 'hierarchy' => $hierarchy['id'],
- 'name' => $name,
- 'url' => $source_url,
- 'short_description' => $desc,
- 'scrapeable' => 1
- ));
- $this->getHierarchiesModel()->touch($category_id, $hierarchy['id']);
- }
- }
- private function process_section($links, $name, $mode, $category_id, $hierarchy, $source_url) {
- $interface_start = strpos($links, '<li><h2>'.$name.'</h2>');
- if( $interface_start !== false ) {
- if( $mode == 2 ) {
- $interface_end = strpos($links, ' </li>', $interface_start);
- $data = substr($links, $interface_start, $interface_end - $interface_start);
- $sub_id = $this->getHierarchiesModel()->fetchByName($category_id, $hierarchy, $name);
- if( !$sub_id ) {
- $this->view->results .= $hierarchy."\n";
- $this->view->results .= $name."\n";
- $this->view->results .= 'Couldn\'t find any parent hierarchy, skipping...' . "\n";
- return false;
- }
- if( !preg_match_all('/<li><a href="(.+?)">(.+?)<\/a>(?:<T>)?<\/li>/', $data, $matches) ) {
- $this->view->results .= 'Couldn\'t find any members name, skipping...' . "\n";
- return false;
- }
- for( $index = 0; $index < count($matches[0]); ++$index ) {
- $name = $matches[2][$index];
- $url = 'http://developer.android.com'.$matches[1][$index];
- $this->view->results .= $this->getHierarchiesModel()->insert($category_id, $sub_id, $name, $url, 1)."\n";
- }
- } else if( $mode == 1 ) {
- $this->view->results .= $this->getHierarchiesModel()->insert($category_id, $hierarchy, $name, $source_url, 0)."\n";
- }
- }
- return true;
- }
- private function scrapeAndroidPackages($mode) {
- $category = 'android';
- $category_id = $this->getCategoriesModel()->fetchCategoryByName($category);
- if( !$category_id ) {
- $this->invalid_category($category);
- return;
- }
- $scrapeable = $this->getHierarchiesModel()->fetchAllScrapeable($category_id);
- if( empty($scrapeable) ) {
- $this->nothing_to_scrape($category);
- return;
- }
- foreach( $scrapeable as $hierarchy ) {
- if( !$hierarchy['source_url'] ) {
- $this->view->results .= 'No source URL specified, skipping...' . "\n";
- continue;
- }
- $source_url = $hierarchy['source_url'];
- $contents = file_get_contents($source_url);
-
- $start_index = strpos($contents, '</div> <!-- end resize-packages -->');
- $links = substr($contents, $start_index);
- $succeeded = true;
- $succeeded = $succeeded && $this->process_section($links, 'Interfaces', $mode, $category_id, $hierarchy['id'], $source_url);
- $succeeded = $succeeded && $this->process_section($links, 'Classes', $mode, $category_id, $hierarchy['id'], $source_url);
- $succeeded = $succeeded && $this->process_section($links, 'Exceptions', $mode, $category_id, $hierarchy['id'], $source_url);
- $succeeded = $succeeded && $this->process_section($links, 'Enums', $mode, $category_id, $hierarchy['id'], $source_url);
- if( $mode == 2 && $succeeded) {
- $this->getHierarchiesModel()->touch($category_id, $hierarchy['id']);
- }
- }
- }
- private function scrapeAndroidPackageList() {
- $category = 'android';
- $category_id = $this->getCategoriesModel()->fetchCategoryByName($category);
- if( !$category_id ) {
- $this->invalid_category($category);
- return;
- }
- $contents = file_get_contents('http://developer.android.com/reference/packages.html');
- $start_index = strpos($contents, '<div id="packages-nav">');
- if( $start_index === false ) {
- $this->view->results .= 'Couldn\'t find the packages navigation, skipping...' . "\n";
- return;
- }
- $links = substr($contents, $start_index);
- if( !preg_match_all('/<a href="(.+?)">(.+?)<\/a><\/li>/', $links, $matches) ) {
- $this->view->results .= 'Couldn\'t find any links, skipping...' . "\n";
- return;
- }
- for( $index = 0; $index < count($matches[0]); ++$index ) {
- $name = $matches[2][$index];
- $url = 'http://developer.android.com'.$matches[1][$index];
- $this->view->results .= $this->getHierarchiesModel()->insert($category_id, 1, $name, $url, 1)."\n";
- }
- }
- private function scrapejQuery2() {
- $category = 'jQuery';
- $category_id = $this->getCategoriesModel()->fetchCategoryByName($category);
- if( !$category_id ) {
- $this->invalid_category($category);
- return;
- }
- $scrapeable = $this->getHierarchiesModel()->fetchAllScrapeable($category_id);
- if( empty($scrapeable) ) {
- $this->nothing_to_scrape($category);
- return;
- }
- $is_saving = true;
- foreach( $scrapeable as $hierarchy ) {
- $this->view->results .= $hierarchy['name'] . "\n";
- if( !$hierarchy['source_url'] ) {
- $this->view->results .= 'No source URL specified, skipping...' . "\n";
- continue;
- }
- $source_url = $hierarchy['source_url'];
- $this->view->results .= '<a href="'.$source_url.'">'.$source_url."</a>\n";
- $contents = file_get_contents($source_url);
- $start_index = strpos($contents, '<div id="options">');
- if( $start_index === false ) {
- $this->view->results .= 'Couldn\'t find the options, skipping...' . "\n";
- continue;
- }
- $end_index = strpos($contents, '<div id="', $start_index+1);
- if( $end_index === false ) {
- $this->view->results .= 'Couldn\'t find the end of the options, skipping...' . "\n";
- continue;
- }
- $source_name = strtolower($hierarchy['name']);
- $data = str_replace("\n", '', substr($contents, $start_index, $end_index - $start_index));
- $elements = explode('<li class="option"', $data);
- foreach( $elements as $element ) {
- if( preg_match('/<h3 class="option-name"><a href="(.+?)">(.+?)<\/a><\/h3>.+?<p>(.+?)<\/p>/', $element, $matches) ) {
- $link = $source_url.$matches[1];
- $name = $source_name .' '.trim(str_replace(' )', ')', str_replace(' ', '', strip_tags($matches[2]))));
- $desc = trim(strip_tags($matches[3], '<b>'));
-
- $this->view->results .= $link.' - '.$name."\n";
- $this->view->results .= $desc."\n\n";
- if( $is_saving ) {
- $this->getFunctionsModel()->insertOrUpdateFunction(array(
- 'category' => $category_id,
- 'hierarchy' => $hierarchy['id'],
- 'name' => $name,
- 'url' => $link,
- 'short_description' => $desc
- ));
- $this->getHierarchiesModel()->touch($category_id, $hierarchy['id']);
- }
- } else {
- //$this->view->results .= htmlentities($element)."\n\n";
- }
- }
- $start_index = strpos($contents, '<div id="events">');
- if( $start_index === false ) {
- $this->view->results .= 'Couldn\'t find the events, skipping...' . "\n";
- continue;
- }
- $end_index = strpos($contents, '<div id="', $start_index+1);
- if( $end_index === false ) {
- $this->view->results .= 'Couldn\'t find the end of the events, skipping...' . "\n";
- continue;
- }
- $source_name = strtolower($hierarchy['name']);
- $data = str_replace("\n", '', substr($contents, $start_index, $end_index - $start_index));
- $elements = explode('<li class="event"', $data);
- foreach( $elements as $element ) {
- if( preg_match('/<h3 class="event-name"><a href="(.+?)">(.+?)<\/a><\/h3>.+?<p>(.+?)<\/p>/', $element, $matches) ) {
- $link = $source_url.$matches[1];
- $name = $source_name .' '.trim(str_replace(' )', ')', str_replace(' ', '', strip_tags($matches[2]))));
- $desc = trim(strip_tags($matches[3], '<b>'));
- $this->view->results .= $link.' - '.$name."\n";
- $this->view->results .= $desc."\n\n";
- if( $is_saving ) {
- $this->getFunctionsModel()->insertOrUpdateFunction(array(
- 'category' => $category_id,
- 'hierarchy' => $hierarchy['id'],
- 'name' => $name,
- 'url' => $link,
- 'short_description' => $desc
- ));
- $this->getHierarchiesModel()->touch($category_id, $hierarchy['id']);
- }
- } else {
- //$this->view->results .= htmlentities($element)."\n\n";
- }
- }
- $start_index = strpos($contents, '<div id="methods">');
- if( $start_index === false ) {
- $this->view->results .= 'Couldn\'t find the methods, skipping...' . "\n";
- continue;
- }
- $end_index = strpos($contents, '<div id="', $start_index+1);
- if( $end_index === false ) {
- $this->view->results .= 'Couldn\'t find the end of the methods, skipping...' . "\n";
- continue;
- }
- $source_name = strtolower($hierarchy['name']);
- $data = str_replace("\n", '', substr($contents, $start_index, $end_index - $start_index));
- $elements = explode('<li class="method"', $data);
- foreach( $elements as $element ) {
- if( preg_match('/<h3 class="method-name"><a href="(.+?)">(.+?)<\/a><\/h3>.+?<p>(.+?)<\/p>/', $element, $matches) ) {
- $link = $source_url.$matches[1];
- $name = $source_name .'(\''.trim(str_replace(' )', ')', str_replace(' ', '', strip_tags($matches[2])))).'\')';
- $desc = trim(strip_tags($matches[3], '<b>'));
- $this->view->results .= $link.' - '.$name."\n";
- $this->view->results .= $desc."\n\n";
- if( $is_saving ) {
- $this->getFunctionsModel()->insertOrUpdateFunction(array(
- 'category' => $category_id,
- 'hierarchy' => $hierarchy['id'],
- 'name' => $name,
- 'url' => $link,
- 'short_description' => $desc
- ));
- $this->getHierarchiesModel()->touch($category_id, $hierarchy['id']);
- }
- } else {
- //$this->view->results .= htmlentities($element)."\n\n";
- }
- }
- }
- }
- private function scrapejQuery() {
- $category = 'jQuery';
- $category_id = $this->getCategoriesModel()->fetchCategoryByName($category);
- if( !$category_id ) {
- $this->invalid_category($category);
- return;
- }
- $scrapeable = $this->getHierarchiesModel()->fetchAllScrapeable($category_id);
- if( empty($scrapeable) ) {
- $this->nothing_to_scrape($category);
- return;
- }
- foreach( $scrapeable as $hierarchy ) {
- $this->view->results .= $hierarchy['name'] . "\n";
- if( !$hierarchy['source_url'] ) {
- $this->view->results .= 'No source URL specified, skipping...' . "\n";
- continue;
- }
- $source_url = $hierarchy['source_url'];
- $this->view->results .= '<a href="'.$source_url.'">'.$source_url."</a>\n";
- $contents = file_get_contents($source_url);
- $start_index = strpos($contents, '<div class="options list">');
- if( $start_index === false ) {
- $this->view->results .= 'Couldn\'t find the options list, skipping...' . "\n";
- continue;
- }
- $end_index = strpos($contents, '<div class="printfooter">', $start_index);
- if( $end_index === false ) {
- $this->view->results .= 'Couldn\'t find the end of the options list, skipping...' . "\n";
- continue;
- }
- $data = substr($contents, $start_index, $end_index - $start_index);
- $elements = explode('tr class="option"', $data);
- foreach( $elements as $element ) {
- if( preg_match('/<a href="(.+?)" title=".+?">(.+?)<\/a><\/b>.+?<td colspan="2" class="desc">(.+?)<\/td>/', $element, $matches) ) {
- $link = 'http://docs.jquery.com'.$matches[1];
- $name = trim(str_replace(' )', ')', str_replace(' ', '', strip_tags($matches[2]))));
- $desc = trim(strip_tags($matches[3], '<b>'));
-
- $this->view->results .= $link.' - '.$name."\n";
- $this->view->results .= $desc."\n\n";
- $this->getFunctionsModel()->insertOrUpdateFunction(array(
- 'category' => $category_id,
- 'hierarchy' => $hierarchy['id'],
- 'name' => $name,
- 'url' => $link,
- 'short_description' => $desc
- ));
- $this->getHierarchiesModel()->touch($category_id, $hierarchy['id']);
- } else {
- //$this->view->results .= htmlentities($element)."\n\n";
- }
- }
- }
- }
- private function scrapeJavascript() {
- $category = 'Javascript';
- $category_id = $this->getCategoriesModel()->fetchCategoryByName($category);
- if( !$category_id ) {
- $this->invalid_category($category);
- return;
- }
- $scrapeable = $this->getHierarchiesModel()->fetchAllScrapeable($category_id);
- if( empty($scrapeable) ) {
- $this->nothing_to_scrape($category);
- return;
- }
- foreach( $scrapeable as $hierarchy ) {
- $this->view->results .= $hierarchy['name'] . "\n";
- if( !$hierarchy['source_url'] ) {
- $this->view->results .= 'No source URL specified, skipping...' . "\n";
- continue;
- }
- $source_url = $hierarchy['source_url'];
- $this->view->results .= '<a href="'.$source_url.'">'.$source_url."</a>\n";
- $contents = file_get_contents(APPLICATION_PATH . '/scraper/js/'.$hierarchy['id'].'.html');
- if( preg_match("/<h2>The (.+?) Object<\/h2>\n<p>(.+)?<\/p>/", $contents, $matches) ) {
- $object_name = $matches[1];
- $description = $matches[2];
- $this->view->results .= $object_name."\n";
- $this->view->results .= $description."\n";
- $this->getFunctionsModel()->insertOrUpdateFunction(array(
- 'category' => $category_id,
- 'hierarchy' => $hierarchy['id'],
- 'name' => $object_name,
- 'url' => $source_url,
- 'short_description' => $description
- ));
- } else {
- $this->view->results .= 'We couldn\'t find the description...' . "\n";
- }
-
- $is_dom = strpos($contents, 'HTML DOM <span class="color_h1">') !== false;
- if( $is_dom ) {
- $object_name = strtolower(str_replace(' ', '', $hierarchy['name']));
- }
- $properties_index = strpos($contents, 'Object Collections</h');
- $end_index = strpos($contents, '</table>', $properties_index);
- if( $properties_index !== FALSE && $end_index !== FALSE ) {
- $properties = array_slice(
- explode(
- '<tr>',
- substr($contents, $properties_index, $end_index - $properties_index)
- ),
- 2
- );
- foreach( $properties as $property ) {
- $elements = explode('<td', $property);
- foreach( $elements as &$element ) {
- $element = trim(
- str_replace(
- ' ',
- '',
- preg_replace(
- '/^.+?>/',
- '',
- str_replace(
- "\n",
- '',
- strip_tags(
- $element,
- '<a>'
- )
- )
- )
- )
- );
- }
- if( count($elements) <= 1 ) {
- $this->view->results .= 'Invalid element list.'."\n";
- $this->view->results .= print_r($property, true);
- break;
- }
- $link = $elements[1];
- $desc = $elements[2];
- $ff = $elements[3];
- if( count($elements) >= 6 ) {
- $ns = $elements[4];
- $ie = $elements[5];
- } else {
- $ie = $elements[4];
- }
- $name = '';
- if( $link ) {
- if( preg_match('/<a href="(.+?)">(.+)?<\/a>/', $link, $matches) ) {
- $link = $matches[1];
- $name = $matches[2];
- } else {
- $name = $link;
- $link = '';
- }
- }
- $name = str_replace('[]', '', $name);
- $this->view->results .= $object_name.'.'.$name ." - ";
- $this->view->results .= $link ." - ".$is_dom.' - ';
- $this->view->results .= $desc ."\n";
- $this->getFunctionsModel()->insertOrUpdateFunction(array(
- 'category' => $category_id,
- 'hierarchy' => $hierarchy['id'],
- 'name' => $object_name.'.'.$name,
- 'url' => $link,
- 'short_description' => $desc
- ));
- }
- }
- $properties_index = strpos($contents, 'Object Properties</h');
- $end_index = strpos($contents, '</table>', $properties_index);
- if( $properties_index !== FALSE && $end_index !== FALSE ) {
- $properties = array_slice(
- explode(
- '<tr>',
- substr($contents, $properties_index, $end_index - $properties_index)
- ),
- 2
- );
- foreach( $properties as $property ) {
- $elements = explode('<td', $property);
- foreach( $elements as &$element ) {
- $element = trim(
- str_replace(
- ' ',
- '',
- preg_replace(
- '/^.+?>/',
- '',
- str_replace(
- "\n",
- '',
- strip_tags(
- $element,
- '<a>'
- )
- )
- )
- )
- );
- }
- if( count($elements) <= 1 ) {
- $this->view->results .= 'Invalid element list.'."\n";
- $this->view->results .= print_r($property, true);
- break;
- }
- $link = $elements[1];
- $desc = $elements[2];
- $ff = $elements[3];
- if( count($elements) >= 6 ) {
- $ns = $elements[4];
- $ie = $elements[5];
- } else {
- $ie = $elements[4];
- }
- $name = '';
- if( $link ) {
- if( preg_match('/<a href="(.+?)">(.+)?<\/a>/', $link, $matches) ) {
- $link = $matches[1];
- $name = $matches[2];
- } else {
- $name = $link;
- $link = '';
- }
- }
- $this->view->results .= $object_name.'.'.$name ." - ";
- $this->view->results .= $link ." - ".$is_dom.' - ';
- $this->view->results .= $desc ."\n";
- $this->getFunctionsModel()->insertOrUpdateFunction(array(
- 'category' => $category_id,
- 'hierarchy' => $hierarchy['id'],
- 'name' => $object_name.'.'.$name,
- 'url' => $link,
- 'short_description' => $desc
- ));
- }
- }
-
- $methods_index = strpos($contents, 'Object Methods</h');
- $end_index = strpos($contents, '</table>', $methods_index);
-
- if( $methods_index !== FALSE && $end_index !== FALSE ) {
- $methods = array_slice(
- explode(
- '<tr>',
- substr($contents, $methods_index, $end_index - $methods_index)
- ),
- 2
- );
- foreach( $methods as $method ) {
- $elements = explode('<td valign="top">', $method);
- foreach( $elements as &$element ) {
- $element = trim(str_replace(' ', '', str_replace("\n", '', strip_tags($element, '<a>'))));
- }
- $link = $elements[1];
- $desc = $elements[2];
- $ff = $elements[3];
- if( count($elements) >= 6 ) {
- $ns = $elements[4];
- $ie = $elements[5];
- } else {
- $ie = $elements[4];
- }
- $name = '';
- if( $link ) {
- if( preg_match('/<a(?: target="_top")? href="(.+?)">(.+)?<\/a>/', $link, $matches) ) {
- $link = $matches[1];
- $name = $matches[2];
- } else {
- $name = $link;
- $link = '';
- }
- }
- $name = preg_replace('/(\(.*?\))/', '', $name);
- $this->view->results .= $object_name.'.'.$name ." - ";
- $this->view->results .= $link ." - ";
- $this->view->results .= $desc ."\n";
- $this->getFunctionsModel()->insertOrUpdateFunction(array(
- 'category' => $category_id,
- 'hierarchy' => $hierarchy['id'],
- 'name' => $object_name.'.'.$name,
- 'url' => $link,
- 'short_description' => $desc,
- 'scrapeable' => 1
- ));
- }
- continue;
- }
- $start_index = strpos($contents, 'Top-level Functions</h2>');
- $end_index = strpos($contents, '</table>', $start_index);
- $start_prop_index = strpos($contents, 'Top-level Properties</h2>');
- $end_prop_index = strpos($contents, '</table>', $start_prop_index);
- if( $start_index !== false && $end_index !== false &&
- $start_prop_index !== false && $end_prop_index !== false ) {
- $functions = array_slice(
- explode(
- '<tr>',
- substr($contents, $start_index, $end_index - $start_index)
- ),
- 2
- );
- foreach( $functions as $function ) {
- $elements = explode('<td valign="top">', $function);
- foreach( $elements as &$element ) {
- $element = trim(str_replace(' ', '', str_replace("\n", '', strip_tags($element, '<a>'))));
- }
- $link = $elements[1];
- $desc = $elements[2];
- $ff = $elements[3];
- if( count($elements) >= 6 ) {
- $ns = $elements[4];
- $ie = $elements[5];
- } else {
- $ie = $elements[4];
- }
- $name = '';
- if( $link ) {
- if( preg_match('/<a(?: target="_top")? href="(.+?)">(.+)?<\/a>/', $link, $matches) ) {
- $link = $matches[1];
- $name = $matches[2];
- } else {
- $name = $link;
- $link = '';
- }
- }
- $name = preg_replace('/(\(.*?\))/', '', $name);
- $this->view->results .= $name ." - ";
- $this->view->results .= $link ." - ";
- $this->view->results .= $desc ."\n";
- $this->getFunctionsModel()->insertOrUpdateFunction(array(
- 'category' => $category_id,
- 'hierarchy' => $hierarchy['id'],
- 'name' => $name,
- 'url' => $link,
- 'short_description' => $desc,
- 'scrapeable' => 1
- ));
- }
- $properties = array_slice(
- explode(
- '<tr>',
- substr($contents, $start_prop_index, $end_prop_index - $start_prop_index)
- ),
- 2
- );
- foreach( $properties as $property ) {
- $elements = explode('<td valign="top">', $property);
- foreach( $elements as &$element ) {
- $element = trim(str_replace(' ', '', str_replace("\n", '', strip_tags($element, '<a>'))));
- }
- $link = $elements[1];
- $desc = $elements[2];
- $ff = $elements[3];
- if( count($elements) >= 6 ) {
- $ns = $elements[4];
- $ie = $elements[5];
- } else {
- $ie = $elements[4];
- }
- $name = '';
- if( $link ) {
- if( preg_match('/<a(?: target="_top")? href="(.+?)">(.+)?<\/a>/', $link, $matches) ) {
- $link = $matches[1];
- $name = $matches[2];
- } else {
- $name = $link;
- $link = '';
- }
- }
- $name = preg_replace('/(\(.*?\))/', '', $name);
- $this->view->results .= $name ." - ";
- $this->view->results .= $link ." - ";
- $this->view->results .= $desc ."\n";
- $this->getFunctionsModel()->insertOrUpdateFunction(array(
- 'category' => $category_id,
- 'hierarchy' => $hierarchy['id'],
- 'name' => $name,
- 'url' => $link,
- 'short_description' => $desc,
- 'scrapeable' => 1
- ));
- }
- continue;
- }
- $start_index = strpos($contents, '<h2>Event Handlers</h2>');
- $end_index = strpos($contents, '</table>', $start_index);
- if( $start_index !== false && $end_index !== false ) {
- $events = array_slice(
- explode(
- '<tr>',
- substr($contents, $start_index, $end_index - $start_index)
- ),
- 2
- );
- foreach( $events as $event ) {
- $elements = explode('<td valign="top">', $event);
- foreach( $elements as &$element ) {
- $element = trim(str_replace(' ', '', str_replace("\n", '', strip_tags($element, '<a>'))));
- }
- $link = $elements[1];
- $desc = $elements[2];
- $ff = $elements[3];
- if( count($elements) >= 6 ) {
- $ns = $elements[4];
- $ie = $elements[5];
- } else {
- $ie = $elements[4];
- }
- $name = '';
- if( $link ) {
- if( preg_match('/<a(?: target="_top")? href="(.+?)">(.+)?<\/a>/', $link, $matches) ) {
- $link = $matches[1];
- $name = $matches[2];
- } else {
- $name = $link;
- $link = '';
- }
- }
- $name = preg_replace('/(\(.*?\))/', '', $name);
- if( $is_dom ) {
- $name = 'event.'.$name;
- }
- $this->view->results .= $name ." - ";
- $this->view->results .= $link ." - " . $is_dom.' - ';
- $this->view->results .= $desc ."\n";
- $this->getFunctionsModel()->insertOrUpdateFunction(array(
- 'category' => $category_id,
- 'hierarchy' => $hierarchy['id'],
- 'name' => $name,
- 'url' => $link,
- 'short_description' => $desc,
- 'scrapeable' => 1
- ));
- }
- if( !$is_dom ) {
- continue;
- }
- }
- $start_index = strpos($contents, 'Keyboard Attributes</h');
- $end_index = strpos($contents, '</table>', $start_index);
- if( $start_index !== false && $end_index !== false ) {
- $events = array_slice(
- explode(
- '<tr>',
- substr($contents, $start_index, $end_index - $start_index)
- ),
- 2
- );
- foreach( $events as $event ) {
- $elements = explode('<td valign="top">', $event);
- foreach( $elements as &$element ) {
- $element = trim(str_replace(' ', '', str_replace("\n", '', strip_tags($element, '<a>'))));
- }
- $link = $elements[1];
- $desc = $elements[2];
- $ff = $elements[3];
- if( count($elements) >= 6 ) {
- $ns = $elements[4];
- $ie = $elements[5];
- } else {
- $ie = $elements[4];
- }
- $name = '';
- if( $link ) {
- if( preg_match('/<a(?: target="_top")? href="(.+?)">(.+)?<\/a>/', $link, $matches) ) {
- $link = $matches[1];
- $name = $matches[2];
- } else {
- $name = $link;
- $link = '';
- }
- }
- $name = preg_replace('/(\(.*?\))/', '', $name);
- if( $is_dom ) {
- $name = 'event.'.$name;
- }
- $this->view->results .= $name ." - ";
- $this->view->results .= $link ." - " . $is_dom.' - ';
- $this->view->results .= $desc ."\n";
- $this->getFunctionsModel()->insertOrUpdateFunction(array(
- 'category' => $category_id,
- 'hierarchy' => $hierarchy['id'],
- 'name' => $name,
- 'url' => $link,
- 'short_description' => $desc,
- 'scrapeable' => 1
- ));
- }
- if( !$is_dom ) {
- continue;
- }
- }
- $start_index = strpos($contents, 'Event Attributes</h');
- $end_index = strpos($contents, '</table>', $start_index);
- if( $start_index !== false && $end_index !== false ) {
- $events = array_slice(
- explode(
- '<tr>',
- substr($contents, $start_index, $end_index - $start_index)
- ),
- 2
- );
- foreach( $events as $event ) {
- $elements = explode('<td valign="top">', $event);
- foreach( $elements as &$element ) {
- $element = trim(str_replace(' ', '', str_replace("\n", '', strip_tags($element, '<a>'))));
- }
- $link = $elements[1];
- $desc = $elements[2];
- $ff = $elements[3];
- if( count($elements) >= 6 ) {
- $ns = $elements[4];
- $ie = $elements[5];
- } else {
- $ie = $elements[4];
- }
- $name = '';
- if( $link ) {
- if( preg_match('/<a(?: target="_top")? href="(.+?)">(.+)?<\/a>/', $link, $matches) ) {
- $link = $matches[1];
- $name = $matches[2];
- } else {
- $name = $link;
- $link = '';
- }
- }
- $name = preg_replace('/(\(.*?\))/', '', $name);
- if( $is_dom ) {
- $name = 'event.'.$name;
- }
- $this->view->results .= $name ." - ";
- $this->view->results .= $link ." - " . $is_dom.' - ';
- $this->view->results .= $desc ."\n";
- $this->getFunctionsModel()->insertOrUpdateFunction(array(
- 'category' => $category_id,
- 'hierarchy' => $hierarchy['id'],
- 'name' => $name,
- 'url' => $link,
- 'short_description' => $desc,
- 'scrapeable' => 1
- ));
- }
- if( !$is_dom ) {
- continue;
- }
- }
- $start_index = strpos($contents, '<h3>Properties</h3>');
- $end_index = strpos($contents, '</table>', $start_index);
- if( $start_index !== false && $end_index !== false ) {
- $events = array_slice(
- explode(
- '<tr>',
- substr($contents, $start_index, $end_index - $start_index)
- ),
- 2
- );
- foreach( $events as $event ) {
- $elements = explode('<td valign="top">', $event);
- foreach( $elements as &$element ) {
- $element = trim(str_replace(' ', '', str_replace("\n", '', strip_tags($element, '<a>'))));
- }
- if( count($elements) < 2 ) {
- $this->view->results .= 'Missing '.print_r($event);
- continue;
- }
- $link = $elements[1];
- $desc = $elements[2];
- $ff = $elements[3];
- if( count($elements) >= 6 ) {
- $ns = $elements[4];
- $ie = $elements[5];
- } else {
- $ie = $elements[4];
- }
- $name = '';
- if( $link ) {
- if( preg_match('/<a(?: target="_top")? href="(.+?)">(.+)?<\/a>/', $link, $matches) ) {
- $link = $matches[1];
- $name = $matches[2];
- } else {
- $name = $link;
- $link = '';
- }
- }
- $name = preg_replace('/(\(.*?\))/', '', $name);
- $this->view->results .= $object_name.'.'.$name ." - ";
- $this->view->results .= $link ." - " . $is_dom.' - ';
- $this->view->results .= $desc ."\n";
- $this->getFunctionsModel()->insertOrUpdateFunction(array(
- 'category' => $category_id,
- 'hierarchy' => $hierarchy['id'],
- 'name' => $object_name.'.'.$name,
- 'url' => $link,
- 'short_description' => $desc,
- 'scrapeable' => 1
- ));
- }
- if( !$is_dom ) {
- continue;
- }
- }
- $start_index = 0;
- do {
- $start_index = strpos($contents, 'properties</a></h3>', $start_index);
- $end_index = strpos($contents, '</table>', $start_index);
- if( $start_index !== false && $end_index !== false ) {
- $events = array_slice(
- explode(
- '<tr>',
- substr($contents, $start_index, $end_index - $start_index)
- ),
- 2
- );
- foreach( $events as $event ) {
- $elements = explode('<td valign="top">', $event);
- foreach( $elements as &$element ) {
- $element = trim(str_replace(' ', '', str_replace("\n", '', strip_tags($element, '<a>'))));
- }
- $link = $elements[1];
- $desc = $elements[2];
- $ff = $elements[3];
- if( count($elements) >= 6 ) {
- $ns = $elements[4];
- $ie = $elements[5];
- } else {
- $ie = $elements[4];
- }
- $name = '';
- if( $link ) {
- if( preg_match('/<a(?: target="_top")? href="(.+?)">(.+)?<\/a>/', $link, $matches) ) {
- $link = $matches[1];
- $name = $matches[2];
- } else {
- $name = $link;
- $link = '';
- }
- }
- $name = preg_replace('/(\(.*?\))/', '', $name);
- $this->view->results .= $object_name.'.'.$name ." - ";
- $this->view->results .= $link ." - " . $is_dom.' - ';
- $this->view->results .= $desc ."\n";
-
- $this->getFunctionsModel()->insertOrUpdateFunction(array(
- 'category' => $category_id,
- 'hierarchy' => $hierarchy['id'],
- 'name' => $object_name.'.'.$name,
- 'url' => $link,
- 'short_description' => $desc,
- 'scrapeable' => 1
- ));
- }
- if( !$is_dom ) {
- continue;
- }
- }
- $start_index++;
- } while( $start_index !== false );
- $start_index = strpos($contents, 'Standard Properties</h3>');
- $end_index = strpos($contents, '</table>', $start_index);
- if( $start_index !== false && $end_index !== false ) {
- $events = array_slice(
- explode(
- '<tr>',
- substr($contents, $start_index, $end_index - $start_index)
- ),
- 2
- );
- foreach( $events as $event ) {
- $elements = explode('<td valign="top">', $event);
- foreach( $elements as &$element ) {
- $element = trim(str_replace(' ', '', str_replace("\n", '', strip_tags($element, '<a>'))));
- }
- $link = $elements[1];
- $desc = $elements[2];
- $ff = $elements[3];
- if( count($elements) >= 6 ) {
- $ns = $elements[4];
- $ie = $elements[5];
- } else {
- $ie = $elements[4];
- }
- $name = '';
- if( $link ) {
- if( preg_match('/<a(?: target="_top")? href="(.+?)">(.+)?<\/a>/', $link, $matches) ) {
- $link = $matches[1];
- $name = $matches[2];
- } else {
- $name = $link;
- $link = '';
- }
- }
- $name = preg_replace('/(\(.*?\))/', '', $name);
- $this->view->results .= $object_name.'.'.$name ." - ";
- $this->view->results .= $link ." - " . $is_dom.' - ';
- $this->view->results .= $desc ."\n";
- $this->getFunctionsModel()->insertOrUpdateFunction(array(
- 'category' => $category_id,
- 'hierarchy' => $hierarchy['id'],
- 'name' => $object_name.'.'.$name,
- 'url' => $link,
- 'short_description' => $desc,
- 'scrapeable' => 1
- ));
- }
- if( !$is_dom ) {
- continue;
- }
- }
- $this->view->results .= 'We couldn\'t find the properties or methods...' . "\n";
- }
- }
- private function scrapeiPhoneDir() {
- $category = 'iPhone';
- $category_id = $this->getCategoriesModel()->fetchCategoryByName($category);
- if( !$category_id ) {
- $this->invalid_category($category);
- return;
- }
- $hierarchies = array(
- /*'6' => 'http://developer.apple.com/iphone/library/navigation/Frameworks/CocoaTouch/AddressBookUI',
- '7' => 'http://developer.apple.com/iphone/library/navigation/Frameworks/CocoaTouch/UIKit',
- '81' => 'http://developer.apple.com/iphone/library/navigation/Frameworks/Media/AudioToolbox',
- '82' => 'http://developer.apple.com/iphone/library/navigation/Frameworks/Media/AudioUnit',
- '83' => 'http://developer.apple.com/iphone/library/navigation/Frameworks/Media/AVFoundation',
- '84' => 'http://developer.apple.com/iphone/library/navigation/Frameworks/Media/CoreAudio',
- '85' => 'http://developer.apple.com/iphone/library/navigation/Frameworks/Media/CoreGraphics',
- '86' => 'http://developer.apple.com/iphone/library/navigation/Frameworks/Media/MediaPlayer',
- '87' => 'http://developer.apple.com/iphone/library/navigation/Frameworks/Media/OpenGLES',
- '88' => 'http://developer.apple.com/iphone/library/navigation/Frameworks/Media/QuartzCore',*/
- '110' => 'http://developer.apple.com/iphone/library/navigation/Frameworks/CoreServices/AddressBook',
- '111' => 'http://developer.apple.com/iphone/library/navigation/Frameworks/CoreServices/CoreFoundation',
- '112' => 'http://developer.apple.com/iphone/library/navigation/Frameworks/CoreServices/CoreLocation',
- '113' => 'http://developer.apple.com/iphone/library/navigation/Frameworks/CoreServices/Foundation',
- '114' => 'http://developer.apple.com/iphone/library/navigation/Frameworks/CoreServices/SystemConfiguration',
- '115' => 'http://developer.apple.com/iph…
Large files files are truncated, but you can click here to view the full file