PageRenderTime 56ms CodeModel.GetById 23ms RepoModel.GetById 1ms app.codeStats 0ms

/v1/lib/DataInterface/KBOPublicSearch/KBOPublicSearchCrawler.php

https://github.com/TomLous/datatools
PHP | 636 lines | 429 code | 86 blank | 121 comment | 60 complexity | 8570632c3fdd41b829c87ebcb6334775 MD5 | raw file
Possible License(s): Apache-2.0, GPL-2.0, LGPL-2.1
  1. <?php
  2. /**
  3. * Invokes form search and crawls requests from this site: http://kbopub.economie.fgov.be/kbopub
  4. * @todo: Open Data will become available soon. check that?
  5. *
  6. * @author Tom Lous <tomlous@gmail.com>
  7. * @copyright 2014 Tom Lous
  8. * @package KBOPublicSearch
  9. * Datetime: 02/05/14 15:05
  10. *
  11. */
  12. namespace DataInterface\KBOPublicSearch;
  13. use DataInterface\DataInterface;
  14. use DataInterface\Exception\IncompatibleInterfaceException;
  15. use DataInterface\Exception\IncompatibleInputException;
  16. use DataInterface\Exception\InterfaceQuotaExceededException;
  17. use models\Address;
  18. use models\GeoLocation;
  19. class KBOPublicSearchCrawler extends DataInterface
  20. {
  21. private $mappingFrench = array(
  22. 'companyNumber' => "Numéro d'entreprise",
  23. 'addressNl' => 'Adresse du siège social',
  24. 'numberOfEstablishments' => "Nombre d'unités d'établissement",
  25. 'numberOfEstablishments2' => "Nombre d''unités d'établissement (UE):",
  26. 'numberAndStartDate' => "Numéro d'entreprise et date de début",
  27. 'noData' => 'Pas de données reprises dans la BCE',
  28. 'startDate' => 'Date de début',
  29. 'name' => "Dénomination de l'entreprise",
  30. 'additionalInfo' => "Info supplémentaire",
  31. 'phoneNumber' => 'Numéro de téléphone',
  32. 'faxNumber' => 'Numéro de fax',
  33. 'emailAddress' => 'Adresse e-mail',
  34. 'website' => 'Adresse web',
  35. 'type' => "Type d'entreprise",
  36. 'status' => 'Statut',
  37. 'legalForm' => 'Forme juridique',
  38. 'abbreviation' => 'Abréviation',
  39. 'commercialName' => 'Dénomination commerciale',
  40. 'revocation' => 'Ambtshalve doorhaling',
  41. 'next' => 'Suivant',
  42. 'active' => 'AC',
  43. 'naturalPerson' => 'Personne morale',
  44. 'corporation' => 'Association des copropriétaires',
  45. 'nonProfit' => 'Association sans but lucratif',
  46. 'closedPartnership' => 'Société privée à responsabilité limitée',
  47. 'namelessPartnership' => 'Société privée à responsabilité limitée unipersonnelle',
  48. 'foundEnterprises' => 'entreprises trouvées',
  49. 'corporationLimitedLiability' => 'Société coopérative à responsabilité limitée',
  50. 'corporationOld' => 'Société coopérative (ancien statut)',
  51. 'oneManBusiness' => 'Société privée à responsabilité limitée',
  52. 'noLegal' => 'Société ou association sans personnalité juridique',
  53. 'foreign' => 'Société étrangère',
  54. 'other' => 'Autres formes juridiques',
  55. 'email' => 'E-mail',
  56. 'legalStatus' => 'Situation juridique',
  57. 'normal' => 'Situation normale',
  58. );
  59. private $mappingDutch = array(
  60. 'companyNumber' => 'Ondernemingsnummer',
  61. 'addressNl' => 'Adres van de maatschappelijke zetel',
  62. 'numberOfEstablishments' => 'Aantal vestigingen',
  63. 'numberOfEstablishments2' => 'Aantal vestigingseenheden (VE)',
  64. 'numberAndStartDate' => "Ondernemings- nummer en begindatum",
  65. 'noData' => 'Geen gegevens opgenomen in KBO',
  66. 'startDate' => 'Begindatum',
  67. 'additionalInfo' => "Bijkomende info",
  68. 'name' => 'Maatschappelijke Naam',
  69. 'phoneNumber' => 'Telefoonnummer',
  70. 'faxNumber' => 'Faxnummer',
  71. 'emailAddress' => 'E-mailadres',
  72. 'website' => 'Webadres',
  73. 'type' => 'Type onderneming',
  74. 'status' => 'Status',
  75. 'legalForm' => 'Rechtsvorm',
  76. 'abbreviation' => 'Afkorting',
  77. 'commercialName' => 'Commerciële Naam',
  78. 'revocation' => 'Ambtshalve doorhaling',
  79. 'next' => 'Volgende',
  80. 'active' => 'AC',
  81. 'naturalPerson' => 'Rechtspersoon',
  82. 'corporation' => 'Vereniging van mede-eigenaars',
  83. 'nonProfit' => 'Vereniging zonder winstoogmerk',
  84. 'closedPartnership' => 'Besloten vennootschap met beperkte aansprakelijkheid',
  85. 'namelessPartnership' => 'Naamloze vennootschap',
  86. 'foundEnterprises' => 'ondernemingen gevonden',
  87. 'corporationLimitedLiability' => 'Coöperatieve vennootschap met beperkte aansprakelijkheid',
  88. 'corporationOld' => 'Coöperatieve vennootschap (oud statuut)',
  89. 'oneManBusiness' => 'Eenmans besloten vennootschap met beperkte aansprakelijkheid',
  90. 'noLegal' => 'Vennootschap of vereniging zonder rechtspersoonlijkheid',
  91. 'foreign' => 'Buitenlandse onderneming',
  92. 'other' => 'Overige rechtsvormen',
  93. 'email' => 'E-mail',
  94. 'legalStatus' => 'Rechtstoestand',
  95. 'normal' => 'Normale toestand',
  96. );
  97. /**
  98. * Constanst used for this API
  99. */
  100. const apiUrl = 'http://kbopub.economie.fgov.be/kbopub/';
  101. /**
  102. * Execute form request based on this page
  103. * @see http://kbopub.economie.fgov.be/kbopub/zoekwoordenform.html
  104. * Required params:
  105. * ondernemingsnummer (Opzoeking volgens ondernemingsnummer) or searchWord (Opzoeking volgens zoekwoord )
  106. * optional:
  107. * params available in form on page
  108. *
  109. * additional:
  110. * page (page num)
  111. * lang (language nl or fr)
  112. * follow_nextpage boolean, for loading all results after another
  113. *
  114. * @param array $params
  115. * @return array
  116. * @throws \DataInterface\Exception\IncompatibleInputException
  117. */
  118. public function searchOndernemingen($params = array())
  119. {
  120. $type = 'zoekwoordenform';
  121. // All available fields that can be set
  122. $queryParams = array(
  123. 'ondernemingsnummer' => '',
  124. 'natuurlijkPersoon' => 'true',
  125. '_natuurlijkPersoon' => 'on',
  126. 'rechtsPersoon' => 'true',
  127. '_rechtsPersoon' => 'on',
  128. 'searchWord' => '', // check
  129. '_oudeBenaming' => 'on',
  130. 'pstcdeNPRP' => '',
  131. 'postgemeente1' => '',
  132. 'familynameFonetic' => '',
  133. 'pstcdeNPFonetic' => '',
  134. 'postgemeente2' => '',
  135. 'searchwordRP' => '',
  136. '_oudeBenaming' => 'on',
  137. 'pstcdeRPFonetic' => '',
  138. 'postgemeente3' => '',
  139. 'rechtsvormFonetic' => 'ALL',
  140. 'familynameExact' => '',
  141. 'firstName' => '',
  142. 'pstcdeNPExact' => '',
  143. 'postgemeente4' => '',
  144. 'firmName' => '',
  145. 'pstcdeRPExact' => '',
  146. 'postgemeente5' => '',
  147. 'rechtsvormExact' => 'ALL',
  148. 'page' => '1', // set hardcoded to 1, so it can be overwritten
  149. 'lang' => 'nl', // set hardcoded to nl, so it can be overwritten
  150. );
  151. // sanitize input params
  152. if (!is_array($params)) {
  153. throw new IncompatibleInputException('Missing properties');
  154. }
  155. foreach ($params as $param => $value) {
  156. if (isset($queryParams[$param]) && is_scalar($value)) {
  157. $queryParams[$param] = $value;
  158. }
  159. }
  160. $follow_nextpage = false;
  161. if (isset($params['follow_nextpage']) && is_scalar($params['follow_nextpage'])) {
  162. $follow_nextpage = strtolower($params['follow_nextpage']) == 'true' || $params['follow_nextpage'] == 1;
  163. }
  164. // Check if either of the two obligated values isset
  165. if (!empty($queryParams['ondernemingsnummer'])) {
  166. $queryParams['actionEntnr'] = 'Zoek onderneming';
  167. } elseif (!empty($queryParams['searchWord'])) {
  168. $queryParams['actionNPRP'] = 'Zoek onderneming';
  169. } else {
  170. throw new IncompatibleInputException('Missing property `ondernemingsnummer` or `searchWord`');
  171. }
  172. // @todo create possible switch for other params / combis
  173. // create a new URL for this request e.g. http://kbopub.economie.fgov.be/kbopub/zoekwoordenform.html?query
  174. $requestUrl = $this->buildUrl($type, $queryParams);
  175. // do request to Geocodefarms
  176. $returnData = $this->doRequestAndInterpretHTML($requestUrl, $type, $follow_nextpage);
  177. return $returnData;
  178. }
  179. private function doRequestAndInterpretHTML($url, $type, $follow_nextpage = false, $maxLoopCount = -1)
  180. {
  181. $returnData = array();
  182. $returnData['Meta'] = array();
  183. $returnData['Meta']['url'] = $url;
  184. $returnData['Meta']['type'] = $type;
  185. // Retrieve JSON for url
  186. $html = $this->doGetRequest($url);
  187. // When request rate is too high, it throws this error
  188. if (preg_match('/Request rate too high/is', $html)) {
  189. throw new InterfaceQuotaExceededException('Access Denied to service reason: `Request rate too high` for request to ' . $url);
  190. }
  191. // convert HTML into DOMDocument
  192. libxml_use_internal_errors(true);
  193. $document = new \DOMDocument();
  194. $document->strictErrorChecking = false;
  195. $document->preserveWhiteSpace = false;
  196. $document->loadHTML($html);
  197. $xpath = new \DOMXPath($document);
  198. $returnData['data'] = null;
  199. $tableArray = array();
  200. $resultsPerPage = 0;
  201. // All data in KBO is returned as table, so parse the tables in the returned data
  202. $tableNodeList = $document->getElementsByTagName("table");
  203. foreach ($tableNodeList as $tableNode) {
  204. $tableData = array();
  205. $tableData['numRows'] = 0;
  206. $tableData['numColums'] = 0;
  207. // retrieve all table rows for this table
  208. $rowNodeList = $tableNode->getElementsByTagName("tr");
  209. $isHeaderRow = false;
  210. foreach ($rowNodeList as $rowNode) {
  211. $tableData['numRows']++;
  212. $numColums = 0;
  213. $rowData = array();
  214. // retrieve all cells for this row
  215. $cellNodeList = $rowNode->getElementsByTagName('td');
  216. if ($cellNodeList->length == 0) {
  217. // when no cells check all th's for this row (this row becomes the header)
  218. $cellNodeList = $rowNode->getElementsByTagName('th');
  219. if ($cellNodeList->length > 0) {
  220. $isHeaderRow = true;
  221. }
  222. }
  223. // loop all cells
  224. foreach ($cellNodeList as $cellNode) {
  225. $numColums++;
  226. $cellData = array();
  227. // all nested html (cellparts, like spans, div's, etc) in a cell as an array element
  228. foreach ($cellNode->childNodes as $cellPartNode) {
  229. $cellPartData = array();
  230. $cellPartData['content'] = $this->translateText($cellPartNode->nodeValue);
  231. // a cellpart with interesting (content) attriutes, like href should be included
  232. if ($cellPartNode->attributes && $cellPartNode->attributes->length > 0 && !$isHeaderRow) {
  233. $cellAttributes = array();
  234. // loop all attributes for this cell
  235. foreach ($cellPartNode->attributes as $attribute) {
  236. if (in_array($attribute->name, array('href'))) { // allowed tags for content
  237. // trim & remove session id from url
  238. $cellAttributes[$attribute->name] = $this->cleanUrl($attribute->value, $type);
  239. }
  240. }
  241. // if interesting cellparts are found add them as attributes
  242. if (count($cellAttributes) > 0) {
  243. $cellPartData['attributes'] = $this->cleanArray($cellAttributes);
  244. }
  245. }
  246. // set cell data
  247. $cellData[] = $this->cleanArray($cellPartData);
  248. }
  249. $cellData = $this->cleanArray($cellData, true);
  250. // key is next number in line
  251. $key = count($rowData);
  252. // if its a header row, use the current cellpart as value
  253. if ($isHeaderRow) {
  254. $cellData = current($cellData);
  255. } // if there is a header for current position
  256. elseif (isset($tableData['header'][$key]['content'])) {
  257. $key = (string)$tableData['header'][$key]['content'];
  258. }
  259. // set the current cell with the appropiate key
  260. $rowData[$key] = $cellData;
  261. }
  262. // assume row with th elements is the header row
  263. if ($isHeaderRow) {
  264. $tableData['header'] = array_values($rowData);
  265. $isHeaderRow = false;
  266. } else {
  267. // filter the empty cells
  268. $tableData['rows'][] = $this->cleanArray($rowData);
  269. }
  270. $tableData['numColums'] = max($tableData['numColums'], $numColums);
  271. }
  272. // filter the empty rows
  273. if (isset($tableData['rows'])) {
  274. $tableData['rows'] = $this->cleanArray($tableData['rows']);
  275. }
  276. if ($tableData['numRows'] > $resultsPerPage) {
  277. $resultsPerPage = $tableData['numRows'];
  278. }
  279. if ($tableData['numColums'] == 0 && $tableData['numRows'] == 0) {
  280. unset($tableData['numRows']);
  281. unset($tableData['numColums']);
  282. }
  283. // filter the empty table properties (eg no header)
  284. $tableArray[] = $this->cleanArray($tableData);
  285. }
  286. // filter the empty tables
  287. $tableArray = $this->cleanArray($tableArray, true);
  288. // Now we have all table structures turned into an array
  289. // find a possible next page link in the DOM
  290. $domNodes = $xpath->query("//a[contains(.,'{$this->mappingFrench['next']}') or contains(.,'{$this->mappingDutch['next']}')]");
  291. if ($domNodes->length > 0) {
  292. $firstNode = $domNodes->item(0);
  293. if ($firstNode) {
  294. $hrefAttribute = $firstNode->getAttribute('href');
  295. if ($hrefAttribute) {
  296. $nextPageUrl = $this->cleanUrl($hrefAttribute, $type);
  297. }
  298. }
  299. }
  300. if ($nextPageUrl) {
  301. $returnData['Meta']['nextPageUrl'] = $nextPageUrl;
  302. }
  303. // parse the number of results from HTML
  304. $numResults = 1;
  305. if (preg_match("/(\d+) ({$this->mappingFrench['foundEnterprises']}|{$this->mappingDutch['foundEnterprises']})/is", $html, $matches)) {
  306. if (isset($matches[1])) {
  307. $numResults = $matches[1];
  308. }
  309. }
  310. $returnData['Meta']['totalNumberOfResults'] = $numResults;
  311. $returnData['data'] = array();
  312. // loop table structures
  313. foreach ($tableArray as $table) {
  314. if ($type == 'zoekwoordenform') { // ondernemingen
  315. if ($table['numColums'] == 7) { // if it's a recordset from company search
  316. foreach ($table['rows'] as $record) {
  317. $kboCompany = $this->parseRecordSetToKBOCompany($record);
  318. $returnData['data'][] = $kboCompany;
  319. }
  320. } elseif ($table['numColums'] == 3) { // if it's a detail page from company search
  321. $kboCompany = $this->parseListToKBOCompany($table['rows']);
  322. // print_r(json_encode($kboCompany));
  323. $returnData['data'][] = $kboCompany;
  324. }else{
  325. $returnData['data'][] = $table;
  326. }
  327. }
  328. }
  329. // Set the max loopcount if unset
  330. $maxLoopCount = $maxLoopCount < 0 ? ceil($numResults / $resultsPerPage) : $maxLoopCount;
  331. if ($nextPageUrl && $maxLoopCount > 0) {
  332. // sleep(1);
  333. $newData = $this->doRequestAndInterpretHTML($nextPageUrl, $type, $follow_nextpage, --$maxLoopCount);
  334. $returnData['data'] = array_merge($returnData['data'], $newData['data']);
  335. $returnData['Meta'] = array_merge($returnData['Meta'], $newData['Meta']);
  336. // $returnData['page'][] = $newData;
  337. }
  338. // print_r($returnData);
  339. // exit();
  340. return $returnData;
  341. }
  342. private function parseListToKBOCompany($rows){
  343. $kboCompany = new KBOCompany();
  344. foreach ($rows as $row) {
  345. if(count($row) >= 2){
  346. $key = $row[0][0]['content'];
  347. $data = $row[1];
  348. // print_r('$key');
  349. // print_r($key);
  350. // print_r('$data');
  351. // print_r($data);
  352. // print_r('$row');
  353. // print_r($data);
  354. // print_r('$row[0]');
  355. // print_r($row[0]);
  356. $this->setKBOCompanyProperty($kboCompany, $key, $data);
  357. }
  358. else{
  359. // print 'no data for key ';
  360. // print_r(count($row));
  361. // print_r($row);
  362. }
  363. }
  364. return $kboCompany;
  365. }
  366. /**
  367. * loops through table array params and sets KBOCompany properties
  368. * @param $record
  369. * @return KBOCompany
  370. */
  371. private function parseRecordSetToKBOCompany($record)
  372. {
  373. $kboCompany = new KBOCompany();
  374. foreach ($record as $key => $data) {
  375. if ($key === 0) {
  376. $key = 'num';
  377. }
  378. $this->setKBOCompanyProperty($kboCompany, $key, $data);
  379. }
  380. $kboCompany->setResultNum(1);
  381. return $kboCompany;
  382. }
  383. private function setKBOCompanyProperty(&$kboCompany, $key, $data){
  384. $value = $this->translateText($data[0]['content']);
  385. switch ($key) {
  386. case 'num':
  387. $kboCompany->setResultNum($value);
  388. break;
  389. case 'numberAndStartDate':
  390. $kboCompany->setCompanyVat($value);
  391. $kboCompany->setDetailUrl($data[0]['attributes']['href']);
  392. $kboCompany->setStartDate($data[1]['content']);
  393. break;
  394. case 'companyNumber':
  395. $kboCompany->setCompanyVat($value);
  396. break;
  397. case 'startDate':
  398. $kboCompany->setStartDate($value);
  399. break;
  400. case 'name':
  401. $kboCompany->setCompanyName($value);
  402. break;
  403. case 'addressNl':
  404. $kboCompany->setAddressParts(KBOCompany::LANGUAGE_NL, $data[0]['content'], $data[2]['content']);
  405. break;
  406. case 'addressFr':
  407. $kboCompany->setAddressParts(KBOCompany::LANGUAGE_FR, $data[0]['content'], $data[2]['content']);
  408. break;
  409. case 'additionalInfo':
  410. if ($value == 'addressNl') {
  411. $kboCompany->setAddressParts(KBOCompany::LANGUAGE_NL, $data[1]['content'], $data[2]['content']);
  412. }
  413. break;
  414. case 'type':
  415. case 'legalForm':
  416. $kboCompany->resetLegalTypes();
  417. foreach ($data as $content) {
  418. if(strpos($content['content']," ")===false){
  419. $kboCompany->addLegalType($content['content']);
  420. }
  421. }
  422. break;
  423. case 'status':
  424. $kboCompany->setStatus($value);
  425. break;
  426. case 'numberOfEstablishments':
  427. case 'numberOfEstablishments2':
  428. $kboCompany->setNumberOfEstablishments($value);
  429. break;
  430. case 'phoneNumber':
  431. $kboCompany->setPhoneNumber($value);
  432. break;
  433. case 'faxNumber':
  434. $kboCompany->setFaxNumber($value);
  435. break;
  436. case 'email':
  437. $kboCompany->setEmailAddress($value);
  438. break;
  439. case 'website':
  440. $kboCompany->setWebsite($value);
  441. break;
  442. case 'legalStatus':
  443. $kboCompany->setLegalStatus($value);
  444. break;
  445. default:
  446. // print "\n\n".'No key switch for key '. $key;
  447. break;
  448. }
  449. }
  450. /**
  451. * Translate text into properties
  452. * @param $string
  453. * @return mixed|string
  454. */
  455. private function translateText($string)
  456. {
  457. if (is_scalar($string)) {
  458. $string = trim($string);
  459. $string = preg_replace('/:$/', '', $string);
  460. $string = preg_replace('/"(.*)"/', '$1', $string);
  461. $string = preg_replace('/\.$/', '', $string);
  462. if ($key = array_search(strtolower($string), array_map('strtolower', $this->mappingDutch))) {
  463. $string = $key;
  464. } elseif ($key = array_search(strtolower($string), array_map('strtolower', $this->mappingFrench))) {
  465. // $string = $key;
  466. }
  467. if($string == 'noData'){
  468. $string = null;
  469. }
  470. }
  471. return $string;
  472. }
  473. /**
  474. * filters an array using filterData method and optionally returns array as numeric array
  475. * @param $array
  476. * @param bool $valuesOnly
  477. * @return array
  478. */
  479. private function cleanArray($array, $valuesOnly = false)
  480. {
  481. $array = array_filter($array, array($this, 'filterData'));
  482. if ($valuesOnly) {
  483. $array = array_values($array);
  484. }
  485. return $array;
  486. }
  487. /**
  488. * Removes session info and rebuilds URL's, optionally the current endpoint for relative url's
  489. * @param $url
  490. * @param null $endpoint
  491. * @return mixed|string
  492. */
  493. private function cleanUrl($url, $endpoint = null)
  494. {
  495. $url = preg_replace('/;jsessionid=[^&?]+/', '', utf8_encode(trim($url)));
  496. $url = str_replace(self::apiUrl, '', $url);
  497. if (!preg_match('/^http/is', $url)) {
  498. if (preg_match('/^\?/is', $url)) {
  499. $url = substr($url, 1);
  500. parse_str($url, $params);
  501. $url = $this->buildUrl($endpoint, $params);
  502. } else {
  503. $url = self::apiUrl . $url;
  504. }
  505. }
  506. return $url;
  507. }
  508. /**
  509. * filter the array values that evaluate to false, except 0! for array_filter()
  510. * @param $value
  511. * @return bool
  512. */
  513. private function filterData($value)
  514. {
  515. return ($value !== null && $value !== false && $value !== '' && (!is_array($value) || count($value) > 0));
  516. }
  517. /**
  518. * Create an GET URL based on endpoint (HTML page) and query params
  519. * @param $endpoint
  520. * @param array $queryParameters
  521. * @return string
  522. */
  523. private function buildUrl($endpoint, $queryParameters = array())
  524. {
  525. $url = self::apiUrl . $endpoint . '.html?' . http_build_query($queryParameters);
  526. return $url;
  527. }
  528. /**
  529. * Execute a GET request and return HTML as string
  530. * @param $url
  531. * @return mixed|string
  532. * @throws \DataInterface\Exception\IncompatibleInterfaceException
  533. */
  534. private function doGetRequest($url)
  535. {
  536. // retrieve data from url
  537. $result = file_get_contents($url);
  538. // If it's not available, throw an alert
  539. if ($result === null) {
  540. throw new IncompatibleInterfaceException('Invalid result from request to ' . $url . ' result: ' . $result);
  541. }
  542. // replace nbsp's to spaces for matchin purposes
  543. // $result = str_replace("\xc2\xa0",' ',$result);
  544. $result = str_replace("&nbsp;", ' ', $result);
  545. return $result;
  546. }
  547. }