PageRenderTime 275ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/web-app/cronjobs/AddressImportGeocom.php

https://github.com/ParveenArora/AddressHunter
PHP | 204 lines | 137 code | 13 blank | 54 comment | 32 complexity | 2664b04bca3129d3c22f2fd9065ea451 MD5 | raw file
  1. <?php
  2. /**
  3. * AddressHunter
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://www.opensource.org/licenses/BSD-3-Clause
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to contact@addresshunter.net so we can send you a copy immediately.
  14. *
  15. * @package AddressHunter
  16. * @copyright Copyright (c) 2011 skobbler GmbH (http://www.skobbler.com)
  17. * @license http://www.opensource.org/licenses/BSD-3-Clause New BSD License
  18. * @version $Id$
  19. */
  20. /**
  21. * Imports addresses from skobbler GeoCom logs
  22. * (to be run as cronjob)
  23. *
  24. * This script performs the first step of a two-step address import (data import and validation)
  25. * and is an example how address data can be imported from various sources. To import data from
  26. * another source, implement a similar tool to this one.
  27. *
  28. * This script parses the logs, extracts the (theoretically) usable addresses and imports them
  29. * to the database ("address_import" table) in a standardized format.
  30. * An address is considered usable if it has at least a street name and preferably also a
  31. * numeric housenumber. If the housenumber is missing, "1" is used instead.
  32. * The addresses are imported as lowercased addressline (string). Duplicates are discarded, but
  33. * their frequency is counted.
  34. */
  35. // debug and benchmark constants
  36. define('DEBUG', false);
  37. define('BENCHMARK', false);
  38. // including the init (ZF bootstrap)
  39. require_once 'init.php';
  40. // getting the config
  41. $config = Zend_Registry::get('Zend_Config');
  42. $geocomLogsPath = $config->addressimport->geocom->path;
  43. // searching for the next logfile to parse
  44. $logfile = false;
  45. if ($dh = opendir($geocomLogsPath)) {
  46. while (false !== ($file = readdir($dh))) {
  47. if ($file != "." && $file != ".." && $file != ".svn" && is_file($geocomLogsPath . $file)) {
  48. $logfile = $file;
  49. break;
  50. }
  51. }
  52. closedir($dh);
  53. } else {
  54. // TODO: use Zend logger everywhere
  55. error_log('AddressImportGeocom ERROR: Error reading from geocom logs directory');
  56. die();
  57. }
  58. if (!$logfile) {
  59. // no log file to parse
  60. die();
  61. }
  62. // array for statistics
  63. $stats = array(
  64. 'regular_has_nr_num' => 0, // regular search with numeric house number
  65. 'regular_has_nr_nan' => 0, // regular search with NaN house number
  66. 'regular_has_str_only' => 0, // regular search with street name only (no house number)
  67. 'regular_has_str_nr' => 0, // regular search with street name which starts or ends with a number (probably a house number, wrongly placed by the user)
  68. 'regular_no_str' => 0, // regular search with no street name (probably only a city, zipcode or country)
  69. 'oneline_has_nr' => 0, // oneline search with at least a numeric character in it
  70. 'oneline_no_nr' => 0, // oneline search with no numeric chars
  71. 'reverse' => 0, // reverse geocodings
  72. );
  73. // TODO: use Doctrine
  74. $dbSettings = $config->doctrine->connection->toArray();
  75. $dbh = mysql_connect($dbSettings['host'], $dbSettings['user'], $dbSettings['password']);
  76. if (!$dbh) {
  77. error_log('AddressImportGeocom ERROR: Could not connect to the database: ' . mysql_error());
  78. die();
  79. }
  80. if (!mysql_select_db($dbSettings['dbname'], $dbh)) {
  81. error_log('AddressImportGeocom ERROR: Could not select database: ' . mysql_error());
  82. die();
  83. }
  84. mysql_query("SET NAMES UTF8", $dbh);
  85. $fh = fopen($geocomLogsPath . $logfile, "rb");
  86. if ($fh) {
  87. while (($buffer = fgets($fh, 4096)) !== false) {
  88. $finalAddressline = false;
  89. $finalHousenumber = false;
  90. // ####### regular searches (separate address details)
  91. if (strpos($buffer, ' message: GEOCOM request received from client: /geocode/regular?') === 0) {
  92. $query = substr($buffer, 65);
  93. $arr = array();
  94. parse_str($query, $arr);
  95. $arr = array_map('trim', $arr);
  96. $arr = array_map('mb_strtolower', $arr);
  97. if (isset($arr['number'])) { // house number specified
  98. if (preg_match('/[0-9]+/', $arr['number'])) { // numeric house number
  99. $stats['regular_has_nr_num']++;
  100. } else { // house number present, but not numeric (we will ignore it)
  101. $stats['regular_has_nr_nan']++;
  102. $arr['number'] = '1';
  103. }
  104. $finalAddressline = $arr['number'] . ' ' . @$arr['street'] . ', ' . @$arr['city'] . ' ' . @$arr['postal_code'] . ', ' . @$arr['state'] . ', ' . @$arr['country_code'];
  105. $finalHousenumber = $arr['number'];
  106. } elseif (isset($arr['street'])) { // no house number specified, but we have street name
  107. $x = explode(' ', $arr['street']);
  108. if (preg_match('/[0-9]+/', $x[0])) { // street name starts with a number (we will consider it as house number)
  109. $stats['regular_has_str_nr']++;
  110. $arr['number'] = $x[0];
  111. array_shift($x);
  112. $arr['street'] = implode(' ', $x);
  113. } elseif (preg_match('/[0-9]+/', $x[count($x)-1])) { // street name ends with a number (we will consider it as house number)
  114. $stats['regular_has_str_nr']++;
  115. $arr['number'] = $x[count($x)-1];
  116. array_pop($x);
  117. $arr['street'] = implode(' ', $x);
  118. } else { // street name with no number: we will search for house no. 1
  119. $stats['regular_has_str_only']++;
  120. $arr['number'] = '1';
  121. }
  122. $finalAddressline = $arr['number'] . ' ' . @$arr['street'] . ', ' . @$arr['city'] . ' ' . @$arr['postal_code'] . ', ' . @$arr['state'] . ', ' . @$arr['country_code'];
  123. $finalHousenumber = $arr['number'];
  124. } else {
  125. $stats['regular_no_str']++;
  126. }
  127. // ####### one-line searches
  128. } elseif (strpos($buffer, ' message: GEOCOM request received from client: /geocode/oneline?') === 0) {
  129. $query = substr($buffer, 65);
  130. $arr = array();
  131. parse_str($query, $arr);
  132. if (isset($arr['address'])) {
  133. $arr['address'] = trim($arr['address']);
  134. $arr['address'] = mb_strtolower($arr['address']);
  135. $matches = null;
  136. // we need to identify the housenumber (if there is one)
  137. if (preg_match('/[0-9]+/', $arr['address'], $matches)) { // addressline has at least a numerical character in it, otherwise it's discarded
  138. $stats['oneline_has_nr']++;
  139. // removing temporarily because we cannot identify the housenumber (reliably) for further comparison
  140. //$finalAddressline = $arr['address'];
  141. //$finalHousenumber = '';
  142. } else {
  143. $stats['oneline_no_nr']++;
  144. }
  145. }
  146. // ####### reverse geocodings
  147. } elseif (strpos($buffer, ' message: GEOCOM request received from client: /geocode/reverse?') === 0) {
  148. $stats['reverse']++;
  149. } else {
  150. // other lines from the log file are ignored
  151. }
  152. // inserting to the DB
  153. if ($finalAddressline) {
  154. // cleaning up the addressline
  155. $finalAddressline = str_replace(array(', , ', ' ', ', , ', ' ', ' ,', "\n", "\r"), array(', ', ' ', ', ', ' ', ',', ' ', ' '), $finalAddressline);
  156. $finalAddressline = preg_replace('!\s+!', ' ', $finalAddressline);
  157. $finalAddressline = trim($finalAddressline, ", \t\n\r\0\x0B");
  158. if (DEBUG) {
  159. //print "\n" . $finalAddressline;
  160. }
  161. $insert = sprintf("INSERT INTO address_import (addressline, housenumber, country_code) VALUES ('%s', '%s', '%s') ON DUPLICATE KEY UPDATE frequency = frequency + 1;",
  162. mysql_real_escape_string($finalAddressline),
  163. mysql_real_escape_string($finalHousenumber),
  164. mysql_real_escape_string(substr($finalAddressline, -2))
  165. );
  166. $dbresult = mysql_query($insert, $dbh);
  167. if (!$dbresult) {
  168. error_log('AddressImportGeocom ERROR: Error inserting to DB: ' . $insert);
  169. }
  170. }
  171. }
  172. if (!feof($fh)) {
  173. error_log('AddressImportGeocom ERROR: unexpected fgets() fail');
  174. }
  175. fclose($fh);
  176. } else {
  177. error_log('AddressImportGeocom ERROR: Error opening logfile' . $logfile);
  178. }
  179. // logging the statistics
  180. $statsLog = $config->addressimport->logs->path . 'geocom_import.log.csv';
  181. file_put_contents(
  182. $statsLog,
  183. "\n" . date('Y-m-d H:i:s') . ';' . $logfile . ';' . implode(';', $stats),
  184. FILE_APPEND
  185. );
  186. if (DEBUG) {
  187. print "\n";
  188. print_r($stats);
  189. }
  190. // deleting the successfully processed log file
  191. unlink($geocomLogsPath . $logfile);