PageRenderTime 43ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/src/com/winterwell/jgeoplanet/LocalGeocoder.java

http://github.com/winterstein/JTwitter
Java | 431 lines | 313 code | 26 blank | 92 comment | 54 complexity | c32a1c9a7e666a03ad72361b5a6ffe00 MD5 | raw file
  1. package com.winterwell.jgeoplanet;
  2. import java.io.BufferedReader;
  3. import java.io.IOException;
  4. import java.io.InputStream;
  5. import java.io.InputStreamReader;
  6. import java.io.Reader;
  7. import java.util.ArrayList;
  8. import java.util.Arrays;
  9. import java.util.Collections;
  10. import java.util.HashMap;
  11. import java.util.List;
  12. import java.util.Map;
  13. import java.util.regex.Matcher;
  14. import java.util.regex.Pattern;
  15. import winterwell.jtwitter.InternalUtils;
  16. /**
  17. * Use a local dataset to do some geocoding.
  18. * The data rows are:
  19. *
  20. * country-format, name1;name2;name3
  21. *
  22. * List of cities: see http://www.opengeocode.org/download.php#cities
  23. * We do not use this -- because it contains (rightly) duplicates.
  24. * c.f. http://opendata.stackexchange.com/questions/3835/a-list-of-cities-of-each-country
  25. *
  26. * @author daniel
  27. * @testedby LocalGeocoderTest}
  28. */
  29. public class LocalGeocoder implements IGeoCode {
  30. private final Map<String, IPlace> canonDesc2place = new HashMap();
  31. /**
  32. * This loads the data from file, so best-practice is to re-use the object.
  33. * @throws RuntimeException
  34. */
  35. public LocalGeocoder() throws RuntimeException {
  36. // Use the bundled csv of country data
  37. try {
  38. Map<String, List<String>> code2names = loadISO3166NameData();
  39. // cities first, so the countries can take precedence if they share the same name
  40. loadWikipedia("LocalGeocoder_wikipedia_cities.txt");
  41. loadCSV("LocalGeocoder_cities.csv", null);
  42. loadCSV("LocalGeocoder_countries.csv", code2names);
  43. // // HACKs
  44. // SimplePlace mog = (SimplePlace) getPlace("Mogadishu");
  45. // mog.centroid = new Location(2.0469, 45.3182);
  46. // loadCSV("worldcities.csv", null);
  47. InternalUtils.log("geo","LocalGeocoder loaded "+places.size());
  48. } catch (Exception e) {
  49. throw new RuntimeException(e);
  50. }
  51. }
  52. /**
  53. * Data from: https://en.wikipedia.org/wiki/List_of_cities_by_latitude
  54. * @param resource
  55. * @throws IOException
  56. */
  57. private void loadWikipedia(String resource) throws IOException {
  58. BufferedReader r = new BufferedReader(
  59. new InputStreamReader(
  60. LocalGeocoder.class.getResourceAsStream(resource)
  61. ));
  62. HashMap c2c = new HashMap();
  63. while(true) {
  64. String line = r.readLine();
  65. if (line==null) break;
  66. if (line.contains("Gaza")) {
  67. System.out.println(line);
  68. }
  69. // expects: eg | data-sort-value="10.183"| 10°11′N ||data-sort-value="-68"| 68°00′W || [[Valencia, Venezuela|Valencia]] || [[Carabobo]] || {{VEN}}
  70. String[] bits = line.split("\\|\\|");
  71. if (bits.length < 5) continue;
  72. Pattern pcoord = Pattern.compile("(\\d+)°(\\d+)′(\\d*\"?)([NESW])");
  73. Matcher mlat = pcoord.matcher(bits[0]);
  74. Matcher mlng = pcoord.matcher(bits[1]);
  75. if ( ! mlat.find() || ! mlng.find()) {
  76. continue;
  77. }
  78. double degslat = Double.valueOf(mlat.group(1));
  79. double minlat = Double.valueOf(mlat.group(2));
  80. double degslng = Double.valueOf(mlng.group(1));
  81. double minlng = Double.valueOf(mlng.group(2));
  82. double lat = degslat + (minlat/60);
  83. double lng = degslng + (minlng/60);
  84. String dirlng = mlng.group(4);
  85. String dirlat = mlat.group(4);
  86. if (dirlat.equals("S")) lat = -lat;
  87. if (dirlng.equals("W")) lng = -lng;
  88. String name = bits[2];
  89. String[] splitname = name.split("\\|");
  90. String shortname = splitname[splitname.length-1].trim();
  91. shortname = InternalUtils.trimPunctuation(shortname);
  92. // pick the last bit -- the display name
  93. String country = bits[4];
  94. country = country.replaceFirst("<!--.+-->", ""); // no comments
  95. country = InternalUtils.removePunctuation(country).trim();
  96. if (country.length()<2) {
  97. continue;
  98. }
  99. ISO3166 iso = new ISO3166();
  100. // Convert the (slightly ad-hoc) Wikipedia codes into proper country codes.
  101. String truncateAndHope = country.substring(0, 2);
  102. switch(country) {
  103. case "DZA": truncateAndHope = "Algeria"; break;
  104. case "ARE": truncateAndHope = "United Arab Emirates"; break;
  105. case "BUR": truncateAndHope = "Burma"; break;
  106. case "DEN": case "DNK": truncateAndHope = "Denmark"; break;
  107. case "DJI": truncateAndHope = "Djibouti"; break;
  108. case "EGY": truncateAndHope = "Egypt"; break;
  109. case "GER": truncateAndHope = "Germany"; break;
  110. case "SWE": truncateAndHope = "Sweden"; break;
  111. case "KAZ": truncateAndHope = "Kazakhstan"; break;
  112. case "MAS": truncateAndHope = "Malaysia"; break;
  113. case "MRT": truncateAndHope = "Mauritania"; break;
  114. case "POL": truncateAndHope = "Poland"; break;
  115. case "SIN": truncateAndHope = "Singapore"; break;
  116. case "SAM": truncateAndHope = "Samoa"; break;
  117. case "SOM": truncateAndHope = "Somalia"; break;
  118. case "IRQ": truncateAndHope = "Iraq"; break;
  119. case "ISR": case "ISR Disputed": truncateAndHope = "Israel"; break;
  120. case "PSE": case "PLE": truncateAndHope = "Palestine"; break;
  121. case "COM": truncateAndHope = "Comoros"; break;
  122. case "CRO": truncateAndHope = "Croatia"; break;
  123. case "PRC": truncateAndHope = "China"; break;
  124. case "TUN": truncateAndHope = "Tunisia"; break;
  125. case "TUR": truncateAndHope = "Turkey"; break;
  126. case "POR": truncateAndHope = "Portugal"; break;
  127. case "KOR": truncateAndHope = "Korea"; break;
  128. case "HAI": truncateAndHope = "Haiti"; break;
  129. case "JAM": truncateAndHope = "Jamaica"; break;
  130. case "ZIM": truncateAndHope = "Zimbabwe"; break;
  131. }
  132. String ccode = iso.getCountryCode(truncateAndHope);
  133. if (ccode==null) {
  134. continue;
  135. }
  136. String country2 = iso.getEverydayName(ccode);
  137. c2c.put(country, country2+" from "+name);
  138. Location posn = new Location(lat, lng);
  139. SimplePlace sp = new SimplePlace(shortname, null, ccode);
  140. sp.setGeocoder(getClass());
  141. sp.type = IPlace.TYPE_CITY;
  142. sp.centroid = posn;
  143. places.add(sp);
  144. // name lookup map
  145. String cn = canonical(shortname);
  146. if (cn.isEmpty()) continue; // shouldn't happen but best to be safe
  147. if (canonDesc2place.containsKey(cn)) {
  148. // System.out.println(canonDesc2place.get(cn));
  149. } else {
  150. canonDesc2place.put(cn, sp);
  151. }
  152. }
  153. // System.out.println(Printer.toString(c2c, "\n", ": "));
  154. }
  155. private void loadCSV(String resource, Map<String, List<String>> code2names) throws IOException {
  156. BufferedReader r = new BufferedReader(
  157. new InputStreamReader(
  158. LocalGeocoder.class.getResourceAsStream(resource)
  159. ));
  160. loadCSV(r, code2names);
  161. }
  162. /**
  163. * @param location
  164. * @return
  165. * @throws PlaceNotFoundException
  166. * This happens quite a bit due to not-unique from the crude
  167. * bounding-box algorithm.
  168. */
  169. public IPlace getPlace(Location location) throws PlaceNotFoundException {
  170. GeoCodeQuery query = new GeoCodeQuery().setLocation(location);
  171. return getBestPlace(query);
  172. }
  173. @Override
  174. public IPlace getPlace(String locationDescription) {
  175. Map<IPlace, Double> places = getPlace(new GeoCodeQuery(locationDescription));
  176. if (places.isEmpty()) return null;
  177. return InternalUtils.getBest(places);
  178. }
  179. @Override
  180. public Map<IPlace, Double> getPlace(GeoCodeQuery query) {
  181. String locnDesc = query.desc;
  182. Location locn = query.locn;
  183. if (locn==null) locn = Location.parse(locnDesc);
  184. if (locn!=null) {
  185. Map<IPlace, Double> pmap = getPlace2(locn);
  186. return pmap;
  187. }
  188. // by name?
  189. String ld = canonical(locnDesc);
  190. IPlace _place = canonDesc2place.get(ld);
  191. if (_place!=null) {
  192. return Collections.singletonMap(_place, 0.8);
  193. }
  194. throw new PlaceNotFoundException(locnDesc);
  195. }
  196. private Map<IPlace, Double> getPlace2(Location locn) {
  197. // 0? Special case 'cos it almost certainly means unset
  198. if (locn.latitude==0 && locn.longitude==0) {
  199. return Collections.emptyMap(); // TODO Should we have a North Pole constant for this?
  200. }
  201. // check the bounding boxes
  202. List<IPlace> possible = new ArrayList();
  203. // TODO a quad-tree??
  204. // 30km
  205. Dx cityRadius = new Dx(40000);
  206. for(IPlace place : places) {
  207. BoundingBox bbox = place.getBoundingBox();
  208. if (bbox==null) {
  209. if (place.getCentroid()==null) continue;
  210. // city check
  211. Dx dist = locn.distance(place.getCentroid());
  212. if (dist.isShorterThan(cityRadius)) {
  213. return Collections.singletonMap(place, 0.98);
  214. }
  215. continue;
  216. }
  217. if (bbox.contains(locn)) {
  218. possible.add(place);
  219. }
  220. }
  221. if (possible.size()==0) {
  222. throw new PlaceNotFoundException(locn.toString());
  223. }
  224. Map<IPlace, Double> map = new HashMap();
  225. for(IPlace p : possible) {
  226. map.put(p, 0.95);
  227. }
  228. return map;
  229. }
  230. /**
  231. * @param locnDesc
  232. * @return place or null
  233. */
  234. public IPlace getPlaceLenient(String locnDesc) {
  235. // Is it a longitude/latitude pair?
  236. // NB: this is here to distinguish the lat/long are not-unique exception
  237. Location locn = Location.parse(locnDesc);
  238. if (locn!=null) {
  239. return getPlace(locn);
  240. }
  241. // OK - try the normal service
  242. try {
  243. IPlace place = getPlace(locnDesc);
  244. if (place!=null) return place;
  245. } catch (PlaceNotFoundException e) {
  246. // we're not beat yet
  247. }
  248. // The lenient bit: by a part of name?, e.g. London, UK
  249. IPlace place = getPlaceLenient2(locnDesc);
  250. return place;
  251. }
  252. @Override
  253. public Boolean matches(GeoCodeQuery query, IPlace place) {
  254. // TODO use GIS to do better! !st develop test cases.
  255. return InternalUtils.geoMatch(query, place);
  256. }
  257. /**
  258. * The lenient part of {@link #getPlaceLenient(String)}
  259. * @param locnDesc
  260. * @return place or null
  261. */
  262. public SimplePlace getPlaceLenient2(String locnDesc) {
  263. String ld = canonical(locnDesc);
  264. // check each place as a word in locnDesc
  265. for(SimplePlace p : places) {
  266. // cn should always be a meaningful string
  267. for(String name : p.getNames()) {
  268. String cn = canonical(name);
  269. if (cn.isEmpty()) continue;
  270. Pattern regex;
  271. // We need to be careful with codes, which can accidentally
  272. // be a part of a longer word. Oman/Uman is also problematic
  273. if (cn.length() < 5) {
  274. regex = Pattern.compile("\\b"+cn+"\\b", Pattern.CASE_INSENSITIVE);
  275. } else {
  276. // Allow within-word matches, e.g. american
  277. regex = Pattern.compile("\\b"+cn, Pattern.CASE_INSENSITIVE);
  278. }
  279. if (regex.matcher(ld).find()) {
  280. return p;
  281. }
  282. }
  283. }
  284. return null;
  285. }
  286. String canonical(String locnDesc) {
  287. if (locnDesc==null) return null;
  288. // mangle it for easier matching
  289. // NB: stripping out spaces proved too much -- it led to false matches
  290. // e.g. with Oman/Uman appearing inside strings
  291. return InternalUtils.toCanonical(locnDesc);
  292. }
  293. public LocalGeocoder(BufferedReader csv) throws IOException {
  294. loadCSV(csv, null);
  295. }
  296. /**
  297. *
  298. * @param csv
  299. * @param code2names Alternate names -- can be null
  300. * @throws IOException
  301. */
  302. void loadCSV(BufferedReader csv, Map<String, List<String>> code2names) throws IOException {
  303. // expects: country-code, place-name(s) ; separated, n, e, s, w, lat?, lng?
  304. while(true) {
  305. String line = csv.readLine();
  306. if (line==null) break;
  307. String[] bits = line.split("\\|");
  308. if (bits.length < 2) continue;
  309. String[] names = bits[1].trim().split(";");
  310. String country = bits[0].trim();
  311. SimplePlace sp;
  312. if (bits.length < 5) {
  313. // no geometry :(
  314. sp = new SimplePlace(names[0], null, country);
  315. sp.setGeocoder(getClass());
  316. } else {
  317. Location northEast = new Location(Double.valueOf(bits[2]),Double.valueOf(bits[3]));
  318. Location southWest = new Location(Double.valueOf(bits[4]),Double.valueOf(bits[5]));
  319. BoundingBox bbox = new BoundingBox(northEast, southWest);
  320. sp = new SimplePlace(names[0], bbox, country);
  321. sp.setGeocoder(getClass());
  322. }
  323. if (bits.length > 6) {
  324. Location centroid = new Location(Double.valueOf(bits[6]),Double.valueOf(bits[7]));
  325. sp.centroid = centroid;
  326. }
  327. loadCSV2_altNames(names, sp, code2names);
  328. places.add(sp);
  329. // name lookup map
  330. for(String name : sp.getNames()) {
  331. String cn = canonical(name);
  332. if (cn.isEmpty()) continue; // shouldn't happen but best to be safe
  333. canonDesc2place.put(cn, sp);
  334. }
  335. }
  336. csv.close();
  337. }
  338. private void loadCSV2_altNames(String[] names, SimplePlace sp, Map<String,List<String>> countryNames) {
  339. // alternative names
  340. if (names.length > 1) {
  341. sp.setAlternativeNames(Arrays.asList(names));
  342. return;
  343. }
  344. ISO3166 iso3166 = new ISO3166();
  345. if (countryNames!=null && iso3166.getCountryCode(sp.getName())!=null)
  346. { // it's a country -- give it the known alternative names
  347. String ccode = iso3166.getCountryCode(sp.getName());
  348. List<String> cNames = countryNames.get(ccode);
  349. if (cNames !=null) sp.setAlternativeNames(cNames);
  350. }
  351. }
  352. Map<String,List<String>> loadISO3166NameData() throws IOException {
  353. // load from file
  354. InputStream strm = ISO3166.class
  355. .getResourceAsStream("iso-3166-country-codes.csv");
  356. Reader _reader = new InputStreamReader(strm, "UTF8");
  357. BufferedReader reader = new BufferedReader(_reader);
  358. String line = reader.readLine(); // discard the header line
  359. Map<String, List<String>> code2names = new HashMap();
  360. while(true) {
  361. line = reader.readLine();
  362. if (line==null) break;
  363. String[] bits = line.split("\t");
  364. String code = bits[1].toUpperCase();
  365. List<String> names = code2names.get(code);
  366. if (names==null) {
  367. names = new ArrayList();
  368. code2names.put(code, names);
  369. }
  370. names.add(bits[0]);
  371. // alternative names?
  372. if (bits.length > 5 && ! bits[5].isEmpty()) {
  373. String[] alternateNames = bits[5].split(";");
  374. for (String alt : alternateNames) {
  375. names.add(alt);
  376. }
  377. }
  378. }
  379. return code2names;
  380. }
  381. /**
  382. * Using a list & scanning with regexes.
  383. * Not terribly efficient -- we should build a big word-based index
  384. * instead.
  385. */
  386. List<SimplePlace> places = new ArrayList();
  387. /**
  388. * @param gq
  389. * @return Best guess for the query.
  390. * @throws PlaceNotFoundException
  391. */
  392. public IPlace getBestPlace(GeoCodeQuery gq) {
  393. Map<IPlace, Double> qplaces = getPlace(gq);
  394. IPlace best = null;
  395. double bestScore = Double.NEGATIVE_INFINITY;
  396. for (IPlace p : qplaces.keySet()) {
  397. Double s = qplaces.get(p);
  398. if (s != null && s > bestScore) {
  399. best = p;
  400. bestScore = s;
  401. }
  402. }
  403. return best;
  404. }
  405. }