PageRenderTime 33ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/geopy/geocoders/wiki_semantic.py

https://bitbucket.org/shishirk/geopy
Python | 108 lines | 85 code | 20 blank | 3 comment | 20 complexity | 27eced587d69a0e2cdea8f322e69f233 MD5 | raw file
  1. import xml.dom.minidom
  2. from geopy.geocoders.base import Geocoder
  3. from geopy.point import Point
  4. from geopy.location import Location
  5. from geopy import util
  6. try:
  7. from BeautifulSoup import BeautifulSoup
  8. except ImportError:
  9. util.logger.warn("BeautifulSoup was not found. " \
  10. "The SemanticMediaWiki geocoder will not work.")
  11. try:
  12. set
  13. except NameError:
  14. from sets import Set as set
  15. class SemanticMediaWiki(Geocoder):
  16. def __init__(self, format_url, attributes=None, relations=None,
  17. prefer_semantic=False, transform_string=None):
  18. self.format_url = format_url
  19. self.attributes = attributes
  20. self.relations = relations
  21. self.prefer_semantic = prefer_semantic
  22. self.transform_string = transform_string
  23. def get_url(self, string):
  24. return self.format_url % self.transform_string(string)
  25. def parse_rdf_link(self, page, mime_type='application/rdf+xml'):
  26. """Parse the URL of the RDF link from the <head> of ``page``."""
  27. soup = BeautifulSoup(page)
  28. link = soup.head.find('link', rel='alternate', type=mime_type)
  29. return link and link['href'] or None
  30. def parse_rdf_things(self, data):
  31. dom = xml.dom.minidom.parseString(data)
  32. thing_map = {}
  33. things = dom.getElementsByTagName('smw:Thing')
  34. things.reverse()
  35. for thing in things:
  36. name = thing.attributes['rdf:about'].value
  37. articles = thing.getElementsByTagName('smw:hasArticle')
  38. things[name] = articles[0].attributes['rdf:resource'].value
  39. return (things, thing)
  40. def transform_semantic(self, string):
  41. """Normalize semantic attribute and relation names by replacing spaces
  42. with underscores and capitalizing the result."""
  43. return string.replace(' ', '_').capitalize()
  44. def get_relations(self, thing, relations=None):
  45. if relations is None:
  46. relations = self.relations
  47. for relation in relations:
  48. relation = self.transform_semantic(relation)
  49. for node in thing.getElementsByTagName('relation:' + relation):
  50. resource = node.attributes['rdf:resource'].value
  51. yield (relation, resource)
  52. def get_attributes(self, thing, attributes=None):
  53. if attributes is None:
  54. attributes = self.attributes
  55. for attribute in attributes:
  56. attribute = self.transform_semantic(attribute)
  57. for node in thing.getElementsByTagName('attribute:' + attribute):
  58. value = node.firstChild.nodeValue.strip()
  59. yield (attribute, value)
  60. def get_thing_label(self, thing):
  61. return util.get_first_text(thing, 'rdfs:label')
  62. def geocode_url(self, url, attempted=None):
  63. if attempted is None:
  64. attempted = set()
  65. util.logger.debug("Fetching %s..." % url)
  66. page = urlopen(url)
  67. soup = BeautifulSoup(page)
  68. rdf_url = self.parse_rdf_link(soup)
  69. util.logger.debug("Fetching %s..." % rdf_url)
  70. page = urlopen(rdf_url)
  71. things, thing = self.parse_rdf(page)
  72. name = self.get_label(thing)
  73. attributes = self.get_attributes(thing)
  74. for attribute, value in attributes:
  75. latitude, longitude = util.parse_geo(value)
  76. if None not in (latitude, longitude):
  77. break
  78. if None in (latitude, longitude):
  79. relations = self.get_relations(thing)
  80. for relation, resource in relations:
  81. url = things.get(resource, resource)
  82. if url in tried: # Avoid cyclic relationships.
  83. continue
  84. tried.add(url)
  85. name, (latitude, longitude) = self.geocode_url(url, tried)
  86. if None not in (name, latitude, longitude):
  87. break
  88. return (name, (latitude, longitude))