PageRenderTime 98ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/Elecciones/ONPEcrawler.py

https://github.com/PuercoPop/EleccionesPeru
Python | 169 lines | 163 code | 3 blank | 3 comment | 0 complexity | c5b6fc9ed3973fca07d03ac456650cf3 MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. from urllib import urlencode
  3. from urllib2 import Request, urlopen
  4. from BeautifulSoup import BeautifulSoup
  5. import Elecciones.models as m
  6. from django.core.management import setup_environ
  7. import settings
  8. setup_environ(settings)
  9. class ONPEcrawler():
  10. def __init__(self, url):
  11. #http://www.web.onpe.gob.pe/modElecciones/elecciones/elecciones2011/1ravuelta/onpe/congreso/rep_acta_cong.php
  12. self.url = url
  13. self.departamentos = { 'Lima':'140000' }
  14. """
  15. self.departamentos = {'Amazonas':'010000',
  16. 'Ancash':'020000',
  17. 'Apurimac':'030000',
  18. 'Arequipa':'040000',
  19. 'Ayacucho':'050000',
  20. 'Cajamarca':'060000',
  21. 'Callao':'240000',
  22. 'Cusco':'070000',
  23. 'Huancavelica':'080000',
  24. 'Huanuco':'090000',
  25. 'Ica':'100000',
  26. 'Junin':'110000',
  27. 'La Libertad':'120000',
  28. 'Lambayeque':'130000',
  29. 'Lima':'140000',
  30. 'Loreto':'150000',
  31. 'Madre de Dios':'160000',
  32. 'Moquegua':'170000',
  33. 'Pasco':'180000',
  34. 'Piura':'190000',
  35. 'Puno':'200000',
  36. 'San Martin':'210000',
  37. 'Tacna':'220000',
  38. 'Tumbes':'230000',
  39. 'Ucayali':'250000'}
  40. """
  41. def seed_tree(self):
  42. for departamento in self.departamentos:
  43. #Save into models
  44. depar = m.UbiGeo.objects.get_or_create(nombre=departamento, tipo='departamento', post_code=self.departamentos[departamento])
  45. def make_tree(self):
  46. for departamento in m.UbiGeo.objects.filter(tipo='departamento'):
  47. req = Request( self.url + 'extras/provincias.php',
  48. urlencode( {'elegido': departamento.post_code } ))
  49. f = urlopen( req )
  50. soup = BeautifulSoup( f.read(),
  51. convertEntities=BeautifulSoup.HTML_ENTITIES)
  52. f.close()
  53. for item in soup.findAll('option'):
  54. if item.string is not None:
  55. #item.string = name ie. u'Amazonas'
  56. #item['value'] = post_code ie. 19291
  57. (prov,created) = m.UbiGeo.objects.get_or_create(nombre=item.string, tipo='provincia', parent=departamento, post_code=item['value'])
  58. #Ahora hacer query a los Ubigeo que tengan el departamento de parent
  59. for provincia in m.UbiGeo.objects.filter( parent=departamento ):
  60. req = Request( self.url + 'extras/distritos.php',
  61. urlencode( {'elegido': provincia.post_code }))
  62. f = urlopen( req )
  63. soup = BeautifulSoup( f.read(),
  64. convertEntities=BeautifulSoup.HTML_ENTITIES)
  65. f.close()
  66. for item in soup.findAll('option'):
  67. if item.string is not None:
  68. (dist,created) = m.UbiGeo.objects.get_or_create(nombre=item.string, tipo='distrito', parent=provincia, post_code=item['value'] )
  69. for distrito in m.UbiGeo.objects.filter( parent=provincia ):
  70. req = Request( self.url + 'extras/locales.php',
  71. urlencode( {'elegido': distrito.post_code}))
  72. f=urlopen(req)
  73. soup = BeautifulSoup( f.read(),
  74. convertEntities=BeautifulSoup.HTML_ENTITIES)
  75. f.close()
  76. for item in soup.findAll('option'):
  77. if item.string is not None:
  78. (local,created) = m.UbiGeo.objects.get_or_create(nombre=item.string, tipo='local', parent=distrito, post_code=item['value'])
  79. for local in m.UbiGeo.objects.filter( parent=distrito ):
  80. post_data = dict()
  81. post_data['tipo_consulta1'] = 'UBIGEO'
  82. post_data['cnume_acta'] = ''
  83. post_data['ambito1'] = 'P'
  84. post_data['dpto'] = local.parent.parent.parent.post_code
  85. post_data['prov'] = local.parent.parent.post_code
  86. post_data['dist'] = local.parent.post_code
  87. post_data['local'] = local.post_code
  88. post_data['estado'] = 'T'
  89. post_data['continente'] = ''
  90. post_data['pais'] = ''
  91. post_data['ciudad'] = ''
  92. post_data['embajada'] = ''
  93. post_data['estado2'] = 'T'
  94. req = Request( self.url + 'extras/buscar_ubigeo_actas.php',
  95. urlencode( post_data ))
  96. f = urlopen( req )
  97. soup = soup.BeautifulSoup( f.read(),
  98. convertEntities=BeautifulSoup.HTML_ENTITIES)
  99. f.close()
  100. ### SE GUARDA EL ACTA
  101. for tr in soup.findAll('tr'):
  102. if tr.findAll('td'):
  103. if len( tr.findAll('td')) == 1:
  104. print "No se encontrarcon actas con los siguientes para", post_data
  105. pass
  106. else:
  107. acta = m.Info_Electoral( num_mesa=tr.findAll('td')[1].string,
  108. estado=tr.findAll('td')[3].string,
  109. local=local,
  110. distrito=distrito,
  111. provincia=provincia,
  112. departamento=departamento)
  113. acta.save()
  114. print "Bajando Acta Num: ", tr.findAll('td')[4].find('a')['href']
  115. def search_tree(departamento=None, provincia=None, distrito=None, local=None, num_mesa=None ):
  116. pass
  117. def test(self):
  118. local = m.UbiGeo.objects.filter( tipo='local' )[0]
  119. post_data = dict()
  120. post_data['tipo_consulta1'] = 'UBIGEO'
  121. post_data['cnume_acta'] = ''
  122. post_data['ambito1'] = 'P'
  123. post_data['dpto'] = str(local.parent.parent.parent.post_code)
  124. post_data['prov'] = str(local.parent.parent.post_code)
  125. post_data['dist'] = str(local.parent.post_code)
  126. post_data['local'] = str(local.post_code)
  127. post_data['estado'] = 'T'
  128. post_data['continente'] = ''
  129. post_data['pais'] = ''
  130. post_data['ciudad'] = ''
  131. post_data['embajada'] = ''
  132. post_data['estado2'] = 'T'
  133. print post_data
  134. print local
  135. print self.url + 'extras/buscar_ubigeo_actas.php'
  136. req = Request( self.url + 'extras/buscar_ubigeo_actas.php',
  137. urlencode( post_data ))
  138. f = urlopen( req )
  139. soup = BeautifulSoup( f.read(),
  140. convertEntities=BeautifulSoup.HTML_ENTITIES)
  141. return (f, soup)
  142. if __name__ == '__main__':
  143. #Congreso :"http://www.web.onpe.gob.pe/modElecciones/elecciones/elecciones2011/1ravuelta/onpe/congreso/"
  144. #Ejemplo Acta URL: http://www.web.onpe.gob.pe/modElecciones/elecciones/elecciones2011/1ravuelta/onpe/congreso/rep_mesas_det_cong.php?cnume_acta=240245
  145. crawler = ONPEcrawler(url = "http://www.web.onpe.gob.pe/modElecciones/elecciones/elecciones2011/1ravuelta/onpe/congreso/" )
  146. crawler.seed_tree()
  147. #crawler.make_tree()