/embersUtils/gsr_processor.py

https://bitbucket.org/sathappanspm/embers · Python · 96 lines · 76 code · 14 blank · 6 comment · 15 complexity · 8a292653b4b62635c9e60a6257348c4d MD5 · raw file

  1. #!/usr/bin/env python
  2. #-*- coding:utf-8 -*-
  3. # vim: ts=4 sts=4 sw=4 tw=79 sta et
  4. """
  5. *.py: Description of what * does.
  6. """
  7. __author__ = "Sathappan Muthiah"
  8. __email__ = "sathap1@vt.edu"
  9. __version__ = "0.0.1"
  10. from etool import args, queue
  11. from collections import namedtuple
  12. import xlrd
  13. from datetime import datetime
  14. import re
  15. GSR_TITLES = namedtuple('GSR_Warning', 'eventId, eventSubId, EntryRevisionDate, recordStatus, country, state, city, eventCode, population, date, earliestReportedDate, source, headline, eventDescription, firstRepLink, otherLinks_gss, otherLinks1, otherLinks2, encodingComment')
  16. def format_loc(loc_item):
  17. if len(loc_item) == 0 or loc_item.lower() == 'na' or loc_item.lower() == 'n/a':
  18. return '-'
  19. else:
  20. return loc_item.strip()
  21. def format_str(s):
  22. if isinstance(s, str):
  23. return s.strip().decode('utf-8')
  24. if isinstance(s, unicode):
  25. return s.strip()
  26. return unicode(s).strip()
  27. def format_date(xlDate, datemode):
  28. if isinstance(xlDate, unicode):
  29. return datetime.strptime(xlDate, '%m/%d/%Y').isoformat('T')
  30. year, month, date, hour, minute, second = xlrd.xldate_as_tuple(xlDate, datemode)
  31. date_str = datetime(year, month, date, hour, minute, second).isoformat('T')
  32. return date_str
  33. def create_named_tuple(titles):
  34. title_deDup = []
  35. for k in titles:
  36. if k in title_deDup:
  37. title_deDup.append(k + '2')
  38. else:
  39. title_deDup.append(k)
  40. title_str = ','.join([re.sub('[^a-z0-9]', '', k.lower()) for k in title_deDup])
  41. return namedtuple('GSR_Warning', title_str)
  42. def main(args):
  43. wb = xlrd.open_workbook(args.gsr)
  44. sh = wb.sheet_by_name('V1')
  45. GSR_TITLES = create_named_tuple(sh.row_values(0))
  46. publisher = queue.open(args.pub, 'w', capture=args.noCapture)
  47. for rownum in range(1, sh.nrows):
  48. warning = {}
  49. rowValue = GSR_TITLES._make(sh.row_values(rownum))
  50. warning['embersId'] = str(int(rowValue.eventid))
  51. warning['eventDate'] = format_date(rowValue.date, wb.datemode)
  52. warning['location'] = [format_loc(rowValue.country),
  53. format_loc(rowValue.state), format_loc(rowValue.city)]
  54. warning['model'] = 'GSR'
  55. warning['confidence'] = 1.00
  56. warning['confidenceIsProbability'] = False
  57. warning['eventType'] = format_str(rowValue.eventcode)
  58. if args.type:
  59. if not re.match('%s.*' % args.type, warning['eventType']):
  60. continue
  61. warning['population'] = format_str(rowValue.population)
  62. warning['date'] = format_date(rowValue.earliestreporteddate, wb.datemode)
  63. warning['derivedFrom'] = {
  64. 'derivedIds': [],
  65. 'embersSubId': rowValue.eventsubid,
  66. 'status': rowValue.newssource,
  67. 'headline': rowValue.headline,
  68. 'description': rowValue.eventdescription,
  69. 'firstReportedLink': rowValue.firstreportedlink,
  70. 'gssLink': rowValue.otherlinksgsslink,
  71. 'otherLinks1': rowValue.otherlinks,
  72. 'otherLinks2': rowValue.otherlinks2,
  73. 'geoCorrected': False
  74. }
  75. publisher.write(warning)
  76. if __name__ == "__main__":
  77. ap = args.get_parser()
  78. ap.add_argument('-g', '--gsr', type=str, help='Location of gsr excel sheet')
  79. ap.add_argument('-n', '--noCapture', action='store_false', default=True, help='Location of gsr excel sheet')
  80. ap.add_argument('-t', '--type', help="filter by type, enter the first two digits of the code")
  81. args = ap.parse_args()
  82. main(args)