/embersUtils/gsr_processor.py
https://bitbucket.org/sathappanspm/embers · Python · 96 lines · 76 code · 14 blank · 6 comment · 15 complexity · 8a292653b4b62635c9e60a6257348c4d MD5 · raw file
- #!/usr/bin/env python
- #-*- coding:utf-8 -*-
- # vim: ts=4 sts=4 sw=4 tw=79 sta et
- """
- *.py: Description of what * does.
- """
- __author__ = "Sathappan Muthiah"
- __email__ = "sathap1@vt.edu"
- __version__ = "0.0.1"
- from etool import args, queue
- from collections import namedtuple
- import xlrd
- from datetime import datetime
- import re
- GSR_TITLES = namedtuple('GSR_Warning', 'eventId, eventSubId, EntryRevisionDate, recordStatus, country, state, city, eventCode, population, date, earliestReportedDate, source, headline, eventDescription, firstRepLink, otherLinks_gss, otherLinks1, otherLinks2, encodingComment')
- def format_loc(loc_item):
- if len(loc_item) == 0 or loc_item.lower() == 'na' or loc_item.lower() == 'n/a':
- return '-'
- else:
- return loc_item.strip()
- def format_str(s):
- if isinstance(s, str):
- return s.strip().decode('utf-8')
- if isinstance(s, unicode):
- return s.strip()
- return unicode(s).strip()
- def format_date(xlDate, datemode):
- if isinstance(xlDate, unicode):
- return datetime.strptime(xlDate, '%m/%d/%Y').isoformat('T')
- year, month, date, hour, minute, second = xlrd.xldate_as_tuple(xlDate, datemode)
- date_str = datetime(year, month, date, hour, minute, second).isoformat('T')
- return date_str
- def create_named_tuple(titles):
- title_deDup = []
- for k in titles:
- if k in title_deDup:
- title_deDup.append(k + '2')
- else:
- title_deDup.append(k)
- title_str = ','.join([re.sub('[^a-z0-9]', '', k.lower()) for k in title_deDup])
- return namedtuple('GSR_Warning', title_str)
- def main(args):
- wb = xlrd.open_workbook(args.gsr)
- sh = wb.sheet_by_name('V1')
- GSR_TITLES = create_named_tuple(sh.row_values(0))
- publisher = queue.open(args.pub, 'w', capture=args.noCapture)
- for rownum in range(1, sh.nrows):
- warning = {}
- rowValue = GSR_TITLES._make(sh.row_values(rownum))
- warning['embersId'] = str(int(rowValue.eventid))
- warning['eventDate'] = format_date(rowValue.date, wb.datemode)
- warning['location'] = [format_loc(rowValue.country),
- format_loc(rowValue.state), format_loc(rowValue.city)]
- warning['model'] = 'GSR'
- warning['confidence'] = 1.00
- warning['confidenceIsProbability'] = False
- warning['eventType'] = format_str(rowValue.eventcode)
- if args.type:
- if not re.match('%s.*' % args.type, warning['eventType']):
- continue
- warning['population'] = format_str(rowValue.population)
- warning['date'] = format_date(rowValue.earliestreporteddate, wb.datemode)
- warning['derivedFrom'] = {
- 'derivedIds': [],
- 'embersSubId': rowValue.eventsubid,
- 'status': rowValue.newssource,
- 'headline': rowValue.headline,
- 'description': rowValue.eventdescription,
- 'firstReportedLink': rowValue.firstreportedlink,
- 'gssLink': rowValue.otherlinksgsslink,
- 'otherLinks1': rowValue.otherlinks,
- 'otherLinks2': rowValue.otherlinks2,
- 'geoCorrected': False
- }
- publisher.write(warning)
- if __name__ == "__main__":
- ap = args.get_parser()
- ap.add_argument('-g', '--gsr', type=str, help='Location of gsr excel sheet')
- ap.add_argument('-n', '--noCapture', action='store_false', default=True, help='Location of gsr excel sheet')
- ap.add_argument('-t', '--type', help="filter by type, enter the first two digits of the code")
- args = ap.parse_args()
- main(args)