PageRenderTime 57ms CodeModel.GetById 28ms RepoModel.GetById 0ms app.codeStats 0ms

/wyominglib.py

https://gitlab.com/rvalenzuela/wyominglib
Python | 416 lines | 398 code | 2 blank | 16 comment | 0 complexity | 2cbcda88aaaecbbb958c9f5e86566063 MD5 | raw file
  1. """
  2. Download and parse wyoming soundings into
  3. pandas DataFrames, then store to HDF5 file
  4. Original code: Philip Austin
  5. Source: https://github.com/phaustin/A405
  6. Raul Valenzuela
  7. raul.valenzuela@colorado.edu
  8. Examples:
  9. 1)
  10. import wyominglib as wl
  11. years = np.arange(2000,2003)
  12. for yr in years:
  13. wl.download_wyoming(region='samer',
  14. station=dict(name='ptomnt',number='85799'),
  15. out_directory='/home/raul',
  16. year=yr)
  17. 2)
  18. import wyominglib as wl
  19. wl.download_wyoming(region='samer',
  20. station=dict(name='puerto_montt',number='85799'),
  21. out_directory='/home/raul',
  22. dates=['2001-01-01 00:00','2001-01-10 12:00'])
  23. 3)
  24. import wyominglib as wl
  25. wl.download_wyoming(region='samer',
  26. station=dict(name='puerto_montt',number='85799'),
  27. out_directory='/home/raul',
  28. date='2001-01-01 00:00')
  29. """
  30. import re
  31. import sys
  32. import urllib3
  33. import h5py
  34. import numpy as np
  35. import pandas as pd
  36. from bs4 import BeautifulSoup
  37. from constants import constants as con
  38. from thermlib import find_esat
  39. # We need to parse a set of lines that look like this:
  40. # Station number: 82965
  41. # Observation time: 110201/0000
  42. # Station latitude: -9.86
  43. # Station longitude: -56.10
  44. # Station elevation: 288.0
  45. # Showalter index: -0.37
  46. # Lifted index: -0.68
  47. # Here's a regular expression that does that:
  48. re_text = """
  49. .*Station\snumber\:\s(.+?)\n
  50. \s+Observation\stime\:\s(.+?)\n
  51. \s+Station\slatitude\:\s(.+?)\n
  52. \s+Station\slongitude\:\s(.+?)\n
  53. \s+Station\selevation\:\s(.+?)\n
  54. .*
  55. """
  56. #
  57. # DOTALL says: make . include \n
  58. # VERBOSE says: ignore whitespace and comments within the regular expression
  59. # to help make the expression more readable
  60. #
  61. header_re = re.compile(re_text, re.DOTALL | re.VERBOSE)
  62. def parse_header(header_text):
  63. """
  64. Read a header returned by make_frames and
  65. return station information
  66. Parameters
  67. ----------
  68. header_text : str
  69. string containing wyoming header read with make_frames
  70. except AttributeError:
  71. print('parse failure with: \n',header_text)
  72. the_id,string_time,lat,lon,elev=the_match.groups()
  73. elev=elev.split('\n')[0] #some soundings follow elev with Shoalwater, not Lifted
  74. lat=float(lat)
  75. lon=float(lon)
  76. elev=float(elev)
  77. day,hour = string_time.strip().split('/')
  78. year=int(day[:2]) + 2000
  79. month=int(day[2:4])
  80. day=int(day[4:6])
  81. minute=int(hour[2:])
  82. hour=int(hour[:2])
  83. return the_id,lat,lon,elev Returns
  84. -------
  85. the_id : int
  86. 5 digit souding id
  87. lat : float
  88. latitude (degrees North)
  89. lon : float
  90. longitude (degrees east)
  91. elev : float
  92. station elevation (meters)
  93. """
  94. header_text = header_text.strip()
  95. the_match = header_re.match(header_text)
  96. try:
  97. the_id, string_time, lat, lon, elev = the_match.groups()
  98. except AttributeError:
  99. print('parse failure with: \n', header_text)
  100. the_id, string_time, lat, lon, elev = the_match.groups()
  101. elev = elev.split('\n')[
  102. 0] # some soundings follow elev with Shoalwater, not Lifted
  103. lat = float(lat)
  104. lon = float(lon)
  105. elev = float(elev)
  106. day, hour = string_time.strip().split('/')
  107. # year=int(day[:2]) + 2000
  108. # month=int(day[2:4])
  109. day = int(day[4:6])
  110. # minute=int(hour[2:])
  111. hour = int(hour[:2])
  112. return the_id, lat, lon, elev
  113. def parse_data(data_text):
  114. """
  115. Read a single sounding into a dataframe
  116. Parameters
  117. ----------
  118. data_text : str
  119. sounding text
  120. Returns
  121. -------
  122. df_out : dataframe
  123. 11 column data frame with sounding values
  124. unit_name : list
  125. list of strings with name of units of each column
  126. """
  127. import struct
  128. """
  129. read lines with 11 numbers and convert to dataframe
  130. data_text looks like:
  131. -----------------------------------------------------------------------------
  132. PRES HGHT TEMP DWPT RELH MIXR DRCT SKNT THTA THTE THTV
  133. hPa m C C % g/kg deg knot K K K
  134. -----------------------------------------------------------------------------
  135. 1000.0 100
  136. 979.0 288 24.0 23.0 94 18.45 0 0 299.0 353.0 302.2
  137. 974.0 333 25.2 21.1 78 16.46 348 0 300.6 349.1 303.6
  138. 932.0 719 24.0 16.0 61 12.42 243 3 303.2 340.3 305.4
  139. 925.0 785 23.4 15.4 61 12.03 225 3 303.2 339.2 305.4
  140. """
  141. all_lines = data_text.strip().split('\n')
  142. # print len(all_lines)
  143. #
  144. # key_str = 'Station information'
  145. # check_line = np.array([s.find(key_str) for s in all_lines])
  146. # print check_line
  147. # print ''
  148. # last_data_line = np.where(check_line > -1)[0][0]
  149. count = 0
  150. theLine = all_lines[count]
  151. try:
  152. while theLine.find('PRES HGHT TEMP DWPT') < 0:
  153. count += 1
  154. theLine = all_lines[count]
  155. header_names = all_lines[count].lower().split()
  156. except IndexError:
  157. print("no column header line found in sounding")
  158. sys.exit(1)
  159. count += 1 # go to unit names
  160. unit_names = all_lines[count].split()
  161. count += 2 # skip a row of ------
  162. data_list = []
  163. # while True:
  164. # try:
  165. # the_line = all_lines[count]
  166. # fmtstring = '7s 7s 7s 7s 7s 7s 7s 7s 7s 7s 7s'
  167. # fieldstruct = struct.Struct(fmtstring)
  168. # parse = fieldstruct.unpack_from
  169. # print count, the_line
  170. # dataFields = parse(the_line)
  171. #
  172. # try:
  173. #
  174. # empty = ' '
  175. # dataFields = [float(number) if empty not in
  176. # number else np.nan for number
  177. # in dataFields]
  178. #
  179. # # if np.isnan(dataFields[5]):
  180. # # # get vapor pressure from dewpoint in hPa
  181. # # es = find_esat(dataFields[3] + 273.15) * 0.01
  182. # # press = dataFields[0]
  183. # # dataFields[5] = (con.eps * es / (press - es))
  184. # # dataFields[5] = dataFields[5] * 1.e3 # g/kg
  185. # except ValueError:
  186. # print('trouble converting dataFields to float')
  187. # print(dataFields)
  188. # sys.exit(1)
  189. # data_list.append(dataFields)
  190. #
  191. # count += 1
  192. # # theLine = all_lines[count]
  193. # except IndexError:
  194. # break
  195. for r in range(count, len(all_lines)-1):
  196. fmtstring = '7s 7s 7s 7s 7s 7s 7s 7s 7s 7s 7s'
  197. fieldstruct = struct.Struct(fmtstring)
  198. parse = fieldstruct.unpack_from
  199. the_line = all_lines[r]
  200. # print r, the_line
  201. dataFields = parse(the_line.strip('\r\n'))
  202. empty = ' '
  203. dataFields = [float(number) if empty not in number else
  204. np.nan for number in dataFields]
  205. data_list.append(dataFields)
  206. df_out = pd.DataFrame.from_records(data_list,
  207. columns=header_names)
  208. return df_out, unit_names
  209. def make_frames(html_doc):
  210. """
  211. turn an html page retrieved from the wyoming site into a dataframe
  212. Parameters
  213. ----------
  214. html_doc : string
  215. web page from wyoming upperair sounding site
  216. http://weather.uwyo.edu/cgi-bin/sounding
  217. retrieved by the urllib module
  218. Returns
  219. -------
  220. attr_dict : dict
  221. attr_dict dictionary with
  222. ['header', 'site_id','longitude','latitude',
  223. 'elevation', 'units']
  224. sound_dict : dict
  225. sounding dictionary with sounding times
  226. as keys and sounding as dataframes
  227. """
  228. soup = BeautifulSoup(html_doc, 'html.parser')
  229. keep = list()
  230. # sounding_dict=dict()
  231. pre = soup.find_all('pre')
  232. # print(len(pre))
  233. if len(pre) > 0:
  234. for item in pre:
  235. keep.append(item.text)
  236. evens = np.arange(0, len(keep), 2)
  237. for count in evens:
  238. df, units = parse_data(keep[count])
  239. the_id, lat, lon, elev = parse_header(keep[count + 1])
  240. header = soup.find_all('h2')[0].text
  241. attr_dict = dict(units=';'.join(units), site_id=the_id,
  242. latitude=lat, longitude=lon, elevation=elev,
  243. header=header)
  244. resp = 'OK'
  245. else:
  246. attr_dict = dict()
  247. df = pd.DataFrame(np.nan,
  248. index=[0],
  249. columns=['data'])
  250. resp = 'NO SOUNDING'
  251. # html_doc.close()
  252. return attr_dict, df, resp
  253. def download_wyoming(region=None, station=None, year=None,
  254. date=None, dates=None, out_directory=None):
  255. """
  256. function to test downloading a sounding
  257. from http://weather.uwyo.edu/cgi-bin/sounding and
  258. creating an hdf file with soundings and attributes
  259. see the notebook newsoundings.ipynb for use
  260. """
  261. st_num = station['number']
  262. st_name = station['name']
  263. url_template = ("http://weather.uwyo.edu/cgi-bin/sounding?"
  264. "region={region:s}"
  265. "&TYPE=TEXT%3ALIST"
  266. "&YEAR={year:s}"
  267. "&MONTH={month:s}"
  268. "&FROM={start:s}"
  269. "&TO={stop:s}"
  270. "&STNM={station:s}")
  271. name_template = out_directory + '/wyoming_{0}_{1}_{2}.h5'
  272. # Parse date to download and output h5 name
  273. if date and year is None and dates is None:
  274. dates = pd.date_range(start=date,
  275. periods=1,
  276. freq='12H')
  277. dstr = dates[0].strftime('%Y%m%d%H')
  278. out_name = name_template.format(region, st_name, dstr)
  279. elif dates and date is None and year is None:
  280. date0 = dates[0]
  281. date1 = dates[1]
  282. dates = pd.date_range(start=date0,
  283. end=date1,
  284. freq='12H')
  285. dstr = dates[0].strftime('%Y%m%d%H-') + \
  286. dates[-1].strftime('%Y%m%d%H')
  287. out_name = name_template.format(region, st_name, dstr)
  288. else:
  289. yr = str(year)
  290. dates = pd.date_range(start=yr + '-01-01 00:00',
  291. end=yr + '-12-31 12:00',
  292. freq='12H')
  293. out_name = name_template.format(region, st_name, yr)
  294. # start downloading for each date
  295. with pd.HDFStore(out_name, 'w') as store:
  296. for date in dates:
  297. values = dict(region=region,
  298. year=date.strftime('%Y'),
  299. month=date.strftime('%m'),
  300. start=date.strftime('%d%H'),
  301. stop=date.strftime('%d%H'),
  302. station=st_num)
  303. url = url_template.format(**values)
  304. print url
  305. # old urllib function
  306. # html_doc = urllib.request.urlopen(url)
  307. http = urllib3.PoolManager()
  308. response = http.request('GET', url)
  309. html_doc = response.data
  310. at_dict, sounding_df, resp = make_frames(html_doc)
  311. if resp == 'OK':
  312. attr_dict = at_dict
  313. else:
  314. attr_dict = dict()
  315. print_str = 'Read/Write sounding date {}: {}'
  316. print_date = date.strftime('%Y-%m-%d_%HZ')
  317. print(print_str.format(print_date, resp))
  318. thetime = date.strftime("Y%Y%m%dZ%H")
  319. store.put(thetime, sounding_df, format='table')
  320. attr_dict['history'] = "written by wyominglib.py"
  321. key_list = ['header', 'site_id', 'longitude', 'latitude',
  322. 'elevation', 'units', 'history']
  323. with h5py.File(out_name, 'a') as f:
  324. print('Writing HDF5 file attributes')
  325. for key in key_list:
  326. try:
  327. print('writing key, value: ', key, attr_dict[key])
  328. f.attrs[key] = attr_dict[key]
  329. except KeyError:
  330. pass
  331. f.close()
  332. print('hdf file {} written'.format(out_name))
  333. print('reading attributes: ')
  334. with h5py.File(out_name, 'r') as f:
  335. keys = f.attrs.keys()
  336. for key in keys:
  337. try:
  338. print(key, f.attrs[key])
  339. except OSError:
  340. pass