PageRenderTime 48ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 1ms

/tests/risk/answer_key.py

https://gitlab.com/lbennett/zipline
Python | 341 lines | 315 code | 10 blank | 16 comment | 1 complexity | ae41e59d62fac20f74d329132ac1ae58 MD5 | raw file
  1. #
  2. # Copyright 2014 Quantopian, Inc.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import datetime
  16. import hashlib
  17. import os
  18. import numpy as np
  19. import pandas as pd
  20. import pytz
  21. import xlrd
  22. import requests
  23. from six.moves import map
  24. def col_letter_to_index(col_letter):
  25. # Only supports single letter,
  26. # but answer key doesn't need multi-letter, yet.
  27. index = 0
  28. for i, char in enumerate(reversed(col_letter)):
  29. index += ((ord(char) - 65) + 1) * pow(26, i)
  30. return index
  31. DIR = os.path.dirname(os.path.realpath(__file__))
  32. ANSWER_KEY_CHECKSUMS_PATH = os.path.join(DIR, 'risk-answer-key-checksums')
  33. ANSWER_KEY_CHECKSUMS = open(ANSWER_KEY_CHECKSUMS_PATH, 'r').read().splitlines()
  34. ANSWER_KEY_FILENAME = 'risk-answer-key.xlsx'
  35. ANSWER_KEY_PATH = os.path.join(DIR, ANSWER_KEY_FILENAME)
  36. ANSWER_KEY_BUCKET_NAME = 'zipline-test_data'
  37. ANSWER_KEY_DL_TEMPLATE = """
  38. https://s3.amazonaws.com/zipline-test-data/risk/{md5}/risk-answer-key.xlsx
  39. """.strip()
  40. LATEST_ANSWER_KEY_URL = ANSWER_KEY_DL_TEMPLATE.format(
  41. md5=ANSWER_KEY_CHECKSUMS[-1])
  42. def answer_key_signature():
  43. with open(ANSWER_KEY_PATH, 'rb') as f:
  44. md5 = hashlib.md5()
  45. buf = f.read(1024)
  46. md5.update(buf)
  47. while buf != b"":
  48. buf = f.read(1024)
  49. md5.update(buf)
  50. return md5.hexdigest()
  51. def ensure_latest_answer_key():
  52. """
  53. Get the latest answer key from a publically available location.
  54. Logic for determining what and when to download is as such:
  55. - If there is no local spreadsheet file, then get the lastest answer key,
  56. as defined by the last row in the checksum file.
  57. - If there is a local spreadsheet file:
  58. -- If the spreadsheet's checksum is in the checksum file:
  59. --- If the spreadsheet's checksum does not match the latest, then grab the
  60. the latest checksum and replace the local checksum file.
  61. --- If the spreadsheet's checksum matches the latest, then skip download,
  62. and use the local spreadsheet as a cached copy.
  63. -- If the spreadsheet's checksum is not in the checksum file, then leave
  64. the local file alone, assuming that the local xls's md5 is not in the list
  65. due to local modifications during development.
  66. It is possible that md5's could collide, if that is ever case, we should
  67. then find an alternative naming scheme.
  68. The spreadsheet answer sheet is not kept in SCM, as every edit would
  69. increase the repo size by the file size, since it is treated as a binary.
  70. """
  71. answer_key_dl_checksum = None
  72. local_answer_key_exists = os.path.exists(ANSWER_KEY_PATH)
  73. if local_answer_key_exists:
  74. local_hash = answer_key_signature()
  75. if local_hash in ANSWER_KEY_CHECKSUMS:
  76. # Assume previously downloaded version.
  77. # Check for latest.
  78. if local_hash != ANSWER_KEY_CHECKSUMS[-1]:
  79. # More recent checksum, download
  80. answer_key_dl_checksum = ANSWER_KEY_CHECKSUMS[-1]
  81. else:
  82. # Assume local copy that is being developed on
  83. answer_key_dl_checksum = None
  84. else:
  85. answer_key_dl_checksum = ANSWER_KEY_CHECKSUMS[-1]
  86. if answer_key_dl_checksum:
  87. res = requests.get(
  88. ANSWER_KEY_DL_TEMPLATE.format(md5=answer_key_dl_checksum))
  89. with open(ANSWER_KEY_PATH, 'wb') as f:
  90. f.write(res.content)
  91. # Get latest answer key on load.
  92. ensure_latest_answer_key()
  93. class DataIndex(object):
  94. """
  95. Coordinates for the spreadsheet, using the values as seen in the notebook.
  96. The python-excel libraries use 0 index, while the spreadsheet in a GUI
  97. uses a 1 index.
  98. """
  99. def __init__(self, sheet_name, col, row_start, row_end,
  100. value_type='float'):
  101. self.sheet_name = sheet_name
  102. self.col = col
  103. self.row_start = row_start
  104. self.row_end = row_end
  105. self.value_type = value_type
  106. @property
  107. def col_index(self):
  108. return col_letter_to_index(self.col) - 1
  109. @property
  110. def row_start_index(self):
  111. return self.row_start - 1
  112. @property
  113. def row_end_index(self):
  114. return self.row_end - 1
  115. def __str__(self):
  116. return "'{sheet_name}'!{col}{row_start}:{col}{row_end}".format(
  117. sheet_name=self.sheet_name,
  118. col=self.col,
  119. row_start=self.row_start,
  120. row_end=self.row_end
  121. )
  122. class AnswerKey(object):
  123. INDEXES = {
  124. 'RETURNS': DataIndex('Sim Period', 'D', 4, 255),
  125. 'BENCHMARK': {
  126. 'Dates': DataIndex('s_p', 'A', 4, 254, value_type='date'),
  127. 'Returns': DataIndex('s_p', 'H', 4, 254)
  128. },
  129. # Below matches the inconsistent capitalization in spreadsheet
  130. 'BENCHMARK_PERIOD_RETURNS': {
  131. 'Monthly': DataIndex('s_p', 'R', 8, 19),
  132. '3-Month': DataIndex('s_p', 'S', 10, 19),
  133. '6-month': DataIndex('s_p', 'T', 13, 19),
  134. 'year': DataIndex('s_p', 'U', 19, 19),
  135. },
  136. 'BENCHMARK_PERIOD_VOLATILITY': {
  137. 'Monthly': DataIndex('s_p', 'V', 8, 19),
  138. '3-Month': DataIndex('s_p', 'W', 10, 19),
  139. '6-month': DataIndex('s_p', 'X', 13, 19),
  140. 'year': DataIndex('s_p', 'Y', 19, 19),
  141. },
  142. 'ALGORITHM_PERIOD_RETURNS': {
  143. 'Monthly': DataIndex('Sim Period', 'Z', 23, 34),
  144. '3-Month': DataIndex('Sim Period', 'AA', 25, 34),
  145. '6-month': DataIndex('Sim Period', 'AB', 28, 34),
  146. 'year': DataIndex('Sim Period', 'AC', 34, 34),
  147. },
  148. 'ALGORITHM_PERIOD_VOLATILITY': {
  149. 'Monthly': DataIndex('Sim Period', 'AH', 23, 34),
  150. '3-Month': DataIndex('Sim Period', 'AI', 25, 34),
  151. '6-month': DataIndex('Sim Period', 'AJ', 28, 34),
  152. 'year': DataIndex('Sim Period', 'AK', 34, 34),
  153. },
  154. 'ALGORITHM_PERIOD_SHARPE': {
  155. 'Monthly': DataIndex('Sim Period', 'AL', 23, 34),
  156. '3-Month': DataIndex('Sim Period', 'AM', 25, 34),
  157. '6-month': DataIndex('Sim Period', 'AN', 28, 34),
  158. 'year': DataIndex('Sim Period', 'AO', 34, 34),
  159. },
  160. 'ALGORITHM_PERIOD_BETA': {
  161. 'Monthly': DataIndex('Sim Period', 'AP', 23, 34),
  162. '3-Month': DataIndex('Sim Period', 'AQ', 25, 34),
  163. '6-month': DataIndex('Sim Period', 'AR', 28, 34),
  164. 'year': DataIndex('Sim Period', 'AS', 34, 34),
  165. },
  166. 'ALGORITHM_PERIOD_ALPHA': {
  167. 'Monthly': DataIndex('Sim Period', 'AT', 23, 34),
  168. '3-Month': DataIndex('Sim Period', 'AU', 25, 34),
  169. '6-month': DataIndex('Sim Period', 'AV', 28, 34),
  170. 'year': DataIndex('Sim Period', 'AW', 34, 34),
  171. },
  172. 'ALGORITHM_PERIOD_BENCHMARK_VARIANCE': {
  173. 'Monthly': DataIndex('Sim Period', 'BJ', 23, 34),
  174. '3-Month': DataIndex('Sim Period', 'BK', 25, 34),
  175. '6-month': DataIndex('Sim Period', 'BL', 28, 34),
  176. 'year': DataIndex('Sim Period', 'BM', 34, 34),
  177. },
  178. 'ALGORITHM_PERIOD_COVARIANCE': {
  179. 'Monthly': DataIndex('Sim Period', 'BF', 23, 34),
  180. '3-Month': DataIndex('Sim Period', 'BG', 25, 34),
  181. '6-month': DataIndex('Sim Period', 'BH', 28, 34),
  182. 'year': DataIndex('Sim Period', 'BI', 34, 34),
  183. },
  184. 'ALGORITHM_PERIOD_DOWNSIDE_RISK': {
  185. 'Monthly': DataIndex('Sim Period', 'BN', 23, 34),
  186. '3-Month': DataIndex('Sim Period', 'BO', 25, 34),
  187. '6-month': DataIndex('Sim Period', 'BP', 28, 34),
  188. 'year': DataIndex('Sim Period', 'BQ', 34, 34),
  189. },
  190. 'ALGORITHM_PERIOD_SORTINO': {
  191. 'Monthly': DataIndex('Sim Period', 'BR', 23, 34),
  192. '3-Month': DataIndex('Sim Period', 'BS', 25, 34),
  193. '6-month': DataIndex('Sim Period', 'BT', 28, 34),
  194. 'year': DataIndex('Sim Period', 'BU', 34, 34),
  195. },
  196. 'ALGORITHM_RETURN_VALUES': DataIndex(
  197. 'Sim Cumulative', 'D', 4, 254),
  198. 'ALGORITHM_CUMULATIVE_VOLATILITY': DataIndex(
  199. 'Sim Cumulative', 'P', 4, 254),
  200. 'ALGORITHM_CUMULATIVE_SHARPE': DataIndex(
  201. 'Sim Cumulative', 'R', 4, 254),
  202. 'CUMULATIVE_DOWNSIDE_RISK': DataIndex(
  203. 'Sim Cumulative', 'U', 4, 254),
  204. 'CUMULATIVE_SORTINO': DataIndex(
  205. 'Sim Cumulative', 'V', 4, 254),
  206. 'CUMULATIVE_INFORMATION': DataIndex(
  207. 'Sim Cumulative', 'AA', 4, 254),
  208. 'CUMULATIVE_BETA': DataIndex(
  209. 'Sim Cumulative', 'AD', 4, 254),
  210. 'CUMULATIVE_ALPHA': DataIndex(
  211. 'Sim Cumulative', 'AE', 4, 254),
  212. 'CUMULATIVE_MAX_DRAWDOWN': DataIndex(
  213. 'Sim Cumulative', 'AH', 4, 254),
  214. }
  215. def __init__(self):
  216. self.workbook = xlrd.open_workbook(ANSWER_KEY_PATH)
  217. self.sheets = {}
  218. self.sheets['Sim Period'] = self.workbook.sheet_by_name('Sim Period')
  219. self.sheets['Sim Cumulative'] = self.workbook.sheet_by_name(
  220. 'Sim Cumulative')
  221. self.sheets['s_p'] = self.workbook.sheet_by_name('s_p')
  222. for name, index in self.INDEXES.items():
  223. if isinstance(index, dict):
  224. subvalues = {}
  225. for subkey, subindex in index.items():
  226. subvalues[subkey] = self.get_values(subindex)
  227. setattr(self, name, subvalues)
  228. else:
  229. setattr(self, name, self.get_values(index))
  230. def parse_date_value(self, value):
  231. return xlrd.xldate_as_tuple(value, 0)
  232. def parse_float_value(self, value):
  233. return value if value != '' else np.nan
  234. def get_raw_values(self, data_index):
  235. return self.sheets[data_index.sheet_name].col_values(
  236. data_index.col_index,
  237. data_index.row_start_index,
  238. data_index.row_end_index + 1)
  239. @property
  240. def value_type_to_value_func(self):
  241. return {
  242. 'float': self.parse_float_value,
  243. 'date': self.parse_date_value,
  244. }
  245. def get_values(self, data_index):
  246. value_parser = self.value_type_to_value_func[data_index.value_type]
  247. return [value for value in
  248. map(value_parser, self.get_raw_values(data_index))]
  249. ANSWER_KEY = AnswerKey()
  250. BENCHMARK_DATES = ANSWER_KEY.BENCHMARK['Dates']
  251. BENCHMARK_RETURNS = ANSWER_KEY.BENCHMARK['Returns']
  252. DATES = [datetime.datetime(*x, tzinfo=pytz.UTC) for x in BENCHMARK_DATES]
  253. BENCHMARK = pd.Series(dict(zip(DATES, BENCHMARK_RETURNS)))
  254. ALGORITHM_RETURNS = pd.Series(
  255. dict(zip(DATES, ANSWER_KEY.ALGORITHM_RETURN_VALUES)))
  256. RETURNS_DATA = pd.DataFrame({'Benchmark Returns': BENCHMARK,
  257. 'Algorithm Returns': ALGORITHM_RETURNS})
  258. RISK_CUMULATIVE = pd.DataFrame({
  259. 'volatility': pd.Series(dict(zip(
  260. DATES, ANSWER_KEY.ALGORITHM_CUMULATIVE_VOLATILITY))),
  261. 'sharpe': pd.Series(dict(zip(
  262. DATES, ANSWER_KEY.ALGORITHM_CUMULATIVE_SHARPE))),
  263. 'downside_risk': pd.Series(dict(zip(
  264. DATES, ANSWER_KEY.CUMULATIVE_DOWNSIDE_RISK))),
  265. 'sortino': pd.Series(dict(zip(
  266. DATES, ANSWER_KEY.CUMULATIVE_SORTINO))),
  267. 'information': pd.Series(dict(zip(
  268. DATES, ANSWER_KEY.CUMULATIVE_INFORMATION))),
  269. 'alpha': pd.Series(dict(zip(
  270. DATES, ANSWER_KEY.CUMULATIVE_ALPHA))),
  271. 'beta': pd.Series(dict(zip(
  272. DATES, ANSWER_KEY.CUMULATIVE_BETA))),
  273. 'max_drawdown': pd.Series(dict(zip(
  274. DATES, ANSWER_KEY.CUMULATIVE_MAX_DRAWDOWN))),
  275. })