PageRenderTime 1597ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/code/download.py

https://gitlab.com/qijungu/stock
Python | 271 lines | 211 code | 41 blank | 19 comment | 36 complexity | b51092f19605b113a8d578f8eebdbfaa MD5 | raw file
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. from bs4 import BeautifulSoup
  4. from datetime import datetime
  5. import sys, requests, time
  6. YEARSTART = 2001
  7. MONTHSTART = 1
  8. DAYSTART = 1
  9. def toepoc(dt):
  10. ep = datetime(1970,1,1,0,0)
  11. delta = dt-ep
  12. return int(delta.total_seconds() + 25200)
  13. def download(stock):
  14. # get the file name of the stock
  15. stockdata = 'data/'+stock+'.dat'
  16. timenow = datetime.now()
  17. # Date,Open,High,Low,Close,Volume
  18. # 2010-12-21,21.27,21.27,21.08,21.10,34060700
  19. # set now-time
  20. yearnow = timenow.year
  21. monthnow = timenow.month
  22. daynow = timenow.day
  23. # open the stock file and set last time
  24. try:
  25. fpstock = open(stockdata)
  26. fpstock.readline() # read the first line, which is format
  27. fpstock.readline() # read the second line, which is the first line of stock data
  28. timelast = fpstock.readline() # read the third line so that updating is possible during day time
  29. timelast = timelast.split(',')[0].split('-')
  30. yearlast = int(timelast[0])
  31. monthlast = int(timelast[1])
  32. daylast = int(timelast[2])
  33. except:
  34. yearlast = YEARSTART
  35. monthlast = MONTHSTART
  36. daylast = DAYSTART
  37. # retrieve new data
  38. # https://finance.yahoo.com/quote/SPY/history?p=SPY
  39. # https://query1.finance.yahoo.com/v7/finance/download/SPY?period1=1492395823&period2=1494987823&interval=1d&events=history&crumb=hEaGemmlAUH
  40. symbol = stock
  41. symboltable = {
  42. 'SPX' : '^GSPC',
  43. 'VIX' : '^VIX',
  44. 'GVZ' : '^GVZ',
  45. 'VDAX': '^VDAX',
  46. 'DAX' : '^GDAXI',
  47. 'RUT' : '^RUT',
  48. }
  49. if stock in symboltable:
  50. symbol = symboltable[stock]
  51. retry = 0
  52. while retry < 5 :
  53. try :
  54. time.sleep(0.5)
  55. hurl = 'https://finance.yahoo.com/quote/'+symbol+'/history?p='+symbol
  56. session = requests.session()
  57. session.headers.update({'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'})
  58. session.headers.update({'Cache-Control': 'no-cache'})
  59. session.headers.update({'Pragma': 'no-cache'})
  60. r = session.get(hurl)
  61. page = str(BeautifulSoup(r.content, 'lxml'))
  62. cstartidx = page.find('CrumbStore')
  63. cstartidx = page.find('{', cstartidx)
  64. cendidx = page.find('}', cstartidx)
  65. crumbstr = page[cstartidx:cendidx+1]
  66. crumb = crumbstr[10:-2]
  67. time.sleep(0.5)
  68. session.headers.update({'Referer': hurl})
  69. timetoday = datetime(yearnow,monthnow,daynow,0,0)
  70. timelast = datetime(yearlast,monthlast,daylast,0,0)
  71. durl = 'https://query1.finance.yahoo.com/v7/finance/download/'+symbol+'?period1='+str(toepoc(timelast))+'&period2='+str(toepoc(timetoday))+'&interval=1d&events=history&crumb='+crumb
  72. r = session.get(durl)
  73. page = BeautifulSoup(r.content, 'lxml').select_one('p')
  74. data = page.text.strip()
  75. if len(data) == 0:
  76. #print('%s error: zero data')
  77. retry += 1
  78. continue
  79. newdata = data.split('\n')
  80. if len(newdata) <= 1:
  81. #print('%s error: empty data %s'%(stock, newdata[0]))
  82. retry += 1
  83. continue
  84. if 'Date' not in newdata[0]:
  85. #print('%s error: no date %s'%(stock, newdata[0]))
  86. retry += 1
  87. continue
  88. break
  89. except:
  90. retry += 1
  91. if retry == 5:
  92. print('Cannot download with 5 retries.')
  93. return
  94. timelatest = newdata[-1].split(',')[0]
  95. timelatest = timelatest.split('-')
  96. yearlatest = int(timelatest[0])
  97. monthlatest = int(timelatest[1])
  98. daylatest = int(timelatest[2])
  99. try:
  100. olddata = open(stockdata).readlines()
  101. del olddata[0]
  102. except:
  103. olddata = []
  104. nd = str(newdata[0]).split(',')
  105. del nd[5]
  106. nd = ','.join(nd)
  107. data = [nd]
  108. errordata = [0,0,0,0,0,0]
  109. for i in range(len(newdata)-1,0,-1):
  110. nd = str(newdata[i]).split(',')
  111. del nd[5]
  112. for j in range(1,len(nd)):
  113. if 'null' in nd[j]:
  114. nd[j] = errordata[j]
  115. errordata = nd
  116. nd = ','.join(nd)
  117. data += [nd]
  118. timelatest = datetime(yearlatest, monthlatest, daylatest,0,0)
  119. for od in olddata:
  120. od = od.strip()
  121. if len(od) == 0:
  122. continue
  123. timeold = od.split(',')[0]
  124. timeold = timeold.split('-')
  125. timeold = datetime(int(timeold[0]), int(timeold[1]), int(timeold[2]),0,0)
  126. if timeold < timelast:
  127. data += [od]
  128. fpstock = open(stockdata, 'w')
  129. for d in data:
  130. fpstock.write(d+'\n')
  131. fpstock.close()
  132. stockmatlab = 'data/matlab/'+stock+'.dat'
  133. fpmatlab = open(stockmatlab, 'w')
  134. fpmatlab.write(stock+'\n')
  135. fpmatlab.write('DATE\tOPEN\tHIGH\tLOW\tCLOSE\tVOLUME\n')
  136. del data[0]
  137. for d in data:
  138. d = d.replace('-','/').replace(',','\t')
  139. fpmatlab.write(d+'\n')
  140. fpmatlab.close()
  141. if len(data)>1:
  142. change = '%.2f%%'%(round((float(data[0].split(',')[4])/float(data[1].split(',')[4])-1)*100,2))
  143. else :
  144. change = 'na'
  145. print("%5s, latest %d-%d-%d (%s, %s), now %d-%d-%d, last %d-%d-%d"%(stock, yearlatest, monthlatest, daylatest, data[0].split(',')[4], change, yearnow, monthnow, daynow, yearlast, monthlast, daylast));
  146. return
  147. def downloadcboe(stock):
  148. # get the file name of the stock
  149. stockdata = 'data/'+stock+'.dat'
  150. timenow = datetime.now()
  151. # Date,Open,High,Low,Close,Volume
  152. # 2010-12-21,21.27,21.27,21.08,21.10,34060700
  153. # set now-time
  154. yearnow = timenow.year
  155. monthnow = timenow.month
  156. daynow = timenow.day
  157. symbol = stock
  158. retry = 0
  159. while retry < 5 :
  160. try :
  161. time.sleep(0.5)
  162. hurl = 'http://www.cboe.com/publish/ScheduledTask/mktdata/datahouse/'+symbol+'DailyPrices.csv'
  163. session = requests.session()
  164. session.headers.update({'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'})
  165. session.headers.update({'Cache-Control': 'no-cache'})
  166. session.headers.update({'Pragma': 'no-cache'})
  167. r = session.get(hurl)
  168. page = BeautifulSoup(r.content, 'lxml').select_one('p')
  169. data = page.text.strip()
  170. if len(data) == 0:
  171. #print('%s error: zero data')
  172. retry += 1
  173. continue
  174. newdata = data.split('\n')
  175. if len(newdata) <= 2:
  176. #print('%s error: empty data %s'%(stock, newdata[0]))
  177. retry += 1
  178. continue
  179. break
  180. except:
  181. retry += 1
  182. if retry == 5:
  183. print('Cannot download with 5 retries.')
  184. return
  185. dflag = False
  186. while not dflag :
  187. if 'Date,Open' not in newdata[0]:
  188. del newdata[0]
  189. else :
  190. dflag = True
  191. timelatest = newdata[-1].split(',')[0]
  192. timelatest = timelatest.split('/')
  193. yearlatest = int(timelatest[2])
  194. monthlatest = int(timelatest[0])
  195. daylatest = int(timelatest[1])
  196. stockmatlab = 'data/matlab/'+stock+'.dat'
  197. fpmatlab = open(stockmatlab, 'w')
  198. fpmatlab.write(stock+'\n')
  199. fpmatlab.write('DATE\tOPEN\tHIGH\tLOW\tCLOSE\tVOLUME\n')
  200. for i in range(len(newdata)-1,0,-1):
  201. d = str(newdata[i].strip())
  202. d = d.split(',')
  203. dtime = d[0].split('/')
  204. d[0] = '/'.join([dtime[2],dtime[0],dtime[1]])
  205. d = '\t'.join(d) +'\t0'
  206. fpmatlab.write(d+'\n')
  207. fpmatlab.close()
  208. if len(newdata)>1:
  209. change = '%.2f%%'%(round((float(newdata[-1].split(',')[4])/float(newdata[-2].split(',')[4])-1)*100,2))
  210. else :
  211. change = 'na'
  212. print("%5s, latest %d-%d-%d (%s, %s), now %d-%d-%d"%(stock, yearlatest, monthlatest, daylatest, newdata[-1].strip().split(',')[4], change, yearnow, monthnow, daynow));
  213. return
  214. def main():
  215. if len(sys.argv) < 2 :
  216. print 'Example: ./download stock.list'
  217. return
  218. del sys.argv[0]
  219. for arg in sys.argv:
  220. stocks = open(arg).readlines()
  221. scboe = ['VXFXI', 'VXGDX', 'VXSLV', 'VXXLE', 'VXTYN', 'RVX']
  222. for s in stocks:
  223. s = s.strip().upper()
  224. if len(s) == 0:
  225. continue
  226. if s in scboe :
  227. downloadcboe(s)
  228. else:
  229. download(s)
  230. main()