/code/download.py
Python | 271 lines | 211 code | 41 blank | 19 comment | 36 complexity | b51092f19605b113a8d578f8eebdbfaa MD5 | raw file
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- from bs4 import BeautifulSoup
- from datetime import datetime
- import sys, requests, time
- YEARSTART = 2001
- MONTHSTART = 1
- DAYSTART = 1
- def toepoc(dt):
- ep = datetime(1970,1,1,0,0)
- delta = dt-ep
- return int(delta.total_seconds() + 25200)
-
- def download(stock):
- # get the file name of the stock
- stockdata = 'data/'+stock+'.dat'
- timenow = datetime.now()
-
- # Date,Open,High,Low,Close,Volume
- # 2010-12-21,21.27,21.27,21.08,21.10,34060700
- # set now-time
- yearnow = timenow.year
- monthnow = timenow.month
- daynow = timenow.day
- # open the stock file and set last time
- try:
- fpstock = open(stockdata)
- fpstock.readline() # read the first line, which is format
- fpstock.readline() # read the second line, which is the first line of stock data
- timelast = fpstock.readline() # read the third line so that updating is possible during day time
- timelast = timelast.split(',')[0].split('-')
- yearlast = int(timelast[0])
- monthlast = int(timelast[1])
- daylast = int(timelast[2])
- except:
- yearlast = YEARSTART
- monthlast = MONTHSTART
- daylast = DAYSTART
-
- # retrieve new data
- # https://finance.yahoo.com/quote/SPY/history?p=SPY
- # https://query1.finance.yahoo.com/v7/finance/download/SPY?period1=1492395823&period2=1494987823&interval=1d&events=history&crumb=hEaGemmlAUH
- symbol = stock
- symboltable = {
- 'SPX' : '^GSPC',
- 'VIX' : '^VIX',
- 'GVZ' : '^GVZ',
- 'VDAX': '^VDAX',
- 'DAX' : '^GDAXI',
- 'RUT' : '^RUT',
- }
- if stock in symboltable:
- symbol = symboltable[stock]
-
- retry = 0
- while retry < 5 :
- try :
- time.sleep(0.5)
- hurl = 'https://finance.yahoo.com/quote/'+symbol+'/history?p='+symbol
- session = requests.session()
- session.headers.update({'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'})
- session.headers.update({'Cache-Control': 'no-cache'})
- session.headers.update({'Pragma': 'no-cache'})
- r = session.get(hurl)
- page = str(BeautifulSoup(r.content, 'lxml'))
-
- cstartidx = page.find('CrumbStore')
- cstartidx = page.find('{', cstartidx)
- cendidx = page.find('}', cstartidx)
- crumbstr = page[cstartidx:cendidx+1]
- crumb = crumbstr[10:-2]
-
- time.sleep(0.5)
- session.headers.update({'Referer': hurl})
- timetoday = datetime(yearnow,monthnow,daynow,0,0)
- timelast = datetime(yearlast,monthlast,daylast,0,0)
- durl = 'https://query1.finance.yahoo.com/v7/finance/download/'+symbol+'?period1='+str(toepoc(timelast))+'&period2='+str(toepoc(timetoday))+'&interval=1d&events=history&crumb='+crumb
- r = session.get(durl)
- page = BeautifulSoup(r.content, 'lxml').select_one('p')
-
- data = page.text.strip()
- if len(data) == 0:
- #print('%s error: zero data')
- retry += 1
- continue
- newdata = data.split('\n')
- if len(newdata) <= 1:
- #print('%s error: empty data %s'%(stock, newdata[0]))
- retry += 1
- continue
- if 'Date' not in newdata[0]:
- #print('%s error: no date %s'%(stock, newdata[0]))
- retry += 1
- continue
-
- break
- except:
- retry += 1
-
- if retry == 5:
- print('Cannot download with 5 retries.')
- return
-
- timelatest = newdata[-1].split(',')[0]
- timelatest = timelatest.split('-')
- yearlatest = int(timelatest[0])
- monthlatest = int(timelatest[1])
- daylatest = int(timelatest[2])
-
- try:
- olddata = open(stockdata).readlines()
- del olddata[0]
- except:
- olddata = []
-
- nd = str(newdata[0]).split(',')
- del nd[5]
- nd = ','.join(nd)
- data = [nd]
- errordata = [0,0,0,0,0,0]
- for i in range(len(newdata)-1,0,-1):
- nd = str(newdata[i]).split(',')
- del nd[5]
- for j in range(1,len(nd)):
- if 'null' in nd[j]:
- nd[j] = errordata[j]
- errordata = nd
- nd = ','.join(nd)
- data += [nd]
-
- timelatest = datetime(yearlatest, monthlatest, daylatest,0,0)
- for od in olddata:
- od = od.strip()
- if len(od) == 0:
- continue
- timeold = od.split(',')[0]
- timeold = timeold.split('-')
- timeold = datetime(int(timeold[0]), int(timeold[1]), int(timeold[2]),0,0)
- if timeold < timelast:
- data += [od]
-
- fpstock = open(stockdata, 'w')
- for d in data:
- fpstock.write(d+'\n')
- fpstock.close()
-
- stockmatlab = 'data/matlab/'+stock+'.dat'
- fpmatlab = open(stockmatlab, 'w')
- fpmatlab.write(stock+'\n')
- fpmatlab.write('DATE\tOPEN\tHIGH\tLOW\tCLOSE\tVOLUME\n')
- del data[0]
- for d in data:
- d = d.replace('-','/').replace(',','\t')
- fpmatlab.write(d+'\n')
- fpmatlab.close()
-
- if len(data)>1:
- change = '%.2f%%'%(round((float(data[0].split(',')[4])/float(data[1].split(',')[4])-1)*100,2))
- else :
- change = 'na'
- print("%5s, latest %d-%d-%d (%s, %s), now %d-%d-%d, last %d-%d-%d"%(stock, yearlatest, monthlatest, daylatest, data[0].split(',')[4], change, yearnow, monthnow, daynow, yearlast, monthlast, daylast));
-
- return
-
- def downloadcboe(stock):
-
- # get the file name of the stock
- stockdata = 'data/'+stock+'.dat'
- timenow = datetime.now()
-
- # Date,Open,High,Low,Close,Volume
- # 2010-12-21,21.27,21.27,21.08,21.10,34060700
- # set now-time
- yearnow = timenow.year
- monthnow = timenow.month
- daynow = timenow.day
-
- symbol = stock
- retry = 0
- while retry < 5 :
- try :
- time.sleep(0.5)
- hurl = 'http://www.cboe.com/publish/ScheduledTask/mktdata/datahouse/'+symbol+'DailyPrices.csv'
- session = requests.session()
- session.headers.update({'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'})
- session.headers.update({'Cache-Control': 'no-cache'})
- session.headers.update({'Pragma': 'no-cache'})
- r = session.get(hurl)
- page = BeautifulSoup(r.content, 'lxml').select_one('p')
- data = page.text.strip()
- if len(data) == 0:
- #print('%s error: zero data')
- retry += 1
- continue
- newdata = data.split('\n')
- if len(newdata) <= 2:
- #print('%s error: empty data %s'%(stock, newdata[0]))
- retry += 1
- continue
- break
-
- except:
- retry += 1
-
- if retry == 5:
- print('Cannot download with 5 retries.')
- return
-
- dflag = False
- while not dflag :
- if 'Date,Open' not in newdata[0]:
- del newdata[0]
- else :
- dflag = True
-
- timelatest = newdata[-1].split(',')[0]
- timelatest = timelatest.split('/')
- yearlatest = int(timelatest[2])
- monthlatest = int(timelatest[0])
- daylatest = int(timelatest[1])
-
- stockmatlab = 'data/matlab/'+stock+'.dat'
- fpmatlab = open(stockmatlab, 'w')
- fpmatlab.write(stock+'\n')
- fpmatlab.write('DATE\tOPEN\tHIGH\tLOW\tCLOSE\tVOLUME\n')
- for i in range(len(newdata)-1,0,-1):
- d = str(newdata[i].strip())
- d = d.split(',')
- dtime = d[0].split('/')
- d[0] = '/'.join([dtime[2],dtime[0],dtime[1]])
- d = '\t'.join(d) +'\t0'
- fpmatlab.write(d+'\n')
- fpmatlab.close()
-
- if len(newdata)>1:
- change = '%.2f%%'%(round((float(newdata[-1].split(',')[4])/float(newdata[-2].split(',')[4])-1)*100,2))
- else :
- change = 'na'
- print("%5s, latest %d-%d-%d (%s, %s), now %d-%d-%d"%(stock, yearlatest, monthlatest, daylatest, newdata[-1].strip().split(',')[4], change, yearnow, monthnow, daynow));
- return
- def main():
-
- if len(sys.argv) < 2 :
- print 'Example: ./download stock.list'
- return
-
- del sys.argv[0]
-
- for arg in sys.argv:
- stocks = open(arg).readlines()
- scboe = ['VXFXI', 'VXGDX', 'VXSLV', 'VXXLE', 'VXTYN', 'RVX']
- for s in stocks:
- s = s.strip().upper()
- if len(s) == 0:
- continue
- if s in scboe :
- downloadcboe(s)
- else:
- download(s)
- main()