PageRenderTime 172ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/fetchvc.py

https://code.google.com/p/simplecd/
Python | 304 lines | 272 code | 14 blank | 18 comment | 17 complexity | bf86f089664cb37d9ffd5b520bb516ba MD5 | raw file
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. #
  4. # fetchvc.py fetch resources from verycd
  5. #
  6. # author: observer
  7. # email: jingchaohu@gmail.com
  8. # blog: http://obmem.com
  9. # last edit @ 2009.12.16
  10. import urllib
  11. import re
  12. import sqlite3
  13. import time
  14. import os,sys
  15. from threading import Thread
  16. from Queue import Queue
  17. from download import httpfetch
  18. path = os.path.dirname(os.path.realpath(sys.argv[0]))
  19. conn = sqlite3.connect(path+'/verycd.sqlite3.db')
  20. conn.text_factory = str
  21. q = Queue()
  22. MAXC = 8
  23. def thread_fetch():
  24. conn = sqlite3.connect(path+'/verycd.sqlite3.db')
  25. conn.text_factory = str
  26. while True:
  27. topic = q.get()
  28. fetch(topic,conn)
  29. q.task_done()
  30. def search(keyword,full=True):
  31. '''search verycd, fetch search results'''
  32. url = 'http://www.verycd.com/search/folders/'+keyword
  33. print 'fetching search results ...'
  34. res = httpfetch(url)
  35. topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(res)
  36. topics = set(topics)
  37. links = []
  38. if full:
  39. links = re.compile(r'/search/folders/(.*?\?start=\d+)',re.DOTALL).findall(res)
  40. print links
  41. print topics
  42. if topics:
  43. for topic in topics:
  44. q.put(topic)
  45. if full and links:
  46. for key in links:
  47. search(key,full=False)
  48. def hot():
  49. ''' read verycd hot res and keep update very day '''
  50. url = 'http://www.verycd.com/'
  51. print 'fetching homepage ...'
  52. home = httpfetch(url)
  53. hotzone = re.compile(r'????.*?</dl>',re.DOTALL).search(home).group()
  54. hot = re.compile(r'<a href="/topics/(\d+)/"[^>]*>(?.*??)[^<]*</a>',re.DOTALL).findall(hotzone)
  55. html = '<h2 style="color:red">??????</h2>\n'
  56. for topic in hot:
  57. print 'fetching hot topic',topic[0],'...'
  58. q.put(topic[0])
  59. html += '&nbsp;<a target="_parent" href="/?id=%s">%s</a>&nbsp;\n' % topic
  60. open(path+'/static/hot.html','w').write(html)
  61. def feed():
  62. ''' read verycd feed and keep update very 30 min '''
  63. url = 'http://www.verycd.com/sto/feed'
  64. print 'fetching feed ...'
  65. feeds = httpfetch(url)
  66. ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(feeds)
  67. ids = set(ids)
  68. print ids
  69. now = time.mktime(time.gmtime())
  70. for id in ids:
  71. q.put(id)
  72. #updtime = fetch(id)
  73. #updtime = time.mktime(time.strptime(updtime,'%Y/%m/%d %H:%M:%S'))-8*3600 #gmt+8->gmt
  74. #diff = now - updtime
  75. #print '%10s secs since update' % (diff)
  76. #if diff > 1900: # only need recent 30min updates
  77. # break
  78. def update(num=10):
  79. urlbase = 'http://www.verycd.com/sto/~all/page'
  80. for i in range(1,num+1):
  81. print 'fetching list',i,'...'
  82. url = urlbase+str(i)
  83. res = httpfetch(url)
  84. res2 = re.compile(r'"topic-list"(.*?)"pnav"',re.DOTALL).findall(res)
  85. if res2:
  86. res2 = res2[0]
  87. else:
  88. continue
  89. topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(res2)
  90. topics = set(topics)
  91. print topics
  92. for topic in topics:
  93. q.put(topic)
  94. def fetchall(ran='1-max',debug=False):
  95. urlbase = 'http://www.verycd.com/archives/'
  96. if ran == '1-max':
  97. m1 = 1
  98. res = urllib.urlopen(urlbase).read()
  99. m2 = int(re.compile(r'archives/(\d+)').search(res).group(1))
  100. else:
  101. m = ran.split('-')
  102. m1 = int(m[0])
  103. m2 = int(m[1])
  104. print 'fetching list from',m1,'to',m2,'...'
  105. for i in range(m1,m2+1):
  106. url = urlbase + '%05d'%i + '.html'
  107. print 'fetching from',url,'...'
  108. res = httpfetch(url)
  109. ids = re.compile(r'topics/(\d+)/',re.DOTALL).findall(res)
  110. print ids
  111. for id in ids:
  112. q.put(id)
  113. def fetch(id,conn=conn,debug=False):
  114. print 'fetching topic',id,'...'
  115. urlbase = 'http://www.verycd.com/topics/'
  116. url = urlbase + str(id)
  117. res = ''
  118. for _ in range(3):
  119. try:
  120. res = httpfetch(url,report=True)
  121. break
  122. except:
  123. continue
  124. abstract = re.compile(r'<h1>.*?visit',re.DOTALL).findall(res)
  125. if not abstract:
  126. print res
  127. if res == '' or '???' in res:
  128. print 'resource does not exist'
  129. return
  130. else:
  131. print 'fetching',id,'again...'
  132. return fetch(id,conn)
  133. abstract = abstract[0]
  134. title = re.compile(r'<h1>(.*?)</h1>',re.DOTALL).findall(abstract)[0]
  135. status = re.compile(r'"requestWords">(.*?)<',re.DOTALL).search(abstract).group(1)
  136. brief = re.compile(r'"font-weight:normal"><span>(.*?)</td>',re.DOTALL).search(abstract).group(1)
  137. brief = re.compile(r'<.*?>',re.DOTALL).sub('',brief).strip()
  138. pubtime = re.compile(r'"date-time">(.*?)</span>.*?"date-time">(.*?)</span>',re.DOTALL).findall(abstract)[0]
  139. category1 = re.compile(r'??.*?<td>(.*?)&nbsp;&nbsp;(.*?)&nbsp;&nbsp;',re.DOTALL).findall(abstract)[0]
  140. category = ['','']
  141. category[0] = re.compile(r'<.*?>',re.DOTALL).sub('',category1[0]).strip()
  142. category[1] = re.compile(r'<.*?>',re.DOTALL).sub('',category1[1]).strip()
  143. res2 = re.compile(r'iptcomED2K"><!--eMule.*?<!--eMule end-->',re.DOTALL).findall(res)[0]
  144. ed2k = re.compile(r'ed2k="([^"]*)" subtitle_[^=]*="([^"]*)">([^<]*)</a>',re.DOTALL).findall(res2)
  145. ed2k.extend( re.compile(r'ed2k="([^"]*)">([^<]*)</a>',re.DOTALL).findall(res2) )
  146. content = re.compile(r'<!--eMule end-->(.*?)<!--Wrap-tail end-->',re.DOTALL).findall(res)
  147. if content:
  148. content = content[0]
  149. content = re.compile(r'<br />',re.DOTALL).sub('\n',content)
  150. content = re.compile(r'<.*?>',re.DOTALL).sub('',content)
  151. content = re.compile(r'&.*?;',re.DOTALL).sub(' ',content)
  152. content = re.compile(r'\n\s+',re.DOTALL).sub('\n',content)
  153. content = content.strip()
  154. else:
  155. content=''
  156. if debug:
  157. print title
  158. print status
  159. print brief
  160. print pubtime[0],pubtime[1]
  161. print category[0],category[1]
  162. for x in ed2k:
  163. print x
  164. print content
  165. ed2kstr = ''
  166. for x in ed2k:
  167. ed2kstr += '`'.join(x)+'`'
  168. if not dbfind(id,conn):
  169. dbinsert(id,title,status,brief,pubtime,category,ed2kstr,content,conn)
  170. else:
  171. dbupdate(id,title,status,brief,pubtime,category,ed2kstr,content,conn)
  172. return pubtime[1]
  173. def dbcreate():
  174. c = conn.cursor()
  175. c.execute('''create table verycd(
  176. verycdid integer primary key,
  177. title text,
  178. status text,
  179. brief text,
  180. pubtime text,
  181. updtime text,
  182. category1 text,
  183. category2 text,
  184. ed2k text,
  185. content text
  186. )''')
  187. conn.commit()
  188. c.close()
  189. def dbinsert(id,title,status,brief,pubtime,category,ed2k,content,conn):
  190. c = conn.cursor()
  191. c.execute('insert into verycd values(?,?,?,?,?,?,?,?,?,?)',\
  192. (id,title,status,brief,pubtime[0],pubtime[1],category[0],category[1],\
  193. ed2k,content))
  194. conn.commit()
  195. c.close()
  196. def dbupdate(id,title,status,brief,pubtime,category,ed2k,content,conn):
  197. c = conn.cursor()
  198. c.execute('update verycd set verycdid=?,title=?,status=?,brief=?,pubtime=?,\
  199. updtime=?,category1=?,category2=?,ed2k=?,content=? where verycdid=?',\
  200. (id,title,status,brief,pubtime[0],pubtime[1],category[0],category[1],\
  201. ed2k,content,id))
  202. conn.commit()
  203. c.close()
  204. def dbfind(id,conn):
  205. c = conn.cursor()
  206. c.execute('select 1 from verycd where verycdid=?',(id,))
  207. c.close()
  208. for x in c:
  209. if 1 in x:
  210. return True
  211. else:
  212. return False
  213. def dblist():
  214. c = conn.cursor()
  215. c.execute('select * from verycd')
  216. for x in c:
  217. for y in x:
  218. print y
  219. def usage():
  220. print '''Usage:
  221. python fetchvc.py createdb
  222. python fetchvc.py fetchall
  223. python fetchvc.py fetch 1-1611 #fetch archive list
  224. python fetchvc.py fetch 5633~5684 #fetch topics
  225. python fetchvc.py fetch 5633 #fetch a topic
  226. python fetchvc.py fetch q=keyword
  227. python fetchvc.py list #list the database
  228. python fetchvc.py feed #run every 30 min to keep up-to-date
  229. python fetchvc.py hot
  230. python fetchvc.py update #update first 20 pages, run on a daily basis'''
  231. if __name__=='__main__':
  232. #initialize thread pool
  233. for i in range(MAXC):
  234. t = Thread(target=thread_fetch)
  235. t.setDaemon(True)
  236. t.start()
  237. if len(sys.argv) == 1:
  238. usage()
  239. elif len(sys.argv) == 2:
  240. if sys.argv[1] == 'createdb':
  241. dbcreate()
  242. elif sys.argv[1] == 'fetchall':
  243. fetchall()
  244. elif sys.argv[1] == 'update':
  245. update(20)
  246. elif sys.argv[1] == 'update1':
  247. update(1)
  248. elif sys.argv[1] == 'feed':
  249. feed()
  250. elif sys.argv[1] == 'hot':
  251. hot()
  252. elif sys.argv[1] == 'list':
  253. dblist()
  254. elif len(sys.argv) == 3:
  255. if sys.argv[1] != 'fetch':
  256. usage()
  257. elif '-' in sys.argv[2]:
  258. fetchall(sys.argv[2])
  259. elif '~' in sys.argv[2]:
  260. m = sys.argv[2].split('~')
  261. for i in range(int(m[0]),int(m[1])+1):
  262. q.put(i)
  263. elif sys.argv[2].startswith("q="):
  264. search(sys.argv[2][2:])
  265. else:
  266. fetch(int(sys.argv[2]),debug=True)
  267. # wait all threads done
  268. q.join()