fetchvc.py | searchcode

/fetchvc.py

https://code.google.com/p/simplecd/ · Python · 304 lines · 248 code · 36 blank · 20 comment · 53 complexity · bf86f089664cb37d9ffd5b520bb516ba MD5 · raw file

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# fetchvc.py fetch resources from verycd
#
# author: observer
# email: jingchaohu@gmail.com
# blog: http://obmem.com
# last edit @ 2009.12.16
import urllib
import re
import sqlite3
import time
import os,sys

from threading import Thread
from Queue import Queue

from download import httpfetch

path = os.path.dirname(os.path.realpath(sys.argv[0]))
conn = sqlite3.connect(path+'/verycd.sqlite3.db')
conn.text_factory = str
q = Queue()
MAXC = 8

def thread_fetch():
	conn = sqlite3.connect(path+'/verycd.sqlite3.db')
	conn.text_factory = str
	while True:
		topic = q.get()
		fetch(topic,conn)
		q.task_done()

def search(keyword,full=True):
	'''search verycd, fetch search results'''
	url = 'http://www.verycd.com/search/folders/'+keyword
	print 'fetching search results ...'
	res = httpfetch(url)
	topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(res)
	topics = set(topics)
	links = []
	if full:
		links = re.compile(r'/search/folders/(.*?\?start=\d+)',re.DOTALL).findall(res)
		print links
	print topics
	if topics:
		for topic in topics:
			q.put(topic)
	if full and links:
		for key in links:
			search(key,full=False)
		

def hot():
	''' read verycd hot res and keep update very day '''
	url = 'http://www.verycd.com/'
	print 'fetching homepage ...'
	home = httpfetch(url)
	hotzone = re.compile(r'????.*?</dl>',re.DOTALL).search(home).group()
	hot = re.compile(r'<a href="/topics/(\d+)/"[^>]*>(?.*??)[^<]*</a>',re.DOTALL).findall(hotzone)
	html = '<h2 style="color:red">??????</h2>\n'
	for topic in hot:
		print 'fetching hot topic',topic[0],'...'
		q.put(topic[0])
		html += '&nbsp;<a target="_parent" href="/?id=%s">%s</a>&nbsp;\n' % topic
	open(path+'/static/hot.html','w').write(html)

def feed():
	''' read verycd feed and keep update very 30 min '''
	url = 'http://www.verycd.com/sto/feed'
	print 'fetching feed ...'
	feeds = httpfetch(url)
	ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(feeds)
	ids = set(ids)
	print ids
	now = time.mktime(time.gmtime())
	for id in ids:
		q.put(id)
		#updtime = fetch(id)
		#updtime = time.mktime(time.strptime(updtime,'%Y/%m/%d %H:%M:%S'))-8*3600 #gmt+8->gmt
		#diff = now - updtime
		#print '%10s secs since update' % (diff)
		#if diff > 1900: # only need recent 30min updates
		#	break

def update(num=10):
	urlbase = 'http://www.verycd.com/sto/~all/page'
	for i in range(1,num+1):
		print 'fetching list',i,'...'		
		url = urlbase+str(i)
		res = httpfetch(url)
		res2 = re.compile(r'"topic-list"(.*?)"pnav"',re.DOTALL).findall(res)
		if res2:
			res2 = res2[0]
		else:
			continue
		topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(res2)
		topics = set(topics)
		print topics	
		for topic in topics:
			q.put(topic)
		

def fetchall(ran='1-max',debug=False):
	urlbase = 'http://www.verycd.com/archives/'
	if ran == '1-max':
		m1 = 1
		res = urllib.urlopen(urlbase).read()
		m2 = int(re.compile(r'archives/(\d+)').search(res).group(1))
	else:
		m = ran.split('-')
		m1 = int(m[0])
		m2 = int(m[1])
	print 'fetching list from',m1,'to',m2,'...'
	for i in range(m1,m2+1):
		url = urlbase + '%05d'%i + '.html'
		print 'fetching from',url,'...'
		res = httpfetch(url)
		ids = re.compile(r'topics/(\d+)/',re.DOTALL).findall(res)
		print ids
		for id in ids:
			q.put(id)
	

def fetch(id,conn=conn,debug=False):
	print 'fetching topic',id,'...'
	urlbase = 'http://www.verycd.com/topics/'
	url = urlbase + str(id)

	res = ''
	for _ in range(3):
		try:
			res = httpfetch(url,report=True)
			break
		except:
			continue

	abstract = re.compile(r'<h1>.*?visit',re.DOTALL).findall(res)
	if not abstract:
		print res
		if res == '' or '???' in res:
			print 'resource does not exist'
			return
		else:
			print 'fetching',id,'again...'
			return fetch(id,conn)
	abstract = abstract[0]
    

	title = re.compile(r'<h1>(.*?)</h1>',re.DOTALL).findall(abstract)[0]
	status = re.compile(r'"requestWords">(.*?)<',re.DOTALL).search(abstract).group(1)
	brief = re.compile(r'"font-weight:normal"><span>(.*?)</td>',re.DOTALL).search(abstract).group(1)
	brief = re.compile(r'<.*?>',re.DOTALL).sub('',brief).strip()
	pubtime = re.compile(r'"date-time">(.*?)</span>.*?"date-time">(.*?)</span>',re.DOTALL).findall(abstract)[0]
	category1 = re.compile(r'??.*?<td>(.*?)&nbsp;&nbsp;(.*?)&nbsp;&nbsp;',re.DOTALL).findall(abstract)[0]
	category = ['','']
	category[0] = re.compile(r'<.*?>',re.DOTALL).sub('',category1[0]).strip()
	category[1] = re.compile(r'<.*?>',re.DOTALL).sub('',category1[1]).strip()

	res2 = re.compile(r'iptcomED2K"><!--eMule.*?<!--eMule end-->',re.DOTALL).findall(res)[0]

	ed2k = re.compile(r'ed2k="([^"]*)" subtitle_[^=]*="([^"]*)">([^<]*)</a>',re.DOTALL).findall(res2)
	ed2k.extend( re.compile(r'ed2k="([^"]*)">([^<]*)</a>',re.DOTALL).findall(res2) )

	content = re.compile(r'<!--eMule end-->(.*?)<!--Wrap-tail end-->',re.DOTALL).findall(res)

	if content:
		content = content[0]
		content = re.compile(r'<br />',re.DOTALL).sub('\n',content)
		content = re.compile(r'<.*?>',re.DOTALL).sub('',content)
		content = re.compile(r'&.*?;',re.DOTALL).sub(' ',content)
		content = re.compile(r'\n\s+',re.DOTALL).sub('\n',content)
		content = content.strip()
	else:
		content=''

	if debug:
		print title
		print status
		print brief
		print pubtime[0],pubtime[1]
		print category[0],category[1]
		for x in ed2k:
			print x
		print content

	ed2kstr = ''
	for x in ed2k:
		ed2kstr += '`'.join(x)+'`'

	if not dbfind(id,conn):
		dbinsert(id,title,status,brief,pubtime,category,ed2kstr,content,conn)
	else:
		dbupdate(id,title,status,brief,pubtime,category,ed2kstr,content,conn)

	return pubtime[1]

def dbcreate():
	c = conn.cursor()
	c.execute('''create table verycd(
		verycdid integer primary key,
		title text,
		status text,
		brief text,
		pubtime text,
		updtime text,
		category1 text,
		category2 text,
		ed2k text,
		content text
	)''')
	conn.commit()
	c.close()

def dbinsert(id,title,status,brief,pubtime,category,ed2k,content,conn):
	c = conn.cursor()
	c.execute('insert into verycd values(?,?,?,?,?,?,?,?,?,?)',\
		(id,title,status,brief,pubtime[0],pubtime[1],category[0],category[1],\
		ed2k,content))
	conn.commit()
	c.close()

def dbupdate(id,title,status,brief,pubtime,category,ed2k,content,conn):
	c = conn.cursor()
	c.execute('update verycd set verycdid=?,title=?,status=?,brief=?,pubtime=?,\
		updtime=?,category1=?,category2=?,ed2k=?,content=? where verycdid=?',\
		(id,title,status,brief,pubtime[0],pubtime[1],category[0],category[1],\
		ed2k,content,id))
	conn.commit()
	c.close()

def dbfind(id,conn):
	c = conn.cursor()
	c.execute('select 1 from verycd where verycdid=?',(id,))
	c.close()
	for x in c:
		if 1 in x:
			return True
		else:
			return False

def dblist():
	c = conn.cursor()
	c.execute('select * from verycd')
	for x in c:
		for y in x:
			print y

def usage():
	print '''Usage:
  python fetchvc.py createdb
  python fetchvc.py fetchall
  python fetchvc.py fetch 1-1611 #fetch archive list
  python fetchvc.py fetch 5633~5684 #fetch topics
  python fetchvc.py fetch 5633 #fetch a topic
  python fetchvc.py fetch q=keyword
  python fetchvc.py list #list the database
  python fetchvc.py feed #run every 30 min to keep up-to-date
  python fetchvc.py hot
  python fetchvc.py update #update first 20 pages, run on a daily basis'''

if __name__=='__main__':
	#initialize thread pool
	for i in range(MAXC):
		t = Thread(target=thread_fetch)
		t.setDaemon(True)
		t.start()

	if len(sys.argv) == 1:
		usage()
	elif len(sys.argv) == 2:
		if sys.argv[1] == 'createdb':
			dbcreate()
		elif sys.argv[1] == 'fetchall':
			fetchall()
		elif sys.argv[1] == 'update':
			update(20)
		elif sys.argv[1] == 'update1':
			update(1)
		elif sys.argv[1] == 'feed':
			feed()
		elif sys.argv[1] == 'hot':
			hot()
		elif sys.argv[1] == 'list':
			dblist()
	elif len(sys.argv) == 3:
		if sys.argv[1] != 'fetch':
			usage()
		elif '-' in sys.argv[2]:
			fetchall(sys.argv[2])
		elif '~' in sys.argv[2]:
			m = sys.argv[2].split('~')
			for i in range(int(m[0]),int(m[1])+1):
				q.put(i)
		elif sys.argv[2].startswith("q="):
			search(sys.argv[2][2:])
		else:
			fetch(int(sys.argv[2]),debug=True)

	# wait all threads done
	q.join()
Tech Fingerprint

Alerts (16)

'sqlite3.connect(' Use 'with sqlite3.connect()' for automatic connection closure
22 28
'def' Ensure functions have docstrings for documentation
27 87 105 126 199 216 224 233 243
Complexity hotspot; lines 50 to 51 (total complexity: 3)
50 51
'open(' Use 'with open()' to ensure Files are properly closed
67 109
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
136