/threading_url.py
Python | 114 lines | 111 code | 1 blank | 2 comment | 0 complexity | 7681a1d8a64e30ca8b0ec470d1c94e09 MD5 | raw file
- #encoding: utf-8
-
- import time,math,os,re,urllib,urllib2,cookielib
- from BeautifulSoup import BeautifulSoup
- import time
- """ 自动抓取百度图片 """
- class BaiduImage:
- image_links = []
- image_dir = 'image'
- current_page = ''
- ext_page = ''
- image_count = 0
- def __init__(self):
- self.cj =cookielib.LWPCookieJar()
- try:
- self.cj.revert('baiduimage.cookie')
- except:
- None
- self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj))
- urllib2.install_opener(self.opener)
- self.opener.addheaders = [
- ("User-agent", "Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.9.1) Gecko/20090704 Firefox/3.5"),
- ("Accept", "*/*")]
-
-
- """ 得到当前页面中图片的链接地址 """
- def get_image_links(self):
- try:
- html = self.opener.open(self.current_page).read()
- except Exception,e:
- self.write_log(e)
- return
-
- soup = BeautifulSoup(html)
- self.image_links = []
- for link in soup.findAll('a',{'href':re.compile('^./img')}):
- if 'src=http://' in str(link):
- l = re.findall(r'src=(http://.*)',link['href'])[0]
- self.image_links.append(l)
-
-
- """ 得到下一页地址 """
- def get_next_page(self):
- html = self.opener.open(self.current_page).read()
- soup = BeautifulSoup(html)
- spans = soup.findAll('span')
- for span in spans:
- span_html = str(span)
- if '下一页' in span_html:
- self.current_page = str('http://wap.baidu.com')+str(BeautifulSoup
-
- (span_html).find('a')['href'])
- self.write_log('Going next page...')
- return
-
- self.write_log('This is the latest page')
- self.next_page = ''
- return False
-
-
- """ 下载self.image_links中的图片 """
- def download(self):
- if not self.image_links:
- return False
-
- self.write_log('Current page - %s' %self.current_page)
-
- for link in self.image_links:
- try:
- data = urllib.urlopen(link).read()
- except Exception,e:
- self.write_log('Connect error:%s' %e)
- return
- self.write_log('Downloading... - %s' %link)
- file_name = str(int(time.time()))+'.jpg'
- file_path = os.path.join(self.image_dir,file_name)
- image = open(file_path,'wb')
-
- try:
- image.write(data)
- except Exception,e:
- self.write_log('Download faild:%s' %e)
- else:
- self.write_log('Download Success!-%s' %link)
-
- self.image_count += 1
- image.close()
- del image
- time.sleep(2)
-
- def write_log(self,text):
- os.system('cls')
- print text
- log = open('log.txt','a')
- log.write(text)
- log.write('\n')
- log.close()
-
-
- """ 给出wap起始页开始下载 """
- def run(self,start_page):
- self.current_page = start_page
-
- while True:
- # 获取后下载首页图片
- self.get_image_links()
- self.download()
- self.get_next_page()
- self.write_log('Image total:%d' %self.image_count)
- time.sleep(1)
-
-
- app = BaiduImage()
- app.run(start_page='http://wap.baidu.com/ssid=0/from=0/bd_page_type=1/uid=wiaui_1315707661_2623/pu=sz%40224_220%2Cusm%401/img?tn=bdwis&word=%E8%8B%8D%E4%BA%95%E7%A9%BA&pn=12&dw=w240&bs=176_208&pinf=6_6_0_@bdwis_@%E8%8B%8D%E4%BA%95%E7%A9%BA_@176_208_@w240&sp=&mid=w240')