PageRenderTime 68ms CodeModel.GetById 30ms RepoModel.GetById 1ms app.codeStats 0ms

/threading_url.py

https://bitbucket.org/cheng123/mytools
Python | 114 lines | 111 code | 1 blank | 2 comment | 0 complexity | 7681a1d8a64e30ca8b0ec470d1c94e09 MD5 | raw file
  1. #encoding: utf-8
  2. import time,math,os,re,urllib,urllib2,cookielib
  3. from BeautifulSoup import BeautifulSoup
  4. import time
  5. """ 自动抓取百度图片 """
  6. class BaiduImage:
  7. image_links = []
  8. image_dir = 'image'
  9. current_page = ''
  10. ext_page = ''
  11. image_count = 0
  12.     def __init__(self):
  13.         self.cj =cookielib.LWPCookieJar()
  14.         try:
  15.             self.cj.revert('baiduimage.cookie')
  16.         except:
  17.             None       
  18.         self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj))
  19.         urllib2.install_opener(self.opener)
  20.         self.opener.addheaders = [
  21.             ("User-agent", "Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.9.1) Gecko/20090704 Firefox/3.5"),
  22.             ("Accept", "*/*")]
  23.  
  24.  
  25.     """ 得到当前页面中图片的链接地址 """
  26.     def get_image_links(self):
  27.         try:
  28.             html = self.opener.open(self.current_page).read()
  29.         except Exception,e:
  30.             self.write_log(e)
  31.             return
  32.              
  33.         soup = BeautifulSoup(html)
  34.         self.image_links = []
  35.         for link in soup.findAll('a',{'href':re.compile('^./img')}):
  36.             if 'src=http://' in str(link):
  37.                 l = re.findall(r'src=(http://.*)',link['href'])[0]
  38.                 self.image_links.append(l)
  39.                  
  40.                  
  41.     """ 得到下一页地址 """
  42.     def get_next_page(self):
  43.         html = self.opener.open(self.current_page).read()
  44.         soup = BeautifulSoup(html)
  45.         spans = soup.findAll('span')
  46.         for span in spans:
  47.             span_html = str(span)
  48.             if '下一页' in span_html:
  49.                 self.current_page = str('http://wap.baidu.com')+str(BeautifulSoup
  50.  
  51. (span_html).find('a')['href'])
  52.                 self.write_log('Going next page...')
  53.                 return               
  54.  
  55.         self.write_log('This is the latest page')
  56.         self.next_page = ''
  57.         return False
  58.          
  59.  
  60.     """ 下载self.image_links中的图片 """
  61.     def download(self):
  62.         if not self.image_links:
  63.             return False
  64.          
  65.         self.write_log('Current page - %s' %self.current_page)
  66.  
  67.         for link in self.image_links:
  68.             try:
  69.                 data = urllib.urlopen(link).read()
  70.             except Exception,e:
  71.                 self.write_log('Connect error:%s' %e)
  72.                 return
  73.             self.write_log('Downloading... - %s' %link)               
  74.             file_name = str(int(time.time()))+'.jpg'
  75.             file_path = os.path.join(self.image_dir,file_name)
  76.             image = open(file_path,'wb')
  77.              
  78.             try:               
  79.                 image.write(data)
  80.             except Exception,e:
  81.                 self.write_log('Download faild:%s' %e)
  82.             else:
  83.                 self.write_log('Download Success!-%s' %link)
  84.              
  85.             self.image_count += 1
  86.             image.close()
  87.             del image
  88.             time.sleep(2)
  89.  
  90.     def write_log(self,text):
  91.         os.system('cls')
  92.         print text
  93.         log = open('log.txt','a')
  94.         log.write(text)
  95.         log.write('\n')
  96.         log.close()         
  97.  
  98.      
  99.     """ 给出wap起始页开始下载 """
  100.     def run(self,start_page):
  101.         self.current_page = start_page
  102.  
  103.         while True:
  104.             # 获取后下载首页图片
  105.             self.get_image_links()
  106.             self.download()
  107.             self.get_next_page()
  108.             self.write_log('Image total:%d' %self.image_count)
  109.             time.sleep(1)
  110.              
  111.  
  112. app = BaiduImage()
  113. app.run(start_page='http://wap.baidu.com/ssid=0/from=0/bd_page_type=1/uid=wiaui_1315707661_2623/pu=sz%40224_220%2Cusm%401/img?tn=bdwis&word=%E8%8B%8D%E4%BA%95%E7%A9%BA&pn=12&dw=w240&bs=176_208&pinf=6_6_0_@bdwis_@%E8%8B%8D%E4%BA%95%E7%A9%BA_@176_208_@w240&sp=&mid=w240')