thingSpider.py | searchcode

/spiderTest/spider/thingSpider.py

https://github.com/enjrolas/Makerbot-Vending-Machine · Python · 138 lines · 104 code · 26 blank · 8 comment · 19 complexity · 94f69c9d416597de6a7f0feaeae044fa MD5 · raw file

import urllib
import urllib2
import re
from BeautifulSoup import BeautifulSoup, SoupStrainer, NavigableString
import os, errno
from urlparse import urlparse


def ensure_dir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)

def download(URL, filename):
    u = urllib2.urlopen(URL)
    f = open(filename, 'wb')
    meta = u.info()
    file_size = int(meta.getheaders("Content-Length")[0])
    print "Downloading: %s Bytes: %s" % (filename, file_size)
    
    file_size_dl = 0
    block_sz = 8192
    while True:
        buffer = u.read(block_sz)
        if not buffer:
            break
        
        file_size_dl += len(buffer)
        f.write(buffer)
        status = r"%10d  [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
        status = status + chr(8)*(len(status)+1)
        print status,

    f.close()


def grabThing(thingId):
    opener=urllib.FancyURLopener({})
    url="http://www.thingiverse.com/thing:"+str(thingId)
    f=opener.open(url)
#    f=open("11816.html",'r')
    page=f.read()
    soup=BeautifulSoup(page)
    hasStlFiles=False

    
#title and author
    titleTag=soup.find('title')
    if titleTag!=None:
        title=titleTag.contents[0].string.split(" ")[0]
        title=title.encode('ascii','ignore').strip()
        if title != "Error":  #the thing exists
            directoryPath="../Things/"+str(thingId)
            author=""
            print title
            parts=soup.findAll('h1')
            if len(parts)>1:
                titleString=titleTag.contents[0].string
                titleParts=titleString.split(" ")
                title=""
                for titlePart in titleParts:
                    if titlePart=="by":
                        break
                    else:
                        title+=titlePart+" "

                print title
                title=title.strip()
                link=parts[1].a;
                if link!=None:

#stl files
                    parts=soup.findAll('h3', 'file_info')            
                    print parts
                    for part in parts:
                        filename=part.contents[0].strip()
                        if filename.find(".stl")!=-1:
                            ensure_dir(directoryPath)
                            print directoryPath+"/"+filename
                            hasStlFiles=True
                            downloadURL=part.parent.parent.a['href']
                            downloadURL="http://www.thingiverse.com"+downloadURL
                            print filename+" "+downloadURL
                            urllib.urlretrieve(downloadURL, directoryPath+"/"+filename)
                    #                    download(downloadURL, directoryPath+"/"+filename)
                            author=link.contents[0].string
                    

#description
                    parts=soup.findAll('h2')
                    if len(parts)>1:
                        descriptionTags=parts[1].nextSibling.nextSibling.contents
                        description=""
                        for tag in descriptionTags:
                            text=tag.string
                            if text!=None:
                                text=text.replace('\t','')
                                text=text.strip()
                                description+=text
                        descriptionTag="<description>"+description+"</description>"
                        descriptionTag=descriptionTag.encode('ascii','ignore')
                        print descriptionTag
                        
            if hasStlFiles:
#create directory if it does not exist, and open xml file
                outputFile=open(directoryPath+"/"+str(thingId)+".xml",'w')

#download images
                gallery=soup.findAll(rel="gallery[thing]")
                if len(gallery)>0:
                    pictureIndex=0
                    for picture in gallery:
                        imageURL=picture["href"]
                        type=imageURL.split(".")[-1]
                        filename=str(pictureIndex)+"."+type
                        urllib.urlretrieve(imageURL, directoryPath+"/"+filename)
                        pictureIndex+=1


                outputFile.write("<thing>")
                outputFile.write("<id>"+str(thingId)+"</id>")
                print "<title>"+title+"</title>"
                outputFile.write("<title>"+title+"</title>")
                print "<author>"+author+"</author>"
                outputFile.write("<author>"+author+"</author>")
                outputFile.write(descriptionTag)
                outputFile.write("</thing>")



    


for i in range(366,1000):
    print "Trying..."+str(i)
    grabThing(i)


#grabThing(157)
Tech Fingerprint

Alerts (6)

'def' Ensure functions have docstrings for documentation
9 13 36
'open(' Use 'with open()' to ensure Files are properly closed
14 15 105