/spiderTest/spider/thingSpider.py

https://github.com/enjrolas/Makerbot-Vending-Machine · Python · 138 lines · 104 code · 26 blank · 8 comment · 19 complexity · 94f69c9d416597de6a7f0feaeae044fa MD5 · raw file

  1. import urllib
  2. import urllib2
  3. import re
  4. from BeautifulSoup import BeautifulSoup, SoupStrainer, NavigableString
  5. import os, errno
  6. from urlparse import urlparse
  7. def ensure_dir(dir):
  8. if not os.path.exists(dir):
  9. os.makedirs(dir)
  10. def download(URL, filename):
  11. u = urllib2.urlopen(URL)
  12. f = open(filename, 'wb')
  13. meta = u.info()
  14. file_size = int(meta.getheaders("Content-Length")[0])
  15. print "Downloading: %s Bytes: %s" % (filename, file_size)
  16. file_size_dl = 0
  17. block_sz = 8192
  18. while True:
  19. buffer = u.read(block_sz)
  20. if not buffer:
  21. break
  22. file_size_dl += len(buffer)
  23. f.write(buffer)
  24. status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
  25. status = status + chr(8)*(len(status)+1)
  26. print status,
  27. f.close()
  28. def grabThing(thingId):
  29. opener=urllib.FancyURLopener({})
  30. url="http://www.thingiverse.com/thing:"+str(thingId)
  31. f=opener.open(url)
  32. # f=open("11816.html",'r')
  33. page=f.read()
  34. soup=BeautifulSoup(page)
  35. hasStlFiles=False
  36. #title and author
  37. titleTag=soup.find('title')
  38. if titleTag!=None:
  39. title=titleTag.contents[0].string.split(" ")[0]
  40. title=title.encode('ascii','ignore').strip()
  41. if title != "Error": #the thing exists
  42. directoryPath="../Things/"+str(thingId)
  43. author=""
  44. print title
  45. parts=soup.findAll('h1')
  46. if len(parts)>1:
  47. titleString=titleTag.contents[0].string
  48. titleParts=titleString.split(" ")
  49. title=""
  50. for titlePart in titleParts:
  51. if titlePart=="by":
  52. break
  53. else:
  54. title+=titlePart+" "
  55. print title
  56. title=title.strip()
  57. link=parts[1].a;
  58. if link!=None:
  59. #stl files
  60. parts=soup.findAll('h3', 'file_info')
  61. print parts
  62. for part in parts:
  63. filename=part.contents[0].strip()
  64. if filename.find(".stl")!=-1:
  65. ensure_dir(directoryPath)
  66. print directoryPath+"/"+filename
  67. hasStlFiles=True
  68. downloadURL=part.parent.parent.a['href']
  69. downloadURL="http://www.thingiverse.com"+downloadURL
  70. print filename+" "+downloadURL
  71. urllib.urlretrieve(downloadURL, directoryPath+"/"+filename)
  72. # download(downloadURL, directoryPath+"/"+filename)
  73. author=link.contents[0].string
  74. #description
  75. parts=soup.findAll('h2')
  76. if len(parts)>1:
  77. descriptionTags=parts[1].nextSibling.nextSibling.contents
  78. description=""
  79. for tag in descriptionTags:
  80. text=tag.string
  81. if text!=None:
  82. text=text.replace('\t','')
  83. text=text.strip()
  84. description+=text
  85. descriptionTag="<description>"+description+"</description>"
  86. descriptionTag=descriptionTag.encode('ascii','ignore')
  87. print descriptionTag
  88. if hasStlFiles:
  89. #create directory if it does not exist, and open xml file
  90. outputFile=open(directoryPath+"/"+str(thingId)+".xml",'w')
  91. #download images
  92. gallery=soup.findAll(rel="gallery[thing]")
  93. if len(gallery)>0:
  94. pictureIndex=0
  95. for picture in gallery:
  96. imageURL=picture["href"]
  97. type=imageURL.split(".")[-1]
  98. filename=str(pictureIndex)+"."+type
  99. urllib.urlretrieve(imageURL, directoryPath+"/"+filename)
  100. pictureIndex+=1
  101. outputFile.write("<thing>")
  102. outputFile.write("<id>"+str(thingId)+"</id>")
  103. print "<title>"+title+"</title>"
  104. outputFile.write("<title>"+title+"</title>")
  105. print "<author>"+author+"</author>"
  106. outputFile.write("<author>"+author+"</author>")
  107. outputFile.write(descriptionTag)
  108. outputFile.write("</thing>")
  109. for i in range(366,1000):
  110. print "Trying..."+str(i)
  111. grabThing(i)
  112. #grabThing(157)