/extra/robots/robots.txt

http://github.com/abeaumont/factor · Plain Text · 279 lines · 236 code · 43 blank · 0 comment · 0 complexity · 8783bb03a4c4d0ee8f2e7eb297c010ef MD5 · raw file

  1. # robots.txt
  2. Sitemap: http://www.chiplist.com/sitemap.txt
  3. User-Agent: *
  4. Disallow: /cgi-bin/
  5. Disallow: /scripts/
  6. Disallow: /ChipList2/scripts/
  7. #Disallow: /styles/
  8. Disallow: /ChipList2/styles/
  9. Disallow: /ads/
  10. Disallow: /ChipList2/ads/
  11. Disallow: /advertisements/
  12. Disallow: /ChipList2/advertisements/
  13. Disallow: /graphics/
  14. Disallow: /ChipList2/graphics/
  15. #Disallow: /ChipList1/
  16. # robots.txt for http://www.wikipedia.org/ and friends
  17. #
  18. # Please note: There are a lot of pages on this site, and there are
  19. # some misbehaved spiders out there that go _way_ too fast. If you're
  20. # irresponsible, your access to the site may be blocked.
  21. # Inktomi's "Slurp" can read a minimum delay between hits; if your
  22. # bot supports such a thing using the 'Crawl-delay' or another
  23. # instruction, please let us know.
  24. # *at least* 1 second please. preferably more :D
  25. #User-agent: *
  26. Crawl-delay: 1
  27. Request-rate: 1/1
  28. Visit-time: 0200-0500
  29. # Crawlers that are kind enough to obey, but which we'd rather not have
  30. # unless they're feeding search engines.
  31. User-agent: UbiCrawler
  32. Disallow: /
  33. User-agent: DOC
  34. Disallow: /
  35. User-agent: Zao
  36. Disallow: /
  37. # Some bots are known to be trouble, particularly those designed to copy
  38. # entire sites. Please obey robots.txt.
  39. User-agent: sitecheck.internetseer.com
  40. Disallow: /
  41. User-agent: Zealbot
  42. Disallow: /
  43. User-agent: MSIECrawler
  44. Disallow: /
  45. User-agent: SiteSnagger
  46. Disallow: /
  47. User-agent: WebStripper
  48. Disallow: /
  49. User-agent: WebCopier
  50. Disallow: /
  51. User-agent: Fetch
  52. Disallow: /
  53. User-agent: Offline Explorer
  54. Disallow: /
  55. User-agent: Teleport
  56. Disallow: /
  57. User-agent: TeleportPro
  58. Disallow: /
  59. User-agent: WebZIP
  60. Disallow: /
  61. User-agent: linko
  62. Disallow: /
  63. User-agent: HTTrack
  64. Disallow: /
  65. User-agent: Microsoft.URL.Control
  66. Disallow: /
  67. User-agent: Xenu
  68. Disallow: /
  69. User-agent: larbin
  70. Disallow: /
  71. User-agent: libwww
  72. Disallow: /
  73. User-agent: ZyBORG
  74. Disallow: /
  75. User-agent: Download Ninja
  76. Disallow: /
  77. #
  78. # Sorry, wget in its recursive mode is a frequent problem.
  79. # Please read the man page and use it properly; there is a
  80. # --wait option you can use to set the delay between hits,
  81. # for instance.
  82. #
  83. User-agent: wget
  84. Disallow: /
  85. #
  86. # The 'grub' distributed client has been *very* poorly behaved.
  87. #
  88. User-agent: grub-client
  89. Disallow: /
  90. #
  91. # Doesn't follow robots.txt anyway, but...
  92. #
  93. User-agent: k2spider
  94. Disallow: /
  95. #
  96. # Hits many times per second, not acceptable
  97. # http://www.nameprotect.com/botinfo.html
  98. User-agent: NPBot
  99. Disallow: /
  100. # A capture bot, downloads gazillions of pages with no public benefit
  101. # http://www.webreaper.net/
  102. User-agent: WebReaper
  103. Disallow: /
  104. # Provided courtesy of http://browsers.garykeith.com.
  105. # Created on February 13, 2008 at 7:39:00 PM GMT.
  106. #
  107. # Place this file in the root public folder of your website.
  108. # It will stop the following bots from indexing your website.
  109. #
  110. User-agent: abot
  111. User-agent: ALeadSoftbot
  112. User-agent: BeijingCrawler
  113. User-agent: BilgiBot
  114. User-agent: bot
  115. User-agent: botlist
  116. User-agent: BOTW Spider
  117. User-agent: bumblebee
  118. User-agent: Bumblebee
  119. User-agent: BuzzRankingBot
  120. User-agent: Charlotte
  121. User-agent: Clushbot
  122. User-agent: Crawler
  123. User-agent: CydralSpider
  124. User-agent: DataFountains
  125. User-agent: DiamondBot
  126. User-agent: Dulance bot
  127. User-agent: DYNAMIC
  128. User-agent: EARTHCOM.info
  129. User-agent: EDI
  130. User-agent: envolk
  131. User-agent: Exabot
  132. User-agent: Exabot-Images
  133. User-agent: Exabot-Test
  134. User-agent: exactseek-pagereaper
  135. User-agent: Exalead NG
  136. User-agent: FANGCrawl
  137. User-agent: Feed::Find
  138. User-agent: flatlandbot
  139. User-agent: Gigabot
  140. User-agent: GigabotSiteSearch
  141. User-agent: GurujiBot
  142. User-agent: Hatena Antenna
  143. User-agent: Hatena Bookmark
  144. User-agent: Hatena RSS
  145. User-agent: HatenaScreenshot
  146. User-agent: Helix
  147. User-agent: HiddenMarket
  148. User-agent: HyperEstraier
  149. User-agent: iaskspider
  150. User-agent: IIITBOT
  151. User-agent: InfociousBot
  152. User-agent: iVia
  153. User-agent: iVia Page Fetcher
  154. User-agent: Jetbot
  155. User-agent: Kolinka Forum Search
  156. User-agent: KRetrieve
  157. User-agent: LetsCrawl.com
  158. User-agent: Lincoln State Web Browser
  159. User-agent: Links4US-Crawler
  160. User-agent: LOOQ
  161. User-agent: Lsearch/sondeur
  162. User-agent: MapoftheInternet.com
  163. User-agent: NationalDirectory
  164. User-agent: NetCarta_WebMapper
  165. User-agent: NewsGator
  166. User-agent: NextGenSearchBot
  167. User-agent: ng
  168. User-agent: nicebot
  169. User-agent: NP
  170. User-agent: NPBot
  171. User-agent: Nudelsalat
  172. User-agent: Nutch
  173. User-agent: OmniExplorer_Bot
  174. User-agent: OpenIntelligenceData
  175. User-agent: Oracle Enterprise Search
  176. User-agent: Pajaczek
  177. User-agent: panscient.com
  178. User-agent: PeerFactor 404 crawler
  179. User-agent: PeerFactor Crawler
  180. User-agent: PlantyNet
  181. User-agent: PlantyNet_WebRobot
  182. User-agent: plinki
  183. User-agent: PMAFind
  184. User-agent: Pogodak!
  185. User-agent: QuickFinder Crawler
  186. User-agent: Radiation Retriever
  187. User-agent: Reaper
  188. User-agent: RedCarpet
  189. User-agent: ScorpionBot
  190. User-agent: Scrubby
  191. User-agent: Scumbot
  192. User-agent: searchbot
  193. User-agent: Seeker.lookseek.com
  194. User-agent: SeznamBot
  195. User-agent: ShowXML
  196. User-agent: snap.com
  197. User-agent: snap.com beta crawler
  198. User-agent: Snapbot
  199. User-agent: SnapPreviewBot
  200. User-agent: sohu
  201. User-agent: SpankBot
  202. User-agent: Speedy Spider
  203. User-agent: Speedy_Spider
  204. User-agent: SpeedySpider
  205. User-agent: spider
  206. User-agent: SquigglebotBot
  207. User-agent: SurveyBot
  208. User-agent: SynapticSearch
  209. User-agent: T-H-U-N-D-E-R-S-T-O-N-E
  210. User-agent: Talkro Web-Shot
  211. User-agent: Tarantula
  212. User-agent: TerrawizBot
  213. User-agent: TheInformant
  214. User-agent: TMCrawler
  215. User-agent: TridentSpider
  216. User-agent: Tutorial Crawler
  217. User-agent: Twiceler
  218. User-agent: unwrapbot
  219. User-agent: URI::Fetch
  220. User-agent: VengaBot
  221. User-agent: Vonna.com b o t
  222. User-agent: Vortex
  223. User-agent: Votay bot
  224. User-agent: WebAlta Crawler
  225. User-agent: Webbot
  226. User-agent: Webclipping.com
  227. User-agent: WebCorp
  228. User-agent: Webinator
  229. User-agent: WIRE
  230. User-agent: WISEbot
  231. User-agent: Xerka WebBot
  232. User-agent: XSpider
  233. User-agent: YodaoBot
  234. User-agent: Yoono
  235. User-agent: yoono
  236. Disallow: /