PageRenderTime 59ms CodeModel.GetById 33ms RepoModel.GetById 1ms app.codeStats 0ms

/testing/tools/grabber/getpages.sh

https://bitbucket.org/careytilden/mozilla-release
Shell | 50 lines | 26 code | 7 blank | 17 comment | 2 complexity | 16fdf0d96cea1844c3ab124fca39032c MD5 | raw file
Possible License(s): Apache-2.0, LGPL-2.1, 0BSD, AGPL-1.0, BSD-3-Clause, GPL-2.0, LGPL-3.0, MIT, MPL-2.0-no-copyleft-exception, JSON
  1. #! /bin/bash
  2. #original author: Alice Nodelman
  3. #contributor: Darin Fisher
  4. #
  5. #takes two inputs, $1 = file containing list of web pages of form http://pagename
  6. # $2 = output file where list of index files is dumped - useful for places list of links into scripts
  7. #
  8. # web pages are dropped in directories named for their urls
  9. if [ $# != 2 ]; then
  10. echo 'missing command line arguments'
  11. echo
  12. echo 'usage: getpages.sh inputfile outputfile'
  13. echo ' inputfile: file containing one url per line of the form http://url'
  14. echo ' outputfile: file to be created during execution, contains a list of index files one per url'
  15. exit
  16. fi
  17. # generates the list of files to be cleansed (exclude image files)
  18. # disables any existing call-outs to the live web
  19. # provided by Darin Fisher
  20. cleanse_files() {
  21. find "$1" -type f -a -print0 ! -iname \*.jpg -a ! -iname \*.gif -a ! -iname \*.png -a ! -name \*.bak | xargs -0 perl -pi -e 's/[a-zA-Z0-9_]*.writeln/void/g;' -e 's/[a-zA-Z0-9_]*.write/void/g;' -e 's/[a-zA-Z0-9_]*.open/void/g;' -e 's/"https/"httpsdisabled/gi;' -e 's/"http/"httpdisabled/gi;' -e 's/<object/<objectdisabled/gi;' -e 's/<embed/<embeddisabled/gi;' -e 's/load/loaddisabled/g;'
  22. }
  23. mkdir testpages
  24. cd testpages
  25. for URL in $(cat ../$1); do
  26. #strip the leading http:// from the url
  27. CLEANURL=$(echo $URL | sed -e 's/http:\/\/\(.*\)/\1/')
  28. #create a directory with the cleaned url as the name
  29. echo "grabbing "$CLEANURL
  30. mkdir $CLEANURL
  31. cd $CLEANURL
  32. ../../wget-1.10-css-parser/src/wget -p -k -H -E -erobots=off --no-check-certificate -U "Mozilla/5.0 (firefox)" --restrict-file-names=windows $URL -o outputlog.txt
  33. #figure out where/what the index file for this page is from the wget output log
  34. FILENAME=$(grep "saved" outputlog.txt | head -1 | sed -e "s/.*\`\(.*\)\'.*/\1/")
  35. rm outputlog.txt
  36. cd ..
  37. #do the final cleanup of any dangling urls
  38. #with thanks to Darin Fisher for the code
  39. cleanse_files $CLEANURL
  40. #add the index file link to the list of index files being generated
  41. echo $CLEANURL/$FILENAME >> $2
  42. done
  43. cd ..