testing/tools/grabber/getpages.sh

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rwxr-xr-x

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 #! /bin/bash
     2 # This Source Code Form is subject to the terms of the Mozilla Public
     3 # License, v. 2.0. If a copy of the MPL was not distributed with this
     4 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
     7 #original author: Alice Nodelman
     8 #contributor: Darin Fisher
     9 #
    10 #takes two inputs, $1 = file containing list of web pages of form http://pagename
    11 #                  $2 = output file where list of index files is dumped - useful for places list of links into scripts
    12 #
    13 # web pages are dropped in directories named for their urls
    15 if [ $# != 2 ]; then
    16 	echo 'missing command line arguments'
    17 	echo
    18 	echo 'usage: getpages.sh inputfile outputfile'
    19 	echo '	inputfile: file containing one url per line of the form http://url'
    20 	echo '	outputfile: file to be created during execution, contains a list of index files one per url'
    21 	exit 
    22 fi
    24 # generates the list of files to be cleansed (exclude image files)
    25 # disables any existing call-outs to the live web
    26 # provided by Darin Fisher
    27 cleanse_files() {
    28 	  find "$1" -type f -a -print0 ! -iname \*.jpg -a ! -iname \*.gif -a ! -iname \*.png -a ! -name \*.bak | xargs -0 perl -pi -e 's/[a-zA-Z0-9_]*.writeln/void/g;' -e 's/[a-zA-Z0-9_]*.write/void/g;' -e 's/[a-zA-Z0-9_]*.open/void/g;' -e 's/"https/"httpsdisabled/gi;' -e 's/"http/"httpdisabled/gi;' -e 's/<object/<objectdisabled/gi;' -e 's/<embed/<embeddisabled/gi;' -e 's/load/loaddisabled/g;'
    29 }
    31 mkdir testpages
    32 cd testpages
    33 for URL in $(cat ../$1); do
    34 	#strip the leading http:// from the url
    35 	CLEANURL=$(echo $URL | sed -e 's/http:\/\/\(.*\)/\1/')
    36 	#create a directory with the cleaned url as the name
    37 	echo "grabbing "$CLEANURL
    38 	mkdir $CLEANURL
    39 	cd $CLEANURL
    40 	../../wget-1.10-css-parser/src/wget -p -k -H -E -erobots=off --no-check-certificate -U "Mozilla/5.0 (firefox)" --restrict-file-names=windows $URL -o outputlog.txt
    41 	#figure out where/what the index file for this page is from the wget output log
    42 	FILENAME=$(grep "saved" outputlog.txt | head -1 | sed -e "s/.*\`\(.*\)\'.*/\1/")
    43 	rm outputlog.txt
    44 	cd ..
    46 	#do the final cleanup of any dangling urls
    47 	#with thanks to Darin Fisher for the code
    48 	cleanse_files $CLEANURL
    50 	#add the index file link to the list of index files being generated
    51 	echo $CLEANURL/$FILENAME >> $2
    52 done
    53 cd ..

mercurial