Wed, 31 Dec 2014 06:55:50 +0100
Added tag UPSTREAM_283F7C6 for changeset ca08bd8f51b2
michael@0 | 1 | #! /bin/bash |
michael@0 | 2 | # This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 4 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. |
michael@0 | 5 | |
michael@0 | 6 | |
michael@0 | 7 | #original author: Alice Nodelman |
michael@0 | 8 | #contributor: Darin Fisher |
michael@0 | 9 | # |
michael@0 | 10 | #takes two inputs, $1 = file containing list of web pages of form http://pagename |
michael@0 | 11 | # $2 = output file where list of index files is dumped - useful for places list of links into scripts |
michael@0 | 12 | # |
michael@0 | 13 | # web pages are dropped in directories named for their urls |
michael@0 | 14 | |
michael@0 | 15 | if [ $# != 2 ]; then |
michael@0 | 16 | echo 'missing command line arguments' |
michael@0 | 17 | echo |
michael@0 | 18 | echo 'usage: getpages.sh inputfile outputfile' |
michael@0 | 19 | echo ' inputfile: file containing one url per line of the form http://url' |
michael@0 | 20 | echo ' outputfile: file to be created during execution, contains a list of index files one per url' |
michael@0 | 21 | exit |
michael@0 | 22 | fi |
michael@0 | 23 | |
michael@0 | 24 | # generates the list of files to be cleansed (exclude image files) |
michael@0 | 25 | # disables any existing call-outs to the live web |
michael@0 | 26 | # provided by Darin Fisher |
michael@0 | 27 | cleanse_files() { |
michael@0 | 28 | find "$1" -type f -a -print0 ! -iname \*.jpg -a ! -iname \*.gif -a ! -iname \*.png -a ! -name \*.bak | xargs -0 perl -pi -e 's/[a-zA-Z0-9_]*.writeln/void/g;' -e 's/[a-zA-Z0-9_]*.write/void/g;' -e 's/[a-zA-Z0-9_]*.open/void/g;' -e 's/"https/"httpsdisabled/gi;' -e 's/"http/"httpdisabled/gi;' -e 's/<object/<objectdisabled/gi;' -e 's/<embed/<embeddisabled/gi;' -e 's/load/loaddisabled/g;' |
michael@0 | 29 | } |
michael@0 | 30 | |
michael@0 | 31 | mkdir testpages |
michael@0 | 32 | cd testpages |
michael@0 | 33 | for URL in $(cat ../$1); do |
michael@0 | 34 | #strip the leading http:// from the url |
michael@0 | 35 | CLEANURL=$(echo $URL | sed -e 's/http:\/\/\(.*\)/\1/') |
michael@0 | 36 | #create a directory with the cleaned url as the name |
michael@0 | 37 | echo "grabbing "$CLEANURL |
michael@0 | 38 | mkdir $CLEANURL |
michael@0 | 39 | cd $CLEANURL |
michael@0 | 40 | ../../wget-1.10-css-parser/src/wget -p -k -H -E -erobots=off --no-check-certificate -U "Mozilla/5.0 (firefox)" --restrict-file-names=windows $URL -o outputlog.txt |
michael@0 | 41 | #figure out where/what the index file for this page is from the wget output log |
michael@0 | 42 | FILENAME=$(grep "saved" outputlog.txt | head -1 | sed -e "s/.*\`\(.*\)\'.*/\1/") |
michael@0 | 43 | rm outputlog.txt |
michael@0 | 44 | cd .. |
michael@0 | 45 | |
michael@0 | 46 | #do the final cleanup of any dangling urls |
michael@0 | 47 | #with thanks to Darin Fisher for the code |
michael@0 | 48 | cleanse_files $CLEANURL |
michael@0 | 49 | |
michael@0 | 50 | #add the index file link to the list of index files being generated |
michael@0 | 51 | echo $CLEANURL/$FILENAME >> $2 |
michael@0 | 52 | done |
michael@0 | 53 | cd .. |
michael@0 | 54 |