michael@0: #! /bin/bash michael@0: # This Source Code Form is subject to the terms of the Mozilla Public michael@0: # License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: # file, You can obtain one at http://mozilla.org/MPL/2.0/. michael@0: michael@0: michael@0: #original author: Alice Nodelman michael@0: #contributor: Darin Fisher michael@0: # michael@0: #takes two inputs, $1 = file containing list of web pages of form http://pagename michael@0: # $2 = output file where list of index files is dumped - useful for places list of links into scripts michael@0: # michael@0: # web pages are dropped in directories named for their urls michael@0: michael@0: if [ $# != 2 ]; then michael@0: echo 'missing command line arguments' michael@0: echo michael@0: echo 'usage: getpages.sh inputfile outputfile' michael@0: echo ' inputfile: file containing one url per line of the form http://url' michael@0: echo ' outputfile: file to be created during execution, contains a list of index files one per url' michael@0: exit michael@0: fi michael@0: michael@0: # generates the list of files to be cleansed (exclude image files) michael@0: # disables any existing call-outs to the live web michael@0: # provided by Darin Fisher michael@0: cleanse_files() { michael@0: find "$1" -type f -a -print0 ! -iname \*.jpg -a ! -iname \*.gif -a ! -iname \*.png -a ! -name \*.bak | xargs -0 perl -pi -e 's/[a-zA-Z0-9_]*.writeln/void/g;' -e 's/[a-zA-Z0-9_]*.write/void/g;' -e 's/[a-zA-Z0-9_]*.open/void/g;' -e 's/"https/"httpsdisabled/gi;' -e 's/"http/"httpdisabled/gi;' -e 's/> $2 michael@0: done michael@0: cd .. michael@0: