#!/usr/bin/env bash # wget.sh - a front-end to GNU Wget; cache content from a given set of URLS # usage: cat ./etc/websites.txt | parallel --jobs 8 ./bin/wget.sh # Eric Lease Morgan # (c) University of Notre Dame; distributed under a GNU Public License # December 15, 2023 - first cut; while at the cabin # configure TEMPLATE='wget --no-clobber --convert-links --random-wait --reject-regex "*html.gz*" --reject-regex "*/_next/*" -np -r -p -E -e robots=off --continue --reject jpg,jpeg,png -U mozilla ##URL##' CORPUS='./websites' LOG='/shared/projects/nabrzyski-blockchain-2024/logs/wget.log' # sanity check if [[ -z $1 ]]; then echo "Usage: $0 " >&2 exit fi # get input URL=$1 # make sane touch $LOG mkdir -p $CORPUS cd $CORPUS # re-initialize WGET=$( echo $TEMPLATE | sed "s|##URL##|$URL|" ) # do the work $WGET 2>>$LOG # done exit