#!/usr/bin/env bash # build-corpus-from-vitalik.sh - loop through selected directories, and copy html files # Eric Lease Morgan # (c) University of Notre Dame; distributed under a GNU Public License # February 6, 2024 - first cut; not scalable # configure DIRECTORIES=( vitalik.eth.limo ) WEBSITES='./websites' CORPUS='./corpus' # make sane mkdir -p $CORPUS # process each directory for DIRECTORY in ${DIRECTORIES[@]}; do # re-initialize; transform directory into simple name NAME=$( echo $DIRECTORY | sed "s/\..*$//" | tr -d [[:punct:]] ) # find and process each file in the magic "t" directory FILES=$( find "$WEBSITES/$DIRECTORY/general" -name *.html ) for FILE in ${FILES[@]}; do # re-initialize; denote where to copy files BASENAME=$( basename $FILE ) DESTINATION="$CORPUS/$NAME-$BASENAME" # do the work cp $FILE $DESTINATION done done exit