#!/usr/bin/env bash # build-corpus.sh - loop through selected directories, and copy html files # Eric Lease Morgan # (c) University of Notre Dame; distributed under a GNU Public License # December 18, 2023 - first cut; use bash for file manipulation # configure DIRECTORIES=( ethereum-magicians.org ethresear.ch swarmresear.ch ) WEBSITES='./websites' CORPUS='./corpus' T='t' # make sane mkdir -p $CORPUS # process each directory for DIRECTORY in ${DIRECTORIES[@]}; do # re-initialize; transform directory into simple name NAME=$( echo $DIRECTORY | sed "s/\..*$//" | tr -d [[:punct:]] ) # find and process each file in the magic "t" directory FILES=$( find "$WEBSITES/$DIRECTORY/$T" -name *.html ) for FILE in ${FILES[@]}; do # re-initialize; denote where to copy files BASENAME=$( basename $FILE ) DESTINATION="$CORPUS/$NAME-$BASENAME" # do the work cp $FILE $DESTINATION done done exit