#!/usr/bin/env python # paragraphs2metadata.py - given a few configurations, output CSV amenable to the reader # Eric Lease Morgan # (c) University of Notre Dame; distributed under a GNU Public License # December 10, 2024 - first investigations # configure CATEGORY = 'international' PARAGRAPHS = './etc/paragraphs-classified.csv' CATEGORIES = [ 'international', 'custom' ] TYPES = [ 'cooperative', 'competitive', 'complementary', 'combative' ] DROP = [ 'length' ] EXTENSION = '.txt' DIRECTORY = './paragraphs' METADATA = 'metadata.csv' # require from pandas import read_csv from pathlib import Path # initialize paragraphs = read_csv( PARAGRAPHS ) # make sane directory = Path( DIRECTORY + '-' + CATEGORY ) directory.mkdir( exist_ok=True ) # filter for classified rows paragraphs = paragraphs.loc[ paragraphs[ 'category' ] == CATEGORY ] # drop length column paragraphs = paragraphs.drop( DROP, axis=1 ) # create author and file columns paragraphs[ 'author' ] = paragraphs[ 'title' ].str.split( '-' ).str[ 0 ] paragraphs[ 'file' ] = paragraphs[ 'title' ] + EXTENSION # process each row for index, row in paragraphs.iterrows() : # save the given paragraph with open ( directory/( row[ 'title' ] + EXTENSION ), 'w' ) as handle : handle.write( row[ 'paragraph' ] ) # output metadata and done with open ( directory/METADATA, 'w' ) as handle : handle.write( paragraphs.to_csv( index=False ) ) exit()