#!/usr/bin/env python

# paragraphs2metadata.py - given a few configurations, output CSV amenable to the reader

# Eric Lease Morgan <emorgan@nd.edu>
# (c) University of Notre Dame; distributed under a GNU Public License

# December 10, 2024 - first investigations


# configure
CATEGORY   = 'international'
PARAGRAPHS = './etc/paragraphs-classified.csv'
CATEGORIES = [ 'international', 'custom' ]
TYPES      = [ 'cooperative', 'competitive', 'complementary', 'combative' ]
DROP       = [ 'length' ]
EXTENSION  = '.txt'
DIRECTORY  = './paragraphs'
METADATA   = 'metadata.csv'

# require
from pandas  import read_csv
from pathlib import Path

# initialize
paragraphs = read_csv( PARAGRAPHS )

# make sane
directory  = Path( DIRECTORY + '-' + CATEGORY )
directory.mkdir( exist_ok=True )

# filter for classified rows
paragraphs = paragraphs.loc[ paragraphs[ 'category' ] == CATEGORY ]

# drop length column
paragraphs = paragraphs.drop( DROP, axis=1 )

# create author and file columns
paragraphs[ 'author' ] = paragraphs[ 'title' ].str.split( '-' ).str[ 0 ] 
paragraphs[ 'file' ]   = paragraphs[ 'title' ] + EXTENSION

# process each row
for index, row in paragraphs.iterrows() :

	# save the given paragraph
	with open ( directory/( row[ 'title' ] + EXTENSION ), 'w' ) as handle : handle.write( row[ 'paragraph' ] )

# output metadata and done
with open ( directory/METADATA, 'w' ) as handle : handle.write( paragraphs.to_csv( index=False ) )
exit()
