#!/usr/bin/env python

# files2sheet.py - given a configured directory of file, split them into paragraphs for Google Sheets

# Eric Lease Morgan <emorgan@nd.edu>
# (c) University of Notre Dame; distributed under a GNU Public License

# August 20, 2024 - first investigation
# August 22, 2024 - a bit behind schedule


# configure
CONSTITUTIONS = './both-new'
PATTERN       = '*.txt'
COLUMNS       = [ 'label', 'length', 'classification', 'type', 'paragraph' ]
FORMAT        = '{:04d}'
DELIMITER     = '\n\n'
TYPE          = 'unknown'
MINIMUM       = 2
NEITHER       = 'neither'

# require
from pathlib import Path
from pandas  import DataFrame

# initialize
constitutions = Path( CONSTITUTIONS )

# loop through each of the given files; create a list of labeled paragraphs
rows = []
for constitution in constitutions.glob( PATTERN ) :

	# re-initialize
	title = constitution.stem
	
	# open and loop through the given constitution
	with open( constitution ) as handle : paragraphs = handle.read().split( DELIMITER )
	for index, paragraph in enumerate( paragraphs ) :
	
		# re-initialize
		type = TYPE
		
		# format data
		label     =  '-'.join( [ title, FORMAT.format( index + 1 ) ] )
		length    =  len( paragraph.split() )
		if length <= MINIMUM : type = NEITHER
		
		# update
		rows.append( [ label, length, 'unknown', type, paragraph ] )
		
	# short-circuit
	#break
	
# create CSV, output, and done	
paragraphs = DataFrame( rows, columns=COLUMNS )
print( paragraphs.to_csv( index=False ) )
exit()