#!/usr/bin/env python # files2sheet.py - given a configured directory of file, split them into paragraphs for Google Sheets # Eric Lease Morgan # (c) University of Notre Dame; distributed under a GNU Public License # August 20, 2024 - first investigation # August 22, 2024 - a bit behind schedule # configure CONSTITUTIONS = './both-new' PATTERN = '*.txt' COLUMNS = [ 'label', 'length', 'classification', 'type', 'paragraph' ] FORMAT = '{:04d}' DELIMITER = '\n\n' TYPE = 'unknown' MINIMUM = 2 NEITHER = 'neither' # require from pathlib import Path from pandas import DataFrame # initialize constitutions = Path( CONSTITUTIONS ) # loop through each of the given files; create a list of labeled paragraphs rows = [] for constitution in constitutions.glob( PATTERN ) : # re-initialize title = constitution.stem # open and loop through the given constitution with open( constitution ) as handle : paragraphs = handle.read().split( DELIMITER ) for index, paragraph in enumerate( paragraphs ) : # re-initialize type = TYPE # format data label = '-'.join( [ title, FORMAT.format( index + 1 ) ] ) length = len( paragraph.split() ) if length <= MINIMUM : type = NEITHER # update rows.append( [ label, length, 'unknown', type, paragraph ] ) # short-circuit #break # create CSV, output, and done paragraphs = DataFrame( rows, columns=COLUMNS ) print( paragraphs.to_csv( index=False ) ) exit()