#!/usr/bin/env python

# corpus2metadata.py 

CORPUS   = './corpus'
PATTERN  = '*.html'
COLUMNS  = [ 'author', 'title', 'file' ]
METADATA = 'metadata.csv'

from pathlib import Path
import pandas

corpus = Path( CORPUS )

# process each file in the given corpus; create a list of metadata values
metadata = []
for file in corpus.glob( PATTERN ) :

	author = file.name.split( '-' )[ 0 ]
	title  = file.stem
	file   = file.name
	
	print( author )
	print( title )
	print( file )
	print()
	
	metadata.append( [ author, title, file ])

# convert to CSV, output, and done
metadata = pandas.DataFrame( metadata, columns=COLUMNS )
with open( corpus/METADATA, 'w' ) as handle : handle.write( metadata.to_csv( index=False ) )
exit()