#!/usr/bin/env python # corpus2metadata.py CORPUS = './corpus' PATTERN = '*.html' COLUMNS = [ 'author', 'title', 'file' ] METADATA = 'metadata.csv' from pathlib import Path import pandas corpus = Path( CORPUS ) # process each file in the given corpus; create a list of metadata values metadata = [] for file in corpus.glob( PATTERN ) : author = file.name.split( '-' )[ 0 ] title = file.stem file = file.name print( author ) print( title ) print( file ) print() metadata.append( [ author, title, file ]) # convert to CSV, output, and done metadata = pandas.DataFrame( metadata, columns=COLUMNS ) with open( corpus/METADATA, 'w' ) as handle : handle.write( metadata.to_csv( index=False ) ) exit()