#!/usr/bin/env python

# augment-articles.py - given the documents file created from topic modeling, update articles.csv with topic labels

# Eric Lease Morgan <emorgan@nd.edu>
# (c) University of Notre Dame; distributed under a GNU Public License

# April 3, 2025 - first investigations
# April 4, 2025 - more investigations


# configure
PROVISIONS   = './etc/articles.csv'
MAP = { '0':'international norms', '1':'treaty status', '2':'multilevel relationships', '3':'legal conformity', '4':'human rights', '5':'humnan development' }
DOCUMENTS    = './etc/topic-model/documents.txt'
COLUMNS      = [ 'key', 'file', 'proportion' ]

# require
from pandas import read_csv, DataFrame

# initialize
provisions = read_csv( PROVISIONS )
with open( DOCUMENTS ) as handle :

	# read the documents file and pop off the header line
	documents = handle.read().splitlines()
	documents.pop( 0 )

# parse the documents into a data frame; I'm sure there is a more efficient way
records = []
for document in documents :

	# parse
	fields     = document.split( ' ' )
	key        = fields[ 0 ]
	file       = fields[ 2 ].split( '/' )[ 5 ]
	proportion = fields[ 3 ]

	# update
	records.append( [ key, file, proportion ] )
	
# dataframe-ify and sort
documents = DataFrame( records, columns=COLUMNS )
documents = documents.sort_values( by='proportion', ascending=False )

# process each document; update provisions with topic labels and proportions
provisions[ 'topic']      = None
provisions[ 'proportion'] = None
processed                 = []
for index, document in documents.iterrows() :
	
	# parse
	file       = document[ 'file' ]
	key        = document[ 'key' ]
	proportion = document[ 'proportion' ]
	
	# don't process any file multiple times
	if file in processed : continue
	
	# map
	topic = MAP[ key ]
	
	# get the first matching row of this file; remember, the rows are sorted
	row = provisions.index[ provisions[ 'file' ] == file ][ 0 ]
		
	# update; do the work
	provisions.at[ row, 'topic' ]      = topic
	provisions.at[ row, 'proportion' ] = proportion
	
	# increment
	processed.append( file  )

# done
print( provisions.to_csv( index=False ) )
exit()
