#!/usr/bin/env python

# categories2edgess.py - given a CSV file, output an edges file for network analysis

# Eric Lease Morgan <emorgan@nd.edu>
# (c) University of Notre Dame; distributed under a GNU Public License

# December 17, 2024 - first investigations
# April     3, 2025 - modified for constitutions


# configure
CSV     = './etc/articles.csv'
COLUMNS = [ 'source', 'target', 'weight' ]

# require
from pandas     import read_csv, DataFrame
from re         import sub
from sys        import stdout, argv, exit

# get input
if len( argv ) != 2 : exit( "Usage: " + argv[ 0 ] + " <custom|international>" )
filter = argv[ 1 ]

# initialize
paragraphs = read_csv( CSV )

# process each paragraph in the given CSV file; create a list of edges
edges = []
for index, paragraph in paragraphs.iterrows() :

	# parse
	country  = paragraph[ 'author' ]
	category = paragraph[ 'category' ]
	type     = paragraph[ 'type' ]
	
	# filter
	if category != filter : continue
	
	# normalize
	country = sub( '\d', '', country)
	country = country.replace( '_', ' ' )
	country = sub( ' $', '', country )

	# update
	edges.append( [ country, type, 1 ] )

# output and done
edges = DataFrame( edges, columns=COLUMNS )
print( edges.to_csv( sep='\t', index=False ))
exit()