#!/usr/bin/env python

# ner-find-and-disambiguate-values.py - output value/sentence combinations

# Eric Lease Morgan <emorgan@nd.edu>
# (c) University of Notre Dame; distributed under a GNU Public License

# September 27, 2024 - first documentation; created a while ago but alas
# October    1, 2024 - moving towards disambiguation
# October    7, 2024 - specified different sets of values and size


# configure
CONSTITUTIONS = './etc/values-constitutions.json'
ETHERIUM      = './etc/values-etherium.json'
SIZE          = 14
SENTENCES     = './etc/sentences.txt'
POS           = 'n'

# require
from json          import loads
from nltk.tokenize import word_tokenize
from nltk.wsd      import lesk
from sys           import stderr

# initialize; read constitution values...
with open( CONSTITUTIONS ) as handle : constitutions = loads( handle.read() )
values = dict( list( constitutions.items() )[ 0:SIZE ] )

# ...and update them with the etherium values
with open( ETHERIUM ) as handle : etherium = loads( handle.read() )
values.update( dict( list( etherium.items() )[ 0:SIZE ] ) )

# get and process each sentence
with open( SENTENCES ) as handle : sentences = handle.read().splitlines()
for i, sentence in enumerate( sentences ) :
	
	# re-initialize
	tokens = word_tokenize( sentence )
	
	# process each value
	for value, synsets in values.items() :
			
		# re-initialize; get just the synset names, and kinda awkward
		synsets = [ list( synset)[ 0 ] for synset in synsets ]

		# short-circuit
		if len( synsets ) == 0 : continue
									
		# debug
		stderr.write( 'processing sentence #' + str( i + 1 ) + ' of ' + str( len( sentences ) ) + '\r' )
						
		# look for value in tokens
		if value in tokens :
		
			# get the asociated synset
			synset = lesk( tokens, value, pos=POS ).name()
			
			# conditionally output
			if synset in synsets : print( '\t'.join( [ value, sentence ] ) )
			
# done
exit()