#!/usr/bin/env python # ner-find-and-disambiguate-values.py - output value/sentence combinations # Eric Lease Morgan # (c) University of Notre Dame; distributed under a GNU Public License # September 27, 2024 - first documentation; created a while ago but alas # October 1, 2024 - moving towards disambiguation # October 7, 2024 - specified different sets of values and size # configure CONSTITUTIONS = './etc/values-constitutions.json' ETHERIUM = './etc/values-etherium.json' SIZE = 14 SENTENCES = './etc/sentences.txt' POS = 'n' # require from json import loads from nltk.tokenize import word_tokenize from nltk.wsd import lesk from sys import stderr # initialize; read constitution values... with open( CONSTITUTIONS ) as handle : constitutions = loads( handle.read() ) values = dict( list( constitutions.items() )[ 0:SIZE ] ) # ...and update them with the etherium values with open( ETHERIUM ) as handle : etherium = loads( handle.read() ) values.update( dict( list( etherium.items() )[ 0:SIZE ] ) ) # get and process each sentence with open( SENTENCES ) as handle : sentences = handle.read().splitlines() for i, sentence in enumerate( sentences ) : # re-initialize tokens = word_tokenize( sentence ) # process each value for value, synsets in values.items() : # re-initialize; get just the synset names, and kinda awkward synsets = [ list( synset)[ 0 ] for synset in synsets ] # short-circuit if len( synsets ) == 0 : continue # debug stderr.write( 'processing sentence #' + str( i + 1 ) + ' of ' + str( len( sentences ) ) + '\r' ) # look for value in tokens if value in tokens : # get the asociated synset synset = lesk( tokens, value, pos=POS ).name() # conditionally output if synset in synsets : print( '\t'.join( [ value, sentence ] ) ) # done exit()