#!/usr/bin/env python

# svo_characters2edges.py - given a few configuration, output graph of characters in svo sentences

# Eric Lease Morgan <eric_morgan@infomotions.com>
# (c) Infomotions, LLC; distributed under a GNU Public License

# August 24, 2025 - just after coming back from church
# August 29, 2025 - in the coffee shop


# configure
CHARACTERS = [ 'i', 'you', 'they', 'we', 'she', 'he', 'emma', 'elton', 'jane', 'harriet', 'frank', 'perry', 'isabella', 'john', 'martin', 'robert', 'james', 'henry', 'selina', 'elizabeth', 'george', 'william', 'hannah', 'hetty' ]
FILE   = './etc/carrel.txt'
SVO    = '''
  NOUNPHRASE: {<DT>?<JJ.*>*<NN.?>+}
   PREDICATE: {<VB.*>?}
     GRAMMAR: {<NOUNPHRASE><PREDICATE><NOUNPHRASE>}
'''

# require
from nltk     import RegexpParser, word_tokenize, pos_tag, sent_tokenize
from re       import sub
from networkx import DiGraph, write_gml
from sys      import stdout

# read the given file
with open( FILE ) as handle : text = handle.read()

# parse it into somewhat normalized sentences
sentences = sent_tokenize( text )
sentences = [ sentence.replace( '\n', ' ' ) for sentence in sentences ]
sentences = [ sub( ' +', ' ', sentence ) for sentence in sentences ]
sentences = [ sub( '^ ', '', sentence )  for sentence in sentences ]

# process all sentences; initialize a parser and output all svos
parser = RegexpParser( SVO )
graph  = DiGraph()
for sentence in sentences :
    
    # normalize and tokenize the sentence
    tokens = word_tokenize( sentence.lower() )

    # get parts-of-speech, create an NLTK tree, and re-initialize a variable
    pos  = pos_tag( tokens )
    tree = parser.parse( pos )
    svo  = []
    
    # process each branch; look for our particular grammar
    for branch in tree.subtrees( lambda t : t.label() == 'GRAMMAR' ) : 
    
        # process each limb
        for limb in [ branch[ 0 ], branch[ 1 ], branch[ 2 ] ] :

            # parse
            words  = []
            leaves = limb.leaves()
            for leaf in leaves : words.append( leaf[ 0 ] )
            
            # update
            svo.append( ' '.join( words ) )

        # check for a given person
        if svo[ 0 ] in CHARACTERS :
        
          # update and break; only process a single branch
          graph.add_node( svo[ 0 ], type='character' )
          graph.add_node( svo[ 1 ], type='verb' )
          graph.add_node( svo[ 2 ], type='object' )
          graph.add_edge( svo[ 0 ], svo[ 1 ] )
          graph.add_edge( svo[ 1 ], svo[ 2 ] )
        
        #break

# output and done
write_gml( graph, stdout.buffer )
exit()

