#!/usr/bin/env python

# json2spacy.py - given hard-coded values, convert a specifically shaped json file to spaCy docbin

# Eric Lease Morgan <emorgan@nd.edu>
# (c) University of Notre Dame; distributed under a GNU Public License

# February 21, 2024 - first cut; see https://github.com/explosion/projects/tree/v3/pipelines/ner_demo


# configure
LANGUAGE = 'en'

# require
from pathlib      import Path
from spacy        import blank
from spacy.tokens import DocBin
from srsly        import read_json
from warnings     import warn
from sys          import stderr

# convert a specifically-shaped json file to serialized spaCy docbin file
def convert( lang: str, input_path: Path, output_path: Path ) :
    
    # initialize
    nlp = blank( lang )
    db  = DocBin()
    
    # process each line in the given json file
    for text, annotation in read_json(input_path):
        
        # re-initialize
        doc  = nlp.make_doc( text )
        ents = []
        
        # process each annotation; create a list of entities
        for start, end, label in annotation[ 'entities' ]:
            
            # re-initialize
            span = doc.char_span( start, end, label=label )
            
            # sanity check
            if span is None : stderr.write( f"Warning: Skipping entity [{start}, {end}, {label}] because the span '{doc.text[start:end]}' does not align with token boundaries: {repr(text)}\n\n" )
            	
            # update the list of entities
            else : ents.append( span )
        
        # update
        doc.ents = ents
        db.add( doc )
    
    # output
    db.to_disk( output_path )

# using brute force, do the work
convert( LANGUAGE, './etc/testing/testing-data.json',   './etc/testing/testing-data.spacy' )
convert( LANGUAGE, './etc/training/training-data.json', './etc/training/training-data.spacy' )

# done
exit()