#!/usr/bin/env python # json2spacy.py - given hard-coded values, convert a specifically shaped json file to spaCy docbin # Eric Lease Morgan # (c) University of Notre Dame; distributed under a GNU Public License # February 21, 2024 - first cut; see https://github.com/explosion/projects/tree/v3/pipelines/ner_demo # configure LANGUAGE = 'en' # require from pathlib import Path from spacy import blank from spacy.tokens import DocBin from srsly import read_json from warnings import warn from sys import stderr # convert a specifically-shaped json file to serialized spaCy docbin file def convert( lang: str, input_path: Path, output_path: Path ) : # initialize nlp = blank( lang ) db = DocBin() # process each line in the given json file for text, annotation in read_json(input_path): # re-initialize doc = nlp.make_doc( text ) ents = [] # process each annotation; create a list of entities for start, end, label in annotation[ 'entities' ]: # re-initialize span = doc.char_span( start, end, label=label ) # sanity check if span is None : stderr.write( f"Warning: Skipping entity [{start}, {end}, {label}] because the span '{doc.text[start:end]}' does not align with token boundaries: {repr(text)}\n\n" ) # update the list of entities else : ents.append( span ) # update doc.ents = ents db.add( doc ) # output db.to_disk( output_path ) # using brute force, do the work convert( LANGUAGE, './etc/testing/testing-data.json', './etc/testing/testing-data.spacy' ) convert( LANGUAGE, './etc/training/training-data.json', './etc/training/training-data.spacy' ) # done exit()