#!/usr/bin/env perl

# xml2txt.pl

# Eric Lease Morgan <emorgan@nd.edu>
# (c) University of Notre Dame; distributed under a GNU Public License

# October 16, 2023 - first cut


# configure
use constant TXT       => './corpus/txt';
use constant EXTENSION => '.txt';

# require
use File::Basename;
use HTML::Entities;
use strict;
use XML::XPath;

# all but hard-coded initialization
my @files = <./corpus/xml/*.xml>;

# loop through each file in the given corpus
foreach my $file ( @files ) {	
	
	# re-initialize and debug
	my ( $name, $path, $suffix ) = fileparse( $file, '.xml' );
	warn( "$name\n" );
	my @content = ();
	my $parser  = XML::XPath->new( filename => $file );
	
	# get and process all contents
	my $contents = $parser->find( '//content[@xml:lang="en"]' );
	foreach my $content ( $contents->get_nodelist ) { push( @content, decode_entities( $content->string_value ) ) }
	
	# output
	my $txt = TXT . "/$name" . EXTENSION;
	open( HANDLE, ">$txt" ) or die "Can't open $txt ($!)\n";
	print( HANDLE join( "\n\n", @content ) );
	close( HANDLE );
		
}

# done
exit;
