#!/usr/bin/env perl # xml2txt.pl # Eric Lease Morgan # (c) University of Notre Dame; distributed under a GNU Public License # October 16, 2023 - first cut # configure use constant TXT => './corpus/txt'; use constant EXTENSION => '.txt'; # require use File::Basename; use HTML::Entities; use strict; use XML::XPath; # all but hard-coded initialization my @files = <./corpus/xml/*.xml>; # loop through each file in the given corpus foreach my $file ( @files ) { # re-initialize and debug my ( $name, $path, $suffix ) = fileparse( $file, '.xml' ); warn( "$name\n" ); my @content = (); my $parser = XML::XPath->new( filename => $file ); # get and process all contents my $contents = $parser->find( '//content[@xml:lang="en"]' ); foreach my $content ( $contents->get_nodelist ) { push( @content, decode_entities( $content->string_value ) ) } # output my $txt = TXT . "/$name" . EXTENSION; open( HANDLE, ">$txt" ) or die "Can't open $txt ($!)\n"; print( HANDLE join( "\n\n", @content ) ); close( HANDLE ); } # done exit;