#!/usr/bin/env python # meta2csv.py - given a meta.xml file, output CSV for the Reader # Eric Lease Morgan # (c) Infomotions, LLC; distributed under a GNU Public License # October 8, 2025 - first cut; while in Paris # October 9, 2025 - close to finished # configure CACHE = 'cache' FILES = './etc/dlib.txt' METADATA = './www.dlib.org/cache/metadata.csv' COLUMNS = [ 'author', 'title', 'date', 'doi', 'url', 'file' ] # require from os.path import basename from pandas import DataFrame from pathlib import Path from re import sub from shutil import copyfile from sys import argv, exit,stderr from xml.etree import ElementTree as parser import string # read and process each of the given file names; create a set of metadata metadata = [] with open( FILES ) as handle : files = handle.read().splitlines() for index, filename in enumerate( files ) : # debug stderr.write( ' item: ' + str( index ) + '\n' ) stderr.write( ' filename: ' + filename + '\n' ) # parse author and date, but not all articles are created equal try : [ root, archive, date, author, meta ] = filename.split( '/' ) except : continue # truncate the date year = sub( '_\d\d', '', date ) # get the title; they are not all well-formed try : tree = parser.parse( filename ) xml = tree.getroot() title = xml.findall("./title")[ 0 ].text title = title.replace( '\n', ' ' ) title = sub( ' +', ' ', title ) identifiers = xml.findall('./identifier') doi = identifiers[0].text url = identifiers[1].text except : continue # create short title; not very clean try : firstWord = title.split()[ 0 ].lower() firstWord = firstWord.translate( firstWord.maketrans( '', '', string.punctuation ) ) secondWord = title.split()[ 1 ].lower() secondWord = secondWord.translate( secondWord.maketrans( '', '', string.punctuation ) ) shortTitle = firstWord + '_' + secondWord except : continue # create paths root = Path( root ) source = meta.replace( 'meta.xml', 'html' ) source = root/archive/date/author/source destination = root/CACHE/( author + '-' + shortTitle + '-' + year + '.html' ) file = basename(destination) # cache try : copyfile('./' + str( source ), './'+ str( destination ) ) except : continue # debug stderr.write( ' author: ' + str( author ) + '\n' ) stderr.write( ' title: ' + title + '\n' ) stderr.write( ' date: ' + str( year ) + '\n' ) stderr.write( ' doi: ' + str( doi ) + '\n' ) stderr.write( ' url: ' + str( url ) + '\n' ) stderr.write( ' source: ' + str( source ) + '\n' ) stderr.write( ' destination: ' + str( destination ) + '\n' ) stderr.write( ' file: ' + str( file ) + '\n' ) stderr.write( '\n' ) # update metadata.append( [ author, title, year, doi, url, file ] ) # create data frame, output to csv, and done metadata = DataFrame( metadata, columns=COLUMNS ) with open( METADATA, 'w' ) as handle : handle.write( metadata.to_csv( index=False ) ) exit()