#!/usr/bin/env python # feeds2csv.py - given a list of rss feeds, output content for the Reader # Eric Lease Morgan # (c) Infomotions, LLC; distributed under a GNU Public License # October 11, 2025 - first investigation; still in Paris # October 16, 2025 - added dates; while still in Paris and not very clean metadata # configure FEEDS = './etc/code4lib-feeds.txt' ARTICLES = './journal.code4lib.org/articles/' CACHE = './journal.code4lib.org/cache' COMMENTSON = ' Comments on: ' METADATA = './journal.code4lib.org/cache/metadata.csv' COLUMNS = [ 'title', 'date', 'url', 'file' ] # require from sys import argv, exit, stderr from xml.etree import ElementTree as parser from shutil import copyfile from pathlib import Path from pandas import DataFrame # open and process each of the given files; create a set of metadata metadata = [] with open( FEEDS ) as handle : feeds = handle.read().splitlines() for index, feed in enumerate( feeds ) : # debug stderr.write( ' item: ' + str( index ) + '\n' ) stderr.write( ' feed: ' + feed + '\n' ) # get the title, url, identifier, and source tree = parser.parse( feed ) xml = tree.getroot() title = xml.find("channel/title") title = title.text title = title.replace( '\n', '' ) title = title.replace( COMMENTSON, '' ) title = title.replace( '\t', '' ) date = xml.find("channel/lastBuildDate") date = date.text date = date.split()[3] url = xml.find("channel/link") url = url.text + '/' identifier = url.split( '/' )[-2] source = ARTICLES + str( identifier ) + '.html' destination = source.replace( 'articles', 'cache' ) file = Path( destination ).name # debug stderr.write( ' title: ' + title + '\n' ) stderr.write( ' date: ' + date + '\n' ) stderr.write( ' url: ' + url + '\n' ) stderr.write( ' identifier: ' + identifier + '\n' ) stderr.write( ' source: ' + source + '\n' ) stderr.write( ' destination: ' + destination + '\n' ) stderr.write( ' file: ' + file + '\n' ) stderr.write( '\n' ) # cache copyfile( source, destination ) # update metadata.append( [ title, date, url, file ] ) # create dataframe, output CSV, and done metadata = DataFrame( metadata, columns=COLUMNS ) with open( METADATA, 'w' ) as handle : handle.write( metadata.to_csv( index=False ) ) exit()