#!/usr/bin/python # # parse_feed.py # # script that parses out individual posts from feeds and saves everything as simple XML files (one post per file). # we used to have a Python script that parsed out the posts, blog details etc. and put things in the database straight away. # we changed things because: # # * this way we don't have to store the content of every post in the database, but we can still search or retrieve them when necessary # * it keeps configuration simple (we only need to tell Perl and PHP the database login details, for example) # * it reduces the amount of Python code - the rest of the pipeline is all in Perl. # * shorter scripts are easier to debug and maintain # # some info about writing XML with Python: http://www.xml.com/pub/a/2002/11/13/py-xml.html # # This script can take a list of feeds (on the command line) as arguments: in that case, it only parses the feeds you supply. # Useful for debugging and incrementally updating the list of posts... import sys import os import feedparser import glob import md5 import string from xml.sax import saxutils ENCODING = 'ascii' ENCODING_ERRORS = 'ignore' FEED_DIR = "feeds/" POSTS_DIR = "posts/" DEBUG = 1 DO_POSTS = 1 # do posts or just refresh feed details? # get attributes from posts. def test_and_print(entry,key): if entry.has_key(key): try: value = entry[key] return value except UnicodeEncodeError: print "a unicode error happened" sys.exit() except: print "Some horrible error happened." sys.exit() else: return "no such key" # get attributes from the feed itself. def get_feed_attr(feed,attr): try: if feed.has_key(attr): return feed.get(attr) else: return "no such attr" except UnicodeEncodeError: print "a unicode error happened" sys.exit() except: print "Some horrible error happened." sys.exit() def md5_hash(input): m = md5.new() m.update(input) hash = m.hexdigest() return hash def parse_file(file): if DEBUG: print "\n" print file # the feed_id is the last 32 chars of the file (feed_ids are a constant length: they're an md5 hash) feed_id = file[len(file) - 32:len(file)] try: feed = feedparser.parse(file) except UnicodeDecodeError: print "A unicode error happened in feedparser.py" return # first get feed information. title = get_feed_attr(feed['feed'],'title').encode(ENCODING, ENCODING_ERRORS) link = get_feed_attr(feed['feed'],'link').encode(ENCODING, ENCODING_ERRORS) description = get_feed_attr(feed['feed'],'description').encode(ENCODING, ENCODING_ERRORS) summary = get_feed_attr(feed['feed'],'summary').encode(ENCODING, ENCODING_ERRORS) if len(description) <= 0: description = summary if DEBUG: print title + " " + link print description # write this information to disk. The filename is a hash of the feed_id (file) feed_filename = POSTS_DIR + "feed_info_" + feed_id this_feed_dir = feed_id + "/" # create the feed directory if it doesn't already exist if not (os.path.exists(POSTS_DIR + this_feed_dir) and os.path.isdir(POSTS_DIR + this_feed_dir)): os.mkdir(POSTS_DIR + this_feed_dir) feed_info = open(feed_filename, 'w') feed_info.write('\n') feed_info.write("\n") feed_info.write("\t" + saxutils.escape(title).encode(ENCODING) + "\n") feed_info.write("\t" + saxutils.escape(feed_id).encode(ENCODING) + "\n") feed_info.write("\t" + saxutils.escape(link).encode(ENCODING) + "\n") feed_info.write("\t\n") feed_info.write("") if DO_POSTS == 0: return # now get all the posts. entries = feed.entries for entry in entries: try: title = test_and_print(entry,'title').encode(ENCODING, ENCODING_ERRORS) link = test_and_print(entry,'link').encode(ENCODING, ENCODING_ERRORS) description = test_and_print(entry,'description').encode(ENCODING, ENCODING_ERRORS) date = test_and_print(entry,'date').encode(ENCODING, ENCODING_ERRORS) tags = () if entry.has_key("categories"): tags = entry.categories if date == "no such attr": date = test_and_print(entry,'published').encode(ENCODING, ENCODING_ERRORS) content = test_and_print(entry,'content') if content != "no such key": # find the longest content dictionary entry. longest = content[0] longest_len = len(longest.value.encode(ENCODING, ENCODING_ERRORS)) for content_entry in content: # find the longest content entry associated with this post. if len(content_entry.value.encode(ENCODING, ENCODING_ERRORS)) > longest_len: longest = content_entry newdescription = longest.value.encode(ENCODING, ENCODING_ERRORS) # this, uh, may or may not work. We're basically assuming that the longest piece of content associated # with a post is the actual post body. if len(newdescription) > (len(description) + 1): description = newdescription if DEBUG: print title # the filename for each post is a hash of the post URL. post_filename = POSTS_DIR + this_feed_dir + "post_" + md5_hash(link) post_info = open(post_filename, 'w') post_info.write('\n') post_info.write("\n"); post_info.write("\t" + saxutils.escape(feed_id).encode(ENCODING, ENCODING_ERRORS) + "\n"); post_info.write("\t" + saxutils.escape(title).encode(ENCODING, ENCODING_ERRORS) + "\n"); post_info.write("\t" + saxutils.escape(link).encode(ENCODING, ENCODING_ERRORS) + "\n"); post_info.write("\t" + saxutils.escape(date).encode(ENCODING, ENCODING_ERRORS) + "\n"); for tag in tags: post_info.write("\t" + saxutils.escape(tag[1]).encode(ENCODING, ENCODING_ERRORS) + "\n"); post_info.write("\t\n"); post_info.write("\n"); except Exception, details: print "An error occurred when processing a post." print details #sys.exit(0) # *** EXECUTION STARTS HERE *** if len(sys.argv) > 1: sys.argv.pop(0) files = sys.argv else: files = glob.glob(FEED_DIR + "*") for file in files: if ( (os.path.getsize(file) > 0) and (os.path.isfile(file)) ): parse_file(file) else: if DEBUG: print "Skipping" + file + ", which has size 0\n"