# Creates per year files with word frequency lists for a WordPress # blog, appropriate for input to http://wordle.net/advanced # See http://gondwanaland.com/mlog/2009/01/03/5-wordle-years/ # This code is placed in the public domain. import MySQLdb import re import sys host = '' user = '' db = '' password = '' poststable = '' def strip_tags(value): "Return the given HTML with all tags stripped." return re.sub(r'<[^>]*?>', '', value) def strip_punctuation(value): return re.sub(r"[^\w']", ' ', value) def countwords(txt): words = txt.split() counts = {} for word in words: if counts.has_key(word): counts[word] = counts[word] + 1 else: counts[word] = 1 return counts def wordle_advanced(counts): buf = "" for word in counts: # Wordle applet seems to crash on large input, so remove # single use words that won't appear anyway if counts[word] > 1: buf += word + ":" + str(counts[word]) + "\n" return buf try: conn = MySQLdb.connect (host, user, password, db) except MySQLdb.Error, e: print "Error %d: %s" % (e.args[0], e.args[1]) sys.exit (1) cursor = conn.cursor () cursor.execute ("SELECT distinct year(post_date) from " + poststable + " WHERE post_type = 'post' AND post_status = 'publish'") years = [] while (1): row = cursor.fetchone() if row == None: break years.append(row[0]) for year in years: yeardump = '' cursor.execute ("SELECT post_title, post_content, post_date FROM " + poststable + " WHERE post_type = 'post' AND post_status = 'publish' AND year(post_date) = %s ORDER BY post_date ASC", year) while (1): row = cursor.fetchone() if row == None: break yeardump += "\n" + row[0] + "\n" + strip_tags(row[1]) print str(year) + " " + str(row[2]) yeardump = strip_punctuation(yeardump) wordcounts = countwords(yeardump) out = wordle_advanced(wordcounts) out_file = open("wp-year-dump-"+str(year)+".txt", "w") out_file.write(out) out_file.close()