# Creates per year files with word frequency lists for a WordPress
# blog, appropriate for input to http://wordle.net/advanced

# See http://gondwanaland.com/mlog/2009/01/03/5-wordle-years/

# This code is placed in the public domain.

import MySQLdb
import re
import sys

host = ''
user = ''
db = ''
password = ''
poststable = ''


def strip_tags(value):
    "Return the given HTML with all tags stripped."
    return re.sub(r'<[^>]*?>', '', value) 

def strip_punctuation(value):
    return re.sub(r"[^\w']", ' ', value) 

def countwords(txt):
    words = txt.split()
    counts = {}
    for word in words:
        if counts.has_key(word):
            counts[word] = counts[word] + 1
        else:
            counts[word] = 1
    return counts

def wordle_advanced(counts):
    buf = ""
    for word in counts:
        # Wordle applet seems to crash on large input, so remove
        # single use words that won't appear anyway
        if counts[word] > 1:
            buf += word + ":" + str(counts[word]) + "\n"
    return buf
         

try:
    conn = MySQLdb.connect (host,
                            user,
                            password,
                            db)
except MySQLdb.Error, e:
    print "Error %d: %s" % (e.args[0], e.args[1])
    sys.exit (1)

cursor = conn.cursor ()
cursor.execute ("SELECT distinct year(post_date) from " + poststable + " WHERE post_type = 'post' AND post_status = 'publish'")
years = []
while (1):
    row = cursor.fetchone()
    if row == None: break
    years.append(row[0])

for year in years:
    yeardump = ''
    cursor.execute ("SELECT post_title, post_content, post_date FROM " + poststable + " WHERE post_type = 'post' AND post_status = 'publish' AND year(post_date) = %s ORDER BY post_date ASC", year)
    while (1):
        row = cursor.fetchone()
        if row == None: break
        yeardump += "\n" + row[0] + "\n" + strip_tags(row[1])
        print str(year) + " " + str(row[2])
        yeardump = strip_punctuation(yeardump)
        wordcounts = countwords(yeardump)
        out = wordle_advanced(wordcounts)
    out_file = open("wp-year-dump-"+str(year)+".txt", "w")
    out_file.write(out)
    out_file.close()