Tuesday, February 10, 2009
Downloading your Blogger archives

A friend was looking for a way to grab an archive of his Blogger posts into a CSV file he could do text mining on (and presumably, for a low-fi backup mechanism). I wrote this Python script for him, enjoy.

#!/usr/bin/env python
#
# Copyright (C) 2009 by Jon Moore
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import csv
import urllib2
import unicodedata
import xml.etree.ElementTree as etree

blog_feed = 'http://codeartisan.blogspot.com/feeds/posts/default'
output = 'posts.csv'

ATOM_NS = 'http://www.w3.org/2005/Atom'

def norm(s):
    if not s: return None
    return s.encode('ascii','ignore')

def main():
    f = open(output, 'wb')
    csv_wr = csv.writer(f)
    url = blog_feed + '?max-results=100'
    csv_wr.writerow(['id','published','updated','permalink','title','content'])
    while url:
        print "fetching", url
        feed = etree.fromstring(urllib2.urlopen(url).read())
        for entry in feed.findall("{%s}entry" % ATOM_NS):
            id = entry.find("{%s}id" % ATOM_NS).text
            published = entry.find("{%s}published" % ATOM_NS).text
            updated = entry.find("{%s}updated" % ATOM_NS).text
            title = norm(entry.find("{%s}title" % ATOM_NS).text)
            content = norm(entry.find("{%s}content" % ATOM_NS).text)
            perm_url = ''
            for link in entry.findall("{%s}link" % ATOM_NS):
                if (link.get('rel') == 'alternate'
                    and link.get('type') == 'text/html'):
                    perm_url = link.get('href')
                    break
            csv_wr.writerow([id,published,updated,perm_url,title,content])
            print "wrote",id
        url = None
        for link in feed.findall("{%s}link" % ATOM_NS):
            if link.get('rel') == 'next':
                url = link.get('href')
                break
    f.close()

if __name__ == "__main__":
    main()