User:Gdr/history.py

From Wikipedia, the free encyclopedia
#!/usr/bin/python
#
#
#                 HISTORY.PY -- WIKIPEDIA PAGE HISTORY
#                           Gdr, 2005-05-12
#
#
# INTRODUCTION
#
# This Python library analyzes the history of articles on the English
# Wikipedia.
#
# You must have the Python Wikipedia Robot Framework
# (http://sourceforge.net/projects/pywikipediabot/).
#
#
# LICENCE
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at
# your option) any later version.

import calendar
import re
import time
import wikipedia

edit1_re = re.compile(r'name="oldid" value="(\d+)"'
                      r'.* title="[^\"]*">([^<]*\d[^<]*)</a>'
                      r'.* title="(?:(User:[^\"]+)|Special:Contributions)">')
edit2_re = re.compile(r'.* title="[^\"]*">([^<]*\d[^<]*)</a>'
                      r'.* title="(?:(User:[^\"]+)|Special:Contributions)">')

months = {
    'Jan':  1,  'January':    1,
    'Feb':  2,  'February':   2,
    'Mar':  3,  'March':      3,
    'Apr':  4,  'April':      4,
    'May':  5,  'May':        5,
    'Jun':  6,  'June':       6,
    'Jul':  7,  'July':       7,
    'Aug':  8,  'August':     8,
    'Sep':  9,  'September':  9,
    'Oct': 10,  'October':   10,
    'Nov': 11,  'November':  11,
    'Dec': 12,  'December':  12,
    }

def dateParse(date):
    # Current time supplies default values.
    tm = list(time.gmtime()[:5]) + [0]

    # Use slot-filling approach to guess fields.
    fields = re.split(r'(?u)[^\w:]+', date)
    for field in fields:
        if re.match(r'^\d\d\d\d$', field):
            # Four digits is a year
            tm[0] = int(field)
        elif re.match(r'^\d\d?$', field):
            # One or two digits is a day
            tm[2] = int(field)
        elif re.match(r'^\d\d:\d\d$', field):
            # 2:2 digits is a time
            tm[3] = int(field[0:2])
            tm[4] = int(field[3:5])
        elif field in months:
            # A month name
            tm[1] = months[field]
    return calendar.timegm(tm)

def historyParse(edit):
    m = edit1_re.search(edit)
    if m:
        return {
            'oldid': m.group(1),
            'date': dateParse(m.group(2)),
            'user': m.group(3)
            }
    m = edit2_re.search(edit)
    if m:
        return {
            'date': dateParse(m.group(1)),
            'user': m.group(2)
            }
    raise wikipedia.Error("Can't parse edit:\n" + edit)

def historyPage(page, limit = None, offset = None):
    """historyPage(page, limit = None, offset = None)
    Get the history of the article given by 'page'. Optional arguments:
    'limit' specifies the maximum number of edits to return, and
    'offset' says where to start in the history. Returns the history as
    a list of dictionaries, one per edit in the history, with keys
    'oldid' - the id of the revision following the edit, if known (in
    MediaWiki 1.4 the current revision has no id), 'date' - the time of
    the edit as a number of seconds since the epoch, and 'user' - the
    user who made the edit."""
    # Check whether we are not too quickly after the previous putPage, and
    # wait a bit until the interval is acceptable
    wikipedia.get_throttle()
    # Which web-site host are we submitting to?
    host = page.site().hostname()
    # Get the address of the page on that host.
    address = '/w/index.php?title=%s&action=%s'%(page.urlname(),'history')
    if limit:
        address += '&limit=%d' % limit
    if offset:
        address += '&offset=%d' % offset
    # Get the page.
    wikipedia.output(u"Getting history for %s" % page.linkname())
    text, charset = wikipedia.getUrl(host, address)

    # Extract the edit items.
    m = re.search(r'<ul id="pagehistory"><li>(.*)</li></ul>', text, re.M)
    if not m:
        raise wikipedia.Error("Can't find the list of edits:" + text)
    return map(historyParse, m.group(1).split('</li><li>'))

def getOldRevision(page, oldid):
    """getOldRevision(page, oldid)
    Returns revision 'oldid' of article given by 'page'."""
    wikipedia.get_throttle()
    host = page.site().hostname()
    address = page.site().edit_address(page.urlname()) + '&oldid=%s' % oldid
    print "address = ", address
    text, charset = wikipedia.getUrl(host, address, page.site())
    return unicode(wikipedia.unescape(re.search('<textarea[^>]*>(.*)</textarea>', text, re.S).group(1)).rstrip(),
                   charset, errors = 'replace')