#!/usr/bin/python import os import re import sys import urlparse import mechanize from argparse import ArgumentParser from BeautifulSoup import BeautifulSoup base_url="http://lists.freedesktop.org/archives/wayland-devel" def get_latest_month_archive(br, base_url): r = br.open(base_url) html = r.read() soup = BeautifulSoup(html) # TODO: Determine the appropriate month to load. For now, just pick # the first month listed. row = soup.findAll('tr')[1] month = row.findAll('td')[0].text.replace(':', '') links = row.findAll('td')[1] for link in links.findAll('a'): if link.text == "[ Date ]": return "%s/%s" %(base_url, link['href']) return None if __name__ == '__main__': ### Handle program arguments parser = ArgumentParser(description='Get release announcement URLs from mailman archives') args=parser.parse_args() br = mechanize.Browser() br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) br.addheaders = [ ('User-agent', ('Mozilla/5.0 (X11; U; Linux i686; en-US;' ' rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox' '/3.0.1')) ] month_archive_url = get_latest_month_archive(br, base_url) if month_archive_url is None: print "Error: Couldn't find month archive url" sys.exit(1) r = br.open(month_archive_url) html = r.read() soup = BeautifulSoup(html) for item in soup.findAll('li'): # Parse the subject links = item.findAll('a') if len(links) < 1: continue subject = links[0].text if "[ANNOUNCE]" not in subject: continue package = subject.split(' ')[1] version = subject.split(' ')[2] date_str = "tbd" ## Parse the author #name = item.findAll('i') #if len(name) < 1: # continue #author = name[0].text url_parts = month_archive_url.split('/') url_parts.pop() url_parts.append(links[0]['href']) post_url = '/'.join(url_parts) r = br.open(post_url) post_html = r.read() post_soup = BeautifulSoup(post_html) date_str = post_soup.findAll('i')[0].text print "%s %s %s %s" %(package, version, post_url, date_str)