#!/usr/bin/env python """ rough script to clean up and compact in a single file the book Creating Applications with Mozilla prerequisites: - the single html book pages, download them in the same directory where this script resides, eg with wget -r --no-parent http://books.mozdev.org/html/index.html - the elementtree and elementtidy packages, available from http://www.effbot.org Enjoy! September 19, 2003 ludo@asiatica.org """ import glob, os from time import localtime, strftime from elementtree.ElementTree import ElementTree, Element, SubElement from elementtidy import TidyHTMLTreeBuilder XHTML = "{http://www.w3.org/1999/xhtml}" # create new tree main_root = Element('html') main_tree = ElementTree(main_root) main_head = SubElement(main_root, 'head') main_title = SubElement(main_head, 'title') # last modification date, parse date try: mtime = os.stat('index.html')[8] except OSError: raise SystemExit('no index.html file found') main_title.text = 'Creating Applications with Mozilla - dl %s - parsed %s' % ( strftime("%B %b %d, %Y %T", localtime(mtime)), strftime("%B %b %d, %Y %T", localtime())) main_body = SubElement(main_root, 'body') files = ['index.html', 'f3.html'] # get files in sorted list for fileglob in ('mozilla-pref', 'mozilla-chp', 'mozilla-app', 'appc'): globbed = [(i[:-5].split('-'), i) for i in glob.glob(fileglob + "*.html")] for i, j in globbed: count = 0 for tok in i: try: tok = int(tok) except ValueError: pass else: i[count] = tok count += 1 globbed.sort() files.extend([i[1] for i in globbed]) # get body for each file for f in files: main_body.append(main_body.makeelement('a', {'name':f[:-5]})) print "parsing file %s" % f tree = TidyHTMLTreeBuilder.parse(f) body = tree.find(XHTML + 'body') for div in body: if 'class' in div.attrib and div.attrib['class'] not in ('NAVHEADER', 'NAVFOOTER'): for el in tree.getiterator(): if el.tag.startswith(XHTML): el.tag = el.tag[30:] main_body.append(div) # clean up links for el in main_tree.getiterator('a'): if not 'href' in el.attrib: continue href = el.attrib['href'] if href.startswith('mailto:'): continue target = href.find('#') if target > 0: el.attrib['href'] = href[target:] elif target < 0: el.attrib['href'] = "#" + href[:-5] # print tree main_tree.write('test.html')