#!/usr/bin/env python
"""
rough script to clean up and compact in a single file the book
Creating Applications with Mozilla

prerequisites:

-   the single html book pages, download them in the same directory where this
    script resides, eg with

    wget -r --no-parent http://books.mozdev.org/html/index.html
    
-   the elementtree and elementtidy packages, available from http://www.effbot.org

Enjoy!
September 19, 2003
ludo@asiatica.org
"""
import glob, os
from time import localtime, strftime
from elementtree.ElementTree import ElementTree, Element, SubElement
from elementtidy import TidyHTMLTreeBuilder

XHTML = "{http://www.w3.org/1999/xhtml}"

# create new tree
main_root = Element('html')
main_tree = ElementTree(main_root)
main_head = SubElement(main_root, 'head')
main_title = SubElement(main_head, 'title')
# last modification date, parse date
try:
    mtime = os.stat('index.html')[8]
except OSError:
    raise SystemExit('no index.html file found')
main_title.text = 'Creating Applications with Mozilla - dl %s - parsed %s' % (
    strftime("%B %b %d, %Y %T", localtime(mtime)), strftime("%B %b %d, %Y %T", localtime()))
main_body = SubElement(main_root, 'body')

files = ['index.html', 'f3.html']
# get files in sorted list
for fileglob in ('mozilla-pref', 'mozilla-chp', 'mozilla-app', 'appc'):
    globbed = [(i[:-5].split('-'), i) for i in glob.glob(fileglob + "*.html")]
    for i, j in globbed:
        count = 0
        for tok in i:
            try:
                tok = int(tok)
            except ValueError:
                pass
            else:
                i[count] = tok
            count += 1
    globbed.sort()
    files.extend([i[1] for i in globbed])
# get body for each file
for f in files:
    main_body.append(main_body.makeelement('a', {'name':f[:-5]}))
    print "parsing file %s" % f
    tree = TidyHTMLTreeBuilder.parse(f)
    body = tree.find(XHTML + 'body')
    for div in body:
        if 'class' in div.attrib and div.attrib['class'] not in ('NAVHEADER', 'NAVFOOTER'):
            for el in tree.getiterator():
                if el.tag.startswith(XHTML):
                    el.tag = el.tag[30:]
            main_body.append(div)
# clean up links
for el in main_tree.getiterator('a'):
    if not 'href' in el.attrib:
        continue
    href = el.attrib['href']
    if href.startswith('mailto:'):
        continue
    target = href.find('#')
    if target > 0:
        el.attrib['href'] = href[target:]
    elif target < 0:
        el.attrib['href'] = "#" + href[:-5]
# print tree
main_tree.write('test.html')