cpython/Doc/tools/mkhowto

#! /usr/bin/env python
#  -*- Python -*-
"""usage: %(program)s [options...] file ...

Options specifying formats to build:
    --html		HyperText Markup Language (default)
    --pdf		Portable Document Format
    --ps		PostScript
    --dvi		'DeVice Indepentent' format from TeX
    --text		ASCII text (requires lynx)

    More than one output format may be specified, or --all.

HTML options:
    --address, -a	Specify an address for page footers.
    --dir		Specify the directory for HTML output.
    --link		Specify the number of levels to include on each page.
    --split, -s		Specify a section level for page splitting, default: %(max_split_depth)s.
    --iconserver, -i	Specify location of icons (default: ./).
    --image-type	Specify the image type to use in HTML output;
                        values: gif (default), png.
    --numeric           Don't rename the HTML files; just keep node#.html for
                        the filenames.
    --style             Specify the CSS file to use for the output (filename,
                        not a URL).
    --up-link           URL to a parent document.
    --up-title          Title of a parent document.
    --favicon           Icon to display in the browsers location bar.

Other options:
    --a4		Format for A4 paper.
    --letter		Format for US letter paper (the default).
    --help, -H		Show this text.
    --logging, -l	Log stdout and stderr to a file (*.how).
    --debugging, -D	Echo commands as they are executed.
    --keep, -k		Keep temporary files around.
    --quiet, -q		Do not print command output to stdout.
			(stderr is also lost,  sorry; see *.how for errors)
"""

import getopt
import glob
import os
import re
import shutil
import string
import sys


if not hasattr(os.path, "abspath"):
    # Python 1.5.1 or earlier
    def abspath(path):
        """Return an absolute path."""
        if not os.path.isabs(path):
            path = os.path.join(os.getcwd(), path)
        return os.path.normpath(path)

    os.path.abspath = abspath


MYDIR = os.path.abspath(sys.path[0])
TOPDIR = os.path.dirname(MYDIR)

ISTFILE = os.path.join(TOPDIR, "texinputs", "python.ist")
NODE2LABEL_SCRIPT = os.path.join(MYDIR, "node2label.pl")
L2H_INIT_FILE = os.path.join(TOPDIR, "perl", "l2hinit.perl")

BIBTEX_BINARY = "bibtex"
DVIPS_BINARY = "dvips"
LATEX_BINARY = "latex"
LATEX2HTML_BINARY = "latex2html"
LYNX_BINARY = "lynx"
MAKEINDEX_BINARY = "makeindex"
PDFLATEX_BINARY = "pdflatex"
PERL_BINARY = "perl"
PYTHON_BINARY = "python"


def usage(options):
    print __doc__ % options

def error(options, message, err=2):
    sys.stdout = sys.stderr
    print message
    print
    usage(options)
    sys.exit(2)


class Options:
    program = os.path.basename(sys.argv[0])
    #
    address = ''
    builddir = None
    debugging = 0
    discard_temps = 1
    have_temps = 0
    icon_server = "."
    image_type = "gif"
    logging = 0
    max_link_depth = 3
    max_split_depth = 6
    paper = "letter"
    quiet = 0
    runs = 0
    numeric = 0
    global_module_index = None
    style_file = os.path.join(TOPDIR, "html", "style.css")
    about_file = os.path.join(TOPDIR, "html", "about.dat")
    up_link = None
    up_title = None
    favicon = None
    #
    # 'dvips_safe' is a weird option.  It is used mostly to make
    # LaTeX2HTML not try to be too smart about protecting the user
    # from a bad version of dvips -- some versions would core dump if
    # the path to the source DVI contained a dot, and it's appearantly
    # difficult to determine if the version available has that bug.
    # This option gets set when PostScript output is requested
    # (because we're going to run dvips regardless, and we'll either
    # know it succeeds before LaTeX2HTML is run, or we'll have
    # detected the failure and bailed), or the user asserts that it's
    # safe from the command line.
    #
    # So, why does LaTeX2HTML think it appropriate to protect the user
    # from a dvips that's only potentially going to core dump?  Only
    # because they want to avoid doing a lot of work just to have to
    # bail later with no useful intermediates.  Unfortunately, they
    # bail *before* they know whether dvips will be needed at all.
    # I've gone around the bush a few times with the LaTeX2HTML
    # developers over whether this is appropriate behavior, and they
    # don't seem interested in changing their position.
    #
    dvips_safe = 0
    #
    DEFAULT_FORMATS = ("html",)
    ALL_FORMATS = ("dvi", "html", "pdf", "ps", "text")

    def __init__(self):
        self.formats = []
        self.l2h_init_files = []

    def __getitem__(self, key):
        # This is used when formatting the usage message.
        try:
            return getattr(self, key)
        except AttributeError:
            raise KeyError, key

    def parse(self, args):
        opts, args = getopt.getopt(args, "Hi:a:s:lDkqr:",
                                   ["all", "postscript", "help", "iconserver=",
                                    "address=", "a4", "letter", "l2h-init=",
                                    "link=", "split=", "logging", "debugging",
                                    "keep", "quiet", "runs=", "image-type=",
                                    "about=", "numeric", "style=", "paper=",
                                    "up-link=", "up-title=", "dir=",
                                    "global-module-index=", "dvips-safe",
                                    "favicon="]
                                   + list(self.ALL_FORMATS))
        for opt, arg in opts:
            if opt == "--all":
                self.formats = list(self.ALL_FORMATS)
                self.dvips_safe = "ps" in self.formats
            elif opt in ("-H", "--help"):
                usage(self)
                sys.exit()
            elif opt == "--iconserver":
                self.icon_server = arg
            elif opt in ("-a", "--address"):
                self.address = arg
            elif opt == "--a4":
                self.paper = "a4"
            elif opt == "--letter":
                self.paper = "letter"
            elif opt == "--link":
                self.max_link_depth = int(arg)
            elif opt in ("-s", "--split"):
                self.max_split_depth = int(arg)
            elif opt in ("-l", "--logging"):
                self.logging = self.logging + 1
            elif opt in ("-D", "--debugging"):
                self.debugging = self.debugging + 1
            elif opt in ("-k", "--keep"):
                self.discard_temps = 0
            elif opt in ("-q", "--quiet"):
                self.quiet = 1
            elif opt in ("-r", "--runs"):
                self.runs = int(arg)
            elif opt == "--image-type":
                self.image_type = arg
            elif opt == "--about":
                # always make this absolute:
                self.about_file = os.path.normpath(
                    os.path.abspath(arg))
            elif opt == "--numeric":
                self.numeric = 1
            elif opt == "--style":
                self.style_file = os.path.abspath(arg)
            elif opt == "--l2h-init":
                self.l2h_init_files.append(os.path.abspath(arg))
            elif opt == "--favicon":
                self.favicon = arg
            elif opt == "--up-link":
                self.up_link = arg
            elif opt == "--up-title":
                self.up_title = arg
            elif opt == "--global-module-index":
                self.global_module_index = arg
            elif opt == "--dir":
                if os.sep == "\\":
                    arg = re.sub("/", "\\", arg)
                self.builddir = os.path.expanduser(arg)
            elif opt == "--paper":
                self.paper = arg
            elif opt == "--dvips-safe":
                self.dvips_safe = 1
            #
            # Format specifiers:
            #
            elif opt[2:] in self.ALL_FORMATS:
                self.add_format(opt[2:])
            elif opt == "--postscript":
                # synonym for --ps
                self.add_format("ps")
        self.initialize()
        #
        # return the args to allow the caller access:
        #
        return args

    def add_format(self, format):
        """Add a format to the formats list if not present."""
        if not format in self.formats:
            if format == "ps":
                # assume this is safe since we're going to run it anyway
                self.dvips_safe = 1
            self.formats.append(format)

    def initialize(self):
        """Complete initialization.  This is needed if parse() isn't used."""
        # add the default format if no formats were specified:
        if not self.formats:
            self.formats = self.DEFAULT_FORMATS
        # determine the base set of texinputs directories:
        texinputs = string.split(os.environ.get("TEXINPUTS", ""), os.pathsep)
        if not texinputs:
            texinputs = ['']
        self.base_texinputs = [
            os.path.join(TOPDIR, "paper-" + self.paper),
            os.path.join(TOPDIR, "texinputs"),
            ] + texinputs
        if self.builddir:
            self.builddir = os.path.abspath(self.builddir)


class Job:
    latex_runs = 0

    def __init__(self, options, path):
        self.options = options
        self.doctype = get_doctype(path)
        self.filedir, self.doc = split_pathname(path)
        self.builddir = os.path.abspath(options.builddir or self.doc)
        if ("html" in options.formats or "text" in options.formats):
            if not os.path.exists(self.builddir):
                os.mkdir(self.builddir)
            self.log_filename = os.path.join(self.builddir, self.doc + ".how")
        else:
            self.log_filename = os.path.abspath(self.doc + ".how")
        if os.path.exists(self.log_filename):
            os.unlink(self.log_filename)
        l2hconf = self.doc + ".l2h"
        if os.path.exists(l2hconf):
            if os.path.exists(l2hconf + "~"):
                os.unlink(l2hconf + "~")
            os.rename(l2hconf, l2hconf + "~")
        self.l2h_aux_init_file = self.doc + ".l2h"
        self.write_l2h_aux_init_file()

    def build(self):
        self.setup_texinputs()
        formats = self.options.formats
        if "dvi" in formats or "ps" in formats:
            self.build_dvi()
        if "pdf" in formats:
            self.build_pdf()
        if "ps" in formats:
            self.build_ps()
        if "html" in formats:
            self.require_temps()
            self.build_html(self.builddir)
            if self.options.icon_server == ".":
                pattern = os.path.join(TOPDIR, "html", "icons",
                                       "*." + self.options.image_type)
                imgs = glob.glob(pattern)
                if not imgs:
                    self.warning(
                        "Could not locate support images of type %s."
                        % `self.options.image_type`)
                for fn in imgs:
                    new_fn = os.path.join(self.builddir, os.path.basename(fn))
                    shutil.copyfile(fn, new_fn)
        if "text" in formats:
            self.require_temps()
            tempdir = self.doc
            need_html = "html" not in formats
            if self.options.max_split_depth != 1:
                fp = open(self.l2h_aux_init_file, "a")
                fp.write("# re-hack this file for --text:\n")
                l2hoption(fp, "MAX_SPLIT_DEPTH", "1")
                fp.write("1;\n")
                fp.close()
                tempdir = self.doc + "-temp-html"
                need_html = 1
            if need_html:
                self.build_html(tempdir, max_split_depth=1)
            self.build_text(tempdir)
        if self.options.discard_temps:
            self.cleanup()

    def setup_texinputs(self):
        texinputs = [self.filedir] + list(self.options.base_texinputs)
        os.environ["TEXINPUTS"] = string.join(texinputs, os.pathsep)
        self.message("TEXINPUTS=" + os.environ["TEXINPUTS"])

    def build_aux(self, binary=None):
        if binary is None:
            binary = LATEX_BINARY
        new_index(   "%s.ind" % self.doc, "genindex")
        new_index("mod%s.ind" % self.doc, "modindex")
        self.run("%s %s" % (binary, self.doc))
        self.use_bibtex = check_for_bibtex(self.doc + ".aux")
        self.latex_runs = 1

    def build_dvi(self):
        self.use_latex(LATEX_BINARY)

    def build_pdf(self):
        self.use_latex(PDFLATEX_BINARY)

    def use_latex(self, binary):
        self.require_temps(binary=binary)
        if self.latex_runs < 2:
            if os.path.isfile("mod%s.idx" % self.doc):
                self.run("%s mod%s.idx" % (MAKEINDEX_BINARY, self.doc))
            use_indfix = 0
            if os.path.isfile(self.doc + ".idx"):
                use_indfix = 1
                # call to Doc/tools/fix_hack omitted; doesn't appear necessary
                self.run("%s %s.idx" % (MAKEINDEX_BINARY, self.doc))
                import indfix
                indfix.process(self.doc + ".ind")
            if self.use_bibtex:
                self.run("%s %s" % (BIBTEX_BINARY, self.doc))
            self.process_synopsis_files()
            self.run("%s %s" % (binary, self.doc))
            self.latex_runs = self.latex_runs + 1
            if os.path.isfile("mod%s.idx" % self.doc):
                self.run("%s -s %s mod%s.idx"
                         % (MAKEINDEX_BINARY, ISTFILE, self.doc))
            if use_indfix:
                self.run("%s -s %s %s.idx"
                         % (MAKEINDEX_BINARY, ISTFILE, self.doc))
                indfix.process(self.doc + ".ind")
            self.process_synopsis_files()
        #
        # and now finish it off:
        #
        if os.path.isfile(self.doc + ".toc") and binary == PDFLATEX_BINARY:
            import toc2bkm
            if self.doctype == "manual":
                bigpart = "chapter"
            else:
                bigpart = "section"
            toc2bkm.process(self.doc + ".toc", self.doc + ".bkm", bigpart)
        if self.use_bibtex:
            self.run("%s %s" % (BIBTEX_BINARY, self.doc))
        self.run("%s %s" % (binary, self.doc))
        self.latex_runs = self.latex_runs + 1

    def process_synopsis_files(self):
        synopsis_files = glob.glob(self.doc + "*.syn")
        for path in synopsis_files:
            uniqify_module_table(path)

    def build_ps(self):
        self.run("%s -N0 -o %s.ps %s" % (DVIPS_BINARY, self.doc, self.doc))

    def build_html(self, builddir, max_split_depth=None):
        if max_split_depth is None:
            max_split_depth = self.options.max_split_depth
        texfile = None
        for p in string.split(os.environ["TEXINPUTS"], os.pathsep):
            fn = os.path.join(p, self.doc + ".tex")
            if os.path.isfile(fn):
                texfile = fn
                break
        if not texfile:
            self.warning("Could not locate %s.tex; aborting." % self.doc)
            sys.exit(1)
        # remove leading ./ (or equiv.); might avoid problems w/ dvips
        if texfile[:2] == os.curdir + os.sep:
            texfile = texfile[2:]
        # build the command line and run LaTeX2HTML:
        if not os.path.isdir(builddir):
            os.mkdir(builddir)
        else:
            for fname in glob.glob(os.path.join(builddir, "*.html")):
                os.unlink(fname)
        args = [LATEX2HTML_BINARY,
                "-init_file", self.l2h_aux_init_file,
                "-dir", builddir,
                texfile
                ]
        self.run(string.join(args))     # XXX need quoting!
        # ... postprocess
        shutil.copyfile(self.options.style_file,
                        os.path.join(builddir, self.doc + ".css"))
        shutil.copyfile(os.path.join(builddir, self.doc + ".html"),
                        os.path.join(builddir, "index.html"))
        if max_split_depth != 1:
            label_file = os.path.join(builddir, "labels.pl")
            fp = open(label_file)
            about_node = None
            target = " = q/about/;\n"
            x = len(target)
            while 1:
                line = fp.readline()
                if not line:
                    break
                if line[-x:] == target:
                    line = fp.readline()
                    m = re.search(r"\|(node\d+\.[a-z]+)\|", line)
                    about_node = m.group(1)
                    shutil.copyfile(os.path.join(builddir, about_node),
                                    os.path.join(builddir, "about.html"))
                    break
            if not self.options.numeric:
                pwd = os.getcwd()
                try:
                    os.chdir(builddir)
                    self.run("%s %s *.html" % (PERL_BINARY, NODE2LABEL_SCRIPT))
                finally:
                    os.chdir(pwd)
        # These files need to be cleaned up here since builddir there
        # can be more than one, so we clean each of them.
        if self.options.discard_temps:
            for fn in ("images.tex", "images.log", "images.aux"):
                safe_unlink(os.path.join(builddir, fn))

    def build_text(self, tempdir=None):
        if tempdir is None:
            tempdir = self.doc
        indexfile = os.path.join(tempdir, "index.html")
        self.run("%s -nolist -dump %s >%s.txt"
                 % (LYNX_BINARY, indexfile, self.doc))

    def require_temps(self, binary=None):
        if not self.latex_runs:
            self.build_aux(binary=binary)

    def write_l2h_aux_init_file(self):
        options = self.options
        fp = open(self.l2h_aux_init_file, "w")
        d = string_to_perl(os.path.dirname(L2H_INIT_FILE))
        fp.write("package main;\n"
                 "push (@INC, '%s');\n"
                 "$mydir = '%s';\n"
                 % (d, d))
        fp.write(open(L2H_INIT_FILE).read())
        for filename in options.l2h_init_files:
            fp.write("\n# initialization code incorporated from:\n# ")
            fp.write(filename)
            fp.write("\n")
            fp.write(open(filename).read())
        fp.write("\n"
                 "# auxillary init file for latex2html\n"
                 "# generated by mkhowto\n"
                 "$NO_AUTO_LINK = 1;\n"
                 )
        l2hoption(fp, "ABOUT_FILE", options.about_file)
        l2hoption(fp, "ICONSERVER", options.icon_server)
        l2hoption(fp, "IMAGE_TYPE", options.image_type)
        l2hoption(fp, "ADDRESS", options.address)
        l2hoption(fp, "MAX_LINK_DEPTH", options.max_link_depth)
        l2hoption(fp, "MAX_SPLIT_DEPTH", options.max_split_depth)
        l2hoption(fp, "EXTERNAL_UP_LINK", options.up_link)
        l2hoption(fp, "EXTERNAL_UP_TITLE", options.up_title)
        l2hoption(fp, "FAVORITES_ICON", options.favicon)
        l2hoption(fp, "GLOBAL_MODULE_INDEX", options.global_module_index)
        l2hoption(fp, "DVIPS_SAFE", options.dvips_safe)
        fp.write("1;\n")
        fp.close()

    def cleanup(self):
        self.__have_temps = 0
        for pattern in ("%s.aux", "%s.log", "%s.out", "%s.toc", "%s.bkm",
                        "%s.idx", "%s.ilg", "%s.ind", "%s.pla",
                        "%s.bbl", "%s.blg",
                        "mod%s.idx", "mod%s.ind", "mod%s.ilg",
                        ):
            safe_unlink(pattern % self.doc)
        map(safe_unlink, glob.glob(self.doc + "*.syn"))
        for spec in ("IMG*", "*.pl", "WARNINGS", "index.dat", "modindex.dat"):
            pattern = os.path.join(self.doc, spec)
            map(safe_unlink, glob.glob(pattern))
        if "dvi" not in self.options.formats:
            safe_unlink(self.doc + ".dvi")
        if os.path.isdir(self.doc + "-temp-html"):
            shutil.rmtree(self.doc + "-temp-html", ignore_errors=1)
        if not self.options.logging:
            os.unlink(self.log_filename)
        if not self.options.debugging:
            os.unlink(self.l2h_aux_init_file)

    def run(self, command):
        self.message(command)
        if sys.platform.startswith("win"):
            rc = os.system(command)
        else:
            rc = os.system("(%s) </dev/null >>%s 2>&1"
                           % (command, self.log_filename))
        if rc:
            self.warning(
                "Session transcript and error messages are in %s."
                % self.log_filename)
            result = 1
            if hasattr(os, "WIFEXITED"):
                if os.WIFEXITED(rc):
                    result = os.WEXITSTATUS(rc)
                    self.warning("Exited with status %s." % result)
                else:
                    self.warning("Killed by signal %s." % os.WSTOPSIG(rc))
            else:
                self.warning("Return code: %s" % rc)
            sys.stderr.write("The relevant lines from the transcript are:\n")
            sys.stderr.write("-" * 72 + "\n")
            sys.stderr.writelines(get_run_transcript(self.log_filename))
            sys.exit(result)

    def message(self, msg):
        msg = "+++ " + msg
        if not self.options.quiet:
            print msg
        self.log(msg + "\n")

    def warning(self, msg):
        msg = "*** %s\n" % msg
        sys.stderr.write(msg)
        self.log(msg)

    def log(self, msg):
        fp = open(self.log_filename, "a")
        fp.write(msg)
        fp.close()


def get_run_transcript(filename):
    """Return lines from the transcript file for the most recent run() call."""
    fp = open(filename)
    lines = fp.readlines()
    fp.close()
    lines.reverse()
    L = []
    for line in lines:
        L.append(line)
        if line[:4] == "+++ ":
            break
    L.reverse()
    return L


def safe_unlink(path):
    """Unlink a file without raising an error if it doesn't exist."""
    try:
        os.unlink(path)
    except os.error:
        pass


def split_pathname(path):
    path = os.path.abspath(path)
    dirname, basename = os.path.split(path)
    if basename[-4:] == ".tex":
        basename = basename[:-4]
    return dirname, basename


_doctype_rx = re.compile(r"\\documentclass(?:\[[^]]*\])?{([a-zA-Z]*)}")
def get_doctype(path):
    fp = open(path)
    doctype = None
    while 1:
        line = fp.readline()
        if not line:
            break
        m = _doctype_rx.match(line)
        if m:
            doctype = m.group(1)
            break
    fp.close()
    return doctype


def main():
    options = Options()
    try:
        args = options.parse(sys.argv[1:])
    except getopt.error, msg:
        error(options, msg)
    if not args:
        # attempt to locate single .tex file in current directory:
        args = glob.glob("*.tex")
        if not args:
            error(options, "No file to process.")
        if len(args) > 1:
            error(options, "Could not deduce which files should be processed.")
    #
    # parameters are processed, let's go!
    #
    for path in args:
        Job(options, path).build()


def l2hoption(fp, option, value):
    if value:
        fp.write('$%s = "%s";\n' % (option, string_to_perl(str(value))))


_to_perl = {}
for c in map(chr, range(1, 256)):
    _to_perl[c] = c
_to_perl["@"] = "\\@"
_to_perl["$"] = "\\$"
_to_perl['"'] = '\\"'

def string_to_perl(s):
    return string.join(map(_to_perl.get, s), '')


def check_for_bibtex(filename):
    fp = open(filename)
    pos = string.find(fp.read(), r"\bibdata{")
    fp.close()
    return pos >= 0

def uniqify_module_table(filename):
    lines = open(filename).readlines()
    if len(lines) > 1:
        if lines[-1] == lines[-2]:
            del lines[-1]
    open(filename, "w").writelines(lines)


def new_index(filename, label="genindex"):
    fp = open(filename, "w")
    fp.write(r"""\
\begin{theindex}
\label{%s}
\end{theindex}
""" % label)
    fp.close()


if __name__ == "__main__":
    main()