#!/usr/bin/env python # Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file # for details. All rights reserved. Use of this source code is governed by a # BSD-style license that can be found in the LICENSE file. ''' This script finds all HTML pages in a folder and downloads all images, replacing the urls with local ones. ''' import os, sys, optparse, subprocess, multiprocessing from os.path import abspath, basename, dirname, join SWARM_PATH = dirname(abspath(__file__)) CLIENT_PATH = dirname(dirname(SWARM_PATH)) CLIENT_TOOLS_PATH = join(CLIENT_PATH, 'tools') # Add the client tools directory so we can find htmlconverter.py. sys.path.append(CLIENT_TOOLS_PATH) import htmlconverter converter = CLIENT_TOOLS_PATH + '/htmlconverter.py' # This has to be a top level function to use with multiprocessing def convertImgs(infile): global options try: htmlconverter.convertForOffline( infile, infile, verbose=options.verbose, encode_images=options.inline_images) print 'Converted ' + infile except BaseException, e: print 'Caught error: %s' % e def Flags(): """ Constructs a parser for extracting flags from the command line. """ parser = optparse.OptionParser() parser.add_option("--inline_images", help=("Encode img payloads as data:// URLs rather than local files."), default=False, action='store_true') parser.add_option("--verbose", help="Print verbose output", default=False, action="store_true") return parser def main(): global options parser = Flags() options, args = parser.parse_args() print "args: %s" % args if len(args) < 1 or 'help' in args[0]: print 'Usage: %s DIRECTORY' % basename(sys.argv[0]) return 1 dirname = args[0] print 'Searching directory ' + dirname files = [] for root, dirs, fnames in os.walk(dirname): for fname in fnames: if fname.endswith('.html'): files.append(join(root, fname)) count = 4 * multiprocessing.cpu_count() pool = multiprocessing.Pool(processes=count) # Note: need a timeout to get keyboard interrupt due to a Python bug pool.map_async(convertImgs, files).get(3600) # one hour if __name__ == '__main__': main()