#!python

"""$URL: svn://svn/repos/trunk/sitereaper/bin/sitereaper $
$Id: sitereaper 20991 2003-03-06 18:17:09Z dbinger $

This is the basic script for running the sitereaper.
Run it with --help for usage information.
"""

import sys, os
from timeoutsocket import setDefaultSocketTimeout
from optik import OptionParser, make_option
import sitereaper.reaper


def set_target (option, opt, value, parser):
    options = parser.values
    url = "http://%s/" % value
    parser.largs.append(url)
    options.local_urls.append(url)
    options.output_file = value
    options.dict = value


def main ():
    option_list = [
        make_option("-v", "--verbose", type="int", dest='verbosity',
                    help="Specify verbosity level"),
        make_option("-l", "--local",
                    action="append",
                    dest='local_urls', default=[],
                    metavar="URL_PREFIX",
                    help=("Scrutinize all URLs starting with URL_PREFIX.  "
                          "Repeat to name multiple local URLs.")),
        make_option("-x", "--exclude",
                    action="append",
                    dest='excluded_urls', default=[],
                    metavar="URL_PREFIX",
                    help=("Exclude URLs starting with URL_PREFIX from "
                          "scrutiny.  Repeat to exclude multiple URLs.")),
        make_option("-b", "--deprecated",
                    action="append",
                    dest='deprecated_urls', default=[],
                    help=("A list of links that should be reported, "
                          "but not excluded.")),
        make_option("-g", "--ignore-urls", action="append",
                    dest='ignore_url_schemes', default=[],
                    metavar='SCHEME',
                    help="ignore URLs with scheme SCHEME (eg. 'ftp', 'news')"),
        make_option("--check-external", action="store_true",
                    dest='check_external', default=1,
                    help="check external URLs with a HEAD request (default)"),
        make_option("--no-check-external", action="store_false",
                    dest='check_external',
                    help="don't check external URLs"),
        make_option("-T", "--timeout", type="int", default=60,
                    help="set socket timeout in seconds (default: 60)"),
        make_option("-e", "--max-errors", type="int",
                    metavar="N",
                    help=("stop after N server errors (5xx response code) "
                          "(default: no limit)")),
        make_option("--spellcheck", action="store_const",
                    dest="skip_spellcheck", const=[],
                    help=("Spellcheck the non-tag text of every local "
                          "HTML document (default)")),
        make_option("--no-spellcheck", action="store_const",
                    dest="skip_spellcheck", const=[''],
                    help="Disable spellchecking completely"),
        make_option("--skip-spellcheck", action="append",
                    dest="skip_spellcheck", metavar="URL_PREFIX",
                    help="Skip spellchecking documents under URL_PREFIX"),

        make_option("--validate", action="store_const",
                    dest="skip_validate", const=[],
                    help="Validate the HTML of every local document"),
        make_option("--no-validate", action="store_const",
                    dest="skip_validate", const=[''],
                    help="Disable HTML validation"),
        make_option("--skip-validate", action="append",
                    dest="skip_validate", metavar="URL_PREFIX",
                    help="Skip validating documents under URL_PREFIX"),

        make_option("-o", "--output", dest="output_file",
                    metavar="FILE",
                    help=("Write report to FILE (default: stdout)")),
        make_option("-L", "--link-report",
                    metavar="FILE",
                    help="Write link report to FILE"),
        make_option("-t", "--target",
                    action="callback", callback=set_target,
                    type="string", metavar="HOST",
                    help=("Scrutinize web site on HOST.  Equivalent to "
                          "-lhttp://HOST/ -oHOST -dHOST http://HOST/")),
        make_option("-d", "--dictionary", default="default",
                    metavar="DICT",
                    help=("Use DICT (relative to sitereaper.dictionary "
                          "directory) as a supplemental dictionary "
                          "with ispell")),
        ]

    usage = """\
usage: %prog [options] start_url ...
       %prog [-t TARGET] [options]"""
    parser = OptionParser(usage, option_list)
    (options, args) = parser.parse_args()

    # User must either supply starting URLs in args, or target host(s)
    # with -t/--target; -t pushes a start_URL onto args.
    if args:
        start_urls = args
    else:
        parser.error("must supply either a start_url or -t (--target) option")

    base_dir = os.path.dirname(sitereaper.__file__)
    dict_dir = os.path.join(base_dir, "dictionaries")

    if options.output_file:
        out = open(options.output_file, "w")
    else:
        out = sys.stdout

    dict = options.dictionary
    if not os.path.isabs(dict):
        dict = os.path.join(dict_dir, dict)

    setDefaultSocketTimeout(options.timeout)

    reaper = sitereaper.reaper.Reaper(options.local_urls,
                                      options.excluded_urls,
                                      options.deprecated_urls,
                                      options.ignore_url_schemes,
                                      out, dict,
                                      options.verbosity)
    reaper.set_max_errors(options.max_errors)

    if not options.check_external:
        reaper.disable_external_checks()
    reaper.set_spellcheck_skiplist(options.skip_spellcheck)
    reaper.set_validation_skiplist(options.skip_validate)

    reaper.traverse(start_urls)

    if options.link_report:
        reaper.report_links(options.link_report)

if __name__ == '__main__':
    try:
        main()
    except KeyboardInterrupt:
        sys.exit("interrupted")
