#!/usr/bin/perl ## ############################################################# ## !!CAUTION!! ## !!USUALLY RUNS A LONG TIME !! ## ############################################################# ## ## Check links in html pages, i.e. ## ## * every occurence of HREF="..." ## * every occurence of SRC="..." ## ## If the subsequent string starts with ## "http:" --> URL checked ## "file:" --> URL checked ## "./..." --> existence checked ## "mailto:" --> [ignore] ## "ftp:" --> [ignore] ## ## ## The URLs that caused failures are collected in a string $errors, ## that is printed out at the end. ## In addition, a HTML error file FILENAME-errors.html is created. ## ## If a optinal argument "-n" is given, there is no error file written. ## ## ---------------------------------------------------------------------- ## Known problems: ## * also checks links that exists only inside quotes ## * stops, if password-protected pages are tried to be accessed ## [but you can enter URL/password via command line and this works fine] ## * forwards pages to viewers, like ghostview, xv, etc. ## though this is not 'dangerous', and you can proceed by just terminating ## the viewer, this slows down the check and I haven't found a way jet ## to switch it of ## * Sometimes asks whether or not to retrieve something (and you have ## to type in "yes" or "no" RETURN) ## * strips off the #MARKER from "URL#MARKER" links; ## ---------------------------------------------------------------------- ## Relies on Emacs with w3 and a function ## do-w3-die (URL) ## which: ## 1. tries to reach the given URL ## 2. quits emacs (7), if it was reached alright ## 3. quits emacs with error (255), if there were problems ## ## ## Emacs function: ## ====================================================================== ## * The following emacs function you have to include in your .emacs file ## (or another place where it is loaded automatically when emacs starts) ## * Exit codes of do-w3-die call: ## 7 : URL was correctly loaded ## 0 : user-termination (time-out) ## other: error in loading URL ## ====================================================================== ## ##(defun do-w3-die (url) ## (interactive) ## (w3-fetch url) ## (setq case-fold-search t) ## (goto-char (point-min)) ## ## (if (search-forward-regexp "\\(^404[ \n\t\r]+not[ \n\t\r]+found\\|\*error[ \n\t\r]+accessing[ \n\t\r]+\\|----[ \n\t\r]+error[ \n\t\r]+was:[ \n\t\r]+----[\n ]*unknown[ \n\t\r]+host[ \n\t\r]+\\)" (point-max) t) ## (progn (message "*die with error*") ## (error "FAILURE")) ## (progn (message "** OK Quit") ## (w3-document-information) ## (kill-emacs 7)))) ## ====================================================================== ## ## Synopsis: ## Validating HTML documents ## ############################################################# ## @TABLE OF CONTENTS: [TOCD: 19:32 18 Jan 2004] ## ## [0.1] String Manipulation ## [0.2] Disclamier ## [0.3] USAGE ## [1] MAIN ROUTINE ## ########################################################## ## @FILE: linkcheck.perl ## @PLACE: Gaia Homestation ## @FORMAT: perl 5.0 script ## @AUTHOR: M. Oliver M'o'ller ## @BEGUN: Fri Dec 1 11:03:53 2000 ## @VERSION: Sun Jan 18 20:10:12 2004 ## ############################################################# $create_error_file=true; ## ############################################################# $Revision = "Rev" ; $Date = "Date" ; $empty = "" ; $VERSION_NUMBER="V 0.9delta [$Revision: 1.13 $empty]"; $VERSION_DATE="[$Date: 2004/02/01 19:36:59 $empty]" ; $VERSION="$VERSION_NUMBER $VERSION_DATE" ; $filearg=0; ## -- COMMAND LINE OPTIONS ------------------------------------------------- if ( ( $ARGV[0] =~ /^-n$/ ) || ( $ARGV[0] =~ /^--no-error-file$/ ) ){ $filearg=1; $create_error_file=0; } if( ( $ARGV[$filearg] =~ /^$/ ) || ( $ARGV[$filearg] =~ /^-h$/ ) || ( $ARGV[$filearg] =~ /^-help$/ ) || ( $ARGV[$filearg] =~ /^--help$/ )){ disclaimer(); usage(); exit 0; } if( ( $ARGV[$filearg] =~ /^-e$/ ) || ( $ARGV[$filearg] =~ /^-help-emacs$/ ) || ( $ARGV[$filearg] =~ /^--help-emacs$/ ) ){ emacs_code(); exit 0; } if( ( $ARGV[$filearg] =~ /^-v$/ ) || ( $ARGV[$filearg] =~ /^-version$/ ) || ( $ARGV[$filearg] =~ /^--version$/ ) ){ (print "$VERSION\n") ? 'ok' : die "can't print $!"; exit 0; } ## ------------------------------------------------------------------------- $old = "$ARGV[$filearg]"; $new = "$ARGV[$filearg]-errors.html"; $bak = "$ARGV[$filearg].bak"; my $current_date = `date`; ## -- AUX FUNCTIONS -------------------------------------------------------- # success-code: 7 sub check_file { (print "---------> checking: $file\n") ? 'ok' : die "can't print $!"; $exists = -e $file; # true if something named myfile.dat exists ##$exists = -f $file; # true if myfile.dat is a plain file if ( $exists ){ return 7; } else { $url = "[file] $file"; return 255; } }## --------------------------------------------------- sub check_url { open PIPEHANDLE, "emacs -q -batch --no-init-file --eval '(progn (setq load-path (cons \"~/emacs/\" (cons \"~/emacs/w3-4.0pre.44//share/emacs/site-lisp\" load-path)))(load-library \"~/emacs/resolve-version-conflict\")(load-library \"~/emacs/my-personal-settings\")(load-library \"~/emacs/my-lisp-macros\")(load-library \"~/emacs/my-web-access\") (do-w3-die \"$url\"))' |" or die "Cannot open pipe to emacs: $!"; my $output; while () { $output .= $_; } close PIPEHANDLE or die "Cannot close pipe to emacs: $!"; $exit_value = $? >> 8; return $exit_value; } sub check_url_with_perl { ## *DEFUNCT* ## use LWP::Simple; ## $page = get "http://www.perlfaq.com"; } sub check_url_old { (print "---------> checking: $url\n") ? 'ok' : die "can't print $!"; @args = ("emacs", "-batch", "-q"," --no-init-file", "--eval", "(progn (setq load-path (append (list ;;\"~/emacs/w3-4.0pre.44//share/emacs/site-lisp\" \"~/emacs\" ) load-path)) (load-library \"~/emacs/resolve-version-conflict\") (load-library \"~/emacs/my-personal-settings\") (load-library \"~/emacs/my-lisp-macros\") (load-library \"~/emacs/my-web-access\") (do-w3-die \"$url\") )", ## ------------------------------------- "2>/dev/null", "1>/dev/null" ); system(@args ); ## ---------------------------------------------------- $exit_value = $? >> 8; $signal_num = $? & 127; $dumped_core = $? & 128; return $exit_value; } ## -------------------------------------------------- # success-code: 7 sub check_file_mark { $result = 0; (print "---------> checking: $file for mark $mark\n") ? 'ok' : die "can't print $!"; $exists = -e $file; # true if something named myfile.dat exists ##$exists = -f $file; # true if myfile.dat is a plain file if ( $exists ){ open PIPEHANDLE, "cat $file |sed s/\\\"/#/g |grep '$mark' |" or die "Cannot open pipe to grep: $!"; my $output; $result = 255; while () { if( $_ =~ /[Nn][Aa][Mm][Ee][ ]*=[ ]*$mark\#/ ){ (print " FOUND def: $_") ? 'ok' : die "can't print $!"; $result = 7; } $output .= $_; # (print "---------> read $output") ? 'ok' : die "can't print $!"; } close PIPEHANDLE or die "Cannot close pipe to grep: $!"; $exit_value = $? >> 8; if($result == 255){ $errors .= "$file$mark [mark not found] (line $linenr)\n"; } } else { $url = "[file] $file [$mark]"; return 255; } return $result; }## --------------------------------------------------- sub proccess_exit_value { if ( $exit_value == 7 ){ (print "OK\n") ? 'ok' : die "can't print $!"; } else { (print "FAILURE: $exit_value\n") ? 'ok' : die "can't print $!"; $errors .= "$url (line $linenr)\n"; if ( $create_error_file ){ (print ERROR "
  • $url
  • \n") ? 'ok' : die "can't write to $new: $!"; } } ## ---------------------------------------------------- (print "---------\n") ? 'ok' : die "can't print $!"; } ## ############################################### ## [0.1] String Manipulation ## ############################################### ## ############################################### ## [0.2] Disclamier ## ############################################### sub disclaimer { (print "********************************************************\n") ? 'ok' : die "can't print $!"; (print "*** This is linkcheck.perl \n") ? 'ok' : die "can't print $!"; (print "*** $VERSION\n") ? 'ok' : die "can't print $!"; (print "*** M. Oliver Möller \n") ? 'ok' : die "can't print $!"; (print "********************************************************\n") ? 'ok' : die "can't print $!"; } ## ############################################### ## [0.3] USAGE ## ############################################### sub usage { (print "usage: [ echo \"yes\" | ] linkcheck.perl [OPTIONS] HTMLFILE options: --no-error-file -n do not write an HTMLFILE-errors.html --help -h print this help and exit --help-emacs -e print sample code to include in ~/.emacs file --version -v print version and exit \n") ? 'ok' : die "can't print $!"; (print "Check links in html pages, i.e. * every occurence of HREF=\"...\" * every occurence of SRC=\"...\" If the subsequent string starts with \"http:\" --> URL checked \"file:\" --> existence checked \"./\" --> existence checked \"mailto:\" --> [ignore] \"ftp:\" --> [ignore] Unless option -n (--no-error-file) is set, an output html page FILE-errors.html is generated that contains all links that were not checked successfully. You should browse this generated file to see which links actually cause problems. Known problems: * also checks links that exists only inside quotes * stops, if password-protected pages are tried to be accessed [but you can enter URL/password via command line and this works fine] * forwards pages to viewers, like ghostview, xv, etc. though this is not 'dangerous', and you can proceed by just terminating the viewer, this slows down the check and I haven't found a way jet to switch it of * Sometimes asks whether or not to retrieve something (and you have to type in \"yes\" or \"no\" RETURN) * strips off the #MARKER from \"URL#MARKER\" links; Relies on Emacs with w3 and a function do-w3-die (URL) which: 1. tries to reach the given URL 2. quits emacs (7), if it was reached alright 3. quits emacs with error (255), if there were problems see: --help-emacs ") ? 'ok' : die "can't print $!"; } sub emacs_code { (print ";; -- INCLUDE e.g. IN YOUR ~/.emacs FILE ------------------------------\n") ? 'ok' : die "can't print $!"; (print ";; The function do-w3-die is needed by the program linkcheck.perl\n") ? 'ok' : die "can't print $!"; (print ";; It might have to be adjusted to work correctly with your w3 version.\n") ? 'ok' : die "can't print $!"; (print "(require 'w3)\n") ? 'ok' : die "can't print $!"; (print "(defun do-w3-die (url)\n") ? 'ok' : die "can't print $!"; (print " \"Try to fetch a site. \n") ? 'ok' : die "can't print $!"; (print "If the site does not exists, emacs dies with exit code 7.\"\n") ? 'ok' : die "can't print $!"; (print " (interactive)\n") ? 'ok' : die "can't print $!"; (print " (let ((displayed-lines-as-message 3) ;; set to nil, if undesired\n") ? 'ok' : die "can't print $!"; (print " (error-pattern (format \"\\\\(%s\\\\|%s\\\\|%s\\\\|%s\\\\)\"\n") ? 'ok' : die "can't print $!"; (print " \"^404[ \\n\\t\\r]+Not[ \\n\\t\\r]+Found\"\n") ? 'ok' : die "can't print $!"; (print " \"Error[ \\n\\t\\r]+accessing[ \\n\\t\\r]+\"\n") ? 'ok' : die "can't print $!"; (print " \"^\*Not[ \\t\\r\\n]+Found\\*\"\n") ? 'ok' : die "can't print $!"; (print " \"----[ \\n\\t\\r]+error[ \\n\\t\\r]+was:[ \\n\\t\\r]+----[\\n ]*unknown[ \\n\\t\\r]+host[ \\n\\t\\r]+\")))\n") ? 'ok' : die "can't print $!"; (print " (w3-fetch url)\n") ? 'ok' : die "can't print $!"; (print " (sleep-for 1)\n") ? 'ok' : die "can't print $!"; (print " (setq case-fold-search t)\n") ? 'ok' : die "can't print $!"; (print " (goto-char (point-min))\n") ? 'ok' : die "can't print $!"; (print " \n") ? 'ok' : die "can't print $!"; (print " (if (search-forward-regexp error-pattern (point-max) t)\n") ? 'ok' : die "can't print $!"; (print " (progn (message \"*die with error*\")\n") ? 'ok' : die "can't print $!"; (print " (message \"FAILURE\")\n") ? 'ok' : die "can't print $!"; (print " (kill-emacs 1)) ;; emacs -batch dies on errors\n") ? 'ok' : die "can't print $!"; (print " (progn (message \"** OK Quit\")\n") ? 'ok' : die "can't print $!"; (print " (kill-emacs 7)))))\n") ? 'ok' : die "can't print $!"; (print ";;; -- END -----------------------------------------------------------\n") ? 'ok' : die "can't print $!"; } ## ################################################################### ## [1] MAIN ROUTINE ## ################################################################### disclaimer(); open(OLD, "< $old") ? 'ok' : die "can't open $old: $!"; $MAX=13; @parts={1,2,3,4,5,6,7,8,9,10,11,12,13} ; $comment=false; $anchor=false; $hit=0; $i; $linenr=1; $errors=""; $unknowns=""; (print "** Checking links in $old...\n") ? 'ok' : die "can't print $!"; if( $create_error_file ){ (print "** writing error file: $new\n") ? 'ok' : die "can't print $!"; open(ERROR, "> $new") ? 'ok' : die "can't open $new: $!"; (print ERROR " \nErrornous Links from \"$old\" \n\n

    Errornous Links from \"$old\"

    created on $current_date

      ") ? 'ok' : die "can't write to $new: $!"; } while ($line = ) { if( $line =~ /^.*([hH][rR][eE][fF]=|[sS][rR][cC]=).*/ ){ @parts = split /([hH][rR][eE][fF]=|[sS][rR][cC]=|\")/, $line ; $hit=$MAX; for($i=1; $i<$MAX; $i++){ if( @parts[$i] =~ /([hH][rR][eE][fF]=|[sS][rR][cC]=)/ ){ $hit=$i; } if( $i == $hit+3 ){ ## -- found something to check ------------- if( @parts[$i] =~ /^[mM][aA][iI][lL][tT][oO]:/ ){ (print "* ignore: @parts[$i]\n") ? 'ok' : die "can't print $!"; } elsif( @parts[$i] =~ /^[fF][iI][lL][eE]:/ ){ $url="@parts[$i]"; $exit_value = &check_url_old($url); &proccess_exit_value($exit_value); } elsif( @parts[$i] =~ /^[fF][tT][pP]:/ ){ (print "* ignore: @parts[$i]\n") ? 'ok' : die "can't print $!"; } elsif( @parts[$i] =~ /^[hH][tT][tT][pP]:/ ){ $url="@parts[$i]"; $exit_value = &check_url_old($url); &proccess_exit_value($exit_value); } elsif( @parts[$i] =~ /^\.\// ){ $file="@parts[$i]"; $exit_value = &check_file($file); &proccess_exit_value($exit_value); } elsif( @parts[$i] =~ /^\.\.\// ){ $file="./@parts[$i]"; $exit_value = &check_file($file); &proccess_exit_value($exit_value); } elsif( @parts[$i] =~ /^#/ ){ $file=$old; $mark="@parts[$i]"; $exit_value = &check_file_mark($file,$mark); } else { (print "* UNKNOWN: @parts[$i]\n") ? 'ok' : die "can't print $!"; $unknowns .= "@parts[$i]\t(line $linenr)\n"; } } } } $linenr=$linenr+1; } if( $create_error_file ){ (print ERROR "

    linkcheck.perl Version $VERSION

    ") ? 'ok' : die "can't write to $new: $!"; } ## -- show unknowns ------------------------------------- (print "======================================================================\n") ? 'ok' : die "can't print $!"; (print "Unknown URLs:\n") ? 'ok' : die "can't print $!"; (print "======================================================================\n") ? 'ok' : die "can't print $!"; (print "$unknowns\n") ? 'ok' : die "can't print $!"; ## -- show errors ------------------------------------- (print "======================================================================\n") ? 'ok' : die "can't print $!"; (print "Errornous URLs:\n") ? 'ok' : die "can't print $!"; (print "======================================================================\n") ? 'ok' : die "can't print $!"; (print "$errors\n") ? 'ok' : die "can't print $!"; if( $create_error_file ){ (print "\n** erronous links written to file: $new\n") ? 'ok' : die "can't write: $!"; close(ERROR); } #exit 0; ## --- OK ------------------------------ ### Local Variables: *** ### mode: perl *** ### eval: (perl-mode) *** ### comment-column:0 *** ### comment-start: "### " *** ### comment-end:"***" *** ### End: ***