#!/bin/csh -f
# lvrfy:  A HTML Link Verification utility
# version 1.6d
# 6 December 1995
# 
# By Preston Crow
# http://www.cs.dartmouth.edu/~crow/
# http://www.cs.dartmouth.edu/~crow/lvrfy.html
# Copyright (c) 1995
# Restriction on use:
#	Significant modifications must be made available, free of charge
#	or restriction, to Preston Crow.
#	May not be redistributed in a modified form without notifying
#	Preston Crow.
#
# Syntax:
# lvrfy startURL fromURL OKfile BADfile OFFSITEfile
# I use:
#	(date;lvrfy / X lvrfy.OK lvrfy.BAD lvrfy.OFF;date) |& tee lvrfy.ERR |& Mail -s 'lvrfy completed on '`hostname` `whoami` &
#
# Customizations:
set SERVER="coos.dartmouth.edu"
set SLASH="/usr/local/etc/httpd/htdocs"
set PUBLIC="public_html"
# List all valid index filenames, in order of preferance
set INDEX=(index.html)
# Temporary directory to use
set TMP="/tmp"
# Maximum nesting level, to avoid filling up the process table
set MAXNEST=6
# Aliases must be hard-coded in this version
#
# Known bugs:
#	* Doesn't handle tags in comments correctly.  It may fault on:
#	  <!-- <LI>foo -->, or otherwise get confused.
#	* Doesn't handle unclosed tags.
#	* May seg fault on non-text or other pathalogical input cases.
#	* May leave files in TMP when it doesn't complete successfully.
#	* Doesn't recognize aliased directories, so links to aliased
#	  files will be reported as bad.
#		--you can manually add aliasing below, if you can
#		  follow the `sed` syntax.
#	* Certain pathalogical file or directory names may confuse
#	  it, but these should be quite rare.
#		--I think I fixed most of these now.
#
# Warning:  This script isn't secure, and shouldn't be run as root.
# I'm not sure if it is possible for a carefully constructed pathalogical
# case to misdirect the script, causing unexpected or dangerous side effects.
#
if ( $7 != '' || $5 == '' ) then
	echo $0 $argv
	echo Usage:  'lvrfy startURL fromURL OKfile BADfile OFFSITEfile'
	sleep 60
	exit 1
endif

set NEST=$6
if ($NEST == '') set NEST=0
@ NEXTNEST = $NEST + 1
if ( $NEST == $MAXNEST ) set NEXTNEST=1

#
# Set variables
#
set INURL="$1"
set PAGE="$1"
set OKFILE=$3
set BADFILE=$4
set OFFFILE=$5
touch $OKFILE

#
# Avoid pathalogical URLs
#
set BAD=no
# set | grep ^argv | sed -e 'sX^argv[ 	]*(XX;sX .*XX' 
echo "$PAGE" | grep '[][{}()$&*?\!;"'"'"'`]' >/dev/null && set BAD=yes
if ($BAD == yes ) then
	echo Link to unprocessable URL "$INURL" from "$2" >> $BADFILE
	exit 0
endif

#
# Convert URL to filename
#
if ( "tilde" == "`echo $PAGE | grep '^/~' >& /dev/null && echo tilde`" ) then
	set PAGE="`echo $PAGE | sed 'sX^/~[^/]*X&/'$PUBLIC'X' | cut -c2- `"
	set PAGE=`echo $PAGE |& grep ^/`
	if ( "$PAGE" == "" ) then
		echo Link to non-existent user: "$INURL" from "$2" >> $BADFILE
		exit 0
	endif
else
	set PAGE=$SLASH''$PAGE
endif

#
# Deal with aliases
#
set PAGE=`echo $PAGE | sed -e 'sX^'$SLASH'/cgi-bin/imagemap/.*X/dev/nullX;sX^'$SLASH'/icons/X/../icons/X;sX^'$SLASH'/cgi-bin/X/../cgi-bin/X'`

#
# Deal with directory indices.
#
if (-d $PAGE) then
	# Compensate for URL's missing the trailing /
	set INURL=`echo $INURL/|sed sX//X/Xg`
	set USEINDEX
	set ICOUNT=0
	while ($ICOUNT < $#INDEX )
		@ ICOUNT = $ICOUNT + 1
		set PAGE2=$PAGE/$INDEX[$ICOUNT]
		if ( -e "$PAGE2" ) break
	end
	set PAGE=`echo $PAGE2|sed sX//X/Xg`
endif
set CDIR=`echo $INURL|sed 'sX/[^/]*$X/Xg'`

#
# Compensate for symbolic links in the file
#
#set PAGEDIR=`echo $PAGE | sed 'sX/[^/]*$XXg'`
set PAGEDIR=$PAGE:h
set PAGE2=`cd $PAGEDIR>&/dev/null;pwd>&/dev/null&&pwd||echo $PAGEDIR`
#if ($PAGE2 != '') set PAGE=$PAGE2/`echo $PAGE|sed 'sX.*/XXg'`
if ($PAGE2 != '') set PAGE=$PAGE2/$PAGE:t

#
# What is the status of this file? (processed, non-existent?)
#
grep ^"$PAGE" $OKFILE >&/dev/null && exit 0
if ( ! -e "$PAGE") then
	if ($?USEINDEX) then
		echo Link to server-generated index page "$INURL" from "$2" >> $BADFILE
		exit 0
	endif
	echo Link to non-existent page "$INURL" from "$2" >> $BADFILE
	exit 0
else
	if ( -r "$PAGE" ) then
		echo "$PAGE" "$2" >> $OKFILE
	else
		echo Link to unreadable page "$INURL" from "$2" >> $BADFILE
		exit 0
	endif
endif

#
# If filename doesn't end in "html," skip it.
#
echo $PAGE | grep 'html$' >&/dev/null || exit 0


#
# OK, we have a new file to process here.  Find the links and recurse
#
sed -e '\
:ok\
sX		*X Xg\
sX\nX Xg\
sX  *X Xg\
sX[ ]*=[ ]*X=Xg\
sX [^<>HhSs ][^<> ]*X Xg\
/<[^>]*$/N\
/\n/b ok\
sX\nX Xg\
sX[^<>]*<X\\
< Xg\
sX>[^<>]*X >\\
Xg\
sX  *X Xg\
sX< \!.*XXg\
:end' $PAGE | sed -n -e '\
sX<.*[Hh][Rr][Ee][Ff]=XHREF=Xg\
sX<.*[Ss][Rr][Cc]=XHREF=Xg\
sX .*XXg\
sX"XXg\
sX#.*$XX\
sX?.*$XX\
sX[Hh][Tt][Tt][Pp]:Xhttp:Xg\
sXhttp://'$SERVER'XXg\
/http:[/][^/]/sXhttp:XXg\
/^HREF=./b next\
b end\
:next\
sX^HREF=XXg\
sX:X:Xgw '$TMP/lvrfy.$OFFFILE.2'\
/^[^/]/s+^+'$CDIR'+g\
:ok\
sX/[^/]*/\.\./X/Xg\
sX/\./X/Xg\
sX//X/Xg\
t ok\
/^[^:]*$/p\
:end' |sed -e sX\'X\'\"\'\"\'Xg';sX\!X\\\!Xg' | awk '{printf("%s '"'"'%s'"'"' '"'"'%s'"'"' %s %s %s %s\n","'$0'",$1,"'$INURL'","'$OKFILE'","'$BADFILE'","'$OFFFILE'","'$NEXTNEST'")}' >$TMP/lvrfy.$NEST
# Last sed to escape embedded apostrophes and exclamation marks to avoid conflict.

awk '{print $1, "'$PAGE'"}' $TMP/lvrfy.$OFFFILE.2 >>$OFFFILE
rm  $TMP/lvrfy.$OFFFILE.2
#
# Now recurse, or save the work for later, if we're at the maximum depth.
#
if ( $NEST == $MAXNEST ) then
	cat $TMP/lvrfy.$NEST >> $TMP/lvrfy.work
	rm $TMP/lvrfy.$NEST
else
	echo exec rm $TMP/lvrfy.$NEST >> $TMP/lvrfy.$NEST
	if ( $NEST > 0 ) then
		exec csh -f <$TMP/lvrfy.$NEST
	else
		csh -f <$TMP/lvrfy.0
#
#		Now do any work that we couldn't do at a deeper depth.
#
		while ( -f $TMP/lvrfy.work )
			mv $TMP/lvrfy.work $TMP/lvrfy.0
			echo rm $TMP/lvrfy.0 >> $TMP/lvrfy.0
			csh -f <$TMP/lvrfy.0
		end
	endif
endif
