#!/v/guest/sw/bin/perl
require 5.002;
###############################################################################
# #
# Program to traverse a tree of HTML pages to collect their URLs and build #
# a textual representation in form of a tree (a HTML page with hyperlinks) #
# #
###############################################################################
# #
# Version 1.0 - Written 31.08.96 by Steffen Beyer #
# Version 1.1 - Written 01.09.96 by Steffen Beyer #
# Version 1.2 - Written 04.09.96 by Steffen Beyer #
# Version 2.0 - Written 10.09.96 by Steffen Beyer #
# Version 2.1 - Written 16.10.96 by Steffen Beyer #
# Version 2.2 - Written 15.05.97 by Steffen Beyer #
# #
###############################################################################
# #
# Copyright (c) 1996 by Steffen Beyer. #
# All rights reserved. #
# #
###############################################################################
# #
# This program is free software; you can redistribute it and/or #
# modify it under the same terms as Perl itself. #
# #
###############################################################################
# ============================ Internal constants: ============================
umask(0022);
$self = $0;
$self =~ s!^.*/!!;
$VERSION = '2.2';
$version = "version $VERSION";
$separator = ('-' x 78) . "\n";
# Definition of standard (default) HTML file names and extensions:
$default = 'index';
%html_ext = ('scgi' => 4, 'cgi' => 3, 'shtml' => 2, 'html' => 1, 'htm' => 1);
$map_ext = 'map';
# ======================= User configurable constants: ========================
# User under which CGI programs are run on your host (usually "nobody"):
$nobody = 'nobody';
# The pattern that uniquely identifies this host in URLs:
$host_pattern = '\bwww(?:\.|1\.|2\.|\.de\.|\.ch\.)engelschall\.com\b|\ben3\b|\ben5\b'; # en3/en5
#$host_pattern = '\bsww1\b|\bsunbi1\b|\bsun\b'; # sunbi1
# The physical path of the directory the HTTP server uses as its root
# directory for HTML pages (for example, "/usr/local/etc/httpd/htdocs/"):
$html_root = '/';
# The physical path of the directory the HTTP server uses as its root
# directory for CGI scripts (i.e., your "cgi-bin" directory):
$cgi_root = '/';
# (I.e., if "/cgi-bin/..." corresponds to "/usr/local/etc/httpd/cgi-bin/...",
# set "$cgi_root" equal to "/usr/local/etc/httpd/")
# Define here where the recursive descent should start (physical path):
# (whatever subtree you want)
$tree_root = '/u/sb/.www/';
# Where to put this script's log file (physical path):
$logfile = "/u/sb/.www/sitemap/$self.log";
# Where to put this script's output file (physical path):
$treefile = '/u/sb/.www/sitemap/index.body';
# Do not include the following subtrees or pages (URLs):
# 0 = hide this page (and hence its associated subtree) completely
# 1 = show this page, skip all the links it contains (i.e., its subtree)
# 2 = show this page and the pages it points to, but no more
# 3 = show this page and two more levels down, then prune tree
# 4 = show this page and three more levels down, then prune tree
# ... and so on
$skip{'/u/sb/sitemap/'} = 0;
$skip{'/u/sb/whoami/'} = 2;
$skip{'/u/sb/whoami/details/curriculum/'} = 0;
$skip{'/u/sb/maptest/'} = 1;
$skip{'/u/sb/scripts/'} = 1;
$skip{'/u/sb/missing/'} = 0;
$skip{'/u/sb/whatsnew/'} = 2;
$skip{'/u/sb/gallery/projector/'} = 0;
$skip{'/u/sb/download/statistics/'} = 1;
$skip{'/u/sb/perl/yapc/fotos/'} = 1;
$skip{'/u/sb/perl/yapc/BitVector/Presentation/'} = 1;
$skip{'/u/sb/perl/yapc/DataLocations/Presentation/'} = 1;
# Define here the maximum number of levels to show (= maximum number
# of hyperlinks to follow down the tree) starting with "$tree_root":
# (Use "-1" for "plus infinity", i.e., no limit at all)
$depth = -1;
# Add here file types to be displayed but whose contents is to be ignored:
$html_ext{'README'} = 5;
$html_ext{'jpeg'} = 5;
$html_ext{'mpeg'} = 5;
$html_ext{'tiff'} = 5;
$html_ext{'jpg'} = 5;
$html_ext{'mp3'} = 5;
$html_ext{'mpg'} = 5;
$html_ext{'gif'} = 5;
$html_ext{'xpm'} = 5;
$html_ext{'tif'} = 5;
$html_ext{'txt'} = 5;
$html_ext{'doc'} = 5;
$html_ext{'xls'} = 5;
$html_ext{'ppt'} = 5;
$html_ext{'pps'} = 5;
$html_ext{'zip'} = 5;
$html_ext{'bas'} = 5;
$html_ext{'pas'} = 5;
$html_ext{'for'} = 5;
$html_ext{'tar'} = 5;
$html_ext{'bin'} = 5;
$html_ext{'dvi'} = 5;
$html_ext{'pl'} = 5;
$html_ext{'pm'} = 5;
$html_ext{'xs'} = 5;
$html_ext{'ps'} = 5;
$html_ext{'gz'} = 5;
$html_ext{'Z'} = 5;
$html_ext{'c'} = 5;
$html_ext{'h'} = 5;
# Enable this script to be able to recognize anchors (...)
# and titles (
...) extending over
# 1) one line 2) one paragraph 3) several paragraphs
# (no line breaks) (no empty lines) (line breaks and empty lines allowed)
# by 1) not changing 2) setting to '' 3) undef'ing the internal variable
# "$/":
undef $/;
# =================================== Main ====================================
# Set configuration dependent constants:
$ext_pattern = join('|', keys(%html_ext));
$html_root =~ s!/$!!;
$cgi_root =~ s!/$!!;
$root_pattern = "$html_root|$cgi_root";
# Check if this script is running under root:
#if (($< == 0) || ($> == 0)) { &restore_root_id; }
#else
#{
# die "$self: this script needs to be run under root!\n";
#}
# Get nobody's UID and GID:
unless (($nbdy_uid,$nbdy_gid) = (getpwnam($nobody))[2,3])
{
die "$self: can't determine UID and GID of user '$nobody'!\n";
}
# Rename logfile if it already exists:
if (-e $logfile)
{
$counter = '000';
$backfile = "$logfile.$counter";
while (-e $backfile)
{
$counter++;
$backfile = "$logfile.$counter";
}
unless (rename($logfile,$backfile))
{
die "$self: can't rename '$logfile' to '$backfile': $!\n";
}
unless (($log_uid,$log_gid) = (stat($backfile))[4,5])
{
die "$self: can't stat '$backfile': $!\n";
}
}
else
{
$logdir = $logfile;
$logdir =~ s!/[^/]*$!!;
unless (($log_uid,$log_gid) = (stat($logdir))[4,5])
{
die "$self: can't stat '$logdir': $!\n";
}
}
# Open logfile:
unless (open(LOGFILE, ">$logfile"))
{
die "$self: can't write '$logfile': $!\n";
}
# Write header information:
$date = `date`;
print LOGFILE $separator;
print LOGFILE "$self $version $date";
print LOGFILE $separator;
$error = 0;
# Rename treefile if it already exists:
unless ($error)
{
if (-e $treefile)
{
$counter = '000';
$backfile = "$treefile.$counter";
while (-e $backfile)
{
$counter++;
$backfile = "$treefile.$counter";
}
unless (rename($treefile,$backfile))
{
$error = 1;
print LOGFILE "can't rename '$treefile' to '$backfile': $!\n";
}
unless ($error)
{
unless (($tree_uid,$tree_gid) = (stat($backfile))[4,5])
{
$error = 1;
print LOGFILE "can't stat '$backfile': $!\n";
}
}
}
else
{
$treedir = $treefile;
$treedir =~ s!/[^/]*$!!;
unless (($tree_uid,$tree_gid) = (stat($treedir))[4,5])
{
$error = 1;
print LOGFILE "can't stat '$treedir': $!\n";
}
}
}
# Open treefile:
unless ($error)
{
unless (open(TREEFILE, ">$treefile"))
{
$error = 1;
print LOGFILE "can't write '$treefile': $!\n";
}
}
unless ($error)
{
# The start page must be marked as visited here because
# "traverse_tree" only marks links it finds INSIDE an HTML page:
$skip{$tree_root} = 0;
# The page which is generated here is also excluded; first because
# it is still empty at the time of the scan, second because even
# if it weren't, it wouldn't make sense:
$skip{$treefile} = 0; # (file paths are not changed by "url_to_file"!)
# Mark certain links as visited to automatically skip them later:
foreach $file (keys(%skip))
{
$level = $skip{$file};
&url_to_file(\$file);
if (($thisdir,$thispage) = find_page($file))
{
if (($dev,$ino) = (lstat($thispage))[0,1])
{
$visited{"$dev:$ino"} = $level;
}
else
{
print LOGFILE "can't stat file to skip '$thispage': $!\n";
}
}
}
# Slurp passwd file for user home directories:
setpwent;
while (($user,$dir) = (getpwent)[0,7])
{
if ($homedir{$user} eq '')
{
$dir =~ s!/$!!;
$homedir{$user} = $dir;
}
}
endpwent;
# Traverse tree:
$tree = [ ];
if (($root,$title) = &traverse_tree($tree_root,'',$depth))
{
&file_to_url(\$tree_root);
push(@{$tree},[$root,$tree_root,$title]);
}
# Output tree:
&html_header;
&html_body($tree,0) if (@{$tree});
&html_footer;
# Close treefile:
close(TREEFILE);
# Restore treefile ownership:
if (chown($tree_uid,$tree_gid,$treefile) != 1)
{
print LOGFILE "can't chown '$treefile'!\n";
}
}
# Write trailer information:
$date = `date`;
print LOGFILE "$self $version $date";
print LOGFILE $separator;
# Close logfile:
close(LOGFILE);
# Restore logfile ownership:
if (chown($log_uid,$log_gid,$logfile) != 1)
{
die "$self: can't chown '$logfile'!\n";
}
# Done:
exit 0;
# ================================ Subroutines ================================
# Subroutine to find given page or default HTML file (index.html etc.)
sub find_page
{
my($thispage) = @_;
my($ext,$nextpage,$symlink);
my($type,$thisdir);
# uses globals %html_ext, $ext_pattern
$type = 0;
if (($thispage =~ m!/$!) || (-d $thispage))
{
$thisdir = $thispage;
$thisdir =~ s!/$!!;
$thispage = '';
EXT:
foreach $ext (keys(%html_ext))
{
$nextpage = "$thisdir/$default.$ext";
if (-f $nextpage)
{
$thispage = $nextpage; # $type = $html_ext{$ext};
last EXT;
}
}
unless ($thispage)
{
print LOGFILE "can't find any default file in directory '$thisdir'!\n";
return();
}
}
unless (-f $thispage)
{
print LOGFILE "can't find any file named '$thispage'!\n";
return();
}
if ($thispage =~ m!\.($ext_pattern)$!io)
{
$type = $html_ext{$1};
if (($type == 1) && ((stat($thispage))[2] & 0001)) # equiv. to *.shtml
{
$type = 2; # html-file executable for: 0001 = "other", 0111 = any
}
}
$thisdir = $thispage;
$thisdir =~ s!/[^/]*$!!;
return($thisdir,$thispage,$type);
}
# Subroutine to substitute all server-side-includes (files only):
sub server_side_includes
{
my($line,$thisdir) = @_; # first argument is a reference!
my($temp,$key,$file,$dir);
# uses global $html_root
$temp = '';
INC:
while (${$line} =~ m,,i)
{
$temp .= $`;
${$line} = $';
$key = $1;
$file = $2;
if ($key eq 'virtual') { $dir = $html_root; }
else { $dir = $thisdir; }
$dir =~ s!/$!!;
$file =~ s!^/!!;
$file = "$dir/$file";
&url_to_file(\$file);
unless (open(SSI_FILE, "<$file"))
{
print LOGFILE "can't open SSI file '$file': $!\n";
next INC;
}
while ()
{
$temp .= $_;
}
close(SSI_FILE);
}
$temp .= ${$line};
${$line} = $temp;
}
# Routine to return the contents of a map file as a series of links:
sub include_map_file
{
my($nextpage) = @_;
my($result,$line,$link,$text,$thisdir,$thispage);
$result = '';
$thisdir = $nextpage;
$thisdir =~ s!/[^/]*$!!;
unless (open(MAP_FILE, "<$nextpage"))
{
print LOGFILE "can't open MAP file '$nextpage': $!\n";
return $result;
}
while($line = )
{
while ($line =~ m!\b(?:rect|circle|poly|default)\s+([^<>'"\s]+)\s!i)
{
$line = $';
# since links inside a map file are relative to that
# map file, convert the links to absolute ones:
$thispage = qq!!;
if ((($link,$text,$nextpage) = parse_link(\$thispage,$thisdir)) &&
($link ne ''))
{
$result .= qq!!;
}
}
}
close(MAP_FILE);
return $result;
}
# Subroutine to check for and extract a valid link:
sub parse_link
{
my($line,$thisdir) = @_; # first argument is a reference!
my($follow,$host,$user,$path,$nextdir);
my($link,$text,$nextpage);
# uses globals $host_pattern, %homedir
if (${$line} =~ m!(.*?)!i)
{
${$line} = $';
$link = $1;
$text = $2;
if (($link =~ m!^([^:/<>'"?=\s#]+)\s*:!) && (uc($1) ne 'HTTP'))
{
# ignore "mailto:...", "ftp:..." etc.:
$follow = 0;
}
elsif ($link =~ m!^
(?:HTTP\s*:\s*)?
//
([^:/<>'"?=\s#]+)
(?::\d+)?
((?:/[^<>'"?=\s#]*)?)
# strips off any cgi parameters or text anchor links
!ix)
{
$host = $1;
$nextpage = $2;
$follow = ($host =~ m!$host_pattern!io);
$nextdir = '/';
}
elsif ($link =~ m!^
(?:HTTP\s*:\s*)?
([^<>'"?=\s#]+)
# strips off any cgi parameters or text anchor links
!ix)
{
$nextpage = $1;
$follow = 1;
if (($nextpage eq '') || ($nextpage =~ m!^/!))
{ $nextdir = '/'; }
else
{ $nextdir = $thisdir; }
}
else
{
# text anchor link ("#subsection") or link of unknown format:
$follow = 0;
unless ($link =~ m!^#!)
{
print LOGFILE "can't parse link '$link' ($text)!\n";
}
}
if ($follow)
{
# map "~user" to user's homedir:
if (($nextpage =~ m!^~([^/]+)!) ||
($nextpage =~ m!^%7E([^/]+)!i))
{
$user = $1;
$path = $';
$path =~ s!^/!!;
if ($homedir{$user} =~ m!^\s*$!)
{
$nextpage = "/u/$user/$path"; # if homedir is unknown
}
else
{
$nextpage = $homedir{$user};
$nextpage =~ s!/$!!;
$nextpage .= "/$path";
}
}
else
{
$nextdir =~ s!/$!!;
$nextpage =~ s!^/!!;
$nextpage = "$nextdir/$nextpage";
}
&url_to_file(\$nextpage);
$link = $nextpage; # this converts a relative
&file_to_url(\$link); # to an absolute link!
}
else # link found, but not followable:
{
$link = '';
$text = '';
$nextpage = '';
}
return($link,$text,$nextpage);
}
return(); # no more links found
}
# Subroutine to set up environment for (S)CGI script to run in:
sub setup_for_cgi
{
my($thispage,$type) = @_;
my($scgi_uid,$scgi_gid);
# uses globals $nbdy_uid, $nbdy_gid
if ($type == 3)
{
# $( = $nbdy_gid; # set GID's first!
# $) = $nbdy_gid;
# $< = $nbdy_uid; # set real UID first!
# $> = $nbdy_uid;
}
elsif ($type == 4)
{
unless (($scgi_uid,$scgi_gid) = (stat($thispage))[4,5])
{
print LOGFILE "can't stat SCGI script '$thispage': $!\n";
return 0;
}
# $( = $scgi_gid; # set GID's first!
# $) = $scgi_gid;
# $< = $scgi_uid; # set real UID first!
# $> = $scgi_uid;
}
else { return 0; }
&file_to_url(\$thispage);
$ENV{'HTTP_USER_AGENT'} = 'Mozilla/3.0';
$ENV{'SCRIPT_NAME'} = $thispage;
# can be made more sophisticated if needed...
return 1;
}
# Subroutine to restore root UID and GID:
#sub restore_root_id
#{
# $< = 0;
# $> = 0;
# $( = 0;
# $) = 0;
#}
# Subroutine that does it all:
sub traverse_tree
{
my($thispage,$prevpage,$level) = @_;
my($thisdir,$nextdir,$nextpage);
my($type,$line,$link,$text,$dev,$ino,$ref);
my(@nextpages);
my($result,$title);
# uses globals $map_ext, %visited
$title = '';
$result = [ ];
unless (($thisdir,$thispage,$type) = find_page($thispage))
{
print LOGFILE "(coming from '$prevpage' -\ntree pruned abnormally)\n";
print LOGFILE $separator;
return();
}
if ($level == 0)
{
print LOGFILE "skipping '$thispage'...\n(coming from '$prevpage')\n";
print LOGFILE $separator;
return();
}
else
{
print LOGFILE "visiting '$thispage'...\n(coming from '$prevpage')\n";
}
$level--;
if (($type == 1) || ($type == 2))
{
unless (open(THISPAGE, "<$thispage"))
{
print LOGFILE "can't open HTML file '$thispage': $!\n";
print LOGFILE $separator;
return();
}
}
elsif (($type == 3) || ($type == 4))
{
if (&setup_for_cgi($thispage,$type))
{
unless (open(THISPAGE, "$thispage |"))
{
print LOGFILE "can't open pipe from CGI script '$thispage': $!\n";
print LOGFILE $separator;
return();
}
}
else
{
print LOGFILE "can't create environment for CGI script '$thispage'!\n";
print LOGFILE $separator;
return();
}
}
elsif ($type == 5)
{
$title = $thispage;
$title =~ s!^.*/!!;
print LOGFILE $separator;
return($result,$title);
}
else
{
print LOGFILE "unknown HTML file type ($type) for file '$thispage'!\n";
print LOGFILE $separator;
return();
}
LINE:
while ($line = )
{
$line =~ s![\n\r\t]+! !g;
&server_side_includes(\$line,$thisdir) if ($type == 2);
$line =~ s![\n\r\t]+! !g;
if (($title eq '') && ($line =~ m!(.*?)!i))
{
$title = $1;
$title =~ s!^\s*!!;
$title =~ s!\s*$!!;
$title =~ s!\s+! !g;
}
next LINE if ($level == 0);
LINK:
while (($link,$text,$nextpage) = parse_link(\$line,$thisdir))
{
next LINK if ($link eq '');
if (($link =~ m!\.$map_ext!io) &&
($text =~ m!^\s*
"!
($text =~ m!>\s*$!))
{
$line = &include_map_file($nextpage) . $line;
}
elsif (($nextdir,$nextpage) = find_page($nextpage))
{
if (($dev,$ino) = (lstat($nextpage))[0,1])
{
if (!defined($visited{"$dev:$ino"}))
{
push( @nextpages, [ $link, $nextpage, $level ] )
unless ($nextpage =~
m!/(?:redirect\.cgi|(?:img|pkg|license)/[^/]+)$!);
$visited{"$dev:$ino"} = 0;
}
else
{
if ($visited{"$dev:$ino"} > 0)
{
push( @nextpages,
[ $link, $nextpage, $visited{"$dev:$ino"} ] );
$visited{"$dev:$ino"} = 0;
}
}
}
else { print LOGFILE "can't stat HTML file '$nextpage': $!\n"; }
}
else { }
}
}
close(THISPAGE);
# free memory:
$line = '';
# ("$line" may contain a whole page - simultaneously in every
# call of "traverse_tree" in the recursive descent!)
# &restore_root_id if (($type == 3) || ($type == 4));
print LOGFILE $separator;
while (@nextpages)
{
$ref = shift(@nextpages);
$link = ${$ref}[0];
$nextpage = ${$ref}[1];
$level = ${$ref}[2];
if (($ref,$text) = &traverse_tree($nextpage,$thispage,$level))
{
push(@{$result},[$ref,$link,$text]);
}
}
return($result,$title);
}
# Subroutine to give out the upper part of the HTML page:
sub html_header
{
# uses global TREEFILE
print TREEFILE <<"VERBATIM";
|
VERBATIM
}
# Subroutine to give out the main part of the HTML page:
sub html_body
{
my($tree,$level) = @_;
my($this,$next,$link,$title);
# uses global TREEFILE
if ($level < 2) { print TREEFILE ' ' x $level, " \n"; }
print TREEFILE ' ' x $level, " \n";
while (@{$tree})
{
$this = shift(@{$tree});
$next = ${$this}[0];
$link = ${$this}[1];
$title = ${$this}[2];
unless ($title) { $title = $link; }
$title =~ s!^Steffen Beyer\s+-\s+!!i; # remove "Steffen Beyer - "
print TREEFILE ' ' x ($level+1);
print TREEFILE qq!- $title
\n!;
if (@{$next})
{
&html_body($next,$level+1);
# if (@{$tree}) { print TREEFILE ' ' x ($level+1), "\n"; }
}
if (($level < 2) && @{$tree}) { print TREEFILE ' ' x ($level+1), " \n"; }
}
print TREEFILE ' ' x $level, " \n";
if ($level < 2) { print TREEFILE ' ' x $level, "\n"; }
}
# Subroutine to give out the lower part of the HTML page:
sub html_footer
{
# uses global TREEFILE
print TREEFILE <<"VERBATIM";
|
|
VERBATIM
}
# Subroutine to convert a URL into a physical path in the file system:
# (Doesn't change the input if it's already a physical path!)
sub url_to_file
{
my($thispage) = @_; # argument is a reference!
my($rootdir,$type,$doit);
# uses globals $html_root, $cgi_root, $root_pattern, $ext_pattern, %html_ext
# prepend path to root HTML or "cgi-bin" directory:
${$thispage} =~ s!^/~([A-Za-z][A-Za-z0-9]*)/!/u/$1/!;
unless (${$thispage} =~ m!^(?:$root_pattern)!io)
{
$rootdir = $html_root;
if (${$thispage} =~ m!\.($ext_pattern)$!io)
{
$type = $html_ext{$1};
if (($type == 3) || ($type == 4)) { $rootdir = $cgi_root; }
}
${$thispage} =~ s!^/!!;
${$thispage} = "$rootdir/" . ${$thispage};
}
# transformation for hidden HTML subdirectories in user home directories:
$doit = 0;
if (${$thispage} =~ m!^/[egu]/[^/\s]+/([^/\s]+)!)
{
if (lc($1) ne '.www') { $doit = 1; }
}
elsif (${$thispage} =~ m!^/[egu]/[^/\s]!)
{
$doit = 1;
}
else { }
if ($doit)
{
${$thispage} =~ s!^/([egu])/([^/\s]+)!/$1/$2/.www!;
}
# substitute "/./" --> "/":
while (${$thispage} =~ m!/\./!)
{
${$thispage} =~ s!/\./!/!g;
}
# substitute "/directory/../" --> "/":
while (${$thispage} =~ m!/[^\./]+/\.\./!)
{
${$thispage} =~ s!/[^\./]+/\.\./!/!g;
}
}
# Subroutine to convert a physical path in the file system into a URL:
# (Doesn't change the input if it's already a URL!)
sub file_to_url
{
my($thispage) = @_; # argument is a reference!
# uses global $root_pattern
# remove leading path to root HTML or "cgi-bin" directory:
${$thispage} =~ s!^(?:$root_pattern)!!io;
# transformation for hidden HTML subdirectories in user home directories:
if ((${$thispage} =~ m!^/[egu]/[^/\s]+/.www/!) ||
(${$thispage} =~ m!^/[egu]/[^/\s]+/.www$!))
{
${$thispage} =~ s!^/([egu])/([^/\s]+)/.www!/$1/$2!;
}
# substitute "/./" --> "/":
while (${$thispage} =~ m!/\./!)
{
${$thispage} =~ s!/\./!/!g;
}
# substitute "/directory/../" --> "/":
while (${$thispage} =~ m!/[^\./]+/\.\./!)
{
${$thispage} =~ s!/[^\./]+/\.\./!/!g;
}
}
# The End:
__END__