#!/v/guest/sw/bin/perl require 5.002; ############################################################################### # # # Program to traverse a tree of HTML pages to collect their URLs and build # # a textual representation in form of a tree (a HTML page with hyperlinks) # # # ############################################################################### # # # Version 1.0 - Written 31.08.96 by Steffen Beyer # # Version 1.1 - Written 01.09.96 by Steffen Beyer # # Version 1.2 - Written 04.09.96 by Steffen Beyer # # Version 2.0 - Written 10.09.96 by Steffen Beyer # # Version 2.1 - Written 16.10.96 by Steffen Beyer # # Version 2.2 - Written 15.05.97 by Steffen Beyer # # # ############################################################################### # # # Copyright (c) 1996 by Steffen Beyer. # # All rights reserved. # # # ############################################################################### # # # This program is free software; you can redistribute it and/or # # modify it under the same terms as Perl itself. # # # ############################################################################### # ============================ Internal constants: ============================ umask(0022); $self = $0; $self =~ s!^.*/!!; $VERSION = '2.2'; $version = "version $VERSION"; $separator = ('-' x 78) . "\n"; # Definition of standard (default) HTML file names and extensions: $default = 'index'; %html_ext = ('scgi' => 4, 'cgi' => 3, 'shtml' => 2, 'html' => 1, 'htm' => 1); $map_ext = 'map'; # ======================= User configurable constants: ======================== # User under which CGI programs are run on your host (usually "nobody"): $nobody = 'nobody'; # The pattern that uniquely identifies this host in URLs: $host_pattern = '\bwww(?:\.|1\.|2\.|\.de\.|\.ch\.)engelschall\.com\b|\ben3\b|\ben5\b'; # en3/en5 #$host_pattern = '\bsww1\b|\bsunbi1\b|\bsun\b'; # sunbi1 # The physical path of the directory the HTTP server uses as its root # directory for HTML pages (for example, "/usr/local/etc/httpd/htdocs/"): $html_root = '/'; # The physical path of the directory the HTTP server uses as its root # directory for CGI scripts (i.e., your "cgi-bin" directory): $cgi_root = '/'; # (I.e., if "/cgi-bin/..." corresponds to "/usr/local/etc/httpd/cgi-bin/...", # set "$cgi_root" equal to "/usr/local/etc/httpd/") # Define here where the recursive descent should start (physical path): # (whatever subtree you want) $tree_root = '/u/sb/.www/'; # Where to put this script's log file (physical path): $logfile = "/u/sb/.www/sitemap/$self.log"; # Where to put this script's output file (physical path): $treefile = '/u/sb/.www/sitemap/index.body'; # Do not include the following subtrees or pages (URLs): # 0 = hide this page (and hence its associated subtree) completely # 1 = show this page, skip all the links it contains (i.e., its subtree) # 2 = show this page and the pages it points to, but no more # 3 = show this page and two more levels down, then prune tree # 4 = show this page and three more levels down, then prune tree # ... and so on $skip{'/u/sb/sitemap/'} = 0; $skip{'/u/sb/whoami/'} = 2; $skip{'/u/sb/whoami/details/curriculum/'} = 0; $skip{'/u/sb/maptest/'} = 1; $skip{'/u/sb/scripts/'} = 1; $skip{'/u/sb/missing/'} = 0; $skip{'/u/sb/whatsnew/'} = 2; $skip{'/u/sb/gallery/projector/'} = 0; $skip{'/u/sb/download/statistics/'} = 1; $skip{'/u/sb/perl/yapc/fotos/'} = 1; $skip{'/u/sb/perl/yapc/BitVector/Presentation/'} = 1; $skip{'/u/sb/perl/yapc/DataLocations/Presentation/'} = 1; # Define here the maximum number of levels to show (= maximum number # of hyperlinks to follow down the tree) starting with "$tree_root": # (Use "-1" for "plus infinity", i.e., no limit at all) $depth = -1; # Add here file types to be displayed but whose contents is to be ignored: $html_ext{'README'} = 5; $html_ext{'jpeg'} = 5; $html_ext{'mpeg'} = 5; $html_ext{'tiff'} = 5; $html_ext{'jpg'} = 5; $html_ext{'mpg'} = 5; $html_ext{'gif'} = 5; $html_ext{'xpm'} = 5; $html_ext{'tif'} = 5; $html_ext{'txt'} = 5; $html_ext{'doc'} = 5; $html_ext{'xls'} = 5; $html_ext{'ppt'} = 5; $html_ext{'pps'} = 5; $html_ext{'zip'} = 5; $html_ext{'bas'} = 5; $html_ext{'pas'} = 5; $html_ext{'for'} = 5; $html_ext{'tar'} = 5; $html_ext{'bin'} = 5; $html_ext{'dvi'} = 5; $html_ext{'pl'} = 5; $html_ext{'pm'} = 5; $html_ext{'xs'} = 5; $html_ext{'ps'} = 5; $html_ext{'gz'} = 5; $html_ext{'Z'} = 5; $html_ext{'c'} = 5; $html_ext{'h'} = 5; # Enable this script to be able to recognize anchors (...) # and titles (...) extending over # 1) one line 2) one paragraph 3) several paragraphs # (no line breaks) (no empty lines) (line breaks and empty lines allowed) # by 1) not changing 2) setting to '' 3) undef'ing the internal variable # "$/": undef $/; # =================================== Main ==================================== # Set configuration dependent constants: $ext_pattern = join('|', keys(%html_ext)); $html_root =~ s!/$!!; $cgi_root =~ s!/$!!; $root_pattern = "$html_root|$cgi_root"; # Check if this script is running under root: #if (($< == 0) || ($> == 0)) { &restore_root_id; } #else #{ # die "$self: this script needs to be run under root!\n"; #} # Get nobody's UID and GID: unless (($nbdy_uid,$nbdy_gid) = (getpwnam($nobody))[2,3]) { die "$self: can't determine UID and GID of user '$nobody'!\n"; } # Rename logfile if it already exists: if (-e $logfile) { $counter = '000'; $backfile = "$logfile.$counter"; while (-e $backfile) { $counter++; $backfile = "$logfile.$counter"; } unless (rename($logfile,$backfile)) { die "$self: can't rename '$logfile' to '$backfile': $!\n"; } unless (($log_uid,$log_gid) = (stat($backfile))[4,5]) { die "$self: can't stat '$backfile': $!\n"; } } else { $logdir = $logfile; $logdir =~ s!/[^/]*$!!; unless (($log_uid,$log_gid) = (stat($logdir))[4,5]) { die "$self: can't stat '$logdir': $!\n"; } } # Open logfile: unless (open(LOGFILE, ">$logfile")) { die "$self: can't write '$logfile': $!\n"; } # Write header information: $date = `date`; print LOGFILE $separator; print LOGFILE "$self $version $date"; print LOGFILE $separator; $error = 0; # Rename treefile if it already exists: unless ($error) { if (-e $treefile) { $counter = '000'; $backfile = "$treefile.$counter"; while (-e $backfile) { $counter++; $backfile = "$treefile.$counter"; } unless (rename($treefile,$backfile)) { $error = 1; print LOGFILE "can't rename '$treefile' to '$backfile': $!\n"; } unless ($error) { unless (($tree_uid,$tree_gid) = (stat($backfile))[4,5]) { $error = 1; print LOGFILE "can't stat '$backfile': $!\n"; } } } else { $treedir = $treefile; $treedir =~ s!/[^/]*$!!; unless (($tree_uid,$tree_gid) = (stat($treedir))[4,5]) { $error = 1; print LOGFILE "can't stat '$treedir': $!\n"; } } } # Open treefile: unless ($error) { unless (open(TREEFILE, ">$treefile")) { $error = 1; print LOGFILE "can't write '$treefile': $!\n"; } } unless ($error) { # The start page must be marked as visited here because # "traverse_tree" only marks links it finds INSIDE an HTML page: $skip{$tree_root} = 0; # The page which is generated here is also excluded; first because # it is still empty at the time of the scan, second because even # if it weren't, it wouldn't make sense: $skip{$treefile} = 0; # (file paths are not changed by "url_to_file"!) # Mark certain links as visited to automatically skip them later: foreach $file (keys(%skip)) { $level = $skip{$file}; &url_to_file(\$file); if (($thisdir,$thispage) = find_page($file)) { if (($dev,$ino) = (lstat($thispage))[0,1]) { $visited{"$dev:$ino"} = $level; } else { print LOGFILE "can't stat file to skip '$thispage': $!\n"; } } } # Slurp passwd file for user home directories: setpwent; while (($user,$dir) = (getpwent)[0,7]) { if ($homedir{$user} eq '') { $dir =~ s!/$!!; $homedir{$user} = $dir; } } endpwent; # Traverse tree: $tree = [ ]; if (($root,$title) = &traverse_tree($tree_root,'',$depth)) { &file_to_url(\$tree_root); push(@{$tree},[$root,$tree_root,$title]); } # Output tree: &html_header; &html_body($tree,0) if (@{$tree}); &html_footer; # Close treefile: close(TREEFILE); # Restore treefile ownership: if (chown($tree_uid,$tree_gid,$treefile) != 1) { print LOGFILE "can't chown '$treefile'!\n"; } } # Write trailer information: $date = `date`; print LOGFILE "$self $version $date"; print LOGFILE $separator; # Close logfile: close(LOGFILE); # Restore logfile ownership: if (chown($log_uid,$log_gid,$logfile) != 1) { die "$self: can't chown '$logfile'!\n"; } # Done: exit 0; # ================================ Subroutines ================================ # Subroutine to find given page or default HTML file (index.html etc.) sub find_page { my($thispage) = @_; my($ext,$nextpage,$symlink); my($type,$thisdir); # uses globals %html_ext, $ext_pattern $type = 0; if (($thispage =~ m!/$!) || (-d $thispage)) { $thisdir = $thispage; $thisdir =~ s!/$!!; $thispage = ''; EXT: foreach $ext (keys(%html_ext)) { $nextpage = "$thisdir/$default.$ext"; if (-f $nextpage) { $thispage = $nextpage; # $type = $html_ext{$ext}; last EXT; } } unless ($thispage) { print LOGFILE "can't find any default file in directory '$thisdir'!\n"; return(); } } unless (-f $thispage) { print LOGFILE "can't find any file named '$thispage'!\n"; return(); } if ($thispage =~ m!\.($ext_pattern)$!io) { $type = $html_ext{$1}; if (($type == 1) && ((stat($thispage))[2] & 0001)) # equiv. to *.shtml { $type = 2; # html-file executable for: 0001 = "other", 0111 = any } } $thisdir = $thispage; $thisdir =~ s!/[^/]*$!!; return($thisdir,$thispage,$type); } # Subroutine to substitute all server-side-includes (files only): sub server_side_includes { my($line,$thisdir) = @_; # first argument is a reference! my($temp,$key,$file,$dir); # uses global $html_root $temp = ''; INC: while (${$line} =~ m,,i) { $temp .= $`; ${$line} = $'; $key = $1; $file = $2; if ($key eq 'virtual') { $dir = $html_root; } else { $dir = $thisdir; } $dir =~ s!/$!!; $file =~ s!^/!!; $file = "$dir/$file"; &url_to_file(\$file); unless (open(SSI_FILE, "<$file")) { print LOGFILE "can't open SSI file '$file': $!\n"; next INC; } while () { $temp .= $_; } close(SSI_FILE); } $temp .= ${$line}; ${$line} = $temp; } # Routine to return the contents of a map file as a series of links: sub include_map_file { my($nextpage) = @_; my($result,$line,$link,$text,$thisdir,$thispage); $result = ''; $thisdir = $nextpage; $thisdir =~ s!/[^/]*$!!; unless (open(MAP_FILE, "<$nextpage")) { print LOGFILE "can't open MAP file '$nextpage': $!\n"; return $result; } while($line = ) { while ($line =~ m!\b(?:rect|circle|poly|default)\s+([^<>'"\s]+)\s!i) { $line = $'; # since links inside a map file are relative to that # map file, convert the links to absolute ones: $thispage = qq!!; if ((($link,$text,$nextpage) = parse_link(\$thispage,$thisdir)) && ($link ne '')) { $result .= qq!!; } } } close(MAP_FILE); return $result; } # Subroutine to check for and extract a valid link: sub parse_link { my($line,$thisdir) = @_; # first argument is a reference! my($follow,$host,$user,$path,$nextdir); my($link,$text,$nextpage); # uses globals $host_pattern, %homedir if (${$line} =~ m!(.*?)!i) { ${$line} = $'; $link = $1; $text = $2; if (($link =~ m!^([^:/<>'"?=\s#]+)\s*:!) && (uc($1) ne 'HTTP')) { # ignore "mailto:...", "ftp:..." etc.: $follow = 0; } elsif ($link =~ m!^ (?:HTTP\s*:\s*)? // ([^:/<>'"?=\s#]+) (?::\d+)? ((?:/[^<>'"?=\s#]*)?) # strips off any cgi parameters or text anchor links !ix) { $host = $1; $nextpage = $2; $follow = ($host =~ m!$host_pattern!io); $nextdir = '/'; } elsif ($link =~ m!^ (?:HTTP\s*:\s*)? ([^<>'"?=\s#]+) # strips off any cgi parameters or text anchor links !ix) { $nextpage = $1; $follow = 1; if (($nextpage eq '') || ($nextpage =~ m!^/!)) { $nextdir = '/'; } else { $nextdir = $thisdir; } } else { # text anchor link ("#subsection") or link of unknown format: $follow = 0; unless ($link =~ m!^#!) { print LOGFILE "can't parse link '$link' ($text)!\n"; } } if ($follow) { # map "~user" to user's homedir: if (($nextpage =~ m!^~([^/]+)!) || ($nextpage =~ m!^%7E([^/]+)!i)) { $user = $1; $path = $'; $path =~ s!^/!!; if ($homedir{$user} =~ m!^\s*$!) { $nextpage = "/u/$user/$path"; # if homedir is unknown } else { $nextpage = $homedir{$user}; $nextpage =~ s!/$!!; $nextpage .= "/$path"; } } else { $nextdir =~ s!/$!!; $nextpage =~ s!^/!!; $nextpage = "$nextdir/$nextpage"; } &url_to_file(\$nextpage); $link = $nextpage; # this converts a relative &file_to_url(\$link); # to an absolute link! } else # link found, but not followable: { $link = ''; $text = ''; $nextpage = ''; } return($link,$text,$nextpage); } return(); # no more links found } # Subroutine to set up environment for (S)CGI script to run in: sub setup_for_cgi { my($thispage,$type) = @_; my($scgi_uid,$scgi_gid); # uses globals $nbdy_uid, $nbdy_gid if ($type == 3) { # $( = $nbdy_gid; # set GID's first! # $) = $nbdy_gid; # $< = $nbdy_uid; # set real UID first! # $> = $nbdy_uid; } elsif ($type == 4) { unless (($scgi_uid,$scgi_gid) = (stat($thispage))[4,5]) { print LOGFILE "can't stat SCGI script '$thispage': $!\n"; return 0; } # $( = $scgi_gid; # set GID's first! # $) = $scgi_gid; # $< = $scgi_uid; # set real UID first! # $> = $scgi_uid; } else { return 0; } &file_to_url(\$thispage); $ENV{'HTTP_USER_AGENT'} = 'Mozilla/3.0'; $ENV{'SCRIPT_NAME'} = $thispage; # can be made more sophisticated if needed... return 1; } # Subroutine to restore root UID and GID: #sub restore_root_id #{ # $< = 0; # $> = 0; # $( = 0; # $) = 0; #} # Subroutine that does it all: sub traverse_tree { my($thispage,$prevpage,$level) = @_; my($thisdir,$nextdir,$nextpage); my($type,$line,$link,$text,$dev,$ino,$ref); my(@nextpages); my($result,$title); # uses globals $map_ext, %visited $title = ''; $result = [ ]; unless (($thisdir,$thispage,$type) = find_page($thispage)) { print LOGFILE "(coming from '$prevpage' -\ntree pruned abnormally)\n"; print LOGFILE $separator; return(); } if ($level == 0) { print LOGFILE "skipping '$thispage'...\n(coming from '$prevpage')\n"; print LOGFILE $separator; return(); } else { print LOGFILE "visiting '$thispage'...\n(coming from '$prevpage')\n"; } $level--; if (($type == 1) || ($type == 2)) { unless (open(THISPAGE, "<$thispage")) { print LOGFILE "can't open HTML file '$thispage': $!\n"; print LOGFILE $separator; return(); } } elsif (($type == 3) || ($type == 4)) { if (&setup_for_cgi($thispage,$type)) { unless (open(THISPAGE, "$thispage |")) { print LOGFILE "can't open pipe from CGI script '$thispage': $!\n"; print LOGFILE $separator; return(); } } else { print LOGFILE "can't create environment for CGI script '$thispage'!\n"; print LOGFILE $separator; return(); } } elsif ($type == 5) { $title = $thispage; $title =~ s!^.*/!!; print LOGFILE $separator; return($result,$title); } else { print LOGFILE "unknown HTML file type ($type) for file '$thispage'!\n"; print LOGFILE $separator; return(); } LINE: while ($line = ) { $line =~ s![\n\r\t]+! !g; &server_side_includes(\$line,$thisdir) if ($type == 2); $line =~ s![\n\r\t]+! !g; if (($title eq '') && ($line =~ m!(.*?)!i)) { $title = $1; $title =~ s!^\s*!!; $title =~ s!\s*$!!; $title =~ s!\s+! !g; } next LINE if ($level == 0); LINK: while (($link,$text,$nextpage) = parse_link(\$line,$thisdir)) { next LINK if ($link eq ''); if (($link =~ m!\.$map_ext!io) && ($text =~ m!^\s*"! ($text =~ m!>\s*$!)) { $line = &include_map_file($nextpage) . $line; } elsif (($nextdir,$nextpage) = find_page($nextpage)) { if (($dev,$ino) = (lstat($nextpage))[0,1]) { if (!defined($visited{"$dev:$ino"})) { push( @nextpages, [ $link, $nextpage, $level ] ) unless ($nextpage =~ m!/(?:redirect\.cgi|(?:img|pkg|license)/[^/]+)$!); $visited{"$dev:$ino"} = 0; } else { if ($visited{"$dev:$ino"} > 0) { push( @nextpages, [ $link, $nextpage, $visited{"$dev:$ino"} ] ); $visited{"$dev:$ino"} = 0; } } } else { print LOGFILE "can't stat HTML file '$nextpage': $!\n"; } } else { } } } close(THISPAGE); # free memory: $line = ''; # ("$line" may contain a whole page - simultaneously in every # call of "traverse_tree" in the recursive descent!) # &restore_root_id if (($type == 3) || ($type == 4)); print LOGFILE $separator; while (@nextpages) { $ref = shift(@nextpages); $link = ${$ref}[0]; $nextpage = ${$ref}[1]; $level = ${$ref}[2]; if (($ref,$text) = &traverse_tree($nextpage,$thispage,$level)) { push(@{$result},[$ref,$link,$text]); } } return($result,$title); } # Subroutine to give out the upper part of the HTML page: sub html_header { # uses global TREEFILE print TREEFILE <<"VERBATIM";
VERBATIM } # Subroutine to give out the main part of the HTML page: sub html_body { my($tree,$level) = @_; my($this,$next,$link,$title); # uses global TREEFILE if ($level < 2) { print TREEFILE ' ' x $level, "

\n"; } print TREEFILE ' ' x $level, "

\n"; while (@{$tree}) { $this = shift(@{$tree}); $next = ${$this}[0]; $link = ${$this}[1]; $title = ${$this}[2]; unless ($title) { $title = $link; } $title =~ s!^Steffen Beyer\s+-\s+!!i; # remove "Steffen Beyer - " print TREEFILE ' ' x ($level+1); print TREEFILE qq!
$title
\n!; if (@{$next}) { &html_body($next,$level+1); # if (@{$tree}) { print TREEFILE ' ' x ($level+1), "

\n"; } } if (($level < 2) && @{$tree}) { print TREEFILE ' ' x ($level+1), "

\n"; } } print TREEFILE ' ' x $level, "

\n"; if ($level < 2) { print TREEFILE ' ' x $level, "

\n"; } } # Subroutine to give out the lower part of the HTML page: sub html_footer { # uses global TREEFILE print TREEFILE <<"VERBATIM";

VERBATIM } # Subroutine to convert a URL into a physical path in the file system: # (Doesn't change the input if it's already a physical path!) sub url_to_file { my($thispage) = @_; # argument is a reference! my($rootdir,$type,$doit); # uses globals $html_root, $cgi_root, $root_pattern, $ext_pattern, %html_ext # prepend path to root HTML or "cgi-bin" directory: ${$thispage} =~ s!^/~([A-Za-z][A-Za-z0-9]*)/!/u/$1/!; unless (${$thispage} =~ m!^(?:$root_pattern)!io) { $rootdir = $html_root; if (${$thispage} =~ m!\.($ext_pattern)$!io) { $type = $html_ext{$1}; if (($type == 3) || ($type == 4)) { $rootdir = $cgi_root; } } ${$thispage} =~ s!^/!!; ${$thispage} = "$rootdir/" . ${$thispage}; } # transformation for hidden HTML subdirectories in user home directories: $doit = 0; if (${$thispage} =~ m!^/[egu]/[^/\s]+/([^/\s]+)!) { if (lc($1) ne '.www') { $doit = 1; } } elsif (${$thispage} =~ m!^/[egu]/[^/\s]!) { $doit = 1; } else { } if ($doit) { ${$thispage} =~ s!^/([egu])/([^/\s]+)!/$1/$2/.www!; } # substitute "/./" --> "/": while (${$thispage} =~ m!/\./!) { ${$thispage} =~ s!/\./!/!g; } # substitute "/directory/../" --> "/": while (${$thispage} =~ m!/[^\./]+/\.\./!) { ${$thispage} =~ s!/[^\./]+/\.\./!/!g; } } # Subroutine to convert a physical path in the file system into a URL: # (Doesn't change the input if it's already a URL!) sub file_to_url { my($thispage) = @_; # argument is a reference! # uses global $root_pattern # remove leading path to root HTML or "cgi-bin" directory: ${$thispage} =~ s!^(?:$root_pattern)!!io; # transformation for hidden HTML subdirectories in user home directories: if ((${$thispage} =~ m!^/[egu]/[^/\s]+/.www/!) || (${$thispage} =~ m!^/[egu]/[^/\s]+/.www$!)) { ${$thispage} =~ s!^/([egu])/([^/\s]+)/.www!/$1/$2!; } # substitute "/./" --> "/": while (${$thispage} =~ m!/\./!) { ${$thispage} =~ s!/\./!/!g; } # substitute "/directory/../" --> "/": while (${$thispage} =~ m!/[^\./]+/\.\./!) { ${$thispage} =~ s!/[^\./]+/\.\./!/!g; } } # The End: __END__