#!/opt/bin/perl5 # # Needs Perl 5.001 or higher! (For non-greedy pattern matching) # ############################################################################### # # # Program to traverse a tree of HTML pages to perform certain actions on them # # # ############################################################################### # # # Version 1.2 - Written 03.02.96 by Steffen Beyer # # # ############################################################################### # # # Copyright (C) 1995 by software design & management GmbH & Co. KG # # # ############################################################################### # # User configurable constants: # ############################################################################### # $hostpattern = '\bsun\b|\bsunbi1\b|\bwww\.bi\.sdm\b|\bsww1\b'; # Pattern! # $hostaddress = 'sww1.sdm.de'; # $rootdir = '/u/www/.www'; $rootpage = 'index.html'; # # Select (single/double/multiple) line anchors (...) # by (not changing/setting to ""/undef'ing) '$/': # undef $/; # ############################################################################### # # Some internal variables: # $version = 'version 1.2'; # $self = $0; $self =~ s!^.*/!!; # $separator = ('-' x 78) . "\n"; # # Check for correct Perl version: # if ($] < 5.000) { warn $separator; warn "$self $version needs Perl version 5.000 or higher!\n"; die $separator; } if ($] < 5.001) { warn $separator; warn "$self $version was developed with Perl version 5.001m.\n"; warn "Your Perl version is: $]. Use at your own risk!\n"; warn $separator; } # # Slurp passwd file for user home directories: # setpwent; # while (($user,$dir) = (getpwent)[0,7]) { if ($homedir{$user} eq "") { $dir =~ s!/$!!; $homedir{$user} = $dir; } } # endpwent; # # Normalize root directory and page: # $rootdir =~ s!/$!!; $rootpage =~ s!/$!!; $rootpage =~ s!^.*/!!; # $logfile = "$rootdir/$self.log"; # # Rename logfile if it already exists: # if (-e $logfile) { $counter = '000'; $backfile = "$logfile.$counter"; while (-e $backfile) { $counter++; $backfile = "$logfile.$counter"; } unless (rename($logfile,$backfile)) { die "$self: can't rename '$logfile' to '$backfile': $!\n"; } unless (($log_uid,$log_gid) = (stat($backfile))[4,5]) { die "$self: can't stat '$backfile': $!\n"; } } else { unless (($log_uid,$log_gid) = (stat($rootdir))[4,5]) { die "$self: can't stat '$rootdir': $!\n"; } } # # Open logfile: # unless (open(LOGFILE, ">$logfile")) { die "$self: can't write '$logfile': $!\n"; } # # Write header information: # $date = `date`; print LOGFILE $separator; print LOGFILE "$self $version $date"; print LOGFILE $separator; # # Traverse tree: # &traverse("$rootdir/$rootpage",""); # # Write trailer information: # $date = `date`; print LOGFILE "$self $version $date"; print LOGFILE $separator; # # Close logfile: # close(LOGFILE); # # Restore logfile ownership: # if (chown($log_uid,$log_gid,$logfile) != 1) { die "$self: can't chown '$logfile'!\n"; } # # Done: # exit; # # Subroutine that does it all: # sub traverse { local($thispage,$prevpage) = @_; local(@nextpages); $thispage =~ s!/$!!; $thisdir = $thispage; $thisdir =~ s!/[^/]*$!!; print LOGFILE "visiting '$thispage'...\n"; print LOGFILE "(coming from '$prevpage')\n"; unless (open(THISPAGE, "<$thispage")) { print LOGFILE "can't read '$thispage': $!\n"; print LOGFILE $separator; return; } $tempfile = "$thispage.new.$$"; unless (open(TEMPFILE, ">$tempfile")) { print LOGFILE "can't write '$tempfile': $!\n"; print LOGFILE $separator; return; } $filechanged = 0; while ($line = ) { $left = ""; $rest = $line; $linechanged = 0; while ($rest =~ m!(.*?)!i) { $left .= $`; $rest = $'; $link = $1; $text = $2; $http = ''; $host = ''; $port = ''; $file = ''; $args = ''; $follow = 0; $linkchanged = 0; #print LOGFILE "found hyperlink '$link' ($text).\n"; if (($link !~ m!^\s*\S+\s*:!) || (($link =~ m!^\s*HTTP\s*:!i) && ($http = 'http:'))) { if ($link =~ m!^\s* (?:HTTP\s*:\s*)? //([^/:]*) ((?::\d+)?) /?(\S*?) ((?:"?\s.*?)?) $!ix) { $host = $1; $port = $2; $file = $3; $args = $4; $follow = ($host =~ m!$hostpattern!io); if (($follow) && ($host ne $hostaddress)) { $linkchanged = 1; $link = "${http}//${hostaddress}${port}/"; } else { $link = "${http}//${host}${port}/"; } $nextdir = $rootdir; } elsif ($link =~ m!^\s* (?:HTTP\s*:\s*)? (\S+) ((?:\s.*?)?) $!ix) { $file = $1; $args = $2; $follow = 1; if ($file =~ m!^/!) { $link = "$http/"; $file =~ s!^/!!; $nextdir = $rootdir; } else { $link = $http; $nextdir = $thisdir; } } else { } } if ($follow) { #print LOGFILE "following hyperlink '${link}$file'.\n"; $nextdir =~ s!/$!!; $nextpage = $file; if ($nextpage =~ m!^([^#]*)#!) { $nextpage = $1; } if (($nextpage =~ m!^~([^/]+)!) || ($nextpage =~ m!^%7E([^/]+)!i)) { $user = $1; $path = $'; $path =~ s!^/!!; if ($homedir{$user} ne "") { $nextpage = "$homedir{$user}/.www/$path"; } else { $nextpage = "/u/$user/.www/$path"; } } else { $nextpage = "$nextdir/$nextpage"; } $nextpage =~ s!/$!!; #print LOGFILE "hyperlink points to '$nextpage'.\n"; $found = 0; if (-f $nextpage) { $found = 1; } else { unless ((-d $nextpage) || ($nextpage =~ m!\.html?$!i)) { $testpage = "$nextpage.htm"; if (-f $testpage) { $found = 1; $linkchanged = 1; $nextpage = $testpage; $file .= ".htm"; } else { $testpage = "$nextpage.html"; if (-f $testpage) { $found = 1; $linkchanged = 1; $nextpage = $testpage; $file .= ".html"; } } } } if (($found) && (-T $nextpage) && !(-B $nextpage)) { while (-l $nextpage) { $symlink = readlink($nextpage); if ($symlink =~ m!^/!) { $nextpage = $symlink; } else { $nextpage =~ s![^/]+$!$symlink!; } } if (($ino) = (stat($nextpage))[1]) { if (!($visited{$ino})) { $visited{$ino} = 1; push(@nextpages, $nextpage); #print LOGFILE "pushing '$nextpage'.\n"; } } else { print LOGFILE "can't stat '$nextpage': $!\n"; } } unless (($found) || (-d $nextpage)) { print LOGFILE "can't find '$nextpage' (${link}$file)!\n"; } } if ($linkchanged) { $linechanged = 1; print LOGFILE qq|${text}\n|; } $left .= qq|${text}|; } $left .= $rest; if ($linechanged) { $filechanged = 1; print TEMPFILE $left; } else { print TEMPFILE $line; } } close(TEMPFILE); close(THISPAGE); if ($filechanged) { unless (($uid,$gid) = (stat($thispage))[4,5]) { print LOGFILE "can't stat old '$thispage': $!\n"; if (unlink($tempfile) != 1) { print LOGFILE "can't delete '$tempfile'!\n"; } print LOGFILE $separator; return; } $counter = '000'; $backfile = "$thispage.$counter"; while (-e $backfile) { $counter++; $backfile = "$thispage.$counter"; } unless (rename($thispage,$backfile)) { print LOGFILE "can't rename '$thispage' to '$backfile': $!\n"; if (unlink($tempfile) != 1) { print LOGFILE "can't delete '$tempfile'!\n"; } print LOGFILE $separator; return; } print LOGFILE "renamed '$thispage' to '$backfile'.\n"; unless (rename($tempfile,$thispage)) { print LOGFILE "can't rename '$tempfile' to '$thispage': $!\n"; print LOGFILE $separator; return; } if (chown($uid,$gid,$thispage) != 1) { print LOGFILE "can't chown '$thispage'!\n"; print LOGFILE $separator; return; } #print LOGFILE "chown $uid.$gid $thispage.\n"; if (chmod(0644,$thispage) != 1) { print LOGFILE "can't chmod '$thispage'!\n"; print LOGFILE $separator; return; } if (($ino) = (stat($thispage))[1]) { $visited{$ino} = 1; } else { print LOGFILE "can't stat new '$thispage': $!\n"; print LOGFILE $separator; return; } } else { if (unlink($tempfile) != 1) { print LOGFILE "can't delete '$tempfile'!\n"; print LOGFILE $separator; return; } } print LOGFILE $separator; while (@nextpages) { &traverse(shift(@nextpages),$thispage); # print STDOUT "&traverse("; # print STDOUT shift(@nextpages); # print STDOUT ","; # print STDOUT $thispage; # print STDOUT ");\n"; } } # # END: # __END__