#!/opt/bin/perl5
#
# Needs Perl 5.001 or higher! (For non-greedy pattern matching)
#
###############################################################################
# #
# Program to traverse a tree of HTML pages to perform certain actions on them #
# #
###############################################################################
# #
# Version 1.2 - Written 03.02.96 by Steffen Beyer #
# #
###############################################################################
# #
# Copyright (C) 1995 by software design & management GmbH & Co. KG #
# #
###############################################################################
#
# User configurable constants:
#
###############################################################################
#
$hostpattern = '\bsun\b|\bsunbi1\b|\bwww\.bi\.sdm\b|\bsww1\b'; # Pattern!
#
$hostaddress = 'sww1.sdm.de';
#
$rootdir = '/u/www/.www';
$rootpage = 'index.html';
#
# Select (single/double/multiple) line anchors (...)
# by (not changing/setting to ""/undef'ing) '$/':
#
undef $/;
#
###############################################################################
#
# Some internal variables:
#
$version = 'version 1.2';
#
$self = $0;
$self =~ s!^.*/!!;
#
$separator = ('-' x 78) . "\n";
#
# Check for correct Perl version:
#
if ($] < 5.000)
{
warn $separator;
warn "$self $version needs Perl version 5.000 or higher!\n";
die $separator;
}
if ($] < 5.001)
{
warn $separator;
warn "$self $version was developed with Perl version 5.001m.\n";
warn "Your Perl version is: $]. Use at your own risk!\n";
warn $separator;
}
#
# Slurp passwd file for user home directories:
#
setpwent;
#
while (($user,$dir) = (getpwent)[0,7])
{
if ($homedir{$user} eq "")
{
$dir =~ s!/$!!;
$homedir{$user} = $dir;
}
}
#
endpwent;
#
# Normalize root directory and page:
#
$rootdir =~ s!/$!!;
$rootpage =~ s!/$!!;
$rootpage =~ s!^.*/!!;
#
$logfile = "$rootdir/$self.log";
#
# Rename logfile if it already exists:
#
if (-e $logfile)
{
$counter = '000';
$backfile = "$logfile.$counter";
while (-e $backfile)
{
$counter++;
$backfile = "$logfile.$counter";
}
unless (rename($logfile,$backfile))
{
die "$self: can't rename '$logfile' to '$backfile': $!\n";
}
unless (($log_uid,$log_gid) = (stat($backfile))[4,5])
{
die "$self: can't stat '$backfile': $!\n";
}
}
else
{
unless (($log_uid,$log_gid) = (stat($rootdir))[4,5])
{
die "$self: can't stat '$rootdir': $!\n";
}
}
#
# Open logfile:
#
unless (open(LOGFILE, ">$logfile"))
{
die "$self: can't write '$logfile': $!\n";
}
#
# Write header information:
#
$date = `date`;
print LOGFILE $separator;
print LOGFILE "$self $version $date";
print LOGFILE $separator;
#
# Traverse tree:
#
&traverse("$rootdir/$rootpage","");
#
# Write trailer information:
#
$date = `date`;
print LOGFILE "$self $version $date";
print LOGFILE $separator;
#
# Close logfile:
#
close(LOGFILE);
#
# Restore logfile ownership:
#
if (chown($log_uid,$log_gid,$logfile) != 1)
{
die "$self: can't chown '$logfile'!\n";
}
#
# Done:
#
exit;
#
# Subroutine that does it all:
#
sub traverse
{
local($thispage,$prevpage) = @_;
local(@nextpages);
$thispage =~ s!/$!!;
$thisdir = $thispage;
$thisdir =~ s!/[^/]*$!!;
print LOGFILE "visiting '$thispage'...\n";
print LOGFILE "(coming from '$prevpage')\n";
unless (open(THISPAGE, "<$thispage"))
{
print LOGFILE "can't read '$thispage': $!\n";
print LOGFILE $separator;
return;
}
$tempfile = "$thispage.new.$$";
unless (open(TEMPFILE, ">$tempfile"))
{
print LOGFILE "can't write '$tempfile': $!\n";
print LOGFILE $separator;
return;
}
$filechanged = 0;
while ($line = )
{
$left = "";
$rest = $line;
$linechanged = 0;
while ($rest =~ m!(.*?)!i)
{
$left .= $`;
$rest = $';
$link = $1;
$text = $2;
$http = '';
$host = '';
$port = '';
$file = '';
$args = '';
$follow = 0;
$linkchanged = 0;
#print LOGFILE "found hyperlink '$link' ($text).\n";
if (($link !~ m!^\s*\S+\s*:!) ||
(($link =~ m!^\s*HTTP\s*:!i) && ($http = 'http:')))
{
if ($link =~ m!^\s*
(?:HTTP\s*:\s*)?
//([^/:]*)
((?::\d+)?)
/?(\S*?)
((?:"?\s.*?)?)
$!ix)
{
$host = $1;
$port = $2;
$file = $3;
$args = $4;
$follow = ($host =~ m!$hostpattern!io);
if (($follow) && ($host ne $hostaddress))
{
$linkchanged = 1;
$link = "${http}//${hostaddress}${port}/";
}
else
{
$link = "${http}//${host}${port}/";
}
$nextdir = $rootdir;
}
elsif ($link =~ m!^\s*
(?:HTTP\s*:\s*)?
(\S+)
((?:\s.*?)?)
$!ix)
{
$file = $1;
$args = $2;
$follow = 1;
if ($file =~ m!^/!)
{
$link = "$http/";
$file =~ s!^/!!;
$nextdir = $rootdir;
}
else
{
$link = $http;
$nextdir = $thisdir;
}
}
else { }
}
if ($follow)
{
#print LOGFILE "following hyperlink '${link}$file'.\n";
$nextdir =~ s!/$!!;
$nextpage = $file;
if ($nextpage =~ m!^([^#]*)#!)
{
$nextpage = $1;
}
if (($nextpage =~ m!^~([^/]+)!) ||
($nextpage =~ m!^%7E([^/]+)!i))
{
$user = $1;
$path = $';
$path =~ s!^/!!;
if ($homedir{$user} ne "")
{
$nextpage = "$homedir{$user}/.www/$path";
}
else
{
$nextpage = "/u/$user/.www/$path";
}
}
else
{
$nextpage = "$nextdir/$nextpage";
}
$nextpage =~ s!/$!!;
#print LOGFILE "hyperlink points to '$nextpage'.\n";
$found = 0;
if (-f $nextpage)
{
$found = 1;
}
else
{
unless ((-d $nextpage) || ($nextpage =~ m!\.html?$!i))
{
$testpage = "$nextpage.htm";
if (-f $testpage)
{
$found = 1;
$linkchanged = 1;
$nextpage = $testpage;
$file .= ".htm";
}
else
{
$testpage = "$nextpage.html";
if (-f $testpage)
{
$found = 1;
$linkchanged = 1;
$nextpage = $testpage;
$file .= ".html";
}
}
}
}
if (($found) && (-T $nextpage) && !(-B $nextpage))
{
while (-l $nextpage)
{
$symlink = readlink($nextpage);
if ($symlink =~ m!^/!)
{
$nextpage = $symlink;
}
else
{
$nextpage =~ s![^/]+$!$symlink!;
}
}
if (($ino) = (stat($nextpage))[1])
{
if (!($visited{$ino}))
{
$visited{$ino} = 1;
push(@nextpages, $nextpage);
#print LOGFILE "pushing '$nextpage'.\n";
}
}
else
{
print LOGFILE "can't stat '$nextpage': $!\n";
}
}
unless (($found) || (-d $nextpage))
{
print LOGFILE "can't find '$nextpage' (${link}$file)!\n";
}
}
if ($linkchanged)
{
$linechanged = 1;
print LOGFILE qq|${text}\n|;
}
$left .= qq|${text}|;
}
$left .= $rest;
if ($linechanged)
{
$filechanged = 1;
print TEMPFILE $left;
}
else
{
print TEMPFILE $line;
}
}
close(TEMPFILE);
close(THISPAGE);
if ($filechanged)
{
unless (($uid,$gid) = (stat($thispage))[4,5])
{
print LOGFILE "can't stat old '$thispage': $!\n";
if (unlink($tempfile) != 1)
{
print LOGFILE "can't delete '$tempfile'!\n";
}
print LOGFILE $separator;
return;
}
$counter = '000';
$backfile = "$thispage.$counter";
while (-e $backfile)
{
$counter++;
$backfile = "$thispage.$counter";
}
unless (rename($thispage,$backfile))
{
print LOGFILE "can't rename '$thispage' to '$backfile': $!\n";
if (unlink($tempfile) != 1)
{
print LOGFILE "can't delete '$tempfile'!\n";
}
print LOGFILE $separator;
return;
}
print LOGFILE "renamed '$thispage' to '$backfile'.\n";
unless (rename($tempfile,$thispage))
{
print LOGFILE "can't rename '$tempfile' to '$thispage': $!\n";
print LOGFILE $separator;
return;
}
if (chown($uid,$gid,$thispage) != 1)
{
print LOGFILE "can't chown '$thispage'!\n";
print LOGFILE $separator;
return;
}
#print LOGFILE "chown $uid.$gid $thispage.\n";
if (chmod(0644,$thispage) != 1)
{
print LOGFILE "can't chmod '$thispage'!\n";
print LOGFILE $separator;
return;
}
if (($ino) = (stat($thispage))[1])
{
$visited{$ino} = 1;
}
else
{
print LOGFILE "can't stat new '$thispage': $!\n";
print LOGFILE $separator;
return;
}
}
else
{
if (unlink($tempfile) != 1)
{
print LOGFILE "can't delete '$tempfile'!\n";
print LOGFILE $separator;
return;
}
}
print LOGFILE $separator;
while (@nextpages)
{
&traverse(shift(@nextpages),$thispage);
# print STDOUT "&traverse(";
# print STDOUT shift(@nextpages);
# print STDOUT ",";
# print STDOUT $thispage;
# print STDOUT ");\n";
}
}
#
# END:
#
__END__