# $Id: wwwhtml.pl,v 0.14 1994/07/22 09:57:52 fielding Exp $
# ---------------------------------------------------------------------------
# wwwhtml: A package for parsing pages of HyperText Markup Language (HTML)
# for a World-Wide Web spider.
#
# This package has been developed by Roy Fielding
# as part of the Arcadia project at the University of California, Irvine.
# Each routine in this package has been derived from the work of multiple
# authors -- those that are known are listed above the respective routines.
# The routines have been changed substantially, so don't blame them for bugs.
# It is distributed under the Artistic License (included with your Perl
# distribution files).
#
# 13 Jun 1994 (RTF): Initial version
# 07 Jul 1994 (RTF): Removed buggy attempt to extract comments.
# Updated META parsing to reflect HTML 2.0 proposal.
# 20 Jul 1994 (RTF): Fix segmentation fault if we are fooled into trying
# to extract links from a non-html document.
# 22 Jul 1994 (RTF): Fixed parsing of href's that had a new-line after
# the quote mark, causing an extra space to precede the
# extracted URL, which in turn created a black hole.
# Also added code to extract and change the base URL
# if there exists a element.
#
# If you have any suggestions, bug reports, fixes, or enhancements,
# send them to Roy Fielding at .
# ---------------------------------------------------------------------------
# Some of these routines are reduced versions of those distributed by
# Oscar Nierstrasz from CUI, University of Geneva.
# See for more info.
# ===========================================================================
require "wwwurl.pl";
package wwwhtml;
# This is just a minimal start. Eventually it would be nice to have
# a complete HTML parser with the ability to output formatted text.
# PostScript output would be even better (yeah, I know, keep dreaming).
# ===========================================================================
# extract_links(): Extract the document metainformation and links from a
# page of HTML content and return them for use by a traversal program.
#
# The parameters are:
# $base = the URL of this document (for fixing relative links);
# *headers = the %table of document metainformation (should already
# contain some information from the HTTP response headers);
# *content = the $page of HTML (this will be DESTROYED in processing);
# *links = the return @queue of absolute child URLs (w/o query or tag);
# *labs = the return @queue of absolute child URLs (with query or tag);
# *lorig = the return @queue of original child HREFs;
# *ltype = the return @queue of link types, where
# 'L' = normal link,
# 'I' = IMG source,
# 'Q' = link or source containing query information,
# 'R' = redirected link (used elsewhere).
#
# Uses ideas from Oscar's hrefs() dated 13/4/94 in
#
#
sub extract_links
{
local($base, *headers, *content, *links, *labs, *lorig, *ltype) = @_;
local($scheme, $host, $port, $path, $query, $frag);
local($link, $orig, $elem);
$content =~ s/\s+/ /g; # Remove all extra whitespace and newlines
$content =~ s#]*href\s*=\s*"?\s*([^">\s]+)[^>]*>##i; # Base?
if ($1) { $base = $1; }
$content =~ s#]*>([^<]+)]*>##i; # Extract the title
if ($1) { $headers{'title'} = $1; }
$content =~ s/^[^<]+//; # Remove everything before first element
$content =~ s/>[^<]*/>/g; # Remove everything between elements (text)
$content =~ s/<[^>]*[^>]+$/>/; # Remove everything after last element
return unless ($content); # Return if we removed everything
# Isolate all META elements as text
$content =~ s/]*http-equiv\s*=\s*"?\s*([^">\s]+)[^>]*content\s*=\s*"?([^">]+)[^>]*>/M $1 $2\n/gi;
$content =~ s/]*name\s*=\s*"?\s*([^">\s]+)[^>]*content\s*=\s*"?([^">]+)[^>]*>/M $1 $2\n/gi;
# Isolate all A element HREFs as text
$content =~ s/]*href\s*=\s*"?\s*([^">\s]+)[^>]*>/A $1\n/gi;
# Isolate all IMG element SRCs as text
$content =~ s/]*src\s*=\s*"?\s*([^">\s]+)[^>]*>/I $1\n/gi;
$content =~ s/<[^>]*>//g; # Remove all remaining elements
$content =~ s/\n+/\n/g; # Remove all blank lines
#
# Finally, construct the link queues from the remaining list
#
foreach $elem (split(/\n/,$content))
{
if ($elem =~ /^A\s+(\S*)$/)
{
$orig = $1;
push(@lorig, $orig);
$link = &wwwurl'absolute($base, $orig);
push(@labs, $link);
($scheme,$host,$port,$path,$query,$frag) = &wwwurl'parse($link);
if ($query)
{
push(@ltype, 'Q');
}
else
{
push(@ltype, 'L');
}
push(@links, &wwwurl'compose($scheme,$host,$port,$path,'',''));
}
elsif ($elem =~ /^I\s+(\S*)$/)
{
$orig = $1;
push(@lorig, $orig);
$link = &wwwurl'absolute($base, $orig);
push(@labs, $link);
($scheme,$host,$port,$path,$query,$frag) = &wwwurl'parse($link);
if ($query)
{
push(@ltype, 'Q');
}
else
{
push(@ltype, 'I');
}
push(@links, &wwwurl'compose($scheme,$host,$port,$path,'',''));
}
elsif ($elem =~ /^M\s+(\S+)\s+(.*)$/)
{
$link = $1; # Actually the metainformation name
$orig = $2; # Actually the metainformation value
$link =~ tr/A-Z/a-z/;
$headers{$link} = $orig;
}
else { warn "A mistake was made in link extraction from $base"; }
}
}
1;