#! /bin/perl # # html.pl --- extract, normalize and hypertextify URLs in HTML files # # NB: If this package interests you, you should probably # have a look at Roy Fielding's libwww-perl packages: # http://www.ics.uci.edu/WebSoft/libwww-perl/ # # This package and friends can be found at: # http://cui_www.unige.ch/ftp/PUBLIC/oscar/scripts/README.html # or ftp: cui.unige.ch:/PUBLIC/oscar/scripts/ # # This package contains: # # html'href: identify URLs and turn them into hypertext links # html'abs: convert relative URLs to absolute ones # html'parse: parse an URL and return ($type,$host,$port,$path,$request) # html'hrefs: return all hrefs in a page # html'esc: escape characters in plain text # # Oscar Nierstrasz 26/8/93 oscar@cui.unige.ch # # 15/01/94 -- fixed html'abs to handle HREFs without surrounding quotes # 09/02/94 -- fixed html'abs to handle images as well # 24/3/94 -- added hrefs (from `explore') # 25/3/94 -- fixed hrefs to handle malformed HREFs (missing or extra quotes) # 25/3/94 -- fixed abs to leave internal refs alone! # 25/3/94 -- moved to separate package # 13/4/94 -- repaired abs() to handle HREFs with missing quotes # 25/5/94 -- modified parse() to handle empty protocol type # # BUGS: Craig Allen points out that binary files # that contain "" is missing. # An arbitrary file name ("runtime.pl") can also be confused. sub href { # study; # doesn't speed things up ... # to avoid special cases for beginning & end of line s|^|#|; s|$|#|; # URLS: : s|(news:[\w.]+)|$&|g; s|(http:[\w/.:+\-~#?]+)|$&|g; s|(file:[\w/.:+\-]+)|$&|g; s|(ftp:[\w/.:+\-]+)|$&|g; s|(wais:[\w/.:+\-]+)|$&|g; s|(gopher:[\w/.:+\-]+)|$&|g; s|(telnet:[\w/.:+\-]+)|$&|g; # s|(\w+://[\w/.:+\-]+)|$&|g; # catch some newsgroups to avoid confusion with sites: s|([^\w\-/.:@>])(alt\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(bionet\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(bit\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(comp\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(gnu\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(misc\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(news\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(rec\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(ch\.[\w.+\-]+[\w+\-]+)|$1$2|g; # FTP locations (with directory): # anonymous@: s|(anonymous@)([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/.]+)|$1$2:$4$3|g; # ftp@: s|(ftp@)([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/.]+)|$1$2:$4$3|g; # : s|([^\w\-/.:@>])([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,}):(\s*)([\w\d+\-/.]+)|$1$2:$4$3|g; # NB: don't confuse an http server with a port number for # an FTP location! # internet number version: : s|([^\w\-/.:@])(\d{2,}\.\d{2,}\.\d+\.\d+):([\w\d+\-/.]+)|$1$2:$3|g; # just the site name (assume two dots): s|([^\w\-/.:@>])([a-zA-Z][\w+\-]+\.[\w.+\-]+\.[a-zA-Z]{2,})([^\w\d\-/.:!])|$1$2$3|g; # NB: can be confused with newsgroup names! # .com has only one dot: s|([^\w\-/.:@>])([a-zA-Z][\w.+\-]+\.com)([^\w\-/.:])|$1$2$3|g; # just internet numbers: s|([^\w\-/.:@])(\d+\.\d+\.\d+\.\d+)([^\w\-/.:])|$1$2$3|g; # unfortunately inet numbers can easily be confused with # european telephone numbers ... s|^#||; s|#$||; } # convert relative http URLs to absolute ones: # BUG: minor problem with binary files containing "]*)"?|HREF="$root/$1"|i) && next; # relative from $path: $hrefs[$n] =~ s|href\s*=\s*"?([^/"][^">]*)"?|HREF="$root$path$1"|i; # collapse relative paths: $hrefs[$n] =~ s|/\./|/|g; while ($hrefs[$n] =~ m|/\.\./|) { $hrefs[$n] =~ s|[^/]*/\.\./||; } } # Actually, this causes problems for binary files # that just happen to include the sequence "]*)"?|SRC="$root/$1"|i) && next; # relative from $path: $hrefs[$n] =~ s|src\s*=\s*"?([^/"][^">]*)"?|SRC="$root$path$1"|i; # collapse relative paths: $hrefs[$n] =~ s|/\./|/|g; while ($hrefs[$n] =~ m|/\.\./|) { $hrefs[$n] =~ s|[^/]*/\.\./||; } } join(":" if ($url =~ /^(\w+):+(.*)/) { $type = $1; $request = $2; } # relative URL of form "" else { $request = $url; } $request =~ s|^$|/|; $request =~ s|^([^/])|$path$1|; # relative path $request =~ s|/\./|/|g; while ($request =~ m|/\.\./|) { $request =~ s|[^/]*/\.\./||; } # assume previous host & port: unless ($host) { # $! = "html'parse: no host for $url\n"; print STDERR "html'parse: no host for $url\n"; return (undef,undef,undef,undef,undef); } } ($type,$host,$port,$path,$request); } # default ports sub defport { local($type) = @_; if ($type eq "http") { 80; } elsif ($type eq "gopher") { 70; } else { undef; } } # return a list of all the hrefs in a page sub hrefs { local($page) = @_; $page =~ s/^[^<]+[^<]*[^<]+$/>/; $page =~ s/]*href\s*=\s*"?([^">]+)[^>]*>/$1\n/gi; $page =~ s/]*src\s*=\s*"?([^">]+)[^>]*>/$1\n/gi; $page =~ s/<[^>]*>//g; $page =~ s/#.*//g; $page =~ s/\n+/\n/g; split(/\n/,$page); } # escape characters in plain text: sub esc { s/&/&/g; s//>/g; } 1;