#!/usr/bin/perl -w # rfcindex,v 1.7 2000/02/20 16:11:54 njh Exp =head1 NAME B - add HTML markup to an rfc-index.txt file =head1 SYNOPSIS B [B<--base>=base-URL] [options] [F F<...>] =head1 README Online RFC repositories typically contain a text file produced by the RFC Editor, F, which lists the RFCs currently in existence. rfcindex is a Perl script that reads the plain text index file and outputs an HTML index file. The RFC number of each citation becomes a hyperlink to the text of that RFC (if an online version of the RFC exists), and the cross references between citations (obsoletes, obsoleted by, updates, updated by) become hyperlinks within the HTML index. =head1 OPTIONS =over 5 =item B<--base=base-URL> Hyperlinks to RFC texts will be relative to the base URL specified. =item B<--notable> Select alternative markup which avoids table tags and produces a file which can be rendered in an incremental fashion. By default the HTML markup applied to the index uses a table, which can result in HTML which is quite slow for some browsers to layout (particularly if the file is being accessed over a network). =item B<--nodate> Supress the date line which is normally added to the HTML output. =item B<--nocredit> Supress the hyperlink to the home page of this program, which is normally added to the HTML output. =item B<--version> or B<--help> Prints version information, copyright, and a pointer towards the documentation, then exits. =back =head1 EXAMPLES To generate an HTML index for a locally held mirror of the RFC archive: rfcindex rfc-index.txt >index.html To generate a locally held index to a remote RFC repository: lynx -source http://www.example.net/rfc/rfc-index.txt \ | rfcindex --base http://www.example.net/rfc/ >rfc-index.html (of course, this example assumes that you have the B browser available - if not then download a copy of the rfc-index.txt file from the remote repository some other way and work on that). =head1 BUGS As part of the markup process, the title string in each citation is emphasised. The regular expressions used to determine where the title ends and the author list begins within the citation appear to produce correct results for all RFCs which were listed at the time of writing, but are not necessarily robust against all possible future title / author combinations. Comments, suggestions for improvement and bug reports are always welcome (see email address below). =head1 LICENSE Copyright (C) 1999, 2000 Neil Hoggarth This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. The GNU General Public License is available from: http://www.gnu.org/copyleft/gpl.html New versions of this script, and my other free software, will be made available from: http://www.kernighan.demon.co.uk/software/ =head1 PREREQUISITES This script requires the C and C modules. =head1 SCRIPT CATEGORIES Web =cut use strict; use Getopt::Long; my $VERSION=1.2; my ($mk_index_start, $mk_index_end, $mk_citation_start, $mk_title_start); my $base=""; my $table=1; my $showdate=1; my $showcredit=1; GetOptions("table!" => \$table, "date!" => \$showdate, "credit!" => \$showcredit, "base=s" => \$base, "version" => \my $getversion, "help" => \my $gethelp); # If a base URL has been provided and it isn't slash-terminated then # make it so. if ($base && ($base =~ /[^\/]$/)) { $base .= "/"; } if ($table) { $mk_index_start=""; $mk_index_end="
"; $mk_citation_start=""; $mk_title_start=""; } else { $mk_index_start="
    "; $mk_index_end="
"; $mk_citation_start="

  • "; $mk_title_start=": "; } if ($getversion || $gethelp) { print "\nrfcindex, version $VERSION " . '(2000/02/20 16:11:54)' . "\n\n" . "Copyright (C) 1999, 2000 Neil Hoggarth . " . "Run\n'perldoc rfcindex' for usage and licensing information.\n\n"; } else { # A previous release of this script put a tag in # the part of the output, in order to make the external RFC # document references relative to any URL requested using the # --base option. Unfortunately Netscape 4 appears to apply the # base URL to the internal, in-document cross-references; I think # that this is a Netscape bug (other browsers that I have tried do # "the right thing") but Netscape is sufficiently prevalent that # we ought to work around it. In this version of the script I have # dropped the tag and the base URL is manually hacked into # the external references in the body, as and when they are made. print "\n"; print "\n"; print "RFC Index\n"; print "\n"; print "\n"; print "

    RFC Index

    \n"; # Label the output with the current date, ISO 8601 style. I'm not # going to worry over much about hours, minutes, seconds, # timezones, etc or the date stamp on the source file. The # objective is just to indicate to the reader whether they are # looking at something reasonably current or not. my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = gmtime; $year += 1900; $mon++; print "

    Converted" if ($showdate || $showcredit); printf (": %04d-%02d-%02d
    ", $year, $mon, $mday) if ($showdate); print " using rfcindex." if ($showcredit); print "

    " if ($showdate || $showcredit); print "$mk_index_start\n"; # Work "by paragraph", rather than "by line". $/=""; while (<>) { # Skip this chunk, unless this it happens to start with a four # digit number (this bypasses all the explanatory material at the # start of the file, and ensures that we only process the # citations themselves). next unless (/^[0-9]{4}/); # Turn angle brackets into corresponding HTML entities. s//</g; # Turn all references of the form RFCxxxx into hyperlinks. This # makes "active" the RFC references in the various Obsoletes / # Obsoleted / Updates / Updated notes. s/RFC([0-9]{4})/RFC$1<\/a>/g; # Add emphasis to RFC titles, for improved readability. # On initial inspection of the citation format it would appear # that we can simply match all the characters following the # RFC number at the start of the line, upto and including the # first period character and call the enclosed text the # title. Unfortunately a few of the titles contain period # characters! We therefor have to attempt to identify the # sequence "period-whitespace-author". The authors are almost # always named with a middle initial, which is easy to regexp. # This still fails for a handful of cases: a few RFCs have no # author (eg. RFC212), and in some cases the "author" cited is # a working group, with no initial (eg. RFCs 1001, 1002) so if # the search for an author fails to find a match then we fall # back to the "search for the first period" method. Finally, # sandwiched in the middle, we special-case one non-standard # author string which happens to appear attached to an RFC # which features a period in the title (RFC2339). s/^([0-9]*\s)(.*?\.)([\s\n]+[A-Z]\.)/$1$2<\/strong>$3/gs || s/^([0-9]*\s)(.*?\.)([\s\n]+The[\s\n]+Internet[\s\n]+Society)/$1$2<\/strong>$3/gs || s/^([0-9]*\s)(.*?\.\s+)/$1$2<\/strong>/gs; # The RFC number at the start of each entry gets made into an # anchor. If the entry is flagged "(Not online)" then it is just a # name anchor, to act as a target for cross-references. Otherwise # it also acts as a hyperlink to the actual text of the RFC. s/^(0*)([1-9][0-9]*)\s(.*\(Not[\s\n]+online\))/${mk_citation_start}$1$2<\/a>${mk_title_start}$3/s; s/^(0*)([1-9][0-9]*)\s/${mk_citation_start}$1$2<\/a>${mk_title_start}/s; # Okay - lets see it! print; } print "$mk_index_end\n"; print "\n"; print "\n"; }