#! /usr/unsupported/bin/perl
##---------------------------------------------------------------------------##
##  File:
##      man2html
##  Author:
##      Earl Hood       ehood@convex.com
##  Description:
##      man2html is a Perl program to convert formatted nroff output
##	to HTML.
##	
##	Recommend command-line options based on platform:
##
##	Platform		Options
##	---------------------------------------------------------------------
##	c2mp			<None, the defaults should be okay>
##	hp9000s700/800		-leftm 1 -topm 8
##	sun4			-sun
##	---------------------------------------------------------------------
##
##---------------------------------------------------------------------------##
##  Copyright (C) 1994  Earl Hood, ehood@convex.com
##
##  This program is free software; you can redistribute it and/or modify
##  it under the terms of the GNU General Public License as published by
##  the Free Software Foundation; either version 2 of the License, or
##  (at your option) any later version.
##  
##  This program is distributed in the hope that it will be useful,
##  but WITHOUT ANY WARRANTY; without even the implied warranty of
##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##  GNU General Public License for more details.
##  
##  You should have received a copy of the GNU General Public License
##  along with this program; if not, write to the Free Software
##  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
##---------------------------------------------------------------------------##

require 'newgetopt.pl' || die "Unable to require newgetopt.pl\n";

($PROG = $0) =~ s/.*\///;
$VERSION = "2.0.0";

## Backspace character:  Used in overstriking detection
$bs = "\b";
## Regular expression to detect manpage references
$mancrossref = '((<EM>)?\w[+_\w\.-]*(<\/EM>)?\((<EM>)?\d+\w?(<\/EM>)?\))';

##	Associative array of section titles and their HTML tag wrapper.
##	This list allows customization of what HTML tag is used for
##	a given section head.
##
##	The section title can be a regular expression.  Therefore, one must
##	be careful about quoting special characters.
##
%SectionHead = (

    '\S.*OPTIONS.*', '<H2>',
    'AUTHORS?', '<H2>',
    'BUGS', '<H2>',
    'COMPATIBILITY', '<H2>',
    'DEPENDENCIES', '<H2>',
    'DESCRIPTION', '<H2>',
    'DIAGNOSTICS', '<H2>',
    'ENVIRONMENT', '<H2>',
    'ERRORS', '<H2>',
    'EXAMPLES', '<H2>',
    'EXTERNAL INFLUENCES', '<H2>',
    'FILES', '<H2>',
    'LIMITATIONS', '<H2>',
    'NAME', '<H2>',
    'NOTES?', '<H2>',
    'OPTIONS', '<H2>',
    'REFERENCES', '<H2>',
    'RETURN VALUE', '<H2>',
    'SECTION.*:', '<H2>',
    'SEE ALSO', '<H2>',
    'STANDARDS CONFORMANCE', '<H2>',
    'STYLE CONVENTION', '<H2>',
    'SYNOPSIS', '<H2>',
    'SYNTAX', '<H2>',
    'WARNINGS', '<H2>',
    '\s+Section.*:', '<H3>',

);
$HeadFallback = '<H2>';  # Fallback tag if above is not found.

##---------------------------------------------------------------------------
##-----------##
## MAIN Body ##
##-----------##
{
&get_cli_opts();

## Check if processing a keyord search
if ($K && $CGIURL) { &man_k(); exit 0; }

local($line,$tmp,$i,$head,$preindent,$see_also);
$see_also = 1;

print STDOUT "<HTML>\n";
print STDOUT "<HEAD>\n",
	     "<TITLE>$TITLE</TITLE>\n",
	     "</HEAD>\n"  if $TITLE;
print STDOUT "<BODY>\n";
print STDOUT "<H1>$TITLE</H1>\n",
	     "<HR>\n"  if $TITLE;
print STDOUT "<PRE>\n";
while(!eof(STDIN)) {
    for ($i=0; $i < $hdsz; $i++) { <STDIN>; }
    for ($i=0; $i < $txsz; $i++) {
	$_ = <STDIN>;

	## Try to check if line space is needed at page boundaries ##
	if (!$NODEPAGE && ($i==0 || $i==($txsz-1)) && !/^\s*$/) {
	    /^(\s*)/;  $tmp = length($1);
	    if ($do) {
		if ($tmp < $preindent) { print STDOUT "\n"; }
	    } else {
		$do = 1;
	    }
	    $preindent = $tmp;
	} else {
	    $do = 0;  $preindent = 0;
	}

	## Interpret line
	$line = $_;
	&entitize(*_);		# Convert [$<>] to entity references
	## Emphasize underlined words
	s/((_$bs[^_])+[\.\(\)_]?(_$bs[^_])+\)?)/&emphasize($1)/oge;
	$secth = 0;
	## Check for strong text and headings
	if ($SUN || /.$bs./o) {
	    if (!$NOHEADS) {
		$line =~ s/(.$bs)+//go;
		$tmp = $HeadFallback;
		foreach $head (keys %SectionHead) {
		    if ($line =~ /^$leftm$head/) {
			$tmp = $SectionHead{$head};
			$secth = 1;
			last;
		    }
		}
		if ($secth || $line =~ /^$leftm\S/o) {
		    if ($CGIURL && $SEEALSO) {
			if ($line =~ /SEE ALSO/o) { $see_also = 1; }
			else { $see_also = 0; }
		    }
		    chop $line;
		    $_ = $tmp . $line . $tmp;
		    s%<([^>]*)>$%</$1>%;
		    $_ = "\n</PRE>\n" . $_ . "<PRE>\n";
		} else {
		    s/(((.$bs)+.)+)/&strongize($1)/oge;
		}
	    } else {
		s/(((.$bs)+.)+)/&strongize($1)/oge;
	    }
	}
	## Create anchor links for manpage references
	s/$mancrossref/&make_xref($1)/oge if $CGIURL && $see_also;
	print STDOUT;
    }
    for ($i=0; $i < $ftsz; $i++) { <STDIN>; }
}
print STDOUT "</PRE>\n",
	     "</BODY>\n",
	     "</HTML>\n";
exit 0;
}  ## End Main
##---------------------------------------------------------------------------
sub get_cli_opts {
    &usage unless
    &NGetOpt(
	"botm=i",	# Number of lines for bottom margin (def: 7)
	"headmap=s",	# Filename of user section head map file
	"leftm=i",	# Character width of left margin (def: 0)
	"nodepage",	# Do not remove pagination lines
	"noheads",	# Do not detect for section heads
	"pgsize=i",	# Number of lines in a page (def: 66)
	"title=s",	# Title of manpage (def: Not defined)
	"topm=i",	# Number of lines for top margin (def: 7)
	"sun",		# Section heads are not overstriked in input
	"cgiurl=s",	# CGI URL for linking to other manpages
	"seealso",	# Link to other manpages only in the SEE ALSO section
	"k",		# Process input from 'man -k' output.
	"help"		# Short usage message
    );
    &usage() if defined($opt_help);

    $pgsz = ($opt_pgsize ? $opt_pgsize : 66);
    if (defined($opt_nodepage)) {
	$hdsz = 0;
	$ftsz = 0;
    } else {
	$hdsz = (defined($opt_topm) ? $opt_topm : 7);
	$ftsz = (defined($opt_botm) ? $opt_botm : 7);
    }
    $txsz = $pgsz - ($hdsz + $ftsz);
    $leftmsz = (defined($opt_leftm) ? $opt_leftm : 0);
    $leftm = ' ' x $leftmsz;
    $TITLE = ($opt_title ? $opt_title : "");
    $NOHEADS = (defined($opt_noheads) ? 1 : 0);
    $SUN = (defined($opt_sun) ? 1 : 0);
    $CGIURL = ($opt_cgiurl ? $opt_cgiurl : "");
    $SEEALSO = ($opt_seealso ? 1 : 0);
    $K = ($opt_k ? 1 : 0);

    if (defined($opt_headmap)) {
	require $opt_headmap || warn "Unable to read $opt_headmap\n";
    }
}
##---------------------------------------------------------------------------
sub emphasize {
    local($txt) = shift;
    $txt =~ s/(.$bs)+//go;
    $txt = "<EM>$txt</EM>";
    $txt;
}
##---------------------------------------------------------------------------
sub strongize {
    local($txt) = shift;
    $txt =~ s/(.$bs)+//go;
    $txt = "<STRONG>$txt</STRONG>";
    $txt;
}
##---------------------------------------------------------------------------
sub entitize {
    local(*txt) = shift;

    ## Check for special characters in overstrike text ##
    $txt =~ s/_$bs\&/&strike('_', '&')/geo;
    $txt =~ s/_$bs</&strike('_', '<')/geo;
    $txt =~ s/_$bs>/&strike('_', '>')/geo;

    $txt =~ s/(\&$bs)+\&/&strike('&', '&')/geo;
    $txt =~ s/(<$bs)+</&strike('<', '<')/geo;
    $txt =~ s/(>$bs)+>/&strike('>', '>')/geo;

    ## Check for special characters in regular text ##
    $txt =~ s/([^$bs])\&([^$bs])/$1&amp;$2/go;
    $txt =~ s/([^$bs])<([^$bs])/$1&lt;$2/go;
    $txt =~ s/([^$bs])>([^$bs])/$1&gt;$2/go;
}
##---------------------------------------------------------------------------
##	strike converts HTML special characters in overstriked text
##	into entity references.  The entities are overstriked so
##	strongize() and emphasize() will recognize the entity to be
##	wrapped in <STRONG>/<EM> tags.
##
sub strike {
    local($w, $char) = @_;
    local($ret);
    if ($w eq '_') {
	if ($char eq '&') {
	    $ret = "_$bs\&_${bs}a_${bs}m_${bs}p_${bs};";
	} elsif ($char eq '<') {
	    $ret = "_$bs\&_${bs}l_${bs}t_${bs};";
	} elsif ($char eq '>') {
	    $ret = "_$bs\&_${bs}g_${bs}t_${bs};";
	} else {
	    warn qq|Unrecognized character, "$char", passed to strike()\n|;
	}
    } else {
	if ($char eq '&') {
	    $ret = "\&$bs\&a${bs}am${bs}mp${bs}p;${bs};";
	} elsif ($char eq '<') {
	    $ret = "\&$bs\&l${bs}lt${bs}t;${bs};";
	} elsif ($char eq '>') {
	    $ret = "\&$bs\&g${bs}gt${bs}t;${bs};";
	} else {
	    warn qq|Unrecognized character, "$char", passed to strike()\n|;
	}
    }
    $ret;
}
##---------------------------------------------------------------------------
##	make_xref() was originally added to man2html by Maurice Cinquini
##	<mauricec@tplrd.tpl.oz.au> for use in the SEE ALSO section.  The
##	code has been modified to handle more general cases, and the routine
##	is called for all manpage cross-references throughout.
##
##	Specifically, I modified it to support the user's URL template for
##	linking to other manpages, support for [+_,-] in the title name,
##	and to handle <EM> tagging.
##
sub make_xref {
    local($str) = shift;
    local($em,$title,$cem,$em2,$section,$subsection,$cem2) =
    ($str =~ /(<EM>)?(\w[+_\.\w-]*)(<\/EM>)?\((<EM>)?(\d+)(\w?)(<\/EM>)?\)/);

    local($href) = (eval "\"$CGIURL\"");
    qq|<A HREF="$href">$str</A>|;
}
##---------------------------------------------------------------------------
##	man_k() process a keyword search.
##
sub man_k {
    local($line,$refs,$section,$subsection,$desc,$i,
	  %Sec1, %Sec1sub, %Sec2, %Sec2sub, %Sec3, %Sec3sub,
	  %Sec4, %Sec4sub, %Sec5, %Sec5sub, %Sec6, %Sec6sub,
	  %Sec7, %Sec7sub, %Sec8, %Sec8sub, %Sec9, %Sec9sub,
	  %SecN, %SecNsub, %SecNsec);

    print STDOUT "<HTML>\n";
    print STDOUT "<HEAD>\n",
		 "<TITLE>$TITLE</TITLE>\n",
		 "</HEAD>\n"  if $TITLE;
    print STDOUT "<BODY>\n";
    print STDOUT "<H1>$TITLE</H1>\n",
		 "<HR>\n"  if $TITLE;
    while ($line = <STDIN>) {
	next if $line !~ /\(\d\w?\)\s*-/;
	($refs,$section,$subsection,$desc) =
	    $line =~ /^\s*(.*)\((\d)(\w?)\)\s*-\s*(.*)$/;
	$refs =~ s/\s(and|or)\s/,/gi;	# Convert and/or to commas
	$refs =~ s/\s//g;		# Remove all whitespace
	$refs =~ s/,/, /g;		# Put space after comma
	&htmlize(*desc);		# Check for special chars in desc
	$desc =~ s/^(.)/\U$1/;		# Uppercase first letter in desc

	if ($section eq '1') {
	    $Sec1{$refs} = $desc; $Sec1sub{$refs} = $subsection;
	} elsif ($section eq '2') {
	    $Sec2{$refs} = $desc; $Sec2sub{$refs} = $subsection;
	} elsif ($section eq '3') {
	    $Sec3{$refs} = $desc; $Sec3sub{$refs} = $subsection;
	} elsif ($section eq '4') {
	    $Sec4{$refs} = $desc; $Sec4sub{$refs} = $subsection;
	} elsif ($section eq '5') {
	    $Sec5{$refs} = $desc; $Sec5sub{$refs} = $subsection;
	} elsif ($section eq '6') {
	    $Sec6{$refs} = $desc; $Sec6sub{$refs} = $subsection;
	} elsif ($section eq '7') {
	    $Sec7{$refs} = $desc; $Sec7sub{$refs} = $subsection;
	} elsif ($section eq '8') {
	    $Sec8{$refs} = $desc; $Sec8sub{$refs} = $subsection;
	} elsif ($section eq '9') {
	    $Sec9{$refs} = $desc; $Sec9sub{$refs} = $subsection;
	} else {			# Catch all
	    $SecN{$refs} = $desc; $SecNsec{$refs} = $section;
	    $SecNsub{$refs} = $subsection;
	}
    }
    &print_mank_sec(*Sec1, 1, *Sec1sub);
    &print_mank_sec(*Sec2, 2, *Sec2sub);
    &print_mank_sec(*Sec3, 3, *Sec3sub);
    &print_mank_sec(*Sec4, 4, *Sec4sub);
    &print_mank_sec(*Sec5, 5, *Sec5sub);
    &print_mank_sec(*Sec6, 6, *Sec6sub);
    &print_mank_sec(*Sec7, 7, *Sec7sub);
    &print_mank_sec(*Sec8, 8, *Sec8sub);
    &print_mank_sec(*Sec9, 9, *Sec9sub);
    &print_mank_sec(*SecN, 'N', *SecNsub, *SecNsec);

    print STDOUT "</DL>\n",
		 "</BODY>\n",
		 "</HTML>\n";
}
##---------------------------------------------------------------------------
##	print_mank_sec() prints out manpage cross-refs of a specific section.
sub print_mank_sec {
    local(*sec, $sect, *secsub, *secsec) = @_;
    local(@array, @refs, $href, $item, $title, $subsection, $i, $section);
    $section = $sect;

    @array = sort keys %sec;
    if ($#array >= 0) {
	print STDOUT "<H2>Section $section</H2>\n",
		     "<DL>\n";
	foreach $item (@array) {
	    $section = $secsec{$item}  if $sect eq 'N';
	    @refs = split(/,/,$item);
	    $title = $refs[0];
	    $title  =~ s/\(\)//g;		# Watch out for extra ()'s
	    $subsection = $secsub{$item};
	    $href = eval "\"$CGIURL\"";		# Create HREF string
	    print STDOUT "<DT>\n";
	    $i = 0;
	    foreach (@refs) {
		print STDOUT qq|<A HREF="$href">$_</A>|;
		print STDOUT ", "  if $i < $#refs;
		$i++;
	    }
	    print STDOUT " ($section$subsection)\n",
			 "<DD>\n",
			 $sec{$item}, "\n";
	}
	print STDOUT "</DL>\n";
    }
}
##---------------------------------------------------------------------------
sub htmlize {
    local(*str) = shift;
    $str =~ s/&/\&amp;/g;
    $str =~ s/</\&lt;/g;
    $str =~ s/>/\&gt;/g;
    $str;
}
##---------------------------------------------------------------------------
sub usage {
    print STDOUT <<EndOfUsage;
Usage: $PROG [ options ] < infile > outfile
Options:
  -botm <#>		: Number of lines for bottom margin (def: 7)
  -cgiurl <url>		: URL for linking to other manpages
  -headmap <file>	: Filename of user section head map file
  -help			: This message
  -k			: Process a keyword search result
  -leftm <#>		: Character width of left margin (def: 0)
  -nodepage		: Do not remove pagination lines
  -noheads		: Do not detect for section heads
  -pgsize <#>		: Number of lines in a page (def: 66)
  -seealso		: Link to other manpages only in the SEE ALSO section
  -sun			: Section heads are not overstriked in input
  -title <string>	: Title of manpage (def: Not defined)
  -topm <#>		: Number of line for top margin (def: 7)
Description:
  $PROG takes formatted manpages from STDIN and converts it to HTML sent to
  STDOUT.  The -topm and -botm arguments are the number of lines to the main
  body text and NOT to the running headers/footers.
Version:
  $VERSION

EndOfUsage
    exit 0;
}
