# -*- perl -*-
#
# $Id: Lexer.pm,v 1.14 1996/06/19 20:28:45 qjb Exp $
# $Source: /home/qjb/source/perl/modules/RCS/Lexer.pm,v $
# $Author: qjb $
#

require 5.002;
use strict;

package Lexer;
my $package = "Lexer";

use qutils qw(:assert_args);
use FileHandle;
require Token;
require LexTable;

# Field names
my $f_table = "table";
my $f_cur_state = "cur_state";
my $f_states = "states";
my $f_eof = "eof";
my $f_read = "read";
my $f_unread = "unread";
my $f_lineno = "lineno";
my $f_lines = "lines";
my $f_cur_line = "cur_line";
my $f_filehandle = "filehandle";

# Static Variables
my $max_unread = 10;

# Routines

sub init
{
    &assert_args(\@_, [$package, "LexTable"], []) if $assert_args;
    my $rep = shift;
    my $table = shift;

    $rep->{$package}{$f_table} = $table;
    $rep->{$package}{$f_cur_state} = $table->initial_state();
    $rep->{$package}{$f_states} = $table->states(); # cache for efficiency
    $rep->{$package}{$f_eof} = 0;
    $rep->{$package}{$f_read} = [];
    $rep->{$package}{$f_unread} = [];
    $rep->{$package}{$f_lineno} = 0;
    $rep->{$package}{$f_lines} = [];
    $rep->{$package}{$f_cur_line} = "";
    $rep->{$package}{$f_filehandle} = undef;
}

sub new_from_string
{
    &assert_args(\@_, ["", "", "", "LexTable"], []) if $assert_args;
    my $class = shift;
    my $rep = +{$package => {} };
    my ($str, $lineno, $table) = @_;

    bless $rep;
    $rep->init($table);

    my @lines = ();

    while ($str =~ s/^([^\r\n]*\r?\n)//)
    {
	push(@lines, $1);
    }
    push(@lines, $str);

    $rep->{$package}{$f_lineno} = $lineno - 1;
    $rep->{$package}{$f_lines} = [@lines];

    $rep;
}

sub new_from_filehandle
{
    &assert_args(\@_, ["", "FileHandle", "LexTable"], []) if $assert_args;
    my $class = shift;
    my $rep = +{$package => {} };
    my ($filehandle, $table) = @_;

    bless $rep;
    $rep->init($table);

    $rep->{$package}{$f_filehandle} = $filehandle;

    $rep;
}

sub p_table
{
    my $rep = shift;
    $rep->{$package}{$f_table};
}

sub p_cur_line
{
    my $rep = shift;
    $rep->{$package}{$f_cur_line} = $_[0] if @_;
    $rep->{$package}{$f_cur_line};
}

sub p_eof
{
    my $rep = shift;
    $rep->{$package}{$f_eof} = $_[0] if @_;
    $rep->{$package}{$f_eof};
}

sub p_unread
{
    my $rep = shift;
    $rep->{$package}{$f_unread};
}

sub p_read
{
    my $rep = shift;
    $rep->{$package}{$f_read};
}

sub p_filehandle
{
    my $rep = shift;
    $rep->{$package}{$f_filehandle};
}

sub p_lines
{
    my $rep = shift;
    $rep->{$package}{$f_lines};
}

sub p_lineno
{
    my $rep = shift;
    $rep->{$package}{$f_lineno} = $_[0] if @_;
    $rep->{$package}{$f_lineno};
}

sub read_token
{
    my $rep = shift;
    my $read = $rep->{$package}{$f_read};
    my $unread = $rep->{$package}{$f_unread};
    my $token = undef;
    my $bol = 0;		# beginning of line

    if (@$unread)
    {
	$token = pop(@$unread);
    }
    else
    {
	my $table = $rep->{$package}{$f_table};
	if ($rep->{$package}{$f_cur_line} eq "")
	{
	    if (! $rep->{$package}{$f_eof})
	    {
		# This block used to be p_next_line, but it was inlined
		# for efficiency.

		my $lines = $rep->{$package}{$f_lines};
		my $filehandle = $rep->{$package}{$f_filehandle};

		if (defined($filehandle) && (scalar(@$lines) == 0))
		{
		    my $line = $filehandle->getline();
		    push(@$lines, $line) if defined($line);
		}

		if (scalar(@$lines))
		{
		    $rep->{$package}{$f_cur_line} = shift(@$lines);
		    $rep->{$package}{$f_lineno}++;
		    $bol = 1;
		}
		else
		{
		    # This only gets called once... no need to break
		    # abstractions for that.
		    $rep->p_eof(1);
		}
	    } # END OF p_next_line block
	}

	{
	    # This block used to be p_next_token, but there is about
	    # a 3% efficiency gain in inlining it, and it can be worth
	    # it for large and complex files.

	    my $curline = $rep->{$package}{$f_cur_line};
	    my $lineno = $rep->{$package}{$f_lineno};
	    my $tokstr = undef;

	    my $table = $rep->{$package}{$f_table};
	    my $cur_state = $rep->{$package}{$f_cur_state};
	    my $states = $rep->{$package}{$f_states};
	    
	    if ($curline eq "")
	    {
		$token = new Token($table->eof_token(), "", $lineno);
	    }
	    else
	    {
		for (@{$states->{$cur_state}})
		{
		    my ($expr, $type, $new_state) = @$_;

		    unless (($expr =~ s/^\^//) && (! $bol))
		    {
			if ($curline =~ s/^($expr)//)
			{
			    if ($1 eq "")
			    {
				die "$package: INTERNAL ERROR: regexp $expr " .
				    "can match the emtpy string!\n";
			    }
			    $rep->{$package}{$f_cur_line} = $curline;
			    $tokstr = $1;

			    # This horrible manual construction of a Token
			    # object results in a 10% run-time payoff
			    # over saying
			    #   $token = new Token($type, $tokstr, $lineno);
			    # Appropriate warnings are given in Token.pm.

			    $token = bless
				+{"Token" => {"type" => $type,
					      "value" => $tokstr,
					      "lineno" => $lineno,
					  }
			      }, "Token";

			    if (defined($new_state))
			    {
				$rep->{$package}{$f_cur_state} = $new_state;
			    }		    
			    last;
			}
		    }
		}
	    }
	    
	    (defined $token) ||
		die "$package: no expression matches $curline\n";
	} # END OF p_next_token block

    }

    push(@$read, $token);
    if (scalar(@$read) >= $max_unread)
    {
	shift(@$read);
    }

    $token;
}

sub unread_token
{
    &assert_args(\@_, [$package], []) if $assert_args;

    my $rep = shift;
    my $read = $rep->p_read();
    my $unread = $rep->p_unread();

    if (@$read)
    {
	push(@$unread, pop(@$read));
    }
    else
    {
	die $package."::unread_token: no tokens to unread\n";
    }
}

sub test
{
    my $table = new LexTable
	("EOF", 'initial',
	 +{
	     'initial' => [["three\$", "three", undef],
			   ['^seven', "seven", undef],
			   ['\r?\n', "newline", undef],
			   ['ni', "word", 'new'],
			   ['[a-zA-Z]+', "word", undef],
			   ['.', "other", undef]],

	     'new' => [['\r?\n', "newline", 'initial'],
		       ['.', "other", undef]],
	 }
	 );

    my $expr_string = "
one two three
three seven fourteen
seven 8 nine
ten
";

    my @results = (new Token("newline", "\n", 1),
		   new Token("word", "one", 2),
		   new Token("other", " ", 2),
		   new Token("word", "two", 2),
		   new Token("other", " ", 2),
		   new Token("three", "three", 2),
		   new Token("newline", "\n", 2),
		   new Token("word", "three", 3),
		   new Token("other", " ", 3),
		   new Token("word", "seven", 3),
		   new Token("other", " ", 3),
		   new Token("word", "fourteen", 3),
		   new Token("newline", "\n", 3),
		   new Token("seven", "seven", 4),
		   new Token("other", " ", 4),
		   new Token("other", "8", 4),
		   new Token("other", " ", 4),
		   new Token("word", "ni", 4),
		   new Token("other", "n", 4),
		   new Token("other", "e", 4),
		   new Token("newline", "\n", 4),
		   new Token("word", "ten", 5),
		   new Token("newline", "\n", 5),
		   new Token("EOF", "", 6)),

    my $lineno = 1;
    
    my $lexer = new_from_string Lexer($expr_string, $lineno, $table);
    
  read_tokens:
    while (1)
    {
	my $token = $lexer->read_token();
	my $actual = $token->unparse();
	my $expected = shift(@results)->unparse();
	print $actual, "\n";
	die "test failed\n" if ($actual ne $expected);
	my $type = $token->type();
	last read_tokens if ($type eq "EOF");
    }
    print "test passed\n";
}

1;

#
# END OF Lexer
#