#!/usr/bin/perl
use strict;
use Getopt::Std;
use locale;

my %opt; 
getopts('l:he:s:ap:om:f', \%opt);

if ( $opt{h} || ! ($opt{e}||$opt{s}) || !$opt{l} ) {
	print<<EOT;
Generator of variant of the Lovin's stemmer which 
uses a longest match algorithm.

Author Teodor Sigaev <teodor\@stack.net>
Usage:
$0 -l LOCALENAME [ -e FILENAME ] [ -s FILENAME ] [ -p PREFIX ] [ -o FILENAME ] [ -a ] [ -m NUMBER ]
-e FILENAME	- file with endings of word
-s FILENAME	- file with list of stop-word
-o FILENAME	- out file, default STDOUT
-a 		- stop-word are strimmed
-p PREFIX	- prefix of function and etc, default strimmed locale
-m NUMBER	- minimal length of rest after semming, default 3
-l LOCALENAME	- name of locale
-f		- do not call tolower for each char
At least one of -e or -s must be defined
EOT
exit; 
}

if ( ! defined $opt{p} ) {
	$opt{p} = $opt{l};
	$opt{p}=~s/[^a-zA-Z0-9_]+//g;
}
$opt{m}=3 if ! defined $opt{m};

my ($enddata,$stopdata) = ('','');
my $maxchild = 0;

if ( $opt{e} ) {
	my @tree;
	buildtree(\@tree,  $opt{e}, 1);
	printstruct( \@tree,  0, \$enddata);
	undef @tree;
} 
	
if ( $opt{s} ) {
	my @tree;
	buildtree(\@tree,  $opt{s}, 0);
	printstruct( \@tree,  0, \$stopdata);
	undef @tree;
} 
	

die "No data\n" if ( ! (length $enddata || length $stopdata) );

$enddata = "\t{0,0,0,0}" if ( ! length $enddata );
$stopdata = "\t{0,0,0,0}" if ( ! length $stopdata );

my $fh=\*STDOUT;
if ( $opt{o} ) {
	open(OUT,">$opt{o}") || die "Can;t open file '$opt{o}' for writing\n";
	$fh = \*OUT;
}

my $linktype = 'uint32';
if ( $maxchild <= 0xff ) {
        $linktype='uint8';
} elsif (  $maxchild <= 0xffff ) {
        $linktype='uint16';
}

my $wherecheck = ( $opt{a} ) ?
		"NULL,\n\t$opt{p}_is_stopword"
	: 
		"$opt{p}_is_stopword,\n\tNULL";

my ($tolower, $resttolower) = ('','');
if ( ! $opt{f} ) {
	$tolower = '*cur = tolower( *cur );';
	$resttolower=<<EOT;
	while( cur - buf >= 0 ) {
		*cur = tolower(*cur);
		cur--;
	}
EOT
}

print {$fh} <<EOT;
/*
 * Autogenerated file
 *
 * Variant of the Lovin's stemmer which uses a longest match algorithm.
 * Endings are stored in a suffix tree.
 */

#ifdef DICT_BODY
typedef struct {
	uint8	val;
	uint8	flag;
	uint8	right;
	$linktype	child;
} $opt{p}_NODE;

/* is exists left tree ? */
#define L       0x01
/* finish word flag */
#define F       0x02
#define ISLEFT(x)       ((($opt{p}_NODE*)x)->flag & L)
#define ISFINISH(x)     ((($opt{p}_NODE*)x)->flag & F)

#define MINLENREST	$opt{m}

static $opt{p}_NODE $opt{p}_endstree[]={
$enddata
};

static $opt{p}_NODE $opt{p}_stoptree[]={
$stopdata
};

static char*
$opt{p}_stem( void* obj, char *in, int *len ) {
	$opt{p}_NODE    *ptr = $opt{p}_endstree;
	int     result = 0;
	uint8 *buf = (uint8 *)in;
	uint8 *cur = buf + (*len) - 1;

	while( cur - buf >= MINLENREST ) {
		$tolower
		if ( ptr->val == *cur ) {
			if ( ISFINISH(ptr) ) result = buf + (*len) - cur;
			cur--;
			if ( ! ptr->child ) break;
			ptr += ptr->child;
		} else if ( ptr->val > *cur ) {
			if ( ISLEFT(ptr) )
				ptr++;
			else
				break;
		} else {
			if ( ptr->right )
				ptr += ptr->right;
			else
				break;
		}
	}
	$resttolower
	*len -= result;
	return in;
}

static int
$opt{p}_is_stopword( void *obj, char *in, int len ) {
        $opt{p}_NODE    *ptr = $opt{p}_stoptree;
        int     result = 0;
        uint8 *buf = (uint8 *)in;
        uint8 *cur = buf;

        while( cur - buf < len ) {
		$tolower
                if ( ptr->val == *cur ) {
                        cur++;
                        if ( ISFINISH(ptr) ) result = cur - buf;
                        if ( ! ptr->child ) break;
                        ptr += ptr->child;
                } else if ( ptr->val > *cur ) {
                        if ( ISLEFT(ptr) )
                                ptr++;
                        else
                                break;
                } else {
                        if ( ptr->right )
                                ptr += ptr->right;
                        else
                                break;
                }
        }
        return (result==len) ? 1 : 0;
}

#undef L
#undef F
#undef ISLEFT
#undef ISFINISH
#undef MINLENREST 
#endif /* DICT_BODY */

#ifdef DICT_TABLE
TABLE_DICT_START
	\"$opt{l}\",
	NULL,
	NULL,
	$opt{p}_stem,
	$wherecheck
TABLE_DICT_END
#endif

EOT

close($fh) if ( $fh != \*STDOUT );


sub buildtree {
	my ($reftree,$file, $needreverse) = @_;
	open(DATA,$file) || die "Can't open file '$file'\n";
	while(<DATA>) {
		chomp;
		next if ! length $_;
		$_ = lc($_) if ! $opt{f};
		addtostruct( $reftree, ( $needreverse ) ? scalar(reverse($_)) : $_ );
	}
	close DATA;
}

sub mkbintree {
	my ( $start, $stop, $rprop, $rres) = @_;
        
	my $middle = $start + int( ($stop-$start)/2 );

	push( @$rres, $rprop->[$middle] );
	my $idx = $#$rres;
	$rres->[$idx]{right}=0;
	$rres->[$idx]{left}=0;
	return 1 if ( $start == $stop );

	my $leftsize = 0;
	if ( $middle!=$start ) {
		$rres->[$idx]{left}=1;  
		$leftsize = mkbintree( $start, $middle-1, $rprop, $rres );
		$rres->[$idx]{right}=$leftsize+1;
	} else {
		$rres->[$idx]{right} = 1;
	}
	return 1 + $leftsize + mkbintree( $middle+1, $stop, $rprop, $rres );
}

sub addtostruct {
	my $node = shift;
	my ($char, $subval) = split('', shift, 2);
	$char = ord( $char );
	if ( ! defined $node->[$char] ) {
		$node->[$char] = {};
		$node->[$char]{finish} = length $subval;
		$node->[$char]{child} = [];
	} elsif ( ! length $subval ) {
		$node->[$char]{finish} = 0;
	}

	addtostruct( $node->[$char]{child}, $subval ) if ( length $subval );
}

sub printstruct {
	my ($node, $pre, $refout) = @_;
	my $add = 0;
	my @prop;
	my $outchild;
	my $current = 0;
	my $poschild=0;
	my @tmp;

	foreach my $i ( 0..255 ) {
		next if ( !defined $node->[ $i ] );
		push @prop , { val=>$i,
			nchild=>printstruct( $node->[ $i ]{child}, 1, \$outchild ),
			poschild=>$poschild };
		$poschild += $prop[$#prop]{nchild};
	}

	return 0 if $#prop < 0;
	if ($pre) {
		$$refout .= ",\n\n";
	}
	mkbintree(0,$#prop,\@prop,\@tmp);
	@prop = @tmp;
	
	$current=$#prop+1;
	foreach my $i ( 0..$#prop ) {
		my $flag = ($prop[$i]{left}) ? 'L' : undef;
		if ( $node->[ $prop[$i]{val} ]{finish}==0 ) {
			$flag .= '|' if defined $flag;
			$flag .= 'F';
		} elsif ( ! defined $flag ) {
			$flag='0';
		}
		$$refout .= "\t{'".chr( $prop[$i]{val} )."',".
				$flag.','.
				$prop[$i]{right}.','.
				(($prop[$i]{nchild}==0)?0:($prop[$i]{poschild}+$current)).'}'.
				(($i==$#prop)? '' : ",\n");

		$maxchild = $prop[$i]{poschild}+$current if
			( $prop[$i]{nchild} && $prop[$i]{poschild}+$current > $maxchild );
		$current--;
		$add += $prop[$i]{nchild};
	}
	$$refout .= $outchild;
	return $#prop+1 + $add;
}



