#!/usr/bin/perl
#
# Script to extract files from Debian packages
# Copyright 2004-2007 Frank Lichtenheld <frank@lichtenheld.de>
#
# based on a shell script which was
# Copyright 2003 Noel Köthe
# Copyright 2004 Martin Schulze <joey@debian.org>
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

use strict;
use warnings;

use FindBin;
use lib "$FindBin::Bin/../lib";
use lib "$FindBin::Bin";

use Getopt::Long;
use File::Temp qw( tempdir );
use File::Path;
use File::Copy;
use File::Basename;
#use Digest::SHA1;
use Deb::Versions;
use Parse::DebControl;
use Parse::DebianChangelog;
use Cwd;
use Fcntl qw(:DEFAULT :flock);
use Data::Dumper;
use DB_File;

use constant PKGPOOL => 1;
use constant DIGESTPOOL => 2;

my $PROGNAME = 'extract_files';
my $MAXWARN = 100;
my $TEMPDIR;

my $directory = cwd()."/pool";
my $dumpfile = '';
my $cachefile = '';
my $target = cwd()."/extracted_files";
my $workdir = '';
my $configdir = cwd()."/etc";
my ( $verbose, $version, $help, $debug, $force, $use_dump );

my %opthash = (
	       'verbose|v' => \$verbose,
	       'force|f' => \$force,
	       'directory|d=s' => \$directory,
	       'config|c=s' => \$configdir,
	       'target|t=s' => \$target,
	       'workdir|w=s' => \$workdir,
	       'cachefile=s' => \$cachefile,
	       'dumpfile=s' => \$dumpfile,
	       'use_dump' => \$use_dump,
	       'version' => \$version,
	       'debug' => \$debug,
	       'help' => \$help,
	       );

my (%src_packages, %bin_packages, %cache);

my %stats = (
    src_pkgs => 0,
    src_cache => 0,
    already_extracted => 0,
    bin_pkgs => 0,
    bin_cache => 0,
    );

Getopt::Long::config('no_getopt_compat', 'no_auto_abbrev');

GetOptions(%opthash) or do_error( "couldn't parse commandline parameters" );

$verbose ||= $debug;
$directory =~ s,/+$,,o;
if ($workdir) {
    $TEMPDIR = tempdir( 'pdo_extract_file.XXXXXX',
			DIR => $workdir, CLEANUP => 1 );
} else {
    $TEMPDIR = tempdir( 'pdo_extract_file.XXXXXX',
		        CLEANUP => 1 );
}

##################################################
# OUTPUT/LOGGING

sub do_error {
    die "$PROGNAME: FATAL: @_\n";
}

my $no_warnings = 0;
sub do_warning {
    warn "$PROGNAME: WARNING: @_\n";
    if (++$no_warnings > $MAXWARN) {
	do_error( "too many warnings ($MAXWARN)" );
    }
}

sub do_info {
    if ($verbose) {
	print "$PROGNAME: INFO: @_\n";
    }
}

sub do_debug {
    if ($debug) {
	print "$PROGNAME: DEBUG: @_\n";
    }
}

sub add_log {
    my $log  = shift;

    do_debug( @_ );
    $$log .= localtime().": @_\n";
}

sub touch {
    my $filename = shift;
    sysopen(H, $filename, O_WRONLY|O_NONBLOCK|O_CREAT) or return undef;
    close(H);
    return 1;
}

##################################################
# PACKAGE HANDLING (UNPACKING/CLEANUP)

sub unpack_srcpkg {
    my ( $pkgname, $dscname, $log ) = @_;

    chdir( $TEMPDIR ) or do_error( "couldn't change working directory to $TEMPDIR" );

    add_log( $log, "dpkg-source -sn -x $dscname $pkgname+source"  );

    system("dpkg-source", "-sn", "-x", $dscname, "$pkgname+source" ) == 0
	or do {
	    do_warning( "couldn't unpack $dscname: $!" );
	    add_log( $log, "couldn't unpack $dscname: $!" );
	    return;
	};

    return "$pkgname+source";
}

sub unpack_binpkg {
    my ( $pkgname, $debname, $log ) = @_;

    add_log( $log, "unpacking binary package $pkgname" );

    mkdir( "$TEMPDIR/$pkgname" ) or do {
	do_warning( "couldn't create directory $TEMPDIR/$pkgname" );
	add_log( $log, "couldn't create directory $TEMPDIR/$pkgname" );
	return;
    };

    add_log( $log, "dpkg-deb --extract $debname $TEMPDIR/$pkgname" );

    system("dpkg-deb", "--extract", $debname, "$TEMPDIR/$pkgname" ) == 0
	or do {
	    do_warning( "couldn't unpack $debname" );
	    add_log( $log, "couldn't unpack $debname" );
	    return;
	};

    return 1;
}

sub unpack_allbinpkg {
    my ($pkg_data, $log) = @_;

    my %already_seen;

    foreach my $pkg (@{$pkg_data->{bins}}) {
	next if $already_seen{$pkg->{bin_name}}; # some assumptions about sane version numbers included

	unpack_binpkg($pkg->{bin_name}, $pkg->{deb}, $log );

	$already_seen{$pkg->{bin_name}}++;
    }
}

sub cleanup_binpkg {
    my ($pkg_data) = @_;

    foreach my $pkg (keys %{$pkg_data->{bin_list}}) {
	# rmtree should do that itself, but there seems to be a bug somewhere
	system( "chmod", "-R", "u+rwx", "$TEMPDIR/$pkg" );
	rmtree( "$TEMPDIR/$pkg" );
    }
}

##################################################
# POOL HANDLING

sub pkg_pool_directory {
    my ($pkg_data) = @_;

    my $name = $pkg_data->{src_name};
    my $version = $pkg_data->{src_version};
    my $dscname = $pkg_data->{dsc};

    my $dir = "";

# I would prefer $name_$version but lets be backward compatible
# in case someone depends on the naming
    if ($name =~ /^(lib.)/o) {
#	$dir .= "$1/$name/${name}_$version";
	$dir .= "$1/$name/".basename($dscname, '.dsc');
    } else {
#	$dir .= substr($name,0,1)."/$name/${name}_$version";
	$dir .= substr($name,0,1)."/$name/".basename($dscname, '.dsc');
    }

    return $dir;
}

sub to_update {
    my ($pkg_data, $config_data, $log) = @_;

    if ($config_data->{structure} == PKGPOOL) {
	my $dir = "$target/".pkg_pool_directory( $pkg_data );
	if (!$force && -f "$dir/log") {
	    (utime(undef,undef,"$dir/log") == 1)
		or do_warning( "touch of $dir/log failed" );
	    return 0;
	} else {
	    rmtree( $dir );
	    return 1;
	}
    } elsif ($config_data->{structure} == DIGESTPOOL) {
	die "UNIMPLEMENTED!";
    } else {
	do_error( "unknown pool structure $config_data->{structure}" );
    }
}

sub write_log ($$) {
    my ($dir, $log) = @_;

    open my $logfh, ">$dir/log" or do_error( "couldn't open log file $dir/log.\n$log" );
    flock $logfh, LOCK_EX or do_error( "couldn't lock log file $dir/log" );;

    print $logfh $log;

    close $logfh or do_warning( "couldn't close log file $dir/log" );
}

##################################################
# EXTRACTION

sub extract_copyright_to_pkgpool {
    my ($pkg_data, $config_data, $log, $source_dir, $target_dir) = @_;

    add_log( $log, "copy copyright file from source package" );

    my $src_tgt = "$target_dir/copyright";
    copy( "$source_dir/debian/copyright", $src_tgt )
	or add_log( $log, "seems to have failed: $!" );

    foreach my $bin_pkg (keys %{$pkg_data->{bin_list}}) {

	my $usd = "$TEMPDIR/$bin_pkg/usr/share/doc/$bin_pkg";
	my $cpy = "$usd/copyright";
	my $tgt = "$target_dir/$bin_pkg.copyright";

	if (-f $cpy) {
	    add_log( $log, "copy copyright file from binary package $bin_pkg" );
	    copy( $cpy, $tgt )
		or add_log( $log, "seems to have failed: $!" );
	} elsif (-l $cpy ) {
	    add_log( $log, "copyright file $cpy is symlink, I can't handle that" );
	} elsif (-l $usd) {
	    add_log( $log, "doc directory $usd is symlink" );
	    my $link = readlink($usd) or add_log( $log, "readlink $usd failed" );
	    if ($link && $link =~ m,^(?:\./)?(\S+)/?$,o) { # do a little sanity check
		my $pkg2 = $1;
		if ($pkg_data->{bin_list}{$pkg2}) {
		    add_log( $log, "symlink points to $pkg2, make symlink to copyright file" );
		    (symlink( "$pkg2.copyright", $tgt ) == 1 )
			or add_log( $log, "symlink creation failed" );
		} else {
		    add_log( $log, "symlink points to $pkg2, don't know what to do with that" );
		}
	    } else {
		add_log( $log, "link seems fishy, not using" );
	    }
	}

	unless (-e $tgt || -l $tgt) { # if it is a link, we can't be sure that the target package was already processed
	    add_log( $log, "copyright file $tgt still doesn't exist" );
	    if (-e $src_tgt) {
		add_log( $log, "copyright file of the source package exists, make symlink" );
		(symlink( "copyright", $tgt ) == 1 )
		    or add_log( $log, "symlink generation failed" );
	    } else {
		add_log( $log, "give up on $bin_pkg" );
		touch("$tgt.ERROR")
		    or add_log( $log, "even the touch of $tgt.ERROR failed :(" );
	    }
	}

    } #foreach $bin_pkg

    unless (-e $src_tgt) {
	add_log( $log, "copyright file $src_tgt still doesn't exist" );
	# take one of the binary packages, prefering one that has
	# the same name as the source package
	foreach my $bin_pkg (($pkg_data->{src_name},
			      keys %{$pkg_data->{bin_list}})) {
	    if (-e "$target_dir/$bin_pkg.copyright") {
		add_log( $log, "copyright file $target_dir/$bin_pkg.copyright seems like a good guess to me, make a symlink" );
		(symlink( "$bin_pkg.copyright", $src_tgt ) == 1 )
		    or do {
			add_log( $log, "symlink generation failed" );
			next;
		    };
		last;
	    }
	}
	unless (-e $src_tgt) {
	    add_log( $log, "give up" );
	    touch("$src_tgt.ERROR") or
		add_log( $log, "even the touch of $src_tgt.ERROR failed :(" );
	}
    }
}

sub extract_changelog_to_pkgpool {
    my ($pkg_data, $config_data, $log, $source_dir, $target_dir) = @_;

    add_log( $log, "copy changelog file from source package" );

    my $src_changelog = copy( "$source_dir/debian/changelog",
			      "$target_dir/changelog.txt" );

    if ($src_changelog) {
	add_log( $log, "changelog file sucessfully copied" );
    } else {
	add_log( $log, "seems to have failed: $!" );
    }

    add_log( $log, "create enhanced HTML version" );
    my $chg = Parse::DebianChangelog->init;
    my $parsed = $chg->parse( { infile => "$source_dir/debian/changelog" } );
    if ($parsed) {
	$chg->html( { outfile => "$target_dir/changelog.html",
		      template => "$configdir/tmpl/default.tmpl" } );
	add_log( $log, scalar $chg->get_parse_errors )
	    if $chg->get_parse_errors;
    } else {
	do_warning( $chg->get_error );
	add_log( $log, $chg->get_error );
    }
}

sub manage_current_link {
    my ($pkg_data, $config_data, $log, $source_dir, $target_dir) = @_;

    my $parent_dir = dirname($target_dir);
    my $dirname = basename($target_dir);
    my $current_link = "$parent_dir/current";
    add_log( $log, "parent_dir=$parent_dir; dirname=$dirname" );
    unless (-l $current_link) {
	add_log( $log, "create new current link" );
	(chdir( $parent_dir ) and
	 (symlink( $dirname, 'current' ) == 1 )) or
	 add_log( $log, "creating new current link failed: $!" );
    } else {
	my $old_target = readlink( $current_link );
	(my $old_version = $old_target) =~ s/^[^_]*_//o;
	if (version_cmp( $pkg_data->{src_version},
			 $old_version) > 0) {
	    add_log( $log,
		     "old_version=$old_version; overwriting current link" );
	    (chdir( $parent_dir ) and
	     unlink( 'current' ) and
	     (symlink( $dirname, 'current' ) == 1 )) or
	     add_log( $log, "overwriting current link failed: $!" );
	} else {
	    add_log( $log,
		     "old_version=$old_version; not touching current link" );
	}
    }
}

sub extract_files {
    my ($pkg_data, $config_data) = @_;
    my $log = "";

    add_log( \$log, "process source package $pkg_data->{src_name} ($pkg_data->{src_version})" );

    unless (to_update( $pkg_data, $config_data, \$log )) {
	$stats{already_extracted}++;
	do_debug( "source package $pkg_data->{src_name} ($pkg_data->{src_version}) doesn't need extracting" );
	return;
    }

    if (my $source_dir = unpack_srcpkg( $pkg_data->{src_name}, $pkg_data->{dsc}, \$log )) {

	$source_dir = "$TEMPDIR/$source_dir";

	unpack_allbinpkg($pkg_data, \$log);

	my $target_dir = "$target/".pkg_pool_directory($pkg_data);
	add_log( \$log, "source_dir=$source_dir; target_dir=$target_dir" );

	mkpath( $target_dir );

	if ($config_data->{structure} == PKGPOOL) {
	    extract_copyright_to_pkgpool( $pkg_data, $config_data, \$log,
					  $source_dir, $target_dir );
	    extract_changelog_to_pkgpool( $pkg_data, $config_data, \$log,
					  $source_dir, $target_dir );
	    manage_current_link( $pkg_data, $config_data, \$log,
				 $source_dir, $target_dir );
	} elsif ($config_data->{structure} == DIGESTPOOL) {
	    die "UNIMPLEMENTED!";
	} else {
	    do_error( "unknown pool structure $config_data->{structure}" );
	}

	# rmtree should do that itself, but there seems to be a bug somewhere
	system( "chmod", "-R", "u+rwx", "$source_dir" );
	rmtree( $source_dir );
	cleanup_binpkg($pkg_data);
	write_log( $target_dir, $log );
    }
}

sub extract_from_all {
    my ( $src_packages ) = @_;

    unless (-d $target) {
	mkpath( $target ) or do_error( "couldn't create target directory" );
    }

    # TODO: make configurable
    my %config = (
		  structure => PKGPOOL,
		  );

    do_info( scalar(keys(%$src_packages))." source packages to process" );
    foreach my $p (keys %$src_packages) {
	foreach my $v (keys %{$src_packages->{$p}}) {
	    extract_files( $src_packages->{$p}{$v}, \%config );
	}
    }
}

##################################################
# COLLECTING INFORMATION

sub merge_src_bin_packages {
    my ( $src_packages, $bin_packages ) = @_;

    foreach my $p (keys %$bin_packages) { # packages
	foreach my $v (keys %{$bin_packages->{$p}}) { # versions
	    foreach my $a (keys %{$bin_packages->{$p}{$v}}) { # architectures
		my %bin_data = %{$bin_packages->{$p}{$v}{$a}};

		if (exists $src_packages->{$bin_data{bin_src}}{$bin_data{bin_src_version}}) {
		    $src_packages->{$bin_data{bin_src}}{$bin_data{bin_src_version}}{bins} ||= [];
		    push @{$src_packages->{$bin_data{bin_src}}{$bin_data{bin_src_version}}{bins}}, \%bin_data;
		    $src_packages->{$bin_data{bin_src}}{$bin_data{bin_src_version}}{bin_list}{$p}++;
		}
	    }
	}
    }

    return $src_packages;
}

sub read_dsc {
    my ( $dscname ) = @_;

    my $parser = Parse::DebControl->new();
    my ( $raw_data, $pkg_data );

    my $dsccontent = $cache{$dscname};
    unless ($dsccontent) {
	open my $dscfh, "<", $dscname or do {
	    do_warning( "reading file $dscname failed" );
	    return;
	};

	$dsccontent = "";
	while (<$dscfh>) {
	    next if /^\#/o;
	    if (/^-----BEGIN PGP SIGNED MESSAGE/o) {
		while (<$dscfh> =~ /\S/) {}; # skip Hash: line and similar
		next;
	    }
	    if (/^-----BEGIN PGP SIGNATURE/o) {
		last; # stop reading
	    }
	    $dsccontent .= $_;
	}

	$cache{$dscname} = $dsccontent;
    } else {
	$stats{src_cache}++;
	if ($debug) {
	    (my $begin = substr($dsccontent,0,20)) =~ s/\n/\\n/go;
	    do_debug( "CACHE HIT: $dscname ($begin)" );
	}
    }

    unless ( $raw_data = $parser->parse_mem( $dsccontent,
					     { discardCase => 1 } ) ) {
	do_warning( "parsing file $dscname failed.\n$dsccontent" );
	return;
    }

    my $no_chunks = @$raw_data;
    if ($no_chunks != 1) {
	do_warning( "expected exactly one chunk in .dsc file, got $no_chunks" );
	return;
    }

    $pkg_data = {
	src_name => $raw_data->[0]{source},
	src_version => $raw_data->[0]{version},
	dsc => $dscname,
    };

    unless( $pkg_data->{src_name} && defined($pkg_data->{src_version})
	&& $pkg_data->{dsc} ) {
	use Data::Dumper;
	do_error( "something fishy happened.\n", Dumper( $pkg_data ) );
    }

    do_debug( "found source package $pkg_data->{src_name}, version $pkg_data->{src_version}" );
    $stats{src_pkgs}++;

    return $pkg_data;
}

sub read_deb {
    my ( $debname ) = @_;

    my $parser = Parse::DebControl->new();
    my ( $raw_data, $pkg_data );

    if ($cache{$debname}) {
	$stats{bin_cache}++;
	if ($debug) {
	    (my $begin = substr($cache{$debname},0,20)) =~ s/\n/\\n/go;
	    do_debug( "CACHE HIT: $debname ($begin)" );
	}
    }
    $cache{$debname} ||= qx/dpkg-deb --info "$debname" control/;
    my $control = $cache{$debname};

    unless ( $raw_data = $parser->parse_mem( $control,
					     { discardCase => 1 } ) ) {
	do_warning( "parsing control information <<$control>> of file $debname failed" );
	return;
    }

    my $no_chunks = @$raw_data;
    if ($no_chunks != 1) {
	do_warning( "expected exactly one chunk in .deb control information, got $no_chunks" );
	return;
    }

    $pkg_data = {
	bin_name => $raw_data->[0]{package},
	bin_version => $raw_data->[0]{version},
	bin_arch => $raw_data->[0]{architecture},
	bin_src => $raw_data->[0]{source} || $raw_data->[0]{package},,
	bin_src_version => $raw_data->[0]{version},
	deb => $debname,
    };

    if ($pkg_data->{bin_src} =~ /^([\w.+-]+)\s*\(\s*=\s*([^\s\)])\s*\)\s*$/) {
	$pkg_data->{bin_src} = $1;
	$pkg_data->{bin_src_version} = $2;
    }

    do_debug( "found binary package $pkg_data->{bin_name}, version $pkg_data->{bin_version}, architecture $pkg_data->{bin_arch}" );
    $stats{bin_pkgs}++;

    return $pkg_data;
}

sub collect_deb {
    my ( $debname ) = @_;

    do_debug( "processing deb file $debname" );

    my $pkg_data = read_deb( $debname );
    return unless $pkg_data;

    if (exists $bin_packages{$pkg_data->{bin_name}}{$pkg_data->{bin_version}}{$pkg_data->{bin_arch}}) {
	do_warning( "duplicated package $pkg_data->{bin_name}, version {$pkg_data->{bin_version}{$pkg_data->{bin_arch}}" );
	return;
    } else {
	$bin_packages{$pkg_data->{bin_name}}{$pkg_data->{bin_version}}{$pkg_data->{bin_arch}} = $pkg_data;
    }
}

sub collect_dsc {
    my ( $dscname ) = @_;

    do_debug( "processing dsc file $dscname" );

    my $pkg_data = read_dsc( $dscname );
    return unless $pkg_data;

    if (exists $src_packages{$pkg_data->{src_name}}{$pkg_data->{src_version}}) {
	do_warning( "duplicated package $pkg_data->{src_name}, version {$pkg_data->{src_version}" );
	return;
    } else {
	$src_packages{$pkg_data->{src_name}}{$pkg_data->{src_version}} = $pkg_data;
    }
}

sub read_sub {
    my ( $dir ) = @_;

    do_debug( "processing directory $dir" );

    opendir my $dh, $dir or do_error( "couldn't open directory $dir" );
    while( my $entry = readdir $dh ) {
	chomp $entry;
	next if $entry =~ /^\.\.?$/o;

	my $fullname = "$dir/$entry";

	read_sub( $fullname ) if -d $fullname;
	collect_dsc( $fullname ) if -f _ && ( $fullname =~ /\.dsc$/o );
	collect_deb( $fullname ) if -f _ && ( $fullname =~ /\..?deb$/o );
    }
    closedir $dh or do_warning( "couldn't close directory $dir" );
}

##################################################
# MAIN PROGRAM

do_info( "Using working directory $TEMPDIR" );
if ($use_dump) {
    do_info( "load information from dump file" );
    open DUMP, '<', $dumpfile
	or do_error( "couldn't open dump file $dumpfile: $!" );
    my $info = join "", <DUMP>;
    eval $info;
    close DUMP or do_warning( "couldn't close dump file: $!" );
} else {
    do_info( "collect information (in $directory)" );
    if ($cachefile) {
	tie %cache, 'DB_File', $cachefile, O_CREAT|O_RDWR, 0640 
	    or die "E: tie with file $cachefile failed: $!";
    }
    read_sub( $directory );
#FIXME: "untie attempted while 1 inner references still exist"
#    untie %cache if tied %cache;
    do_info( "postprocess collected information" );
    merge_src_bin_packages( \%src_packages, \%bin_packages );
    if ($dumpfile) {
	do_info( "dump backup of collected information" );
	open DUMP, '>', $dumpfile
	    or do_error( "couldn't open dump file $dumpfile: $!" );
	print DUMP Data::Dumper->Dump( [ \%src_packages ],
				       [ '*src_packages' ] );
	close DUMP or do_warning( "couldn't close dump file: $!" );
    }
}
do_info( "begin extracting files" );
extract_from_all( \%src_packages );
do_info( <<STATS );
Statistics:
 Source Packages:   $stats{src_pkgs}
 Cached Info:       $stats{src_cache}
 Already Extracted: $stats{already_extracted}
 Binary Packages:   $stats{bin_pkgs}
 Cached Info:       $stats{bin_cache}
STATS
