#!/usr/athena/bin/perl
#
# marc2txt
#
# Translate MARC format data into plain text data.
#
# This script can be used to extract data from a MARC file.
# It can even translate whole files into editable text equivalents.
# In conjunction with txt2marc it can be used to make changes to 
# MARC files. If no -f is given, marc2txt will read standard input.
#
# To just see a tiny bit of data about every record:
#        cat marcfile | marc2txt -r
#
# To see all tags for the first 10 records in the file:
#        cat marcfile | marc2txt -ax
#
# To record info, 245 $a, and 910 tags:
#        marc2txt -f marcfile -r -t"245a 910"
# 
# In theory doing "cat marc1 | marc2txt -a | txt2marc > marc2" would
# result in identical marc1 and marc2 files. In practice there are
# often minor differences in marc1 and marc2, so there must be bugs 
# lurking in the code. This is likely to be due to spaces surrounding
# subfield tags, though I can't be sure.
#
#
#    Written in Summer 1995 by Eric Celeste for the MIT Libraries. 
#    <efc@mit.edu>
#
# 960528  (efc)   fixed so record count (recnum) starts with 1, not zero
# 970303  (efc)   tried to make it more graceful about poor marc data
# 970317  (efc)   notice non-oclc character set & multi-terminated tags
# 970318  (efc)   deals with \0 padded recs & accept comma delimited -x lists
# 970319  (efc)   distinguish non-OCLC char types
# 970324  (efc)   oh, gosh, C++
# 970807  (efc)   added -s to split files into # of record chunks
# 970808  (efc)   undid -s, see marcsplit for this functionality
# 970819  (efc)   added non-numeric tag warning (seen in some BNA bibs)
# 970922  (efc)   fixe \xa1 bug in non-OCLC char set (polish L)
# 980708  (efc)   allow LDR in the -t option to have an effect
# 980724  (orbitee) added -F and -R options, to filter out certain records

require "getopts.pl";

&Getopts('hx:vrat:cf:mewnds:o:bFR');

if ($opt_h) {
	print " h   Print this help information\n";
	print " x <number> Experimental, only show few records\n";
	print " v   Verbose, show all kinds of messages\n";
	print " r   Record, show some info about every record (including transaction)\n";
	print " a   All, show all tags from each record\n";
	print " t <list> Tags, contains list of tags to show, like \"949a 910\"\n";
	print " c   Content only, show only requested data, \n";
	print "     no other context, no tags, no subfields\n";
	print " f <name> File, use this file for input\n";
	print " m   MARC record raw\n";
	print " e   Supress errors\n";
	print " w   Washed clean MARC record\n";
	print " d   Departmental theses, oldstyle to newstyle mapping\n";
	print " n   New line between records\n";
	print " b   Screen out brief records (rectype=b biblevel=r)\n";
	print " F   Filter - only records containing tags listed in -t\n";
	print " R   Reverse filter - skip records containing tags\n";
	exit;
}

#### CONSTANTS ####

$marcDel = "\x1f"; # the MARC delimiter (part of efc970303 update)
$marcFT = "\x1e";  # the MARC field terminator
$marcRT = "\x1d";  # the MARC record terminator

# create a set of all non-oclc characters for error checking later
$nonoclcL = "\xa0"; # glis script L?
$nonoclc0 = "\xc0"; # glis superscript zero (degrees, angstrom)?
$nonoclcS = "\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9"  # glis superscripts
  . "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9";     # glis subscripts
$nonoclcU = ""     # unknown non-oclc characters
  . "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
  . "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1c"
  . "\x5e\x5f"
  . "\x60"
  . "\x7f"
  . "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
  . "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
  . "\xaf"   # until 970922 had \xa1 by accident
  . "\xbb\xbf"
  . "\xca\xcb\xcc\xcd\xce\xcf"
  . "\xda\xdb\xdc\xdd\xde\xdf"
  . "\xfd\xfe\xff";
$nonoclc = $nonoclcL . $nonoclc0 . $nonoclcS . $nonoclcU;

$leadersize = 24;
$tagsize = 3;

# the whole notion of transactions is OCLC specific
%transaction = (
	'01', 'PRODUCE . . . . . . .',
	'02', 'UPDATE. . . . . . . .',
	'03', 'CANCEL. . . . . . . .',
	'11', 'REPLACE . . . . . . .',
	'50', 'ALL PRODUCE . . . . .',
	'90', 'OFFLINE RETRIEVE. . .',
	'92', 'OFFLINE UPDATE. . . .',
	'93', 'OFFLINE CANCEL UPDATE',
	'94', 'MICROCON UPDATE . . .',
	'78', 'Exported CD or ME . .',
	'65', 'Exported via PRISM. .',
);

#### PARAMETERS ####

# are we looking to only print certain tags?
if ($opt_t) {
	@tag = split(/ /, $opt_t);
	foreach $t (@tag) {
		if (length($t) > 3) {
			($t, $m) = split(/=/, $t, 2);
			if (length($t) > 3) {
				$tag{substr($t,0,3)} = substr($t,3);
			} else {
				$tag{$t} = " ";
			}
			# if there's an = sign we are trying to match data
			if ($m) {
				$match{substr($t,0,3)} = $m;
				print "$t should match $m\n" if $opt_v;
			}
		} else {
			$tag{$t} = " ";
		}
	}
}
if ($opt_x) {
	if ($opt_x =~ /[a-zA-Z]/) {
		open(X,"< $opt_x") || die "Could not open \"$opt_x\".\n";
		$xArray[0] = 1; # so we know we've been here
		while (<X>) {
			chop;
			$xHash{$_} = 1;
		}
		close(X);
	} elsif ($opt_x =~ /-/) {
		($xfirst, $xlast) = split(/-/,$opt_x);
		$xfirst = 0 if ($xfirst eq "");
		$xlast = 999999 if ($xlast eq "");
	} elsif ($opt_x =~ /,/) {
		@xArray = split(/,/,$opt_x);
		foreach $x (@xArray) {
			$xHash{$x} = 1;
		}
	} else {
		$xfirst = $opt_x;
		$xlast = $opt_x;
	}

}

#### MAIN PROGRAM ####

# standard input or file?
if ($opt_f) {
	open(IN, "< $opt_f") || die "Could not open \"$opt_f\".\n";
} else {
	open(IN, "-");
}

$defaultMatched = defined %match ? 0 : 1;
	# in other words, if we've defined some matches, 
	# then don't assume this record meets the criteria.
	# on the other hand, if there are no matches,
	# then every record is a match.

$/ = $marcRT; # read data in one record at a time

# loop through all the records in the file
MARCRECORD: while(<IN>) {
	$recnum++;

	$matched = $defaultMatched;
	
	if ($opt_x) {
		if ($xArray[0]) {
			$go = $xHash{$recnum};
		} else {
			$go = (($xfirst <= $recnum) && ($recnum <= $xlast));
		}
	} else {
		$go = 1;
	}
	
	if ($go) {
		
		$errors = "";
		
		$marcRaw = $_; # this is the raw record
		if (/^ /) {
			$errors .= "OOPS leading spaces found on record (#$recnum)\n";
			s/^ +//;
		}
		if (/^\000/) {
			$errors .= "OOPS leading nulls found on record (#$recnum)\n";
			s/^\000+//;
		}
		$marcRecord = $_; # this is the record minus leading spaces
		
		# @marcField = split(/$marcFT/o);
		($marcHeader, $marcData) = split(/$marcFT/o, $marcRecord, 2);
		@marcField = split(/$marcFT/o, $marcData);
		$marcWashD = $marcData; # a place to hold washed data

		$leader = substr($marcHeader, 0, $leadersize);
		$washD = $directory = substr($marcHeader, $leadersize);
		
		# Pull apart the leader
		($reclen, $recstat, $rectype, $biblvl, $blanks, $indcount, 
			$subcount, $baseaddr, $enclvl, $desc, $linked, $lolof, 
			$loscp, $tcode, $undefined
		) = unpack("a5 a1 a1 a1 a2 a1 a1 a5 a1 a1 a1 a1 a1 H2 a1", $leader);
				
		# Decipher tcode, really only an OCLC construct
		$recinfo = "$tcode $transaction{$tcode} $oclc ($enclvl$desc) #$recnum\n";
		
		# Fill in the basics of full & brief output
		$brief = "";
		$full = "---|-=-=-=-=-|-=-=-=-=-|-=-=-=-=-|-=-=-=-=-|-=-=-=-=-| "; 
		$full .= "$recnum\n";
		$full .= "LDR $leader\n";
		
		# look in tag array for LDR and if present add to brief record
		if ($tag{"LDR"} eq " ") {
			$brief .= "LDR $leader\n";
		}
		
		# length of a directory entry is based on data from the leader
		$entrylen = $tagsize + $lolof + $loscp;
		$numofentries = length($directory) / $entrylen;
		
		# Look for errors
		print "$numofentries entries in directory, $reclen bytes long\n" if $opt_v;
		if (length($marcRecord) != $reclen) {
			$errors .= "OOPS record length: $reclen claimed <> ";
			$errors .= length($marcRecord);
			$errors .= " found #$recnum\n";
		}
		if ($numofentries != $#marcField) {
			$errors .= "OOPS number of fields: $numofentries in dir <> ";
			$errors .= $#marcField;
			$errors .= " found #$recnum\n";
		}
		if ($marcData =~ /[$nonoclc]/o) {
			$errors .= "OOPS non-OCLC (";
			$errors .= ($marcData =~ /$nonoclcL/o) ? "L" : " ";
			$errors .= ($marcData =~ /$nonoclc0/o) ? "0" : " ";
			$errors .= ($marcData =~ /[$nonoclcS]/) ? "S" : " ";
			$errors .= ($marcData =~ /[$nonoclcU]/) ? "U" : " ";
			$errors .= ") characters found #$recnum\n";
			$marcWashD =~ s/$nonoclcL/\xbe/go; # script L?
			$marcWashD =~ s/$nonoclc0/\xea/go; # degree/angstrom?
			$marcWashD =~ s/\xd0/0/g; # sub & superscript numbers?
			$marcWashD =~ s/[\xc1\xd1]/1/g;
			$marcWashD =~ s/[\xc2\xd2]/2/g;
			$marcWashD =~ s/[\xc3\xd3]/3/g;
			$marcWashD =~ s/[\xc4\xd4]/4/g;
			$marcWashD =~ s/[\xc5\xd5]/5/g;
			$marcWashD =~ s/[\xc6\xd6]/6/g;
			$marcWashD =~ s/[\xc7\xd7]/7/g;
			$marcWashD =~ s/[\xc8\xd8]/8/g;
			$marcWashD =~ s/[\xc9\xd9]/9/g;
			$marcWashD =~ s/C\xda\xda/C\+\+/g; # C++?
			$marcWashD =~ s/C\xca\xca/C\+\+/g; # C++?
			$marcWashD =~ s/[$nonoclc]/|/go;
			
		}
		
		# Stuff data & tags into an array
		for ($entry = 0; $entry < $numofentries; $entry++) { # lcs990324 fix
			$entrydata = substr($directory, $entry * $entrylen, $entrylen);
			($tag, $length, $offset) 
				= unpack("a$tagsize a$lolof a$loscp", $entrydata);
			# $content = $marcField[$entry];
			$washC = $content = substr($marcData, $offset, $length - 1); # don't need eof indicator
			
			if ($tag =~ /\D/) {
				$errors .= "OOPS non-numeric tag '$tag' #$recnum\n";
			}

			if (substr($content,-1) eq $marcFT) {
				$errors .= "OOPS field $tag terminated too often #$recnum\n";
				# eliminate the extra field terminators
				$washC =~ s/$marcFT/|/go;
				substr($marcWashD, $offset, $length - 1) = $washC;
			}
	
			# first build a subfield array for later analysis
			@subfield = split(/$marcDel/o,"-".$content);
			# then make the subfields presentable to humans
			$content =~ s/\|/\|\|/g;
			$content =~ s/$marcDel/\|/go;
	
			# now we work on building a brief record of only the
			# specific tags we asked for with the -t switch
			if ($tag{$tag} eq " ") {
				# shortcut for including all subfields
				$brief .= "$tag $content\n";
			} elsif (defined $tag{$tag}) {
				# method for including only specific subfields
				$brief .= "$tag $subfield[0]" if (! $opt_c);
				foreach $s (@subfield) {
					$delimiter = substr($s,0,1);
					if (index($tag{$tag},$delimiter) >= $[) {
						$c = substr($s,1);
						if ($opt_c) {
							$brief .= "$c\n";
						} else {
							$brief .= "|$delimiter$c";
						}
					}
				}
				$brief .= "\n" if (! $opt_c);
			}
			
			# and, of course, we include everyting in the full record
			$full .= "$tag $content\n";
			undef @subfield;
			
			if (defined $match{$tag}) {
				if (($match{$tag} eq "*") || ($content =~ /$match{$tag}/)) {
					$matched{$tag} = 1;
				}
			}

			# do some special mapping for old style dept theses
			if ($opt_d) {
				if ($tag eq "099") {
					$thesis = ($washC =~ /\x1faThesis/) ? 1 : $thesis;
				}
				if ( $thesis # move 410 thesis to 502
					&& ($tag eq "410")
					&& (substr($washC,0,2) eq "20")
					&& ($washC =~ /\x1ftThesis/)
				) {
					substr($entrydata,0,3) = "502";
					substr($washD, $entry * $entrylen, $entrylen) = $entrydata;
					substr($washC,0,2) = "  ";
					$washC =~ s/^(..\x1fa)(.*)\x1ft(Thesis.*)/\1\3--\2/;
					$washC =~ s/\x1f[^a]/  /g;
					substr($marcWashD, $offset, $length - 1) = $washC;
				}
				if ( $thesis # move 710 thesis to 917
					&& ($tag eq "710")
					&& (substr($washC,0,2) eq "20")
					&& ($washC =~ /\x1ftThesis/)
				) {
					substr($entrydata,0,3) = "917";
					substr($washD, $entry * $entrylen, $entrylen) = $entrydata;
					substr($washC,0,2) = "  ";
					$washC =~ s/^(..\x1fa)(.*)\x1ft(Thesis.*)/\1\3--\2/;
					$washC =~ s/\x1f[^a]/  /g;
					substr($marcWashD, $offset, $length - 1) = $washC;
				}
				if ( $thesis # move supervised by note to 599
					&& ($tag eq "500")
					&& ($washC =~ /\x1faSupervised by/)
				) {
					substr($entrydata,0,3) = "599";
					substr($washD, $entry * $entrylen, $entrylen) = $entrydata;
					substr($washC,0,2) = "00";
					substr($marcWashD, $offset, $length - 1) = $washC;					
				}
			}
		} # for entry
		
		$matched = 1;
		# to "and" the matches together, assume we've matched
		CHECK: foreach $key (keys %match) {
			# but reverse our assumption if any of the matches are false
			if ($matched{$key} != 1) {
				$matched = 0;
				last CHECK;
			}
		}
		undef %matched;
		
		$thesis = 0;

		if ($opt_b) { # this skips all brief records
		              # probably could find a better spot (efc)
			next MARCRECORD if (($rectype eq "b") && ($biblvl eq "r"));
		}

		if ($opt_F) {
			next MARCRECORD if ($matched == 0);
		}

		if ($opt_R) {
			next MARCRECORD if $matched;
			$matched = 1;
		}

		print $recinfo if ($opt_r && $matched);
		
		print $errors if ((! $opt_e) && $matched);
		
		if ($opt_a) {
			print $full if $matched;
		} else {
			print $brief if ($matched && !$opt_F);
		}
		
		if ($opt_m) { # print the raw marc record, if it was requested
			print $marcRaw if $matched;
		}

		if ($opt_w) { # print the raw marc record, if it was requested
			print $leader.$washD.$marcFT.$marcWashD if $matched;
		}
		
		print "\n" if ($opt_n && $matched);

	} # go
	
} # while IN

if ($opt_s) {
	close(OUT);
}

close(IN);


