#!/usr/athena/bin/perl # # marc2txt # # Translate MARC format data into plain text data. # # This script can be used to extract data from a MARC file. # It can even translate whole files into editable text equivalents. # In conjunction with txt2marc it can be used to make changes to # MARC files. If no -f is given, marc2txt will read standard input. # # To just see a tiny bit of data about every record: # cat marcfile | marc2txt -r # # To see all tags for the first 10 records in the file: # cat marcfile | marc2txt -ax # # To record info, 245 $a, and 910 tags: # marc2txt -f marcfile -r -t"245a 910" # # In theory doing "cat marc1 | marc2txt -a | txt2marc > marc2" would # result in identical marc1 and marc2 files. In practice there are # often minor differences in marc1 and marc2, so there must be bugs # lurking in the code. This is likely to be due to spaces surrounding # subfield tags, though I can't be sure. # # # Written in Summer 1995 by Eric Celeste for the MIT Libraries. # # # 960528 (efc) fixed so record count (recnum) starts with 1, not zero # 970303 (efc) tried to make it more graceful about poor marc data # 970317 (efc) notice non-oclc character set & multi-terminated tags # 970318 (efc) deals with \0 padded recs & accept comma delimited -x lists # 970319 (efc) distinguish non-OCLC char types # 970324 (efc) oh, gosh, C++ # 970807 (efc) added -s to split files into # of record chunks # 970808 (efc) undid -s, see marcsplit for this functionality # 970819 (efc) added non-numeric tag warning (seen in some BNA bibs) # 970922 (efc) fixe \xa1 bug in non-OCLC char set (polish L) # 980708 (efc) allow LDR in the -t option to have an effect # 980724 (orbitee) added -F and -R options, to filter out certain records require "getopts.pl"; &Getopts('hx:vrat:cf:mewnds:o:bFR'); if ($opt_h) { print " h Print this help information\n"; print " x Experimental, only show few records\n"; print " v Verbose, show all kinds of messages\n"; print " r Record, show some info about every record (including transaction)\n"; print " a All, show all tags from each record\n"; print " t Tags, contains list of tags to show, like \"949a 910\"\n"; print " c Content only, show only requested data, \n"; print " no other context, no tags, no subfields\n"; print " f File, use this file for input\n"; print " m MARC record raw\n"; print " e Supress errors\n"; print " w Washed clean MARC record\n"; print " d Departmental theses, oldstyle to newstyle mapping\n"; print " n New line between records\n"; print " b Screen out brief records (rectype=b biblevel=r)\n"; print " F Filter - only records containing tags listed in -t\n"; print " R Reverse filter - skip records containing tags\n"; exit; } #### CONSTANTS #### $marcDel = "\x1f"; # the MARC delimiter (part of efc970303 update) $marcFT = "\x1e"; # the MARC field terminator $marcRT = "\x1d"; # the MARC record terminator # create a set of all non-oclc characters for error checking later $nonoclcL = "\xa0"; # glis script L? $nonoclc0 = "\xc0"; # glis superscript zero (degrees, angstrom)? $nonoclcS = "\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9" # glis superscripts . "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9"; # glis subscripts $nonoclcU = "" # unknown non-oclc characters . "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" . "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1c" . "\x5e\x5f" . "\x60" . "\x7f" . "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" . "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" . "\xaf" # until 970922 had \xa1 by accident . "\xbb\xbf" . "\xca\xcb\xcc\xcd\xce\xcf" . "\xda\xdb\xdc\xdd\xde\xdf" . "\xfd\xfe\xff"; $nonoclc = $nonoclcL . $nonoclc0 . $nonoclcS . $nonoclcU; $leadersize = 24; $tagsize = 3; # the whole notion of transactions is OCLC specific %transaction = ( '01', 'PRODUCE . . . . . . .', '02', 'UPDATE. . . . . . . .', '03', 'CANCEL. . . . . . . .', '11', 'REPLACE . . . . . . .', '50', 'ALL PRODUCE . . . . .', '90', 'OFFLINE RETRIEVE. . .', '92', 'OFFLINE UPDATE. . . .', '93', 'OFFLINE CANCEL UPDATE', '94', 'MICROCON UPDATE . . .', '78', 'Exported CD or ME . .', '65', 'Exported via PRISM. .', ); #### PARAMETERS #### # are we looking to only print certain tags? if ($opt_t) { @tag = split(/ /, $opt_t); foreach $t (@tag) { if (length($t) > 3) { ($t, $m) = split(/=/, $t, 2); if (length($t) > 3) { $tag{substr($t,0,3)} = substr($t,3); } else { $tag{$t} = " "; } # if there's an = sign we are trying to match data if ($m) { $match{substr($t,0,3)} = $m; print "$t should match $m\n" if $opt_v; } } else { $tag{$t} = " "; } } } if ($opt_x) { if ($opt_x =~ /[a-zA-Z]/) { open(X,"< $opt_x") || die "Could not open \"$opt_x\".\n"; $xArray[0] = 1; # so we know we've been here while () { chop; $xHash{$_} = 1; } close(X); } elsif ($opt_x =~ /-/) { ($xfirst, $xlast) = split(/-/,$opt_x); $xfirst = 0 if ($xfirst eq ""); $xlast = 999999 if ($xlast eq ""); } elsif ($opt_x =~ /,/) { @xArray = split(/,/,$opt_x); foreach $x (@xArray) { $xHash{$x} = 1; } } else { $xfirst = $opt_x; $xlast = $opt_x; } } #### MAIN PROGRAM #### # standard input or file? if ($opt_f) { open(IN, "< $opt_f") || die "Could not open \"$opt_f\".\n"; } else { open(IN, "-"); } $defaultMatched = defined %match ? 0 : 1; # in other words, if we've defined some matches, # then don't assume this record meets the criteria. # on the other hand, if there are no matches, # then every record is a match. $/ = $marcRT; # read data in one record at a time # loop through all the records in the file MARCRECORD: while() { $recnum++; $matched = $defaultMatched; if ($opt_x) { if ($xArray[0]) { $go = $xHash{$recnum}; } else { $go = (($xfirst <= $recnum) && ($recnum <= $xlast)); } } else { $go = 1; } if ($go) { $errors = ""; $marcRaw = $_; # this is the raw record if (/^ /) { $errors .= "OOPS leading spaces found on record (#$recnum)\n"; s/^ +//; } if (/^\000/) { $errors .= "OOPS leading nulls found on record (#$recnum)\n"; s/^\000+//; } $marcRecord = $_; # this is the record minus leading spaces # @marcField = split(/$marcFT/o); ($marcHeader, $marcData) = split(/$marcFT/o, $marcRecord, 2); @marcField = split(/$marcFT/o, $marcData); $marcWashD = $marcData; # a place to hold washed data $leader = substr($marcHeader, 0, $leadersize); $washD = $directory = substr($marcHeader, $leadersize); # Pull apart the leader ($reclen, $recstat, $rectype, $biblvl, $blanks, $indcount, $subcount, $baseaddr, $enclvl, $desc, $linked, $lolof, $loscp, $tcode, $undefined ) = unpack("a5 a1 a1 a1 a2 a1 a1 a5 a1 a1 a1 a1 a1 H2 a1", $leader); # Decipher tcode, really only an OCLC construct $recinfo = "$tcode $transaction{$tcode} $oclc ($enclvl$desc) #$recnum\n"; # Fill in the basics of full & brief output $brief = ""; $full = "---|-=-=-=-=-|-=-=-=-=-|-=-=-=-=-|-=-=-=-=-|-=-=-=-=-| "; $full .= "$recnum\n"; $full .= "LDR $leader\n"; # look in tag array for LDR and if present add to brief record if ($tag{"LDR"} eq " ") { $brief .= "LDR $leader\n"; } # length of a directory entry is based on data from the leader $entrylen = $tagsize + $lolof + $loscp; $numofentries = length($directory) / $entrylen; # Look for errors print "$numofentries entries in directory, $reclen bytes long\n" if $opt_v; if (length($marcRecord) != $reclen) { $errors .= "OOPS record length: $reclen claimed <> "; $errors .= length($marcRecord); $errors .= " found #$recnum\n"; } if ($numofentries != $#marcField) { $errors .= "OOPS number of fields: $numofentries in dir <> "; $errors .= $#marcField; $errors .= " found #$recnum\n"; } if ($marcData =~ /[$nonoclc]/o) { $errors .= "OOPS non-OCLC ("; $errors .= ($marcData =~ /$nonoclcL/o) ? "L" : " "; $errors .= ($marcData =~ /$nonoclc0/o) ? "0" : " "; $errors .= ($marcData =~ /[$nonoclcS]/) ? "S" : " "; $errors .= ($marcData =~ /[$nonoclcU]/) ? "U" : " "; $errors .= ") characters found #$recnum\n"; $marcWashD =~ s/$nonoclcL/\xbe/go; # script L? $marcWashD =~ s/$nonoclc0/\xea/go; # degree/angstrom? $marcWashD =~ s/\xd0/0/g; # sub & superscript numbers? $marcWashD =~ s/[\xc1\xd1]/1/g; $marcWashD =~ s/[\xc2\xd2]/2/g; $marcWashD =~ s/[\xc3\xd3]/3/g; $marcWashD =~ s/[\xc4\xd4]/4/g; $marcWashD =~ s/[\xc5\xd5]/5/g; $marcWashD =~ s/[\xc6\xd6]/6/g; $marcWashD =~ s/[\xc7\xd7]/7/g; $marcWashD =~ s/[\xc8\xd8]/8/g; $marcWashD =~ s/[\xc9\xd9]/9/g; $marcWashD =~ s/C\xda\xda/C\+\+/g; # C++? $marcWashD =~ s/C\xca\xca/C\+\+/g; # C++? $marcWashD =~ s/[$nonoclc]/|/go; } # Stuff data & tags into an array for ($entry = 0; $entry < $numofentries; $entry++) { # lcs990324 fix $entrydata = substr($directory, $entry * $entrylen, $entrylen); ($tag, $length, $offset) = unpack("a$tagsize a$lolof a$loscp", $entrydata); # $content = $marcField[$entry]; $washC = $content = substr($marcData, $offset, $length - 1); # don't need eof indicator if ($tag =~ /\D/) { $errors .= "OOPS non-numeric tag '$tag' #$recnum\n"; } if (substr($content,-1) eq $marcFT) { $errors .= "OOPS field $tag terminated too often #$recnum\n"; # eliminate the extra field terminators $washC =~ s/$marcFT/|/go; substr($marcWashD, $offset, $length - 1) = $washC; } # first build a subfield array for later analysis @subfield = split(/$marcDel/o,"-".$content); # then make the subfields presentable to humans $content =~ s/\|/\|\|/g; $content =~ s/$marcDel/\|/go; # now we work on building a brief record of only the # specific tags we asked for with the -t switch if ($tag{$tag} eq " ") { # shortcut for including all subfields $brief .= "$tag $content\n"; } elsif (defined $tag{$tag}) { # method for including only specific subfields $brief .= "$tag $subfield[0]" if (! $opt_c); foreach $s (@subfield) { $delimiter = substr($s,0,1); if (index($tag{$tag},$delimiter) >= $[) { $c = substr($s,1); if ($opt_c) { $brief .= "$c\n"; } else { $brief .= "|$delimiter$c"; } } } $brief .= "\n" if (! $opt_c); } # and, of course, we include everyting in the full record $full .= "$tag $content\n"; undef @subfield; if (defined $match{$tag}) { if (($match{$tag} eq "*") || ($content =~ /$match{$tag}/)) { $matched{$tag} = 1; } } # do some special mapping for old style dept theses if ($opt_d) { if ($tag eq "099") { $thesis = ($washC =~ /\x1faThesis/) ? 1 : $thesis; } if ( $thesis # move 410 thesis to 502 && ($tag eq "410") && (substr($washC,0,2) eq "20") && ($washC =~ /\x1ftThesis/) ) { substr($entrydata,0,3) = "502"; substr($washD, $entry * $entrylen, $entrylen) = $entrydata; substr($washC,0,2) = " "; $washC =~ s/^(..\x1fa)(.*)\x1ft(Thesis.*)/\1\3--\2/; $washC =~ s/\x1f[^a]/ /g; substr($marcWashD, $offset, $length - 1) = $washC; } if ( $thesis # move 710 thesis to 917 && ($tag eq "710") && (substr($washC,0,2) eq "20") && ($washC =~ /\x1ftThesis/) ) { substr($entrydata,0,3) = "917"; substr($washD, $entry * $entrylen, $entrylen) = $entrydata; substr($washC,0,2) = " "; $washC =~ s/^(..\x1fa)(.*)\x1ft(Thesis.*)/\1\3--\2/; $washC =~ s/\x1f[^a]/ /g; substr($marcWashD, $offset, $length - 1) = $washC; } if ( $thesis # move supervised by note to 599 && ($tag eq "500") && ($washC =~ /\x1faSupervised by/) ) { substr($entrydata,0,3) = "599"; substr($washD, $entry * $entrylen, $entrylen) = $entrydata; substr($washC,0,2) = "00"; substr($marcWashD, $offset, $length - 1) = $washC; } } } # for entry $matched = 1; # to "and" the matches together, assume we've matched CHECK: foreach $key (keys %match) { # but reverse our assumption if any of the matches are false if ($matched{$key} != 1) { $matched = 0; last CHECK; } } undef %matched; $thesis = 0; if ($opt_b) { # this skips all brief records # probably could find a better spot (efc) next MARCRECORD if (($rectype eq "b") && ($biblvl eq "r")); } if ($opt_F) { next MARCRECORD if ($matched == 0); } if ($opt_R) { next MARCRECORD if $matched; $matched = 1; } print $recinfo if ($opt_r && $matched); print $errors if ((! $opt_e) && $matched); if ($opt_a) { print $full if $matched; } else { print $brief if ($matched && !$opt_F); } if ($opt_m) { # print the raw marc record, if it was requested print $marcRaw if $matched; } if ($opt_w) { # print the raw marc record, if it was requested print $leader.$washD.$marcFT.$marcWashD if $matched; } print "\n" if ($opt_n && $matched); } # go } # while IN if ($opt_s) { close(OUT); } close(IN);