#!/usr/bin/perl use Fcntl; use DBI; use Time::Local; # teisgm # this version for translating TEI-Lite tdh files in SGML into OAI # version 2.0 unqualified Dublin Core # adds fields to MySQL database tables in Paul Cummin's OAI repository at # http://diglib.lib.utk.edu/oai/ # datestamp used is seconds from the epoch in GMT time # as well as to add metadata section to hardcopy file (if this is all you want, # comment out "#" all lines referring to the database calls # (let me know if you want a script for creating the date for headers) # type tei (return) and wait # Jody DeRidder, UTK Libraries, 12/16/02: deridder@aztec.lib.utk.edu # type teisgm (return) and wait # NOTE: THIS IS SET FOR SET 10, TDH!!! # this software provided free without guarantees $netid = "netid"; # a field in our database to identify worker $schema = "http://www.w3.org/2001/XMLSchema"; # then "-instance" if needed $xmlns = "http://www.openarchives.org/OAI/2.0/OAI_"; #follow with verb $purl = "http://purl.org/dc/elements/1.1/"; $xsd = "http://www.openarchives.org/OAI/2.0/"; # follow with format, then xsd $username = "username"; # for access to database $password = "password"; $us = "oai:oai.sunsite.utk.edu:"; # preface to all ids $begin_rec = " \n
\n"; $rec_footer = " \n \n \n"; $meta_header = " \n"; $udc_blurb = " \n"; $meta_footer = " \n \n"; $abt_header = " \n"; $abt_footer = " \n \n"; #this gets next oai_id ## This database is ## set up with auto-incrementing identifiers as primary key. $dbh = DBI->connect ("dbi:mysql:dcedit", $username, $password) or die "Can't connect to Mysql database: ",$DBI::errstr,"\n"; $h->{PrintError} = 1; $h->{RaiseError} = 1; $netid = $dbh->quote($netid); $sth = $dbh->prepare(" select max(id) as id from dc") or print "Can't get a count! Kill me and call for help!! : ",$dbh->errstr(),"\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $lastone = $sth->fetchrow_array(); warn "Problem in fetchrow_array(): ",$sth->errstr(),"\n" if $sth->err(); $sth->finish(); $dbh->disconnect() or warn "Disconnection failed: $DBI::errstr\n"; print "verify that this was the last OAI ID used: $lastone\n"; print "Please enter y or n:\n"; chop($another = ); while ($another ne "y" && $another ne "n"){ print "Please type y or n:\n"; chop($another = ); } if ($another eq "n"){ print "Fix it !!! Bye!!\n"; exit;} # CHANGE THIS!!! PUT FILES IN YOUR CHOSEN DIRECTORY $thisdir = "/home/firestar/tdhnew/sgm/"; $rightdir= "/home/firestar/tdhnew/oai"; opendir(DIR,$thisdir); while (defined($name= readdir (DIR))){ print "looking at ".$name." in ".$thisdir."\n"; if ($name =~ /([a-z]{2}[0-9]{4})\.sgm(l)?/){ # check this! # this picks up files starting with # 2 letters, then 4 numbers, followed # by '.sgm' or '.sgml' print " name without .sgm is: $1\n"; $spc_id = $1; #prefix $lastone ++; #increment to next oai_id; use this for 1st file $thisone = $lastone; print "opening $name and $spc_id\n"; #hardcode files are named by prefix of sgm file sysopen (THIS, $rightdir."/".$spc_id, O_WRONLY | O_CREAT, 0666) or next; sysopen (THAT,$name, O_RDONLY) or die "Opening: $!"; print "opening $thisdir/$spc_id\n"; ## set input record delimiter to undefined, slurp in whole file undef $/; my $line = ; #first clear everything while (@title){pop(@title);} while (@creator){pop(@creator);} while (@subject){pop(@subject);} while (@descr){pop(@descr);} while (@pub){pop(@pub);} while (@contrib){pop(@contrib);} while (@date){pop(@date);} while (@type){pop(@type);} while (@format){pop(@format);} while (@ids){pop(@ids);} while (@source){pop(@source);} while (@lang){pop(@lang);} while (@rel){pop(@rel);} while (@cov){pop(@cov);} while (@rts){pop(@rts);} while (@all){pop(@all);} $creator=$description=$subject=$title=$pub=$morepub=$moretitle=""; $dat=$morep=$morer=$rights=$contrib=$subject=$admin=$moreabs=""; $descrip2 = ""; $titl=$dat=""; $id=""; # this is where you can add a description if you want; we # added our own identifier, and the collection of the file push(@descr, "Document ID: $spc_id"); push (@descr, "Tennessee Documentary History Collection"); $flag = 0; $found = ""; $done = ""; # print "LOOKING AT >\n".$_."\n\n"; $_ = $line; s/\n//g; s/\s{3,}/ /g; # get rid of excess spaces s/\&\;/and/g; # ampersands become 'and' s/\&/and/g; #print "My file is this:\n\n$_\n\n"; $flag ++; if ($flag == 50){print "endless loop\n"; exit(1);} if (!@title){ if ($_ =~ /(.*)<\/titleStmt>/i){ $work = $1; if ($work =~ /(.*)<\/title>( )*?<author>(.*)<\/author>.*/i){ $title = $1; $creator = $3; if ($title =~ /(.*)<\/title>( )*?<author>(.*)<\/author>.*/i){ # remove next section $title = $1; $creator = $3; } if ($creator =~ /(\s)*(.*)(\s)*/){ #remove leading/ending spaces $creator = $2; } if ($creator =~/(.*)<\/author>( )*?<author>(.*)/i){ # multiple authors # print "CREATOR: $creator"; while ($creator =~/(.*)<\/author>( )*?<author>(.*)/i){ push (@creator, $3); $creator = $1; } push (@creator, $creator); } else {push(@creator, $creator);} while ($title =~ /(.*)<\/title>( )*?<title>(.*)/i){ push (@title, $3); $title = $1; } push (@title, $title); $found = "yes"; $done = "yes"; $find = "pub"; } # if no authors, this one will still work elsif ($work =~ /<title>(.*)<\/title>( )*?<resp.*/i){ $title = $1; while ($title =~ /(.*)<\/title>( )*?<title>(.*)/i){ push (@title, $3); $title = $1; } push (@title, $1); $creator = ""; $found = "yes"; $done = "yes"; $find = "pub"; } } } if (($done eq "yes")&&($find eq "pub")){ if ($_ =~ /<PUBLICATIONSTMT>(\s)*?(.*)<\/PUBLICATIONSTMT>/i){ $work = $2; if ($work =~ /<publisher>(.*)(\s)*/i){ $pub = $1; $found = "yes"; $done = "no"; if ($pub =~/(\s)*(.*)<\/publisher>/i){ # take out end tag $pub = $2; $find = "doc"; # print "pub is found, looking for DOC\n"; if ($pub =~ /(\s)*(.*)(\s)*/){ #remove leading/ending spaces $pub = $2; } $done = "yes"; push (@pub, $pub); } } $find = "doc"; } } if ($find eq "doc"){ $find = "rights"; } if (($done eq "yes") && ($find eq "rights")){ if ($_ =~ /<p>(.*)<\/p>( )*?<\/availability>/i){ push (@rts, $1); $rights = $1; $done = "yes"; } $find = "date"; } if (($done eq "yes")&&($find eq "date")){ if($_ =~ /author>( )*?<date value="(.*)">(.*)<\/date>/i){ $working = $2; while ($working =~/(.*)">.*/){ $working = $1; } $dat = $working; } if (!$dat){ if ($_ =~ /author>( )*?<date value="(.*)">.*<\/date>/i){ $working = $2; while ($working =~/(.*)">.*/){ $working = $1; } $dat = $working; } # this for no authors elsif ($_ =~ /title>( )*?<date value="(.*)">.*<\/date>/i){ $working = $2; while ($working =~/(.*)">.*/){ $working = $1; } $dat = $working; } } push (@date, $dat); $find = "des"; } if (($done eq "yes") && ($find eq "des")){ $working = $_; if ($working =~ /.*<note type="summary">(.*)(\s)*<\/note>( *)?<note.*/i){ #print "found a match!\n"; $working = $1; while ($working =~ /(.*)(\s)*<\/note>( *)?<note.*/i){ $working = $1; } while ($working =~ /(.*)<xref>(.*)<\/xref>(.*)/i){ $working = $1." ".$2.$3; } $found = "yes"; $done = "yes"; #print "found this des:\n$working\n"; $working =~ s/ {3,}/ /g; # correct for overspacing in document $working =~ s/<HI REND="i">/"/ig; $working =~ s/<\/HI>/"/ig; if ($working){ push (@descr, $working);} #next; } $find = "sub"; } if ($find eq "sub"){ $work = ""; if ($_ =~ /<profileDesc>(\s)*?<textClass>(\s)*?<keywords scheme="LCSH">(\s)*?<list>(.*)(\s)*?<\/keywords>(\s)*?<\/textClass>(\s)*?<\/profileDesc>/i){ $work = $4; } if ($work =~ /<item>(.*)<\/item>/i){ $wk = $1; while ($wk =~ /(.*)<\/item>( )*?<item>(.+)/i){ push (@subject, $3); $wk = $1; } push (@subject, $wk); #next; } $find = "nada"; } # change this; we needed to tweak our output on publisher and contributor if (!@pub){ push (@pub, "University of Tennessee Special Collections Library"); } push (@contrib, "University of Tennessee Special Collections Library, Knoxville"); if ($rights =~ /(.*) McClung Museum(.*)/){ push (@contrib, "Frank H. McClung Museum (Knoxville, Tenn.)"); } if ($rights =~ /(.*)McClung Historical Collection(.*)/){ push (@contrib, "Knox County Public Library (Knoxville, Tenn.)"); } if ($rights =~ /.*Memphis Public Library.*/){ push (@contrib, "Memphis Public Library (Memphis, Tenn.)"); } if ($rights =~ /.*Tennessee State Library.*/){ push (@contrib, "Tennessee State Library and Archives (Nashville, Tenn.)"); } # this is where you put in your identifier link, where you have the item online $id = "http://anasazi.lib.utk.edu/cgi/t/text/text-idx?c=tdh;view=text;rgn=main;idno=$spc_id"; # print "check identifier link: $id\n"; push (@ids, $id); # our default language is English, and these files were text/html files with # tiff images push (@lang , "en"); @type = ("Image", "Text"); @format = ("Image/tiff", "Text/html"); $check = 0; print "\nHere are all the fields for the record that you entered:\n\n"; if(@title){print "title:"; foreach(@title){print " ".$_."\n";}} else {$check ++;} if(@creator){print "\ncreator:"; foreach(@creator){print " ".$_."\n";}} else {$check ++;} if(@subject){print "\nsubject:"; foreach(@subject){print " ".$_."\n";}} else {$check ++;} if(@descr){print "\ndescription:"; foreach(@descr){print " ".$_."\n";}} else {$check ++;} if(@pub){print "\npublisher:"; foreach(@pub){print " ".$_."\n";}} else {$check ++;} if(@contrib){print "\ncontributor:"; foreach(@contrib){print " ".$_."\n";}} else {$check ++;} if(@date){print "\ndate:"; foreach(@date){print " ".$_."\n";}} else {$check ++;} if(@type){print "\ntype:"; foreach(@type){print " ".$_."\n";}} else {$check ++;} if(@format){print "\nformat:"; foreach(@format){print " ".$_."\n";}} else {$check ++;} if(@ids){print "\nidentifier:"; foreach(@ids){print " ".$_."\n";}} else {$check ++;} if(@source){print "\nsource:"; foreach(@source){print " ".$_."\n";}} else {$check ++;} if(@lang){print "\nlanguage:"; foreach(@lang){print " ".$_."\n";}} else {$check ++;} if(@rel){print "\nrelation:"; foreach(@rel){print " ".$_."\n";}} else {$check ++;} if(@cov){print "\ncoverage:"; foreach(@cov){print " ".$_."\n";}} else {$check ++;} if(@rts){print "\nrights:"; foreach(@rts){print " ".$_."\n";}} else {$check ++;} if ($check == 15){ print "Sorry, we cannot enter a record with no DC values.\n"; exit; } print "\nIf all this is correct, type y\n"; print "TO ABORT THIS RECORD AND EXIT, type x\n"; chop($ok = <STDIN>); while ($ok ne "y" && $ok ne "x"){ print "Please type y or or x:\n"; chop($ok = <STDIN>); } if ($ok eq "x"){ print "\nBye-Bye!!\n"; exit; } print "got here\n"; $mydate = &getDate; $thisone = $dbh->quote($thisone); $mydate = $dbh->quote($mydate); $dbh = DBI->connect ("dbi:mysql:dcedit", $username, $password) or die "Can't connect to Mysql database: ",$DBI::errstr,"\n"; $h->{PrintError} = 1; $h->{RaiseError} = 1; print "inserting $name as $thisone\n"; $sth = $dbh->prepare("insert into dc (id,netid,oai_date) values($thisone,$netid,$mydate)") or die "dc: unable to insert $name, $thisone : ",$dbh->errstr(),"\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $sth->finish(); # NOTE: THIS IS SET FOR SET 10, TDH!!! $sth = $dbh->prepare("insert into set2dc (id,setid,dcid) values(0,'10',$thisone)") or print "set2dc: unable to insert $name, $thisone : ",$dbh->errstr(),"\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $sth->finish(); # collecting entire xml file for text blob in database $all = ""; print THIS $meta_header.$udc_blurb; foreach(@title){ print THIS " <dc:title>$_</dc:title>\n"; $title = $dbh->quote($_); $sth = $dbh->prepare("insert into title (id, data, dcid) values (0, $title, $thisone)") or print "title: unable to insert $name, $thisone : ",$dbh->errstr(),"\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $sth->finish(); } foreach(@creator){ print THIS " <dc:creator>$_</dc:creator>\n"; $creator = $dbh->quote($_); $sth = $dbh->prepare("insert into creator (id, data, dcid) values (0, $creator, $thisone)") or print "creator: unable to insert $name, $thisone : $dbh->errstr()\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $sth->finish(); } foreach(@subject){ print THIS " <dc:subject>$_</dc:subject>\n"; $subject = $dbh->quote($_); $sth = $dbh->prepare("insert into subject (id, data, dcid) values (0, $subject, $thisone)") or print "subject: unable to insert $name, $thisone : ",$dbh->errstr(),"\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $sth->finish(); } foreach(@descr){ print THIS " <dc:description>$_</dc:description>\n"; $description = $dbh->quote($_); $sth = $dbh->prepare("insert into description (id, data, dcid) values (0, $description, $thisone)") or print "description: unable to insert $name, $thisone : ",$dbh->errstr(),"\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $sth->finish(); } foreach(@pub){ print THIS " <dc:publisher>$_</dc:publisher>\n"; $publisher = $dbh->quote($_); $sth = $dbh->prepare("insert into publisher (id, data, dcid) values (0, $publisher, $thisone)") or print "publisher: unable to insert $name, $thisone : ",$dbh->errstr(),"\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $sth->finish(); } foreach(@contrib){ print THIS " <dc:contributor>$_</dc:contributor>\n"; $contributor = $dbh->quote($_); $sth = $dbh->prepare("insert into contributor (id, data, dcid) values (0, $contributor, $thisone)") or print "contributor: unable to insert $name, $thisone : $dbh->errstr()\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $sth->finish(); } foreach(@date){ print THIS " <dc:date>$_</dc:date>\n"; $date = $dbh->quote($_); $sth = $dbh->prepare("insert into date (id, data, dcid) values (0, $date, $thisone)") or print "date: unable to insert $name, $thisone : $dbh->errstr()\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $sth->finish(); } foreach(@type){ print THIS " <dc:type>$_</dc:type>\n"; $type = $dbh->quote($_); $sth = $dbh->prepare("insert into type (id, data, dcid) values (0, $type, $thisone)") or print "type: unable to insert $name, $thisone : $dbh->errstr()\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $sth->finish(); } foreach(@format){ print THIS " <dc:format>$_</dc:format>\n"; $format = $dbh->quote($_); $sth = $dbh->prepare("insert into format (id, data, dcid) values (0, $format, $thisone)") or print "format: unable to insert $name, $thisone : $dbh->errstr()\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $sth->finish(); } foreach(@ids){ print THIS " <dc:identifier>$_</dc:identifier>\n"; $id = $dbh->quote($_); $sth = $dbh->prepare("insert into identifier (id, data, dcid) values (0, $id, $thisone)") or print "identifier: unable to insert $name, $thisone : $dbh->errstr()\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $sth->finish(); } foreach(@source){ print THIS " <dc:source>$_</dc:source>\n"; $source = $dbh->quote($_); $sth = $dbh->prepare("insert into source (id, data, dcid) values (0, $source, $thisone)") or print "source: unable to insert $name, $thisone : $dbh->errstr()\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $sth->finish(); } foreach(@lang){ print THIS " <dc:language>$_</dc:language>\n"; $language = $dbh->quote($_); $sth = $dbh->prepare("insert into language (id, data, dcid) values (0, $language, $thisone)") or print "language: unable to insert $name, $thisone : $dbh->errstr()\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $sth->finish(); } foreach(@rel){ print THIS " <dc:relation>$_</dc:relation>\n"; $relation = $dbh->quote($_); $sth = $dbh->prepare("insert into relation (id, data, dcid) values (0, $relation, $thisone)") or print "relation: unable to insert $name, $thisone : $dbh->errstr()\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $sth->finish(); } foreach(@cov){ print THIS " <dc:coverage>$_</dc:coverage>\n"; $coverage = $dbh->quote($_); $sth = $dbh->prepare("insert into coverage (id, data, dcid) values (0, $coverage, $thisone)") or print "coverage: unable to insert $name, $thisone : $dbh->errstr()\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $sth->finish(); } foreach(@rts){ print THIS " <dc:rights>$_</dc:rights>\n"; $rights = $dbh->quote($_); $sth = $dbh->prepare("insert into rights (id, data, dcid) values (0, $rights,$thisone)") or print "rights: unable to insert $name, $thisone : $dbh->errstr()\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $sth->finish(); } print THIS $meta_footer; close (THIS); close (THAT); $dbh->disconnect or warn "Disconnection failed: $DBI::errstr\n"; } # end of if this file name fits 2 letters 3 numbers } # end of looking through this directory closedir(DIR); exit; sub getDate{ # gets time from the epoch, GMT time @when = split(/ +/, `date -u`); ($mymonth, $myday, $time, $myyear) = (@when)[1,2,3,5]; ($hour, $min, $sec) = split (':', $time); chop($myyear); #lose the newline # month must be in numbers, 0-11 !!!!! if ($mymonth eq "Jan"){$mymonth = "00";} elsif ($mymonth eq "Feb"){$mymonth = "01";} elsif ($mymonth eq "Mar"){$mymonth = "02";} elsif ($mymonth eq "Apr"){$mymonth = "03";} elsif ($mymonth eq "May"){$mymonth = "04";} elsif ($mymonth eq "Jun"){$mymonth = "05";} elsif ($mymonth eq "Jul"){$mymonth = "06";} elsif ($mymonth eq "Aug"){$mymonth = "07";} elsif ($mymonth eq "Sep"){$mymonth = "08";} elsif ($mymonth eq "Oct"){$mymonth = "09";} elsif ($mymonth eq "Nov"){$mymonth = "10";} else{$mymonth = "11";} #$olddate = `date -u`; #print " date: $olddate\n"; #print " month: $mymonth\n day: $myday\n year: $myyear\n hour: $hour\n minute: $min\n second: $sec\n"; $mydate = timegm($sec, $min, $hour, $myday, $mymonth, $myyear); print "time: $mydate\n"; $mydate; }