#!/usr/bin/perl use Fcntl; use DBI; # this version for translating xmetal TEI-Lite tdh files in XML into OAI_DC 2 # and to add to database table oai2.udc, oai2.xml_udc, mdf, ids # as well as to add metadata section to hardcopy file for search engine # type teiMakexml (return) and wait # Jody DeRidder 9/08/02 # after 1st 2 lines, data appears to be all on one line # get date for datestamp # NOT HERE!!! IN THE LOOP $mydate = &responseDate; # $datestamp = " $mydate\n"; # print "datestamp will be: $datestamp\n"; $schema = "http://www.w3.org/2001/XMLSchema"; # then "-instance" if needed $xmlns = "http://www.openarchives.org/OAI/2.0/OAI_"; #follow with verb $purl = "http://purl.org/dc/elements/1.1/"; $xsd = "http://www.openarchives.org/OAI/2.0/"; # follow with format, then xsd $email = "deridder\@cs.utk.edu"; $username = "username"; $password = "password"; $us = "oai:oai.sunsite.utk.edu:"; # preface to all ids $begin_rec = " \n
\n"; $rec_footer = " \n \n \n"; $udc_table = "udc"; # contains untagged fields separately $xml_udc = "xml_udc"; # contains full file as blob $mdf_table = "mdf"; #tracks metadata formats available for each file $id_table = "ids"; # tracks ids of files; a cross-reference $meta_header = " \n"; $udc_blurb = " \n"; $meta_footer = " \n \n"; $abt_header = " \n"; $abt_footer = " \n \n"; #this gets next oai_id ## This database is ## set up with auto-incrementing identifiers as primary key. $dbh = DBI->connect ("dbi:mysql:oai2", $username, $password) or die "Can't connect to Mysql database: ",$DBI::errstr,"\n"; $h->{PrintError} = 1; $h->{RaiseError} = 1; $sth = $dbh->prepare(" select max(oai_id) as oai_id from $udc_table") or print "Can't get a count! Kill me and call for help!! : ",$dbh->errstr(),"\n"; $sth->execute() or die "Can't execute SQL statement: ", $sth->errstr(),"\n"; $lastone = $sth->fetchrow_array(); warn "Problem in fetchrow_array(): ",$sth->errstr(),"\n" if $sth->err(); $sth->finish(); $dbh->disconnect() or warn "Disconnection failed: $DBI::errstr\n"; print "verify that this was the last OAI ID used: $lastone\n"; print "Please enter y or n:\n"; chop($another = ); while ($another ne "y" && $another ne "n"){ print "Please type y or n:\n"; chop($another = ); } if ($another eq "n"){ print "Fix it !!! Bye!!\n"; exit;} # CHANGE THIS!!! PUT FILES IN YOUR CHOSEN DIRECTORY $thisdir = "/usr/sunsite/htdocs/oai/newfiles/tdh_xml/"; $rightdir= "/usr/sunsite/htdocs/oai/2.0/"; # $rightdir= "/usr/sunsite/htdocs/oai/2.0/"; opendir(DIR,$thisdir); while (defined($name= readdir (DIR))){ print "looking at ".$name." in ".$thisdir."\n"; if ($name =~ /([a-z]{2}[0-9]{3})\.xml/){ # check this! # if ($name =~ /(ead[0-9]{4})\.sgm/) # if ($name =~ /0([0-9]{4})\.sgm/) print " name without .xml is: $1\n"; $spc_id = $1; $lastone ++; #increment to next oai_id; use this for 1st file $oai_id = "0000000000".$lastone; # 10 integers long if ($oai_id =~ /.*([0-9]{10})/){ $oai_id = $1; } print "opening $name and $oai_id\n"; $identifier = $us.$oai_id; print "identifier will be $identifier"; sysopen (THIS, $rightdir."/".$oai_id, O_WRONLY | O_CREAT, 0666) or next; sysopen (THAT,$name, O_RDONLY) or die "Opening: $!"; print "opening $thisdir/$oai_id\n"; @ARRAY = ; $path = $rightdir."/".$oai_id; #first clear everything while (@title){pop(@title);} while (@creator){pop(@creator);} while (@subject){pop(@subject);} while (@descr){pop(@descr);} while (@pub){pop(@pub);} while (@contrib){pop(@contrib);} while (@date){pop(@date);} while (@type){pop(@type);} while (@format){pop(@format);} while (@ids){pop(@ids);} while (@source){pop(@source);} while (@lang){pop(@lang);} while (@rel){pop(@rel);} while (@cov){pop(@cov);} while (@rts){pop(@rts);} while (@all){pop(@all);} $creator=$description=$subject=$title=$pub=$morepub=$moretitle=""; $dat=$morep=$morer=$rights=$contrib=$subject=$admin=$moreabs=""; $descrip2 = ""; $titl=$dat=""; $id=""; push(@descr, "Document ID: $spc_id"); push (@descr, "Tennessee Documentary History Collection"); $flag = 0; $found = ""; $done = ""; foreach $_ (@ARRAY){ # print "LOOKING AT >\n".$_."\n\n"; s/\&\;/and/g; $flag ++; if ($flag == 50){print "endless loop\n"; exit(1);} if (!@title){ if ($_ =~ /(.*)<\/title><author>(.*)<\/author>.*/){ $title = $1; $creator = $2; if ($title =~ /(.*)<\/title><author>(.*)<\/author>.*/){ # remove next section $title = $1; $creator = $2; } if ($creator =~ /(\s)*(.*)(\s)*/){ #remove leading/ending spaces $creator = $2; } if ($creator =~/(.*)<\/author><author>(.*)/){ # multiple authors print "CREATOR: $creator"; while ($creator =~/(.*)<\/author><author>(.*)/){ push (@creator, $2); $creator = $1; } push (@creator, $2); } else {push(@creator, $creator);} push (@title, $title); $found = "yes"; $done = "yes"; $find = "pub"; } # if no authors, this one will still work elsif ($_ =~ /<title>(.*)<\/title><resp.*/){ push (@title, $1); $creator = ""; $found = "yes"; $done = "yes"; $find = "pub"; } } if (($done eq "yes")&&($find eq "pub")){ if ($_ =~ /<publisher>(.*)(\s)*/){ $pub = $1; $found = "yes"; $done = "no"; if ($pub =~/(\s)*(.*)<\/publisher>/){ # take out end tag $pub = $2; $find = "doc"; # print "pub is found, looking for DOC\n"; if ($pub =~ /(\s)*(.*)(\s)*/){ #remove leading/ending spaces $pub = $2; } # print "without end tag is $pub\n"; $done = "yes"; push (@pub, $pub); } # next; } $find = "doc"; } if (($found)&& ($done eq "no") && ($find eq "pub")){ if ($_ =~ /(\s)*(.*)<\/publisher>/){ # take out end tag $morepub = $2; $find = "doc"; if ($morepub =~ /(\s)*(.*)(\s)*/){ #remove leading spaces $morepub = $2; } if ($morepub =~ /(.*)(\s)+$/){ #including last one $morepub = $1; } if ($morepub){ $pub = $pub ." ". $morepub;} # print " found end tag\n"; # print "now: $pub\n"; $done = "yes"; push (@pub, $pub); # next; } else{ $morepub = $_; if ($morepub =~ /(\s)*(.*)(\s)*/){ #remove leading spaces $morepub = $2; } if ($morepub =~ /(.*)(\s)+$/){ #including last one $morepub = $1; } if ($morepub){ $pub = $pub . $morepub;} # print "did not find end tag; adding more to pub.\n"; # next; } } if ($find eq "doc"){ $find = "rights"; } if (($done eq "yes") && ($find eq "rights")){ if ($_ =~ /<p>(.*)<\/p><\/availability>/){ push (@rts, $1); $rights = $1; $done = "yes"; } $find = "date"; } if (($done eq "yes")&&($find eq "date")){ if($_ =~ /author><date value="(.*)">(.*)<\/date>/){ $working = $1; while ($working =~/(.*)">.*/){ $working = $1; } $dat = $working; } if (!$dat){ if ($_ =~ /author><date value="(.*)">.*<\/date>/){ $working = $1; while ($working =~/(.*)">.*/){ $working = $1; } $dat = $working; } # this for no authors elsif ($_ =~ /title><date value="(.*)">.*<\/date>/){ $working = $1; while ($working =~/(.*)">.*/){ $working = $1; } $dat = $working; } } push (@date, $dat); $find = "des"; } if (($done eq "yes") && ($find eq "des")){ if ($_ =~ /.*<note type="summary">(.*)(\s)*<\/note><note type="collection"/){ $des = "$1"; while ($des =~ /(.*)<xref>(.*)<\/xref>(.*)/){ $des = $1." ".$2.$3; } $found = "yes"; $done = "yes"; # print "found this des:\n$2\n"; $des =~ s/ {3,}/ /g; # correct for overspacing in document push (@descr, $des); #next; } $find = "sub"; } if ($find eq "sub"){ $work = ""; if ($_ =~ /<item>(.*)<\/item>/){ $work = $1; while ($work =~ /(.*)<\/item><item>(.+)/){ push (@subject, $2); $work = $1; } push (@subject, $work); #next; } $find = "nada"; } } if (!@pub){ push (@pub, "University of Tennessee Special Collections Library"); } push (@contrib, "University of Tennessee Special Collections Library, Knoxville"); if ($rights =~ /(.*) McClung Museum(.*)/){ push (@contrib, "Frank H. McClung Museum (Knoxville, Tenn.)"); } if ($rights =~ /(.*)McClung Historical Collection(.*)/){ push (@contrib, "Knox County Public Library (Knoxville, Tenn.)"); } if ($rights =~ /.*Memphis Public Library.*/){ push (@contrib, "Memphis Public Library (Memphis, Tenn.)"); } if ($rights =~ /.*Tennessee State Library.*/){ push (@contrib, "Tennessee State Library and Archives (Nashville, Tenn.)"); } # $coverage = " <coverage>".$dat."</coverage>\n"; #$dat = " <date>$dat</date>\n"; # print "$titl"; # print "$coverage"; $id = "http://oai.sunsite.utk.edu/sgm/$spc_id.html"; # print "check identifier link: $id\n"; push (@ids, $id); push (@lang , "en"); push (@rel , "Mode of Access: World Wide Web"); @type = ("Image", "Text"); # $check = 0; # print "\nHere are all the fields for the record that you entered:\n\n"; # if(@title){print "title:"; foreach(@title){print " ".$_."\n";}} # else {$check ++;} # if(@creator){print "\ncreator:"; foreach(@creator){print " ".$_."\n";}} # else {$check ++;} # if(@subject){print "\nsubject:"; foreach(@subject){print " ".$_."\n";}} # else {$check ++;} # if(@descr){print "\ndescription:"; foreach(@descr){print " ".$_."\n";}} # else {$check ++;} # if(@pub){print "\npublisher:"; foreach(@pub){print " ".$_."\n";}} # else {$check ++;} # if(@contrib){print "\ncontributor:"; foreach(@contrib){print " ".$_."\n";}} # else {$check ++;} # if(@date){print "\ndate:"; foreach(@date){print " ".$_."\n";}} # else {$check ++;} # if(@type){print "\ntype:"; foreach(@type){print " ".$_."\n";}} # else {$check ++;} # if(@format){print "\nformat:"; foreach(@format){print " ".$_."\n";}} # else {$check ++;} # if(@ids){print "\nidentifier:"; foreach(@ids){print " ".$_."\n";}} # else {$check ++;} # if(@source){print "\nsource:"; foreach(@source){print " ".$_."\n";}} # else {$check ++;} # if(@lang){print "\nlanguage:"; foreach(@lang){print " ".$_."\n";}} # else {$check ++;} # if(@rel){print "\nrelation:"; foreach(@rel){print " ".$_."\n";}} # else {$check ++;} # if(@cov){print "\ncoverage:"; foreach(@cov){print " ".$_."\n";}} # else {$check ++;} # if(@rts){print "\nrights:"; foreach(@rts){print " ".$_."\n";}} # else {$check ++;} # if ($check == 15){ # print "Sorry, we cannot enter a record with no DC values.\n"; # exit; # } # print "\nIf all this is correct, type y\n"; # print "TO ABORT THIS RECORD AND EXIT, type x\n"; # chop($ok = <STDIN>); # while ($ok ne "y" && $ok ne "x"){ # print "Please type y or or x:\n"; # chop($ok = <STDIN>); # } # if ($ok eq "x"){ # print "\nBye-Bye!!\n"; # exit; # } $mydate = &responseDate; # collecting entire xml file for text blob in database while (@all){pop(@all);} $all = ""; push (@all, $begin_rec); push (@all, " <identifier>"); push (@all, $us.$oai_id); push (@all, "</identifier>\n"); push (@all, " <datestamp>".$mydate."</datestamp>\n </header>\n"); push (@all, $meta_header.$udc_blurb); print THIS $meta_header.$udc_blurb; foreach(@title){print THIS " <dc:title>$_</dc:title>\n"; $titl = ""; $titl = " <dc:title>".$_; while ($titl =~ /((.){60,75} )(.+)/){ #trying to shorten the lines push (@all,$1."\n"); $titl =""; $titl = " ".$3; } $titl = $titl."</dc:title>\n"; push (@all, $titl); } foreach(@creator){print THIS " <dc:creator>$_</dc:creator>\n"; push (@all, " <dc:creator>$_</dc:creator>\n");} foreach(@subject){print THIS " <dc:subject>$_</dc:subject>\n"; push (@all, " <dc:subject>$_</dc:subject>\n");} foreach(@descr){print THIS " <dc:description>$_</dc:description>\n"; $des = ""; $des = " <dc:description>".$_; while ($des =~ /((.){60,75} )(.+)/){ #trying to shorten the lines push (@all,$1."\n"); $des =""; $des = " ".$3; } $des = $des."</dc:description>\n"; push (@all, $des); } foreach(@pub){print THIS " <dc:publisher>$_</dc:publisher>\n"; $pb = ""; $pb = " <dc:publisher>".$_; while ($pb =~ /((.){60,75} )(.+)/){ #trying to shorten the lines push (@all,$1."\n"); $pb =""; $pb = " ".$3; } $pb = $pb."</dc:publisher>\n"; push (@all, $pb); } foreach(@contrib){print THIS " <dc:contributor>$_</dc:contributor>\n"; $cont = ""; $cont = " <dc:contributor>".$_; while ($cont =~ /((.){60,75} )(.+)/){ #trying to shorten the lines push (@all,$1."\n"); $cont =""; $cont = " ".$3; } $cont = $cont."</dc:contributor>\n"; push (@all, $cont); } foreach(@date){print THIS " <dc:date>$_</dc:date>\n"; push (@all, " <dc:date>$_</dc:date>\n");} foreach(@type){print THIS " <dc:type>$_</dc:type>\n"; push (@all, " <dc:type>$_</dc:type>\n");} foreach(@format){print THIS " <dc:format>$_</dc:format>\n"; push (@all, " <dc:format>$_</dc:format>\n");} foreach(@ids){print THIS " <dc:identifier>$_</dc:identifier>\n"; push (@all, " <dc:identifier>$_</dc:identifier>\n");} foreach(@source){print THIS " <dc:source>$_</dc:source>\n"; push (@all, " <dc:source>$_</dc:source>\n");} foreach(@lang){print THIS " <dc:language>$_</dc:language>\n"; push (@all, " <dc:language>$_</dc:language>\n");} foreach(@rel){print THIS " <dc:relation>$_</dc:relation>\n"; push (@all, " <dc:relation>$_</dc:relation>\n");} foreach(@cov){print THIS " <dc:coverage>$_</dc:coverage>\n"; push (@all, " <dc:coverage>$_</dc:coverage>\n");} foreach(@rts){print THIS " <dc:rights>$_</dc:rights>\n"; $r = ""; $r = " <dc:rights>".$_; while ($r =~ /((.){60,75} )(.+)/){ #trying to shorten the lines push (@all,$1."\n"); $r =""; $r = " ".$3; } $r = $r."</dc:rights>\n"; push (@all, $r); } push (@all, $rec_footer); print THIS $meta_footer; close (THIS); close (OLD_FILE); #foreach (@all){print $_."\n";} $all = join ('', @all); print " here's my blob:\n".$all."\n"; $dbh = DBI->connect ("dbi:mysql:oai2", $username, $password) or die "Can't connect to Mysql database: ",$DBI::errstr,"\n"; $h->{PrintError} = 1; $h->{RaiseError} = 1; $mypath = $rightdir.$oai_id; $mypath = $dbh->quote($mypath); $oai_id = $dbh->quote($oai_id); $all = $dbh->quote($all); $mydate = $dbh->quote($mydate); $sth = $dbh->do("insert $xml_udc (oai_id, datestamp, path, file) values ($oai_id, $mydate, $mypath, $all)") or die "Can't insert xml statement: $DBI::errstr\n"; $title = join("|", @title); $title = $dbh->quote($title); $creator = join("|",@creator); $creator = $dbh->quote($creator); $subject = join("|",@subject); $subject = $dbh->quote($subject); $descr = join("|",@descr); $descr = $dbh->quote($descr); $pub = join("|",@pub); $pub = $dbh->quote($pub); $contrib = join("|",@contrib); $contrib = $dbh->quote($contrib); $date = join("|",@date); $date = $dbh->quote($date); $type = join("|",@type); $type = $dbh->quote($type); $format = join("|", @format); $format = $dbh->quote($format); $ids = join("|", @ids); $ids = $dbh->quote($ids); $source = join("|", @source); $source = $dbh->quote($source); $lang = join("|", @lang); $lang = $dbh->quote($lang); $rel = join("|", @rel); $rel = $dbh->quote($rel); $cov = join("|", @cov); $cov = $dbh->quote($cov); $rts = join("|", @rts); $rts = $dbh->quote($rts); $sth = $dbh->do("insert $udc_table (oai_id, datestamp, title, creator, subject, description, publisher, contributor, date, type, format, identifier, source, language, relation, coverage, rights, path) values ($oai_id, $mydate, $title, $creator, $subject, $descr, $pub, $contrib, $date, $type, $format, $ids, $source, $lang, $rel, $cov, $rts, $mypath)") or die "Can't prepare sql statement: $DBI::errstr\n"; $sth = $dbh->do("insert into $mdf_table values($oai_id, 'y')") or die "Can't prepare sql statement: $DBI::errstr\n"; if ($spc_id){ $spc_id = $dbh->quote($spc_id); $sth = $dbh->do("insert $id_table (oai_id, spc_id) values($oai_id, $spc_id)") or warn "Can't insert spc statement: $DBI::errstr\n"; } $dbh->disconnect or warn "Disconnection failed: $DBI::errstr\n"; } # end of if this file name fits 2 letters 3 numbers close(THIS); close(THAT); } # end of looking through this directory closedir(DIR); exit; sub responseDate{ #format the date response! @when = split(/ +/, `date -u`); ($mymonth, $myday, $time, $myyear) = (@when)[1,2,3,5]; chop($myyear); #lose the newline # month must be in numbers if ($mymonth eq "Jan"){$mymonth = "01";} elsif ($mymonth eq "Feb"){$mymonth = "02";} elsif ($mymonth eq "Mar"){$mymonth = "03";} elsif ($mymonth eq "Apr"){$mymonth = "04";} elsif ($mymonth eq "May"){$mymonth = "05";} elsif ($mymonth eq "Jun"){$mymonth = "06";} elsif ($mymonth eq "Jul"){$mymonth = "07";} elsif ($mymonth eq "Aug"){$mymonth = "08";} elsif ($mymonth eq "Sep"){$mymonth = "09";} elsif ($mymonth eq "Oct"){$mymonth = "10";} elsif ($mymonth eq "Nov"){$mymonth = "11";} else{$mymonth = "12";} if ($myday<10){ $myday = "0".$myday;} #day must be 2 digits long $mydate = $myyear."-".$mymonth."-".$myday."T".$time."Z"; }