#!/usr/bin/perl -w #!/usr/bin/perl5.004/Shell.pm package Shell; use Shell qw(:find); use Cwd; use File::Find qw(finddepth); # xml_html_2 # to use to change any TEIx lite XML files in this directory into html format # that have not already been format: will do all in file: # type xml_html_2 (return) and wait! # code from Susan Gants, University of Georgia, GALILEO project, # tweaked into oai.wk by Jody DeRidder 1/5/01 for TEI lite sgml files, # which titles them Southeastern American Indians; # and changed from that to this (Tennessee Documentary History) # on 8/30/02, both these versions reference links to jpegs, djvu at GAGAL #opendir(DIR,"/usr/sunsite/htdocs/oai/translate/html/"); opendir(DIR,"/usr/local/sunsite/htdocs/oai/newfiles/tdh_xml/whatnow"); foreach $_ (readdir(DIR)){ if (($_ =~ /([0-9]*)\.xml/) && (! -e $1."html")){ print " Formatting into html file ".$_ ."\n"; $filename = $_; ## Open input file open (TEXT, $filename) || die "cannot open $filename - $!"; ($prefix = $filename) =~ s,(.*?)\.xml,$1,; print "prefix is $prefix\n"; #### Open output file (my $outfile = $filename) =~ s,(.*?)\.xml,$1.html,; open (OUT, ">$outfile") || die "cannot open file - $!"; ## just a couple of style specs my $style_parms = ""; ## link colors my $body_attr = qq{bgcolor="#ffffff" text="#000000" link="#0000cc" vlink="#0000cc" alink="#cc0000"}; ## basic page layout my $hdr = qq{\n

\n\n\n
A University of Tennessee Digital Library Database
\n
\n\n\n
Tennessee Documentary History, 1796-1850
\n\n
\n}; ## values for HI REND tags # no attrib --> my %HIstarts = ( 'b' => qq{}, 'i' => qq{}, 'o' => '', 'o, b' => qq{}, 'sc' => '', # 'typewritten' => qq{[printed text: }, # 'printed text' => qq{[printed text: }, # 'printed text, i' => qq{[printed text: }, # 'printed text, sc' => qq{[printed text: }, # 'written text' => qq{[written text: }, ); my %HIends = ( 'b' => qq{}, 'i' => qq{}, 'o' => '', 'o, b' => qq{}, 'sc' => '', # 'typewritten' => qq{ ]}, # 'printed text' => qq{ ]}, # 'printed text, i' => qq{ ]}, # 'printed text, sc' => qq{ ]}, # 'written text' => qq{ ]}, ); ## values for display my %langs = ( 'lat' => 'Latin', 'fra' => 'French', 'ger' => 'German', ); #### Prompt for input file ################### #### NOTE ##### assumes pgm is run in directory containing sgm files ################### #print "\nEnter filename: \n"; # chop (my $filename = ); ## set input record delimiter to undefined, slurp in whole file undef $/; my $line = ; ## omit line ends - file is now one long string $line =~ s,\r,,g; # carriage returns from DOS $line =~ s,>\n,>,g; # "real" record ends $line =~ s,([^>])\n,$1 ,g; # mid-line $line =~ s,^\n$,,; # blank lines $line =~ s,\ \;, ,g; $line =~ s,<\?[^>]*>,,g; # xap PI's outright ## Print header print OUT ("Tennessee Documentary History, 1796-1850\n$style_parms\n\n\n", $hdr, "\n\n,g; $line =~ s,,,g; $line =~ s,,,g; $line =~ s,,,g; $line =~ s,,,g; $line =~ s,,,g; $line =~ s,,,g; $line =~ s,,,g; $line =~ s,,,g; # FIGURE tags ----------------------------------------- $line =~ s,
(.+?)(.+?)
,
\[ Note: $2; $1 \],g; $line =~ s,
(.+?)
,
\[ Note: $1 \],g; $line =~ s,
(.+?)
,
\[ Note: $1 Click on page image to view \],g; # various tags ----------------------------------------- $line =~ s,,\n,g; $line =~ s,,\n,g; $line =~ s,,,g; $line =~ s,,,g; $line =~ s,,,g; $line =~ s,,,g; $line =~ s,,\n,g; $line =~ s,,,g; $line =~ s,,\n,g; $line =~ s,,,g; $line =~ s,,\n,g; $line =~ s,,\n,g; $line =~ s,,\n,g; $line =~ s,,,g; # All other NOTEs within TEXT ----------------------------------------- $line =~ s,(.*?), \[ Note: $1 \] ,g; # Paragraphs, Quotes, Linegroups, Lines, Line Breaks ---------------------- $line =~ s,

,\n

",g; $line =~ s,

,"

\n,g; $line =~ s,,\n,g; $line =~ s,,\n,g; $line =~ s,,\n
,g; $line =~ s,
,\n,g; $line =~ s,

,\n

,g; $line =~ s,,\n
,g; # Page Breaks -------------------------------------- # this section does not always work, so only using one line of it if ($line =~ /()

,\n,g; $line =~ s,,\n

Page: $3   \[jpg image\]

,g; # Letter related tags -------------------------------------- $line =~ s,

,,g; $line =~ s,
,,g; $line =~ s,,
,g; $line =~ s,
,,g; $line =~ s,,
,g; $line =~ s,
,,g; $line =~ s,,\n
,g; $line =~ s,
,,g; $line =~ s,,\n
,g; $line =~ s,
,,g; $line =~ s,,\n

,g; $line =~ s,,,g; $line =~ s,X, X ,g; $line =~ s,,\n
\[Signed\] ,g; $line =~ s,,\n
\[Signed\] ,g; $line =~ s,
,
,g; # text change tags -------------------------------------- $line =~ s,,\[illegible\],g; $line =~ s,(.*?),\[unclear: $1\],g; $line =~ s,,\[$1\],g; $line =~ s,,\[$2: $1 \],g; $line =~ s,(.*?),$1 \[$2\],g; $line =~ s,(.+?),$2 \[$1\],g; $line =~ s,(.+?),$2 \[$1\],g; $line =~ s,(.*?),$2 \[$1\],g; $line =~ s,(.*?),$2 \[$1\],g; $line =~ s,(.*?),\[added ($1): $2\],g; $line =~ s,(.*?),\[added ($1): $2\],g; $line =~ s,(.*?),\[added: $1\],g; $line =~ s,(.*?),\[deleted ($1): $2\],g; $line =~ s,(.*?),\[deleted: $1\],g; $line =~ s,(.*?),\[$langs{lc($1)}: $2\],g; $line =~ s,,,g; $line =~ s,,,g; #$line =~ s,(.+?),$HIstarts{lc($1)}$2 HIends{lc($1)},g; $line =~ s,,,g; $line =~ s,,,g; # LIST and ITEM tags -------------------------------------- while ($line =~ /') { $line =~ s,,\n,g; $line =~ s,,\n,g; $line =~ s,,\n
,g; $line =~ s,,
,g; $line =~ s,
,,g; } elsif ($line =~ ',\n,g; $line =~ s,,\n,g; $line =~ s,
, \n,g; $line =~ s,,,g; $line =~ s,,,g; } elsif ($line =~ ',\n

    ,g; $line =~ s,,\n
,g; $line =~ s,,\n
  • ,g; $line =~ s,,
  • ,g; $line =~ s,,
  • ,g; } # end if } # end while # XREF tag -------------------------------------- ### This creates an internal link to a second document. ### $bin would be a call to the search program, ### with 'type=doc' indicating a retrieval of an entire document, and ### tei2id=$1 being the ID of the document to be retrieved. ### I'm not sure how you'll want to handle this my $bin = ''; $line =~ s,(.+?),$1,g; ### print out 1-line file and closing HTML tags print OUT "$line\n"; print OUT ('

    \n"); ## Process input file # drop DOCTYPE and ENTITY tags ----------------------------------------- $line =~ s,,,; # for files with ENTITY tags $line =~ s,,,; # for file without # clean up character entities ----------------------------------------- $line =~ s,\&hyphen\;,-,g; $line =~ s,\&ndash\;, - ,g; $line =~ s,\&mdash\;, -- ,g; $line =~ s,\&[lr]dquo\;,\",g; $line =~ s,\&[lr]squo\;,\',g; $line =~ s,\&cross\;,+,g; $line =~ s,\&dagger\;,+,g; $line =~ s,\&plus\;,+,g; $line =~ s,\&lcub\;,{,g; $line =~ s,\&rcub\;,},g; $line =~ s,\&frac38\;,3/8,g; $line =~ s,\&check\;,[checkmark],g; $line =~ s,\&equals\;,\=,g; $line =~ s,\&dollar\;,\$,g; $line =~ s,\―,
    ,g; # TEI.2 tags ----------------------------------------- $line =~ s,,Document: $1,; if ($1) {$prefix = $1;} $line =~ s,,,; # TEIHEADER section ----------------------------------------- $line =~ s,,,; $line =~ s,.*?,,; $line =~ s,.*?,,; $line =~ s,.*?,,; # SOURCEDESC tags ----------------------------------------- if ( $line =~ m,(.*?), ) { my $temp = $1; $temp =~ s,,,; $temp =~ s,,\n<h3>,; $temp =~ s,,\n,; $temp =~ s/<\/author>/; /g; $temp =~ s,(.*?),\n
    author: $1,; $temp =~ s,(.*?),\n
    publication place: $1,; $temp =~ s,(.*?),\n
    publisher: $1,; $temp =~ s, -- (.*?),\n
    date: $1,; $temp =~ s,(.*?),\n
    extent: $1,; $temp =~ s,(.*?),\n
    $1: $2,g; $temp =~ s,
    ,,; $line =~ s,.*?,$temp,; } # end if $line =~ s,
    ,,; $line =~ s,.*?,,; $line =~ s,.*?,,; $line =~ s,
    ,

    ,g; # TEXT tags ----------------------------------------- $line =~ s,,,; $line =~ s,,,; # FRONT section ----------------------------------------- $line =~ s,,\n


    ,; $line =~ s,,
    \n,; $line =~ s,,

    \n,; $line =~ s,,
    \n,; $line =~ s,,
    \n,; $line =~ s,,
    \n,; $line =~ s,,,g; $line =~ s,,,g; $line =~ s,,,g; $line =~ s,,,g; $line =~ s,,
    \n,; $line =~ s,,
    \n,; $line =~ s,,,g; $line =~ s,,,g; $line =~ s,(.*?),$1,; $line =~ s,(.*?),$1,; $line =~ s,(.*?),$1,; $line =~ s,,
    \n,; # BODY/BACK tags ----------------------------------------- $line =~ s,,
    \n

    ,; $line =~ s,,\n,; $line =~ s,,


    ,; $line =~ s,,
    ,; # TABLE tags ----------------------------------------- $line =~ s,,
    \n
    ,g; $line =~ s,,
    ,g; $line =~ s,
    ,
    \n

    ,g; $line =~ s,,

     
    $1
    '); #print OUT ('

    '); #print OUT ('Thanks to the GALILEO Digital Library of Georgia.'); #print OUT ('
    The entire Southeastern Native American Documents project '); #print OUT ('can be seen after January 30, 2001, through:
    '); #print OUT (''); #print OUT ('GALILEO
    '); #print OUT ('From the GALILEO home page, choose the Digital Library of Georgia,'); #print OUT ('
    and then scroll down to choose the database.
    '); } closedir(DIR); } ############################## sub num { my @fig = ('', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',); my $parm = shift; print "*** $prefix N= $parm \n"; if ($parm =~ /\[(\d+)\]/) { $a = $fig[$1]; } elsif ($parm =~ /(\d+)/) { $a = $fig[$1]; } if (!$a){ $a = "";} $b = qq{pb id="$prefix$a" n="$parm"}; return $b; } ### end program