#! /usr/local/bin/perl -s # Fixup script for gutenberg files files # Add -h option to remove hyphenated words # Add -d with -h to use the dictionary as well as words in the file. # (otherwise, all hyphens remain) # Add -para to fix paragraphs # Add -q if the quotes are in a real mess! # Add -s to add end-of-sentence (and clause) spaces # Add -head to strip page headings (titles) # # Rules for hyphen removal. Suppose we have found an eol hyphen "to-day" # where "today" is either in the text or in the dictionary: # # (1) If the word "today" appears in the text, then remove the hyphen; # (2) Otherwise, if the word "to-day" appears in the text (non-eol hyphen), # then keep the hyphen; # (3) Otherwise, if the file had lots of eol hyphens (more than 100), # then remove the hyphen; # (4) Otherwise, leave it in. # # Set this to the filename of your dictionary # (a simple text file containing one word per line): $dictionary = "$ENV{HOME}/dict/text710.words"; # Default options: $h = 1; $d = 1; # $para = 0; $q = 0; $s = 0; $head = 0; ($myname = $0) =~ s|(.*/)*||; # strip path component from name $hyphens = 0; # number of eol hyphens in the file $big = 100; # cutoff point for aggressive hyphen removal (rule 3) $header = 0; # Have we seen a Gutenberg header? $gutpat = '^\*END\*THE SMALL PRINT\!'; $headpat1 = '^\d+\s+[A-Z][^a-z0-9]+$'; $headpat2 = '^[A-Z][^a-z0-9]+\s+\d+$'; $nums = '(\d+|[MDCLXVImdclxvi_]+)'; # pattern for page nos @cache = (); $Usage = "Usage: $myname infile [outfile] \n"; # Check one or two arguments: die $Usage if (($#ARGV < 0) || ($#ARGV > 1)); $file = $ARGV[0]; open (IN, $file) || die "Can't open $file: $!\n"; if ($#ARGV == 1) { open (OUT, ">$ARGV[1]") || die "Can't open $ARGV[1]: $!\n"; } else { open (OUT, ">&STDOUT"); } if ($h) { if ($d) { # Read a dictionary into %word open (WORDS, $dictionary) or die "Can't read dictionary file `$dictionary': $!\n"; while () { chop; $word{$_}=1; } } # Read the file for more words: while () { tr/\015\032//d; $header++ if (/$gutpat/); $hyphens++ if (/\w-$/); tr/A-Z/a-z/; tr/a-z/ /cs; foreach $w (split(/ /)) { $textword{$w} = 1; } } # reopen input: close (IN); open (IN, $file) || die "Can't open $file: $!\n"; if ($header) { while () { print OUT; last if (/$gutpat/); } } } while ($_ = getline()) { # tidy up dashes: s/(^|\s+)-(\s+|$)/--/g; # new para: s/^ {3}(\S)/\n$1/ if ($para); # check for hyphenated word: while ($h && m/([a-zA-Z]+)-$/) { # read next line, skipping page number: $next = getline(); $_ .= $next; s/([a-zA-Z]+)-\n([a-zA-Z]+)/\377/; $orig = $&; ($new = $orig) =~ s/-\n//; ($old = $orig) =~ s/\n//; # Mark the hyphen in $orig as "done": $orig =~ tr/-/\376/; ($x = $new) =~ tr/A-Z/a-z/; ($y = $old) =~ tr/A-Z/a-z/; # (1) If the word "today" appears in the text, then remove the hyphen; # (2) Otherwise, if the word "to-day" appears in the text (non-eol hyphen), # then keep the hyphen; # (3) Otherwise, if the file had lots of eol hyphens (more than 100), # then remove the hyphen; # (4) Otherwise, leave it in. if ($word{$x} || $textword{$x}) { if ($textword{$x}) { s/\377/$new/; # remove the hyphen } elsif ($textword{$y}) { s/\377/$orig/; # leave in the hyphen } elsif ($hyphens > $big) { s/\377/$new/; # remove the hyphen } else { s/\377/$orig/; # leave in the hyphen } } else { s/\377/$orig/; # leave in the hyphen } } # restore "done" hyphens: tr/\376/-/; if ($s) { # Add end of sentence (and clause) spaces: s/([\:\.\!\?]) ($|[A-Z])/$1 $2/g; # Double space after : s/: (\S)/: $1/g; # Ditto after s/([\:\.\!\?])(["'`]) ($|[A-Z])/$1$2 $3/g; # Double space after : s/:(["'`]) (\S)/:$1 $2/g; # Not all "."s are end of sentences: s/(Mr|Mrs|Miss|St)\. /$1. /g; } if ($q) { # rationalise "s: s/(^|\s)['`](\S|$)/$1"$2/g; s/(^|\S)['`](\s|$)/$1"$2/g; s/(\w)["'`](\w)/$1'$2/g; # double quotes: s/(^|\s)['`](\S|$)/$1"$2/g; s/(^|\S)['`](\s|$)/$1"$2/g; } # long dashes: if ($s) { s/ -([ "'`]|$)/--$1/g; s/\s*--\s*/--/g; } print OUT; } # Read a line from IN and return it: sub getline() { local ($_); if (@cache) { return(shift(@cache)); } else { get_one_line(); # There is no closing "}" on this illus: for (;;) { # get_one_line() while (/^\s*\{illust\. caption =/); if (!defined($_)) { if (@cache) { return(shift(@cache)); } else { return(""); } } # Cache blank lines: if ((/^\s*$/) || ($head && /^\d+$/)) { push(@cache, $_); get_one_line(); redo; } # search for the end of {illus....}: while ((/^\s*\{illus[^\{\}]*$/) && !eof(IN)) { chomp($_); $_ .= " " . ; } # If we hit a page number or illustration, # clear the cache and skip to the next non-blank line (if any): if (($head && (/$headpat1/ || /^[A-Z][^a-z0-9]+\s+\d+$/)) || (/^\s*\\s*$/) || (/^\s*\\s*$/) || (/^\s*\{illus[^\{\}]*\}\s*$/) || (/^\s*\{\s*\}\s*$/)) { @cache = (); if (/\{/ || ($head && (/$headpat1/ || /$headpat2/))) { # skip to next non-blank $_ = ""; get_one_line() while (($_ =~ /^\s*$/) && !eof(IN)); } else { # read next line: get_one_line(); } redo; } else { # Process the line: tr/\015\032//d; s/\s*\\s*//g; s/\s*\\s*//g; s/\s*\{illus[^\{\}]*\}\s*//g; # Fix "Larsen" encodes: s//L/g; # pound sign s/<([a-zA-Z][a-zA-Z])>/$1/g; s/<([a-zA-Z\?])([\'\`\^\:\;\!\,])>/$1/g; s/\s*<(\d\/\d)>/ $1/g; # fractions s/<(\d+)s>/$1/g; # subscripts s//n/g; # ~ over n (?) s/<'0>/'0/g; # degrees symbol # Greek words: s//$1/g; # Emphasis italics: s/_\*([^_]*)_/\U$1\E/g; # Other italics are left as _italics_ # Fix up HTML codes: s/\&(..)lig;/$1/g; s/\&(.)(circ|grave|acute|uml);/$1/g; s/\&frac(.)(.);/$1\/$2/g; s/\£/pounds /g; s/\s*\°/ degrees/g; s/<\/?i>/_/gi; s/_([^ _]{2,})_/\U$1\E/g; s/<\/?b>//gi; # tidy up dashes: s/(^|\s+)-(\s+|$)/--/g; # tidy up double quotes: s/\`\`/"/g; s/\'\'/"/g; # If the result is blank, cache it and continue: if (/^\s*$/) { push(@cache, $_); get_one_line(); redo; } else { push(@cache, $_); return(shift(@cache)); } } } } } sub get_one_line () { $_ = ; s/\cM//g if (defined($_)); }