#!/usr/local/bin/perl -s # check-punct Version 4.5 # # Checks punctuation in a Gutenberg file # # If no files are given, reads from stdin and writes to stdout. # If filenames are given, then writes to a .not (notes) file # -- If -b option also given, then also writes to stdout. # -- Checks for the header automatically if files are on the command line # # -h option to skip Gutenberg header (up to "*END*THE SMALL PRINT!") # -b option only prints bad lines (with line numbers) # default is to print everything (with no line numbers). # (If you print everything, you can restore the original file by deleting # all the lines which start with ##). # -q option to use alternative method for checking quotes (recommended) # -e option, allow ". . ." for ellipses! # -d option runs "showdups" with output in .dup # -s option runs "gutspell" with output in .spl # -m option marks the words which appear in .spl # (if -s and -m are used together, then the .spl is created first # and then used to mark the spelling errors) # # These assignments set "-b -q -e" as default options: $b = 1; $q = 1; $e = 1; # # Work a paragraph at a time # Check for, and complain about: # Mismatched quotes. Should be like this: "...`..."..."...'..." # NB: whole para quoted at beginning (no quote at end) # NB: apostrophes not quotes: ain't 'em 'Arry 'Talian yo' jes' # Mismatched parentheses # bad characters # Space before punctuation (?!;:,.) # Punctuation (?!;:,.) before letter # (except initials: M.P. Ward, and...ellipses) # No space or " or ( before ` or ( # No letter or " or ( after ` or ( # Space before ) # Letter after ) # Hyphens should be either letter-letter or notspace--notspace # # Author: Martin Ward, Martin.Ward@durham.ac.uk # Comments welcomed! # # Copyright 1994 Martin Ward # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # # Recent history: # Version 4.1 Modified to not require the Unix "tee" program # Version 4.2 Fixed a bug in 4.1 closing the output file to early # Version 4.3 Fixed a bug where output for later files goes to all # previously opened files! # Version 4.4 Allow nor'-letter (don't complain about the hyphen) # Version 4.5 Allow four dots (with optional spaces) anywhere # $SIG{PIPE} = sig_pipe; $| = 1 if ($b); # make output unbuffered if only printing bad lines $prefix = "##"; $quote_mode = 0; $quote_mode = 1 if ($q); $punct = '[\?\!\;\:\,\.]'; $bad = '[^a-zA-Z0-9\$\&\s\(\)\'\`\"\-\?\!\;\:\,\.\200-\377\_\*\[\]\%\/\+\@]'; $letter = '[a-zA-Z0-9\$\341-\372\301-\332\260-\271\244\@]'; # Note that the following are negated character classes # - it is an error if these match. # error if this matches after ( or open quote: $before_open = '[^ \200\202\"\(\[\n\-\`]'; $after_open = '[^\200-\377a-zA-Z0-9\$\"\(\[\.\`\-\'\_\*]'; # error if this matches before ) or close quote: $before_close = '[^\200-\377a-zA-Z0-9\$\"\)\]\-\?\!\;\:\,\.\']'; $after_close = '[^ \200\202\"\(\[\n\212\.\-\?\!\;\:\,\.\']'; # Check for .zip files: if ($#ARGV < 0) { # no files, read stdin, write to stdout: *IN = *STDIN; open(OUT, ">&STDOUT"); &process_file; } else { foreach $file (@ARGV) { die "File not found: $file\n" unless (-f $file); $killme = 0; # delete file after processing? if ($file =~ /\.zip$/) { $killme = 1; # Quietly unzip file, overwriting existing file: system 'unzip', '-qo', $file; $file =~ s/\.zip$/\.txt/; die "File not created from zip file: $file\n" unless (-f $file); } # check for DOS format open(IN, $file) || die "Can't open $file: $!"; ($base = $file) =~ s/\.txt$//; read(IN, $head, 8192); close(IN); if ($head =~ /\015/) { # Convert DOS format file: $old = "$base.old"; rename($file, $old); open(IN, $old); open(OUT, ">$file") || die "Can't write to $file: $!"; do { $bytes = read(IN, $_, 8192); s/[\015\032]//g; print OUT $_; } until ($bytes < 8192); close(IN); close(OUT); unlink($old); } # check for header: $h = 0; $h = 1 if ($head =~ /\*START\*/); # start of small print open(IN, $file) || die "Can't open $file: $!"; $out = "$base.not"; $| = 1; # make unbuffered open(NOTES, ">$out") or die "Can't write to $out: $!\n"; $| = 1; # make unbuffered message("Checking $file...\n\n") if ($b); #system "showdups $file > $base.dup" if ($d); #system "gutspell $file > $base.spl" if ($s); &process_file; close(IN); unlink ($file) if ($killme); # Close notes: close(NOTES); } # next file } exit(0); sub process_file { # Read list of spelling errors to mark %err = (); if ($m && (-f "$base.spl")) { open (ERR, "$base.spl") || die "Can't open spellings file $base.spl: $!"; @words = ; chop(@words); $wordpat = '\b(' . join("|", @words) . ')\b'; $m = 0 if ($#words < 0); } if ($m && (!-f "$base.spl")) { message("Warning: -m option given but $base.spl file not found\n"); $m = 0; } $lineno = 1; $/ = "\n"; # read input a line at a time. # skip header: if ($h) { while () { $lineno++; last if (/^\*END\*/ || /\*END\*$/); } die "End of header not found!\n" if (eof); } $/ = "\n\n"; # read input a paragraph at a time. $quoting = 0; # are we expecting the next paragraph to be quoted? while ($para = ) { &clear_bads; # add the paragraph break: # split para into lines each of which ends in \n: @lines = split(/\n/, $para, 999999); pop(@lines); grep ($_ .= "\n", @lines); # First the simple checks, do a line at a time: $l = 0; foreach (@lines) { # Marked up accents and ligatures and page numbers etc.: s/<[^<>]+>/&set_meta("$&")/ge; s/-\^-/&set_meta("$&")/ge; while (s/$bad/&set_meta("$&")/e) { # Only flag each bad character once in the file: substr($bad, -1, 0) = "\\$&"; &bad("Bad character", $l, length($`)); } # Mark words in wordlist: if ($m) { while (s/$wordpat/&set_meta("$&")/e) { &bad("Possible misspelling", $l, length($`)); # Remove the word from the pattern: $word = $&; @words = grep (!/^$word$/, @words); $wordpat = '\b(' . join("|", @words) . ')\b'; if ($#words < 0) { $m = 0; last; } } } # Avoid false matches on numbers n,nnn and n.nnn and times n:nn s/\d[\.\,\:]\d/&set_meta("$&")/ge; s/(^|\s)\.(\d)/"$1" . &set_meta(".") . "$2"/ge; # # Allow four or more dots anywhere: s/(\s*\.){4,}/&set_meta("$&")/eg; # M.P. Ward, Ph.D, c.v. and...ellipses and.... ellipses are OK: s/\b($letter$letter?\.)+$letter/&set_meta("$&")/eg; s/((^|$letter|[\`\"\!\?])\.\.\.)+($letter|[\'\"\!\?]|\n)/&set_meta("$&")/eg; if ($e) { # Allow ". . ." also as ellipses: s/(^|$letter|[\`\"\!\?])\s*\.\s*\.\s*(\.\s*)+_?($letter|[\'\"\!\?]|\n)/&set_meta("$&")/eg; } # (!) and (?) are OK: s/\([\!\?]\)/&set_meta("$&")/ge; # while (s/(\s)($punct)/"$1".&set_meta("$2")/e) { &bad("Space before punctuation", $l, length($`)); } while (s/($punct$letter)/&set_meta("$&")/e) { &bad("Letter after punctuation", $l, length($`)); } # must have space or " or ( before ` or ( while (s/($before_open)([\`\(])/"$1".&set_meta("$2")/e) { &bad("No space before ` or (", $l, length($`)); } # must have letter or " or ( after ` or ( while (s/([\`\(])($after_open)/&set_meta("$1")."$2"/e) { &bad("No letter or punctuation after ` or (", $l, length($`)); } while (s/(^| )(\))/"$1".&set_meta("$2")/e) { &bad("Space before )", $l, length($`)); } while (s/(\))($letter)/&set_meta("$1")."$2"/e) { &bad("Letter after )", $l, length($`)); } # Hyphens should be either letter-letter or -----... (2 or more) # NB all of space--space, notspace--notspace, # space--notspace and notspace--space are used, also ----. # Also allow letter-o'-letter and NOR'-letter and letter-'and s/($letter)-o'-($letter)/&set_meta("$&")/eg; s/nor'-($letter)/&set_meta("$&")/eg; #' s/($letter)-'and/&set_meta("$&")/eg; #' s/($letter-)+($letter)/&set_meta("$&")/eg; s/-{2,}/&set_meta("$&")/eg; s/tête-à-tête/&set_meta("$&")/eg; # Also allow :- and -/- anywhere: s/:-/&set_meta("$&")/eg; s/-\/-/&set_meta("$&")/eg; # Also allow s'-letter s/($letter)s'-($letter)/&set_meta("$&")/eg; #' # remaining (single) hyphens are bad: while (s/-+/&set_meta("$&")/e) { &bad("Bad hyphen(s)", $l, length($`)); } tr/\200-\377/\0-\177/; $l++; } # now check parentheses and quotes. $_ = $para; # Parentheses must match within the paragraph. # First zap balanced parentheses: 1 while (s/\(([^\(\)]*)\)/\377$1\377/g); # remaining parentheses are unbalanced: while (s/[\(\)]/\377/) { &bad_par("Unmatched parenthesis", $`); } # Next, check for matching quotes $_ = $para; s/

/ /gi; # Clear HTML paragraph markers. # if $quoting, then first non-space must be " # if ($quoting) { if (m/^\s*"/) { #" # quote found. Check if this para closes the quote, # if so, then clear $quoting # if not, then add a "dummy" close quote at the end $quotes = tr/"/"/; if ($quotes & 1) { # odd no. of quotes, so add a dummy close quote: s/(\n)*$/ DUMMY\"$1/; } else { # even no. of quotes, leave quoting mode: $quoting = 0; } } else { # opening quote is missing &bad("Unmatched \"s in previous paragraph", 0, 0); $quoting = 0; } } else { # if an odd no of quotes, then enter quote mode and add a dummy # close quote at the end: $quotes = tr/"/"/; if ($quotes & 1) { $quoting = 1; s/(\n)*$/ DUMMY\"$1/; } } # Now all quotes should balance. # Decide whether "'s are opening or closing. # use \200 for opening quote and \201 for closing quote # zap balanced `stuff': 1 while (s/`[^`'"]*'/&set_meta("$&")/ge); # Check for `..."stuff"...' 1 while (s/(`[^`"]*)"([^`"]*)"/$1\200$2\201/g); #` # zap balanced `stuff': 1 while (s/`[^`"]*'/&set_meta("$&")/ge); #" # Check for `..."stuff"...' 1 while (s/(`[^`"]*)"([^`"]*)"/$1\200$2\201/g); #` # Decide on remaining "s: s/"([^`"]*)"/\200$1\201/g; #" # Any remaining "s or `s must be unbalanced: $tmp = $_; while (s/[`"]/\377/) { #` &bad_par("Unmatched quote", $`); } $_ = $tmp; tr/\202-\377/\002-\177/; if ($quote_mode == 0) { # Old method # Check spacing around \200 and \201 # Must have space or \200 or " or ( before \200 # Change \200 to \202 when done: while (s/($before_open)\200/$1\202/) { &bad_par("No space before open quote", $`); } tr/\202/\200/; # must have letter or \200 " or ( or ... after \200 while (s/\200($after_open)/\202$1/) { &bad_par("No letter or \" or ( after open quote", $`); } tr/\202/\200/; while (s/\201($after_close)/\203$1/) { &bad_par("No space after close quote", $`); } tr/\203/\201/; # must have letter or \201 " or ) or ... after \201 while (s/($before_close)\201/$1\203/) { &bad_par("No letter or punctuation before close quote", $`); } tr/\203/\201/; } else { # Alternative method of checking quotes. tr/\200\201\202\203/""""/; # Use context to decide whether each " is an open or close quote. # Then check for balanced quotes. # Trivially, the first char of a para must be an opening quote! s/^"/\200/; # Simple cases: space"letter (opening) and letter"space (closing) # and `"letter, space`" (opening) and '"space, letter'" (closing) s/(\s)"($letter)/$1\200$2/g; s/($letter)"(\s)/$1\201$2/g; s/(\s)`(\s*)"/$1`$2\200/g; s/(\s)"(\s*)`/$1\200$2`/g; s/`(\s*)"($letter)/`$1\200$2/g; s/"(\s*)`($letter)/\200$1`$2/g; s/'(\s*)"(\s)/'$1\201$2/g; s/"(\s*)'(\s)/\201$1'$2/g; s/($letter)'(\s*)"/$1'$2\201/g; s/($letter)"(\s*)'/$1\201$2'/g; # )" and ") are closing, while (" and "( are opening: s|\)"|\)\201|g; s|"\)|\201\)|g; s|\("|\(\200|g; s|"\(|\200\(|g; if ($e) { # Special cases with ". . ." ellipses: . . . "punct is a closing quote: s/(\.\s*\.\s*\.\s*)"($punct|\s)/$1\201$2/g; s/(\s)"(\s*\.\s*\.\s*\.\s*)/$1\200$2/g; } # more difficult cases (punctuation on one side): # punct"space, punct"punct (closing), space"'letter (opening) # letter"punct (closing) s/($punct|-)"(\s)/$1\201$2/g; s/($punct)"($punct)/$1\201$2/g; s/(\s)"'($letter)/$1\200$2/g; s/($letter)"($punct)/$1\201$2/g; # hyphens: -"letter is opening, -"space is closing # space"- is opening, (letter or punct)"- is closing s/-"($letter)/-\200$1/g; s/-"(\s)/-\201$1/g; s/(\s)"-/$1\200-/g; s/($letter|$punct)"-/$1\201-/g; # space"... (opening), ..."(punct or space) (closing): s/(\s)"\.\.\./$1\200\.\.\./g; s/\.\.\."($punct|\s)/\.\.\.\201$1/g; # space"space-- is opening and --space"space is closing: s/(\s)"(\s--)/$1\200$2/g; s/(--\s)"(\s)/$1\201$2/g; # "[\d is closing and \d]" is opening (foot note markers) s/"(\[\d)/\201$1/g; s/(\d\])"/$1\200/g; # "<\d is closing and \d>" is opening (foot note markers) s/"(<\d)/\201$1/g; s/(\d>)"/$1\200/g; # space"_ and _"space are opening and closing: s/(^|\s)"_/$1\200_/g; s/_"($|\s)/_\201$1/g; # --"(punct or space) is closing: s/--"($punct|\s)/--\201$1/g; # (" is opening and ") is closing: s/([\[\(\<])"/$1\200/g; s/"([\]\)\>])/\201$1/g; # Any remaining quotes are errors. \200...\201 should balance while (s/"/\377/) { &bad_par("Bad quote", $`); } # zap balanced quotes: 1 while (s/\200([^\200\201]*)\201/\377$1\377/g); # remaining \200 and \201s are unbalanced quotes: while (s/[\200\201]/\377/) { &bad_par("Unmatched quote", $`); } } &print_bads; $lineno += $l; } } # end of process_file subroutine. # record a problem for this paragraph: sub bad { local($whinge, $l, $pos) = @_; $whinges{$l, $pos} = "$prefix $whinge\n"; $posns[$l] .= "," if ($posns[$l] ne ""); $posns[$l] .= $pos; } # record a problem for this paragraph -- calculate line and pos sub bad_par { local($whinge, $p) = @_; local ($l); $l = ($p =~ tr/\n\212/\n\212/); $p .= " "; # Make sure there is at least one char to match: $p =~ m/([^\n\212]+)$/; $p = length($1) - 1; &bad($whinge, $l, $p); } # print either just bad lines, or all lines, plus whinges # ignore posns beyond length of line sub print_bads { local ($l, $ll, @sortposns); $l = 0; foreach (@lines) { $ll = length; if (($posns[$l] eq "") || ($posns[$l] > $ll)) { message($_) unless ($b); } else { message(sprintf("%5d:", $l + $lineno)) if ($b); message($_); # print whinges for this line $marks = " " x 80; @sortposns = sort(split(/,/, $posns[$l])); foreach $pos (@sortposns) { last if (($pos > $ll) || ($pos >= 80)); substr($marks, $pos, 1) = "^"; } $marks =~ s/\s+$//; if ($b) { $tmp = $prefix x 3; message($tmp, $marks, "\n"); } else { substr($marks, 0, 2) = $prefix; message("$marks\n"); } foreach $pos (@sortposns) { last if ($pos > $ll); message($whinges{$l, $pos}); } message("\n") if ($b); } $l++; } } sub clear_bads { %whinges = (); @posns = (); } # return string with meta bits set: sub set_meta { local($_) = (@_); tr/\0-\177/\200-\377/; $_; } sub clear_meta { local($_) = (@_); tr/\200-\377/\0-\177/; $_; } sub sig_pipe { print STDERR "Caught a SIGPIPE!\n"; } sub message { print @_ if ($b); print NOTES @_; }