Style and Spell Checker in Perl
This is an example from Perl and OpenOffice.

#!/usr/bin/perl
#
# A Style and Spell Checker that reads OpenOffice files
# 
# Tom Anderson
# 6/24/2005
#
use strict;
use warnings;

main($ARGV[0]);

sub main {
  my ($fn)= @_;
  my $oo= OOText->new($fn);
  my $text_body= $oo->get_text_content();
  my $code_examples= $oo->get_code_content();

  PerlSyntax::check_syntax($code_examples);
  Fathom::fathom($text_body);

  my $sentences= Sentences->new($text_body);
  $sentences->get_lengths();
  print $sentences->report_lengths();
  print $sentences->report_long_sentences();

  my $spell = SpellCheck->new('/cygdrive/c/program files/Aspell/dict');
  $spell->check_spelling($text_body);
  print $spell->report_dict();
  print $spell->report_not_in_dict();
}

package PerlSyntax;
use IPC::Open3;
use Symbol qw(gensym);
use IO::File;
use File::Slurp;

sub check_syntax
{
  my ($code_examples)= @_;
  my $perl_code_fn= "tstcode.pl";
  write_file($perl_code_fn, $code_examples);
  local *CATCHOUT = IO::File->new_tmpfile;
  local *CATCHERR = IO::File->new_tmpfile;

  my $cmd= "perl -c $perl_code_fn";
  my $pid = open3(gensym, ">&CATCHOUT", ">&CATCHERR", $cmd);
  waitpid($pid, 0);
  seek $_, 0, 0 for \*CATCHOUT, \*CATCHERR;
  while(  ) {}
  my $syntax_check= '';
  while(  ) { $syntax_check .= $_;}

  if ($syntax_check =~ /syntax OK/) {
    return '';
  } else {
    return "Perl code syntax error:\n$syntax_check\n";
  }
}
1;

package Sentences;
use Lingua::EN::Sentence qw( get_sentences );
use CGI qw( :html3 :standard );
use Encode;

sub new
{
  my $class= shift;
  my ($text_body)= @_;
  my $self= {};
  $text_body=~ s/-_/ /g;
  $text_body=~ s/:/./g;
  $self->{sentences}= get_sentences($text_body);
  bless $self, $class;
  return $self;
}

sub get_lengths
{
  my $self= shift;
  my @lengths;
  $self->{long_sentences}= [];
  foreach my $sentence (@{$self->{sentences}})
  {
    my @words= split /\s+/, $sentence;
    my $num_words= scalar(@words)+1;
    # my $delim= join '|', @words;
    # print $num_words, ': ', $delim, "\n";
    $lengths[$num_words]++;
    if ($num_words > 24)
    {
      push @{$self->{long_sentences}}, encode('UTF-8',$sentence);
    }
  }
  foreach my $i (0..scalar(@lengths)-1)
  {
    $lengths[$i]=0 if not defined $lengths[$i];
  }
  $self->{lengths}= \@lengths;
  return $self;
}

sub report_long_sentences
{
  my $self= shift;
  my $out='';
  foreach my $sentence (@{$self->{long_sentences}})
  {
    $out .= "Long sentence:\n  $sentence\n\n";
  }
  return $out;
}

sub report_lengths
{
  my $self= shift;
  my $lengths= $self->{lengths};
  my $max_len= scalar(@$lengths)-1;
  my $out= h3('Sentence lengths')."\n";

  foreach my $i (0..$max_len)
  {
    $out .= $i."\t".$lengths->[$i]."\n";
  }
  return $out;
}

sub print
{
  my $self= shift;
  foreach my $sentence (@{$self->{sentences}})
  {
    print $sentence,"\n";
  }
  return $self;
}

1;

package SpellCheck;
use File::Slurp;
use Storable;
use Carp;

sub new {
  my $class= shift;
  my ($dict)= @_;
  my $self= {};
  bless $self, $class;
  croak 'Need to specify a dictionary directory in SpellCheck->new()' 
    if (not defined $dict);
  croak 'Dictionary directory $dict does not exist' if (not -e $dict);
  croak 'Dictionary directory $dict is not a directory' if (not -d $dict);
  $self->read_dict($dict);
  return $self;
}

sub report_not_in_dict {
  my $self= shift;
  my $out="\nSpelling errors:\n";
  foreach my $non_word (keys %{$self->{not_in_dict}}) {
    $out .= "  $non_word\n";
  }
  if ($self->{questionable} ne '')
  {
    $out.= "\nQuestionable\n".$self->{questionable}."\n";
  }
  return $out;
}

sub report_dict {
  my $self= shift;
  my $out="Dict Report\n";
  foreach my $dict (sort { get_dict_level($a) <=> get_dict_level($b)
                         } (keys %{$self->{which_dict}})) {
    $out .= $dict. ' '. $self->{which_dict}->{$dict}. "\n";
  }
  return $out;
}

sub get_dict_level
{ 
  my ($fn) = @_;
  $fn =~ s/(.*?)\.(.*)//;
  my $level= $2;
  return $level;
}

sub load_wordlist {
  my $self= shift;
  $self->{wordlist}= {};
  my ($text_body)= @_;
  my %wordlist;
  foreach my $word (split /(\s+|[-_]+)/, $text_body) {
    $word=~ s/[^a-zA-Z']+//g;
    foreach my $quote (qw( ' " )) {
      $word =~ s/^$quote//;
      $word =~ s/$quote$//;
    }
    $word= lc $word;
    $self->{wordlist}->{$word}='';
  }
  return $self;
}

sub check_spelling {
  my $self= shift;
  my ($text_body)= @_;
  $self->load_wordlist($text_body);
  $self->{correct_spelling}='';
  $self->{questionable}= '';
  $self->{not_in_dict}= {};
  $self->{which_dict}= {};
  foreach my $word (sort (keys %{$self->{wordlist}})) {
    if (exists $self->{dict}->{$word}) {
      $self->{correct_spelling} .= $self->count_dict($word);
      if (get_dict_level($self->{dict}->{$word}) > 50) {
        $self->{questionable} .= '  ' . $self->{dict}->{$word} . ': ' . $word . "\n";
      }
    } else {
      $self->{not_in_dict}->{$word}='';
    }
  }
  return $self;
}

sub count_dict
{
  my $self= shift;
  my ($word)= @_;
  my $dict= $self->{dict};
  my $this_word_dict= $dict->{$word};
  my $report = $word.': '.$this_word_dict."\n";
  if (not exists $self->{which_dict}->{$this_word_dict}) {
    $self->{which_dict}->{$this_word_dict}= 1;
  } else {
    $self->{which_dict}->{$this_word_dict}++;
  }
  return $report;
}

sub read_dict {
  my $self= shift;
  my ($dict_path)= @_;
  if (-e "dict.cache")
  {
    $self->{dict}= retrieve("dict.cache");
    return $self;
  }
  my %dict;
  foreach my $fn (read_dir($dict_path))
  {
    foreach my $word (split /\n/, read_file($dict_path.'/'.$fn))
    {
      $dict{$word}= $fn;
    }
  }
  store \%dict, "dict.cache";
  $self->{dict}= \%dict;
  return $self;
}

1;

package OOText;

use Encode;
use XML::Twig;
use Archive::Zip qw(AZ_OK);
use Carp;

sub new {
  my $class= shift;
  my ($oo_fn)= @_;
  my $t= XML::Twig->new(
         twig_roots => {'text:p' => \&text_out },
                       );
  $t->{spellcheck_text} = '';
  my $content_xml= get_string_from_zip("content.xml", $oo_fn);
  $t->parsestring($content_xml);
  bless $t, $class;
  return $t;
}

sub text_out {
  my ($t, $elt)= @_;
  if ($elt->att('text:style-name') eq 'Text body') {
    $t->{_spellcheck_text} .= $elt->text."\n";
  }
  elsif ($elt->att('text:style-name') eq 'code') {
    $t->{_code_text} .= $elt->text."\n";
  }
}

sub get_text_content {
  my $self= shift;
  return $self->{_spellcheck_text};
}

sub get_code_content {
  my $self= shift;
  return $self->{_code_text};
}

sub get_string_from_zip {
  my ($zip_member_fn, $zip_archive_fn)= @_;
  my $zip = Archive::Zip->new();
  if ($zip->read($zip_archive_fn) == AZ_OK) {
    return $zip->contents($zip_member_fn);
  } else {
    croak "Can't find member $zip_member_fn in archive $zip_archive_fn";
  }
}

package Fathom;

use Lingua::EN::Fathom;
use Text::FormatTable;
use Math::Round qw(nearest);

sub fathom
{
  my ($content)= @_;
  my $fog_description = 
      { 'unreadable' => [18,1e12], 
        'difficult'  => [14,18]  ,
        'ideal'      => [11,14]  ,  
        'acceptable' => [8,11]   ,
        'childish'   => [-1e12,8] };

  my $text = new Lingua::EN::Fathom;
  # $text->analyse_file($file);
 
  my $accumulate = 1;
  $text->analyse_block($content, $accumulate);

  my $fog     = nearest(0.1, $text->fog);
  my $flesch  = nearest(0.1, $text->flesch);
  my $kincaid = nearest(0.1, $text->kincaid);

  my $table = Text::FormatTable->new('r  l  l  l');

  my $fog_descr;
  foreach my $fog (keys %$fog_description) {
    if ($text->fog >= $fog_description->{$fog}[0] and 
        $text->fog <  $fog_description->{$fog}[1]) {
      $fog_descr= $fog;
    }
  }

  my $percent_complex_words = nearest(0.1,$text->percent_complex_words);
  $table->row("Fog", nearest(0.1, $text->fog), $fog_descr, "");
  $table->row("Grade Level", $kincaid, "(Flesch-Kincaid)", "");
  $table->row("Flesch", $flesch, "", "");
  $table->row("Complex words", "$percent_complex_words %", "", "");
  $table->row("Chars", $text->num_chars, "Words", $text->num_words);
  $table->row("Lines", $text->num_text_lines, "Blank Lines", $text->num_blank_lines);
  $table->row("Sentences", $text->num_sentences, "Paragraphs", $text->num_paragraphs);

  print $table->render();
}

1;

08/01/2005

By toma