This is an example from
Perl and OpenOffice.
#!/usr/bin/perl
#
# A Style and Spell Checker that reads OpenOffice files
#
# Tom Anderson
# 6/24/2005
#
use strict;
use warnings;
main($ARGV[0]);
sub main {
my ($fn)= @_;
my $oo= OOText->new($fn);
my $text_body= $oo->get_text_content();
my $code_examples= $oo->get_code_content();
PerlSyntax::check_syntax($code_examples);
Fathom::fathom($text_body);
my $sentences= Sentences->new($text_body);
$sentences->get_lengths();
print $sentences->report_lengths();
print $sentences->report_long_sentences();
my $spell = SpellCheck->new('/cygdrive/c/program files/Aspell/dict');
$spell->check_spelling($text_body);
print $spell->report_dict();
print $spell->report_not_in_dict();
}
package PerlSyntax;
use IPC::Open3;
use Symbol qw(gensym);
use IO::File;
use File::Slurp;
sub check_syntax
{
my ($code_examples)= @_;
my $perl_code_fn= "tstcode.pl";
write_file($perl_code_fn, $code_examples);
local *CATCHOUT = IO::File->new_tmpfile;
local *CATCHERR = IO::File->new_tmpfile;
my $cmd= "perl -c $perl_code_fn";
my $pid = open3(gensym, ">&CATCHOUT", ">&CATCHERR", $cmd);
waitpid($pid, 0);
seek $_, 0, 0 for \*CATCHOUT, \*CATCHERR;
while( ) {}
my $syntax_check= '';
while( ) { $syntax_check .= $_;}
if ($syntax_check =~ /syntax OK/) {
return '';
} else {
return "Perl code syntax error:\n$syntax_check\n";
}
}
1;
package Sentences;
use Lingua::EN::Sentence qw( get_sentences );
use CGI qw( :html3 :standard );
use Encode;
sub new
{
my $class= shift;
my ($text_body)= @_;
my $self= {};
$text_body=~ s/-_/ /g;
$text_body=~ s/:/./g;
$self->{sentences}= get_sentences($text_body);
bless $self, $class;
return $self;
}
sub get_lengths
{
my $self= shift;
my @lengths;
$self->{long_sentences}= [];
foreach my $sentence (@{$self->{sentences}})
{
my @words= split /\s+/, $sentence;
my $num_words= scalar(@words)+1;
# my $delim= join '|', @words;
# print $num_words, ': ', $delim, "\n";
$lengths[$num_words]++;
if ($num_words > 24)
{
push @{$self->{long_sentences}}, encode('UTF-8',$sentence);
}
}
foreach my $i (0..scalar(@lengths)-1)
{
$lengths[$i]=0 if not defined $lengths[$i];
}
$self->{lengths}= \@lengths;
return $self;
}
sub report_long_sentences
{
my $self= shift;
my $out='';
foreach my $sentence (@{$self->{long_sentences}})
{
$out .= "Long sentence:\n $sentence\n\n";
}
return $out;
}
sub report_lengths
{
my $self= shift;
my $lengths= $self->{lengths};
my $max_len= scalar(@$lengths)-1;
my $out= h3('Sentence lengths')."\n";
foreach my $i (0..$max_len)
{
$out .= $i."\t".$lengths->[$i]."\n";
}
return $out;
}
sub print
{
my $self= shift;
foreach my $sentence (@{$self->{sentences}})
{
print $sentence,"\n";
}
return $self;
}
1;
package SpellCheck;
use File::Slurp;
use Storable;
use Carp;
sub new {
my $class= shift;
my ($dict)= @_;
my $self= {};
bless $self, $class;
croak 'Need to specify a dictionary directory in SpellCheck->new()'
if (not defined $dict);
croak 'Dictionary directory $dict does not exist' if (not -e $dict);
croak 'Dictionary directory $dict is not a directory' if (not -d $dict);
$self->read_dict($dict);
return $self;
}
sub report_not_in_dict {
my $self= shift;
my $out="\nSpelling errors:\n";
foreach my $non_word (keys %{$self->{not_in_dict}}) {
$out .= " $non_word\n";
}
if ($self->{questionable} ne '')
{
$out.= "\nQuestionable\n".$self->{questionable}."\n";
}
return $out;
}
sub report_dict {
my $self= shift;
my $out="Dict Report\n";
foreach my $dict (sort { get_dict_level($a) <=> get_dict_level($b)
} (keys %{$self->{which_dict}})) {
$out .= $dict. ' '. $self->{which_dict}->{$dict}. "\n";
}
return $out;
}
sub get_dict_level
{
my ($fn) = @_;
$fn =~ s/(.*?)\.(.*)//;
my $level= $2;
return $level;
}
sub load_wordlist {
my $self= shift;
$self->{wordlist}= {};
my ($text_body)= @_;
my %wordlist;
foreach my $word (split /(\s+|[-_]+)/, $text_body) {
$word=~ s/[^a-zA-Z']+//g;
foreach my $quote (qw( ' " )) {
$word =~ s/^$quote//;
$word =~ s/$quote$//;
}
$word= lc $word;
$self->{wordlist}->{$word}='';
}
return $self;
}
sub check_spelling {
my $self= shift;
my ($text_body)= @_;
$self->load_wordlist($text_body);
$self->{correct_spelling}='';
$self->{questionable}= '';
$self->{not_in_dict}= {};
$self->{which_dict}= {};
foreach my $word (sort (keys %{$self->{wordlist}})) {
if (exists $self->{dict}->{$word}) {
$self->{correct_spelling} .= $self->count_dict($word);
if (get_dict_level($self->{dict}->{$word}) > 50) {
$self->{questionable} .= ' ' . $self->{dict}->{$word} . ': ' . $word . "\n";
}
} else {
$self->{not_in_dict}->{$word}='';
}
}
return $self;
}
sub count_dict
{
my $self= shift;
my ($word)= @_;
my $dict= $self->{dict};
my $this_word_dict= $dict->{$word};
my $report = $word.': '.$this_word_dict."\n";
if (not exists $self->{which_dict}->{$this_word_dict}) {
$self->{which_dict}->{$this_word_dict}= 1;
} else {
$self->{which_dict}->{$this_word_dict}++;
}
return $report;
}
sub read_dict {
my $self= shift;
my ($dict_path)= @_;
if (-e "dict.cache")
{
$self->{dict}= retrieve("dict.cache");
return $self;
}
my %dict;
foreach my $fn (read_dir($dict_path))
{
foreach my $word (split /\n/, read_file($dict_path.'/'.$fn))
{
$dict{$word}= $fn;
}
}
store \%dict, "dict.cache";
$self->{dict}= \%dict;
return $self;
}
1;
package OOText;
use Encode;
use XML::Twig;
use Archive::Zip qw(AZ_OK);
use Carp;
sub new {
my $class= shift;
my ($oo_fn)= @_;
my $t= XML::Twig->new(
twig_roots => {'text:p' => \&text_out },
);
$t->{spellcheck_text} = '';
my $content_xml= get_string_from_zip("content.xml", $oo_fn);
$t->parsestring($content_xml);
bless $t, $class;
return $t;
}
sub text_out {
my ($t, $elt)= @_;
if ($elt->att('text:style-name') eq 'Text body') {
$t->{_spellcheck_text} .= $elt->text."\n";
}
elsif ($elt->att('text:style-name') eq 'code') {
$t->{_code_text} .= $elt->text."\n";
}
}
sub get_text_content {
my $self= shift;
return $self->{_spellcheck_text};
}
sub get_code_content {
my $self= shift;
return $self->{_code_text};
}
sub get_string_from_zip {
my ($zip_member_fn, $zip_archive_fn)= @_;
my $zip = Archive::Zip->new();
if ($zip->read($zip_archive_fn) == AZ_OK) {
return $zip->contents($zip_member_fn);
} else {
croak "Can't find member $zip_member_fn in archive $zip_archive_fn";
}
}
package Fathom;
use Lingua::EN::Fathom;
use Text::FormatTable;
use Math::Round qw(nearest);
sub fathom
{
my ($content)= @_;
my $fog_description =
{ 'unreadable' => [18,1e12],
'difficult' => [14,18] ,
'ideal' => [11,14] ,
'acceptable' => [8,11] ,
'childish' => [-1e12,8] };
my $text = new Lingua::EN::Fathom;
# $text->analyse_file($file);
my $accumulate = 1;
$text->analyse_block($content, $accumulate);
my $fog = nearest(0.1, $text->fog);
my $flesch = nearest(0.1, $text->flesch);
my $kincaid = nearest(0.1, $text->kincaid);
my $table = Text::FormatTable->new('r l l l');
my $fog_descr;
foreach my $fog (keys %$fog_description) {
if ($text->fog >= $fog_description->{$fog}[0] and
$text->fog < $fog_description->{$fog}[1]) {
$fog_descr= $fog;
}
}
my $percent_complex_words = nearest(0.1,$text->percent_complex_words);
$table->row("Fog", nearest(0.1, $text->fog), $fog_descr, "");
$table->row("Grade Level", $kincaid, "(Flesch-Kincaid)", "");
$table->row("Flesch", $flesch, "", "");
$table->row("Complex words", "$percent_complex_words %", "", "");
$table->row("Chars", $text->num_chars, "Words", $text->num_words);
$table->row("Lines", $text->num_text_lines, "Blank Lines", $text->num_blank_lines);
$table->row("Sentences", $text->num_sentences, "Paragraphs", $text->num_paragraphs);
print $table->render();
}
1;

08/01/2005
By toma