Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

Requirements

You will need giftopnm convert (imagemagick) and gocr installed.

Installation

...

– Author: Maarten de Boer, mdeboer at iua dot upf dot edu

Changelog

Version 2:

  • Use convert instead of giftopnm, because I received some mails with .gif's that were actually .jpg's. convert handles that ok.
  • Some words added

Code

Ocr.cf

No Format
loadplugin Ocr Ocr.pm
body OCR eval:check_ocr()
describe OCR Check if text in attached images contains spam words
score OCR 3.0

Ocr.pm

No Format

# Ocr plugin, version 2
package Ocr;

use strict;
use Mail::SpamAssassin;
use Mail::SpamAssassin::Util;
use Mail::SpamAssassin::Plugin;

our @ISA = qw (Mail::SpamAssassin::Plugin);

# constructor: register the eval rule
sub new {
   my ( $class, $mailsa ) = @_;
   $class = ref($class) || $class;
   my $self = $class->SUPER::new($mailsa);
   bless( $self, $class );
   $self->register_eval_rule("check_ocr");
   return $self;
}

sub check_ocr {
   my ( $self, $pms ) = @_;
   my $cnt = 0;
   foreach my $p ( $pms->{msg}->find_parts("image") ) {
      my ( $ctype, $boundary, $charset, $name ) =
        Mail::SpamAssassin::Util::parse_content_type(
         $p->get_header('content-type') );
      if ( $ctype eq "image/gif" ) {
         open OCR, "|/usr/bin/giftopnmconvert - pnm:-|/usr/bin/gocr -i - > /tmp/spamassassin.ocr.$$";
         foreach $p ( $p->decode() ) {
            print OCR $p;
         }
         close OCR;
         open OCR, "/tmp/spamassassin.ocr.$$";
         my @words =
           ( 'company', 'money', 'stock', 'million', 'thousand', 'buy', 'price', 'don\'t' );
         while (<OCR>) {
            my $w;
            foreach $w (@words) {
               if (m/$w/i) {
                  $cnt++;
               }
            }
         }
         unlink "/tmp/spamassassin.ocr.$$";
      }
   }
   return ( $cnt > 21 );
}

1;