optional case-insensitivity for searching
[bse.git] / site / cgi-bin / modules / BSE / Index / BSE.pm
index 31bea3a..7958644 100644 (file)
@@ -2,7 +2,10 @@ package BSE::Index::BSE;
 use strict;
 use base 'BSE::Index::Base';
 use BSE::DB;
-use Constants qw($DATADIR $MAXPHRASE);
+use Constants qw($MAXPHRASE);
+use BSE::CfgInfo qw(cfg_data_dir);
+
+our $VERSION = "1.004";
 
 sub new {
   my ($class, %opts) = @_;
@@ -14,7 +17,33 @@ sub new {
     or die "No dropIndex member in BSE::DB";
   $self->{insertIndex} = $self->{dh}->stmt('insertIndex')
     or die "No insertIndex member in BSE::DB";
-  $self->{index} = {};
+
+  $self->{case} eq 'controlled'
+    and die "BSE built-in searcher doesn't support controlled search (yet)";
+
+  my $priority = $self->{cfg}->entry("search", "index_priority", "speed");
+  if ($priority eq "speed") {
+    $self->{index} = {};
+  }
+  elsif ($priority eq "memory") {
+    eval { require DBM::Deep; 1 }
+      or die "DBM::Deep must be installed to use [search].index_priority=memory\n";
+    require File::Temp;
+    my $fh = File::Temp->new;
+    $self->{index} = DBM::Deep->new
+      (
+       fh => $fh,
+       locking => 0,
+       autoflush => 0,
+       data_sector_size => 256,
+      );
+    $self->{fh} = $fh;
+    $self->{filename} = $fh->filename;
+  }
+  else {
+    die "Unknown [search].index_priority of '$priority'\n";
+  }
+  $self->{priority} = $priority;
 
   $self->{decay_multiplier} = 0.4;
 
@@ -26,7 +55,8 @@ sub new {
 sub start_index {
   my $self = shift;
 
-  my $stopwords = "$DATADIR/stopwords.txt";
+  my $data_dir = cfg_data_dir();
+  my $stopwords = "$data_dir/stopwords.txt";
 
   # load the stop words
   open STOP, "< $stopwords"
@@ -87,27 +117,34 @@ sub process {
     $end = $#words if $end > $#words;
     
     for my $phrase (map { "@words[$start..$_]" } $start..$end) {
-      if (lc $phrase ne $phrase && !$seen->{lc $phrase}++) {
-       if (exists $self->{index}{lc $phrase}{$id}) {
-         $weights->{lc $phrase} *= $self->{decay_multiplier};
-         $self->{index}{lc $phrase}{$id}[1] += 
-           $score * $weights->{lc $phrase};
-       }
-       else {
-         $weights->{lc $phrase} = 1.0;
-         $self->{index}{lc $phrase}{$id} = [ $sectionid, $score ];
+      if ($self->{case} eq 'context') {
+       if (lc $phrase ne $phrase && !$seen->{lc $phrase}++) {
+         my $temp = $self->{index}{lc $phrase};
+         if (exists $temp->{$id}) {
+           $weights->{lc $phrase} *= $self->{decay_multiplier};
+           $temp->{$id}[1] += $score * $weights->{lc $phrase};
+         }
+         else {
+           $weights->{lc $phrase} = 1.0;
+           $temp->{$id} = [ $sectionid, $score ];
+         }
+         $self->{index}{lc $phrase} = $temp;
        }
       }
+      else {
+       $phrase = lc $phrase;
+      }
       if (!$seen->{$phrase}++) {
-       if (exists $self->{index}{$phrase}{$id}) {
+       my $temp = $self->{index}{$phrase};
+       if (exists $temp->{$id}) {
          $weights->{$phrase} *= $self->{decay_multiplier};
-         $self->{index}{$phrase}{$id}[1] += 
-           $score * $weights->{$phrase};
+         $temp->{$id}[1] += $score * $weights->{$phrase};
        }
        else {
          $weights->{$phrase} = 1.0;
-         $self->{index}{$phrase}{$id} = [ $sectionid, $score ];
+         $temp->{$id} = [ $sectionid, $score ];
        }
+       $self->{index}{$phrase} = $temp;
       }
     }
   }
@@ -131,6 +168,12 @@ sub end_index {
     $insertIndex->execute($key, "@ids", "@sections", "@scores")
       or die "Cannot insert into index: ", $insertIndex->errstr;
   }
+
+  if ($self->{priority} eq "memory") {
+    delete $self->{dbm};
+    delete $self->{fh};
+    unlink $self->{filename};
+  }
 }
 
 1;