optional case-insensitivity for searching
[bse.git] / site / cgi-bin / modules / BSE / Index / BSE.pm
CommitLineData
7f7330d7
TC
1package BSE::Index::BSE;
2use strict;
3use base 'BSE::Index::Base';
4use BSE::DB;
771ab646
TC
5use Constants qw($MAXPHRASE);
6use BSE::CfgInfo qw(cfg_data_dir);
7f7330d7 7
5d2a441a 8our $VERSION = "1.004";
cb7fd78d 9
7f7330d7
TC
10sub new {
11 my ($class, %opts) = @_;
12
13 my $self = bless \%opts, $class;
14
15 $self->{dh} = BSE::DB->single;
16 $self->{dropIndex} = $self->{dh}->stmt('dropIndex')
17 or die "No dropIndex member in BSE::DB";
18 $self->{insertIndex} = $self->{dh}->stmt('insertIndex')
19 or die "No insertIndex member in BSE::DB";
65ad6c28 20
5d2a441a
TC
21 $self->{case} eq 'controlled'
22 and die "BSE built-in searcher doesn't support controlled search (yet)";
23
65ad6c28
TC
24 my $priority = $self->{cfg}->entry("search", "index_priority", "speed");
25 if ($priority eq "speed") {
26 $self->{index} = {};
27 }
28 elsif ($priority eq "memory") {
8af4f235
TC
29 eval { require DBM::Deep; 1 }
30 or die "DBM::Deep must be installed to use [search].index_priority=memory\n";
65ad6c28
TC
31 require File::Temp;
32 my $fh = File::Temp->new;
33 $self->{index} = DBM::Deep->new
34 (
35 fh => $fh,
36 locking => 0,
37 autoflush => 0,
38 data_sector_size => 256,
39 );
40 $self->{fh} = $fh;
41 $self->{filename} = $fh->filename;
42 }
43 else {
44 die "Unknown [search].index_priority of '$priority'\n";
45 }
46 $self->{priority} = $priority;
7f7330d7
TC
47
48 $self->{decay_multiplier} = 0.4;
49
d401b996
TC
50 $self->{wordre} = $self->{cfg}->entry("search", "wordre", "\\w+");
51
7f7330d7
TC
52 return $self;
53}
54
55sub start_index {
56 my $self = shift;
57
771ab646
TC
58 my $data_dir = cfg_data_dir();
59 my $stopwords = "$data_dir/stopwords.txt";
7f7330d7
TC
60
61 # load the stop words
62 open STOP, "< $stopwords"
63 or die "Cannot open $stopwords: $!";
64 chomp(my @stopwords = <STOP>);
65 tr/\r//d for @stopwords; # just in case
66 my %stopwords;
67 @stopwords{@stopwords} = (1) x @stopwords;
68 close STOP;
45062f53 69 $self->{weights} = {};
7f7330d7
TC
70
71 return 1;
72}
73
74sub process_article {
75 my ($self, $article, $section, $indexas, $fields) = @_;
76
45062f53 77 $self->{weights}{$indexas} ||= {};
7f7330d7
TC
78 for my $field (sort { $self->{scores}{$b} <=> $self->{scores}{$a} }
79 keys %$fields) {
d401b996 80 my $word_re = $self->{cfg}->entry("search", "wordre_$field", $self->{wordre});
7f7330d7
TC
81 my $text = $fields->{$field};
82 my $score = $self->{scores}{$field};
83 my %seen; # $seen{phrase} non-zero if seen for this field
84
85 # for each paragraph
86 for my $para (split /\n/, $text) {
d401b996
TC
87 my @words;
88 while ($para =~ /($word_re)/g) {
89 push @words, $1;
90 }
7f7330d7
TC
91 my @buffer;
92
93 for my $word (@words) {
94 if ($self->{stopwords}{lc $word}) {
45062f53 95 $self->process($indexas, $section->{id}, $score, $self->{weights}{$indexas}, \%seen,
7f7330d7
TC
96 @buffer) if @buffer;
97 @buffer = ();
98 }
99 else {
100 push(@buffer, $word);
101 }
102 }
45062f53 103 $self->process($indexas, $section->{id}, $score, $self->{weights}{$indexas}, \%seen,
7f7330d7
TC
104 @buffer) if @buffer;
105 }
bf184a23
TC
106 if ($field eq 'product_code' && $text) {
107 $self->process($indexas, $section->{id}, $score, $self->{weights}{$indexas}, \%seen, $text);
108 }
7f7330d7
TC
109 }
110}
111
112sub process {
113 my ($self, $id, $sectionid, $score, $weights, $seen, @words) = @_;
114
115 for (my $start = 0; $start < @words; ++$start) {
116 my $end = $start + $MAXPHRASE-1;
117 $end = $#words if $end > $#words;
118
119 for my $phrase (map { "@words[$start..$_]" } $start..$end) {
5d2a441a
TC
120 if ($self->{case} eq 'context') {
121 if (lc $phrase ne $phrase && !$seen->{lc $phrase}++) {
122 my $temp = $self->{index}{lc $phrase};
123 if (exists $temp->{$id}) {
124 $weights->{lc $phrase} *= $self->{decay_multiplier};
125 $temp->{$id}[1] += $score * $weights->{lc $phrase};
126 }
127 else {
128 $weights->{lc $phrase} = 1.0;
129 $temp->{$id} = [ $sectionid, $score ];
130 }
131 $self->{index}{lc $phrase} = $temp;
7f7330d7 132 }
5d2a441a
TC
133 }
134 else {
135 $phrase = lc $phrase;
7f7330d7
TC
136 }
137 if (!$seen->{$phrase}++) {
65ad6c28
TC
138 my $temp = $self->{index}{$phrase};
139 if (exists $temp->{$id}) {
7f7330d7 140 $weights->{$phrase} *= $self->{decay_multiplier};
65ad6c28 141 $temp->{$id}[1] += $score * $weights->{$phrase};
7f7330d7
TC
142 }
143 else {
144 $weights->{$phrase} = 1.0;
65ad6c28 145 $temp->{$id} = [ $sectionid, $score ];
7f7330d7 146 }
65ad6c28 147 $self->{index}{$phrase} = $temp;
7f7330d7
TC
148 }
149 }
150 }
151}
152
153sub end_index {
154 my $self = shift;
155
156 $self->{dropIndex}->execute()
157 or die "dropIndex failed: ", $self->{dropindex}->errstr, "\n";
158
159 my $insertIndex = $self->{insertIndex};
160 for my $key (sort keys %{$self->{index}}) {
161 my $word = $self->{index}{$key};
162 # sort by reverse score so that if we overflow the field we
163 # get the highest scoring matches
164 my @ids = sort { $word->{$b}[1] <=> $word->{$a}[1] } keys %$word;
165 my @sections = map { $_->[0] } @$word{@ids};
166 my @scores = map { $_->[1] } @$word{@ids};
167
168 $insertIndex->execute($key, "@ids", "@sections", "@scores")
169 or die "Cannot insert into index: ", $insertIndex->errstr;
170 }
65ad6c28
TC
171
172 if ($self->{priority} eq "memory") {
173 delete $self->{dbm};
174 delete $self->{fh};
175 unlink $self->{filename};
176 }
7f7330d7
TC
177}
178
1791;