add a host config parameter for S3 storages
[bse.git] / site / cgi-bin / modules / BSE / Index / BSE.pm
CommitLineData
7f7330d7
TC
1package BSE::Index::BSE;
2use strict;
3use base 'BSE::Index::Base';
4use BSE::DB;
771ab646
TC
5use Constants qw($MAXPHRASE);
6use BSE::CfgInfo qw(cfg_data_dir);
7f7330d7 7
8af4f235 8our $VERSION = "1.003";
cb7fd78d 9
7f7330d7
TC
10sub new {
11 my ($class, %opts) = @_;
12
13 my $self = bless \%opts, $class;
14
15 $self->{dh} = BSE::DB->single;
16 $self->{dropIndex} = $self->{dh}->stmt('dropIndex')
17 or die "No dropIndex member in BSE::DB";
18 $self->{insertIndex} = $self->{dh}->stmt('insertIndex')
19 or die "No insertIndex member in BSE::DB";
65ad6c28
TC
20
21 my $priority = $self->{cfg}->entry("search", "index_priority", "speed");
22 if ($priority eq "speed") {
23 $self->{index} = {};
24 }
25 elsif ($priority eq "memory") {
8af4f235
TC
26 eval { require DBM::Deep; 1 }
27 or die "DBM::Deep must be installed to use [search].index_priority=memory\n";
65ad6c28
TC
28 require File::Temp;
29 my $fh = File::Temp->new;
30 $self->{index} = DBM::Deep->new
31 (
32 fh => $fh,
33 locking => 0,
34 autoflush => 0,
35 data_sector_size => 256,
36 );
37 $self->{fh} = $fh;
38 $self->{filename} = $fh->filename;
39 }
40 else {
41 die "Unknown [search].index_priority of '$priority'\n";
42 }
43 $self->{priority} = $priority;
7f7330d7
TC
44
45 $self->{decay_multiplier} = 0.4;
46
d401b996
TC
47 $self->{wordre} = $self->{cfg}->entry("search", "wordre", "\\w+");
48
7f7330d7
TC
49 return $self;
50}
51
52sub start_index {
53 my $self = shift;
54
771ab646
TC
55 my $data_dir = cfg_data_dir();
56 my $stopwords = "$data_dir/stopwords.txt";
7f7330d7
TC
57
58 # load the stop words
59 open STOP, "< $stopwords"
60 or die "Cannot open $stopwords: $!";
61 chomp(my @stopwords = <STOP>);
62 tr/\r//d for @stopwords; # just in case
63 my %stopwords;
64 @stopwords{@stopwords} = (1) x @stopwords;
65 close STOP;
45062f53 66 $self->{weights} = {};
7f7330d7
TC
67
68 return 1;
69}
70
71sub process_article {
72 my ($self, $article, $section, $indexas, $fields) = @_;
73
45062f53 74 $self->{weights}{$indexas} ||= {};
7f7330d7
TC
75 for my $field (sort { $self->{scores}{$b} <=> $self->{scores}{$a} }
76 keys %$fields) {
d401b996 77 my $word_re = $self->{cfg}->entry("search", "wordre_$field", $self->{wordre});
7f7330d7
TC
78 my $text = $fields->{$field};
79 my $score = $self->{scores}{$field};
80 my %seen; # $seen{phrase} non-zero if seen for this field
81
82 # for each paragraph
83 for my $para (split /\n/, $text) {
d401b996
TC
84 my @words;
85 while ($para =~ /($word_re)/g) {
86 push @words, $1;
87 }
7f7330d7
TC
88 my @buffer;
89
90 for my $word (@words) {
91 if ($self->{stopwords}{lc $word}) {
45062f53 92 $self->process($indexas, $section->{id}, $score, $self->{weights}{$indexas}, \%seen,
7f7330d7
TC
93 @buffer) if @buffer;
94 @buffer = ();
95 }
96 else {
97 push(@buffer, $word);
98 }
99 }
45062f53 100 $self->process($indexas, $section->{id}, $score, $self->{weights}{$indexas}, \%seen,
7f7330d7
TC
101 @buffer) if @buffer;
102 }
bf184a23
TC
103 if ($field eq 'product_code' && $text) {
104 $self->process($indexas, $section->{id}, $score, $self->{weights}{$indexas}, \%seen, $text);
105 }
7f7330d7
TC
106 }
107}
108
109sub process {
110 my ($self, $id, $sectionid, $score, $weights, $seen, @words) = @_;
111
112 for (my $start = 0; $start < @words; ++$start) {
113 my $end = $start + $MAXPHRASE-1;
114 $end = $#words if $end > $#words;
115
116 for my $phrase (map { "@words[$start..$_]" } $start..$end) {
117 if (lc $phrase ne $phrase && !$seen->{lc $phrase}++) {
65ad6c28
TC
118 my $temp = $self->{index}{lc $phrase};
119 if (exists $temp->{$id}) {
7f7330d7 120 $weights->{lc $phrase} *= $self->{decay_multiplier};
65ad6c28 121 $temp->{$id}[1] += $score * $weights->{lc $phrase};
7f7330d7
TC
122 }
123 else {
124 $weights->{lc $phrase} = 1.0;
65ad6c28 125 $temp->{$id} = [ $sectionid, $score ];
7f7330d7 126 }
65ad6c28 127 $self->{index}{lc $phrase} = $temp;
7f7330d7
TC
128 }
129 if (!$seen->{$phrase}++) {
65ad6c28
TC
130 my $temp = $self->{index}{$phrase};
131 if (exists $temp->{$id}) {
7f7330d7 132 $weights->{$phrase} *= $self->{decay_multiplier};
65ad6c28 133 $temp->{$id}[1] += $score * $weights->{$phrase};
7f7330d7
TC
134 }
135 else {
136 $weights->{$phrase} = 1.0;
65ad6c28 137 $temp->{$id} = [ $sectionid, $score ];
7f7330d7 138 }
65ad6c28 139 $self->{index}{$phrase} = $temp;
7f7330d7
TC
140 }
141 }
142 }
143}
144
145sub end_index {
146 my $self = shift;
147
148 $self->{dropIndex}->execute()
149 or die "dropIndex failed: ", $self->{dropindex}->errstr, "\n";
150
151 my $insertIndex = $self->{insertIndex};
152 for my $key (sort keys %{$self->{index}}) {
153 my $word = $self->{index}{$key};
154 # sort by reverse score so that if we overflow the field we
155 # get the highest scoring matches
156 my @ids = sort { $word->{$b}[1] <=> $word->{$a}[1] } keys %$word;
157 my @sections = map { $_->[0] } @$word{@ids};
158 my @scores = map { $_->[1] } @$word{@ids};
159
160 $insertIndex->execute($key, "@ids", "@sections", "@scores")
161 or die "Cannot insert into index: ", $insertIndex->errstr;
162 }
65ad6c28
TC
163
164 if ($self->{priority} eq "memory") {
165 delete $self->{dbm};
166 delete $self->{fh};
167 unlink $self->{filename};
168 }
7f7330d7
TC
169}
170
1711;