make the arithmetic tag handle unknown subtags correctly
[bse.git] / site / cgi-bin / modules / BSE / Index / BSE.pm
CommitLineData
7f7330d7
TC
1package BSE::Index::BSE;
2use strict;
3use base 'BSE::Index::Base';
4use BSE::DB;
5use Constants qw($DATADIR $MAXPHRASE);
6
cb7fd78d
TC
7our $VERSION = "1.000";
8
7f7330d7
TC
9sub new {
10 my ($class, %opts) = @_;
11
12 my $self = bless \%opts, $class;
13
14 $self->{dh} = BSE::DB->single;
15 $self->{dropIndex} = $self->{dh}->stmt('dropIndex')
16 or die "No dropIndex member in BSE::DB";
17 $self->{insertIndex} = $self->{dh}->stmt('insertIndex')
18 or die "No insertIndex member in BSE::DB";
19 $self->{index} = {};
20
21 $self->{decay_multiplier} = 0.4;
22
d401b996
TC
23 $self->{wordre} = $self->{cfg}->entry("search", "wordre", "\\w+");
24
7f7330d7
TC
25 return $self;
26}
27
28sub start_index {
29 my $self = shift;
30
31 my $stopwords = "$DATADIR/stopwords.txt";
32
33 # load the stop words
34 open STOP, "< $stopwords"
35 or die "Cannot open $stopwords: $!";
36 chomp(my @stopwords = <STOP>);
37 tr/\r//d for @stopwords; # just in case
38 my %stopwords;
39 @stopwords{@stopwords} = (1) x @stopwords;
40 close STOP;
45062f53 41 $self->{weights} = {};
7f7330d7
TC
42
43 return 1;
44}
45
46sub process_article {
47 my ($self, $article, $section, $indexas, $fields) = @_;
48
45062f53 49 $self->{weights}{$indexas} ||= {};
7f7330d7
TC
50 for my $field (sort { $self->{scores}{$b} <=> $self->{scores}{$a} }
51 keys %$fields) {
d401b996 52 my $word_re = $self->{cfg}->entry("search", "wordre_$field", $self->{wordre});
7f7330d7
TC
53 my $text = $fields->{$field};
54 my $score = $self->{scores}{$field};
55 my %seen; # $seen{phrase} non-zero if seen for this field
56
57 # for each paragraph
58 for my $para (split /\n/, $text) {
d401b996
TC
59 my @words;
60 while ($para =~ /($word_re)/g) {
61 push @words, $1;
62 }
7f7330d7
TC
63 my @buffer;
64
65 for my $word (@words) {
66 if ($self->{stopwords}{lc $word}) {
45062f53 67 $self->process($indexas, $section->{id}, $score, $self->{weights}{$indexas}, \%seen,
7f7330d7
TC
68 @buffer) if @buffer;
69 @buffer = ();
70 }
71 else {
72 push(@buffer, $word);
73 }
74 }
45062f53 75 $self->process($indexas, $section->{id}, $score, $self->{weights}{$indexas}, \%seen,
7f7330d7
TC
76 @buffer) if @buffer;
77 }
bf184a23
TC
78 if ($field eq 'product_code' && $text) {
79 $self->process($indexas, $section->{id}, $score, $self->{weights}{$indexas}, \%seen, $text);
80 }
7f7330d7
TC
81 }
82}
83
84sub process {
85 my ($self, $id, $sectionid, $score, $weights, $seen, @words) = @_;
86
87 for (my $start = 0; $start < @words; ++$start) {
88 my $end = $start + $MAXPHRASE-1;
89 $end = $#words if $end > $#words;
90
91 for my $phrase (map { "@words[$start..$_]" } $start..$end) {
92 if (lc $phrase ne $phrase && !$seen->{lc $phrase}++) {
93 if (exists $self->{index}{lc $phrase}{$id}) {
94 $weights->{lc $phrase} *= $self->{decay_multiplier};
95 $self->{index}{lc $phrase}{$id}[1] +=
96 $score * $weights->{lc $phrase};
97 }
98 else {
99 $weights->{lc $phrase} = 1.0;
100 $self->{index}{lc $phrase}{$id} = [ $sectionid, $score ];
101 }
102 }
103 if (!$seen->{$phrase}++) {
104 if (exists $self->{index}{$phrase}{$id}) {
105 $weights->{$phrase} *= $self->{decay_multiplier};
106 $self->{index}{$phrase}{$id}[1] +=
107 $score * $weights->{$phrase};
108 }
109 else {
110 $weights->{$phrase} = 1.0;
111 $self->{index}{$phrase}{$id} = [ $sectionid, $score ];
112 }
113 }
114 }
115 }
116}
117
118sub end_index {
119 my $self = shift;
120
121 $self->{dropIndex}->execute()
122 or die "dropIndex failed: ", $self->{dropindex}->errstr, "\n";
123
124 my $insertIndex = $self->{insertIndex};
125 for my $key (sort keys %{$self->{index}}) {
126 my $word = $self->{index}{$key};
127 # sort by reverse score so that if we overflow the field we
128 # get the highest scoring matches
129 my @ids = sort { $word->{$b}[1] <=> $word->{$a}[1] } keys %$word;
130 my @sections = map { $_->[0] } @$word{@ids};
131 my @scores = map { $_->[1] } @$word{@ids};
132
133 $insertIndex->execute($key, "@ids", "@sections", "@scores")
134 or die "Cannot insert into index: ", $insertIndex->errstr;
135 }
136}
137
1381;