treat the whole product code as a word for indexing
[bse.git] / site / cgi-bin / modules / BSE / Index / BSE.pm
CommitLineData
7f7330d7
TC
1package BSE::Index::BSE;
2use strict;
3use base 'BSE::Index::Base';
4use BSE::DB;
5use Constants qw($DATADIR $MAXPHRASE);
6
7sub new {
8 my ($class, %opts) = @_;
9
10 my $self = bless \%opts, $class;
11
12 $self->{dh} = BSE::DB->single;
13 $self->{dropIndex} = $self->{dh}->stmt('dropIndex')
14 or die "No dropIndex member in BSE::DB";
15 $self->{insertIndex} = $self->{dh}->stmt('insertIndex')
16 or die "No insertIndex member in BSE::DB";
17 $self->{index} = {};
18
19 $self->{decay_multiplier} = 0.4;
20
21 return $self;
22}
23
24sub start_index {
25 my $self = shift;
26
27 my $stopwords = "$DATADIR/stopwords.txt";
28
29 # load the stop words
30 open STOP, "< $stopwords"
31 or die "Cannot open $stopwords: $!";
32 chomp(my @stopwords = <STOP>);
33 tr/\r//d for @stopwords; # just in case
34 my %stopwords;
35 @stopwords{@stopwords} = (1) x @stopwords;
36 close STOP;
45062f53 37 $self->{weights} = {};
7f7330d7
TC
38
39 return 1;
40}
41
42sub process_article {
43 my ($self, $article, $section, $indexas, $fields) = @_;
44
45062f53 45 $self->{weights}{$indexas} ||= {};
7f7330d7
TC
46 for my $field (sort { $self->{scores}{$b} <=> $self->{scores}{$a} }
47 keys %$fields) {
48 my $text = $fields->{$field};
49 my $score = $self->{scores}{$field};
50 my %seen; # $seen{phrase} non-zero if seen for this field
51
52 # for each paragraph
53 for my $para (split /\n/, $text) {
54 my @words = split /\W+/, $para;
55 my @buffer;
56
57 for my $word (@words) {
58 if ($self->{stopwords}{lc $word}) {
45062f53 59 $self->process($indexas, $section->{id}, $score, $self->{weights}{$indexas}, \%seen,
7f7330d7
TC
60 @buffer) if @buffer;
61 @buffer = ();
62 }
63 else {
64 push(@buffer, $word);
65 }
66 }
45062f53 67 $self->process($indexas, $section->{id}, $score, $self->{weights}{$indexas}, \%seen,
7f7330d7
TC
68 @buffer) if @buffer;
69 }
bf184a23
TC
70 if ($field eq 'product_code' && $text) {
71 $self->process($indexas, $section->{id}, $score, $self->{weights}{$indexas}, \%seen, $text);
72 }
7f7330d7
TC
73 }
74}
75
76sub process {
77 my ($self, $id, $sectionid, $score, $weights, $seen, @words) = @_;
78
79 for (my $start = 0; $start < @words; ++$start) {
80 my $end = $start + $MAXPHRASE-1;
81 $end = $#words if $end > $#words;
82
83 for my $phrase (map { "@words[$start..$_]" } $start..$end) {
84 if (lc $phrase ne $phrase && !$seen->{lc $phrase}++) {
85 if (exists $self->{index}{lc $phrase}{$id}) {
86 $weights->{lc $phrase} *= $self->{decay_multiplier};
87 $self->{index}{lc $phrase}{$id}[1] +=
88 $score * $weights->{lc $phrase};
89 }
90 else {
91 $weights->{lc $phrase} = 1.0;
92 $self->{index}{lc $phrase}{$id} = [ $sectionid, $score ];
93 }
94 }
95 if (!$seen->{$phrase}++) {
96 if (exists $self->{index}{$phrase}{$id}) {
97 $weights->{$phrase} *= $self->{decay_multiplier};
98 $self->{index}{$phrase}{$id}[1] +=
99 $score * $weights->{$phrase};
100 }
101 else {
102 $weights->{$phrase} = 1.0;
103 $self->{index}{$phrase}{$id} = [ $sectionid, $score ];
104 }
105 }
106 }
107 }
108}
109
110sub end_index {
111 my $self = shift;
112
113 $self->{dropIndex}->execute()
114 or die "dropIndex failed: ", $self->{dropindex}->errstr, "\n";
115
116 my $insertIndex = $self->{insertIndex};
117 for my $key (sort keys %{$self->{index}}) {
118 my $word = $self->{index}{$key};
119 # sort by reverse score so that if we overflow the field we
120 # get the highest scoring matches
121 my @ids = sort { $word->{$b}[1] <=> $word->{$a}[1] } keys %$word;
122 my @sections = map { $_->[0] } @$word{@ids};
123 my @scores = map { $_->[1] } @$word{@ids};
124
125 $insertIndex->execute($key, "@ids", "@sections", "@scores")
126 or die "Cannot insert into index: ", $insertIndex->errstr;
127 }
128}
129
1301;