t10edit.t now cleans up the article it creates
[bse.git] / site / cgi-bin / modules / BSE / Index / BSE.pm
CommitLineData
7f7330d7
TC
1package BSE::Index::BSE;
2use strict;
3use base 'BSE::Index::Base';
4use BSE::DB;
5use Constants qw($DATADIR $MAXPHRASE);
6
65ad6c28 7our $VERSION = "1.001";
cb7fd78d 8
7f7330d7
TC
9sub new {
10 my ($class, %opts) = @_;
11
12 my $self = bless \%opts, $class;
13
14 $self->{dh} = BSE::DB->single;
15 $self->{dropIndex} = $self->{dh}->stmt('dropIndex')
16 or die "No dropIndex member in BSE::DB";
17 $self->{insertIndex} = $self->{dh}->stmt('insertIndex')
18 or die "No insertIndex member in BSE::DB";
65ad6c28
TC
19
20 my $priority = $self->{cfg}->entry("search", "index_priority", "speed");
21 if ($priority eq "speed") {
22 $self->{index} = {};
23 }
24 elsif ($priority eq "memory") {
25 require DBM::Deep;
26 require File::Temp;
27 my $fh = File::Temp->new;
28 $self->{index} = DBM::Deep->new
29 (
30 fh => $fh,
31 locking => 0,
32 autoflush => 0,
33 data_sector_size => 256,
34 );
35 $self->{fh} = $fh;
36 $self->{filename} = $fh->filename;
37 }
38 else {
39 die "Unknown [search].index_priority of '$priority'\n";
40 }
41 $self->{priority} = $priority;
7f7330d7
TC
42
43 $self->{decay_multiplier} = 0.4;
44
d401b996
TC
45 $self->{wordre} = $self->{cfg}->entry("search", "wordre", "\\w+");
46
7f7330d7
TC
47 return $self;
48}
49
50sub start_index {
51 my $self = shift;
52
53 my $stopwords = "$DATADIR/stopwords.txt";
54
55 # load the stop words
56 open STOP, "< $stopwords"
57 or die "Cannot open $stopwords: $!";
58 chomp(my @stopwords = <STOP>);
59 tr/\r//d for @stopwords; # just in case
60 my %stopwords;
61 @stopwords{@stopwords} = (1) x @stopwords;
62 close STOP;
45062f53 63 $self->{weights} = {};
7f7330d7
TC
64
65 return 1;
66}
67
68sub process_article {
69 my ($self, $article, $section, $indexas, $fields) = @_;
70
45062f53 71 $self->{weights}{$indexas} ||= {};
7f7330d7
TC
72 for my $field (sort { $self->{scores}{$b} <=> $self->{scores}{$a} }
73 keys %$fields) {
d401b996 74 my $word_re = $self->{cfg}->entry("search", "wordre_$field", $self->{wordre});
7f7330d7
TC
75 my $text = $fields->{$field};
76 my $score = $self->{scores}{$field};
77 my %seen; # $seen{phrase} non-zero if seen for this field
78
79 # for each paragraph
80 for my $para (split /\n/, $text) {
d401b996
TC
81 my @words;
82 while ($para =~ /($word_re)/g) {
83 push @words, $1;
84 }
7f7330d7
TC
85 my @buffer;
86
87 for my $word (@words) {
88 if ($self->{stopwords}{lc $word}) {
45062f53 89 $self->process($indexas, $section->{id}, $score, $self->{weights}{$indexas}, \%seen,
7f7330d7
TC
90 @buffer) if @buffer;
91 @buffer = ();
92 }
93 else {
94 push(@buffer, $word);
95 }
96 }
45062f53 97 $self->process($indexas, $section->{id}, $score, $self->{weights}{$indexas}, \%seen,
7f7330d7
TC
98 @buffer) if @buffer;
99 }
bf184a23
TC
100 if ($field eq 'product_code' && $text) {
101 $self->process($indexas, $section->{id}, $score, $self->{weights}{$indexas}, \%seen, $text);
102 }
7f7330d7
TC
103 }
104}
105
106sub process {
107 my ($self, $id, $sectionid, $score, $weights, $seen, @words) = @_;
108
109 for (my $start = 0; $start < @words; ++$start) {
110 my $end = $start + $MAXPHRASE-1;
111 $end = $#words if $end > $#words;
112
113 for my $phrase (map { "@words[$start..$_]" } $start..$end) {
114 if (lc $phrase ne $phrase && !$seen->{lc $phrase}++) {
65ad6c28
TC
115 my $temp = $self->{index}{lc $phrase};
116 if (exists $temp->{$id}) {
7f7330d7 117 $weights->{lc $phrase} *= $self->{decay_multiplier};
65ad6c28 118 $temp->{$id}[1] += $score * $weights->{lc $phrase};
7f7330d7
TC
119 }
120 else {
121 $weights->{lc $phrase} = 1.0;
65ad6c28 122 $temp->{$id} = [ $sectionid, $score ];
7f7330d7 123 }
65ad6c28 124 $self->{index}{lc $phrase} = $temp;
7f7330d7
TC
125 }
126 if (!$seen->{$phrase}++) {
65ad6c28
TC
127 my $temp = $self->{index}{$phrase};
128 if (exists $temp->{$id}) {
7f7330d7 129 $weights->{$phrase} *= $self->{decay_multiplier};
65ad6c28 130 $temp->{$id}[1] += $score * $weights->{$phrase};
7f7330d7
TC
131 }
132 else {
133 $weights->{$phrase} = 1.0;
65ad6c28 134 $temp->{$id} = [ $sectionid, $score ];
7f7330d7 135 }
65ad6c28 136 $self->{index}{$phrase} = $temp;
7f7330d7
TC
137 }
138 }
139 }
140}
141
142sub end_index {
143 my $self = shift;
144
145 $self->{dropIndex}->execute()
146 or die "dropIndex failed: ", $self->{dropindex}->errstr, "\n";
147
148 my $insertIndex = $self->{insertIndex};
149 for my $key (sort keys %{$self->{index}}) {
150 my $word = $self->{index}{$key};
151 # sort by reverse score so that if we overflow the field we
152 # get the highest scoring matches
153 my @ids = sort { $word->{$b}[1] <=> $word->{$a}[1] } keys %$word;
154 my @sections = map { $_->[0] } @$word{@ids};
155 my @scores = map { $_->[1] } @$word{@ids};
156
157 $insertIndex->execute($key, "@ids", "@sections", "@scores")
158 or die "Cannot insert into index: ", $insertIndex->errstr;
159 }
65ad6c28
TC
160
161 if ($self->{priority} eq "memory") {
162 delete $self->{dbm};
163 delete $self->{fh};
164 unlink $self->{filename};
165 }
7f7330d7
TC
166}
167
1681;