]>
Commit | Line | Data |
---|---|---|
aec300cd TC |
1 | package BSE::Search::BSE; |
2 | use strict; | |
3 | use Constants qw(:search); | |
4 | ||
5 | use base 'BSE::Search::Base'; | |
6 | ||
7 | sub new { | |
8 | my ($class, %opts) = @_; | |
9 | ||
10 | return bless \%opts, $class; | |
11 | } | |
12 | ||
13 | sub get_term_matches { | |
14 | my ($term, $allow_wc, $section) = @_; | |
15 | ||
16 | my $dh = BSE::DB->single; | |
17 | ||
18 | my $sth; | |
19 | if ($SEARCH_AUTO_WILDCARD && $allow_wc) { | |
20 | $sth = $dh->stmt('searchIndexWC'); | |
21 | $sth->execute($term."%") | |
22 | or die "Could not execute search: ",$sth->errstr; | |
23 | } | |
24 | else { | |
25 | $sth = $dh->stmt('searchIndex'); | |
26 | $sth->execute($term) | |
27 | or die "Could not execute search: ",$sth->errstr; | |
28 | } | |
29 | ||
30 | my %matches; | |
31 | while (my $row = $sth->fetchrow_arrayref) { | |
32 | # skip any results that contain spaces if our term doesn't | |
33 | # contain spaces. This loses wildcard matches which hit | |
34 | # phrase entries | |
35 | next if $term !~ /\s/ && $row->[0] =~ /\s/; | |
36 | my @ids = split ' ', $row->[1]; | |
37 | my @scores = split ' ', $row->[3]; | |
38 | if ($section) { | |
39 | # only for the section requested | |
40 | my @sections = split ' ', $row->[2]; | |
41 | my @keep = grep { $sections[$_] == $section && $ids[$_] } 0..$#sections; | |
42 | @ids = @ids[@keep]; | |
43 | @scores = @scores[@keep]; | |
44 | } | |
45 | for my $index (0 .. $#ids) { | |
46 | $matches{$ids[$index]} += $scores[$index]; | |
47 | } | |
48 | } | |
49 | ||
50 | return map [ $_, $matches{$_} ], keys %matches; | |
51 | } | |
52 | ||
53 | sub search { | |
d1ca9873 | 54 | my ($self, $words, $section, $date, $terms, $match_all, $req) = @_; |
aec300cd TC |
55 | |
56 | # canonical form | |
57 | $words =~ s/^\s+|\s+$//g; | |
58 | ||
59 | # array of [ term, unquoted, required, weight ] | |
60 | my @terms; | |
61 | my @exclude; | |
62 | while (1) { | |
63 | if ($words =~ /\G\s*-"([^"]+)"/gc | |
64 | || $words =~ /\G\s*-'([^\']+)'/gc) { | |
65 | push @exclude, [ $1, 0 ]; | |
66 | } | |
67 | elsif ($words =~ /\G\s*\+"([^"]+)"/gc | |
68 | || $words =~ /\G\s*\+'([^\']+)'/gc) { | |
69 | push @terms, [ $1, 0, 1, 1 ]; | |
70 | } | |
71 | elsif ($words =~ /\G\s*"([^"]+)"/gc | |
72 | || $words =~ /\G\s*'([^']+)'/gc) { | |
73 | push(@terms, [ $1, 0, 0, 1 ]); | |
74 | } | |
75 | elsif ($words =~ /\G\s*-(\S+)/gc) { | |
76 | push @exclude, [ $1, 1 ]; | |
77 | } | |
78 | elsif ($words =~ /\G\s*\+(\S+)/gc) { | |
79 | push(@terms, [ $1, 1, 1, 1 ]); | |
80 | } | |
81 | elsif ($words =~ /\G\s*(\S+)/gc) { | |
82 | push(@terms, [ $1, 1, 0, 1 ]); | |
83 | } | |
84 | else { | |
85 | last; | |
86 | } | |
87 | } | |
88 | ||
89 | @terms or return; | |
90 | ||
d1ca9873 TC |
91 | if ($match_all) { |
92 | for my $term (@terms) { | |
93 | $term->[2] = 1; | |
94 | } | |
95 | } | |
96 | ||
aec300cd TC |
97 | # if the user entered a plain multi-word phrase |
98 | if ($words !~ /["'+-]/ && $words =~ /\s/) { | |
99 | # treat it as if they entered it in quotes as well | |
100 | # giving articles with that phrase an extra score | |
101 | push(@terms, [ $words, 0, 0, 0.1 ]); | |
102 | } | |
103 | ||
104 | # disable wildcarding for short terms | |
105 | for my $term (@terms) { | |
106 | if ($term->[1] && length($term->[0]) < $SEARCH_WILDCARD_MIN) { | |
107 | $term->[1] = 0; | |
108 | } | |
109 | } | |
110 | ||
111 | my %scores; | |
112 | my %terms; | |
113 | for my $term (grep !$_->[2], @terms) { | |
114 | my @matches = get_term_matches($term->[0], $term->[1], $section); | |
115 | for my $match (@matches) { | |
116 | $scores{$match->[0]} += $match->[1] * $term->[3]; | |
117 | } | |
118 | } | |
119 | my @required = grep $_->[2], @terms; | |
120 | my %delete; # has of id to 1 | |
121 | if (@required) { | |
122 | my %match_required; | |
123 | for my $term (@required) { | |
124 | my @matches = get_term_matches($term->[0], $term->[1], $section); | |
125 | for my $match (@matches) { | |
126 | $scores{$match->[0]} += $match->[1]; | |
127 | ++$match_required{$match->[0]}; | |
128 | } | |
129 | } | |
130 | for my $id (keys %scores) { | |
131 | if (!$match_required{$id} || $match_required{$id} != @required) { | |
132 | ++$delete{$id}; | |
133 | } | |
134 | } | |
135 | } | |
136 | for my $term (@exclude) { | |
137 | my @matches = get_term_matches($term->[0], $term->[1], $section); | |
138 | ++$delete{$_->[0]} for @matches; | |
139 | } | |
140 | ||
141 | delete @scores{keys %delete}; | |
142 | ||
143 | return () if !keys %scores; | |
144 | ||
145 | # make sure we match the other requirements | |
146 | my $sql = "select id from article where "; | |
147 | $sql .= "(".join(" or ", map "id = $_", keys %scores).")"; | |
148 | my $now = _sql_date(time); | |
149 | my $oneday = 24 * 3600; | |
150 | SWITCH: for ($date) { | |
151 | $_ eq 'ar' # been released | |
152 | && do { | |
153 | $sql .= " and $now between release and expire"; | |
154 | last SWITCH; | |
155 | }; | |
156 | /^r(\d+)$/ # released in last N days | |
157 | && do { | |
158 | $sql .= " and release > "._sql_date(time - $oneday * $1); | |
159 | last SWITCH; | |
160 | }; | |
161 | /^e(\d+)$/ # expired in last N days | |
162 | && do { | |
163 | $sql .= " and expire > " . _sql_date(time - $oneday * $1) | |
164 | ." and expire <= $now"; | |
165 | last SWITCH; | |
166 | }; | |
167 | /^m(\d+)$/ # modified in last N days | |
168 | && do { | |
169 | $sql .= " and lastModified > " . _sql_date(time - $oneday * $1); | |
170 | last SWITCH; | |
171 | }; | |
172 | $_ eq 'ae' | |
173 | && do { | |
174 | $sql .= " and expire < $now"; | |
175 | last SWITCH; | |
176 | }; | |
177 | } | |
178 | ||
d1ca9873 TC |
179 | $sql .= " order by title"; |
180 | ||
aec300cd TC |
181 | my $dh = BSE::DB->single; |
182 | ||
183 | my $sth = $dh->{dbh}->prepare($sql) | |
184 | or die "Error preparing $sql: ",$dh->{dbh}->errstr; | |
185 | ||
186 | $sth->execute() | |
187 | or die "Cannot execute $sql: ",$sth->errstr; | |
188 | ||
189 | my @ids; | |
190 | my $row; | |
191 | push(@ids, $row->[0]) while $row = $sth->fetchrow_arrayref; | |
192 | ||
193 | @ids = sort { $scores{$b} <=> $scores{$a} } @ids; | |
194 | ||
195 | @$terms = map $_->[0], @terms; | |
196 | ||
197 | return map [ $_, $scores{$_} ], @ids; | |
198 | } | |
199 | ||
200 | sub _sql_date { | |
201 | my ($time) = @_; | |
202 | use POSIX qw(strftime); | |
203 | ||
204 | strftime("'%Y-%m-%d %H:%M'", localtime $time); | |
205 | } | |
206 | ||
207 | 1; |