site/cgi-bin/admin/makeIndex.pl

   1 #!/usr/bin/perl -w
   2 use strict;
   3 use FindBin;
   4 use lib "$FindBin::Bin/../modules";
   5 use Articles;
   6 use Constants qw($BASEDIR $MAXPHRASE $DATADIR @SEARCH_EXCLUDE @SEARCH_INCLUDE $SEARCH_LEVEL);
   7 use BSE::DB;
   8 use Generate;
   9 use BSE::Cfg;
  10 use BSE::WebUtil 'refresh_to_admin';
  11 my $in_cgi = exists $ENV{REQUEST_METHOD};
  12 if ($in_cgi) {
  13   #eval "use CGI::Carp qw(fatalsToBrowser)";
  14 }
  15
  16 my $cfg = BSE::Cfg->new;
  17 my $urlbase = $cfg->entryVar('site', 'url');
  18
  19 my $stopwords = "$DATADIR/stopwords.txt";
  20
  21 # load the stop words
  22 open STOP, "< $stopwords"
  23   or die "Cannot open $stopwords: $!";
  24 chomp(my @stopwords = <STOP>);
  25 tr/\r//d for @stopwords; # just in case
  26 my %stopwords;
  27 @stopwords{@stopwords} = (1) x @stopwords;
  28 close STOP;
  29
  30 my $articles = Articles->new;
  31
  32 # scores depending on where the term appears
  33 # these may need some tuning
  34 # preferably, keep these to single digits
  35 my %scores =
  36   (
  37    title=>5,
  38    body=>3,
  39    keyword=>4,
  40    pageTitle=>5,
  41    author=>4,
  42    file_displayName => 2,
  43    file_description=>2,
  44    file_notes => 1,
  45   );
  46
  47 for my $name (keys %scores) {
  48   my $score = $cfg->entry('search index scores', $name);
  49   if (defined($score) && $score =~ /^\d+$/) {
  50     $scores{$name} = $score;
  51   }
  52 }
  53
  54 # if the level of the article is higher than this, store it's parentid
  55 # instead
  56 my $max_level = $SEARCH_LEVEL;
  57
  58 # key is phrase, value is hashref with $id -> $sectionid
  59 my %index;
  60 makeIndex($articles);
  61
  62 #use Data::Dumper;
  63 #print Dumper(\%index);
  64
  65 my $dh = BSE::DB->single;
  66 my $dropIndex = $dh->stmt('dropIndex')
  67   or die "No dropIndex member in BSE::DB";
  68 my $insertIndex = $dh->stmt('insertIndex')
  69   or die "No insertIndex member in BSE::DB";
  70
  71 $dropIndex->execute()
  72   or die "Could not drop search index ",$dropIndex->errstr;
  73
  74 for my $key (sort keys %index) {
  75   my $word = $index{$key};
  76   # sort by reverse score so that if we overflow the field we
  77   # get the highest scoring matches
  78   my @ids = sort { $word->{$b}[1] <=> $word->{$a}[1] } keys %$word;
  79   my @sections = map { $_->[0] } @$word{@ids};
  80   my @scores = map { $_->[1] } @$word{@ids};
  81   #my @sections = map { $_->[0] } values %{$index{$key}};
  82   #my @scores = map { $_->[1] } values %{$index{$key}};
  83
  84   $insertIndex->execute($key, "@ids", "@sections", "@scores")
  85     or die "Cannot insert into index: ", $insertIndex->errstr;
  86 }
  87
  88 if ($in_cgi) {
  89   refresh_to_admin($cfg, "/cgi-bin/admin/menu.pl");
  90 }
  91
  92 sub makeIndex {
  93   my $articles = shift;
  94   my %dont_search;
  95   my %do_search;
  96   @dont_search{@SEARCH_EXCLUDE} = @SEARCH_EXCLUDE;
  97   @do_search{@SEARCH_INCLUDE} = @SEARCH_INCLUDE;
  98   INDEX: until ($articles->EOF) {
  99     my @files;
 100     my $got_files;
 101     # find the section
 102     my $article = $articles->getNext;
 103     next unless ($article->{listed} || $article->{flags} =~ /I/);
 104     next if $article->{flags} =~ /[CN]/;
 105     my $section = $article;
 106     while ($section->{parentid} >= 1) {
 107       $section = $articles->getByPkey($section->{parentid});
 108       next INDEX if $section->{flags} =~ /C/;
 109     }
 110     my $id = $article->{id};
 111     my $indexas = $article->{level} > $max_level ? $article->{parentid} : $id;
 112     my $sectionid = $section->{id};
 113     eval "use $article->{generator}";
 114     $@ and die $@;
 115     my $gen = $article->{generator}->new(top=>$article, cfg=>$cfg);
 116     next unless $gen->visible($article) or $do_search{$sectionid};
 117
 118     next if $dont_search{$sectionid};
 119
 120     for my $field (sort { $scores{$b} <=> $scores{$a} } keys %scores) {
 121
 122       next unless $scores{$field};
 123       # strip out markup
 124       my $text;
 125       if (exists $article->{$field}) {
 126         $text = $article->{$field};
 127       }
 128       else {
 129         if ($field =~ /^file_(.*)/) {
 130           my $file_field = $1;
 131           @files = $article->files unless $got_files++;
 132           $text = join "\n", map $_->{$file_field}, @files;
 133         }
 134       }
 135       #next if $text =~ m!^\<html\>!i; # I don't know how to do this (yet)
 136       if ($field eq 'body') {
 137         $gen->remove_block($articles, [], \$text);
 138         $text =~ s/[abi]\[([^\]]+)\]/$1/g;
 139       }
 140
 141       next unless defined $text;
 142
 143       # for each paragraph
 144       for my $para (split /\n/, $text) {
 145         my @words = split /\W+/, $para;
 146         my @buffer;
 147
 148         for my $word (@words) {
 149           if ($stopwords{lc $word}) {
 150             process($indexas, $sectionid, $scores{$field}, @buffer) if @buffer;
 151             @buffer = ();
 152           }
 153           else {
 154             push(@buffer, $word);
 155           }
 156         }
 157         process($indexas, $sectionid, $scores{$field}, @buffer) if @buffer;
 158       }
 159     }
 160   }
 161 }
 162
 163 sub process {
 164   my ($id, $sectionid, $score, @words) = @_;
 165
 166   for (my $start = 0; $start < @words; ++$start) {
 167     my $end = $start + $MAXPHRASE-1;
 168     $end = $#words if $end > $#words;
 169
 170     for my $phrase (map { "@words[$start..$_]" } $start..$end) {
 171       if (!exists $index{lc $phrase}{$id}
 172           || $score > $index{lc $phrase}{$id}[1]) {
 173         $index{lc $phrase}{$id} = [ $sectionid, $score ];
 174       }
 175       if (!exists $index{$phrase}{$id}
 176           || $score > $index{$phrase}{$id}[1]) {
 177         $index{$phrase}{$id} = [ $sectionid, $score ];
 178       }
 179     }
 180   }
 181 }
 182