1 package BSE::Util::Fetcher;
7 use BSE::Util::HTML "escape_uri";
10 our $VERSION = "1.002";
12 my $json_types = qq!\\A(?:application/json|text/x-json(?: encoding=(?:"utf-8"|utf-8)?))\\z!;
15 my ($class, %opts) = @_;
17 if ($opts{articles}) {
18 $opts{harticles} = +{ map { $_ => 1 } @{$opts{articles}} };
20 $opts{report} ||= sub { print "@_\n" };
30 my $cfg = $self->{cfg};
31 my $section = $self->{section};
32 my $verbose = $self->{verbose};
33 my $report = $self->{report};
35 unless ($cfg->entry("basic", "access_control", 0)) {
36 $self->crit(undef, undef, undef,
37 "Access control must be enabled for fetch processing");
41 my %entries = $cfg->entries($section);
42 my @data_keys = grep /^data/i, keys %entries;
45 for my $key (@data_keys) {
46 (my $suffix = $key) =~ s/^data//i;
48 my $data_name = $cfg->entryErr($section, $key);
50 unless ($data_name =~ /^([a-zA-Z0-9_-]+)$/) {
51 $self->crit(undef, undef, undef,
52 "Invalid metadata name '$data_name' for [$section].$key");
55 my $url_name = $cfg->entry($section, "url$suffix", "${data_name}_url");
56 unless ($url_name =~ /^([a-zA-Z0-9_-]+)$/) {
57 $self->crit(undef, undef, undef,
58 "Invalid metadata url '$url_name' for [$section].url$suffix");
61 my $url_pattern = $cfg->entry($section, "url_pattern$suffix", '$s');
62 unless ($url_pattern =~ /\$s/) {
63 $self->crit(undef, undef, undef,
64 "Invalid url pattern '$url_pattern' for [$section].url_pattern$suffix");
67 my $url_escape = $cfg->entry($section, "url_escape$suffix", 0);
68 my $types = $cfg->entry($section, "types$suffix", $json_types);
70 unless (eval { $types_re = qr/$types/; 1 }) {
71 $self->crit(undef, undef, undef,
72 "Cannot compile regexp /$types/ for [$section].types$suffix: $@");
75 my $validate = $cfg->entry($section, "validate$suffix", "json");
76 unless ($validate =~ /\A(?:json|none)\z/i) {
77 $self->crit(undef, undef, undef,
78 "Invalid validate '$validate' value for [$section].validate$suffix");
81 my $max_length = $cfg->entry($section, "max_length$suffix", 1_000_000);
82 unless ($max_length =~ /\A[1-9][0-9]+\z/) {
83 $self->crit(undef, undef, undef,
84 "Invalid max_length '$max_length' value for [$section].max_length$suffix");
87 my $on_fail = $cfg->entry($section, "on_fail$suffix", "delete");
88 unless ($on_fail =~ /\A(delete|keep)\z/i) {
89 $self->crit(undef, undef, undef,
90 "Invalid on_fail '$on_fail' value for [$section].on_fail$suffix");
93 my $on_success = $cfg->entry($section, "on_success$suffix", "");
94 unless ($on_success =~ /\A(?:|(?&KEY)(?:,(?&KEY))*)\z
98 $self->crit(undef, undef, undef,
99 "Invalid on_success '$on_success' value for [$section].on_success$suffix");
102 $bad_cfg and next KEY;
106 data_name => $data_name,
107 url_name => $url_name,
108 url_pattern => $url_pattern,
109 url_escape => $url_escape,
111 validate => $validate,
112 max_length => $max_length,
114 on_success => $on_success,
117 my $ua = LWP::UserAgent->new;
119 # look for articles with the url metadata defined
120 my @meta = BSE::TB::Article->all_meta_by_name($url_name);
122 for my $meta (@meta) {
125 if ($self->{harticles} && !$self->{harticles}{$meta->file_id}) {
128 my ($article) = BSE::TB::Articles->getByPkey($meta->file_id)
134 article => $article->id,
137 unless ($meta->is_text_type) {
138 $self->fail($article, $data_name, $on_fail,
139 "Metadata $url_name for article " . $meta->file_id . " isn't text");
143 my $url_part = $meta->value_text;
144 $url_part =~ /\S/ or next META;
145 $url_escape and $url_part = escape_uri($url_part);
146 (my $url = $url_pattern) =~ s/\$s/$url_part/;
148 unless ($url =~ /\A(?:https?|ftp):/) {
149 $self->fail($article, $data_name, $on_fail, "$url isn't http, https or ftp",
154 $report->("$data_name: fetching $url") if $verbose;
155 $base_dump{url} = $url;
156 my $resp = $ua->get($url);
157 unless ($resp->is_success) {
158 print " fetch failed: ", $resp->status_line, "\n" if $verbose;
159 $self->fail($article, $data_name, $on_fail,
160 "Error fetching $url: " . $resp->status_line,
163 status => scalar $resp->status_line,
168 # we don't want character set decoding, just raw content after
170 my $content = $resp->content;
171 unless (length($content) <= $max_length) {
172 $report->(" response too long") if $verbose;
173 $self->fail($article, $data_name, $on_fail,
174 "Content is ".length($content)." which is larger than $max_length",
177 length => length($content),
181 unless ($resp->content_type =~ $types_re) {
182 $report->(" Invalid content type", $resp->content_type) if $verbose;
183 $self->fail($article, $data_name, $on_fail,
184 "Content type '".$resp->content_type()."' doesn't match the types regexp",
187 content_type => $resp->content_type,
191 if ($validate eq 'json') {
192 my $json = JSON->new;
193 unless (eval { $json->decode($content); 1 }) {
194 $report->(" Failed JSON validation") if $verbose;
195 $self->fail($article, $data_name, $on_fail,
196 "Content failed JSON validation", \%base_dump);
202 my $data = $article->meta_by_name($data_name);
204 $data->set_content_type($resp->content_type);
205 $data->set_value($content);
209 $data = $article->add_meta
212 content_type => scalar $resp->content_type,
217 $report->(" Saved") if $verbose;
218 if ($on_success =~ /\blog\b/i) {
219 BSE::TB::AuditLog->log
221 component => "fetcher::run",
224 msg => "Successfully saved '$data_name' for article '".$article->id."'",
233 return !@{$self->{errors}};
243 my ($article, $data_name, $on_fail) = @_;
244 $self->_log("error", @_);
246 if ($article && $on_fail eq "delete" && $self->{save}) {
247 $article->delete_meta_by_name($data_name);
253 $self->_log("crit", @_);
257 my ($self, $level, $article, $data_name, $on_fail, $message, $dump) = @_;
259 push @{$self->{errors}}, [ $level, $message ];
261 BSE::TB::AuditLog->log
263 component => "fetcher::run",