site/cgi-bin/modules/BSE/Util/ContentType.pm
site/cgi-bin/modules/BSE/Util/DynamicTags.pm
site/cgi-bin/modules/BSE/Util/DynSort.pm
+site/cgi-bin/modules/BSE/Util/Fetcher.pm
site/cgi-bin/modules/BSE/Util/Format.pm
site/cgi-bin/modules/BSE/Util/HTML.pm
site/cgi-bin/modules/BSE/Util/Iterate.pm
site/templates/user/userpage_base.tmpl
site/templates/xbase.tmpl
site/util/bse_back.pl
+site/util/bse_fetch.pl
site/util/bse_imageclean.pl
site/util/bse_makeindex.pl
site/util/bse_mkgitversion.pl
t/130-importer/010-csv.t
t/130-importer/020-article.t
t/130-importer/030-product.t
+t/140-fetch/010-fetcher.t
t/900-kwalitee/010-strict-warn.t
t/900-kwalitee/020-checktemplates.t
t/900-kwalitee/030-messages.t
use strict;
use Carp 'confess';
-our $VERSION = "1.003";
+our $VERSION = "1.005";
=head1 NAME
return $result;
}
+=item meta_json_by_name
+
+Retrieve metadata with a specific name and decode it as JSON,
+returning a data structure.
+
+Returns nothing if there is no metadata of that name, or if the
+content type isn't a JSON content type or if the metadata cannot be
+decoded as JSON.
+
+=cut
+
+sub meta_json_by_name {
+ my ($self, $name) = @_;
+
+ my $meta = $self->meta_by_name($name)
+ or return;
+
+ return $meta->retrieve_json;
+}
+
=item metanames
Returns the names of each metadatum defined for the file.
}
+=item all_meta_by_name
+
+Retrieves all metadata for this owner type with the given name.
+
+=cut
+
+sub all_meta_by_name {
+ my ($class, $name) = @_;
+
+ require BSE::TB::Metadata;
+ return BSE::TB::Metadata->getBy
+ (
+ owner_type => $class->meta_owner_type,
+ name => $name,
+ );
+}
+
=back
=head1 RESTRICTED METHODS
sub delete_meta_by_name {
my ($self, $name) = @_;
-print STDERR "Delete ", $self->id, ",", $name, ",", $self->meta_owner_type, ")\n";
BSE::DB->run(bseDeleteArticleFileMetaByName => $self->id, $name, $self->meta_owner_type);
}
use strict;
use base 'Squirrel::Row';
-our $VERSION = "1.002";
+our $VERSION = "1.004";
sub table {
"bse_article_file_meta";
1;
}
+sub retrieve_json {
+ my ($self) = @_;
+
+ return unless $self->content_type
+ =~ m!\A(?:application/json|text/x-json(?: encoding=(?:"utf-8"|utf-8))?)\z!;
+
+ require JSON;
+ my $json = JSON->new->utf8;
+ my $data;
+ return unless eval { $data = $json->decode($self->value); 1 };
+ return $data;
+}
+
1;
--- /dev/null
+package BSE::Util::Fetcher;
+use strict;
+use warnings;
+use BSE::TB::Articles;
+use BSE::TB::AuditLog;
+use LWP::UserAgent;
+use BSE::Util::HTML "escape_uri";
+use JSON ();
+
+our $VERSION = "1.002";
+
+my $json_types = qq!\\A(?:application/json|text/x-json(?: encoding=(?:"utf-8"|utf-8)?))\\z!;
+
+sub new {
+ my ($class, %opts) = @_;
+
+ if ($opts{articles}) {
+ $opts{harticles} = +{ map { $_ => 1 } @{$opts{articles}} };
+ }
+ $opts{report} ||= sub { print "@_\n" };
+
+ bless \%opts, $class;
+}
+
+sub run {
+ my ($self) = @_;
+
+ $self->{errors} = [];
+
+ my $cfg = $self->{cfg};
+ my $section = $self->{section};
+ my $verbose = $self->{verbose};
+ my $report = $self->{report};
+
+ unless ($cfg->entry("basic", "access_control", 0)) {
+ $self->crit(undef, undef, undef,
+ "Access control must be enabled for fetch processing");
+ return;
+ }
+
+ my %entries = $cfg->entries($section);
+ my @data_keys = grep /^data/i, keys %entries;
+
+ KEY:
+ for my $key (@data_keys) {
+ (my $suffix = $key) =~ s/^data//i;
+
+ my $data_name = $cfg->entryErr($section, $key);
+ my $bad_cfg = 0;
+ unless ($data_name =~ /^([a-zA-Z0-9_-]+)$/) {
+ $self->crit(undef, undef, undef,
+ "Invalid metadata name '$data_name' for [$section].$key");
+ ++$bad_cfg;
+ }
+ my $url_name = $cfg->entry($section, "url$suffix", "${data_name}_url");
+ unless ($url_name =~ /^([a-zA-Z0-9_-]+)$/) {
+ $self->crit(undef, undef, undef,
+ "Invalid metadata url '$url_name' for [$section].url$suffix");
+ ++$bad_cfg;
+ }
+ my $url_pattern = $cfg->entry($section, "url_pattern$suffix", '$s');
+ unless ($url_pattern =~ /\$s/) {
+ $self->crit(undef, undef, undef,
+ "Invalid url pattern '$url_pattern' for [$section].url_pattern$suffix");
+ ++$bad_cfg;
+ }
+ my $url_escape = $cfg->entry($section, "url_escape$suffix", 0);
+ my $types = $cfg->entry($section, "types$suffix", $json_types);
+ my $types_re;
+ unless (eval { $types_re = qr/$types/; 1 }) {
+ $self->crit(undef, undef, undef,
+ "Cannot compile regexp /$types/ for [$section].types$suffix: $@");
+ ++$bad_cfg;
+ }
+ my $validate = $cfg->entry($section, "validate$suffix", "json");
+ unless ($validate =~ /\A(?:json|none)\z/i) {
+ $self->crit(undef, undef, undef,
+ "Invalid validate '$validate' value for [$section].validate$suffix");
+ ++$bad_cfg;
+ }
+ my $max_length = $cfg->entry($section, "max_length$suffix", 1_000_000);
+ unless ($max_length =~ /\A[1-9][0-9]+\z/) {
+ $self->crit(undef, undef, undef,
+ "Invalid max_length '$max_length' value for [$section].max_length$suffix");
+ ++$bad_cfg;
+ }
+ my $on_fail = $cfg->entry($section, "on_fail$suffix", "delete");
+ unless ($on_fail =~ /\A(delete|keep)\z/i) {
+ $self->crit(undef, undef, undef,
+ "Invalid on_fail '$on_fail' value for [$section].on_fail$suffix");
+ ++$bad_cfg;
+ }
+ my $on_success = $cfg->entry($section, "on_success$suffix", "");
+ unless ($on_success =~ /\A(?:|(?&KEY)(?:,(?&KEY))*)\z
+ (?(DEFINE)
+ (?<KEY>log)
+ )/xi) {
+ $self->crit(undef, undef, undef,
+ "Invalid on_success '$on_success' value for [$section].on_success$suffix");
+ ++$bad_cfg;
+ }
+ $bad_cfg and next KEY;
+
+ my %cfg_dump =
+ (
+ data_name => $data_name,
+ url_name => $url_name,
+ url_pattern => $url_pattern,
+ url_escape => $url_escape,
+ types => $types,
+ validate => $validate,
+ max_length => $max_length,
+ on_fail => $on_fail,
+ on_success => $on_success,
+ );
+
+ my $ua = LWP::UserAgent->new;
+
+ # look for articles with the url metadata defined
+ my @meta = BSE::TB::Article->all_meta_by_name($url_name);
+ META:
+ for my $meta (@meta) {
+ length $meta->value
+ or next;
+ if ($self->{harticles} && !$self->{harticles}{$meta->file_id}) {
+ next META;
+ }
+ my ($article) = BSE::TB::Articles->getByPkey($meta->file_id)
+ or next META;
+
+ my %base_dump =
+ (
+ %cfg_dump,
+ article => $article->id,
+ );
+
+ unless ($meta->is_text_type) {
+ $self->fail($article, $data_name, $on_fail,
+ "Metadata $url_name for article " . $meta->file_id . " isn't text");
+ next META;
+ }
+
+ my $url_part = $meta->value_text;
+ $url_part =~ /\S/ or next META;
+ $url_escape and $url_part = escape_uri($url_part);
+ (my $url = $url_pattern) =~ s/\$s/$url_part/;
+
+ unless ($url =~ /\A(?:https?|ftp):/) {
+ $self->fail($article, $data_name, $on_fail, "$url isn't http, https or ftp",
+ \%base_dump);
+ next META;
+ }
+
+ $report->("$data_name: fetching $url") if $verbose;
+ $base_dump{url} = $url;
+ my $resp = $ua->get($url);
+ unless ($resp->is_success) {
+ print " fetch failed: ", $resp->status_line, "\n" if $verbose;
+ $self->fail($article, $data_name, $on_fail,
+ "Error fetching $url: " . $resp->status_line,
+ +{
+ %base_dump,
+ status => scalar $resp->status_line,
+ });
+ next META;
+ }
+ $resp->decode;
+ # we don't want character set decoding, just raw content after
+ # decompression
+ my $content = $resp->content;
+ unless (length($content) <= $max_length) {
+ $report->(" response too long") if $verbose;
+ $self->fail($article, $data_name, $on_fail,
+ "Content is ".length($content)." which is larger than $max_length",
+ +{
+ %base_dump,
+ length => length($content),
+ });
+ next META;
+ }
+ unless ($resp->content_type =~ $types_re) {
+ $report->(" Invalid content type", $resp->content_type) if $verbose;
+ $self->fail($article, $data_name, $on_fail,
+ "Content type '".$resp->content_type()."' doesn't match the types regexp",
+ +{
+ %base_dump,
+ content_type => $resp->content_type,
+ });
+ next META;
+ }
+ if ($validate eq 'json') {
+ my $json = JSON->new;
+ unless (eval { $json->decode($content); 1 }) {
+ $report->(" Failed JSON validation") if $verbose;
+ $self->fail($article, $data_name, $on_fail,
+ "Content failed JSON validation", \%base_dump);
+ next META;
+ }
+ }
+
+ if ($self->{save}) {
+ my $data = $article->meta_by_name($data_name);
+ if ($data) {
+ $data->set_content_type($resp->content_type);
+ $data->set_value($content);
+ $data->save;
+ }
+ else {
+ $data = $article->add_meta
+ (
+ name => $data_name,
+ content_type => scalar $resp->content_type,
+ value => $content,
+ appdata => 1,
+ );
+ }
+ $report->(" Saved") if $verbose;
+ if ($on_success =~ /\blog\b/i) {
+ BSE::TB::AuditLog->log
+ (
+ component => "fetcher::run",
+ level => "info",
+ actor => "S",
+ msg => "Successfully saved '$data_name' for article '".$article->id."'",
+ object => $article,
+ dump => \%base_dump,
+ );
+ }
+ }
+ }
+ }
+
+ return !@{$self->{errors}};
+}
+
+sub errors {
+ my $self = shift;
+ $self->{errors};
+}
+
+sub fail {
+ my $self = shift;
+ my ($article, $data_name, $on_fail) = @_;
+ $self->_log("error", @_);
+
+ if ($article && $on_fail eq "delete" && $self->{save}) {
+ $article->delete_meta_by_name($data_name);
+ }
+}
+
+sub crit {
+ my $self = shift;
+ $self->_log("crit", @_);
+}
+
+sub _log {
+ my ($self, $level, $article, $data_name, $on_fail, $message, $dump) = @_;
+
+ push @{$self->{errors}}, [ $level, $message ];
+ if ($self->{log}) {
+ BSE::TB::AuditLog->log
+ (
+ component => "fetcher::run",
+ level => $level,
+ actor => "S",
+ msg => $message,
+ object => $article,
+ dump => $dump,
+ );
+ }
+}
use BSE::TB::Site;
use BSE::Util::HTML;
-our $VERSION = "1.021";
+our $VERSION = "1.022";
sub _base_variables {
my ($self, %opts) = @_;
require JSON;
return JSON->new->allow_nonref->encode($_[0]);
},
+ decode_json => sub {
+ require JSON;
+ my $json = JSON->new->utf8;
+ return eval { $json->decode($_[0]) };
+ },
report_data => \&_report_data,
);
}
Return C<data> as JSON. This will fail for perl objects.
+=item bse.decode_json(data)
+
+Decode JSON into a data structure. This requires binary data.
+
=item dumper(value)
Dump the value in perl syntax using L<Data::Dumper>.
long_desc: <<TEXT
Rebuild the search index.
TEXT
+
+id: bse_fetch
+description: Fetch metadata defined content for articles
+binname: perl util/bse_fetch.pl
+bin_opts: -v
+long_desc: <<TEXT
+<p>Fetch content defined by the <code>[automatic data]</code> section
+of the configuration file.</p>
+
+<p>If you don't have anything defined there then this task isn't
+useful.</p>
+TEXT
\ No newline at end of file
=back
+=head2 [automatic data]
+
+This section defines article metadata to be used in fetching content
+from external URLs.
+
+The metadata definition for the URL field must be separately defined
+in C<[global article metadata]>.
+
+See L<bse_fetch.pl> for more details.
+
+=over
+
+=item *
+
+C<< dataI<suffix> >> - defines the metadata entry to store the
+retrived content in. This is the only required configuration,
+
+=item *
+
+C<< urlI<suffix> >> - defines the metadata entry to retrieve the URL
+or URL part from. Defaults to the value of C<< dataI<suffix> >>
+followed by C<_url>.
+
+=item *
+
+C<< url_patternI<suffix> >> - defines a simple URL template. Any
+C<$s> in this string is replaced with the value retrieved from the
+metadata field defined by C<< urlI<suffix> >>. Default: C<$s>.
+
+=item *
+
+C<< url_patternI<suffix> >> - set to true to URL escape the value
+retrieved from the metadata field defined by C<< urlI<suffix> >>.
+Default: 0 (false)
+
+=item *
+
+C<< url_patternI<suffix> >> - set to a perl regular expression to
+validate the content type of the data fetched. Default: a regular
+expression matching JSON content.
+
+=item *
+
+C<< validateI<suffix> >> - how to validate the fetched content.
+Currently the only possible values are C<none>, which does no
+validation, and C<json> which validates the content as JSON. Default:
+C<json>.
+
+=item *
+
+C<< max_lengthI<suffix> >> - the maximum length in bytes of the
+retrieved content. Default: 1000000.
+
+=item *
+
+C<< on_failI<suffix> >> - how to treat the currently stored content if
+the fetch fails. Possible values are C<delete> which deletes the
+content metadata, or C<keep> which doesn't. Default: C<delete>.
+
+=item *
+
+C<< on_successI<suffix> >> - actions to take on successful fetch,
+which currently only has two possible values, either C<log> to log a
+success message to the audit log, or an empty string to not do so.
+Default: an empty string.
+
+=back
+
=head1 AUTHOR
Tony Cook <tony@develop-help.com>
--- /dev/null
+#!perl -w
+use strict;
+use warnings;
+use FindBin;
+use lib "$FindBin::Bin/../cgi-bin/modules";
+use Getopt::Long;
+use BSE::API qw(bse_init bse_cfg);
+use BSE::Util::Fetcher;
+
+our $VERSION = "1.000";
+
+bse_init("../cgi-bin");
+
+my $verbose;
+my $nosave;
+my $nolog;
+my $section = "automatic data";
+GetOptions(
+ "v:i" => \$verbose,
+ "nosave|n" => \$nosave,
+ "nolog" => \$nolog,
+ "section|s=s" => \$section,
+ );
+if (defined $verbose && $verbose eq '') {
+ $verbose = 1;
+}
+
+my $cfg = bse_cfg();
+
+my @extra;
+if (@ARGV) {
+ @extra = ( articles => [ @ARGV ] );
+}
+
+my $o = BSE::Util::Fetcher->new
+ (
+ cfg => $cfg,
+ verbose => $verbose,
+ save => !$nosave,
+ log => !$nolog,
+ section => $section,
+ @extra,
+ );
+
+$o->run();
+
+my $errors = $o->errors;
+print STDERR "$_->[0]: $_->[1]\n" for @$errors;
+
+exit 1 if @$errors;
+
+=head1 NAME
+
+bse_fetch.pl - fetch data based on article metadata and store as article metadata
+
+=head1 SYNOPSIS
+
+ perl bse_fetch.pl
+
+=head1 DESCRIPTION
+
+The C<bse_fetch.pl> tool, based on configuration stored in the
+C<[automatic data]> section of the configuration file and on the
+article metadata that describes, retrieves data from remote sources
+and stores it in article metadata.
+
+Since this mechanism accesses external sites it will only function if
+access control is enabled.
+
+At the simplest configuring this requires setting one key in
+C<[automatic data]>:
+
+ [automatic data]
+ data_example=example
+
+and defining a field for the URL in C<[global article metadata]>:
+
+ [global article metadata]
+ example_url=
+
+ [article metadata example_url]
+ title=Example URL
+ type=string
+ width=60
+
+If an article has this metadata set, typically via the article editor,
+a run of C<bse_fetch.pl> will attempt to fetch the URL defined by that
+metadata.
+
+The value of the URL metadata must contain at least one non-blank
+character or it will be silently skipped.
+
+You can set a C<< url_patternI<suffix> >> to allow the supplied value
+to be subtituted into a full url, so for example:
+
+ [automatic data]
+ data_example=example
+ url_pattern_example=http://example.com/location/$s/events
+ url_escape_example=1
+
+ [global article metadata]
+ example_url=
+
+ [article metadata example_url]
+ title=Events location ID
+ type=string
+ width=10
+
+The C<< url_escapeI<suffix> >> key allows the value from the URL field
+to be URL escaped. If this field is a full URL or URL fragment you
+typically don't want this, but if it's some sort of text to be
+subtituted into a URL it's recommended.
+
+The final URLs must have one of C<http:>, C<https:> or C<ftp:> scheme.
+C<file:> URLs are not permitted.
+
+By default the content retrived must have a JSON content type and must
+validate as JSON, you can control this with the C<< validateI<suffix>
+>> and C<< typesI<suffix> >> keys. The first specifies a regular
+expression used to validate the returned content type, while the
+second can be set to C<none> to disable validation of the content
+itself.
+
+ [automatic downloads]
+ data_example=example
+ ; accept anything
+ types_example=.
+ validate_example=none
+
+To prevent storage of excessively large content, by default C<<
+max_lengthI<suffix> >> to 1000000, which you can set lower or higher
+as needed. There is no mechanism to support unlimited sizes.
+
+By default, if a fetch of the data for a particular article fails, any
+existing stored metadata for that definition is deleted from the
+article. You can prevent that by setting C<< on_failI<suffix> >> to
+C<keep>.
+
+If a fetch fails, an error is reported in the audit log.
+
+Success however is silent by default, you can configure success
+producing an C<info> audit log message by setting C<<
+on_successI<suffix> >> to C<log>:
+
+ [automatic downloads]
+ data_example=example
+ on_success_example=log
+
+=head AUTHOR
+
+Tony Cook <tony@develop-help.com>
+
+=cut
--- /dev/null
+#!perl -w
+use strict;
+use BSE::Test ();
+use BSE::Cfg;
+use BSE::API qw(bse_init bse_cfg);
+use BSE::Util::Fetcher;
+
+use Test::More;
+
+my $base_cgi = File::Spec->catdir(BSE::Test::base_dir(), "cgi-bin");
+BSE::API::bse_init($base_cgi);
+my $cfg = bse_cfg();
+
+my $art = BSE::API::bse_make_article
+ (
+ cfg => $cfg,
+ title => "010-fetcher.t",
+ );
+
+my $html = $cfg->entryVar("paths", "public_html");
+
+my $base_cfg = <<EOS;
+[basic]
+access_control=1
+
+[paths]
+public_html=$html
+
+EOS
+
+note "article ". $art->id;
+
+SKIP:
+{
+ my $wcfg = BSE::Cfg->new_from_text(text => <<EOS);
+$base_cfg
+
+[automatic data]
+data_test=test
+EOS
+
+ my $url_meta = $art->add_meta
+ (
+ value => "http://test.develop-help.com/test.json",
+ name => "test_url",
+ content_type => "text/plain",
+ );
+ ok($url_meta, "add url metadata");
+
+ my $f = BSE::Util::Fetcher->new
+ (
+ cfg => $wcfg,
+ save => 1,
+ log => 0,
+ section => "automatic data",
+ articles => [ $art->id ],
+ );
+ ok($f->run(), "do the fetch")
+ or do { diag "@$_" for @{$f->errors}; skip "No data", 1 };
+
+ my $meta = $art->meta_by_name("test");
+ ok($meta, "data stored in meta")
+ or skip "no data stored", 1;
+ like($meta->value, qr/\A\[\s+5\s+\]\s+\z/, "check content")
+ or skip "wrong data stored", 1;
+ is($meta->content_type, "application/json",
+ "check content type");
+ my $data = $meta->retrieve_json;
+ ok($data, "decoded json")
+ or skip "No decoded data to look at", 1;
+ is($data->[0], 5, "check stored data");
+
+ $url_meta->remove;
+ $meta->remove;
+
+ $url_meta = $art->add_meta
+ (
+ value => "http://test.develop-help.com/test-not.json",
+ name => "test_url",
+ content_type => "text/plain",
+ );
+ ok($url_meta, "add invalid json metadata url");
+ ok(!$f->run(), "do the fetch");
+ $meta = $art->meta_by_name("test");
+ ok(!$meta, "should be no data");
+ my @msgs = map $_->[1], @{$f->{errors}};
+ ok(grep(/^Content failed JSON validation/, @msgs),
+ "check json validation failed");
+}
+
+SKIP:
+{
+ my $badcfg = BSE::Cfg->new_from_text(text => <<EOS);
+$base_cfg
+
+[automatic data]
+data_test=test*
+url_test=other*
+url_pattern_test=foo
+types_test=(
+validate_test=unknown
+max_length_test=x
+on_fail_test=foo
+on_success_test=x
+EOS
+
+ my $f = BSE::Util::Fetcher->new
+ (
+ cfg => $badcfg,
+ save => 1,
+ log => 0,
+ section => "automatic data",
+ articles => [ $art->id ],
+ );
+ ok(!$f->run(), "do the fetch (and fail)")
+ or skip "It didn't fail", 1;
+ my @errors = @{$f->errors};
+ my @msgs = map $_->[1], @errors;
+ ok(grep(/Invalid metadata name 'test\*'/, @msgs),
+ "Invalid name errored");
+ ok(grep(/Invalid metadata url 'other\*'/, @msgs),
+ "Invalid url errored");
+ ok(grep(/Invalid url pattern 'foo'/, @msgs),
+ "Invalid url_pattern errored");
+ ok(grep(/Cannot compile regexp \/\(\/ for/, @msgs),
+ "Invalid test re errored");
+ ok(grep(/Invalid validate 'unknown'/, @msgs),
+ "Invalid validate errored");
+ ok(grep(/Invalid max_length 'x'/, @msgs),
+ "Invalid max length errored");
+ ok(grep(/Invalid on_fail 'foo'/, @msgs),
+ "Invalid on_fail errored");
+ ok(grep(/Invalid on_success 'x'/, @msgs),
+ "Invalid on_success errored");
+}
+
+END {
+ $art->remove($cfg);
+}
+
+done_testing();