Skip to content

Commit

Permalink
Extending Text importer
Browse files Browse the repository at this point in the history
Added option `split` and modified import format to use field `match`.
  • Loading branch information
nichtich committed Jun 19, 2015
1 parent e70a5a3 commit ffdedcd
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 36 deletions.
1 change: 1 addition & 0 deletions Changes
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
Revision history for Catmandu

{{$NEXT}}
- new Text importer to import data similar to sed and awk
- support multiple namespaces and module description in Catmandu::Importer::Modules
- help command now shows importer/exporter options
- new utility functions pod_section and array_split
Expand Down
71 changes: 45 additions & 26 deletions lib/Catmandu/Importer/Text.pm
Original file line number Diff line number Diff line change
Expand Up @@ -8,36 +8,46 @@ with 'Catmandu::Importer';

has pattern => (
is => 'ro',
coerce => sub {
my ($p) = @_;
return $p if ref $p;
$p =~ /\n/m ? qr{$p}x : qr{$p};
coerce => sub {
$_[0] =~ /\n/m ? qr{$_[0]}x : qr{$_[0]}
},
default => sub { qr/^(?<text>.*$)/ },
);

has split => (
is => 'ro',
coerce => sub {
length $_[0] == 1 ? $_[0] : qr{$_[0]}
}
);

sub generator {
my ($self) = @_;
sub {
state $pattern = $self->pattern;
state $cnt = 0;
state $split = $self->split;
state $count = 0;
state $line;

while ( defined($line = $self->readline) ) {
chomp $line;
next if $line !~ $pattern;
next if $pattern and $line !~ $pattern;

my $data = { _id => ++$count };

if (scalar %+) { # named capturing groups
return { _id => ++$cnt , %+ };
} else { # numbered capturing groups
if (@+ < 2) { # no capturing groups
$data->{text} = $line;
} elsif (%+) { # named capturing groups
$data->{match} = { %+ };
} else { # numbered capturing groups
no strict 'refs';
my $data = {
_id => ++$cnt,
map { '_'.$_ => $$_ } grep { defined $$_ } 1..@+-1
};
$data->{text} = $line if keys %$data == 1;
return $data;
$data->{match} = [ map { $$_ } 1..@+-1 ];
}

if ($split) {
$data->{text} = [ split $split, $line ];
}

return $data;
}

return;
Expand Down Expand Up @@ -92,21 +102,27 @@ Binmode of the input stream C<fh>. Set to C<:utf8> by default.
An ARRAY of one or more fixes or file scripts to be applied to imported items.
=item split
Character or regular expression, given as string, to split each line. Imported
field C<text> will contain an array.
=item pattern
An regular expression to only import matching lines. If the pattern contains
capturing groups, only these groups are imported as field C<_1>, C<_2>, ...
(numbered capturing groups) or with named capturing groups. If at least one
named capturing group matches, all unnamed capturing groups are ignored. If
the pattern consists of multiple lines, whitespace is ignored for better
readability. For instance dates in C<YYYY-MM-DD> format can be imported with
one of the following patterns:
An regular expression, given as string, to only import matching lines.
Whitespaces in patterns are ignored or must be escaped if patterns consists of
multiple lines. If the pattern contains capturing groups, captured values are
imported in field C<match> instead of C<text>.
For instance dates in C<YYYY-MM-DD> format can be imported as named fields with
(?<year>\d\d\d\d)-(?<month>\d\d)-(?<day>\d\d)
(\d\d\d\d)- # year: _1
(\d\d)- # month: _2
(\d\d) # day: _3
or as array with
(\d\d\d\d)- # year
(\d\d)- # month
(\d\d) # day
=back
Expand All @@ -120,4 +136,7 @@ can only be read once.
L<Catmandu::Exporter::Text>
L<awk|https://en.wikipedia.org/wiki/AWK> and
L<sed|https://en.wikipedia.org/wiki/Sed>
=cut
31 changes: 21 additions & 10 deletions t/Catmandu-Importer-Text.t
Original file line number Diff line number Diff line change
Expand Up @@ -30,23 +30,34 @@ is_deeply text( pattern => 'are' ), [
], 'simple pattern match';

is_deeply text( pattern => '(\w+)(.).*\.$' ), [
{_id => 1 , _1 => "And", _2 => ' '},
], 'numbered capturing group';
{_id => 1 , match => ["And"," "]},
], 'numbered capturing groups';

is_deeply text( pattern => '^(?<first>\w+) (?<second>are).*\,$' ), [
{_id => 1 , first => "Roses", second => "are"},
{_id => 2 , first => "Violets", second => "are"},
], 'named capturing group';
my $items = [ {_id => 1 , match => {first => "Roses", second => "are"}},
{_id => 2 , match => {first => "Violets", second => "are"}} ];

is_deeply text( pattern => '^(?<first>\w+) (?<second>are).*\,$' ),
$items, 'named capturing groups';

my $pattern = <<'PAT';
^(?<first> \w+) # first word
\ # space
(?<second> are ) # second word = 'are'
PAT

is_deeply text( pattern => $pattern ), [
{_id => 1 , first => "Roses", second => "are"},
{_id => 2 , first => "Violets", second => "are"},
], 'more legible pattern';
is_deeply text( pattern => $pattern ),
$items, 'multiline pattern';

is_deeply [ map { $_->{text} } @{ text( split => ' ' ) } ],
[ map { [ split ' ', $_ ] } split "\n", $text ],
'split by character';

is_deeply [ map { $_->{text} } @{ text( split => 'is|are' ) } ],
[ map { [ split /is|are/, $_ ] } split "\n", $text ],
'split by regexp';

is_deeply text( split => ' is | are ', pattern => '^And so (.*)' ),
[ { _id => 1, text => ['And so','you.'], match => ['are you.'] } ],
'split and pattern';

done_testing;

0 comments on commit ffdedcd

Please sign in to comment.