#!perl
use 5.010;
use strict;
use warnings;
use Text::CSV_XS;
use Text::Table::Tiny qw/ generate_table /;
use Getopt::Long;
binmode(STDOUT, ":encoding(UTF-8)");
my $SHOW_COUNT_THRESHOLD = 7;
my $usage_string = "usage: $0 [-h] [-d
] [-i] \n";
my $case_insensitive = 0;
my $match_column = "";
my $column_spec;
my ($pattern, $filename) = process_command_line();
show_matching_rows($pattern, $filename);
exit 0;
sub show_matching_rows
{
my ($pattern, $filename) = @_;
my $sep_char = ',';
my $total_rows = 0;
my @col_indices;
my @rows;
# TSV files have tab as separator instead of comma
$sep_char = "\t" if $filename =~ /\.tsv$/;
open(my $fh, '<:encoding(utf8)', $filename)
|| die "can't read $filename: $!\n";
my $parser = Text::CSV_XS->new({ sep_char => $sep_char });
my @headers = @{ $parser->getline($fh) };
if ($column_spec) {
@col_indices = split(/,/, $column_spec);
@headers = @headers[@col_indices];
}
push(@rows, \@headers);
while (<$fh>) {
$total_rows++;
next unless $match_column =~ (/^\d+$/)
|| ($case_insensitive && /$pattern/io)
|| (!$case_insensitive && /$pattern/o);
my $status = $parser->parse($_);
my @columns = $parser->fields;
next if $match_column =~ (/^\d+$/)
&& ( ($case_insensitive && $columns[$match_column] !~ /$pattern/io)
|| (!$case_insensitive && $columns[$match_column] !~ /$pattern/o)
);
if ($column_spec) {
@columns = @columns[@col_indices];
}
push(@rows, [@columns]);
}
die "No match found in $total_rows records\n" unless @rows > 1;
print generate_table(rows => \@rows, header_row => 1), "\n";
my $displayed_rows = int(@rows) - 1;
if ($displayed_rows > $SHOW_COUNT_THRESHOLD) {
print "matched $displayed_rows of $total_rows lines\n";
}
}
sub process_command_line
{
my $dirpath;
my $help = 0;
GetOptions(
'ignore-case|i' => \$case_insensitive,
'help|h' => \$help,
'columns|c=s' => \$column_spec,
'directory|d=s' => \$dirpath,
'match-column|mc=i' => \$match_column,
) || die $usage_string;
die $usage_string if ($help);
die $usage_string unless @ARGV == 2
|| $dirpath;
my $pattern = shift @ARGV;
my $filename = $dirpath
? find_newest_file($dirpath)
: shift @ARGV;
return ($pattern, $filename);
}
sub find_newest_file
{
my $dirpath = shift;
opendir(my $DIR, $dirpath)
|| die "can't read directory $dirpath: $!\n";
my @files = map { $_->[0] }
sort { $b->[1] <=> $a->[1] }
map { [ $_, (stat("$dirpath/$_"))[9] ] } # mtime
grep { /\.[ct]sv$/ } readdir($DIR);
closedir($DIR);
die "no .csv files found in $dirpath\n" unless @files > 0;
return "$dirpath/$files[0]";
}
__END__
=head1 NAME
csvgrep - search for patterns in a CSV and display results in a table
=head1 SYNOPSIS
csvgrep
csvgrep -d
=head1 DESCRIPTION
B is a script that lets you look for a pattern in a CSV file,
and then displays the results in a text table.
We assume that the first line in the CSV is a header row.
The simplest usage is to look for a word in a CSV:
% csvgrep Murakami books.csv
+-------------------+-----------------+-------+------+
| Book | Author | Pages | Date |
+-------------------+-----------------+-------+------+
| Norwegian Wood | Haruki Murakami | 400 | 1987 |
| Men without Women | Haruki Murakami | 228 | 2017 |
+-------------------+-----------------+-------+------+
As with regular grep, you can use the B<-i> switch to make it
case-insensitive:
% csvgrep -i wood books.csv
+-----------------------+-----------------+-------+------+
| Book | Author | Pages | Date |
+-----------------------+-----------------+-------+------+
| Norwegian Wood | Haruki Murakami | 400 | 1987 |
| A Walk in the Woods | Bill Bryson | 276 | 1997 |
| Death Walks the Woods | Cyril Hare | 222 | 1954 |
+-----------------------+-----------------+-------+------+
You can specify a subset of the columns to display with the B<-c> option,
which takes a comma-separated list of column numbers:
% csvgrep -c 0,1,3 -i mary books.csv
+--------------+--------------+------+
| Book | Author | Date |
+--------------+--------------+------+
| Mary Poppins | PL Travers | 1934 |
| Frankenstein | Mary Shelley | 1818 |
+--------------+--------------+------+
By default the pattern will be matched against the whole line,
but you can use B<--match-column> or B<-mc> to specify that the pattern
should only be matched against a specific column:
% csvgrep -mc 0 -c 0,1,3 -i mary books.csv
+--------------+--------------+------+
| Book | Author | Date |
+--------------+--------------+------+
| Mary Poppins | PL Travers | 1934 |
+--------------+--------------+------+
The number of the match column refers to the numbering of the full set
of columns, regardless of whether you've used the B<-c> option.
This means you can match against a column that you're not displaying.
The pattern can be a Perl regexp,
but you'll probably need to quote it from your shell:
% csvgrep -i 'walk.*wood' books.csv
+-----------------------+-------------+-------+------+
| Book | Author | Pages | Date |
+-----------------------+-------------+-------+------+
| A Walk in the Woods | Bill Bryson | 276 | 1997 |
| Death Walks the Woods | Cyril Hare | 222 | 1954 |
+-----------------------+-------------+-------+------+
At work we have a number of situations where we have a directory that contains multiple versions
of a particular CSV file, for example with a feed from a customer.
With the B<-d> option, csvgrep will look at the most recent file in the specified directory,
only considering files with a .csv extension:
% csvgrep -d /usr/local/feeds/users -i smith
I have various aliases defined, like this:
alias tg="csvgrep -d .../file.csv -c 0,1,2 -i"
So then I can just run:
tg smith
This is a script I've used internally,
with features being added as I wanted them.
Let me know if you've ideas for additional features, or send me a pull request.
=head2 Tab-Separated Values
TSV files are pretty common; they use a tab character instead of a comma.
If the filename ends with C<.tsv> rather than C<.csv>,
we'll set the field separator to be a tab character:
% csvgrep -i norwegian ~/books.tsv
+----------------+-----------------+-------+------+
| Book | Author | Pages | Date |
+----------------+-----------------+-------+------+
| Norwegian Wood | Haruki Murakami | 400 | 1987 |
+----------------+-----------------+-------+------+
This also applies to the B<-d> option.
=head1 REPOSITORY
L
=head1 AUTHOR
Neil Bowers Eneilb@cpan.orgE
=head1 COPYRIGHT AND LICENSE
This software is copyright (c) 2017 by Neil Bowers .
This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.