#!perl
use 5.010;
use strict;
use warnings;

use Text::CSV_XS;
use Text::Table::Tiny qw/ generate_table /;
use Getopt::Long;

binmode(STDOUT, ":encoding(UTF-8)");

my $SHOW_COUNT_THRESHOLD = 7;
my $usage_string         = "usage: $0 [-h] [-d <dir>] [-i] <pattern> <file>\n";
my $case_insensitive     = 0;
my $match_column         = "";
my $column_spec;


my ($pattern, $filename) = process_command_line();
show_matching_rows($pattern, $filename);
exit 0;


sub show_matching_rows
{
    my ($pattern, $filename) = @_;
    my $sep_char             = ',';
    my $total_rows           = 0;
    my @col_indices;
    my @rows;

    # TSV files have tab as separator instead of comma
    $sep_char = "\t" if $filename =~ /\.tsv$/;

    open(my $fh, '<:encoding(utf8)', $filename)
        || die "can't read $filename: $!\n";

    my $parser  = Text::CSV_XS->new({ sep_char => $sep_char });
    my @headers = @{ $parser->getline($fh) };

    if ($column_spec) {
        @col_indices = split(/,/, $column_spec);
        @headers    = @headers[@col_indices];
    }

    push(@rows, \@headers);

    while (<$fh>) {
        $total_rows++;
        next unless $match_column =~ (/^\d+$/)
                 || ($case_insensitive && /$pattern/io)
                 || (!$case_insensitive && /$pattern/o);
        my $status = $parser->parse($_);
        my @columns = $parser->fields;

        next if $match_column =~ (/^\d+$/)
             && (   ($case_insensitive  && $columns[$match_column] !~ /$pattern/io)
                 || (!$case_insensitive && $columns[$match_column] !~ /$pattern/o)
                );

        if ($column_spec) {
            @columns = @columns[@col_indices];
        }
        push(@rows, [@columns]);
    }

    die "No match found in $total_rows records\n" unless @rows > 1;

    print generate_table(rows => \@rows, header_row => 1), "\n";

    my $displayed_rows = int(@rows) - 1;
    if ($displayed_rows > $SHOW_COUNT_THRESHOLD) {
        print "matched $displayed_rows of $total_rows lines\n";
    }
}


sub process_command_line
{
    my $dirpath;
    my $help = 0;

    GetOptions(
        'ignore-case|i'     => \$case_insensitive,
        'help|h'            => \$help,
        'columns|c=s'       => \$column_spec,
        'directory|d=s'     => \$dirpath,
        'match-column|mc=i' => \$match_column,
    ) || die $usage_string;

    die $usage_string if ($help);
    die $usage_string unless @ARGV == 2
                          || $dirpath;

    my $pattern  = shift @ARGV;

    my $filename = $dirpath
                 ? find_newest_file($dirpath)
                 : shift @ARGV;

    return ($pattern, $filename);
}

sub find_newest_file
{
    my $dirpath = shift;

    opendir(my $DIR, $dirpath)
        || die "can't read directory $dirpath: $!\n";

    my @files = map { $_->[0] }
                sort { $b->[1] <=> $a->[1] }
                map { [ $_, (stat("$dirpath/$_"))[9] ] } # mtime
                grep { /\.[ct]sv$/ } readdir($DIR);

    closedir($DIR);

    die "no .csv files found in $dirpath\n" unless @files > 0;

    return "$dirpath/$files[0]";
}

__END__

=head1 NAME

csvgrep - search for patterns in a CSV and display results in a table

=head1 SYNOPSIS

 csvgrep <pattern> <file>
 csvgrep -d <directory> <pattern>

=head1 DESCRIPTION

B<csvgrep> is a script that lets you look for a pattern in a CSV file,
and then displays the results in a text table.
We assume that the first line in the CSV is a header row.

The simplest usage is to look for a word in a CSV:

 % csvgrep Murakami books.csv
 +-------------------+-----------------+-------+------+
 | Book              | Author          | Pages | Date |
 +-------------------+-----------------+-------+------+
 | Norwegian Wood    | Haruki Murakami | 400   | 1987 |
 | Men without Women | Haruki Murakami | 228   | 2017 |
 +-------------------+-----------------+-------+------+

As with regular grep, you can use the B<-i> switch to make it
case-insensitive:

 % csvgrep -i wood books.csv
 +-----------------------+-----------------+-------+------+
 | Book                  | Author          | Pages | Date |
 +-----------------------+-----------------+-------+------+
 | Norwegian Wood        | Haruki Murakami | 400   | 1987 |
 | A Walk in the Woods   | Bill Bryson     | 276   | 1997 |
 | Death Walks the Woods | Cyril Hare      | 222   | 1954 |
 +-----------------------+-----------------+-------+------+

You can specify a subset of the columns to display with the B<-c> option,
which takes a comma-separated list of column numbers:

 % csvgrep -c 0,1,3 -i mary books.csv
 +--------------+--------------+------+
 | Book         | Author       | Date |
 +--------------+--------------+------+
 | Mary Poppins | PL Travers   | 1934 |
 | Frankenstein | Mary Shelley | 1818 |
 +--------------+--------------+------+

By default the pattern will be matched against the whole line,
but you can use B<--match-column> or B<-mc> to specify that the pattern
should only be matched against a specific column:

 % csvgrep -mc 0 -c 0,1,3 -i mary books.csv
 +--------------+--------------+------+
 | Book         | Author       | Date |
 +--------------+--------------+------+
 | Mary Poppins | PL Travers   | 1934 |
 +--------------+--------------+------+

The number of the match column refers to the numbering of the full set
of columns, regardless of whether you've used the B<-c> option.
This means you can match against a column that you're not displaying.

The pattern can be a Perl regexp,
but you'll probably need to quote it from your shell:

 % csvgrep -i 'walk.*wood' books.csv
 +-----------------------+-------------+-------+------+
 | Book                  | Author      | Pages | Date |
 +-----------------------+-------------+-------+------+
 | A Walk in the Woods   | Bill Bryson | 276   | 1997 |
 | Death Walks the Woods | Cyril Hare  | 222   | 1954 |
 +-----------------------+-------------+-------+------+

At work we have a number of situations where we have a directory that contains multiple versions
of a particular CSV file, for example with a feed from a customer.
With the B<-d> option, csvgrep will look at the most recent file in the specified directory,
only considering files with a .csv extension:

 % csvgrep -d /usr/local/feeds/users -i smith

I have various aliases defined, like this:

 alias tg="csvgrep -d .../file.csv -c 0,1,2 -i"

So then I can just run:

 tg smith

This is a script I've used internally,
with features being added as I wanted them.
Let me know if you've ideas for additional features, or send me a pull request.

=head2 Tab-Separated Values

TSV files are pretty common; they use a tab character instead of a comma.
If the filename ends with C<.tsv> rather than C<.csv>,
we'll set the field separator to be a tab character:

 % csvgrep -i norwegian ~/books.tsv
 +----------------+-----------------+-------+------+
 | Book           | Author          | Pages | Date |
 +----------------+-----------------+-------+------+
 | Norwegian Wood | Haruki Murakami | 400   | 1987 |
 +----------------+-----------------+-------+------+

This also applies to the B<-d> option.

=head1 REPOSITORY

L<https://github.com/neilb/csvgrep>

=head1 AUTHOR

Neil Bowers E<lt>neilb@cpan.orgE<gt>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2017 by Neil Bowers <neilb@cpan.org>.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.