#!perl use 5.010; use strict; use warnings; use Text::CSV_XS; use Text::Table::Tiny qw/ generate_table /; use Getopt::Long; binmode(STDOUT, ":encoding(UTF-8)"); my $SHOW_COUNT_THRESHOLD = 7; my $usage_string = "usage: $0 [-h] [-d ] [-i] \n"; my $case_insensitive = 0; my $match_column = ""; my $column_spec; my ($pattern, $filename) = process_command_line(); show_matching_rows($pattern, $filename); exit 0; sub show_matching_rows { my ($pattern, $filename) = @_; my $sep_char = ','; my $total_rows = 0; my @col_indices; my @rows; # TSV files have tab as separator instead of comma $sep_char = "\t" if $filename =~ /\.tsv$/; open(my $fh, '<:encoding(utf8)', $filename) || die "can't read $filename: $!\n"; my $parser = Text::CSV_XS->new({ sep_char => $sep_char }); my @headers = @{ $parser->getline($fh) }; if ($column_spec) { @col_indices = split(/,/, $column_spec); @headers = @headers[@col_indices]; } push(@rows, \@headers); while (<$fh>) { $total_rows++; next unless $match_column =~ (/^\d+$/) || ($case_insensitive && /$pattern/io) || (!$case_insensitive && /$pattern/o); my $status = $parser->parse($_); my @columns = $parser->fields; next if $match_column =~ (/^\d+$/) && ( ($case_insensitive && $columns[$match_column] !~ /$pattern/io) || (!$case_insensitive && $columns[$match_column] !~ /$pattern/o) ); if ($column_spec) { @columns = @columns[@col_indices]; } push(@rows, [@columns]); } die "No match found in $total_rows records\n" unless @rows > 1; print generate_table(rows => \@rows, header_row => 1), "\n"; my $displayed_rows = int(@rows) - 1; if ($displayed_rows > $SHOW_COUNT_THRESHOLD) { print "matched $displayed_rows of $total_rows lines\n"; } } sub process_command_line { my $dirpath; my $help = 0; GetOptions( 'ignore-case|i' => \$case_insensitive, 'help|h' => \$help, 'columns|c=s' => \$column_spec, 'directory|d=s' => \$dirpath, 'match-column|mc=i' => \$match_column, ) || die $usage_string; die $usage_string if ($help); die $usage_string unless @ARGV == 2 || $dirpath; my $pattern = shift @ARGV; my $filename = $dirpath ? find_newest_file($dirpath) : shift @ARGV; return ($pattern, $filename); } sub find_newest_file { my $dirpath = shift; opendir(my $DIR, $dirpath) || die "can't read directory $dirpath: $!\n"; my @files = map { $_->[0] } sort { $b->[1] <=> $a->[1] } map { [ $_, (stat("$dirpath/$_"))[9] ] } # mtime grep { /\.[ct]sv$/ } readdir($DIR); closedir($DIR); die "no .csv files found in $dirpath\n" unless @files > 0; return "$dirpath/$files[0]"; } __END__ =head1 NAME csvgrep - search for patterns in a CSV and display results in a table =head1 SYNOPSIS csvgrep csvgrep -d =head1 DESCRIPTION B is a script that lets you look for a pattern in a CSV file, and then displays the results in a text table. We assume that the first line in the CSV is a header row. The simplest usage is to look for a word in a CSV: % csvgrep Murakami books.csv +-------------------+-----------------+-------+------+ | Book | Author | Pages | Date | +-------------------+-----------------+-------+------+ | Norwegian Wood | Haruki Murakami | 400 | 1987 | | Men without Women | Haruki Murakami | 228 | 2017 | +-------------------+-----------------+-------+------+ As with regular grep, you can use the B<-i> switch to make it case-insensitive: % csvgrep -i wood books.csv +-----------------------+-----------------+-------+------+ | Book | Author | Pages | Date | +-----------------------+-----------------+-------+------+ | Norwegian Wood | Haruki Murakami | 400 | 1987 | | A Walk in the Woods | Bill Bryson | 276 | 1997 | | Death Walks the Woods | Cyril Hare | 222 | 1954 | +-----------------------+-----------------+-------+------+ You can specify a subset of the columns to display with the B<-c> option, which takes a comma-separated list of column numbers: % csvgrep -c 0,1,3 -i mary books.csv +--------------+--------------+------+ | Book | Author | Date | +--------------+--------------+------+ | Mary Poppins | PL Travers | 1934 | | Frankenstein | Mary Shelley | 1818 | +--------------+--------------+------+ By default the pattern will be matched against the whole line, but you can use B<--match-column> or B<-mc> to specify that the pattern should only be matched against a specific column: % csvgrep -mc 0 -c 0,1,3 -i mary books.csv +--------------+--------------+------+ | Book | Author | Date | +--------------+--------------+------+ | Mary Poppins | PL Travers | 1934 | +--------------+--------------+------+ The number of the match column refers to the numbering of the full set of columns, regardless of whether you've used the B<-c> option. This means you can match against a column that you're not displaying. The pattern can be a Perl regexp, but you'll probably need to quote it from your shell: % csvgrep -i 'walk.*wood' books.csv +-----------------------+-------------+-------+------+ | Book | Author | Pages | Date | +-----------------------+-------------+-------+------+ | A Walk in the Woods | Bill Bryson | 276 | 1997 | | Death Walks the Woods | Cyril Hare | 222 | 1954 | +-----------------------+-------------+-------+------+ At work we have a number of situations where we have a directory that contains multiple versions of a particular CSV file, for example with a feed from a customer. With the B<-d> option, csvgrep will look at the most recent file in the specified directory, only considering files with a .csv extension: % csvgrep -d /usr/local/feeds/users -i smith I have various aliases defined, like this: alias tg="csvgrep -d .../file.csv -c 0,1,2 -i" So then I can just run: tg smith This is a script I've used internally, with features being added as I wanted them. Let me know if you've ideas for additional features, or send me a pull request. =head2 Tab-Separated Values TSV files are pretty common; they use a tab character instead of a comma. If the filename ends with C<.tsv> rather than C<.csv>, we'll set the field separator to be a tab character: % csvgrep -i norwegian ~/books.tsv +----------------+-----------------+-------+------+ | Book | Author | Pages | Date | +----------------+-----------------+-------+------+ | Norwegian Wood | Haruki Murakami | 400 | 1987 | +----------------+-----------------+-------+------+ This also applies to the B<-d> option. =head1 REPOSITORY L =head1 AUTHOR Neil Bowers Eneilb@cpan.orgE =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2017 by Neil Bowers . This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself.