public release
[git-grep-footer.git] / git-grep-footer
index bc71345250a9190b0e9297b4b1e0572dfc6df5a5..97fbca7748b374a295e1758f43887ce6f14de8ef 100755 (executable)
-#!/usr/bin/perl
+#!/usr/bin/env perl
 use 5.010;
 use strict;
 use warnings;
 use open ':std', OUT => ':utf8';
 use Encode 'decode';
 use Data::Dump 'pp';
-use Getopt::Long;
+use Getopt::Long qw(:config bundling);
+
+our $VERSION = '1.00';
 
 GetOptions(\my %opt,
        'debug!',
+       '',  # stdin
+       'count|c!',
        'simplify|s:s',
-       'unique|u!',
        'ignore-case|i!',
-) or die;
+       'fuzzy!',
+       'grep|S=s',
+       'min|min-count|unique|u:i',
+       'max|max-count|show|n:i',
+       'hash|H!',
+       'version|V'  => sub { Getopt::Long::VersionMessage() },
+       'usage|h'    => sub { Getopt::Long::HelpMessage() },
+       'help|man|?' => sub { Getopt::Long::HelpMessage(-verbose => 2) },
+) or exit 129;
+
+my $inputstream = $opt{''} ? \*ARGV : eval {
+       require Git;
+       Git::command_output_pipe('log', '-z', '--pretty=format:%h%n%b', @ARGV);
+} || die "Automatic git log failed: $@";
 
 local $| = 1;
 local $/ = "\0";
 
-my $HEADERMATCH = qr/ [a-z]+ (?: (?:-\w+)+ | \ by ) /ix;
+my $HEADERMATCH = qr/ [a-z]+ (?: (?:-\w+)+ | \ by ) | cc | reference /imsx;
+
+my (%headercount, @headercache);
 
-while (readline) {
-       s/(.+)\n//m;
-       my $hash = $1;
+while (readline $inputstream) {
+       s/^ ([0-9a-f]{4,40}) \n//msx;
+       my $hash = $opt{hash} ? $1 : undef;
 
        # strip commit seperator
        chomp;
        # skip expensive checks without potential identifier
        m/:/ or next;
        # try to parse as UTF-8
-       eval { $_ = decode(utf8   => $_, Encode::FB_CROAK()) };
+       eval { $_ = decode(utf8   => $_, Encode::FB_CROAK()); return 1 }
        # if invalid, assume it's latin1
-              $_ = decode(cp1252 => $_) if $@;
-
-       my $prefix = 0;
-       my %attr;
+           or $_ = decode(cp1252 => $_);
 
        BLOCK:
        for (reverse split /\n\n/) {
                my @headers;
+               my $prefix = 0;
 
                LINE:
                for (split /\n/) {
-                       next if not /\S/;
+                       next if not m/\S/;
                        my @header = m{
                                ^
                                (?<key> $HEADERMATCH)
                                : \s*
-                               (?<val> \S .+)
+                               (?<val> \S [^\n]+)
                                $
-                       }imx or do {
+                       }imsx or do {
                                $prefix++;
                                next LINE;
                        };
 
-                       given ($opt{simplify} // 'no') {
-                               when ('strict') {
+                       push @header, $_ if defined $opt{max};
+
+                       if ($opt{fuzzy}) {
+                               for ($header[0]) {
+                                       tr/ _/-/;
+
+                                       state $BY = qr{ (?: -? b[yu] )? \Z }imsx;
+                                       s{\A si (?:ge?n|n?g) (?:e?[dt])? -? (?:of+)? $BY}{Signed-off-by}imsx;
+                                       s{\A ack (?:ed|de)?  $BY}{Acked-by}imsx;
+                                       s{\A review (?:e?d)? $BY}{Reviewed-by}imsx;
+                                       s{\A teste[dt]       $BY}{Tested-by}imsx;
+                               }
+                       }
+
+                       if (defined $opt{grep}) {
+                               $_ ~~ qr/$opt{grep}/im or next LINE;
+                       }
+
+                       given ($opt{simplify} // 'none') {
+                               when (['email', 'authors']) {
                                        $header[1] =~ s{
                                                \A
-                                               (?: [^:]+ )?
+                                               (?: [^:;]+ )?
                                                < [^@>]+ (?: @ | \h?\W? at \W?\h? ) [a-z0-9.-]+ >
                                                \Z
                                        }{<...>}imsx;
                                }
-                               when (['text', '']) {
-                                       when ($header[0] =~ /[ _-] (?: by | to ) $/imsx) {
-                                               pop @header;
+                               when (['var', 'vars', '']) {
+                                       when ($header[0] =~ m/[ _-] (?: by | to ) $ | ^cc$/imsx) {
+                                               $header[1] = undef;
                                        }
                                        for ($header[1]) {
                                                s{\b (https?)://\S+ }{[$1]}gmsx;  # url
                                                s{(?: < | \A ) [^@>\s]+ @ [^>]+ (?: > | \Z )}{<...>}igmsx;  # address
                                                s{\b [0-9]+ \b}{[num]}gmsx;  # number
-                                               s{\b I? [0-9a-f]{40} \b}{[sha1]}gmsx;  # hash
+                                               s{\b [Ig]? [0-9a-f]{  40} \b}{[sha1]}gmsx;  # hash
+                                               s{\b [Ig]? [0-9a-f]{6,40} \b}{[hash]}gmsx;  # abbrev
                                        }
                                }
-                               when (['all', 'any']) {
-                                       pop @header;
+                               when (['all', 'contents']) {
+                                       $header[1] = undef;
                                }
-                               when ('no') {
+                               when (['none', 'no', '0']) {
                                }
                                default {
                                        die "Unknown simplify option: '$_'\n";
@@ -84,26 +119,162 @@ while (readline) {
                        }
 
                        if ($opt{'ignore-case'}) {
-                               $_ = lc for @header;
+                               $_ = lc for $header[0], $header[1] // ();
                        }
 
+                       pop @header if not defined $header[-1];
+
                        push @headers, \@header;
                }
 
                next BLOCK if not @headers;
 
                if ($opt{debug} and $prefix) {
-                       say "infix junk in commit $hash";
+                       say sprintf ': invalid lines in %s (%s)', $hash // 'block', $prefix;
                }
 
                for (@headers) {
-                       if ($opt{unique}) {
-                               state $seen;
-                               next if $seen->{ $_->[0] }->{ $_->[1] // '' }++;
+                       my $line = $_->[2] // join(': ', @$_);
+                       $line =~ s/\A/$hash /msx if defined $hash;
+
+                       if (defined $opt{min} or $opt{max} or $opt{count}) {
+                               my $counter = \$headercount{ $_->[0] }->{ $_->[1] // '' };
+                               my $excess = ${$counter}++ - ($opt{min} // 0);
+                               next if $excess >= ($opt{max} || 1);
+                               next if $excess <  0;
+                               if ($opt{count}) {
+                                       push @headercache, [ $line, $excess ? \undef : $counter ];
+                                       next;
+                               }
                        }
-                       say join ': ', @$_;
+                       say $line;
                }
 
                last BLOCK;
        }
 }
+
+for (@headercache) {
+       say ${$_->[1]} // '', "\t", $_->[0];
+}
+
+__END__
+
+=head1 NAME
+
+git-grep-footer - Find custom header lines in commit messages
+
+=head1 SYNOPSIS
+
+F<git-grep-footer> [OPTIONS] [-- <git log options>]
+
+F<git> log -z --pretty=format:%b | F<git-grep-footer> [OPTIONS] -
+
+=head1 DESCRIPTION
+
+Filters out header sections near the end of a commit body,
+a common convention to list custom metadata such as
+C<Signed-off-by> and C<Acked-by>.
+
+Sections are identified by at least one leading keyword containing a dash
+(or exceptionally recognised)
+followed by a colon.
+
+=head1 OPTIONS
+
+=over
+
+=item -i, --ignore-case
+
+Lowercases everything.
+
+=item -s, --simplify[=<rule>]
+
+Modifies values to hide specific details.
+Several different rules are supported:
+
+=over
+
+=item I<var> (default)
+
+Replaces highly variable contents such as numbers, hashes, and addresses,
+leaving only exceptional annotations as distinct text.
+Attributes ending in I<-to> or I<-by> are assumed variable author names
+and omitted entirely,
+unless they contain a colon indicating possible attribute exceptions.
+
+=item I<email>
+
+Filters out author lines following the git signoff convention,
+i.e. an <email address> optionally preceded by a name.
+
+=item I<all>
+
+Values will be hidden entirely, so only attribute names remain.
+
+=back
+
+=item --grep=<pattern>
+
+Only include lines matching the specified regular expression.
+Case insensitivity can be disabled by prepending C<(?-i)>.
+
+=item -u, --unique[=<threshold>]
+
+Each match is only shown once,
+optionally after it has already occurred a given amount of times.
+
+=item -n, --show[=<limit>]
+
+The original line is given for each match,
+but simplifications still apply for duplicate determination.
+Additional samples are optionally given upto the given maximum.
+
+=item -c, --count
+
+Prefixes (unique) lines by the number of occurrences.
+Causes output to be buffered until all input has been read (obviously).
+
+=item -H, --hash
+
+Prefixes the SHA1 hash of the (or a) matching commit.
+
+=back
+
+=head1 EXAMPLES
+
+=over
+
+=item git-grep-footer --grep=^ack v2.6.32..v2.6.33
+
+Search for I<Acked-by> lines for version I<v2.6.33>.
+Append C<-uin> to skip reoccurrences.
+
+=item git-grep-footer -u --grep=junio
+
+Show distinct lines mentioning a specific author.
+
+=item git-grep-footer -c --simplify --grep=^si
+
+Compare various capitalisations and (mis)spellings of signoffs.
+
+=item git-grep-footer -c --simplify=all -i | sort -n -r | head -n10
+
+List the ten most frequently used attribute names.
+
+=item git-grep-footer -n2 -i -s --hash -- --reverse
+
+The earliest two usages of each distinct identifier.
+
+=back
+
+=head1 AUTHOR
+
+Mischa POSLAWSKY <perl@shiar.org>
+
+=head1 LICENSE
+
+This software is free software;
+you can redistribute and/or modify it under the terms of the GNU GPL
+version 2 or later.
+