document count option (and imply unique)
[git-grep-footer.git] / git-grep-footer
index ff279a606bc2ad4cc8545b1e2e14f039b23550ce..b676ab8aef983e241ffb59921849d94075dd889d 100755 (executable)
@@ -9,8 +9,10 @@ use Getopt::Long qw(:config bundling);
 
 GetOptions(\my %opt,
        'debug!',
+       'count|c!',
        'simplify|s:s',
        'ignore-case|i!',
+       'fuzzy!',
        'min|min-count|unique|u:i',
        'max|max-count|show|n:i',
        'version|V'  => sub { Getopt::Long::VersionMessage() },
@@ -21,10 +23,12 @@ GetOptions(\my %opt,
 local $| = 1;
 local $/ = "\0";
 
-my $HEADERMATCH = qr/ [a-z]+ (?: (?:-\w+)+ | \ by ) /ix;
+my $HEADERMATCH = qr/ [a-z]+ (?: (?:-\w+)+ | \ by ) | cc | reference /ix;
+
+my (%headercount, @headercache);
 
 while (readline) {
-       s/(.+)\n//m;
+       s/^([0-9a-f]{4,40})\n//m and
        my $hash = $1;
 
        # strip commit seperator
@@ -36,12 +40,12 @@ while (readline) {
        # if invalid, assume it's latin1
               $_ = decode(cp1252 => $_) if $@;
 
-       my $prefix = 0;
        my %attr;
 
        BLOCK:
        for (reverse split /\n\n/) {
                my @headers;
+               my $prefix = 0;
 
                LINE:
                for (split /\n/) {
@@ -59,6 +63,18 @@ while (readline) {
 
                        push @header, $_ if defined $opt{max};
 
+                       if ($opt{fuzzy}) {
+                               for ($header[0]) {
+                                       tr/ _/-/;
+
+                                       state $BY = qr{ (?: -? b[yu] )? \Z }ix;
+                                       s{^ si (?:ge?n|n?g) (?:e?[dt])? -? (?:of+)? $BY}{Signed-off-by}ix;
+                                       s{^ ack (?:ed|de)?  $BY}{Acked-by}ix;
+                                       s{^ review (?:e?d)? $BY}{Reviewed-by}ix;
+                                       s{^ teste[dt]       $BY}{Tested-by}ix;
+                               }
+                       }
+
                        given ($opt{simplify} // 'none') {
                                when (['email', 'authors']) {
                                        $header[1] =~ s{
@@ -69,14 +85,15 @@ while (readline) {
                                        }{<...>}imsx;
                                }
                                when (['var', 'vars', '']) {
-                                       when ($header[0] =~ /[ _-] (?: by | to ) $/imsx) {
+                                       when ($header[0] =~ /[ _-] (?: by | to ) $ | ^cc$/imsx) {
                                                $header[1] = undef;
                                        }
                                        for ($header[1]) {
                                                s{\b (https?)://\S+ }{[$1]}gmsx;  # url
                                                s{(?: < | \A ) [^@>\s]+ @ [^>]+ (?: > | \Z )}{<...>}igmsx;  # address
                                                s{\b [0-9]+ \b}{[num]}gmsx;  # number
-                                               s{\b I? [0-9a-f]{40} \b}{[sha1]}gmsx;  # hash
+                                               s{\b [Ig]? [0-9a-f]{  40} \b}{[sha1]}gmsx;  # hash
+                                               s{\b [Ig]? [0-9a-f]{6,40} \b}{[hash]}gmsx;  # abbrev
                                        }
                                }
                                when (['all', 'contents']) {
@@ -101,23 +118,34 @@ while (readline) {
                next BLOCK if not @headers;
 
                if ($opt{debug} and $prefix) {
-                       say "infix junk in commit $hash";
+                       say sprintf ': invalid lines in %s (%s)', $hash // 'block', $prefix;
                }
 
                for (@headers) {
-                       if (defined $opt{min} or $opt{max}) {
-                               state $seen;
-                               my $count = $seen->{ $_->[0] }->{ $_->[1] // '' }++;
-                               next if $count >= ($opt{min} // 0) + ($opt{max} || 1);
-                               next if $count < ($opt{min} // 0);
+                       my $line = $_->[2] // join(': ', @$_);
+                       $line =~ s/^/$hash / if defined $hash;
+
+                       if (defined $opt{min} or $opt{max} or $opt{count}) {
+                               my $counter = \$headercount{ $_->[0] }->{ $_->[1] // '' };
+                               my $excess = $$counter++ - ($opt{min} // 0);
+                               next if $excess >= ($opt{max} || 1);
+                               next if $excess <  0;
+                               if ($opt{count}) {
+                                       push @headercache, [ $line, $excess ? \undef : $counter ];
+                                       next;
+                               }
                        }
-                       say $_->[2] // join(': ', @$_);
+                       say $line;
                }
 
                last BLOCK;
        }
 }
 
+for (@headercache) {
+       say ${$_->[1]} // '', "\t", $_->[0];
+}
+
 __END__
 
 =head1 NAME
@@ -135,6 +163,7 @@ a common convention to list custom metadata such as
 C<Signed-off-by> and C<Acked-by>.
 
 Sections are identified by at least one leading keyword containing a dash
+(or exceptionally recognised)
 followed by a colon.
 
 =head1 OPTIONS
@@ -182,6 +211,11 @@ The original line is given for each match,
 but simplifications still apply for duplicate determination.
 Additional samples are optionally given upto the given maximum.
 
+=item -c, --count
+
+Prefixes (unique) lines by the number of occurrences.
+Causes output to be buffered until all input has been read (obviously).
+
 =back
 
 =head1 AUTHOR