-#!/bin/sh
-git log --pretty=%b%x00 "$@" |
-perl -n0 -wMstrict -E '
+#!/usr/bin/perl
+use 5.010;
+use strict;
+use warnings;
+use open ':std', OUT => ':utf8';
+use Encode 'decode';
+use Data::Dump 'pp';
+use Getopt::Long qw(:config bundling);
+
+GetOptions(\my %opt,
+ 'debug!',
+ 'count|c!',
+ 'simplify|s:s',
+ 'ignore-case|i!',
+ 'fuzzy!',
+ 'grep|S=s',
+ 'min|min-count|unique|u:i',
+ 'max|max-count|show|n:i',
+ 'version|V' => sub { Getopt::Long::VersionMessage() },
+ 'usage|h' => sub { Getopt::Long::HelpMessage() },
+ 'help|man|?' => sub { Getopt::Long::HelpMessage(-verbose => 2) },
+) or exit 129;
+
+local $| = 1;
+local $/ = "\0";
+
+my $HEADERMATCH = qr/ [a-z]+ (?: (?:-\w+)+ | \ by ) | cc | reference /ix;
+
+my (%headercount, @headercache);
+
+while (readline) {
+ s/^([0-9a-f]{4,40})\n//m and
+ my $hash = $1;
+
+ # strip commit seperator
+ chomp;
+ # skip expensive checks without potential identifier
+ m/:/ or next;
+ # try to parse as UTF-8
+ eval { $_ = decode(utf8 => $_, Encode::FB_CROAK()) };
+ # if invalid, assume it's latin1
+ $_ = decode(cp1252 => $_) if $@;
+
+ my %attr;
+
+ BLOCK:
for (reverse split /\n\n/) {
- my @headers = grep m{
- ^ (?: [a-z]+ (?: (?:-\w+)+ | \ by ) ) : \s* \S
- }imx, split /\n/ or next;
- say for @headers;
- last;
+ my @headers;
+ my $prefix = 0;
+
+ LINE:
+ for (split /\n/) {
+ next if not /\S/;
+ my @header = m{
+ ^
+ (?<key> $HEADERMATCH)
+ : \s*
+ (?<val> \S .+)
+ $
+ }imx or do {
+ $prefix++;
+ next LINE;
+ };
+
+ push @header, $_ if defined $opt{max};
+
+ if ($opt{fuzzy}) {
+ for ($header[0]) {
+ tr/ _/-/;
+
+ state $BY = qr{ (?: -? b[yu] )? \Z }ix;
+ s{^ si (?:ge?n|n?g) (?:e?[dt])? -? (?:of+)? $BY}{Signed-off-by}ix;
+ s{^ ack (?:ed|de)? $BY}{Acked-by}ix;
+ s{^ review (?:e?d)? $BY}{Reviewed-by}ix;
+ s{^ teste[dt] $BY}{Tested-by}ix;
+ }
+ }
+
+ if (defined $opt{grep}) {
+ $_ ~~ qr/$opt{grep}/i or next LINE;
+ }
+
+ given ($opt{simplify} // 'none') {
+ when (['email', 'authors']) {
+ $header[1] =~ s{
+ \A
+ (?: [^:;]+ )?
+ < [^@>]+ (?: @ | \h?\W? at \W?\h? ) [a-z0-9.-]+ >
+ \Z
+ }{<...>}imsx;
+ }
+ when (['var', 'vars', '']) {
+ when ($header[0] =~ /[ _-] (?: by | to ) $ | ^cc$/imsx) {
+ $header[1] = undef;
+ }
+ for ($header[1]) {
+ s{\b (https?)://\S+ }{[$1]}gmsx; # url
+ s{(?: < | \A ) [^@>\s]+ @ [^>]+ (?: > | \Z )}{<...>}igmsx; # address
+ s{\b [0-9]+ \b}{[num]}gmsx; # number
+ s{\b [Ig]? [0-9a-f]{ 40} \b}{[sha1]}gmsx; # hash
+ s{\b [Ig]? [0-9a-f]{6,40} \b}{[hash]}gmsx; # abbrev
+ }
+ }
+ when (['all', 'contents']) {
+ $header[1] = undef;
+ }
+ when (['none', 'no', '0']) {
+ }
+ default {
+ die "Unknown simplify option: '$_'\n";
+ }
+ }
+
+ if ($opt{'ignore-case'}) {
+ $_ = lc for $header[0], $header[1] // ();
+ }
+
+ pop @header if not defined $header[-1];
+
+ push @headers, \@header;
+ }
+
+ next BLOCK if not @headers;
+
+ if ($opt{debug} and $prefix) {
+ say sprintf ': invalid lines in %s (%s)', $hash // 'block', $prefix;
+ }
+
+ for (@headers) {
+ my $line = $_->[2] // join(': ', @$_);
+ $line =~ s/^/$hash / if defined $hash;
+
+ if (defined $opt{min} or $opt{max} or $opt{count}) {
+ my $counter = \$headercount{ $_->[0] }->{ $_->[1] // '' };
+ my $excess = $$counter++ - ($opt{min} // 0);
+ next if $excess >= ($opt{max} || 1);
+ next if $excess < 0;
+ if ($opt{count}) {
+ push @headercache, [ $line, $excess ? \undef : $counter ];
+ next;
+ }
+ }
+ say $line;
+ }
+
+ last BLOCK;
}
-'
+}
+
+for (@headercache) {
+ say ${$_->[1]} // '', "\t", $_->[0];
+}
+
+__END__
+
+=head1 NAME
+
+git-grep-footer - Find custom header lines in commit messages
+
+=head1 SYNOPSIS
+
+F<git> log --pretty=%b%x00 | F<git-grep-footer> [OPTIONS]
+
+=head1 DESCRIPTION
+
+Filters out header sections near the end of a commit body,
+a common convention to list custom metadata such as
+C<Signed-off-by> and C<Acked-by>.
+
+Sections are identified by at least one leading keyword containing a dash
+(or exceptionally recognised)
+followed by a colon.
+
+=head1 OPTIONS
+
+=over
+
+=item -i, --ignore-case
+
+Lowercases everything.
+
+=item -s, --simplify[=<rule>]
+
+Modifies values to hide specific details.
+Several different rules are supported:
+
+=over
+
+=item I<var> (default)
+
+Replaces highly variable contents such as numbers, hashes, and addresses,
+leaving only exceptional annotations as distinct text.
+Attributes ending in I<-to> or I<-by> are assumed variable author names
+and omitted entirely,
+unless they contain a colon indicating possible attribute exceptions.
+
+=item I<email>
+
+Filters out author lines following the git signoff convention,
+i.e. an <email address> optionally preceded by a name.
+
+=item I<all>
+
+Values will be hidden entirely, so only attribute names remain.
+
+=back
+
+=item --grep=<pattern>
+
+Only include lines matching the specified regular expression.
+Case insensitivity can be disabled by prepending C<(?-i)>.
+
+=item -u, --unique[=<threshold>]
+
+Each match is only shown once,
+optionally after it has already occurred a given amount of times.
+
+=item -n, --show[=<limit>]
+
+The original line is given for each match,
+but simplifications still apply for duplicate determination.
+Additional samples are optionally given upto the given maximum.
+
+=item -c, --count
+
+Prefixes (unique) lines by the number of occurrences.
+Causes output to be buffered until all input has been read (obviously).
+
+=back
+
+=head1 EXAMPLES
+
+=over
+
+=item git-grep-footer --grep=^ack
+
+Search for Acked-by lines.
+Append C<-uin> to skip reoccurrences.
+
+=item git-grep-footer -u --grep=junio
+
+Show distinct lines mentioning a specific author.
+
+=item git-grep-footer -c --simplify --grep=^si
+
+Compare various capitalisations and (mis)spellings of signoffs.
+
+=item git-grep-footer -c --simplify=all -i | sort -n -r | head -n10
+
+List the ten most frequently used attribute names.
+
+=item git-grep-footer -n2 -i -s
+
+Upto two examples for each distinct identifier.
+
+=back
+
+=head1 AUTHOR
+
+Mischa POSLAWSKY <perl@shiar.org>
+
+=head1 LICENSE
+
+This software is free software;
+you can redistribute and/or modify it under the terms of the GNU GPL
+version 2 or later.
+