#!/usr/bin/env perl
use 5.010;
use strict;
use warnings;
use open ':std', OUT => ':utf8';
use Encode 'decode';
use Data::Dump 'pp';
use Getopt::Long qw(:config bundling);

our $VERSION = '1.00';

GetOptions(\my %opt,
	'debug!',
	'',  # stdin
	'count|c!',
	'simplify|s:s',
	'ignore-case|i!',
	'fuzzy!',
	'grep|S=s',
	'min|min-count|unique|u:i',
	'max|max-count|show|n:i',
	'hash|H!',
	'version|V'  => sub { Getopt::Long::VersionMessage() },
	'usage|h'    => sub { Getopt::Long::HelpMessage() },
	'help|man|?' => sub { Getopt::Long::HelpMessage(-verbose => 2) },
) or exit 129;

my $inputstream = $opt{''} ? \*ARGV : eval {
	require Git;
	Git::command_output_pipe('log', '-z', '--pretty=format:%h%n%b', @ARGV);
} || die "Automatic git log failed: $@";

local $| = 1;
local $/ = "\0";

my $HEADERMATCH = qr/ [a-z]+ (?: (?:-\w+)+ | \ by ) | cc | reference /imsx;

my (%headercount, @headercache);

while (readline $inputstream) {
	s/^ ([0-9a-f]{4,40}) \n//msx;
	my $hash = $opt{hash} ? $1 : undef;

	# strip commit seperator
	chomp;
	# skip expensive checks without potential identifier
	m/:/ or next;
	# try to parse as UTF-8
	eval { $_ = decode(utf8   => $_, Encode::FB_CROAK()); return 1 }
	# if invalid, assume it's latin1
	    or $_ = decode(cp1252 => $_);

	BLOCK:
	for (reverse split /\n\n/) {
		my @headers;
		my $prefix = 0;

		LINE:
		for (split /\n/) {
			next if not m/\S/;
			my @header = m{
				^
				(?<key> $HEADERMATCH)
				: \s*
				(?<val> \S [^\n]+)
				$
			}imsx or do {
				$prefix++;
				next LINE;
			};

			push @header, $_ if defined $opt{max};

			if ($opt{fuzzy}) {
				for ($header[0]) {
					tr/ _/-/;

					state $BY = qr{ (?: -? b[yu] )? \Z }imsx;
					s{\A si (?:ge?n|n?g) (?:e?[dt])? -? (?:of+)? $BY}{Signed-off-by}imsx;
					s{\A ack (?:ed|de)?  $BY}{Acked-by}imsx;
					s{\A review (?:e?d)? $BY}{Reviewed-by}imsx;
					s{\A teste[dt]       $BY}{Tested-by}imsx;
				}
			}

			if (defined $opt{grep}) {
				$_ ~~ qr/$opt{grep}/im or next LINE;
			}

			given ($opt{simplify} // 'none') {
				when (['email', 'authors']) {
					$header[1] =~ s{
						\A
						(?: [^:;]+ )?
						< [^@>]+ (?: @ | \h?\W? at \W?\h? ) [a-z0-9.-]+ >
						\Z
					}{<...>}imsx;
				}
				when (['var', 'vars', '']) {
					when ($header[0] =~ m/[ _-] (?: by | to ) $ | ^cc$/imsx) {
						$header[1] = undef;
					}
					for ($header[1]) {
						s{\b (https?)://\S+ }{[$1]}gmsx;  # url
						s{(?: < | \A ) [^@>\s]+ @ [^>]+ (?: > | \Z )}{<...>}igmsx;  # address
						s{\b [0-9]+ \b}{[num]}gmsx;  # number
						s{\b [Ig]? [0-9a-f]{  40} \b}{[sha1]}gmsx;  # hash
						s{\b [Ig]? [0-9a-f]{6,40} \b}{[hash]}gmsx;  # abbrev
					}
				}
				when (['all', 'contents']) {
					$header[1] = undef;
				}
				when (['none', 'no', '0']) {
				}
				default {
					die "Unknown simplify option: '$_'\n";
				}
			}

			if ($opt{'ignore-case'}) {
				$_ = lc for $header[0], $header[1] // ();
			}

			pop @header if not defined $header[-1];

			push @headers, \@header;
		}

		next BLOCK if not @headers;

		if ($opt{debug} and $prefix) {
			say sprintf ': invalid lines in %s (%s)', $hash // 'block', $prefix;
		}

		for (@headers) {
			my $line = $_->[2] // join(': ', @$_);
			$line =~ s/\A/$hash /msx if defined $hash;

			if (defined $opt{min} or $opt{max} or $opt{count}) {
				my $counter = \$headercount{ $_->[0] }->{ $_->[1] // '' };
				my $excess = ${$counter}++ - ($opt{min} // 0);
				next if $excess >= ($opt{max} || 1);
				next if $excess <  0;
				if ($opt{count}) {
					push @headercache, [ $line, $excess ? \undef : $counter ];
					next;
				}
			}
			say $line;
		}

		last BLOCK;
	}
}

for (@headercache) {
	say ${$_->[1]} // '', "\t", $_->[0];
}

__END__

=head1 NAME

git-grep-footer - Find custom header lines in commit messages

=head1 SYNOPSIS

F<git-grep-footer> [OPTIONS] [-- <git log options>]

F<git> log -z --pretty=format:%b | F<git-grep-footer> [OPTIONS] -

=head1 DESCRIPTION

Filters out header sections near the end of a commit body,
a common convention to list custom metadata such as
C<Signed-off-by> and C<Acked-by>.

Sections are identified by at least one leading keyword containing a dash
(or exceptionally recognised)
followed by a colon.

=head1 OPTIONS

=over

=item -i, --ignore-case

Lowercases everything.

=item -s, --simplify[=<rule>]

Modifies values to hide specific details.
Several different rules are supported:

=over

=item I<var> (default)

Replaces highly variable contents such as numbers, hashes, and addresses,
leaving only exceptional annotations as distinct text.
Attributes ending in I<-to> or I<-by> are assumed variable author names
and omitted entirely,
unless they contain a colon indicating possible attribute exceptions.

=item I<email>

Filters out author lines following the git signoff convention,
i.e. an <email address> optionally preceded by a name.

=item I<all>

Values will be hidden entirely, so only attribute names remain.

=back

=item --grep=<pattern>

Only include lines matching the specified regular expression.
Case insensitivity can be disabled by prepending C<(?-i)>.

=item -u, --unique[=<threshold>]

Each match is only shown once,
optionally after it has already occurred a given amount of times.

=item -n, --show[=<limit>]

The original line is given for each match,
but simplifications still apply for duplicate determination.
Additional samples are optionally given upto the given maximum.

=item -c, --count

Prefixes (unique) lines by the number of occurrences.
Causes output to be buffered until all input has been read (obviously).

=item -H, --hash

Prefixes the SHA1 hash of the (or a) matching commit.

=back

=head1 EXAMPLES

=over

=item git-grep-footer --grep=^ack v2.6.32..v2.6.33

Search for I<Acked-by> lines for version I<v2.6.33>.
Append C<-uin> to skip reoccurrences.

=item git-grep-footer -u --grep=junio

Show distinct lines mentioning a specific author.

=item git-grep-footer -c --simplify --grep=^si

Compare various capitalisations and (mis)spellings of signoffs.

=item git-grep-footer -c --simplify=all -i | sort -n -r | head -n10

List the ten most frequently used attribute names.

=item git-grep-footer -n2 -i -s --hash -- --reverse

The earliest two usages of each distinct identifier.

=back

=head1 AUTHOR

Mischa POSLAWSKY <perl@shiar.org>

=head1 LICENSE

This software is free software;
you can redistribute and/or modify it under the terms of the GNU GPL
version 2 or later.