#!/usr/bin/perl use 5.010; use strict; use warnings; use open ':std', OUT => ':utf8'; use Encode 'decode'; use Data::Dump 'pp'; use Getopt::Long qw(:config bundling); GetOptions(\my %opt, 'debug!', 'count|c!', 'simplify|s:s', 'ignore-case|i!', 'fuzzy!', 'min|min-count|unique|u:i', 'max|max-count|show|n:i', 'version|V' => sub { Getopt::Long::VersionMessage() }, 'usage|h' => sub { Getopt::Long::HelpMessage() }, 'help|man|?' => sub { Getopt::Long::HelpMessage(-verbose => 2) }, ) or exit 129; local $| = 1; local $/ = "\0"; my $HEADERMATCH = qr/ [a-z]+ (?: (?:-\w+)+ | \ by ) | cc | reference /ix; my (%headercount, @headercache); while (readline) { s/^([0-9a-f]{4,40})\n//m and my $hash = $1; # strip commit seperator chomp; # skip expensive checks without potential identifier m/:/ or next; # try to parse as UTF-8 eval { $_ = decode(utf8 => $_, Encode::FB_CROAK()) }; # if invalid, assume it's latin1 $_ = decode(cp1252 => $_) if $@; my %attr; BLOCK: for (reverse split /\n\n/) { my @headers; my $prefix = 0; LINE: for (split /\n/) { next if not /\S/; my @header = m{ ^ (? $HEADERMATCH) : \s* (? \S .+) $ }imx or do { $prefix++; next LINE; }; push @header, $_ if defined $opt{max}; if ($opt{fuzzy}) { for ($header[0]) { tr/ _/-/; state $BY = qr{ (?: -? b[yu] )? \Z }ix; s{^ si (?:ge?n|n?g) (?:e?[dt])? -? (?:of+)? $BY}{Signed-off-by}ix; s{^ ack (?:ed|de)? $BY}{Acked-by}ix; s{^ review (?:e?d)? $BY}{Reviewed-by}ix; s{^ teste[dt] $BY}{Tested-by}ix; } } given ($opt{simplify} // 'none') { when (['email', 'authors']) { $header[1] =~ s{ \A (?: [^:;]+ )? < [^@>]+ (?: @ | \h?\W? at \W?\h? ) [a-z0-9.-]+ > \Z }{<...>}imsx; } when (['var', 'vars', '']) { when ($header[0] =~ /[ _-] (?: by | to ) $ | ^cc$/imsx) { $header[1] = undef; } for ($header[1]) { s{\b (https?)://\S+ }{[$1]}gmsx; # url s{(?: < | \A ) [^@>\s]+ @ [^>]+ (?: > | \Z )}{<...>}igmsx; # address s{\b [0-9]+ \b}{[num]}gmsx; # number s{\b [Ig]? [0-9a-f]{ 40} \b}{[sha1]}gmsx; # hash s{\b [Ig]? [0-9a-f]{6,40} \b}{[hash]}gmsx; # abbrev } } when (['all', 'contents']) { $header[1] = undef; } when (['none', 'no', '0']) { } default { die "Unknown simplify option: '$_'\n"; } } if ($opt{'ignore-case'}) { $_ = lc for $header[0], $header[1] // (); } pop @header if not defined $header[-1]; push @headers, \@header; } next BLOCK if not @headers; if ($opt{debug} and $prefix) { say sprintf ': invalid lines in %s (%s)', $hash // 'block', $prefix; } for (@headers) { my $line = $_->[2] // join(': ', @$_); $line =~ s/^/$hash / if defined $hash; if (defined $opt{min} or $opt{max} or $opt{count}) { my $counter = \$headercount{ $_->[0] }->{ $_->[1] // '' }; my $excess = $$counter++ - ($opt{min} // 0); next if $excess >= ($opt{max} || 1); next if $excess < 0; if ($opt{count}) { push @headercache, [ $line, $excess ? \undef : $counter ]; next; } } say $line; } last BLOCK; } } for (@headercache) { say ${$_->[1]} // '', "\t", $_->[0]; } __END__ =head1 NAME git-grep-footer - Find custom header lines in commit messages =head1 SYNOPSIS F log --pretty=%b%x00 | F [OPTIONS] =head1 DESCRIPTION Filters out header sections near the end of a commit body, a common convention to list custom metadata such as C and C. Sections are identified by at least one leading keyword containing a dash (or exceptionally recognised) followed by a colon. =head1 OPTIONS =over =item -i, --ignore-case Lowercases everything. =item -s, --simplify[=] Modifies values to hide specific details. Several different rules are supported: =over =item I (default) Replaces highly variable contents such as numbers, hashes, and addresses, leaving only exceptional annotations as distinct text. Attributes ending in I<-to> or I<-by> are assumed variable author names and omitted entirely, unless they contain a colon indicating possible attribute exceptions. =item I Filters out author lines following the git signoff convention, i.e. an optionally preceded by a name. =item I Values will be hidden entirely, so only attribute names remain. =back =item -u, --unique[=] Each match is only shown once, optionally after it has already occurred a given amount of times. =item -n, --show[=] The original line is given for each match, but simplifications still apply for duplicate determination. Additional samples are optionally given upto the given maximum. =item -c, --count Prefixes (unique) lines by the number of occurrences. Causes output to be buffered until all input has been read (obviously). =back =head1 AUTHOR Mischa POSLAWSKY =head1 LICENSE Copyright. All rights reserved.