additional hash simplifications for -s'any'
[git-grep-footer.git] / git-grep-footer
1 #!/usr/bin/perl
2 use 5.010;
3 use strict;
4 use warnings;
5 use open ':std', OUT => ':utf8';
6 use Encode 'decode';
7 use Data::Dump 'pp';
8 use Getopt::Long qw(:config bundling);
9
10 GetOptions(\my %opt,
11         'debug!',
12         'count|c!',
13         'simplify|s:s',
14         'ignore-case|i!',
15         'min|min-count|unique|u:i',
16         'max|max-count|show|n:i',
17         'version|V'  => sub { Getopt::Long::VersionMessage() },
18         'usage|h'    => sub { Getopt::Long::HelpMessage() },
19         'help|man|?' => sub { Getopt::Long::HelpMessage(-verbose => 2) },
20 ) or exit 129;
21
22 local $| = 1;
23 local $/ = "\0";
24
25 my $HEADERMATCH = qr/ [a-z]+ (?: (?:-\w+)+ | \ by ) /ix;
26
27 my (%headercount, @headercache);
28
29 while (readline) {
30         s/(.+)\n//m;
31         my $hash = $1;
32
33         # strip commit seperator
34         chomp;
35         # skip expensive checks without potential identifier
36         m/:/ or next;
37         # try to parse as UTF-8
38         eval { $_ = decode(utf8   => $_, Encode::FB_CROAK()) };
39         # if invalid, assume it's latin1
40                $_ = decode(cp1252 => $_) if $@;
41
42         my $prefix = 0;
43         my %attr;
44
45         BLOCK:
46         for (reverse split /\n\n/) {
47                 my @headers;
48
49                 LINE:
50                 for (split /\n/) {
51                         next if not /\S/;
52                         my @header = m{
53                                 ^
54                                 (?<key> $HEADERMATCH)
55                                 : \s*
56                                 (?<val> \S .+)
57                                 $
58                         }imx or do {
59                                 $prefix++;
60                                 next LINE;
61                         };
62
63                         push @header, $_ if defined $opt{max};
64
65                         given ($opt{simplify} // 'none') {
66                                 when (['email', 'authors']) {
67                                         $header[1] =~ s{
68                                                 \A
69                                                 (?: [^:;]+ )?
70                                                 < [^@>]+ (?: @ | \h?\W? at \W?\h? ) [a-z0-9.-]+ >
71                                                 \Z
72                                         }{<...>}imsx;
73                                 }
74                                 when (['var', 'vars', '']) {
75                                         when ($header[0] =~ /[ _-] (?: by | to ) $/imsx) {
76                                                 $header[1] = undef;
77                                         }
78                                         for ($header[1]) {
79                                                 s{\b (https?)://\S+ }{[$1]}gmsx;  # url
80                                                 s{(?: < | \A ) [^@>\s]+ @ [^>]+ (?: > | \Z )}{<...>}igmsx;  # address
81                                                 s{\b [0-9]+ \b}{[num]}gmsx;  # number
82                                                 s{\b [Ig]? [0-9a-f]{  40} \b}{[sha1]}gmsx;  # hash
83                                                 s{\b [Ig]? [0-9a-f]{6,40} \b}{[hash]}gmsx;  # abbrev
84                                         }
85                                 }
86                                 when (['all', 'contents']) {
87                                         $header[1] = undef;
88                                 }
89                                 when (['none', 'no', '0']) {
90                                 }
91                                 default {
92                                         die "Unknown simplify option: '$_'\n";
93                                 }
94                         }
95
96                         if ($opt{'ignore-case'}) {
97                                 $_ = lc for $header[0], $header[1] // ();
98                         }
99
100                         pop @header if not defined $header[-1];
101
102                         push @headers, \@header;
103                 }
104
105                 next BLOCK if not @headers;
106
107                 if ($opt{debug} and $prefix) {
108                         say "infix junk in commit $hash";
109                 }
110
111                 for (@headers) {
112                         my $line = $_->[2] // join(': ', @$_);
113                         if (defined $opt{min} or $opt{max}) {
114                                 my $counter = \$headercount{ $_->[0] }->{ $_->[1] // '' };
115                                 my $excess = $$counter++ - ($opt{min} // 0);
116                                 next if $excess >= ($opt{max} || 1);
117                                 next if $excess <  0;
118                                 if ($opt{count}) {
119                                         push @headercache, [ $line, $excess ? \undef : $counter ];
120                                         next;
121                                 }
122                         }
123                         say $line;
124                 }
125
126                 last BLOCK;
127         }
128 }
129
130 for (@headercache) {
131         say ${$_->[1]} // '', "\t", $_->[0];
132 }
133
134 __END__
135
136 =head1 NAME
137
138 git-grep-footer - Find custom header lines in commit messages
139
140 =head1 SYNOPSIS
141
142 F<git> log --pretty=%b%x00 | F<git-grep-footer> [OPTIONS]
143
144 =head1 DESCRIPTION
145
146 Filters out header sections near the end of a commit body,
147 a common convention to list custom metadata such as
148 C<Signed-off-by> and C<Acked-by>.
149
150 Sections are identified by at least one leading keyword containing a dash
151 followed by a colon.
152
153 =head1 OPTIONS
154
155 =over
156
157 =item -i, --ignore-case
158
159 Lowercases everything.
160
161 =item -s, --simplify[=<rule>]
162
163 Modifies values to hide specific details.
164 Several different rules are supported:
165
166 =over
167
168 =item I<var> (default)
169
170 Replaces highly variable contents such as numbers, hashes, and addresses,
171 leaving only exceptional annotations as distinct text.
172 Attributes ending in I<-to> or I<-by> are assumed variable author names
173 and omitted entirely,
174 unless they contain a colon indicating possible attribute exceptions.
175
176 =item I<email>
177
178 Filters out author lines following the git signoff convention,
179 i.e. an <email address> optionally preceded by a name.
180
181 =item I<all>
182
183 Values will be hidden entirely, so only attribute names remain.
184
185 =back
186
187 =item -u, --unique[=<threshold>]
188
189 Each match is only shown once,
190 optionally after it has already occurred a given amount of times.
191
192 =item -n, --show[=<limit>]
193
194 The original line is given for each match,
195 but simplifications still apply for duplicate determination.
196 Additional samples are optionally given upto the given maximum.
197
198 =back
199
200 =head1 AUTHOR
201
202 Mischa POSLAWSKY <perl@shiar.org>
203
204 =head1 LICENSE
205
206 Copyright. All rights reserved.
207