-#!/usr/bin/perl -0 -CS
+#!/usr/bin/perl -0 -CO
use 5.010;
use strict;
use warnings;
+use Encode 'decode';
use Data::Dump 'pp';
+use Getopt::Long;
+
+GetOptions(\my %opt,
+ 'debug!',
+ 'simplify|s:s',
+ 'unique|u!',
+ 'ignore-case|i!',
+) or die;
+
+local $| = 1;
my $HEADERMATCH = qr/ [a-z]+ (?: (?:-\w+)+ | \ by ) /ix;
while (readline) {
+ s/(.+)\n//m;
+ my $hash = $1;
+
+ # strip commit seperator
+ chomp;
+ # skip expensive checks without potential identifier
+ m/:/ or next;
+ # try to parse as UTF-8
+ eval { $_ = decode(utf8 => $_, Encode::FB_CROAK()) };
+ # if invalid, assume it's latin1
+ $_ = decode(cp1252 => $_) if $@;
+
+ my $prefix = 0;
+ my %attr;
+
BLOCK:
for (reverse split /\n\n/) {
my @headers;
: \s*
(?<val> \S .+)
$
- }imx or next LINE;
+ }imx or do {
+ $prefix++;
+ next LINE;
+ };
+
+ given ($opt{simplify} // 'no') {
+ when ('strict') {
+ $header[1] =~ s{
+ \A
+ (?: [^:]+ )?
+ < [^@>]+ (?: @ | \h?\W? at \W?\h? ) [a-z0-9.-]+ >
+ \Z
+ }{<...>}imsx;
+ }
+ when (['text', '']) {
+ when ($header[0] =~ /[ _-] (?: by | to ) $/imsx) {
+ pop @header;
+ }
+ for ($header[1]) {
+ s{\b (https?)://\S+ }{[$1]}gmsx; # url
+ s{(?: < | \A ) [^@>\s]+ @ [^>]+ (?: > | \Z )}{<...>}igmsx; # address
+ s{\b [0-9]+ \b}{[num]}gmsx; # number
+ s{\b I? [0-9a-f]{40} \b}{[sha1]}gmsx; # hash
+ }
+ }
+ when (['all', 'any']) {
+ pop @header;
+ }
+ when ('no') {
+ }
+ default {
+ die "Unknown simplify option: '$_'\n";
+ }
+ }
+
+ if ($opt{'ignore-case'}) {
+ $_ = lc for @header;
+ }
push @headers, \@header;
}
next BLOCK if not @headers;
+ if ($opt{debug} and $prefix) {
+ say "infix junk in commit $hash";
+ }
+
for (@headers) {
+ if ($opt{unique}) {
+ state $seen;
+ next if $seen->{ $_->[0] }->{ $_->[1] // '' }++;
+ }
say join ': ', @$_;
}