#!/usr/bin/perl -0 -CO use 5.010; use strict; use warnings; use Encode 'decode'; use Data::Dump 'pp'; use Getopt::Long; GetOptions(\my %opt, 'debug!', 'simplify|s:s', 'unique|u!', 'ignore-case|i!', ) or die; local $| = 1; my $HEADERMATCH = qr/ [a-z]+ (?: (?:-\w+)+ | \ by ) /ix; while (readline) { s/(.+)\n//m; my $hash = $1; # strip commit seperator chomp; # skip expensive checks without potential identifier m/:/ or next; # try to parse as UTF-8 eval { $_ = decode(utf8 => $_, Encode::FB_CROAK()) }; # if invalid, assume it's latin1 $_ = decode(cp1252 => $_) if $@; my $prefix = 0; my %attr; BLOCK: for (reverse split /\n\n/) { my @headers; LINE: for (split /\n/) { next if not /\S/; my @header = m{ ^ (? $HEADERMATCH) : \s* (? \S .+) $ }imx or do { $prefix++; next LINE; }; given ($opt{simplify} // 'no') { when ('strict') { $header[1] =~ s{ \A (?: [^:]+ )? < [^@>]+ (?: @ | \h?\W? at \W?\h? ) [a-z0-9.-]+ > \Z }{<...>}imsx; } when (['text', '']) { when ($header[0] =~ /[ _-] (?: by | to ) $/imsx) { pop @header; } for ($header[1]) { s{\b (https?)://\S+ }{[$1]}gmsx; # url s{(?: < | \A ) [^@>\s]+ @ [^>]+ (?: > | \Z )}{<...>}igmsx; # address s{\b [0-9]+ \b}{[num]}gmsx; # number s{\b I? [0-9a-f]{40} \b}{[sha1]}gmsx; # hash } } when (['all', 'any']) { pop @header; } when ('no') { } default { die "Unknown simplify option: '$_'\n"; } } if ($opt{'ignore-case'}) { $_ = lc for @header; } push @headers, \@header; } next BLOCK if not @headers; if ($opt{debug} and $prefix) { say "infix junk in commit $hash"; } for (@headers) { if ($opt{unique}) { state $seen; next if $seen->{ $_->[0] }->{ $_->[1] // '' }++; } say join ': ', @$_; } last BLOCK; } }