1*23cf27dbSMartin Matuska#!/usr/bin/env perl 2*23cf27dbSMartin Matuska 3*23cf27dbSMartin Matuska# SPDX-License-Identifier: MIT 4*23cf27dbSMartin Matuska# 5*23cf27dbSMartin Matuska# Copyright (c) 2023, Rob Norris <robn@despairlabs.com> 6*23cf27dbSMartin Matuska# 7*23cf27dbSMartin Matuska# Permission is hereby granted, free of charge, to any person obtaining a copy 8*23cf27dbSMartin Matuska# of this software and associated documentation files (the "Software"), to 9*23cf27dbSMartin Matuska# deal in the Software without restriction, including without limitation the 10*23cf27dbSMartin Matuska# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 11*23cf27dbSMartin Matuska# sell copies of the Software, and to permit persons to whom the Software is 12*23cf27dbSMartin Matuska# furnished to do so, subject to the following conditions: 13*23cf27dbSMartin Matuska# 14*23cf27dbSMartin Matuska# The above copyright notice and this permission notice shall be included in 15*23cf27dbSMartin Matuska# all copies or substantial portions of the Software. 16*23cf27dbSMartin Matuska# 17*23cf27dbSMartin Matuska# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18*23cf27dbSMartin Matuska# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19*23cf27dbSMartin Matuska# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20*23cf27dbSMartin Matuska# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21*23cf27dbSMartin Matuska# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22*23cf27dbSMartin Matuska# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 23*23cf27dbSMartin Matuska# IN THE SOFTWARE. 24*23cf27dbSMartin Matuska 25*23cf27dbSMartin Matuska 26*23cf27dbSMartin Matuska# This program will update the AUTHORS file to include commit authors that are 27*23cf27dbSMartin Matuska# in the git history but are not yet credited. 28*23cf27dbSMartin Matuska# 29*23cf27dbSMartin Matuska# The CONTRIBUTORS section of the AUTHORS file attempts to be a list of 30*23cf27dbSMartin Matuska# individual contributors to OpenZFS, with one name, address and line per 31*23cf27dbSMartin Matuska# person. This is good for readability, but does not really leave room for the 32*23cf27dbSMartin Matuska# that names and emails on commits from the same individual can be different, 33*23cf27dbSMartin Matuska# for all kinds of reasons, not limited to: 34*23cf27dbSMartin Matuska# 35*23cf27dbSMartin Matuska# - a person might change organisations, and so their email address changes 36*23cf27dbSMartin Matuska# 37*23cf27dbSMartin Matuska# - a person might be paid to work on OpenZFS for their employer, and then hack 38*23cf27dbSMartin Matuska# on personal projects in the evening, so commits legitimately come from 39*23cf27dbSMartin Matuska# different addresses 40*23cf27dbSMartin Matuska# 41*23cf27dbSMartin Matuska# - names change for all kinds of reasons 42*23cf27dbSMartin Matuska# 43*23cf27dbSMartin Matuska# To try and account for this, this program will try to find all the possible 44*23cf27dbSMartin Matuska# names and emails for a single contributor, and then select the "best" one to 45*23cf27dbSMartin Matuska# add to the AUTHORS file. 46*23cf27dbSMartin Matuska# 47*23cf27dbSMartin Matuska# The CONTRIBUTORS section of the AUTHORS file is considered the source of 48*23cf27dbSMartin Matuska# truth. Once an individual committer is listed in there, that line will not be 49*23cf27dbSMartin Matuska# removed regardless of what is discovered in the commit history. However, it 50*23cf27dbSMartin Matuska# can't just be _anything_. The name or email still has to match something seen 51*23cf27dbSMartin Matuska# in the commit history, so that we're able to undertand that its the same 52*23cf27dbSMartin Matuska# contributor. 53*23cf27dbSMartin Matuska# 54*23cf27dbSMartin Matuska# The bulk of the work is in running `git log` to fetch commit author names and 55*23cf27dbSMartin Matuska# emails. For each value, we generate a "slug" to use as an internal id for 56*23cf27dbSMartin Matuska# that value, which is mostly just the lowercase of the value with whitespace 57*23cf27dbSMartin Matuska# and punctuation removed. Two values with subtle differences can produce the 58*23cf27dbSMartin Matuska# same slug, so at this point we also try to keep the "best" pre-slug value as 59*23cf27dbSMartin Matuska# the display version. We use this slug to update two maps, one of email->name, 60*23cf27dbSMartin Matuska# the other of name->email. 61*23cf27dbSMartin Matuska# 62*23cf27dbSMartin Matuska# Once collected, we then walk all the emails we've seen and get all the names 63*23cf27dbSMartin Matuska# associated with every instance. Then for each of those names, we get all the 64*23cf27dbSMartin Matuska# emails associated, and so on until we've seen all the connected names and 65*23cf27dbSMartin Matuska# emails. This collection is every possible name and email for an individual 66*23cf27dbSMartin Matuska# contributor. 67*23cf27dbSMartin Matuska# 68*23cf27dbSMartin Matuska# Finaly, we consider these groups, and select the "best" name and email for 69*23cf27dbSMartin Matuska# the contributor, and add them to the author tables if they aren't there 70*23cf27dbSMartin Matuska# already. Once we've done everyone, we write out a new AUTHORS file, and 71*23cf27dbSMartin Matuska# that's the whole job. 72*23cf27dbSMartin Matuska# 73*23cf27dbSMartin Matuska# This is imperfect! Its necessary for the user to examine the diff and make 74*23cf27dbSMartin Matuska# sure its sensible. If it hasn't hooked up right, it may necessary to adjust 75*23cf27dbSMartin Matuska# the input data (via .mailmap) or improve the heuristics in this program. It 76*23cf27dbSMartin Matuska# took a long time to get into good shape when first written (355 new names 77*23cf27dbSMartin Matuska# added to AUTHORS!) but hopefully in the future we'll be running this 78*23cf27dbSMartin Matuska# regularly so it doesn't fall so far behind. 79*23cf27dbSMartin Matuska 80*23cf27dbSMartin Matuska 81*23cf27dbSMartin Matuskause 5.010; 82*23cf27dbSMartin Matuskause warnings; 83*23cf27dbSMartin Matuskause strict; 84*23cf27dbSMartin Matuska 85*23cf27dbSMartin Matuska# Storage for the "best looking" version of name or email, keyed on slug. 86*23cf27dbSMartin Matuskamy %display_name; 87*23cf27dbSMartin Matuskamy %display_email; 88*23cf27dbSMartin Matuska 89*23cf27dbSMartin Matuska# First, we load the existing AUTHORS file. We save everything before 90*23cf27dbSMartin Matuska# CONTRIBUTORS: line as-is so we can write it back out to the new file. Then 91*23cf27dbSMartin Matuska# we extract name,email pairs from the remainder and store them in a pair of 92*23cf27dbSMartin Matuska# hashtables, keyed on slug. 93*23cf27dbSMartin Matuskamy %authors_name; 94*23cf27dbSMartin Matuskamy %authors_email; 95*23cf27dbSMartin Matuska 96*23cf27dbSMartin Matuskamy @authors_header; 97*23cf27dbSMartin Matuska 98*23cf27dbSMartin Matuskafor my $line (do { local (@ARGV) = ('AUTHORS'); <> }) { 99*23cf27dbSMartin Matuska chomp $line; 100*23cf27dbSMartin Matuska state $in_header = 1; 101*23cf27dbSMartin Matuska if ($in_header) { 102*23cf27dbSMartin Matuska push @authors_header, $line; 103*23cf27dbSMartin Matuska $in_header = 0 if $line =~ m/^CONTRIBUTORS:/; 104*23cf27dbSMartin Matuska } else { 105*23cf27dbSMartin Matuska my ($name, $email) = $line =~ m/^\s+(.+)(?= <) <([^>]+)/; 106*23cf27dbSMartin Matuska next unless $name; 107*23cf27dbSMartin Matuska 108*23cf27dbSMartin Matuska my $semail = email_slug($email); 109*23cf27dbSMartin Matuska my $sname = name_slug($name); 110*23cf27dbSMartin Matuska 111*23cf27dbSMartin Matuska $authors_name{$semail} = $sname; 112*23cf27dbSMartin Matuska $authors_email{$sname} = $semail; 113*23cf27dbSMartin Matuska 114*23cf27dbSMartin Matuska # The name/email in AUTHORS is already the "best looking" 115*23cf27dbSMartin Matuska # version, by definition. 116*23cf27dbSMartin Matuska $display_name{$sname} = $name; 117*23cf27dbSMartin Matuska $display_email{$semail} = $email; 118*23cf27dbSMartin Matuska } 119*23cf27dbSMartin Matuska} 120*23cf27dbSMartin Matuska 121*23cf27dbSMartin Matuska# Next, we load all the commit authors. and form name<->email mappings, keyed 122*23cf27dbSMartin Matuska# on slug. Note that this format is getting the .mailmap-converted form. This 123*23cf27dbSMartin Matuska# lets us control the input to some extent by making changes there. 124*23cf27dbSMartin Matuskamy %git_names; 125*23cf27dbSMartin Matuskamy %git_emails; 126*23cf27dbSMartin Matuska 127*23cf27dbSMartin Matuskafor my $line (reverse qx(git log --pretty=tformat:'%aN:::%aE')) { 128*23cf27dbSMartin Matuska chomp $line; 129*23cf27dbSMartin Matuska my ($name, $email) = $line =~ m/^(.*):::(.*)/; 130*23cf27dbSMartin Matuska next unless $name && $email; 131*23cf27dbSMartin Matuska 132*23cf27dbSMartin Matuska my $semail = email_slug($email); 133*23cf27dbSMartin Matuska my $sname = name_slug($name); 134*23cf27dbSMartin Matuska 135*23cf27dbSMartin Matuska $git_names{$semail}{$sname} = 1; 136*23cf27dbSMartin Matuska $git_emails{$sname}{$semail} = 1; 137*23cf27dbSMartin Matuska 138*23cf27dbSMartin Matuska # Update the "best looking" display value, but only if we don't already 139*23cf27dbSMartin Matuska # have something from the AUTHORS file. If we do, we must not change it. 140*23cf27dbSMartin Matuska if (!$authors_name{email_slug($email)}) { 141*23cf27dbSMartin Matuska update_display_email($email); 142*23cf27dbSMartin Matuska } 143*23cf27dbSMartin Matuska 144*23cf27dbSMartin Matuska if (!$authors_email{name_slug($name)}) { 145*23cf27dbSMartin Matuska update_display_name($name); 146*23cf27dbSMartin Matuska } 147*23cf27dbSMartin Matuska} 148*23cf27dbSMartin Matuska 149*23cf27dbSMartin Matuska# Now collect unique committers by all names+emails we've ever seen for them. 150*23cf27dbSMartin Matuska# We start with emails and resolve all possible names, then we resolve the 151*23cf27dbSMartin Matuska# emails for those names, and round and round until there's nothing left. 152*23cf27dbSMartin Matuskamy @committers; 153*23cf27dbSMartin Matuskafor my $start_email (sort keys %git_names) { 154*23cf27dbSMartin Matuska # it might have been deleted already through a cross-reference 155*23cf27dbSMartin Matuska next unless $git_names{$start_email}; 156*23cf27dbSMartin Matuska 157*23cf27dbSMartin Matuska my %emails; 158*23cf27dbSMartin Matuska my %names; 159*23cf27dbSMartin Matuska 160*23cf27dbSMartin Matuska my @check_emails = ($start_email); 161*23cf27dbSMartin Matuska my @check_names; 162*23cf27dbSMartin Matuska while (@check_emails || @check_names) { 163*23cf27dbSMartin Matuska while (my $email = shift @check_emails) { 164*23cf27dbSMartin Matuska next if $emails{$email}++; 165*23cf27dbSMartin Matuska push @check_names, 166*23cf27dbSMartin Matuska sort keys %{delete $git_names{$email}}; 167*23cf27dbSMartin Matuska } 168*23cf27dbSMartin Matuska while (my $name = shift @check_names) { 169*23cf27dbSMartin Matuska next if $names{$name}++; 170*23cf27dbSMartin Matuska push @check_emails, 171*23cf27dbSMartin Matuska sort keys %{delete $git_emails{$name}}; 172*23cf27dbSMartin Matuska } 173*23cf27dbSMartin Matuska } 174*23cf27dbSMartin Matuska 175*23cf27dbSMartin Matuska # A "committer" is the collection of connected names and emails. 176*23cf27dbSMartin Matuska push @committers, [[sort keys %emails], [sort keys %names]]; 177*23cf27dbSMartin Matuska} 178*23cf27dbSMartin Matuska 179*23cf27dbSMartin Matuska# Now we have our committers, we can work out what to add to AUTHORS. 180*23cf27dbSMartin Matuskafor my $committer (@committers) { 181*23cf27dbSMartin Matuska my ($emails, $names) = @$committer; 182*23cf27dbSMartin Matuska 183*23cf27dbSMartin Matuska # If this commiter is already in AUTHORS, we must not touch. 184*23cf27dbSMartin Matuska next if grep { $authors_name{$_} } @$emails; 185*23cf27dbSMartin Matuska next if grep { $authors_email{$_} } @$names; 186*23cf27dbSMartin Matuska 187*23cf27dbSMartin Matuska # Decide on the "best" name and email to use 188*23cf27dbSMartin Matuska my $email = best_email(@$emails); 189*23cf27dbSMartin Matuska my $name = best_name(@$names); 190*23cf27dbSMartin Matuska 191*23cf27dbSMartin Matuska $authors_email{$name} = $email; 192*23cf27dbSMartin Matuska $authors_name{$email} = $name; 193*23cf27dbSMartin Matuska} 194*23cf27dbSMartin Matuska 195*23cf27dbSMartin Matuska# Now output the new AUTHORS file 196*23cf27dbSMartin Matuskaopen my $fh, '>', 'AUTHORS' or die "E: couldn't open AUTHORS for write: $!\n"; 197*23cf27dbSMartin Matuska#my $fh = \*STDOUT; 198*23cf27dbSMartin Matuskasay $fh join("\n", @authors_header, ""); 199*23cf27dbSMartin Matuskafor my $name (sort keys %authors_email) { 200*23cf27dbSMartin Matuska my $cname = $display_name{$name}; 201*23cf27dbSMartin Matuska my $cemail = $display_email{email_slug($authors_email{$name})}; 202*23cf27dbSMartin Matuska say $fh " $cname <$cemail>"; 203*23cf27dbSMartin Matuska} 204*23cf27dbSMartin Matuska 205*23cf27dbSMartin Matuskaexit 0; 206*23cf27dbSMartin Matuska 207*23cf27dbSMartin Matuska# "Slugs" are used at the hashtable key for names and emails. They are used to 208*23cf27dbSMartin Matuska# making two variants of a value be the "same" for matching. Mostly this is 209*23cf27dbSMartin Matuska# to make upper and lower-case versions of a name or email compare the same, 210*23cf27dbSMartin Matuska# but we do a little bit of munging to handle some common cases. 211*23cf27dbSMartin Matuska# 212*23cf27dbSMartin Matuska# Note that these are only used for matching internally; for display, the 213*23cf27dbSMartin Matuska# slug will be used to look up the display form. 214*23cf27dbSMartin Matuskasub name_slug { 215*23cf27dbSMartin Matuska my ($name) = @_; 216*23cf27dbSMartin Matuska 217*23cf27dbSMartin Matuska # Remove spaces and dots, to handle differences in initials. 218*23cf27dbSMartin Matuska $name =~ s/[\s\.]//g; 219*23cf27dbSMartin Matuska 220*23cf27dbSMartin Matuska return lc $name; 221*23cf27dbSMartin Matuska} 222*23cf27dbSMartin Matuskasub email_slug { 223*23cf27dbSMartin Matuska my ($email) = @_; 224*23cf27dbSMartin Matuska 225*23cf27dbSMartin Matuska # Remove everything up to and including the first space, and the last 226*23cf27dbSMartin Matuska # space and everything after it. 227*23cf27dbSMartin Matuska $email =~ s/^(.*\s+)|(\s+.*)$//g; 228*23cf27dbSMartin Matuska 229*23cf27dbSMartin Matuska # Remove the leading userid+ on Github noreply addresses. They're 230*23cf27dbSMartin Matuska # optional and we want to treat them as the same thing. 231*23cf27dbSMartin Matuska $email =~ s/^[^\+]*\+//g if $email =~ m/\.noreply\.github\.com$/; 232*23cf27dbSMartin Matuska 233*23cf27dbSMartin Matuska return lc $email; 234*23cf27dbSMartin Matuska} 235*23cf27dbSMartin Matuska 236*23cf27dbSMartin Matuskasub update_display_name { 237*23cf27dbSMartin Matuska my ($name) = @_; 238*23cf27dbSMartin Matuska my $sname = name_slug($name); 239*23cf27dbSMartin Matuska 240*23cf27dbSMartin Matuska # For names, "more specific" means "has more non-lower-case characters" 241*23cf27dbSMartin Matuska # (in ASCII), guessing that if a person has gone to some effort to 242*23cf27dbSMartin Matuska # specialise their name in a later commit, they presumably care more 243*23cf27dbSMartin Matuska # about it. If this is wrong, its probably better to add a .mailmap 244*23cf27dbSMartin Matuska # entry. 245*23cf27dbSMartin Matuska 246*23cf27dbSMartin Matuska my $cname = $display_name{$sname}; 247*23cf27dbSMartin Matuska if (!$cname || 248*23cf27dbSMartin Matuska ($name =~ tr/a-z //) < ($cname =~ tr/a-z //)) { 249*23cf27dbSMartin Matuska $display_name{$sname} = $name; 250*23cf27dbSMartin Matuska } 251*23cf27dbSMartin Matuska} 252*23cf27dbSMartin Matuskasub update_display_email { 253*23cf27dbSMartin Matuska my ($email) = @_; 254*23cf27dbSMartin Matuska my $semail = email_slug($email); 255*23cf27dbSMartin Matuska 256*23cf27dbSMartin Matuska # Like names, we prefer uppercase when possible. We also remove any 257*23cf27dbSMartin Matuska # leading "plus address" for Github noreply addresses. 258*23cf27dbSMartin Matuska $email =~ s/^[^\+]*\+//g if $email =~ m/\.noreply\.github\.com$/; 259*23cf27dbSMartin Matuska 260*23cf27dbSMartin Matuska my $cemail = $display_email{$semail}; 261*23cf27dbSMartin Matuska if (!$cemail || 262*23cf27dbSMartin Matuska ($email =~ tr/a-z //) < ($cemail =~ tr/a-z //)) { 263*23cf27dbSMartin Matuska $display_email{$semail} = $email; 264*23cf27dbSMartin Matuska } 265*23cf27dbSMartin Matuska} 266*23cf27dbSMartin Matuska 267*23cf27dbSMartin Matuskasub best_name { 268*23cf27dbSMartin Matuska my @names = sort { 269*23cf27dbSMartin Matuska my $cmp; 270*23cf27dbSMartin Matuska my ($aa) = $display_name{$a}; 271*23cf27dbSMartin Matuska my ($bb) = $display_name{$b}; 272*23cf27dbSMartin Matuska 273*23cf27dbSMartin Matuska # The "best" name is very subjective, and a simple sort 274*23cf27dbSMartin Matuska # produced good-enough results, so I didn't try harder. Use of 275*23cf27dbSMartin Matuska # accented characters, punctuation and caps are probably an 276*23cf27dbSMartin Matuska # indicator of "better", but possibly we should also take into 277*23cf27dbSMartin Matuska # account the most recent name we saw, in case the committer 278*23cf27dbSMartin Matuska # has changed their name or nickname or similar. 279*23cf27dbSMartin Matuska # 280*23cf27dbSMartin Matuska # Really, .mailmap is the place to control this. 281*23cf27dbSMartin Matuska 282*23cf27dbSMartin Matuska return ($aa cmp $bb); 283*23cf27dbSMartin Matuska } @_; 284*23cf27dbSMartin Matuska 285*23cf27dbSMartin Matuska return shift @names; 286*23cf27dbSMartin Matuska} 287*23cf27dbSMartin Matuskasub best_email { 288*23cf27dbSMartin Matuska state $internal_re = qr/\.(?:internal|local|\(none\))$/; 289*23cf27dbSMartin Matuska state $noreply_re = qr/\.noreply\.github\.com$/; 290*23cf27dbSMartin Matuska state $freemail_re = qr/\@(?:gmail|hotmail)\.com$/; 291*23cf27dbSMartin Matuska 292*23cf27dbSMartin Matuska my @emails = sort { 293*23cf27dbSMartin Matuska my $cmp; 294*23cf27dbSMartin Matuska 295*23cf27dbSMartin Matuska # prefer address with a single @ over those without 296*23cf27dbSMartin Matuska $cmp = (($b =~ tr/@//) == 1) <=> (($a =~ tr/@//) == 1); 297*23cf27dbSMartin Matuska return $cmp unless $cmp == 0; 298*23cf27dbSMartin Matuska 299*23cf27dbSMartin Matuska # prefer any address over internal/local addresses 300*23cf27dbSMartin Matuska $cmp = (($a =~ $internal_re) <=> ($b =~ $internal_re)); 301*23cf27dbSMartin Matuska return $cmp unless $cmp == 0; 302*23cf27dbSMartin Matuska 303*23cf27dbSMartin Matuska # prefer any address over github noreply aliases 304*23cf27dbSMartin Matuska $cmp = (($a =~ $noreply_re) <=> ($b =~ $noreply_re)); 305*23cf27dbSMartin Matuska return $cmp unless $cmp == 0; 306*23cf27dbSMartin Matuska 307*23cf27dbSMartin Matuska # prefer any address over freemail providers 308*23cf27dbSMartin Matuska $cmp = (($a =~ $freemail_re) <=> ($b =~ $freemail_re)); 309*23cf27dbSMartin Matuska return $cmp unless $cmp == 0; 310*23cf27dbSMartin Matuska 311*23cf27dbSMartin Matuska # alphabetical by domain 312*23cf27dbSMartin Matuska my ($alocal, $adom) = split /\@/, $a; 313*23cf27dbSMartin Matuska my ($blocal, $bdom) = split /\@/, $b; 314*23cf27dbSMartin Matuska $cmp = ($adom cmp $bdom); 315*23cf27dbSMartin Matuska return $cmp unless $cmp == 0; 316*23cf27dbSMartin Matuska 317*23cf27dbSMartin Matuska # alphabetical by local part 318*23cf27dbSMartin Matuska return ($alocal cmp $blocal); 319*23cf27dbSMartin Matuska } @_; 320*23cf27dbSMartin Matuska 321*23cf27dbSMartin Matuska return shift @emails; 322*23cf27dbSMartin Matuska} 323