xref: /freebsd-src/sys/contrib/openzfs/scripts/update_authors.pl (revision 23cf27db2cc4b0208a35b009f873971a7bb4a6bb)
1*23cf27dbSMartin Matuska#!/usr/bin/env perl
2*23cf27dbSMartin Matuska
3*23cf27dbSMartin Matuska# SPDX-License-Identifier: MIT
4*23cf27dbSMartin Matuska#
5*23cf27dbSMartin Matuska# Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
6*23cf27dbSMartin Matuska#
7*23cf27dbSMartin Matuska# Permission is hereby granted, free of charge, to any person obtaining a copy
8*23cf27dbSMartin Matuska# of this software and associated documentation files (the "Software"), to
9*23cf27dbSMartin Matuska# deal in the Software without restriction, including without limitation the
10*23cf27dbSMartin Matuska# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
11*23cf27dbSMartin Matuska# sell copies of the Software, and to permit persons to whom the Software is
12*23cf27dbSMartin Matuska# furnished to do so, subject to the following conditions:
13*23cf27dbSMartin Matuska#
14*23cf27dbSMartin Matuska# The above copyright notice and this permission notice shall be included in
15*23cf27dbSMartin Matuska# all copies or substantial portions of the Software.
16*23cf27dbSMartin Matuska#
17*23cf27dbSMartin Matuska# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18*23cf27dbSMartin Matuska# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19*23cf27dbSMartin Matuska# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20*23cf27dbSMartin Matuska# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21*23cf27dbSMartin Matuska# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22*23cf27dbSMartin Matuska# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23*23cf27dbSMartin Matuska# IN THE SOFTWARE.
24*23cf27dbSMartin Matuska
25*23cf27dbSMartin Matuska
26*23cf27dbSMartin Matuska# This program will update the AUTHORS file to include commit authors that are
27*23cf27dbSMartin Matuska# in the git history but are not yet credited.
28*23cf27dbSMartin Matuska#
29*23cf27dbSMartin Matuska# The CONTRIBUTORS section of the AUTHORS file attempts to be a list of
30*23cf27dbSMartin Matuska# individual contributors to OpenZFS, with one name, address and line per
31*23cf27dbSMartin Matuska# person. This is good for readability, but does not really leave room for the
32*23cf27dbSMartin Matuska# that names and emails on commits from the same individual can be different,
33*23cf27dbSMartin Matuska# for all kinds of reasons, not limited to:
34*23cf27dbSMartin Matuska#
35*23cf27dbSMartin Matuska# - a person might change organisations, and so their email address changes
36*23cf27dbSMartin Matuska#
37*23cf27dbSMartin Matuska# - a person might be paid to work on OpenZFS for their employer, and then hack
38*23cf27dbSMartin Matuska#   on personal projects in the evening, so commits legitimately come from
39*23cf27dbSMartin Matuska#   different addresses
40*23cf27dbSMartin Matuska#
41*23cf27dbSMartin Matuska# - names change for all kinds of reasons
42*23cf27dbSMartin Matuska#
43*23cf27dbSMartin Matuska# To try and account for this, this program will try to find all the possible
44*23cf27dbSMartin Matuska# names and emails for a single contributor, and then select the "best" one to
45*23cf27dbSMartin Matuska# add to the AUTHORS file.
46*23cf27dbSMartin Matuska#
47*23cf27dbSMartin Matuska# The CONTRIBUTORS section of the AUTHORS file is considered the source of
48*23cf27dbSMartin Matuska# truth. Once an individual committer is listed in there, that line will not be
49*23cf27dbSMartin Matuska# removed regardless of what is discovered in the commit history. However, it
50*23cf27dbSMartin Matuska# can't just be _anything_. The name or email still has to match something seen
51*23cf27dbSMartin Matuska# in the commit history, so that we're able to undertand that its the same
52*23cf27dbSMartin Matuska# contributor.
53*23cf27dbSMartin Matuska#
54*23cf27dbSMartin Matuska# The bulk of the work is in running `git log` to fetch commit author names and
55*23cf27dbSMartin Matuska# emails. For each value, we generate a "slug" to use as an internal id for
56*23cf27dbSMartin Matuska# that value, which is mostly just the lowercase of the value with whitespace
57*23cf27dbSMartin Matuska# and punctuation removed. Two values with subtle differences can produce the
58*23cf27dbSMartin Matuska# same slug, so at this point we also try to keep the "best" pre-slug value as
59*23cf27dbSMartin Matuska# the display version. We use this slug to update two maps, one of email->name,
60*23cf27dbSMartin Matuska# the other of name->email.
61*23cf27dbSMartin Matuska#
62*23cf27dbSMartin Matuska# Once collected, we then walk all the emails we've seen and get all the names
63*23cf27dbSMartin Matuska# associated with every instance. Then for each of those names, we get all the
64*23cf27dbSMartin Matuska# emails associated, and so on until we've seen all the connected names and
65*23cf27dbSMartin Matuska# emails. This collection is every possible name and email for an individual
66*23cf27dbSMartin Matuska# contributor.
67*23cf27dbSMartin Matuska#
68*23cf27dbSMartin Matuska# Finaly, we consider these groups, and select the "best" name and email for
69*23cf27dbSMartin Matuska# the contributor, and add them to the author tables if they aren't there
70*23cf27dbSMartin Matuska# already. Once we've done everyone, we write out a new AUTHORS file, and
71*23cf27dbSMartin Matuska# that's the whole job.
72*23cf27dbSMartin Matuska#
73*23cf27dbSMartin Matuska# This is imperfect! Its necessary for the user to examine the diff and make
74*23cf27dbSMartin Matuska# sure its sensible. If it hasn't hooked up right, it may necessary to adjust
75*23cf27dbSMartin Matuska# the input data (via .mailmap) or improve the heuristics in this program. It
76*23cf27dbSMartin Matuska# took a long time to get into good shape when first written (355 new names
77*23cf27dbSMartin Matuska# added to AUTHORS!) but hopefully in the future we'll be running this
78*23cf27dbSMartin Matuska# regularly so it doesn't fall so far behind.
79*23cf27dbSMartin Matuska
80*23cf27dbSMartin Matuska
81*23cf27dbSMartin Matuskause 5.010;
82*23cf27dbSMartin Matuskause warnings;
83*23cf27dbSMartin Matuskause strict;
84*23cf27dbSMartin Matuska
85*23cf27dbSMartin Matuska# Storage for the "best looking" version of name or email, keyed on slug.
86*23cf27dbSMartin Matuskamy %display_name;
87*23cf27dbSMartin Matuskamy %display_email;
88*23cf27dbSMartin Matuska
89*23cf27dbSMartin Matuska# First, we load the existing AUTHORS file. We save everything before
90*23cf27dbSMartin Matuska# CONTRIBUTORS: line as-is so we can write it back out to the new file. Then
91*23cf27dbSMartin Matuska# we extract name,email pairs from the remainder and store them in a pair of
92*23cf27dbSMartin Matuska# hashtables, keyed on slug.
93*23cf27dbSMartin Matuskamy %authors_name;
94*23cf27dbSMartin Matuskamy %authors_email;
95*23cf27dbSMartin Matuska
96*23cf27dbSMartin Matuskamy @authors_header;
97*23cf27dbSMartin Matuska
98*23cf27dbSMartin Matuskafor my $line (do { local (@ARGV) = ('AUTHORS'); <> }) {
99*23cf27dbSMartin Matuska	chomp $line;
100*23cf27dbSMartin Matuska	state $in_header = 1;
101*23cf27dbSMartin Matuska	if ($in_header) {
102*23cf27dbSMartin Matuska		push @authors_header, $line;
103*23cf27dbSMartin Matuska		$in_header = 0 if $line =~ m/^CONTRIBUTORS:/;
104*23cf27dbSMartin Matuska	} else {
105*23cf27dbSMartin Matuska		my ($name, $email) = $line =~ m/^\s+(.+)(?= <) <([^>]+)/;
106*23cf27dbSMartin Matuska		next unless $name;
107*23cf27dbSMartin Matuska
108*23cf27dbSMartin Matuska		my $semail = email_slug($email);
109*23cf27dbSMartin Matuska		my $sname = name_slug($name);
110*23cf27dbSMartin Matuska
111*23cf27dbSMartin Matuska		$authors_name{$semail} = $sname;
112*23cf27dbSMartin Matuska		$authors_email{$sname} = $semail;
113*23cf27dbSMartin Matuska
114*23cf27dbSMartin Matuska		# The name/email in AUTHORS is already the "best looking"
115*23cf27dbSMartin Matuska		# version, by definition.
116*23cf27dbSMartin Matuska		$display_name{$sname} = $name;
117*23cf27dbSMartin Matuska		$display_email{$semail} = $email;
118*23cf27dbSMartin Matuska	}
119*23cf27dbSMartin Matuska}
120*23cf27dbSMartin Matuska
121*23cf27dbSMartin Matuska# Next, we load all the commit authors. and form name<->email mappings, keyed
122*23cf27dbSMartin Matuska# on slug. Note that this format is getting the .mailmap-converted form. This
123*23cf27dbSMartin Matuska# lets us control the input to some extent by making changes there.
124*23cf27dbSMartin Matuskamy %git_names;
125*23cf27dbSMartin Matuskamy %git_emails;
126*23cf27dbSMartin Matuska
127*23cf27dbSMartin Matuskafor my $line (reverse qx(git log --pretty=tformat:'%aN:::%aE')) {
128*23cf27dbSMartin Matuska	chomp $line;
129*23cf27dbSMartin Matuska	my ($name, $email) = $line =~ m/^(.*):::(.*)/;
130*23cf27dbSMartin Matuska	next unless $name && $email;
131*23cf27dbSMartin Matuska
132*23cf27dbSMartin Matuska	my $semail = email_slug($email);
133*23cf27dbSMartin Matuska	my $sname = name_slug($name);
134*23cf27dbSMartin Matuska
135*23cf27dbSMartin Matuska	$git_names{$semail}{$sname} = 1;
136*23cf27dbSMartin Matuska	$git_emails{$sname}{$semail} = 1;
137*23cf27dbSMartin Matuska
138*23cf27dbSMartin Matuska	# Update the "best looking" display value, but only if we don't already
139*23cf27dbSMartin Matuska	# have something from the AUTHORS file. If we do, we must not change it.
140*23cf27dbSMartin Matuska	if (!$authors_name{email_slug($email)}) {
141*23cf27dbSMartin Matuska		update_display_email($email);
142*23cf27dbSMartin Matuska	}
143*23cf27dbSMartin Matuska
144*23cf27dbSMartin Matuska	if (!$authors_email{name_slug($name)}) {
145*23cf27dbSMartin Matuska		update_display_name($name);
146*23cf27dbSMartin Matuska	}
147*23cf27dbSMartin Matuska}
148*23cf27dbSMartin Matuska
149*23cf27dbSMartin Matuska# Now collect unique committers by all names+emails we've ever seen for them.
150*23cf27dbSMartin Matuska# We start with emails and resolve all possible names, then we resolve the
151*23cf27dbSMartin Matuska# emails for those names, and round and round until there's nothing left.
152*23cf27dbSMartin Matuskamy @committers;
153*23cf27dbSMartin Matuskafor my $start_email (sort keys %git_names) {
154*23cf27dbSMartin Matuska	# it might have been deleted already through a cross-reference
155*23cf27dbSMartin Matuska	next unless $git_names{$start_email};
156*23cf27dbSMartin Matuska
157*23cf27dbSMartin Matuska	my %emails;
158*23cf27dbSMartin Matuska	my %names;
159*23cf27dbSMartin Matuska
160*23cf27dbSMartin Matuska	my @check_emails = ($start_email);
161*23cf27dbSMartin Matuska	my @check_names;
162*23cf27dbSMartin Matuska	while (@check_emails || @check_names) {
163*23cf27dbSMartin Matuska		while (my $email = shift @check_emails) {
164*23cf27dbSMartin Matuska			next if $emails{$email}++;
165*23cf27dbSMartin Matuska			push @check_names,
166*23cf27dbSMartin Matuska			    sort keys %{delete $git_names{$email}};
167*23cf27dbSMartin Matuska		}
168*23cf27dbSMartin Matuska		while (my $name = shift @check_names) {
169*23cf27dbSMartin Matuska			next if $names{$name}++;
170*23cf27dbSMartin Matuska			push @check_emails,
171*23cf27dbSMartin Matuska			    sort keys %{delete $git_emails{$name}};
172*23cf27dbSMartin Matuska		}
173*23cf27dbSMartin Matuska	}
174*23cf27dbSMartin Matuska
175*23cf27dbSMartin Matuska	# A "committer" is the collection of connected names and emails.
176*23cf27dbSMartin Matuska	push @committers, [[sort keys %emails], [sort keys %names]];
177*23cf27dbSMartin Matuska}
178*23cf27dbSMartin Matuska
179*23cf27dbSMartin Matuska# Now we have our committers, we can work out what to add to AUTHORS.
180*23cf27dbSMartin Matuskafor my $committer (@committers) {
181*23cf27dbSMartin Matuska	my ($emails, $names) = @$committer;
182*23cf27dbSMartin Matuska
183*23cf27dbSMartin Matuska	# If this commiter is already in AUTHORS, we must not touch.
184*23cf27dbSMartin Matuska	next if grep { $authors_name{$_} } @$emails;
185*23cf27dbSMartin Matuska	next if grep { $authors_email{$_} } @$names;
186*23cf27dbSMartin Matuska
187*23cf27dbSMartin Matuska	# Decide on the "best" name and email to use
188*23cf27dbSMartin Matuska	my $email = best_email(@$emails);
189*23cf27dbSMartin Matuska	my $name = best_name(@$names);
190*23cf27dbSMartin Matuska
191*23cf27dbSMartin Matuska	$authors_email{$name} = $email;
192*23cf27dbSMartin Matuska	$authors_name{$email} = $name;
193*23cf27dbSMartin Matuska}
194*23cf27dbSMartin Matuska
195*23cf27dbSMartin Matuska# Now output the new AUTHORS file
196*23cf27dbSMartin Matuskaopen my $fh, '>', 'AUTHORS' or die "E: couldn't open AUTHORS for write: $!\n";
197*23cf27dbSMartin Matuska#my $fh = \*STDOUT;
198*23cf27dbSMartin Matuskasay $fh join("\n", @authors_header, "");
199*23cf27dbSMartin Matuskafor my $name (sort keys %authors_email) {
200*23cf27dbSMartin Matuska	my $cname = $display_name{$name};
201*23cf27dbSMartin Matuska	my $cemail = $display_email{email_slug($authors_email{$name})};
202*23cf27dbSMartin Matuska	say $fh "    $cname <$cemail>";
203*23cf27dbSMartin Matuska}
204*23cf27dbSMartin Matuska
205*23cf27dbSMartin Matuskaexit 0;
206*23cf27dbSMartin Matuska
207*23cf27dbSMartin Matuska# "Slugs" are used at the hashtable key for names and emails. They are used to
208*23cf27dbSMartin Matuska# making two variants of a value be the "same" for matching. Mostly this is
209*23cf27dbSMartin Matuska# to make upper and lower-case versions of a name or email compare the same,
210*23cf27dbSMartin Matuska# but we do a little bit of munging to handle some common cases.
211*23cf27dbSMartin Matuska#
212*23cf27dbSMartin Matuska# Note that these are only used for matching internally; for display, the
213*23cf27dbSMartin Matuska# slug will be used to look up the display form.
214*23cf27dbSMartin Matuskasub name_slug {
215*23cf27dbSMartin Matuska	my ($name) = @_;
216*23cf27dbSMartin Matuska
217*23cf27dbSMartin Matuska	# Remove spaces and dots, to handle differences in initials.
218*23cf27dbSMartin Matuska	$name =~ s/[\s\.]//g;
219*23cf27dbSMartin Matuska
220*23cf27dbSMartin Matuska	return lc $name;
221*23cf27dbSMartin Matuska}
222*23cf27dbSMartin Matuskasub email_slug {
223*23cf27dbSMartin Matuska	my ($email) = @_;
224*23cf27dbSMartin Matuska
225*23cf27dbSMartin Matuska	# Remove everything up to and including the first space, and the last
226*23cf27dbSMartin Matuska	# space and everything after it.
227*23cf27dbSMartin Matuska	$email =~ s/^(.*\s+)|(\s+.*)$//g;
228*23cf27dbSMartin Matuska
229*23cf27dbSMartin Matuska	# Remove the leading userid+ on Github noreply addresses. They're
230*23cf27dbSMartin Matuska	# optional and we want to treat them as the same thing.
231*23cf27dbSMartin Matuska	$email =~ s/^[^\+]*\+//g if $email =~ m/\.noreply\.github\.com$/;
232*23cf27dbSMartin Matuska
233*23cf27dbSMartin Matuska	return lc $email;
234*23cf27dbSMartin Matuska}
235*23cf27dbSMartin Matuska
236*23cf27dbSMartin Matuskasub update_display_name {
237*23cf27dbSMartin Matuska	my ($name) = @_;
238*23cf27dbSMartin Matuska	my $sname = name_slug($name);
239*23cf27dbSMartin Matuska
240*23cf27dbSMartin Matuska	# For names, "more specific" means "has more non-lower-case characters"
241*23cf27dbSMartin Matuska	# (in ASCII), guessing that if a person has gone to some effort to
242*23cf27dbSMartin Matuska	# specialise their name in a later commit, they presumably care more
243*23cf27dbSMartin Matuska	# about it. If this is wrong, its probably better to add a .mailmap
244*23cf27dbSMartin Matuska	# entry.
245*23cf27dbSMartin Matuska
246*23cf27dbSMartin Matuska	my $cname = $display_name{$sname};
247*23cf27dbSMartin Matuska	if (!$cname ||
248*23cf27dbSMartin Matuska	    ($name =~ tr/a-z //) < ($cname =~ tr/a-z //)) {
249*23cf27dbSMartin Matuska		$display_name{$sname} = $name;
250*23cf27dbSMartin Matuska	}
251*23cf27dbSMartin Matuska}
252*23cf27dbSMartin Matuskasub update_display_email {
253*23cf27dbSMartin Matuska	my ($email) = @_;
254*23cf27dbSMartin Matuska	my $semail = email_slug($email);
255*23cf27dbSMartin Matuska
256*23cf27dbSMartin Matuska	# Like names, we prefer uppercase when possible. We also remove any
257*23cf27dbSMartin Matuska	# leading "plus address" for Github noreply addresses.
258*23cf27dbSMartin Matuska	$email =~ s/^[^\+]*\+//g if $email =~ m/\.noreply\.github\.com$/;
259*23cf27dbSMartin Matuska
260*23cf27dbSMartin Matuska	my $cemail = $display_email{$semail};
261*23cf27dbSMartin Matuska	if (!$cemail ||
262*23cf27dbSMartin Matuska	    ($email =~ tr/a-z //) < ($cemail =~ tr/a-z //)) {
263*23cf27dbSMartin Matuska		$display_email{$semail} = $email;
264*23cf27dbSMartin Matuska	}
265*23cf27dbSMartin Matuska}
266*23cf27dbSMartin Matuska
267*23cf27dbSMartin Matuskasub best_name {
268*23cf27dbSMartin Matuska	my @names = sort {
269*23cf27dbSMartin Matuska		my $cmp;
270*23cf27dbSMartin Matuska		my ($aa) = $display_name{$a};
271*23cf27dbSMartin Matuska		my ($bb) = $display_name{$b};
272*23cf27dbSMartin Matuska
273*23cf27dbSMartin Matuska		# The "best" name is very subjective, and a simple sort
274*23cf27dbSMartin Matuska		# produced good-enough results, so I didn't try harder. Use of
275*23cf27dbSMartin Matuska		# accented characters, punctuation and caps are probably an
276*23cf27dbSMartin Matuska		# indicator of "better", but possibly we should also take into
277*23cf27dbSMartin Matuska		# account the most recent name we saw, in case the committer
278*23cf27dbSMartin Matuska		# has changed their name or nickname or similar.
279*23cf27dbSMartin Matuska		#
280*23cf27dbSMartin Matuska		# Really, .mailmap is the place to control this.
281*23cf27dbSMartin Matuska
282*23cf27dbSMartin Matuska		return ($aa cmp $bb);
283*23cf27dbSMartin Matuska	} @_;
284*23cf27dbSMartin Matuska
285*23cf27dbSMartin Matuska	return shift @names;
286*23cf27dbSMartin Matuska}
287*23cf27dbSMartin Matuskasub best_email {
288*23cf27dbSMartin Matuska	state $internal_re = qr/\.(?:internal|local|\(none\))$/;
289*23cf27dbSMartin Matuska	state $noreply_re  = qr/\.noreply\.github\.com$/;
290*23cf27dbSMartin Matuska	state $freemail_re = qr/\@(?:gmail|hotmail)\.com$/;
291*23cf27dbSMartin Matuska
292*23cf27dbSMartin Matuska	my @emails = sort {
293*23cf27dbSMartin Matuska		my $cmp;
294*23cf27dbSMartin Matuska
295*23cf27dbSMartin Matuska		# prefer address with a single @ over those without
296*23cf27dbSMartin Matuska		$cmp = (($b =~ tr/@//) == 1) <=> (($a =~ tr/@//) == 1);
297*23cf27dbSMartin Matuska		return $cmp unless $cmp == 0;
298*23cf27dbSMartin Matuska
299*23cf27dbSMartin Matuska		# prefer any address over internal/local addresses
300*23cf27dbSMartin Matuska		$cmp = (($a =~ $internal_re) <=> ($b =~ $internal_re));
301*23cf27dbSMartin Matuska		return $cmp unless $cmp == 0;
302*23cf27dbSMartin Matuska
303*23cf27dbSMartin Matuska		# prefer any address over github noreply aliases
304*23cf27dbSMartin Matuska		$cmp = (($a =~ $noreply_re) <=> ($b =~ $noreply_re));
305*23cf27dbSMartin Matuska		return $cmp unless $cmp == 0;
306*23cf27dbSMartin Matuska
307*23cf27dbSMartin Matuska		# prefer any address over freemail providers
308*23cf27dbSMartin Matuska		$cmp = (($a =~ $freemail_re) <=> ($b =~ $freemail_re));
309*23cf27dbSMartin Matuska		return $cmp unless $cmp == 0;
310*23cf27dbSMartin Matuska
311*23cf27dbSMartin Matuska		# alphabetical by domain
312*23cf27dbSMartin Matuska		my ($alocal, $adom) = split /\@/, $a;
313*23cf27dbSMartin Matuska		my ($blocal, $bdom) = split /\@/, $b;
314*23cf27dbSMartin Matuska		$cmp = ($adom cmp $bdom);
315*23cf27dbSMartin Matuska		return $cmp unless $cmp == 0;
316*23cf27dbSMartin Matuska
317*23cf27dbSMartin Matuska		# alphabetical by local part
318*23cf27dbSMartin Matuska		return ($alocal cmp $blocal);
319*23cf27dbSMartin Matuska	} @_;
320*23cf27dbSMartin Matuska
321*23cf27dbSMartin Matuska	return shift @emails;
322*23cf27dbSMartin Matuska}
323