xref: /netbsd-src/external/gpl3/gcc.old/dist/contrib/check-internal-format-escaping.py (revision 4c3eb207d36f67d31994830c0a694161fc1ca39b)
1627f7eb2Smrg#!/usr/bin/env python3
2627f7eb2Smrg#
3*4c3eb207Smrg# Check gcc.pot file for stylistic issues as described in
4*4c3eb207Smrg# https://gcc.gnu.org/onlinedocs/gccint/Guidelines-for-Diagnostics.html,
5*4c3eb207Smrg# especially in gcc-internal-format messages.
6627f7eb2Smrg#
7627f7eb2Smrg# This file is part of GCC.
8627f7eb2Smrg#
9627f7eb2Smrg# GCC is free software; you can redistribute it and/or modify it under
10627f7eb2Smrg# the terms of the GNU General Public License as published by the Free
11627f7eb2Smrg# Software Foundation; either version 3, or (at your option) any later
12627f7eb2Smrg# version.
13627f7eb2Smrg#
14627f7eb2Smrg# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15627f7eb2Smrg# WARRANTY; without even the implied warranty of MERCHANTABILITY or
16627f7eb2Smrg# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
17627f7eb2Smrg# for more details.
18627f7eb2Smrg#
19627f7eb2Smrg# You should have received a copy of the GNU General Public License
20627f7eb2Smrg# along with GCC; see the file COPYING3.  If not see
21*4c3eb207Smrg# <http://www.gnu.org/licenses/>.
22627f7eb2Smrg
23627f7eb2Smrgimport argparse
24627f7eb2Smrgimport re
25*4c3eb207Smrgfrom collections import Counter
26*4c3eb207Smrgfrom typing import Dict, Match
27627f7eb2Smrg
28*4c3eb207Smrgimport polib
29*4c3eb207Smrg
30*4c3eb207Smrgseen_warnings = Counter()
31*4c3eb207Smrg
32*4c3eb207Smrg
33*4c3eb207Smrgdef location(msg: polib.POEntry):
34*4c3eb207Smrg    if msg.occurrences:
35*4c3eb207Smrg        occ = msg.occurrences[0]
36*4c3eb207Smrg        return f'{occ[0]}:{occ[1]}'
37*4c3eb207Smrg    return '<unknown location>'
38*4c3eb207Smrg
39*4c3eb207Smrg
40*4c3eb207Smrgdef warn(msg: polib.POEntry,
41*4c3eb207Smrg         diagnostic_id: str, diagnostic: str, include_msgid=True):
42*4c3eb207Smrg    """
43*4c3eb207Smrg    To suppress a warning for a particular message,
44*4c3eb207Smrg    add a line "#, gcclint:ignore:{diagnostic_id}" to the message.
45*4c3eb207Smrg    """
46*4c3eb207Smrg
47*4c3eb207Smrg    if f'gcclint:ignore:{diagnostic_id}' in msg.flags:
48*4c3eb207Smrg        return
49*4c3eb207Smrg
50*4c3eb207Smrg    seen_warnings[diagnostic] += 1
51*4c3eb207Smrg
52*4c3eb207Smrg    if include_msgid:
53*4c3eb207Smrg        print(f'{location(msg)}: {diagnostic} in {repr(msg.msgid)}')
54*4c3eb207Smrg    else:
55*4c3eb207Smrg        print(f'{location(msg)}: {diagnostic}')
56*4c3eb207Smrg
57*4c3eb207Smrg
58*4c3eb207Smrgdef lint_gcc_internal_format(msg: polib.POEntry):
59*4c3eb207Smrg    """
60*4c3eb207Smrg    Checks a single message that has the gcc-internal-format. These
61*4c3eb207Smrg    messages use a variety of placeholders like %qs, %<quotes%> and
62*4c3eb207Smrg    %q#E.
63*4c3eb207Smrg    """
64*4c3eb207Smrg
65*4c3eb207Smrg    msgid: str = msg.msgid
66*4c3eb207Smrg
67*4c3eb207Smrg    def outside_quotes(m: Match[str]):
68*4c3eb207Smrg        before = msgid[:m.start(0)]
69*4c3eb207Smrg        return before.count("%<") == before.count("%>")
70*4c3eb207Smrg
71*4c3eb207Smrg    def lint_matching_placeholders():
72*4c3eb207Smrg        """
73*4c3eb207Smrg        Warns when literal values in placeholders are not exactly equal
74*4c3eb207Smrg        in the translation. This can happen when doing copy-and-paste
75*4c3eb207Smrg        translations of similar messages.
76*4c3eb207Smrg
77*4c3eb207Smrg        To avoid these mismatches in the first place,
78*4c3eb207Smrg        structurally equal messages are found by
79*4c3eb207Smrg        lint_diagnostics_differing_only_in_placeholders.
80*4c3eb207Smrg
81*4c3eb207Smrg        This check only applies when checking a finished translation
82*4c3eb207Smrg        such as de.po, not gcc.pot.
83*4c3eb207Smrg        """
84*4c3eb207Smrg
85*4c3eb207Smrg        if not msg.translated():
86*4c3eb207Smrg            return
87*4c3eb207Smrg
88*4c3eb207Smrg        in_msgid = re.findall('%<[^%]+%>', msgid)
89*4c3eb207Smrg        in_msgstr = re.findall('%<[^%]+%>', msg.msgstr)
90*4c3eb207Smrg
91*4c3eb207Smrg        if set(in_msgid) != set(in_msgstr):
92*4c3eb207Smrg            warn(msg,
93*4c3eb207Smrg                 'placeholder-mismatch',
94*4c3eb207Smrg                 f'placeholder mismatch: msgid has {in_msgid}, '
95*4c3eb207Smrg                 f'msgstr has {in_msgstr}',
96*4c3eb207Smrg                 include_msgid=False)
97*4c3eb207Smrg
98*4c3eb207Smrg    def lint_option_outside_quotes():
99*4c3eb207Smrg        for match in re.finditer(r'\S+', msgid):
100*4c3eb207Smrg            part = match.group()
101*4c3eb207Smrg            if not outside_quotes(match):
102*4c3eb207Smrg                continue
103*4c3eb207Smrg
104*4c3eb207Smrg            if part.startswith('-'):
105*4c3eb207Smrg                if len(part) >= 2 and part[1].isalpha():
106*4c3eb207Smrg                    if part == '-INF':
107*4c3eb207Smrg                        continue
108*4c3eb207Smrg
109*4c3eb207Smrg                    warn(msg,
110*4c3eb207Smrg                         'option-outside-quotes',
111*4c3eb207Smrg                         'command line option outside %<quotes%>')
112*4c3eb207Smrg
113*4c3eb207Smrg            if part.startswith('__builtin_'):
114*4c3eb207Smrg                warn(msg,
115*4c3eb207Smrg                     'builtin-outside-quotes',
116*4c3eb207Smrg                     'builtin function outside %<quotes%>')
117*4c3eb207Smrg
118*4c3eb207Smrg    def lint_plain_apostrophe():
119*4c3eb207Smrg        for match in re.finditer("[^%]'", msgid):
120*4c3eb207Smrg            if outside_quotes(match):
121*4c3eb207Smrg                warn(msg, 'apostrophe', 'apostrophe without leading %')
122*4c3eb207Smrg
123*4c3eb207Smrg    def lint_space_before_quote():
124*4c3eb207Smrg        """
125*4c3eb207Smrg        A space before %< is often the result of string literals that
126*4c3eb207Smrg        are joined by the C compiler and neither literal has a space
127*4c3eb207Smrg        to separate the words.
128*4c3eb207Smrg        """
129*4c3eb207Smrg
130*4c3eb207Smrg        for match in re.finditer("(.?[a-zA-Z0-9])%<", msgid):
131*4c3eb207Smrg            if match.group(1) != '%s':
132*4c3eb207Smrg                warn(msg,
133*4c3eb207Smrg                     'no-space-before-quote',
134*4c3eb207Smrg                     '%< directly following a letter or digit')
135*4c3eb207Smrg
136*4c3eb207Smrg    def lint_underscore_outside_quotes():
137*4c3eb207Smrg        """
138*4c3eb207Smrg        An underscore outside of quotes is used in several contexts,
139*4c3eb207Smrg        and many of them violate the GCC Guidelines for Diagnostics:
140*4c3eb207Smrg
141*4c3eb207Smrg        * names of GCC-internal compiler functions
142*4c3eb207Smrg        * names of GCC-internal data structures
143*4c3eb207Smrg        * static_cast and the like (which are legitimate)
144*4c3eb207Smrg        """
145*4c3eb207Smrg
146*4c3eb207Smrg        for match in re.finditer("_", msgid):
147*4c3eb207Smrg            if outside_quotes(match):
148*4c3eb207Smrg                warn(msg,
149*4c3eb207Smrg                     'underscore-outside-quotes',
150*4c3eb207Smrg                     'underscore outside of %<quotes%>')
151*4c3eb207Smrg                return
152*4c3eb207Smrg
153*4c3eb207Smrg    def lint_may_not():
154*4c3eb207Smrg        """
155*4c3eb207Smrg        The term "may not" may either mean "it could be the case"
156*4c3eb207Smrg        or "should not". These two different meanings are sometimes
157*4c3eb207Smrg        hard to tell apart.
158*4c3eb207Smrg        """
159*4c3eb207Smrg
160*4c3eb207Smrg        if re.search(r'\bmay not\b', msgid):
161*4c3eb207Smrg            warn(msg,
162*4c3eb207Smrg                 'ambiguous-may-not',
163*4c3eb207Smrg                 'the term "may not" is ambiguous')
164*4c3eb207Smrg
165*4c3eb207Smrg    def lint_unbalanced_quotes():
166*4c3eb207Smrg        if msgid.count("%<") != msgid.count("%>"):
167*4c3eb207Smrg            warn(msg,
168*4c3eb207Smrg                 'unbalanced-quotes',
169*4c3eb207Smrg                 'unbalanced %< and %> quotes')
170*4c3eb207Smrg
171*4c3eb207Smrg        if msg.translated():
172*4c3eb207Smrg            if msg.msgstr.count("%<") != msg.msgstr.count("%>"):
173*4c3eb207Smrg                warn(msg,
174*4c3eb207Smrg                     'unbalanced-quotes',
175*4c3eb207Smrg                     'unbalanced %< and %> quotes')
176*4c3eb207Smrg
177*4c3eb207Smrg    def lint_single_space_after_sentence():
178*4c3eb207Smrg        """
179*4c3eb207Smrg        After a sentence there should be two spaces.
180*4c3eb207Smrg        """
181*4c3eb207Smrg
182*4c3eb207Smrg        if re.search(r'[.] [A-Z]', msgid):
183*4c3eb207Smrg            warn(msg,
184*4c3eb207Smrg                 'single-space-after-sentence',
185*4c3eb207Smrg                 'single space after sentence')
186*4c3eb207Smrg
187*4c3eb207Smrg    def lint_non_canonical_quotes():
188*4c3eb207Smrg        """
189*4c3eb207Smrg        Catches %<%s%>, which can be written in the shorter form %qs.
190*4c3eb207Smrg        """
191*4c3eb207Smrg        match = re.search("%<%s%>|'%s'|\"%s\"|`%s'", msgid)
192*4c3eb207Smrg        if match:
193*4c3eb207Smrg            warn(msg,
194*4c3eb207Smrg                 'non-canonical-quotes',
195*4c3eb207Smrg                 f'placeholder {match.group()} should be written as %qs')
196*4c3eb207Smrg
197*4c3eb207Smrg    lint_option_outside_quotes()
198*4c3eb207Smrg    lint_plain_apostrophe()
199*4c3eb207Smrg    lint_space_before_quote()
200*4c3eb207Smrg    lint_underscore_outside_quotes()
201*4c3eb207Smrg    lint_may_not()
202*4c3eb207Smrg    lint_unbalanced_quotes()
203*4c3eb207Smrg    lint_matching_placeholders()
204*4c3eb207Smrg    lint_single_space_after_sentence()
205*4c3eb207Smrg    lint_non_canonical_quotes()
206*4c3eb207Smrg
207*4c3eb207Smrg
208*4c3eb207Smrgdef lint_diagnostics_differing_only_in_placeholders(po: polib.POFile):
209*4c3eb207Smrg    """
210*4c3eb207Smrg    Detects messages that are structurally the same, except that they
211*4c3eb207Smrg    use different plain strings inside %<quotes%>. These messages can
212*4c3eb207Smrg    be merged in order to prevent copy-and-paste mistakes by the
213*4c3eb207Smrg    translators.
214*4c3eb207Smrg
215*4c3eb207Smrg    See bug 90119.
216*4c3eb207Smrg    """
217*4c3eb207Smrg
218*4c3eb207Smrg    seen: Dict[str, polib.POEntry] = {}
219*4c3eb207Smrg
220*4c3eb207Smrg    for msg in po:
221*4c3eb207Smrg        msg: polib.POEntry
222*4c3eb207Smrg        msgid = msg.msgid
223*4c3eb207Smrg
224*4c3eb207Smrg        normalized = re.sub('%<[^%]+%>', '%qs', msgid)
225*4c3eb207Smrg        if normalized not in seen:
226*4c3eb207Smrg            seen[normalized] = msg
227*4c3eb207Smrg            seen[msgid] = msg
228*4c3eb207Smrg            continue
229*4c3eb207Smrg
230*4c3eb207Smrg        prev = seen[normalized]
231*4c3eb207Smrg        warn(msg,
232*4c3eb207Smrg             'same-pattern',
233*4c3eb207Smrg             f'same pattern for {repr(msgid)} and '
234*4c3eb207Smrg             f'{repr(prev.msgid)} in {location(prev)}',
235*4c3eb207Smrg             include_msgid=False)
236*4c3eb207Smrg
237*4c3eb207Smrg
238*4c3eb207Smrgdef lint_file(po: polib.POFile):
239*4c3eb207Smrg    for msg in po:
240*4c3eb207Smrg        msg: polib.POEntry
241*4c3eb207Smrg
242*4c3eb207Smrg        if not msg.obsolete and not msg.fuzzy:
243*4c3eb207Smrg            if 'gcc-internal-format' in msg.flags:
244*4c3eb207Smrg                lint_gcc_internal_format(msg)
245*4c3eb207Smrg
246*4c3eb207Smrg    lint_diagnostics_differing_only_in_placeholders(po)
247*4c3eb207Smrg
248*4c3eb207Smrg
249*4c3eb207Smrgdef main():
250627f7eb2Smrg    parser = argparse.ArgumentParser(description='')
251627f7eb2Smrg    parser.add_argument('file', help='pot file')
252627f7eb2Smrg
253627f7eb2Smrg    args = parser.parse_args()
254627f7eb2Smrg
255*4c3eb207Smrg    po = polib.pofile(args.file)
256*4c3eb207Smrg    lint_file(po)
257627f7eb2Smrg
258*4c3eb207Smrg    print()
259*4c3eb207Smrg    print('summary:')
260*4c3eb207Smrg    for entry in seen_warnings.most_common():
261*4c3eb207Smrg        if entry[1] > 1:
262*4c3eb207Smrg            print(f'{entry[1]}\t{entry[0]}')
263627f7eb2Smrg
264*4c3eb207Smrg
265*4c3eb207Smrgif __name__ == '__main__':
266*4c3eb207Smrg    main()
267