xref: /netbsd-src/external/gpl3/gcc/dist/contrib/check-internal-format-escaping.py (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1181254a7Smrg#!/usr/bin/env python3
2181254a7Smrg#
3fb8a8121Smrg# Check gcc.pot file for stylistic issues as described in
4fb8a8121Smrg# https://gcc.gnu.org/onlinedocs/gccint/Guidelines-for-Diagnostics.html,
5fb8a8121Smrg# especially in gcc-internal-format messages.
6181254a7Smrg#
7181254a7Smrg# This file is part of GCC.
8181254a7Smrg#
9181254a7Smrg# GCC is free software; you can redistribute it and/or modify it under
10181254a7Smrg# the terms of the GNU General Public License as published by the Free
11181254a7Smrg# Software Foundation; either version 3, or (at your option) any later
12181254a7Smrg# version.
13181254a7Smrg#
14181254a7Smrg# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15181254a7Smrg# WARRANTY; without even the implied warranty of MERCHANTABILITY or
16181254a7Smrg# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
17181254a7Smrg# for more details.
18181254a7Smrg#
19181254a7Smrg# You should have received a copy of the GNU General Public License
20181254a7Smrg# along with GCC; see the file COPYING3.  If not see
21fb8a8121Smrg# <http://www.gnu.org/licenses/>.
22181254a7Smrg
23181254a7Smrgimport argparse
24181254a7Smrgimport re
25fb8a8121Smrgfrom collections import Counter
26fb8a8121Smrgfrom typing import Dict, Match
27181254a7Smrg
28fb8a8121Smrgimport polib
29fb8a8121Smrg
30fb8a8121Smrgseen_warnings = Counter()
31fb8a8121Smrg
32fb8a8121Smrg
33fb8a8121Smrgdef location(msg: polib.POEntry):
34fb8a8121Smrg    if msg.occurrences:
35fb8a8121Smrg        occ = msg.occurrences[0]
36fb8a8121Smrg        return f'{occ[0]}:{occ[1]}'
37fb8a8121Smrg    return '<unknown location>'
38fb8a8121Smrg
39fb8a8121Smrg
40fb8a8121Smrgdef warn(msg: polib.POEntry,
41fb8a8121Smrg         diagnostic_id: str, diagnostic: str, include_msgid=True):
42fb8a8121Smrg    """
43fb8a8121Smrg    To suppress a warning for a particular message,
44fb8a8121Smrg    add a line "#, gcclint:ignore:{diagnostic_id}" to the message.
45fb8a8121Smrg    """
46fb8a8121Smrg
47fb8a8121Smrg    if f'gcclint:ignore:{diagnostic_id}' in msg.flags:
48fb8a8121Smrg        return
49fb8a8121Smrg
50fb8a8121Smrg    seen_warnings[diagnostic] += 1
51fb8a8121Smrg
52fb8a8121Smrg    if include_msgid:
53fb8a8121Smrg        print(f'{location(msg)}: {diagnostic} in {repr(msg.msgid)}')
54fb8a8121Smrg    else:
55fb8a8121Smrg        print(f'{location(msg)}: {diagnostic}')
56fb8a8121Smrg
57fb8a8121Smrg
58fb8a8121Smrgdef lint_gcc_internal_format(msg: polib.POEntry):
59fb8a8121Smrg    """
60fb8a8121Smrg    Checks a single message that has the gcc-internal-format. These
61fb8a8121Smrg    messages use a variety of placeholders like %qs, %<quotes%> and
62fb8a8121Smrg    %q#E.
63fb8a8121Smrg    """
64fb8a8121Smrg
65fb8a8121Smrg    msgid: str = msg.msgid
66fb8a8121Smrg
67fb8a8121Smrg    def outside_quotes(m: Match[str]):
68fb8a8121Smrg        before = msgid[:m.start(0)]
69*b1e83836Smrg        return before.count('%<') == before.count('%>')
70fb8a8121Smrg
71fb8a8121Smrg    def lint_matching_placeholders():
72fb8a8121Smrg        """
73fb8a8121Smrg        Warns when literal values in placeholders are not exactly equal
74fb8a8121Smrg        in the translation. This can happen when doing copy-and-paste
75fb8a8121Smrg        translations of similar messages.
76fb8a8121Smrg
77fb8a8121Smrg        To avoid these mismatches in the first place,
78fb8a8121Smrg        structurally equal messages are found by
79fb8a8121Smrg        lint_diagnostics_differing_only_in_placeholders.
80fb8a8121Smrg
81fb8a8121Smrg        This check only applies when checking a finished translation
82fb8a8121Smrg        such as de.po, not gcc.pot.
83fb8a8121Smrg        """
84fb8a8121Smrg
85fb8a8121Smrg        if not msg.translated():
86fb8a8121Smrg            return
87fb8a8121Smrg
88fb8a8121Smrg        in_msgid = re.findall('%<[^%]+%>', msgid)
89fb8a8121Smrg        in_msgstr = re.findall('%<[^%]+%>', msg.msgstr)
90fb8a8121Smrg
91fb8a8121Smrg        if set(in_msgid) != set(in_msgstr):
92fb8a8121Smrg            warn(msg,
93fb8a8121Smrg                 'placeholder-mismatch',
94fb8a8121Smrg                 f'placeholder mismatch: msgid has {in_msgid}, '
95fb8a8121Smrg                 f'msgstr has {in_msgstr}',
96fb8a8121Smrg                 include_msgid=False)
97fb8a8121Smrg
98fb8a8121Smrg    def lint_option_outside_quotes():
99fb8a8121Smrg        for match in re.finditer(r'\S+', msgid):
100fb8a8121Smrg            part = match.group()
101fb8a8121Smrg            if not outside_quotes(match):
102fb8a8121Smrg                continue
103fb8a8121Smrg
104fb8a8121Smrg            if part.startswith('-'):
105fb8a8121Smrg                if len(part) >= 2 and part[1].isalpha():
106fb8a8121Smrg                    if part == '-INF':
107fb8a8121Smrg                        continue
108fb8a8121Smrg
109fb8a8121Smrg                    warn(msg,
110fb8a8121Smrg                         'option-outside-quotes',
111fb8a8121Smrg                         'command line option outside %<quotes%>')
112fb8a8121Smrg
113fb8a8121Smrg            if part.startswith('__builtin_'):
114fb8a8121Smrg                warn(msg,
115fb8a8121Smrg                     'builtin-outside-quotes',
116fb8a8121Smrg                     'builtin function outside %<quotes%>')
117fb8a8121Smrg
118fb8a8121Smrg    def lint_plain_apostrophe():
119fb8a8121Smrg        for match in re.finditer("[^%]'", msgid):
120fb8a8121Smrg            if outside_quotes(match):
121fb8a8121Smrg                warn(msg, 'apostrophe', 'apostrophe without leading %')
122fb8a8121Smrg
123fb8a8121Smrg    def lint_space_before_quote():
124fb8a8121Smrg        """
125fb8a8121Smrg        A space before %< is often the result of string literals that
126fb8a8121Smrg        are joined by the C compiler and neither literal has a space
127fb8a8121Smrg        to separate the words.
128fb8a8121Smrg        """
129fb8a8121Smrg
130*b1e83836Smrg        for match in re.finditer('(.?[a-zA-Z0-9])%<', msgid):
131fb8a8121Smrg            if match.group(1) != '%s':
132fb8a8121Smrg                warn(msg,
133fb8a8121Smrg                     'no-space-before-quote',
134fb8a8121Smrg                     '%< directly following a letter or digit')
135fb8a8121Smrg
136fb8a8121Smrg    def lint_underscore_outside_quotes():
137fb8a8121Smrg        """
138fb8a8121Smrg        An underscore outside of quotes is used in several contexts,
139fb8a8121Smrg        and many of them violate the GCC Guidelines for Diagnostics:
140fb8a8121Smrg
141fb8a8121Smrg        * names of GCC-internal compiler functions
142fb8a8121Smrg        * names of GCC-internal data structures
143fb8a8121Smrg        * static_cast and the like (which are legitimate)
144fb8a8121Smrg        """
145fb8a8121Smrg
146*b1e83836Smrg        for match in re.finditer('_', msgid):
147fb8a8121Smrg            if outside_quotes(match):
148fb8a8121Smrg                warn(msg,
149fb8a8121Smrg                     'underscore-outside-quotes',
150fb8a8121Smrg                     'underscore outside of %<quotes%>')
151fb8a8121Smrg                return
152fb8a8121Smrg
153fb8a8121Smrg    def lint_may_not():
154fb8a8121Smrg        """
155fb8a8121Smrg        The term "may not" may either mean "it could be the case"
156fb8a8121Smrg        or "should not". These two different meanings are sometimes
157fb8a8121Smrg        hard to tell apart.
158fb8a8121Smrg        """
159fb8a8121Smrg
160fb8a8121Smrg        if re.search(r'\bmay not\b', msgid):
161fb8a8121Smrg            warn(msg,
162fb8a8121Smrg                 'ambiguous-may-not',
163fb8a8121Smrg                 'the term "may not" is ambiguous')
164fb8a8121Smrg
165fb8a8121Smrg    def lint_unbalanced_quotes():
166*b1e83836Smrg        if msgid.count('%<') != msgid.count('%>'):
167fb8a8121Smrg            warn(msg,
168fb8a8121Smrg                 'unbalanced-quotes',
169fb8a8121Smrg                 'unbalanced %< and %> quotes')
170fb8a8121Smrg
171fb8a8121Smrg        if msg.translated():
172*b1e83836Smrg            if msg.msgstr.count('%<') != msg.msgstr.count('%>'):
173fb8a8121Smrg                warn(msg,
174fb8a8121Smrg                     'unbalanced-quotes',
175fb8a8121Smrg                     'unbalanced %< and %> quotes')
176fb8a8121Smrg
177fb8a8121Smrg    def lint_single_space_after_sentence():
178fb8a8121Smrg        """
179fb8a8121Smrg        After a sentence there should be two spaces.
180fb8a8121Smrg        """
181fb8a8121Smrg
182fb8a8121Smrg        if re.search(r'[.] [A-Z]', msgid):
183fb8a8121Smrg            warn(msg,
184fb8a8121Smrg                 'single-space-after-sentence',
185fb8a8121Smrg                 'single space after sentence')
186fb8a8121Smrg
187fb8a8121Smrg    def lint_non_canonical_quotes():
188fb8a8121Smrg        """
189fb8a8121Smrg        Catches %<%s%>, which can be written in the shorter form %qs.
190fb8a8121Smrg        """
191fb8a8121Smrg        match = re.search("%<%s%>|'%s'|\"%s\"|`%s'", msgid)
192fb8a8121Smrg        if match:
193fb8a8121Smrg            warn(msg,
194fb8a8121Smrg                 'non-canonical-quotes',
195fb8a8121Smrg                 f'placeholder {match.group()} should be written as %qs')
196fb8a8121Smrg
197fb8a8121Smrg    lint_option_outside_quotes()
198fb8a8121Smrg    lint_plain_apostrophe()
199fb8a8121Smrg    lint_space_before_quote()
200fb8a8121Smrg    lint_underscore_outside_quotes()
201fb8a8121Smrg    lint_may_not()
202fb8a8121Smrg    lint_unbalanced_quotes()
203fb8a8121Smrg    lint_matching_placeholders()
204fb8a8121Smrg    lint_single_space_after_sentence()
205fb8a8121Smrg    lint_non_canonical_quotes()
206fb8a8121Smrg
207fb8a8121Smrg
208fb8a8121Smrgdef lint_diagnostics_differing_only_in_placeholders(po: polib.POFile):
209fb8a8121Smrg    """
210fb8a8121Smrg    Detects messages that are structurally the same, except that they
211fb8a8121Smrg    use different plain strings inside %<quotes%>. These messages can
212fb8a8121Smrg    be merged in order to prevent copy-and-paste mistakes by the
213fb8a8121Smrg    translators.
214fb8a8121Smrg
215fb8a8121Smrg    See bug 90119.
216fb8a8121Smrg    """
217fb8a8121Smrg
218fb8a8121Smrg    seen: Dict[str, polib.POEntry] = {}
219fb8a8121Smrg
220fb8a8121Smrg    for msg in po:
221fb8a8121Smrg        msg: polib.POEntry
222fb8a8121Smrg        msgid = msg.msgid
223fb8a8121Smrg
224fb8a8121Smrg        normalized = re.sub('%<[^%]+%>', '%qs', msgid)
225fb8a8121Smrg        if normalized not in seen:
226fb8a8121Smrg            seen[normalized] = msg
227fb8a8121Smrg            seen[msgid] = msg
228fb8a8121Smrg            continue
229fb8a8121Smrg
230fb8a8121Smrg        prev = seen[normalized]
231fb8a8121Smrg        warn(msg,
232fb8a8121Smrg             'same-pattern',
233fb8a8121Smrg             f'same pattern for {repr(msgid)} and '
234fb8a8121Smrg             f'{repr(prev.msgid)} in {location(prev)}',
235fb8a8121Smrg             include_msgid=False)
236fb8a8121Smrg
237fb8a8121Smrg
238fb8a8121Smrgdef lint_file(po: polib.POFile):
239fb8a8121Smrg    for msg in po:
240fb8a8121Smrg        msg: polib.POEntry
241fb8a8121Smrg
242fb8a8121Smrg        if not msg.obsolete and not msg.fuzzy:
243fb8a8121Smrg            if 'gcc-internal-format' in msg.flags:
244fb8a8121Smrg                lint_gcc_internal_format(msg)
245fb8a8121Smrg
246fb8a8121Smrg    lint_diagnostics_differing_only_in_placeholders(po)
247fb8a8121Smrg
248fb8a8121Smrg
249fb8a8121Smrgdef main():
250181254a7Smrg    parser = argparse.ArgumentParser(description='')
251181254a7Smrg    parser.add_argument('file', help='pot file')
252181254a7Smrg
253181254a7Smrg    args = parser.parse_args()
254181254a7Smrg
255fb8a8121Smrg    po = polib.pofile(args.file)
256fb8a8121Smrg    lint_file(po)
257181254a7Smrg
258fb8a8121Smrg    print()
259fb8a8121Smrg    print('summary:')
260fb8a8121Smrg    for entry in seen_warnings.most_common():
261fb8a8121Smrg        if entry[1] > 1:
262fb8a8121Smrg            print(f'{entry[1]}\t{entry[0]}')
263181254a7Smrg
264fb8a8121Smrg
265fb8a8121Smrgif __name__ == '__main__':
266fb8a8121Smrg    main()
267