1627f7eb2Smrg#!/usr/bin/env python3 2627f7eb2Smrg# 3*4c3eb207Smrg# Check gcc.pot file for stylistic issues as described in 4*4c3eb207Smrg# https://gcc.gnu.org/onlinedocs/gccint/Guidelines-for-Diagnostics.html, 5*4c3eb207Smrg# especially in gcc-internal-format messages. 6627f7eb2Smrg# 7627f7eb2Smrg# This file is part of GCC. 8627f7eb2Smrg# 9627f7eb2Smrg# GCC is free software; you can redistribute it and/or modify it under 10627f7eb2Smrg# the terms of the GNU General Public License as published by the Free 11627f7eb2Smrg# Software Foundation; either version 3, or (at your option) any later 12627f7eb2Smrg# version. 13627f7eb2Smrg# 14627f7eb2Smrg# GCC is distributed in the hope that it will be useful, but WITHOUT ANY 15627f7eb2Smrg# WARRANTY; without even the implied warranty of MERCHANTABILITY or 16627f7eb2Smrg# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 17627f7eb2Smrg# for more details. 18627f7eb2Smrg# 19627f7eb2Smrg# You should have received a copy of the GNU General Public License 20627f7eb2Smrg# along with GCC; see the file COPYING3. If not see 21*4c3eb207Smrg# <http://www.gnu.org/licenses/>. 22627f7eb2Smrg 23627f7eb2Smrgimport argparse 24627f7eb2Smrgimport re 25*4c3eb207Smrgfrom collections import Counter 26*4c3eb207Smrgfrom typing import Dict, Match 27627f7eb2Smrg 28*4c3eb207Smrgimport polib 29*4c3eb207Smrg 30*4c3eb207Smrgseen_warnings = Counter() 31*4c3eb207Smrg 32*4c3eb207Smrg 33*4c3eb207Smrgdef location(msg: polib.POEntry): 34*4c3eb207Smrg if msg.occurrences: 35*4c3eb207Smrg occ = msg.occurrences[0] 36*4c3eb207Smrg return f'{occ[0]}:{occ[1]}' 37*4c3eb207Smrg return '<unknown location>' 38*4c3eb207Smrg 39*4c3eb207Smrg 40*4c3eb207Smrgdef warn(msg: polib.POEntry, 41*4c3eb207Smrg diagnostic_id: str, diagnostic: str, include_msgid=True): 42*4c3eb207Smrg """ 43*4c3eb207Smrg To suppress a warning for a particular message, 44*4c3eb207Smrg add a line "#, gcclint:ignore:{diagnostic_id}" to the message. 45*4c3eb207Smrg """ 46*4c3eb207Smrg 47*4c3eb207Smrg if f'gcclint:ignore:{diagnostic_id}' in msg.flags: 48*4c3eb207Smrg return 49*4c3eb207Smrg 50*4c3eb207Smrg seen_warnings[diagnostic] += 1 51*4c3eb207Smrg 52*4c3eb207Smrg if include_msgid: 53*4c3eb207Smrg print(f'{location(msg)}: {diagnostic} in {repr(msg.msgid)}') 54*4c3eb207Smrg else: 55*4c3eb207Smrg print(f'{location(msg)}: {diagnostic}') 56*4c3eb207Smrg 57*4c3eb207Smrg 58*4c3eb207Smrgdef lint_gcc_internal_format(msg: polib.POEntry): 59*4c3eb207Smrg """ 60*4c3eb207Smrg Checks a single message that has the gcc-internal-format. These 61*4c3eb207Smrg messages use a variety of placeholders like %qs, %<quotes%> and 62*4c3eb207Smrg %q#E. 63*4c3eb207Smrg """ 64*4c3eb207Smrg 65*4c3eb207Smrg msgid: str = msg.msgid 66*4c3eb207Smrg 67*4c3eb207Smrg def outside_quotes(m: Match[str]): 68*4c3eb207Smrg before = msgid[:m.start(0)] 69*4c3eb207Smrg return before.count("%<") == before.count("%>") 70*4c3eb207Smrg 71*4c3eb207Smrg def lint_matching_placeholders(): 72*4c3eb207Smrg """ 73*4c3eb207Smrg Warns when literal values in placeholders are not exactly equal 74*4c3eb207Smrg in the translation. This can happen when doing copy-and-paste 75*4c3eb207Smrg translations of similar messages. 76*4c3eb207Smrg 77*4c3eb207Smrg To avoid these mismatches in the first place, 78*4c3eb207Smrg structurally equal messages are found by 79*4c3eb207Smrg lint_diagnostics_differing_only_in_placeholders. 80*4c3eb207Smrg 81*4c3eb207Smrg This check only applies when checking a finished translation 82*4c3eb207Smrg such as de.po, not gcc.pot. 83*4c3eb207Smrg """ 84*4c3eb207Smrg 85*4c3eb207Smrg if not msg.translated(): 86*4c3eb207Smrg return 87*4c3eb207Smrg 88*4c3eb207Smrg in_msgid = re.findall('%<[^%]+%>', msgid) 89*4c3eb207Smrg in_msgstr = re.findall('%<[^%]+%>', msg.msgstr) 90*4c3eb207Smrg 91*4c3eb207Smrg if set(in_msgid) != set(in_msgstr): 92*4c3eb207Smrg warn(msg, 93*4c3eb207Smrg 'placeholder-mismatch', 94*4c3eb207Smrg f'placeholder mismatch: msgid has {in_msgid}, ' 95*4c3eb207Smrg f'msgstr has {in_msgstr}', 96*4c3eb207Smrg include_msgid=False) 97*4c3eb207Smrg 98*4c3eb207Smrg def lint_option_outside_quotes(): 99*4c3eb207Smrg for match in re.finditer(r'\S+', msgid): 100*4c3eb207Smrg part = match.group() 101*4c3eb207Smrg if not outside_quotes(match): 102*4c3eb207Smrg continue 103*4c3eb207Smrg 104*4c3eb207Smrg if part.startswith('-'): 105*4c3eb207Smrg if len(part) >= 2 and part[1].isalpha(): 106*4c3eb207Smrg if part == '-INF': 107*4c3eb207Smrg continue 108*4c3eb207Smrg 109*4c3eb207Smrg warn(msg, 110*4c3eb207Smrg 'option-outside-quotes', 111*4c3eb207Smrg 'command line option outside %<quotes%>') 112*4c3eb207Smrg 113*4c3eb207Smrg if part.startswith('__builtin_'): 114*4c3eb207Smrg warn(msg, 115*4c3eb207Smrg 'builtin-outside-quotes', 116*4c3eb207Smrg 'builtin function outside %<quotes%>') 117*4c3eb207Smrg 118*4c3eb207Smrg def lint_plain_apostrophe(): 119*4c3eb207Smrg for match in re.finditer("[^%]'", msgid): 120*4c3eb207Smrg if outside_quotes(match): 121*4c3eb207Smrg warn(msg, 'apostrophe', 'apostrophe without leading %') 122*4c3eb207Smrg 123*4c3eb207Smrg def lint_space_before_quote(): 124*4c3eb207Smrg """ 125*4c3eb207Smrg A space before %< is often the result of string literals that 126*4c3eb207Smrg are joined by the C compiler and neither literal has a space 127*4c3eb207Smrg to separate the words. 128*4c3eb207Smrg """ 129*4c3eb207Smrg 130*4c3eb207Smrg for match in re.finditer("(.?[a-zA-Z0-9])%<", msgid): 131*4c3eb207Smrg if match.group(1) != '%s': 132*4c3eb207Smrg warn(msg, 133*4c3eb207Smrg 'no-space-before-quote', 134*4c3eb207Smrg '%< directly following a letter or digit') 135*4c3eb207Smrg 136*4c3eb207Smrg def lint_underscore_outside_quotes(): 137*4c3eb207Smrg """ 138*4c3eb207Smrg An underscore outside of quotes is used in several contexts, 139*4c3eb207Smrg and many of them violate the GCC Guidelines for Diagnostics: 140*4c3eb207Smrg 141*4c3eb207Smrg * names of GCC-internal compiler functions 142*4c3eb207Smrg * names of GCC-internal data structures 143*4c3eb207Smrg * static_cast and the like (which are legitimate) 144*4c3eb207Smrg """ 145*4c3eb207Smrg 146*4c3eb207Smrg for match in re.finditer("_", msgid): 147*4c3eb207Smrg if outside_quotes(match): 148*4c3eb207Smrg warn(msg, 149*4c3eb207Smrg 'underscore-outside-quotes', 150*4c3eb207Smrg 'underscore outside of %<quotes%>') 151*4c3eb207Smrg return 152*4c3eb207Smrg 153*4c3eb207Smrg def lint_may_not(): 154*4c3eb207Smrg """ 155*4c3eb207Smrg The term "may not" may either mean "it could be the case" 156*4c3eb207Smrg or "should not". These two different meanings are sometimes 157*4c3eb207Smrg hard to tell apart. 158*4c3eb207Smrg """ 159*4c3eb207Smrg 160*4c3eb207Smrg if re.search(r'\bmay not\b', msgid): 161*4c3eb207Smrg warn(msg, 162*4c3eb207Smrg 'ambiguous-may-not', 163*4c3eb207Smrg 'the term "may not" is ambiguous') 164*4c3eb207Smrg 165*4c3eb207Smrg def lint_unbalanced_quotes(): 166*4c3eb207Smrg if msgid.count("%<") != msgid.count("%>"): 167*4c3eb207Smrg warn(msg, 168*4c3eb207Smrg 'unbalanced-quotes', 169*4c3eb207Smrg 'unbalanced %< and %> quotes') 170*4c3eb207Smrg 171*4c3eb207Smrg if msg.translated(): 172*4c3eb207Smrg if msg.msgstr.count("%<") != msg.msgstr.count("%>"): 173*4c3eb207Smrg warn(msg, 174*4c3eb207Smrg 'unbalanced-quotes', 175*4c3eb207Smrg 'unbalanced %< and %> quotes') 176*4c3eb207Smrg 177*4c3eb207Smrg def lint_single_space_after_sentence(): 178*4c3eb207Smrg """ 179*4c3eb207Smrg After a sentence there should be two spaces. 180*4c3eb207Smrg """ 181*4c3eb207Smrg 182*4c3eb207Smrg if re.search(r'[.] [A-Z]', msgid): 183*4c3eb207Smrg warn(msg, 184*4c3eb207Smrg 'single-space-after-sentence', 185*4c3eb207Smrg 'single space after sentence') 186*4c3eb207Smrg 187*4c3eb207Smrg def lint_non_canonical_quotes(): 188*4c3eb207Smrg """ 189*4c3eb207Smrg Catches %<%s%>, which can be written in the shorter form %qs. 190*4c3eb207Smrg """ 191*4c3eb207Smrg match = re.search("%<%s%>|'%s'|\"%s\"|`%s'", msgid) 192*4c3eb207Smrg if match: 193*4c3eb207Smrg warn(msg, 194*4c3eb207Smrg 'non-canonical-quotes', 195*4c3eb207Smrg f'placeholder {match.group()} should be written as %qs') 196*4c3eb207Smrg 197*4c3eb207Smrg lint_option_outside_quotes() 198*4c3eb207Smrg lint_plain_apostrophe() 199*4c3eb207Smrg lint_space_before_quote() 200*4c3eb207Smrg lint_underscore_outside_quotes() 201*4c3eb207Smrg lint_may_not() 202*4c3eb207Smrg lint_unbalanced_quotes() 203*4c3eb207Smrg lint_matching_placeholders() 204*4c3eb207Smrg lint_single_space_after_sentence() 205*4c3eb207Smrg lint_non_canonical_quotes() 206*4c3eb207Smrg 207*4c3eb207Smrg 208*4c3eb207Smrgdef lint_diagnostics_differing_only_in_placeholders(po: polib.POFile): 209*4c3eb207Smrg """ 210*4c3eb207Smrg Detects messages that are structurally the same, except that they 211*4c3eb207Smrg use different plain strings inside %<quotes%>. These messages can 212*4c3eb207Smrg be merged in order to prevent copy-and-paste mistakes by the 213*4c3eb207Smrg translators. 214*4c3eb207Smrg 215*4c3eb207Smrg See bug 90119. 216*4c3eb207Smrg """ 217*4c3eb207Smrg 218*4c3eb207Smrg seen: Dict[str, polib.POEntry] = {} 219*4c3eb207Smrg 220*4c3eb207Smrg for msg in po: 221*4c3eb207Smrg msg: polib.POEntry 222*4c3eb207Smrg msgid = msg.msgid 223*4c3eb207Smrg 224*4c3eb207Smrg normalized = re.sub('%<[^%]+%>', '%qs', msgid) 225*4c3eb207Smrg if normalized not in seen: 226*4c3eb207Smrg seen[normalized] = msg 227*4c3eb207Smrg seen[msgid] = msg 228*4c3eb207Smrg continue 229*4c3eb207Smrg 230*4c3eb207Smrg prev = seen[normalized] 231*4c3eb207Smrg warn(msg, 232*4c3eb207Smrg 'same-pattern', 233*4c3eb207Smrg f'same pattern for {repr(msgid)} and ' 234*4c3eb207Smrg f'{repr(prev.msgid)} in {location(prev)}', 235*4c3eb207Smrg include_msgid=False) 236*4c3eb207Smrg 237*4c3eb207Smrg 238*4c3eb207Smrgdef lint_file(po: polib.POFile): 239*4c3eb207Smrg for msg in po: 240*4c3eb207Smrg msg: polib.POEntry 241*4c3eb207Smrg 242*4c3eb207Smrg if not msg.obsolete and not msg.fuzzy: 243*4c3eb207Smrg if 'gcc-internal-format' in msg.flags: 244*4c3eb207Smrg lint_gcc_internal_format(msg) 245*4c3eb207Smrg 246*4c3eb207Smrg lint_diagnostics_differing_only_in_placeholders(po) 247*4c3eb207Smrg 248*4c3eb207Smrg 249*4c3eb207Smrgdef main(): 250627f7eb2Smrg parser = argparse.ArgumentParser(description='') 251627f7eb2Smrg parser.add_argument('file', help='pot file') 252627f7eb2Smrg 253627f7eb2Smrg args = parser.parse_args() 254627f7eb2Smrg 255*4c3eb207Smrg po = polib.pofile(args.file) 256*4c3eb207Smrg lint_file(po) 257627f7eb2Smrg 258*4c3eb207Smrg print() 259*4c3eb207Smrg print('summary:') 260*4c3eb207Smrg for entry in seen_warnings.most_common(): 261*4c3eb207Smrg if entry[1] > 1: 262*4c3eb207Smrg print(f'{entry[1]}\t{entry[0]}') 263627f7eb2Smrg 264*4c3eb207Smrg 265*4c3eb207Smrgif __name__ == '__main__': 266*4c3eb207Smrg main() 267