1181254a7Smrg#!/usr/bin/env python3 2181254a7Smrg# 3fb8a8121Smrg# Check gcc.pot file for stylistic issues as described in 4fb8a8121Smrg# https://gcc.gnu.org/onlinedocs/gccint/Guidelines-for-Diagnostics.html, 5fb8a8121Smrg# especially in gcc-internal-format messages. 6181254a7Smrg# 7181254a7Smrg# This file is part of GCC. 8181254a7Smrg# 9181254a7Smrg# GCC is free software; you can redistribute it and/or modify it under 10181254a7Smrg# the terms of the GNU General Public License as published by the Free 11181254a7Smrg# Software Foundation; either version 3, or (at your option) any later 12181254a7Smrg# version. 13181254a7Smrg# 14181254a7Smrg# GCC is distributed in the hope that it will be useful, but WITHOUT ANY 15181254a7Smrg# WARRANTY; without even the implied warranty of MERCHANTABILITY or 16181254a7Smrg# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 17181254a7Smrg# for more details. 18181254a7Smrg# 19181254a7Smrg# You should have received a copy of the GNU General Public License 20181254a7Smrg# along with GCC; see the file COPYING3. If not see 21fb8a8121Smrg# <http://www.gnu.org/licenses/>. 22181254a7Smrg 23181254a7Smrgimport argparse 24181254a7Smrgimport re 25fb8a8121Smrgfrom collections import Counter 26fb8a8121Smrgfrom typing import Dict, Match 27181254a7Smrg 28fb8a8121Smrgimport polib 29fb8a8121Smrg 30fb8a8121Smrgseen_warnings = Counter() 31fb8a8121Smrg 32fb8a8121Smrg 33fb8a8121Smrgdef location(msg: polib.POEntry): 34fb8a8121Smrg if msg.occurrences: 35fb8a8121Smrg occ = msg.occurrences[0] 36fb8a8121Smrg return f'{occ[0]}:{occ[1]}' 37fb8a8121Smrg return '<unknown location>' 38fb8a8121Smrg 39fb8a8121Smrg 40fb8a8121Smrgdef warn(msg: polib.POEntry, 41fb8a8121Smrg diagnostic_id: str, diagnostic: str, include_msgid=True): 42fb8a8121Smrg """ 43fb8a8121Smrg To suppress a warning for a particular message, 44fb8a8121Smrg add a line "#, gcclint:ignore:{diagnostic_id}" to the message. 45fb8a8121Smrg """ 46fb8a8121Smrg 47fb8a8121Smrg if f'gcclint:ignore:{diagnostic_id}' in msg.flags: 48fb8a8121Smrg return 49fb8a8121Smrg 50fb8a8121Smrg seen_warnings[diagnostic] += 1 51fb8a8121Smrg 52fb8a8121Smrg if include_msgid: 53fb8a8121Smrg print(f'{location(msg)}: {diagnostic} in {repr(msg.msgid)}') 54fb8a8121Smrg else: 55fb8a8121Smrg print(f'{location(msg)}: {diagnostic}') 56fb8a8121Smrg 57fb8a8121Smrg 58fb8a8121Smrgdef lint_gcc_internal_format(msg: polib.POEntry): 59fb8a8121Smrg """ 60fb8a8121Smrg Checks a single message that has the gcc-internal-format. These 61fb8a8121Smrg messages use a variety of placeholders like %qs, %<quotes%> and 62fb8a8121Smrg %q#E. 63fb8a8121Smrg """ 64fb8a8121Smrg 65fb8a8121Smrg msgid: str = msg.msgid 66fb8a8121Smrg 67fb8a8121Smrg def outside_quotes(m: Match[str]): 68fb8a8121Smrg before = msgid[:m.start(0)] 69*b1e83836Smrg return before.count('%<') == before.count('%>') 70fb8a8121Smrg 71fb8a8121Smrg def lint_matching_placeholders(): 72fb8a8121Smrg """ 73fb8a8121Smrg Warns when literal values in placeholders are not exactly equal 74fb8a8121Smrg in the translation. This can happen when doing copy-and-paste 75fb8a8121Smrg translations of similar messages. 76fb8a8121Smrg 77fb8a8121Smrg To avoid these mismatches in the first place, 78fb8a8121Smrg structurally equal messages are found by 79fb8a8121Smrg lint_diagnostics_differing_only_in_placeholders. 80fb8a8121Smrg 81fb8a8121Smrg This check only applies when checking a finished translation 82fb8a8121Smrg such as de.po, not gcc.pot. 83fb8a8121Smrg """ 84fb8a8121Smrg 85fb8a8121Smrg if not msg.translated(): 86fb8a8121Smrg return 87fb8a8121Smrg 88fb8a8121Smrg in_msgid = re.findall('%<[^%]+%>', msgid) 89fb8a8121Smrg in_msgstr = re.findall('%<[^%]+%>', msg.msgstr) 90fb8a8121Smrg 91fb8a8121Smrg if set(in_msgid) != set(in_msgstr): 92fb8a8121Smrg warn(msg, 93fb8a8121Smrg 'placeholder-mismatch', 94fb8a8121Smrg f'placeholder mismatch: msgid has {in_msgid}, ' 95fb8a8121Smrg f'msgstr has {in_msgstr}', 96fb8a8121Smrg include_msgid=False) 97fb8a8121Smrg 98fb8a8121Smrg def lint_option_outside_quotes(): 99fb8a8121Smrg for match in re.finditer(r'\S+', msgid): 100fb8a8121Smrg part = match.group() 101fb8a8121Smrg if not outside_quotes(match): 102fb8a8121Smrg continue 103fb8a8121Smrg 104fb8a8121Smrg if part.startswith('-'): 105fb8a8121Smrg if len(part) >= 2 and part[1].isalpha(): 106fb8a8121Smrg if part == '-INF': 107fb8a8121Smrg continue 108fb8a8121Smrg 109fb8a8121Smrg warn(msg, 110fb8a8121Smrg 'option-outside-quotes', 111fb8a8121Smrg 'command line option outside %<quotes%>') 112fb8a8121Smrg 113fb8a8121Smrg if part.startswith('__builtin_'): 114fb8a8121Smrg warn(msg, 115fb8a8121Smrg 'builtin-outside-quotes', 116fb8a8121Smrg 'builtin function outside %<quotes%>') 117fb8a8121Smrg 118fb8a8121Smrg def lint_plain_apostrophe(): 119fb8a8121Smrg for match in re.finditer("[^%]'", msgid): 120fb8a8121Smrg if outside_quotes(match): 121fb8a8121Smrg warn(msg, 'apostrophe', 'apostrophe without leading %') 122fb8a8121Smrg 123fb8a8121Smrg def lint_space_before_quote(): 124fb8a8121Smrg """ 125fb8a8121Smrg A space before %< is often the result of string literals that 126fb8a8121Smrg are joined by the C compiler and neither literal has a space 127fb8a8121Smrg to separate the words. 128fb8a8121Smrg """ 129fb8a8121Smrg 130*b1e83836Smrg for match in re.finditer('(.?[a-zA-Z0-9])%<', msgid): 131fb8a8121Smrg if match.group(1) != '%s': 132fb8a8121Smrg warn(msg, 133fb8a8121Smrg 'no-space-before-quote', 134fb8a8121Smrg '%< directly following a letter or digit') 135fb8a8121Smrg 136fb8a8121Smrg def lint_underscore_outside_quotes(): 137fb8a8121Smrg """ 138fb8a8121Smrg An underscore outside of quotes is used in several contexts, 139fb8a8121Smrg and many of them violate the GCC Guidelines for Diagnostics: 140fb8a8121Smrg 141fb8a8121Smrg * names of GCC-internal compiler functions 142fb8a8121Smrg * names of GCC-internal data structures 143fb8a8121Smrg * static_cast and the like (which are legitimate) 144fb8a8121Smrg """ 145fb8a8121Smrg 146*b1e83836Smrg for match in re.finditer('_', msgid): 147fb8a8121Smrg if outside_quotes(match): 148fb8a8121Smrg warn(msg, 149fb8a8121Smrg 'underscore-outside-quotes', 150fb8a8121Smrg 'underscore outside of %<quotes%>') 151fb8a8121Smrg return 152fb8a8121Smrg 153fb8a8121Smrg def lint_may_not(): 154fb8a8121Smrg """ 155fb8a8121Smrg The term "may not" may either mean "it could be the case" 156fb8a8121Smrg or "should not". These two different meanings are sometimes 157fb8a8121Smrg hard to tell apart. 158fb8a8121Smrg """ 159fb8a8121Smrg 160fb8a8121Smrg if re.search(r'\bmay not\b', msgid): 161fb8a8121Smrg warn(msg, 162fb8a8121Smrg 'ambiguous-may-not', 163fb8a8121Smrg 'the term "may not" is ambiguous') 164fb8a8121Smrg 165fb8a8121Smrg def lint_unbalanced_quotes(): 166*b1e83836Smrg if msgid.count('%<') != msgid.count('%>'): 167fb8a8121Smrg warn(msg, 168fb8a8121Smrg 'unbalanced-quotes', 169fb8a8121Smrg 'unbalanced %< and %> quotes') 170fb8a8121Smrg 171fb8a8121Smrg if msg.translated(): 172*b1e83836Smrg if msg.msgstr.count('%<') != msg.msgstr.count('%>'): 173fb8a8121Smrg warn(msg, 174fb8a8121Smrg 'unbalanced-quotes', 175fb8a8121Smrg 'unbalanced %< and %> quotes') 176fb8a8121Smrg 177fb8a8121Smrg def lint_single_space_after_sentence(): 178fb8a8121Smrg """ 179fb8a8121Smrg After a sentence there should be two spaces. 180fb8a8121Smrg """ 181fb8a8121Smrg 182fb8a8121Smrg if re.search(r'[.] [A-Z]', msgid): 183fb8a8121Smrg warn(msg, 184fb8a8121Smrg 'single-space-after-sentence', 185fb8a8121Smrg 'single space after sentence') 186fb8a8121Smrg 187fb8a8121Smrg def lint_non_canonical_quotes(): 188fb8a8121Smrg """ 189fb8a8121Smrg Catches %<%s%>, which can be written in the shorter form %qs. 190fb8a8121Smrg """ 191fb8a8121Smrg match = re.search("%<%s%>|'%s'|\"%s\"|`%s'", msgid) 192fb8a8121Smrg if match: 193fb8a8121Smrg warn(msg, 194fb8a8121Smrg 'non-canonical-quotes', 195fb8a8121Smrg f'placeholder {match.group()} should be written as %qs') 196fb8a8121Smrg 197fb8a8121Smrg lint_option_outside_quotes() 198fb8a8121Smrg lint_plain_apostrophe() 199fb8a8121Smrg lint_space_before_quote() 200fb8a8121Smrg lint_underscore_outside_quotes() 201fb8a8121Smrg lint_may_not() 202fb8a8121Smrg lint_unbalanced_quotes() 203fb8a8121Smrg lint_matching_placeholders() 204fb8a8121Smrg lint_single_space_after_sentence() 205fb8a8121Smrg lint_non_canonical_quotes() 206fb8a8121Smrg 207fb8a8121Smrg 208fb8a8121Smrgdef lint_diagnostics_differing_only_in_placeholders(po: polib.POFile): 209fb8a8121Smrg """ 210fb8a8121Smrg Detects messages that are structurally the same, except that they 211fb8a8121Smrg use different plain strings inside %<quotes%>. These messages can 212fb8a8121Smrg be merged in order to prevent copy-and-paste mistakes by the 213fb8a8121Smrg translators. 214fb8a8121Smrg 215fb8a8121Smrg See bug 90119. 216fb8a8121Smrg """ 217fb8a8121Smrg 218fb8a8121Smrg seen: Dict[str, polib.POEntry] = {} 219fb8a8121Smrg 220fb8a8121Smrg for msg in po: 221fb8a8121Smrg msg: polib.POEntry 222fb8a8121Smrg msgid = msg.msgid 223fb8a8121Smrg 224fb8a8121Smrg normalized = re.sub('%<[^%]+%>', '%qs', msgid) 225fb8a8121Smrg if normalized not in seen: 226fb8a8121Smrg seen[normalized] = msg 227fb8a8121Smrg seen[msgid] = msg 228fb8a8121Smrg continue 229fb8a8121Smrg 230fb8a8121Smrg prev = seen[normalized] 231fb8a8121Smrg warn(msg, 232fb8a8121Smrg 'same-pattern', 233fb8a8121Smrg f'same pattern for {repr(msgid)} and ' 234fb8a8121Smrg f'{repr(prev.msgid)} in {location(prev)}', 235fb8a8121Smrg include_msgid=False) 236fb8a8121Smrg 237fb8a8121Smrg 238fb8a8121Smrgdef lint_file(po: polib.POFile): 239fb8a8121Smrg for msg in po: 240fb8a8121Smrg msg: polib.POEntry 241fb8a8121Smrg 242fb8a8121Smrg if not msg.obsolete and not msg.fuzzy: 243fb8a8121Smrg if 'gcc-internal-format' in msg.flags: 244fb8a8121Smrg lint_gcc_internal_format(msg) 245fb8a8121Smrg 246fb8a8121Smrg lint_diagnostics_differing_only_in_placeholders(po) 247fb8a8121Smrg 248fb8a8121Smrg 249fb8a8121Smrgdef main(): 250181254a7Smrg parser = argparse.ArgumentParser(description='') 251181254a7Smrg parser.add_argument('file', help='pot file') 252181254a7Smrg 253181254a7Smrg args = parser.parse_args() 254181254a7Smrg 255fb8a8121Smrg po = polib.pofile(args.file) 256fb8a8121Smrg lint_file(po) 257181254a7Smrg 258fb8a8121Smrg print() 259fb8a8121Smrg print('summary:') 260fb8a8121Smrg for entry in seen_warnings.most_common(): 261fb8a8121Smrg if entry[1] > 1: 262fb8a8121Smrg print(f'{entry[1]}\t{entry[0]}') 263181254a7Smrg 264fb8a8121Smrg 265fb8a8121Smrgif __name__ == '__main__': 266fb8a8121Smrg main() 267