1 //===-- Int type specifier converters for scanf -----------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "src/stdio/scanf_core/int_converter.h" 10 11 #include "src/__support/CPP/limits.h" 12 #include "src/__support/ctype_utils.h" 13 #include "src/__support/macros/config.h" 14 #include "src/stdio/scanf_core/converter_utils.h" 15 #include "src/stdio/scanf_core/core_structs.h" 16 #include "src/stdio/scanf_core/reader.h" 17 18 #include <stddef.h> 19 20 namespace LIBC_NAMESPACE_DECL { 21 namespace scanf_core { 22 23 // This code is very similar to the code in __support/str_to_integer.h but is 24 // not quite the same. Here is the list of differences and why they exist: 25 // 1) This takes a reader and a format section instead of a char* and the base. 26 // This should be fairly self explanatory. While the char* could be adapted 27 // to a reader and the base could be calculated ahead of time, the 28 // semantics are slightly different, specifically a char* can be indexed 29 // freely (I can read str[2] and then str[0]) whereas a File (which the 30 // reader may contain) cannot. 31 // 2) Because this uses a Reader, this function can only unget once. 32 // This is relevant because scanf specifies it reads the "longest sequence 33 // of input characters which does not exceed any specified field width and 34 // which is, or is a prefix of, a matching input sequence." Whereas the 35 // strtol function accepts "the longest initial subsequence of the input 36 // string (...) that is of the expected form." This is demonstrated by the 37 // differences in how they deal with the string "0xZZZ" when parsing as 38 // hexadecimal. Scanf will read the "0x" as a valid prefix and return 0, 39 // since it reads the first 'Z', sees that it's not a valid hex digit, and 40 // reverses one character. The strtol function on the other hand only 41 // accepts the "0" since that's the longest valid hexadecimal sequence. It 42 // sees the 'Z' after the "0x" and determines that this is not the prefix 43 // to a valid hex string. 44 // 3) This conversion may have a maximum width. 45 // If a maximum width is specified, this conversion is only allowed to 46 // accept a certain number of characters. Strtol doesn't have any such 47 // limitation. 48 int convert_int(Reader *reader, const FormatSection &to_conv) { 49 // %d "Matches an optionally signed decimal integer [...] with the value 10 50 // for the base argument. The corresponding argument shall be a pointer to 51 // signed integer." 52 53 // %i "Matches an optionally signed integer [...] with the value 0 for the 54 // base argument. The corresponding argument shall be a pointer to signed 55 // integer." 56 57 // %u "Matches an optionally signed decimal integer [...] with the value 10 58 // for the base argument. The corresponding argument shall be a pointer to 59 // unsigned integer" 60 61 // %o "Matches an optionally signed octal integer [...] with the value 8 for 62 // the base argument. The corresponding argument shall be a pointer to 63 // unsigned integer" 64 65 // %x/X "Matches an optionally signed hexadecimal integer [...] with the value 66 // 16 for the base argument. The corresponding argument shall be a pointer to 67 // unsigned integer" 68 69 size_t max_width = cpp::numeric_limits<size_t>::max(); 70 if (to_conv.max_width > 0) { 71 max_width = to_conv.max_width; 72 } 73 74 uintmax_t result = 0; 75 bool is_number = false; 76 bool is_signed = false; 77 int base = 0; 78 if (to_conv.conv_name == 'i') { 79 base = 0; 80 is_signed = true; 81 } else if (to_conv.conv_name == 'o') { 82 base = 8; 83 } else if (internal::tolower(to_conv.conv_name) == 'x' || 84 to_conv.conv_name == 'p') { 85 base = 16; 86 } else if (to_conv.conv_name == 'd') { 87 base = 10; 88 is_signed = true; 89 } else { // conv_name must be 'u' 90 base = 10; 91 } 92 93 char cur_char = reader->getc(); 94 95 char result_sign = '+'; 96 if (cur_char == '+' || cur_char == '-') { 97 result_sign = cur_char; 98 if (max_width > 1) { 99 --max_width; 100 cur_char = reader->getc(); 101 } else { 102 // If the max width has been hit already, then the return value must be 0 103 // since no actual digits of the number have been parsed yet. 104 write_int_with_length(0, to_conv); 105 return MATCHING_FAILURE; 106 } 107 } 108 const bool is_negative = result_sign == '-'; 109 110 // Base of 0 means automatically determine the base. Base of 16 may have a 111 // prefix of "0x" 112 if (base == 0 || base == 16) { 113 // If the first character is 0, then it could be octal or hex. 114 if (cur_char == '0') { 115 is_number = true; 116 117 // Read the next character to check. 118 if (max_width > 1) { 119 --max_width; 120 cur_char = reader->getc(); 121 } else { 122 write_int_with_length(0, to_conv); 123 return READ_OK; 124 } 125 126 if (internal::tolower(cur_char) == 'x') { 127 // This is a valid hex prefix. 128 129 is_number = false; 130 // A valid hex prefix is not necessarily a valid number. For the 131 // conversion to be valid it needs to use all of the characters it 132 // consumes. From the standard: 133 // 7.23.6.2 paragraph 9: "An input item is defined as the longest 134 // sequence of input characters which does not exceed any specified 135 // field width and which is, or is a prefix of, a matching input 136 // sequence." 137 // 7.23.6.2 paragraph 10: "If the input item is not a matching sequence, 138 // the execution of the directive fails: this condition is a matching 139 // failure" 140 base = 16; 141 if (max_width > 1) { 142 --max_width; 143 cur_char = reader->getc(); 144 } else { 145 return MATCHING_FAILURE; 146 } 147 148 } else { 149 if (base == 0) { 150 base = 8; 151 } 152 } 153 } else if (base == 0) { 154 if (internal::isdigit(cur_char)) { 155 // If the first character is a different number, then it's 10. 156 base = 10; 157 } else { 158 // If the first character isn't a valid digit, then there are no valid 159 // digits at all. The number is 0. 160 reader->ungetc(cur_char); 161 write_int_with_length(0, to_conv); 162 return MATCHING_FAILURE; 163 } 164 } 165 } 166 167 constexpr uintmax_t UNSIGNED_MAX = cpp::numeric_limits<uintmax_t>::max(); 168 constexpr uintmax_t SIGNED_MAX = 169 static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max()); 170 constexpr uintmax_t NEGATIVE_SIGNED_MAX = 171 static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max()) + 1; 172 173 const uintmax_t MAX = 174 (is_signed ? (is_negative ? NEGATIVE_SIGNED_MAX : SIGNED_MAX) 175 : UNSIGNED_MAX); 176 177 const uintmax_t max_div_by_base = MAX / base; 178 179 if (internal::isalnum(cur_char) && 180 internal::b36_char_to_int(cur_char) < base) { 181 is_number = true; 182 } 183 184 bool has_overflow = false; 185 size_t i = 0; 186 for (; i < max_width && internal::isalnum(cur_char) && 187 internal::b36_char_to_int(cur_char) < base; 188 ++i, cur_char = reader->getc()) { 189 190 uintmax_t cur_digit = internal::b36_char_to_int(cur_char); 191 192 if (result == MAX) { 193 has_overflow = true; 194 continue; 195 } else if (result > max_div_by_base) { 196 result = MAX; 197 has_overflow = true; 198 } else { 199 result = result * base; 200 } 201 202 if (result > MAX - cur_digit) { 203 result = MAX; 204 has_overflow = true; 205 } else { 206 result = result + cur_digit; 207 } 208 } 209 210 // We always read one more character than will be used, so we have to put the 211 // last one back. 212 reader->ungetc(cur_char); 213 214 if (!is_number) 215 return MATCHING_FAILURE; 216 217 if (has_overflow) { 218 write_int_with_length(MAX, to_conv); 219 } else { 220 if (is_negative) 221 result = -result; 222 223 write_int_with_length(result, to_conv); 224 } 225 226 return READ_OK; 227 } 228 229 } // namespace scanf_core 230 } // namespace LIBC_NAMESPACE_DECL 231