17a129f07SMichael Jones //===-- Format string parser for scanf -------------------------*- C++ -*-===// 27a129f07SMichael Jones // 37a129f07SMichael Jones // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 47a129f07SMichael Jones // See https://llvm.org/LICENSE.txt for license information. 57a129f07SMichael Jones // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 67a129f07SMichael Jones // 77a129f07SMichael Jones //===----------------------------------------------------------------------===// 87a129f07SMichael Jones 97a129f07SMichael Jones #ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H 107a129f07SMichael Jones #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H 117a129f07SMichael Jones 127a129f07SMichael Jones #include "src/__support/arg_list.h" 13e0be78beSJoseph Huber #include "src/__support/ctype_utils.h" 14*5ff3ff33SPetr Hosek #include "src/__support/macros/config.h" 15e0be78beSJoseph Huber #include "src/__support/str_to_integer.h" 167a129f07SMichael Jones #include "src/stdio/scanf_core/core_structs.h" 177a129f07SMichael Jones #include "src/stdio/scanf_core/scanf_config.h" 187a129f07SMichael Jones 197a129f07SMichael Jones #include <stddef.h> 207a129f07SMichael Jones 21*5ff3ff33SPetr Hosek namespace LIBC_NAMESPACE_DECL { 227a129f07SMichael Jones namespace scanf_core { 237a129f07SMichael Jones 24e0be78beSJoseph Huber #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE 25e0be78beSJoseph Huber #define GET_ARG_VAL_SIMPLEST(arg_type, index) get_arg_value<arg_type>(index) 26e0be78beSJoseph Huber #else 27e0be78beSJoseph Huber #define GET_ARG_VAL_SIMPLEST(arg_type, _) get_next_arg_value<arg_type>() 28e0be78beSJoseph Huber #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE 29e0be78beSJoseph Huber 30e0be78beSJoseph Huber template <typename ArgProvider> class Parser { 317a129f07SMichael Jones const char *__restrict str; 327a129f07SMichael Jones 337a129f07SMichael Jones size_t cur_pos = 0; 34e0be78beSJoseph Huber ArgProvider args_cur; 357a129f07SMichael Jones 36c3228714SGuillaume Chatelet #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE 377a129f07SMichael Jones // args_start stores the start of the va_args, which is used when a previous 387a129f07SMichael Jones // argument is needed. In that case, we have to read the arguments from the 397a129f07SMichael Jones // beginning since they don't support reading backwards. 40e0be78beSJoseph Huber ArgProvider args_start; 417a129f07SMichael Jones size_t args_index = 1; 42c3228714SGuillaume Chatelet #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE 437a129f07SMichael Jones 447a129f07SMichael Jones public: 45c3228714SGuillaume Chatelet #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE 46494734b0SSiva Chandra Reddy LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args) 477a129f07SMichael Jones : str(new_str), args_cur(args), args_start(args) {} 487a129f07SMichael Jones #else 49494734b0SSiva Chandra Reddy LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args) 507a129f07SMichael Jones : str(new_str), args_cur(args) {} 51c3228714SGuillaume Chatelet #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE 527a129f07SMichael Jones 537a129f07SMichael Jones // get_next_section will parse the format string until it has a fully 547a129f07SMichael Jones // specified format section. This can either be a raw format section with no 557a129f07SMichael Jones // conversion, or a format section with a conversion that has all of its 567a129f07SMichael Jones // variables stored in the format section. 57e0be78beSJoseph Huber LIBC_INLINE FormatSection get_next_section() { 58e0be78beSJoseph Huber FormatSection section; 59e0be78beSJoseph Huber size_t starting_pos = cur_pos; 60e0be78beSJoseph Huber if (str[cur_pos] == '%') { 61e0be78beSJoseph Huber // format section 62e0be78beSJoseph Huber section.has_conv = true; 63e0be78beSJoseph Huber 64e0be78beSJoseph Huber ++cur_pos; 65e0be78beSJoseph Huber [[maybe_unused]] size_t conv_index = 0; 66e0be78beSJoseph Huber 67e0be78beSJoseph Huber #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE 68e0be78beSJoseph Huber conv_index = parse_index(&cur_pos); 69e0be78beSJoseph Huber #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE 70e0be78beSJoseph Huber 71e0be78beSJoseph Huber if (str[cur_pos] == '*') { 72e0be78beSJoseph Huber ++cur_pos; 73e0be78beSJoseph Huber section.flags = FormatFlags::NO_WRITE; 74e0be78beSJoseph Huber } 75e0be78beSJoseph Huber 76e0be78beSJoseph Huber // handle width 77e0be78beSJoseph Huber section.max_width = -1; 78e0be78beSJoseph Huber if (internal::isdigit(str[cur_pos])) { 79e0be78beSJoseph Huber auto result = internal::strtointeger<int>(str + cur_pos, 10); 80e0be78beSJoseph Huber section.max_width = result.value; 81e0be78beSJoseph Huber cur_pos = cur_pos + result.parsed_len; 82e0be78beSJoseph Huber } 83e0be78beSJoseph Huber 84e0be78beSJoseph Huber // TODO(michaelrj): add posix allocate flag support. 85e0be78beSJoseph Huber // if (str[cur_pos] == 'm') { 86e0be78beSJoseph Huber // ++cur_pos; 87e0be78beSJoseph Huber // section.flags = FormatFlags::ALLOCATE; 88e0be78beSJoseph Huber // } 89e0be78beSJoseph Huber 90e0be78beSJoseph Huber LengthModifier lm = parse_length_modifier(&cur_pos); 91e0be78beSJoseph Huber section.length_modifier = lm; 92e0be78beSJoseph Huber 93e0be78beSJoseph Huber section.conv_name = str[cur_pos]; 94e0be78beSJoseph Huber 95e0be78beSJoseph Huber // If NO_WRITE is not set, then read the next arg as the output pointer. 96e0be78beSJoseph Huber if ((section.flags & FormatFlags::NO_WRITE) == 0) { 97e0be78beSJoseph Huber // Since all outputs are pointers, there's no need to distinguish when 98e0be78beSJoseph Huber // reading from va_args. They're all the same size and stored the same. 99e0be78beSJoseph Huber section.output_ptr = GET_ARG_VAL_SIMPLEST(void *, conv_index); 100e0be78beSJoseph Huber } 101e0be78beSJoseph Huber 102e0be78beSJoseph Huber // If the end of the format section is on the '\0'. This means we need to 103e0be78beSJoseph Huber // not advance the cur_pos and we should not count this has having a 104e0be78beSJoseph Huber // conversion. 105e0be78beSJoseph Huber if (str[cur_pos] != '\0') { 106e0be78beSJoseph Huber ++cur_pos; 107e0be78beSJoseph Huber } else { 108e0be78beSJoseph Huber section.has_conv = false; 109e0be78beSJoseph Huber } 110e0be78beSJoseph Huber 111e0be78beSJoseph Huber // If the format is a bracketed one, then we need to parse out the insides 112e0be78beSJoseph Huber // of the brackets. 113e0be78beSJoseph Huber if (section.conv_name == '[') { 114e0be78beSJoseph Huber constexpr char CLOSING_BRACKET = ']'; 115e0be78beSJoseph Huber constexpr char INVERT_FLAG = '^'; 116e0be78beSJoseph Huber constexpr char RANGE_OPERATOR = '-'; 117e0be78beSJoseph Huber 118e0be78beSJoseph Huber cpp::bitset<256> scan_set; 119e0be78beSJoseph Huber bool invert = false; 120e0be78beSJoseph Huber 121e0be78beSJoseph Huber // The circumflex in the first position represents the inversion flag, 122e0be78beSJoseph Huber // but it's easier to apply that at the end so we just store it for now. 123e0be78beSJoseph Huber if (str[cur_pos] == INVERT_FLAG) { 124e0be78beSJoseph Huber invert = true; 125e0be78beSJoseph Huber ++cur_pos; 126e0be78beSJoseph Huber } 127e0be78beSJoseph Huber 128e0be78beSJoseph Huber // This is used to determine if a hyphen is being used as a literal or 129e0be78beSJoseph Huber // as a range operator. 130e0be78beSJoseph Huber size_t set_start_pos = cur_pos; 131e0be78beSJoseph Huber 132e0be78beSJoseph Huber // Normally the right bracket closes the set, but if it's the first 133e0be78beSJoseph Huber // character (possibly after the inversion flag) then it's instead 134e0be78beSJoseph Huber // included as a character in the set and the second right bracket 135e0be78beSJoseph Huber // closes the set. 136e0be78beSJoseph Huber if (str[cur_pos] == CLOSING_BRACKET) { 137e0be78beSJoseph Huber scan_set.set(CLOSING_BRACKET); 138e0be78beSJoseph Huber ++cur_pos; 139e0be78beSJoseph Huber } 140e0be78beSJoseph Huber 141e0be78beSJoseph Huber while (str[cur_pos] != '\0' && str[cur_pos] != CLOSING_BRACKET) { 142e0be78beSJoseph Huber // If a hyphen is being used as a range operator, since it's neither 143e0be78beSJoseph Huber // at the beginning nor end of the set. 144e0be78beSJoseph Huber if (str[cur_pos] == RANGE_OPERATOR && cur_pos != set_start_pos && 145e0be78beSJoseph Huber str[cur_pos + 1] != CLOSING_BRACKET && str[cur_pos + 1] != '\0') { 146e0be78beSJoseph Huber // Technically there is no requirement to correct the ordering of 147e0be78beSJoseph Huber // the range, but since the range operator is entirely 148e0be78beSJoseph Huber // implementation defined it seems like a good convenience. 149e0be78beSJoseph Huber char a = str[cur_pos - 1]; 150e0be78beSJoseph Huber char b = str[cur_pos + 1]; 151e0be78beSJoseph Huber char start = (a < b ? a : b); 152e0be78beSJoseph Huber char end = (a < b ? b : a); 153e0be78beSJoseph Huber scan_set.set_range(start, end); 154e0be78beSJoseph Huber cur_pos += 2; 155e0be78beSJoseph Huber } else { 156e0be78beSJoseph Huber scan_set.set(str[cur_pos]); 157e0be78beSJoseph Huber ++cur_pos; 158e0be78beSJoseph Huber } 159e0be78beSJoseph Huber } 160e0be78beSJoseph Huber if (invert) 161e0be78beSJoseph Huber scan_set.flip(); 162e0be78beSJoseph Huber 163e0be78beSJoseph Huber if (str[cur_pos] == CLOSING_BRACKET) { 164e0be78beSJoseph Huber ++cur_pos; 165e0be78beSJoseph Huber section.scan_set = scan_set; 166e0be78beSJoseph Huber } else { 167e0be78beSJoseph Huber // if the end of the string was encountered, this is not a valid set. 168e0be78beSJoseph Huber section.has_conv = false; 169e0be78beSJoseph Huber } 170e0be78beSJoseph Huber } 171e0be78beSJoseph Huber } else { 172e0be78beSJoseph Huber // raw section 173e0be78beSJoseph Huber section.has_conv = false; 174e0be78beSJoseph Huber while (str[cur_pos] != '%' && str[cur_pos] != '\0') 175e0be78beSJoseph Huber ++cur_pos; 176e0be78beSJoseph Huber } 177e0be78beSJoseph Huber section.raw_string = {str + starting_pos, cur_pos - starting_pos}; 178e0be78beSJoseph Huber return section; 179e0be78beSJoseph Huber } 1807a129f07SMichael Jones 1817a129f07SMichael Jones private: 1827a129f07SMichael Jones // parse_length_modifier parses the length modifier inside a format string. It 1837a129f07SMichael Jones // assumes that str[*local_pos] is inside a format specifier. It returns a 1847a129f07SMichael Jones // LengthModifier with the length modifier it found. It will advance local_pos 1857a129f07SMichael Jones // after the format specifier if one is found. 186e0be78beSJoseph Huber LIBC_INLINE LengthModifier parse_length_modifier(size_t *local_pos) { 187e0be78beSJoseph Huber switch (str[*local_pos]) { 188e0be78beSJoseph Huber case ('l'): 189e0be78beSJoseph Huber if (str[*local_pos + 1] == 'l') { 190e0be78beSJoseph Huber *local_pos += 2; 191e0be78beSJoseph Huber return LengthModifier::ll; 192e0be78beSJoseph Huber } else { 193e0be78beSJoseph Huber ++*local_pos; 194e0be78beSJoseph Huber return LengthModifier::l; 195e0be78beSJoseph Huber } 196e0be78beSJoseph Huber case ('h'): 197e0be78beSJoseph Huber if (str[*local_pos + 1] == 'h') { 198e0be78beSJoseph Huber *local_pos += 2; 199e0be78beSJoseph Huber return LengthModifier::hh; 200e0be78beSJoseph Huber } else { 201e0be78beSJoseph Huber ++*local_pos; 202e0be78beSJoseph Huber return LengthModifier::h; 203e0be78beSJoseph Huber } 204e0be78beSJoseph Huber case ('L'): 205e0be78beSJoseph Huber ++*local_pos; 206e0be78beSJoseph Huber return LengthModifier::L; 207e0be78beSJoseph Huber case ('j'): 208e0be78beSJoseph Huber ++*local_pos; 209e0be78beSJoseph Huber return LengthModifier::j; 210e0be78beSJoseph Huber case ('z'): 211e0be78beSJoseph Huber ++*local_pos; 212e0be78beSJoseph Huber return LengthModifier::z; 213e0be78beSJoseph Huber case ('t'): 214e0be78beSJoseph Huber ++*local_pos; 215e0be78beSJoseph Huber return LengthModifier::t; 216e0be78beSJoseph Huber default: 217e0be78beSJoseph Huber return LengthModifier::NONE; 218e0be78beSJoseph Huber } 219e0be78beSJoseph Huber } 2207a129f07SMichael Jones 2217a129f07SMichael Jones // get_next_arg_value gets the next value from the arg list as type T. 222494734b0SSiva Chandra Reddy template <class T> LIBC_INLINE T get_next_arg_value() { 223e0be78beSJoseph Huber return args_cur.template next_var<T>(); 2247a129f07SMichael Jones } 2257a129f07SMichael Jones 2267a129f07SMichael Jones //---------------------------------------------------- 2277a129f07SMichael Jones // INDEX MODE ONLY FUNCTIONS AFTER HERE: 2287a129f07SMichael Jones //---------------------------------------------------- 2297a129f07SMichael Jones 230c3228714SGuillaume Chatelet #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE 2317a129f07SMichael Jones 2327a129f07SMichael Jones // parse_index parses the index of a value inside a format string. It 2337a129f07SMichael Jones // assumes that str[*local_pos] points to character after a '%' or '*', and 2347a129f07SMichael Jones // returns 0 if there is no closing $, or if it finds no number. If it finds a 2357a129f07SMichael Jones // number, it will move local_pos past the end of the $, else it will not move 2367a129f07SMichael Jones // local_pos. 237e0be78beSJoseph Huber LIBC_INLINE size_t parse_index(size_t *local_pos) { 238e0be78beSJoseph Huber if (internal::isdigit(str[*local_pos])) { 239e0be78beSJoseph Huber auto result = internal::strtointeger<int>(str + *local_pos, 10); 240e0be78beSJoseph Huber size_t index = result.value; 241e0be78beSJoseph Huber if (str[*local_pos + result.parsed_len] != '$') 242e0be78beSJoseph Huber return 0; 243e0be78beSJoseph Huber *local_pos = 1 + result.parsed_len + *local_pos; 244e0be78beSJoseph Huber return index; 245e0be78beSJoseph Huber } 246e0be78beSJoseph Huber return 0; 247e0be78beSJoseph Huber } 2487a129f07SMichael Jones 2497a129f07SMichael Jones // get_arg_value gets the value from the arg list at index (starting at 1). 2507a129f07SMichael Jones // This may require parsing the format string. An index of 0 is interpreted as 2517a129f07SMichael Jones // the next value. 252494734b0SSiva Chandra Reddy template <class T> LIBC_INLINE T get_arg_value(size_t index) { 2537a129f07SMichael Jones if (!(index == 0 || index == args_index)) 2547a129f07SMichael Jones args_to_index(index); 2557a129f07SMichael Jones 2567a129f07SMichael Jones ++args_index; 2577a129f07SMichael Jones return get_next_arg_value<T>(); 2587a129f07SMichael Jones } 2597a129f07SMichael Jones 2607a129f07SMichael Jones // the ArgList can only return the next item in the list. This function is 2617a129f07SMichael Jones // used in index mode when the item that needs to be read is not the next one. 262678e3ee1SFangrui Song // It moves cur_args to the index requested so the appropriate value may 2637a129f07SMichael Jones // be read. This may involve parsing the format string, and is in the worst 2647a129f07SMichael Jones // case an O(n^2) operation. 265e0be78beSJoseph Huber LIBC_INLINE void args_to_index(size_t index) { 266e0be78beSJoseph Huber if (args_index > index) { 267e0be78beSJoseph Huber args_index = 1; 268e0be78beSJoseph Huber args_cur = args_start; 269e0be78beSJoseph Huber } 270e0be78beSJoseph Huber 271e0be78beSJoseph Huber while (args_index < index) { 272e0be78beSJoseph Huber // Since all arguments must be pointers, we can just read all of them as 273e0be78beSJoseph Huber // void * and not worry about type issues. 274e0be78beSJoseph Huber args_cur.template next_var<void *>(); 275e0be78beSJoseph Huber ++args_index; 276e0be78beSJoseph Huber } 277e0be78beSJoseph Huber } 2787a129f07SMichael Jones 279c3228714SGuillaume Chatelet #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE 2807a129f07SMichael Jones }; 2817a129f07SMichael Jones 2827a129f07SMichael Jones } // namespace scanf_core 283*5ff3ff33SPetr Hosek } // namespace LIBC_NAMESPACE_DECL 2847a129f07SMichael Jones 2857a129f07SMichael Jones #endif // LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H 286