//===-- Format string parser for scanf -------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H #include "src/__support/arg_list.h" #include "src/__support/ctype_utils.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" #include "src/stdio/scanf_core/core_structs.h" #include "src/stdio/scanf_core/scanf_config.h" #include namespace LIBC_NAMESPACE_DECL { namespace scanf_core { #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE #define GET_ARG_VAL_SIMPLEST(arg_type, index) get_arg_value(index) #else #define GET_ARG_VAL_SIMPLEST(arg_type, _) get_next_arg_value() #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE template class Parser { const char *__restrict str; size_t cur_pos = 0; ArgProvider args_cur; #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE // args_start stores the start of the va_args, which is used when a previous // argument is needed. In that case, we have to read the arguments from the // beginning since they don't support reading backwards. ArgProvider args_start; size_t args_index = 1; #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE public: #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args) : str(new_str), args_cur(args), args_start(args) {} #else LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args) : str(new_str), args_cur(args) {} #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE // get_next_section will parse the format string until it has a fully // specified format section. This can either be a raw format section with no // conversion, or a format section with a conversion that has all of its // variables stored in the format section. LIBC_INLINE FormatSection get_next_section() { FormatSection section; size_t starting_pos = cur_pos; if (str[cur_pos] == '%') { // format section section.has_conv = true; ++cur_pos; [[maybe_unused]] size_t conv_index = 0; #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE conv_index = parse_index(&cur_pos); #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE if (str[cur_pos] == '*') { ++cur_pos; section.flags = FormatFlags::NO_WRITE; } // handle width section.max_width = -1; if (internal::isdigit(str[cur_pos])) { auto result = internal::strtointeger(str + cur_pos, 10); section.max_width = result.value; cur_pos = cur_pos + result.parsed_len; } // TODO(michaelrj): add posix allocate flag support. // if (str[cur_pos] == 'm') { // ++cur_pos; // section.flags = FormatFlags::ALLOCATE; // } LengthModifier lm = parse_length_modifier(&cur_pos); section.length_modifier = lm; section.conv_name = str[cur_pos]; // If NO_WRITE is not set, then read the next arg as the output pointer. if ((section.flags & FormatFlags::NO_WRITE) == 0) { // Since all outputs are pointers, there's no need to distinguish when // reading from va_args. They're all the same size and stored the same. section.output_ptr = GET_ARG_VAL_SIMPLEST(void *, conv_index); } // If the end of the format section is on the '\0'. This means we need to // not advance the cur_pos and we should not count this has having a // conversion. if (str[cur_pos] != '\0') { ++cur_pos; } else { section.has_conv = false; } // If the format is a bracketed one, then we need to parse out the insides // of the brackets. if (section.conv_name == '[') { constexpr char CLOSING_BRACKET = ']'; constexpr char INVERT_FLAG = '^'; constexpr char RANGE_OPERATOR = '-'; cpp::bitset<256> scan_set; bool invert = false; // The circumflex in the first position represents the inversion flag, // but it's easier to apply that at the end so we just store it for now. if (str[cur_pos] == INVERT_FLAG) { invert = true; ++cur_pos; } // This is used to determine if a hyphen is being used as a literal or // as a range operator. size_t set_start_pos = cur_pos; // Normally the right bracket closes the set, but if it's the first // character (possibly after the inversion flag) then it's instead // included as a character in the set and the second right bracket // closes the set. if (str[cur_pos] == CLOSING_BRACKET) { scan_set.set(CLOSING_BRACKET); ++cur_pos; } while (str[cur_pos] != '\0' && str[cur_pos] != CLOSING_BRACKET) { // If a hyphen is being used as a range operator, since it's neither // at the beginning nor end of the set. if (str[cur_pos] == RANGE_OPERATOR && cur_pos != set_start_pos && str[cur_pos + 1] != CLOSING_BRACKET && str[cur_pos + 1] != '\0') { // Technically there is no requirement to correct the ordering of // the range, but since the range operator is entirely // implementation defined it seems like a good convenience. char a = str[cur_pos - 1]; char b = str[cur_pos + 1]; char start = (a < b ? a : b); char end = (a < b ? b : a); scan_set.set_range(start, end); cur_pos += 2; } else { scan_set.set(str[cur_pos]); ++cur_pos; } } if (invert) scan_set.flip(); if (str[cur_pos] == CLOSING_BRACKET) { ++cur_pos; section.scan_set = scan_set; } else { // if the end of the string was encountered, this is not a valid set. section.has_conv = false; } } } else { // raw section section.has_conv = false; while (str[cur_pos] != '%' && str[cur_pos] != '\0') ++cur_pos; } section.raw_string = {str + starting_pos, cur_pos - starting_pos}; return section; } private: // parse_length_modifier parses the length modifier inside a format string. It // assumes that str[*local_pos] is inside a format specifier. It returns a // LengthModifier with the length modifier it found. It will advance local_pos // after the format specifier if one is found. LIBC_INLINE LengthModifier parse_length_modifier(size_t *local_pos) { switch (str[*local_pos]) { case ('l'): if (str[*local_pos + 1] == 'l') { *local_pos += 2; return LengthModifier::ll; } else { ++*local_pos; return LengthModifier::l; } case ('h'): if (str[*local_pos + 1] == 'h') { *local_pos += 2; return LengthModifier::hh; } else { ++*local_pos; return LengthModifier::h; } case ('L'): ++*local_pos; return LengthModifier::L; case ('j'): ++*local_pos; return LengthModifier::j; case ('z'): ++*local_pos; return LengthModifier::z; case ('t'): ++*local_pos; return LengthModifier::t; default: return LengthModifier::NONE; } } // get_next_arg_value gets the next value from the arg list as type T. template LIBC_INLINE T get_next_arg_value() { return args_cur.template next_var(); } //---------------------------------------------------- // INDEX MODE ONLY FUNCTIONS AFTER HERE: //---------------------------------------------------- #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE // parse_index parses the index of a value inside a format string. It // assumes that str[*local_pos] points to character after a '%' or '*', and // returns 0 if there is no closing $, or if it finds no number. If it finds a // number, it will move local_pos past the end of the $, else it will not move // local_pos. LIBC_INLINE size_t parse_index(size_t *local_pos) { if (internal::isdigit(str[*local_pos])) { auto result = internal::strtointeger(str + *local_pos, 10); size_t index = result.value; if (str[*local_pos + result.parsed_len] != '$') return 0; *local_pos = 1 + result.parsed_len + *local_pos; return index; } return 0; } // get_arg_value gets the value from the arg list at index (starting at 1). // This may require parsing the format string. An index of 0 is interpreted as // the next value. template LIBC_INLINE T get_arg_value(size_t index) { if (!(index == 0 || index == args_index)) args_to_index(index); ++args_index; return get_next_arg_value(); } // the ArgList can only return the next item in the list. This function is // used in index mode when the item that needs to be read is not the next one. // It moves cur_args to the index requested so the appropriate value may // be read. This may involve parsing the format string, and is in the worst // case an O(n^2) operation. LIBC_INLINE void args_to_index(size_t index) { if (args_index > index) { args_index = 1; args_cur = args_start; } while (args_index < index) { // Since all arguments must be pointers, we can just read all of them as // void * and not worry about type issues. args_cur.template next_var(); ++args_index; } } #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE }; } // namespace scanf_core } // namespace LIBC_NAMESPACE_DECL #endif // LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H