xref: /llvm-project/libc/src/stdio/scanf_core/parser.h (revision 5ff3ff33ff930e4ec49da7910612d8a41eb068cb)
17a129f07SMichael Jones //===-- Format string parser for scanf -------------------------*- C++ -*-===//
27a129f07SMichael Jones //
37a129f07SMichael Jones // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
47a129f07SMichael Jones // See https://llvm.org/LICENSE.txt for license information.
57a129f07SMichael Jones // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
67a129f07SMichael Jones //
77a129f07SMichael Jones //===----------------------------------------------------------------------===//
87a129f07SMichael Jones 
97a129f07SMichael Jones #ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H
107a129f07SMichael Jones #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H
117a129f07SMichael Jones 
127a129f07SMichael Jones #include "src/__support/arg_list.h"
13e0be78beSJoseph Huber #include "src/__support/ctype_utils.h"
14*5ff3ff33SPetr Hosek #include "src/__support/macros/config.h"
15e0be78beSJoseph Huber #include "src/__support/str_to_integer.h"
167a129f07SMichael Jones #include "src/stdio/scanf_core/core_structs.h"
177a129f07SMichael Jones #include "src/stdio/scanf_core/scanf_config.h"
187a129f07SMichael Jones 
197a129f07SMichael Jones #include <stddef.h>
207a129f07SMichael Jones 
21*5ff3ff33SPetr Hosek namespace LIBC_NAMESPACE_DECL {
227a129f07SMichael Jones namespace scanf_core {
237a129f07SMichael Jones 
24e0be78beSJoseph Huber #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
25e0be78beSJoseph Huber #define GET_ARG_VAL_SIMPLEST(arg_type, index) get_arg_value<arg_type>(index)
26e0be78beSJoseph Huber #else
27e0be78beSJoseph Huber #define GET_ARG_VAL_SIMPLEST(arg_type, _) get_next_arg_value<arg_type>()
28e0be78beSJoseph Huber #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
29e0be78beSJoseph Huber 
30e0be78beSJoseph Huber template <typename ArgProvider> class Parser {
317a129f07SMichael Jones   const char *__restrict str;
327a129f07SMichael Jones 
337a129f07SMichael Jones   size_t cur_pos = 0;
34e0be78beSJoseph Huber   ArgProvider args_cur;
357a129f07SMichael Jones 
36c3228714SGuillaume Chatelet #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
377a129f07SMichael Jones   // args_start stores the start of the va_args, which is used when a previous
387a129f07SMichael Jones   // argument is needed. In that case, we have to read the arguments from the
397a129f07SMichael Jones   // beginning since they don't support reading backwards.
40e0be78beSJoseph Huber   ArgProvider args_start;
417a129f07SMichael Jones   size_t args_index = 1;
42c3228714SGuillaume Chatelet #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
437a129f07SMichael Jones 
447a129f07SMichael Jones public:
45c3228714SGuillaume Chatelet #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
46494734b0SSiva Chandra Reddy   LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args)
477a129f07SMichael Jones       : str(new_str), args_cur(args), args_start(args) {}
487a129f07SMichael Jones #else
49494734b0SSiva Chandra Reddy   LIBC_INLINE Parser(const char *__restrict new_str, internal::ArgList &args)
507a129f07SMichael Jones       : str(new_str), args_cur(args) {}
51c3228714SGuillaume Chatelet #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
527a129f07SMichael Jones 
537a129f07SMichael Jones   // get_next_section will parse the format string until it has a fully
547a129f07SMichael Jones   // specified format section. This can either be a raw format section with no
557a129f07SMichael Jones   // conversion, or a format section with a conversion that has all of its
567a129f07SMichael Jones   // variables stored in the format section.
57e0be78beSJoseph Huber   LIBC_INLINE FormatSection get_next_section() {
58e0be78beSJoseph Huber     FormatSection section;
59e0be78beSJoseph Huber     size_t starting_pos = cur_pos;
60e0be78beSJoseph Huber     if (str[cur_pos] == '%') {
61e0be78beSJoseph Huber       // format section
62e0be78beSJoseph Huber       section.has_conv = true;
63e0be78beSJoseph Huber 
64e0be78beSJoseph Huber       ++cur_pos;
65e0be78beSJoseph Huber       [[maybe_unused]] size_t conv_index = 0;
66e0be78beSJoseph Huber 
67e0be78beSJoseph Huber #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
68e0be78beSJoseph Huber       conv_index = parse_index(&cur_pos);
69e0be78beSJoseph Huber #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
70e0be78beSJoseph Huber 
71e0be78beSJoseph Huber       if (str[cur_pos] == '*') {
72e0be78beSJoseph Huber         ++cur_pos;
73e0be78beSJoseph Huber         section.flags = FormatFlags::NO_WRITE;
74e0be78beSJoseph Huber       }
75e0be78beSJoseph Huber 
76e0be78beSJoseph Huber       // handle width
77e0be78beSJoseph Huber       section.max_width = -1;
78e0be78beSJoseph Huber       if (internal::isdigit(str[cur_pos])) {
79e0be78beSJoseph Huber         auto result = internal::strtointeger<int>(str + cur_pos, 10);
80e0be78beSJoseph Huber         section.max_width = result.value;
81e0be78beSJoseph Huber         cur_pos = cur_pos + result.parsed_len;
82e0be78beSJoseph Huber       }
83e0be78beSJoseph Huber 
84e0be78beSJoseph Huber       // TODO(michaelrj): add posix allocate flag support.
85e0be78beSJoseph Huber       // if (str[cur_pos] == 'm') {
86e0be78beSJoseph Huber       //   ++cur_pos;
87e0be78beSJoseph Huber       //   section.flags = FormatFlags::ALLOCATE;
88e0be78beSJoseph Huber       // }
89e0be78beSJoseph Huber 
90e0be78beSJoseph Huber       LengthModifier lm = parse_length_modifier(&cur_pos);
91e0be78beSJoseph Huber       section.length_modifier = lm;
92e0be78beSJoseph Huber 
93e0be78beSJoseph Huber       section.conv_name = str[cur_pos];
94e0be78beSJoseph Huber 
95e0be78beSJoseph Huber       // If NO_WRITE is not set, then read the next arg as the output pointer.
96e0be78beSJoseph Huber       if ((section.flags & FormatFlags::NO_WRITE) == 0) {
97e0be78beSJoseph Huber         // Since all outputs are pointers, there's no need to distinguish when
98e0be78beSJoseph Huber         // reading from va_args. They're all the same size and stored the same.
99e0be78beSJoseph Huber         section.output_ptr = GET_ARG_VAL_SIMPLEST(void *, conv_index);
100e0be78beSJoseph Huber       }
101e0be78beSJoseph Huber 
102e0be78beSJoseph Huber       // If the end of the format section is on the '\0'. This means we need to
103e0be78beSJoseph Huber       // not advance the cur_pos and we should not count this has having a
104e0be78beSJoseph Huber       // conversion.
105e0be78beSJoseph Huber       if (str[cur_pos] != '\0') {
106e0be78beSJoseph Huber         ++cur_pos;
107e0be78beSJoseph Huber       } else {
108e0be78beSJoseph Huber         section.has_conv = false;
109e0be78beSJoseph Huber       }
110e0be78beSJoseph Huber 
111e0be78beSJoseph Huber       // If the format is a bracketed one, then we need to parse out the insides
112e0be78beSJoseph Huber       // of the brackets.
113e0be78beSJoseph Huber       if (section.conv_name == '[') {
114e0be78beSJoseph Huber         constexpr char CLOSING_BRACKET = ']';
115e0be78beSJoseph Huber         constexpr char INVERT_FLAG = '^';
116e0be78beSJoseph Huber         constexpr char RANGE_OPERATOR = '-';
117e0be78beSJoseph Huber 
118e0be78beSJoseph Huber         cpp::bitset<256> scan_set;
119e0be78beSJoseph Huber         bool invert = false;
120e0be78beSJoseph Huber 
121e0be78beSJoseph Huber         // The circumflex in the first position represents the inversion flag,
122e0be78beSJoseph Huber         // but it's easier to apply that at the end so we just store it for now.
123e0be78beSJoseph Huber         if (str[cur_pos] == INVERT_FLAG) {
124e0be78beSJoseph Huber           invert = true;
125e0be78beSJoseph Huber           ++cur_pos;
126e0be78beSJoseph Huber         }
127e0be78beSJoseph Huber 
128e0be78beSJoseph Huber         // This is used to determine if a hyphen is being used as a literal or
129e0be78beSJoseph Huber         // as a range operator.
130e0be78beSJoseph Huber         size_t set_start_pos = cur_pos;
131e0be78beSJoseph Huber 
132e0be78beSJoseph Huber         // Normally the right bracket closes the set, but if it's the first
133e0be78beSJoseph Huber         // character (possibly after the inversion flag) then it's instead
134e0be78beSJoseph Huber         // included as a character in the set and the second right bracket
135e0be78beSJoseph Huber         // closes the set.
136e0be78beSJoseph Huber         if (str[cur_pos] == CLOSING_BRACKET) {
137e0be78beSJoseph Huber           scan_set.set(CLOSING_BRACKET);
138e0be78beSJoseph Huber           ++cur_pos;
139e0be78beSJoseph Huber         }
140e0be78beSJoseph Huber 
141e0be78beSJoseph Huber         while (str[cur_pos] != '\0' && str[cur_pos] != CLOSING_BRACKET) {
142e0be78beSJoseph Huber           // If a hyphen is being used as a range operator, since it's neither
143e0be78beSJoseph Huber           // at the beginning nor end of the set.
144e0be78beSJoseph Huber           if (str[cur_pos] == RANGE_OPERATOR && cur_pos != set_start_pos &&
145e0be78beSJoseph Huber               str[cur_pos + 1] != CLOSING_BRACKET && str[cur_pos + 1] != '\0') {
146e0be78beSJoseph Huber             // Technically there is no requirement to correct the ordering of
147e0be78beSJoseph Huber             // the range, but since the range operator is entirely
148e0be78beSJoseph Huber             // implementation defined it seems like a good convenience.
149e0be78beSJoseph Huber             char a = str[cur_pos - 1];
150e0be78beSJoseph Huber             char b = str[cur_pos + 1];
151e0be78beSJoseph Huber             char start = (a < b ? a : b);
152e0be78beSJoseph Huber             char end = (a < b ? b : a);
153e0be78beSJoseph Huber             scan_set.set_range(start, end);
154e0be78beSJoseph Huber             cur_pos += 2;
155e0be78beSJoseph Huber           } else {
156e0be78beSJoseph Huber             scan_set.set(str[cur_pos]);
157e0be78beSJoseph Huber             ++cur_pos;
158e0be78beSJoseph Huber           }
159e0be78beSJoseph Huber         }
160e0be78beSJoseph Huber         if (invert)
161e0be78beSJoseph Huber           scan_set.flip();
162e0be78beSJoseph Huber 
163e0be78beSJoseph Huber         if (str[cur_pos] == CLOSING_BRACKET) {
164e0be78beSJoseph Huber           ++cur_pos;
165e0be78beSJoseph Huber           section.scan_set = scan_set;
166e0be78beSJoseph Huber         } else {
167e0be78beSJoseph Huber           // if the end of the string was encountered, this is not a valid set.
168e0be78beSJoseph Huber           section.has_conv = false;
169e0be78beSJoseph Huber         }
170e0be78beSJoseph Huber       }
171e0be78beSJoseph Huber     } else {
172e0be78beSJoseph Huber       // raw section
173e0be78beSJoseph Huber       section.has_conv = false;
174e0be78beSJoseph Huber       while (str[cur_pos] != '%' && str[cur_pos] != '\0')
175e0be78beSJoseph Huber         ++cur_pos;
176e0be78beSJoseph Huber     }
177e0be78beSJoseph Huber     section.raw_string = {str + starting_pos, cur_pos - starting_pos};
178e0be78beSJoseph Huber     return section;
179e0be78beSJoseph Huber   }
1807a129f07SMichael Jones 
1817a129f07SMichael Jones private:
1827a129f07SMichael Jones   // parse_length_modifier parses the length modifier inside a format string. It
1837a129f07SMichael Jones   // assumes that str[*local_pos] is inside a format specifier. It returns a
1847a129f07SMichael Jones   // LengthModifier with the length modifier it found. It will advance local_pos
1857a129f07SMichael Jones   // after the format specifier if one is found.
186e0be78beSJoseph Huber   LIBC_INLINE LengthModifier parse_length_modifier(size_t *local_pos) {
187e0be78beSJoseph Huber     switch (str[*local_pos]) {
188e0be78beSJoseph Huber     case ('l'):
189e0be78beSJoseph Huber       if (str[*local_pos + 1] == 'l') {
190e0be78beSJoseph Huber         *local_pos += 2;
191e0be78beSJoseph Huber         return LengthModifier::ll;
192e0be78beSJoseph Huber       } else {
193e0be78beSJoseph Huber         ++*local_pos;
194e0be78beSJoseph Huber         return LengthModifier::l;
195e0be78beSJoseph Huber       }
196e0be78beSJoseph Huber     case ('h'):
197e0be78beSJoseph Huber       if (str[*local_pos + 1] == 'h') {
198e0be78beSJoseph Huber         *local_pos += 2;
199e0be78beSJoseph Huber         return LengthModifier::hh;
200e0be78beSJoseph Huber       } else {
201e0be78beSJoseph Huber         ++*local_pos;
202e0be78beSJoseph Huber         return LengthModifier::h;
203e0be78beSJoseph Huber       }
204e0be78beSJoseph Huber     case ('L'):
205e0be78beSJoseph Huber       ++*local_pos;
206e0be78beSJoseph Huber       return LengthModifier::L;
207e0be78beSJoseph Huber     case ('j'):
208e0be78beSJoseph Huber       ++*local_pos;
209e0be78beSJoseph Huber       return LengthModifier::j;
210e0be78beSJoseph Huber     case ('z'):
211e0be78beSJoseph Huber       ++*local_pos;
212e0be78beSJoseph Huber       return LengthModifier::z;
213e0be78beSJoseph Huber     case ('t'):
214e0be78beSJoseph Huber       ++*local_pos;
215e0be78beSJoseph Huber       return LengthModifier::t;
216e0be78beSJoseph Huber     default:
217e0be78beSJoseph Huber       return LengthModifier::NONE;
218e0be78beSJoseph Huber     }
219e0be78beSJoseph Huber   }
2207a129f07SMichael Jones 
2217a129f07SMichael Jones   // get_next_arg_value gets the next value from the arg list as type T.
222494734b0SSiva Chandra Reddy   template <class T> LIBC_INLINE T get_next_arg_value() {
223e0be78beSJoseph Huber     return args_cur.template next_var<T>();
2247a129f07SMichael Jones   }
2257a129f07SMichael Jones 
2267a129f07SMichael Jones   //----------------------------------------------------
2277a129f07SMichael Jones   // INDEX MODE ONLY FUNCTIONS AFTER HERE:
2287a129f07SMichael Jones   //----------------------------------------------------
2297a129f07SMichael Jones 
230c3228714SGuillaume Chatelet #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
2317a129f07SMichael Jones 
2327a129f07SMichael Jones   // parse_index parses the index of a value inside a format string. It
2337a129f07SMichael Jones   // assumes that str[*local_pos] points to character after a '%' or '*', and
2347a129f07SMichael Jones   // returns 0 if there is no closing $, or if it finds no number. If it finds a
2357a129f07SMichael Jones   // number, it will move local_pos past the end of the $, else it will not move
2367a129f07SMichael Jones   // local_pos.
237e0be78beSJoseph Huber   LIBC_INLINE size_t parse_index(size_t *local_pos) {
238e0be78beSJoseph Huber     if (internal::isdigit(str[*local_pos])) {
239e0be78beSJoseph Huber       auto result = internal::strtointeger<int>(str + *local_pos, 10);
240e0be78beSJoseph Huber       size_t index = result.value;
241e0be78beSJoseph Huber       if (str[*local_pos + result.parsed_len] != '$')
242e0be78beSJoseph Huber         return 0;
243e0be78beSJoseph Huber       *local_pos = 1 + result.parsed_len + *local_pos;
244e0be78beSJoseph Huber       return index;
245e0be78beSJoseph Huber     }
246e0be78beSJoseph Huber     return 0;
247e0be78beSJoseph Huber   }
2487a129f07SMichael Jones 
2497a129f07SMichael Jones   // get_arg_value gets the value from the arg list at index (starting at 1).
2507a129f07SMichael Jones   // This may require parsing the format string. An index of 0 is interpreted as
2517a129f07SMichael Jones   // the next value.
252494734b0SSiva Chandra Reddy   template <class T> LIBC_INLINE T get_arg_value(size_t index) {
2537a129f07SMichael Jones     if (!(index == 0 || index == args_index))
2547a129f07SMichael Jones       args_to_index(index);
2557a129f07SMichael Jones 
2567a129f07SMichael Jones     ++args_index;
2577a129f07SMichael Jones     return get_next_arg_value<T>();
2587a129f07SMichael Jones   }
2597a129f07SMichael Jones 
2607a129f07SMichael Jones   // the ArgList can only return the next item in the list. This function is
2617a129f07SMichael Jones   // used in index mode when the item that needs to be read is not the next one.
262678e3ee1SFangrui Song   // It moves cur_args to the index requested so the appropriate value may
2637a129f07SMichael Jones   // be read. This may involve parsing the format string, and is in the worst
2647a129f07SMichael Jones   // case an O(n^2) operation.
265e0be78beSJoseph Huber   LIBC_INLINE void args_to_index(size_t index) {
266e0be78beSJoseph Huber     if (args_index > index) {
267e0be78beSJoseph Huber       args_index = 1;
268e0be78beSJoseph Huber       args_cur = args_start;
269e0be78beSJoseph Huber     }
270e0be78beSJoseph Huber 
271e0be78beSJoseph Huber     while (args_index < index) {
272e0be78beSJoseph Huber       // Since all arguments must be pointers, we can just read all of them as
273e0be78beSJoseph Huber       // void * and not worry about type issues.
274e0be78beSJoseph Huber       args_cur.template next_var<void *>();
275e0be78beSJoseph Huber       ++args_index;
276e0be78beSJoseph Huber     }
277e0be78beSJoseph Huber   }
2787a129f07SMichael Jones 
279c3228714SGuillaume Chatelet #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
2807a129f07SMichael Jones };
2817a129f07SMichael Jones 
2827a129f07SMichael Jones } // namespace scanf_core
283*5ff3ff33SPetr Hosek } // namespace LIBC_NAMESPACE_DECL
2847a129f07SMichael Jones 
2857a129f07SMichael Jones #endif // LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H
286