xref: /netbsd-src/external/bsd/ntp/dist/sntp/libopts/tokenize.c (revision 3117ece4fc4a4ca4489ba793710b60b0d26bab6c)
1 /*	$NetBSD: tokenize.c,v 1.10 2024/08/18 20:47:25 christos Exp $	*/
2 
3 /** \file tokenize.c
4  *
5  *  Tokenize a string, accommodating quoted strings.
6  *
7  * @addtogroup autoopts
8  * @{
9  */
10 /*
11  *  This file defines the string_tokenize interface
12  *  This file is part of AutoOpts, a companion to AutoGen.
13  *  AutoOpts is free software.
14  *  AutoOpts is Copyright (C) 1992-2018 by Bruce Korb - all rights reserved
15  *
16  *  AutoOpts is available under any one of two licenses.  The license
17  *  in use must be one of these two and the choice is under the control
18  *  of the user of the license.
19  *
20  *   The GNU Lesser General Public License, version 3 or later
21  *      See the files "COPYING.lgplv3" and "COPYING.gplv3"
22  *
23  *   The Modified Berkeley Software Distribution License
24  *      See the file "COPYING.mbsd"
25  *
26  *  These files have the following sha256 sums:
27  *
28  *  8584710e9b04216a394078dc156b781d0b47e1729104d666658aecef8ee32e95  COPYING.gplv3
29  *  4379e7444a0e2ce2b12dd6f5a52a27a4d02d39d247901d3285c88cf0d37f477b  COPYING.lgplv3
30  *  13aa749a5b0a454917a944ed8fffc530b784f5ead522b1aacaf4ec8aa55a6239  COPYING.mbsd
31  */
32 
33 static void
34 copy_cooked(ch_t ** ppDest, char const ** ppSrc)
35 {
36     ch_t * pDest = (ch_t *)*ppDest;
37     const ch_t * pSrc  = (const ch_t *)(*ppSrc + 1);
38 
39     for (;;) {
40         ch_t ch = *(pSrc++);
41         switch (ch) {
42         case NUL:   *ppSrc = NULL; return;
43         case '"':   goto done;
44         case '\\':
45             pSrc += ao_string_cook_escape_char(__UNCONST(pSrc), (char *)&ch, 0x7F);
46             if (ch == 0x7F)
47                 break;
48             /* FALLTHROUGH */
49 
50         default:
51             *(pDest++) = ch;
52         }
53     }
54 
55  done:
56     *ppDest = (ch_t *)pDest; /* next spot for storing character */
57     *ppSrc  = (char const *)pSrc;  /* char following closing quote    */
58 }
59 
60 
61 static void
62 copy_raw(ch_t ** ppDest, char const ** ppSrc)
63 {
64     ch_t * pDest = *ppDest;
65     cc_t * pSrc  = (cc_t *) (*ppSrc + 1);
66 
67     for (;;) {
68         ch_t ch = *(pSrc++);
69         switch (ch) {
70         case NUL:   *ppSrc = NULL; return;
71         case '\'':  goto done;
72         case '\\':
73             /*
74              *  *Four* escapes are handled:  newline removal, escape char
75              *  quoting and apostrophe quoting
76              */
77             switch (*pSrc) {
78             case NUL:   *ppSrc = NULL; return;
79             case '\r':
80                 if (*(++pSrc) == NL)
81                     ++pSrc;
82                 continue;
83 
84             case NL:
85                 ++pSrc;
86                 continue;
87 
88             case '\'':
89                 ch = '\'';
90                 /* FALLTHROUGH */
91 
92             case '\\':
93                 ++pSrc;
94                 break;
95             }
96             /* FALLTHROUGH */
97 
98         default:
99             *(pDest++) = ch;
100         }
101     }
102 
103  done:
104     *ppDest = pDest; /* next spot for storing character */
105     *ppSrc  = (char const *) pSrc;  /* char following closing quote    */
106 }
107 
108 static token_list_t *
109 alloc_token_list(char const * str)
110 {
111     token_list_t * res;
112 
113     int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */
114 
115     if (str == NULL) goto enoent_res;
116 
117     /*
118      *  Trim leading white space.  Use "ENOENT" and a NULL return to indicate
119      *  an empty string was passed.
120      */
121     str = SPN_WHITESPACE_CHARS(str);
122     if (*str == NUL)  goto enoent_res;
123 
124     /*
125      *  Take an approximate count of tokens.  If no quoted strings are used,
126      *  it will be accurate.  If quoted strings are used, it will be a little
127      *  high and we'll squander the space for a few extra pointers.
128      */
129     {
130         char const * pz = str;
131 
132         do {
133             max_token_ct++;
134             pz = BRK_WHITESPACE_CHARS(pz+1);
135             pz = SPN_WHITESPACE_CHARS(pz);
136         } while (*pz != NUL);
137 
138         res = malloc(sizeof(*res) + (size_t)(pz - str)
139                      + ((size_t)max_token_ct * sizeof(ch_t *)));
140     }
141 
142     if (res == NULL)
143         errno = ENOMEM;
144     else res->tkn_list[0] = (ch_t *)(res->tkn_list + (max_token_ct - 1));
145 
146     return res;
147 
148     enoent_res:
149 
150     errno = ENOENT;
151     return NULL;
152 }
153 
154 /*=export_func ao_string_tokenize
155  *
156  * what: tokenize an input string
157  *
158  * arg:  + char const * + string + string to be tokenized +
159  *
160  * ret_type:  token_list_t *
161  * ret_desc:  pointer to a structure that lists each token
162  *
163  * doc:
164  *
165  * This function will convert one input string into a list of strings.
166  * The list of strings is derived by separating the input based on
167  * white space separation.  However, if the input contains either single
168  * or double quote characters, then the text after that character up to
169  * a matching quote will become the string in the list.
170  *
171  *  The returned pointer should be deallocated with @code{free(3C)} when
172  *  are done using the data.  The data are placed in a single block of
173  *  allocated memory.  Do not deallocate individual token/strings.
174  *
175  *  The structure pointed to will contain at least these two fields:
176  *  @table @samp
177  *  @item tkn_ct
178  *  The number of tokens found in the input string.
179  *  @item tok_list
180  *  An array of @code{tkn_ct + 1} pointers to substring tokens, with
181  *  the last pointer set to NULL.
182  *  @end table
183  *
184  * There are two types of quoted strings: single quoted (@code{'}) and
185  * double quoted (@code{"}).  Singly quoted strings are fairly raw in that
186  * escape characters (@code{\\}) are simply another character, except when
187  * preceding the following characters:
188  * @example
189  * @code{\\}  double backslashes reduce to one
190  * @code{'}   incorporates the single quote into the string
191  * @code{\n}  suppresses both the backslash and newline character
192  * @end example
193  *
194  * Double quote strings are formed according to the rules of string
195  * constants in ANSI-C programs.
196  *
197  * example:
198  * @example
199  *    #include <stdlib.h>
200  *    int ix;
201  *    token_list_t * ptl = ao_string_tokenize(some_string)
202  *    for (ix = 0; ix < ptl->tkn_ct; ix++)
203  *       do_something_with_tkn(ptl->tkn_list[ix]);
204  *    free(ptl);
205  * @end example
206  * Note that everything is freed with the one call to @code{free(3C)}.
207  *
208  * err:
209  *  NULL is returned and @code{errno} will be set to indicate the problem:
210  *  @itemize @bullet
211  *  @item
212  *  @code{EINVAL} - There was an unterminated quoted string.
213  *  @item
214  *  @code{ENOENT} - The input string was empty.
215  *  @item
216  *  @code{ENOMEM} - There is not enough memory.
217  *  @end itemize
218 =*/
219 token_list_t *
220 ao_string_tokenize(char const * str)
221 {
222     token_list_t * res = alloc_token_list(str);
223     ch_t * pzDest;
224 
225     /*
226      *  Now copy each token into the output buffer.
227      */
228     if (res == NULL)
229         return res;
230 
231     pzDest = (ch_t *)(res->tkn_list[0]);
232     res->tkn_ct  = 0;
233 
234     do  {
235         res->tkn_list[ res->tkn_ct++ ] = pzDest;
236         for (;;) {
237             int ch = (ch_t)*str;
238             if (IS_WHITESPACE_CHAR(ch)) {
239             found_white_space:
240                 str = SPN_WHITESPACE_CHARS(str+1);
241                 break;
242             }
243 
244             switch (ch) {
245             case '"':
246                 copy_cooked(&pzDest, &str);
247                 if (str == NULL) {
248                     free(res);
249                     errno = EINVAL;
250                     return NULL;
251                 }
252                 if (IS_WHITESPACE_CHAR(*str))
253                     goto found_white_space;
254                 break;
255 
256             case '\'':
257                 copy_raw(&pzDest, &str);
258                 if (str == NULL) {
259                     free(res);
260                     errno = EINVAL;
261                     return NULL;
262                 }
263                 if (IS_WHITESPACE_CHAR(*str))
264                     goto found_white_space;
265                 break;
266 
267             case NUL:
268                 goto copy_done;
269 
270             default:
271                 str++;
272                 *(pzDest++) = (unsigned char)ch;
273             }
274         } copy_done:;
275 
276         /*
277          * NUL terminate the last token and see if we have any more tokens.
278          */
279         *(pzDest++) = NUL;
280     } while (*str != NUL);
281 
282     res->tkn_list[ res->tkn_ct ] = NULL;
283 
284     return res;
285 }
286 
287 #ifdef TEST
288 #include <stdio.h>
289 #include <string.h>
290 
291 int
292 main(int argc, char ** argv)
293 {
294     if (argc == 1) {
295         printf("USAGE:  %s arg [ ... ]\n", *argv);
296         return 1;
297     }
298     while (--argc > 0) {
299         char * arg = *(++argv);
300         token_list_t * p = ao_string_tokenize(arg);
301         if (p == NULL) {
302             printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
303                    arg, errno, strerror(errno));
304         } else {
305             int ix = 0;
306             printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct);
307             do {
308                 printf(" %3d:  ``%s''\n", ix+1, p->tkn_list[ix]);
309             } while (++ix < p->tkn_ct);
310             free(p);
311         }
312     }
313     return 0;
314 }
315 #endif
316 
317 /** @}
318  *
319  * Local Variables:
320  * mode: C
321  * c-file-style: "stroustrup"
322  * indent-tabs-mode: nil
323  * End:
324  * end of autoopts/tokenize.c */
325