xref: /netbsd-src/external/bsd/ntp/dist/sntp/libopts/tokenize.c (revision c42dbd0ed2e61fe6eda8590caa852ccf34719964)
1 /*	$NetBSD: tokenize.c,v 1.9 2020/05/25 20:47:35 christos Exp $	*/
2 
3 /** \file tokenize.c
4  *
5  *  Tokenize a string, accommodating quoted strings.
6  *
7  * @addtogroup autoopts
8  * @{
9  */
10 /*
11  *  This file defines the string_tokenize interface
12  *  This file is part of AutoOpts, a companion to AutoGen.
13  *  AutoOpts is free software.
14  *  AutoOpts is Copyright (C) 1992-2015 by Bruce Korb - all rights reserved
15  *
16  *  AutoOpts is available under any one of two licenses.  The license
17  *  in use must be one of these two and the choice is under the control
18  *  of the user of the license.
19  *
20  *   The GNU Lesser General Public License, version 3 or later
21  *      See the files "COPYING.lgplv3" and "COPYING.gplv3"
22  *
23  *   The Modified Berkeley Software Distribution License
24  *      See the file "COPYING.mbsd"
25  *
26  *  These files have the following sha256 sums:
27  *
28  *  8584710e9b04216a394078dc156b781d0b47e1729104d666658aecef8ee32e95  COPYING.gplv3
29  *  4379e7444a0e2ce2b12dd6f5a52a27a4d02d39d247901d3285c88cf0d37f477b  COPYING.lgplv3
30  *  13aa749a5b0a454917a944ed8fffc530b784f5ead522b1aacaf4ec8aa55a6239  COPYING.mbsd
31  */
32 
33 #include <errno.h>
34 #include <stdlib.h>
35 
36 #define cc_t   const unsigned char
37 #define ch_t   unsigned char
38 
39 /* = = = START-STATIC-FORWARD = = = */
40 static void
41 copy_cooked(ch_t ** ppDest, char const ** ppSrc);
42 
43 static void
44 copy_raw(ch_t ** ppDest, char const ** ppSrc);
45 
46 static token_list_t *
47 alloc_token_list(char const * str);
48 /* = = = END-STATIC-FORWARD = = = */
49 
50 static void
51 copy_cooked(ch_t ** ppDest, char const ** ppSrc)
52 {
53     ch_t * pDest = (ch_t *)*ppDest;
54     const ch_t * pSrc  = (const ch_t *)(*ppSrc + 1);
55 
56     for (;;) {
57         ch_t ch = *(pSrc++);
58         switch (ch) {
59         case NUL:   *ppSrc = NULL; return;
60         case '"':   goto done;
61         case '\\':
62             pSrc += ao_string_cook_escape_char((const char *)pSrc, (char *)&ch, 0x7F);
63             if (ch == 0x7F)
64                 break;
65             /* FALLTHROUGH */
66 
67         default:
68             *(pDest++) = ch;
69         }
70     }
71 
72  done:
73     *ppDest = (ch_t *)pDest; /* next spot for storing character */
74     *ppSrc  = (char const *)pSrc;  /* char following closing quote    */
75 }
76 
77 
78 static void
79 copy_raw(ch_t ** ppDest, char const ** ppSrc)
80 {
81     ch_t * pDest = *ppDest;
82     cc_t * pSrc  = (cc_t *) (*ppSrc + 1);
83 
84     for (;;) {
85         ch_t ch = *(pSrc++);
86         switch (ch) {
87         case NUL:   *ppSrc = NULL; return;
88         case '\'':  goto done;
89         case '\\':
90             /*
91              *  *Four* escapes are handled:  newline removal, escape char
92              *  quoting and apostrophe quoting
93              */
94             switch (*pSrc) {
95             case NUL:   *ppSrc = NULL; return;
96             case '\r':
97                 if (*(++pSrc) == NL)
98                     ++pSrc;
99                 continue;
100 
101             case NL:
102                 ++pSrc;
103                 continue;
104 
105             case '\'':
106                 ch = '\'';
107                 /* FALLTHROUGH */
108 
109             case '\\':
110                 ++pSrc;
111                 break;
112             }
113             /* FALLTHROUGH */
114 
115         default:
116             *(pDest++) = ch;
117         }
118     }
119 
120  done:
121     *ppDest = pDest; /* next spot for storing character */
122     *ppSrc  = (char const *) pSrc;  /* char following closing quote    */
123 }
124 
125 static token_list_t *
126 alloc_token_list(char const * str)
127 {
128     token_list_t * res;
129 
130     int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */
131 
132     if (str == NULL) goto enoent_res;
133 
134     /*
135      *  Trim leading white space.  Use "ENOENT" and a NULL return to indicate
136      *  an empty string was passed.
137      */
138     str = SPN_WHITESPACE_CHARS(str);
139     if (*str == NUL)  goto enoent_res;
140 
141     /*
142      *  Take an approximate count of tokens.  If no quoted strings are used,
143      *  it will be accurate.  If quoted strings are used, it will be a little
144      *  high and we'll squander the space for a few extra pointers.
145      */
146     {
147         char const * pz = str;
148 
149         do {
150             max_token_ct++;
151             pz = BRK_WHITESPACE_CHARS(pz+1);
152             pz = SPN_WHITESPACE_CHARS(pz);
153         } while (*pz != NUL);
154 
155         res = malloc(sizeof(*res) + (size_t)(pz - str)
156                      + ((size_t)max_token_ct * sizeof(ch_t *)));
157     }
158 
159     if (res == NULL)
160         errno = ENOMEM;
161     else res->tkn_list[0] = (ch_t *)(res->tkn_list + (max_token_ct - 1));
162 
163     return res;
164 
165     enoent_res:
166 
167     errno = ENOENT;
168     return NULL;
169 }
170 
171 /*=export_func ao_string_tokenize
172  *
173  * what: tokenize an input string
174  *
175  * arg:  + char const * + string + string to be tokenized +
176  *
177  * ret_type:  token_list_t *
178  * ret_desc:  pointer to a structure that lists each token
179  *
180  * doc:
181  *
182  * This function will convert one input string into a list of strings.
183  * The list of strings is derived by separating the input based on
184  * white space separation.  However, if the input contains either single
185  * or double quote characters, then the text after that character up to
186  * a matching quote will become the string in the list.
187  *
188  *  The returned pointer should be deallocated with @code{free(3C)} when
189  *  are done using the data.  The data are placed in a single block of
190  *  allocated memory.  Do not deallocate individual token/strings.
191  *
192  *  The structure pointed to will contain at least these two fields:
193  *  @table @samp
194  *  @item tkn_ct
195  *  The number of tokens found in the input string.
196  *  @item tok_list
197  *  An array of @code{tkn_ct + 1} pointers to substring tokens, with
198  *  the last pointer set to NULL.
199  *  @end table
200  *
201  * There are two types of quoted strings: single quoted (@code{'}) and
202  * double quoted (@code{"}).  Singly quoted strings are fairly raw in that
203  * escape characters (@code{\\}) are simply another character, except when
204  * preceding the following characters:
205  * @example
206  * @code{\\}  double backslashes reduce to one
207  * @code{'}   incorporates the single quote into the string
208  * @code{\n}  suppresses both the backslash and newline character
209  * @end example
210  *
211  * Double quote strings are formed according to the rules of string
212  * constants in ANSI-C programs.
213  *
214  * example:
215  * @example
216  *    #include <stdlib.h>
217  *    int ix;
218  *    token_list_t * ptl = ao_string_tokenize(some_string)
219  *    for (ix = 0; ix < ptl->tkn_ct; ix++)
220  *       do_something_with_tkn(ptl->tkn_list[ix]);
221  *    free(ptl);
222  * @end example
223  * Note that everything is freed with the one call to @code{free(3C)}.
224  *
225  * err:
226  *  NULL is returned and @code{errno} will be set to indicate the problem:
227  *  @itemize @bullet
228  *  @item
229  *  @code{EINVAL} - There was an unterminated quoted string.
230  *  @item
231  *  @code{ENOENT} - The input string was empty.
232  *  @item
233  *  @code{ENOMEM} - There is not enough memory.
234  *  @end itemize
235 =*/
236 token_list_t *
237 ao_string_tokenize(char const * str)
238 {
239     token_list_t * res = alloc_token_list(str);
240     ch_t * pzDest;
241 
242     /*
243      *  Now copy each token into the output buffer.
244      */
245     if (res == NULL)
246         return res;
247 
248     pzDest = (ch_t *)(res->tkn_list[0]);
249     res->tkn_ct  = 0;
250 
251     do  {
252         res->tkn_list[ res->tkn_ct++ ] = pzDest;
253         for (;;) {
254             int ch = (ch_t)*str;
255             if (IS_WHITESPACE_CHAR(ch)) {
256             found_white_space:
257                 str = SPN_WHITESPACE_CHARS(str+1);
258                 break;
259             }
260 
261             switch (ch) {
262             case '"':
263                 copy_cooked(&pzDest, &str);
264                 if (str == NULL) {
265                     free(res);
266                     errno = EINVAL;
267                     return NULL;
268                 }
269                 if (IS_WHITESPACE_CHAR(*str))
270                     goto found_white_space;
271                 break;
272 
273             case '\'':
274                 copy_raw(&pzDest, &str);
275                 if (str == NULL) {
276                     free(res);
277                     errno = EINVAL;
278                     return NULL;
279                 }
280                 if (IS_WHITESPACE_CHAR(*str))
281                     goto found_white_space;
282                 break;
283 
284             case NUL:
285                 goto copy_done;
286 
287             default:
288                 str++;
289                 *(pzDest++) = (unsigned char)ch;
290             }
291         } copy_done:;
292 
293         /*
294          * NUL terminate the last token and see if we have any more tokens.
295          */
296         *(pzDest++) = NUL;
297     } while (*str != NUL);
298 
299     res->tkn_list[ res->tkn_ct ] = NULL;
300 
301     return res;
302 }
303 
304 #ifdef TEST
305 #include <stdio.h>
306 #include <string.h>
307 
308 int
309 main(int argc, char ** argv)
310 {
311     if (argc == 1) {
312         printf("USAGE:  %s arg [ ... ]\n", *argv);
313         return 1;
314     }
315     while (--argc > 0) {
316         char * arg = *(++argv);
317         token_list_t * p = ao_string_tokenize(arg);
318         if (p == NULL) {
319             printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
320                    arg, errno, strerror(errno));
321         } else {
322             int ix = 0;
323             printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct);
324             do {
325                 printf(" %3d:  ``%s''\n", ix+1, p->tkn_list[ix]);
326             } while (++ix < p->tkn_ct);
327             free(p);
328         }
329     }
330     return 0;
331 }
332 #endif
333 
334 /** @}
335  *
336  * Local Variables:
337  * mode: C
338  * c-file-style: "stroustrup"
339  * indent-tabs-mode: nil
340  * End:
341  * end of autoopts/tokenize.c */
342