xref: /netbsd-src/external/bsd/ntp/dist/sntp/libopts/tokenize.c (revision b757af438b42b93f8c6571f026d8b8ef3eaf5fc9)
1 /*	$NetBSD: tokenize.c,v 1.2 2012/02/03 21:36:40 christos Exp $	*/
2 
3 /*
4  *  This file defines the string_tokenize interface
5  * Time-stamp:      "2010-07-17 10:40:26 bkorb"
6  *
7  *  This file is part of AutoOpts, a companion to AutoGen.
8  *  AutoOpts is free software.
9  *  AutoOpts is Copyright (c) 1992-2011 by Bruce Korb - all rights reserved
10  *
11  *  AutoOpts is available under any one of two licenses.  The license
12  *  in use must be one of these two and the choice is under the control
13  *  of the user of the license.
14  *
15  *   The GNU Lesser General Public License, version 3 or later
16  *      See the files "COPYING.lgplv3" and "COPYING.gplv3"
17  *
18  *   The Modified Berkeley Software Distribution License
19  *      See the file "COPYING.mbsd"
20  *
21  *  These files have the following md5sums:
22  *
23  *  43b91e8ca915626ed3818ffb1b71248b pkg/libopts/COPYING.gplv3
24  *  06a1a2e4760c90ea5e1dad8dfaac4d39 pkg/libopts/COPYING.lgplv3
25  *  66a5cedaf62c4b2637025f049f9b826f pkg/libopts/COPYING.mbsd
26  */
27 
28 #include <errno.h>
29 #include <stdlib.h>
30 
31 #define cc_t   const unsigned char
32 #define ch_t   unsigned char
33 
34 /* = = = START-STATIC-FORWARD = = = */
35 static void
36 copy_cooked(ch_t** ppDest, char const ** ppSrc);
37 
38 static void
39 copy_raw(ch_t** ppDest, char const ** ppSrc);
40 
41 static token_list_t *
42 alloc_token_list(char const * str);
43 /* = = = END-STATIC-FORWARD = = = */
44 
45 static void
46 copy_cooked(ch_t** ppDest, char const ** ppSrc)
47 {
48     ch_t* pDest = (ch_t*)*ppDest;
49     const ch_t* pSrc  = (const ch_t*)(*ppSrc + 1);
50 
51     for (;;) {
52         ch_t ch = *(pSrc++);
53         switch (ch) {
54         case NUL:   *ppSrc = NULL; return;
55         case '"':   goto done;
56         case '\\':
57             pSrc += ao_string_cook_escape_char((char*)(intptr_t)pSrc, (char*)(intptr_t)&ch, 0x7F);
58             if (ch == 0x7F)
59                 break;
60             /* FALLTHROUGH */
61 
62         default:
63             *(pDest++) = ch;
64         }
65     }
66 
67  done:
68     *ppDest = (ch_t*)pDest; /* next spot for storing character */
69     *ppSrc  = (char const *)pSrc;  /* char following closing quote    */
70 }
71 
72 
73 static void
74 copy_raw(ch_t** ppDest, char const ** ppSrc)
75 {
76     ch_t* pDest = *ppDest;
77     cc_t* pSrc  = (cc_t*) (*ppSrc + 1);
78 
79     for (;;) {
80         ch_t ch = *(pSrc++);
81         switch (ch) {
82         case NUL:   *ppSrc = NULL; return;
83         case '\'':  goto done;
84         case '\\':
85             /*
86              *  *Four* escapes are handled:  newline removal, escape char
87              *  quoting and apostrophe quoting
88              */
89             switch (*pSrc) {
90             case NUL:   *ppSrc = NULL; return;
91             case '\r':
92                 if (*(++pSrc) == '\n')
93                     ++pSrc;
94                 continue;
95 
96             case '\n':
97                 ++pSrc;
98                 continue;
99 
100             case '\'':
101                 ch = '\'';
102                 /* FALLTHROUGH */
103 
104             case '\\':
105                 ++pSrc;
106                 break;
107             }
108             /* FALLTHROUGH */
109 
110         default:
111             *(pDest++) = ch;
112         }
113     }
114 
115  done:
116     *ppDest = pDest; /* next spot for storing character */
117     *ppSrc  = (char const *) pSrc;  /* char following closing quote    */
118 }
119 
120 static token_list_t *
121 alloc_token_list(char const * str)
122 {
123     token_list_t * res;
124 
125     int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */
126 
127     if (str == NULL) goto enoent_res;
128 
129     /*
130      *  Trim leading white space.  Use "ENOENT" and a NULL return to indicate
131      *  an empty string was passed.
132      */
133     while (IS_WHITESPACE_CHAR(*str))  str++;
134     if (*str == NUL)  goto enoent_res;
135 
136     /*
137      *  Take an approximate count of tokens.  If no quoted strings are used,
138      *  it will be accurate.  If quoted strings are used, it will be a little
139      *  high and we'll squander the space for a few extra pointers.
140      */
141     {
142         cc_t* pz = (cc_t*)str;
143 
144         do {
145             max_token_ct++;
146             while (! IS_WHITESPACE_CHAR(*++pz))
147                 if (*pz == NUL) goto found_nul;
148             while (IS_WHITESPACE_CHAR(*pz))  pz++;
149         } while (*pz != NUL);
150 
151     found_nul:
152         res = malloc(sizeof(*res) + (pz - (cc_t*)str)
153                      + (max_token_ct * sizeof(ch_t*)));
154     }
155 
156     if (res == NULL)
157         errno = ENOMEM;
158     else res->tkn_list[0] = (ch_t*)(res->tkn_list + (max_token_ct - 1));
159 
160     return res;
161 
162     enoent_res:
163 
164     errno = ENOENT;
165     return NULL;
166 }
167 
168 /*=export_func ao_string_tokenize
169  *
170  * what: tokenize an input string
171  *
172  * arg:  + char const* + string + string to be tokenized +
173  *
174  * ret_type:  token_list_t*
175  * ret_desc:  pointer to a structure that lists each token
176  *
177  * doc:
178  *
179  * This function will convert one input string into a list of strings.
180  * The list of strings is derived by separating the input based on
181  * white space separation.  However, if the input contains either single
182  * or double quote characters, then the text after that character up to
183  * a matching quote will become the string in the list.
184  *
185  *  The returned pointer should be deallocated with @code{free(3C)} when
186  *  are done using the data.  The data are placed in a single block of
187  *  allocated memory.  Do not deallocate individual token/strings.
188  *
189  *  The structure pointed to will contain at least these two fields:
190  *  @table @samp
191  *  @item tkn_ct
192  *  The number of tokens found in the input string.
193  *  @item tok_list
194  *  An array of @code{tkn_ct + 1} pointers to substring tokens, with
195  *  the last pointer set to NULL.
196  *  @end table
197  *
198  * There are two types of quoted strings: single quoted (@code{'}) and
199  * double quoted (@code{"}).  Singly quoted strings are fairly raw in that
200  * escape characters (@code{\\}) are simply another character, except when
201  * preceding the following characters:
202  * @example
203  * @code{\\}  double backslashes reduce to one
204  * @code{'}   incorporates the single quote into the string
205  * @code{\n}  suppresses both the backslash and newline character
206  * @end example
207  *
208  * Double quote strings are formed according to the rules of string
209  * constants in ANSI-C programs.
210  *
211  * example:
212  * @example
213  *    #include <stdlib.h>
214  *    int ix;
215  *    token_list_t* ptl = ao_string_tokenize(some_string)
216  *    for (ix = 0; ix < ptl->tkn_ct; ix++)
217  *       do_something_with_tkn(ptl->tkn_list[ix]);
218  *    free(ptl);
219  * @end example
220  * Note that everything is freed with the one call to @code{free(3C)}.
221  *
222  * err:
223  *  NULL is returned and @code{errno} will be set to indicate the problem:
224  *  @itemize @bullet
225  *  @item
226  *  @code{EINVAL} - There was an unterminated quoted string.
227  *  @item
228  *  @code{ENOENT} - The input string was empty.
229  *  @item
230  *  @code{ENOMEM} - There is not enough memory.
231  *  @end itemize
232 =*/
233 token_list_t*
234 ao_string_tokenize(char const* str)
235 {
236     token_list_t* res = alloc_token_list(str);
237     ch_t* pzDest;
238 
239     /*
240      *  Now copy each token into the output buffer.
241      */
242     if (res == NULL)
243         return res;
244 
245     pzDest = (ch_t*)(res->tkn_list[0]);
246     res->tkn_ct  = 0;
247 
248     do  {
249         res->tkn_list[ res->tkn_ct++ ] = pzDest;
250         for (;;) {
251             int ch = (ch_t)*str;
252             if (IS_WHITESPACE_CHAR(ch)) {
253             found_white_space:
254                 while (IS_WHITESPACE_CHAR(*++str))  ;
255                 break;
256             }
257 
258             switch (ch) {
259             case '"':
260                 copy_cooked(&pzDest, &str);
261                 if (str == NULL) {
262                     free(res);
263                     errno = EINVAL;
264                     return NULL;
265                 }
266                 if (IS_WHITESPACE_CHAR(*str))
267                     goto found_white_space;
268                 break;
269 
270             case '\'':
271                 copy_raw(&pzDest, &str);
272                 if (str == NULL) {
273                     free(res);
274                     errno = EINVAL;
275                     return NULL;
276                 }
277                 if (IS_WHITESPACE_CHAR(*str))
278                     goto found_white_space;
279                 break;
280 
281             case NUL:
282                 goto copy_done;
283 
284             default:
285                 str++;
286                 *(pzDest++) = ch;
287             }
288         } copy_done:;
289 
290         /*
291          * NUL terminate the last token and see if we have any more tokens.
292          */
293         *(pzDest++) = NUL;
294     } while (*str != NUL);
295 
296     res->tkn_list[ res->tkn_ct ] = NULL;
297 
298     return res;
299 }
300 
301 #ifdef TEST
302 #include <stdio.h>
303 #include <string.h>
304 
305 int
306 main(int argc, char** argv)
307 {
308     if (argc == 1) {
309         printf("USAGE:  %s arg [ ... ]\n", *argv);
310         return 1;
311     }
312     while (--argc > 0) {
313         char* arg = *(++argv);
314         token_list_t* p = ao_string_tokenize(arg);
315         if (p == NULL) {
316             printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
317                    arg, errno, strerror(errno));
318         } else {
319             int ix = 0;
320             printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct);
321             do {
322                 printf(" %3d:  ``%s''\n", ix+1, p->tkn_list[ix]);
323             } while (++ix < p->tkn_ct);
324             free(p);
325         }
326     }
327     return 0;
328 }
329 #endif
330 
331 /*
332  * Local Variables:
333  * mode: C
334  * c-file-style: "stroustrup"
335  * indent-tabs-mode: nil
336  * End:
337  * end of autoopts/tokenize.c */
338