xref: /netbsd-src/external/bsd/ntp/dist/sntp/libopts/tokenize.c (revision b1c86f5f087524e68db12794ee9c3e3da1ab17a0)
1 /*	$NetBSD: tokenize.c,v 1.1.1.1 2009/12/13 16:57:23 kardel Exp $	*/
2 
3 /*
4  *  This file defines the string_tokenize interface
5  * Time-stamp:      "2007-11-12 20:40:36 bkorb"
6  *
7  *  This file is part of AutoOpts, a companion to AutoGen.
8  *  AutoOpts is free software.
9  *  AutoOpts is copyright (c) 1992-2009 by Bruce Korb - all rights reserved
10  *
11  *  AutoOpts is available under any one of two licenses.  The license
12  *  in use must be one of these two and the choice is under the control
13  *  of the user of the license.
14  *
15  *   The GNU Lesser General Public License, version 3 or later
16  *      See the files "COPYING.lgplv3" and "COPYING.gplv3"
17  *
18  *   The Modified Berkeley Software Distribution License
19  *      See the file "COPYING.mbsd"
20  *
21  *  These files have the following md5sums:
22  *
23  *  43b91e8ca915626ed3818ffb1b71248b pkg/libopts/COPYING.gplv3
24  *  06a1a2e4760c90ea5e1dad8dfaac4d39 pkg/libopts/COPYING.lgplv3
25  *  66a5cedaf62c4b2637025f049f9b826f pkg/libopts/COPYING.mbsd
26  */
27 
28 #include <errno.h>
29 #include <stdlib.h>
30 
31 #define cc_t   const unsigned char
32 #define ch_t   unsigned char
33 
34 /* = = = START-STATIC-FORWARD = = = */
35 /* static forward declarations maintained by mk-fwd */
36 static void
37 copy_cooked( ch_t** ppDest, char const ** ppSrc );
38 
39 static void
40 copy_raw( ch_t** ppDest, char const ** ppSrc );
41 /* = = = END-STATIC-FORWARD = = = */
42 
43 static void
44 copy_cooked( ch_t** ppDest, char const ** ppSrc )
45 {
46     ch_t* pDest = (ch_t*)*ppDest;
47     const ch_t* pSrc  = (const ch_t*)(*ppSrc + 1);
48 
49     for (;;) {
50         ch_t ch = *(pSrc++);
51         switch (ch) {
52         case NUL:   *ppSrc = NULL; return;
53         case '"':   goto done;
54         case '\\':
55             pSrc += ao_string_cook_escape_char( (char*)pSrc, (char*)&ch, 0x7F );
56             if (ch == 0x7F)
57                 break;
58             /* FALLTHROUGH */
59 
60         default:
61             *(pDest++) = ch;
62         }
63     }
64 
65  done:
66     *ppDest = (ch_t*)pDest; /* next spot for storing character */
67     *ppSrc  = (char const *)pSrc;  /* char following closing quote    */
68 }
69 
70 
71 static void
72 copy_raw( ch_t** ppDest, char const ** ppSrc )
73 {
74     ch_t* pDest = *ppDest;
75     cc_t* pSrc  = (cc_t*) (*ppSrc + 1);
76 
77     for (;;) {
78         ch_t ch = *(pSrc++);
79         switch (ch) {
80         case NUL:   *ppSrc = NULL; return;
81         case '\'':  goto done;
82         case '\\':
83             /*
84              *  *Four* escapes are handled:  newline removal, escape char
85              *  quoting and apostrophe quoting
86              */
87             switch (*pSrc) {
88             case NUL:   *ppSrc = NULL; return;
89             case '\r':
90                 if (*(++pSrc) == '\n')
91                     ++pSrc;
92                 continue;
93 
94             case '\n':
95                 ++pSrc;
96                 continue;
97 
98             case '\'':
99                 ch = '\'';
100                 /* FALLTHROUGH */
101 
102             case '\\':
103                 ++pSrc;
104                 break;
105             }
106             /* FALLTHROUGH */
107 
108         default:
109             *(pDest++) = ch;
110         }
111     }
112 
113  done:
114     *ppDest = pDest; /* next spot for storing character */
115     *ppSrc  = (char const *) pSrc;  /* char following closing quote    */
116 }
117 
118 
119 /*=export_func ao_string_tokenize
120  *
121  * what: tokenize an input string
122  *
123  * arg:  + char const* + string + string to be tokenized +
124  *
125  * ret_type:  token_list_t*
126  * ret_desc:  pointer to a structure that lists each token
127  *
128  * doc:
129  *
130  * This function will convert one input string into a list of strings.
131  * The list of strings is derived by separating the input based on
132  * white space separation.  However, if the input contains either single
133  * or double quote characters, then the text after that character up to
134  * a matching quote will become the string in the list.
135  *
136  *  The returned pointer should be deallocated with @code{free(3C)} when
137  *  are done using the data.  The data are placed in a single block of
138  *  allocated memory.  Do not deallocate individual token/strings.
139  *
140  *  The structure pointed to will contain at least these two fields:
141  *  @table @samp
142  *  @item tkn_ct
143  *  The number of tokens found in the input string.
144  *  @item tok_list
145  *  An array of @code{tkn_ct + 1} pointers to substring tokens, with
146  *  the last pointer set to NULL.
147  *  @end table
148  *
149  * There are two types of quoted strings: single quoted (@code{'}) and
150  * double quoted (@code{"}).  Singly quoted strings are fairly raw in that
151  * escape characters (@code{\\}) are simply another character, except when
152  * preceding the following characters:
153  * @example
154  * @code{\\}  double backslashes reduce to one
155  * @code{'}   incorporates the single quote into the string
156  * @code{\n}  suppresses both the backslash and newline character
157  * @end example
158  *
159  * Double quote strings are formed according to the rules of string
160  * constants in ANSI-C programs.
161  *
162  * example:
163  * @example
164  *    #include <stdlib.h>
165  *    int ix;
166  *    token_list_t* ptl = ao_string_tokenize( some_string )
167  *    for (ix = 0; ix < ptl->tkn_ct; ix++)
168  *       do_something_with_tkn( ptl->tkn_list[ix] );
169  *    free( ptl );
170  * @end example
171  * Note that everything is freed with the one call to @code{free(3C)}.
172  *
173  * err:
174  *  NULL is returned and @code{errno} will be set to indicate the problem:
175  *  @itemize @bullet
176  *  @item
177  *  @code{EINVAL} - There was an unterminated quoted string.
178  *  @item
179  *  @code{ENOENT} - The input string was empty.
180  *  @item
181  *  @code{ENOMEM} - There is not enough memory.
182  *  @end itemize
183 =*/
184 token_list_t*
185 ao_string_tokenize( char const* str )
186 {
187     int max_token_ct = 1; /* allow for trailing NUL on string */
188     token_list_t* res;
189 
190     if (str == NULL)  goto bogus_str;
191 
192     /*
193      *  Trim leading white space.  Use "ENOENT" and a NULL return to indicate
194      *  an empty string was passed.
195      */
196     while (IS_WHITESPACE_CHAR(*str))  str++;
197     if (*str == NUL) {
198     bogus_str:
199         errno = ENOENT;
200         return NULL;
201     }
202 
203     /*
204      *  Take an approximate count of tokens.  If no quoted strings are used,
205      *  it will be accurate.  If quoted strings are used, it will be a little
206      *  high and we'll squander the space for a few extra pointers.
207      */
208     {
209         cc_t* pz = (cc_t*)str;
210 
211         do {
212             max_token_ct++;
213             while (! IS_WHITESPACE_CHAR(*++pz))
214                 if (*pz == NUL) goto found_nul;
215             while (IS_WHITESPACE_CHAR(*pz))  pz++;
216         } while (*pz != NUL);
217 
218     found_nul:
219         ;
220     }
221 
222     res = malloc( sizeof(*res) + strlen(str) + (max_token_ct * sizeof(ch_t*)) );
223     if (res == NULL) {
224         errno = ENOMEM;
225         return res;
226     }
227 
228     /*
229      *  Now copy each token into the output buffer.
230      */
231     {
232         ch_t* pzDest = (ch_t*)(res->tkn_list + (max_token_ct + 1));
233         res->tkn_ct  = 0;
234 
235         do  {
236             res->tkn_list[ res->tkn_ct++ ] = pzDest;
237             for (;;) {
238                 int ch = (ch_t)*str;
239                 if (IS_WHITESPACE_CHAR(ch)) {
240                 found_white_space:
241                     while (IS_WHITESPACE_CHAR(*++str))  ;
242                     break;
243                 }
244 
245                 switch (ch) {
246                 case '"':
247                     copy_cooked( &pzDest, &str );
248                     if (str == NULL) {
249                         free(res);
250                         errno = EINVAL;
251                         return NULL;
252                     }
253                     if (IS_WHITESPACE_CHAR(*str))
254                         goto found_white_space;
255                     break;
256 
257                 case '\'':
258                     copy_raw( &pzDest, &str );
259                     if (str == NULL) {
260                         free(res);
261                         errno = EINVAL;
262                         return NULL;
263                     }
264                     if (IS_WHITESPACE_CHAR(*str))
265                         goto found_white_space;
266                     break;
267 
268                 case NUL:
269                     goto copy_done;
270 
271                 default:
272                     str++;
273                     *(pzDest++) = ch;
274                 }
275             } copy_done:;
276 
277             /*
278              * NUL terminate the last token and see if we have any more tokens.
279              */
280             *(pzDest++) = NUL;
281         } while (*str != NUL);
282 
283         res->tkn_list[ res->tkn_ct ] = NULL;
284     }
285 
286     return res;
287 }
288 
289 #ifdef TEST
290 #include <stdio.h>
291 #include <string.h>
292 
293 int
294 main( int argc, char** argv )
295 {
296     if (argc == 1) {
297         printf("USAGE:  %s arg [ ... ]\n", *argv);
298         return 1;
299     }
300     while (--argc > 0) {
301         char* arg = *(++argv);
302         token_list_t* p = ao_string_tokenize( arg );
303         if (p == NULL) {
304             printf( "Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
305                     arg, errno, strerror( errno ));
306         } else {
307             int ix = 0;
308             printf( "Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct );
309             do {
310                 printf( " %3d:  ``%s''\n", ix+1, p->tkn_list[ix] );
311             } while (++ix < p->tkn_ct);
312             free(p);
313         }
314     }
315     return 0;
316 }
317 #endif
318 
319 /*
320  * Local Variables:
321  * mode: C
322  * c-file-style: "stroustrup"
323  * indent-tabs-mode: nil
324  * End:
325  * end of autoopts/tokenize.c */
326