1 /*
2 * Copyright (c) 2016, 2018, 2023 Kristaps Dzonsons <kristaps@bsd.lv>
3 *
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16 #include "config.h"
17
18 #if HAVE_SYS_QUEUE
19 # include <sys/queue.h>
20 #endif
21
22 #include <assert.h>
23 #include <ctype.h>
24 #if HAVE_ERR
25 # include <err.h>
26 #endif
27 #include <getopt.h>
28 #if HAVE_SANDBOX_INIT
29 # include <sandbox.h>
30 #endif
31 #include <search.h>
32 #include <stdint.h> /* uintptr_t */
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <unistd.h>
37
38 /*
39 * Phase of parsing input file.
40 */
41 enum phase {
42 PHASE_INIT = 0, /* waiting to encounter definition */
43 PHASE_KEYS, /* have definition, now keywords */
44 PHASE_DESC, /* have keywords, now description */
45 PHASE_SEEALSO,
46 PHASE_DECL /* have description, now declarations */
47 };
48
49 /*
50 * What kind of declaration (preliminary analysis).
51 */
52 enum decltype {
53 DECLTYPE_CPP, /* pre-processor */
54 DECLTYPE_C, /* semicolon-closed non-preprocessor */
55 DECLTYPE_NEITHER /* non-preprocessor, no semicolon */
56 };
57
58 /*
59 * In variables and function declarations, we toss these.
60 */
61 enum preproc {
62 PREPROC_SQLITE_API,
63 PREPROC_SQLITE_DEPRECATED,
64 PREPROC_SQLITE_EXPERIMENTAL,
65 PREPROC_SQLITE_EXTERN,
66 PREPROC_SQLITE_STDCALL,
67 PREPROC__MAX
68 };
69
70 /*
71 * HTML tags that we recognise.
72 */
73 enum tag {
74 TAG_A_CLOSE,
75 TAG_A_OPEN_ATTRS,
76 TAG_B_CLOSE,
77 TAG_B_OPEN,
78 TAG_BLOCK_CLOSE,
79 TAG_BLOCK_OPEN,
80 TAG_BR_OPEN,
81 TAG_DD_CLOSE,
82 TAG_DD_OPEN,
83 TAG_DL_CLOSE,
84 TAG_DL_OPEN,
85 TAG_DT_CLOSE,
86 TAG_DT_OPEN,
87 TAG_EM_CLOSE,
88 TAG_EM_OPEN,
89 TAG_H3_CLOSE,
90 TAG_H3_OPEN,
91 TAG_I_CLOSE,
92 TAG_I_OPEN,
93 TAG_LI_CLOSE,
94 TAG_LI_OPEN,
95 TAG_LI_OPEN_ATTRS,
96 TAG_OL_CLOSE,
97 TAG_OL_OPEN,
98 TAG_P_OPEN,
99 TAG_PRE_CLOSE,
100 TAG_PRE_OPEN,
101 TAG_SPAN_CLOSE,
102 TAG_SPAN_OPEN_ATTRS,
103 TAG_TABLE_CLOSE,
104 TAG_TABLE_OPEN,
105 TAG_TABLE_OPEN_ATTRS,
106 TAG_TD_CLOSE,
107 TAG_TD_OPEN,
108 TAG_TD_OPEN_ATTRS,
109 TAG_TH_CLOSE,
110 TAG_TH_OPEN,
111 TAG_TH_OPEN_ATTRS,
112 TAG_TR_CLOSE,
113 TAG_TR_OPEN,
114 TAG_U_CLOSE,
115 TAG_U_OPEN,
116 TAG_UL_CLOSE,
117 TAG_UL_OPEN,
118 TAG__MAX
119 };
120
121 TAILQ_HEAD(defnq, defn);
122 TAILQ_HEAD(declq, decl);
123
124 /*
125 * A declaration of type DECLTYPE_CPP or DECLTYPE_C.
126 * These need not be unique (if ifdef'd).
127 */
128 struct decl {
129 enum decltype type; /* type of declaration */
130 char *text; /* text */
131 size_t textsz; /* strlen(text) */
132 TAILQ_ENTRY(decl) entries;
133 };
134
135 /*
136 * A definition is basically the manpage contents.
137 */
138 struct defn {
139 char *name; /* really Nd */
140 TAILQ_ENTRY(defn) entries;
141 char *desc; /* long description */
142 size_t descsz; /* strlen(desc) */
143 char *fulldesc; /* description w/newlns */
144 size_t fulldescsz; /* strlen(fulldesc) */
145 struct declq dcqhead; /* declarations */
146 int multiline; /* used when parsing */
147 int instruct; /* used when parsing */
148 const char *fn; /* parsed from file */
149 size_t ln; /* parsed at line */
150 int postprocessed; /* good for emission? */
151 char *dt; /* manpage title */
152 char **nms; /* manpage names */
153 size_t nmsz; /* number of names */
154 char *fname; /* manpage filename */
155 char *keybuf; /* raw keywords */
156 size_t keybufsz; /* length of "keysbuf" */
157 char *seealso; /* see also tags */
158 size_t seealsosz; /* length of seealso */
159 char **xrs; /* parsed "see also" references */
160 size_t xrsz; /* number of references */
161 char **keys; /* parsed keywords */
162 size_t keysz; /* number of keywords */
163 };
164
165 /*
166 * Entire parse routine.
167 */
168 struct parse {
169 enum phase phase; /* phase of parse */
170 size_t ln; /* line number */
171 const char *fn; /* open file */
172 struct defnq dqhead; /* definitions */
173 };
174
175 /*
176 * How to handle HTML tags we find in the text.
177 */
178 struct taginfo {
179 const char *html; /* HTML to key on */
180 const char *mdoc; /* generate mdoc(7) */
181 unsigned int flags;
182 #define TAGINFO_NOBR 0x01 /* follow w/space, not newline */
183 #define TAGINFO_NOOP 0x02 /* just strip out */
184 #define TAGINFO_NOSP 0x04 /* follow w/o space or newline */
185 #define TAGINFO_INLINE 0x08 /* inline block (notused) */
186 #define TAGINFO_ATTRS 0x10 /* ignore attributes */
187 };
188
189 static const struct taginfo tags[TAG__MAX] = {
190 { "</a>", "", TAGINFO_INLINE }, /* TAG_A_CLOSE */
191 { "<a ", "", TAGINFO_INLINE | TAGINFO_ATTRS }, /* TAG_A_OPEN_ATTRS */
192 { "</b>", "\\fP", TAGINFO_INLINE }, /* TAG_B_CLOSE */
193 { "<b>", "\\fB", TAGINFO_INLINE }, /* TAG_B_OPEN */
194 { "<br>", " ", TAGINFO_INLINE }, /* TAG_BR_OPEN */
195 { "</blockquote>", ".Ed\n.Pp", 0 }, /* TAG_BLOCK_CLOSE */
196 { "<blockquote>", ".Bd -ragged", 0 }, /* TAG_BLOCK_OPEN */
197 { "</dd>", "", TAGINFO_NOOP }, /* TAG_DD_CLOSE */
198 { "<dd>", "", TAGINFO_NOBR | TAGINFO_NOSP }, /* TAG_DD_OPEN */
199 { "</dl>", ".El\n.Pp", 0 }, /* TAG_DL_CLOSE */
200 { "<dl>", ".Bl -tag -width Ds", 0 }, /* TAG_DL_OPEN */
201 { "</dt>", "", TAGINFO_NOBR | TAGINFO_NOSP}, /* TAG_DT_CLOSE */
202 { "<dt>", ".It", TAGINFO_NOBR }, /* TAG_DT_OPEN */
203 { "</em>", "\\fP", TAGINFO_INLINE }, /* TAG_EM_CLOSE */
204 { "<em>", "\\fB", TAGINFO_INLINE }, /* TAG_EM_OPEN */
205 { "</h3>", "", TAGINFO_NOBR | TAGINFO_NOSP}, /* TAG_H3_CLOSE */
206 { "<h3>", ".Ss", TAGINFO_NOBR }, /* TAG_H3_OPEN */
207 { "</i>", "\\fP", TAGINFO_INLINE }, /* TAG_I_CLOSE */
208 { "<i>", "\\fI", TAGINFO_INLINE }, /* TAG_I_OPEN */
209 { "</li>", "", TAGINFO_NOOP }, /* TAG_LI_CLOSE */
210 { "<li>", ".It", 0 }, /* TAG_LI_OPEN */
211 { "<li ", ".It", TAGINFO_ATTRS }, /* TAG_LI_OPEN_ATTRS */
212 { "</ol>", ".El\n.Pp", 0 }, /* TAG_OL_CLOSE */
213 { "<ol>", ".Bl -enum", 0 }, /* TAG_OL_OPEN */
214 { "<p>", ".Pp", 0 }, /* TAG_P_OPEN */
215 { "</pre>", ".Ed\n.Pp", 0 }, /* TAG_PRE_CLOSE */
216 { "<pre>", ".Bd -literal", 0 }, /* TAG_PRE_OPEN */
217 { "</span>", "", TAGINFO_INLINE }, /* TAG_SPAN_CLOSE */
218 { "<span ", "", TAGINFO_INLINE | TAGINFO_ATTRS }, /* TAG_SPAN_OPEN_ATTRS */
219 { "</table>", ".Pp", 0 }, /* TAG_TABLE_CLOSE */
220 { "<table>", ".Pp", 0 }, /* TAG_TABLE_OPEN */
221 { "<table ", ".Pp", TAGINFO_ATTRS }, /* TAG_TABLE_OPEN_ATTRS */
222 { "</td>", "", TAGINFO_NOOP }, /* TAG_TD_CLOSE */
223 { "<td>", " ", TAGINFO_INLINE }, /* TAG_TD_OPEN */
224 { "<td ", " ", TAGINFO_INLINE | TAGINFO_ATTRS}, /* TAG_TD_OPEN_ATTRS */
225 { "</th>", "", TAGINFO_NOOP }, /* TAG_TH_CLOSE */
226 { "<th>", " ", TAGINFO_INLINE }, /* TAG_TH_OPEN */
227 { "<th ", " ", TAGINFO_INLINE | TAGINFO_ATTRS}, /* TAG_TH_OPEN_ATTRS */
228 { "</tr>", "", TAGINFO_NOOP}, /* TAG_TR_CLOSE */
229 { "<tr>", "", TAGINFO_NOBR }, /* TAG_TR_OPEN */
230 { "</u>", "\\fP", TAGINFO_INLINE }, /* TAG_U_CLOSE */
231 { "<u>", "\\fI", TAGINFO_INLINE }, /* TAG_U_OPEN */
232 { "</ul>", ".El\n.Pp", 0 }, /* TAG_UL_CLOSE */
233 { "<ul>", ".Bl -bullet", 0 }, /* TAG_UL_OPEN */
234 };
235
236 static const char *const preprocs[TAG__MAX] = {
237 "SQLITE_API", /* PREPROC_SQLITE_API */
238 "SQLITE_DEPRECATED", /* PREPROC_SQLITE_DEPRECATED */
239 "SQLITE_EXPERIMENTAL", /* PREPROC_SQLITE_EXPERIMENTAL */
240 "SQLITE_EXTERN", /* PREPROC_SQLITE_EXTERN */
241 "SQLITE_STDCALL", /* PREPROC_SQLITE_STDCALL */
242 };
243
244 /* Verbose reporting. */
245 static int verbose;
246
247 /* Don't output any files: use stdout. */
248 static int nofile;
249
250 /* Print out only filename. */
251 static int filename;
252
253 static void
decl_function_add(struct parse * p __unused,char ** etext,size_t * etextsz,const char * cp,size_t len)254 decl_function_add(struct parse *p __unused, char **etext,
255 size_t *etextsz, const char *cp, size_t len)
256 {
257
258 if ((*etext)[*etextsz - 1] != ' ') {
259 *etext = realloc(*etext, *etextsz + 2);
260 if (*etext == NULL)
261 err(1, NULL);
262 (*etextsz)++;
263 strlcat(*etext, " ", *etextsz + 1);
264 }
265 *etext = realloc(*etext, *etextsz + len + 1);
266 if (*etext == NULL)
267 err(1, NULL);
268 memcpy(*etext + *etextsz, cp, len);
269 *etextsz += len;
270 (*etext)[*etextsz] = '\0';
271 }
272
273 static void
decl_function_copy(struct parse * p __unused,char ** etext,size_t * etextsz,const char * cp,size_t len)274 decl_function_copy(struct parse *p __unused, char **etext,
275 size_t *etextsz, const char *cp, size_t len)
276 {
277
278 *etext = malloc(len + 1);
279 if (*etext == NULL)
280 err(1, NULL);
281 memcpy(*etext, cp, len);
282 *etextsz = len;
283 (*etext)[*etextsz] = '\0';
284 }
285
286 /*
287 * A C function (or variable, or whatever).
288 * This is more specifically any non-preprocessor text.
289 */
290 static int
decl_function(struct parse * p,const char * cp,size_t len)291 decl_function(struct parse *p, const char *cp, size_t len)
292 {
293 char *ep, *lcp, *rcp;
294 const char *ncp;
295 size_t nlen;
296 struct defn *d;
297 struct decl *e;
298
299 /* Fetch current interface definition. */
300 d = TAILQ_LAST(&p->dqhead, defnq);
301 assert(NULL != d);
302
303 /*
304 * Since C tokens are semicolon-separated, we may be invoked any
305 * number of times per a single line.
306 */
307 again:
308 while (isspace((unsigned char)*cp)) {
309 cp++;
310 len--;
311 }
312 if (*cp == '\0')
313 return(1);
314
315 /* Whether we're a continuation clause. */
316 if (d->multiline) {
317 /* This might be NULL if we're not a continuation. */
318 e = TAILQ_LAST(&d->dcqhead, declq);
319 assert(DECLTYPE_C == e->type);
320 assert(NULL != e);
321 assert(NULL != e->text);
322 assert(e->textsz);
323 } else {
324 assert(d->instruct == 0);
325 e = calloc(1, sizeof(struct decl));
326 if (e == NULL)
327 err(1, NULL);
328 e->type = DECLTYPE_C;
329 TAILQ_INSERT_TAIL(&d->dcqhead, e, entries);
330 }
331
332 /*
333 * We begin by seeing if there's a semicolon on this line.
334 * If there is, we'll need to do some special handling.
335 */
336 ep = strchr(cp, ';');
337 lcp = strchr(cp, '{');
338 rcp = strchr(cp, '}');
339
340 /* We're only a partial statement (i.e., no closure). */
341 if (ep == NULL && d->multiline) {
342 assert(e->text != NULL);
343 assert(e->textsz > 0);
344 /* Is a struct starting or ending here? */
345 if (d->instruct && NULL != rcp)
346 d->instruct--;
347 else if (NULL != lcp)
348 d->instruct++;
349 decl_function_add(p, &e->text, &e->textsz, cp, len);
350 return(1);
351 } else if (ep == NULL && !d->multiline) {
352 d->multiline = 1;
353 /* Is a structure starting in this line? */
354 if (NULL != lcp &&
355 (rcp == NULL || rcp < lcp))
356 d->instruct++;
357 decl_function_copy(p, &e->text, &e->textsz, cp, len);
358 return(1);
359 }
360
361 /* Position ourselves after the semicolon. */
362 assert(NULL != ep);
363 ncp = cp;
364 nlen = (ep - cp) + 1;
365 cp = ep + 1;
366 len -= nlen;
367
368 if (d->multiline) {
369 assert(NULL != e->text);
370 /* Don't stop the multi-line if we're in a struct. */
371 if (d->instruct == 0) {
372 if (lcp == NULL || lcp > cp)
373 d->multiline = 0;
374 } else if (NULL != rcp && rcp < cp)
375 if (--d->instruct == 0)
376 d->multiline = 0;
377 decl_function_add(p, &e->text, &e->textsz, ncp, nlen);
378 } else {
379 assert(e->text == NULL);
380 if (NULL != lcp && lcp < cp) {
381 d->multiline = 1;
382 d->instruct++;
383 }
384 decl_function_copy(p, &e->text, &e->textsz, ncp, nlen);
385 }
386
387 goto again;
388 }
389
390 /*
391 * A definition is just #define followed by space followed by the name,
392 * then the value of that name.
393 * We ignore the latter.
394 * FIXME: this does not understand multi-line CPP, but I don't think
395 * there are any instances of that in sqlite3.h.
396 */
397 static int
decl_define(struct parse * p,const char * cp,size_t len)398 decl_define(struct parse *p, const char *cp, size_t len)
399 {
400 struct defn *d;
401 struct decl *e;
402 size_t sz;
403
404 while (isspace((unsigned char)*cp)) {
405 cp++;
406 len--;
407 }
408 if (len == 0) {
409 warnx("%s:%zu: empty pre-processor "
410 "constant", p->fn, p->ln);
411 return(1);
412 }
413
414 d = TAILQ_LAST(&p->dqhead, defnq);
415 assert(NULL != d);
416
417 /*
418 * We're parsing a preprocessor definition, but we're still
419 * waiting on a semicolon from a function definition.
420 * It might be a comment or an error.
421 */
422 if (d->multiline) {
423 if (verbose)
424 warnx("%s:%zu: multiline declaration "
425 "still open", p->fn, p->ln);
426 e = TAILQ_LAST(&d->dcqhead, declq);
427 assert(NULL != e);
428 e->type = DECLTYPE_NEITHER;
429 d->multiline = d->instruct = 0;
430 }
431
432 sz = 0;
433 while (cp[sz] != '\0' && !isspace((unsigned char)cp[sz]))
434 sz++;
435
436 e = calloc(1, sizeof(struct decl));
437 if (e == NULL)
438 err(1, NULL);
439 e->type = DECLTYPE_CPP;
440 e->text = calloc(1, sz + 1);
441 if (e->text == NULL)
442 err(1, NULL);
443 strlcpy(e->text, cp, sz + 1);
444 e->textsz = sz;
445 TAILQ_INSERT_TAIL(&d->dcqhead, e, entries);
446 return(1);
447 }
448
449 /*
450 * A declaration is a function, variable, preprocessor definition, or
451 * really anything else until we reach a blank line.
452 */
453 static void
decl(struct parse * p,const char * cp,size_t len)454 decl(struct parse *p, const char *cp, size_t len)
455 {
456 struct defn *d;
457 struct decl *e;
458 const char *oldcp;
459 size_t oldlen;
460
461 oldcp = cp;
462 oldlen = len;
463
464 while (isspace((unsigned char)*cp)) {
465 cp++;
466 len--;
467 }
468
469 d = TAILQ_LAST(&p->dqhead, defnq);
470 assert(NULL != d);
471
472 /* Check closure. */
473 if (*cp == '\0') {
474 p->phase = PHASE_INIT;
475 /* Check multiline status. */
476 if (d->multiline) {
477 if (verbose)
478 warnx("%s:%zu: multiline declaration "
479 "still open", p->fn, p->ln);
480 e = TAILQ_LAST(&d->dcqhead, declq);
481 assert(NULL != e);
482 e->type = DECLTYPE_NEITHER;
483 d->multiline = d->instruct = 0;
484 }
485 return;
486 }
487
488 d->fulldesc = realloc(d->fulldesc,
489 d->fulldescsz + oldlen + 2);
490 if (d->fulldesc == NULL)
491 err(1, NULL);
492 if (d->fulldescsz == 0)
493 d->fulldesc[0] = '\0';
494 d->fulldescsz += oldlen + 2;
495 strlcat(d->fulldesc, oldcp, d->fulldescsz);
496 strlcat(d->fulldesc, "\n", d->fulldescsz);
497
498 /*
499 * Catch preprocessor defines, but discard all other types of
500 * preprocessor statements.
501 * We might already be in the middle of a declaration (a
502 * function declaration), but that's ok.
503 */
504
505 if (*cp == '#') {
506 len--;
507 cp++;
508 while (isspace((unsigned char)*cp)) {
509 len--;
510 cp++;
511 }
512 if (strncmp(cp, "define", 6) == 0)
513 decl_define(p, cp + 6, len - 6);
514 return;
515 }
516
517 /* Skip one-liner comments. */
518
519 if (len > 4 &&
520 cp[0] == '/' && cp[1] == '*' &&
521 cp[len - 2] == '*' && cp[len - 1] == '/')
522 return;
523
524 decl_function(p, cp, len);
525 }
526
527 /*
528 * Whether to end an interface description phase with an asterisk-slash.
529 * This is run within a phase already opened with slash-asterisk. It
530 * adjusts the parse state on ending a phase or syntax errors. It has
531 * various hacks around lacks syntax (e.g., starting single-asterisk
532 * instead of double-asterisk) found in the wild.
533 *
534 * Returns zero if not ending the phase, non-zero if ending.
535 */
536 static int
endphase(struct parse * p,const char * cp)537 endphase(struct parse *p, const char *cp)
538 {
539
540 if (*cp == '\0') {
541 /*
542 * Error: empty line.
543 */
544 warnx("%s:%zu: warn: unexpected empty line in "
545 "interface description", p->fn, p->ln);
546 p->phase = PHASE_INIT;
547 return 1;
548 } else if (strcmp(cp, "*/") == 0) {
549 /*
550 * End of the interface description.
551 */
552 p->phase = PHASE_DECL;
553 return 1;
554 } else if (!(cp[0] == '*' && cp[1] == '*')) {
555 /*
556 * Error: bad syntax, not end or continuation.
557 */
558 if (cp[0] == '*' && cp[1] == '\0') {
559 if (verbose)
560 warnx("%s:%zu: warn: ignoring "
561 "standalone asterisk "
562 "in interface description",
563 p->fn, p->ln);
564 return 0;
565 } else if (cp[0] == '*' && cp[1] == ' ') {
566 if (verbose)
567 warnx("%s:%zu: warn: ignoring "
568 "leading single asterisk "
569 "in interface description",
570 p->fn, p->ln);
571 return 0;
572 }
573 warnx("%s:%zu: warn: ambiguous leading characters in "
574 "interface description", p->fn, p->ln);
575 p->phase = PHASE_INIT;
576 return 1;
577 }
578
579 /* If here, at a continuation ('**'). */
580
581 return 0;
582 }
583
584 /*
585 * Parse a "SEE ALSO" phase, which can come at any point in the
586 * interface description (unlike what they claim).
587 */
588 static void
seealso(struct parse * p,const char * cp,size_t len)589 seealso(struct parse *p, const char *cp, size_t len)
590 {
591 struct defn *d;
592
593 if (endphase(p, cp) || len < 2)
594 return;
595
596 cp += 2;
597 len -= 2;
598
599 while (isspace((unsigned char)*cp)) {
600 cp++;
601 len--;
602 }
603
604 /* Blank line: back to description part. */
605 if (len == 0) {
606 p->phase = PHASE_DESC;
607 return;
608 }
609
610 /* Fetch current interface definition. */
611 d = TAILQ_LAST(&p->dqhead, defnq);
612 assert(NULL != d);
613
614 d->seealso = realloc(d->seealso,
615 d->seealsosz + len + 1);
616 memcpy(d->seealso + d->seealsosz, cp, len);
617 d->seealsosz += len;
618 d->seealso[d->seealsosz] = '\0';
619 }
620
621 /*
622 * A definition description is a block of text that we'll later format
623 * in mdoc(7).
624 * It extends from the name of the definition down to the declarations
625 * themselves.
626 */
627 static void
desc(struct parse * p,const char * cp,size_t len)628 desc(struct parse *p, const char *cp, size_t len)
629 {
630 struct defn *d;
631 size_t nsz;
632
633 if (endphase(p, cp) || len < 2)
634 return;
635
636 cp += 2;
637 len -= 2;
638
639 while (isspace((unsigned char)*cp)) {
640 cp++;
641 len--;
642 }
643
644 /* Fetch current interface definition. */
645
646 d = TAILQ_LAST(&p->dqhead, defnq);
647 assert(NULL != d);
648
649 /* Ignore leading blank lines. */
650
651 if (len == 0 && d->desc == NULL)
652 return;
653
654 /* Collect SEE ALSO clauses. */
655
656 if (strncasecmp(cp, "see also:", 9) == 0) {
657 cp += 9;
658 len -= 9;
659 while (isspace((unsigned char)*cp)) {
660 cp++;
661 len--;
662 }
663 p->phase = PHASE_SEEALSO;
664 d->seealso = realloc(d->seealso,
665 d->seealsosz + len + 1);
666 memcpy(d->seealso + d->seealsosz, cp, len);
667 d->seealsosz += len;
668 d->seealso[d->seealsosz] = '\0';
669 return;
670 }
671
672 /* White-space padding between lines. */
673
674 if (d->desc != NULL &&
675 d->descsz > 0 &&
676 d->desc[d->descsz - 1] != ' ' &&
677 d->desc[d->descsz - 1] != '\n') {
678 d->desc = realloc(d->desc, d->descsz + 2);
679 if (d->desc == NULL)
680 err(1, NULL);
681 d->descsz++;
682 strlcat(d->desc, " ", d->descsz + 1);
683 }
684
685 /* Either append the line of a newline, if blank. */
686
687 nsz = len == 0 ? 1 : len;
688 if (d->desc == NULL) {
689 assert(d->descsz == 0);
690 d->desc = calloc(1, nsz + 1);
691 if (d->desc == NULL)
692 err(1, NULL);
693 } else {
694 d->desc = realloc(d->desc, d->descsz + nsz + 1);
695 if (d->desc == NULL)
696 err(1, NULL);
697 }
698
699 d->descsz += nsz;
700 strlcat(d->desc, len == 0 ? "\n" : cp, d->descsz + 1);
701 }
702
703 /*
704 * Copy all KEYWORDS into a buffer.
705 */
706 static void
keys(struct parse * p,const char * cp,size_t len)707 keys(struct parse *p, const char *cp, size_t len)
708 {
709 struct defn *d;
710
711 if (endphase(p, cp) || len < 2)
712 return;
713
714 cp += 2;
715 len -= 2;
716 while (isspace((unsigned char)*cp)) {
717 cp++;
718 len--;
719 }
720
721 if (len == 0) {
722 p->phase = PHASE_DESC;
723 return;
724 } else if (strncmp(cp, "KEYWORDS:", 9))
725 return;
726
727 cp += 9;
728 len -= 9;
729
730 d = TAILQ_LAST(&p->dqhead, defnq);
731 assert(NULL != d);
732 d->keybuf = realloc(d->keybuf, d->keybufsz + len + 1);
733 if (d->keybuf == NULL)
734 err(1, NULL);
735 memcpy(d->keybuf + d->keybufsz, cp, len);
736 d->keybufsz += len;
737 d->keybuf[d->keybufsz] = '\0';
738 }
739
740 /*
741 * Initial state is where we're scanning forward to find commented
742 * instances of CAPI3REF.
743 */
744 static void
init(struct parse * p,const char * cp)745 init(struct parse *p, const char *cp)
746 {
747 struct defn *d;
748 size_t i, sz;
749
750 /* Look for comment hook. */
751
752 if (cp[0] != '*' || cp[1] != '*')
753 return;
754 cp += 2;
755 while (isspace((unsigned char)*cp))
756 cp++;
757
758 /* Look for beginning of definition. */
759
760 if (strncmp(cp, "CAPI3REF:", 9))
761 return;
762 cp += 9;
763 while (isspace((unsigned char)*cp))
764 cp++;
765 if (*cp == '\0') {
766 warnx("%s:%zu: warn: unexpected end of "
767 "interface definition", p->fn, p->ln);
768 return;
769 }
770
771 /* Add definition to list of existing ones. */
772
773 if ((d = calloc(1, sizeof(struct defn))) == NULL)
774 err(1, NULL);
775 if ((d->name = strdup(cp)) == NULL)
776 err(1, NULL);
777
778 /* Strip trailing spaces and periods. */
779
780 for (sz = strlen(d->name); sz > 0; sz--)
781 if (d->name[sz - 1] == '.' ||
782 d->name[sz - 1] == ' ')
783 d->name[sz - 1] = '\0';
784 else
785 break;
786
787 /*
788 * Un-title case. Use a simple heuristic where all words
789 * starting with an upper case letter followed by a not
790 * uppercase letter are lowercased.
791 */
792
793 for (i = 0; sz > 0 && i < sz - 1; i++)
794 if ((i == 0 || d->name[i - 1] == ' ') &&
795 isupper((unsigned char)d->name[i]) &&
796 !isupper((unsigned char)d->name[i + 1]) &&
797 !ispunct((unsigned char)d->name[i + 1]))
798 d->name[i] = tolower((unsigned char)d->name[i]);
799
800 d->fn = p->fn;
801 d->ln = p->ln;
802 p->phase = PHASE_KEYS;
803 TAILQ_INIT(&d->dcqhead);
804 TAILQ_INSERT_TAIL(&p->dqhead, d, entries);
805 }
806
807 #define BPOINT(_cp) \
808 (';' == (_cp)[0] || \
809 '[' == (_cp)[0] || \
810 ('(' == (_cp)[0] && '*' != (_cp)[1]) || \
811 ')' == (_cp)[0] || \
812 '{' == (_cp)[0])
813
814 /*
815 * Given a declaration (be it preprocessor or C), try to parse out a
816 * reasonable "name" for the affair.
817 * For a struct, for example, it'd be the struct name.
818 * For a typedef, it'd be the type name.
819 * For a function, it'd be the function name.
820 */
821 static void
grok_name(const struct decl * e,const char ** start,size_t * sz)822 grok_name(const struct decl *e,
823 const char **start, size_t *sz)
824 {
825 const char *cp;
826
827 *start = NULL;
828 *sz = 0;
829
830 if (DECLTYPE_CPP != e->type) {
831 if (e->text[e->textsz - 1] != ';')
832 return;
833 cp = e->text;
834 do {
835 while (isspace((unsigned char)*cp))
836 cp++;
837 if (BPOINT(cp))
838 break;
839 /* Function pointers... */
840 if (*cp == '(')
841 cp++;
842 /* Pass over pointers. */
843 while (*cp == '*')
844 cp++;
845 *start = cp;
846 *sz = 0;
847 while (!isspace((unsigned char)*cp)) {
848 if (BPOINT(cp))
849 break;
850 cp++;
851 (*sz)++;
852 }
853 } while (!BPOINT(cp));
854 } else {
855 *sz = e->textsz;
856 *start = e->text;
857 }
858 }
859
860 /*
861 * Extract information from the interface definition.
862 * Mark it as "postprocessed" on success.
863 */
864 static void
postprocess(const char * prefix,struct defn * d)865 postprocess(const char *prefix, struct defn *d)
866 {
867 struct decl *first;
868 const char *start;
869 size_t offs, sz, i;
870 ENTRY ent;
871
872 if (TAILQ_EMPTY(&d->dcqhead))
873 return;
874
875 /* Find the first #define or declaration. */
876
877 TAILQ_FOREACH(first, &d->dcqhead, entries)
878 if (DECLTYPE_CPP == first->type ||
879 DECLTYPE_C == first->type)
880 break;
881
882 if (first == NULL) {
883 warnx("%s:%zu: no entry to document", d->fn, d->ln);
884 return;
885 }
886
887 /*
888 * Now compute the document name (`Dt').
889 * We'll also use this for the filename.
890 */
891
892 grok_name(first, &start, &sz);
893 if (start == NULL) {
894 warnx("%s:%zu: couldn't deduce "
895 "entry name", d->fn, d->ln);
896 return;
897 }
898
899 /* Document name needs all-caps. */
900
901 if ((d->dt = strndup(start, sz)) == NULL)
902 err(1, NULL);
903 sz = strlen(d->dt);
904 for (i = 0; i < sz; i++)
905 d->dt[i] = toupper((unsigned char)d->dt[i]);
906
907 /* Filename needs no special chars. */
908
909 if (filename) {
910 asprintf(&d->fname, "%.*s.3", (int)sz, start);
911 offs = 0;
912 } else {
913 asprintf(&d->fname, "%s/%.*s.3",
914 prefix, (int)sz, start);
915 offs = strlen(prefix) + 1;
916 }
917
918 if (d->fname == NULL)
919 err(1, NULL);
920
921 for (i = 0; i < sz; i++) {
922 if (isalnum((unsigned char)d->fname[offs + i]) ||
923 d->fname[offs + i] == '_' ||
924 d->fname[offs + i] == '-')
925 continue;
926 d->fname[offs + i] = '_';
927 }
928
929 /*
930 * First, extract all keywords.
931 */
932 for (i = 0; i < d->keybufsz; ) {
933 while (isspace((unsigned char)d->keybuf[i]))
934 i++;
935 if (i == d->keybufsz)
936 break;
937 sz = 0;
938 start = &d->keybuf[i];
939 if (d->keybuf[i] == '{') {
940 start = &d->keybuf[++i];
941 for ( ; i < d->keybufsz; i++, sz++)
942 if (d->keybuf[i] == '}')
943 break;
944 if (d->keybuf[i] == '}')
945 i++;
946 } else
947 for ( ; i < d->keybufsz; i++, sz++)
948 if (isspace((unsigned char)d->keybuf[i]))
949 break;
950 if (sz == 0)
951 continue;
952 d->keys = reallocarray(d->keys,
953 d->keysz + 1, sizeof(char *));
954 if (d->keys == NULL)
955 err(1, NULL);
956 d->keys[d->keysz] = malloc(sz + 1);
957 if (d->keys[d->keysz] == NULL)
958 err(1, NULL);
959 memcpy(d->keys[d->keysz], start, sz);
960 d->keys[d->keysz][sz] = '\0';
961 d->keysz++;
962
963 /* Hash the keyword. */
964 ent.key = d->keys[d->keysz - 1];
965 ent.data = d;
966 (void)hsearch(ent, ENTER);
967 }
968
969 /*
970 * Now extract all `Nm' values for this document.
971 * We only use CPP and C references, and hope for the best when
972 * doing so.
973 * Enter each one of these as a searchable keyword.
974 */
975 TAILQ_FOREACH(first, &d->dcqhead, entries) {
976 if (DECLTYPE_CPP != first->type &&
977 DECLTYPE_C != first->type)
978 continue;
979 grok_name(first, &start, &sz);
980 if (start == NULL)
981 continue;
982 d->nms = reallocarray(d->nms,
983 d->nmsz + 1, sizeof(char *));
984 if (d->nms == NULL)
985 err(1, NULL);
986 d->nms[d->nmsz] = malloc(sz + 1);
987 if (d->nms[d->nmsz] == NULL)
988 err(1, NULL);
989 memcpy(d->nms[d->nmsz], start, sz);
990 d->nms[d->nmsz][sz] = '\0';
991 d->nmsz++;
992
993 /* Hash the name. */
994 ent.key = d->nms[d->nmsz - 1];
995 ent.data = d;
996 (void)hsearch(ent, ENTER);
997 }
998
999 if (d->nmsz == 0) {
1000 warnx("%s:%zu: couldn't deduce "
1001 "any names", d->fn, d->ln);
1002 return;
1003 }
1004
1005 /*
1006 * Next, scan for all `Xr' values.
1007 * We'll add more to this list later.
1008 */
1009 for (i = 0; i < d->seealsosz; i++) {
1010 /*
1011 * Find next value starting with `['.
1012 * There's other stuff in there (whitespace or
1013 * free text leading up to these) that we're ok
1014 * to ignore.
1015 */
1016 while (i < d->seealsosz && d->seealso[i] != '[')
1017 i++;
1018 if (i == d->seealsosz)
1019 break;
1020
1021 /*
1022 * Now scan for the matching `]'.
1023 * We can also have a vertical bar if we're separating a
1024 * keyword and its shown name.
1025 */
1026 start = &d->seealso[++i];
1027 sz = 0;
1028 while (i < d->seealsosz &&
1029 d->seealso[i] != ']' &&
1030 d->seealso[i] != '|') {
1031 i++;
1032 sz++;
1033 }
1034 if (i == d->seealsosz)
1035 break;
1036 if (sz == 0)
1037 continue;
1038
1039 /*
1040 * Continue on to the end-of-reference, if we weren't
1041 * there to begin with.
1042 */
1043 if (d->seealso[i] != ']')
1044 while (i < d->seealsosz &&
1045 d->seealso[i] != ']')
1046 i++;
1047
1048 /* Strip trailing whitespace. */
1049 while (sz > 1 && start[sz - 1] == ' ')
1050 sz--;
1051
1052 /* Strip trailing parenthesis. */
1053 if (sz > 2 &&
1054 start[sz - 2] == '(' &&
1055 start[sz - 1] == ')')
1056 sz -= 2;
1057
1058 d->xrs = reallocarray(d->xrs,
1059 d->xrsz + 1, sizeof(char *));
1060 if (d->xrs == NULL)
1061 err(1, NULL);
1062 d->xrs[d->xrsz] = malloc(sz + 1);
1063 if (d->xrs[d->xrsz] == NULL)
1064 err(1, NULL);
1065 memcpy(d->xrs[d->xrsz], start, sz);
1066 d->xrs[d->xrsz][sz] = '\0';
1067 d->xrsz++;
1068 }
1069
1070 /*
1071 * Next, extract all references.
1072 * We'll accumulate these into a list of SEE ALSO tags, after.
1073 * See how these are parsed above for a description: this is
1074 * basically the same thing.
1075 */
1076 for (i = 0; i < d->descsz; i++) {
1077 if (d->desc[i] != '[')
1078 continue;
1079 i++;
1080 if (d->desc[i] == '[')
1081 continue;
1082
1083 start = &d->desc[i];
1084 for (sz = 0; i < d->descsz; i++, sz++)
1085 if (d->desc[i] == ']' ||
1086 d->desc[i] == '|')
1087 break;
1088
1089 if (i == d->descsz)
1090 break;
1091 else if (sz == 0)
1092 continue;
1093
1094 if (d->desc[i] != ']')
1095 while (i < d->descsz && d->desc[i] != ']')
1096 i++;
1097
1098 while (sz > 1 && start[sz - 1] == ' ')
1099 sz--;
1100
1101 if (sz > 2 &&
1102 start[sz - 2] == '(' &&
1103 start[sz - 1] == ')')
1104 sz -= 2;
1105
1106 d->xrs = reallocarray(d->xrs,
1107 d->xrsz + 1, sizeof(char *));
1108 if (d->xrs == NULL)
1109 err(1, NULL);
1110 d->xrs[d->xrsz] = malloc(sz + 1);
1111 if (d->xrs[d->xrsz] == NULL)
1112 err(1, NULL);
1113 memcpy(d->xrs[d->xrsz], start, sz);
1114 d->xrs[d->xrsz][sz] = '\0';
1115 d->xrsz++;
1116 }
1117
1118 d->postprocessed = 1;
1119 }
1120
1121 /*
1122 * Convenience function to look up which manpage "hosts" a certain
1123 * keyword. For example, SQLITE_OK(3) also handles SQLITE_TOOBIG and so
1124 * on, so a reference to SQLITE_TOOBIG should actually point to
1125 * SQLITE_OK.
1126 * Returns the keyword's file if found or NULL.
1127 */
1128 static const char *
lookup(const char * key)1129 lookup(const char *key)
1130 {
1131 ENTRY ent;
1132 ENTRY *res;
1133 const struct defn *d;
1134
1135 ent.key = (char *)(uintptr_t)key;
1136 ent.data = NULL;
1137
1138 if ((res = hsearch(ent, FIND)) == NULL)
1139 return NULL;
1140
1141 d = (const struct defn *)res->data;
1142 if (d->nmsz == 0)
1143 return NULL;
1144
1145 assert(d->nms[0] != NULL);
1146 return d->nms[0];
1147 }
1148
1149 static int
xrcmp(const void * p1,const void * p2)1150 xrcmp(const void *p1, const void *p2)
1151 {
1152 /* Silence bogus warnings about un-consting. */
1153
1154 const char *s1 = lookup(*(const char **)(uintptr_t)p1),
1155 *s2 = lookup(*(const char **)(uintptr_t)p2);
1156
1157 if (s1 == NULL)
1158 s1 = "";
1159 if (s2 == NULL)
1160 s2 = "";
1161
1162 return strcasecmp(s1, s2);
1163 }
1164
1165 /*
1166 * Return non-zero if "new sentence, new line" is in effect, zero
1167 * otherwise.
1168 * Accepts the start and finish offset of a buffer.
1169 */
1170 static int
newsentence(size_t start,size_t finish,const char * buf)1171 newsentence(size_t start, size_t finish, const char *buf)
1172 {
1173 size_t span = finish - start;
1174
1175 assert(finish >= start);
1176
1177 /* Ignore "i.e." and "e.g.". */
1178
1179 if ((span >= 4 &&
1180 strncasecmp(&buf[finish - 4], "i.e.", 4) == 0) ||
1181 (span >= 4 &&
1182 strncasecmp(&buf[finish - 4], "e.g.", 4) == 0))
1183 return 0;
1184
1185 return 1;
1186 }
1187
1188 /*
1189 * Emit a valid mdoc(7) document within the given prefix.
1190 */
1191 static void
emit(struct defn * d)1192 emit(struct defn *d)
1193 {
1194 struct decl *first;
1195 size_t sz, i, j, col, last, ns, fnsz, stripspace;
1196 FILE *f;
1197 char *cp;
1198 const char *res, *lastres, *args, *str, *end, *fn;
1199 enum tag tag;
1200 enum preproc pre;
1201
1202 if (!d->postprocessed) {
1203 warnx("%s:%zu: interface has errors, not "
1204 "producing manpage", d->fn, d->ln);
1205 return;
1206 }
1207
1208 if (nofile == 0) {
1209 if ((f = fopen(d->fname, "w")) == NULL) {
1210 warn("%s: fopen", d->fname);
1211 return;
1212 }
1213 } else if (filename) {
1214 printf("%s\n", d->fname);
1215 return;
1216 } else
1217 f = stdout;
1218
1219 /* Begin by outputting the mdoc(7) header. */
1220
1221 fputs(".Dd $" "Mdocdate$\n", f);
1222 fprintf(f, ".Dt %s 3\n", d->dt);
1223 fputs(".Os\n", f);
1224 fputs(".Sh NAME\n", f);
1225
1226 /* Now print the name bits of each declaration. */
1227
1228 for (i = 0; i < d->nmsz; i++)
1229 fprintf(f, ".Nm %s%s\n", d->nms[i],
1230 i < d->nmsz - 1 ? " ," : "");
1231
1232 fprintf(f, ".Nd %s\n", d->name);
1233 fputs(".Sh SYNOPSIS\n", f);
1234 fputs(".In sqlite3.h\n", f);
1235
1236 TAILQ_FOREACH(first, &d->dcqhead, entries) {
1237 if (first->type != DECLTYPE_CPP &&
1238 first->type != DECLTYPE_C)
1239 continue;
1240
1241 /* Easy: just print the CPP name. */
1242
1243 if (first->type == DECLTYPE_CPP) {
1244 fprintf(f, ".Fd #define %s\n",
1245 first->text);
1246 continue;
1247 }
1248
1249 /* First, strip out the sqlite CPPs. */
1250
1251 for (i = 0; i < first->textsz; ) {
1252 for (pre = 0; pre < PREPROC__MAX; pre++) {
1253 sz = strlen(preprocs[pre]);
1254 if (strncmp(preprocs[pre],
1255 &first->text[i], sz))
1256 continue;
1257 i += sz;
1258 while (isspace((unsigned char)first->text[i]))
1259 i++;
1260 break;
1261 }
1262 if (pre == PREPROC__MAX)
1263 break;
1264 }
1265
1266 /* If we're a typedef, immediately print Vt. */
1267
1268 if (strncmp(&first->text[i], "typedef", 7) == 0) {
1269 fprintf(f, ".Vt %s\n", &first->text[i]);
1270 continue;
1271 }
1272
1273 /* Are we a struct? */
1274
1275 if (first->textsz > 2 &&
1276 first->text[first->textsz - 2] == '}' &&
1277 (cp = strchr(&first->text[i], '{')) != NULL) {
1278 *cp = '\0';
1279 fprintf(f, ".Vt %s;\n", &first->text[i]);
1280 /* Restore brace for later usage. */
1281 *cp = '{';
1282 continue;
1283 }
1284
1285 /* Catch remaining non-functions. */
1286
1287 if (first->textsz > 2 &&
1288 first->text[first->textsz - 2] != ')') {
1289 fprintf(f, ".Vt %s\n", &first->text[i]);
1290 continue;
1291 }
1292
1293 str = &first->text[i];
1294 if ((args = strchr(str, '(')) == NULL || args == str) {
1295 /* What is this? */
1296 fputs(".Bd -literal\n", f);
1297 fputs(&first->text[i], f);
1298 fputs("\n.Ed\n", f);
1299 continue;
1300 }
1301
1302 /*
1303 * Current state:
1304 * type_t *function (args...)
1305 * ^str ^args
1306 * Scroll back to end of function name.
1307 */
1308
1309 end = args - 1;
1310 while (end > str && isspace((unsigned char)*end))
1311 end--;
1312
1313 /*
1314 * Current state:
1315 * type_t *function (args...)
1316 * ^str ^end ^args
1317 * Scroll back to what comes before.
1318 */
1319
1320 for (fnsz = 0; end > str; end--, fnsz++)
1321 if (isspace((unsigned char)*end) || *end == '*')
1322 break;
1323
1324 if (fnsz == 0)
1325 warnx("%s:%zu: zero-length "
1326 "function name", d->fn, d->ln);
1327 fn = end + 1;
1328
1329 /*
1330 * Current state:
1331 * type_t *function (args...)
1332 * ^str ^end ^args
1333 * type_t function (args...)
1334 * ^str ^end ^args
1335 * Strip away whitespace.
1336 */
1337
1338 while (end > str && isspace((unsigned char)*end))
1339 end--;
1340
1341 /*
1342 * type_t *function (args...)
1343 * ^str ^end ^args
1344 * type_t function (args...)
1345 * ^str ^end ^args
1346 */
1347
1348 /*
1349 * If we can't find what came before, then the function
1350 * has no type, which is odd... let's just call it void.
1351 */
1352
1353 if (end > str) {
1354 fprintf(f, ".Ft %.*s\n",
1355 (int)(end - str + 1), str);
1356 fprintf(f, ".Fo %.*s\n", (int)fnsz, fn);
1357 } else {
1358 fputs(".Ft void\n", f);
1359 fprintf(f, ".Fo %.*s\n", (int)fnsz, fn);
1360 }
1361
1362 /*
1363 * Convert function arguments into `Fa' clauses.
1364 * This also handles nested function pointers, which
1365 * would otherwise throw off the delimeters.
1366 */
1367
1368 for (;;) {
1369 str = ++args;
1370 while (isspace((unsigned char)*str))
1371 str++;
1372 fputs(".Fa \"", f);
1373 ns = 0;
1374 while (*str != '\0' &&
1375 (ns || *str != ',') &&
1376 (ns || *str != ')')) {
1377 /*
1378 * Handle comments in the declarations.
1379 */
1380 if (str[0] == '/' && str[1] == '*') {
1381 str += 2;
1382 for ( ; str[0] != '\0'; str++)
1383 if (str[0] == '*' && str[1] == '/')
1384 break;
1385 if (*str == '\0')
1386 break;
1387 str += 2;
1388 while (isspace((unsigned char)*str))
1389 str++;
1390 if (*str == '\0' ||
1391 (ns == 0 && *str == ',') ||
1392 (ns == 0 && *str == ')'))
1393 break;
1394 }
1395 if (*str == '(')
1396 ns++;
1397 else if (*str == ')')
1398 ns--;
1399
1400 /*
1401 * Handle some instances of whitespace
1402 * by compressing it down.
1403 * However, if the whitespace ends at
1404 * the end-of-definition, then don't
1405 * print it at all.
1406 */
1407
1408 if (isspace((unsigned char)*str)) {
1409 while (isspace((unsigned char)*str))
1410 str++;
1411 /* Are we at a comment? */
1412 if (str[0] == '/' && str[1] == '*')
1413 continue;
1414 if (*str == '\0' ||
1415 (ns == 0 && *str == ',') ||
1416 (ns == 0 && *str == ')'))
1417 break;
1418 fputc(' ', f);
1419 } else {
1420 fputc(*str, f);
1421 str++;
1422 }
1423 }
1424 fputs("\"\n", f);
1425 if (*str == '\0' || *str == ')')
1426 break;
1427 args = str;
1428 }
1429
1430 fputs(".Fc\n", f);
1431 }
1432
1433 fputs(".Sh DESCRIPTION\n", f);
1434
1435 /*
1436 * Strip the crap out of the description.
1437 * "Crap" consists of things I don't understand that mess up
1438 * parsing of the HTML, for instance,
1439 * <dl>[[foo bar]]<dt>foo bar</dt>...</dl>
1440 * These are not well-formed HTML.
1441 * Note that d->desc[d->descz] is the NUL terminator, so we
1442 * don't need to check d->descsz - 1.
1443 */
1444
1445 for (i = 0; i < d->descsz; ) {
1446 if (d->desc[i] == '^' &&
1447 d->desc[i + 1] == '(') {
1448 memmove(&d->desc[i],
1449 &d->desc[i + 2],
1450 d->descsz - i - 1);
1451 d->descsz -= 2;
1452 continue;
1453 } else if (d->desc[i] == ')' &&
1454 d->desc[i + 1] == '^') {
1455 memmove(&d->desc[i],
1456 &d->desc[i + 2],
1457 d->descsz - i - 1);
1458 d->descsz -= 2;
1459 continue;
1460 } else if (d->desc[i] == '^') {
1461 memmove(&d->desc[i],
1462 &d->desc[i + 1],
1463 d->descsz - i);
1464 d->descsz -= 1;
1465 continue;
1466 } else if (d->desc[i] != '[' ||
1467 d->desc[i + 1] != '[') {
1468 i++;
1469 continue;
1470 }
1471
1472 for (j = i; j < d->descsz; j++)
1473 if (d->desc[j] == ']' &&
1474 d->desc[j + 1] == ']')
1475 break;
1476
1477 /* Ignore if we don't have a terminator. */
1478
1479 assert(j > i);
1480 j += 2;
1481 if (j > d->descsz) {
1482 i++;
1483 continue;
1484 }
1485
1486 memmove(&d->desc[i], &d->desc[j], d->descsz - j + 1);
1487 d->descsz -= (j - i);
1488 }
1489
1490 /*
1491 * Here we go!
1492 * Print out the description as best we can.
1493 * Do on-the-fly processing of any HTML we encounter into
1494 * mdoc(7) and try to break lines up.
1495 */
1496
1497 col = stripspace = 0;
1498
1499 for (i = 0; i < d->descsz; ) {
1500 /*
1501 * The "stripspace" variable is set to >=2 if we've
1502 * stripped white-space off before an anticipated macro.
1503 * Without it, if the macro ends up *not* being a macro,
1504 * we wouldn't flush the line and thus end up losing a
1505 * space. This lets the code that flushes the line know
1506 * that we've stripped spaces and adds them back in.
1507 */
1508
1509 if (stripspace > 0)
1510 stripspace--;
1511
1512 /* Ignore NUL byte, just in case. */
1513
1514 if (d->desc[i] == '\0') {
1515 i++;
1516 continue;
1517 }
1518
1519 /*
1520 * Newlines are paragraph breaks.
1521 * If we have multiple newlines, then keep to a single
1522 * `Pp' to keep it clean.
1523 * Only do this if we're not before a block-level HTML,
1524 * as this would mean, for instance, a `Pp'-`Bd' pair.
1525 */
1526
1527 if (d->desc[i] == '\n') {
1528 while (isspace((unsigned char)d->desc[i]))
1529 i++;
1530 for (tag = 0; tag < TAG__MAX; tag++) {
1531 sz = strlen(tags[tag].html);
1532 if (strncasecmp(&d->desc[i],
1533 tags[tag].html, sz) == 0)
1534 break;
1535 }
1536 if (tag == TAG__MAX ||
1537 (tags[tag].flags & TAGINFO_INLINE)) {
1538 if (col > 0)
1539 fputs("\n", f);
1540 fputs(".Pp\n", f);
1541 /* We're on a new line. */
1542 col = 0;
1543 }
1544 continue;
1545 }
1546
1547 /*
1548 * New sentence, new line.
1549 * We guess whether this is the case by using the
1550 * dumbest possible heuristic.
1551 */
1552
1553 if (d->desc[i] == ' ' &&
1554 i > 0 && d->desc[i - 1] == '.') {
1555 for (j = i - 1; j > 0; j--)
1556 if (isspace((unsigned char)d->desc[j])) {
1557 j++;
1558 break;
1559 }
1560 if (newsentence(j, i, d->desc)) {
1561 while (d->desc[i] == ' ')
1562 i++;
1563 fputc('\n', f);
1564 col = 0;
1565 continue;
1566 }
1567 }
1568
1569 /*
1570 * After 65 characters, force a break when we encounter
1571 * white-space to keep our lines more or less tidy.
1572 */
1573
1574 if (col > 65 && d->desc[i] == ' ') {
1575 while (d->desc[i] == ' ' )
1576 i++;
1577 fputc('\n', f);
1578 col = 0;
1579 continue;
1580 }
1581
1582 /* Parse HTML tags and links. */
1583
1584 if (d->desc[i] == '<') {
1585 for (tag = 0; tag < TAG__MAX; tag++) {
1586 sz = strlen(tags[tag].html);
1587 assert(sz > 0);
1588 if (strncmp(&d->desc[i],
1589 tags[tag].html, sz))
1590 continue;
1591
1592 i += sz;
1593
1594 /* Blindly ignore attributes. */
1595
1596 if (tags[tag].flags & TAGINFO_ATTRS) {
1597 while (d->desc[i] != '\0' &&
1598 d->desc[i] != '>')
1599 i++;
1600 if (d->desc[i] == '\0')
1601 break;
1602 i++;
1603 }
1604
1605 /*
1606 * NOOP tags don't do anything, such as
1607 * the case of `</dd>', which only
1608 * serves to end an `It' block that will
1609 * be closed out by a subsequent `It' or
1610 * end of clause `El' anyway.
1611 * Skip the trailing space.
1612 */
1613
1614 if (tags[tag].flags & TAGINFO_NOOP) {
1615 while (isspace((unsigned char)d->desc[i]))
1616 i++;
1617 break;
1618 } else if (tags[tag].flags & TAGINFO_INLINE) {
1619 while (stripspace > 0) {
1620 fputc(' ', f);
1621 col++;
1622 stripspace--;
1623 }
1624 fputs(tags[tag].mdoc, f);
1625 /*col += strlen(tags[tag].mdoc);*/
1626 break;
1627 }
1628
1629 /*
1630 * A breaking mdoc(7) statement.
1631 * Break the current line, output the
1632 * macro, and conditionally break
1633 * following that (or we might do
1634 * nothing at all).
1635 */
1636
1637 if (col > 0) {
1638 fputs("\n", f);
1639 col = 0;
1640 }
1641
1642 fputs(tags[tag].mdoc, f);
1643 if (!(tags[tag].flags & TAGINFO_NOBR)) {
1644 fputs("\n", f);
1645 col = 0;
1646 } else if (!(tags[tag].flags & TAGINFO_NOSP)) {
1647 fputs(" ", f);
1648 col++;
1649 }
1650 while (isspace((unsigned char)d->desc[i]))
1651 i++;
1652 break;
1653 }
1654 if (tag < TAG__MAX) {
1655 stripspace = 0;
1656 continue;
1657 }
1658 while (stripspace > 0) {
1659 fputc(' ', f);
1660 col++;
1661 stripspace--;
1662 }
1663 } else if (d->desc[i] == '[' && d->desc[i + 1] != ']') {
1664 /* Do we start at the bracket or bar? */
1665
1666 for (sz = i + 1; sz < d->descsz; sz++)
1667 if (d->desc[sz] == '|' ||
1668 d->desc[sz] == ']')
1669 break;
1670
1671 /* This is a degenerate case. */
1672
1673 if (sz == d->descsz) {
1674 i++;
1675 stripspace = 0;
1676 continue;
1677 }
1678
1679 /*
1680 * Look for a trailing "()", using "j" as a
1681 * sentinel in case it was found. This lets us
1682 * print out a "Fn xxxx" instead of having the
1683 * function be ugly. If we don't have a Fn and
1684 * we'd stripped space before this, remember to
1685 * add the space back in.
1686 */
1687
1688 j = 0;
1689 if (d->desc[sz] != '|') {
1690 i = i + 1;
1691 if (sz > 2 &&
1692 d->desc[sz - 1] == ')' &&
1693 d->desc[sz - 2] == '(') {
1694 if (col > 0)
1695 fputc('\n', f);
1696 fputs(".Fn ", f);
1697 j = sz - 2;
1698 assert(j > 0);
1699 } else if (stripspace) {
1700 fputc(' ', f);
1701 col++;
1702 }
1703 } else {
1704 if (stripspace) {
1705 fputc(' ', f);
1706 col++;
1707 }
1708 i = sz + 1;
1709 }
1710
1711 while (isspace((unsigned char)d->desc[i]))
1712 i++;
1713
1714 /*
1715 * Now handle in-page references. If we're a
1716 * function reference (e.g., function()), then
1717 * omit the trailing parentheses and put in a Fn
1718 * block. Otherwise print them out as-is: we've
1719 * already accumulated them into our "SEE ALSO"
1720 * values, which we'll use below.
1721 */
1722
1723 for ( ; i < d->descsz; i++, col++) {
1724 if (j > 0 && i == j) {
1725 i += 3;
1726 for ( ; i < d->descsz; i++)
1727 if (d->desc[i] == '.')
1728 fputs(" .", f);
1729 else if (d->desc[i] == ',')
1730 fputs(" ,", f);
1731 else if (d->desc[i] == ')')
1732 fputs(" )", f);
1733 else
1734 break;
1735
1736 /* Trim trailing space. */
1737
1738 while (i < d->descsz &&
1739 isspace((unsigned char)d->desc[i]))
1740 i++;
1741
1742 fputc('\n', f);
1743 col = 0;
1744 break;
1745 } else if (d->desc[i] == ']') {
1746 i++;
1747 break;
1748 }
1749 fputc(d->desc[i], f);
1750 col++;
1751 }
1752
1753 stripspace = 0;
1754 continue;
1755 }
1756
1757 /* Strip leading spaces from output. */
1758
1759 if (d->desc[i] == ' ' && col == 0) {
1760 while (d->desc[i] == ' ')
1761 i++;
1762 continue;
1763 }
1764
1765 /*
1766 * Strip trailing spaces from output.
1767 * Set "stripspace" to be the number of white-space
1768 * characters that we've skipped, plus one.
1769 * This means that the next loop iteration while get the
1770 * actual amount we've skipped (for '<' or '[') and we
1771 * can act upon it there.
1772 */
1773
1774 if (d->desc[i] == ' ') {
1775 j = i;
1776 while (j < d->descsz && d->desc[j] == ' ')
1777 j++;
1778 if (j < d->descsz &&
1779 (d->desc[j] == '\n' ||
1780 d->desc[j] == '<' ||
1781 d->desc[j] == '[')) {
1782 stripspace = d->desc[j] != '\n' ?
1783 (j - i + 1) : 0;
1784 i = j;
1785 continue;
1786 }
1787 }
1788
1789 assert(d->desc[i] != '\n');
1790
1791 /*
1792 * Handle some oddities.
1793 * The following HTML escapes exist in the output that I
1794 * could find.
1795 * There might be others...
1796 */
1797
1798 if (strncmp(&d->desc[i], "→", 6) == 0) {
1799 i += 6;
1800 fputs("\\(->", f);
1801 } else if (strncmp(&d->desc[i], "←", 6) == 0) {
1802 i += 6;
1803 fputs("\\(<-", f);
1804 } else if (strncmp(&d->desc[i], " ", 6) == 0) {
1805 i += 6;
1806 fputc(' ', f);
1807 } else if (strncmp(&d->desc[i], "<", 4) == 0) {
1808 i += 4;
1809 fputc('<', f);
1810 } else if (strncmp(&d->desc[i], ">", 4) == 0) {
1811 i += 4;
1812 fputc('>', f);
1813 } else if (strncmp(&d->desc[i], "[", 5) == 0) {
1814 i += 5;
1815 fputc('[', f);
1816 } else {
1817 /* Make sure we don't trigger a macro. */
1818 if (col == 0 &&
1819 (d->desc[i] == '.' || d->desc[i] == '\''))
1820 fputs("\\&", f);
1821 fputc(d->desc[i], f);
1822 i++;
1823 }
1824
1825 col++;
1826 }
1827
1828 if (col > 0)
1829 fputs("\n", f);
1830
1831 fputs(".Sh IMPLEMENTATION NOTES\n", f);
1832 fprintf(f, "These declarations were extracted from the\n"
1833 "interface documentation at line %zu.\n", d->ln);
1834 fputs(".Bd -literal\n", f);
1835 fputs(d->fulldesc, f);
1836 fputs(".Ed\n", f);
1837
1838 /*
1839 * Look up all of our keywords (which are in the xrs field) in
1840 * the table of all known keywords.
1841 * Don't print duplicates.
1842 */
1843
1844 if (d->xrsz > 0) {
1845 qsort(d->xrs, d->xrsz, sizeof(char *), xrcmp);
1846 lastres = NULL;
1847 for (last = 0, i = 0; i < d->xrsz; i++) {
1848 res = lookup(d->xrs[i]);
1849
1850 /* Ignore self-reference. */
1851
1852 if (res == d->nms[0] && verbose)
1853 warnx("%s:%zu: self-reference: %s",
1854 d->fn, d->ln, d->xrs[i]);
1855 if (res == d->nms[0])
1856 continue;
1857 if (res == NULL && verbose)
1858 warnx("%s:%zu: ref not found: %s",
1859 d->fn, d->ln, d->xrs[i]);
1860 if (res == NULL)
1861 continue;
1862
1863 /* Ignore duplicates. */
1864
1865 if (lastres == res)
1866 continue;
1867
1868 if (last)
1869 fputs(" ,\n", f);
1870 else
1871 fputs(".Sh SEE ALSO\n", f);
1872
1873 fprintf(f, ".Xr %s 3", res);
1874 last = 1;
1875 lastres = res;
1876 }
1877 if (last)
1878 fputs("\n", f);
1879 }
1880
1881 if (nofile == 0)
1882 fclose(f);
1883 }
1884
1885 #if HAVE_PLEDGE
1886 /*
1887 * We pledge(2) stdio if we're receiving from stdin and writing to
1888 * stdout, otherwise we need file-creation and writing.
1889 */
1890 static void
sandbox_pledge(void)1891 sandbox_pledge(void)
1892 {
1893
1894 if (nofile) {
1895 if (pledge("stdio", NULL) == -1)
1896 err(1, NULL);
1897 } else {
1898 if (pledge("stdio wpath cpath", NULL) == -1)
1899 err(1, NULL);
1900 }
1901 }
1902 #endif
1903
1904 #if HAVE_SANDBOX_INIT
1905 /*
1906 * Darwin's "seatbelt".
1907 * If we're writing to stdout, then use pure computation.
1908 * Otherwise we need file writing.
1909 */
1910 static void
sandbox_apple(void)1911 sandbox_apple(void)
1912 {
1913 char *ep;
1914 int rc;
1915
1916 rc = sandbox_init
1917 (nofile ? kSBXProfilePureComputation :
1918 kSBXProfileNoNetwork, SANDBOX_NAMED, &ep);
1919 if (rc == 0)
1920 return;
1921 perror(ep);
1922 sandbox_free_error(ep);
1923 exit(1);
1924 }
1925 #endif
1926
1927 /*
1928 * Check to see whether there are any filename duplicates.
1929 * This is just a warning, but will really screw things up, since the
1930 * last filename will overwrite the first.
1931 */
1932 static void
check_dupes(struct parse * p)1933 check_dupes(struct parse *p)
1934 {
1935 const struct defn *d, *dd;
1936
1937 TAILQ_FOREACH(d, &p->dqhead, entries)
1938 TAILQ_FOREACH_REVERSE(dd, &p->dqhead, defnq, entries) {
1939 if (dd == d)
1940 break;
1941 if (d->fname == NULL ||
1942 dd->fname == NULL ||
1943 strcmp(d->fname, dd->fname))
1944 continue;
1945 warnx("%s:%zu: duplicate filename: "
1946 "%s (from %s, line %zu)", d->fn,
1947 d->ln, d->fname, dd->nms[0], dd->ln);
1948 }
1949 }
1950
1951 int
main(int argc,char * argv[])1952 main(int argc, char *argv[])
1953 {
1954 size_t i, bufsz;
1955 ssize_t len;
1956 FILE *f = stdin;
1957 char *cp = NULL;
1958 const char *prefix = ".";
1959 struct parse p;
1960 int rc = 0, ch;
1961 struct defn *d;
1962 struct decl *e;
1963
1964 memset(&p, 0, sizeof(struct parse));
1965
1966 p.fn = "<stdin>";
1967 p.ln = 0;
1968 p.phase = PHASE_INIT;
1969
1970 TAILQ_INIT(&p.dqhead);
1971
1972 while ((ch = getopt(argc, argv, "nNp:v")) != -1)
1973 switch (ch) {
1974 case 'n':
1975 nofile = 1;
1976 break;
1977 case 'N':
1978 nofile = 1;
1979 filename = 1;
1980 break;
1981 case 'p':
1982 prefix = optarg;
1983 break;
1984 case 'v':
1985 verbose = 1;
1986 break;
1987 default:
1988 goto usage;
1989 }
1990
1991 argc -= optind;
1992 argv += optind;
1993
1994 if (argc > 1)
1995 goto usage;
1996
1997 if (argc > 0) {
1998 if ((f = fopen(argv[0], "r")) == NULL)
1999 err(1, "%s", argv[0]);
2000 p.fn = argv[0];
2001 }
2002
2003 #if HAVE_SANDBOX_INIT
2004 sandbox_apple();
2005 #elif HAVE_PLEDGE
2006 sandbox_pledge();
2007 #endif
2008 /*
2009 * Read in line-by-line and process in the phase dictated by our
2010 * finite state automaton.
2011 */
2012
2013 while ((len = getline(&cp, &bufsz, f)) != -1) {
2014 assert(len > 0);
2015 p.ln++;
2016 if (cp[len - 1] != '\n') {
2017 warnx("%s:%zu: unterminated line", p.fn, p.ln);
2018 break;
2019 }
2020
2021 /*
2022 * Lines are now always NUL-terminated, and don't allow
2023 * NUL characters in the line.
2024 */
2025
2026 cp[--len] = '\0';
2027 len = strlen(cp);
2028
2029 switch (p.phase) {
2030 case PHASE_INIT:
2031 init(&p, cp);
2032 break;
2033 case PHASE_KEYS:
2034 keys(&p, cp, (size_t)len);
2035 break;
2036 case PHASE_DESC:
2037 desc(&p, cp, (size_t)len);
2038 break;
2039 case PHASE_SEEALSO:
2040 seealso(&p, cp, (size_t)len);
2041 break;
2042 case PHASE_DECL:
2043 decl(&p, cp, (size_t)len);
2044 break;
2045 }
2046 }
2047
2048 /*
2049 * If we hit the last line, then try to process.
2050 * Otherwise, we failed along the way.
2051 */
2052
2053 if (feof(f)) {
2054 /*
2055 * Allow us to be at the declarations or scanning for
2056 * the next clause.
2057 */
2058 if (p.phase == PHASE_INIT ||
2059 p.phase == PHASE_DECL) {
2060 if (hcreate(5000) == 0)
2061 err(1, NULL);
2062 TAILQ_FOREACH(d, &p.dqhead, entries)
2063 postprocess(prefix, d);
2064 check_dupes(&p);
2065 TAILQ_FOREACH(d, &p.dqhead, entries)
2066 emit(d);
2067 rc = 1;
2068 } else if (p.phase != PHASE_DECL)
2069 warnx("%s:%zu: exit when not in "
2070 "initial state", p.fn, p.ln);
2071 }
2072
2073 while ((d = TAILQ_FIRST(&p.dqhead)) != NULL) {
2074 TAILQ_REMOVE(&p.dqhead, d, entries);
2075 while ((e = TAILQ_FIRST(&d->dcqhead)) != NULL) {
2076 TAILQ_REMOVE(&d->dcqhead, e, entries);
2077 free(e->text);
2078 free(e);
2079 }
2080 free(d->name);
2081 free(d->desc);
2082 free(d->fulldesc);
2083 free(d->dt);
2084 for (i = 0; i < d->nmsz; i++)
2085 free(d->nms[i]);
2086 for (i = 0; i < d->xrsz; i++)
2087 free(d->xrs[i]);
2088 for (i = 0; i < d->keysz; i++)
2089 free(d->keys[i]);
2090 free(d->keys);
2091 free(d->nms);
2092 free(d->xrs);
2093 free(d->fname);
2094 free(d->seealso);
2095 free(d->keybuf);
2096 free(d);
2097 }
2098
2099 return !rc;
2100 usage:
2101 fprintf(stderr, "usage: %s [-Nnv] [-p prefix] [file]\n",
2102 getprogname());
2103 return 1;
2104 }
2105