1.\" $Vendor-Id: mandoc.3,v 1.17 2012/01/13 15:27:14 joerg Exp $ 2.\" 3.\" Copyright (c) 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4.\" Copyright (c) 2010 Ingo Schwarze <schwarze@openbsd.org> 5.\" 6.\" Permission to use, copy, modify, and distribute this software for any 7.\" purpose with or without fee is hereby granted, provided that the above 8.\" copyright notice and this permission notice appear in all copies. 9.\" 10.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17.\" 18.Dd January 13, 2012 19.Dt MANDOC 3 20.Os 21.Sh NAME 22.Nm mandoc , 23.Nm mandoc_escape , 24.Nm man_meta , 25.Nm man_mparse , 26.Nm man_node , 27.Nm mchars_alloc , 28.Nm mchars_free , 29.Nm mchars_num2char , 30.Nm mchars_num2uc , 31.Nm mchars_spec2cp , 32.Nm mchars_spec2str , 33.Nm mdoc_meta , 34.Nm mdoc_node , 35.Nm mparse_alloc , 36.Nm mparse_free , 37.Nm mparse_getkeep , 38.Nm mparse_keep , 39.Nm mparse_readfd , 40.Nm mparse_reset , 41.Nm mparse_result , 42.Nm mparse_strerror , 43.Nm mparse_strlevel 44.Nd mandoc macro compiler library 45.Sh LIBRARY 46.Lb mandoc 47.Sh SYNOPSIS 48.In man.h 49.In mdoc.h 50.In mandoc.h 51.Ft "enum mandoc_esc" 52.Fo mandoc_escape 53.Fa "const char **end" 54.Fa "const char **start" 55.Fa "int *sz" 56.Fc 57.Ft "const struct man_meta *" 58.Fo man_meta 59.Fa "const struct man *man" 60.Fc 61.Ft "const struct mparse *" 62.Fo man_mparse 63.Fa "const struct man *man" 64.Fc 65.Ft "const struct man_node *" 66.Fo man_node 67.Fa "const struct man *man" 68.Fc 69.Ft "struct mchars *" 70.Fn mchars_alloc 71.Ft void 72.Fn mchars_free "struct mchars *p" 73.Ft char 74.Fn mchars_num2char "const char *cp" "size_t sz" 75.Ft int 76.Fn mchars_num2uc "const char *cp" "size_t sz" 77.Ft "const char *" 78.Fo mchars_spec2str 79.Fa "const struct mchars *p" 80.Fa "const char *cp" 81.Fa "size_t sz" 82.Fa "size_t *rsz" 83.Fc 84.Ft int 85.Fo mchars_spec2cp 86.Fa "const struct mchars *p" 87.Fa "const char *cp" 88.Fa "size_t sz" 89.Ft "const char *" 90.Fc 91.Ft "const struct mdoc_meta *" 92.Fo mdoc_meta 93.Fa "const struct mdoc *mdoc" 94.Fc 95.Ft "const struct mdoc_node *" 96.Fo mdoc_node 97.Fa "const struct mdoc *mdoc" 98.Fc 99.Ft void 100.Fo mparse_alloc 101.Fa "enum mparset type" 102.Fa "enum mandoclevel wlevel" 103.Fa "mandocmsg msg" 104.Fa "void *msgarg" 105.Fc 106.Ft void 107.Fo mparse_free 108.Fa "struct mparse *parse" 109.Fc 110.Ft void 111.Fo mparse_getkeep 112.Fa "const struct mparse *parse" 113.Fc 114.Ft void 115.Fo mparse_keep 116.Fa "struct mparse *parse" 117.Fc 118.Ft "enum mandoclevel" 119.Fo mparse_readfd 120.Fa "struct mparse *parse" 121.Fa "int fd" 122.Fa "const char *fname" 123.Fc 124.Ft void 125.Fo mparse_reset 126.Fa "struct mparse *parse" 127.Fc 128.Ft void 129.Fo mparse_result 130.Fa "struct mparse *parse" 131.Fa "struct mdoc **mdoc" 132.Fa "struct man **man" 133.Fc 134.Ft "const char *" 135.Fo mparse_strerror 136.Fa "enum mandocerr" 137.Fc 138.Ft "const char *" 139.Fo mparse_strlevel 140.Fa "enum mandoclevel" 141.Fc 142.Vt extern const char * const * man_macronames; 143.Vt extern const char * const * mdoc_argnames; 144.Vt extern const char * const * mdoc_macronames; 145.Fd "#define ASCII_NBRSP" 146.Fd "#define ASCII_HYPH" 147.Sh DESCRIPTION 148The 149.Nm mandoc 150library parses a 151.Ux 152manual into an abstract syntax tree (AST). 153.Ux 154manuals are composed of 155.Xr mdoc 7 156or 157.Xr man 7 , 158and may be mixed with 159.Xr roff 7 , 160.Xr tbl 7 , 161and 162.Xr eqn 7 163invocations. 164.Pp 165The following describes a general parse sequence: 166.Bl -enum 167.It 168initiate a parsing sequence with 169.Fn mparse_alloc ; 170.It 171parse files or file descriptors with 172.Fn mparse_readfd ; 173.It 174retrieve a parsed syntax tree, if the parse was successful, with 175.Fn mparse_result ; 176.It 177iterate over parse nodes with 178.Fn mdoc_node 179or 180.Fn man_node ; 181.It 182free all allocated memory with 183.Fn mparse_free , 184or invoke 185.Fn mparse_reset 186and parse new files. 187.El 188.Pp 189The 190.Nm 191library also contains routines for translating character strings into glyphs 192.Pq see Fn mchars_alloc 193and parsing escape sequences from strings 194.Pq see Fn mandoc_escape . 195.Sh REFERENCE 196This section documents the functions, types, and variables available 197via 198.In mandoc.h . 199.Ss Types 200.Bl -ohang 201.It Vt "enum mandoc_esc" 202An escape sequence classification. 203.It Vt "enum mandocerr" 204A fatal error, error, or warning message during parsing. 205.It Vt "enum mandoclevel" 206A classification of an 207.Vt "enum mandoclevel" 208as regards system operation. 209.It Vt "struct mchars" 210An opaque pointer to an object allowing for translation between 211character strings and glyphs. 212See 213.Fn mchars_alloc . 214.It Vt "enum mparset" 215The type of parser when reading input. 216This should usually be 217.Dv MPARSE_AUTO 218for auto-detection. 219.It Vt "struct mparse" 220An opaque pointer to a running parse sequence. 221Created with 222.Fn mparse_alloc 223and freed with 224.Fn mparse_free . 225This may be used across parsed input if 226.Fn mparse_reset 227is called between parses. 228.It Vt "mandocmsg" 229A prototype for a function to handle fatal error, error, and warning 230messages emitted by the parser. 231.El 232.Ss Functions 233.Bl -ohang 234.It Fn mandoc_escape 235Scan an escape sequence, i.e., a character string beginning with 236.Sq \e . 237Pass a pointer to the character after the 238.Sq \e 239as 240.Va end ; 241it will be set to the supremum of the parsed escape sequence unless 242returning 243.Dv ESCAPE_ERROR , 244in which case the string is bogus and should be 245thrown away. 246If not 247.Dv ESCAPE_ERROR 248or 249.Dv ESCAPE_IGNORE , 250.Va start 251is set to the first relevant character of the substring (font, glyph, 252whatever) of length 253.Va sz . 254Both 255.Va start 256and 257.Va sz 258may be 259.Dv NULL . 260.It Fn man_meta 261Obtain the meta-data of a successful parse. 262This may only be used on a pointer returned by 263.Fn mparse_result . 264.It Fn man_mparse 265Get the parser used for the current output. 266.It Fn man_node 267Obtain the root node of a successful parse. 268This may only be used on a pointer returned by 269.Fn mparse_result . 270.It Fn mchars_alloc 271Allocate an 272.Vt "struct mchars *" 273object for translating special characters into glyphs. 274See 275.Xr mandoc_char 7 276for an overview of special characters. 277The object must be freed with 278.Fn mchars_free . 279.It Fn mchars_free 280Free an object created with 281.Fn mchars_alloc . 282.It Fn mchars_num2char 283Convert a character index (e.g., the \eN\(aq\(aq escape) into a 284printable ASCII character. 285Returns \e0 (the nil character) if the input sequence is malformed. 286.It Fn mchars_num2uc 287Convert a hexadecimal character index (e.g., the \e[uNNNN] escape) into 288a Unicode codepoint. 289Returns \e0 (the nil character) if the input sequence is malformed. 290.It Fn mchars_spec2cp 291Convert a special character into a valid Unicode codepoint. 292Returns \-1 on failure or a non-zero Unicode codepoint on success. 293.It Fn mchars_spec2str 294Convert a special character into an ASCII string. 295Returns 296.Dv NULL 297on failure. 298.It Fn mdoc_meta 299Obtain the meta-data of a successful parse. 300This may only be used on a pointer returned by 301.Fn mparse_result . 302.It Fn mdoc_node 303Obtain the root node of a successful parse. 304This may only be used on a pointer returned by 305.Fn mparse_result . 306.It Fn mparse_alloc 307Allocate a parser. 308The same parser may be used for multiple files so long as 309.Fn mparse_reset 310is called between parses. 311.Fn mparse_free 312must be called to free the memory allocated by this function. 313.It Fn mparse_free 314Free all memory allocated by 315.Fn mparse_alloc . 316.It Fn mparse_getkeep 317Acquire the keep buffer. 318Must follow a call of 319.Fn mparse_keep . 320.It Fn mparse_keep 321Instruct the parser to retain a copy of its parsed input. 322This can be acquired with subsequent 323.Fn mparse_getkeep 324calls. 325.It Fn mparse_readfd 326Parse a file or file descriptor. 327If 328.Va fd 329is -1, 330.Va fname 331is opened for reading. 332Otherwise, 333.Va fname 334is assumed to be the name associated with 335.Va fd . 336This may be called multiple times with different parameters; however, 337.Fn mparse_reset 338should be invoked between parses. 339.It Fn mparse_reset 340Reset a parser so that 341.Fn mparse_readfd 342may be used again. 343.It Fn mparse_result 344Obtain the result of a parse. 345Only successful parses 346.Po 347i.e., those where 348.Fn mparse_readfd 349returned less than MANDOCLEVEL_FATAL 350.Pc 351should invoke this function, in which case one of the two pointers will 352be filled in. 353.It Fn mparse_strerror 354Return a statically-allocated string representation of an error code. 355.It Fn mparse_strlevel 356Return a statically-allocated string representation of a level code. 357.El 358.Ss Variables 359.Bl -ohang 360.It Va man_macronames 361The string representation of a man macro as indexed by 362.Vt "enum mant" . 363.It Va mdoc_argnames 364The string representation of a mdoc macro argument as indexed by 365.Vt "enum mdocargt" . 366.It Va mdoc_macronames 367The string representation of a mdoc macro as indexed by 368.Vt "enum mdoct" . 369.El 370.Sh IMPLEMENTATION NOTES 371This section consists of structural documentation for 372.Xr mdoc 7 373and 374.Xr man 7 375syntax trees and strings. 376.Ss Man and Mdoc Strings 377Strings may be extracted from mdoc and man meta-data, or from text 378nodes (MDOC_TEXT and MAN_TEXT, respectively). 379These strings have special non-printing formatting cues embedded in the 380text itself, as well as 381.Xr roff 7 382escapes preserved from input. 383Implementing systems will need to handle both situations to produce 384human-readable text. 385In general, strings may be assumed to consist of 7-bit ASCII characters. 386.Pp 387The following non-printing characters may be embedded in text strings: 388.Bl -tag -width Ds 389.It Dv ASCII_NBRSP 390A non-breaking space character. 391.It Dv ASCII_HYPH 392A soft hyphen. 393.El 394.Pp 395Escape characters are also passed verbatim into text strings. 396An escape character is a sequence of characters beginning with the 397backslash 398.Pq Sq \e . 399To construct human-readable text, these should be intercepted with 400.Fn mandoc_escape 401and converted with one of 402.Fn mchars_num2char , 403.Fn mchars_spec2str , 404and so on. 405.Ss Man Abstract Syntax Tree 406This AST is governed by the ontological rules dictated in 407.Xr man 7 408and derives its terminology accordingly. 409.Pp 410The AST is composed of 411.Vt struct man_node 412nodes with element, root and text types as declared by the 413.Va type 414field. 415Each node also provides its parse point (the 416.Va line , 417.Va sec , 418and 419.Va pos 420fields), its position in the tree (the 421.Va parent , 422.Va child , 423.Va next 424and 425.Va prev 426fields) and some type-specific data. 427.Pp 428The tree itself is arranged according to the following normal form, 429where capitalised non-terminals represent nodes. 430.Pp 431.Bl -tag -width "ELEMENTXX" -compact 432.It ROOT 433\(<- mnode+ 434.It mnode 435\(<- ELEMENT | TEXT | BLOCK 436.It BLOCK 437\(<- HEAD BODY 438.It HEAD 439\(<- mnode* 440.It BODY 441\(<- mnode* 442.It ELEMENT 443\(<- ELEMENT | TEXT* 444.It TEXT 445\(<- [[:ascii:]]* 446.El 447.Pp 448The only elements capable of nesting other elements are those with 449next-lint scope as documented in 450.Xr man 7 . 451.Ss Mdoc Abstract Syntax Tree 452This AST is governed by the ontological 453rules dictated in 454.Xr mdoc 7 455and derives its terminology accordingly. 456.Qq In-line 457elements described in 458.Xr mdoc 7 459are described simply as 460.Qq elements . 461.Pp 462The AST is composed of 463.Vt struct mdoc_node 464nodes with block, head, body, element, root and text types as declared 465by the 466.Va type 467field. 468Each node also provides its parse point (the 469.Va line , 470.Va sec , 471and 472.Va pos 473fields), its position in the tree (the 474.Va parent , 475.Va child , 476.Va nchild , 477.Va next 478and 479.Va prev 480fields) and some type-specific data, in particular, for nodes generated 481from macros, the generating macro in the 482.Va tok 483field. 484.Pp 485The tree itself is arranged according to the following normal form, 486where capitalised non-terminals represent nodes. 487.Pp 488.Bl -tag -width "ELEMENTXX" -compact 489.It ROOT 490\(<- mnode+ 491.It mnode 492\(<- BLOCK | ELEMENT | TEXT 493.It BLOCK 494\(<- HEAD [TEXT] (BODY [TEXT])+ [TAIL [TEXT]] 495.It ELEMENT 496\(<- TEXT* 497.It HEAD 498\(<- mnode* 499.It BODY 500\(<- mnode* [ENDBODY mnode*] 501.It TAIL 502\(<- mnode* 503.It TEXT 504\(<- [[:ascii:]]* 505.El 506.Pp 507Of note are the TEXT nodes following the HEAD, BODY and TAIL nodes of 508the BLOCK production: these refer to punctuation marks. 509Furthermore, although a TEXT node will generally have a non-zero-length 510string, in the specific case of 511.Sq \&.Bd \-literal , 512an empty line will produce a zero-length string. 513Multiple body parts are only found in invocations of 514.Sq \&Bl \-column , 515where a new body introduces a new phrase. 516.Pp 517The 518.Xr mdoc 7 519syntax tree accommodates for broken block structures as well. 520The ENDBODY node is available to end the formatting associated 521with a given block before the physical end of that block. 522It has a non-null 523.Va end 524field, is of the BODY 525.Va type , 526has the same 527.Va tok 528as the BLOCK it is ending, and has a 529.Va pending 530field pointing to that BLOCK's BODY node. 531It is an indirect child of that BODY node 532and has no children of its own. 533.Pp 534An ENDBODY node is generated when a block ends while one of its child 535blocks is still open, like in the following example: 536.Bd -literal -offset indent 537\&.Ao ao 538\&.Bo bo ac 539\&.Ac bc 540\&.Bc end 541.Ed 542.Pp 543This example results in the following block structure: 544.Bd -literal -offset indent 545BLOCK Ao 546 HEAD Ao 547 BODY Ao 548 TEXT ao 549 BLOCK Bo, pending -> Ao 550 HEAD Bo 551 BODY Bo 552 TEXT bo 553 TEXT ac 554 ENDBODY Ao, pending -> Ao 555 TEXT bc 556TEXT end 557.Ed 558.Pp 559Here, the formatting of the 560.Sq \&Ao 561block extends from TEXT ao to TEXT ac, 562while the formatting of the 563.Sq \&Bo 564block extends from TEXT bo to TEXT bc. 565It renders as follows in 566.Fl T Ns Cm ascii 567mode: 568.Pp 569.Dl <ao [bo ac> bc] end 570.Pp 571Support for badly-nested blocks is only provided for backward 572compatibility with some older 573.Xr mdoc 7 574implementations. 575Using badly-nested blocks is 576.Em strongly discouraged ; 577for example, the 578.Fl T Ns Cm html 579and 580.Fl T Ns Cm xhtml 581front-ends to 582.Xr mandoc 1 583are unable to render them in any meaningful way. 584Furthermore, behaviour when encountering badly-nested blocks is not 585consistent across troff implementations, especially when using multiple 586levels of badly-nested blocks. 587.Sh SEE ALSO 588.Xr mandoc 1 , 589.Xr eqn 7 , 590.Xr man 7 , 591.Xr mandoc_char 7 , 592.Xr mdoc 7 , 593.Xr roff 7 , 594.Xr tbl 7 595.Sh AUTHORS 596The 597.Nm 598library was written by 599.An Kristaps Dzonsons , 600.Mt kristaps@bsd.lv . 601