xref: /netbsd-src/external/bsd/openldap/dist/libraries/liblunicode/ucdata/api.txt (revision e670fd5c413e99c2f6a37901bb21c537fcd322d2)
1#
2# Id: api.txt,v 1.3 2001/01/02 18:46:20 mleisher Exp
3#
4
5                             The MUTT UCData API
6                             -------------------
7
8
9####
10NOTE: This library has been customized for use with OpenLDAP. The character
11data tables are hardcoded into the library and the load/unload/reload
12functions are no-ops. Also, the MUTT API claimed to be compatible with
13John Cowan's library but its ucnumber behavior was broken. This has been
14fixed in the OpenLDAP release.
15
16By default, the implementation specific properties in MUTTUCData.txt are
17not incorporated into the OpenLDAP build. You can supply them to ucgendat
18and recreate uctable.h if you need them.
19  -- hyc@openldap.org
20####
21
22
23-----------------------------------------------------------------------------
24
25Macros that combine to select data tables for ucdata_load(), ucdata_unload(),
26and ucdata_reload().
27
28#define UCDATA_CASE   0x01
29#define UCDATA_CTYPE  0x02
30#define UCDATA_DECOMP 0x04
31#define UCDATA_CMBCL  0x08
32#define UCDATA_NUM    0x10
33#define UCDATA_COMP   0x20
34#define UCATA_ALL (UCDATA_CASE|UCDATA_CTYPE|UCDATA_DECOMP|\
35                   UCDATA_CMBCL|UCDATA_NUM|UCDATA_COMP)
36-----------------------------------------------------------------------------
37
38void ucdata_load(char *paths, int masks)
39
40  This function initializes the UCData library by locating the data files in
41  one of the colon-separated directories in the `paths' parameter.  The data
42  files to be loaded are specified in the `masks' parameter as a bitwise
43  combination of the macros listed above.
44
45  This should be called before using any of the other functions.
46
47  NOTE: the ucdata_setup(char *paths) function is now a macro that expands
48        into this function at compile time.
49
50-----------------------------------------------------------------------------
51
52void ucdata_unload(int masks)
53
54  This function unloads the data tables specified in the `masks' parameter.
55
56  This function should be called when the application is done using the UCData
57  package.
58
59  NOTE: the ucdata_cleanup() function is now a macro that expands into this
60        function at compile time.
61
62-----------------------------------------------------------------------------
63
64void ucdata_reload(char *paths, int masks)
65
66  This function reloads the data files from one of the colon-separated
67  directories in the `paths' parameter.  The data files to be reloaded are
68  specified in the `masks' parameter as a bitwise combination of the macros
69  listed above.
70
71  If the data files have already been loaded, they are unloaded before the
72  data files are loaded again.
73
74-----------------------------------------------------------------------------
75
76int ucdecomp(unsigned long code, unsigned long *num, unsigned long **decomp)
77
78  This function determines if a character has a decomposition and returns the
79  decomposition information if it exists.
80
81  If a zero is returned, there is no decomposition.  If a non-zero is
82  returned, then the `num' and `decomp' variables are filled in with the
83  appropriate values.
84
85  Example call:
86
87    unsigned long i, num, *decomp;
88
89    if (ucdecomp(0x1d5, &num, &decomp) != 0) {
90       for (i = 0; i < num; i++)
91         printf("0x%08lX,", decomp[i]);
92       putchar('\n');
93    }
94
95int uccanondecomp(const unsigned long *in, int inlen, unsigned long **out,
96                  int *outlen)
97
98  This function decomposes an input string and does canonical reordering of
99  the characters at the same time.
100
101  If a -1 is returned, memory allocation was not successful.  If a zero is
102  returned, no decomposition occurred.  Any other value means the output string
103  contains the fully decomposed string in canonical order.
104
105  If the "outlen" parameter comes back with a value > 0, then the string
106  returned in the "out" parameter needs to be deallocated by the caller.
107
108-----------------------------------------------------------------------------
109
110int ucdecomp_hangul(unsigned long code, unsigned long *num,
111                    unsigned long decomp[])
112
113  This function determines if a Hangul syllable has a decomposition and
114  returns the decomposition information.
115
116  An array of at least size 3 should be passed to the function for the
117  decomposition of the syllable.
118
119  If a zero is returned, the character is not a Hangul syllable.  If a
120  non-zero is returned, the `num' field will be 2 or 3 and the syllable will
121  be decomposed into the `decomp' array arithmetically.
122
123  Example call:
124
125    unsigned long i, num, decomp[3];
126
127    if (ucdecomp_hangul(0xb1ba, &num, &decomp) != 0) {
128       for (i = 0; i < num; i++)
129         printf("0x%08lX,", decomp[i]);
130       putchar('\n');
131    }
132
133-----------------------------------------------------------------------------
134
135int uccomp(unsigned long ch1, unsigned long ch2, unsigned long *comp)
136
137  This function takes a pair of characters and determines if they combine to
138  form another character.
139
140  If a zero is returned, no composition is formed by the character pair.  Any
141  other value indicates the "comp" parameter has a value.
142
143int uccomp_hangul(unsigned long *str, int len)
144
145  This function composes the Hangul Jamo in the string.  The composition is
146  done in-place.
147
148  The return value provides the new length of the string.  This will be
149  smaller than "len" if compositions occurred.
150
151int uccanoncomp(unsigned long *str, int len)
152
153  This function does a canonical composition of characters in the string.
154
155  The return value is the new length of the string.
156
157-----------------------------------------------------------------------------
158
159struct ucnumber {
160  int numerator;
161  int denominator;
162};
163
164int ucnumber_lookup(unsigned long code, struct ucnumber *num)
165
166  This function determines if the code is a number and fills in the `num'
167  field with the numerator and denominator.  If the code happens to be a
168  single digit, the denominator field will be 1.
169
170####
171The original code would set numerator = denominator for regular digits.
172However, the Readme also claimed to be compatible with John Cowan's uctype
173library, but this behavior is both nonsensical and incompatible with the
174Cowan library. As such, it has been fixed here as described above.
175  -- hyc@openldap.org
176####
177
178  If the function returns 0, the code is not a number.  Any other return
179  value means the code is a number.
180
181int ucdigit_lookup(unsigned long code, int *digit)
182
183  This function determines if the code is a digit and fills in the `digit'
184  field with the digit value.
185
186  If the function returns 0, the code is not a number.  Any other return
187  value means the code is a number.
188
189struct ucnumber ucgetnumber(unsigned long code)
190
191  This is a compatibility function with John Cowan's "uctype" package.  It
192  uses ucnumber_lookup().
193
194int ucgetdigit(unsigned long code)
195
196  This is a compatibility function with John Cowan's "uctype" package.  It
197  uses ucdigit_lookup().
198
199-----------------------------------------------------------------------------
200
201unsigned long uctoupper(unsigned long code)
202
203  This function returns the code unchanged if it is already upper case or has
204  no upper case equivalent.  Otherwise the upper case equivalent is returned.
205
206-----------------------------------------------------------------------------
207
208unsigned long uctolower(unsigned long code)
209
210  This function returns the code unchanged if it is already lower case or has
211  no lower case equivalent.  Otherwise the lower case equivalent is returned.
212
213-----------------------------------------------------------------------------
214
215unsigned long uctotitle(unsigned long code)
216
217  This function returns the code unchanged if it is already title case or has
218  no title case equivalent.  Otherwise the title case equivalent is returned.
219
220-----------------------------------------------------------------------------
221
222int ucisalpha(unsigned long code)
223int ucisalnum(unsigned long code)
224int ucisdigit(unsigned long code)
225int uciscntrl(unsigned long code)
226int ucisspace(unsigned long code)
227int ucisblank(unsigned long code)
228int ucispunct(unsigned long code)
229int ucisgraph(unsigned long code)
230int ucisprint(unsigned long code)
231int ucisxdigit(unsigned long code)
232
233int ucisupper(unsigned long code)
234int ucislower(unsigned long code)
235int ucistitle(unsigned long code)
236
237  These functions (actually macros) determine if a character has these
238  properties.  These behave in a fashion very similar to the venerable ctype
239  package.
240
241-----------------------------------------------------------------------------
242
243int ucisisocntrl(unsigned long code)
244
245  Is the character a C0 control character (< 32) ?
246
247int ucisfmtcntrl(unsigned long code)
248
249  Is the character a format control character?
250
251int ucissymbol(unsigned long code)
252
253  Is the character a symbol?
254
255int ucisnumber(unsigned long code)
256
257  Is the character a number or digit?
258
259int ucisnonspacing(unsigned long code)
260
261  Is the character non-spacing?
262
263int ucisopenpunct(unsigned long code)
264
265  Is the character an open/left punctuation (i.e. '[')
266
267int ucisclosepunct(unsigned long code)
268
269  Is the character an close/right punctuation (i.e. ']')
270
271int ucisinitialpunct(unsigned long code)
272
273  Is the character an initial punctuation (i.e. U+2018 LEFT SINGLE QUOTATION
274  MARK)
275
276int ucisfinalpunct(unsigned long code)
277
278  Is the character a final punctuation (i.e. U+2019 RIGHT SINGLE QUOTATION
279  MARK)
280
281int uciscomposite(unsigned long code)
282
283  Can the character be decomposed into a set of other characters?
284
285int ucisquote(unsigned long code)
286
287  Is the character one of the many quotation marks?
288
289int ucissymmetric(unsigned long code)
290
291  Is the character one that has an opposite form (i.e. <>)
292
293int ucismirroring(unsigned long code)
294
295  Is the character mirroring (superset of symmetric)?
296
297int ucisnonbreaking(unsigned long code)
298
299  Is the character non-breaking (i.e. non-breaking space)?
300
301int ucisrtl(unsigned long code)
302
303  Does the character have strong right-to-left directionality (i.e. Arabic
304  letters)?
305
306int ucisltr(unsigned long code)
307
308  Does the character have strong left-to-right directionality (i.e. Latin
309  letters)?
310
311int ucisstrong(unsigned long code)
312
313  Does the character have strong directionality?
314
315int ucisweak(unsigned long code)
316
317  Does the character have weak directionality (i.e. numbers)?
318
319int ucisneutral(unsigned long code)
320
321  Does the character have neutral directionality (i.e. whitespace)?
322
323int ucisseparator(unsigned long code)
324
325  Is the character a block or segment separator?
326
327int ucislsep(unsigned long code)
328
329  Is the character a line separator?
330
331int ucispsep(unsigned long code)
332
333  Is the character a paragraph separator?
334
335int ucismark(unsigned long code)
336
337  Is the character a mark of some kind?
338
339int ucisnsmark(unsigned long code)
340
341  Is the character a non-spacing mark?
342
343int ucisspmark(unsigned long code)
344
345  Is the character a spacing mark?
346
347int ucismodif(unsigned long code)
348
349  Is the character a modifier letter?
350
351int ucismodifsymbol(unsigned long code)
352
353  Is the character a modifier symbol?
354
355int ucisletnum(unsigned long code)
356
357  Is the character a number represented by a letter?
358
359int ucisconnect(unsigned long code)
360
361  Is the character connecting punctuation?
362
363int ucisdash(unsigned long code)
364
365  Is the character dash punctuation?
366
367int ucismath(unsigned long code)
368
369  Is the character a math character?
370
371int uciscurrency(unsigned long code)
372
373  Is the character a currency character?
374
375int ucisenclosing(unsigned long code)
376
377  Is the character enclosing (i.e. enclosing box)?
378
379int ucisprivate(unsigned long code)
380
381  Is the character from the Private Use Area?
382
383int ucissurrogate(unsigned long code)
384
385  Is the character one of the surrogate codes?
386
387int ucisdefined(unsigned long code)
388
389  Is the character defined (appeared in one of the data files)?
390
391int ucisundefined(unsigned long code)
392
393  Is the character not defined (non-Unicode)?
394
395int ucishan(unsigned long code)
396
397  Is the character a Han ideograph?
398
399int ucishangul(unsigned long code)
400
401  Is the character a pre-composed Hangul syllable?
402