xref: /plan9/sys/src/cmd/gs/src/gsfcmap.c (revision ff8c3af2f44d95267f67219afa20ba82ff6cf7e4)
1 /* Copyright (C) 1997, 2000 Aladdin Enterprises.  All rights reserved.
2 
3   This file is part of AFPL Ghostscript.
4 
5   AFPL Ghostscript is distributed with NO WARRANTY OF ANY KIND.  No author or
6   distributor accepts any responsibility for the consequences of using it, or
7   for whether it serves any particular purpose or works at all, unless he or
8   she says so in writing.  Refer to the Aladdin Free Public License (the
9   "License") for full details.
10 
11   Every copy of AFPL Ghostscript must include a copy of the License, normally
12   in a plain ASCII text file named PUBLIC.  The License grants you the right
13   to copy, modify and redistribute AFPL Ghostscript, but only under certain
14   conditions described in the License.  Among other things, the License
15   requires that the copyright notice and this notice be preserved on all
16   copies.
17 */
18 
19 /*$Id: gsfcmap.c,v 1.14 2001/06/16 19:02:32 igorm Exp $ */
20 /* CMap character decoding */
21 #include "memory_.h"
22 #include "gx.h"
23 #include "gserrors.h"
24 #include "gsstruct.h"
25 #include "gsutil.h"		/* for gs_next_ids */
26 #include "gxfcmap.h"
27 
28 /* GC descriptors */
29 public_st_cmap();
30 /* Because lookup ranges can be elements of arrays, */
31 /* their enum_ptrs procedure must never return 0 prematurely. */
32 private
33 ENUM_PTRS_WITH(code_lookup_range_enum_ptrs,
34                gx_code_lookup_range_t *pclr) return 0;
35 case 0:
36     if (pclr->value_type == CODE_VALUE_GLYPH) {
37         const byte *pv = pclr->values.data;
38         int k;
39 
40         for (k = 0; k < pclr->num_keys; ++k) {
41             gs_glyph glyph = 0;
42             int i;
43 
44             for (i = 0; i < pclr->value_size; ++i)
45                 glyph = (glyph << 8) + *pv++;
46             pclr->cmap->mark_glyph(glyph, pclr->cmap->mark_glyph_data);
47         }
48     }
49     return ENUM_OBJ(pclr->cmap);
50 case 1: return ENUM_STRING(&pclr->keys);
51 case 2: return ENUM_STRING(&pclr->values);
52 ENUM_PTRS_END
53 private
54 RELOC_PTRS_WITH(code_lookup_range_reloc_ptrs, gx_code_lookup_range_t *pclr)
55     RELOC_VAR(pclr->cmap);
56     RELOC_STRING_VAR(pclr->keys);
57     RELOC_STRING_VAR(pclr->values);
58 RELOC_PTRS_END
59 public_st_code_lookup_range();
60 public_st_code_lookup_range_element();
61 
62 /* ---------------- Procedures ---------------- */
63 
64 /*
65  * Initialize a just-allocated CMap, to ensure that all pointers are clean
66  * for the GC.
67  */
68 void
69 gs_cmap_init(gs_cmap_t *pcmap)
70 {
71     memset(pcmap, 0, sizeof(*pcmap));
72     pcmap->id = gs_next_ids(1);
73     uid_set_invalid(&pcmap->uid);
74 }
75 
76 /*
77  * Create an Identity CMap.
78  */
79 int
80 gs_cmap_create_identity(gs_cmap_t **ppcmap, int num_bytes, int wmode,
81 			gs_memory_t *mem)
82 {
83     gs_cmap_t *pcmap =
84 	gs_alloc_struct(mem, gs_cmap_t, &st_cmap,
85 			"gs_cmap_create_identity(CMap)");
86     gx_code_space_range_t *range = (gx_code_space_range_t *)
87 	gs_alloc_bytes(mem, sizeof(gx_code_space_range_t),
88 		       "gs_cmap_create_identity(code space range)");
89     gx_code_lookup_range_t *lookup =
90 	gs_alloc_struct_array(mem, 1, gx_code_lookup_range_t,
91 			      &st_code_lookup_range,
92 			      "gs_cmap_create_identity(lookup range)");
93     /* We allocate CIDSystemInfo dynamically only for the sake of the GC. */
94     gs_cid_system_info_t *pcidsi =
95 	gs_alloc_struct(mem, gs_cid_system_info_t, &st_cid_system_info,
96 			"gs_cmap_create_identity(CIDSystemInfo)");
97     static const byte key_data[8] = {
98 	0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff
99     };
100     static const gs_cid_system_info_t identity_cidsi = {
101 	{ (const byte *)"Adobe", 5 },
102 	{ (const byte *)"Identity", 8 },
103 	0
104     };
105 
106     if (pcmap == 0 || range == 0 || lookup == 0 || pcidsi == 0)
107 	return_error(gs_error_VMerror);
108     if (num_bytes != 2)		/* for now */
109 	return_error(gs_error_rangecheck);
110     gs_cmap_init(pcmap);
111     pcmap->CMapType = 1;
112     pcmap->CMapName.data = (const byte *)
113 	(wmode ? "Identity-V" : "Identity-H");
114     pcmap->CMapName.size = 10;
115     *pcidsi = identity_cidsi;
116     pcmap->CIDSystemInfo = pcidsi;
117     pcmap->num_fonts = 1;
118     pcmap->CMapVersion = 1.0;
119     /* no uid, UIDOffset */
120     pcmap->WMode = wmode;
121     memset(range->first, 0, num_bytes);
122     memset(range->last, 0xff, num_bytes);
123     range->size = num_bytes;
124     pcmap->code_space.ranges = range;
125     pcmap->code_space.num_ranges = 1;
126     memset(lookup, 0, sizeof(*lookup));
127     lookup->cmap = pcmap;
128     lookup->key_size = num_bytes;
129     lookup->num_keys = 1;
130     lookup->key_is_range = true;
131     /*
132      * It's OK to break const here, because the strings are never
133      * freed, and the GC can handle strings outside the heap.
134      */
135     lookup->keys.data = (byte*) (key_data + 4 - num_bytes);
136     lookup->keys.size = num_bytes * 2;
137     lookup->value_type = CODE_VALUE_CID;
138     lookup->value_size = num_bytes;
139     /* ditto */
140     lookup->values.data = (byte*) key_data;
141     lookup->values.size = num_bytes;
142     pcmap->def.lookup = lookup;
143     pcmap->def.num_lookup = 1;
144     /* no notdef */
145     /* no mark_glyph, mark_glyph_data, glyph_name, glyph_name_data */
146     *ppcmap = pcmap;
147     return 0;
148 }
149 
150 /*
151  * multi-dimensional range comparator
152  */
153 
154 private void
155 print_msg_str_in_range(const byte *str,
156                        const byte *key_lo, const byte *key_hi,
157                        int key_size)
158 {
159     debug_print_string_hex(str, key_size);
160     dlprintf(" in ");
161     debug_print_string_hex(key_lo, key_size);
162     dlprintf(" - ");
163     debug_print_string_hex(key_hi, key_size);
164     dlprintf("\n");
165 }
166 
167 private int
168 gs_cmap_get_shortest_chr(const gx_code_map_t * pcmap, uint *pfidx)
169 {
170     int i;
171     int len_shortest = MAX_CMAP_CODE_SIZE;
172     uint fidx_shortest = 0; /* font index for this fallback */
173 
174     for (i = pcmap->num_lookup - 1; i >= 0; --i) {
175         const gx_code_lookup_range_t *pclr = &pcmap->lookup[i];
176         if ((pclr->key_prefix_size + pclr->key_size) <= len_shortest) {
177            len_shortest = (pclr->key_prefix_size + pclr->key_size);
178            fidx_shortest = pclr->font_index;
179         }
180     }
181 
182     *pfidx = fidx_shortest;
183     return len_shortest;
184 }
185 
186 /*
187  * multi-dimensional relative position calculator
188  *
189  * Returns offset of the given CID, considering CID range
190  * as array of CIDs (the last index changes fastest).
191  */
192 private int
193 gs_multidim_CID_offset(const byte *key_str,
194                         const byte *key_lo, const byte *key_hi,
195 			int key_size)
196 {
197 
198     int i;	/* index for current dimension */
199     int CID_offset = 0;
200 
201     if (gs_debug_c('J')) {
202         dlprintf("[J]gmCo()         calc CID_offset for 0x");
203         print_msg_str_in_range(key_str, key_lo, key_hi, key_size);
204     }
205 
206     for (i = 0; i < key_size; i++)
207         CID_offset = CID_offset * (key_hi[i] - key_lo[i] + 1) +
208             key_str[i] - key_lo[i];
209 
210     if_debug1('J', "[J]gmCo()         CID_offset = %d\n", CID_offset);
211     return CID_offset;
212 }
213 
214 /* Get a big-endian integer. */
215 private uint
216 bytes2int(const byte *p, int n)
217 {
218     uint v = 0;
219     int i;
220 
221     for (i = 0; i < n; ++i)
222         v = (v << 8) + p[i];
223     return v;
224 }
225 
226 /*
227  * Decode a character from a string using a code map, updating the index.
228  * Return 0 for a CID or name, N > 0 for a character code where N is the
229  * number of bytes in the code, or an error.  Shift the decoded bytes into
230  * *pchr.  For undefined characters, set *pglyph = gs_no_glyph and return 0.
231  */
232 private int
233 code_map_decode_next(const gx_code_map_t * pcmap, const gs_const_string * pstr,
234                      uint * pindex, uint * pfidx,
235                      gs_char * pchr, gs_glyph * pglyph)
236 {
237     const byte *str = pstr->data + *pindex;
238     uint ssize = pstr->size - *pindex;
239     /*
240      * The keys are not sorted due to 'usecmap'.  Possible optimization :
241      * merge and sort keys in 'zbuildcmap', then use binary search here.
242      * This would be valuable for UniJIS-UTF8-H, which contains about 7000
243      * keys.
244      */
245     int i;
246 
247     for (i = pcmap->num_lookup - 1; i >= 0; --i) { /* reverse scan order due to 'usecmap' */
248         const gx_code_lookup_range_t *pclr = &pcmap->lookup[i];
249         int pre_size = pclr->key_prefix_size, key_size = pclr->key_size,
250             chr_size = pre_size + key_size;
251 
252         if (ssize < chr_size)
253             continue;
254         if (memcmp(str, pclr->key_prefix, pre_size))
255             continue;
256         /* Search the lookup range. We could use binary search. */
257         {
258             const byte *key = pclr->keys.data;
259             int step = key_size;
260             int k;
261             const byte *pvalue;
262 
263             if (pclr->key_is_range) {
264                 step <<= 1;
265                 for (k = 0; k < pclr->num_keys; ++k, key += step)
266                     if (memcmp(str + pre_size, key, key_size) >= 0 &&
267                         memcmp(str + pre_size, key + key_size, key_size) <= 0)
268                         break;
269             } else {
270                 for (k = 0; k < pclr->num_keys; ++k, key += step)
271                     if (!memcmp(str + pre_size, key, key_size))
272                         break;
273             }
274             if (k == pclr->num_keys)
275                 continue;
276             /* We have a match.  Return the result. */
277             *pchr = (*pchr << (chr_size * 8)) + bytes2int(str, chr_size);
278             *pindex += chr_size;
279             *pfidx = pclr->font_index;
280             pvalue = pclr->values.data + k * pclr->value_size;
281             switch (pclr->value_type) {
282             case CODE_VALUE_CID:
283                 *pglyph = gs_min_cid_glyph +
284                     bytes2int(pvalue, pclr->value_size) +
285                     bytes2int(str + pre_size, key_size) -
286                     bytes2int(key, key_size);
287                 return 0;
288             case CODE_VALUE_GLYPH:
289                 *pglyph = bytes2int(pvalue, pclr->value_size);
290                 return 0;
291             case CODE_VALUE_CHARS:
292                 *pglyph =
293                     bytes2int(pvalue, pclr->value_size) +
294                     bytes2int(str + pre_size, key_size) -
295                     bytes2int(key, key_size);
296                 return pclr->value_size;
297             default:            /* shouldn't happen */
298                 return_error(gs_error_rangecheck);
299             }
300         }
301     }
302     /* No mapping. */
303     *pglyph = gs_no_glyph;
304     return 0;
305 }
306 
307 private int
308 code_map_decode_next_multidim_regime(const gx_code_map_t * pcmap,
309                      const gs_const_string * pstr,
310                      uint * pindex, uint * pfidx,
311                      gs_char * pchr, gs_glyph * pglyph)
312 {
313     const byte *str = pstr->data + *pindex;
314     uint ssize = pstr->size - *pindex;
315     /*
316      * The keys are not sorted due to 'usecmap'.  Possible optimization :
317      * merge and sort keys in 'zbuildcmap', then use binary search here.
318      * This would be valuable for UniJIS-UTF8-H, which contains about 7000
319      * keys.
320      */
321     int i;
322 
323     /*
324      * In the fallback of CMap decoding procedure, there is "partial matching".
325      * For detail, refer PostScript Ref. Manual v3 at the end of Fonts chapter.
326      */
327 
328     /* "pm" stands for partial match (not pointer), temporal use. */
329     int pm_maxlen = 0;		/* partial match: max length */
330     int pm_index = *pindex;	/* partial match: ptr index (in str) */
331     uint pm_fidx = *pfidx;	/* partial match: ptr font index */
332     gs_char pm_chr = *pchr;	/* partial match: ptr character */
333 
334     *pchr = '\0';
335 
336     if (gs_debug_c('J')) {
337         dlprintf("[J]CMDNmr() is called: str=(");
338         debug_print_string_hex(str, ssize);
339         dlprintf3(") @ 0x%lx ssize=%d, %d ranges to check\n",
340                        str, ssize, pcmap->num_lookup);
341     }
342 
343     for (i = pcmap->num_lookup - 1; i >= 0; --i) {
344 	/* main loop - scan the map passed via pcmap */
345 	/* reverse scan order due to 'usecmap' */
346 
347         const gx_code_lookup_range_t *pclr = &pcmap->lookup[i];
348         int pre_size = pclr->key_prefix_size, key_size = pclr->key_size,
349             chr_size = pre_size + key_size;
350 
351         int j = 0;
352 	/* length of the given byte stream is shorter than
353          * chr-length of current range, no need for further check,
354          * skip to the next range.
355          */
356         if (ssize < chr_size)
357             continue;
358 
359         if (0 < pre_size) {
360             const byte * prefix = pclr->key_prefix;
361             /* check partial match in prefix */
362             for (j = 0; j < pre_size; j++)
363                if (prefix[j] != str[j])
364                    break;
365 
366             if (0 == j)			/* no match, skip to next i */
367                 continue;
368             else if (j < pre_size) {	/* not exact, partial match */
369                 if (gs_debug_c('J')) {
370                     dlprintf("[J]CMDNmr() partial match with prefix:");
371                     print_msg_str_in_range(str, prefix,
372                                                 prefix, pre_size);
373                 }
374 
375                 if (pm_maxlen < j) {
376                     pm_maxlen = chr_size;
377                     pm_chr = bytes2int(str, chr_size);
378                     pm_index = (*pindex) + chr_size;
379                     pm_fidx = pclr->font_index;
380                 }
381                 continue ; /* no need to check key, skip to next i */
382             }
383 
384             if (gs_debug_c('J')) {
385                 dlprintf("[J]CMDNmr()   full match with prefix:");
386                 print_msg_str_in_range(str, prefix, prefix, pre_size);
387             }
388 
389         } /* if (0 < pre_size) */
390 
391         /* full match in prefix. check key */
392         {
393             const byte *key = pclr->keys.data;
394             int step = key_size;
395             int k, l, m;
396             const byte *pvalue = NULL;
397 
398 	    /* when range is "range", 2 keys for lo-end and hi-end
399 	     * are stacked. So twice the step. */
400             if (pclr->key_is_range)
401 		step <<=1; 	/* step = step * 2; */
402 
403             for (k = 0; k < pclr->num_keys; ++k, key += step) {
404 
405                 if_debug0('J', "[J]CMDNmr()     check key:");
406                 if (gs_debug_c('J'))
407                     print_msg_str_in_range(str + pre_size,
408                                          key, key + key_size, key_size);
409 
410                 for (l = 0; l < key_size; l++) {
411                     byte c = str[l + pre_size];
412                     if (c < key[l] || c > key[key_size +l])
413                         break;
414                 }
415 
416 		if (pm_maxlen < pre_size + l) {
417                     pm_maxlen = chr_size;
418                     pm_chr = bytes2int(str, chr_size);
419                     pm_index = (*pindex) + chr_size;
420                     pm_fidx = pclr->font_index;
421                 }
422                 if (l == key_size)
423                         break;
424 	    }
425 
426             /* all keys are tried, but found no match. */
427             /* go to next prefix. */
428             if (k == pclr->num_keys)
429                 continue;
430 
431             /* We have a match.  Return the result. */
432             *pchr = bytes2int(str, chr_size);
433             *pindex += chr_size;
434             *pfidx = pclr->font_index;
435             pvalue = pclr->values.data + k * pclr->value_size;
436 
437             if (gs_debug_c('J')) {
438                 dlprintf("[J]CMDNmr()     full matched pvalue=(");
439                 debug_print_string_hex(pvalue, pclr->value_size);
440                 dlprintf(")\n");
441             }
442 
443             switch (pclr->value_type) {
444             case CODE_VALUE_CID:
445                 *pglyph = gs_min_cid_glyph +
446                     bytes2int(pvalue, pclr->value_size) +
447                     gs_multidim_CID_offset(str + pre_size,
448                         key, key + key_size, key_size);
449                 return 0;
450             case CODE_VALUE_NOTDEF:
451                 *pglyph = gs_min_cid_glyph +
452                     bytes2int(pvalue, pclr->value_size);
453                 return 0;
454             case CODE_VALUE_GLYPH:
455                 *pglyph = bytes2int(pvalue, pclr->value_size);
456                 return 0;
457             case CODE_VALUE_CHARS:
458                 *pglyph =
459                     bytes2int(pvalue, pclr->value_size) +
460                     bytes2int(str + pre_size, key_size) -
461                     bytes2int(key, key_size);
462                 return pclr->value_size;
463             default:            /* shouldn't happen */
464                 return_error(gs_error_rangecheck);
465             }
466         }
467     }
468     /* No mapping. */
469     *pchr = pm_chr;
470     *pindex = pm_index;
471     *pfidx = pm_fidx;
472     *pglyph = gs_no_glyph;
473     if (gs_debug_c('J')) {
474         dlprintf("[J]CMDNmr()     no full match, use partial match for (");
475         debug_print_string_hex(str, pm_maxlen);
476         dlprintf(")\n");
477     }
478     return 0;
479 }
480 
481 /*
482  * Decode a character from a string using a CMap.
483  * Return like code_map_decode_next.
484  * At present, the range specification by (begin|end)codespacerange
485  * is not used in this function. Therefore, this function accepts
486  * some invalid CMap which def & undef maps exceed the codespacerange.
487  * It should be checked in this function, or some procedure in gs_cmap.ps.
488  */
489 int
490 gs_cmap_decode_next(const gs_cmap_t * pcmap, const gs_const_string * pstr,
491                     uint * pindex, uint * pfidx,
492                     gs_char * pchr, gs_glyph * pglyph)
493 {
494     uint save_index = *pindex;
495     int code;
496 
497     uint pm_index;
498     uint pm_fidx;
499     gs_char pm_chr;
500 
501     /* For first, check defined map */
502     if_debug0('J', "[J]GCDN() check def CMap\n");
503     code =
504         code_map_decode_next_multidim_regime(&pcmap->def, pstr, pindex, pfidx, pchr, pglyph);
505 
506     /* This is defined character */
507     if (code != 0 || *pglyph != gs_no_glyph)
508         return code;
509 
510     /* In here, this is NOT defined character */
511     /* save partially matched results */
512     pm_index = *pindex;
513     pm_fidx = *pfidx;
514     pm_chr = *pchr;
515 
516     /* check notdef map. */
517     if_debug0('J', "[J]GCDN() check notdef CMap\n");
518     *pindex = save_index;
519     code =
520 	code_map_decode_next_multidim_regime(&pcmap->notdef, pstr, pindex, pfidx, pchr, pglyph);
521 
522     /* This is defined "notdef" character. */
523     if (code != 0 || *pglyph != gs_no_glyph)
524         return code;
525 
526     /*
527      * This is undefined in def & undef maps,
528      * use partially matched result with default notdef (CID = 0).
529      */
530     if (save_index < pm_index) {
531 
532 	/* there was some partially matched */
533 
534         *pglyph = gs_min_cid_glyph;	/* CID = 0 */
535         *pindex = pm_index;
536         *pfidx = pm_fidx;
537         *pchr = '\0';
538          return 0; /* should return some error for partial matched .notdef? */
539     }
540     else {
541 	/* no match */
542 
543 	/* Even partial match is failed.
544          * Getting the shortest length from defined characters,
545          * and take the leading bytes (with same length of the shortest
546          * defined chr) as an unidentified character: CID = 0.
547 	 * Also this procedure is specified in PS Ref. Manual v3,
548          * at the end of Fonts chapter.
549          */
550 
551 	const byte *str = pstr->data + save_index;
552 	uint ssize = pstr->size - save_index;
553 	int chr_size_shortest =
554 		gs_cmap_get_shortest_chr(&pcmap->def, pfidx);
555 
556 	if (chr_size_shortest <= ssize) {
557             *pglyph = gs_min_cid_glyph;	/* CID = 0, this is CMap fallback */
558             *pindex = save_index + chr_size_shortest;
559 	    *pchr = '\0';
560             if (gs_debug_c('J')) {
561                 dlprintf1("[J]GCDN() no partial match, skip %d byte (",
562                                                chr_size_shortest);
563                 debug_print_string_hex(str, chr_size_shortest);
564                 dlprintf(")\n");
565             }
566             return 0; /* should return some error for fallback .notdef? */
567 	}
568 	else {
569             /* Undecodable string is shorter than the shortest character,
570              * there's no way except to return error.
571              */
572 	    *pglyph = gs_no_glyph;
573 	    return -1;
574 	}
575     }
576 }
577