1 /* Copyright (C) 1997, 2000 Aladdin Enterprises. All rights reserved. 2 3 This file is part of AFPL Ghostscript. 4 5 AFPL Ghostscript is distributed with NO WARRANTY OF ANY KIND. No author or 6 distributor accepts any responsibility for the consequences of using it, or 7 for whether it serves any particular purpose or works at all, unless he or 8 she says so in writing. Refer to the Aladdin Free Public License (the 9 "License") for full details. 10 11 Every copy of AFPL Ghostscript must include a copy of the License, normally 12 in a plain ASCII text file named PUBLIC. The License grants you the right 13 to copy, modify and redistribute AFPL Ghostscript, but only under certain 14 conditions described in the License. Among other things, the License 15 requires that the copyright notice and this notice be preserved on all 16 copies. 17 */ 18 19 /*$Id: gsfcmap.c,v 1.14 2001/06/16 19:02:32 igorm Exp $ */ 20 /* CMap character decoding */ 21 #include "memory_.h" 22 #include "gx.h" 23 #include "gserrors.h" 24 #include "gsstruct.h" 25 #include "gsutil.h" /* for gs_next_ids */ 26 #include "gxfcmap.h" 27 28 /* GC descriptors */ 29 public_st_cmap(); 30 /* Because lookup ranges can be elements of arrays, */ 31 /* their enum_ptrs procedure must never return 0 prematurely. */ 32 private 33 ENUM_PTRS_WITH(code_lookup_range_enum_ptrs, 34 gx_code_lookup_range_t *pclr) return 0; 35 case 0: 36 if (pclr->value_type == CODE_VALUE_GLYPH) { 37 const byte *pv = pclr->values.data; 38 int k; 39 40 for (k = 0; k < pclr->num_keys; ++k) { 41 gs_glyph glyph = 0; 42 int i; 43 44 for (i = 0; i < pclr->value_size; ++i) 45 glyph = (glyph << 8) + *pv++; 46 pclr->cmap->mark_glyph(glyph, pclr->cmap->mark_glyph_data); 47 } 48 } 49 return ENUM_OBJ(pclr->cmap); 50 case 1: return ENUM_STRING(&pclr->keys); 51 case 2: return ENUM_STRING(&pclr->values); 52 ENUM_PTRS_END 53 private 54 RELOC_PTRS_WITH(code_lookup_range_reloc_ptrs, gx_code_lookup_range_t *pclr) 55 RELOC_VAR(pclr->cmap); 56 RELOC_STRING_VAR(pclr->keys); 57 RELOC_STRING_VAR(pclr->values); 58 RELOC_PTRS_END 59 public_st_code_lookup_range(); 60 public_st_code_lookup_range_element(); 61 62 /* ---------------- Procedures ---------------- */ 63 64 /* 65 * Initialize a just-allocated CMap, to ensure that all pointers are clean 66 * for the GC. 67 */ 68 void 69 gs_cmap_init(gs_cmap_t *pcmap) 70 { 71 memset(pcmap, 0, sizeof(*pcmap)); 72 pcmap->id = gs_next_ids(1); 73 uid_set_invalid(&pcmap->uid); 74 } 75 76 /* 77 * Create an Identity CMap. 78 */ 79 int 80 gs_cmap_create_identity(gs_cmap_t **ppcmap, int num_bytes, int wmode, 81 gs_memory_t *mem) 82 { 83 gs_cmap_t *pcmap = 84 gs_alloc_struct(mem, gs_cmap_t, &st_cmap, 85 "gs_cmap_create_identity(CMap)"); 86 gx_code_space_range_t *range = (gx_code_space_range_t *) 87 gs_alloc_bytes(mem, sizeof(gx_code_space_range_t), 88 "gs_cmap_create_identity(code space range)"); 89 gx_code_lookup_range_t *lookup = 90 gs_alloc_struct_array(mem, 1, gx_code_lookup_range_t, 91 &st_code_lookup_range, 92 "gs_cmap_create_identity(lookup range)"); 93 /* We allocate CIDSystemInfo dynamically only for the sake of the GC. */ 94 gs_cid_system_info_t *pcidsi = 95 gs_alloc_struct(mem, gs_cid_system_info_t, &st_cid_system_info, 96 "gs_cmap_create_identity(CIDSystemInfo)"); 97 static const byte key_data[8] = { 98 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff 99 }; 100 static const gs_cid_system_info_t identity_cidsi = { 101 { (const byte *)"Adobe", 5 }, 102 { (const byte *)"Identity", 8 }, 103 0 104 }; 105 106 if (pcmap == 0 || range == 0 || lookup == 0 || pcidsi == 0) 107 return_error(gs_error_VMerror); 108 if (num_bytes != 2) /* for now */ 109 return_error(gs_error_rangecheck); 110 gs_cmap_init(pcmap); 111 pcmap->CMapType = 1; 112 pcmap->CMapName.data = (const byte *) 113 (wmode ? "Identity-V" : "Identity-H"); 114 pcmap->CMapName.size = 10; 115 *pcidsi = identity_cidsi; 116 pcmap->CIDSystemInfo = pcidsi; 117 pcmap->num_fonts = 1; 118 pcmap->CMapVersion = 1.0; 119 /* no uid, UIDOffset */ 120 pcmap->WMode = wmode; 121 memset(range->first, 0, num_bytes); 122 memset(range->last, 0xff, num_bytes); 123 range->size = num_bytes; 124 pcmap->code_space.ranges = range; 125 pcmap->code_space.num_ranges = 1; 126 memset(lookup, 0, sizeof(*lookup)); 127 lookup->cmap = pcmap; 128 lookup->key_size = num_bytes; 129 lookup->num_keys = 1; 130 lookup->key_is_range = true; 131 /* 132 * It's OK to break const here, because the strings are never 133 * freed, and the GC can handle strings outside the heap. 134 */ 135 lookup->keys.data = (byte*) (key_data + 4 - num_bytes); 136 lookup->keys.size = num_bytes * 2; 137 lookup->value_type = CODE_VALUE_CID; 138 lookup->value_size = num_bytes; 139 /* ditto */ 140 lookup->values.data = (byte*) key_data; 141 lookup->values.size = num_bytes; 142 pcmap->def.lookup = lookup; 143 pcmap->def.num_lookup = 1; 144 /* no notdef */ 145 /* no mark_glyph, mark_glyph_data, glyph_name, glyph_name_data */ 146 *ppcmap = pcmap; 147 return 0; 148 } 149 150 /* 151 * multi-dimensional range comparator 152 */ 153 154 private void 155 print_msg_str_in_range(const byte *str, 156 const byte *key_lo, const byte *key_hi, 157 int key_size) 158 { 159 debug_print_string_hex(str, key_size); 160 dlprintf(" in "); 161 debug_print_string_hex(key_lo, key_size); 162 dlprintf(" - "); 163 debug_print_string_hex(key_hi, key_size); 164 dlprintf("\n"); 165 } 166 167 private int 168 gs_cmap_get_shortest_chr(const gx_code_map_t * pcmap, uint *pfidx) 169 { 170 int i; 171 int len_shortest = MAX_CMAP_CODE_SIZE; 172 uint fidx_shortest = 0; /* font index for this fallback */ 173 174 for (i = pcmap->num_lookup - 1; i >= 0; --i) { 175 const gx_code_lookup_range_t *pclr = &pcmap->lookup[i]; 176 if ((pclr->key_prefix_size + pclr->key_size) <= len_shortest) { 177 len_shortest = (pclr->key_prefix_size + pclr->key_size); 178 fidx_shortest = pclr->font_index; 179 } 180 } 181 182 *pfidx = fidx_shortest; 183 return len_shortest; 184 } 185 186 /* 187 * multi-dimensional relative position calculator 188 * 189 * Returns offset of the given CID, considering CID range 190 * as array of CIDs (the last index changes fastest). 191 */ 192 private int 193 gs_multidim_CID_offset(const byte *key_str, 194 const byte *key_lo, const byte *key_hi, 195 int key_size) 196 { 197 198 int i; /* index for current dimension */ 199 int CID_offset = 0; 200 201 if (gs_debug_c('J')) { 202 dlprintf("[J]gmCo() calc CID_offset for 0x"); 203 print_msg_str_in_range(key_str, key_lo, key_hi, key_size); 204 } 205 206 for (i = 0; i < key_size; i++) 207 CID_offset = CID_offset * (key_hi[i] - key_lo[i] + 1) + 208 key_str[i] - key_lo[i]; 209 210 if_debug1('J', "[J]gmCo() CID_offset = %d\n", CID_offset); 211 return CID_offset; 212 } 213 214 /* Get a big-endian integer. */ 215 private uint 216 bytes2int(const byte *p, int n) 217 { 218 uint v = 0; 219 int i; 220 221 for (i = 0; i < n; ++i) 222 v = (v << 8) + p[i]; 223 return v; 224 } 225 226 /* 227 * Decode a character from a string using a code map, updating the index. 228 * Return 0 for a CID or name, N > 0 for a character code where N is the 229 * number of bytes in the code, or an error. Shift the decoded bytes into 230 * *pchr. For undefined characters, set *pglyph = gs_no_glyph and return 0. 231 */ 232 private int 233 code_map_decode_next(const gx_code_map_t * pcmap, const gs_const_string * pstr, 234 uint * pindex, uint * pfidx, 235 gs_char * pchr, gs_glyph * pglyph) 236 { 237 const byte *str = pstr->data + *pindex; 238 uint ssize = pstr->size - *pindex; 239 /* 240 * The keys are not sorted due to 'usecmap'. Possible optimization : 241 * merge and sort keys in 'zbuildcmap', then use binary search here. 242 * This would be valuable for UniJIS-UTF8-H, which contains about 7000 243 * keys. 244 */ 245 int i; 246 247 for (i = pcmap->num_lookup - 1; i >= 0; --i) { /* reverse scan order due to 'usecmap' */ 248 const gx_code_lookup_range_t *pclr = &pcmap->lookup[i]; 249 int pre_size = pclr->key_prefix_size, key_size = pclr->key_size, 250 chr_size = pre_size + key_size; 251 252 if (ssize < chr_size) 253 continue; 254 if (memcmp(str, pclr->key_prefix, pre_size)) 255 continue; 256 /* Search the lookup range. We could use binary search. */ 257 { 258 const byte *key = pclr->keys.data; 259 int step = key_size; 260 int k; 261 const byte *pvalue; 262 263 if (pclr->key_is_range) { 264 step <<= 1; 265 for (k = 0; k < pclr->num_keys; ++k, key += step) 266 if (memcmp(str + pre_size, key, key_size) >= 0 && 267 memcmp(str + pre_size, key + key_size, key_size) <= 0) 268 break; 269 } else { 270 for (k = 0; k < pclr->num_keys; ++k, key += step) 271 if (!memcmp(str + pre_size, key, key_size)) 272 break; 273 } 274 if (k == pclr->num_keys) 275 continue; 276 /* We have a match. Return the result. */ 277 *pchr = (*pchr << (chr_size * 8)) + bytes2int(str, chr_size); 278 *pindex += chr_size; 279 *pfidx = pclr->font_index; 280 pvalue = pclr->values.data + k * pclr->value_size; 281 switch (pclr->value_type) { 282 case CODE_VALUE_CID: 283 *pglyph = gs_min_cid_glyph + 284 bytes2int(pvalue, pclr->value_size) + 285 bytes2int(str + pre_size, key_size) - 286 bytes2int(key, key_size); 287 return 0; 288 case CODE_VALUE_GLYPH: 289 *pglyph = bytes2int(pvalue, pclr->value_size); 290 return 0; 291 case CODE_VALUE_CHARS: 292 *pglyph = 293 bytes2int(pvalue, pclr->value_size) + 294 bytes2int(str + pre_size, key_size) - 295 bytes2int(key, key_size); 296 return pclr->value_size; 297 default: /* shouldn't happen */ 298 return_error(gs_error_rangecheck); 299 } 300 } 301 } 302 /* No mapping. */ 303 *pglyph = gs_no_glyph; 304 return 0; 305 } 306 307 private int 308 code_map_decode_next_multidim_regime(const gx_code_map_t * pcmap, 309 const gs_const_string * pstr, 310 uint * pindex, uint * pfidx, 311 gs_char * pchr, gs_glyph * pglyph) 312 { 313 const byte *str = pstr->data + *pindex; 314 uint ssize = pstr->size - *pindex; 315 /* 316 * The keys are not sorted due to 'usecmap'. Possible optimization : 317 * merge and sort keys in 'zbuildcmap', then use binary search here. 318 * This would be valuable for UniJIS-UTF8-H, which contains about 7000 319 * keys. 320 */ 321 int i; 322 323 /* 324 * In the fallback of CMap decoding procedure, there is "partial matching". 325 * For detail, refer PostScript Ref. Manual v3 at the end of Fonts chapter. 326 */ 327 328 /* "pm" stands for partial match (not pointer), temporal use. */ 329 int pm_maxlen = 0; /* partial match: max length */ 330 int pm_index = *pindex; /* partial match: ptr index (in str) */ 331 uint pm_fidx = *pfidx; /* partial match: ptr font index */ 332 gs_char pm_chr = *pchr; /* partial match: ptr character */ 333 334 *pchr = '\0'; 335 336 if (gs_debug_c('J')) { 337 dlprintf("[J]CMDNmr() is called: str=("); 338 debug_print_string_hex(str, ssize); 339 dlprintf3(") @ 0x%lx ssize=%d, %d ranges to check\n", 340 str, ssize, pcmap->num_lookup); 341 } 342 343 for (i = pcmap->num_lookup - 1; i >= 0; --i) { 344 /* main loop - scan the map passed via pcmap */ 345 /* reverse scan order due to 'usecmap' */ 346 347 const gx_code_lookup_range_t *pclr = &pcmap->lookup[i]; 348 int pre_size = pclr->key_prefix_size, key_size = pclr->key_size, 349 chr_size = pre_size + key_size; 350 351 int j = 0; 352 /* length of the given byte stream is shorter than 353 * chr-length of current range, no need for further check, 354 * skip to the next range. 355 */ 356 if (ssize < chr_size) 357 continue; 358 359 if (0 < pre_size) { 360 const byte * prefix = pclr->key_prefix; 361 /* check partial match in prefix */ 362 for (j = 0; j < pre_size; j++) 363 if (prefix[j] != str[j]) 364 break; 365 366 if (0 == j) /* no match, skip to next i */ 367 continue; 368 else if (j < pre_size) { /* not exact, partial match */ 369 if (gs_debug_c('J')) { 370 dlprintf("[J]CMDNmr() partial match with prefix:"); 371 print_msg_str_in_range(str, prefix, 372 prefix, pre_size); 373 } 374 375 if (pm_maxlen < j) { 376 pm_maxlen = chr_size; 377 pm_chr = bytes2int(str, chr_size); 378 pm_index = (*pindex) + chr_size; 379 pm_fidx = pclr->font_index; 380 } 381 continue ; /* no need to check key, skip to next i */ 382 } 383 384 if (gs_debug_c('J')) { 385 dlprintf("[J]CMDNmr() full match with prefix:"); 386 print_msg_str_in_range(str, prefix, prefix, pre_size); 387 } 388 389 } /* if (0 < pre_size) */ 390 391 /* full match in prefix. check key */ 392 { 393 const byte *key = pclr->keys.data; 394 int step = key_size; 395 int k, l, m; 396 const byte *pvalue = NULL; 397 398 /* when range is "range", 2 keys for lo-end and hi-end 399 * are stacked. So twice the step. */ 400 if (pclr->key_is_range) 401 step <<=1; /* step = step * 2; */ 402 403 for (k = 0; k < pclr->num_keys; ++k, key += step) { 404 405 if_debug0('J', "[J]CMDNmr() check key:"); 406 if (gs_debug_c('J')) 407 print_msg_str_in_range(str + pre_size, 408 key, key + key_size, key_size); 409 410 for (l = 0; l < key_size; l++) { 411 byte c = str[l + pre_size]; 412 if (c < key[l] || c > key[key_size +l]) 413 break; 414 } 415 416 if (pm_maxlen < pre_size + l) { 417 pm_maxlen = chr_size; 418 pm_chr = bytes2int(str, chr_size); 419 pm_index = (*pindex) + chr_size; 420 pm_fidx = pclr->font_index; 421 } 422 if (l == key_size) 423 break; 424 } 425 426 /* all keys are tried, but found no match. */ 427 /* go to next prefix. */ 428 if (k == pclr->num_keys) 429 continue; 430 431 /* We have a match. Return the result. */ 432 *pchr = bytes2int(str, chr_size); 433 *pindex += chr_size; 434 *pfidx = pclr->font_index; 435 pvalue = pclr->values.data + k * pclr->value_size; 436 437 if (gs_debug_c('J')) { 438 dlprintf("[J]CMDNmr() full matched pvalue=("); 439 debug_print_string_hex(pvalue, pclr->value_size); 440 dlprintf(")\n"); 441 } 442 443 switch (pclr->value_type) { 444 case CODE_VALUE_CID: 445 *pglyph = gs_min_cid_glyph + 446 bytes2int(pvalue, pclr->value_size) + 447 gs_multidim_CID_offset(str + pre_size, 448 key, key + key_size, key_size); 449 return 0; 450 case CODE_VALUE_NOTDEF: 451 *pglyph = gs_min_cid_glyph + 452 bytes2int(pvalue, pclr->value_size); 453 return 0; 454 case CODE_VALUE_GLYPH: 455 *pglyph = bytes2int(pvalue, pclr->value_size); 456 return 0; 457 case CODE_VALUE_CHARS: 458 *pglyph = 459 bytes2int(pvalue, pclr->value_size) + 460 bytes2int(str + pre_size, key_size) - 461 bytes2int(key, key_size); 462 return pclr->value_size; 463 default: /* shouldn't happen */ 464 return_error(gs_error_rangecheck); 465 } 466 } 467 } 468 /* No mapping. */ 469 *pchr = pm_chr; 470 *pindex = pm_index; 471 *pfidx = pm_fidx; 472 *pglyph = gs_no_glyph; 473 if (gs_debug_c('J')) { 474 dlprintf("[J]CMDNmr() no full match, use partial match for ("); 475 debug_print_string_hex(str, pm_maxlen); 476 dlprintf(")\n"); 477 } 478 return 0; 479 } 480 481 /* 482 * Decode a character from a string using a CMap. 483 * Return like code_map_decode_next. 484 * At present, the range specification by (begin|end)codespacerange 485 * is not used in this function. Therefore, this function accepts 486 * some invalid CMap which def & undef maps exceed the codespacerange. 487 * It should be checked in this function, or some procedure in gs_cmap.ps. 488 */ 489 int 490 gs_cmap_decode_next(const gs_cmap_t * pcmap, const gs_const_string * pstr, 491 uint * pindex, uint * pfidx, 492 gs_char * pchr, gs_glyph * pglyph) 493 { 494 uint save_index = *pindex; 495 int code; 496 497 uint pm_index; 498 uint pm_fidx; 499 gs_char pm_chr; 500 501 /* For first, check defined map */ 502 if_debug0('J', "[J]GCDN() check def CMap\n"); 503 code = 504 code_map_decode_next_multidim_regime(&pcmap->def, pstr, pindex, pfidx, pchr, pglyph); 505 506 /* This is defined character */ 507 if (code != 0 || *pglyph != gs_no_glyph) 508 return code; 509 510 /* In here, this is NOT defined character */ 511 /* save partially matched results */ 512 pm_index = *pindex; 513 pm_fidx = *pfidx; 514 pm_chr = *pchr; 515 516 /* check notdef map. */ 517 if_debug0('J', "[J]GCDN() check notdef CMap\n"); 518 *pindex = save_index; 519 code = 520 code_map_decode_next_multidim_regime(&pcmap->notdef, pstr, pindex, pfidx, pchr, pglyph); 521 522 /* This is defined "notdef" character. */ 523 if (code != 0 || *pglyph != gs_no_glyph) 524 return code; 525 526 /* 527 * This is undefined in def & undef maps, 528 * use partially matched result with default notdef (CID = 0). 529 */ 530 if (save_index < pm_index) { 531 532 /* there was some partially matched */ 533 534 *pglyph = gs_min_cid_glyph; /* CID = 0 */ 535 *pindex = pm_index; 536 *pfidx = pm_fidx; 537 *pchr = '\0'; 538 return 0; /* should return some error for partial matched .notdef? */ 539 } 540 else { 541 /* no match */ 542 543 /* Even partial match is failed. 544 * Getting the shortest length from defined characters, 545 * and take the leading bytes (with same length of the shortest 546 * defined chr) as an unidentified character: CID = 0. 547 * Also this procedure is specified in PS Ref. Manual v3, 548 * at the end of Fonts chapter. 549 */ 550 551 const byte *str = pstr->data + save_index; 552 uint ssize = pstr->size - save_index; 553 int chr_size_shortest = 554 gs_cmap_get_shortest_chr(&pcmap->def, pfidx); 555 556 if (chr_size_shortest <= ssize) { 557 *pglyph = gs_min_cid_glyph; /* CID = 0, this is CMap fallback */ 558 *pindex = save_index + chr_size_shortest; 559 *pchr = '\0'; 560 if (gs_debug_c('J')) { 561 dlprintf1("[J]GCDN() no partial match, skip %d byte (", 562 chr_size_shortest); 563 debug_print_string_hex(str, chr_size_shortest); 564 dlprintf(")\n"); 565 } 566 return 0; /* should return some error for fallback .notdef? */ 567 } 568 else { 569 /* Undecodable string is shorter than the shortest character, 570 * there's no way except to return error. 571 */ 572 *pglyph = gs_no_glyph; 573 return -1; 574 } 575 } 576 } 577