1 /* $NetBSD: tls.c,v 1.23 2024/11/30 01:04:05 christos Exp $ */ 2 /*- 3 * Copyright (c) 2011 The NetBSD Foundation, Inc. 4 * All rights reserved. 5 * 6 * This code is derived from software contributed to The NetBSD Foundation 7 * by Joerg Sonnenberger. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 20 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 22 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #include <sys/cdefs.h> 32 __RCSID("$NetBSD: tls.c,v 1.23 2024/11/30 01:04:05 christos Exp $"); 33 34 /* 35 * Thread-local storage 36 * 37 * Reference: 38 * 39 * [ELFTLS] Ulrich Drepper, `ELF Handling For Thread-Local 40 * Storage', Version 0.21, 2023-08-22. 41 * https://akkadia.org/drepper/tls.pdf 42 * https://web.archive.org/web/20240718081934/https://akkadia.org/drepper/tls.pdf 43 */ 44 45 #include <sys/param.h> 46 #include <sys/ucontext.h> 47 #include <lwp.h> 48 #include <stdalign.h> 49 #include <stddef.h> 50 #include <string.h> 51 #include "debug.h" 52 #include "rtld.h" 53 54 #include <machine/lwp_private.h> 55 56 #if defined(__HAVE_TLS_VARIANT_I) || defined(__HAVE_TLS_VARIANT_II) 57 58 static struct tls_tcb *_rtld_tls_allocate_locked(void); 59 static void *_rtld_tls_module_allocate(struct tls_tcb *, size_t); 60 61 /* 62 * DTV offset 63 * 64 * On some architectures (m68k, mips, or1k, powerpc, and riscv), 65 * the DTV offsets passed to __tls_get_addr have a bias relative 66 * to the start of the DTV, in order to maximize the range of TLS 67 * offsets that can be used by instruction encodings with signed 68 * displacements. 69 */ 70 #ifndef TLS_DTV_OFFSET 71 #define TLS_DTV_OFFSET 0 72 #endif 73 74 static size_t _rtld_tls_static_space; /* Static TLS space allocated */ 75 static size_t _rtld_tls_static_offset; /* Next offset for static TLS to use */ 76 size_t _rtld_tls_dtv_generation = 1; /* Bumped on each load of obj w/ TLS */ 77 size_t _rtld_tls_max_index = 1; /* Max index into up-to-date DTV */ 78 79 /* 80 * DTV -- Dynamic Thread Vector 81 * 82 * The DTV is a per-thread array that maps each module with 83 * thread-local storage to a pointer into part of the thread's TCB 84 * (thread control block), or dynamically loaded TLS blocks, 85 * reserved for that module's storage. 86 * 87 * The TCB itself, struct tls_tcb, has a pointer to the DTV at 88 * tcb->tcb_dtv. 89 * 90 * The layout is: 91 * 92 * +---------------+ 93 * | max index | -1 max index i for which dtv[i] is alloced 94 * +---------------+ 95 * | generation | 0 void **dtv points here 96 * +---------------+ 97 * | obj 1 tls ptr | 1 TLS pointer for obj w/ obj->tlsindex 1 98 * +---------------+ 99 * | obj 2 tls ptr | 2 TLS pointer for obj w/ obj->tlsindex 2 100 * +---------------+ 101 * . 102 * . 103 * . 104 * 105 * The values of obj->tlsindex start at 1; this way, 106 * dtv[obj->tlsindex] works, when dtv[0] is the generation. The 107 * TLS pointers go either into the static thread-local storage, 108 * for the initial objects (i.e., those loaded at startup), or 109 * into TLS blocks dynamically allocated for objects that 110 * dynamically loaded by dlopen. 111 * 112 * The generation field is a cache of the global generation number 113 * _rtld_tls_dtv_generation, which is bumped every time an object 114 * with TLS is loaded in _rtld_map_object, and cached by 115 * __tls_get_addr (via _rtld_tls_get_addr) when a newly loaded 116 * module lies outside the bounds of the current DTV. 117 * 118 * XXX Why do we keep max index and generation separately? They 119 * appear to be initialized the same, always incremented together, 120 * and always stored together. 121 * 122 * XXX Why is this not a struct? 123 * 124 * struct dtv { 125 * size_t dtv_gen; 126 * void *dtv_module[]; 127 * }; 128 */ 129 #define DTV_GENERATION(dtv) ((size_t)((dtv)[0])) 130 #define DTV_MAX_INDEX(dtv) ((size_t)((dtv)[-1])) 131 #define SET_DTV_GENERATION(dtv, val) (dtv)[0] = (void *)(size_t)(val) 132 #define SET_DTV_MAX_INDEX(dtv, val) (dtv)[-1] = (void *)(size_t)(val) 133 134 /* 135 * _rtld_tls_get_addr(tcb, idx, offset) 136 * 137 * Slow path for __tls_get_addr (see below), called to allocate 138 * TLS space if needed for the object obj with obj->tlsindex idx, 139 * at offset, which must be below obj->tlssize. 140 * 141 * This may allocate a DTV if the current one is too old, and it 142 * may allocate a dynamically loaded TLS block if there isn't one 143 * already allocated for it. 144 * 145 * XXX Why is the first argument passed as `void *tls' instead of 146 * just `struct tls_tcb *tcb'? 147 */ 148 void * 149 _rtld_tls_get_addr(void *tls, size_t idx, size_t offset) 150 { 151 struct tls_tcb *tcb = tls; 152 void **dtv, **new_dtv; 153 sigset_t mask; 154 155 _rtld_exclusive_enter(&mask); 156 157 dtv = tcb->tcb_dtv; 158 159 /* 160 * If the generation number has changed, we have to allocate a 161 * new DTV. 162 * 163 * XXX Do we really? Isn't it enough to check whether idx <= 164 * DTV_MAX_INDEX(dtv)? 165 */ 166 if (__predict_false(DTV_GENERATION(dtv) != _rtld_tls_dtv_generation)) { 167 size_t to_copy = DTV_MAX_INDEX(dtv); 168 169 /* 170 * "2 +" because the first element is the generation and 171 * the second one is the maximum index. 172 */ 173 new_dtv = xcalloc((2 + _rtld_tls_max_index) * sizeof(*dtv)); 174 ++new_dtv; /* advance past DTV_MAX_INDEX */ 175 if (to_copy > _rtld_tls_max_index) /* XXX How? */ 176 to_copy = _rtld_tls_max_index; 177 memcpy(new_dtv + 1, dtv + 1, to_copy * sizeof(*dtv)); 178 xfree(dtv - 1); /* retreat back to DTV_MAX_INDEX */ 179 dtv = tcb->tcb_dtv = new_dtv; 180 SET_DTV_MAX_INDEX(dtv, _rtld_tls_max_index); 181 SET_DTV_GENERATION(dtv, _rtld_tls_dtv_generation); 182 } 183 184 if (__predict_false(dtv[idx] == NULL)) 185 dtv[idx] = _rtld_tls_module_allocate(tcb, idx); 186 187 _rtld_exclusive_exit(&mask); 188 189 return (uint8_t *)dtv[idx] + offset; 190 } 191 192 /* 193 * _rtld_tls_initial_allocation() 194 * 195 * Allocate the TCB (thread control block) for the initial thread, 196 * once the static TLS space usage has been determined (plus some 197 * slop to allow certain special cases like Mesa to be dlopened). 198 * 199 * This must be done _after_ all initial objects (i.e., those 200 * loaded at startup, as opposed to objects dynamically loaded by 201 * dlopen) have had TLS offsets allocated if need be by 202 * _rtld_tls_offset_allocate, and have had relocations processed. 203 */ 204 void 205 _rtld_tls_initial_allocation(void) 206 { 207 struct tls_tcb *tcb; 208 209 _rtld_tls_static_space = _rtld_tls_static_offset + 210 RTLD_STATIC_TLS_RESERVATION; 211 212 #ifndef __HAVE_TLS_VARIANT_I 213 _rtld_tls_static_space = roundup2(_rtld_tls_static_space, 214 alignof(max_align_t)); 215 #endif 216 dbg(("_rtld_tls_static_space %zu", _rtld_tls_static_space)); 217 218 tcb = _rtld_tls_allocate_locked(); 219 #ifdef __HAVE___LWP_SETTCB 220 __lwp_settcb(tcb); 221 #else 222 _lwp_setprivate(tcb); 223 #endif 224 } 225 226 /* 227 * _rtld_tls_allocate_locked() 228 * 229 * Internal subroutine to allocate a TCB (thread control block) 230 * for the current thread. 231 * 232 * This allocates a DTV and a TCB that points to it, including 233 * static space in the TCB for the TLS of the initial objects. 234 * TLS blocks for dynamically loaded objects are allocated lazily. 235 * 236 * Caller must either be single-threaded (at startup via 237 * _rtld_tls_initial_allocation) or hold the rtld exclusive lock 238 * (via _rtld_tls_allocate). 239 */ 240 static struct tls_tcb * 241 _rtld_tls_allocate_locked(void) 242 { 243 Obj_Entry *obj; 244 struct tls_tcb *tcb; 245 uint8_t *p, *q; 246 247 p = xcalloc(_rtld_tls_static_space + sizeof(struct tls_tcb)); 248 #ifdef __HAVE_TLS_VARIANT_I 249 tcb = (struct tls_tcb *)p; 250 p += sizeof(struct tls_tcb); 251 #else 252 p += _rtld_tls_static_space; 253 tcb = (struct tls_tcb *)p; 254 tcb->tcb_self = tcb; 255 #endif 256 dbg(("lwp %d tls tcb %p", _lwp_self(), tcb)); 257 /* 258 * "2 +" because the first element is the generation and the second 259 * one is the maximum index. 260 */ 261 tcb->tcb_dtv = xcalloc(sizeof(*tcb->tcb_dtv) * (2 + _rtld_tls_max_index)); 262 ++tcb->tcb_dtv; /* advance past DTV_MAX_INDEX */ 263 SET_DTV_MAX_INDEX(tcb->tcb_dtv, _rtld_tls_max_index); 264 SET_DTV_GENERATION(tcb->tcb_dtv, _rtld_tls_dtv_generation); 265 266 for (obj = _rtld_objlist; obj != NULL; obj = obj->next) { 267 if (obj->tls_static) { 268 #ifdef __HAVE_TLS_VARIANT_I 269 q = p + obj->tlsoffset; 270 #else 271 q = p - obj->tlsoffset; 272 #endif 273 dbg(("%s: [lwp %d] tls dtv %p index %zu offset %zu", 274 obj->path, _lwp_self(), 275 q, obj->tlsindex, obj->tlsoffset)); 276 if (obj->tlsinitsize) 277 memcpy(q, obj->tlsinit, obj->tlsinitsize); 278 tcb->tcb_dtv[obj->tlsindex] = q; 279 } 280 } 281 282 return tcb; 283 } 284 285 /* 286 * _rtld_tls_allocate() 287 * 288 * Allocate a TCB (thread control block) for the current thread. 289 * 290 * Called by pthread_create for non-initial threads. (The initial 291 * thread's TCB is allocated by _rtld_tls_initial_allocation.) 292 */ 293 struct tls_tcb * 294 _rtld_tls_allocate(void) 295 { 296 struct tls_tcb *tcb; 297 sigset_t mask; 298 299 _rtld_exclusive_enter(&mask); 300 tcb = _rtld_tls_allocate_locked(); 301 _rtld_exclusive_exit(&mask); 302 303 return tcb; 304 } 305 306 /* 307 * _rtld_tls_free(tcb) 308 * 309 * Free a TCB allocated with _rtld_tls_allocate. 310 * 311 * Frees any TLS blocks for dynamically loaded objects that tcb's 312 * DTV points to, and frees tcb's DTV, and frees tcb. 313 */ 314 void 315 _rtld_tls_free(struct tls_tcb *tcb) 316 { 317 size_t i, max_index; 318 uint8_t *p, *p_end; 319 sigset_t mask; 320 321 _rtld_exclusive_enter(&mask); 322 323 #ifdef __HAVE_TLS_VARIANT_I 324 p = (uint8_t *)tcb; 325 #else 326 p = (uint8_t *)tcb - _rtld_tls_static_space; 327 #endif 328 p_end = p + _rtld_tls_static_space; 329 330 max_index = DTV_MAX_INDEX(tcb->tcb_dtv); 331 for (i = 1; i <= max_index; ++i) { 332 if ((uint8_t *)tcb->tcb_dtv[i] < p || 333 (uint8_t *)tcb->tcb_dtv[i] >= p_end) 334 xfree(tcb->tcb_dtv[i]); 335 } 336 xfree(tcb->tcb_dtv - 1); /* retreat back to DTV_MAX_INDEX */ 337 xfree(p); 338 339 _rtld_exclusive_exit(&mask); 340 } 341 342 /* 343 * _rtld_tls_module_allocate(tcb, idx) 344 * 345 * Allocate thread-local storage in the thread with the given TCB 346 * (thread control block) for the object obj whose obj->tlsindex 347 * is idx. 348 * 349 * If obj has had space in static TLS reserved (obj->tls_static), 350 * return a pointer into that. Otherwise, allocate a TLS block, 351 * mark obj as having a TLS block allocated (obj->tls_dynamic), 352 * and return it. 353 * 354 * Called by _rtld_tls_get_addr to get the thread-local storage 355 * for an object the first time around. 356 */ 357 static void * 358 _rtld_tls_module_allocate(struct tls_tcb *tcb, size_t idx) 359 { 360 Obj_Entry *obj; 361 uint8_t *p; 362 363 for (obj = _rtld_objlist; obj != NULL; obj = obj->next) { 364 if (obj->tlsindex == idx) 365 break; 366 } 367 if (obj == NULL) { 368 _rtld_error("Module for TLS index %zu missing", idx); 369 _rtld_die(); 370 } 371 if (obj->tls_static) { 372 #ifdef __HAVE_TLS_VARIANT_I 373 p = (uint8_t *)tcb + obj->tlsoffset + sizeof(struct tls_tcb); 374 #else 375 p = (uint8_t *)tcb - obj->tlsoffset; 376 #endif 377 return p; 378 } 379 380 p = xmalloc(obj->tlssize); 381 memcpy(p, obj->tlsinit, obj->tlsinitsize); 382 memset(p + obj->tlsinitsize, 0, obj->tlssize - obj->tlsinitsize); 383 384 obj->tls_dynamic = 1; 385 386 return p; 387 } 388 389 /* 390 * _rtld_tls_offset_allocate(obj) 391 * 392 * Allocate a static thread-local storage offset for obj. 393 * 394 * Called by _rtld at startup for all initial objects. Called 395 * also by MD relocation logic, which is allowed (for Mesa) to 396 * allocate an additional 64 bytes (RTLD_STATIC_TLS_RESERVATION) 397 * of static thread-local storage in dlopened objects. 398 */ 399 int 400 _rtld_tls_offset_allocate(Obj_Entry *obj) 401 { 402 size_t offset, next_offset; 403 404 if (obj->tls_dynamic) 405 return -1; 406 407 if (obj->tls_static) 408 return 0; 409 if (obj->tlssize == 0) { 410 obj->tlsoffset = 0; 411 obj->tls_static = 1; 412 return 0; 413 } 414 415 #ifdef __HAVE_TLS_VARIANT_I 416 offset = roundup2(_rtld_tls_static_offset, obj->tlsalign); 417 next_offset = offset + obj->tlssize; 418 #else 419 offset = roundup2(_rtld_tls_static_offset + obj->tlssize, 420 obj->tlsalign); 421 next_offset = offset; 422 #endif 423 424 /* 425 * Check if the static allocation was already done. 426 * This happens if dynamically loaded modules want to use 427 * static TLS space. 428 * 429 * XXX Keep an actual free list and callbacks for initialisation. 430 */ 431 if (_rtld_tls_static_space) { 432 if (obj->tlsinitsize) { 433 _rtld_error("%s: Use of initialized " 434 "Thread Local Storage with model initial-exec " 435 "and dlopen is not supported", 436 obj->path); 437 return -1; 438 } 439 if (next_offset > _rtld_tls_static_space) { 440 _rtld_error("%s: No space available " 441 "for static Thread Local Storage", 442 obj->path); 443 return -1; 444 } 445 } 446 obj->tlsoffset = offset; 447 dbg(("%s: static tls offset 0x%zx size %zu\n", 448 obj->path, obj->tlsoffset, obj->tlssize)); 449 _rtld_tls_static_offset = next_offset; 450 obj->tls_static = 1; 451 452 return 0; 453 } 454 455 /* 456 * _rtld_tls_offset_free(obj) 457 * 458 * Free a static thread-local storage offset for obj. 459 * 460 * Called by dlclose (via _rtld_unload_object -> _rtld_obj_free). 461 * 462 * Since static thread-local storage is normally not used by 463 * dlopened objects (with the exception of Mesa), this doesn't do 464 * anything to recycle the space right now. 465 */ 466 void 467 _rtld_tls_offset_free(Obj_Entry *obj) 468 { 469 470 /* 471 * XXX See above. 472 */ 473 obj->tls_static = 0; 474 return; 475 } 476 477 #if defined(__HAVE_COMMON___TLS_GET_ADDR) && defined(RTLD_LOADER) 478 /* 479 * __tls_get_addr(tlsindex) 480 * 481 * Symbol directly called by code generated by the compiler for 482 * references thread-local storage in the general-dynamic or 483 * local-dynamic TLS models (but not initial-exec or local-exec). 484 * 485 * The argument is a pointer to 486 * 487 * struct { 488 * unsigned long int ti_module; 489 * unsigned long int ti_offset; 490 * }; 491 * 492 * as in, e.g., [ELFTLS] Sec. 3.4.3. This coincides with the 493 * type size_t[2] on all architectures that use this common 494 * __tls_get_addr definition (XXX but why do we write it as 495 * size_t[2]?). 496 * 497 * ti_module, i.e., arg[0], is the obj->tlsindex assigned at 498 * load-time by _rtld_map_object, and ti_offset, i.e., arg[1], is 499 * assigned at link-time by ld(1), possibly adjusted by 500 * TLS_DTV_OFFSET. 501 * 502 * Some architectures -- specifically IA-64 -- use a different 503 * calling convention. Some architectures -- specifically i386 504 * -- also use another entry point ___tls_get_addr (that's three 505 * leading underscores) with a different calling convention. 506 */ 507 void * 508 __tls_get_addr(void *arg_) 509 { 510 size_t *arg = (size_t *)arg_; 511 void **dtv; 512 #ifdef __HAVE___LWP_GETTCB_FAST 513 struct tls_tcb * const tcb = __lwp_gettcb_fast(); 514 #else 515 struct tls_tcb * const tcb = __lwp_getprivate_fast(); 516 #endif 517 size_t idx = arg[0], offset = arg[1] + TLS_DTV_OFFSET; 518 519 dtv = tcb->tcb_dtv; 520 521 /* 522 * Fast path: access to an already allocated DTV entry. This 523 * checks the current limit and the entry without needing any 524 * locking. Entries are only freed on dlclose() and it is an 525 * application bug if code of the module is still running at 526 * that point. 527 */ 528 if (__predict_true(idx <= DTV_MAX_INDEX(dtv) && dtv[idx] != NULL)) 529 return (uint8_t *)dtv[idx] + offset; 530 531 return _rtld_tls_get_addr(tcb, idx, offset); 532 } 533 #endif 534 535 #endif /* __HAVE_TLS_VARIANT_I || __HAVE_TLS_VARIANT_II */ 536