/* $NetBSD: tls.c,v 1.12.2.3 2024/08/07 11:01:57 martin Exp $ */ /*- * Copyright (c) 2011 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Joerg Sonnenberger. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __RCSID("$NetBSD: tls.c,v 1.12.2.3 2024/08/07 11:01:57 martin Exp $"); /* * Thread-local storage * * Reference: * * [ELFTLS] Ulrich Drepper, `ELF Handling For Thread-Local * Storage', Version 0.21, 2023-08-22. * https://akkadia.org/drepper/tls.pdf * https://web.archive.org/web/20240718081934/https://akkadia.org/drepper/tls.pdf */ #include #include #include #include #include #include #include "debug.h" #include "rtld.h" #if defined(__HAVE_TLS_VARIANT_I) || defined(__HAVE_TLS_VARIANT_II) static struct tls_tcb *_rtld_tls_allocate_locked(void); static void *_rtld_tls_module_allocate(struct tls_tcb *, size_t); /* * DTV offset * * On some architectures (m68k, mips, or1k, powerpc, and riscv), * the DTV offsets passed to __tls_get_addr have a bias relative * to the start of the DTV, in order to maximize the range of TLS * offsets that can be used by instruction encodings with signed * displacements. */ #ifndef TLS_DTV_OFFSET #define TLS_DTV_OFFSET 0 #endif static size_t _rtld_tls_static_space; /* Static TLS space allocated */ static size_t _rtld_tls_static_offset; /* Next offset for static TLS to use */ size_t _rtld_tls_dtv_generation = 1; /* Bumped on each load of obj w/ TLS */ size_t _rtld_tls_max_index = 1; /* Max index into up-to-date DTV */ /* * DTV -- Dynamic Thread Vector * * The DTV is a per-thread array that maps each module with * thread-local storage to a pointer into part of the thread's TCB * (thread control block), or dynamically loaded TLS blocks, * reserved for that module's storage. * * The TCB itself, struct tls_tcb, has a pointer to the DTV at * tcb->tcb_dtv. * * The layout is: * * +---------------+ * | max index | -1 max index i for which dtv[i] is alloced * +---------------+ * | generation | 0 void **dtv points here * +---------------+ * | obj 1 tls ptr | 1 TLS pointer for obj w/ obj->tlsindex 1 * +---------------+ * | obj 2 tls ptr | 2 TLS pointer for obj w/ obj->tlsindex 2 * +---------------+ * . * . * . * * The values of obj->tlsindex start at 1; this way, * dtv[obj->tlsindex] works, when dtv[0] is the generation. The * TLS pointers go either into the static thread-local storage, * for the initial objects (i.e., those loaded at startup), or * into TLS blocks dynamically allocated for objects that * dynamically loaded by dlopen. * * The generation field is a cache of the global generation number * _rtld_tls_dtv_generation, which is bumped every time an object * with TLS is loaded in _rtld_map_object, and cached by * __tls_get_addr (via _rtld_tls_get_addr) when a newly loaded * module lies outside the bounds of the current DTV. * * XXX Why do we keep max index and generation separately? They * appear to be initialized the same, always incremented together, * and always stored together. * * XXX Why is this not a struct? * * struct dtv { * size_t dtv_gen; * void *dtv_module[]; * }; */ #define DTV_GENERATION(dtv) ((size_t)((dtv)[0])) #define DTV_MAX_INDEX(dtv) ((size_t)((dtv)[-1])) #define SET_DTV_GENERATION(dtv, val) (dtv)[0] = (void *)(size_t)(val) #define SET_DTV_MAX_INDEX(dtv, val) (dtv)[-1] = (void *)(size_t)(val) /* * _rtld_tls_get_addr(tcb, idx, offset) * * Slow path for __tls_get_addr (see below), called to allocate * TLS space if needed for the object obj with obj->tlsindex idx, * at offset, which must be below obj->tlssize. * * This may allocate a DTV if the current one is too old, and it * may allocate a dynamically loaded TLS block if there isn't one * already allocated for it. * * XXX Why is the first argument passed as `void *tls' instead of * just `struct tls_tcb *tcb'? */ void * _rtld_tls_get_addr(void *tls, size_t idx, size_t offset) { struct tls_tcb *tcb = tls; void **dtv, **new_dtv; sigset_t mask; _rtld_exclusive_enter(&mask); dtv = tcb->tcb_dtv; /* * If the generation number has changed, we have to allocate a * new DTV. * * XXX Do we really? Isn't it enough to check whether idx <= * DTV_MAX_INDEX(dtv)? */ if (__predict_false(DTV_GENERATION(dtv) != _rtld_tls_dtv_generation)) { size_t to_copy = DTV_MAX_INDEX(dtv); /* * "2 +" because the first element is the generation and * the second one is the maximum index. */ new_dtv = xcalloc((2 + _rtld_tls_max_index) * sizeof(*dtv)); ++new_dtv; /* advance past DTV_MAX_INDEX */ if (to_copy > _rtld_tls_max_index) /* XXX How? */ to_copy = _rtld_tls_max_index; memcpy(new_dtv + 1, dtv + 1, to_copy * sizeof(*dtv)); xfree(dtv - 1); /* retreat back to DTV_MAX_INDEX */ dtv = tcb->tcb_dtv = new_dtv; SET_DTV_MAX_INDEX(dtv, _rtld_tls_max_index); SET_DTV_GENERATION(dtv, _rtld_tls_dtv_generation); } if (__predict_false(dtv[idx] == NULL)) dtv[idx] = _rtld_tls_module_allocate(tcb, idx); _rtld_exclusive_exit(&mask); return (uint8_t *)dtv[idx] + offset; } /* * _rtld_tls_initial_allocation() * * Allocate the TCB (thread control block) for the initial thread, * once the static TLS space usage has been determined (plus some * slop to allow certain special cases like Mesa to be dlopened). * * This must be done _after_ all initial objects (i.e., those * loaded at startup, as opposed to objects dynamically loaded by * dlopen) have had TLS offsets allocated if need be by * _rtld_tls_offset_allocate, and have had relocations processed. */ void _rtld_tls_initial_allocation(void) { struct tls_tcb *tcb; _rtld_tls_static_space = _rtld_tls_static_offset + RTLD_STATIC_TLS_RESERVATION; #ifndef __HAVE_TLS_VARIANT_I _rtld_tls_static_space = roundup2(_rtld_tls_static_space, alignof(max_align_t)); #endif dbg(("_rtld_tls_static_space %zu", _rtld_tls_static_space)); tcb = _rtld_tls_allocate_locked(); #ifdef __HAVE___LWP_SETTCB __lwp_settcb(tcb); #else _lwp_setprivate(tcb); #endif } /* * _rtld_tls_allocate_locked() * * Internal subroutine to allocate a TCB (thread control block) * for the current thread. * * This allocates a DTV and a TCB that points to it, including * static space in the TCB for the TLS of the initial objects. * TLS blocks for dynamically loaded objects are allocated lazily. * * Caller must either be single-threaded (at startup via * _rtld_tls_initial_allocation) or hold the rtld exclusive lock * (via _rtld_tls_allocate). */ static struct tls_tcb * _rtld_tls_allocate_locked(void) { Obj_Entry *obj; struct tls_tcb *tcb; uint8_t *p, *q; p = xcalloc(_rtld_tls_static_space + sizeof(struct tls_tcb)); #ifdef __HAVE_TLS_VARIANT_I tcb = (struct tls_tcb *)p; p += sizeof(struct tls_tcb); #else p += _rtld_tls_static_space; tcb = (struct tls_tcb *)p; tcb->tcb_self = tcb; #endif dbg(("lwp %d tls tcb %p", _lwp_self(), tcb)); /* * "2 +" because the first element is the generation and the second * one is the maximum index. */ tcb->tcb_dtv = xcalloc(sizeof(*tcb->tcb_dtv) * (2 + _rtld_tls_max_index)); ++tcb->tcb_dtv; /* advance past DTV_MAX_INDEX */ SET_DTV_MAX_INDEX(tcb->tcb_dtv, _rtld_tls_max_index); SET_DTV_GENERATION(tcb->tcb_dtv, _rtld_tls_dtv_generation); for (obj = _rtld_objlist; obj != NULL; obj = obj->next) { if (obj->tls_static) { #ifdef __HAVE_TLS_VARIANT_I q = p + obj->tlsoffset; #else q = p - obj->tlsoffset; #endif dbg(("%s: [lwp %d] tls dtv %p index %zu offset %zu", obj->path, _lwp_self(), q, obj->tlsindex, obj->tlsoffset)); if (obj->tlsinitsize) memcpy(q, obj->tlsinit, obj->tlsinitsize); tcb->tcb_dtv[obj->tlsindex] = q; } } return tcb; } /* * _rtld_tls_allocate() * * Allocate a TCB (thread control block) for the current thread. * * Called by pthread_create for non-initial threads. (The initial * thread's TCB is allocated by _rtld_tls_initial_allocation.) */ struct tls_tcb * _rtld_tls_allocate(void) { struct tls_tcb *tcb; sigset_t mask; _rtld_exclusive_enter(&mask); tcb = _rtld_tls_allocate_locked(); _rtld_exclusive_exit(&mask); return tcb; } /* * _rtld_tls_free(tcb) * * Free a TCB allocated with _rtld_tls_allocate. * * Frees any TLS blocks for dynamically loaded objects that tcb's * DTV points to, and frees tcb's DTV, and frees tcb. */ void _rtld_tls_free(struct tls_tcb *tcb) { size_t i, max_index; uint8_t *p, *p_end; sigset_t mask; _rtld_exclusive_enter(&mask); #ifdef __HAVE_TLS_VARIANT_I p = (uint8_t *)tcb; #else p = (uint8_t *)tcb - _rtld_tls_static_space; #endif p_end = p + _rtld_tls_static_space; max_index = DTV_MAX_INDEX(tcb->tcb_dtv); for (i = 1; i <= max_index; ++i) { if ((uint8_t *)tcb->tcb_dtv[i] < p || (uint8_t *)tcb->tcb_dtv[i] >= p_end) xfree(tcb->tcb_dtv[i]); } xfree(tcb->tcb_dtv - 1); /* retreat back to DTV_MAX_INDEX */ xfree(p); _rtld_exclusive_exit(&mask); } /* * _rtld_tls_module_allocate(tcb, idx) * * Allocate thread-local storage in the thread with the given TCB * (thread control block) for the object obj whose obj->tlsindex * is idx. * * If obj has had space in static TLS reserved (obj->tls_static), * return a pointer into that. Otherwise, allocate a TLS block, * mark obj as having a TLS block allocated (obj->tls_dynamic), * and return it. * * Called by _rtld_tls_get_addr to get the thread-local storage * for an object the first time around. */ static void * _rtld_tls_module_allocate(struct tls_tcb *tcb, size_t idx) { Obj_Entry *obj; uint8_t *p; for (obj = _rtld_objlist; obj != NULL; obj = obj->next) { if (obj->tlsindex == idx) break; } if (obj == NULL) { _rtld_error("Module for TLS index %zu missing", idx); _rtld_die(); } if (obj->tls_static) { #ifdef __HAVE_TLS_VARIANT_I p = (uint8_t *)tcb + obj->tlsoffset + sizeof(struct tls_tcb); #else p = (uint8_t *)tcb - obj->tlsoffset; #endif return p; } p = xmalloc(obj->tlssize); memcpy(p, obj->tlsinit, obj->tlsinitsize); memset(p + obj->tlsinitsize, 0, obj->tlssize - obj->tlsinitsize); obj->tls_dynamic = 1; return p; } /* * _rtld_tls_offset_allocate(obj) * * Allocate a static thread-local storage offset for obj. * * Called by _rtld at startup for all initial objects. Called * also by MD relocation logic, which is allowed (for Mesa) to * allocate an additional 64 bytes (RTLD_STATIC_TLS_RESERVATION) * of static thread-local storage in dlopened objects. */ int _rtld_tls_offset_allocate(Obj_Entry *obj) { size_t offset, next_offset; if (obj->tls_dynamic) return -1; if (obj->tls_static) return 0; if (obj->tlssize == 0) { obj->tlsoffset = 0; obj->tls_static = 1; return 0; } #ifdef __HAVE_TLS_VARIANT_I offset = roundup2(_rtld_tls_static_offset, obj->tlsalign); next_offset = offset + obj->tlssize; #else offset = roundup2(_rtld_tls_static_offset + obj->tlssize, obj->tlsalign); next_offset = offset; #endif /* * Check if the static allocation was already done. * This happens if dynamically loaded modules want to use * static TLS space. * * XXX Keep an actual free list and callbacks for initialisation. */ if (_rtld_tls_static_space) { if (obj->tlsinitsize) { _rtld_error("%s: Use of initialized " "Thread Local Storage with model initial-exec " "and dlopen is not supported", obj->path); return -1; } if (next_offset > _rtld_tls_static_space) { _rtld_error("%s: No space available " "for static Thread Local Storage", obj->path); return -1; } } obj->tlsoffset = offset; dbg(("%s: static tls offset 0x%zx size %zu\n", obj->path, obj->tlsoffset, obj->tlssize)); _rtld_tls_static_offset = next_offset; obj->tls_static = 1; return 0; } /* * _rtld_tls_offset_free(obj) * * Free a static thread-local storage offset for obj. * * Called by dlclose (via _rtld_unload_object -> _rtld_obj_free). * * Since static thread-local storage is normally not used by * dlopened objects (with the exception of Mesa), this doesn't do * anything to recycle the space right now. */ void _rtld_tls_offset_free(Obj_Entry *obj) { /* * XXX See above. */ obj->tls_static = 0; return; } #if defined(__HAVE_COMMON___TLS_GET_ADDR) && defined(RTLD_LOADER) /* * __tls_get_addr(tlsindex) * * Symbol directly called by code generated by the compiler for * references thread-local storage in the general-dynamic or * local-dynamic TLS models (but not initial-exec or local-exec). * * The argument is a pointer to * * struct { * unsigned long int ti_module; * unsigned long int ti_offset; * }; * * as in, e.g., [ELFTLS] Sec. 3.4.3. This coincides with the * type size_t[2] on all architectures that use this common * __tls_get_addr definition (XXX but why do we write it as * size_t[2]?). * * ti_module, i.e., arg[0], is the obj->tlsindex assigned at * load-time by _rtld_map_object, and ti_offset, i.e., arg[1], is * assigned at link-time by ld(1), possibly adjusted by * TLS_DTV_OFFSET. * * Some architectures -- specifically IA-64 -- use a different * calling convention. Some architectures -- specifically i386 * -- also use another entry point ___tls_get_addr (that's three * leading underscores) with a different calling convention. */ void * __tls_get_addr(void *arg_) { size_t *arg = (size_t *)arg_; void **dtv; #ifdef __HAVE___LWP_GETTCB_FAST struct tls_tcb * const tcb = __lwp_gettcb_fast(); #else struct tls_tcb * const tcb = __lwp_getprivate_fast(); #endif size_t idx = arg[0], offset = arg[1] + TLS_DTV_OFFSET; dtv = tcb->tcb_dtv; /* * Fast path: access to an already allocated DTV entry. This * checks the current limit and the entry without needing any * locking. Entries are only freed on dlclose() and it is an * application bug if code of the module is still running at * that point. */ if (__predict_true(idx < DTV_MAX_INDEX(dtv) && dtv[idx] != NULL)) return (uint8_t *)dtv[idx] + offset; return _rtld_tls_get_addr(tcb, idx, offset); } #endif #endif /* __HAVE_TLS_VARIANT_I || __HAVE_TLS_VARIANT_II */