1/* $NetBSD: lwp.h,v 1.172 2016/07/03 14:24:59 christos Exp $ */
2
3/*-
4 * Copyright (c) 2001, 2006, 2007, 2008, 2009, 2010
5 * The NetBSD Foundation, Inc.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Nathan J. Williams and Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33#ifndef _SYS_LWP_H_
34#define _SYS_LWP_H_
35
36#if defined(_KERNEL) || defined(_KMEMUSER)
37
38#include <sys/param.h>
39#include <sys/time.h>
40#include <sys/queue.h>
41#include <sys/callout.h>
42#include <sys/kcpuset.h>
43#include <sys/mutex.h>
44#include <sys/condvar.h>
45#include <sys/signalvar.h>
46#include <sys/sched.h>
47#include <sys/specificdata.h>
48#include <sys/syncobj.h>
49#include <sys/resource.h>
50
51#if defined(_KERNEL)
52struct lwp;
53/* forward declare this for <machine/cpu.h> so it can get l_cpu. */
54static inline struct cpu_info *lwp_getcpu(struct lwp *);
55#include <machine/cpu.h> /* curcpu() and cpu_info */
56#endif
57
58#include <machine/proc.h> /* Machine-dependent proc substruct. */
59
60/*
61 * Lightweight process. Field markings and the corresponding locks:
62 *
63 * a: proc_lock
64 * c: condition variable interlock, passed to cv_wait()
65 * l: *l_mutex
66 * p: l_proc->p_lock
67 * s: spc_mutex, which may or may not be referenced by l_mutex
68 * S: l_selcluster->sc_lock
69 * (: unlocked, stable
70 * !: unlocked, may only be reliably accessed by the LWP itself
71 *
72 * Fields are clustered together by usage (to increase the likelyhood
73 * of cache hits) and by size (to reduce dead space in the structure).
74 */
75
76#include <sys/pcu.h>
77
78struct lockdebug;
79struct sysent;
80
81struct lwp {
82 /* Scheduling and overall state. */
83 TAILQ_ENTRY(lwp) l_runq; /* s: run queue */
84 union {
85 void * info; /* s: scheduler-specific structure */
86 u_int timeslice; /* l: time-quantum for SCHED_M2 */
87 } l_sched;
88 struct cpu_info *volatile l_cpu;/* s: CPU we're on if LSONPROC */
89 kmutex_t * volatile l_mutex; /* l: ptr to mutex on sched state */
90 int l_ctxswtch; /* l: performing a context switch */
91 void *l_addr; /* l: PCB address; use lwp_getpcb() */
92 struct mdlwp l_md; /* l: machine-dependent fields. */
93 int l_flag; /* l: misc flag values */
94 int l_stat; /* l: overall LWP status */
95 struct bintime l_rtime; /* l: real time */
96 struct bintime l_stime; /* l: start time (while ONPROC) */
97 u_int l_swtime; /* l: time swapped in or out */
98 u_int l_rticks; /* l: Saved start time of run */
99 u_int l_rticksum; /* l: Sum of ticks spent running */
100 u_int l_slpticks; /* l: Saved start time of sleep */
101 u_int l_slpticksum; /* l: Sum of ticks spent sleeping */
102 int l_biglocks; /* l: biglock count before sleep */
103 int l_class; /* l: scheduling class */
104 int l_kpriority; /* !: has kernel priority boost */
105 pri_t l_kpribase; /* !: kernel priority base level */
106 pri_t l_priority; /* l: scheduler priority */
107 pri_t l_inheritedprio;/* l: inherited priority */
108 pri_t l_protectprio; /* l: for PTHREAD_PRIO_PROTECT */
109 pri_t l_auxprio; /* l: max(inherit,protect) priority */
110 int l_protectdepth; /* l: for PTHREAD_PRIO_PROTECT */
111 SLIST_HEAD(, turnstile) l_pi_lenders; /* l: ts lending us priority */
112 uint64_t l_ncsw; /* l: total context switches */
113 uint64_t l_nivcsw; /* l: involuntary context switches */
114 u_int l_cpticks; /* (: Ticks of CPU time */
115 fixpt_t l_pctcpu; /* p: %cpu during l_swtime */
116 fixpt_t l_estcpu; /* l: cpu time for SCHED_4BSD */
117 psetid_t l_psid; /* l: assigned processor-set ID */
118 struct cpu_info *l_target_cpu; /* l: target CPU to migrate */
119 struct lwpctl *l_lwpctl; /* p: lwpctl block kernel address */
120 struct lcpage *l_lcpage; /* p: lwpctl containing page */
121 kcpuset_t *l_affinity; /* l: CPU set for affinity */
122
123 /* Synchronisation. */
124 struct turnstile *l_ts; /* l: current turnstile */
125 struct syncobj *l_syncobj; /* l: sync object operations set */
126 TAILQ_ENTRY(lwp) l_sleepchain; /* l: sleep queue */
127 wchan_t l_wchan; /* l: sleep address */
128 const char *l_wmesg; /* l: reason for sleep */
129 struct sleepq *l_sleepq; /* l: current sleep queue */
130 int l_sleeperr; /* !: error before unblock */
131 u_int l_slptime; /* l: time since last blocked */
132 callout_t l_timeout_ch; /* !: callout for tsleep */
133 u_int l_emap_gen; /* !: emap generation number */
134 kcondvar_t l_waitcv; /* a: vfork() wait */
135
136#if PCU_UNIT_COUNT > 0
137 struct cpu_info * volatile l_pcu_cpu[PCU_UNIT_COUNT];
138 uint32_t l_pcu_valid;
139#endif
140
141 /* Process level and global state, misc. */
142 LIST_ENTRY(lwp) l_list; /* a: entry on list of all LWPs */
143 void *l_ctxlink; /* p: uc_link {get,set}context */
144 struct proc *l_proc; /* p: parent process */
145 LIST_ENTRY(lwp) l_sibling; /* p: entry on proc's list of LWPs */
146 lwpid_t l_waiter; /* p: first LWP waiting on us */
147 lwpid_t l_waitingfor; /* p: specific LWP we are waiting on */
148 int l_prflag; /* p: process level flags */
149 u_int l_refcnt; /* p: reference count on this LWP */
150 lwpid_t l_lid; /* (: LWP identifier; local to proc */
151 char *l_name; /* (: name, optional */
152
153 /* State of select() or poll(). */
154 int l_selflag; /* S: polling state flags */
155 SLIST_HEAD(,selinfo) l_selwait; /* S: descriptors waited on */
156 int l_selret; /* S: return value of select/poll */
157 uintptr_t l_selrec; /* !: argument for selrecord() */
158 struct selcluster *l_selcluster;/* !: associated cluster data */
159 void * l_selbits; /* (: select() bit-field */
160 size_t l_selni; /* (: size of a single bit-field */
161
162 /* Signals. */
163 int l_sigrestore; /* p: need to restore old sig mask */
164 sigset_t l_sigwaitset; /* p: signals being waited for */
165 kcondvar_t l_sigcv; /* p: for sigsuspend() */
166 struct ksiginfo *l_sigwaited; /* p: delivered signals from set */
167 sigpend_t *l_sigpendset; /* p: XXX issignal()/postsig() baton */
168 LIST_ENTRY(lwp) l_sigwaiter; /* p: chain on list of waiting LWPs */
169 stack_t l_sigstk; /* p: sp & on stack state variable */
170 sigset_t l_sigmask; /* p: signal mask */
171 sigpend_t l_sigpend; /* p: signals to this LWP */
172 sigset_t l_sigoldmask; /* p: mask for sigpause */
173
174 /* Private data. */
175 specificdata_reference
176 l_specdataref; /* !: subsystem lwp-specific data */
177 struct timespec l_ktrcsw; /* !: for ktrace CSW trace XXX */
178 void *l_private; /* !: svr4-style lwp-private data */
179 struct lwp *l_switchto; /* !: mi_switch: switch to this LWP */
180 struct kauth_cred *l_cred; /* !: cached credentials */
181 struct filedesc *l_fd; /* !: cached copy of proc::p_fd */
182 void *l_emuldata; /* !: kernel lwp-private data */
183 u_int l_cv_signalled; /* c: restarted by cv_signal() */
184 u_short l_shlocks; /* !: lockdebug: shared locks held */
185 u_short l_exlocks; /* !: lockdebug: excl. locks held */
186 u_short l_unused; /* !: unused */
187 u_short l_blcnt; /* !: count of kernel_lock held */
188 int l_nopreempt; /* !: don't preempt me! */
189 u_int l_dopreempt; /* s: kernel preemption pending */
190 int l_pflag; /* !: LWP private flags */
191 int l_dupfd; /* !: side return from cloning devs XXX */
192 const struct sysent * volatile l_sysent;/* !: currently active syscall */
193 struct rusage l_ru; /* !: accounting information */
194 uint64_t l_pfailtime; /* !: for kernel preemption */
195 uintptr_t l_pfailaddr; /* !: for kernel preemption */
196 uintptr_t l_pfaillock; /* !: for kernel preemption */
197 _TAILQ_HEAD(,struct lockdebug,volatile) l_ld_locks;/* !: locks held by LWP */
198 int l_tcgen; /* !: for timecounter removal */
199
200 /* These are only used by 'options SYSCALL_TIMES'. */
201 uint32_t l_syscall_time; /* !: time epoch for current syscall */
202 uint64_t *l_syscall_counter; /* !: counter for current process */
203
204 struct kdtrace_thread *l_dtrace; /* (: DTrace-specific data. */
205};
206
207/*
208 * UAREA_PCB_OFFSET: an offset of PCB structure in the uarea. MD code may
209 * define it in <machine/proc.h>, to indicate a different uarea layout.
210 */
211#ifndef UAREA_PCB_OFFSET
212#define UAREA_PCB_OFFSET 0
213#endif
214
215LIST_HEAD(lwplist, lwp); /* A list of LWPs. */
216
217#ifdef _KERNEL
218extern struct lwplist alllwp; /* List of all LWPs. */
219extern lwp_t lwp0; /* LWP for proc0. */
220extern int maxlwp __read_mostly; /* max number of lwps */
221#ifndef MAXLWP
222#define MAXLWP 2048
223#endif
224#ifndef __HAVE_CPU_MAXLWP
225#define cpu_maxlwp() MAXLWP
226#endif
227#endif
228
229#endif /* _KERNEL || _KMEMUSER */
230
231/* These flags are kept in l_flag. */
232#define LW_IDLE 0x00000001 /* Idle lwp. */
233#define LW_LWPCTL 0x00000002 /* Adjust lwpctl in userret */
234#define LW_SINTR 0x00000080 /* Sleep is interruptible. */
235#define LW_SYSTEM 0x00000200 /* Kernel thread */
236#define LW_WSUSPEND 0x00020000 /* Suspend before return to user */
237#define LW_BATCH 0x00040000 /* LWP tends to hog CPU */
238#define LW_WCORE 0x00080000 /* Stop for core dump on return to user */
239#define LW_WEXIT 0x00100000 /* Exit before return to user */
240#define LW_PENDSIG 0x01000000 /* Pending signal for us */
241#define LW_CANCELLED 0x02000000 /* tsleep should not sleep */
242#define LW_WREBOOT 0x08000000 /* System is rebooting, please suspend */
243#define LW_UNPARKED 0x10000000 /* Unpark op pending */
244#define LW_RUMP_CLEAR 0x40000000 /* Clear curlwp in RUMP scheduler */
245#define LW_RUMP_QEXIT 0x80000000 /* LWP should exit ASAP */
246
247/* The second set of flags is kept in l_pflag. */
248#define LP_KTRACTIVE 0x00000001 /* Executing ktrace operation */
249#define LP_KTRCSW 0x00000002 /* ktrace context switch marker */
250#define LP_KTRCSWUSER 0x00000004 /* ktrace context switch marker */
251#define LP_PIDLID 0x00000008 /* free LID from PID space on exit */
252#define LP_OWEUPC 0x00000010 /* Owe user profiling tick */
253#define LP_MPSAFE 0x00000020 /* Starts life without kernel_lock */
254#define LP_INTR 0x00000040 /* Soft interrupt handler */
255#define LP_SYSCTLWRITE 0x00000080 /* sysctl write lock held */
256#define LP_MUSTJOIN 0x00000100 /* Must join kthread on exit */
257#define LP_VFORKWAIT 0x00000200 /* Waiting at vfork() for a child */
258#define LP_TIMEINTR 0x00010000 /* Time this soft interrupt */
259#define LP_RUNNING 0x20000000 /* Active on a CPU */
260#define LP_BOUND 0x80000000 /* Bound to a CPU */
261
262/* The third set is kept in l_prflag. */
263#define LPR_DETACHED 0x00800000 /* Won't be waited for. */
264#define LPR_CRMOD 0x00000100 /* Credentials modified */
265
266/*
267 * Mask indicating that there is "exceptional" work to be done on return to
268 * user.
269 */
270#define LW_USERRET \
271 (LW_WEXIT | LW_PENDSIG | LW_WREBOOT | LW_WSUSPEND | LW_WCORE | LW_LWPCTL)
272
273/*
274 * Status values.
275 *
276 * A note about LSRUN and LSONPROC: LSRUN indicates that a process is
277 * runnable but *not* yet running, i.e. is on a run queue. LSONPROC
278 * indicates that the process is actually executing on a CPU, i.e.
279 * it is no longer on a run queue.
280 */
281#define LSIDL 1 /* Process being created by fork. */
282#define LSRUN 2 /* Currently runnable. */
283#define LSSLEEP 3 /* Sleeping on an address. */
284#define LSSTOP 4 /* Process debugging or suspension. */
285#define LSZOMB 5 /* Awaiting collection by parent. */
286/* unused, for source compatibility with NetBSD 4.0 and earlier. */
287#define LSDEAD 6 /* Process is almost a zombie. */
288#define LSONPROC 7 /* Process is currently on a CPU. */
289#define LSSUSPENDED 8 /* Not running, not signalable. */
290
291#if defined(_KERNEL) || defined(_KMEMUSER)
292static inline void *
293lwp_getpcb(struct lwp *l)
294{
295
296 return l->l_addr;
297}
298#endif /* _KERNEL || _KMEMUSER */
299
300#ifdef _KERNEL
301#define LWP_CACHE_CREDS(l, p) \
302do { \
303 (void)p; \
304 if (__predict_false((l)->l_prflag & LPR_CRMOD)) \
305 lwp_update_creds(l); \
306} while (/* CONSTCOND */ 0)
307
308void lwpinit(void);
309void lwp0_init(void);
310void lwp_sys_init(void);
311
312void lwp_startup(lwp_t *, lwp_t *);
313void startlwp(void *);
314
315int lwp_locked(lwp_t *, kmutex_t *);
316void lwp_setlock(lwp_t *, kmutex_t *);
317void lwp_unlock_to(lwp_t *, kmutex_t *);
318int lwp_trylock(lwp_t *);
319void lwp_addref(lwp_t *);
320void lwp_delref(lwp_t *);
321void lwp_delref2(lwp_t *);
322void lwp_drainrefs(lwp_t *);
323bool lwp_alive(lwp_t *);
324lwp_t *lwp_find_first(proc_t *);
325
326int lwp_wait(lwp_t *, lwpid_t, lwpid_t *, bool);
327void lwp_continue(lwp_t *);
328void lwp_unsleep(lwp_t *, bool);
329void lwp_unstop(lwp_t *);
330void lwp_exit(lwp_t *);
331void lwp_exit_switchaway(lwp_t *) __dead;
332int lwp_suspend(lwp_t *, lwp_t *);
333int lwp_create1(lwp_t *, const void *, size_t, u_long, lwpid_t *);
334void lwp_update_creds(lwp_t *);
335void lwp_migrate(lwp_t *, struct cpu_info *);
336lwp_t * lwp_find2(pid_t, lwpid_t);
337lwp_t * lwp_find(proc_t *, int);
338void lwp_userret(lwp_t *);
339void lwp_need_userret(lwp_t *);
340void lwp_free(lwp_t *, bool, bool);
341uint64_t lwp_pctr(void);
342int lwp_setprivate(lwp_t *, void *);
343int do_lwp_create(lwp_t *, void *, u_long, lwpid_t *);
344
345void lwpinit_specificdata(void);
346int lwp_specific_key_create(specificdata_key_t *, specificdata_dtor_t);
347void lwp_specific_key_delete(specificdata_key_t);
348void lwp_initspecific(lwp_t *);
349void lwp_finispecific(lwp_t *);
350void *lwp_getspecific(specificdata_key_t);
351#if defined(_LWP_API_PRIVATE)
352void *_lwp_getspecific_by_lwp(lwp_t *, specificdata_key_t);
353#endif
354void lwp_setspecific(specificdata_key_t, void *);
355
356/* Syscalls. */
357int lwp_park(clockid_t, int, struct timespec *, const void *);
358int lwp_unpark(lwpid_t, const void *);
359
360/* DDB. */
361void lwp_whatis(uintptr_t, void (*)(const char *, ...) __printflike(1, 2));
362
363/*
364 * Lock an LWP. XXX _MODULE
365 */
366static inline void
367lwp_lock(lwp_t *l)
368{
369 kmutex_t *old = l->l_mutex;
370
371 /*
372 * Note: mutex_spin_enter() will have posted a read barrier.
373 * Re-test l->l_mutex. If it has changed, we need to try again.
374 */
375 mutex_spin_enter(old);
376 while (__predict_false(l->l_mutex != old)) {
377 mutex_spin_exit(old);
378 old = l->l_mutex;
379 mutex_spin_enter(old);
380 }
381}
382
383/*
384 * Unlock an LWP. XXX _MODULE
385 */
386static inline void
387lwp_unlock(lwp_t *l)
388{
389 mutex_spin_exit(l->l_mutex);
390}
391
392static inline void
393lwp_changepri(lwp_t *l, pri_t pri)
394{
395 KASSERT(mutex_owned(l->l_mutex));
396
397 if (l->l_priority == pri)
398 return;
399
400 (*l->l_syncobj->sobj_changepri)(l, pri);
401 KASSERT(l->l_priority == pri);
402}
403
404static inline void
405lwp_lendpri(lwp_t *l, pri_t pri)
406{
407 KASSERT(mutex_owned(l->l_mutex));
408
409 (*l->l_syncobj->sobj_lendpri)(l, pri);
410 KASSERT(l->l_inheritedprio == pri);
411}
412
413static inline pri_t
414lwp_eprio(lwp_t *l)
415{
416 pri_t pri;
417
418 pri = l->l_priority;
419 if ((l->l_flag & LW_SYSTEM) == 0 && l->l_kpriority && pri < PRI_KERNEL)
420 pri = (pri >> 1) + l->l_kpribase;
421 return MAX(l->l_auxprio, pri);
422}
423
424int lwp_create(lwp_t *, struct proc *, vaddr_t, int,
425 void *, size_t, void (*)(void *), void *, lwp_t **, int);
426
427/*
428 * XXX _MODULE
429 * We should provide real stubs for the below that modules can use.
430 */
431
432static inline void
433spc_lock(struct cpu_info *ci)
434{
435 mutex_spin_enter(ci->ci_schedstate.spc_mutex);
436}
437
438static inline void
439spc_unlock(struct cpu_info *ci)
440{
441 mutex_spin_exit(ci->ci_schedstate.spc_mutex);
442}
443
444static inline void
445spc_dlock(struct cpu_info *ci1, struct cpu_info *ci2)
446{
447 struct schedstate_percpu *spc1 = &ci1->ci_schedstate;
448 struct schedstate_percpu *spc2 = &ci2->ci_schedstate;
449
450 KASSERT(ci1 != ci2);
451 if (ci1 < ci2) {
452 mutex_spin_enter(spc1->spc_mutex);
453 mutex_spin_enter(spc2->spc_mutex);
454 } else {
455 mutex_spin_enter(spc2->spc_mutex);
456 mutex_spin_enter(spc1->spc_mutex);
457 }
458}
459
460/*
461 * Allow machine-dependent code to override curlwp in <machine/cpu.h> for
462 * its own convenience. Otherwise, we declare it as appropriate.
463 */
464#if !defined(curlwp)
465#if defined(MULTIPROCESSOR)
466#define curlwp curcpu()->ci_curlwp /* Current running LWP */
467#else
468extern struct lwp *curlwp; /* Current running LWP */
469#endif /* MULTIPROCESSOR */
470#endif /* ! curlwp */
471#define curproc (curlwp->l_proc)
472
473/*
474 * This provide a way for <machine/cpu.h> to get l_cpu for curlwp before
475 * struct lwp is defined.
476 */
477static inline struct cpu_info *
478lwp_getcpu(struct lwp *l)
479{
480 return l->l_cpu;
481}
482
483static inline bool
484CURCPU_IDLE_P(void)
485{
486 struct cpu_info *ci = curcpu();
487 return ci->ci_data.cpu_onproc == ci->ci_data.cpu_idlelwp;
488}
489
490/*
491 * Disable and re-enable preemption. Only for low-level kernel
492 * use. Device drivers and anything that could potentially be
493 * compiled as a module should use kpreempt_disable() and
494 * kpreempt_enable().
495 */
496static inline void
497KPREEMPT_DISABLE(lwp_t *l)
498{
499
500 KASSERT(l == curlwp);
501 l->l_nopreempt++;
502 __insn_barrier();
503}
504
505static inline void
506KPREEMPT_ENABLE(lwp_t *l)
507{
508
509 KASSERT(l == curlwp);
510 KASSERT(l->l_nopreempt > 0);
511 __insn_barrier();
512 if (--l->l_nopreempt != 0)
513 return;
514 __insn_barrier();
515 if (__predict_false(l->l_dopreempt))
516 kpreempt(0);
517 __insn_barrier();
518}
519
520/* For lwp::l_dopreempt */
521#define DOPREEMPT_ACTIVE 0x01
522#define DOPREEMPT_COUNTED 0x02
523
524/*
525 * Prevent curlwp from migrating between CPUs beteen curlwp_bind and
526 * curlwp_bindx. One use case is psref(9) that has a contract that
527 * forbids migrations.
528 */
529static inline int
530curlwp_bind(void)
531{
532 int bound;
533
534 bound = curlwp->l_pflag & LP_BOUND;
535 curlwp->l_pflag |= LP_BOUND;
536
537 return bound;
538}
539
540static inline void
541curlwp_bindx(int bound)
542{
543 curlwp->l_pflag ^= bound ^ LP_BOUND;
544}
545
546#endif /* _KERNEL */
547
548/* Flags for _lwp_create(), as per Solaris. */
549#define LWP_DETACHED 0x00000040
550#define LWP_SUSPENDED 0x00000080
551
552/* Kernel-internal flags for LWP creation. */
553#define LWP_PIDLID 0x40000000
554#define LWP_VFORK 0x80000000
555
556#endif /* !_SYS_LWP_H_ */
557