1 | /* $NetBSD: kern_turnstile.c,v 1.32 2012/06/15 13:51:40 yamt Exp $ */ |
2 | |
3 | /*- |
4 | * Copyright (c) 2002, 2006, 2007, 2009 The NetBSD Foundation, Inc. |
5 | * All rights reserved. |
6 | * |
7 | * This code is derived from software contributed to The NetBSD Foundation |
8 | * by Jason R. Thorpe and Andrew Doran. |
9 | * |
10 | * Redistribution and use in source and binary forms, with or without |
11 | * modification, are permitted provided that the following conditions |
12 | * are met: |
13 | * 1. Redistributions of source code must retain the above copyright |
14 | * notice, this list of conditions and the following disclaimer. |
15 | * 2. Redistributions in binary form must reproduce the above copyright |
16 | * notice, this list of conditions and the following disclaimer in the |
17 | * documentation and/or other materials provided with the distribution. |
18 | * |
19 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
20 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
21 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
22 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
29 | * POSSIBILITY OF SUCH DAMAGE. |
30 | */ |
31 | |
32 | /* |
33 | * Turnstiles are described in detail in: |
34 | * |
35 | * Solaris Internals: Core Kernel Architecture, Jim Mauro and |
36 | * Richard McDougall. |
37 | * |
38 | * Turnstiles are kept in a hash table. There are likely to be many more |
39 | * synchronisation objects than there are threads. Since a thread can block |
40 | * on only one lock at a time, we only need one turnstile per thread, and |
41 | * so they are allocated at thread creation time. |
42 | * |
43 | * When a thread decides it needs to block on a lock, it looks up the |
44 | * active turnstile for that lock. If no active turnstile exists, then |
45 | * the process lends its turnstile to the lock. If there is already an |
46 | * active turnstile for the lock, the thread places its turnstile on a |
47 | * list of free turnstiles, and references the active one instead. |
48 | * |
49 | * The act of looking up the turnstile acquires an interlock on the sleep |
50 | * queue. If a thread decides it doesn't need to block after all, then this |
51 | * interlock must be released by explicitly aborting the turnstile |
52 | * operation. |
53 | * |
54 | * When a thread is awakened, it needs to get its turnstile back. If there |
55 | * are still other threads waiting in the active turnstile, the thread |
56 | * grabs a free turnstile off the free list. Otherwise, it can take back |
57 | * the active turnstile from the lock (thus deactivating the turnstile). |
58 | * |
59 | * Turnstiles are the place to do priority inheritence. |
60 | */ |
61 | |
62 | #include <sys/cdefs.h> |
63 | __KERNEL_RCSID(0, "$NetBSD: kern_turnstile.c,v 1.32 2012/06/15 13:51:40 yamt Exp $" ); |
64 | |
65 | #include <sys/param.h> |
66 | #include <sys/lockdebug.h> |
67 | #include <sys/pool.h> |
68 | #include <sys/proc.h> |
69 | #include <sys/sleepq.h> |
70 | #include <sys/systm.h> |
71 | |
72 | #define TS_HASH_SIZE 64 |
73 | #define TS_HASH_MASK (TS_HASH_SIZE - 1) |
74 | #define TS_HASH(obj) (((uintptr_t)(obj) >> 3) & TS_HASH_MASK) |
75 | |
76 | static tschain_t turnstile_tab[TS_HASH_SIZE] __cacheline_aligned; |
77 | pool_cache_t turnstile_cache __read_mostly; |
78 | |
79 | static int turnstile_ctor(void *, void *, int); |
80 | |
81 | extern turnstile_t turnstile0; |
82 | |
83 | /* |
84 | * turnstile_init: |
85 | * |
86 | * Initialize the turnstile mechanism. |
87 | */ |
88 | void |
89 | turnstile_init(void) |
90 | { |
91 | tschain_t *tc; |
92 | int i; |
93 | |
94 | for (i = 0; i < TS_HASH_SIZE; i++) { |
95 | tc = &turnstile_tab[i]; |
96 | LIST_INIT(&tc->tc_chain); |
97 | tc->tc_mutex = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); |
98 | } |
99 | |
100 | turnstile_cache = pool_cache_init(sizeof(turnstile_t), 0, 0, 0, |
101 | "tstilepl" , NULL, IPL_NONE, turnstile_ctor, NULL, NULL); |
102 | KASSERT(turnstile_cache != NULL); |
103 | |
104 | (void)turnstile_ctor(NULL, &turnstile0, 0); |
105 | } |
106 | |
107 | /* |
108 | * turnstile_ctor: |
109 | * |
110 | * Constructor for turnstiles. |
111 | */ |
112 | static int |
113 | turnstile_ctor(void *arg, void *obj, int flags) |
114 | { |
115 | turnstile_t *ts = obj; |
116 | |
117 | memset(ts, 0, sizeof(*ts)); |
118 | sleepq_init(&ts->ts_sleepq[TS_READER_Q]); |
119 | sleepq_init(&ts->ts_sleepq[TS_WRITER_Q]); |
120 | return (0); |
121 | } |
122 | |
123 | /* |
124 | * turnstile_remove: |
125 | * |
126 | * Remove an LWP from a turnstile sleep queue and wake it. |
127 | */ |
128 | static inline void |
129 | turnstile_remove(turnstile_t *ts, lwp_t *l, int q) |
130 | { |
131 | turnstile_t *nts; |
132 | |
133 | KASSERT(l->l_ts == ts); |
134 | |
135 | /* |
136 | * This process is no longer using the active turnstile. |
137 | * Find an inactive one on the free list to give to it. |
138 | */ |
139 | if ((nts = ts->ts_free) != NULL) { |
140 | KASSERT(TS_ALL_WAITERS(ts) > 1); |
141 | l->l_ts = nts; |
142 | ts->ts_free = nts->ts_free; |
143 | nts->ts_free = NULL; |
144 | } else { |
145 | /* |
146 | * If the free list is empty, this is the last |
147 | * waiter. |
148 | */ |
149 | KASSERT(TS_ALL_WAITERS(ts) == 1); |
150 | LIST_REMOVE(ts, ts_chain); |
151 | } |
152 | |
153 | ts->ts_waiters[q]--; |
154 | sleepq_remove(&ts->ts_sleepq[q], l); |
155 | } |
156 | |
157 | /* |
158 | * turnstile_lookup: |
159 | * |
160 | * Look up the turnstile for the specified lock. This acquires and |
161 | * holds the turnstile chain lock (sleep queue interlock). |
162 | */ |
163 | turnstile_t * |
164 | turnstile_lookup(wchan_t obj) |
165 | { |
166 | turnstile_t *ts; |
167 | tschain_t *tc; |
168 | |
169 | tc = &turnstile_tab[TS_HASH(obj)]; |
170 | mutex_spin_enter(tc->tc_mutex); |
171 | |
172 | LIST_FOREACH(ts, &tc->tc_chain, ts_chain) |
173 | if (ts->ts_obj == obj) |
174 | return (ts); |
175 | |
176 | /* |
177 | * No turnstile yet for this lock. No problem, turnstile_block() |
178 | * handles this by fetching the turnstile from the blocking thread. |
179 | */ |
180 | return (NULL); |
181 | } |
182 | |
183 | /* |
184 | * turnstile_exit: |
185 | * |
186 | * Abort a turnstile operation. |
187 | */ |
188 | void |
189 | turnstile_exit(wchan_t obj) |
190 | { |
191 | tschain_t *tc; |
192 | |
193 | tc = &turnstile_tab[TS_HASH(obj)]; |
194 | mutex_spin_exit(tc->tc_mutex); |
195 | } |
196 | |
197 | /* |
198 | * turnstile_lendpri: |
199 | * |
200 | * Lend our priority to lwps on the blocking chain. |
201 | * |
202 | * If the current owner of the lock (l->l_wchan, set by sleepq_enqueue) |
203 | * has a priority lower than ours (lwp_eprio(l)), lend our priority to |
204 | * him to avoid priority inversions. |
205 | */ |
206 | |
207 | static void |
208 | turnstile_lendpri(lwp_t *cur) |
209 | { |
210 | lwp_t * l = cur; |
211 | pri_t prio; |
212 | |
213 | /* |
214 | * NOTE: if you get a panic in this code block, it is likely that |
215 | * a lock has been destroyed or corrupted while still in use. Try |
216 | * compiling a kernel with LOCKDEBUG to pinpoint the problem. |
217 | */ |
218 | |
219 | LOCKDEBUG_BARRIER(l->l_mutex, 1); |
220 | KASSERT(l == curlwp); |
221 | prio = lwp_eprio(l); |
222 | for (;;) { |
223 | lwp_t *owner; |
224 | turnstile_t *ts; |
225 | bool dolock; |
226 | |
227 | if (l->l_wchan == NULL) |
228 | break; |
229 | |
230 | /* |
231 | * Ask syncobj the owner of the lock. |
232 | */ |
233 | owner = (*l->l_syncobj->sobj_owner)(l->l_wchan); |
234 | if (owner == NULL) |
235 | break; |
236 | |
237 | /* |
238 | * The owner may have changed as we have dropped the tc lock. |
239 | */ |
240 | if (cur == owner) { |
241 | /* |
242 | * We own the lock: stop here, sleepq_block() |
243 | * should wake up immediatly. |
244 | */ |
245 | break; |
246 | } |
247 | /* |
248 | * Acquire owner->l_mutex if we don't have it yet. |
249 | * Because we already have another LWP lock (l->l_mutex) held, |
250 | * we need to play a try lock dance to avoid deadlock. |
251 | */ |
252 | dolock = l->l_mutex != owner->l_mutex; |
253 | if (l == owner || (dolock && !lwp_trylock(owner))) { |
254 | /* |
255 | * The owner was changed behind us or trylock failed. |
256 | * Restart from curlwp. |
257 | * |
258 | * Note that there may be a livelock here: |
259 | * the owner may try grabing cur's lock (which is the |
260 | * tc lock) while we're trying to grab the owner's lock. |
261 | */ |
262 | lwp_unlock(l); |
263 | l = cur; |
264 | lwp_lock(l); |
265 | prio = lwp_eprio(l); |
266 | continue; |
267 | } |
268 | /* |
269 | * If the owner's priority is already higher than ours, |
270 | * there's nothing to do anymore. |
271 | */ |
272 | if (prio <= lwp_eprio(owner)) { |
273 | if (dolock) |
274 | lwp_unlock(owner); |
275 | break; |
276 | } |
277 | /* |
278 | * Lend our priority to the 'owner' LWP. |
279 | * |
280 | * Update lenders info for turnstile_unlendpri. |
281 | */ |
282 | ts = l->l_ts; |
283 | KASSERT(ts->ts_inheritor == owner || ts->ts_inheritor == NULL); |
284 | if (ts->ts_inheritor == NULL) { |
285 | ts->ts_inheritor = owner; |
286 | ts->ts_eprio = prio; |
287 | SLIST_INSERT_HEAD(&owner->l_pi_lenders, ts, ts_pichain); |
288 | lwp_lendpri(owner, prio); |
289 | } else if (prio > ts->ts_eprio) { |
290 | ts->ts_eprio = prio; |
291 | lwp_lendpri(owner, prio); |
292 | } |
293 | if (dolock) |
294 | lwp_unlock(l); |
295 | LOCKDEBUG_BARRIER(owner->l_mutex, 1); |
296 | l = owner; |
297 | } |
298 | LOCKDEBUG_BARRIER(l->l_mutex, 1); |
299 | if (cur->l_mutex != l->l_mutex) { |
300 | lwp_unlock(l); |
301 | lwp_lock(cur); |
302 | } |
303 | LOCKDEBUG_BARRIER(cur->l_mutex, 1); |
304 | } |
305 | |
306 | /* |
307 | * turnstile_unlendpri: undo turnstile_lendpri |
308 | */ |
309 | |
310 | static void |
311 | turnstile_unlendpri(turnstile_t *ts) |
312 | { |
313 | lwp_t * const l = curlwp; |
314 | turnstile_t *iter; |
315 | turnstile_t *next; |
316 | turnstile_t *prev = NULL; |
317 | pri_t prio; |
318 | bool dolock; |
319 | |
320 | KASSERT(ts->ts_inheritor != NULL); |
321 | ts->ts_inheritor = NULL; |
322 | dolock = l->l_mutex == l->l_cpu->ci_schedstate.spc_lwplock; |
323 | if (dolock) { |
324 | lwp_lock(l); |
325 | } |
326 | |
327 | /* |
328 | * the following loop does two things. |
329 | * |
330 | * - remove ts from the list. |
331 | * |
332 | * - from the rest of the list, find the highest priority. |
333 | */ |
334 | |
335 | prio = -1; |
336 | KASSERT(!SLIST_EMPTY(&l->l_pi_lenders)); |
337 | for (iter = SLIST_FIRST(&l->l_pi_lenders); |
338 | iter != NULL; iter = next) { |
339 | KASSERT(lwp_eprio(l) >= ts->ts_eprio); |
340 | next = SLIST_NEXT(iter, ts_pichain); |
341 | if (iter == ts) { |
342 | if (prev == NULL) { |
343 | SLIST_REMOVE_HEAD(&l->l_pi_lenders, |
344 | ts_pichain); |
345 | } else { |
346 | SLIST_REMOVE_AFTER(prev, ts_pichain); |
347 | } |
348 | } else if (prio < iter->ts_eprio) { |
349 | prio = iter->ts_eprio; |
350 | } |
351 | prev = iter; |
352 | } |
353 | |
354 | lwp_lendpri(l, prio); |
355 | |
356 | if (dolock) { |
357 | lwp_unlock(l); |
358 | } |
359 | } |
360 | |
361 | /* |
362 | * turnstile_block: |
363 | * |
364 | * Enter an object into the turnstile chain and prepare the current |
365 | * LWP for sleep. |
366 | */ |
367 | void |
368 | turnstile_block(turnstile_t *ts, int q, wchan_t obj, syncobj_t *sobj) |
369 | { |
370 | lwp_t * const l = curlwp; /* cached curlwp */ |
371 | turnstile_t *ots; |
372 | tschain_t *tc; |
373 | sleepq_t *sq; |
374 | pri_t obase; |
375 | |
376 | tc = &turnstile_tab[TS_HASH(obj)]; |
377 | |
378 | KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); |
379 | KASSERT(mutex_owned(tc->tc_mutex)); |
380 | KASSERT(l != NULL && l->l_ts != NULL); |
381 | |
382 | if (ts == NULL) { |
383 | /* |
384 | * We are the first thread to wait for this object; |
385 | * lend our turnstile to it. |
386 | */ |
387 | ts = l->l_ts; |
388 | KASSERT(TS_ALL_WAITERS(ts) == 0); |
389 | KASSERT(TAILQ_EMPTY(&ts->ts_sleepq[TS_READER_Q]) && |
390 | TAILQ_EMPTY(&ts->ts_sleepq[TS_WRITER_Q])); |
391 | ts->ts_obj = obj; |
392 | ts->ts_inheritor = NULL; |
393 | LIST_INSERT_HEAD(&tc->tc_chain, ts, ts_chain); |
394 | } else { |
395 | /* |
396 | * Object already has a turnstile. Put our turnstile |
397 | * onto the free list, and reference the existing |
398 | * turnstile instead. |
399 | */ |
400 | ots = l->l_ts; |
401 | KASSERT(ots->ts_free == NULL); |
402 | ots->ts_free = ts->ts_free; |
403 | ts->ts_free = ots; |
404 | l->l_ts = ts; |
405 | |
406 | KASSERT(ts->ts_obj == obj); |
407 | KASSERT(TS_ALL_WAITERS(ts) != 0); |
408 | KASSERT(!TAILQ_EMPTY(&ts->ts_sleepq[TS_READER_Q]) || |
409 | !TAILQ_EMPTY(&ts->ts_sleepq[TS_WRITER_Q])); |
410 | } |
411 | |
412 | sq = &ts->ts_sleepq[q]; |
413 | ts->ts_waiters[q]++; |
414 | sleepq_enter(sq, l, tc->tc_mutex); |
415 | LOCKDEBUG_BARRIER(tc->tc_mutex, 1); |
416 | l->l_kpriority = true; |
417 | obase = l->l_kpribase; |
418 | if (obase < PRI_KTHREAD) |
419 | l->l_kpribase = PRI_KTHREAD; |
420 | sleepq_enqueue(sq, obj, "tstile" , sobj); |
421 | |
422 | /* |
423 | * Disable preemption across this entire block, as we may drop |
424 | * scheduler locks (allowing preemption), and would prefer not |
425 | * to be interrupted while in a state of flux. |
426 | */ |
427 | KPREEMPT_DISABLE(l); |
428 | KASSERT(tc->tc_mutex == l->l_mutex); |
429 | turnstile_lendpri(l); |
430 | sleepq_block(0, false); |
431 | l->l_kpribase = obase; |
432 | KPREEMPT_ENABLE(l); |
433 | } |
434 | |
435 | /* |
436 | * turnstile_wakeup: |
437 | * |
438 | * Wake up the specified number of threads that are blocked |
439 | * in a turnstile. |
440 | */ |
441 | void |
442 | turnstile_wakeup(turnstile_t *ts, int q, int count, lwp_t *nl) |
443 | { |
444 | sleepq_t *sq; |
445 | tschain_t *tc; |
446 | lwp_t *l; |
447 | |
448 | tc = &turnstile_tab[TS_HASH(ts->ts_obj)]; |
449 | sq = &ts->ts_sleepq[q]; |
450 | |
451 | KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); |
452 | KASSERT(count > 0 && count <= TS_WAITERS(ts, q)); |
453 | KASSERT(mutex_owned(tc->tc_mutex)); |
454 | KASSERT(ts->ts_inheritor == curlwp || ts->ts_inheritor == NULL); |
455 | |
456 | /* |
457 | * restore inherited priority if necessary. |
458 | */ |
459 | |
460 | if (ts->ts_inheritor != NULL) { |
461 | turnstile_unlendpri(ts); |
462 | } |
463 | |
464 | if (nl != NULL) { |
465 | #if defined(DEBUG) || defined(LOCKDEBUG) |
466 | TAILQ_FOREACH(l, sq, l_sleepchain) { |
467 | if (l == nl) |
468 | break; |
469 | } |
470 | if (l == NULL) |
471 | panic("turnstile_wakeup: nl not on sleepq" ); |
472 | #endif |
473 | turnstile_remove(ts, nl, q); |
474 | } else { |
475 | while (count-- > 0) { |
476 | l = TAILQ_FIRST(sq); |
477 | KASSERT(l != NULL); |
478 | turnstile_remove(ts, l, q); |
479 | } |
480 | } |
481 | mutex_spin_exit(tc->tc_mutex); |
482 | } |
483 | |
484 | /* |
485 | * turnstile_unsleep: |
486 | * |
487 | * Remove an LWP from the turnstile. This is called when the LWP has |
488 | * not been awoken normally but instead interrupted: for example, if it |
489 | * has received a signal. It's not a valid action for turnstiles, |
490 | * since LWPs blocking on a turnstile are not interruptable. |
491 | */ |
492 | void |
493 | turnstile_unsleep(lwp_t *l, bool cleanup) |
494 | { |
495 | |
496 | lwp_unlock(l); |
497 | panic("turnstile_unsleep" ); |
498 | } |
499 | |
500 | /* |
501 | * turnstile_changepri: |
502 | * |
503 | * Adjust the priority of an LWP residing on a turnstile. |
504 | */ |
505 | void |
506 | turnstile_changepri(lwp_t *l, pri_t pri) |
507 | { |
508 | |
509 | /* XXX priority inheritance */ |
510 | sleepq_changepri(l, pri); |
511 | } |
512 | |
513 | #if defined(LOCKDEBUG) |
514 | /* |
515 | * turnstile_print: |
516 | * |
517 | * Given the address of a lock object, print the contents of a |
518 | * turnstile. |
519 | */ |
520 | void |
521 | turnstile_print(volatile void *obj, void (*pr)(const char *, ...)) |
522 | { |
523 | turnstile_t *ts; |
524 | tschain_t *tc; |
525 | sleepq_t *rsq, *wsq; |
526 | lwp_t *l; |
527 | |
528 | tc = &turnstile_tab[TS_HASH(obj)]; |
529 | |
530 | LIST_FOREACH(ts, &tc->tc_chain, ts_chain) |
531 | if (ts->ts_obj == obj) |
532 | break; |
533 | |
534 | (*pr)("Turnstile chain at %p.\n" , tc); |
535 | if (ts == NULL) { |
536 | (*pr)("=> No active turnstile for this lock.\n" ); |
537 | return; |
538 | } |
539 | |
540 | rsq = &ts->ts_sleepq[TS_READER_Q]; |
541 | wsq = &ts->ts_sleepq[TS_WRITER_Q]; |
542 | |
543 | (*pr)("=> Turnstile at %p (wrq=%p, rdq=%p).\n" , ts, rsq, wsq); |
544 | |
545 | (*pr)("=> %d waiting readers:" , TS_WAITERS(ts, TS_READER_Q)); |
546 | TAILQ_FOREACH(l, rsq, l_sleepchain) { |
547 | (*pr)(" %p" , l); |
548 | } |
549 | (*pr)("\n" ); |
550 | |
551 | (*pr)("=> %d waiting writers:" , TS_WAITERS(ts, TS_WRITER_Q)); |
552 | TAILQ_FOREACH(l, wsq, l_sleepchain) { |
553 | (*pr)(" %p" , l); |
554 | } |
555 | (*pr)("\n" ); |
556 | } |
557 | #endif /* LOCKDEBUG */ |
558 | |