1 | /* $NetBSD: kern_runq.c,v 1.45 2016/07/07 06:55:43 msaitoh Exp $ */ |
2 | |
3 | /* |
4 | * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org> |
5 | * All rights reserved. |
6 | * |
7 | * Redistribution and use in source and binary forms, with or without |
8 | * modification, are permitted provided that the following conditions |
9 | * are met: |
10 | * 1. Redistributions of source code must retain the above copyright |
11 | * notice, this list of conditions and the following disclaimer. |
12 | * 2. Redistributions in binary form must reproduce the above copyright |
13 | * notice, this list of conditions and the following disclaimer in the |
14 | * documentation and/or other materials provided with the distribution. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
26 | * SUCH DAMAGE. |
27 | */ |
28 | |
29 | #include <sys/cdefs.h> |
30 | __KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.45 2016/07/07 06:55:43 msaitoh Exp $" ); |
31 | |
32 | #include "opt_dtrace.h" |
33 | |
34 | #include <sys/param.h> |
35 | #include <sys/kernel.h> |
36 | #include <sys/bitops.h> |
37 | #include <sys/cpu.h> |
38 | #include <sys/idle.h> |
39 | #include <sys/intr.h> |
40 | #include <sys/kmem.h> |
41 | #include <sys/lwp.h> |
42 | #include <sys/mutex.h> |
43 | #include <sys/proc.h> |
44 | #include <sys/sched.h> |
45 | #include <sys/syscallargs.h> |
46 | #include <sys/sysctl.h> |
47 | #include <sys/systm.h> |
48 | #include <sys/types.h> |
49 | #include <sys/evcnt.h> |
50 | |
51 | /* |
52 | * Priority related definitions. |
53 | */ |
54 | #define PRI_TS_COUNT (NPRI_USER) |
55 | #define PRI_RT_COUNT (PRI_COUNT - PRI_TS_COUNT) |
56 | #define PRI_HTS_RANGE (PRI_TS_COUNT / 10) |
57 | |
58 | #define PRI_HIGHEST_TS (MAXPRI_USER) |
59 | |
60 | /* |
61 | * Bits per map. |
62 | */ |
63 | #define BITMAP_BITS (32) |
64 | #define BITMAP_SHIFT (5) |
65 | #define BITMAP_MSB (0x80000000U) |
66 | #define BITMAP_MASK (BITMAP_BITS - 1) |
67 | |
68 | /* |
69 | * Structures, runqueue. |
70 | */ |
71 | |
72 | const int schedppq = 1; |
73 | |
74 | typedef struct { |
75 | TAILQ_HEAD(, lwp) q_head; |
76 | } queue_t; |
77 | |
78 | typedef struct { |
79 | /* Bitmap */ |
80 | uint32_t r_bitmap[PRI_COUNT >> BITMAP_SHIFT]; |
81 | /* Counters */ |
82 | u_int r_count; /* Count of the threads */ |
83 | u_int r_avgcount; /* Average count of threads */ |
84 | u_int r_mcount; /* Count of migratable threads */ |
85 | /* Runqueues */ |
86 | queue_t r_rt_queue[PRI_RT_COUNT]; |
87 | queue_t r_ts_queue[PRI_TS_COUNT]; |
88 | /* Event counters */ |
89 | struct evcnt r_ev_pull; |
90 | struct evcnt r_ev_push; |
91 | struct evcnt r_ev_stay; |
92 | struct evcnt r_ev_localize; |
93 | } runqueue_t; |
94 | |
95 | static void * sched_getrq(runqueue_t *, const pri_t); |
96 | #ifdef MULTIPROCESSOR |
97 | static lwp_t * sched_catchlwp(struct cpu_info *); |
98 | static void sched_balance(void *); |
99 | #endif |
100 | |
101 | /* |
102 | * Preemption control. |
103 | */ |
104 | int sched_upreempt_pri = 0; |
105 | #ifdef __HAVE_PREEMPTION |
106 | # ifdef DEBUG |
107 | int sched_kpreempt_pri = 0; |
108 | # else |
109 | int sched_kpreempt_pri = PRI_USER_RT; |
110 | # endif |
111 | #else |
112 | int sched_kpreempt_pri = 1000; |
113 | #endif |
114 | |
115 | /* |
116 | * Migration and balancing. |
117 | */ |
118 | static u_int cacheht_time; /* Cache hotness time */ |
119 | static u_int min_catch; /* Minimal LWP count for catching */ |
120 | static u_int balance_period; /* Balance period */ |
121 | static struct cpu_info *worker_ci; /* Victim CPU */ |
122 | #ifdef MULTIPROCESSOR |
123 | static struct callout balance_ch; /* Callout of balancer */ |
124 | #endif |
125 | |
126 | #ifdef KDTRACE_HOOKS |
127 | struct lwp *curthread; |
128 | #endif |
129 | |
130 | void |
131 | runq_init(void) |
132 | { |
133 | |
134 | /* Balancing */ |
135 | worker_ci = curcpu(); |
136 | cacheht_time = mstohz(3); /* ~3 ms */ |
137 | balance_period = mstohz(300); /* ~300 ms */ |
138 | |
139 | /* Minimal count of LWPs for catching */ |
140 | min_catch = 1; |
141 | |
142 | /* Initialize balancing callout and run it */ |
143 | #ifdef MULTIPROCESSOR |
144 | callout_init(&balance_ch, CALLOUT_MPSAFE); |
145 | callout_setfunc(&balance_ch, sched_balance, NULL); |
146 | callout_schedule(&balance_ch, balance_period); |
147 | #endif |
148 | } |
149 | |
150 | void |
151 | sched_cpuattach(struct cpu_info *ci) |
152 | { |
153 | runqueue_t *ci_rq; |
154 | void *rq_ptr; |
155 | u_int i, size; |
156 | |
157 | if (ci->ci_schedstate.spc_lwplock == NULL) { |
158 | ci->ci_schedstate.spc_lwplock = |
159 | mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); |
160 | } |
161 | if (ci == lwp0.l_cpu) { |
162 | /* Initialize the scheduler structure of the primary LWP */ |
163 | lwp0.l_mutex = ci->ci_schedstate.spc_lwplock; |
164 | } |
165 | if (ci->ci_schedstate.spc_mutex != NULL) { |
166 | /* Already initialized. */ |
167 | return; |
168 | } |
169 | |
170 | /* Allocate the run queue */ |
171 | size = roundup2(sizeof(runqueue_t), coherency_unit) + coherency_unit; |
172 | rq_ptr = kmem_zalloc(size, KM_SLEEP); |
173 | if (rq_ptr == NULL) { |
174 | panic("sched_cpuattach: could not allocate the runqueue" ); |
175 | } |
176 | ci_rq = (void *)(roundup2((uintptr_t)(rq_ptr), coherency_unit)); |
177 | |
178 | /* Initialize run queues */ |
179 | ci->ci_schedstate.spc_mutex = |
180 | mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); |
181 | for (i = 0; i < PRI_RT_COUNT; i++) |
182 | TAILQ_INIT(&ci_rq->r_rt_queue[i].q_head); |
183 | for (i = 0; i < PRI_TS_COUNT; i++) |
184 | TAILQ_INIT(&ci_rq->r_ts_queue[i].q_head); |
185 | |
186 | ci->ci_schedstate.spc_sched_info = ci_rq; |
187 | |
188 | evcnt_attach_dynamic(&ci_rq->r_ev_pull, EVCNT_TYPE_MISC, NULL, |
189 | cpu_name(ci), "runqueue pull" ); |
190 | evcnt_attach_dynamic(&ci_rq->r_ev_push, EVCNT_TYPE_MISC, NULL, |
191 | cpu_name(ci), "runqueue push" ); |
192 | evcnt_attach_dynamic(&ci_rq->r_ev_stay, EVCNT_TYPE_MISC, NULL, |
193 | cpu_name(ci), "runqueue stay" ); |
194 | evcnt_attach_dynamic(&ci_rq->r_ev_localize, EVCNT_TYPE_MISC, NULL, |
195 | cpu_name(ci), "runqueue localize" ); |
196 | } |
197 | |
198 | /* |
199 | * Control of the runqueue. |
200 | */ |
201 | |
202 | static inline void * |
203 | sched_getrq(runqueue_t *ci_rq, const pri_t prio) |
204 | { |
205 | |
206 | KASSERT(prio < PRI_COUNT); |
207 | return (prio <= PRI_HIGHEST_TS) ? |
208 | &ci_rq->r_ts_queue[prio].q_head : |
209 | &ci_rq->r_rt_queue[prio - PRI_HIGHEST_TS - 1].q_head; |
210 | } |
211 | |
212 | void |
213 | sched_enqueue(struct lwp *l, bool swtch) |
214 | { |
215 | runqueue_t *ci_rq; |
216 | struct schedstate_percpu *spc; |
217 | TAILQ_HEAD(, lwp) *q_head; |
218 | const pri_t eprio = lwp_eprio(l); |
219 | struct cpu_info *ci; |
220 | int type; |
221 | |
222 | ci = l->l_cpu; |
223 | spc = &ci->ci_schedstate; |
224 | ci_rq = spc->spc_sched_info; |
225 | KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); |
226 | |
227 | /* Update the last run time on switch */ |
228 | if (__predict_true(swtch == true)) |
229 | l->l_rticksum += (hardclock_ticks - l->l_rticks); |
230 | else if (l->l_rticks == 0) |
231 | l->l_rticks = hardclock_ticks; |
232 | |
233 | /* Enqueue the thread */ |
234 | q_head = sched_getrq(ci_rq, eprio); |
235 | if (TAILQ_EMPTY(q_head)) { |
236 | u_int i; |
237 | uint32_t q; |
238 | |
239 | /* Mark bit */ |
240 | i = eprio >> BITMAP_SHIFT; |
241 | q = BITMAP_MSB >> (eprio & BITMAP_MASK); |
242 | KASSERT((ci_rq->r_bitmap[i] & q) == 0); |
243 | ci_rq->r_bitmap[i] |= q; |
244 | } |
245 | TAILQ_INSERT_TAIL(q_head, l, l_runq); |
246 | ci_rq->r_count++; |
247 | if ((l->l_pflag & LP_BOUND) == 0) |
248 | ci_rq->r_mcount++; |
249 | |
250 | /* |
251 | * Update the value of highest priority in the runqueue, |
252 | * if priority of this thread is higher. |
253 | */ |
254 | if (eprio > spc->spc_maxpriority) |
255 | spc->spc_maxpriority = eprio; |
256 | |
257 | sched_newts(l); |
258 | |
259 | /* |
260 | * Wake the chosen CPU or cause a preemption if the newly |
261 | * enqueued thread has higher priority. Don't cause a |
262 | * preemption if the thread is yielding (swtch). |
263 | */ |
264 | if (!swtch && eprio > spc->spc_curpriority) { |
265 | if (eprio >= sched_kpreempt_pri) |
266 | type = RESCHED_KPREEMPT; |
267 | else if (eprio >= sched_upreempt_pri) |
268 | type = RESCHED_IMMED; |
269 | else |
270 | type = RESCHED_LAZY; |
271 | cpu_need_resched(ci, type); |
272 | } |
273 | } |
274 | |
275 | void |
276 | sched_dequeue(struct lwp *l) |
277 | { |
278 | runqueue_t *ci_rq; |
279 | TAILQ_HEAD(, lwp) *q_head; |
280 | struct schedstate_percpu *spc; |
281 | const pri_t eprio = lwp_eprio(l); |
282 | |
283 | spc = & l->l_cpu->ci_schedstate; |
284 | ci_rq = spc->spc_sched_info; |
285 | KASSERT(lwp_locked(l, spc->spc_mutex)); |
286 | |
287 | KASSERT(eprio <= spc->spc_maxpriority); |
288 | KASSERT(ci_rq->r_bitmap[eprio >> BITMAP_SHIFT] != 0); |
289 | KASSERT(ci_rq->r_count > 0); |
290 | |
291 | if (spc->spc_migrating == l) |
292 | spc->spc_migrating = NULL; |
293 | |
294 | ci_rq->r_count--; |
295 | if ((l->l_pflag & LP_BOUND) == 0) |
296 | ci_rq->r_mcount--; |
297 | |
298 | q_head = sched_getrq(ci_rq, eprio); |
299 | TAILQ_REMOVE(q_head, l, l_runq); |
300 | if (TAILQ_EMPTY(q_head)) { |
301 | u_int i; |
302 | uint32_t q; |
303 | |
304 | /* Unmark bit */ |
305 | i = eprio >> BITMAP_SHIFT; |
306 | q = BITMAP_MSB >> (eprio & BITMAP_MASK); |
307 | KASSERT((ci_rq->r_bitmap[i] & q) != 0); |
308 | ci_rq->r_bitmap[i] &= ~q; |
309 | |
310 | /* |
311 | * Update the value of highest priority in the runqueue, in a |
312 | * case it was a last thread in the queue of highest priority. |
313 | */ |
314 | if (eprio != spc->spc_maxpriority) |
315 | return; |
316 | |
317 | do { |
318 | if (ci_rq->r_bitmap[i] != 0) { |
319 | q = ffs(ci_rq->r_bitmap[i]); |
320 | spc->spc_maxpriority = |
321 | (i << BITMAP_SHIFT) + (BITMAP_BITS - q); |
322 | return; |
323 | } |
324 | } while (i--); |
325 | |
326 | /* If not found - set the lowest value */ |
327 | spc->spc_maxpriority = 0; |
328 | } |
329 | } |
330 | |
331 | /* |
332 | * Migration and balancing. |
333 | */ |
334 | |
335 | #ifdef MULTIPROCESSOR |
336 | |
337 | /* Estimate if LWP is cache-hot */ |
338 | static inline bool |
339 | lwp_cache_hot(const struct lwp *l) |
340 | { |
341 | |
342 | if (__predict_false(l->l_slptime || l->l_rticks == 0)) |
343 | return false; |
344 | |
345 | return (hardclock_ticks - l->l_rticks <= cacheht_time); |
346 | } |
347 | |
348 | /* Check if LWP can migrate to the chosen CPU */ |
349 | static inline bool |
350 | sched_migratable(const struct lwp *l, struct cpu_info *ci) |
351 | { |
352 | const struct schedstate_percpu *spc = &ci->ci_schedstate; |
353 | KASSERT(lwp_locked(__UNCONST(l), NULL)); |
354 | |
355 | /* Is CPU offline? */ |
356 | if (__predict_false(spc->spc_flags & SPCF_OFFLINE)) |
357 | return false; |
358 | |
359 | /* Is affinity set? */ |
360 | if (__predict_false(l->l_affinity)) |
361 | return kcpuset_isset(l->l_affinity, cpu_index(ci)); |
362 | |
363 | /* Is there a processor-set? */ |
364 | return (spc->spc_psid == l->l_psid); |
365 | } |
366 | |
367 | /* |
368 | * Estimate the migration of LWP to the other CPU. |
369 | * Take and return the CPU, if migration is needed. |
370 | */ |
371 | struct cpu_info * |
372 | sched_takecpu(struct lwp *l) |
373 | { |
374 | struct cpu_info *ci, *tci, *pivot, *next; |
375 | struct schedstate_percpu *spc; |
376 | runqueue_t *ci_rq, *ici_rq; |
377 | pri_t eprio, lpri, pri; |
378 | |
379 | KASSERT(lwp_locked(l, NULL)); |
380 | |
381 | /* If thread is strictly bound, do not estimate other CPUs */ |
382 | ci = l->l_cpu; |
383 | if (l->l_pflag & LP_BOUND) |
384 | return ci; |
385 | |
386 | spc = &ci->ci_schedstate; |
387 | ci_rq = spc->spc_sched_info; |
388 | |
389 | /* Make sure that thread is in appropriate processor-set */ |
390 | if (__predict_true(spc->spc_psid == l->l_psid)) { |
391 | /* If CPU of this thread is idling - run there */ |
392 | if (ci_rq->r_count == 0) { |
393 | ci_rq->r_ev_stay.ev_count++; |
394 | return ci; |
395 | } |
396 | /* Stay if thread is cache-hot */ |
397 | eprio = lwp_eprio(l); |
398 | if (__predict_true(l->l_stat != LSIDL) && |
399 | lwp_cache_hot(l) && eprio >= spc->spc_curpriority) { |
400 | ci_rq->r_ev_stay.ev_count++; |
401 | return ci; |
402 | } |
403 | } else { |
404 | eprio = lwp_eprio(l); |
405 | } |
406 | |
407 | /* Run on current CPU if priority of thread is higher */ |
408 | ci = curcpu(); |
409 | spc = &ci->ci_schedstate; |
410 | if (eprio > spc->spc_curpriority && sched_migratable(l, ci)) { |
411 | ci_rq = spc->spc_sched_info; |
412 | ci_rq->r_ev_localize.ev_count++; |
413 | return ci; |
414 | } |
415 | |
416 | /* |
417 | * Look for the CPU with the lowest priority thread. In case of |
418 | * equal priority, choose the CPU with the fewest of threads. |
419 | */ |
420 | pivot = l->l_cpu; |
421 | ci = pivot; |
422 | tci = pivot; |
423 | lpri = PRI_COUNT; |
424 | do { |
425 | if ((next = cpu_lookup(cpu_index(ci) + 1)) == NULL) { |
426 | /* Reached the end, start from the beginning. */ |
427 | next = cpu_lookup(0); |
428 | } |
429 | spc = &ci->ci_schedstate; |
430 | ici_rq = spc->spc_sched_info; |
431 | pri = MAX(spc->spc_curpriority, spc->spc_maxpriority); |
432 | if (pri > lpri) |
433 | continue; |
434 | |
435 | if (pri == lpri && ci_rq->r_count < ici_rq->r_count) |
436 | continue; |
437 | |
438 | if (!sched_migratable(l, ci)) |
439 | continue; |
440 | |
441 | lpri = pri; |
442 | tci = ci; |
443 | ci_rq = ici_rq; |
444 | } while (ci = next, ci != pivot); |
445 | |
446 | ci_rq = tci->ci_schedstate.spc_sched_info; |
447 | ci_rq->r_ev_push.ev_count++; |
448 | |
449 | return tci; |
450 | } |
451 | |
452 | /* |
453 | * Tries to catch an LWP from the runqueue of other CPU. |
454 | */ |
455 | static struct lwp * |
456 | sched_catchlwp(struct cpu_info *ci) |
457 | { |
458 | struct cpu_info *curci = curcpu(); |
459 | struct schedstate_percpu *spc, *curspc; |
460 | TAILQ_HEAD(, lwp) *q_head; |
461 | runqueue_t *ci_rq; |
462 | struct lwp *l; |
463 | |
464 | curspc = &curci->ci_schedstate; |
465 | spc = &ci->ci_schedstate; |
466 | KASSERT(curspc->spc_psid == spc->spc_psid); |
467 | |
468 | ci_rq = spc->spc_sched_info; |
469 | if (ci_rq->r_mcount < min_catch) { |
470 | spc_unlock(ci); |
471 | return NULL; |
472 | } |
473 | |
474 | /* Take the highest priority thread */ |
475 | q_head = sched_getrq(ci_rq, spc->spc_maxpriority); |
476 | l = TAILQ_FIRST(q_head); |
477 | |
478 | for (;;) { |
479 | /* Check the first and next result from the queue */ |
480 | if (l == NULL) { |
481 | break; |
482 | } |
483 | KASSERTMSG(l->l_stat == LSRUN, "%s l %p (%s) l_stat %d" , |
484 | ci->ci_data.cpu_name, |
485 | l, (l->l_name ? l->l_name : l->l_proc->p_comm), l->l_stat); |
486 | |
487 | /* Look for threads, whose are allowed to migrate */ |
488 | if ((l->l_pflag & LP_BOUND) || lwp_cache_hot(l) || |
489 | !sched_migratable(l, curci)) { |
490 | l = TAILQ_NEXT(l, l_runq); |
491 | continue; |
492 | } |
493 | |
494 | /* Grab the thread, and move to the local run queue */ |
495 | sched_dequeue(l); |
496 | |
497 | /* |
498 | * If LWP is still context switching, we may need to |
499 | * spin-wait before changing its CPU. |
500 | */ |
501 | if (__predict_false(l->l_ctxswtch != 0)) { |
502 | u_int count; |
503 | count = SPINLOCK_BACKOFF_MIN; |
504 | while (l->l_ctxswtch) |
505 | SPINLOCK_BACKOFF(count); |
506 | } |
507 | l->l_cpu = curci; |
508 | ci_rq->r_ev_pull.ev_count++; |
509 | lwp_unlock_to(l, curspc->spc_mutex); |
510 | sched_enqueue(l, false); |
511 | return l; |
512 | } |
513 | spc_unlock(ci); |
514 | |
515 | return l; |
516 | } |
517 | |
518 | /* |
519 | * Periodical calculations for balancing. |
520 | */ |
521 | static void |
522 | sched_balance(void *nocallout) |
523 | { |
524 | struct cpu_info *ci, *hci; |
525 | runqueue_t *ci_rq; |
526 | CPU_INFO_ITERATOR cii; |
527 | u_int highest; |
528 | |
529 | hci = curcpu(); |
530 | highest = 0; |
531 | |
532 | /* Make lockless countings */ |
533 | for (CPU_INFO_FOREACH(cii, ci)) { |
534 | ci_rq = ci->ci_schedstate.spc_sched_info; |
535 | |
536 | /* Average count of the threads */ |
537 | ci_rq->r_avgcount = (ci_rq->r_avgcount + ci_rq->r_mcount) >> 1; |
538 | |
539 | /* Look for CPU with the highest average */ |
540 | if (ci_rq->r_avgcount > highest) { |
541 | hci = ci; |
542 | highest = ci_rq->r_avgcount; |
543 | } |
544 | } |
545 | |
546 | /* Update the worker */ |
547 | worker_ci = hci; |
548 | |
549 | if (nocallout == NULL) |
550 | callout_schedule(&balance_ch, balance_period); |
551 | } |
552 | |
553 | /* |
554 | * Called from each CPU's idle loop. |
555 | */ |
556 | void |
557 | sched_idle(void) |
558 | { |
559 | struct cpu_info *ci = curcpu(), *tci = NULL; |
560 | struct schedstate_percpu *spc, *tspc; |
561 | runqueue_t *ci_rq; |
562 | bool dlock = false; |
563 | |
564 | /* Check if there is a migrating LWP */ |
565 | spc = &ci->ci_schedstate; |
566 | if (spc->spc_migrating == NULL) |
567 | goto no_migration; |
568 | |
569 | spc_lock(ci); |
570 | for (;;) { |
571 | struct lwp *l; |
572 | |
573 | l = spc->spc_migrating; |
574 | if (l == NULL) |
575 | break; |
576 | |
577 | /* |
578 | * If second attempt, and target CPU has changed, |
579 | * drop the old lock. |
580 | */ |
581 | if (dlock == true && tci != l->l_target_cpu) { |
582 | KASSERT(tci != NULL); |
583 | spc_unlock(tci); |
584 | dlock = false; |
585 | } |
586 | |
587 | /* |
588 | * Nothing to do if destination has changed to the |
589 | * local CPU, or migration was done by other CPU. |
590 | */ |
591 | tci = l->l_target_cpu; |
592 | if (tci == NULL || tci == ci) { |
593 | spc->spc_migrating = NULL; |
594 | l->l_target_cpu = NULL; |
595 | break; |
596 | } |
597 | tspc = &tci->ci_schedstate; |
598 | |
599 | /* |
600 | * Double-lock the runqueues. |
601 | * We do that only once. |
602 | */ |
603 | if (dlock == false) { |
604 | dlock = true; |
605 | if (ci < tci) { |
606 | spc_lock(tci); |
607 | } else if (!mutex_tryenter(tspc->spc_mutex)) { |
608 | spc_unlock(ci); |
609 | spc_lock(tci); |
610 | spc_lock(ci); |
611 | /* Check the situation again.. */ |
612 | continue; |
613 | } |
614 | } |
615 | |
616 | /* Migrate the thread */ |
617 | KASSERT(l->l_stat == LSRUN); |
618 | spc->spc_migrating = NULL; |
619 | l->l_target_cpu = NULL; |
620 | sched_dequeue(l); |
621 | l->l_cpu = tci; |
622 | lwp_setlock(l, tspc->spc_mutex); |
623 | sched_enqueue(l, false); |
624 | break; |
625 | } |
626 | if (dlock == true) { |
627 | KASSERT(tci != NULL); |
628 | spc_unlock(tci); |
629 | } |
630 | spc_unlock(ci); |
631 | |
632 | no_migration: |
633 | ci_rq = spc->spc_sched_info; |
634 | if ((spc->spc_flags & SPCF_OFFLINE) != 0 || ci_rq->r_count != 0) { |
635 | return; |
636 | } |
637 | |
638 | /* Reset the counter, and call the balancer */ |
639 | ci_rq->r_avgcount = 0; |
640 | sched_balance(ci); |
641 | tci = worker_ci; |
642 | tspc = &tci->ci_schedstate; |
643 | if (ci == tci || spc->spc_psid != tspc->spc_psid) |
644 | return; |
645 | spc_dlock(ci, tci); |
646 | (void)sched_catchlwp(tci); |
647 | spc_unlock(ci); |
648 | } |
649 | |
650 | #else |
651 | |
652 | /* |
653 | * stubs for !MULTIPROCESSOR |
654 | */ |
655 | |
656 | struct cpu_info * |
657 | sched_takecpu(struct lwp *l) |
658 | { |
659 | |
660 | return l->l_cpu; |
661 | } |
662 | |
663 | void |
664 | sched_idle(void) |
665 | { |
666 | |
667 | } |
668 | #endif /* MULTIPROCESSOR */ |
669 | |
670 | /* |
671 | * Scheduling statistics and balancing. |
672 | */ |
673 | void |
674 | sched_lwp_stats(struct lwp *l) |
675 | { |
676 | int batch; |
677 | |
678 | KASSERT(lwp_locked(l, NULL)); |
679 | |
680 | /* Update sleep time */ |
681 | if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP || |
682 | l->l_stat == LSSUSPENDED) |
683 | l->l_slptime++; |
684 | |
685 | /* |
686 | * Set that thread is more CPU-bound, if sum of run time exceeds the |
687 | * sum of sleep time. Check if thread is CPU-bound a first time. |
688 | */ |
689 | batch = (l->l_rticksum > l->l_slpticksum); |
690 | if (batch != 0) { |
691 | if ((l->l_flag & LW_BATCH) == 0) |
692 | batch = 0; |
693 | l->l_flag |= LW_BATCH; |
694 | } else |
695 | l->l_flag &= ~LW_BATCH; |
696 | |
697 | /* |
698 | * If thread is CPU-bound and never sleeps, it would occupy the CPU. |
699 | * In such case reset the value of last sleep, and check it later, if |
700 | * it is still zero - perform the migration, unmark the batch flag. |
701 | */ |
702 | if (batch && (l->l_slptime + l->l_slpticksum) == 0) { |
703 | if (l->l_slpticks == 0) { |
704 | if (l->l_target_cpu == NULL && |
705 | (l->l_stat == LSRUN || l->l_stat == LSONPROC)) { |
706 | struct cpu_info *ci = sched_takecpu(l); |
707 | l->l_target_cpu = (ci != l->l_cpu) ? ci : NULL; |
708 | } |
709 | l->l_flag &= ~LW_BATCH; |
710 | } else { |
711 | l->l_slpticks = 0; |
712 | } |
713 | } |
714 | |
715 | /* Reset the time sums */ |
716 | l->l_slpticksum = 0; |
717 | l->l_rticksum = 0; |
718 | |
719 | /* Scheduler-specific hook */ |
720 | sched_pstats_hook(l, batch); |
721 | #ifdef KDTRACE_HOOKS |
722 | curthread = l; |
723 | #endif |
724 | } |
725 | |
726 | /* |
727 | * Scheduler mill. |
728 | */ |
729 | struct lwp * |
730 | sched_nextlwp(void) |
731 | { |
732 | struct cpu_info *ci = curcpu(); |
733 | struct schedstate_percpu *spc; |
734 | TAILQ_HEAD(, lwp) *q_head; |
735 | runqueue_t *ci_rq; |
736 | struct lwp *l; |
737 | |
738 | /* Return to idle LWP if there is a migrating thread */ |
739 | spc = &ci->ci_schedstate; |
740 | if (__predict_false(spc->spc_migrating != NULL)) |
741 | return NULL; |
742 | ci_rq = spc->spc_sched_info; |
743 | |
744 | #ifdef MULTIPROCESSOR |
745 | /* If runqueue is empty, try to catch some thread from other CPU */ |
746 | if (__predict_false(ci_rq->r_count == 0)) { |
747 | struct schedstate_percpu *cspc; |
748 | struct cpu_info *cci; |
749 | |
750 | /* Offline CPUs should not perform this, however */ |
751 | if (__predict_false(spc->spc_flags & SPCF_OFFLINE)) |
752 | return NULL; |
753 | |
754 | /* Reset the counter, and call the balancer */ |
755 | ci_rq->r_avgcount = 0; |
756 | sched_balance(ci); |
757 | cci = worker_ci; |
758 | cspc = &cci->ci_schedstate; |
759 | if (ci == cci || spc->spc_psid != cspc->spc_psid || |
760 | !mutex_tryenter(cci->ci_schedstate.spc_mutex)) |
761 | return NULL; |
762 | return sched_catchlwp(cci); |
763 | } |
764 | #else |
765 | if (__predict_false(ci_rq->r_count == 0)) |
766 | return NULL; |
767 | #endif |
768 | |
769 | /* Take the highest priority thread */ |
770 | KASSERT(ci_rq->r_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]); |
771 | q_head = sched_getrq(ci_rq, spc->spc_maxpriority); |
772 | l = TAILQ_FIRST(q_head); |
773 | KASSERT(l != NULL); |
774 | |
775 | sched_oncpu(l); |
776 | l->l_rticks = hardclock_ticks; |
777 | |
778 | return l; |
779 | } |
780 | |
781 | /* |
782 | * sched_curcpu_runnable_p: return if curcpu() should exit the idle loop. |
783 | */ |
784 | |
785 | bool |
786 | sched_curcpu_runnable_p(void) |
787 | { |
788 | const struct cpu_info *ci; |
789 | const struct schedstate_percpu *spc; |
790 | const runqueue_t *ci_rq; |
791 | bool rv; |
792 | |
793 | kpreempt_disable(); |
794 | ci = curcpu(); |
795 | spc = &ci->ci_schedstate; |
796 | ci_rq = spc->spc_sched_info; |
797 | |
798 | #ifndef __HAVE_FAST_SOFTINTS |
799 | if (ci->ci_data.cpu_softints) { |
800 | kpreempt_enable(); |
801 | return true; |
802 | } |
803 | #endif |
804 | |
805 | rv = (ci_rq->r_count != 0) ? true : false; |
806 | kpreempt_enable(); |
807 | |
808 | return rv; |
809 | } |
810 | |
811 | /* |
812 | * Sysctl nodes and initialization. |
813 | */ |
814 | |
815 | SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup" ) |
816 | { |
817 | const struct sysctlnode *node = NULL; |
818 | |
819 | sysctl_createv(clog, 0, NULL, &node, |
820 | CTLFLAG_PERMANENT, |
821 | CTLTYPE_NODE, "sched" , |
822 | SYSCTL_DESCR("Scheduler options" ), |
823 | NULL, 0, NULL, 0, |
824 | CTL_KERN, CTL_CREATE, CTL_EOL); |
825 | |
826 | if (node == NULL) |
827 | return; |
828 | |
829 | sysctl_createv(clog, 0, &node, NULL, |
830 | CTLFLAG_PERMANENT | CTLFLAG_READWRITE, |
831 | CTLTYPE_INT, "cacheht_time" , |
832 | SYSCTL_DESCR("Cache hotness time (in ticks)" ), |
833 | NULL, 0, &cacheht_time, 0, |
834 | CTL_CREATE, CTL_EOL); |
835 | sysctl_createv(clog, 0, &node, NULL, |
836 | CTLFLAG_PERMANENT | CTLFLAG_READWRITE, |
837 | CTLTYPE_INT, "balance_period" , |
838 | SYSCTL_DESCR("Balance period (in ticks)" ), |
839 | NULL, 0, &balance_period, 0, |
840 | CTL_CREATE, CTL_EOL); |
841 | sysctl_createv(clog, 0, &node, NULL, |
842 | CTLFLAG_PERMANENT | CTLFLAG_READWRITE, |
843 | CTLTYPE_INT, "min_catch" , |
844 | SYSCTL_DESCR("Minimal count of threads for catching" ), |
845 | NULL, 0, &min_catch, 0, |
846 | CTL_CREATE, CTL_EOL); |
847 | sysctl_createv(clog, 0, &node, NULL, |
848 | CTLFLAG_PERMANENT | CTLFLAG_READWRITE, |
849 | CTLTYPE_INT, "timesoftints" , |
850 | SYSCTL_DESCR("Track CPU time for soft interrupts" ), |
851 | NULL, 0, &softint_timing, 0, |
852 | CTL_CREATE, CTL_EOL); |
853 | sysctl_createv(clog, 0, &node, NULL, |
854 | CTLFLAG_PERMANENT | CTLFLAG_READWRITE, |
855 | CTLTYPE_INT, "kpreempt_pri" , |
856 | SYSCTL_DESCR("Minimum priority to trigger kernel preemption" ), |
857 | NULL, 0, &sched_kpreempt_pri, 0, |
858 | CTL_CREATE, CTL_EOL); |
859 | sysctl_createv(clog, 0, &node, NULL, |
860 | CTLFLAG_PERMANENT | CTLFLAG_READWRITE, |
861 | CTLTYPE_INT, "upreempt_pri" , |
862 | SYSCTL_DESCR("Minimum priority to trigger user preemption" ), |
863 | NULL, 0, &sched_upreempt_pri, 0, |
864 | CTL_CREATE, CTL_EOL); |
865 | } |
866 | |
867 | /* |
868 | * Debugging. |
869 | */ |
870 | |
871 | #ifdef DDB |
872 | |
873 | void |
874 | sched_print_runqueue(void (*pr)(const char *, ...)) |
875 | { |
876 | runqueue_t *ci_rq; |
877 | struct cpu_info *ci, *tci; |
878 | struct schedstate_percpu *spc; |
879 | struct lwp *l; |
880 | struct proc *p; |
881 | CPU_INFO_ITERATOR cii; |
882 | |
883 | for (CPU_INFO_FOREACH(cii, ci)) { |
884 | int i; |
885 | |
886 | spc = &ci->ci_schedstate; |
887 | ci_rq = spc->spc_sched_info; |
888 | |
889 | (*pr)("Run-queue (CPU = %u):\n" , ci->ci_index); |
890 | (*pr)(" pid.lid = %d.%d, r_count = %u, r_avgcount = %u, " |
891 | "maxpri = %d, mlwp = %p\n" , |
892 | #ifdef MULTIPROCESSOR |
893 | ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid, |
894 | #else |
895 | curlwp->l_proc->p_pid, curlwp->l_lid, |
896 | #endif |
897 | ci_rq->r_count, ci_rq->r_avgcount, spc->spc_maxpriority, |
898 | spc->spc_migrating); |
899 | i = (PRI_COUNT >> BITMAP_SHIFT) - 1; |
900 | do { |
901 | uint32_t q; |
902 | q = ci_rq->r_bitmap[i]; |
903 | (*pr)(" bitmap[%d] => [ %d (0x%x) ]\n" , i, ffs(q), q); |
904 | } while (i--); |
905 | } |
906 | |
907 | (*pr)(" %5s %4s %4s %10s %3s %18s %4s %4s %s\n" , |
908 | "LID" , "PRI" , "EPRI" , "FL" , "ST" , "LWP" , "CPU" , "TCI" , "LRTICKS" ); |
909 | |
910 | PROCLIST_FOREACH(p, &allproc) { |
911 | (*pr)(" /- %d (%s)\n" , (int)p->p_pid, p->p_comm); |
912 | LIST_FOREACH(l, &p->p_lwps, l_sibling) { |
913 | ci = l->l_cpu; |
914 | tci = l->l_target_cpu; |
915 | (*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %4d %u\n" , |
916 | (int)l->l_lid, l->l_priority, lwp_eprio(l), |
917 | l->l_flag, l->l_stat == LSRUN ? "RQ" : |
918 | (l->l_stat == LSSLEEP ? "SQ" : "-" ), |
919 | l, ci->ci_index, (tci ? tci->ci_index : -1), |
920 | (u_int)(hardclock_ticks - l->l_rticks)); |
921 | } |
922 | } |
923 | } |
924 | |
925 | #endif |
926 | |