1/* $NetBSD: kern_runq.c,v 1.45 2016/07/07 06:55:43 msaitoh Exp $ */
2
3/*
4 * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.45 2016/07/07 06:55:43 msaitoh Exp $");
31
32#include "opt_dtrace.h"
33
34#include <sys/param.h>
35#include <sys/kernel.h>
36#include <sys/bitops.h>
37#include <sys/cpu.h>
38#include <sys/idle.h>
39#include <sys/intr.h>
40#include <sys/kmem.h>
41#include <sys/lwp.h>
42#include <sys/mutex.h>
43#include <sys/proc.h>
44#include <sys/sched.h>
45#include <sys/syscallargs.h>
46#include <sys/sysctl.h>
47#include <sys/systm.h>
48#include <sys/types.h>
49#include <sys/evcnt.h>
50
51/*
52 * Priority related definitions.
53 */
54#define PRI_TS_COUNT (NPRI_USER)
55#define PRI_RT_COUNT (PRI_COUNT - PRI_TS_COUNT)
56#define PRI_HTS_RANGE (PRI_TS_COUNT / 10)
57
58#define PRI_HIGHEST_TS (MAXPRI_USER)
59
60/*
61 * Bits per map.
62 */
63#define BITMAP_BITS (32)
64#define BITMAP_SHIFT (5)
65#define BITMAP_MSB (0x80000000U)
66#define BITMAP_MASK (BITMAP_BITS - 1)
67
68/*
69 * Structures, runqueue.
70 */
71
72const int schedppq = 1;
73
74typedef struct {
75 TAILQ_HEAD(, lwp) q_head;
76} queue_t;
77
78typedef struct {
79 /* Bitmap */
80 uint32_t r_bitmap[PRI_COUNT >> BITMAP_SHIFT];
81 /* Counters */
82 u_int r_count; /* Count of the threads */
83 u_int r_avgcount; /* Average count of threads */
84 u_int r_mcount; /* Count of migratable threads */
85 /* Runqueues */
86 queue_t r_rt_queue[PRI_RT_COUNT];
87 queue_t r_ts_queue[PRI_TS_COUNT];
88 /* Event counters */
89 struct evcnt r_ev_pull;
90 struct evcnt r_ev_push;
91 struct evcnt r_ev_stay;
92 struct evcnt r_ev_localize;
93} runqueue_t;
94
95static void * sched_getrq(runqueue_t *, const pri_t);
96#ifdef MULTIPROCESSOR
97static lwp_t * sched_catchlwp(struct cpu_info *);
98static void sched_balance(void *);
99#endif
100
101/*
102 * Preemption control.
103 */
104int sched_upreempt_pri = 0;
105#ifdef __HAVE_PREEMPTION
106# ifdef DEBUG
107int sched_kpreempt_pri = 0;
108# else
109int sched_kpreempt_pri = PRI_USER_RT;
110# endif
111#else
112int sched_kpreempt_pri = 1000;
113#endif
114
115/*
116 * Migration and balancing.
117 */
118static u_int cacheht_time; /* Cache hotness time */
119static u_int min_catch; /* Minimal LWP count for catching */
120static u_int balance_period; /* Balance period */
121static struct cpu_info *worker_ci; /* Victim CPU */
122#ifdef MULTIPROCESSOR
123static struct callout balance_ch; /* Callout of balancer */
124#endif
125
126#ifdef KDTRACE_HOOKS
127struct lwp *curthread;
128#endif
129
130void
131runq_init(void)
132{
133
134 /* Balancing */
135 worker_ci = curcpu();
136 cacheht_time = mstohz(3); /* ~3 ms */
137 balance_period = mstohz(300); /* ~300 ms */
138
139 /* Minimal count of LWPs for catching */
140 min_catch = 1;
141
142 /* Initialize balancing callout and run it */
143#ifdef MULTIPROCESSOR
144 callout_init(&balance_ch, CALLOUT_MPSAFE);
145 callout_setfunc(&balance_ch, sched_balance, NULL);
146 callout_schedule(&balance_ch, balance_period);
147#endif
148}
149
150void
151sched_cpuattach(struct cpu_info *ci)
152{
153 runqueue_t *ci_rq;
154 void *rq_ptr;
155 u_int i, size;
156
157 if (ci->ci_schedstate.spc_lwplock == NULL) {
158 ci->ci_schedstate.spc_lwplock =
159 mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
160 }
161 if (ci == lwp0.l_cpu) {
162 /* Initialize the scheduler structure of the primary LWP */
163 lwp0.l_mutex = ci->ci_schedstate.spc_lwplock;
164 }
165 if (ci->ci_schedstate.spc_mutex != NULL) {
166 /* Already initialized. */
167 return;
168 }
169
170 /* Allocate the run queue */
171 size = roundup2(sizeof(runqueue_t), coherency_unit) + coherency_unit;
172 rq_ptr = kmem_zalloc(size, KM_SLEEP);
173 if (rq_ptr == NULL) {
174 panic("sched_cpuattach: could not allocate the runqueue");
175 }
176 ci_rq = (void *)(roundup2((uintptr_t)(rq_ptr), coherency_unit));
177
178 /* Initialize run queues */
179 ci->ci_schedstate.spc_mutex =
180 mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
181 for (i = 0; i < PRI_RT_COUNT; i++)
182 TAILQ_INIT(&ci_rq->r_rt_queue[i].q_head);
183 for (i = 0; i < PRI_TS_COUNT; i++)
184 TAILQ_INIT(&ci_rq->r_ts_queue[i].q_head);
185
186 ci->ci_schedstate.spc_sched_info = ci_rq;
187
188 evcnt_attach_dynamic(&ci_rq->r_ev_pull, EVCNT_TYPE_MISC, NULL,
189 cpu_name(ci), "runqueue pull");
190 evcnt_attach_dynamic(&ci_rq->r_ev_push, EVCNT_TYPE_MISC, NULL,
191 cpu_name(ci), "runqueue push");
192 evcnt_attach_dynamic(&ci_rq->r_ev_stay, EVCNT_TYPE_MISC, NULL,
193 cpu_name(ci), "runqueue stay");
194 evcnt_attach_dynamic(&ci_rq->r_ev_localize, EVCNT_TYPE_MISC, NULL,
195 cpu_name(ci), "runqueue localize");
196}
197
198/*
199 * Control of the runqueue.
200 */
201
202static inline void *
203sched_getrq(runqueue_t *ci_rq, const pri_t prio)
204{
205
206 KASSERT(prio < PRI_COUNT);
207 return (prio <= PRI_HIGHEST_TS) ?
208 &ci_rq->r_ts_queue[prio].q_head :
209 &ci_rq->r_rt_queue[prio - PRI_HIGHEST_TS - 1].q_head;
210}
211
212void
213sched_enqueue(struct lwp *l, bool swtch)
214{
215 runqueue_t *ci_rq;
216 struct schedstate_percpu *spc;
217 TAILQ_HEAD(, lwp) *q_head;
218 const pri_t eprio = lwp_eprio(l);
219 struct cpu_info *ci;
220 int type;
221
222 ci = l->l_cpu;
223 spc = &ci->ci_schedstate;
224 ci_rq = spc->spc_sched_info;
225 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
226
227 /* Update the last run time on switch */
228 if (__predict_true(swtch == true))
229 l->l_rticksum += (hardclock_ticks - l->l_rticks);
230 else if (l->l_rticks == 0)
231 l->l_rticks = hardclock_ticks;
232
233 /* Enqueue the thread */
234 q_head = sched_getrq(ci_rq, eprio);
235 if (TAILQ_EMPTY(q_head)) {
236 u_int i;
237 uint32_t q;
238
239 /* Mark bit */
240 i = eprio >> BITMAP_SHIFT;
241 q = BITMAP_MSB >> (eprio & BITMAP_MASK);
242 KASSERT((ci_rq->r_bitmap[i] & q) == 0);
243 ci_rq->r_bitmap[i] |= q;
244 }
245 TAILQ_INSERT_TAIL(q_head, l, l_runq);
246 ci_rq->r_count++;
247 if ((l->l_pflag & LP_BOUND) == 0)
248 ci_rq->r_mcount++;
249
250 /*
251 * Update the value of highest priority in the runqueue,
252 * if priority of this thread is higher.
253 */
254 if (eprio > spc->spc_maxpriority)
255 spc->spc_maxpriority = eprio;
256
257 sched_newts(l);
258
259 /*
260 * Wake the chosen CPU or cause a preemption if the newly
261 * enqueued thread has higher priority. Don't cause a
262 * preemption if the thread is yielding (swtch).
263 */
264 if (!swtch && eprio > spc->spc_curpriority) {
265 if (eprio >= sched_kpreempt_pri)
266 type = RESCHED_KPREEMPT;
267 else if (eprio >= sched_upreempt_pri)
268 type = RESCHED_IMMED;
269 else
270 type = RESCHED_LAZY;
271 cpu_need_resched(ci, type);
272 }
273}
274
275void
276sched_dequeue(struct lwp *l)
277{
278 runqueue_t *ci_rq;
279 TAILQ_HEAD(, lwp) *q_head;
280 struct schedstate_percpu *spc;
281 const pri_t eprio = lwp_eprio(l);
282
283 spc = & l->l_cpu->ci_schedstate;
284 ci_rq = spc->spc_sched_info;
285 KASSERT(lwp_locked(l, spc->spc_mutex));
286
287 KASSERT(eprio <= spc->spc_maxpriority);
288 KASSERT(ci_rq->r_bitmap[eprio >> BITMAP_SHIFT] != 0);
289 KASSERT(ci_rq->r_count > 0);
290
291 if (spc->spc_migrating == l)
292 spc->spc_migrating = NULL;
293
294 ci_rq->r_count--;
295 if ((l->l_pflag & LP_BOUND) == 0)
296 ci_rq->r_mcount--;
297
298 q_head = sched_getrq(ci_rq, eprio);
299 TAILQ_REMOVE(q_head, l, l_runq);
300 if (TAILQ_EMPTY(q_head)) {
301 u_int i;
302 uint32_t q;
303
304 /* Unmark bit */
305 i = eprio >> BITMAP_SHIFT;
306 q = BITMAP_MSB >> (eprio & BITMAP_MASK);
307 KASSERT((ci_rq->r_bitmap[i] & q) != 0);
308 ci_rq->r_bitmap[i] &= ~q;
309
310 /*
311 * Update the value of highest priority in the runqueue, in a
312 * case it was a last thread in the queue of highest priority.
313 */
314 if (eprio != spc->spc_maxpriority)
315 return;
316
317 do {
318 if (ci_rq->r_bitmap[i] != 0) {
319 q = ffs(ci_rq->r_bitmap[i]);
320 spc->spc_maxpriority =
321 (i << BITMAP_SHIFT) + (BITMAP_BITS - q);
322 return;
323 }
324 } while (i--);
325
326 /* If not found - set the lowest value */
327 spc->spc_maxpriority = 0;
328 }
329}
330
331/*
332 * Migration and balancing.
333 */
334
335#ifdef MULTIPROCESSOR
336
337/* Estimate if LWP is cache-hot */
338static inline bool
339lwp_cache_hot(const struct lwp *l)
340{
341
342 if (__predict_false(l->l_slptime || l->l_rticks == 0))
343 return false;
344
345 return (hardclock_ticks - l->l_rticks <= cacheht_time);
346}
347
348/* Check if LWP can migrate to the chosen CPU */
349static inline bool
350sched_migratable(const struct lwp *l, struct cpu_info *ci)
351{
352 const struct schedstate_percpu *spc = &ci->ci_schedstate;
353 KASSERT(lwp_locked(__UNCONST(l), NULL));
354
355 /* Is CPU offline? */
356 if (__predict_false(spc->spc_flags & SPCF_OFFLINE))
357 return false;
358
359 /* Is affinity set? */
360 if (__predict_false(l->l_affinity))
361 return kcpuset_isset(l->l_affinity, cpu_index(ci));
362
363 /* Is there a processor-set? */
364 return (spc->spc_psid == l->l_psid);
365}
366
367/*
368 * Estimate the migration of LWP to the other CPU.
369 * Take and return the CPU, if migration is needed.
370 */
371struct cpu_info *
372sched_takecpu(struct lwp *l)
373{
374 struct cpu_info *ci, *tci, *pivot, *next;
375 struct schedstate_percpu *spc;
376 runqueue_t *ci_rq, *ici_rq;
377 pri_t eprio, lpri, pri;
378
379 KASSERT(lwp_locked(l, NULL));
380
381 /* If thread is strictly bound, do not estimate other CPUs */
382 ci = l->l_cpu;
383 if (l->l_pflag & LP_BOUND)
384 return ci;
385
386 spc = &ci->ci_schedstate;
387 ci_rq = spc->spc_sched_info;
388
389 /* Make sure that thread is in appropriate processor-set */
390 if (__predict_true(spc->spc_psid == l->l_psid)) {
391 /* If CPU of this thread is idling - run there */
392 if (ci_rq->r_count == 0) {
393 ci_rq->r_ev_stay.ev_count++;
394 return ci;
395 }
396 /* Stay if thread is cache-hot */
397 eprio = lwp_eprio(l);
398 if (__predict_true(l->l_stat != LSIDL) &&
399 lwp_cache_hot(l) && eprio >= spc->spc_curpriority) {
400 ci_rq->r_ev_stay.ev_count++;
401 return ci;
402 }
403 } else {
404 eprio = lwp_eprio(l);
405 }
406
407 /* Run on current CPU if priority of thread is higher */
408 ci = curcpu();
409 spc = &ci->ci_schedstate;
410 if (eprio > spc->spc_curpriority && sched_migratable(l, ci)) {
411 ci_rq = spc->spc_sched_info;
412 ci_rq->r_ev_localize.ev_count++;
413 return ci;
414 }
415
416 /*
417 * Look for the CPU with the lowest priority thread. In case of
418 * equal priority, choose the CPU with the fewest of threads.
419 */
420 pivot = l->l_cpu;
421 ci = pivot;
422 tci = pivot;
423 lpri = PRI_COUNT;
424 do {
425 if ((next = cpu_lookup(cpu_index(ci) + 1)) == NULL) {
426 /* Reached the end, start from the beginning. */
427 next = cpu_lookup(0);
428 }
429 spc = &ci->ci_schedstate;
430 ici_rq = spc->spc_sched_info;
431 pri = MAX(spc->spc_curpriority, spc->spc_maxpriority);
432 if (pri > lpri)
433 continue;
434
435 if (pri == lpri && ci_rq->r_count < ici_rq->r_count)
436 continue;
437
438 if (!sched_migratable(l, ci))
439 continue;
440
441 lpri = pri;
442 tci = ci;
443 ci_rq = ici_rq;
444 } while (ci = next, ci != pivot);
445
446 ci_rq = tci->ci_schedstate.spc_sched_info;
447 ci_rq->r_ev_push.ev_count++;
448
449 return tci;
450}
451
452/*
453 * Tries to catch an LWP from the runqueue of other CPU.
454 */
455static struct lwp *
456sched_catchlwp(struct cpu_info *ci)
457{
458 struct cpu_info *curci = curcpu();
459 struct schedstate_percpu *spc, *curspc;
460 TAILQ_HEAD(, lwp) *q_head;
461 runqueue_t *ci_rq;
462 struct lwp *l;
463
464 curspc = &curci->ci_schedstate;
465 spc = &ci->ci_schedstate;
466 KASSERT(curspc->spc_psid == spc->spc_psid);
467
468 ci_rq = spc->spc_sched_info;
469 if (ci_rq->r_mcount < min_catch) {
470 spc_unlock(ci);
471 return NULL;
472 }
473
474 /* Take the highest priority thread */
475 q_head = sched_getrq(ci_rq, spc->spc_maxpriority);
476 l = TAILQ_FIRST(q_head);
477
478 for (;;) {
479 /* Check the first and next result from the queue */
480 if (l == NULL) {
481 break;
482 }
483 KASSERTMSG(l->l_stat == LSRUN, "%s l %p (%s) l_stat %d",
484 ci->ci_data.cpu_name,
485 l, (l->l_name ? l->l_name : l->l_proc->p_comm), l->l_stat);
486
487 /* Look for threads, whose are allowed to migrate */
488 if ((l->l_pflag & LP_BOUND) || lwp_cache_hot(l) ||
489 !sched_migratable(l, curci)) {
490 l = TAILQ_NEXT(l, l_runq);
491 continue;
492 }
493
494 /* Grab the thread, and move to the local run queue */
495 sched_dequeue(l);
496
497 /*
498 * If LWP is still context switching, we may need to
499 * spin-wait before changing its CPU.
500 */
501 if (__predict_false(l->l_ctxswtch != 0)) {
502 u_int count;
503 count = SPINLOCK_BACKOFF_MIN;
504 while (l->l_ctxswtch)
505 SPINLOCK_BACKOFF(count);
506 }
507 l->l_cpu = curci;
508 ci_rq->r_ev_pull.ev_count++;
509 lwp_unlock_to(l, curspc->spc_mutex);
510 sched_enqueue(l, false);
511 return l;
512 }
513 spc_unlock(ci);
514
515 return l;
516}
517
518/*
519 * Periodical calculations for balancing.
520 */
521static void
522sched_balance(void *nocallout)
523{
524 struct cpu_info *ci, *hci;
525 runqueue_t *ci_rq;
526 CPU_INFO_ITERATOR cii;
527 u_int highest;
528
529 hci = curcpu();
530 highest = 0;
531
532 /* Make lockless countings */
533 for (CPU_INFO_FOREACH(cii, ci)) {
534 ci_rq = ci->ci_schedstate.spc_sched_info;
535
536 /* Average count of the threads */
537 ci_rq->r_avgcount = (ci_rq->r_avgcount + ci_rq->r_mcount) >> 1;
538
539 /* Look for CPU with the highest average */
540 if (ci_rq->r_avgcount > highest) {
541 hci = ci;
542 highest = ci_rq->r_avgcount;
543 }
544 }
545
546 /* Update the worker */
547 worker_ci = hci;
548
549 if (nocallout == NULL)
550 callout_schedule(&balance_ch, balance_period);
551}
552
553/*
554 * Called from each CPU's idle loop.
555 */
556void
557sched_idle(void)
558{
559 struct cpu_info *ci = curcpu(), *tci = NULL;
560 struct schedstate_percpu *spc, *tspc;
561 runqueue_t *ci_rq;
562 bool dlock = false;
563
564 /* Check if there is a migrating LWP */
565 spc = &ci->ci_schedstate;
566 if (spc->spc_migrating == NULL)
567 goto no_migration;
568
569 spc_lock(ci);
570 for (;;) {
571 struct lwp *l;
572
573 l = spc->spc_migrating;
574 if (l == NULL)
575 break;
576
577 /*
578 * If second attempt, and target CPU has changed,
579 * drop the old lock.
580 */
581 if (dlock == true && tci != l->l_target_cpu) {
582 KASSERT(tci != NULL);
583 spc_unlock(tci);
584 dlock = false;
585 }
586
587 /*
588 * Nothing to do if destination has changed to the
589 * local CPU, or migration was done by other CPU.
590 */
591 tci = l->l_target_cpu;
592 if (tci == NULL || tci == ci) {
593 spc->spc_migrating = NULL;
594 l->l_target_cpu = NULL;
595 break;
596 }
597 tspc = &tci->ci_schedstate;
598
599 /*
600 * Double-lock the runqueues.
601 * We do that only once.
602 */
603 if (dlock == false) {
604 dlock = true;
605 if (ci < tci) {
606 spc_lock(tci);
607 } else if (!mutex_tryenter(tspc->spc_mutex)) {
608 spc_unlock(ci);
609 spc_lock(tci);
610 spc_lock(ci);
611 /* Check the situation again.. */
612 continue;
613 }
614 }
615
616 /* Migrate the thread */
617 KASSERT(l->l_stat == LSRUN);
618 spc->spc_migrating = NULL;
619 l->l_target_cpu = NULL;
620 sched_dequeue(l);
621 l->l_cpu = tci;
622 lwp_setlock(l, tspc->spc_mutex);
623 sched_enqueue(l, false);
624 break;
625 }
626 if (dlock == true) {
627 KASSERT(tci != NULL);
628 spc_unlock(tci);
629 }
630 spc_unlock(ci);
631
632no_migration:
633 ci_rq = spc->spc_sched_info;
634 if ((spc->spc_flags & SPCF_OFFLINE) != 0 || ci_rq->r_count != 0) {
635 return;
636 }
637
638 /* Reset the counter, and call the balancer */
639 ci_rq->r_avgcount = 0;
640 sched_balance(ci);
641 tci = worker_ci;
642 tspc = &tci->ci_schedstate;
643 if (ci == tci || spc->spc_psid != tspc->spc_psid)
644 return;
645 spc_dlock(ci, tci);
646 (void)sched_catchlwp(tci);
647 spc_unlock(ci);
648}
649
650#else
651
652/*
653 * stubs for !MULTIPROCESSOR
654 */
655
656struct cpu_info *
657sched_takecpu(struct lwp *l)
658{
659
660 return l->l_cpu;
661}
662
663void
664sched_idle(void)
665{
666
667}
668#endif /* MULTIPROCESSOR */
669
670/*
671 * Scheduling statistics and balancing.
672 */
673void
674sched_lwp_stats(struct lwp *l)
675{
676 int batch;
677
678 KASSERT(lwp_locked(l, NULL));
679
680 /* Update sleep time */
681 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
682 l->l_stat == LSSUSPENDED)
683 l->l_slptime++;
684
685 /*
686 * Set that thread is more CPU-bound, if sum of run time exceeds the
687 * sum of sleep time. Check if thread is CPU-bound a first time.
688 */
689 batch = (l->l_rticksum > l->l_slpticksum);
690 if (batch != 0) {
691 if ((l->l_flag & LW_BATCH) == 0)
692 batch = 0;
693 l->l_flag |= LW_BATCH;
694 } else
695 l->l_flag &= ~LW_BATCH;
696
697 /*
698 * If thread is CPU-bound and never sleeps, it would occupy the CPU.
699 * In such case reset the value of last sleep, and check it later, if
700 * it is still zero - perform the migration, unmark the batch flag.
701 */
702 if (batch && (l->l_slptime + l->l_slpticksum) == 0) {
703 if (l->l_slpticks == 0) {
704 if (l->l_target_cpu == NULL &&
705 (l->l_stat == LSRUN || l->l_stat == LSONPROC)) {
706 struct cpu_info *ci = sched_takecpu(l);
707 l->l_target_cpu = (ci != l->l_cpu) ? ci : NULL;
708 }
709 l->l_flag &= ~LW_BATCH;
710 } else {
711 l->l_slpticks = 0;
712 }
713 }
714
715 /* Reset the time sums */
716 l->l_slpticksum = 0;
717 l->l_rticksum = 0;
718
719 /* Scheduler-specific hook */
720 sched_pstats_hook(l, batch);
721#ifdef KDTRACE_HOOKS
722 curthread = l;
723#endif
724}
725
726/*
727 * Scheduler mill.
728 */
729struct lwp *
730sched_nextlwp(void)
731{
732 struct cpu_info *ci = curcpu();
733 struct schedstate_percpu *spc;
734 TAILQ_HEAD(, lwp) *q_head;
735 runqueue_t *ci_rq;
736 struct lwp *l;
737
738 /* Return to idle LWP if there is a migrating thread */
739 spc = &ci->ci_schedstate;
740 if (__predict_false(spc->spc_migrating != NULL))
741 return NULL;
742 ci_rq = spc->spc_sched_info;
743
744#ifdef MULTIPROCESSOR
745 /* If runqueue is empty, try to catch some thread from other CPU */
746 if (__predict_false(ci_rq->r_count == 0)) {
747 struct schedstate_percpu *cspc;
748 struct cpu_info *cci;
749
750 /* Offline CPUs should not perform this, however */
751 if (__predict_false(spc->spc_flags & SPCF_OFFLINE))
752 return NULL;
753
754 /* Reset the counter, and call the balancer */
755 ci_rq->r_avgcount = 0;
756 sched_balance(ci);
757 cci = worker_ci;
758 cspc = &cci->ci_schedstate;
759 if (ci == cci || spc->spc_psid != cspc->spc_psid ||
760 !mutex_tryenter(cci->ci_schedstate.spc_mutex))
761 return NULL;
762 return sched_catchlwp(cci);
763 }
764#else
765 if (__predict_false(ci_rq->r_count == 0))
766 return NULL;
767#endif
768
769 /* Take the highest priority thread */
770 KASSERT(ci_rq->r_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]);
771 q_head = sched_getrq(ci_rq, spc->spc_maxpriority);
772 l = TAILQ_FIRST(q_head);
773 KASSERT(l != NULL);
774
775 sched_oncpu(l);
776 l->l_rticks = hardclock_ticks;
777
778 return l;
779}
780
781/*
782 * sched_curcpu_runnable_p: return if curcpu() should exit the idle loop.
783 */
784
785bool
786sched_curcpu_runnable_p(void)
787{
788 const struct cpu_info *ci;
789 const struct schedstate_percpu *spc;
790 const runqueue_t *ci_rq;
791 bool rv;
792
793 kpreempt_disable();
794 ci = curcpu();
795 spc = &ci->ci_schedstate;
796 ci_rq = spc->spc_sched_info;
797
798#ifndef __HAVE_FAST_SOFTINTS
799 if (ci->ci_data.cpu_softints) {
800 kpreempt_enable();
801 return true;
802 }
803#endif
804
805 rv = (ci_rq->r_count != 0) ? true : false;
806 kpreempt_enable();
807
808 return rv;
809}
810
811/*
812 * Sysctl nodes and initialization.
813 */
814
815SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup")
816{
817 const struct sysctlnode *node = NULL;
818
819 sysctl_createv(clog, 0, NULL, &node,
820 CTLFLAG_PERMANENT,
821 CTLTYPE_NODE, "sched",
822 SYSCTL_DESCR("Scheduler options"),
823 NULL, 0, NULL, 0,
824 CTL_KERN, CTL_CREATE, CTL_EOL);
825
826 if (node == NULL)
827 return;
828
829 sysctl_createv(clog, 0, &node, NULL,
830 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
831 CTLTYPE_INT, "cacheht_time",
832 SYSCTL_DESCR("Cache hotness time (in ticks)"),
833 NULL, 0, &cacheht_time, 0,
834 CTL_CREATE, CTL_EOL);
835 sysctl_createv(clog, 0, &node, NULL,
836 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
837 CTLTYPE_INT, "balance_period",
838 SYSCTL_DESCR("Balance period (in ticks)"),
839 NULL, 0, &balance_period, 0,
840 CTL_CREATE, CTL_EOL);
841 sysctl_createv(clog, 0, &node, NULL,
842 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
843 CTLTYPE_INT, "min_catch",
844 SYSCTL_DESCR("Minimal count of threads for catching"),
845 NULL, 0, &min_catch, 0,
846 CTL_CREATE, CTL_EOL);
847 sysctl_createv(clog, 0, &node, NULL,
848 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
849 CTLTYPE_INT, "timesoftints",
850 SYSCTL_DESCR("Track CPU time for soft interrupts"),
851 NULL, 0, &softint_timing, 0,
852 CTL_CREATE, CTL_EOL);
853 sysctl_createv(clog, 0, &node, NULL,
854 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
855 CTLTYPE_INT, "kpreempt_pri",
856 SYSCTL_DESCR("Minimum priority to trigger kernel preemption"),
857 NULL, 0, &sched_kpreempt_pri, 0,
858 CTL_CREATE, CTL_EOL);
859 sysctl_createv(clog, 0, &node, NULL,
860 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
861 CTLTYPE_INT, "upreempt_pri",
862 SYSCTL_DESCR("Minimum priority to trigger user preemption"),
863 NULL, 0, &sched_upreempt_pri, 0,
864 CTL_CREATE, CTL_EOL);
865}
866
867/*
868 * Debugging.
869 */
870
871#ifdef DDB
872
873void
874sched_print_runqueue(void (*pr)(const char *, ...))
875{
876 runqueue_t *ci_rq;
877 struct cpu_info *ci, *tci;
878 struct schedstate_percpu *spc;
879 struct lwp *l;
880 struct proc *p;
881 CPU_INFO_ITERATOR cii;
882
883 for (CPU_INFO_FOREACH(cii, ci)) {
884 int i;
885
886 spc = &ci->ci_schedstate;
887 ci_rq = spc->spc_sched_info;
888
889 (*pr)("Run-queue (CPU = %u):\n", ci->ci_index);
890 (*pr)(" pid.lid = %d.%d, r_count = %u, r_avgcount = %u, "
891 "maxpri = %d, mlwp = %p\n",
892#ifdef MULTIPROCESSOR
893 ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid,
894#else
895 curlwp->l_proc->p_pid, curlwp->l_lid,
896#endif
897 ci_rq->r_count, ci_rq->r_avgcount, spc->spc_maxpriority,
898 spc->spc_migrating);
899 i = (PRI_COUNT >> BITMAP_SHIFT) - 1;
900 do {
901 uint32_t q;
902 q = ci_rq->r_bitmap[i];
903 (*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q);
904 } while (i--);
905 }
906
907 (*pr)(" %5s %4s %4s %10s %3s %18s %4s %4s %s\n",
908 "LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "TCI", "LRTICKS");
909
910 PROCLIST_FOREACH(p, &allproc) {
911 (*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm);
912 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
913 ci = l->l_cpu;
914 tci = l->l_target_cpu;
915 (*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %4d %u\n",
916 (int)l->l_lid, l->l_priority, lwp_eprio(l),
917 l->l_flag, l->l_stat == LSRUN ? "RQ" :
918 (l->l_stat == LSSLEEP ? "SQ" : "-"),
919 l, ci->ci_index, (tci ? tci->ci_index : -1),
920 (u_int)(hardclock_ticks - l->l_rticks));
921 }
922 }
923}
924
925#endif
926