1/* $NetBSD: sys_sched.c,v 1.46 2016/07/30 15:38:17 christos Exp $ */
2
3/*
4 * Copyright (c) 2008, 2011 Mindaugas Rasiukevicius <rmind at NetBSD org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29/*
30 * System calls relating to the scheduler.
31 *
32 * Lock order:
33 *
34 * cpu_lock ->
35 * proc_lock ->
36 * proc_t::p_lock ->
37 * lwp_t::lwp_lock
38 *
39 * TODO:
40 * - Handle pthread_setschedprio() as defined by POSIX;
41 * - Handle sched_yield() case for SCHED_FIFO as defined by POSIX;
42 */
43
44#include <sys/cdefs.h>
45__KERNEL_RCSID(0, "$NetBSD: sys_sched.c,v 1.46 2016/07/30 15:38:17 christos Exp $");
46
47#include <sys/param.h>
48
49#include <sys/cpu.h>
50#include <sys/kauth.h>
51#include <sys/kmem.h>
52#include <sys/lwp.h>
53#include <sys/mutex.h>
54#include <sys/proc.h>
55#include <sys/pset.h>
56#include <sys/sched.h>
57#include <sys/syscallargs.h>
58#include <sys/sysctl.h>
59#include <sys/systm.h>
60#include <sys/types.h>
61#include <sys/unistd.h>
62
63static struct sysctllog *sched_sysctl_log;
64static kauth_listener_t sched_listener;
65
66/*
67 * Convert user priority or the in-kernel priority or convert the current
68 * priority to the appropriate range according to the policy change.
69 */
70static pri_t
71convert_pri(lwp_t *l, int policy, pri_t pri)
72{
73
74 /* Convert user priority to the in-kernel */
75 if (pri != PRI_NONE) {
76 /* Only for real-time threads */
77 KASSERT(pri >= SCHED_PRI_MIN && pri <= SCHED_PRI_MAX);
78 KASSERT(policy != SCHED_OTHER);
79 return PRI_USER_RT + pri;
80 }
81
82 /* Neither policy, nor priority change */
83 if (l->l_class == policy)
84 return l->l_priority;
85
86 /* Time-sharing -> real-time */
87 if (l->l_class == SCHED_OTHER) {
88 KASSERT(policy == SCHED_FIFO || policy == SCHED_RR);
89 return PRI_USER_RT;
90 }
91
92 /* Real-time -> time-sharing */
93 if (policy == SCHED_OTHER) {
94 KASSERT(l->l_class == SCHED_FIFO || l->l_class == SCHED_RR);
95 /*
96 * this is a bit arbitrary because the priority is dynamic
97 * for SCHED_OTHER threads and will likely be changed by
98 * the scheduler soon anyway.
99 */
100 return l->l_priority - PRI_USER_RT;
101 }
102
103 /* Real-time -> real-time */
104 return l->l_priority;
105}
106
107int
108do_sched_setparam(pid_t pid, lwpid_t lid, int policy,
109 const struct sched_param *params)
110{
111 struct proc *p;
112 struct lwp *t;
113 pri_t pri;
114 u_int lcnt;
115 int error;
116
117 error = 0;
118
119 pri = params->sched_priority;
120
121 /* If no parameters specified, just return (this should not happen) */
122 if (pri == PRI_NONE && policy == SCHED_NONE)
123 return 0;
124
125 /* Validate scheduling class */
126 if (policy != SCHED_NONE && (policy < SCHED_OTHER || policy > SCHED_RR))
127 return EINVAL;
128
129 /* Validate priority */
130 if (pri != PRI_NONE && (pri < SCHED_PRI_MIN || pri > SCHED_PRI_MAX))
131 return EINVAL;
132
133 if (pid != 0) {
134 /* Find the process */
135 mutex_enter(proc_lock);
136 p = proc_find(pid);
137 if (p == NULL) {
138 mutex_exit(proc_lock);
139 return ESRCH;
140 }
141 mutex_enter(p->p_lock);
142 mutex_exit(proc_lock);
143 /* Disallow modification of system processes */
144 if ((p->p_flag & PK_SYSTEM) != 0) {
145 mutex_exit(p->p_lock);
146 return EPERM;
147 }
148 } else {
149 /* Use the calling process */
150 p = curlwp->l_proc;
151 mutex_enter(p->p_lock);
152 }
153
154 /* Find the LWP(s) */
155 lcnt = 0;
156 LIST_FOREACH(t, &p->p_lwps, l_sibling) {
157 pri_t kpri;
158 int lpolicy;
159
160 if (lid && lid != t->l_lid)
161 continue;
162
163 lcnt++;
164 lwp_lock(t);
165 lpolicy = (policy == SCHED_NONE) ? t->l_class : policy;
166
167 /* Disallow setting of priority for SCHED_OTHER threads */
168 if (lpolicy == SCHED_OTHER && pri != PRI_NONE) {
169 lwp_unlock(t);
170 error = EINVAL;
171 break;
172 }
173
174 /* Convert priority, if needed */
175 kpri = convert_pri(t, lpolicy, pri);
176
177 /* Check the permission */
178 error = kauth_authorize_process(kauth_cred_get(),
179 KAUTH_PROCESS_SCHEDULER_SETPARAM, p, t, KAUTH_ARG(lpolicy),
180 KAUTH_ARG(kpri));
181 if (error) {
182 lwp_unlock(t);
183 break;
184 }
185
186 /* Set the scheduling class, change the priority */
187 t->l_class = lpolicy;
188 lwp_changepri(t, kpri);
189 lwp_unlock(t);
190 }
191 mutex_exit(p->p_lock);
192 return (lcnt == 0) ? ESRCH : error;
193}
194
195/*
196 * Set scheduling parameters.
197 */
198int
199sys__sched_setparam(struct lwp *l, const struct sys__sched_setparam_args *uap,
200 register_t *retval)
201{
202 /* {
203 syscallarg(pid_t) pid;
204 syscallarg(lwpid_t) lid;
205 syscallarg(int) policy;
206 syscallarg(const struct sched_param *) params;
207 } */
208 struct sched_param params;
209 int error;
210
211 /* Get the parameters from the user-space */
212 error = copyin(SCARG(uap, params), &params, sizeof(params));
213 if (error)
214 goto out;
215
216 error = do_sched_setparam(SCARG(uap, pid), SCARG(uap, lid),
217 SCARG(uap, policy), &params);
218out:
219 return error;
220}
221
222/*
223 * do_sched_getparam:
224 *
225 * if lid=0, returns the parameter of the first LWP in the process.
226 */
227int
228do_sched_getparam(pid_t pid, lwpid_t lid, int *policy,
229 struct sched_param *params)
230{
231 struct sched_param lparams;
232 struct lwp *t;
233 int error, lpolicy;
234
235 t = lwp_find2(pid, lid); /* acquire p_lock */
236 if (t == NULL)
237 return ESRCH;
238
239 /* Check the permission */
240 error = kauth_authorize_process(kauth_cred_get(),
241 KAUTH_PROCESS_SCHEDULER_GETPARAM, t->l_proc, NULL, NULL, NULL);
242 if (error != 0) {
243 mutex_exit(t->l_proc->p_lock);
244 return error;
245 }
246
247 lwp_lock(t);
248 lparams.sched_priority = t->l_priority;
249 lpolicy = t->l_class;
250 lwp_unlock(t);
251 mutex_exit(t->l_proc->p_lock);
252
253 /*
254 * convert to the user-visible priority value.
255 * it's an inversion of convert_pri().
256 *
257 * the SCHED_OTHER case is a bit arbitrary given that
258 * - we don't allow setting the priority.
259 * - the priority is dynamic.
260 */
261 switch (lpolicy) {
262 case SCHED_OTHER:
263 lparams.sched_priority -= PRI_USER;
264 break;
265 case SCHED_RR:
266 case SCHED_FIFO:
267 lparams.sched_priority -= PRI_USER_RT;
268 break;
269 }
270
271 if (policy != NULL)
272 *policy = lpolicy;
273
274 if (params != NULL)
275 *params = lparams;
276
277 return error;
278}
279
280/*
281 * Get scheduling parameters.
282 */
283int
284sys__sched_getparam(struct lwp *l, const struct sys__sched_getparam_args *uap,
285 register_t *retval)
286{
287 /* {
288 syscallarg(pid_t) pid;
289 syscallarg(lwpid_t) lid;
290 syscallarg(int *) policy;
291 syscallarg(struct sched_param *) params;
292 } */
293 struct sched_param params;
294 int error, policy;
295
296 error = do_sched_getparam(SCARG(uap, pid), SCARG(uap, lid), &policy,
297 &params);
298 if (error)
299 goto out;
300
301 error = copyout(&params, SCARG(uap, params), sizeof(params));
302 if (error == 0 && SCARG(uap, policy) != NULL)
303 error = copyout(&policy, SCARG(uap, policy), sizeof(int));
304out:
305 return error;
306}
307
308/*
309 * Allocate the CPU set, and get it from userspace.
310 */
311static int
312genkcpuset(kcpuset_t **dset, const cpuset_t *sset, size_t size)
313{
314 kcpuset_t *kset;
315 int error;
316
317 kcpuset_create(&kset, true);
318 error = kcpuset_copyin(sset, kset, size);
319 if (error) {
320 kcpuset_unuse(kset, NULL);
321 } else {
322 *dset = kset;
323 }
324 return error;
325}
326
327/*
328 * Set affinity.
329 */
330int
331sys__sched_setaffinity(struct lwp *l,
332 const struct sys__sched_setaffinity_args *uap, register_t *retval)
333{
334 /* {
335 syscallarg(pid_t) pid;
336 syscallarg(lwpid_t) lid;
337 syscallarg(size_t) size;
338 syscallarg(const cpuset_t *) cpuset;
339 } */
340 kcpuset_t *kcset, *kcpulst = NULL;
341 struct cpu_info *ici, *ci;
342 struct proc *p;
343 struct lwp *t;
344 CPU_INFO_ITERATOR cii;
345 bool alloff;
346 lwpid_t lid;
347 u_int lcnt;
348 int error;
349
350 error = genkcpuset(&kcset, SCARG(uap, cpuset), SCARG(uap, size));
351 if (error)
352 return error;
353
354 /*
355 * Traverse _each_ CPU to:
356 * - Check that CPUs in the mask have no assigned processor set.
357 * - Check that at least one CPU from the mask is online.
358 * - Find the first target CPU to migrate.
359 *
360 * To avoid the race with CPU online/offline calls and processor sets,
361 * cpu_lock will be locked for the entire operation.
362 */
363 ci = NULL;
364 alloff = false;
365 mutex_enter(&cpu_lock);
366 for (CPU_INFO_FOREACH(cii, ici)) {
367 struct schedstate_percpu *ispc;
368
369 if (!kcpuset_isset(kcset, cpu_index(ici))) {
370 continue;
371 }
372
373 ispc = &ici->ci_schedstate;
374 /* Check that CPU is not in the processor-set */
375 if (ispc->spc_psid != PS_NONE) {
376 error = EPERM;
377 goto out;
378 }
379 /* Skip offline CPUs */
380 if (ispc->spc_flags & SPCF_OFFLINE) {
381 alloff = true;
382 continue;
383 }
384 /* Target CPU to migrate */
385 if (ci == NULL) {
386 ci = ici;
387 }
388 }
389 if (ci == NULL) {
390 if (alloff) {
391 /* All CPUs in the set are offline */
392 error = EPERM;
393 goto out;
394 }
395 /* Empty set */
396 kcpuset_unuse(kcset, &kcpulst);
397 kcset = NULL;
398 }
399
400 if (SCARG(uap, pid) != 0) {
401 /* Find the process */
402 mutex_enter(proc_lock);
403 p = proc_find(SCARG(uap, pid));
404 if (p == NULL) {
405 mutex_exit(proc_lock);
406 error = ESRCH;
407 goto out;
408 }
409 mutex_enter(p->p_lock);
410 mutex_exit(proc_lock);
411 /* Disallow modification of system processes. */
412 if ((p->p_flag & PK_SYSTEM) != 0) {
413 mutex_exit(p->p_lock);
414 error = EPERM;
415 goto out;
416 }
417 } else {
418 /* Use the calling process */
419 p = l->l_proc;
420 mutex_enter(p->p_lock);
421 }
422
423 /*
424 * Check the permission.
425 */
426 error = kauth_authorize_process(l->l_cred,
427 KAUTH_PROCESS_SCHEDULER_SETAFFINITY, p, NULL, NULL, NULL);
428 if (error != 0) {
429 mutex_exit(p->p_lock);
430 goto out;
431 }
432
433 /* Iterate through LWP(s). */
434 lcnt = 0;
435 lid = SCARG(uap, lid);
436 LIST_FOREACH(t, &p->p_lwps, l_sibling) {
437 if (lid && lid != t->l_lid) {
438 continue;
439 }
440 lwp_lock(t);
441 /* No affinity for zombie LWPs. */
442 if (t->l_stat == LSZOMB) {
443 lwp_unlock(t);
444 continue;
445 }
446 /* First, release existing affinity, if any. */
447 if (t->l_affinity) {
448 kcpuset_unuse(t->l_affinity, &kcpulst);
449 }
450 if (kcset) {
451 /*
452 * Hold a reference on affinity mask, assign mask to
453 * LWP and migrate it to another CPU (unlocks LWP).
454 */
455 kcpuset_use(kcset);
456 t->l_affinity = kcset;
457 lwp_migrate(t, ci);
458 } else {
459 /* Old affinity mask is released, just clear. */
460 t->l_affinity = NULL;
461 lwp_unlock(t);
462 }
463 lcnt++;
464 }
465 mutex_exit(p->p_lock);
466 if (lcnt == 0) {
467 error = ESRCH;
468 }
469out:
470 mutex_exit(&cpu_lock);
471
472 /*
473 * Drop the initial reference (LWPs, if any, have the ownership now),
474 * and destroy whatever is in the G/C list, if filled.
475 */
476 if (kcset) {
477 kcpuset_unuse(kcset, &kcpulst);
478 }
479 if (kcpulst) {
480 kcpuset_destroy(kcpulst);
481 }
482 return error;
483}
484
485/*
486 * Get affinity.
487 */
488int
489sys__sched_getaffinity(struct lwp *l,
490 const struct sys__sched_getaffinity_args *uap, register_t *retval)
491{
492 /* {
493 syscallarg(pid_t) pid;
494 syscallarg(lwpid_t) lid;
495 syscallarg(size_t) size;
496 syscallarg(cpuset_t *) cpuset;
497 } */
498 struct lwp *t;
499 kcpuset_t *kcset;
500 int error;
501
502 error = genkcpuset(&kcset, SCARG(uap, cpuset), SCARG(uap, size));
503 if (error)
504 return error;
505
506 /* Locks the LWP */
507 t = lwp_find2(SCARG(uap, pid), SCARG(uap, lid));
508 if (t == NULL) {
509 error = ESRCH;
510 goto out;
511 }
512 /* Check the permission */
513 if (kauth_authorize_process(l->l_cred,
514 KAUTH_PROCESS_SCHEDULER_GETAFFINITY, t->l_proc, NULL, NULL, NULL)) {
515 mutex_exit(t->l_proc->p_lock);
516 error = EPERM;
517 goto out;
518 }
519 lwp_lock(t);
520 if (t->l_affinity) {
521 kcpuset_copy(kcset, t->l_affinity);
522 } else {
523 kcpuset_zero(kcset);
524 }
525 lwp_unlock(t);
526 mutex_exit(t->l_proc->p_lock);
527
528 error = kcpuset_copyout(kcset, SCARG(uap, cpuset), SCARG(uap, size));
529out:
530 kcpuset_unuse(kcset, NULL);
531 return error;
532}
533
534/*
535 * Priority protection for PTHREAD_PRIO_PROTECT. This is a weak
536 * analogue of priority inheritance: temp raise the priority
537 * of the caller when accessing a protected resource.
538 */
539int
540sys__sched_protect(struct lwp *l,
541 const struct sys__sched_protect_args *uap, register_t *retval)
542{
543 /* {
544 syscallarg(int) priority;
545 syscallarg(int *) opriority;
546 } */
547 int error;
548 pri_t pri;
549
550 KASSERT(l->l_inheritedprio == -1);
551 KASSERT(l->l_auxprio == -1 || l->l_auxprio == l->l_protectprio);
552
553 pri = SCARG(uap, priority);
554 error = 0;
555 lwp_lock(l);
556 if (pri == -1) {
557 /* back out priority changes */
558 switch(l->l_protectdepth) {
559 case 0:
560 error = EINVAL;
561 break;
562 case 1:
563 l->l_protectdepth = 0;
564 l->l_protectprio = -1;
565 l->l_auxprio = -1;
566 break;
567 default:
568 l->l_protectdepth--;
569 break;
570 }
571 } else if (pri < 0) {
572 /* Just retrieve the current value, for debugging */
573 if (l->l_protectprio == -1)
574 error = ENOENT;
575 else
576 *retval = l->l_protectprio - PRI_USER_RT;
577 } else if (__predict_false(pri < SCHED_PRI_MIN ||
578 pri > SCHED_PRI_MAX || l->l_priority > pri + PRI_USER_RT)) {
579 /* must fail if existing priority is higher */
580 error = EPERM;
581 } else {
582 /* play along but make no changes if not a realtime LWP. */
583 l->l_protectdepth++;
584 pri += PRI_USER_RT;
585 if (__predict_true(l->l_class != SCHED_OTHER &&
586 pri > l->l_protectprio)) {
587 l->l_protectprio = pri;
588 l->l_auxprio = pri;
589 }
590 }
591 lwp_unlock(l);
592
593 return error;
594}
595
596/*
597 * Yield.
598 */
599int
600sys_sched_yield(struct lwp *l, const void *v, register_t *retval)
601{
602
603 yield();
604 return 0;
605}
606
607/*
608 * Sysctl nodes and initialization.
609 */
610static void
611sysctl_sched_setup(struct sysctllog **clog)
612{
613 const struct sysctlnode *node = NULL;
614
615 sysctl_createv(clog, 0, NULL, NULL,
616 CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
617 CTLTYPE_INT, "posix_sched",
618 SYSCTL_DESCR("Version of IEEE Std 1003.1 and its "
619 "Process Scheduling option to which the "
620 "system attempts to conform"),
621 NULL, _POSIX_PRIORITY_SCHEDULING, NULL, 0,
622 CTL_KERN, CTL_CREATE, CTL_EOL);
623 sysctl_createv(clog, 0, NULL, &node,
624 CTLFLAG_PERMANENT,
625 CTLTYPE_NODE, "sched",
626 SYSCTL_DESCR("Scheduler options"),
627 NULL, 0, NULL, 0,
628 CTL_KERN, CTL_CREATE, CTL_EOL);
629
630 if (node == NULL)
631 return;
632
633 sysctl_createv(clog, 0, &node, NULL,
634 CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
635 CTLTYPE_INT, "pri_min",
636 SYSCTL_DESCR("Minimal POSIX real-time priority"),
637 NULL, SCHED_PRI_MIN, NULL, 0,
638 CTL_CREATE, CTL_EOL);
639 sysctl_createv(clog, 0, &node, NULL,
640 CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
641 CTLTYPE_INT, "pri_max",
642 SYSCTL_DESCR("Maximal POSIX real-time priority"),
643 NULL, SCHED_PRI_MAX, NULL, 0,
644 CTL_CREATE, CTL_EOL);
645}
646
647static int
648sched_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
649 void *arg0, void *arg1, void *arg2, void *arg3)
650{
651 struct proc *p;
652 int result;
653
654 result = KAUTH_RESULT_DEFER;
655 p = arg0;
656
657 switch (action) {
658 case KAUTH_PROCESS_SCHEDULER_GETPARAM:
659 if (kauth_cred_uidmatch(cred, p->p_cred))
660 result = KAUTH_RESULT_ALLOW;
661 break;
662
663 case KAUTH_PROCESS_SCHEDULER_SETPARAM:
664 if (kauth_cred_uidmatch(cred, p->p_cred)) {
665 struct lwp *l;
666 int policy;
667 pri_t priority;
668
669 l = arg1;
670 policy = (int)(unsigned long)arg2;
671 priority = (pri_t)(unsigned long)arg3;
672
673 if ((policy == l->l_class ||
674 (policy != SCHED_FIFO && policy != SCHED_RR)) &&
675 priority <= l->l_priority)
676 result = KAUTH_RESULT_ALLOW;
677 }
678
679 break;
680
681 case KAUTH_PROCESS_SCHEDULER_GETAFFINITY:
682 result = KAUTH_RESULT_ALLOW;
683 break;
684
685 case KAUTH_PROCESS_SCHEDULER_SETAFFINITY:
686 /* Privileged; we let the secmodel handle this. */
687 break;
688
689 default:
690 break;
691 }
692
693 return result;
694}
695
696void
697sched_init(void)
698{
699
700 sysctl_sched_setup(&sched_sysctl_log);
701
702 sched_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
703 sched_listener_cb, NULL);
704}
705