1 | /* $NetBSD: sys_pset.c,v 1.19 2015/07/30 08:11:44 maxv Exp $ */ |
2 | |
3 | /* |
4 | * Copyright (c) 2008, Mindaugas Rasiukevicius <rmind at NetBSD org> |
5 | * All rights reserved. |
6 | * |
7 | * Redistribution and use in source and binary forms, with or without |
8 | * modification, are permitted provided that the following conditions |
9 | * are met: |
10 | * 1. Redistributions of source code must retain the above copyright |
11 | * notice, this list of conditions and the following disclaimer. |
12 | * 2. Redistributions in binary form must reproduce the above copyright |
13 | * notice, this list of conditions and the following disclaimer in the |
14 | * documentation and/or other materials provided with the distribution. |
15 | * |
16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
26 | * SUCH DAMAGE. |
27 | */ |
28 | |
29 | /* |
30 | * Implementation of the Processor Sets. |
31 | * |
32 | * Locking |
33 | * The array of the processor-set structures and its members are protected |
34 | * by the global cpu_lock. Note that in scheduler, the very l_psid value |
35 | * might be used without lock held. |
36 | */ |
37 | |
38 | #include <sys/cdefs.h> |
39 | __KERNEL_RCSID(0, "$NetBSD: sys_pset.c,v 1.19 2015/07/30 08:11:44 maxv Exp $" ); |
40 | |
41 | #include <sys/param.h> |
42 | |
43 | #include <sys/cpu.h> |
44 | #include <sys/kauth.h> |
45 | #include <sys/kmem.h> |
46 | #include <sys/lwp.h> |
47 | #include <sys/mutex.h> |
48 | #include <sys/proc.h> |
49 | #include <sys/pset.h> |
50 | #include <sys/sched.h> |
51 | #include <sys/syscallargs.h> |
52 | #include <sys/sysctl.h> |
53 | #include <sys/systm.h> |
54 | #include <sys/types.h> |
55 | |
56 | static pset_info_t ** psets; |
57 | static u_int psets_max; |
58 | static u_int psets_count; |
59 | static kauth_listener_t psets_listener; |
60 | |
61 | static int psets_realloc(int); |
62 | static int psid_validate(psetid_t, bool); |
63 | static int kern_pset_create(psetid_t *); |
64 | static int kern_pset_destroy(psetid_t); |
65 | |
66 | static int |
67 | psets_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, |
68 | void *arg0, void *arg1, void *arg2, void *arg3) |
69 | { |
70 | psetid_t id; |
71 | enum kauth_system_req req; |
72 | int result; |
73 | |
74 | result = KAUTH_RESULT_DEFER; |
75 | req = (enum kauth_system_req)arg0; |
76 | id = (psetid_t)(unsigned long)arg1; |
77 | |
78 | if (action != KAUTH_SYSTEM_PSET) |
79 | return result; |
80 | |
81 | if ((req == KAUTH_REQ_SYSTEM_PSET_ASSIGN) || |
82 | (req == KAUTH_REQ_SYSTEM_PSET_BIND)) { |
83 | if (id == PS_QUERY) |
84 | result = KAUTH_RESULT_ALLOW; |
85 | } |
86 | |
87 | return result; |
88 | } |
89 | |
90 | /* |
91 | * Initialization of the processor-sets. |
92 | */ |
93 | void |
94 | psets_init(void) |
95 | { |
96 | |
97 | psets_max = max(maxcpus, 32); |
98 | psets = kmem_zalloc(psets_max * sizeof(void *), KM_SLEEP); |
99 | psets_count = 0; |
100 | |
101 | psets_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM, |
102 | psets_listener_cb, NULL); |
103 | } |
104 | |
105 | /* |
106 | * Reallocate the array of the processor-set structures. |
107 | */ |
108 | static int |
109 | psets_realloc(int new_psets_max) |
110 | { |
111 | pset_info_t **new_psets, **old_psets; |
112 | const u_int newsize = new_psets_max * sizeof(void *); |
113 | u_int i, oldsize; |
114 | |
115 | if (new_psets_max < 1) |
116 | return EINVAL; |
117 | |
118 | new_psets = kmem_zalloc(newsize, KM_SLEEP); |
119 | mutex_enter(&cpu_lock); |
120 | old_psets = psets; |
121 | oldsize = psets_max * sizeof(void *); |
122 | |
123 | /* Check if we can lower the size of the array */ |
124 | if (new_psets_max < psets_max) { |
125 | for (i = new_psets_max; i < psets_max; i++) { |
126 | if (psets[i] == NULL) |
127 | continue; |
128 | mutex_exit(&cpu_lock); |
129 | kmem_free(new_psets, newsize); |
130 | return EBUSY; |
131 | } |
132 | } |
133 | |
134 | /* Copy all pointers to the new array */ |
135 | memcpy(new_psets, psets, newsize); |
136 | psets_max = new_psets_max; |
137 | psets = new_psets; |
138 | mutex_exit(&cpu_lock); |
139 | |
140 | kmem_free(old_psets, oldsize); |
141 | return 0; |
142 | } |
143 | |
144 | /* |
145 | * Validate processor-set ID. |
146 | */ |
147 | static int |
148 | psid_validate(psetid_t psid, bool chkps) |
149 | { |
150 | |
151 | KASSERT(mutex_owned(&cpu_lock)); |
152 | |
153 | if (chkps && (psid == PS_NONE || psid == PS_QUERY || psid == PS_MYID)) |
154 | return 0; |
155 | if (psid <= 0 || psid > psets_max) |
156 | return EINVAL; |
157 | if (psets[psid - 1] == NULL) |
158 | return EINVAL; |
159 | if (psets[psid - 1]->ps_flags & PSET_BUSY) |
160 | return EBUSY; |
161 | |
162 | return 0; |
163 | } |
164 | |
165 | /* |
166 | * Create a processor-set. |
167 | */ |
168 | static int |
169 | kern_pset_create(psetid_t *psid) |
170 | { |
171 | pset_info_t *pi; |
172 | u_int i; |
173 | |
174 | if (psets_count == psets_max) |
175 | return ENOMEM; |
176 | |
177 | pi = kmem_zalloc(sizeof(pset_info_t), KM_SLEEP); |
178 | |
179 | mutex_enter(&cpu_lock); |
180 | if (psets_count == psets_max) { |
181 | mutex_exit(&cpu_lock); |
182 | kmem_free(pi, sizeof(pset_info_t)); |
183 | return ENOMEM; |
184 | } |
185 | |
186 | /* Find a free entry in the array */ |
187 | for (i = 0; i < psets_max; i++) |
188 | if (psets[i] == NULL) |
189 | break; |
190 | KASSERT(i != psets_max); |
191 | |
192 | psets[i] = pi; |
193 | psets_count++; |
194 | mutex_exit(&cpu_lock); |
195 | |
196 | *psid = i + 1; |
197 | return 0; |
198 | } |
199 | |
200 | /* |
201 | * Destroy a processor-set. |
202 | */ |
203 | static int |
204 | kern_pset_destroy(psetid_t psid) |
205 | { |
206 | struct cpu_info *ci; |
207 | pset_info_t *pi; |
208 | struct lwp *l; |
209 | CPU_INFO_ITERATOR cii; |
210 | int error; |
211 | |
212 | mutex_enter(&cpu_lock); |
213 | if (psid == PS_MYID) { |
214 | /* Use caller's processor-set ID */ |
215 | psid = curlwp->l_psid; |
216 | } |
217 | error = psid_validate(psid, false); |
218 | if (error) { |
219 | mutex_exit(&cpu_lock); |
220 | return error; |
221 | } |
222 | |
223 | /* Release the processor-set from all CPUs */ |
224 | for (CPU_INFO_FOREACH(cii, ci)) { |
225 | struct schedstate_percpu *spc; |
226 | |
227 | spc = &ci->ci_schedstate; |
228 | if (spc->spc_psid != psid) |
229 | continue; |
230 | spc->spc_psid = PS_NONE; |
231 | } |
232 | /* Mark that processor-set is going to be destroyed */ |
233 | pi = psets[psid - 1]; |
234 | pi->ps_flags |= PSET_BUSY; |
235 | mutex_exit(&cpu_lock); |
236 | |
237 | /* Unmark the processor-set ID from each thread */ |
238 | mutex_enter(proc_lock); |
239 | LIST_FOREACH(l, &alllwp, l_list) { |
240 | /* Safe to check and set without lock held */ |
241 | if (l->l_psid != psid) |
242 | continue; |
243 | l->l_psid = PS_NONE; |
244 | } |
245 | mutex_exit(proc_lock); |
246 | |
247 | /* Destroy the processor-set */ |
248 | mutex_enter(&cpu_lock); |
249 | psets[psid - 1] = NULL; |
250 | psets_count--; |
251 | mutex_exit(&cpu_lock); |
252 | |
253 | kmem_free(pi, sizeof(pset_info_t)); |
254 | return 0; |
255 | } |
256 | |
257 | /* |
258 | * General system calls for the processor-sets. |
259 | */ |
260 | |
261 | int |
262 | sys_pset_create(struct lwp *l, const struct sys_pset_create_args *uap, |
263 | register_t *retval) |
264 | { |
265 | /* { |
266 | syscallarg(psetid_t) *psid; |
267 | } */ |
268 | psetid_t psid; |
269 | int error; |
270 | |
271 | /* Available only for super-user */ |
272 | if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_PSET, |
273 | KAUTH_REQ_SYSTEM_PSET_CREATE, NULL, NULL, NULL)) |
274 | return EPERM; |
275 | |
276 | error = kern_pset_create(&psid); |
277 | if (error) |
278 | return error; |
279 | |
280 | error = copyout(&psid, SCARG(uap, psid), sizeof(psetid_t)); |
281 | if (error) |
282 | (void)kern_pset_destroy(psid); |
283 | |
284 | return error; |
285 | } |
286 | |
287 | int |
288 | sys_pset_destroy(struct lwp *l, const struct sys_pset_destroy_args *uap, |
289 | register_t *retval) |
290 | { |
291 | /* { |
292 | syscallarg(psetid_t) psid; |
293 | } */ |
294 | |
295 | /* Available only for super-user */ |
296 | if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_PSET, |
297 | KAUTH_REQ_SYSTEM_PSET_DESTROY, |
298 | KAUTH_ARG(SCARG(uap, psid)), NULL, NULL)) |
299 | return EPERM; |
300 | |
301 | return kern_pset_destroy(SCARG(uap, psid)); |
302 | } |
303 | |
304 | int |
305 | sys_pset_assign(struct lwp *l, const struct sys_pset_assign_args *uap, |
306 | register_t *retval) |
307 | { |
308 | /* { |
309 | syscallarg(psetid_t) psid; |
310 | syscallarg(cpuid_t) cpuid; |
311 | syscallarg(psetid_t) *opsid; |
312 | } */ |
313 | struct cpu_info *ici, *ci = NULL; |
314 | struct schedstate_percpu *spc = NULL; |
315 | struct lwp *t; |
316 | psetid_t psid = SCARG(uap, psid), opsid = 0; |
317 | CPU_INFO_ITERATOR cii; |
318 | int error = 0, nnone = 0; |
319 | |
320 | /* Available only for super-user, except the case of PS_QUERY */ |
321 | if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_PSET, |
322 | KAUTH_REQ_SYSTEM_PSET_ASSIGN, KAUTH_ARG(SCARG(uap, psid)), NULL, |
323 | NULL)) |
324 | return EPERM; |
325 | |
326 | /* Find the target CPU */ |
327 | mutex_enter(&cpu_lock); |
328 | for (CPU_INFO_FOREACH(cii, ici)) { |
329 | struct schedstate_percpu *ispc; |
330 | ispc = &ici->ci_schedstate; |
331 | if (cpu_index(ici) == SCARG(uap, cpuid)) { |
332 | ci = ici; |
333 | spc = ispc; |
334 | } |
335 | nnone += (ispc->spc_psid == PS_NONE); |
336 | } |
337 | if (ci == NULL) { |
338 | mutex_exit(&cpu_lock); |
339 | return EINVAL; |
340 | } |
341 | error = psid_validate(psid, true); |
342 | if (error) { |
343 | mutex_exit(&cpu_lock); |
344 | return error; |
345 | } |
346 | opsid = spc->spc_psid; |
347 | switch (psid) { |
348 | case PS_QUERY: |
349 | break; |
350 | case PS_MYID: |
351 | psid = curlwp->l_psid; |
352 | /* FALLTHROUGH */ |
353 | default: |
354 | /* |
355 | * Ensure at least one CPU stays in the default set, |
356 | * and that specified CPU is not offline. |
357 | */ |
358 | if (psid != PS_NONE && ((spc->spc_flags & SPCF_OFFLINE) || |
359 | (nnone == 1 && spc->spc_psid == PS_NONE))) { |
360 | mutex_exit(&cpu_lock); |
361 | return EBUSY; |
362 | } |
363 | mutex_enter(proc_lock); |
364 | /* |
365 | * Ensure that none of the threads are using affinity mask |
366 | * with this target CPU in it. |
367 | */ |
368 | LIST_FOREACH(t, &alllwp, l_list) { |
369 | if (t->l_affinity == NULL) { |
370 | continue; |
371 | } |
372 | lwp_lock(t); |
373 | if (t->l_affinity == NULL) { |
374 | lwp_unlock(t); |
375 | continue; |
376 | } |
377 | if (kcpuset_isset(t->l_affinity, cpu_index(ci))) { |
378 | lwp_unlock(t); |
379 | mutex_exit(proc_lock); |
380 | mutex_exit(&cpu_lock); |
381 | return EPERM; |
382 | } |
383 | lwp_unlock(t); |
384 | } |
385 | /* |
386 | * Set the processor-set ID. |
387 | * Migrate out any threads running on this CPU. |
388 | */ |
389 | spc->spc_psid = psid; |
390 | |
391 | LIST_FOREACH(t, &alllwp, l_list) { |
392 | struct cpu_info *tci; |
393 | if (t->l_cpu != ci) |
394 | continue; |
395 | if (t->l_pflag & (LP_BOUND | LP_INTR)) |
396 | continue; |
397 | lwp_lock(t); |
398 | tci = sched_takecpu(t); |
399 | KASSERT(tci != ci); |
400 | lwp_migrate(t, tci); |
401 | } |
402 | mutex_exit(proc_lock); |
403 | break; |
404 | } |
405 | mutex_exit(&cpu_lock); |
406 | |
407 | if (SCARG(uap, opsid) != NULL) |
408 | error = copyout(&opsid, SCARG(uap, opsid), sizeof(psetid_t)); |
409 | |
410 | return error; |
411 | } |
412 | |
413 | int |
414 | sys__pset_bind(struct lwp *l, const struct sys__pset_bind_args *uap, |
415 | register_t *retval) |
416 | { |
417 | /* { |
418 | syscallarg(idtype_t) idtype; |
419 | syscallarg(id_t) first_id; |
420 | syscallarg(id_t) second_id; |
421 | syscallarg(psetid_t) psid; |
422 | syscallarg(psetid_t) *opsid; |
423 | } */ |
424 | struct cpu_info *ci; |
425 | struct proc *p; |
426 | struct lwp *t; |
427 | id_t id1, id2; |
428 | pid_t pid = 0; |
429 | lwpid_t lid = 0; |
430 | psetid_t psid, opsid; |
431 | int error = 0, lcnt; |
432 | |
433 | psid = SCARG(uap, psid); |
434 | |
435 | /* Available only for super-user, except the case of PS_QUERY */ |
436 | if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_PSET, |
437 | KAUTH_REQ_SYSTEM_PSET_BIND, KAUTH_ARG(SCARG(uap, psid)), NULL, |
438 | NULL)) |
439 | return EPERM; |
440 | |
441 | mutex_enter(&cpu_lock); |
442 | error = psid_validate(psid, true); |
443 | if (error) { |
444 | mutex_exit(&cpu_lock); |
445 | return error; |
446 | } |
447 | if (psid == PS_MYID) |
448 | psid = curlwp->l_psid; |
449 | if (psid != PS_QUERY && psid != PS_NONE) |
450 | psets[psid - 1]->ps_flags |= PSET_BUSY; |
451 | mutex_exit(&cpu_lock); |
452 | |
453 | /* |
454 | * Get PID and LID from the ID. |
455 | */ |
456 | p = l->l_proc; |
457 | id1 = SCARG(uap, first_id); |
458 | id2 = SCARG(uap, second_id); |
459 | |
460 | switch (SCARG(uap, idtype)) { |
461 | case P_PID: |
462 | /* |
463 | * Process: |
464 | * First ID - PID; |
465 | * Second ID - ignored; |
466 | */ |
467 | pid = (id1 == P_MYID) ? p->p_pid : id1; |
468 | lid = 0; |
469 | break; |
470 | case P_LWPID: |
471 | /* |
472 | * Thread (LWP): |
473 | * First ID - LID; |
474 | * Second ID - PID; |
475 | */ |
476 | if (id1 == P_MYID) { |
477 | pid = p->p_pid; |
478 | lid = l->l_lid; |
479 | break; |
480 | } |
481 | lid = id1; |
482 | pid = (id2 == P_MYID) ? p->p_pid : id2; |
483 | break; |
484 | default: |
485 | error = EINVAL; |
486 | goto error; |
487 | } |
488 | |
489 | /* Find the process */ |
490 | mutex_enter(proc_lock); |
491 | p = proc_find(pid); |
492 | if (p == NULL) { |
493 | mutex_exit(proc_lock); |
494 | error = ESRCH; |
495 | goto error; |
496 | } |
497 | mutex_enter(p->p_lock); |
498 | mutex_exit(proc_lock); |
499 | |
500 | /* Disallow modification of the system processes */ |
501 | if (p->p_flag & PK_SYSTEM) { |
502 | mutex_exit(p->p_lock); |
503 | error = EPERM; |
504 | goto error; |
505 | } |
506 | |
507 | /* Find the LWP(s) */ |
508 | lcnt = 0; |
509 | ci = NULL; |
510 | LIST_FOREACH(t, &p->p_lwps, l_sibling) { |
511 | if (lid && lid != t->l_lid) |
512 | continue; |
513 | /* |
514 | * Bind the thread to the processor-set, |
515 | * take some CPU and migrate. |
516 | */ |
517 | lwp_lock(t); |
518 | opsid = t->l_psid; |
519 | t->l_psid = psid; |
520 | ci = sched_takecpu(t); |
521 | /* Unlocks LWP */ |
522 | lwp_migrate(t, ci); |
523 | lcnt++; |
524 | } |
525 | mutex_exit(p->p_lock); |
526 | if (lcnt == 0) { |
527 | error = ESRCH; |
528 | goto error; |
529 | } |
530 | if (SCARG(uap, opsid)) |
531 | error = copyout(&opsid, SCARG(uap, opsid), sizeof(psetid_t)); |
532 | error: |
533 | if (psid != PS_QUERY && psid != PS_NONE) { |
534 | mutex_enter(&cpu_lock); |
535 | psets[psid - 1]->ps_flags &= ~PSET_BUSY; |
536 | mutex_exit(&cpu_lock); |
537 | } |
538 | return error; |
539 | } |
540 | |
541 | /* |
542 | * Sysctl nodes and initialization. |
543 | */ |
544 | |
545 | static int |
546 | sysctl_psets_max(SYSCTLFN_ARGS) |
547 | { |
548 | struct sysctlnode node; |
549 | int error, newsize; |
550 | |
551 | node = *rnode; |
552 | node.sysctl_data = &newsize; |
553 | |
554 | newsize = psets_max; |
555 | error = sysctl_lookup(SYSCTLFN_CALL(&node)); |
556 | if (error || newp == NULL) |
557 | return error; |
558 | |
559 | if (newsize <= 0) |
560 | return EINVAL; |
561 | |
562 | sysctl_unlock(); |
563 | error = psets_realloc(newsize); |
564 | sysctl_relock(); |
565 | return error; |
566 | } |
567 | |
568 | static int |
569 | sysctl_psets_list(SYSCTLFN_ARGS) |
570 | { |
571 | const size_t bufsz = 1024; |
572 | char *buf, tbuf[16]; |
573 | int i, error; |
574 | size_t len; |
575 | |
576 | sysctl_unlock(); |
577 | buf = kmem_alloc(bufsz, KM_SLEEP); |
578 | snprintf(buf, bufsz, "%d:1" , PS_NONE); /* XXX */ |
579 | |
580 | mutex_enter(&cpu_lock); |
581 | for (i = 0; i < psets_max; i++) { |
582 | if (psets[i] == NULL) |
583 | continue; |
584 | snprintf(tbuf, sizeof(tbuf), ",%d:2" , i + 1); /* XXX */ |
585 | strlcat(buf, tbuf, bufsz); |
586 | } |
587 | mutex_exit(&cpu_lock); |
588 | len = strlen(buf) + 1; |
589 | error = 0; |
590 | if (oldp != NULL) |
591 | error = copyout(buf, oldp, min(len, *oldlenp)); |
592 | *oldlenp = len; |
593 | kmem_free(buf, bufsz); |
594 | sysctl_relock(); |
595 | return error; |
596 | } |
597 | |
598 | SYSCTL_SETUP(sysctl_pset_setup, "sysctl kern.pset subtree setup" ) |
599 | { |
600 | const struct sysctlnode *node = NULL; |
601 | |
602 | sysctl_createv(clog, 0, NULL, &node, |
603 | CTLFLAG_PERMANENT, |
604 | CTLTYPE_NODE, "pset" , |
605 | SYSCTL_DESCR("Processor-set options" ), |
606 | NULL, 0, NULL, 0, |
607 | CTL_KERN, CTL_CREATE, CTL_EOL); |
608 | |
609 | if (node == NULL) |
610 | return; |
611 | |
612 | sysctl_createv(clog, 0, &node, NULL, |
613 | CTLFLAG_PERMANENT | CTLFLAG_READWRITE, |
614 | CTLTYPE_INT, "psets_max" , |
615 | SYSCTL_DESCR("Maximal count of the processor-sets" ), |
616 | sysctl_psets_max, 0, &psets_max, 0, |
617 | CTL_CREATE, CTL_EOL); |
618 | sysctl_createv(clog, 0, &node, NULL, |
619 | CTLFLAG_PERMANENT, |
620 | CTLTYPE_STRING, "list" , |
621 | SYSCTL_DESCR("List of active sets" ), |
622 | sysctl_psets_list, 0, NULL, 0, |
623 | CTL_CREATE, CTL_EOL); |
624 | } |
625 | |