1/* $NetBSD: sysv_shm.c,v 1.131 2015/11/26 13:15:34 martin Exp $ */
2
3/*-
4 * Copyright (c) 1999, 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, and by Mindaugas Rasiukevicius.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33/*
34 * Copyright (c) 1994 Adam Glass and Charles M. Hannum. All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by Adam Glass and Charles M.
47 * Hannum.
48 * 4. The names of the authors may not be used to endorse or promote products
49 * derived from this software without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
52 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
53 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
54 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
55 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
56 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
57 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
58 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
59 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
60 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
61 */
62
63#include <sys/cdefs.h>
64__KERNEL_RCSID(0, "$NetBSD: sysv_shm.c,v 1.131 2015/11/26 13:15:34 martin Exp $");
65
66#ifdef _KERNEL_OPT
67#include "opt_sysv.h"
68#endif
69
70#include <sys/param.h>
71#include <sys/kernel.h>
72#include <sys/kmem.h>
73#include <sys/shm.h>
74#include <sys/mutex.h>
75#include <sys/mman.h>
76#include <sys/stat.h>
77#include <sys/sysctl.h>
78#include <sys/mount.h> /* XXX for <sys/syscallargs.h> */
79#include <sys/syscallargs.h>
80#include <sys/queue.h>
81#include <sys/kauth.h>
82
83#include <uvm/uvm_extern.h>
84#include <uvm/uvm_object.h>
85
86struct shmmap_entry {
87 SLIST_ENTRY(shmmap_entry) next;
88 vaddr_t va;
89 int shmid;
90};
91
92int shm_nused __cacheline_aligned;
93struct shmid_ds * shmsegs __read_mostly;
94
95static kmutex_t shm_lock __cacheline_aligned;
96static kcondvar_t * shm_cv __cacheline_aligned;
97static int shm_last_free __cacheline_aligned;
98static size_t shm_committed __cacheline_aligned;
99static int shm_use_phys __read_mostly;
100
101static kcondvar_t shm_realloc_cv;
102static bool shm_realloc_state;
103static u_int shm_realloc_disable;
104
105struct shmmap_state {
106 unsigned int nitems;
107 unsigned int nrefs;
108 SLIST_HEAD(, shmmap_entry) entries;
109};
110
111extern int kern_has_sysvshm;
112
113SYSCTL_SETUP_PROTO(sysctl_ipc_shm_setup);
114
115#ifdef SHMDEBUG
116#define SHMPRINTF(a) printf a
117#else
118#define SHMPRINTF(a)
119#endif
120
121static int shmrealloc(int);
122
123/*
124 * Find the shared memory segment by the identifier.
125 * => must be called with shm_lock held;
126 */
127static struct shmid_ds *
128shm_find_segment_by_shmid(int shmid)
129{
130 int segnum;
131 struct shmid_ds *shmseg;
132
133 KASSERT(mutex_owned(&shm_lock));
134
135 segnum = IPCID_TO_IX(shmid);
136 if (segnum < 0 || segnum >= shminfo.shmmni)
137 return NULL;
138 shmseg = &shmsegs[segnum];
139 if ((shmseg->shm_perm.mode & SHMSEG_ALLOCATED) == 0)
140 return NULL;
141 if ((shmseg->shm_perm.mode &
142 (SHMSEG_REMOVED|SHMSEG_RMLINGER)) == SHMSEG_REMOVED)
143 return NULL;
144 if (shmseg->shm_perm._seq != IPCID_TO_SEQ(shmid))
145 return NULL;
146
147 return shmseg;
148}
149
150/*
151 * Free memory segment.
152 * => must be called with shm_lock held;
153 */
154static void
155shm_free_segment(int segnum)
156{
157 struct shmid_ds *shmseg;
158 size_t size;
159 bool wanted;
160
161 KASSERT(mutex_owned(&shm_lock));
162
163 shmseg = &shmsegs[segnum];
164 SHMPRINTF(("shm freeing key 0x%lx seq 0x%x\n",
165 shmseg->shm_perm._key, shmseg->shm_perm._seq));
166
167 size = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET;
168 wanted = (shmseg->shm_perm.mode & SHMSEG_WANTED);
169
170 shmseg->_shm_internal = NULL;
171 shm_committed -= btoc(size);
172 shm_nused--;
173 shmseg->shm_perm.mode = SHMSEG_FREE;
174 shm_last_free = segnum;
175 if (wanted == true)
176 cv_broadcast(&shm_cv[segnum]);
177}
178
179/*
180 * Delete entry from the shm map.
181 * => must be called with shm_lock held;
182 */
183static struct uvm_object *
184shm_delete_mapping(struct shmmap_state *shmmap_s,
185 struct shmmap_entry *shmmap_se)
186{
187 struct uvm_object *uobj = NULL;
188 struct shmid_ds *shmseg;
189 int segnum;
190
191 KASSERT(mutex_owned(&shm_lock));
192
193 segnum = IPCID_TO_IX(shmmap_se->shmid);
194 shmseg = &shmsegs[segnum];
195 SLIST_REMOVE(&shmmap_s->entries, shmmap_se, shmmap_entry, next);
196 shmmap_s->nitems--;
197 shmseg->shm_dtime = time_second;
198 if ((--shmseg->shm_nattch <= 0) &&
199 (shmseg->shm_perm.mode & SHMSEG_REMOVED)) {
200 uobj = shmseg->_shm_internal;
201 shm_free_segment(segnum);
202 }
203
204 return uobj;
205}
206
207/*
208 * Get a non-shared shm map for that vmspace. Note, that memory
209 * allocation might be performed with lock held.
210 */
211static struct shmmap_state *
212shmmap_getprivate(struct proc *p)
213{
214 struct shmmap_state *oshmmap_s, *shmmap_s;
215 struct shmmap_entry *oshmmap_se, *shmmap_se;
216
217 KASSERT(mutex_owned(&shm_lock));
218
219 /* 1. A shm map with refcnt = 1, used by ourselves, thus return */
220 oshmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
221 if (oshmmap_s && oshmmap_s->nrefs == 1)
222 return oshmmap_s;
223
224 /* 2. No shm map preset - create a fresh one */
225 shmmap_s = kmem_zalloc(sizeof(struct shmmap_state), KM_SLEEP);
226 shmmap_s->nrefs = 1;
227 SLIST_INIT(&shmmap_s->entries);
228 p->p_vmspace->vm_shm = (void *)shmmap_s;
229
230 if (oshmmap_s == NULL)
231 return shmmap_s;
232
233 SHMPRINTF(("shmmap_getprivate: vm %p split (%d entries), was used by %d\n",
234 p->p_vmspace, oshmmap_s->nitems, oshmmap_s->nrefs));
235
236 /* 3. A shared shm map, copy to a fresh one and adjust refcounts */
237 SLIST_FOREACH(oshmmap_se, &oshmmap_s->entries, next) {
238 shmmap_se = kmem_alloc(sizeof(struct shmmap_entry), KM_SLEEP);
239 shmmap_se->va = oshmmap_se->va;
240 shmmap_se->shmid = oshmmap_se->shmid;
241 SLIST_INSERT_HEAD(&shmmap_s->entries, shmmap_se, next);
242 }
243 shmmap_s->nitems = oshmmap_s->nitems;
244 oshmmap_s->nrefs--;
245
246 return shmmap_s;
247}
248
249/*
250 * Lock/unlock the memory.
251 * => must be called with shm_lock held;
252 * => called from one place, thus, inline;
253 */
254static inline int
255shm_memlock(struct lwp *l, struct shmid_ds *shmseg, int shmid, int cmd)
256{
257 struct proc *p = l->l_proc;
258 struct shmmap_entry *shmmap_se;
259 struct shmmap_state *shmmap_s;
260 size_t size;
261 int error;
262
263 KASSERT(mutex_owned(&shm_lock));
264 shmmap_s = shmmap_getprivate(p);
265
266 /* Find our shared memory address by shmid */
267 SLIST_FOREACH(shmmap_se, &shmmap_s->entries, next) {
268 if (shmmap_se->shmid != shmid)
269 continue;
270
271 size = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET;
272
273 if (cmd == SHM_LOCK &&
274 (shmseg->shm_perm.mode & SHMSEG_WIRED) == 0) {
275 /* Wire the object and map, then tag it */
276 error = uvm_obj_wirepages(shmseg->_shm_internal,
277 0, size, NULL);
278 if (error)
279 return EIO;
280 error = uvm_map_pageable(&p->p_vmspace->vm_map,
281 shmmap_se->va, shmmap_se->va + size, false, 0);
282 if (error) {
283 uvm_obj_unwirepages(shmseg->_shm_internal,
284 0, size);
285 if (error == EFAULT)
286 error = ENOMEM;
287 return error;
288 }
289 shmseg->shm_perm.mode |= SHMSEG_WIRED;
290
291 } else if (cmd == SHM_UNLOCK &&
292 (shmseg->shm_perm.mode & SHMSEG_WIRED) != 0) {
293 /* Unwire the object and map, then untag it */
294 uvm_obj_unwirepages(shmseg->_shm_internal, 0, size);
295 error = uvm_map_pageable(&p->p_vmspace->vm_map,
296 shmmap_se->va, shmmap_se->va + size, true, 0);
297 if (error)
298 return EIO;
299 shmseg->shm_perm.mode &= ~SHMSEG_WIRED;
300 }
301 }
302
303 return 0;
304}
305
306/*
307 * Unmap shared memory.
308 */
309int
310sys_shmdt(struct lwp *l, const struct sys_shmdt_args *uap, register_t *retval)
311{
312 /* {
313 syscallarg(const void *) shmaddr;
314 } */
315 struct proc *p = l->l_proc;
316 struct shmmap_state *shmmap_s1, *shmmap_s;
317 struct shmmap_entry *shmmap_se;
318 struct uvm_object *uobj;
319 struct shmid_ds *shmseg;
320 size_t size;
321
322 mutex_enter(&shm_lock);
323 /* In case of reallocation, we will wait for completion */
324 while (__predict_false(shm_realloc_state))
325 cv_wait(&shm_realloc_cv, &shm_lock);
326
327 shmmap_s1 = (struct shmmap_state *)p->p_vmspace->vm_shm;
328 if (shmmap_s1 == NULL) {
329 mutex_exit(&shm_lock);
330 return EINVAL;
331 }
332
333 /* Find the map entry */
334 SLIST_FOREACH(shmmap_se, &shmmap_s1->entries, next)
335 if (shmmap_se->va == (vaddr_t)SCARG(uap, shmaddr))
336 break;
337 if (shmmap_se == NULL) {
338 mutex_exit(&shm_lock);
339 return EINVAL;
340 }
341
342 shmmap_s = shmmap_getprivate(p);
343 if (shmmap_s != shmmap_s1) {
344 /* Map has been copied, lookup entry in new map */
345 SLIST_FOREACH(shmmap_se, &shmmap_s->entries, next)
346 if (shmmap_se->va == (vaddr_t)SCARG(uap, shmaddr))
347 break;
348 if (shmmap_se == NULL) {
349 mutex_exit(&shm_lock);
350 return EINVAL;
351 }
352 }
353
354 SHMPRINTF(("shmdt: vm %p: remove %d @%lx\n",
355 p->p_vmspace, shmmap_se->shmid, shmmap_se->va));
356
357 /* Delete the entry from shm map */
358 uobj = shm_delete_mapping(shmmap_s, shmmap_se);
359 shmseg = &shmsegs[IPCID_TO_IX(shmmap_se->shmid)];
360 size = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET;
361 mutex_exit(&shm_lock);
362
363 uvm_deallocate(&p->p_vmspace->vm_map, shmmap_se->va, size);
364 if (uobj != NULL) {
365 uao_detach(uobj);
366 }
367 kmem_free(shmmap_se, sizeof(struct shmmap_entry));
368
369 return 0;
370}
371
372/*
373 * Map shared memory.
374 */
375int
376sys_shmat(struct lwp *l, const struct sys_shmat_args *uap, register_t *retval)
377{
378 /* {
379 syscallarg(int) shmid;
380 syscallarg(const void *) shmaddr;
381 syscallarg(int) shmflg;
382 } */
383 int error, flags = 0;
384 struct proc *p = l->l_proc;
385 kauth_cred_t cred = l->l_cred;
386 struct shmid_ds *shmseg;
387 struct shmmap_state *shmmap_s;
388 struct shmmap_entry *shmmap_se;
389 struct uvm_object *uobj;
390 struct vmspace *vm;
391 vaddr_t attach_va;
392 vm_prot_t prot;
393 vsize_t size;
394
395 /* Allocate a new map entry and set it */
396 shmmap_se = kmem_alloc(sizeof(struct shmmap_entry), KM_SLEEP);
397 shmmap_se->shmid = SCARG(uap, shmid);
398
399 mutex_enter(&shm_lock);
400 /* In case of reallocation, we will wait for completion */
401 while (__predict_false(shm_realloc_state))
402 cv_wait(&shm_realloc_cv, &shm_lock);
403
404 shmseg = shm_find_segment_by_shmid(SCARG(uap, shmid));
405 if (shmseg == NULL) {
406 error = EINVAL;
407 goto err;
408 }
409 error = ipcperm(cred, &shmseg->shm_perm,
410 (SCARG(uap, shmflg) & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W);
411 if (error)
412 goto err;
413
414 vm = p->p_vmspace;
415 shmmap_s = (struct shmmap_state *)vm->vm_shm;
416 if (shmmap_s && shmmap_s->nitems >= shminfo.shmseg) {
417 error = EMFILE;
418 goto err;
419 }
420
421 size = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET;
422 prot = VM_PROT_READ;
423 if ((SCARG(uap, shmflg) & SHM_RDONLY) == 0)
424 prot |= VM_PROT_WRITE;
425 if (SCARG(uap, shmaddr)) {
426 flags |= UVM_FLAG_FIXED;
427 if (SCARG(uap, shmflg) & SHM_RND)
428 attach_va =
429 (vaddr_t)SCARG(uap, shmaddr) & ~(SHMLBA-1);
430 else if (((vaddr_t)SCARG(uap, shmaddr) & (SHMLBA-1)) == 0)
431 attach_va = (vaddr_t)SCARG(uap, shmaddr);
432 else {
433 error = EINVAL;
434 goto err;
435 }
436 } else {
437 /* This is just a hint to uvm_map() about where to put it. */
438 attach_va = p->p_emul->e_vm_default_addr(p,
439 (vaddr_t)vm->vm_daddr, size,
440 p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
441 }
442
443 /*
444 * Create a map entry, add it to the list and increase the counters.
445 * The lock will be dropped before the mapping, disable reallocation.
446 */
447 shmmap_s = shmmap_getprivate(p);
448 SLIST_INSERT_HEAD(&shmmap_s->entries, shmmap_se, next);
449 shmmap_s->nitems++;
450 shmseg->shm_lpid = p->p_pid;
451 shmseg->shm_nattch++;
452 shm_realloc_disable++;
453 mutex_exit(&shm_lock);
454
455 /*
456 * Add a reference to the memory object, map it to the
457 * address space, and lock the memory, if needed.
458 */
459 uobj = shmseg->_shm_internal;
460 uao_reference(uobj);
461 error = uvm_map(&vm->vm_map, &attach_va, size, uobj, 0, 0,
462 UVM_MAPFLAG(prot, prot, UVM_INH_SHARE, UVM_ADV_RANDOM, flags));
463 if (error)
464 goto err_detach;
465 if (shm_use_phys || (shmseg->shm_perm.mode & SHMSEG_WIRED)) {
466 error = uvm_map_pageable(&vm->vm_map, attach_va,
467 attach_va + size, false, 0);
468 if (error) {
469 if (error == EFAULT)
470 error = ENOMEM;
471 uvm_deallocate(&vm->vm_map, attach_va, size);
472 goto err_detach;
473 }
474 }
475
476 /* Set the new address, and update the time */
477 mutex_enter(&shm_lock);
478 shmmap_se->va = attach_va;
479 shmseg->shm_atime = time_second;
480 shm_realloc_disable--;
481 retval[0] = attach_va;
482 SHMPRINTF(("shmat: vm %p: add %d @%lx\n",
483 p->p_vmspace, shmmap_se->shmid, attach_va));
484err:
485 cv_broadcast(&shm_realloc_cv);
486 mutex_exit(&shm_lock);
487 if (error && shmmap_se) {
488 kmem_free(shmmap_se, sizeof(struct shmmap_entry));
489 }
490 return error;
491
492err_detach:
493 uao_detach(uobj);
494 mutex_enter(&shm_lock);
495 uobj = shm_delete_mapping(shmmap_s, shmmap_se);
496 shm_realloc_disable--;
497 cv_broadcast(&shm_realloc_cv);
498 mutex_exit(&shm_lock);
499 if (uobj != NULL) {
500 uao_detach(uobj);
501 }
502 kmem_free(shmmap_se, sizeof(struct shmmap_entry));
503 return error;
504}
505
506/*
507 * Shared memory control operations.
508 */
509int
510sys___shmctl50(struct lwp *l, const struct sys___shmctl50_args *uap,
511 register_t *retval)
512{
513 /* {
514 syscallarg(int) shmid;
515 syscallarg(int) cmd;
516 syscallarg(struct shmid_ds *) buf;
517 } */
518 struct shmid_ds shmbuf;
519 int cmd, error;
520
521 cmd = SCARG(uap, cmd);
522 if (cmd == IPC_SET) {
523 error = copyin(SCARG(uap, buf), &shmbuf, sizeof(shmbuf));
524 if (error)
525 return error;
526 }
527
528 error = shmctl1(l, SCARG(uap, shmid), cmd,
529 (cmd == IPC_SET || cmd == IPC_STAT) ? &shmbuf : NULL);
530
531 if (error == 0 && cmd == IPC_STAT)
532 error = copyout(&shmbuf, SCARG(uap, buf), sizeof(shmbuf));
533
534 return error;
535}
536
537int
538shmctl1(struct lwp *l, int shmid, int cmd, struct shmid_ds *shmbuf)
539{
540 struct uvm_object *uobj = NULL;
541 kauth_cred_t cred = l->l_cred;
542 struct shmid_ds *shmseg;
543 int error = 0;
544
545 mutex_enter(&shm_lock);
546 /* In case of reallocation, we will wait for completion */
547 while (__predict_false(shm_realloc_state))
548 cv_wait(&shm_realloc_cv, &shm_lock);
549
550 shmseg = shm_find_segment_by_shmid(shmid);
551 if (shmseg == NULL) {
552 mutex_exit(&shm_lock);
553 return EINVAL;
554 }
555
556 switch (cmd) {
557 case IPC_STAT:
558 if ((error = ipcperm(cred, &shmseg->shm_perm, IPC_R)) != 0)
559 break;
560 memcpy(shmbuf, shmseg, sizeof(struct shmid_ds));
561 break;
562 case IPC_SET:
563 if ((error = ipcperm(cred, &shmseg->shm_perm, IPC_M)) != 0)
564 break;
565 shmseg->shm_perm.uid = shmbuf->shm_perm.uid;
566 shmseg->shm_perm.gid = shmbuf->shm_perm.gid;
567 shmseg->shm_perm.mode =
568 (shmseg->shm_perm.mode & ~ACCESSPERMS) |
569 (shmbuf->shm_perm.mode & ACCESSPERMS);
570 shmseg->shm_ctime = time_second;
571 break;
572 case IPC_RMID:
573 if ((error = ipcperm(cred, &shmseg->shm_perm, IPC_M)) != 0)
574 break;
575 shmseg->shm_perm._key = IPC_PRIVATE;
576 shmseg->shm_perm.mode |= SHMSEG_REMOVED;
577 if (shmseg->shm_nattch <= 0) {
578 uobj = shmseg->_shm_internal;
579 shm_free_segment(IPCID_TO_IX(shmid));
580 }
581 break;
582 case SHM_LOCK:
583 case SHM_UNLOCK:
584 if ((error = kauth_authorize_system(cred,
585 KAUTH_SYSTEM_SYSVIPC,
586 (cmd == SHM_LOCK) ? KAUTH_REQ_SYSTEM_SYSVIPC_SHM_LOCK :
587 KAUTH_REQ_SYSTEM_SYSVIPC_SHM_UNLOCK, NULL, NULL, NULL)) != 0)
588 break;
589 error = shm_memlock(l, shmseg, shmid, cmd);
590 break;
591 default:
592 error = EINVAL;
593 }
594
595 mutex_exit(&shm_lock);
596 if (uobj != NULL)
597 uao_detach(uobj);
598 return error;
599}
600
601/*
602 * Try to take an already existing segment.
603 * => must be called with shm_lock held;
604 * => called from one place, thus, inline;
605 */
606static inline int
607shmget_existing(struct lwp *l, const struct sys_shmget_args *uap, int mode,
608 register_t *retval)
609{
610 struct shmid_ds *shmseg;
611 kauth_cred_t cred = l->l_cred;
612 int segnum, error;
613again:
614 KASSERT(mutex_owned(&shm_lock));
615
616 /* Find segment by key */
617 for (segnum = 0; segnum < shminfo.shmmni; segnum++)
618 if ((shmsegs[segnum].shm_perm.mode & SHMSEG_ALLOCATED) &&
619 shmsegs[segnum].shm_perm._key == SCARG(uap, key))
620 break;
621 if (segnum == shminfo.shmmni) {
622 /* Not found */
623 return -1;
624 }
625
626 shmseg = &shmsegs[segnum];
627 if (shmseg->shm_perm.mode & SHMSEG_REMOVED) {
628 /*
629 * This segment is in the process of being allocated. Wait
630 * until it's done, and look the key up again (in case the
631 * allocation failed or it was freed).
632 */
633 shmseg->shm_perm.mode |= SHMSEG_WANTED;
634 error = cv_wait_sig(&shm_cv[segnum], &shm_lock);
635 if (error)
636 return error;
637 goto again;
638 }
639
640 /*
641 * First check the flags, to generate a useful error when a
642 * segment already exists.
643 */
644 if ((SCARG(uap, shmflg) & (IPC_CREAT | IPC_EXCL)) ==
645 (IPC_CREAT | IPC_EXCL))
646 return EEXIST;
647
648 /* Check the permission and segment size. */
649 error = ipcperm(cred, &shmseg->shm_perm, mode);
650 if (error)
651 return error;
652 if (SCARG(uap, size) && SCARG(uap, size) > shmseg->shm_segsz)
653 return EINVAL;
654
655 *retval = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm);
656 return 0;
657}
658
659int
660sys_shmget(struct lwp *l, const struct sys_shmget_args *uap, register_t *retval)
661{
662 /* {
663 syscallarg(key_t) key;
664 syscallarg(size_t) size;
665 syscallarg(int) shmflg;
666 } */
667 struct shmid_ds *shmseg;
668 kauth_cred_t cred = l->l_cred;
669 key_t key = SCARG(uap, key);
670 size_t size;
671 int error, mode, segnum;
672 bool lockmem;
673
674 mode = SCARG(uap, shmflg) & ACCESSPERMS;
675 if (SCARG(uap, shmflg) & _SHM_RMLINGER)
676 mode |= SHMSEG_RMLINGER;
677
678 SHMPRINTF(("shmget: key 0x%lx size 0x%zx shmflg 0x%x mode 0x%x\n",
679 SCARG(uap, key), SCARG(uap, size), SCARG(uap, shmflg), mode));
680
681 mutex_enter(&shm_lock);
682 /* In case of reallocation, we will wait for completion */
683 while (__predict_false(shm_realloc_state))
684 cv_wait(&shm_realloc_cv, &shm_lock);
685
686 if (key != IPC_PRIVATE) {
687 error = shmget_existing(l, uap, mode, retval);
688 if (error != -1) {
689 mutex_exit(&shm_lock);
690 return error;
691 }
692 if ((SCARG(uap, shmflg) & IPC_CREAT) == 0) {
693 mutex_exit(&shm_lock);
694 return ENOENT;
695 }
696 }
697 error = 0;
698
699 /*
700 * Check the for the limits.
701 */
702 size = SCARG(uap, size);
703 if (size < shminfo.shmmin || size > shminfo.shmmax) {
704 mutex_exit(&shm_lock);
705 return EINVAL;
706 }
707 if (shm_nused >= shminfo.shmmni) {
708 mutex_exit(&shm_lock);
709 return ENOSPC;
710 }
711 size = (size + PGOFSET) & ~PGOFSET;
712 if (shm_committed + btoc(size) > shminfo.shmall) {
713 mutex_exit(&shm_lock);
714 return ENOMEM;
715 }
716
717 /* Find the first available segment */
718 if (shm_last_free < 0) {
719 for (segnum = 0; segnum < shminfo.shmmni; segnum++)
720 if (shmsegs[segnum].shm_perm.mode & SHMSEG_FREE)
721 break;
722 KASSERT(segnum < shminfo.shmmni);
723 } else {
724 segnum = shm_last_free;
725 shm_last_free = -1;
726 }
727
728 /*
729 * Initialize the segment.
730 * We will drop the lock while allocating the memory, thus mark the
731 * segment present, but removed, that no other thread could take it.
732 * Also, disable reallocation, while lock is dropped.
733 */
734 shmseg = &shmsegs[segnum];
735 shmseg->shm_perm.mode = SHMSEG_ALLOCATED | SHMSEG_REMOVED;
736 shm_committed += btoc(size);
737 shm_nused++;
738 lockmem = shm_use_phys;
739 shm_realloc_disable++;
740 mutex_exit(&shm_lock);
741
742 /* Allocate the memory object and lock it if needed */
743 shmseg->_shm_internal = uao_create(size, 0);
744 if (lockmem) {
745 /* Wire the pages and tag it */
746 error = uvm_obj_wirepages(shmseg->_shm_internal, 0, size, NULL);
747 if (error) {
748 uao_detach(shmseg->_shm_internal);
749 mutex_enter(&shm_lock);
750 shm_free_segment(segnum);
751 shm_realloc_disable--;
752 mutex_exit(&shm_lock);
753 return error;
754 }
755 }
756
757 /*
758 * Please note, while segment is marked, there are no need to hold the
759 * lock, while setting it (except shm_perm.mode).
760 */
761 shmseg->shm_perm._key = SCARG(uap, key);
762 shmseg->shm_perm._seq = (shmseg->shm_perm._seq + 1) & 0x7fff;
763 *retval = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm);
764
765 shmseg->shm_perm.cuid = shmseg->shm_perm.uid = kauth_cred_geteuid(cred);
766 shmseg->shm_perm.cgid = shmseg->shm_perm.gid = kauth_cred_getegid(cred);
767 shmseg->shm_segsz = SCARG(uap, size);
768 shmseg->shm_cpid = l->l_proc->p_pid;
769 shmseg->shm_lpid = shmseg->shm_nattch = 0;
770 shmseg->shm_atime = shmseg->shm_dtime = 0;
771 shmseg->shm_ctime = time_second;
772
773 /*
774 * Segment is initialized.
775 * Enter the lock, mark as allocated, and notify waiters (if any).
776 * Also, unmark the state of reallocation.
777 */
778 mutex_enter(&shm_lock);
779 shmseg->shm_perm.mode = (shmseg->shm_perm.mode & SHMSEG_WANTED) |
780 (mode & (ACCESSPERMS | SHMSEG_RMLINGER)) |
781 SHMSEG_ALLOCATED | (lockmem ? SHMSEG_WIRED : 0);
782 if (shmseg->shm_perm.mode & SHMSEG_WANTED) {
783 shmseg->shm_perm.mode &= ~SHMSEG_WANTED;
784 cv_broadcast(&shm_cv[segnum]);
785 }
786 shm_realloc_disable--;
787 cv_broadcast(&shm_realloc_cv);
788 mutex_exit(&shm_lock);
789
790 return error;
791}
792
793void
794shmfork(struct vmspace *vm1, struct vmspace *vm2)
795{
796 struct shmmap_state *shmmap_s;
797 struct shmmap_entry *shmmap_se;
798
799 SHMPRINTF(("shmfork %p->%p\n", vm1, vm2));
800 mutex_enter(&shm_lock);
801 vm2->vm_shm = vm1->vm_shm;
802 if (vm1->vm_shm) {
803 shmmap_s = (struct shmmap_state *)vm1->vm_shm;
804 SLIST_FOREACH(shmmap_se, &shmmap_s->entries, next)
805 shmsegs[IPCID_TO_IX(shmmap_se->shmid)].shm_nattch++;
806 shmmap_s->nrefs++;
807 }
808 mutex_exit(&shm_lock);
809}
810
811void
812shmexit(struct vmspace *vm)
813{
814 struct shmmap_state *shmmap_s;
815 struct shmmap_entry *shmmap_se;
816
817 mutex_enter(&shm_lock);
818 shmmap_s = (struct shmmap_state *)vm->vm_shm;
819 if (shmmap_s == NULL) {
820 mutex_exit(&shm_lock);
821 return;
822 }
823 vm->vm_shm = NULL;
824
825 if (--shmmap_s->nrefs > 0) {
826 SHMPRINTF(("shmexit: vm %p drop ref (%d entries), refs = %d\n",
827 vm, shmmap_s->nitems, shmmap_s->nrefs));
828 SLIST_FOREACH(shmmap_se, &shmmap_s->entries, next) {
829 shmsegs[IPCID_TO_IX(shmmap_se->shmid)].shm_nattch--;
830 }
831 mutex_exit(&shm_lock);
832 return;
833 }
834
835 SHMPRINTF(("shmexit: vm %p cleanup (%d entries)\n", vm, shmmap_s->nitems));
836 if (shmmap_s->nitems == 0) {
837 mutex_exit(&shm_lock);
838 kmem_free(shmmap_s, sizeof(struct shmmap_state));
839 return;
840 }
841
842 /*
843 * Delete the entry from shm map.
844 */
845 for (;;) {
846 struct shmid_ds *shmseg;
847 struct uvm_object *uobj;
848 size_t sz;
849
850 shmmap_se = SLIST_FIRST(&shmmap_s->entries);
851 KASSERT(shmmap_se != NULL);
852
853 shmseg = &shmsegs[IPCID_TO_IX(shmmap_se->shmid)];
854 sz = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET;
855 /* shm_delete_mapping() removes from the list. */
856 uobj = shm_delete_mapping(shmmap_s, shmmap_se);
857 mutex_exit(&shm_lock);
858
859 uvm_deallocate(&vm->vm_map, shmmap_se->va, sz);
860 if (uobj != NULL) {
861 uao_detach(uobj);
862 }
863 kmem_free(shmmap_se, sizeof(struct shmmap_entry));
864
865 if (SLIST_EMPTY(&shmmap_s->entries)) {
866 break;
867 }
868 mutex_enter(&shm_lock);
869 KASSERT(!SLIST_EMPTY(&shmmap_s->entries));
870 }
871 kmem_free(shmmap_s, sizeof(struct shmmap_state));
872}
873
874static int
875shmrealloc(int newshmni)
876{
877 vaddr_t v;
878 struct shmid_ds *oldshmsegs, *newshmsegs;
879 kcondvar_t *newshm_cv, *oldshm_cv;
880 size_t sz;
881 int i, lsegid, oldshmni;
882
883 if (newshmni < 1)
884 return EINVAL;
885
886 /* Allocate new memory area */
887 sz = ALIGN(newshmni * sizeof(struct shmid_ds)) +
888 ALIGN(newshmni * sizeof(kcondvar_t));
889 sz = round_page(sz);
890 v = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
891 if (v == 0)
892 return ENOMEM;
893
894 mutex_enter(&shm_lock);
895 while (shm_realloc_state || shm_realloc_disable)
896 cv_wait(&shm_realloc_cv, &shm_lock);
897
898 /*
899 * Get the number of last segment. Fail we are trying to
900 * reallocate less memory than we use.
901 */
902 lsegid = 0;
903 for (i = 0; i < shminfo.shmmni; i++)
904 if ((shmsegs[i].shm_perm.mode & SHMSEG_FREE) == 0)
905 lsegid = i;
906 if (lsegid >= newshmni) {
907 mutex_exit(&shm_lock);
908 uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED);
909 return EBUSY;
910 }
911 shm_realloc_state = true;
912
913 newshmsegs = (void *)v;
914 newshm_cv = (void *)((uintptr_t)newshmsegs +
915 ALIGN(newshmni * sizeof(struct shmid_ds)));
916
917 /* Copy all memory to the new area */
918 for (i = 0; i < shm_nused; i++) {
919 cv_init(&newshm_cv[i], "shmwait");
920 (void)memcpy(&newshmsegs[i], &shmsegs[i],
921 sizeof(newshmsegs[0]));
922 }
923
924 /* Mark as free all new segments, if there is any */
925 for (; i < newshmni; i++) {
926 cv_init(&newshm_cv[i], "shmwait");
927 newshmsegs[i].shm_perm.mode = SHMSEG_FREE;
928 newshmsegs[i].shm_perm._seq = 0;
929 }
930
931 oldshmsegs = shmsegs;
932 oldshmni = shminfo.shmmni;
933 shminfo.shmmni = newshmni;
934 shmsegs = newshmsegs;
935 shm_cv = newshm_cv;
936
937 /* Reallocation completed - notify all waiters, if any */
938 shm_realloc_state = false;
939 cv_broadcast(&shm_realloc_cv);
940 mutex_exit(&shm_lock);
941
942 /* Release now unused resources. */
943 oldshm_cv = (void *)((uintptr_t)oldshmsegs +
944 ALIGN(oldshmni * sizeof(struct shmid_ds)));
945 for (i = 0; i < oldshmni; i++)
946 cv_destroy(&oldshm_cv[i]);
947
948 sz = ALIGN(oldshmni * sizeof(struct shmid_ds)) +
949 ALIGN(oldshmni * sizeof(kcondvar_t));
950 sz = round_page(sz);
951 uvm_km_free(kernel_map, (vaddr_t)oldshmsegs, sz, UVM_KMF_WIRED);
952
953 return 0;
954}
955
956void
957shminit(struct sysctllog **clog)
958{
959 vaddr_t v;
960 size_t sz;
961 int i;
962
963 mutex_init(&shm_lock, MUTEX_DEFAULT, IPL_NONE);
964 cv_init(&shm_realloc_cv, "shmrealc");
965
966 /* Allocate the wired memory for our structures */
967 sz = ALIGN(shminfo.shmmni * sizeof(struct shmid_ds)) +
968 ALIGN(shminfo.shmmni * sizeof(kcondvar_t));
969 sz = round_page(sz);
970 v = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
971 if (v == 0)
972 panic("sysv_shm: cannot allocate memory");
973 shmsegs = (void *)v;
974 shm_cv = (void *)((uintptr_t)shmsegs +
975 ALIGN(shminfo.shmmni * sizeof(struct shmid_ds)));
976
977 if (shminfo.shmmax == 0)
978 shminfo.shmmax = max(physmem / 4, 1024) * PAGE_SIZE;
979 else
980 shminfo.shmmax *= PAGE_SIZE;
981 shminfo.shmall = shminfo.shmmax / PAGE_SIZE;
982
983 for (i = 0; i < shminfo.shmmni; i++) {
984 cv_init(&shm_cv[i], "shmwait");
985 shmsegs[i].shm_perm.mode = SHMSEG_FREE;
986 shmsegs[i].shm_perm._seq = 0;
987 }
988 shm_last_free = 0;
989 shm_nused = 0;
990 shm_committed = 0;
991 shm_realloc_disable = 0;
992 shm_realloc_state = false;
993
994 kern_has_sysvshm = 1;
995
996 /* Load the callback function pointers for the uvm subsystem */
997 uvm_shmexit = shmexit;
998 uvm_shmfork = shmfork;
999
1000#ifdef _MODULE
1001 if (clog)
1002 sysctl_ipc_shm_setup(clog);
1003#endif
1004}
1005
1006int
1007shmfini(void)
1008{
1009 size_t sz;
1010 int i;
1011 vaddr_t v = (vaddr_t)shmsegs;
1012
1013 mutex_enter(&shm_lock);
1014 if (shm_nused) {
1015 mutex_exit(&shm_lock);
1016 return 1;
1017 }
1018
1019 /* Clear the callback function pointers for the uvm subsystem */
1020 uvm_shmexit = NULL;
1021 uvm_shmfork = NULL;
1022
1023 /* Destroy all condvars */
1024 for (i = 0; i < shminfo.shmmni; i++)
1025 cv_destroy(&shm_cv[i]);
1026 cv_destroy(&shm_realloc_cv);
1027
1028 /* Free the allocated/wired memory */
1029 sz = ALIGN(shminfo.shmmni * sizeof(struct shmid_ds)) +
1030 ALIGN(shminfo.shmmni * sizeof(kcondvar_t));
1031 sz = round_page(sz);
1032 uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED);
1033
1034 /* Release and destroy our mutex */
1035 mutex_exit(&shm_lock);
1036 mutex_destroy(&shm_lock);
1037
1038 kern_has_sysvshm = 0;
1039
1040 return 0;
1041}
1042
1043static int
1044sysctl_ipc_shmmni(SYSCTLFN_ARGS)
1045{
1046 int newsize, error;
1047 struct sysctlnode node;
1048 node = *rnode;
1049 node.sysctl_data = &newsize;
1050
1051 newsize = shminfo.shmmni;
1052 error = sysctl_lookup(SYSCTLFN_CALL(&node));
1053 if (error || newp == NULL)
1054 return error;
1055
1056 sysctl_unlock();
1057 error = shmrealloc(newsize);
1058 sysctl_relock();
1059 return error;
1060}
1061
1062static int
1063sysctl_ipc_shmmaxpgs(SYSCTLFN_ARGS)
1064{
1065 uint32_t newsize;
1066 int error;
1067 struct sysctlnode node;
1068 node = *rnode;
1069 node.sysctl_data = &newsize;
1070
1071 newsize = shminfo.shmall;
1072 error = sysctl_lookup(SYSCTLFN_CALL(&node));
1073 if (error || newp == NULL)
1074 return error;
1075
1076 if (newsize < 1)
1077 return EINVAL;
1078
1079 shminfo.shmall = newsize;
1080 shminfo.shmmax = (uint64_t)shminfo.shmall * PAGE_SIZE;
1081
1082 return 0;
1083}
1084
1085static int
1086sysctl_ipc_shmmax(SYSCTLFN_ARGS)
1087{
1088 uint64_t newsize;
1089 int error;
1090 struct sysctlnode node;
1091 node = *rnode;
1092 node.sysctl_data = &newsize;
1093
1094 newsize = shminfo.shmmax;
1095 error = sysctl_lookup(SYSCTLFN_CALL(&node));
1096 if (error || newp == NULL)
1097 return error;
1098
1099 if (newsize < PAGE_SIZE)
1100 return EINVAL;
1101
1102 shminfo.shmmax = round_page(newsize);
1103 shminfo.shmall = shminfo.shmmax >> PAGE_SHIFT;
1104
1105 return 0;
1106}
1107
1108SYSCTL_SETUP(sysctl_ipc_shm_setup, "sysctl kern.ipc subtree setup")
1109{
1110
1111 sysctl_createv(clog, 0, NULL, NULL,
1112 CTLFLAG_PERMANENT,
1113 CTLTYPE_NODE, "ipc",
1114 SYSCTL_DESCR("SysV IPC options"),
1115 NULL, 0, NULL, 0,
1116 CTL_KERN, KERN_SYSVIPC, CTL_EOL);
1117 sysctl_createv(clog, 0, NULL, NULL,
1118 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1119 CTLTYPE_QUAD, "shmmax",
1120 SYSCTL_DESCR("Max shared memory segment size in bytes"),
1121 sysctl_ipc_shmmax, 0, &shminfo.shmmax, 0,
1122 CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMMAX, CTL_EOL);
1123 sysctl_createv(clog, 0, NULL, NULL,
1124 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1125 CTLTYPE_INT, "shmmni",
1126 SYSCTL_DESCR("Max number of shared memory identifiers"),
1127 sysctl_ipc_shmmni, 0, &shminfo.shmmni, 0,
1128 CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMMNI, CTL_EOL);
1129 sysctl_createv(clog, 0, NULL, NULL,
1130 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1131 CTLTYPE_INT, "shmseg",
1132 SYSCTL_DESCR("Max shared memory segments per process"),
1133 NULL, 0, &shminfo.shmseg, 0,
1134 CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMSEG, CTL_EOL);
1135 sysctl_createv(clog, 0, NULL, NULL,
1136 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1137 CTLTYPE_INT, "shmmaxpgs",
1138 SYSCTL_DESCR("Max amount of shared memory in pages"),
1139 sysctl_ipc_shmmaxpgs, 0, &shminfo.shmall, 0,
1140 CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMMAXPGS, CTL_EOL);
1141 sysctl_createv(clog, 0, NULL, NULL,
1142 CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1143 CTLTYPE_INT, "shm_use_phys",
1144 SYSCTL_DESCR("Enable/disable locking of shared memory in "
1145 "physical memory"), NULL, 0, &shm_use_phys, 0,
1146 CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMUSEPHYS, CTL_EOL);
1147}
1148