kern_fork.c source code [src/src/sys/kern/kern_fork.c]

1	/ $NetBSD: kern_fork.c,v 1.196 2016/11/04 18:14:04 christos Exp $ /
2
3	/-*
4	* Copyright (c) 1999, 2001, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
5	* All rights reserved.
6	*
7	* This code is derived from software contributed to The NetBSD Foundation
8	* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9	* NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10	*
11	* Redistribution and use in source and binary forms, with or without
12	* modification, are permitted provided that the following conditions
13	* are met:
14	* 1. Redistributions of source code must retain the above copyright
15	* notice, this list of conditions and the following disclaimer.
16	* 2. Redistributions in binary form must reproduce the above copyright
17	* notice, this list of conditions and the following disclaimer in the
18	* documentation and/or other materials provided with the distribution.
19	*
20	* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30	* POSSIBILITY OF SUCH DAMAGE.
31	*/
32
33	/*
34	* Copyright (c) 1982, 1986, 1989, 1991, 1993
35	* The Regents of the University of California. All rights reserved.
36	* (c) UNIX System Laboratories, Inc.
37	* All or some portions of this file are derived from material licensed
38	* to the University of California by American Telephone and Telegraph
39	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
40	* the permission of UNIX System Laboratories, Inc.
41	*
42	* Redistribution and use in source and binary forms, with or without
43	* modification, are permitted provided that the following conditions
44	* are met:
45	* 1. Redistributions of source code must retain the above copyright
46	* notice, this list of conditions and the following disclaimer.
47	* 2. Redistributions in binary form must reproduce the above copyright
48	* notice, this list of conditions and the following disclaimer in the
49	* documentation and/or other materials provided with the distribution.
50	* 3. Neither the name of the University nor the names of its contributors
51	* may be used to endorse or promote products derived from this software
52	* without specific prior written permission.
53	*
54	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64	* SUCH DAMAGE.
65	*
66	* @(#)kern_fork.c 8.8 (Berkeley) 2/14/95
67	*/
68
69	#include <sys/cdefs.h>
70	__KERNEL_RCSID(`0`, "$NetBSD: kern_fork.c,v 1.196 2016/11/04 18:14:04 christos Exp $");
71
72	#include "opt_ktrace.h"
73	#include "opt_dtrace.h"
74
75	#include <sys/param.h>
76	#include <sys/systm.h>
77	#include <sys/filedesc.h>
78	#include <sys/kernel.h>
79	#include <sys/pool.h>
80	#include <sys/mount.h>
81	#include <sys/proc.h>
82	#include <sys/ras.h>
83	#include <sys/resourcevar.h>
84	#include <sys/vnode.h>
85	#include <sys/file.h>
86	#include <sys/acct.h>
87	#include <sys/ktrace.h>
88	#include <sys/sched.h>
89	#include <sys/signalvar.h>
90	#include <sys/kauth.h>
91	#include <sys/atomic.h>
92	#include <sys/syscallargs.h>
93	#include <sys/uidinfo.h>
94	#include <sys/sdt.h>
95	#include <sys/ptrace.h>
96
97	#include <uvm/uvm_extern.h>
98
99	/*
100	* DTrace SDT provider definitions
101	*/
102	SDT_PROVIDER_DECLARE(proc);
103	SDT_PROBE_DEFINE3(proc, kernel, , create,
104	"struct proc ", /* new process /
105	"struct proc ", /* parent process /
106	"int" / flags /);
107
108	u_int nprocs __cacheline_aligned = `1`; / process 0 /
109
110	/*
111	* Number of ticks to sleep if fork() would fail due to process hitting
112	* limits. Exported in miliseconds to userland via sysctl.
113	*/
114	int forkfsleep = `0`;
115
116	int
117	sys_fork(struct lwp l, const* void v, register_t retval)
118	{
119
120	return fork1(l, `0`, SIGCHLD, NULL, `0`, NULL, NULL, retval, NULL);
121	}
122
123	/*
124	* vfork(2) system call compatible with 4.4BSD (i.e. BSD with Mach VM).
125	* Address space is not shared, but parent is blocked until child exit.
126	*/
127	int
128	sys_vfork(struct lwp l, const* void v, register_t retval)
129	{
130
131	return fork1(l, FORK_PPWAIT, SIGCHLD, NULL, `0`, NULL, NULL,
132	retval, NULL);
133	}
134
135	/*
136	* New vfork(2) system call for NetBSD, which implements original 3BSD vfork(2)
137	* semantics. Address space is shared, and parent is blocked until child exit.
138	*/
139	int
140	sys___vfork14(struct lwp l, const* void v, register_t retval)
141	{
142
143	return fork1(l, FORK_PPWAIT\|FORK_SHAREVM, SIGCHLD, NULL, `0`,
144	NULL, NULL, retval, NULL);
145	}
146
147	/*
148	* Linux-compatible __clone(2) system call.
149	*/
150	int
151	sys___clone(struct lwp l, const* struct sys___clone_args *uap,
152	register_t *retval)
153	{
154	/ {*
155	syscallarg(int) flags;
156	syscallarg(void ) stack;*
157	} /*
158	int flags, sig;
159
160	/*
161	* We don't support the CLONE_PID or CLONE_PTRACE flags.
162	*/
163	if (SCARG(uap, flags) & (CLONE_PID\|CLONE_PTRACE))
164	return EINVAL;
165
166	/*
167	* Linux enforces CLONE_VM with CLONE_SIGHAND, do same.
168	*/
169	if (SCARG(uap, flags) & CLONE_SIGHAND
170	&& (SCARG(uap, flags) & CLONE_VM) == `0`)
171	return EINVAL;
172
173	flags = `0`;
174
175	if (SCARG(uap, flags) & CLONE_VM)
176	flags \|= FORK_SHAREVM;
177	if (SCARG(uap, flags) & CLONE_FS)
178	flags \|= FORK_SHARECWD;
179	if (SCARG(uap, flags) & CLONE_FILES)
180	flags \|= FORK_SHAREFILES;
181	if (SCARG(uap, flags) & CLONE_SIGHAND)
182	flags \|= FORK_SHARESIGS;
183	if (SCARG(uap, flags) & CLONE_VFORK)
184	flags \|= FORK_PPWAIT;
185
186	sig = SCARG(uap, flags) & CLONE_CSIGNAL;
187	if (sig < `0` \|\| sig >= _NSIG)
188	return EINVAL;
189
190	/*
191	* Note that the Linux API does not provide a portable way of
192	* specifying the stack area; the caller must know if the stack
193	* grows up or down. So, we pass a stack size of 0, so that the
194	* code that makes this adjustment is a noop.
195	*/
196	return fork1(l, flags, sig, SCARG(uap, stack), `0`,
197	NULL, NULL, retval, NULL);
198	}
199
200	/*
201	* Print the 'table full' message once per 10 seconds.
202	*/
203	static struct timeval fork_tfmrate = { `10`, `0` };
204
205	/*
206	* General fork call. Note that another LWP in the process may call exec()
207	* or exit() while we are forking. It's safe to continue here, because
208	* neither operation will complete until all LWPs have exited the process.
209	*/
210	int
211	fork1(struct lwp l1, int* flags, int exitsig, void *stack, size_t stacksize,
212	void (func)(void* ), void* arg, register_t retval,
213	struct proc **rnewprocp)
214	{
215	struct proc p1, p2, *parent;
216	struct plimit *p1_lim;
217	uid_t uid;
218	struct lwp *l2;
219	int count;
220	vaddr_t uaddr;
221	int tnprocs;
222	int tracefork;
223	int error = `0`;
224
225	p1 = l1->l_proc;
226	uid = kauth_cred_getuid(l1->l_cred);
227	tnprocs = atomic_inc_uint_nv(&nprocs);
228
229	/*
230	* Although process entries are dynamically created, we still keep
231	* a global limit on the maximum number we will create.
232	*/
233	if (__predict_false(tnprocs >= maxproc))
234	error = -`1`;
235	else
236	error = kauth_authorize_process(l1->l_cred,
237	KAUTH_PROCESS_FORK, p1, KAUTH_ARG(tnprocs), NULL, NULL);
238
239	if (error) {
240	static struct timeval lasttfm;
241	atomic_dec_uint(&nprocs);
242	if (ratecheck(&lasttfm, &fork_tfmrate))
243	tablefull("proc", "increase kern.maxproc or NPROC");
244	if (forkfsleep)
245	kpause("forkmx", false, forkfsleep, NULL);
246	return EAGAIN;
247	}
248
249	/*
250	* Enforce limits.
251	*/
252	count = chgproccnt(uid, `1`);
253	if (__predict_false(count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur)) {
254	if (kauth_authorize_process(l1->l_cred, KAUTH_PROCESS_RLIMIT,
255	p1, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS),
256	&p1->p_rlimit[RLIMIT_NPROC], KAUTH_ARG(RLIMIT_NPROC)) != `0`) {
257	(void)chgproccnt(uid, -`1`);
258	atomic_dec_uint(&nprocs);
259	if (forkfsleep)
260	kpause("forkulim", false, forkfsleep, NULL);
261	return EAGAIN;
262	}
263	}
264
265	/*
266	* Allocate virtual address space for the U-area now, while it
267	* is still easy to abort the fork operation if we're out of
268	* kernel virtual address space.
269	*/
270	uaddr = uvm_uarea_alloc();
271	if (__predict_false(uaddr == `0`)) {
272	(void)chgproccnt(uid, -`1`);
273	atomic_dec_uint(&nprocs);
274	return ENOMEM;
275	}
276
277	/*
278	* We are now committed to the fork. From here on, we may
279	* block on resources, but resource allocation may NOT fail.
280	*/
281
282	/ Allocate new proc. /
283	p2 = proc_alloc();
284
285	/*
286	* Make a proc table entry for the new process.
287	* Start by zeroing the section of proc that is zero-initialized,
288	* then copy the section that is copied directly from the parent.
289	*/
290	memset(&p2->p_startzero, `0`,
291	(unsigned) ((char )&p2->p_endzero - (char* *)&p2->p_startzero));
292	memcpy(&p2->p_startcopy, &p1->p_startcopy,
293	(unsigned) ((char )&p2->p_endcopy - (char* *)&p2->p_startcopy));
294
295	TAILQ_INIT(&p2->p_sigpend.sp_info);
296
297	LIST_INIT(&p2->p_lwps);
298	LIST_INIT(&p2->p_sigwaiters);
299
300	/*
301	* Duplicate sub-structures as needed.
302	* Increase reference counts on shared objects.
303	* Inherit flags we want to keep. The flags related to SIGCHLD
304	* handling are important in order to keep a consistent behaviour
305	* for the child after the fork. If we are a 32-bit process, the
306	* child will be too.
307	*/
308	p2->p_flag =
309	p1->p_flag & (PK_SUGID \| PK_NOCLDWAIT \| PK_CLDSIGIGN \| PK_32);
310	p2->p_emul = p1->p_emul;
311	p2->p_execsw = p1->p_execsw;
312
313	if (flags & FORK_SYSTEM) {
314	/*
315	* Mark it as a system process. Set P_NOCLDWAIT so that
316	* children are reparented to init(8) when they exit.
317	* init(8) can easily wait them out for us.
318	*/
319	p2->p_flag \|= (PK_SYSTEM \| PK_NOCLDWAIT);
320	}
321
322	mutex_init(&p2->p_stmutex, MUTEX_DEFAULT, IPL_HIGH);
323	mutex_init(&p2->p_auxlock, MUTEX_DEFAULT, IPL_NONE);
324	rw_init(&p2->p_reflock);
325	cv_init(&p2->p_waitcv, "wait");
326	cv_init(&p2->p_lwpcv, "lwpwait");
327
328	/*
329	* Share a lock between the processes if they are to share signal
330	* state: we must synchronize access to it.
331	*/
332	if (flags & FORK_SHARESIGS) {
333	p2->p_lock = p1->p_lock;
334	mutex_obj_hold(p1->p_lock);
335	} else
336	p2->p_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
337
338	kauth_proc_fork(p1, p2);
339
340	p2->p_raslist = NULL;
341	#if defined(__HAVE_RAS)
342	ras_fork(p1, p2);
343	#endif
344
345	/ bump references to the text vnode (for procfs) /
346	p2->p_textvp = p1->p_textvp;
347	if (p2->p_textvp)
348	vref(p2->p_textvp);
349
350	if (flags & FORK_SHAREFILES)
351	fd_share(p2);
352	else if (flags & FORK_CLEANFILES)
353	p2->p_fd = fd_init(NULL);
354	else
355	p2->p_fd = fd_copy();
356
357	/ XXX racy /
358	p2->p_mqueue_cnt = p1->p_mqueue_cnt;
359
360	if (flags & FORK_SHARECWD)
361	cwdshare(p2);
362	else
363	p2->p_cwdi = cwdinit();
364
365	/*
366	* Note: p_limit (rlimit stuff) is copy-on-write, so normally
367	* we just need increase pl_refcnt.
368	*/
369	p1_lim = p1->p_limit;
370	if (!p1_lim->pl_writeable) {
371	lim_addref(p1_lim);
372	p2->p_limit = p1_lim;
373	} else {
374	p2->p_limit = lim_copy(p1_lim);
375	}
376
377	if (flags & FORK_PPWAIT) {
378	/ Mark ourselves as waiting for a child. /
379	l1->l_pflag \|= LP_VFORKWAIT;
380	p2->p_lflag = PL_PPWAIT;
381	p2->p_vforklwp = l1;
382	} else {
383	p2->p_lflag = `0`;
384	}
385	p2->p_sflag = `0`;
386	p2->p_slflag = `0`;
387	parent = (flags & FORK_NOWAIT) ? initproc : p1;
388	p2->p_pptr = parent;
389	p2->p_ppid = parent->p_pid;
390	LIST_INIT(&p2->p_children);
391
392	p2->p_aio = NULL;
393
394	#ifdef KTRACE
395	/*
396	* Copy traceflag and tracefile if enabled.
397	* If not inherited, these were zeroed above.
398	*/
399	if (p1->p_traceflag & KTRFAC_INHERIT) {
400	mutex_enter(&ktrace_lock);
401	p2->p_traceflag = p1->p_traceflag;
402	if ((p2->p_tracep = p1->p_tracep) != NULL)
403	ktradref(p2);
404	mutex_exit(&ktrace_lock);
405	}
406	#endif
407
408	/*
409	* Create signal actions for the child process.
410	*/
411	p2->p_sigacts = sigactsinit(p1, flags & FORK_SHARESIGS);
412	mutex_enter(p1->p_lock);
413	p2->p_sflag \|=
414	(p1->p_sflag & (PS_STOPFORK \| PS_STOPEXEC \| PS_NOCLDSTOP));
415	sched_proc_fork(p1, p2);
416	mutex_exit(p1->p_lock);
417
418	p2->p_stflag = p1->p_stflag;
419
420	/*
421	* p_stats.
422	* Copy parts of p_stats, and zero out the rest.
423	*/
424	p2->p_stats = pstatscopy(p1->p_stats);
425
426	/*
427	* Set up the new process address space.
428	*/
429	uvm_proc_fork(p1, p2, (flags & FORK_SHAREVM) ? true : false);
430
431	/*
432	* Finish creating the child process.
433	* It will return through a different path later.
434	*/
435	lwp_create(l1, p2, uaddr, (flags & FORK_PPWAIT) ? LWP_VFORK : `0`,
436	stack, stacksize, (func != NULL) ? func : child_return, arg, &l2,
437	l1->l_class);
438
439	/*
440	* Inherit l_private from the parent.
441	* Note that we cannot use lwp_setprivate() here since that
442	* also sets the CPU TLS register, which is incorrect if the
443	* process has changed that without letting the kernel know.
444	*/
445	l2->l_private = l1->l_private;
446
447	/*
448	* If emulation has a process fork hook, call it now.
449	*/
450	if (p2->p_emul->e_proc_fork)
451	(*p2->p_emul->e_proc_fork)(p2, l1, flags);
452
453	/*
454	* ...and finally, any other random fork hooks that subsystems
455	* might have registered.
456	*/
457	doforkhooks(p2, p1);
458
459	SDT_PROBE(proc, kernel, , create, p2, p1, flags, `0`, `0`);
460
461	/*
462	* It's now safe for the scheduler and other processes to see the
463	* child process.
464	*/
465	mutex_enter(proc_lock);
466
467	if (p1->p_session->s_ttyvp != NULL && p1->p_lflag & PL_CONTROLT)
468	p2->p_lflag \|= PL_CONTROLT;
469
470	LIST_INSERT_HEAD(&parent->p_children, p2, p_sibling);
471	p2->p_exitsig = exitsig; / signal for parent on exit /
472
473	/*
474	* We don't want to tracefork vfork()ed processes because they
475	* will not receive the SIGTRAP until it is too late.
476	*/
477	tracefork = (p1->p_slflag & (PSL_TRACEFORK\|PSL_TRACED)) ==
478	(PSL_TRACEFORK\|PSL_TRACED) && (flags && FORK_PPWAIT) == `0`;
479	if (tracefork) {
480	proc_changeparent(p2, p1->p_pptr);
481	/*
482	* Set ptrace status.
483	*/
484	p1->p_fpid = p2->p_pid;
485	p2->p_fpid = p1->p_pid;
486	}
487
488	LIST_INSERT_AFTER(p1, p2, p_pglist);
489	LIST_INSERT_HEAD(&allproc, p2, p_list);
490
491	p2->p_trace_enabled = trace_is_enabled(p2);
492	#ifdef __HAVE_SYSCALL_INTERN
493	(*p2->p_emul->e_syscall_intern)(p2);
494	#endif
495
496	/*
497	* Update stats now that we know the fork was successful.
498	*/
499	uvmexp.forks++;
500	if (flags & FORK_PPWAIT)
501	uvmexp.forks_ppwait++;
502	if (flags & FORK_SHAREVM)
503	uvmexp.forks_sharevm++;
504
505	/*
506	* Pass a pointer to the new process to the caller.
507	*/
508	if (rnewprocp != NULL)
509	*rnewprocp = p2;
510
511	if (ktrpoint(KTR_EMUL))
512	p2->p_traceflag \|= KTRFAC_TRC_EMUL;
513
514	/*
515	* Notify any interested parties about the new process.
516	*/
517	if (!SLIST_EMPTY(&p1->p_klist)) {
518	mutex_exit(proc_lock);
519	KNOTE(&p1->p_klist, NOTE_FORK \| p2->p_pid);
520	mutex_enter(proc_lock);
521	}
522
523	/*
524	* Make child runnable, set start time, and add to run queue except
525	* if the parent requested the child to start in SSTOP state.
526	*/
527	mutex_enter(p2->p_lock);
528
529	/*
530	* Start profiling.
531	*/
532	if ((p2->p_stflag & PST_PROFIL) != `0`) {
533	mutex_spin_enter(&p2->p_stmutex);
534	startprofclock(p2);
535	mutex_spin_exit(&p2->p_stmutex);
536	}
537
538	getmicrotime(&p2->p_stats->p_start);
539	p2->p_acflag = AFORK;
540	lwp_lock(l2);
541	KASSERT(p2->p_nrlwps == `1`);
542	if (p2->p_sflag & PS_STOPFORK) {
543	struct schedstate_percpu *spc = &l2->l_cpu->ci_schedstate;
544	p2->p_nrlwps = `0`;
545	p2->p_stat = SSTOP;
546	p2->p_waited = `0`;
547	p1->p_nstopchild++;
548	l2->l_stat = LSSTOP;
549	KASSERT(l2->l_wchan == NULL);
550	lwp_unlock_to(l2, spc->spc_lwplock);
551	} else {
552	p2->p_nrlwps = `1`;
553	p2->p_stat = SACTIVE;
554	l2->l_stat = LSRUN;
555	sched_enqueue(l2, false);
556	lwp_unlock(l2);
557	}
558
559	/*
560	* Return child pid to parent process,
561	* marking us as parent via retval[1].
562	*/
563	if (retval != NULL) {
564	retval[`0`] = p2->p_pid;
565	retval[`1`] = `0`;
566	}
567	mutex_exit(p2->p_lock);
568
569	/*
570	* Preserve synchronization semantics of vfork. If waiting for
571	* child to exec or exit, sleep until it clears LP_VFORKWAIT.
572	*/
573	#if 0
574	while (l1->l_pflag & LP_VFORKWAIT) {
575	cv_wait(&l1->l_waitcv, proc_lock);
576	}
577	#else
578	while (p2->p_lflag & PL_PPWAIT)
579	cv_wait(&p1->p_waitcv, proc_lock);
580	#endif
581
582	/*
583	* Let the parent know that we are tracing its child.
584	*/
585	if (tracefork) {
586	ksiginfo_t ksi;
587
588	KSI_INIT_EMPTY(&ksi);
589	ksi.ksi_signo = SIGTRAP;
590	ksi.ksi_lid = l1->l_lid;
591	kpsignal(p1, &ksi, NULL);
592	}
593	mutex_exit(proc_lock);
594
595	return `0`;
596	}
597

Browse the source code of src/src/sys/kern/kern_fork.c