vfs_syscalls.c source code [src/src/sys/kern/vfs_syscalls.c]

1	/ $NetBSD: vfs_syscalls.c,v 1.505 2016/07/31 20:34:04 dholland Exp $ /
2
3	/-*
4	* Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5	* All rights reserved.
6	*
7	* This code is derived from software contributed to The NetBSD Foundation
8	* by Andrew Doran.
9	*
10	* Redistribution and use in source and binary forms, with or without
11	* modification, are permitted provided that the following conditions
12	* are met:
13	* 1. Redistributions of source code must retain the above copyright
14	* notice, this list of conditions and the following disclaimer.
15	* 2. Redistributions in binary form must reproduce the above copyright
16	* notice, this list of conditions and the following disclaimer in the
17	* documentation and/or other materials provided with the distribution.
18	*
19	* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29	* POSSIBILITY OF SUCH DAMAGE.
30	*/
31
32	/*
33	* Copyright (c) 1989, 1993
34	* The Regents of the University of California. All rights reserved.
35	* (c) UNIX System Laboratories, Inc.
36	* All or some portions of this file are derived from material licensed
37	* to the University of California by American Telephone and Telegraph
38	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
39	* the permission of UNIX System Laboratories, Inc.
40	*
41	* Redistribution and use in source and binary forms, with or without
42	* modification, are permitted provided that the following conditions
43	* are met:
44	* 1. Redistributions of source code must retain the above copyright
45	* notice, this list of conditions and the following disclaimer.
46	* 2. Redistributions in binary form must reproduce the above copyright
47	* notice, this list of conditions and the following disclaimer in the
48	* documentation and/or other materials provided with the distribution.
49	* 3. Neither the name of the University nor the names of its contributors
50	* may be used to endorse or promote products derived from this software
51	* without specific prior written permission.
52	*
53	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63	* SUCH DAMAGE.
64	*
65	* @(#)vfs_syscalls.c 8.42 (Berkeley) 7/31/95
66	*/
67
68	/*
69	* Virtual File System System Calls
70	*/
71
72	#include <sys/cdefs.h>
73	__KERNEL_RCSID(`0`, "$NetBSD: vfs_syscalls.c,v 1.505 2016/07/31 20:34:04 dholland Exp $");
74
75	#ifdef _KERNEL_OPT
76	#include "opt_fileassoc.h"
77	#include "veriexec.h"
78	#endif
79
80	#include <sys/param.h>
81	#include <sys/systm.h>
82	#include <sys/namei.h>
83	#include <sys/filedesc.h>
84	#include <sys/kernel.h>
85	#include <sys/file.h>
86	#include <sys/fcntl.h>
87	#include <sys/stat.h>
88	#include <sys/vnode.h>
89	#include <sys/mount.h>
90	#include <sys/proc.h>
91	#include <sys/uio.h>
92	#include <sys/kmem.h>
93	#include <sys/dirent.h>
94	#include <sys/sysctl.h>
95	#include <sys/syscallargs.h>
96	#include <sys/vfs_syscalls.h>
97	#include <sys/quota.h>
98	#include <sys/quotactl.h>
99	#include <sys/ktrace.h>
100	#ifdef FILEASSOC
101	#include <sys/fileassoc.h>
102	#endif /* FILEASSOC */
103	#include <sys/extattr.h>
104	#include <sys/verified_exec.h>
105	#include <sys/kauth.h>
106	#include <sys/atomic.h>
107	#include <sys/module.h>
108	#include <sys/buf.h>
109
110	#include <miscfs/genfs/genfs.h>
111	#include <miscfs/specfs/specdev.h>
112
113	#include <nfs/rpcv2.h>
114	#include <nfs/nfsproto.h>
115	#include <nfs/nfs.h>
116	#include <nfs/nfs_var.h>
117
118	/ XXX this shouldn't be here /
119	#ifndef OFF_T_MAX
120	#define OFF_T_MAX __type_max(off_t)
121	#endif
122
123	static int change_flags(struct vnode , u_long, struct* lwp *);
124	static int change_mode(struct vnode , int, struct* lwp *);
125	static int change_owner(struct vnode , uid_t, gid_t, struct* lwp , int*);
126	static int do_sys_openat(lwp_t , int, const* char , int, int, int* *);
127	static int do_sys_mkdirat(struct lwp l, int, const* char *, mode_t,
128	enum uio_seg);
129	static int do_sys_mkfifoat(struct lwp , int, const* char *, mode_t);
130	static int do_sys_symlinkat(struct lwp , const* char , int, const* char *,
131	enum uio_seg);
132	static int do_sys_renameat(struct lwp l, int, const* char , int, const* char *,
133	enum uio_seg, int);
134	static int do_sys_readlinkat(struct lwp , int, const* char , char* *,
135	size_t, register_t *);
136	static int do_sys_unlinkat(struct lwp , int, const* char , int, enum* uio_seg);
137
138	static int fd_nameiat(struct lwp , int, struct* nameidata *);
139	static int fd_nameiat_simple_user(struct lwp , int, const* char *,
140	namei_simple_flags_t, struct vnode **);
141
142
143	/*
144	* This table is used to maintain compatibility with 4.3BSD
145	* and NetBSD 0.9 mount syscalls - and possibly other systems.
146	* Note, the order is important!
147	*
148	* Do not modify this table. It should only contain filesystems
149	* supported by NetBSD 0.9 and 4.3BSD.
150	*/
151	const char * const mountcompatnames[] = {
152	NULL, / 0 = MOUNT_NONE /
153	MOUNT_FFS, / 1 = MOUNT_UFS /
154	MOUNT_NFS, / 2 /
155	MOUNT_MFS, / 3 /
156	MOUNT_MSDOS, / 4 /
157	MOUNT_CD9660, / 5 = MOUNT_ISOFS /
158	MOUNT_FDESC, / 6 /
159	MOUNT_KERNFS, / 7 /
160	NULL, / 8 = MOUNT_DEVFS /
161	MOUNT_AFS, / 9 /
162	};
163
164	const int nmountcompatnames = __arraycount(mountcompatnames);
165
166	static int
167	fd_nameiat(struct lwp l, int* fdat, struct nameidata *ndp)
168	{
169	file_t *dfp;
170	int error;
171
172	if (fdat != AT_FDCWD) {
173	if ((error = fd_getvnode(fdat, &dfp)) != `0`)
174	goto out;
175
176	NDAT(ndp, dfp->f_vnode);
177	}
178
179	error = namei(ndp);
180
181	if (fdat != AT_FDCWD)
182	fd_putfile(fdat);
183	out:
184	return error;
185	}
186
187	static int
188	fd_nameiat_simple_user(struct lwp l, int* fdat, const char *path,
189	namei_simple_flags_t sflags, struct vnode **vp_ret)
190	{
191	file_t *dfp;
192	struct vnode *dvp;
193	int error;
194
195	if (fdat != AT_FDCWD) {
196	if ((error = fd_getvnode(fdat, &dfp)) != `0`)
197	goto out;
198
199	dvp = dfp->f_vnode;
200	} else {
201	dvp = NULL;
202	}
203
204	error = nameiat_simple_user(dvp, path, sflags, vp_ret);
205
206	if (fdat != AT_FDCWD)
207	fd_putfile(fdat);
208	out:
209	return error;
210	}
211
212	static int
213	open_setfp(struct lwp l, file_t fp, struct vnode vp, int* indx, int flags)
214	{
215	int error;
216
217	fp->f_flag = flags & FMASK;
218	fp->f_type = DTYPE_VNODE;
219	fp->f_ops = &vnops;
220	fp->f_vnode = vp;
221
222	if (flags & (O_EXLOCK \| O_SHLOCK)) {
223	struct flock lf;
224	int type;
225
226	lf.l_whence = SEEK_SET;
227	lf.l_start = `0`;
228	lf.l_len = `0`;
229	if (flags & O_EXLOCK)
230	lf.l_type = F_WRLCK;
231	else
232	lf.l_type = F_RDLCK;
233	type = F_FLOCK;
234	if ((flags & FNONBLOCK) == `0`)
235	type \|= F_WAIT;
236	VOP_UNLOCK(vp);
237	error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
238	if (error) {
239	(void) vn_close(vp, fp->f_flag, fp->f_cred);
240	fd_abort(l->l_proc, fp, indx);
241	return error;
242	}
243	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
244	atomic_or_uint(&fp->f_flag, FHASLOCK);
245	}
246	if (flags & O_CLOEXEC)
247	fd_set_exclose(l, indx, true);
248	return `0`;
249	}
250
251	static int
252	mount_update(struct lwp l, struct* vnode vp, const* char path, int* flags,
253	void data, size_t data_len)
254	{
255	struct mount *mp;
256	int error = `0`, saved_flags;
257
258	mp = vp->v_mount;
259	saved_flags = mp->mnt_flag;
260
261	/ We can operate only on VV_ROOT nodes. /
262	if ((vp->v_vflag & VV_ROOT) == `0`) {
263	error = EINVAL;
264	goto out;
265	}
266
267	/*
268	* We only allow the filesystem to be reloaded if it
269	* is currently mounted read-only. Additionally, we
270	* prevent read-write to read-only downgrades.
271	*/
272	if ((flags & (MNT_RELOAD \| MNT_RDONLY)) != `0` &&
273	(mp->mnt_flag & MNT_RDONLY) == `0` &&
274	(mp->mnt_iflag & IMNT_CAN_RWTORO) == `0`) {
275	error = EOPNOTSUPP; / Needs translation /
276	goto out;
277	}
278
279	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
280	KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data);
281	if (error)
282	goto out;
283
284	if (vfs_busy(mp, NULL)) {
285	error = EPERM;
286	goto out;
287	}
288
289	mutex_enter(&mp->mnt_updating);
290
291	mp->mnt_flag &= ~MNT_OP_FLAGS;
292	mp->mnt_flag \|= flags & MNT_OP_FLAGS;
293
294	/*
295	* Set the mount level flags.
296	*/
297	if (flags & MNT_RDONLY)
298	mp->mnt_flag \|= MNT_RDONLY;
299	else if (mp->mnt_flag & MNT_RDONLY)
300	mp->mnt_iflag \|= IMNT_WANTRDWR;
301	mp->mnt_flag &= ~MNT_BASIC_FLAGS;
302	mp->mnt_flag \|= flags & MNT_BASIC_FLAGS;
303	error = VFS_MOUNT(mp, path, data, data_len);
304
305	if (error && data != NULL) {
306	int error2;
307
308	/*
309	* Update failed; let's try and see if it was an
310	* export request. For compat with 3.0 and earlier.
311	*/
312	error2 = vfs_hooks_reexport(mp, path, data);
313
314	/*
315	* Only update error code if the export request was
316	* understood but some problem occurred while
317	* processing it.
318	*/
319	if (error2 != EJUSTRETURN)
320	error = error2;
321	}
322
323	if (mp->mnt_iflag & IMNT_WANTRDWR)
324	mp->mnt_flag &= ~MNT_RDONLY;
325	if (error)
326	mp->mnt_flag = saved_flags;
327	mp->mnt_flag &= ~MNT_OP_FLAGS;
328	mp->mnt_iflag &= ~IMNT_WANTRDWR;
329	if ((mp->mnt_flag & (MNT_RDONLY \| MNT_ASYNC)) == `0`) {
330	if ((mp->mnt_iflag & IMNT_ONWORKLIST) == `0`)
331	vfs_syncer_add_to_worklist(mp);
332	} else {
333	if ((mp->mnt_iflag & IMNT_ONWORKLIST) != `0`)
334	vfs_syncer_remove_from_worklist(mp);
335	}
336	mutex_exit(&mp->mnt_updating);
337	vfs_unbusy(mp, false, NULL);
338
339	if ((error == `0`) && !(saved_flags & MNT_EXTATTR) &&
340	(flags & MNT_EXTATTR)) {
341	if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_START,
342	NULL, `0`, NULL) != `0`) {
343	printf("%s: failed to start extattr, error = %d",
344	mp->mnt_stat.f_mntonname, error);
345	mp->mnt_flag &= ~MNT_EXTATTR;
346	}
347	}
348
349	if ((error == `0`) && (saved_flags & MNT_EXTATTR) &&
350	!(flags & MNT_EXTATTR)) {
351	if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_STOP,
352	NULL, `0`, NULL) != `0`) {
353	printf("%s: failed to stop extattr, error = %d",
354	mp->mnt_stat.f_mntonname, error);
355	mp->mnt_flag \|= MNT_RDONLY;
356	}
357	}
358	out:
359	return (error);
360	}
361
362	static int
363	mount_get_vfsops(const char fstype, enum* uio_seg type_seg,
364	struct vfsops **vfsops)
365	{
366	char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)];
367	int error;
368
369	if (type_seg == UIO_USERSPACE) {
370	/ Copy file-system type from userspace. /
371	error = copyinstr(fstype, fstypename, sizeof(fstypename), NULL);
372	} else {
373	error = copystr(fstype, fstypename, sizeof(fstypename), NULL);
374	KASSERT(error == `0`);
375	}
376
377	if (error) {
378	/*
379	* Historically, filesystem types were identified by numbers.
380	* If we get an integer for the filesystem type instead of a
381	* string, we check to see if it matches one of the historic
382	* filesystem types.
383	*/
384	u_long fsindex = (u_long)fstype;
385	if (fsindex >= nmountcompatnames \|\|
386	mountcompatnames[fsindex] == NULL)
387	return ENODEV;
388	strlcpy(fstypename, mountcompatnames[fsindex],
389	sizeof(fstypename));
390	}
391
392	/ Accept `ufs' as an alias for `ffs', for compatibility. /
393	if (strcmp(fstypename, "ufs") == `0`)
394	fstypename[`0`] = `'f'`;
395
396	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
397	return `0`;
398
399	/ If we can autoload a vfs module, try again /
400	(void)module_autoload(fstypename, MODULE_CLASS_VFS);
401
402	if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
403	return `0`;
404
405	return ENODEV;
406	}
407
408	static int
409	mount_getargs(struct lwp l, struct* vnode vp, const* char path, int* flags,
410	void data, size_t data_len)
411	{
412	struct mount *mp;
413	int error;
414
415	/ If MNT_GETARGS is specified, it should be the only flag. /
416	if (flags & ~MNT_GETARGS)
417	return EINVAL;
418
419	mp = vp->v_mount;
420
421	/ XXX: probably some notion of "can see" here if we want isolation. /
422	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
423	KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL);
424	if (error)
425	return error;
426
427	if ((vp->v_vflag & VV_ROOT) == `0`)
428	return EINVAL;
429
430	if (vfs_busy(mp, NULL))
431	return EPERM;
432
433	mutex_enter(&mp->mnt_updating);
434	mp->mnt_flag &= ~MNT_OP_FLAGS;
435	mp->mnt_flag \|= MNT_GETARGS;
436	error = VFS_MOUNT(mp, path, data, data_len);
437	mp->mnt_flag &= ~MNT_OP_FLAGS;
438	mutex_exit(&mp->mnt_updating);
439
440	vfs_unbusy(mp, false, NULL);
441	return (error);
442	}
443
444	int
445	sys___mount50(struct lwp l, const* struct sys___mount50_args uap, register_t retval)
446	{
447	/ {*
448	syscallarg(const char ) type;*
449	syscallarg(const char ) path;*
450	syscallarg(int) flags;
451	syscallarg(void ) data;*
452	syscallarg(size_t) data_len;
453	} /*
454
455	return do_sys_mount(l, SCARG(uap, type), UIO_USERSPACE, SCARG(uap, path),
456	SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE,
457	SCARG(uap, data_len), retval);
458	}
459
460	int
461	do_sys_mount(struct lwp l, const* char type, enum* uio_seg type_seg,
462	const char path, int* flags, void data, enum* uio_seg data_seg,
463	size_t data_len, register_t *retval)
464	{
465	struct vfsops vfsops = NULL; /* XXX gcc4.8 /
466	struct vnode *vp;
467	void *data_buf = data;
468	bool vfsopsrele = false;
469	size_t alloc_sz = `0`;
470	int error;
471
472	/*
473	* Get vnode to be covered
474	*/
475	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
476	if (error != `0`) {
477	vp = NULL;
478	goto done;
479	}
480
481	if (flags & (MNT_GETARGS \| MNT_UPDATE)) {
482	vfsops = vp->v_mount->mnt_op;
483	} else {
484	/ 'type' is userspace /
485	error = mount_get_vfsops(type, type_seg, &vfsops);
486	if (error != `0`)
487	goto done;
488	vfsopsrele = true;
489	}
490
491	/*
492	* We allow data to be NULL, even for userspace. Some fs's don't need
493	* it. The others will handle NULL.
494	*/
495	if (data != NULL && data_seg == UIO_USERSPACE) {
496	if (data_len == `0`) {
497	/ No length supplied, use default for filesystem /
498	data_len = vfsops->vfs_min_mount_data;
499
500	/*
501	* Hopefully a longer buffer won't make copyin() fail.
502	* For compatibility with 3.0 and earlier.
503	*/
504	if (flags & MNT_UPDATE
505	&& data_len < sizeof (struct mnt_export_args30))
506	data_len = sizeof (struct mnt_export_args30);
507	}
508	if ((data_len == `0`) \|\| (data_len > VFS_MAX_MOUNT_DATA)) {
509	error = EINVAL;
510	goto done;
511	}
512	alloc_sz = data_len;
513	data_buf = kmem_alloc(alloc_sz, KM_SLEEP);
514
515	/ NFS needs the buffer even for mnt_getargs .... /
516	error = copyin(data, data_buf, data_len);
517	if (error != `0`)
518	goto done;
519	}
520
521	if (flags & MNT_GETARGS) {
522	if (data_len == `0`) {
523	error = EINVAL;
524	goto done;
525	}
526	error = mount_getargs(l, vp, path, flags, data_buf, &data_len);
527	if (error != `0`)
528	goto done;
529	if (data_seg == UIO_USERSPACE)
530	error = copyout(data_buf, data, data_len);
531	*retval = data_len;
532	} else if (flags & MNT_UPDATE) {
533	error = mount_update(l, vp, path, flags, data_buf, &data_len);
534	} else {
535	/ Locking is handled internally in mount_domount(). /
536	KASSERT(vfsopsrele == true);
537	error = mount_domount(l, &vp, vfsops, path, flags, data_buf,
538	&data_len);
539	vfsopsrele = false;
540	}
541
542	done:
543	if (vfsopsrele)
544	vfs_delref(vfsops);
545	if (vp != NULL) {
546	vrele(vp);
547	}
548	if (data_buf != data)
549	kmem_free(data_buf, alloc_sz);
550	return (error);
551	}
552
553	/*
554	* Unmount a file system.
555	*
556	* Note: unmount takes a path to the vnode mounted on as argument,
557	* not special file (as before).
558	*/
559	/ ARGSUSED /
560	int
561	sys_unmount(struct lwp l, const* struct sys_unmount_args uap, register_t retval)
562	{
563	/ {*
564	syscallarg(const char ) path;*
565	syscallarg(int) flags;
566	} /*
567	struct vnode *vp;
568	struct mount *mp;
569	int error;
570	struct pathbuf *pb;
571	struct nameidata nd;
572
573	error = pathbuf_copyin(SCARG(uap, path), &pb);
574	if (error) {
575	return error;
576	}
577
578	NDINIT(&nd, LOOKUP, NOFOLLOW \| LOCKLEAF \| TRYEMULROOT, pb);
579	if ((error = namei(&nd)) != `0`) {
580	pathbuf_destroy(pb);
581	return error;
582	}
583	vp = nd.ni_vp;
584	pathbuf_destroy(pb);
585
586	mp = vp->v_mount;
587	atomic_inc_uint(&mp->mnt_refcnt);
588	VOP_UNLOCK(vp);
589
590	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
591	KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL);
592	if (error) {
593	vrele(vp);
594	vfs_destroy(mp);
595	return (error);
596	}
597
598	/*
599	* Don't allow unmounting the root file system.
600	*/
601	if (mp->mnt_flag & MNT_ROOTFS) {
602	vrele(vp);
603	vfs_destroy(mp);
604	return (EINVAL);
605	}
606
607	/*
608	* Must be the root of the filesystem
609	*/
610	if ((vp->v_vflag & VV_ROOT) == `0`) {
611	vrele(vp);
612	vfs_destroy(mp);
613	return (EINVAL);
614	}
615
616	vrele(vp);
617	error = dounmount(mp, SCARG(uap, flags), l);
618	vfs_destroy(mp);
619	return error;
620	}
621
622	/*
623	* Sync each mounted filesystem.
624	*/
625	#ifdef DEBUG
626	int syncprt = `0`;
627	struct ctldebug debug0 = { "syncprt", &syncprt };
628	#endif
629
630	void
631	do_sys_sync(struct lwp *l)
632	{
633	struct mount mp, nmp;
634	int asyncflag;
635
636	mutex_enter(&mountlist_lock);
637	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
638	if (vfs_busy(mp, &nmp)) {
639	continue;
640	}
641	mutex_enter(&mp->mnt_updating);
642	if ((mp->mnt_flag & MNT_RDONLY) == `0`) {
643	asyncflag = mp->mnt_flag & MNT_ASYNC;
644	mp->mnt_flag &= ~MNT_ASYNC;
645	VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
646	if (asyncflag)
647	mp->mnt_flag \|= MNT_ASYNC;
648	}
649	mutex_exit(&mp->mnt_updating);
650	vfs_unbusy(mp, false, &nmp);
651	}
652	mutex_exit(&mountlist_lock);
653	#ifdef DEBUG
654	if (syncprt)
655	vfs_bufstats();
656	#endif /* DEBUG */
657	}
658
659	/ ARGSUSED /
660	int
661	sys_sync(struct lwp l, const* void v, register_t retval)
662	{
663	do_sys_sync(l);
664	return (`0`);
665	}
666
667
668	/*
669	* Access or change filesystem quotas.
670	*
671	* (this is really 14 different calls bundled into one)
672	*/
673
674	static int
675	do_sys_quotactl_stat(struct mount mp, struct* quotastat *info_u)
676	{
677	struct quotastat info_k;
678	int error;
679
680	/ ensure any padding bytes are cleared /
681	memset(&info_k, `0`, sizeof(info_k));
682
683	error = vfs_quotactl_stat(mp, &info_k);
684	if (error) {
685	return error;
686	}
687
688	return copyout(&info_k, info_u, sizeof(info_k));
689	}
690
691	static int
692	do_sys_quotactl_idtypestat(struct mount mp, int* idtype,
693	struct quotaidtypestat *info_u)
694	{
695	struct quotaidtypestat info_k;
696	int error;
697
698	/ ensure any padding bytes are cleared /
699	memset(&info_k, `0`, sizeof(info_k));
700
701	error = vfs_quotactl_idtypestat(mp, idtype, &info_k);
702	if (error) {
703	return error;
704	}
705
706	return copyout(&info_k, info_u, sizeof(info_k));
707	}
708
709	static int
710	do_sys_quotactl_objtypestat(struct mount mp, int* objtype,
711	struct quotaobjtypestat *info_u)
712	{
713	struct quotaobjtypestat info_k;
714	int error;
715
716	/ ensure any padding bytes are cleared /
717	memset(&info_k, `0`, sizeof(info_k));
718
719	error = vfs_quotactl_objtypestat(mp, objtype, &info_k);
720	if (error) {
721	return error;
722	}
723
724	return copyout(&info_k, info_u, sizeof(info_k));
725	}
726
727	static int
728	do_sys_quotactl_get(struct mount mp, const* struct quotakey *key_u,
729	struct quotaval *val_u)
730	{
731	struct quotakey key_k;
732	struct quotaval val_k;
733	int error;
734
735	/ ensure any padding bytes are cleared /
736	memset(&val_k, `0`, sizeof(val_k));
737
738	error = copyin(key_u, &key_k, sizeof(key_k));
739	if (error) {
740	return error;
741	}
742
743	error = vfs_quotactl_get(mp, &key_k, &val_k);
744	if (error) {
745	return error;
746	}
747
748	return copyout(&val_k, val_u, sizeof(val_k));
749	}
750
751	static int
752	do_sys_quotactl_put(struct mount mp, const* struct quotakey *key_u,
753	const struct quotaval *val_u)
754	{
755	struct quotakey key_k;
756	struct quotaval val_k;
757	int error;
758
759	error = copyin(key_u, &key_k, sizeof(key_k));
760	if (error) {
761	return error;
762	}
763
764	error = copyin(val_u, &val_k, sizeof(val_k));
765	if (error) {
766	return error;
767	}
768
769	return vfs_quotactl_put(mp, &key_k, &val_k);
770	}
771
772	static int
773	do_sys_quotactl_del(struct mount mp, const* struct quotakey *key_u)
774	{
775	struct quotakey key_k;
776	int error;
777
778	error = copyin(key_u, &key_k, sizeof(key_k));
779	if (error) {
780	return error;
781	}
782
783	return vfs_quotactl_del(mp, &key_k);
784	}
785
786	static int
787	do_sys_quotactl_cursoropen(struct mount mp, struct* quotakcursor *cursor_u)
788	{
789	struct quotakcursor cursor_k;
790	int error;
791
792	/ ensure any padding bytes are cleared /
793	memset(&cursor_k, `0`, sizeof(cursor_k));
794
795	error = vfs_quotactl_cursoropen(mp, &cursor_k);
796	if (error) {
797	return error;
798	}
799
800	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
801	}
802
803	static int
804	do_sys_quotactl_cursorclose(struct mount mp, struct* quotakcursor *cursor_u)
805	{
806	struct quotakcursor cursor_k;
807	int error;
808
809	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
810	if (error) {
811	return error;
812	}
813
814	return vfs_quotactl_cursorclose(mp, &cursor_k);
815	}
816
817	static int
818	do_sys_quotactl_cursorskipidtype(struct mount *mp,
819	struct quotakcursor cursor_u, int* idtype)
820	{
821	struct quotakcursor cursor_k;
822	int error;
823
824	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
825	if (error) {
826	return error;
827	}
828
829	error = vfs_quotactl_cursorskipidtype(mp, &cursor_k, idtype);
830	if (error) {
831	return error;
832	}
833
834	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
835	}
836
837	static int
838	do_sys_quotactl_cursorget(struct mount mp, struct* quotakcursor *cursor_u,
839	struct quotakey keys_u, struct* quotaval vals_u, unsigned* maxnum,
840	unsigned *ret_u)
841	{
842	#define CGET_STACK_MAX 8
843	struct quotakcursor cursor_k;
844	struct quotakey stackkeys[CGET_STACK_MAX];
845	struct quotaval stackvals[CGET_STACK_MAX];
846	struct quotakey *keys_k;
847	struct quotaval *vals_k;
848	unsigned ret_k;
849	int error;
850
851	if (maxnum > `128`) {
852	maxnum = `128`;
853	}
854
855	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
856	if (error) {
857	return error;
858	}
859
860	if (maxnum <= CGET_STACK_MAX) {
861	keys_k = stackkeys;
862	vals_k = stackvals;
863	/ ensure any padding bytes are cleared /
864	memset(keys_k, `0`, maxnum * sizeof(keys_k[`0`]));
865	memset(vals_k, `0`, maxnum * sizeof(vals_k[`0`]));
866	} else {
867	keys_k = kmem_zalloc(maxnum * sizeof(keys_k[`0`]), KM_SLEEP);
868	vals_k = kmem_zalloc(maxnum * sizeof(vals_k[`0`]), KM_SLEEP);
869	}
870
871	error = vfs_quotactl_cursorget(mp, &cursor_k, keys_k, vals_k, maxnum,
872	&ret_k);
873	if (error) {
874	goto fail;
875	}
876
877	error = copyout(keys_k, keys_u, ret_k * sizeof(keys_k[`0`]));
878	if (error) {
879	goto fail;
880	}
881
882	error = copyout(vals_k, vals_u, ret_k * sizeof(vals_k[`0`]));
883	if (error) {
884	goto fail;
885	}
886
887	error = copyout(&ret_k, ret_u, sizeof(ret_k));
888	if (error) {
889	goto fail;
890	}
891
892	/ do last to maximize the chance of being able to recover a failure /
893	error = copyout(&cursor_k, cursor_u, sizeof(cursor_k));
894
895	fail:
896	if (keys_k != stackkeys) {
897	kmem_free(keys_k, maxnum * sizeof(keys_k[`0`]));
898	}
899	if (vals_k != stackvals) {
900	kmem_free(vals_k, maxnum * sizeof(vals_k[`0`]));
901	}
902	return error;
903	}
904
905	static int
906	do_sys_quotactl_cursoratend(struct mount mp, struct* quotakcursor *cursor_u,
907	int *ret_u)
908	{
909	struct quotakcursor cursor_k;
910	int ret_k;
911	int error;
912
913	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
914	if (error) {
915	return error;
916	}
917
918	error = vfs_quotactl_cursoratend(mp, &cursor_k, &ret_k);
919	if (error) {
920	return error;
921	}
922
923	error = copyout(&ret_k, ret_u, sizeof(ret_k));
924	if (error) {
925	return error;
926	}
927
928	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
929	}
930
931	static int
932	do_sys_quotactl_cursorrewind(struct mount mp, struct* quotakcursor *cursor_u)
933	{
934	struct quotakcursor cursor_k;
935	int error;
936
937	error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
938	if (error) {
939	return error;
940	}
941
942	error = vfs_quotactl_cursorrewind(mp, &cursor_k);
943	if (error) {
944	return error;
945	}
946
947	return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
948	}
949
950	static int
951	do_sys_quotactl_quotaon(struct mount mp, int* idtype, const char *path_u)
952	{
953	char *path_k;
954	int error;
955
956	/ XXX this should probably be a struct pathbuf /
957	path_k = PNBUF_GET();
958	error = copyin(path_u, path_k, PATH_MAX);
959	if (error) {
960	PNBUF_PUT(path_k);
961	return error;
962	}
963
964	error = vfs_quotactl_quotaon(mp, idtype, path_k);
965
966	PNBUF_PUT(path_k);
967	return error;
968	}
969
970	static int
971	do_sys_quotactl_quotaoff(struct mount mp, int* idtype)
972	{
973	return vfs_quotactl_quotaoff(mp, idtype);
974	}
975
976	int
977	do_sys_quotactl(const char path_u, const* struct quotactl_args *args)
978	{
979	struct mount *mp;
980	struct vnode *vp;
981	int error;
982
983	error = namei_simple_user(path_u, NSM_FOLLOW_TRYEMULROOT, &vp);
984	if (error != `0`)
985	return (error);
986	mp = vp->v_mount;
987
988	switch (args->qc_op) {
989	case QUOTACTL_STAT:
990	error = do_sys_quotactl_stat(mp, args->u.stat.qc_info);
991	break;
992	case QUOTACTL_IDTYPESTAT:
993	error = do_sys_quotactl_idtypestat(mp,
994	args->u.idtypestat.qc_idtype,
995	args->u.idtypestat.qc_info);
996	break;
997	case QUOTACTL_OBJTYPESTAT:
998	error = do_sys_quotactl_objtypestat(mp,
999	args->u.objtypestat.qc_objtype,
1000	args->u.objtypestat.qc_info);
1001	break;
1002	case QUOTACTL_GET:
1003	error = do_sys_quotactl_get(mp,
1004	args->u.get.qc_key,
1005	args->u.get.qc_val);
1006	break;
1007	case QUOTACTL_PUT:
1008	error = do_sys_quotactl_put(mp,
1009	args->u.put.qc_key,
1010	args->u.put.qc_val);
1011	break;
1012	case QUOTACTL_DEL:
1013	error = do_sys_quotactl_del(mp, args->u.del.qc_key);
1014	break;
1015	case QUOTACTL_CURSOROPEN:
1016	error = do_sys_quotactl_cursoropen(mp,
1017	args->u.cursoropen.qc_cursor);
1018	break;
1019	case QUOTACTL_CURSORCLOSE:
1020	error = do_sys_quotactl_cursorclose(mp,
1021	args->u.cursorclose.qc_cursor);
1022	break;
1023	case QUOTACTL_CURSORSKIPIDTYPE:
1024	error = do_sys_quotactl_cursorskipidtype(mp,
1025	args->u.cursorskipidtype.qc_cursor,
1026	args->u.cursorskipidtype.qc_idtype);
1027	break;
1028	case QUOTACTL_CURSORGET:
1029	error = do_sys_quotactl_cursorget(mp,
1030	args->u.cursorget.qc_cursor,
1031	args->u.cursorget.qc_keys,
1032	args->u.cursorget.qc_vals,
1033	args->u.cursorget.qc_maxnum,
1034	args->u.cursorget.qc_ret);
1035	break;
1036	case QUOTACTL_CURSORATEND:
1037	error = do_sys_quotactl_cursoratend(mp,
1038	args->u.cursoratend.qc_cursor,
1039	args->u.cursoratend.qc_ret);
1040	break;
1041	case QUOTACTL_CURSORREWIND:
1042	error = do_sys_quotactl_cursorrewind(mp,
1043	args->u.cursorrewind.qc_cursor);
1044	break;
1045	case QUOTACTL_QUOTAON:
1046	error = do_sys_quotactl_quotaon(mp,
1047	args->u.quotaon.qc_idtype,
1048	args->u.quotaon.qc_quotafile);
1049	break;
1050	case QUOTACTL_QUOTAOFF:
1051	error = do_sys_quotactl_quotaoff(mp,
1052	args->u.quotaoff.qc_idtype);
1053	break;
1054	default:
1055	error = EINVAL;
1056	break;
1057	}
1058
1059	vrele(vp);
1060	return error;
1061	}
1062
1063	/ ARGSUSED /
1064	int
1065	sys___quotactl(struct lwp l, const* struct sys___quotactl_args *uap,
1066	register_t *retval)
1067	{
1068	/ {*
1069	syscallarg(const char ) path;*
1070	syscallarg(struct quotactl_args ) args;*
1071	} /*
1072	struct quotactl_args args;
1073	int error;
1074
1075	error = copyin(SCARG(uap, args), &args, sizeof(args));
1076	if (error) {
1077	return error;
1078	}
1079
1080	return do_sys_quotactl(SCARG(uap, path), &args);
1081	}
1082
1083	int
1084	dostatvfs(struct mount mp, struct* statvfs sp, struct* lwp l, int* flags,
1085	int root)
1086	{
1087	struct cwdinfo *cwdi = l->l_proc->p_cwdi;
1088	int error = `0`;
1089
1090	/*
1091	* If MNT_NOWAIT or MNT_LAZY is specified, do not
1092	* refresh the fsstat cache. MNT_WAIT or MNT_LAZY
1093	* overrides MNT_NOWAIT.
1094	*/
1095	if (flags == MNT_NOWAIT \|\| flags == MNT_LAZY \|\|
1096	(flags != MNT_WAIT && flags != `0`)) {
1097	memcpy(sp, &mp->mnt_stat, sizeof(*sp));
1098	goto done;
1099	}
1100
1101	/ Get the filesystem stats now /
1102	memset(sp, `0`, sizeof(*sp));
1103	if ((error = VFS_STATVFS(mp, sp)) != `0`) {
1104	return error;
1105	}
1106
1107	if (cwdi->cwdi_rdir == NULL)
1108	(void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat));
1109	done:
1110	if (cwdi->cwdi_rdir != NULL) {
1111	size_t len;
1112	char *bp;
1113	char c;
1114	char *path = PNBUF_GET();
1115
1116	bp = path + MAXPATHLEN;
1117	*--bp = `'\0'`;
1118	rw_enter(&cwdi->cwdi_lock, RW_READER);
1119	error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path,
1120	MAXPATHLEN / `2`, `0`, l);
1121	rw_exit(&cwdi->cwdi_lock);
1122	if (error) {
1123	PNBUF_PUT(path);
1124	return error;
1125	}
1126	len = strlen(bp);
1127	if (len != `1`) {
1128	/*
1129	* for mount points that are below our root, we can see
1130	* them, so we fix up the pathname and return them. The
1131	* rest we cannot see, so we don't allow viewing the
1132	* data.
1133	*/
1134	if (strncmp(bp, sp->f_mntonname, len) == `0` &&
1135	((c = sp->f_mntonname[len]) == `'/'` \|\| c == `'\0'`)) {
1136	(void)strlcpy(sp->f_mntonname,
1137	c == `'\0'` ? "/" : &sp->f_mntonname[len],
1138	sizeof(sp->f_mntonname));
1139	} else {
1140	if (root)
1141	(void)strlcpy(sp->f_mntonname, "/",
1142	sizeof(sp->f_mntonname));
1143	else
1144	error = EPERM;
1145	}
1146	}
1147	PNBUF_PUT(path);
1148	}
1149	sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK;
1150	return error;
1151	}
1152
1153	/*
1154	* Get filesystem statistics by path.
1155	*/
1156	int
1157	do_sys_pstatvfs(struct lwp l, const* char path, int* flags, struct statvfs *sb)
1158	{
1159	struct mount *mp;
1160	int error;
1161	struct vnode *vp;
1162
1163	error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
1164	if (error != `0`)
1165	return error;
1166	mp = vp->v_mount;
1167	error = dostatvfs(mp, sb, l, flags, `1`);
1168	vrele(vp);
1169	return error;
1170	}
1171
1172	/ ARGSUSED /
1173	int
1174	sys_statvfs1(struct lwp l, const* struct sys_statvfs1_args uap, register_t retval)
1175	{
1176	/ {*
1177	syscallarg(const char ) path;*
1178	syscallarg(struct statvfs ) buf;*
1179	syscallarg(int) flags;
1180	} /*
1181	struct statvfs *sb;
1182	int error;
1183
1184	sb = STATVFSBUF_GET();
1185	error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
1186	if (error == `0`)
1187	error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1188	STATVFSBUF_PUT(sb);
1189	return error;
1190	}
1191
1192	/*
1193	* Get filesystem statistics by fd.
1194	*/
1195	int
1196	do_sys_fstatvfs(struct lwp l, int* fd, int flags, struct statvfs *sb)
1197	{
1198	file_t *fp;
1199	struct mount *mp;
1200	int error;
1201
1202	/ fd_getvnode() will use the descriptor for us /
1203	if ((error = fd_getvnode(fd, &fp)) != `0`)
1204	return (error);
1205	mp = fp->f_vnode->v_mount;
1206	error = dostatvfs(mp, sb, curlwp, flags, `1`);
1207	fd_putfile(fd);
1208	return error;
1209	}
1210
1211	/ ARGSUSED /
1212	int
1213	sys_fstatvfs1(struct lwp l, const* struct sys_fstatvfs1_args uap, register_t retval)
1214	{
1215	/ {*
1216	syscallarg(int) fd;
1217	syscallarg(struct statvfs ) buf;*
1218	syscallarg(int) flags;
1219	} /*
1220	struct statvfs *sb;
1221	int error;
1222
1223	sb = STATVFSBUF_GET();
1224	error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
1225	if (error == `0`)
1226	error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
1227	STATVFSBUF_PUT(sb);
1228	return error;
1229	}
1230
1231
1232	/*
1233	* Get statistics on all filesystems.
1234	*/
1235	int
1236	do_sys_getvfsstat(struct lwp l, void* sfsp, size_t bufsize, int* flags,
1237	int (copyfn)(const* void , void* *, size_t), size_t entry_sz,
1238	register_t *retval)
1239	{
1240	int root = `0`;
1241	struct proc *p = l->l_proc;
1242	struct mount mp, nmp;
1243	struct statvfs *sb;
1244	size_t count, maxcount;
1245	int error = `0`;
1246
1247	sb = STATVFSBUF_GET();
1248	maxcount = bufsize / entry_sz;
1249	mutex_enter(&mountlist_lock);
1250	count = `0`;
1251	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
1252	if (vfs_busy(mp, &nmp)) {
1253	continue;
1254	}
1255	if (sfsp && count < maxcount) {
1256	error = dostatvfs(mp, sb, l, flags, `0`);
1257	if (error) {
1258	vfs_unbusy(mp, false, &nmp);
1259	error = `0`;
1260	continue;
1261	}
1262	error = copyfn(sb, sfsp, entry_sz);
1263	if (error) {
1264	vfs_unbusy(mp, false, NULL);
1265	goto out;
1266	}
1267	sfsp = (char *)sfsp + entry_sz;
1268	root \|= strcmp(sb->f_mntonname, "/") == `0`;
1269	}
1270	count++;
1271	vfs_unbusy(mp, false, &nmp);
1272	}
1273	mutex_exit(&mountlist_lock);
1274
1275	if (root == `0` && p->p_cwdi->cwdi_rdir) {
1276	/*
1277	* fake a root entry
1278	*/
1279	error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount,
1280	sb, l, flags, `1`);
1281	if (error != `0`)
1282	goto out;
1283	if (sfsp) {
1284	error = copyfn(sb, sfsp, entry_sz);
1285	if (error != `0`)
1286	goto out;
1287	}
1288	count++;
1289	}
1290	if (sfsp && count > maxcount)
1291	*retval = maxcount;
1292	else
1293	*retval = count;
1294	out:
1295	STATVFSBUF_PUT(sb);
1296	return error;
1297	}
1298
1299	int
1300	sys_getvfsstat(struct lwp l, const* struct sys_getvfsstat_args uap, register_t retval)
1301	{
1302	/ {*
1303	syscallarg(struct statvfs ) buf;*
1304	syscallarg(size_t) bufsize;
1305	syscallarg(int) flags;
1306	} /*
1307
1308	return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
1309	SCARG(uap, flags), copyout, sizeof (struct statvfs), retval);
1310	}
1311
1312	/*
1313	* Change current working directory to a given file descriptor.
1314	*/
1315	/ ARGSUSED /
1316	int
1317	sys_fchdir(struct lwp l, const* struct sys_fchdir_args uap, register_t retval)
1318	{
1319	/ {*
1320	syscallarg(int) fd;
1321	} /*
1322	struct proc *p = l->l_proc;
1323	struct cwdinfo *cwdi;
1324	struct vnode vp, tdp;
1325	struct mount *mp;
1326	file_t *fp;
1327	int error, fd;
1328
1329	/ fd_getvnode() will use the descriptor for us /
1330	fd = SCARG(uap, fd);
1331	if ((error = fd_getvnode(fd, &fp)) != `0`)
1332	return (error);
1333	vp = fp->f_vnode;
1334
1335	vref(vp);
1336	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
1337	if (vp->v_type != VDIR)
1338	error = ENOTDIR;
1339	else
1340	error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1341	if (error) {
1342	vput(vp);
1343	goto out;
1344	}
1345	while ((mp = vp->v_mountedhere) != NULL) {
1346	error = vfs_busy(mp, NULL);
1347	vput(vp);
1348	if (error != `0`)
1349	goto out;
1350	error = VFS_ROOT(mp, &tdp);
1351	vfs_unbusy(mp, false, NULL);
1352	if (error)
1353	goto out;
1354	vp = tdp;
1355	}
1356	VOP_UNLOCK(vp);
1357
1358	/*
1359	* Disallow changing to a directory not under the process's
1360	* current root directory (if there is one).
1361	*/
1362	cwdi = p->p_cwdi;
1363	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1364	if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) {
1365	vrele(vp);
1366	error = EPERM; / operation not permitted /
1367	} else {
1368	vrele(cwdi->cwdi_cdir);
1369	cwdi->cwdi_cdir = vp;
1370	}
1371	rw_exit(&cwdi->cwdi_lock);
1372
1373	out:
1374	fd_putfile(fd);
1375	return (error);
1376	}
1377
1378	/*
1379	* Change this process's notion of the root directory to a given file
1380	* descriptor.
1381	*/
1382	int
1383	sys_fchroot(struct lwp l, const* struct sys_fchroot_args uap, register_t retval)
1384	{
1385	struct proc *p = l->l_proc;
1386	struct vnode *vp;
1387	file_t *fp;
1388	int error, fd = SCARG(uap, fd);
1389
1390	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1391	KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != `0`)
1392	return error;
1393	/ fd_getvnode() will use the descriptor for us /
1394	if ((error = fd_getvnode(fd, &fp)) != `0`)
1395	return error;
1396	vp = fp->f_vnode;
1397	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
1398	if (vp->v_type != VDIR)
1399	error = ENOTDIR;
1400	else
1401	error = VOP_ACCESS(vp, VEXEC, l->l_cred);
1402	VOP_UNLOCK(vp);
1403	if (error)
1404	goto out;
1405	vref(vp);
1406
1407	change_root(p->p_cwdi, vp, l);
1408
1409	out:
1410	fd_putfile(fd);
1411	return (error);
1412	}
1413
1414	/*
1415	* Change current working directory (``.'').
1416	*/
1417	/ ARGSUSED /
1418	int
1419	sys_chdir(struct lwp l, const* struct sys_chdir_args uap, register_t retval)
1420	{
1421	/ {*
1422	syscallarg(const char ) path;*
1423	} /*
1424	struct proc *p = l->l_proc;
1425	struct cwdinfo *cwdi;
1426	int error;
1427	struct vnode *vp;
1428
1429	if ((error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE,
1430	&vp, l)) != `0`)
1431	return (error);
1432	cwdi = p->p_cwdi;
1433	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1434	vrele(cwdi->cwdi_cdir);
1435	cwdi->cwdi_cdir = vp;
1436	rw_exit(&cwdi->cwdi_lock);
1437	return (`0`);
1438	}
1439
1440	/*
1441	* Change notion of root (``/'') directory.
1442	*/
1443	/ ARGSUSED /
1444	int
1445	sys_chroot(struct lwp l, const* struct sys_chroot_args uap, register_t retval)
1446	{
1447	/ {*
1448	syscallarg(const char ) path;*
1449	} /*
1450	struct proc *p = l->l_proc;
1451	int error;
1452	struct vnode *vp;
1453
1454	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
1455	KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != `0`)
1456	return (error);
1457	if ((error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE,
1458	&vp, l)) != `0`)
1459	return (error);
1460
1461	change_root(p->p_cwdi, vp, l);
1462
1463	return (`0`);
1464	}
1465
1466	/*
1467	* Common routine for chroot and fchroot.
1468	* NB: callers need to properly authorize the change root operation.
1469	*/
1470	void
1471	change_root(struct cwdinfo cwdi, struct* vnode vp, struct* lwp *l)
1472	{
1473	struct proc *p = l->l_proc;
1474	kauth_cred_t ncred;
1475
1476	ncred = kauth_cred_alloc();
1477
1478	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
1479	if (cwdi->cwdi_rdir != NULL)
1480	vrele(cwdi->cwdi_rdir);
1481	cwdi->cwdi_rdir = vp;
1482
1483	/*
1484	* Prevent escaping from chroot by putting the root under
1485	* the working directory. Silently chdir to / if we aren't
1486	* already there.
1487	*/
1488	if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
1489	/*
1490	* XXX would be more failsafe to change directory to a
1491	* deadfs node here instead
1492	*/
1493	vrele(cwdi->cwdi_cdir);
1494	vref(vp);
1495	cwdi->cwdi_cdir = vp;
1496	}
1497	rw_exit(&cwdi->cwdi_lock);
1498
1499	/ Get a write lock on the process credential. /
1500	proc_crmod_enter();
1501
1502	kauth_cred_clone(p->p_cred, ncred);
1503	kauth_proc_chroot(ncred, p->p_cwdi);
1504
1505	/ Broadcast our credentials to the process and other LWPs. /
1506	proc_crmod_leave(ncred, p->p_cred, true);
1507	}
1508
1509	/*
1510	* Common routine for chroot and chdir.
1511	* XXX "where" should be enum uio_seg
1512	*/
1513	int
1514	chdir_lookup(const char path, int* where, struct vnode vpp, struct** lwp *l)
1515	{
1516	struct pathbuf *pb;
1517	struct nameidata nd;
1518	int error;
1519
1520	error = pathbuf_maybe_copyin(path, where, &pb);
1521	if (error) {
1522	return error;
1523	}
1524	NDINIT(&nd, LOOKUP, FOLLOW \| LOCKLEAF \| TRYEMULROOT, pb);
1525	if ((error = namei(&nd)) != `0`) {
1526	pathbuf_destroy(pb);
1527	return error;
1528	}
1529	*vpp = nd.ni_vp;
1530	pathbuf_destroy(pb);
1531
1532	if ((*vpp)->v_type != VDIR)
1533	error = ENOTDIR;
1534	else
1535	error = VOP_ACCESS(*vpp, VEXEC, l->l_cred);
1536
1537	if (error)
1538	vput(*vpp);
1539	else
1540	VOP_UNLOCK(*vpp);
1541	return (error);
1542	}
1543
1544	/*
1545	* Internals of sys_open - path has already been converted into a pathbuf
1546	* (so we can easily reuse this function from other parts of the kernel,
1547	* like posix_spawn post-processing).
1548	*/
1549	int
1550	do_open(lwp_t l, struct* vnode dvp, struct* pathbuf pb, int* open_flags,
1551	int open_mode, int *fd)
1552	{
1553	struct proc *p = l->l_proc;
1554	struct cwdinfo *cwdi = p->p_cwdi;
1555	file_t *fp;
1556	struct vnode *vp;
1557	int flags, cmode;
1558	int indx, error;
1559	struct nameidata nd;
1560
1561	if (open_flags & O_SEARCH) {
1562	open_flags &= ~(int)O_SEARCH;
1563	}
1564
1565	flags = FFLAGS(open_flags);
1566	if ((flags & (FREAD \| FWRITE)) == `0`)
1567	return EINVAL;
1568
1569	if ((error = fd_allocfile(&fp, &indx)) != `0`) {
1570	return error;
1571	}
1572
1573	/ We're going to read cwdi->cwdi_cmask unlocked here. /
1574	cmode = ((open_mode &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;
1575	NDINIT(&nd, LOOKUP, FOLLOW \| TRYEMULROOT, pb);
1576	if (dvp != NULL)
1577	NDAT(&nd, dvp);
1578
1579	l->l_dupfd = -indx - `1`; / XXX check for fdopen /
1580	if ((error = vn_open(&nd, flags, cmode)) != `0`) {
1581	fd_abort(p, fp, indx);
1582	if ((error == EDUPFD \|\| error == EMOVEFD) &&
1583	l->l_dupfd >= `0` && / XXX from fdopen /
1584	(error =
1585	fd_dupopen(l->l_dupfd, &indx, flags, error)) == `0`) {
1586	*fd = indx;
1587	return `0`;
1588	}
1589	if (error == ERESTART)
1590	error = EINTR;
1591	return error;
1592	}
1593
1594	l->l_dupfd = `0`;
1595	vp = nd.ni_vp;
1596
1597	if ((error = open_setfp(l, fp, vp, indx, flags)))
1598	return error;
1599
1600	VOP_UNLOCK(vp);
1601	*fd = indx;
1602	fd_affix(p, fp, indx);
1603	return `0`;
1604	}
1605
1606	int
1607	fd_open(const char path, int* open_flags, int open_mode, int *fd)
1608	{
1609	struct pathbuf *pb;
1610	int error, oflags;
1611
1612	oflags = FFLAGS(open_flags);
1613	if ((oflags & (FREAD \| FWRITE)) == `0`)
1614	return EINVAL;
1615
1616	pb = pathbuf_create(path);
1617	if (pb == NULL)
1618	return ENOMEM;
1619
1620	error = do_open(curlwp, NULL, pb, open_flags, open_mode, fd);
1621	pathbuf_destroy(pb);
1622
1623	return error;
1624	}
1625
1626	/*
1627	* Check permissions, allocate an open file structure,
1628	* and call the device open routine if any.
1629	*/
1630	static int
1631	do_sys_openat(lwp_t l, int* fdat, const char path, int* flags,
1632	int mode, int *fd)
1633	{
1634	file_t *dfp = NULL;
1635	struct vnode *dvp = NULL;
1636	struct pathbuf *pb;
1637	int error;
1638
1639	#ifdef COMPAT_10 /* XXX: and perhaps later */
1640	if (path == NULL) {
1641	pb = pathbuf_create(".");
1642	if (pb == NULL)
1643	return ENOMEM;
1644	} else
1645	#endif
1646	{
1647	error = pathbuf_copyin(path, &pb);
1648	if (error)
1649	return error;
1650	}
1651
1652	if (fdat != AT_FDCWD) {
1653	/ fd_getvnode() will use the descriptor for us /
1654	if ((error = fd_getvnode(fdat, &dfp)) != `0`)
1655	goto out;
1656
1657	dvp = dfp->f_vnode;
1658	}
1659
1660	error = do_open(l, dvp, pb, flags, mode, fd);
1661
1662	if (dfp != NULL)
1663	fd_putfile(fdat);
1664	out:
1665	pathbuf_destroy(pb);
1666	return error;
1667	}
1668
1669	int
1670	sys_open(struct lwp l, const* struct sys_open_args uap, register_t retval)
1671	{
1672	/ {*
1673	syscallarg(const char ) path;*
1674	syscallarg(int) flags;
1675	syscallarg(int) mode;
1676	} /*
1677	int error;
1678	int fd;
1679
1680	error = do_sys_openat(l, AT_FDCWD, SCARG(uap, path),
1681	SCARG(uap, flags), SCARG(uap, mode), &fd);
1682
1683	if (error == `0`)
1684	*retval = fd;
1685
1686	return error;
1687	}
1688
1689	int
1690	sys_openat(struct lwp l, const* struct sys_openat_args uap, register_t retval)
1691	{
1692	/ {*
1693	syscallarg(int) fd;
1694	syscallarg(const char ) path;*
1695	syscallarg(int) oflags;
1696	syscallarg(int) mode;
1697	} /*
1698	int error;
1699	int fd;
1700
1701	error = do_sys_openat(l, SCARG(uap, fd), SCARG(uap, path),
1702	SCARG(uap, oflags), SCARG(uap, mode), &fd);
1703
1704	if (error == `0`)
1705	*retval = fd;
1706
1707	return error;
1708	}
1709
1710	static void
1711	vfs__fhfree(fhandle_t *fhp)
1712	{
1713	size_t fhsize;
1714
1715	fhsize = FHANDLE_SIZE(fhp);
1716	kmem_free(fhp, fhsize);
1717	}
1718
1719	/*
1720	* vfs_composefh: compose a filehandle.
1721	*/
1722
1723	int
1724	vfs_composefh(struct vnode vp, fhandle_t fhp, size_t *fh_size)
1725	{
1726	struct mount *mp;
1727	struct fid *fidp;
1728	int error;
1729	size_t needfhsize;
1730	size_t fidsize;
1731
1732	mp = vp->v_mount;
1733	fidp = NULL;
1734	if (*fh_size < FHANDLE_SIZE_MIN) {
1735	fidsize = `0`;
1736	} else {
1737	fidsize = *fh_size - offsetof(fhandle_t, fh_fid);
1738	if (fhp != NULL) {
1739	memset(fhp, `0`, *fh_size);
1740	fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1741	fidp = &fhp->fh_fid;
1742	}
1743	}
1744	error = VFS_VPTOFH(vp, fidp, &fidsize);
1745	needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1746	if (error == `0` && *fh_size < needfhsize) {
1747	error = E2BIG;
1748	}
1749	*fh_size = needfhsize;
1750	return error;
1751	}
1752
1753	int
1754	vfs_composefh_alloc(struct vnode vp, fhandle_t *fhpp)
1755	{
1756	struct mount *mp;
1757	fhandle_t *fhp;
1758	size_t fhsize;
1759	size_t fidsize;
1760	int error;
1761
1762	mp = vp->v_mount;
1763	fidsize = `0`;
1764	error = VFS_VPTOFH(vp, NULL, &fidsize);
1765	KASSERT(error != `0`);
1766	if (error != E2BIG) {
1767	goto out;
1768	}
1769	fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
1770	fhp = kmem_zalloc(fhsize, KM_SLEEP);
1771	if (fhp == NULL) {
1772	error = ENOMEM;
1773	goto out;
1774	}
1775	fhp->fh_fsid = mp->mnt_stat.f_fsidx;
1776	error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize);
1777	if (error == `0`) {
1778	KASSERT((FHANDLE_SIZE(fhp) == fhsize &&
1779	FHANDLE_FILEID(fhp)->fid_len == fidsize));
1780	*fhpp = fhp;
1781	} else {
1782	kmem_free(fhp, fhsize);
1783	}
1784	out:
1785	return error;
1786	}
1787
1788	void
1789	vfs_composefh_free(fhandle_t *fhp)
1790	{
1791
1792	vfs__fhfree(fhp);
1793	}
1794
1795	/*
1796	* vfs_fhtovp: lookup a vnode by a filehandle.
1797	*/
1798
1799	int
1800	vfs_fhtovp(fhandle_t fhp, struct* vnode **vpp)
1801	{
1802	struct mount *mp;
1803	int error;
1804
1805	*vpp = NULL;
1806	mp = vfs_getvfs(FHANDLE_FSID(fhp));
1807	if (mp == NULL) {
1808	error = ESTALE;
1809	goto out;
1810	}
1811	if (mp->mnt_op->vfs_fhtovp == NULL) {
1812	error = EOPNOTSUPP;
1813	goto out;
1814	}
1815	error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), vpp);
1816	out:
1817	return error;
1818	}
1819
1820	/*
1821	* vfs_copyinfh_alloc: allocate and copyin a filehandle, given
1822	* the needed size.
1823	*/
1824
1825	int
1826	vfs_copyinfh_alloc(const void ufhp, size_t fhsize, fhandle_t *fhpp)
1827	{
1828	fhandle_t *fhp;
1829	int error;
1830
1831	if (fhsize > FHANDLE_SIZE_MAX) {
1832	return EINVAL;
1833	}
1834	if (fhsize < FHANDLE_SIZE_MIN) {
1835	return EINVAL;
1836	}
1837	again:
1838	fhp = kmem_alloc(fhsize, KM_SLEEP);
1839	if (fhp == NULL) {
1840	return ENOMEM;
1841	}
1842	error = copyin(ufhp, fhp, fhsize);
1843	if (error == `0`) {
1844	/ XXX this check shouldn't be here /
1845	if (FHANDLE_SIZE(fhp) == fhsize) {
1846	*fhpp = fhp;
1847	return `0`;
1848	} else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) {
1849	/*
1850	* a kludge for nfsv2 padded handles.
1851	*/
1852	size_t sz;
1853
1854	sz = FHANDLE_SIZE(fhp);
1855	kmem_free(fhp, fhsize);
1856	fhsize = sz;
1857	goto again;
1858	} else {
1859	/*
1860	* userland told us wrong size.
1861	*/
1862	error = EINVAL;
1863	}
1864	}
1865	kmem_free(fhp, fhsize);
1866	return error;
1867	}
1868
1869	void
1870	vfs_copyinfh_free(fhandle_t *fhp)
1871	{
1872
1873	vfs__fhfree(fhp);
1874	}
1875
1876	/*
1877	* Get file handle system call
1878	*/
1879	int
1880	sys___getfh30(struct lwp l, const* struct sys___getfh30_args uap, register_t retval)
1881	{
1882	/ {*
1883	syscallarg(char ) fname;*
1884	syscallarg(fhandle_t ) fhp;*
1885	syscallarg(size_t ) fh_size;*
1886	} /*
1887	struct vnode *vp;
1888	fhandle_t *fh;
1889	int error;
1890	struct pathbuf *pb;
1891	struct nameidata nd;
1892	size_t sz;
1893	size_t usz;
1894
1895	/*
1896	* Must be super user
1897	*/
1898	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1899	`0`, NULL, NULL, NULL);
1900	if (error)
1901	return (error);
1902
1903	error = pathbuf_copyin(SCARG(uap, fname), &pb);
1904	if (error) {
1905	return error;
1906	}
1907	NDINIT(&nd, LOOKUP, FOLLOW \| LOCKLEAF \| TRYEMULROOT, pb);
1908	error = namei(&nd);
1909	if (error) {
1910	pathbuf_destroy(pb);
1911	return error;
1912	}
1913	vp = nd.ni_vp;
1914	pathbuf_destroy(pb);
1915
1916	error = vfs_composefh_alloc(vp, &fh);
1917	vput(vp);
1918	if (error != `0`) {
1919	return error;
1920	}
1921	error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t));
1922	if (error != `0`) {
1923	goto out;
1924	}
1925	sz = FHANDLE_SIZE(fh);
1926	error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t));
1927	if (error != `0`) {
1928	goto out;
1929	}
1930	if (usz >= sz) {
1931	error = copyout(fh, SCARG(uap, fhp), sz);
1932	} else {
1933	error = E2BIG;
1934	}
1935	out:
1936	vfs_composefh_free(fh);
1937	return (error);
1938	}
1939
1940	/*
1941	* Open a file given a file handle.
1942	*
1943	* Check permissions, allocate an open file structure,
1944	* and call the device open routine if any.
1945	*/
1946
1947	int
1948	dofhopen(struct lwp l, const* void ufhp, size_t fhsize, int* oflags,
1949	register_t *retval)
1950	{
1951	file_t *fp;
1952	struct vnode *vp = NULL;
1953	kauth_cred_t cred = l->l_cred;
1954	file_t *nfp;
1955	int indx, error;
1956	struct vattr va;
1957	fhandle_t *fh;
1958	int flags;
1959	proc_t *p;
1960
1961	p = curproc;
1962
1963	/*
1964	* Must be super user
1965	*/
1966	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
1967	`0`, NULL, NULL, NULL)))
1968	return (error);
1969
1970	if (oflags & O_SEARCH) {
1971	oflags &= ~(int)O_SEARCH;
1972	}
1973
1974	flags = FFLAGS(oflags);
1975	if ((flags & (FREAD \| FWRITE)) == `0`)
1976	return (EINVAL);
1977	if ((flags & O_CREAT))
1978	return (EINVAL);
1979	if ((error = fd_allocfile(&nfp, &indx)) != `0`)
1980	return (error);
1981	fp = nfp;
1982	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
1983	if (error != `0`) {
1984	goto bad;
1985	}
1986	error = vfs_fhtovp(fh, &vp);
1987	vfs_copyinfh_free(fh);
1988	if (error != `0`) {
1989	goto bad;
1990	}
1991
1992	/ Now do an effective vn_open /
1993
1994	if (vp->v_type == VSOCK) {
1995	error = EOPNOTSUPP;
1996	goto bad;
1997	}
1998	error = vn_openchk(vp, cred, flags);
1999	if (error != `0`)
2000	goto bad;
2001	if (flags & O_TRUNC) {
2002	VOP_UNLOCK(vp); / XXX /
2003	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY); / XXX /
2004	vattr_null(&va);
2005	va.va_size = `0`;
2006	error = VOP_SETATTR(vp, &va, cred);
2007	if (error)
2008	goto bad;
2009	}
2010	if ((error = VOP_OPEN(vp, flags, cred)) != `0`)
2011	goto bad;
2012	if (flags & FWRITE) {
2013	mutex_enter(vp->v_interlock);
2014	vp->v_writecount++;
2015	mutex_exit(vp->v_interlock);
2016	}
2017
2018	/ done with modified vn_open, now finish what sys_open does. /
2019	if ((error = open_setfp(l, fp, vp, indx, flags)))
2020	return error;
2021
2022	VOP_UNLOCK(vp);
2023	*retval = indx;
2024	fd_affix(p, fp, indx);
2025	return (`0`);
2026
2027	bad:
2028	fd_abort(p, fp, indx);
2029	if (vp != NULL)
2030	vput(vp);
2031	return (error);
2032	}
2033
2034	int
2035	sys___fhopen40(struct lwp l, const* struct sys___fhopen40_args uap, register_t retval)
2036	{
2037	/ {*
2038	syscallarg(const void ) fhp;*
2039	syscallarg(size_t) fh_size;
2040	syscallarg(int) flags;
2041	} /*
2042
2043	return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size),
2044	SCARG(uap, flags), retval);
2045	}
2046
2047	int
2048	do_fhstat(struct lwp l, const* void ufhp, size_t fhsize, struct* stat *sb)
2049	{
2050	int error;
2051	fhandle_t *fh;
2052	struct vnode *vp;
2053
2054	/*
2055	* Must be super user
2056	*/
2057	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2058	`0`, NULL, NULL, NULL)))
2059	return (error);
2060
2061	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2062	if (error != `0`)
2063	return error;
2064
2065	error = vfs_fhtovp(fh, &vp);
2066	vfs_copyinfh_free(fh);
2067	if (error != `0`)
2068	return error;
2069
2070	error = vn_stat(vp, sb);
2071	vput(vp);
2072	return error;
2073	}
2074
2075
2076	/ ARGSUSED /
2077	int
2078	sys___fhstat50(struct lwp l, const* struct sys___fhstat50_args uap, register_t retval)
2079	{
2080	/ {*
2081	syscallarg(const void ) fhp;*
2082	syscallarg(size_t) fh_size;
2083	syscallarg(struct stat ) sb;*
2084	} /*
2085	struct stat sb;
2086	int error;
2087
2088	error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
2089	if (error)
2090	return error;
2091	return copyout(&sb, SCARG(uap, sb), sizeof(sb));
2092	}
2093
2094	int
2095	do_fhstatvfs(struct lwp l, const* void ufhp, size_t fhsize, struct* statvfs *sb,
2096	int flags)
2097	{
2098	fhandle_t *fh;
2099	struct mount *mp;
2100	struct vnode *vp;
2101	int error;
2102
2103	/*
2104	* Must be super user
2105	*/
2106	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
2107	`0`, NULL, NULL, NULL)))
2108	return error;
2109
2110	error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
2111	if (error != `0`)
2112	return error;
2113
2114	error = vfs_fhtovp(fh, &vp);
2115	vfs_copyinfh_free(fh);
2116	if (error != `0`)
2117	return error;
2118
2119	mp = vp->v_mount;
2120	error = dostatvfs(mp, sb, l, flags, `1`);
2121	vput(vp);
2122	return error;
2123	}
2124
2125	/ ARGSUSED /
2126	int
2127	sys___fhstatvfs140(struct lwp l, const* struct sys___fhstatvfs140_args uap, register_t retval)
2128	{
2129	/ {*
2130	syscallarg(const void ) fhp;*
2131	syscallarg(size_t) fh_size;
2132	syscallarg(struct statvfs ) buf;*
2133	syscallarg(int) flags;
2134	} /*
2135	struct statvfs *sb = STATVFSBUF_GET();
2136	int error;
2137
2138	error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb,
2139	SCARG(uap, flags));
2140	if (error == `0`)
2141	error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
2142	STATVFSBUF_PUT(sb);
2143	return error;
2144	}
2145
2146	/*
2147	* Create a special file.
2148	*/
2149	/ ARGSUSED /
2150	int
2151	sys___mknod50(struct lwp l, const* struct sys___mknod50_args *uap,
2152	register_t *retval)
2153	{
2154	/ {*
2155	syscallarg(const char ) path;*
2156	syscallarg(mode_t) mode;
2157	syscallarg(dev_t) dev;
2158	} /*
2159	return do_sys_mknodat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap, mode),
2160	SCARG(uap, dev), retval, UIO_USERSPACE);
2161	}
2162
2163	int
2164	sys_mknodat(struct lwp l, const* struct sys_mknodat_args *uap,
2165	register_t *retval)
2166	{
2167	/ {*
2168	syscallarg(int) fd;
2169	syscallarg(const char ) path;*
2170	syscallarg(mode_t) mode;
2171	syscallarg(int) pad;
2172	syscallarg(dev_t) dev;
2173	} /*
2174
2175	return do_sys_mknodat(l, SCARG(uap, fd), SCARG(uap, path),
2176	SCARG(uap, mode), SCARG(uap, dev), retval, UIO_USERSPACE);
2177	}
2178
2179	int
2180	do_sys_mknod(struct lwp l, const* char *pathname, mode_t mode, dev_t dev,
2181	register_t retval, enum* uio_seg seg)
2182	{
2183	return do_sys_mknodat(l, AT_FDCWD, pathname, mode, dev, retval, seg);
2184	}
2185
2186	int
2187	do_sys_mknodat(struct lwp l, int* fdat, const char *pathname, mode_t mode,
2188	dev_t dev, register_t retval, enum* uio_seg seg)
2189	{
2190	struct proc *p = l->l_proc;
2191	struct vnode *vp;
2192	struct vattr vattr;
2193	int error, optype;
2194	struct pathbuf *pb;
2195	struct nameidata nd;
2196	const char *pathstring;
2197
2198	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD,
2199	`0`, NULL, NULL, NULL)) != `0`)
2200	return (error);
2201
2202	optype = VOP_MKNOD_DESCOFFSET;
2203
2204	error = pathbuf_maybe_copyin(pathname, seg, &pb);
2205	if (error) {
2206	return error;
2207	}
2208	pathstring = pathbuf_stringcopy_get(pb);
2209	if (pathstring == NULL) {
2210	pathbuf_destroy(pb);
2211	return ENOMEM;
2212	}
2213
2214	NDINIT(&nd, CREATE, LOCKPARENT \| TRYEMULROOT, pb);
2215
2216	if ((error = fd_nameiat(l, fdat, &nd)) != `0`)
2217	goto out;
2218	vp = nd.ni_vp;
2219
2220	if (vp != NULL)
2221	error = EEXIST;
2222	else {
2223	vattr_null(&vattr);
2224	/ We will read cwdi->cwdi_cmask unlocked. /
2225	vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2226	vattr.va_rdev = dev;
2227
2228	switch (mode & S_IFMT) {
2229	case S_IFMT: / used by badsect to flag bad sectors /
2230	vattr.va_type = VBAD;
2231	break;
2232	case S_IFCHR:
2233	vattr.va_type = VCHR;
2234	break;
2235	case S_IFBLK:
2236	vattr.va_type = VBLK;
2237	break;
2238	case S_IFWHT:
2239	optype = VOP_WHITEOUT_DESCOFFSET;
2240	break;
2241	case S_IFREG:
2242	#if NVERIEXEC > 0
2243	error = veriexec_openchk(l, nd.ni_vp, pathstring,
2244	O_CREAT);
2245	#endif /* NVERIEXEC > 0 */
2246	vattr.va_type = VREG;
2247	vattr.va_rdev = VNOVAL;
2248	optype = VOP_CREATE_DESCOFFSET;
2249	break;
2250	default:
2251	error = EINVAL;
2252	break;
2253	}
2254	}
2255	if (error == `0` && optype == VOP_MKNOD_DESCOFFSET
2256	&& vattr.va_rdev == VNOVAL)
2257	error = EINVAL;
2258	if (!error) {
2259	switch (optype) {
2260	case VOP_WHITEOUT_DESCOFFSET:
2261	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
2262	if (error)
2263	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2264	vput(nd.ni_dvp);
2265	break;
2266
2267	case VOP_MKNOD_DESCOFFSET:
2268	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
2269	&nd.ni_cnd, &vattr);
2270	if (error == `0`)
2271	vrele(nd.ni_vp);
2272	vput(nd.ni_dvp);
2273	break;
2274
2275	case VOP_CREATE_DESCOFFSET:
2276	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
2277	&nd.ni_cnd, &vattr);
2278	if (error == `0`)
2279	vrele(nd.ni_vp);
2280	vput(nd.ni_dvp);
2281	break;
2282	}
2283	} else {
2284	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2285	if (nd.ni_dvp == vp)
2286	vrele(nd.ni_dvp);
2287	else
2288	vput(nd.ni_dvp);
2289	if (vp)
2290	vrele(vp);
2291	}
2292	out:
2293	pathbuf_stringcopy_put(pb, pathstring);
2294	pathbuf_destroy(pb);
2295	return (error);
2296	}
2297
2298	/*
2299	* Create a named pipe.
2300	*/
2301	/ ARGSUSED /
2302	int
2303	sys_mkfifo(struct lwp l, const* struct sys_mkfifo_args uap, register_t retval)
2304	{
2305	/ {*
2306	syscallarg(const char ) path;*
2307	syscallarg(int) mode;
2308	} /*
2309	return do_sys_mkfifoat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap, mode));
2310	}
2311
2312	int
2313	sys_mkfifoat(struct lwp l, const* struct sys_mkfifoat_args *uap,
2314	register_t *retval)
2315	{
2316	/ {*
2317	syscallarg(int) fd;
2318	syscallarg(const char ) path;*
2319	syscallarg(int) mode;
2320	} /*
2321
2322	return do_sys_mkfifoat(l, SCARG(uap, fd), SCARG(uap, path),
2323	SCARG(uap, mode));
2324	}
2325
2326	static int
2327	do_sys_mkfifoat(struct lwp l, int* fdat, const char *path, mode_t mode)
2328	{
2329	struct proc *p = l->l_proc;
2330	struct vattr vattr;
2331	int error;
2332	struct pathbuf *pb;
2333	struct nameidata nd;
2334
2335	error = pathbuf_copyin(path, &pb);
2336	if (error) {
2337	return error;
2338	}
2339	NDINIT(&nd, CREATE, LOCKPARENT \| TRYEMULROOT, pb);
2340
2341	if ((error = fd_nameiat(l, fdat, &nd)) != `0`) {
2342	pathbuf_destroy(pb);
2343	return error;
2344	}
2345	if (nd.ni_vp != NULL) {
2346	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2347	if (nd.ni_dvp == nd.ni_vp)
2348	vrele(nd.ni_dvp);
2349	else
2350	vput(nd.ni_dvp);
2351	vrele(nd.ni_vp);
2352	pathbuf_destroy(pb);
2353	return (EEXIST);
2354	}
2355	vattr_null(&vattr);
2356	vattr.va_type = VFIFO;
2357	/ We will read cwdi->cwdi_cmask unlocked. /
2358	vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
2359	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
2360	if (error == `0`)
2361	vrele(nd.ni_vp);
2362	vput(nd.ni_dvp);
2363	pathbuf_destroy(pb);
2364	return (error);
2365	}
2366
2367	/*
2368	* Make a hard file link.
2369	*/
2370	/ ARGSUSED /
2371	int
2372	do_sys_linkat(struct lwp l, int* fdpath, const char path, int* fdlink,
2373	const char link, int* follow, register_t *retval)
2374	{
2375	struct vnode *vp;
2376	struct pathbuf *linkpb;
2377	struct nameidata nd;
2378	namei_simple_flags_t ns_flags;
2379	int error;
2380
2381	if (follow & AT_SYMLINK_FOLLOW)
2382	ns_flags = NSM_FOLLOW_TRYEMULROOT;
2383	else
2384	ns_flags = NSM_NOFOLLOW_TRYEMULROOT;
2385
2386	error = fd_nameiat_simple_user(l, fdpath, path, ns_flags, &vp);
2387	if (error != `0`)
2388	return (error);
2389	error = pathbuf_copyin(link, &linkpb);
2390	if (error) {
2391	goto out1;
2392	}
2393	NDINIT(&nd, CREATE, LOCKPARENT \| TRYEMULROOT, linkpb);
2394	if ((error = fd_nameiat(l, fdlink, &nd)) != `0`)
2395	goto out2;
2396	if (nd.ni_vp) {
2397	error = EEXIST;
2398	goto abortop;
2399	}
2400	/ Prevent hard links on directories. /
2401	if (vp->v_type == VDIR) {
2402	error = EPERM;
2403	goto abortop;
2404	}
2405	/ Prevent cross-mount operation. /
2406	if (nd.ni_dvp->v_mount != vp->v_mount) {
2407	error = EXDEV;
2408	goto abortop;
2409	}
2410	error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
2411	VOP_UNLOCK(nd.ni_dvp);
2412	vrele(nd.ni_dvp);
2413	out2:
2414	pathbuf_destroy(linkpb);
2415	out1:
2416	vrele(vp);
2417	return (error);
2418	abortop:
2419	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2420	if (nd.ni_dvp == nd.ni_vp)
2421	vrele(nd.ni_dvp);
2422	else
2423	vput(nd.ni_dvp);
2424	if (nd.ni_vp != NULL)
2425	vrele(nd.ni_vp);
2426	goto out2;
2427	}
2428
2429	int
2430	sys_link(struct lwp l, const* struct sys_link_args uap, register_t retval)
2431	{
2432	/ {*
2433	syscallarg(const char ) path;*
2434	syscallarg(const char ) link;*
2435	} /*
2436	const char *path = SCARG(uap, path);
2437	const char *link = SCARG(uap, link);
2438
2439	return do_sys_linkat(l, AT_FDCWD, path, AT_FDCWD, link,
2440	AT_SYMLINK_FOLLOW, retval);
2441	}
2442
2443	int
2444	sys_linkat(struct lwp l, const* struct sys_linkat_args *uap,
2445	register_t *retval)
2446	{
2447	/ {*
2448	syscallarg(int) fd1;
2449	syscallarg(const char ) name1;*
2450	syscallarg(int) fd2;
2451	syscallarg(const char ) name2;*
2452	syscallarg(int) flags;
2453	} /*
2454	int fd1 = SCARG(uap, fd1);
2455	const char *name1 = SCARG(uap, name1);
2456	int fd2 = SCARG(uap, fd2);
2457	const char *name2 = SCARG(uap, name2);
2458	int follow;
2459
2460	follow = SCARG(uap, flags) & AT_SYMLINK_FOLLOW;
2461
2462	return do_sys_linkat(l, fd1, name1, fd2, name2, follow, retval);
2463	}
2464
2465
2466	int
2467	do_sys_symlink(const char patharg, const* char link, enum* uio_seg seg)
2468	{
2469	return do_sys_symlinkat(NULL, patharg, AT_FDCWD, link, seg);
2470	}
2471
2472	static int
2473	do_sys_symlinkat(struct lwp l, const* char patharg, int* fdat,
2474	const char link, enum* uio_seg seg)
2475	{
2476	struct proc *p = curproc;
2477	struct vattr vattr;
2478	char *path;
2479	int error;
2480	struct pathbuf *linkpb;
2481	struct nameidata nd;
2482
2483	KASSERT(l != NULL \|\| fdat == AT_FDCWD);
2484
2485	path = PNBUF_GET();
2486	if (seg == UIO_USERSPACE) {
2487	if ((error = copyinstr(patharg, path, MAXPATHLEN, NULL)) != `0`)
2488	goto out1;
2489	if ((error = pathbuf_copyin(link, &linkpb)) != `0`)
2490	goto out1;
2491	} else {
2492	KASSERT(strlen(patharg) < MAXPATHLEN);
2493	strcpy(path, patharg);
2494	linkpb = pathbuf_create(link);
2495	if (linkpb == NULL) {
2496	error = ENOMEM;
2497	goto out1;
2498	}
2499	}
2500	ktrkuser("symlink-target", path, strlen(path));
2501
2502	NDINIT(&nd, CREATE, LOCKPARENT \| TRYEMULROOT, linkpb);
2503	if ((error = fd_nameiat(l, fdat, &nd)) != `0`)
2504	goto out2;
2505	if (nd.ni_vp) {
2506	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2507	if (nd.ni_dvp == nd.ni_vp)
2508	vrele(nd.ni_dvp);
2509	else
2510	vput(nd.ni_dvp);
2511	vrele(nd.ni_vp);
2512	error = EEXIST;
2513	goto out2;
2514	}
2515	vattr_null(&vattr);
2516	vattr.va_type = VLNK;
2517	/ We will read cwdi->cwdi_cmask unlocked. /
2518	vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask;
2519	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
2520	if (error == `0`)
2521	vrele(nd.ni_vp);
2522	vput(nd.ni_dvp);
2523	out2:
2524	pathbuf_destroy(linkpb);
2525	out1:
2526	PNBUF_PUT(path);
2527	return (error);
2528	}
2529
2530	/*
2531	* Make a symbolic link.
2532	*/
2533	/ ARGSUSED /
2534	int
2535	sys_symlink(struct lwp l, const* struct sys_symlink_args uap, register_t retval)
2536	{
2537	/ {*
2538	syscallarg(const char ) path;*
2539	syscallarg(const char ) link;*
2540	} /*
2541
2542	return do_sys_symlinkat(l, SCARG(uap, path), AT_FDCWD, SCARG(uap, link),
2543	UIO_USERSPACE);
2544	}
2545
2546	int
2547	sys_symlinkat(struct lwp l, const* struct sys_symlinkat_args *uap,
2548	register_t *retval)
2549	{
2550	/ {*
2551	syscallarg(const char ) path1;*
2552	syscallarg(int) fd;
2553	syscallarg(const char ) path2;*
2554	} /*
2555
2556	return do_sys_symlinkat(l, SCARG(uap, path1), SCARG(uap, fd),
2557	SCARG(uap, path2), UIO_USERSPACE);
2558	}
2559
2560	/*
2561	* Delete a whiteout from the filesystem.
2562	*/
2563	/ ARGSUSED /
2564	int
2565	sys_undelete(struct lwp l, const* struct sys_undelete_args uap, register_t retval)
2566	{
2567	/ {*
2568	syscallarg(const char ) path;*
2569	} /*
2570	int error;
2571	struct pathbuf *pb;
2572	struct nameidata nd;
2573
2574	error = pathbuf_copyin(SCARG(uap, path), &pb);
2575	if (error) {
2576	return error;
2577	}
2578
2579	NDINIT(&nd, DELETE, LOCKPARENT \| DOWHITEOUT \| TRYEMULROOT, pb);
2580	error = namei(&nd);
2581	if (error) {
2582	pathbuf_destroy(pb);
2583	return (error);
2584	}
2585
2586	if (nd.ni_vp != NULLVP \|\| !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
2587	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2588	if (nd.ni_dvp == nd.ni_vp)
2589	vrele(nd.ni_dvp);
2590	else
2591	vput(nd.ni_dvp);
2592	if (nd.ni_vp)
2593	vrele(nd.ni_vp);
2594	pathbuf_destroy(pb);
2595	return (EEXIST);
2596	}
2597	if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != `0`)
2598	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2599	vput(nd.ni_dvp);
2600	pathbuf_destroy(pb);
2601	return (error);
2602	}
2603
2604	/*
2605	* Delete a name from the filesystem.
2606	*/
2607	/ ARGSUSED /
2608	int
2609	sys_unlink(struct lwp l, const* struct sys_unlink_args uap, register_t retval)
2610	{
2611	/ {*
2612	syscallarg(const char ) path;*
2613	} /*
2614
2615	return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path), `0`, UIO_USERSPACE);
2616	}
2617
2618	int
2619	sys_unlinkat(struct lwp l, const* struct sys_unlinkat_args *uap,
2620	register_t *retval)
2621	{
2622	/ {*
2623	syscallarg(int) fd;
2624	syscallarg(const char ) path;*
2625	syscallarg(int) flag;
2626	} /*
2627
2628	return do_sys_unlinkat(l, SCARG(uap, fd), SCARG(uap, path),
2629	SCARG(uap, flag), UIO_USERSPACE);
2630	}
2631
2632	int
2633	do_sys_unlink(const char arg, enum* uio_seg seg)
2634	{
2635	return do_sys_unlinkat(NULL, AT_FDCWD, arg, `0`, seg);
2636	}
2637
2638	static int
2639	do_sys_unlinkat(struct lwp l, int* fdat, const char arg, int* flags,
2640	enum uio_seg seg)
2641	{
2642	struct vnode *vp;
2643	int error;
2644	struct pathbuf *pb;
2645	struct nameidata nd;
2646	const char *pathstring;
2647
2648	KASSERT(l != NULL \|\| fdat == AT_FDCWD);
2649
2650	error = pathbuf_maybe_copyin(arg, seg, &pb);
2651	if (error) {
2652	return error;
2653	}
2654	pathstring = pathbuf_stringcopy_get(pb);
2655	if (pathstring == NULL) {
2656	pathbuf_destroy(pb);
2657	return ENOMEM;
2658	}
2659
2660	NDINIT(&nd, DELETE, LOCKPARENT \| LOCKLEAF \| TRYEMULROOT, pb);
2661	if ((error = fd_nameiat(l, fdat, &nd)) != `0`)
2662	goto out;
2663	vp = nd.ni_vp;
2664
2665	/*
2666	* The root of a mounted filesystem cannot be deleted.
2667	*/
2668	if ((vp->v_vflag & VV_ROOT) != `0`) {
2669	error = EBUSY;
2670	goto abort;
2671	}
2672
2673	if ((vp->v_type == VDIR) && (vp->v_mountedhere != NULL)) {
2674	error = EBUSY;
2675	goto abort;
2676	}
2677
2678	/*
2679	* No rmdir "." please.
2680	*/
2681	if (nd.ni_dvp == vp) {
2682	error = EINVAL;
2683	goto abort;
2684	}
2685
2686	/*
2687	* AT_REMOVEDIR is required to remove a directory
2688	*/
2689	if (vp->v_type == VDIR) {
2690	if (!(flags & AT_REMOVEDIR)) {
2691	error = EPERM;
2692	goto abort;
2693	} else {
2694	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2695	goto out;
2696	}
2697	}
2698
2699	/*
2700	* Starting here we only deal with non directories.
2701	*/
2702	if (flags & AT_REMOVEDIR) {
2703	error = ENOTDIR;
2704	goto abort;
2705	}
2706
2707	#if NVERIEXEC > 0
2708	/ Handle remove requests for veriexec entries. /
2709	if ((error = veriexec_removechk(curlwp, nd.ni_vp, pathstring)) != `0`) {
2710	goto abort;
2711	}
2712	#endif /* NVERIEXEC > 0 */
2713
2714	#ifdef FILEASSOC
2715	(void)fileassoc_file_delete(vp);
2716	#endif /* FILEASSOC */
2717	error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
2718	goto out;
2719
2720	abort:
2721	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
2722	if (nd.ni_dvp == vp)
2723	vrele(nd.ni_dvp);
2724	else
2725	vput(nd.ni_dvp);
2726	vput(vp);
2727
2728	out:
2729	pathbuf_stringcopy_put(pb, pathstring);
2730	pathbuf_destroy(pb);
2731	return (error);
2732	}
2733
2734	/*
2735	* Reposition read/write file offset.
2736	*/
2737	int
2738	sys_lseek(struct lwp l, const* struct sys_lseek_args uap, register_t retval)
2739	{
2740	/ {*
2741	syscallarg(int) fd;
2742	syscallarg(int) pad;
2743	syscallarg(off_t) offset;
2744	syscallarg(int) whence;
2745	} /*
2746	kauth_cred_t cred = l->l_cred;
2747	file_t *fp;
2748	struct vnode *vp;
2749	struct vattr vattr;
2750	off_t newoff;
2751	int error, fd;
2752
2753	fd = SCARG(uap, fd);
2754
2755	if ((fp = fd_getfile(fd)) == NULL)
2756	return (EBADF);
2757
2758	vp = fp->f_vnode;
2759	if (fp->f_type != DTYPE_VNODE \|\| vp->v_type == VFIFO) {
2760	error = ESPIPE;
2761	goto out;
2762	}
2763
2764	vn_lock(vp, LK_SHARED \| LK_RETRY);
2765
2766	switch (SCARG(uap, whence)) {
2767	case SEEK_CUR:
2768	newoff = fp->f_offset + SCARG(uap, offset);
2769	break;
2770	case SEEK_END:
2771	error = VOP_GETATTR(vp, &vattr, cred);
2772	if (error) {
2773	VOP_UNLOCK(vp);
2774	goto out;
2775	}
2776	newoff = SCARG(uap, offset) + vattr.va_size;
2777	break;
2778	case SEEK_SET:
2779	newoff = SCARG(uap, offset);
2780	break;
2781	default:
2782	error = EINVAL;
2783	VOP_UNLOCK(vp);
2784	goto out;
2785	}
2786	VOP_UNLOCK(vp);
2787	if ((error = VOP_SEEK(vp, fp->f_offset, newoff, cred)) == `0`) {
2788	(off_t )retval = fp->f_offset = newoff;
2789	}
2790	out:
2791	fd_putfile(fd);
2792	return (error);
2793	}
2794
2795	/*
2796	* Positional read system call.
2797	*/
2798	int
2799	sys_pread(struct lwp l, const* struct sys_pread_args uap, register_t retval)
2800	{
2801	/ {*
2802	syscallarg(int) fd;
2803	syscallarg(void ) buf;*
2804	syscallarg(size_t) nbyte;
2805	syscallarg(off_t) offset;
2806	} /*
2807	file_t *fp;
2808	struct vnode *vp;
2809	off_t offset;
2810	int error, fd = SCARG(uap, fd);
2811
2812	if ((fp = fd_getfile(fd)) == NULL)
2813	return (EBADF);
2814
2815	if ((fp->f_flag & FREAD) == `0`) {
2816	fd_putfile(fd);
2817	return (EBADF);
2818	}
2819
2820	vp = fp->f_vnode;
2821	if (fp->f_type != DTYPE_VNODE \|\| vp->v_type == VFIFO) {
2822	error = ESPIPE;
2823	goto out;
2824	}
2825
2826	offset = SCARG(uap, offset);
2827
2828	/*
2829	* XXX This works because no file systems actually
2830	* XXX take any action on the seek operation.
2831	*/
2832	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != `0`)
2833	goto out;
2834
2835	/ dofileread() will unuse the descriptor for us /
2836	return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2837	&offset, `0`, retval));
2838
2839	out:
2840	fd_putfile(fd);
2841	return (error);
2842	}
2843
2844	/*
2845	* Positional scatter read system call.
2846	*/
2847	int
2848	sys_preadv(struct lwp l, const* struct sys_preadv_args uap, register_t retval)
2849	{
2850	/ {*
2851	syscallarg(int) fd;
2852	syscallarg(const struct iovec ) iovp;*
2853	syscallarg(int) iovcnt;
2854	syscallarg(off_t) offset;
2855	} /*
2856	off_t offset = SCARG(uap, offset);
2857
2858	return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
2859	SCARG(uap, iovcnt), &offset, `0`, retval);
2860	}
2861
2862	/*
2863	* Positional write system call.
2864	*/
2865	int
2866	sys_pwrite(struct lwp l, const* struct sys_pwrite_args uap, register_t retval)
2867	{
2868	/ {*
2869	syscallarg(int) fd;
2870	syscallarg(const void ) buf;*
2871	syscallarg(size_t) nbyte;
2872	syscallarg(off_t) offset;
2873	} /*
2874	file_t *fp;
2875	struct vnode *vp;
2876	off_t offset;
2877	int error, fd = SCARG(uap, fd);
2878
2879	if ((fp = fd_getfile(fd)) == NULL)
2880	return (EBADF);
2881
2882	if ((fp->f_flag & FWRITE) == `0`) {
2883	fd_putfile(fd);
2884	return (EBADF);
2885	}
2886
2887	vp = fp->f_vnode;
2888	if (fp->f_type != DTYPE_VNODE \|\| vp->v_type == VFIFO) {
2889	error = ESPIPE;
2890	goto out;
2891	}
2892
2893	offset = SCARG(uap, offset);
2894
2895	/*
2896	* XXX This works because no file systems actually
2897	* XXX take any action on the seek operation.
2898	*/
2899	if ((error = VOP_SEEK(vp, fp->f_offset, offset, fp->f_cred)) != `0`)
2900	goto out;
2901
2902	/ dofilewrite() will unuse the descriptor for us /
2903	return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
2904	&offset, `0`, retval));
2905
2906	out:
2907	fd_putfile(fd);
2908	return (error);
2909	}
2910
2911	/*
2912	* Positional gather write system call.
2913	*/
2914	int
2915	sys_pwritev(struct lwp l, const* struct sys_pwritev_args uap, register_t retval)
2916	{
2917	/ {*
2918	syscallarg(int) fd;
2919	syscallarg(const struct iovec ) iovp;*
2920	syscallarg(int) iovcnt;
2921	syscallarg(off_t) offset;
2922	} /*
2923	off_t offset = SCARG(uap, offset);
2924
2925	return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
2926	SCARG(uap, iovcnt), &offset, `0`, retval);
2927	}
2928
2929	/*
2930	* Check access permissions.
2931	*/
2932	int
2933	sys_access(struct lwp l, const* struct sys_access_args uap, register_t retval)
2934	{
2935	/ {*
2936	syscallarg(const char ) path;*
2937	syscallarg(int) flags;
2938	} /*
2939
2940	return do_sys_accessat(l, AT_FDCWD, SCARG(uap, path),
2941	SCARG(uap, flags), `0`);
2942	}
2943
2944	int
2945	do_sys_accessat(struct lwp l, int* fdat, const char *path,
2946	int mode, int flags)
2947	{
2948	kauth_cred_t cred;
2949	struct vnode *vp;
2950	int error, nd_flag, vmode;
2951	struct pathbuf *pb;
2952	struct nameidata nd;
2953
2954	CTASSERT(F_OK == `0`);
2955	if ((mode & ~(R_OK \| W_OK \| X_OK)) != `0`) {
2956	/ nonsense mode /
2957	return EINVAL;
2958	}
2959
2960	nd_flag = FOLLOW \| LOCKLEAF \| TRYEMULROOT;
2961	if (flags & AT_SYMLINK_NOFOLLOW)
2962	nd_flag &= ~FOLLOW;
2963
2964	error = pathbuf_copyin(path, &pb);
2965	if (error)
2966	return error;
2967
2968	NDINIT(&nd, LOOKUP, nd_flag, pb);
2969
2970	/ Override default credentials /
2971	cred = kauth_cred_dup(l->l_cred);
2972	if (!(flags & AT_EACCESS)) {
2973	kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred));
2974	kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred));
2975	}
2976	nd.ni_cnd.cn_cred = cred;
2977
2978	if ((error = fd_nameiat(l, fdat, &nd)) != `0`) {
2979	pathbuf_destroy(pb);
2980	goto out;
2981	}
2982	vp = nd.ni_vp;
2983	pathbuf_destroy(pb);
2984
2985	/ Flags == 0 means only check for existence. /
2986	if (mode) {
2987	vmode = `0`;
2988	if (mode & R_OK)
2989	vmode \|= VREAD;
2990	if (mode & W_OK)
2991	vmode \|= VWRITE;
2992	if (mode & X_OK)
2993	vmode \|= VEXEC;
2994
2995	error = VOP_ACCESS(vp, vmode, cred);
2996	if (!error && (vmode & VWRITE))
2997	error = vn_writechk(vp);
2998	}
2999	vput(vp);
3000	out:
3001	kauth_cred_free(cred);
3002	return (error);
3003	}
3004
3005	int
3006	sys_faccessat(struct lwp l, const* struct sys_faccessat_args *uap,
3007	register_t *retval)
3008	{
3009	/ {*
3010	syscallarg(int) fd;
3011	syscallarg(const char ) path;*
3012	syscallarg(int) amode;
3013	syscallarg(int) flag;
3014	} /*
3015
3016	return do_sys_accessat(l, SCARG(uap, fd), SCARG(uap, path),
3017	SCARG(uap, amode), SCARG(uap, flag));
3018	}
3019
3020	/*
3021	* Common code for all sys_stat functions, including compat versions.
3022	*/
3023	int
3024	do_sys_stat(const char userpath, unsigned* int nd_flag,
3025	struct stat *sb)
3026	{
3027	return do_sys_statat(NULL, AT_FDCWD, userpath, nd_flag, sb);
3028	}
3029
3030	int
3031	do_sys_statat(struct lwp l, int* fdat, const char *userpath,
3032	unsigned int nd_flag, struct stat *sb)
3033	{
3034	int error;
3035	struct pathbuf *pb;
3036	struct nameidata nd;
3037
3038	KASSERT(l != NULL \|\| fdat == AT_FDCWD);
3039
3040	error = pathbuf_copyin(userpath, &pb);
3041	if (error) {
3042	return error;
3043	}
3044
3045	NDINIT(&nd, LOOKUP, nd_flag \| LOCKLEAF \| TRYEMULROOT, pb);
3046
3047	error = fd_nameiat(l, fdat, &nd);
3048	if (error != `0`) {
3049	pathbuf_destroy(pb);
3050	return error;
3051	}
3052	error = vn_stat(nd.ni_vp, sb);
3053	vput(nd.ni_vp);
3054	pathbuf_destroy(pb);
3055	return error;
3056	}
3057
3058	/*
3059	* Get file status; this version follows links.
3060	*/
3061	/ ARGSUSED /
3062	int
3063	sys___stat50(struct lwp l, const* struct sys___stat50_args uap, register_t retval)
3064	{
3065	/ {*
3066	syscallarg(const char ) path;*
3067	syscallarg(struct stat ) ub;*
3068	} /*
3069	struct stat sb;
3070	int error;
3071
3072	error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), FOLLOW, &sb);
3073	if (error)
3074	return error;
3075	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
3076	}
3077
3078	/*
3079	* Get file status; this version does not follow links.
3080	*/
3081	/ ARGSUSED /
3082	int
3083	sys___lstat50(struct lwp l, const* struct sys___lstat50_args uap, register_t retval)
3084	{
3085	/ {*
3086	syscallarg(const char ) path;*
3087	syscallarg(struct stat ) ub;*
3088	} /*
3089	struct stat sb;
3090	int error;
3091
3092	error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), NOFOLLOW, &sb);
3093	if (error)
3094	return error;
3095	return copyout(&sb, SCARG(uap, ub), sizeof(sb));
3096	}
3097
3098	int
3099	sys_fstatat(struct lwp l, const* struct sys_fstatat_args *uap,
3100	register_t *retval)
3101	{
3102	/ {*
3103	syscallarg(int) fd;
3104	syscallarg(const char ) path;*
3105	syscallarg(struct stat ) buf;*
3106	syscallarg(int) flag;
3107	} /*
3108	unsigned int nd_flag;
3109	struct stat sb;
3110	int error;
3111
3112	if (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW)
3113	nd_flag = NOFOLLOW;
3114	else
3115	nd_flag = FOLLOW;
3116
3117	error = do_sys_statat(l, SCARG(uap, fd), SCARG(uap, path), nd_flag,
3118	&sb);
3119	if (error)
3120	return error;
3121	return copyout(&sb, SCARG(uap, buf), sizeof(sb));
3122	}
3123
3124	/*
3125	* Get configurable pathname variables.
3126	*/
3127	/ ARGSUSED /
3128	int
3129	sys_pathconf(struct lwp l, const* struct sys_pathconf_args uap, register_t retval)
3130	{
3131	/ {*
3132	syscallarg(const char ) path;*
3133	syscallarg(int) name;
3134	} /*
3135	int error;
3136	struct pathbuf *pb;
3137	struct nameidata nd;
3138
3139	error = pathbuf_copyin(SCARG(uap, path), &pb);
3140	if (error) {
3141	return error;
3142	}
3143	NDINIT(&nd, LOOKUP, FOLLOW \| LOCKLEAF \| TRYEMULROOT, pb);
3144	if ((error = namei(&nd)) != `0`) {
3145	pathbuf_destroy(pb);
3146	return (error);
3147	}
3148	error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), retval);
3149	vput(nd.ni_vp);
3150	pathbuf_destroy(pb);
3151	return (error);
3152	}
3153
3154	/*
3155	* Return target name of a symbolic link.
3156	*/
3157	/ ARGSUSED /
3158	int
3159	sys_readlink(struct lwp l, const* struct sys_readlink_args *uap,
3160	register_t *retval)
3161	{
3162	/ {*
3163	syscallarg(const char ) path;*
3164	syscallarg(char ) buf;*
3165	syscallarg(size_t) count;
3166	} /*
3167	return do_sys_readlinkat(l, AT_FDCWD, SCARG(uap, path),
3168	SCARG(uap, buf), SCARG(uap, count), retval);
3169	}
3170
3171	static int
3172	do_sys_readlinkat(struct lwp l, int* fdat, const char path, char* *buf,
3173	size_t count, register_t *retval)
3174	{
3175	struct vnode *vp;
3176	struct iovec aiov;
3177	struct uio auio;
3178	int error;
3179	struct pathbuf *pb;
3180	struct nameidata nd;
3181
3182	error = pathbuf_copyin(path, &pb);
3183	if (error) {
3184	return error;
3185	}
3186	NDINIT(&nd, LOOKUP, NOFOLLOW \| LOCKLEAF \| TRYEMULROOT, pb);
3187	if ((error = fd_nameiat(l, fdat, &nd)) != `0`) {
3188	pathbuf_destroy(pb);
3189	return error;
3190	}
3191	vp = nd.ni_vp;
3192	pathbuf_destroy(pb);
3193	if (vp->v_type != VLNK)
3194	error = EINVAL;
3195	else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) \|\|
3196	(error = VOP_ACCESS(vp, VREAD, l->l_cred)) == `0`) {
3197	aiov.iov_base = buf;
3198	aiov.iov_len = count;
3199	auio.uio_iov = &aiov;
3200	auio.uio_iovcnt = `1`;
3201	auio.uio_offset = `0`;
3202	auio.uio_rw = UIO_READ;
3203	KASSERT(l == curlwp);
3204	auio.uio_vmspace = l->l_proc->p_vmspace;
3205	auio.uio_resid = count;
3206	if ((error = VOP_READLINK(vp, &auio, l->l_cred)) == `0`)
3207	*retval = count - auio.uio_resid;
3208	}
3209	vput(vp);
3210	return (error);
3211	}
3212
3213	int
3214	sys_readlinkat(struct lwp l, const* struct sys_readlinkat_args *uap,
3215	register_t *retval)
3216	{
3217	/ {*
3218	syscallarg(int) fd;
3219	syscallarg(const char ) path;*
3220	syscallarg(char ) buf;*
3221	syscallarg(size_t) bufsize;
3222	} /*
3223
3224	return do_sys_readlinkat(l, SCARG(uap, fd), SCARG(uap, path),
3225	SCARG(uap, buf), SCARG(uap, bufsize), retval);
3226	}
3227
3228	/*
3229	* Change flags of a file given a path name.
3230	*/
3231	/ ARGSUSED /
3232	int
3233	sys_chflags(struct lwp l, const* struct sys_chflags_args uap, register_t retval)
3234	{
3235	/ {*
3236	syscallarg(const char ) path;*
3237	syscallarg(u_long) flags;
3238	} /*
3239	struct vnode *vp;
3240	int error;
3241
3242	error = namei_simple_user(SCARG(uap, path),
3243	NSM_FOLLOW_TRYEMULROOT, &vp);
3244	if (error != `0`)
3245	return (error);
3246	error = change_flags(vp, SCARG(uap, flags), l);
3247	vput(vp);
3248	return (error);
3249	}
3250
3251	/*
3252	* Change flags of a file given a file descriptor.
3253	*/
3254	/ ARGSUSED /
3255	int
3256	sys_fchflags(struct lwp l, const* struct sys_fchflags_args uap, register_t retval)
3257	{
3258	/ {*
3259	syscallarg(int) fd;
3260	syscallarg(u_long) flags;
3261	} /*
3262	struct vnode *vp;
3263	file_t *fp;
3264	int error;
3265
3266	/ fd_getvnode() will use the descriptor for us /
3267	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != `0`)
3268	return (error);
3269	vp = fp->f_vnode;
3270	error = change_flags(vp, SCARG(uap, flags), l);
3271	VOP_UNLOCK(vp);
3272	fd_putfile(SCARG(uap, fd));
3273	return (error);
3274	}
3275
3276	/*
3277	* Change flags of a file given a path name; this version does
3278	* not follow links.
3279	*/
3280	int
3281	sys_lchflags(struct lwp l, const* struct sys_lchflags_args uap, register_t retval)
3282	{
3283	/ {*
3284	syscallarg(const char ) path;*
3285	syscallarg(u_long) flags;
3286	} /*
3287	struct vnode *vp;
3288	int error;
3289
3290	error = namei_simple_user(SCARG(uap, path),
3291	NSM_NOFOLLOW_TRYEMULROOT, &vp);
3292	if (error != `0`)
3293	return (error);
3294	error = change_flags(vp, SCARG(uap, flags), l);
3295	vput(vp);
3296	return (error);
3297	}
3298
3299	/*
3300	* Common routine to change flags of a file.
3301	*/
3302	int
3303	change_flags(struct vnode vp, u_long flags, struct* lwp *l)
3304	{
3305	struct vattr vattr;
3306	int error;
3307
3308	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
3309
3310	vattr_null(&vattr);
3311	vattr.va_flags = flags;
3312	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3313
3314	return (error);
3315	}
3316
3317	/*
3318	* Change mode of a file given path name; this version follows links.
3319	*/
3320	/ ARGSUSED /
3321	int
3322	sys_chmod(struct lwp l, const* struct sys_chmod_args uap, register_t retval)
3323	{
3324	/ {*
3325	syscallarg(const char ) path;*
3326	syscallarg(int) mode;
3327	} /*
3328	return do_sys_chmodat(l, AT_FDCWD, SCARG(uap, path),
3329	SCARG(uap, mode), `0`);
3330	}
3331
3332	int
3333	do_sys_chmodat(struct lwp l, int* fdat, const char path, int* mode, int flags)
3334	{
3335	int error;
3336	struct vnode *vp;
3337	namei_simple_flags_t ns_flag;
3338
3339	if (flags & AT_SYMLINK_NOFOLLOW)
3340	ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
3341	else
3342	ns_flag = NSM_FOLLOW_TRYEMULROOT;
3343
3344	error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
3345	if (error != `0`)
3346	return error;
3347
3348	error = change_mode(vp, mode, l);
3349
3350	vrele(vp);
3351
3352	return (error);
3353	}
3354
3355	/*
3356	* Change mode of a file given a file descriptor.
3357	*/
3358	/ ARGSUSED /
3359	int
3360	sys_fchmod(struct lwp l, const* struct sys_fchmod_args uap, register_t retval)
3361	{
3362	/ {*
3363	syscallarg(int) fd;
3364	syscallarg(int) mode;
3365	} /*
3366	file_t *fp;
3367	int error;
3368
3369	/ fd_getvnode() will use the descriptor for us /
3370	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != `0`)
3371	return (error);
3372	error = change_mode(fp->f_vnode, SCARG(uap, mode), l);
3373	fd_putfile(SCARG(uap, fd));
3374	return (error);
3375	}
3376
3377	int
3378	sys_fchmodat(struct lwp l, const* struct sys_fchmodat_args *uap,
3379	register_t *retval)
3380	{
3381	/ {*
3382	syscallarg(int) fd;
3383	syscallarg(const char ) path;*
3384	syscallarg(int) mode;
3385	syscallarg(int) flag;
3386	} /*
3387
3388	return do_sys_chmodat(l, SCARG(uap, fd), SCARG(uap, path),
3389	SCARG(uap, mode), SCARG(uap, flag));
3390	}
3391
3392	/*
3393	* Change mode of a file given path name; this version does not follow links.
3394	*/
3395	/ ARGSUSED /
3396	int
3397	sys_lchmod(struct lwp l, const* struct sys_lchmod_args uap, register_t retval)
3398	{
3399	/ {*
3400	syscallarg(const char ) path;*
3401	syscallarg(int) mode;
3402	} /*
3403	int error;
3404	struct vnode *vp;
3405
3406	error = namei_simple_user(SCARG(uap, path),
3407	NSM_NOFOLLOW_TRYEMULROOT, &vp);
3408	if (error != `0`)
3409	return (error);
3410
3411	error = change_mode(vp, SCARG(uap, mode), l);
3412
3413	vrele(vp);
3414	return (error);
3415	}
3416
3417	/*
3418	* Common routine to set mode given a vnode.
3419	*/
3420	static int
3421	change_mode(struct vnode vp, int* mode, struct lwp *l)
3422	{
3423	struct vattr vattr;
3424	int error;
3425
3426	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
3427	vattr_null(&vattr);
3428	vattr.va_mode = mode & ALLPERMS;
3429	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3430	VOP_UNLOCK(vp);
3431	return (error);
3432	}
3433
3434	/*
3435	* Set ownership given a path name; this version follows links.
3436	*/
3437	/ ARGSUSED /
3438	int
3439	sys_chown(struct lwp l, const* struct sys_chown_args uap, register_t retval)
3440	{
3441	/ {*
3442	syscallarg(const char ) path;*
3443	syscallarg(uid_t) uid;
3444	syscallarg(gid_t) gid;
3445	} /*
3446	return do_sys_chownat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap,uid),
3447	SCARG(uap, gid), `0`);
3448	}
3449
3450	int
3451	do_sys_chownat(struct lwp l, int* fdat, const char *path, uid_t uid,
3452	gid_t gid, int flags)
3453	{
3454	int error;
3455	struct vnode *vp;
3456	namei_simple_flags_t ns_flag;
3457
3458	if (flags & AT_SYMLINK_NOFOLLOW)
3459	ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
3460	else
3461	ns_flag = NSM_FOLLOW_TRYEMULROOT;
3462
3463	error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
3464	if (error != `0`)
3465	return error;
3466
3467	error = change_owner(vp, uid, gid, l, `0`);
3468
3469	vrele(vp);
3470
3471	return (error);
3472	}
3473
3474	/*
3475	* Set ownership given a path name; this version follows links.
3476	* Provides POSIX semantics.
3477	*/
3478	/ ARGSUSED /
3479	int
3480	sys___posix_chown(struct lwp l, const* struct sys___posix_chown_args uap, register_t retval)
3481	{
3482	/ {*
3483	syscallarg(const char ) path;*
3484	syscallarg(uid_t) uid;
3485	syscallarg(gid_t) gid;
3486	} /*
3487	int error;
3488	struct vnode *vp;
3489
3490	error = namei_simple_user(SCARG(uap, path),
3491	NSM_FOLLOW_TRYEMULROOT, &vp);
3492	if (error != `0`)
3493	return (error);
3494
3495	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, `1`);
3496
3497	vrele(vp);
3498	return (error);
3499	}
3500
3501	/*
3502	* Set ownership given a file descriptor.
3503	*/
3504	/ ARGSUSED /
3505	int
3506	sys_fchown(struct lwp l, const* struct sys_fchown_args uap, register_t retval)
3507	{
3508	/ {*
3509	syscallarg(int) fd;
3510	syscallarg(uid_t) uid;
3511	syscallarg(gid_t) gid;
3512	} /*
3513	int error;
3514	file_t *fp;
3515
3516	/ fd_getvnode() will use the descriptor for us /
3517	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != `0`)
3518	return (error);
3519	error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
3520	l, `0`);
3521	fd_putfile(SCARG(uap, fd));
3522	return (error);
3523	}
3524
3525	int
3526	sys_fchownat(struct lwp l, const* struct sys_fchownat_args *uap,
3527	register_t *retval)
3528	{
3529	/ {*
3530	syscallarg(int) fd;
3531	syscallarg(const char ) path;*
3532	syscallarg(uid_t) owner;
3533	syscallarg(gid_t) group;
3534	syscallarg(int) flag;
3535	} /*
3536
3537	return do_sys_chownat(l, SCARG(uap, fd), SCARG(uap, path),
3538	SCARG(uap, owner), SCARG(uap, group),
3539	SCARG(uap, flag));
3540	}
3541
3542	/*
3543	* Set ownership given a file descriptor, providing POSIX/XPG semantics.
3544	*/
3545	/ ARGSUSED /
3546	int
3547	sys___posix_fchown(struct lwp l, const* struct sys___posix_fchown_args uap, register_t retval)
3548	{
3549	/ {*
3550	syscallarg(int) fd;
3551	syscallarg(uid_t) uid;
3552	syscallarg(gid_t) gid;
3553	} /*
3554	int error;
3555	file_t *fp;
3556
3557	/ fd_getvnode() will use the descriptor for us /
3558	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != `0`)
3559	return (error);
3560	error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
3561	l, `1`);
3562	fd_putfile(SCARG(uap, fd));
3563	return (error);
3564	}
3565
3566	/*
3567	* Set ownership given a path name; this version does not follow links.
3568	*/
3569	/ ARGSUSED /
3570	int
3571	sys_lchown(struct lwp l, const* struct sys_lchown_args uap, register_t retval)
3572	{
3573	/ {*
3574	syscallarg(const char ) path;*
3575	syscallarg(uid_t) uid;
3576	syscallarg(gid_t) gid;
3577	} /*
3578	int error;
3579	struct vnode *vp;
3580
3581	error = namei_simple_user(SCARG(uap, path),
3582	NSM_NOFOLLOW_TRYEMULROOT, &vp);
3583	if (error != `0`)
3584	return (error);
3585
3586	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, `0`);
3587
3588	vrele(vp);
3589	return (error);
3590	}
3591
3592	/*
3593	* Set ownership given a path name; this version does not follow links.
3594	* Provides POSIX/XPG semantics.
3595	*/
3596	/ ARGSUSED /
3597	int
3598	sys___posix_lchown(struct lwp l, const* struct sys___posix_lchown_args uap, register_t retval)
3599	{
3600	/ {*
3601	syscallarg(const char ) path;*
3602	syscallarg(uid_t) uid;
3603	syscallarg(gid_t) gid;
3604	} /*
3605	int error;
3606	struct vnode *vp;
3607
3608	error = namei_simple_user(SCARG(uap, path),
3609	NSM_NOFOLLOW_TRYEMULROOT, &vp);
3610	if (error != `0`)
3611	return (error);
3612
3613	error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, `1`);
3614
3615	vrele(vp);
3616	return (error);
3617	}
3618
3619	/*
3620	* Common routine to set ownership given a vnode.
3621	*/
3622	static int
3623	change_owner(struct vnode vp, uid_t uid, gid_t gid, struct* lwp *l,
3624	int posix_semantics)
3625	{
3626	struct vattr vattr;
3627	mode_t newmode;
3628	int error;
3629
3630	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
3631	if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != `0`)
3632	goto out;
3633
3634	#define CHANGED(x) ((int)(x) != -1)
3635	newmode = vattr.va_mode;
3636	if (posix_semantics) {
3637	/*
3638	* POSIX/XPG semantics: if the caller is not the super-user,
3639	* clear set-user-id and set-group-id bits. Both POSIX and
3640	* the XPG consider the behaviour for calls by the super-user
3641	* implementation-defined; we leave the set-user-id and set-
3642	* group-id settings intact in that case.
3643	*/
3644	if (vattr.va_mode & S_ISUID) {
3645	if (kauth_authorize_vnode(l->l_cred,
3646	KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != `0`)
3647	newmode &= ~S_ISUID;
3648	}
3649	if (vattr.va_mode & S_ISGID) {
3650	if (kauth_authorize_vnode(l->l_cred,
3651	KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != `0`)
3652	newmode &= ~S_ISGID;
3653	}
3654	} else {
3655	/*
3656	* NetBSD semantics: when changing owner and/or group,
3657	* clear the respective bit(s).
3658	*/
3659	if (CHANGED(uid))
3660	newmode &= ~S_ISUID;
3661	if (CHANGED(gid))
3662	newmode &= ~S_ISGID;
3663	}
3664	/ Update va_mode iff altered. /
3665	if (vattr.va_mode == newmode)
3666	newmode = VNOVAL;
3667
3668	vattr_null(&vattr);
3669	vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL;
3670	vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL;
3671	vattr.va_mode = newmode;
3672	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3673	#undef CHANGED
3674
3675	out:
3676	VOP_UNLOCK(vp);
3677	return (error);
3678	}
3679
3680	/*
3681	* Set the access and modification times given a path name; this
3682	* version follows links.
3683	*/
3684	/ ARGSUSED /
3685	int
3686	sys___utimes50(struct lwp l, const* struct sys___utimes50_args *uap,
3687	register_t *retval)
3688	{
3689	/ {*
3690	syscallarg(const char ) path;*
3691	syscallarg(const struct timeval ) tptr;*
3692	} /*
3693
3694	return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
3695	SCARG(uap, tptr), UIO_USERSPACE);
3696	}
3697
3698	/*
3699	* Set the access and modification times given a file descriptor.
3700	*/
3701	/ ARGSUSED /
3702	int
3703	sys___futimes50(struct lwp l, const* struct sys___futimes50_args *uap,
3704	register_t *retval)
3705	{
3706	/ {*
3707	syscallarg(int) fd;
3708	syscallarg(const struct timeval ) tptr;*
3709	} /*
3710	int error;
3711	file_t *fp;
3712
3713	/ fd_getvnode() will use the descriptor for us /
3714	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != `0`)
3715	return (error);
3716	error = do_sys_utimes(l, fp->f_vnode, NULL, `0`, SCARG(uap, tptr),
3717	UIO_USERSPACE);
3718	fd_putfile(SCARG(uap, fd));
3719	return (error);
3720	}
3721
3722	int
3723	sys_futimens(struct lwp l, const* struct sys_futimens_args *uap,
3724	register_t *retval)
3725	{
3726	/ {*
3727	syscallarg(int) fd;
3728	syscallarg(const struct timespec ) tptr;*
3729	} /*
3730	int error;
3731	file_t *fp;
3732
3733	/ fd_getvnode() will use the descriptor for us /
3734	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != `0`)
3735	return (error);
3736	error = do_sys_utimensat(l, AT_FDCWD, fp->f_vnode, NULL, `0`,
3737	SCARG(uap, tptr), UIO_USERSPACE);
3738	fd_putfile(SCARG(uap, fd));
3739	return (error);
3740	}
3741
3742	/*
3743	* Set the access and modification times given a path name; this
3744	* version does not follow links.
3745	*/
3746	int
3747	sys___lutimes50(struct lwp l, const* struct sys___lutimes50_args *uap,
3748	register_t *retval)
3749	{
3750	/ {*
3751	syscallarg(const char ) path;*
3752	syscallarg(const struct timeval ) tptr;*
3753	} /*
3754
3755	return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
3756	SCARG(uap, tptr), UIO_USERSPACE);
3757	}
3758
3759	int
3760	sys_utimensat(struct lwp l, const* struct sys_utimensat_args *uap,
3761	register_t *retval)
3762	{
3763	/ {*
3764	syscallarg(int) fd;
3765	syscallarg(const char ) path;*
3766	syscallarg(const struct timespec ) tptr;*
3767	syscallarg(int) flag;
3768	} /*
3769	int follow;
3770	const struct timespec *tptr;
3771	int error;
3772
3773	tptr = SCARG(uap, tptr);
3774	follow = (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
3775
3776	error = do_sys_utimensat(l, SCARG(uap, fd), NULL,
3777	SCARG(uap, path), follow, tptr, UIO_USERSPACE);
3778
3779	return error;
3780	}
3781
3782	/*
3783	* Common routine to set access and modification times given a vnode.
3784	*/
3785	int
3786	do_sys_utimens(struct lwp l, struct* vnode vp, const* char path, int* flag,
3787	const struct timespec tptr, enum* uio_seg seg)
3788	{
3789	return do_sys_utimensat(l, AT_FDCWD, vp, path, flag, tptr, seg);
3790	}
3791
3792	int
3793	do_sys_utimensat(struct lwp l, int* fdat, struct vnode *vp,
3794	const char path, int* flag, const struct timespec tptr, enum* uio_seg seg)
3795	{
3796	struct vattr vattr;
3797	int error, dorele = `0`;
3798	namei_simple_flags_t sflags;
3799	bool vanull, setbirthtime;
3800	struct timespec ts[`2`];
3801
3802	KASSERT(l != NULL \|\| fdat == AT_FDCWD);
3803
3804	/*
3805	* I have checked all callers and they pass either FOLLOW,
3806	* NOFOLLOW, or 0 (when they don't pass a path), and NOFOLLOW
3807	* is 0. More to the point, they don't pass anything else.
3808	* Let's keep it that way at least until the namei interfaces
3809	* are fully sanitized.
3810	*/
3811	KASSERT(flag == NOFOLLOW \|\| flag == FOLLOW);
3812	sflags = (flag == FOLLOW) ?
3813	NSM_FOLLOW_TRYEMULROOT : NSM_NOFOLLOW_TRYEMULROOT;
3814
3815	if (tptr == NULL) {
3816	vanull = true;
3817	nanotime(&ts[`0`]);
3818	ts[`1`] = ts[`0`];
3819	} else {
3820	vanull = false;
3821	if (seg != UIO_SYSSPACE) {
3822	error = copyin(tptr, ts, sizeof (ts));
3823	if (error != `0`)
3824	return error;
3825	} else {
3826	ts[`0`] = tptr[`0`];
3827	ts[`1`] = tptr[`1`];
3828	}
3829	}
3830
3831	if (ts[`0`].tv_nsec == UTIME_NOW) {
3832	nanotime(&ts[`0`]);
3833	if (ts[`1`].tv_nsec == UTIME_NOW) {
3834	vanull = true;
3835	ts[`1`] = ts[`0`];
3836	}
3837	} else if (ts[`1`].tv_nsec == UTIME_NOW)
3838	nanotime(&ts[`1`]);
3839
3840	if (vp == NULL) {
3841	/ note: SEG describes TPTR, not PATH; PATH is always user /
3842	error = fd_nameiat_simple_user(l, fdat, path, sflags, &vp);
3843	if (error != `0`)
3844	return error;
3845	dorele = `1`;
3846	}
3847
3848	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
3849	setbirthtime = (VOP_GETATTR(vp, &vattr, l->l_cred) == `0` &&
3850	timespeccmp(&ts[`1`], &vattr.va_birthtime, <));
3851	vattr_null(&vattr);
3852
3853	if (ts[`0`].tv_nsec != UTIME_OMIT)
3854	vattr.va_atime = ts[`0`];
3855
3856	if (ts[`1`].tv_nsec != UTIME_OMIT) {
3857	vattr.va_mtime = ts[`1`];
3858	if (setbirthtime)
3859	vattr.va_birthtime = ts[`1`];
3860	}
3861
3862	if (vanull)
3863	vattr.va_vaflags \|= VA_UTIMES_NULL;
3864	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3865	VOP_UNLOCK(vp);
3866
3867	if (dorele != `0`)
3868	vrele(vp);
3869
3870	return error;
3871	}
3872
3873	int
3874	do_sys_utimes(struct lwp l, struct* vnode vp, const* char path, int* flag,
3875	const struct timeval tptr, enum* uio_seg seg)
3876	{
3877	struct timespec ts[`2`];
3878	struct timespec *tsptr = NULL;
3879	int error;
3880
3881	if (tptr != NULL) {
3882	struct timeval tv[`2`];
3883
3884	if (seg != UIO_SYSSPACE) {
3885	error = copyin(tptr, tv, sizeof (tv));
3886	if (error != `0`)
3887	return error;
3888	tptr = tv;
3889	}
3890
3891	if ((tv[`0`].tv_usec == UTIME_NOW) \|\|
3892	(tv[`0`].tv_usec == UTIME_OMIT))
3893	ts[`0`].tv_nsec = tv[`0`].tv_usec;
3894	else
3895	TIMEVAL_TO_TIMESPEC(&tptr[`0`], &ts[`0`]);
3896
3897	if ((tv[`1`].tv_usec == UTIME_NOW) \|\|
3898	(tv[`1`].tv_usec == UTIME_OMIT))
3899	ts[`1`].tv_nsec = tv[`1`].tv_usec;
3900	else
3901	TIMEVAL_TO_TIMESPEC(&tptr[`1`], &ts[`1`]);
3902
3903	tsptr = &ts[`0`];
3904	}
3905
3906	return do_sys_utimens(l, vp, path, flag, tsptr, UIO_SYSSPACE);
3907	}
3908
3909	/*
3910	* Truncate a file given its path name.
3911	*/
3912	/ ARGSUSED /
3913	int
3914	sys_truncate(struct lwp l, const* struct sys_truncate_args uap, register_t retval)
3915	{
3916	/ {*
3917	syscallarg(const char ) path;*
3918	syscallarg(int) pad;
3919	syscallarg(off_t) length;
3920	} /*
3921	struct vnode *vp;
3922	struct vattr vattr;
3923	int error;
3924
3925	if (SCARG(uap, length) < `0`)
3926	return EINVAL;
3927
3928	error = namei_simple_user(SCARG(uap, path),
3929	NSM_FOLLOW_TRYEMULROOT, &vp);
3930	if (error != `0`)
3931	return (error);
3932	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
3933	if (vp->v_type == VDIR)
3934	error = EISDIR;
3935	else if ((error = vn_writechk(vp)) == `0` &&
3936	(error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == `0`) {
3937	vattr_null(&vattr);
3938	vattr.va_size = SCARG(uap, length);
3939	error = VOP_SETATTR(vp, &vattr, l->l_cred);
3940	}
3941	vput(vp);
3942	return (error);
3943	}
3944
3945	/*
3946	* Truncate a file given a file descriptor.
3947	*/
3948	/ ARGSUSED /
3949	int
3950	sys_ftruncate(struct lwp l, const* struct sys_ftruncate_args uap, register_t retval)
3951	{
3952	/ {*
3953	syscallarg(int) fd;
3954	syscallarg(int) pad;
3955	syscallarg(off_t) length;
3956	} /*
3957	struct vattr vattr;
3958	struct vnode *vp;
3959	file_t *fp;
3960	int error;
3961
3962	if (SCARG(uap, length) < `0`)
3963	return EINVAL;
3964
3965	/ fd_getvnode() will use the descriptor for us /
3966	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != `0`)
3967	return (error);
3968	if ((fp->f_flag & FWRITE) == `0`) {
3969	error = EINVAL;
3970	goto out;
3971	}
3972	vp = fp->f_vnode;
3973	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
3974	if (vp->v_type == VDIR)
3975	error = EISDIR;
3976	else if ((error = vn_writechk(vp)) == `0`) {
3977	vattr_null(&vattr);
3978	vattr.va_size = SCARG(uap, length);
3979	error = VOP_SETATTR(vp, &vattr, fp->f_cred);
3980	}
3981	VOP_UNLOCK(vp);
3982	out:
3983	fd_putfile(SCARG(uap, fd));
3984	return (error);
3985	}
3986
3987	/*
3988	* Sync an open file.
3989	*/
3990	/ ARGSUSED /
3991	int
3992	sys_fsync(struct lwp l, const* struct sys_fsync_args uap, register_t retval)
3993	{
3994	/ {*
3995	syscallarg(int) fd;
3996	} /*
3997	struct vnode *vp;
3998	file_t *fp;
3999	int error;
4000
4001	/ fd_getvnode() will use the descriptor for us /
4002	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != `0`)
4003	return (error);
4004	vp = fp->f_vnode;
4005	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
4006	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, `0`, `0`);
4007	VOP_UNLOCK(vp);
4008	fd_putfile(SCARG(uap, fd));
4009	return (error);
4010	}
4011
4012	/*
4013	* Sync a range of file data. API modeled after that found in AIX.
4014	*
4015	* FDATASYNC indicates that we need only save enough metadata to be able
4016	* to re-read the written data. Note we duplicate AIX's requirement that
4017	* the file be open for writing.
4018	*/
4019	/ ARGSUSED /
4020	int
4021	sys_fsync_range(struct lwp l, const* struct sys_fsync_range_args uap, register_t retval)
4022	{
4023	/ {*
4024	syscallarg(int) fd;
4025	syscallarg(int) flags;
4026	syscallarg(off_t) start;
4027	syscallarg(off_t) length;
4028	} /*
4029	struct vnode *vp;
4030	file_t *fp;
4031	int flags, nflags;
4032	off_t s, e, len;
4033	int error;
4034
4035	/ fd_getvnode() will use the descriptor for us /
4036	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != `0`)
4037	return (error);
4038
4039	if ((fp->f_flag & FWRITE) == `0`) {
4040	error = EBADF;
4041	goto out;
4042	}
4043
4044	flags = SCARG(uap, flags);
4045	if (((flags & (FDATASYNC \| FFILESYNC)) == `0`) \|\|
4046	((~flags & (FDATASYNC \| FFILESYNC)) == `0`)) {
4047	error = EINVAL;
4048	goto out;
4049	}
4050	/ Now set up the flags for value(s) to pass to VOP_FSYNC() /
4051	if (flags & FDATASYNC)
4052	nflags = FSYNC_DATAONLY \| FSYNC_WAIT;
4053	else
4054	nflags = FSYNC_WAIT;
4055	if (flags & FDISKSYNC)
4056	nflags \|= FSYNC_CACHE;
4057
4058	len = SCARG(uap, length);
4059	/ If length == 0, we do the whole file, and s = e = 0 will do that /
4060	if (len) {
4061	s = SCARG(uap, start);
4062	e = s + len;
4063	if (e < s) {
4064	error = EINVAL;
4065	goto out;
4066	}
4067	} else {
4068	e = `0`;
4069	s = `0`;
4070	}
4071
4072	vp = fp->f_vnode;
4073	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
4074	error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e);
4075	VOP_UNLOCK(vp);
4076	out:
4077	fd_putfile(SCARG(uap, fd));
4078	return (error);
4079	}
4080
4081	/*
4082	* Sync the data of an open file.
4083	*/
4084	/ ARGSUSED /
4085	int
4086	sys_fdatasync(struct lwp l, const* struct sys_fdatasync_args uap, register_t retval)
4087	{
4088	/ {*
4089	syscallarg(int) fd;
4090	} /*
4091	struct vnode *vp;
4092	file_t *fp;
4093	int error;
4094
4095	/ fd_getvnode() will use the descriptor for us /
4096	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != `0`)
4097	return (error);
4098	if ((fp->f_flag & FWRITE) == `0`) {
4099	fd_putfile(SCARG(uap, fd));
4100	return (EBADF);
4101	}
4102	vp = fp->f_vnode;
4103	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
4104	error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT\|FSYNC_DATAONLY, `0`, `0`);
4105	VOP_UNLOCK(vp);
4106	fd_putfile(SCARG(uap, fd));
4107	return (error);
4108	}
4109
4110	/*
4111	* Rename files, (standard) BSD semantics frontend.
4112	*/
4113	/ ARGSUSED /
4114	int
4115	sys_rename(struct lwp l, const* struct sys_rename_args uap, register_t retval)
4116	{
4117	/ {*
4118	syscallarg(const char ) from;*
4119	syscallarg(const char ) to;*
4120	} /*
4121
4122	return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
4123	SCARG(uap, to), UIO_USERSPACE, `0`));
4124	}
4125
4126	int
4127	sys_renameat(struct lwp l, const* struct sys_renameat_args *uap,
4128	register_t *retval)
4129	{
4130	/ {*
4131	syscallarg(int) fromfd;
4132	syscallarg(const char ) from;*
4133	syscallarg(int) tofd;
4134	syscallarg(const char ) to;*
4135	} /*
4136
4137	return (do_sys_renameat(l, SCARG(uap, fromfd), SCARG(uap, from),
4138	SCARG(uap, tofd), SCARG(uap, to), UIO_USERSPACE, `0`));
4139	}
4140
4141	/*
4142	* Rename files, POSIX semantics frontend.
4143	*/
4144	/ ARGSUSED /
4145	int
4146	sys___posix_rename(struct lwp l, const* struct sys___posix_rename_args uap, register_t retval)
4147	{
4148	/ {*
4149	syscallarg(const char ) from;*
4150	syscallarg(const char ) to;*
4151	} /*
4152
4153	return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
4154	SCARG(uap, to), UIO_USERSPACE, `1`));
4155	}
4156
4157	/*
4158	* Rename files. Source and destination must either both be directories,
4159	* or both not be directories. If target is a directory, it must be empty.
4160	* If `from' and `to' refer to the same object, the value of the `retain'
4161	* argument is used to determine whether `from' will be
4162	*
4163	* (retain == 0) deleted unless `from' and `to' refer to the same
4164	* object in the file system's name space (BSD).
4165	* (retain == 1) always retained (POSIX).
4166	*
4167	* XXX Synchronize with nfsrv_rename in nfs_serv.c.
4168	*/
4169	int
4170	do_sys_rename(const char from, const* char to, enum* uio_seg seg, int retain)
4171	{
4172	return do_sys_renameat(NULL, AT_FDCWD, from, AT_FDCWD, to, seg, retain);
4173	}
4174
4175	static int
4176	do_sys_renameat(struct lwp l, int* fromfd, const char from, int* tofd,
4177	const char to, enum* uio_seg seg, int retain)
4178	{
4179	struct pathbuf fpb, tpb;
4180	struct nameidata fnd, tnd;
4181	struct vnode fdvp, fvp;
4182	struct vnode tdvp, tvp;
4183	struct mount mp, tmp;
4184	int error;
4185
4186	KASSERT(l != NULL \|\| (fromfd == AT_FDCWD && tofd == AT_FDCWD));
4187
4188	error = pathbuf_maybe_copyin(from, seg, &fpb);
4189	if (error)
4190	goto out0;
4191	KASSERT(fpb != NULL);
4192
4193	error = pathbuf_maybe_copyin(to, seg, &tpb);
4194	if (error)
4195	goto out1;
4196	KASSERT(tpb != NULL);
4197
4198	/*
4199	* Lookup from.
4200	*
4201	* XXX LOCKPARENT is wrong because we don't actually want it
4202	* locked yet, but (a) namei is insane, and (b) VOP_RENAME is
4203	* insane, so for the time being we need to leave it like this.
4204	*/
4205	NDINIT(&fnd, DELETE, (LOCKPARENT \| TRYEMULROOT), fpb);
4206	if ((error = fd_nameiat(l, fromfd, &fnd)) != `0`)
4207	goto out2;
4208
4209	/*
4210	* Pull out the important results of the lookup, fdvp and fvp.
4211	* Of course, fvp is bogus because we're about to unlock fdvp.
4212	*/
4213	fdvp = fnd.ni_dvp;
4214	fvp = fnd.ni_vp;
4215	KASSERT(fdvp != NULL);
4216	KASSERT(fvp != NULL);
4217	KASSERT((fdvp == fvp) \|\| (VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE));
4218
4219	/*
4220	* Make sure neither fdvp nor fvp is locked.
4221	*/
4222	if (fdvp != fvp)
4223	VOP_UNLOCK(fdvp);
4224	/ XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); /
4225	/ XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); /
4226
4227	/*
4228	* Reject renaming `.' and `..'. Can't do this until after
4229	* namei because we need namei's parsing to find the final
4230	* component name. (namei should just leave us with the final
4231	* component name and not look it up itself, but anyway...)
4232	*
4233	* This was here before because we used to relookup from
4234	* instead of to and relookup requires the caller to check
4235	* this, but now file systems may depend on this check, so we
4236	* must retain it until the file systems are all rototilled.
4237	*/
4238	if (((fnd.ni_cnd.cn_namelen == `1`) &&
4239	(fnd.ni_cnd.cn_nameptr[`0`] == `'.'`)) \|\|
4240	((fnd.ni_cnd.cn_namelen == `2`) &&
4241	(fnd.ni_cnd.cn_nameptr[`0`] == `'.'`) &&
4242	(fnd.ni_cnd.cn_nameptr[`1`] == `'.'`))) {
4243	error = EINVAL; / XXX EISDIR? /
4244	goto abort0;
4245	}
4246
4247	/*
4248	* Lookup to.
4249	*
4250	* XXX LOCKPARENT is wrong, but...insanity, &c. Also, using
4251	* fvp here to decide whether to add CREATEDIR is a load of
4252	* bollocks because fvp might be the wrong node by now, since
4253	* fdvp is unlocked.
4254	*
4255	* XXX Why not pass CREATEDIR always?
4256	*/
4257	NDINIT(&tnd, RENAME,
4258	(LOCKPARENT \| NOCACHE \| TRYEMULROOT \|
4259	((fvp->v_type == VDIR)? CREATEDIR : `0`)),
4260	tpb);
4261	if ((error = fd_nameiat(l, tofd, &tnd)) != `0`)
4262	goto abort0;
4263
4264	/*
4265	* Pull out the important results of the lookup, tdvp and tvp.
4266	* Of course, tvp is bogus because we're about to unlock tdvp.
4267	*/
4268	tdvp = tnd.ni_dvp;
4269	tvp = tnd.ni_vp;
4270	KASSERT(tdvp != NULL);
4271	KASSERT((tdvp == tvp) \|\| (VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE));
4272
4273	/*
4274	* Make sure neither tdvp nor tvp is locked.
4275	*/
4276	if (tdvp != tvp)
4277	VOP_UNLOCK(tdvp);
4278	/ XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); /
4279	/ XXX KASSERT((tvp == NULL) \|\| (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); /
4280
4281	/*
4282	* Reject renaming onto `.' or `..'. relookup is unhappy with
4283	* these, which is why we must do this here. Once upon a time
4284	* we relooked up from instead of to, and consequently didn't
4285	* need this check, but now that we relookup to instead of
4286	* from, we need this; and we shall need it forever forward
4287	* until the VOP_RENAME protocol changes, because file systems
4288	* will no doubt begin to depend on this check.
4289	*/
4290	if ((tnd.ni_cnd.cn_namelen == `1`) && (tnd.ni_cnd.cn_nameptr[`0`] == `'.'`)) {
4291	error = EISDIR;
4292	goto abort1;
4293	}
4294	if ((tnd.ni_cnd.cn_namelen == `2`) &&
4295	(tnd.ni_cnd.cn_nameptr[`0`] == `'.'`) &&
4296	(tnd.ni_cnd.cn_nameptr[`1`] == `'.'`)) {
4297	error = EINVAL;
4298	goto abort1;
4299	}
4300
4301	/*
4302	* Get the mount point. If the file system has been unmounted,
4303	* which it may be because we're not holding any vnode locks,
4304	* then v_mount will be NULL. We're not really supposed to
4305	* read v_mount without holding the vnode lock, but since we
4306	* have fdvp referenced, if fdvp->v_mount changes then at worst
4307	* it will be set to NULL, not changed to another mount point.
4308	* And, of course, since it is up to the file system to
4309	* determine the real lock order, we can't lock both fdvp and
4310	* tdvp at the same time.
4311	*/
4312	mp = fdvp->v_mount;
4313	if (mp == NULL) {
4314	error = ENOENT;
4315	goto abort1;
4316	}
4317
4318	/*
4319	* Make sure the mount points match. Again, although we don't
4320	* hold any vnode locks, the v_mount fields may change -- but
4321	* at worst they will change to NULL, so this will never become
4322	* a cross-device rename, because we hold vnode references.
4323	*
4324	* XXX Because nothing is locked and the compiler may reorder
4325	* things here, unmounting the file system at an inopportune
4326	* moment may cause rename to fail with EXDEV when it really
4327	* should fail with ENOENT.
4328	*/
4329	tmp = tdvp->v_mount;
4330	if (tmp == NULL) {
4331	error = ENOENT;
4332	goto abort1;
4333	}
4334
4335	if (mp != tmp) {
4336	error = EXDEV;
4337	goto abort1;
4338	}
4339
4340	/*
4341	* Take the vfs rename lock to avoid cross-directory screw cases.
4342	* Nothing is locked currently, so taking this lock is safe.
4343	*/
4344	error = VFS_RENAMELOCK_ENTER(mp);
4345	if (error)
4346	goto abort1;
4347
4348	/*
4349	* Now fdvp, fvp, tdvp, and (if nonnull) tvp are referenced,
4350	* and nothing is locked except for the vfs rename lock.
4351	*
4352	* The next step is a little rain dance to conform to the
4353	* insane lock protocol, even though it does nothing to ward
4354	* off race conditions.
4355	*
4356	* We need tdvp and tvp to be locked. However, because we have
4357	* unlocked tdvp in order to hold no locks while we take the
4358	* vfs rename lock, tvp may be wrong here, and we can't safely
4359	* lock it even if the sensible file systems will just unlock
4360	* it straight away. Consequently, we must lock tdvp and then
4361	* relookup tvp to get it locked.
4362	*
4363	* Finally, because the VOP_RENAME protocol is brain-damaged
4364	* and various file systems insanely depend on the semantics of
4365	* this brain damage, the lookup of to must be the last lookup
4366	* before VOP_RENAME.
4367	*/
4368	vn_lock(tdvp, LK_EXCLUSIVE \| LK_RETRY);
4369	error = relookup(tdvp, &tnd.ni_vp, &tnd.ni_cnd, `0`);
4370	if (error)
4371	goto abort2;
4372
4373	/*
4374	* Drop the old tvp and pick up the new one -- which might be
4375	* the same, but that doesn't matter to us. After this, tdvp
4376	* and tvp should both be locked.
4377	*/
4378	if (tvp != NULL)
4379	vrele(tvp);
4380	tvp = tnd.ni_vp;
4381	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4382	KASSERT((tvp == NULL) \|\| (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
4383
4384	/*
4385	* The old do_sys_rename had various consistency checks here
4386	* involving fvp and tvp. fvp is bogus already here, and tvp
4387	* will become bogus soon in any sensible file system, so the
4388	* only purpose in putting these checks here is to give lip
4389	* service to these screw cases and to acknowledge that they
4390	* exist, not actually to handle them, but here you go
4391	* anyway...
4392	*/
4393
4394	/*
4395	* Acknowledge that directories and non-directories aren't
4396	* suposed to mix.
4397	*/
4398	if (tvp != NULL) {
4399	if ((fvp->v_type == VDIR) && (tvp->v_type != VDIR)) {
4400	error = ENOTDIR;
4401	goto abort3;
4402	} else if ((fvp->v_type != VDIR) && (tvp->v_type == VDIR)) {
4403	error = EISDIR;
4404	goto abort3;
4405	}
4406	}
4407
4408	/*
4409	* Acknowledge some random screw case, among the dozens that
4410	* might arise.
4411	*/
4412	if (fvp == tdvp) {
4413	error = EINVAL;
4414	goto abort3;
4415	}
4416
4417	/*
4418	* Acknowledge that POSIX has a wacky screw case.
4419	*
4420	* XXX Eventually the retain flag needs to be passed on to
4421	* VOP_RENAME.
4422	*/
4423	if (fvp == tvp) {
4424	if (retain) {
4425	error = `0`;
4426	goto abort3;
4427	} else if ((fdvp == tdvp) &&
4428	(fnd.ni_cnd.cn_namelen == tnd.ni_cnd.cn_namelen) &&
4429	(`0` == memcmp(fnd.ni_cnd.cn_nameptr, tnd.ni_cnd.cn_nameptr,
4430	fnd.ni_cnd.cn_namelen))) {
4431	error = `0`;
4432	goto abort3;
4433	}
4434	}
4435
4436	/*
4437	* Make sure veriexec can screw us up. (But a race can screw
4438	* up veriexec, of course -- remember, fvp and (soon) tvp are
4439	* bogus.)
4440	*/
4441	#if NVERIEXEC > 0
4442	{
4443	char f1, f2;
4444	size_t f1_len;
4445	size_t f2_len;
4446
4447	f1_len = fnd.ni_cnd.cn_namelen + `1`;
4448	f1 = kmem_alloc(f1_len, KM_SLEEP);
4449	strlcpy(f1, fnd.ni_cnd.cn_nameptr, f1_len);
4450
4451	f2_len = tnd.ni_cnd.cn_namelen + `1`;
4452	f2 = kmem_alloc(f2_len, KM_SLEEP);
4453	strlcpy(f2, tnd.ni_cnd.cn_nameptr, f2_len);
4454
4455	error = veriexec_renamechk(curlwp, fvp, f1, tvp, f2);
4456
4457	kmem_free(f1, f1_len);
4458	kmem_free(f2, f2_len);
4459
4460	if (error)
4461	goto abort3;
4462	}
4463	#endif /* NVERIEXEC > 0 */
4464
4465	/*
4466	* All ready. Incant the rename vop.
4467	*/
4468	/ XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); /
4469	/ XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); /
4470	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4471	KASSERT((tvp == NULL) \|\| (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
4472	error = VOP_RENAME(fdvp, fvp, &fnd.ni_cnd, tdvp, tvp, &tnd.ni_cnd);
4473
4474	/*
4475	* VOP_RENAME releases fdvp, fvp, tdvp, and tvp, and unlocks
4476	* tdvp and tvp. But we can't assert any of that.
4477	*/
4478	/ XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); /
4479	/ XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); /
4480	/ XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); /
4481	/ XXX KASSERT((tvp == NULL) \|\| (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); /
4482
4483	/*
4484	* So all we have left to do is to drop the rename lock and
4485	* destroy the pathbufs.
4486	*/
4487	VFS_RENAMELOCK_EXIT(mp);
4488	goto out2;
4489
4490	abort3: if ((tvp != NULL) && (tvp != tdvp))
4491	VOP_UNLOCK(tvp);
4492	abort2: VOP_UNLOCK(tdvp);
4493	VFS_RENAMELOCK_EXIT(mp);
4494	abort1: VOP_ABORTOP(tdvp, &tnd.ni_cnd);
4495	vrele(tdvp);
4496	if (tvp != NULL)
4497	vrele(tvp);
4498	abort0: VOP_ABORTOP(fdvp, &fnd.ni_cnd);
4499	vrele(fdvp);
4500	vrele(fvp);
4501	out2: pathbuf_destroy(tpb);
4502	out1: pathbuf_destroy(fpb);
4503	out0: return error;
4504	}
4505
4506	/*
4507	* Make a directory file.
4508	*/
4509	/ ARGSUSED /
4510	int
4511	sys_mkdir(struct lwp l, const* struct sys_mkdir_args uap, register_t retval)
4512	{
4513	/ {*
4514	syscallarg(const char ) path;*
4515	syscallarg(int) mode;
4516	} /*
4517
4518	return do_sys_mkdirat(l, AT_FDCWD, SCARG(uap, path),
4519	SCARG(uap, mode), UIO_USERSPACE);
4520	}
4521
4522	int
4523	sys_mkdirat(struct lwp l, const* struct sys_mkdirat_args *uap,
4524	register_t *retval)
4525	{
4526	/ {*
4527	syscallarg(int) fd;
4528	syscallarg(const char ) path;*
4529	syscallarg(int) mode;
4530	} /*
4531
4532	return do_sys_mkdirat(l, SCARG(uap, fd), SCARG(uap, path),
4533	SCARG(uap, mode), UIO_USERSPACE);
4534	}
4535
4536
4537	int
4538	do_sys_mkdir(const char path, mode_t mode, enum* uio_seg seg)
4539	{
4540	return do_sys_mkdirat(NULL, AT_FDCWD, path, mode, UIO_USERSPACE);
4541	}
4542
4543	static int
4544	do_sys_mkdirat(struct lwp l, int* fdat, const char *path, mode_t mode,
4545	enum uio_seg seg)
4546	{
4547	struct proc *p = curlwp->l_proc;
4548	struct vnode *vp;
4549	struct vattr vattr;
4550	int error;
4551	struct pathbuf *pb;
4552	struct nameidata nd;
4553
4554	KASSERT(l != NULL \|\| fdat == AT_FDCWD);
4555
4556	/ XXX bollocks, should pass in a pathbuf /
4557	error = pathbuf_maybe_copyin(path, seg, &pb);
4558	if (error) {
4559	return error;
4560	}
4561
4562	NDINIT(&nd, CREATE, LOCKPARENT \| CREATEDIR \| TRYEMULROOT, pb);
4563
4564	if ((error = fd_nameiat(l, fdat, &nd)) != `0`) {
4565	pathbuf_destroy(pb);
4566	return (error);
4567	}
4568	vp = nd.ni_vp;
4569	if (vp != NULL) {
4570	VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
4571	if (nd.ni_dvp == vp)
4572	vrele(nd.ni_dvp);
4573	else
4574	vput(nd.ni_dvp);
4575	vrele(vp);
4576	pathbuf_destroy(pb);
4577	return (EEXIST);
4578	}
4579	vattr_null(&vattr);
4580	vattr.va_type = VDIR;
4581	/ We will read cwdi->cwdi_cmask unlocked. /
4582	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask;
4583	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
4584	if (!error)
4585	vrele(nd.ni_vp);
4586	vput(nd.ni_dvp);
4587	pathbuf_destroy(pb);
4588	return (error);
4589	}
4590
4591	/*
4592	* Remove a directory file.
4593	*/
4594	/ ARGSUSED /
4595	int
4596	sys_rmdir(struct lwp l, const* struct sys_rmdir_args uap, register_t retval)
4597	{
4598	return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path),
4599	AT_REMOVEDIR, UIO_USERSPACE);
4600	}
4601
4602	/*
4603	* Read a block of directory entries in a file system independent format.
4604	*/
4605	int
4606	sys___getdents30(struct lwp l, const* struct sys___getdents30_args uap, register_t retval)
4607	{
4608	/ {*
4609	syscallarg(int) fd;
4610	syscallarg(char ) buf;*
4611	syscallarg(size_t) count;
4612	} /*
4613	file_t *fp;
4614	int error, done;
4615
4616	/ fd_getvnode() will use the descriptor for us /
4617	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != `0`)
4618	return (error);
4619	if ((fp->f_flag & FREAD) == `0`) {
4620	error = EBADF;
4621	goto out;
4622	}
4623	error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE,
4624	SCARG(uap, count), &done, l, `0`, `0`);
4625	ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error);
4626	*retval = done;
4627	out:
4628	fd_putfile(SCARG(uap, fd));
4629	return (error);
4630	}
4631
4632	/*
4633	* Set the mode mask for creation of filesystem nodes.
4634	*/
4635	int
4636	sys_umask(struct lwp l, const* struct sys_umask_args uap, register_t retval)
4637	{
4638	/ {*
4639	syscallarg(mode_t) newmask;
4640	} /*
4641	struct proc *p = l->l_proc;
4642	struct cwdinfo *cwdi;
4643
4644	/*
4645	* cwdi->cwdi_cmask will be read unlocked elsewhere. What's
4646	* important is that we serialize changes to the mask. The
4647	* rw_exit() will issue a write memory barrier on our behalf,
4648	* and force the changes out to other CPUs (as it must use an
4649	* atomic operation, draining the local CPU's store buffers).
4650	*/
4651	cwdi = p->p_cwdi;
4652	rw_enter(&cwdi->cwdi_lock, RW_WRITER);
4653	*retval = cwdi->cwdi_cmask;
4654	cwdi->cwdi_cmask = SCARG(uap, newmask) & ALLPERMS;
4655	rw_exit(&cwdi->cwdi_lock);
4656
4657	return (`0`);
4658	}
4659
4660	int
4661	dorevoke(struct vnode *vp, kauth_cred_t cred)
4662	{
4663	struct vattr vattr;
4664	int error, fs_decision;
4665
4666	vn_lock(vp, LK_SHARED \| LK_RETRY);
4667	error = VOP_GETATTR(vp, &vattr, cred);
4668	VOP_UNLOCK(vp);
4669	if (error != `0`)
4670	return error;
4671	fs_decision = (kauth_cred_geteuid(cred) == vattr.va_uid) ? `0` : EPERM;
4672	error = kauth_authorize_vnode(cred, KAUTH_VNODE_REVOKE, vp, NULL,
4673	fs_decision);
4674	if (!error)
4675	VOP_REVOKE(vp, REVOKEALL);
4676	return (error);
4677	}
4678
4679	/*
4680	* Void all references to file by ripping underlying filesystem
4681	* away from vnode.
4682	*/
4683	/ ARGSUSED /
4684	int
4685	sys_revoke(struct lwp l, const* struct sys_revoke_args uap, register_t retval)
4686	{
4687	/ {*
4688	syscallarg(const char ) path;*
4689	} /*
4690	struct vnode *vp;
4691	int error;
4692
4693	error = namei_simple_user(SCARG(uap, path),
4694	NSM_FOLLOW_TRYEMULROOT, &vp);
4695	if (error != `0`)
4696	return (error);
4697	error = dorevoke(vp, l->l_cred);
4698	vrele(vp);
4699	return (error);
4700	}
4701
4702	/*
4703	* Allocate backing store for a file, filling a hole without having to
4704	* explicitly write anything out.
4705	*/
4706	/ ARGSUSED /
4707	int
4708	sys_posix_fallocate(struct lwp l, const* struct sys_posix_fallocate_args *uap,
4709	register_t *retval)
4710	{
4711	/ {*
4712	syscallarg(int) fd;
4713	syscallarg(off_t) pos;
4714	syscallarg(off_t) len;
4715	} /*
4716	int fd;
4717	off_t pos, len;
4718	struct file *fp;
4719	struct vnode *vp;
4720	int error;
4721
4722	fd = SCARG(uap, fd);
4723	pos = SCARG(uap, pos);
4724	len = SCARG(uap, len);
4725
4726	if (pos < `0` \|\| len < `0` \|\| len > OFF_T_MAX - pos) {
4727	*retval = EINVAL;
4728	return `0`;
4729	}
4730
4731	error = fd_getvnode(fd, &fp);
4732	if (error) {
4733	*retval = error;
4734	return `0`;
4735	}
4736	if ((fp->f_flag & FWRITE) == `0`) {
4737	error = EBADF;
4738	goto fail;
4739	}
4740	vp = fp->f_vnode;
4741
4742	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
4743	if (vp->v_type == VDIR) {
4744	error = EISDIR;
4745	} else {
4746	error = VOP_FALLOCATE(vp, pos, len);
4747	}
4748	VOP_UNLOCK(vp);
4749
4750	fail:
4751	fd_putfile(fd);
4752	*retval = error;
4753	return `0`;
4754	}
4755
4756	/*
4757	* Deallocate backing store for a file, creating a hole. Also used for
4758	* invoking TRIM on disks.
4759	*/
4760	/ ARGSUSED /
4761	int
4762	sys_fdiscard(struct lwp l, const* struct sys_fdiscard_args *uap,
4763	register_t *retval)
4764	{
4765	/ {*
4766	syscallarg(int) fd;
4767	syscallarg(off_t) pos;
4768	syscallarg(off_t) len;
4769	} /*
4770	int fd;
4771	off_t pos, len;
4772	struct file *fp;
4773	struct vnode *vp;
4774	int error;
4775
4776	fd = SCARG(uap, fd);
4777	pos = SCARG(uap, pos);
4778	len = SCARG(uap, len);
4779
4780	if (pos < `0` \|\| len < `0` \|\| len > OFF_T_MAX - pos) {
4781	return EINVAL;
4782	}
4783
4784	error = fd_getvnode(fd, &fp);
4785	if (error) {
4786	return error;
4787	}
4788	if ((fp->f_flag & FWRITE) == `0`) {
4789	error = EBADF;
4790	goto fail;
4791	}
4792	vp = fp->f_vnode;
4793
4794	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
4795	if (vp->v_type == VDIR) {
4796	error = EISDIR;
4797	} else {
4798	error = VOP_FDISCARD(vp, pos, len);
4799	}
4800	VOP_UNLOCK(vp);
4801
4802	fail:
4803	fd_putfile(fd);
4804	return error;
4805	}
4806

Browse the source code of src/src/sys/kern/vfs_syscalls.c