1/* $NetBSD: vfs_mount.c,v 1.41 2016/11/03 11:03:31 hannken Exp $ */
2
3/*-
4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33/*
34 * Copyright (c) 1989, 1993
35 * The Regents of the University of California. All rights reserved.
36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67 */
68
69#include <sys/cdefs.h>
70__KERNEL_RCSID(0, "$NetBSD: vfs_mount.c,v 1.41 2016/11/03 11:03:31 hannken Exp $");
71
72#include <sys/param.h>
73#include <sys/kernel.h>
74
75#include <sys/atomic.h>
76#include <sys/buf.h>
77#include <sys/conf.h>
78#include <sys/fcntl.h>
79#include <sys/filedesc.h>
80#include <sys/device.h>
81#include <sys/kauth.h>
82#include <sys/kmem.h>
83#include <sys/module.h>
84#include <sys/mount.h>
85#include <sys/namei.h>
86#include <sys/extattr.h>
87#include <sys/syscallargs.h>
88#include <sys/sysctl.h>
89#include <sys/systm.h>
90#include <sys/vfs_syscalls.h>
91#include <sys/vnode_impl.h>
92
93#include <miscfs/genfs/genfs.h>
94#include <miscfs/specfs/specdev.h>
95
96/* Root filesystem. */
97vnode_t * rootvnode;
98
99/* Mounted filesystem list. */
100struct mntlist mountlist;
101kmutex_t mountlist_lock;
102
103kmutex_t mntvnode_lock;
104kmutex_t vfs_list_lock;
105
106static specificdata_domain_t mount_specificdata_domain;
107static kmutex_t mntid_lock;
108
109static kmutex_t mountgen_lock;
110static uint64_t mountgen;
111
112void
113vfs_mount_sysinit(void)
114{
115
116 TAILQ_INIT(&mountlist);
117 mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE);
118 mutex_init(&mntvnode_lock, MUTEX_DEFAULT, IPL_NONE);
119 mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE);
120
121 mount_specificdata_domain = specificdata_domain_create();
122 mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE);
123 mutex_init(&mountgen_lock, MUTEX_DEFAULT, IPL_NONE);
124 mountgen = 0;
125}
126
127struct mount *
128vfs_mountalloc(struct vfsops *vfsops, vnode_t *vp)
129{
130 struct mount *mp;
131 int error __diagused;
132
133 mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
134 if (mp == NULL)
135 return NULL;
136
137 mp->mnt_op = vfsops;
138 mp->mnt_refcnt = 1;
139 TAILQ_INIT(&mp->mnt_vnodelist);
140 mutex_init(&mp->mnt_unmounting, MUTEX_DEFAULT, IPL_NONE);
141 mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE);
142 mutex_init(&mp->mnt_updating, MUTEX_DEFAULT, IPL_NONE);
143 error = vfs_busy(mp, NULL);
144 KASSERT(error == 0);
145 mp->mnt_vnodecovered = vp;
146 mount_initspecific(mp);
147
148 mutex_enter(&mountgen_lock);
149 mp->mnt_gen = mountgen++;
150 mutex_exit(&mountgen_lock);
151
152 return mp;
153}
154
155/*
156 * vfs_rootmountalloc: lookup a filesystem type, and if found allocate and
157 * initialize a mount structure for it.
158 *
159 * Devname is usually updated by mount(8) after booting.
160 */
161int
162vfs_rootmountalloc(const char *fstypename, const char *devname,
163 struct mount **mpp)
164{
165 struct vfsops *vfsp = NULL;
166 struct mount *mp;
167
168 mutex_enter(&vfs_list_lock);
169 LIST_FOREACH(vfsp, &vfs_list, vfs_list)
170 if (!strncmp(vfsp->vfs_name, fstypename,
171 sizeof(mp->mnt_stat.f_fstypename)))
172 break;
173 if (vfsp == NULL) {
174 mutex_exit(&vfs_list_lock);
175 return (ENODEV);
176 }
177 vfsp->vfs_refcount++;
178 mutex_exit(&vfs_list_lock);
179
180 if ((mp = vfs_mountalloc(vfsp, NULL)) == NULL)
181 return ENOMEM;
182 mp->mnt_flag = MNT_RDONLY;
183 (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
184 sizeof(mp->mnt_stat.f_fstypename));
185 mp->mnt_stat.f_mntonname[0] = '/';
186 mp->mnt_stat.f_mntonname[1] = '\0';
187 mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
188 '\0';
189 (void)copystr(devname, mp->mnt_stat.f_mntfromname,
190 sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
191 *mpp = mp;
192 return 0;
193}
194
195/*
196 * vfs_getnewfsid: get a new unique fsid.
197 */
198void
199vfs_getnewfsid(struct mount *mp)
200{
201 static u_short xxxfs_mntid;
202 fsid_t tfsid;
203 int mtype;
204
205 mutex_enter(&mntid_lock);
206 mtype = makefstype(mp->mnt_op->vfs_name);
207 mp->mnt_stat.f_fsidx.__fsid_val[0] = makedev(mtype, 0);
208 mp->mnt_stat.f_fsidx.__fsid_val[1] = mtype;
209 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
210 if (xxxfs_mntid == 0)
211 ++xxxfs_mntid;
212 tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid);
213 tfsid.__fsid_val[1] = mtype;
214 if (!TAILQ_EMPTY(&mountlist)) {
215 while (vfs_getvfs(&tfsid)) {
216 tfsid.__fsid_val[0]++;
217 xxxfs_mntid++;
218 }
219 }
220 mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0];
221 mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
222 mutex_exit(&mntid_lock);
223}
224
225/*
226 * Lookup a mount point by filesystem identifier.
227 *
228 * XXX Needs to add a reference to the mount point.
229 */
230struct mount *
231vfs_getvfs(fsid_t *fsid)
232{
233 struct mount *mp;
234
235 mutex_enter(&mountlist_lock);
236 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
237 if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] &&
238 mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) {
239 mutex_exit(&mountlist_lock);
240 return (mp);
241 }
242 }
243 mutex_exit(&mountlist_lock);
244 return NULL;
245}
246
247/*
248 * Drop a reference to a mount structure, freeing if the last reference.
249 */
250void
251vfs_destroy(struct mount *mp)
252{
253
254 if (__predict_true((int)atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) {
255 return;
256 }
257
258 /*
259 * Nothing else has visibility of the mount: we can now
260 * free the data structures.
261 */
262 KASSERT(mp->mnt_refcnt == 0);
263 specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
264 mutex_destroy(&mp->mnt_unmounting);
265 mutex_destroy(&mp->mnt_updating);
266 mutex_destroy(&mp->mnt_renamelock);
267 if (mp->mnt_op != NULL) {
268 vfs_delref(mp->mnt_op);
269 }
270 kmem_free(mp, sizeof(*mp));
271}
272
273/*
274 * Mark a mount point as busy, and gain a new reference to it. Used to
275 * prevent the file system from being unmounted during critical sections.
276 *
277 * vfs_busy can be called multiple times and by multiple threads
278 * and must be accompanied by the same number of vfs_unbusy calls.
279 *
280 * => The caller must hold a pre-existing reference to the mount.
281 * => Will fail if the file system is being unmounted, or is unmounted.
282 */
283int
284vfs_busy(struct mount *mp, struct mount **nextp)
285{
286
287 KASSERT(mp->mnt_refcnt > 0);
288
289 mutex_enter(&mp->mnt_unmounting);
290 if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) {
291 mutex_exit(&mp->mnt_unmounting);
292 if (nextp != NULL) {
293 KASSERT(mutex_owned(&mountlist_lock));
294 *nextp = TAILQ_NEXT(mp, mnt_list);
295 }
296 return ENOENT;
297 }
298 ++mp->mnt_busynest;
299 KASSERT(mp->mnt_busynest != 0);
300 mutex_exit(&mp->mnt_unmounting);
301 if (nextp != NULL) {
302 mutex_exit(&mountlist_lock);
303 }
304 atomic_inc_uint(&mp->mnt_refcnt);
305 return 0;
306}
307
308/*
309 * Unbusy a busy filesystem.
310 *
311 * Every successful vfs_busy() call must be undone by a vfs_unbusy() call.
312 *
313 * => If keepref is true, preserve reference added by vfs_busy().
314 * => If nextp != NULL, acquire mountlist_lock.
315 */
316void
317vfs_unbusy(struct mount *mp, bool keepref, struct mount **nextp)
318{
319
320 KASSERT(mp->mnt_refcnt > 0);
321
322 if (nextp != NULL) {
323 mutex_enter(&mountlist_lock);
324 }
325 mutex_enter(&mp->mnt_unmounting);
326 KASSERT(mp->mnt_busynest != 0);
327 mp->mnt_busynest--;
328 mutex_exit(&mp->mnt_unmounting);
329 if (!keepref) {
330 vfs_destroy(mp);
331 }
332 if (nextp != NULL) {
333 KASSERT(mutex_owned(&mountlist_lock));
334 *nextp = TAILQ_NEXT(mp, mnt_list);
335 }
336}
337
338struct vnode_iterator {
339 struct vnode vi_vnode;
340};
341
342void
343vfs_vnode_iterator_init(struct mount *mp, struct vnode_iterator **vip)
344{
345 struct vnode *vp;
346
347 vp = vnalloc_marker(mp);
348
349 mutex_enter(&mntvnode_lock);
350 TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
351 vp->v_usecount = 1;
352 mutex_exit(&mntvnode_lock);
353
354 *vip = (struct vnode_iterator *)vp;
355}
356
357void
358vfs_vnode_iterator_destroy(struct vnode_iterator *vi)
359{
360 struct vnode *mvp = &vi->vi_vnode;
361
362 mutex_enter(&mntvnode_lock);
363 KASSERT(vnis_marker(mvp));
364 if (mvp->v_usecount != 0) {
365 TAILQ_REMOVE(&mvp->v_mount->mnt_vnodelist, mvp, v_mntvnodes);
366 mvp->v_usecount = 0;
367 }
368 mutex_exit(&mntvnode_lock);
369 vnfree_marker(mvp);
370}
371
372struct vnode *
373vfs_vnode_iterator_next(struct vnode_iterator *vi,
374 bool (*f)(void *, struct vnode *), void *cl)
375{
376 struct vnode *mvp = &vi->vi_vnode;
377 struct mount *mp = mvp->v_mount;
378 struct vnode *vp;
379 int error;
380
381 KASSERT(vnis_marker(mvp));
382
383 do {
384 mutex_enter(&mntvnode_lock);
385 vp = TAILQ_NEXT(mvp, v_mntvnodes);
386 TAILQ_REMOVE(&mp->mnt_vnodelist, mvp, v_mntvnodes);
387 mvp->v_usecount = 0;
388again:
389 if (vp == NULL) {
390 mutex_exit(&mntvnode_lock);
391 return NULL;
392 }
393 mutex_enter(vp->v_interlock);
394 if (vnis_marker(vp) ||
395 vdead_check(vp, VDEAD_NOWAIT) ||
396 (f && !(*f)(cl, vp))) {
397 mutex_exit(vp->v_interlock);
398 vp = TAILQ_NEXT(vp, v_mntvnodes);
399 goto again;
400 }
401
402 TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vp, mvp, v_mntvnodes);
403 mvp->v_usecount = 1;
404 mutex_exit(&mntvnode_lock);
405 error = vget(vp, 0, true /* wait */);
406 KASSERT(error == 0 || error == ENOENT);
407 } while (error != 0);
408
409 return vp;
410}
411
412/*
413 * Move a vnode from one mount queue to another.
414 */
415void
416vfs_insmntque(vnode_t *vp, struct mount *mp)
417{
418 struct mount *omp;
419
420 KASSERT(mp == NULL || (mp->mnt_iflag & IMNT_UNMOUNT) == 0 ||
421 vp->v_tag == VT_VFS);
422
423 mutex_enter(&mntvnode_lock);
424 /*
425 * Delete from old mount point vnode list, if on one.
426 */
427 if ((omp = vp->v_mount) != NULL)
428 TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes);
429 /*
430 * Insert into list of vnodes for the new mount point, if
431 * available. The caller must take a reference on the mount
432 * structure and donate to the vnode.
433 */
434 if ((vp->v_mount = mp) != NULL)
435 TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
436 mutex_exit(&mntvnode_lock);
437
438 if (omp != NULL) {
439 /* Release reference to old mount. */
440 vfs_destroy(omp);
441 }
442}
443
444/*
445 * Remove any vnodes in the vnode table belonging to mount point mp.
446 *
447 * If FORCECLOSE is not specified, there should not be any active ones,
448 * return error if any are found (nb: this is a user error, not a
449 * system error). If FORCECLOSE is specified, detach any active vnodes
450 * that are found.
451 *
452 * If WRITECLOSE is set, only flush out regular file vnodes open for
453 * writing.
454 *
455 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
456 */
457#ifdef DEBUG
458int busyprt = 0; /* print out busy vnodes */
459struct ctldebug debug1 = { "busyprt", &busyprt };
460#endif
461
462struct vflush_ctx {
463 const struct vnode *skipvp;
464 int flags;
465};
466
467static bool
468vflush_selector(void *cl, struct vnode *vp)
469{
470 struct vflush_ctx *c = cl;
471 /*
472 * Skip over a selected vnode.
473 */
474 if (vp == c->skipvp)
475 return false;
476 /*
477 * Skip over a vnodes marked VSYSTEM.
478 */
479 if ((c->flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM))
480 return false;
481
482 /*
483 * If WRITECLOSE is set, only flush out regular file
484 * vnodes open for writing.
485 */
486 if ((c->flags & WRITECLOSE) && vp->v_type == VREG) {
487 if (vp->v_writecount == 0)
488 return false;
489 }
490 return true;
491}
492
493static vnode_t *
494vflushnext(struct vnode_iterator *marker, void *ctx, int *when)
495{
496 if (hardclock_ticks > *when) {
497 yield();
498 *when = hardclock_ticks + hz / 10;
499 }
500 return vfs_vnode_iterator_next(marker, vflush_selector, ctx);
501}
502
503
504int
505vflush(struct mount *mp, vnode_t *skipvp, int flags)
506{
507 vnode_t *vp;
508 struct vnode_iterator *marker;
509 int error, busy = 0, when = 0;
510 struct vflush_ctx ctx;
511
512 /* First, flush out any vnode references from vrele_list. */
513 vrele_flush();
514
515 vfs_vnode_iterator_init(mp, &marker);
516
517 ctx.skipvp = skipvp;
518 ctx.flags = flags;
519 while ((vp = vflushnext(marker, &ctx, &when)) != NULL) {
520 /*
521 * First try to recycle the vnode.
522 */
523 if (vrecycle(vp))
524 continue;
525 /*
526 * If FORCECLOSE is set, forcibly close the vnode.
527 */
528 if (flags & FORCECLOSE) {
529 vgone(vp);
530 continue;
531 }
532#ifdef DEBUG
533 if (busyprt)
534 vprint("vflush: busy vnode", vp);
535#endif
536 vrele(vp);
537 busy++;
538 }
539 vfs_vnode_iterator_destroy(marker);
540 if (busy)
541 return (EBUSY);
542
543 /* Wait for all vnodes to be reclaimed. */
544 for (;;) {
545 mutex_enter(&mntvnode_lock);
546 TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
547 if (vp == skipvp)
548 continue;
549 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM))
550 continue;
551 break;
552 }
553 if (vp != NULL) {
554 mutex_enter(vp->v_interlock);
555 mutex_exit(&mntvnode_lock);
556 error = vget(vp, 0, true /* wait */);
557 if (error == ENOENT)
558 continue;
559 else if (error == 0)
560 vrele(vp);
561 return EBUSY;
562 } else {
563 mutex_exit(&mntvnode_lock);
564 return 0;
565 }
566 }
567}
568
569/*
570 * Mount a file system.
571 */
572
573/*
574 * Scan all active processes to see if any of them have a current or root
575 * directory onto which the new filesystem has just been mounted. If so,
576 * replace them with the new mount point.
577 */
578static void
579mount_checkdirs(vnode_t *olddp)
580{
581 vnode_t *newdp, *rele1, *rele2;
582 struct cwdinfo *cwdi;
583 struct proc *p;
584 bool retry;
585
586 if (olddp->v_usecount == 1) {
587 return;
588 }
589 if (VFS_ROOT(olddp->v_mountedhere, &newdp))
590 panic("mount: lost mount");
591
592 do {
593 retry = false;
594 mutex_enter(proc_lock);
595 PROCLIST_FOREACH(p, &allproc) {
596 if ((cwdi = p->p_cwdi) == NULL)
597 continue;
598 /*
599 * Cannot change to the old directory any more,
600 * so even if we see a stale value it is not a
601 * problem.
602 */
603 if (cwdi->cwdi_cdir != olddp &&
604 cwdi->cwdi_rdir != olddp)
605 continue;
606 retry = true;
607 rele1 = NULL;
608 rele2 = NULL;
609 atomic_inc_uint(&cwdi->cwdi_refcnt);
610 mutex_exit(proc_lock);
611 rw_enter(&cwdi->cwdi_lock, RW_WRITER);
612 if (cwdi->cwdi_cdir == olddp) {
613 rele1 = cwdi->cwdi_cdir;
614 vref(newdp);
615 cwdi->cwdi_cdir = newdp;
616 }
617 if (cwdi->cwdi_rdir == olddp) {
618 rele2 = cwdi->cwdi_rdir;
619 vref(newdp);
620 cwdi->cwdi_rdir = newdp;
621 }
622 rw_exit(&cwdi->cwdi_lock);
623 cwdfree(cwdi);
624 if (rele1 != NULL)
625 vrele(rele1);
626 if (rele2 != NULL)
627 vrele(rele2);
628 mutex_enter(proc_lock);
629 break;
630 }
631 mutex_exit(proc_lock);
632 } while (retry);
633
634 if (rootvnode == olddp) {
635 vrele(rootvnode);
636 vref(newdp);
637 rootvnode = newdp;
638 }
639 vput(newdp);
640}
641
642/*
643 * Start extended attributes
644 */
645static int
646start_extattr(struct mount *mp)
647{
648 int error;
649
650 error = VFS_EXTATTRCTL(mp, EXTATTR_CMD_START, NULL, 0, NULL);
651 if (error)
652 printf("%s: failed to start extattr: error = %d\n",
653 mp->mnt_stat.f_mntonname, error);
654
655 return error;
656}
657
658int
659mount_domount(struct lwp *l, vnode_t **vpp, struct vfsops *vfsops,
660 const char *path, int flags, void *data, size_t *data_len)
661{
662 vnode_t *vp = *vpp;
663 struct mount *mp;
664 struct pathbuf *pb;
665 struct nameidata nd;
666 int error;
667
668 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
669 KAUTH_REQ_SYSTEM_MOUNT_NEW, vp, KAUTH_ARG(flags), data);
670 if (error) {
671 vfs_delref(vfsops);
672 return error;
673 }
674
675 /* Cannot make a non-dir a mount-point (from here anyway). */
676 if (vp->v_type != VDIR) {
677 vfs_delref(vfsops);
678 return ENOTDIR;
679 }
680
681 if (flags & MNT_EXPORTED) {
682 vfs_delref(vfsops);
683 return EINVAL;
684 }
685
686 if ((mp = vfs_mountalloc(vfsops, vp)) == NULL) {
687 vfs_delref(vfsops);
688 return ENOMEM;
689 }
690
691 mp->mnt_stat.f_owner = kauth_cred_geteuid(l->l_cred);
692
693 /*
694 * The underlying file system may refuse the mount for
695 * various reasons. Allow the user to force it to happen.
696 *
697 * Set the mount level flags.
698 */
699 mp->mnt_flag = flags & (MNT_BASIC_FLAGS | MNT_FORCE | MNT_IGNORE);
700
701 mutex_enter(&mp->mnt_updating);
702 error = VFS_MOUNT(mp, path, data, data_len);
703 mp->mnt_flag &= ~MNT_OP_FLAGS;
704
705 if (error != 0)
706 goto err_unmounted;
707
708 /*
709 * Validate and prepare the mount point.
710 */
711 error = pathbuf_copyin(path, &pb);
712 if (error != 0) {
713 goto err_mounted;
714 }
715 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
716 error = namei(&nd);
717 pathbuf_destroy(pb);
718 if (error != 0) {
719 goto err_mounted;
720 }
721 if (nd.ni_vp != vp) {
722 vput(nd.ni_vp);
723 error = EINVAL;
724 goto err_mounted;
725 }
726 if (vp->v_mountedhere != NULL) {
727 vput(nd.ni_vp);
728 error = EBUSY;
729 goto err_mounted;
730 }
731 error = vinvalbuf(vp, V_SAVE, l->l_cred, l, 0, 0);
732 if (error != 0) {
733 vput(nd.ni_vp);
734 goto err_mounted;
735 }
736
737 /*
738 * Put the new filesystem on the mount list after root.
739 */
740 cache_purge(vp);
741 mp->mnt_iflag &= ~IMNT_WANTRDWR;
742
743 mutex_enter(&mountlist_lock);
744 TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
745 mutex_exit(&mountlist_lock);
746 if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
747 vfs_syncer_add_to_worklist(mp);
748 vp->v_mountedhere = mp;
749 vput(nd.ni_vp);
750
751 mount_checkdirs(vp);
752 mutex_exit(&mp->mnt_updating);
753
754 /* Hold an additional reference to the mount across VFS_START(). */
755 vfs_unbusy(mp, true, NULL);
756 (void) VFS_STATVFS(mp, &mp->mnt_stat);
757 error = VFS_START(mp, 0);
758 if (error) {
759 vrele(vp);
760 } else if (flags & MNT_EXTATTR) {
761 (void)start_extattr(mp);
762 }
763 /* Drop reference held for VFS_START(). */
764 vfs_destroy(mp);
765 *vpp = NULL;
766 return error;
767
768err_mounted:
769 if (VFS_UNMOUNT(mp, MNT_FORCE) != 0)
770 panic("Unmounting fresh file system failed");
771
772err_unmounted:
773 vp->v_mountedhere = NULL;
774 mutex_exit(&mp->mnt_updating);
775 vfs_unbusy(mp, false, NULL);
776 vfs_destroy(mp);
777
778 return error;
779}
780
781/*
782 * Do the actual file system unmount. File system is assumed to have
783 * been locked by the caller.
784 *
785 * => Caller hold reference to the mount, explicitly for dounmount().
786 */
787int
788dounmount(struct mount *mp, int flags, struct lwp *l)
789{
790 vnode_t *coveredvp;
791 int error, async, used_syncer, used_extattr;
792
793#if NVERIEXEC > 0
794 error = veriexec_unmountchk(mp);
795 if (error)
796 return (error);
797#endif /* NVERIEXEC > 0 */
798
799 /*
800 * XXX Freeze syncer. Must do this before locking the
801 * mount point. See dounmount() for details.
802 */
803 mutex_enter(&syncer_mutex);
804
805 /*
806 * Abort unmount attempt when the filesystem is in use
807 */
808 mutex_enter(&mp->mnt_unmounting);
809 if (mp->mnt_busynest != 0) {
810 mutex_exit(&mp->mnt_unmounting);
811 mutex_exit(&syncer_mutex);
812 return EBUSY;
813 }
814
815 /*
816 * Abort unmount attempt when the filesystem is not mounted
817 */
818 if ((mp->mnt_iflag & IMNT_GONE) != 0) {
819 mutex_exit(&mp->mnt_unmounting);
820 mutex_exit(&syncer_mutex);
821 return ENOENT;
822 }
823
824 used_syncer = (mp->mnt_iflag & IMNT_ONWORKLIST) != 0;
825 used_extattr = mp->mnt_flag & MNT_EXTATTR;
826
827 /*
828 * XXX Syncer must be frozen when we get here. This should really
829 * be done on a per-mountpoint basis, but the syncer doesn't work
830 * like that.
831 *
832 * The caller of dounmount() must acquire syncer_mutex because
833 * the syncer itself acquires locks in syncer_mutex -> vfs_busy
834 * order, and we must preserve that order to avoid deadlock.
835 *
836 * So, if the file system did not use the syncer, now is
837 * the time to release the syncer_mutex.
838 */
839 if (used_syncer == 0) {
840 mutex_exit(&syncer_mutex);
841 }
842 mp->mnt_iflag |= IMNT_UNMOUNT;
843 mutex_enter(&mp->mnt_updating);
844 async = mp->mnt_flag & MNT_ASYNC;
845 mp->mnt_flag &= ~MNT_ASYNC;
846 cache_purgevfs(mp); /* remove cache entries for this file sys */
847 if (used_syncer)
848 vfs_syncer_remove_from_worklist(mp);
849 error = 0;
850 if (((mp->mnt_flag & MNT_RDONLY) == 0) && ((flags & MNT_FORCE) == 0)) {
851 error = VFS_SYNC(mp, MNT_WAIT, l->l_cred);
852 }
853 if (error == 0 || (flags & MNT_FORCE)) {
854 error = VFS_UNMOUNT(mp, flags);
855 }
856 if (error) {
857 mp->mnt_iflag &= ~IMNT_UNMOUNT;
858 mutex_exit(&mp->mnt_unmounting);
859 if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
860 vfs_syncer_add_to_worklist(mp);
861 mp->mnt_flag |= async;
862 mutex_exit(&mp->mnt_updating);
863 if (used_syncer)
864 mutex_exit(&syncer_mutex);
865 if (used_extattr) {
866 if (start_extattr(mp) != 0)
867 mp->mnt_flag &= ~MNT_EXTATTR;
868 else
869 mp->mnt_flag |= MNT_EXTATTR;
870 }
871 return (error);
872 }
873 mutex_exit(&mp->mnt_updating);
874
875 /*
876 * release mnt_umounting lock here, because other code calls
877 * vfs_busy() while holding the mountlist_lock.
878 *
879 * mark filesystem as gone to prevent further umounts
880 * after mnt_umounting lock is gone, this also prevents
881 * vfs_busy() from succeeding.
882 */
883 mp->mnt_iflag |= IMNT_GONE;
884 mutex_exit(&mp->mnt_unmounting);
885
886 if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
887 vn_lock(coveredvp, LK_EXCLUSIVE | LK_RETRY);
888 coveredvp->v_mountedhere = NULL;
889 VOP_UNLOCK(coveredvp);
890 }
891 mutex_enter(&mountlist_lock);
892 TAILQ_REMOVE(&mountlist, mp, mnt_list);
893 mutex_exit(&mountlist_lock);
894 if (TAILQ_FIRST(&mp->mnt_vnodelist) != NULL)
895 panic("unmount: dangling vnode");
896 if (used_syncer)
897 mutex_exit(&syncer_mutex);
898 vfs_hooks_unmount(mp);
899
900 vfs_destroy(mp); /* reference from mount() */
901 if (coveredvp != NULLVP) {
902 vrele(coveredvp);
903 }
904 return (0);
905}
906
907/*
908 * Unmount all file systems.
909 * We traverse the list in reverse order under the assumption that doing so
910 * will avoid needing to worry about dependencies.
911 */
912bool
913vfs_unmountall(struct lwp *l)
914{
915
916 printf("unmounting file systems...\n");
917 return vfs_unmountall1(l, true, true);
918}
919
920static void
921vfs_unmount_print(struct mount *mp, const char *pfx)
922{
923
924 aprint_verbose("%sunmounted %s on %s type %s\n", pfx,
925 mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname,
926 mp->mnt_stat.f_fstypename);
927}
928
929bool
930vfs_unmount_forceone(struct lwp *l)
931{
932 struct mount *mp, *nmp;
933 int error;
934
935 nmp = NULL;
936
937 TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
938 if (nmp == NULL || mp->mnt_gen > nmp->mnt_gen) {
939 nmp = mp;
940 }
941 }
942 if (nmp == NULL) {
943 return false;
944 }
945
946#ifdef DEBUG
947 printf("forcefully unmounting %s (%s)...\n",
948 nmp->mnt_stat.f_mntonname, nmp->mnt_stat.f_mntfromname);
949#endif
950 atomic_inc_uint(&nmp->mnt_refcnt);
951 if ((error = dounmount(nmp, MNT_FORCE, l)) == 0) {
952 vfs_unmount_print(nmp, "forcefully ");
953 return true;
954 } else {
955 vfs_destroy(nmp);
956 }
957
958#ifdef DEBUG
959 printf("forceful unmount of %s failed with error %d\n",
960 nmp->mnt_stat.f_mntonname, error);
961#endif
962
963 return false;
964}
965
966bool
967vfs_unmountall1(struct lwp *l, bool force, bool verbose)
968{
969 struct mount *mp, *nmp;
970 bool any_error = false, progress = false;
971 int error;
972
973 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, nmp) {
974#ifdef DEBUG
975 printf("unmounting %p %s (%s)...\n",
976 (void *)mp, mp->mnt_stat.f_mntonname,
977 mp->mnt_stat.f_mntfromname);
978#endif
979 atomic_inc_uint(&mp->mnt_refcnt);
980 if ((error = dounmount(mp, force ? MNT_FORCE : 0, l)) == 0) {
981 vfs_unmount_print(mp, "");
982 progress = true;
983 } else {
984 vfs_destroy(mp);
985 if (verbose) {
986 printf("unmount of %s failed with error %d\n",
987 mp->mnt_stat.f_mntonname, error);
988 }
989 any_error = true;
990 }
991 }
992 if (verbose) {
993 printf("unmounting done\n");
994 }
995 if (any_error && verbose) {
996 printf("WARNING: some file systems would not unmount\n");
997 }
998 return progress;
999}
1000
1001void
1002vfs_sync_all(struct lwp *l)
1003{
1004 printf("syncing disks... ");
1005
1006 /* remove user processes from run queue */
1007 suspendsched();
1008 (void)spl0();
1009
1010 /* avoid coming back this way again if we panic. */
1011 doing_shutdown = 1;
1012
1013 do_sys_sync(l);
1014
1015 /* Wait for sync to finish. */
1016 if (buf_syncwait() != 0) {
1017#if defined(DDB) && defined(DEBUG_HALT_BUSY)
1018 Debugger();
1019#endif
1020 printf("giving up\n");
1021 return;
1022 } else
1023 printf("done\n");
1024}
1025
1026/*
1027 * Sync and unmount file systems before shutting down.
1028 */
1029void
1030vfs_shutdown(void)
1031{
1032 lwp_t *l = curlwp;
1033
1034 vfs_sync_all(l);
1035
1036 /*
1037 * If we have paniced - do not make the situation potentially
1038 * worse by unmounting the file systems.
1039 */
1040 if (panicstr != NULL) {
1041 return;
1042 }
1043
1044 /* Unmount file systems. */
1045 vfs_unmountall(l);
1046}
1047
1048/*
1049 * Print a list of supported file system types (used by vfs_mountroot)
1050 */
1051static void
1052vfs_print_fstypes(void)
1053{
1054 struct vfsops *v;
1055 int cnt = 0;
1056
1057 mutex_enter(&vfs_list_lock);
1058 LIST_FOREACH(v, &vfs_list, vfs_list)
1059 ++cnt;
1060 mutex_exit(&vfs_list_lock);
1061
1062 if (cnt == 0) {
1063 printf("WARNING: No file system modules have been loaded.\n");
1064 return;
1065 }
1066
1067 printf("Supported file systems:");
1068 mutex_enter(&vfs_list_lock);
1069 LIST_FOREACH(v, &vfs_list, vfs_list) {
1070 printf(" %s", v->vfs_name);
1071 }
1072 mutex_exit(&vfs_list_lock);
1073 printf("\n");
1074}
1075
1076/*
1077 * Mount the root file system. If the operator didn't specify a
1078 * file system to use, try all possible file systems until one
1079 * succeeds.
1080 */
1081int
1082vfs_mountroot(void)
1083{
1084 struct vfsops *v;
1085 int error = ENODEV;
1086
1087 if (root_device == NULL)
1088 panic("vfs_mountroot: root device unknown");
1089
1090 switch (device_class(root_device)) {
1091 case DV_IFNET:
1092 if (rootdev != NODEV)
1093 panic("vfs_mountroot: rootdev set for DV_IFNET "
1094 "(0x%llx -> %llu,%llu)",
1095 (unsigned long long)rootdev,
1096 (unsigned long long)major(rootdev),
1097 (unsigned long long)minor(rootdev));
1098 break;
1099
1100 case DV_DISK:
1101 if (rootdev == NODEV)
1102 panic("vfs_mountroot: rootdev not set for DV_DISK");
1103 if (bdevvp(rootdev, &rootvp))
1104 panic("vfs_mountroot: can't get vnode for rootdev");
1105 error = VOP_OPEN(rootvp, FREAD, FSCRED);
1106 if (error) {
1107 printf("vfs_mountroot: can't open root device\n");
1108 return (error);
1109 }
1110 break;
1111
1112 case DV_VIRTUAL:
1113 break;
1114
1115 default:
1116 printf("%s: inappropriate for root file system\n",
1117 device_xname(root_device));
1118 return (ENODEV);
1119 }
1120
1121 /*
1122 * If user specified a root fs type, use it. Make sure the
1123 * specified type exists and has a mount_root()
1124 */
1125 if (strcmp(rootfstype, ROOT_FSTYPE_ANY) != 0) {
1126 v = vfs_getopsbyname(rootfstype);
1127 error = EFTYPE;
1128 if (v != NULL) {
1129 if (v->vfs_mountroot != NULL) {
1130 error = (v->vfs_mountroot)();
1131 }
1132 v->vfs_refcount--;
1133 }
1134 goto done;
1135 }
1136
1137 /*
1138 * Try each file system currently configured into the kernel.
1139 */
1140 mutex_enter(&vfs_list_lock);
1141 LIST_FOREACH(v, &vfs_list, vfs_list) {
1142 if (v->vfs_mountroot == NULL)
1143 continue;
1144#ifdef DEBUG
1145 aprint_normal("mountroot: trying %s...\n", v->vfs_name);
1146#endif
1147 v->vfs_refcount++;
1148 mutex_exit(&vfs_list_lock);
1149 error = (*v->vfs_mountroot)();
1150 mutex_enter(&vfs_list_lock);
1151 v->vfs_refcount--;
1152 if (!error) {
1153 aprint_normal("root file system type: %s\n",
1154 v->vfs_name);
1155 break;
1156 }
1157 }
1158 mutex_exit(&vfs_list_lock);
1159
1160 if (v == NULL) {
1161 vfs_print_fstypes();
1162 printf("no file system for %s", device_xname(root_device));
1163 if (device_class(root_device) == DV_DISK)
1164 printf(" (dev 0x%llx)", (unsigned long long)rootdev);
1165 printf("\n");
1166 error = EFTYPE;
1167 }
1168
1169done:
1170 if (error && device_class(root_device) == DV_DISK) {
1171 VOP_CLOSE(rootvp, FREAD, FSCRED);
1172 vrele(rootvp);
1173 }
1174 if (error == 0) {
1175 struct mount *mp;
1176 extern struct cwdinfo cwdi0;
1177
1178 mp = TAILQ_FIRST(&mountlist);
1179 mp->mnt_flag |= MNT_ROOTFS;
1180 mp->mnt_op->vfs_refcount++;
1181
1182 /*
1183 * Get the vnode for '/'. Set cwdi0.cwdi_cdir to
1184 * reference it.
1185 */
1186 error = VFS_ROOT(mp, &rootvnode);
1187 if (error)
1188 panic("cannot find root vnode, error=%d", error);
1189 cwdi0.cwdi_cdir = rootvnode;
1190 vref(cwdi0.cwdi_cdir);
1191 VOP_UNLOCK(rootvnode);
1192 cwdi0.cwdi_rdir = NULL;
1193
1194 /*
1195 * Now that root is mounted, we can fixup initproc's CWD
1196 * info. All other processes are kthreads, which merely
1197 * share proc0's CWD info.
1198 */
1199 initproc->p_cwdi->cwdi_cdir = rootvnode;
1200 vref(initproc->p_cwdi->cwdi_cdir);
1201 initproc->p_cwdi->cwdi_rdir = NULL;
1202 /*
1203 * Enable loading of modules from the filesystem
1204 */
1205 module_load_vfs_init();
1206
1207 }
1208 return (error);
1209}
1210
1211/*
1212 * mount_specific_key_create --
1213 * Create a key for subsystem mount-specific data.
1214 */
1215int
1216mount_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
1217{
1218
1219 return specificdata_key_create(mount_specificdata_domain, keyp, dtor);
1220}
1221
1222/*
1223 * mount_specific_key_delete --
1224 * Delete a key for subsystem mount-specific data.
1225 */
1226void
1227mount_specific_key_delete(specificdata_key_t key)
1228{
1229
1230 specificdata_key_delete(mount_specificdata_domain, key);
1231}
1232
1233/*
1234 * mount_initspecific --
1235 * Initialize a mount's specificdata container.
1236 */
1237void
1238mount_initspecific(struct mount *mp)
1239{
1240 int error __diagused;
1241
1242 error = specificdata_init(mount_specificdata_domain,
1243 &mp->mnt_specdataref);
1244 KASSERT(error == 0);
1245}
1246
1247/*
1248 * mount_finispecific --
1249 * Finalize a mount's specificdata container.
1250 */
1251void
1252mount_finispecific(struct mount *mp)
1253{
1254
1255 specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
1256}
1257
1258/*
1259 * mount_getspecific --
1260 * Return mount-specific data corresponding to the specified key.
1261 */
1262void *
1263mount_getspecific(struct mount *mp, specificdata_key_t key)
1264{
1265
1266 return specificdata_getspecific(mount_specificdata_domain,
1267 &mp->mnt_specdataref, key);
1268}
1269
1270/*
1271 * mount_setspecific --
1272 * Set mount-specific data corresponding to the specified key.
1273 */
1274void
1275mount_setspecific(struct mount *mp, specificdata_key_t key, void *data)
1276{
1277
1278 specificdata_setspecific(mount_specificdata_domain,
1279 &mp->mnt_specdataref, key, data);
1280}
1281
1282/*
1283 * Check to see if a filesystem is mounted on a block device.
1284 */
1285int
1286vfs_mountedon(vnode_t *vp)
1287{
1288 vnode_t *vq;
1289 int error = 0;
1290
1291 if (vp->v_type != VBLK)
1292 return ENOTBLK;
1293 if (spec_node_getmountedfs(vp) != NULL)
1294 return EBUSY;
1295 if (spec_node_lookup_by_dev(vp->v_type, vp->v_rdev, &vq) == 0) {
1296 if (spec_node_getmountedfs(vq) != NULL)
1297 error = EBUSY;
1298 vrele(vq);
1299 }
1300
1301 return error;
1302}
1303
1304/*
1305 * Check if a device pointed to by vp is mounted.
1306 *
1307 * Returns:
1308 * EINVAL if it's not a disk
1309 * EBUSY if it's a disk and mounted
1310 * 0 if it's a disk and not mounted
1311 */
1312int
1313rawdev_mounted(vnode_t *vp, vnode_t **bvpp)
1314{
1315 vnode_t *bvp;
1316 dev_t dev;
1317 int d_type;
1318
1319 bvp = NULL;
1320 d_type = D_OTHER;
1321
1322 if (iskmemvp(vp))
1323 return EINVAL;
1324
1325 switch (vp->v_type) {
1326 case VCHR: {
1327 const struct cdevsw *cdev;
1328
1329 dev = vp->v_rdev;
1330 cdev = cdevsw_lookup(dev);
1331 if (cdev != NULL) {
1332 dev_t blkdev;
1333
1334 blkdev = devsw_chr2blk(dev);
1335 if (blkdev != NODEV) {
1336 if (vfinddev(blkdev, VBLK, &bvp) != 0) {
1337 d_type = (cdev->d_flag & D_TYPEMASK);
1338 /* XXX: what if bvp disappears? */
1339 vrele(bvp);
1340 }
1341 }
1342 }
1343
1344 break;
1345 }
1346
1347 case VBLK: {
1348 const struct bdevsw *bdev;
1349
1350 dev = vp->v_rdev;
1351 bdev = bdevsw_lookup(dev);
1352 if (bdev != NULL)
1353 d_type = (bdev->d_flag & D_TYPEMASK);
1354
1355 bvp = vp;
1356
1357 break;
1358 }
1359
1360 default:
1361 break;
1362 }
1363
1364 if (d_type != D_DISK)
1365 return EINVAL;
1366
1367 if (bvpp != NULL)
1368 *bvpp = bvp;
1369
1370 /*
1371 * XXX: This is bogus. We should be failing the request
1372 * XXX: not only if this specific slice is mounted, but
1373 * XXX: if it's on a disk with any other mounted slice.
1374 */
1375 if (vfs_mountedon(bvp))
1376 return EBUSY;
1377
1378 return 0;
1379}
1380
1381/*
1382 * Make a 'unique' number from a mount type name.
1383 */
1384long
1385makefstype(const char *type)
1386{
1387 long rv;
1388
1389 for (rv = 0; *type; type++) {
1390 rv <<= 2;
1391 rv ^= *type;
1392 }
1393 return rv;
1394}
1395
1396void
1397mountlist_append(struct mount *mp)
1398{
1399 mutex_enter(&mountlist_lock);
1400 TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
1401 mutex_exit(&mountlist_lock);
1402}
1403