1/* $NetBSD: vfs_vnode.c,v 1.59 2016/11/03 11:04:21 hannken Exp $ */
2
3/*-
4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33/*
34 * Copyright (c) 1989, 1993
35 * The Regents of the University of California. All rights reserved.
36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
67 */
68
69/*
70 * The vnode cache subsystem.
71 *
72 * Life-cycle
73 *
74 * Normally, there are two points where new vnodes are created:
75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode
76 * starts in one of the following ways:
77 *
78 * - Allocation, via vcache_get(9) or vcache_new(9).
79 * - Reclamation of inactive vnode, via vget(9).
80 *
81 * Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
82 * was another, traditional way. Currently, only the draining thread
83 * recycles the vnodes. This behaviour might be revisited.
84 *
85 * The life-cycle ends when the last reference is dropped, usually
86 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform
87 * the file system that vnode is inactive. Via this call, file system
88 * indicates whether vnode can be recycled (usually, it checks its own
89 * references, e.g. count of links, whether the file was removed).
90 *
91 * Depending on indication, vnode can be put into a free list (cache),
92 * or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to
93 * disassociate underlying file system from the vnode, and finally
94 * destroyed.
95 *
96 * Vnode state
97 *
98 * Vnode is always in one of six states:
99 * - MARKER This is a marker vnode to help list traversal. It
100 * will never change its state.
101 * - LOADING Vnode is associating underlying file system and not
102 * yet ready to use.
103 * - ACTIVE Vnode has associated underlying file system and is
104 * ready to use.
105 * - BLOCKED Vnode is active but cannot get new references.
106 * - RECLAIMING Vnode is disassociating from the underlying file
107 * system.
108 * - RECLAIMED Vnode has disassociated from underlying file system
109 * and is dead.
110 *
111 * Valid state changes are:
112 * LOADING -> ACTIVE
113 * Vnode has been initialised in vcache_get() or
114 * vcache_new() and is ready to use.
115 * ACTIVE -> RECLAIMING
116 * Vnode starts disassociation from underlying file
117 * system in vcache_reclaim().
118 * RECLAIMING -> RECLAIMED
119 * Vnode finished disassociation from underlying file
120 * system in vcache_reclaim().
121 * ACTIVE -> BLOCKED
122 * Either vcache_rekey*() is changing the vnode key or
123 * vrelel() is about to call VOP_INACTIVE().
124 * BLOCKED -> ACTIVE
125 * The block condition is over.
126 * LOADING -> RECLAIMED
127 * Either vcache_get() or vcache_new() failed to
128 * associate the underlying file system or vcache_rekey*()
129 * drops a vnode used as placeholder.
130 *
131 * Of these states LOADING, BLOCKED and RECLAIMING are intermediate
132 * and it is possible to wait for state change.
133 *
134 * State is protected with v_interlock with one exception:
135 * to change from LOADING both v_interlock and vcache.lock must be held
136 * so it is possible to check "state == LOADING" without holding
137 * v_interlock. See vcache_get() for details.
138 *
139 * Reference counting
140 *
141 * Vnode is considered active, if reference count (vnode_t::v_usecount)
142 * is non-zero. It is maintained using: vref(9) and vrele(9), as well
143 * as vput(9), routines. Common points holding references are e.g.
144 * file openings, current working directory, mount points, etc.
145 *
146 * Note on v_usecount and its locking
147 *
148 * At nearly all points it is known that v_usecount could be zero,
149 * the vnode_t::v_interlock will be held. To change v_usecount away
150 * from zero, the interlock must be held. To change from a non-zero
151 * value to zero, again the interlock must be held.
152 *
153 * Changing the usecount from a non-zero value to a non-zero value can
154 * safely be done using atomic operations, without the interlock held.
155 *
156 */
157
158#include <sys/cdefs.h>
159__KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.59 2016/11/03 11:04:21 hannken Exp $");
160
161#include <sys/param.h>
162#include <sys/kernel.h>
163
164#include <sys/atomic.h>
165#include <sys/buf.h>
166#include <sys/conf.h>
167#include <sys/device.h>
168#include <sys/hash.h>
169#include <sys/kauth.h>
170#include <sys/kmem.h>
171#include <sys/kthread.h>
172#include <sys/module.h>
173#include <sys/mount.h>
174#include <sys/namei.h>
175#include <sys/syscallargs.h>
176#include <sys/sysctl.h>
177#include <sys/systm.h>
178#include <sys/vnode_impl.h>
179#include <sys/wapbl.h>
180#include <sys/fstrans.h>
181
182#include <uvm/uvm.h>
183#include <uvm/uvm_readahead.h>
184
185/* Flags to vrelel. */
186#define VRELEL_ASYNC_RELE 0x0001 /* Always defer to vrele thread. */
187
188u_int numvnodes __cacheline_aligned;
189
190/*
191 * There are two free lists: one is for vnodes which have no buffer/page
192 * references and one for those which do (i.e. v_holdcnt is non-zero).
193 * Vnode recycling mechanism first attempts to look into the former list.
194 */
195static kmutex_t vnode_free_list_lock __cacheline_aligned;
196static vnodelst_t vnode_free_list __cacheline_aligned;
197static vnodelst_t vnode_hold_list __cacheline_aligned;
198static kcondvar_t vdrain_cv __cacheline_aligned;
199
200static vnodelst_t vrele_list __cacheline_aligned;
201static kmutex_t vrele_lock __cacheline_aligned;
202static kcondvar_t vrele_cv __cacheline_aligned;
203static lwp_t * vrele_lwp __cacheline_aligned;
204static int vrele_pending __cacheline_aligned;
205static int vrele_gen __cacheline_aligned;
206
207SLIST_HEAD(hashhead, vnode_impl);
208static struct {
209 kmutex_t lock;
210 kcondvar_t cv;
211 u_long hashmask;
212 struct hashhead *hashtab;
213 pool_cache_t pool;
214} vcache __cacheline_aligned;
215
216static int cleanvnode(void);
217static vnode_impl_t *vcache_alloc(void);
218static void vcache_free(vnode_impl_t *);
219static void vcache_init(void);
220static void vcache_reinit(void);
221static void vcache_reclaim(vnode_t *);
222static void vrelel(vnode_t *, int);
223static void vdrain_thread(void *);
224static void vrele_thread(void *);
225static void vnpanic(vnode_t *, const char *, ...)
226 __printflike(2, 3);
227
228/* Routines having to do with the management of the vnode table. */
229extern struct mount *dead_rootmount;
230extern int (**dead_vnodeop_p)(void *);
231extern struct vfsops dead_vfsops;
232
233/* Vnode state operations and diagnostics. */
234
235#if defined(DIAGNOSTIC)
236
237#define VSTATE_GET(vp) \
238 vstate_assert_get((vp), __func__, __LINE__)
239#define VSTATE_CHANGE(vp, from, to) \
240 vstate_assert_change((vp), (from), (to), __func__, __LINE__)
241#define VSTATE_WAIT_STABLE(vp) \
242 vstate_assert_wait_stable((vp), __func__, __LINE__)
243#define VSTATE_ASSERT(vp, state) \
244 vstate_assert((vp), (state), __func__, __LINE__)
245
246static void
247vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line)
248{
249 vnode_impl_t *node = VNODE_TO_VIMPL(vp);
250
251 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
252
253 if (__predict_true(node->vi_state == state))
254 return;
255 vnpanic(vp, "state is %s, expected %s at %s:%d",
256 vstate_name(node->vi_state), vstate_name(state), func, line);
257}
258
259static enum vnode_state
260vstate_assert_get(vnode_t *vp, const char *func, int line)
261{
262 vnode_impl_t *node = VNODE_TO_VIMPL(vp);
263
264 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
265 if (node->vi_state == VS_MARKER)
266 vnpanic(vp, "state is %s at %s:%d",
267 vstate_name(node->vi_state), func, line);
268
269 return node->vi_state;
270}
271
272static void
273vstate_assert_wait_stable(vnode_t *vp, const char *func, int line)
274{
275 vnode_impl_t *node = VNODE_TO_VIMPL(vp);
276
277 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
278 if (node->vi_state == VS_MARKER)
279 vnpanic(vp, "state is %s at %s:%d",
280 vstate_name(node->vi_state), func, line);
281
282 while (node->vi_state != VS_ACTIVE && node->vi_state != VS_RECLAIMED)
283 cv_wait(&vp->v_cv, vp->v_interlock);
284
285 if (node->vi_state == VS_MARKER)
286 vnpanic(vp, "state is %s at %s:%d",
287 vstate_name(node->vi_state), func, line);
288}
289
290static void
291vstate_assert_change(vnode_t *vp, enum vnode_state from, enum vnode_state to,
292 const char *func, int line)
293{
294 vnode_impl_t *node = VNODE_TO_VIMPL(vp);
295
296 KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
297 if (from == VS_LOADING)
298 KASSERTMSG(mutex_owned(&vcache.lock), "at %s:%d", func, line);
299
300 if (from == VS_MARKER)
301 vnpanic(vp, "from is %s at %s:%d",
302 vstate_name(from), func, line);
303 if (to == VS_MARKER)
304 vnpanic(vp, "to is %s at %s:%d",
305 vstate_name(to), func, line);
306 if (node->vi_state != from)
307 vnpanic(vp, "from is %s, expected %s at %s:%d\n",
308 vstate_name(node->vi_state), vstate_name(from), func, line);
309
310 node->vi_state = to;
311 if (from == VS_LOADING)
312 cv_broadcast(&vcache.cv);
313 if (to == VS_ACTIVE || to == VS_RECLAIMED)
314 cv_broadcast(&vp->v_cv);
315}
316
317#else /* defined(DIAGNOSTIC) */
318
319#define VSTATE_GET(vp) \
320 (VNODE_TO_VIMPL((vp))->vi_state)
321#define VSTATE_CHANGE(vp, from, to) \
322 vstate_change((vp), (from), (to))
323#define VSTATE_WAIT_STABLE(vp) \
324 vstate_wait_stable((vp))
325#define VSTATE_ASSERT(vp, state)
326
327static void
328vstate_wait_stable(vnode_t *vp)
329{
330 vnode_impl_t *node = VNODE_TO_VIMPL(vp);
331
332 while (node->vi_state != VS_ACTIVE && node->vi_state != VS_RECLAIMED)
333 cv_wait(&vp->v_cv, vp->v_interlock);
334}
335
336static void
337vstate_change(vnode_t *vp, enum vnode_state from, enum vnode_state to)
338{
339 vnode_impl_t *node = VNODE_TO_VIMPL(vp);
340
341 node->vi_state = to;
342 if (from == VS_LOADING)
343 cv_broadcast(&vcache.cv);
344 if (to == VS_ACTIVE || to == VS_RECLAIMED)
345 cv_broadcast(&vp->v_cv);
346}
347
348#endif /* defined(DIAGNOSTIC) */
349
350void
351vfs_vnode_sysinit(void)
352{
353 int error __diagused;
354
355 dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL);
356 KASSERT(dead_rootmount != NULL);
357 dead_rootmount->mnt_iflag = IMNT_MPSAFE;
358
359 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE);
360 TAILQ_INIT(&vnode_free_list);
361 TAILQ_INIT(&vnode_hold_list);
362 TAILQ_INIT(&vrele_list);
363
364 vcache_init();
365
366 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE);
367 cv_init(&vdrain_cv, "vdrain");
368 cv_init(&vrele_cv, "vrele");
369 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread,
370 NULL, NULL, "vdrain");
371 KASSERTMSG((error == 0), "kthread_create(vdrain) failed: %d", error);
372 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread,
373 NULL, &vrele_lwp, "vrele");
374 KASSERTMSG((error == 0), "kthread_create(vrele) failed: %d", error);
375}
376
377/*
378 * Allocate a new marker vnode.
379 */
380vnode_t *
381vnalloc_marker(struct mount *mp)
382{
383 vnode_impl_t *node;
384 vnode_t *vp;
385
386 node = pool_cache_get(vcache.pool, PR_WAITOK);
387 memset(node, 0, sizeof(*node));
388 vp = VIMPL_TO_VNODE(node);
389 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
390 vp->v_mount = mp;
391 vp->v_type = VBAD;
392 node->vi_state = VS_MARKER;
393
394 return vp;
395}
396
397/*
398 * Free a marker vnode.
399 */
400void
401vnfree_marker(vnode_t *vp)
402{
403 vnode_impl_t *node;
404
405 node = VNODE_TO_VIMPL(vp);
406 KASSERT(node->vi_state == VS_MARKER);
407 uvm_obj_destroy(&vp->v_uobj, true);
408 pool_cache_put(vcache.pool, node);
409}
410
411/*
412 * Test a vnode for being a marker vnode.
413 */
414bool
415vnis_marker(vnode_t *vp)
416{
417
418 return (VNODE_TO_VIMPL(vp)->vi_state == VS_MARKER);
419}
420
421/*
422 * cleanvnode: grab a vnode from freelist, clean and free it.
423 *
424 * => Releases vnode_free_list_lock.
425 */
426static int
427cleanvnode(void)
428{
429 vnode_t *vp;
430 vnodelst_t *listhd;
431 struct mount *mp;
432
433 KASSERT(mutex_owned(&vnode_free_list_lock));
434
435 listhd = &vnode_free_list;
436try_nextlist:
437 TAILQ_FOREACH(vp, listhd, v_freelist) {
438 /*
439 * It's safe to test v_usecount and v_iflag
440 * without holding the interlock here, since
441 * these vnodes should never appear on the
442 * lists.
443 */
444 KASSERT(vp->v_usecount == 0);
445 KASSERT(vp->v_freelisthd == listhd);
446
447 if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
448 continue;
449 if (!mutex_tryenter(vp->v_interlock)) {
450 VOP_UNLOCK(vp);
451 continue;
452 }
453 mp = vp->v_mount;
454 if (fstrans_start_nowait(mp, FSTRANS_SHARED) != 0) {
455 mutex_exit(vp->v_interlock);
456 VOP_UNLOCK(vp);
457 continue;
458 }
459 break;
460 }
461
462 if (vp == NULL) {
463 if (listhd == &vnode_free_list) {
464 listhd = &vnode_hold_list;
465 goto try_nextlist;
466 }
467 mutex_exit(&vnode_free_list_lock);
468 return EBUSY;
469 }
470
471 /* Remove it from the freelist. */
472 TAILQ_REMOVE(listhd, vp, v_freelist);
473 vp->v_freelisthd = NULL;
474 mutex_exit(&vnode_free_list_lock);
475
476 KASSERT(vp->v_usecount == 0);
477
478 /*
479 * The vnode is still associated with a file system, so we must
480 * clean it out before freeing it. We need to add a reference
481 * before doing this.
482 */
483 vp->v_usecount = 1;
484 vcache_reclaim(vp);
485 vrelel(vp, 0);
486 fstrans_done(mp);
487
488 return 0;
489}
490
491/*
492 * Helper thread to keep the number of vnodes below desiredvnodes.
493 */
494static void
495vdrain_thread(void *cookie)
496{
497 int error;
498
499 mutex_enter(&vnode_free_list_lock);
500
501 for (;;) {
502 cv_timedwait(&vdrain_cv, &vnode_free_list_lock, hz);
503 while (numvnodes > desiredvnodes) {
504 error = cleanvnode();
505 if (error)
506 kpause("vndsbusy", false, hz, NULL);
507 mutex_enter(&vnode_free_list_lock);
508 if (error)
509 break;
510 }
511 }
512}
513
514/*
515 * Remove a vnode from its freelist.
516 */
517void
518vremfree(vnode_t *vp)
519{
520
521 KASSERT(mutex_owned(vp->v_interlock));
522 KASSERT(vp->v_usecount == 0);
523
524 /*
525 * Note that the reference count must not change until
526 * the vnode is removed.
527 */
528 mutex_enter(&vnode_free_list_lock);
529 if (vp->v_holdcnt > 0) {
530 KASSERT(vp->v_freelisthd == &vnode_hold_list);
531 } else {
532 KASSERT(vp->v_freelisthd == &vnode_free_list);
533 }
534 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
535 vp->v_freelisthd = NULL;
536 mutex_exit(&vnode_free_list_lock);
537}
538
539/*
540 * vget: get a particular vnode from the free list, increment its reference
541 * count and return it.
542 *
543 * => Must be called with v_interlock held.
544 *
545 * If state is VS_RECLAIMING, the vnode may be eliminated in vcache_reclaim().
546 * In that case, we cannot grab the vnode, so the process is awakened when
547 * the transition is completed, and an error returned to indicate that the
548 * vnode is no longer usable.
549 *
550 * If state is VS_LOADING or VS_BLOCKED, wait until the vnode enters a
551 * stable state (VS_ACTIVE or VS_RECLAIMED).
552 */
553int
554vget(vnode_t *vp, int flags, bool waitok)
555{
556
557 KASSERT(mutex_owned(vp->v_interlock));
558 KASSERT((flags & ~LK_NOWAIT) == 0);
559 KASSERT(waitok == ((flags & LK_NOWAIT) == 0));
560
561 /*
562 * Before adding a reference, we must remove the vnode
563 * from its freelist.
564 */
565 if (vp->v_usecount == 0) {
566 vremfree(vp);
567 vp->v_usecount = 1;
568 } else {
569 atomic_inc_uint(&vp->v_usecount);
570 }
571
572 /*
573 * If the vnode is in the process of changing state we wait
574 * for the change to complete and take care not to return
575 * a clean vnode.
576 */
577 if (! ISSET(flags, LK_NOWAIT))
578 VSTATE_WAIT_STABLE(vp);
579 if (VSTATE_GET(vp) == VS_RECLAIMED) {
580 vrelel(vp, 0);
581 return ENOENT;
582 } else if (VSTATE_GET(vp) != VS_ACTIVE) {
583 KASSERT(ISSET(flags, LK_NOWAIT));
584 vrelel(vp, 0);
585 return EBUSY;
586 }
587
588 /*
589 * Ok, we got it in good shape.
590 */
591 VSTATE_ASSERT(vp, VS_ACTIVE);
592 mutex_exit(vp->v_interlock);
593
594 return 0;
595}
596
597/*
598 * vput: unlock and release the reference.
599 */
600void
601vput(vnode_t *vp)
602{
603
604 VOP_UNLOCK(vp);
605 vrele(vp);
606}
607
608/*
609 * Try to drop reference on a vnode. Abort if we are releasing the
610 * last reference. Note: this _must_ succeed if not the last reference.
611 */
612static inline bool
613vtryrele(vnode_t *vp)
614{
615 u_int use, next;
616
617 for (use = vp->v_usecount;; use = next) {
618 if (use == 1) {
619 return false;
620 }
621 KASSERT(use > 1);
622 next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
623 if (__predict_true(next == use)) {
624 return true;
625 }
626 }
627}
628
629/*
630 * Vnode release. If reference count drops to zero, call inactive
631 * routine and either return to freelist or free to the pool.
632 */
633static void
634vrelel(vnode_t *vp, int flags)
635{
636 bool recycle, defer;
637 int error;
638
639 KASSERT(mutex_owned(vp->v_interlock));
640 KASSERT(vp->v_freelisthd == NULL);
641
642 if (__predict_false(vp->v_op == dead_vnodeop_p &&
643 VSTATE_GET(vp) != VS_RECLAIMED)) {
644 vnpanic(vp, "dead but not clean");
645 }
646
647 /*
648 * If not the last reference, just drop the reference count
649 * and unlock.
650 */
651 if (vtryrele(vp)) {
652 mutex_exit(vp->v_interlock);
653 return;
654 }
655 if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
656 vnpanic(vp, "%s: bad ref count", __func__);
657 }
658
659#ifdef DIAGNOSTIC
660 if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
661 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
662 vprint("vrelel: missing VOP_CLOSE()", vp);
663 }
664#endif
665
666 /*
667 * If not clean, deactivate the vnode, but preserve
668 * our reference across the call to VOP_INACTIVE().
669 */
670 if (VSTATE_GET(vp) != VS_RECLAIMED) {
671 recycle = false;
672
673 /*
674 * XXX This ugly block can be largely eliminated if
675 * locking is pushed down into the file systems.
676 *
677 * Defer vnode release to vrele_thread if caller
678 * requests it explicitly or is the pagedaemon.
679 */
680 if ((curlwp == uvm.pagedaemon_lwp) ||
681 (flags & VRELEL_ASYNC_RELE) != 0) {
682 defer = true;
683 } else if (curlwp == vrele_lwp) {
684 /*
685 * We have to try harder.
686 */
687 mutex_exit(vp->v_interlock);
688 error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
689 KASSERTMSG((error == 0), "vn_lock failed: %d", error);
690 mutex_enter(vp->v_interlock);
691 defer = false;
692 } else {
693 /* If we can't acquire the lock, then defer. */
694 mutex_exit(vp->v_interlock);
695 error = vn_lock(vp,
696 LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
697 defer = (error != 0);
698 mutex_enter(vp->v_interlock);
699 }
700
701 KASSERT(mutex_owned(vp->v_interlock));
702 KASSERT(! (curlwp == vrele_lwp && defer));
703
704 if (defer) {
705 /*
706 * Defer reclaim to the kthread; it's not safe to
707 * clean it here. We donate it our last reference.
708 */
709 mutex_enter(&vrele_lock);
710 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist);
711 if (++vrele_pending > (desiredvnodes >> 8))
712 cv_signal(&vrele_cv);
713 mutex_exit(&vrele_lock);
714 mutex_exit(vp->v_interlock);
715 return;
716 }
717
718 /*
719 * If the node got another reference while we
720 * released the interlock, don't try to inactivate it yet.
721 */
722 if (__predict_false(vtryrele(vp))) {
723 VOP_UNLOCK(vp);
724 mutex_exit(vp->v_interlock);
725 return;
726 }
727 VSTATE_CHANGE(vp, VS_ACTIVE, VS_BLOCKED);
728 mutex_exit(vp->v_interlock);
729
730 /*
731 * The vnode must not gain another reference while being
732 * deactivated. If VOP_INACTIVE() indicates that
733 * the described file has been deleted, then recycle
734 * the vnode.
735 *
736 * Note that VOP_INACTIVE() will drop the vnode lock.
737 */
738 VOP_INACTIVE(vp, &recycle);
739 if (recycle) {
740 /* vcache_reclaim() below will drop the lock. */
741 if (vn_lock(vp, LK_EXCLUSIVE) != 0)
742 recycle = false;
743 }
744 mutex_enter(vp->v_interlock);
745 VSTATE_CHANGE(vp, VS_BLOCKED, VS_ACTIVE);
746 if (!recycle) {
747 if (vtryrele(vp)) {
748 mutex_exit(vp->v_interlock);
749 return;
750 }
751 }
752
753 /* Take care of space accounting. */
754 if (vp->v_iflag & VI_EXECMAP) {
755 atomic_add_int(&uvmexp.execpages,
756 -vp->v_uobj.uo_npages);
757 atomic_add_int(&uvmexp.filepages,
758 vp->v_uobj.uo_npages);
759 }
760 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
761 vp->v_vflag &= ~VV_MAPPED;
762
763 /*
764 * Recycle the vnode if the file is now unused (unlinked),
765 * otherwise just free it.
766 */
767 if (recycle) {
768 VSTATE_ASSERT(vp, VS_ACTIVE);
769 vcache_reclaim(vp);
770 }
771 KASSERT(vp->v_usecount > 0);
772 }
773
774 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) {
775 /* Gained another reference while being reclaimed. */
776 mutex_exit(vp->v_interlock);
777 return;
778 }
779
780 if (VSTATE_GET(vp) == VS_RECLAIMED) {
781 /*
782 * It's clean so destroy it. It isn't referenced
783 * anywhere since it has been reclaimed.
784 */
785 KASSERT(vp->v_holdcnt == 0);
786 KASSERT(vp->v_writecount == 0);
787 mutex_exit(vp->v_interlock);
788 vfs_insmntque(vp, NULL);
789 if (vp->v_type == VBLK || vp->v_type == VCHR) {
790 spec_node_destroy(vp);
791 }
792 vcache_free(VNODE_TO_VIMPL(vp));
793 } else {
794 /*
795 * Otherwise, put it back onto the freelist. It
796 * can't be destroyed while still associated with
797 * a file system.
798 */
799 mutex_enter(&vnode_free_list_lock);
800 if (vp->v_holdcnt > 0) {
801 vp->v_freelisthd = &vnode_hold_list;
802 } else {
803 vp->v_freelisthd = &vnode_free_list;
804 }
805 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
806 mutex_exit(&vnode_free_list_lock);
807 mutex_exit(vp->v_interlock);
808 }
809}
810
811void
812vrele(vnode_t *vp)
813{
814
815 if (vtryrele(vp)) {
816 return;
817 }
818 mutex_enter(vp->v_interlock);
819 vrelel(vp, 0);
820}
821
822/*
823 * Asynchronous vnode release, vnode is released in different context.
824 */
825void
826vrele_async(vnode_t *vp)
827{
828
829 if (vtryrele(vp)) {
830 return;
831 }
832 mutex_enter(vp->v_interlock);
833 vrelel(vp, VRELEL_ASYNC_RELE);
834}
835
836static void
837vrele_thread(void *cookie)
838{
839 vnodelst_t skip_list;
840 vnode_t *vp;
841 struct mount *mp;
842
843 TAILQ_INIT(&skip_list);
844
845 mutex_enter(&vrele_lock);
846 for (;;) {
847 while (TAILQ_EMPTY(&vrele_list)) {
848 vrele_gen++;
849 cv_broadcast(&vrele_cv);
850 cv_timedwait(&vrele_cv, &vrele_lock, hz);
851 TAILQ_CONCAT(&vrele_list, &skip_list, v_freelist);
852 }
853 vp = TAILQ_FIRST(&vrele_list);
854 mp = vp->v_mount;
855 TAILQ_REMOVE(&vrele_list, vp, v_freelist);
856 if (fstrans_start_nowait(mp, FSTRANS_LAZY) != 0) {
857 TAILQ_INSERT_TAIL(&skip_list, vp, v_freelist);
858 continue;
859 }
860 vrele_pending--;
861 mutex_exit(&vrele_lock);
862
863 /*
864 * If not the last reference, then ignore the vnode
865 * and look for more work.
866 */
867 mutex_enter(vp->v_interlock);
868 vrelel(vp, 0);
869 fstrans_done(mp);
870 mutex_enter(&vrele_lock);
871 }
872}
873
874void
875vrele_flush(void)
876{
877 int gen;
878
879 mutex_enter(&vrele_lock);
880 gen = vrele_gen;
881 while (vrele_pending && gen == vrele_gen) {
882 cv_broadcast(&vrele_cv);
883 cv_wait(&vrele_cv, &vrele_lock);
884 }
885 mutex_exit(&vrele_lock);
886}
887
888/*
889 * Vnode reference, where a reference is already held by some other
890 * object (for example, a file structure).
891 */
892void
893vref(vnode_t *vp)
894{
895
896 KASSERT(vp->v_usecount != 0);
897
898 atomic_inc_uint(&vp->v_usecount);
899}
900
901/*
902 * Page or buffer structure gets a reference.
903 * Called with v_interlock held.
904 */
905void
906vholdl(vnode_t *vp)
907{
908
909 KASSERT(mutex_owned(vp->v_interlock));
910
911 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) {
912 mutex_enter(&vnode_free_list_lock);
913 KASSERT(vp->v_freelisthd == &vnode_free_list);
914 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
915 vp->v_freelisthd = &vnode_hold_list;
916 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
917 mutex_exit(&vnode_free_list_lock);
918 }
919}
920
921/*
922 * Page or buffer structure frees a reference.
923 * Called with v_interlock held.
924 */
925void
926holdrelel(vnode_t *vp)
927{
928
929 KASSERT(mutex_owned(vp->v_interlock));
930
931 if (vp->v_holdcnt <= 0) {
932 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
933 }
934
935 vp->v_holdcnt--;
936 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) {
937 mutex_enter(&vnode_free_list_lock);
938 KASSERT(vp->v_freelisthd == &vnode_hold_list);
939 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
940 vp->v_freelisthd = &vnode_free_list;
941 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
942 mutex_exit(&vnode_free_list_lock);
943 }
944}
945
946/*
947 * Recycle an unused vnode if caller holds the last reference.
948 */
949bool
950vrecycle(vnode_t *vp)
951{
952
953 if (vn_lock(vp, LK_EXCLUSIVE) != 0)
954 return false;
955
956 mutex_enter(vp->v_interlock);
957
958 if (vp->v_usecount != 1) {
959 mutex_exit(vp->v_interlock);
960 VOP_UNLOCK(vp);
961 return false;
962 }
963 vcache_reclaim(vp);
964 vrelel(vp, 0);
965 return true;
966}
967
968/*
969 * Eliminate all activity associated with the requested vnode
970 * and with all vnodes aliased to the requested vnode.
971 */
972void
973vrevoke(vnode_t *vp)
974{
975 vnode_t *vq;
976 enum vtype type;
977 dev_t dev;
978
979 KASSERT(vp->v_usecount > 0);
980
981 mutex_enter(vp->v_interlock);
982 VSTATE_WAIT_STABLE(vp);
983 if (VSTATE_GET(vp) == VS_RECLAIMED) {
984 mutex_exit(vp->v_interlock);
985 return;
986 } else if (vp->v_type != VBLK && vp->v_type != VCHR) {
987 atomic_inc_uint(&vp->v_usecount);
988 mutex_exit(vp->v_interlock);
989 vgone(vp);
990 return;
991 } else {
992 dev = vp->v_rdev;
993 type = vp->v_type;
994 mutex_exit(vp->v_interlock);
995 }
996
997 while (spec_node_lookup_by_dev(type, dev, &vq) == 0) {
998 vgone(vq);
999 }
1000}
1001
1002/*
1003 * Eliminate all activity associated with a vnode in preparation for
1004 * reuse. Drops a reference from the vnode.
1005 */
1006void
1007vgone(vnode_t *vp)
1008{
1009
1010 if (vn_lock(vp, LK_EXCLUSIVE) != 0) {
1011 VSTATE_ASSERT(vp, VS_RECLAIMED);
1012 vrele(vp);
1013 }
1014
1015 mutex_enter(vp->v_interlock);
1016 vcache_reclaim(vp);
1017 vrelel(vp, 0);
1018}
1019
1020static inline uint32_t
1021vcache_hash(const struct vcache_key *key)
1022{
1023 uint32_t hash = HASH32_BUF_INIT;
1024
1025 hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
1026 hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
1027 return hash;
1028}
1029
1030static void
1031vcache_init(void)
1032{
1033
1034 vcache.pool = pool_cache_init(sizeof(vnode_impl_t), 0, 0, 0,
1035 "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
1036 KASSERT(vcache.pool != NULL);
1037 mutex_init(&vcache.lock, MUTEX_DEFAULT, IPL_NONE);
1038 cv_init(&vcache.cv, "vcache");
1039 vcache.hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
1040 &vcache.hashmask);
1041}
1042
1043static void
1044vcache_reinit(void)
1045{
1046 int i;
1047 uint32_t hash;
1048 u_long oldmask, newmask;
1049 struct hashhead *oldtab, *newtab;
1050 vnode_impl_t *node;
1051
1052 newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
1053 mutex_enter(&vcache.lock);
1054 oldtab = vcache.hashtab;
1055 oldmask = vcache.hashmask;
1056 vcache.hashtab = newtab;
1057 vcache.hashmask = newmask;
1058 for (i = 0; i <= oldmask; i++) {
1059 while ((node = SLIST_FIRST(&oldtab[i])) != NULL) {
1060 SLIST_REMOVE(&oldtab[i], node, vnode_impl, vi_hash);
1061 hash = vcache_hash(&node->vi_key);
1062 SLIST_INSERT_HEAD(&newtab[hash & vcache.hashmask],
1063 node, vi_hash);
1064 }
1065 }
1066 mutex_exit(&vcache.lock);
1067 hashdone(oldtab, HASH_SLIST, oldmask);
1068}
1069
1070static inline vnode_impl_t *
1071vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
1072{
1073 struct hashhead *hashp;
1074 vnode_impl_t *node;
1075
1076 KASSERT(mutex_owned(&vcache.lock));
1077
1078 hashp = &vcache.hashtab[hash & vcache.hashmask];
1079 SLIST_FOREACH(node, hashp, vi_hash) {
1080 if (key->vk_mount != node->vi_key.vk_mount)
1081 continue;
1082 if (key->vk_key_len != node->vi_key.vk_key_len)
1083 continue;
1084 if (memcmp(key->vk_key, node->vi_key.vk_key, key->vk_key_len))
1085 continue;
1086 return node;
1087 }
1088 return NULL;
1089}
1090
1091/*
1092 * Allocate a new, uninitialized vcache node.
1093 */
1094static vnode_impl_t *
1095vcache_alloc(void)
1096{
1097 vnode_impl_t *node;
1098 vnode_t *vp;
1099
1100 node = pool_cache_get(vcache.pool, PR_WAITOK);
1101 memset(node, 0, sizeof(*node));
1102
1103 /* SLIST_INIT(&node->vi_hash); */
1104
1105 vp = VIMPL_TO_VNODE(node);
1106 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0);
1107 cv_init(&vp->v_cv, "vnode");
1108 /* LIST_INIT(&vp->v_nclist); */
1109 /* LIST_INIT(&vp->v_dnclist); */
1110
1111 mutex_enter(&vnode_free_list_lock);
1112 numvnodes++;
1113 if (numvnodes > desiredvnodes + desiredvnodes / 10)
1114 cv_signal(&vdrain_cv);
1115 mutex_exit(&vnode_free_list_lock);
1116
1117 rw_init(&vp->v_lock);
1118 vp->v_usecount = 1;
1119 vp->v_type = VNON;
1120 vp->v_size = vp->v_writesize = VSIZENOTSET;
1121
1122 node->vi_state = VS_LOADING;
1123
1124 return node;
1125}
1126
1127/*
1128 * Free an unused, unreferenced vcache node.
1129 */
1130static void
1131vcache_free(vnode_impl_t *node)
1132{
1133 vnode_t *vp;
1134
1135 vp = VIMPL_TO_VNODE(node);
1136
1137 KASSERT(vp->v_usecount == 0);
1138
1139 rw_destroy(&vp->v_lock);
1140 mutex_enter(&vnode_free_list_lock);
1141 numvnodes--;
1142 mutex_exit(&vnode_free_list_lock);
1143
1144 uvm_obj_destroy(&vp->v_uobj, true);
1145 cv_destroy(&vp->v_cv);
1146 pool_cache_put(vcache.pool, node);
1147}
1148
1149/*
1150 * Get a vnode / fs node pair by key and return it referenced through vpp.
1151 */
1152int
1153vcache_get(struct mount *mp, const void *key, size_t key_len,
1154 struct vnode **vpp)
1155{
1156 int error;
1157 uint32_t hash;
1158 const void *new_key;
1159 struct vnode *vp;
1160 struct vcache_key vcache_key;
1161 vnode_impl_t *node, *new_node;
1162
1163 new_key = NULL;
1164 *vpp = NULL;
1165
1166 vcache_key.vk_mount = mp;
1167 vcache_key.vk_key = key;
1168 vcache_key.vk_key_len = key_len;
1169 hash = vcache_hash(&vcache_key);
1170
1171again:
1172 mutex_enter(&vcache.lock);
1173 node = vcache_hash_lookup(&vcache_key, hash);
1174
1175 /* If found, take a reference or retry. */
1176 if (__predict_true(node != NULL)) {
1177 /*
1178 * If the vnode is loading we cannot take the v_interlock
1179 * here as it might change during load (see uvm_obj_setlock()).
1180 * As changing state from VS_LOADING requires both vcache.lock
1181 * and v_interlock it is safe to test with vcache.lock held.
1182 *
1183 * Wait for vnodes changing state from VS_LOADING and retry.
1184 */
1185 if (__predict_false(node->vi_state == VS_LOADING)) {
1186 cv_wait(&vcache.cv, &vcache.lock);
1187 mutex_exit(&vcache.lock);
1188 goto again;
1189 }
1190 vp = VIMPL_TO_VNODE(node);
1191 mutex_enter(vp->v_interlock);
1192 mutex_exit(&vcache.lock);
1193 error = vget(vp, 0, true /* wait */);
1194 if (error == ENOENT)
1195 goto again;
1196 if (error == 0)
1197 *vpp = vp;
1198 KASSERT((error != 0) == (*vpp == NULL));
1199 return error;
1200 }
1201 mutex_exit(&vcache.lock);
1202
1203 /* Allocate and initialize a new vcache / vnode pair. */
1204 error = vfs_busy(mp, NULL);
1205 if (error)
1206 return error;
1207 new_node = vcache_alloc();
1208 new_node->vi_key = vcache_key;
1209 vp = VIMPL_TO_VNODE(new_node);
1210 mutex_enter(&vcache.lock);
1211 node = vcache_hash_lookup(&vcache_key, hash);
1212 if (node == NULL) {
1213 SLIST_INSERT_HEAD(&vcache.hashtab[hash & vcache.hashmask],
1214 new_node, vi_hash);
1215 node = new_node;
1216 }
1217
1218 /* If another thread beat us inserting this node, retry. */
1219 if (node != new_node) {
1220 mutex_enter(vp->v_interlock);
1221 VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED);
1222 mutex_exit(&vcache.lock);
1223 vrelel(vp, 0);
1224 vfs_unbusy(mp, false, NULL);
1225 goto again;
1226 }
1227 mutex_exit(&vcache.lock);
1228
1229 /* Load the fs node. Exclusive as new_node is VS_LOADING. */
1230 error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
1231 if (error) {
1232 mutex_enter(&vcache.lock);
1233 SLIST_REMOVE(&vcache.hashtab[hash & vcache.hashmask],
1234 new_node, vnode_impl, vi_hash);
1235 mutex_enter(vp->v_interlock);
1236 VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED);
1237 mutex_exit(&vcache.lock);
1238 vrelel(vp, 0);
1239 vfs_unbusy(mp, false, NULL);
1240 KASSERT(*vpp == NULL);
1241 return error;
1242 }
1243 KASSERT(new_key != NULL);
1244 KASSERT(memcmp(key, new_key, key_len) == 0);
1245 KASSERT(vp->v_op != NULL);
1246 vfs_insmntque(vp, mp);
1247 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1248 vp->v_vflag |= VV_MPSAFE;
1249 vfs_unbusy(mp, true, NULL);
1250
1251 /* Finished loading, finalize node. */
1252 mutex_enter(&vcache.lock);
1253 new_node->vi_key.vk_key = new_key;
1254 mutex_enter(vp->v_interlock);
1255 VSTATE_CHANGE(vp, VS_LOADING, VS_ACTIVE);
1256 mutex_exit(vp->v_interlock);
1257 mutex_exit(&vcache.lock);
1258 *vpp = vp;
1259 return 0;
1260}
1261
1262/*
1263 * Create a new vnode / fs node pair and return it referenced through vpp.
1264 */
1265int
1266vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap,
1267 kauth_cred_t cred, struct vnode **vpp)
1268{
1269 int error;
1270 uint32_t hash;
1271 struct vnode *ovp, *vp;
1272 vnode_impl_t *new_node;
1273 vnode_impl_t *old_node __diagused;
1274
1275 *vpp = NULL;
1276
1277 /* Allocate and initialize a new vcache / vnode pair. */
1278 error = vfs_busy(mp, NULL);
1279 if (error)
1280 return error;
1281 new_node = vcache_alloc();
1282 new_node->vi_key.vk_mount = mp;
1283 vp = VIMPL_TO_VNODE(new_node);
1284
1285 /* Create and load the fs node. */
1286 error = VFS_NEWVNODE(mp, dvp, vp, vap, cred,
1287 &new_node->vi_key.vk_key_len, &new_node->vi_key.vk_key);
1288 if (error) {
1289 mutex_enter(&vcache.lock);
1290 mutex_enter(vp->v_interlock);
1291 VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED);
1292 mutex_exit(&vcache.lock);
1293 vrelel(vp, 0);
1294 vfs_unbusy(mp, false, NULL);
1295 KASSERT(*vpp == NULL);
1296 return error;
1297 }
1298 KASSERT(new_node->vi_key.vk_key != NULL);
1299 KASSERT(vp->v_op != NULL);
1300 hash = vcache_hash(&new_node->vi_key);
1301
1302 /* Wait for previous instance to be reclaimed, then insert new node. */
1303 mutex_enter(&vcache.lock);
1304 while ((old_node = vcache_hash_lookup(&new_node->vi_key, hash))) {
1305 ovp = VIMPL_TO_VNODE(old_node);
1306 mutex_enter(ovp->v_interlock);
1307 mutex_exit(&vcache.lock);
1308 error = vget(ovp, 0, true /* wait */);
1309 KASSERT(error == ENOENT);
1310 mutex_enter(&vcache.lock);
1311 }
1312 SLIST_INSERT_HEAD(&vcache.hashtab[hash & vcache.hashmask],
1313 new_node, vi_hash);
1314 mutex_exit(&vcache.lock);
1315 vfs_insmntque(vp, mp);
1316 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
1317 vp->v_vflag |= VV_MPSAFE;
1318 vfs_unbusy(mp, true, NULL);
1319
1320 /* Finished loading, finalize node. */
1321 mutex_enter(&vcache.lock);
1322 mutex_enter(vp->v_interlock);
1323 VSTATE_CHANGE(vp, VS_LOADING, VS_ACTIVE);
1324 mutex_exit(&vcache.lock);
1325 mutex_exit(vp->v_interlock);
1326 *vpp = vp;
1327 return 0;
1328}
1329
1330/*
1331 * Prepare key change: lock old and new cache node.
1332 * Return an error if the new node already exists.
1333 */
1334int
1335vcache_rekey_enter(struct mount *mp, struct vnode *vp,
1336 const void *old_key, size_t old_key_len,
1337 const void *new_key, size_t new_key_len)
1338{
1339 uint32_t old_hash, new_hash;
1340 struct vcache_key old_vcache_key, new_vcache_key;
1341 vnode_impl_t *node, *new_node;
1342 struct vnode *tvp;
1343
1344 old_vcache_key.vk_mount = mp;
1345 old_vcache_key.vk_key = old_key;
1346 old_vcache_key.vk_key_len = old_key_len;
1347 old_hash = vcache_hash(&old_vcache_key);
1348
1349 new_vcache_key.vk_mount = mp;
1350 new_vcache_key.vk_key = new_key;
1351 new_vcache_key.vk_key_len = new_key_len;
1352 new_hash = vcache_hash(&new_vcache_key);
1353
1354 new_node = vcache_alloc();
1355 new_node->vi_key = new_vcache_key;
1356 tvp = VIMPL_TO_VNODE(new_node);
1357
1358 /* Insert locked new node used as placeholder. */
1359 mutex_enter(&vcache.lock);
1360 node = vcache_hash_lookup(&new_vcache_key, new_hash);
1361 if (node != NULL) {
1362 mutex_enter(tvp->v_interlock);
1363 VSTATE_CHANGE(tvp, VS_LOADING, VS_RECLAIMED);
1364 mutex_exit(&vcache.lock);
1365 vrelel(tvp, 0);
1366 return EEXIST;
1367 }
1368 SLIST_INSERT_HEAD(&vcache.hashtab[new_hash & vcache.hashmask],
1369 new_node, vi_hash);
1370
1371 /* Lock old node. */
1372 node = vcache_hash_lookup(&old_vcache_key, old_hash);
1373 KASSERT(node != NULL);
1374 KASSERT(VIMPL_TO_VNODE(node) == vp);
1375 mutex_enter(vp->v_interlock);
1376 VSTATE_CHANGE(vp, VS_ACTIVE, VS_BLOCKED);
1377 node->vi_key = old_vcache_key;
1378 mutex_exit(vp->v_interlock);
1379 mutex_exit(&vcache.lock);
1380 return 0;
1381}
1382
1383/*
1384 * Key change complete: remove old node and unlock new node.
1385 */
1386void
1387vcache_rekey_exit(struct mount *mp, struct vnode *vp,
1388 const void *old_key, size_t old_key_len,
1389 const void *new_key, size_t new_key_len)
1390{
1391 uint32_t old_hash, new_hash;
1392 struct vcache_key old_vcache_key, new_vcache_key;
1393 vnode_impl_t *old_node, *new_node;
1394 struct vnode *tvp;
1395
1396 old_vcache_key.vk_mount = mp;
1397 old_vcache_key.vk_key = old_key;
1398 old_vcache_key.vk_key_len = old_key_len;
1399 old_hash = vcache_hash(&old_vcache_key);
1400
1401 new_vcache_key.vk_mount = mp;
1402 new_vcache_key.vk_key = new_key;
1403 new_vcache_key.vk_key_len = new_key_len;
1404 new_hash = vcache_hash(&new_vcache_key);
1405
1406 mutex_enter(&vcache.lock);
1407
1408 /* Lookup old and new node. */
1409 old_node = vcache_hash_lookup(&old_vcache_key, old_hash);
1410 KASSERT(old_node != NULL);
1411 KASSERT(VIMPL_TO_VNODE(old_node) == vp);
1412 mutex_enter(vp->v_interlock);
1413 VSTATE_ASSERT(vp, VS_BLOCKED);
1414
1415 new_node = vcache_hash_lookup(&new_vcache_key, new_hash);
1416 KASSERT(new_node != NULL);
1417 KASSERT(new_node->vi_key.vk_key_len == new_key_len);
1418 tvp = VIMPL_TO_VNODE(new_node);
1419 mutex_enter(tvp->v_interlock);
1420 VSTATE_ASSERT(VIMPL_TO_VNODE(new_node), VS_LOADING);
1421
1422 /* Rekey old node and put it onto its new hashlist. */
1423 old_node->vi_key = new_vcache_key;
1424 if (old_hash != new_hash) {
1425 SLIST_REMOVE(&vcache.hashtab[old_hash & vcache.hashmask],
1426 old_node, vnode_impl, vi_hash);
1427 SLIST_INSERT_HEAD(&vcache.hashtab[new_hash & vcache.hashmask],
1428 old_node, vi_hash);
1429 }
1430 VSTATE_CHANGE(vp, VS_BLOCKED, VS_ACTIVE);
1431 mutex_exit(vp->v_interlock);
1432
1433 /* Remove new node used as placeholder. */
1434 SLIST_REMOVE(&vcache.hashtab[new_hash & vcache.hashmask],
1435 new_node, vnode_impl, vi_hash);
1436 VSTATE_CHANGE(tvp, VS_LOADING, VS_RECLAIMED);
1437 mutex_exit(&vcache.lock);
1438 vrelel(tvp, 0);
1439}
1440
1441/*
1442 * Disassociate the underlying file system from a vnode.
1443 *
1444 * Must be called with vnode locked and will return unlocked.
1445 * Must be called with the interlock held, and will return with it held.
1446 */
1447static void
1448vcache_reclaim(vnode_t *vp)
1449{
1450 lwp_t *l = curlwp;
1451 vnode_impl_t *node = VNODE_TO_VIMPL(vp);
1452 uint32_t hash;
1453 uint8_t temp_buf[64], *temp_key;
1454 size_t temp_key_len;
1455 bool recycle, active;
1456 int error;
1457
1458 KASSERT((vp->v_vflag & VV_LOCKSWORK) == 0 ||
1459 VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
1460 KASSERT(mutex_owned(vp->v_interlock));
1461 KASSERT(vp->v_usecount != 0);
1462
1463 active = (vp->v_usecount > 1);
1464 temp_key_len = node->vi_key.vk_key_len;
1465 /*
1466 * Prevent the vnode from being recycled or brought into use
1467 * while we clean it out.
1468 */
1469 VSTATE_CHANGE(vp, VS_ACTIVE, VS_RECLAIMING);
1470 if (vp->v_iflag & VI_EXECMAP) {
1471 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
1472 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
1473 }
1474 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
1475 mutex_exit(vp->v_interlock);
1476
1477 /* Replace the vnode key with a temporary copy. */
1478 if (node->vi_key.vk_key_len > sizeof(temp_buf)) {
1479 temp_key = kmem_alloc(temp_key_len, KM_SLEEP);
1480 } else {
1481 temp_key = temp_buf;
1482 }
1483 mutex_enter(&vcache.lock);
1484 memcpy(temp_key, node->vi_key.vk_key, temp_key_len);
1485 node->vi_key.vk_key = temp_key;
1486 mutex_exit(&vcache.lock);
1487
1488 /*
1489 * Clean out any cached data associated with the vnode.
1490 * If purging an active vnode, it must be closed and
1491 * deactivated before being reclaimed. Note that the
1492 * VOP_INACTIVE will unlock the vnode.
1493 */
1494 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
1495 if (error != 0) {
1496 if (wapbl_vphaswapbl(vp))
1497 WAPBL_DISCARD(wapbl_vptomp(vp));
1498 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
1499 }
1500 KASSERTMSG((error == 0), "vinvalbuf failed: %d", error);
1501 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1502 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
1503 spec_node_revoke(vp);
1504 }
1505 if (active) {
1506 VOP_INACTIVE(vp, &recycle);
1507 } else {
1508 /*
1509 * Any other processes trying to obtain this lock must first
1510 * wait for VS_RECLAIMED, then call the new lock operation.
1511 */
1512 VOP_UNLOCK(vp);
1513 }
1514
1515 /* Disassociate the underlying file system from the vnode. */
1516 if (VOP_RECLAIM(vp)) {
1517 vnpanic(vp, "%s: cannot reclaim", __func__);
1518 }
1519
1520 KASSERT(vp->v_data == NULL);
1521 KASSERT(vp->v_uobj.uo_npages == 0);
1522
1523 if (vp->v_type == VREG && vp->v_ractx != NULL) {
1524 uvm_ra_freectx(vp->v_ractx);
1525 vp->v_ractx = NULL;
1526 }
1527
1528 /* Purge name cache. */
1529 cache_purge(vp);
1530
1531 /* Move to dead mount. */
1532 vp->v_vflag &= ~VV_ROOT;
1533 atomic_inc_uint(&dead_rootmount->mnt_refcnt);
1534 vfs_insmntque(vp, dead_rootmount);
1535
1536 /* Remove from vnode cache. */
1537 hash = vcache_hash(&node->vi_key);
1538 mutex_enter(&vcache.lock);
1539 KASSERT(node == vcache_hash_lookup(&node->vi_key, hash));
1540 SLIST_REMOVE(&vcache.hashtab[hash & vcache.hashmask],
1541 node, vnode_impl, vi_hash);
1542 mutex_exit(&vcache.lock);
1543 if (temp_key != temp_buf)
1544 kmem_free(temp_key, temp_key_len);
1545
1546 /* Done with purge, notify sleepers of the grim news. */
1547 mutex_enter(vp->v_interlock);
1548 vp->v_op = dead_vnodeop_p;
1549 vp->v_vflag |= VV_LOCKSWORK;
1550 VSTATE_CHANGE(vp, VS_RECLAIMING, VS_RECLAIMED);
1551 vp->v_tag = VT_NON;
1552 KNOTE(&vp->v_klist, NOTE_REVOKE);
1553
1554 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1555}
1556
1557/*
1558 * Update outstanding I/O count and do wakeup if requested.
1559 */
1560void
1561vwakeup(struct buf *bp)
1562{
1563 vnode_t *vp;
1564
1565 if ((vp = bp->b_vp) == NULL)
1566 return;
1567
1568 KASSERT(bp->b_objlock == vp->v_interlock);
1569 KASSERT(mutex_owned(bp->b_objlock));
1570
1571 if (--vp->v_numoutput < 0)
1572 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
1573 if (vp->v_numoutput == 0)
1574 cv_broadcast(&vp->v_cv);
1575}
1576
1577/*
1578 * Test a vnode for being or becoming dead. Returns one of:
1579 * EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
1580 * ENOENT: vnode is dead.
1581 * 0: otherwise.
1582 *
1583 * Whenever this function returns a non-zero value all future
1584 * calls will also return a non-zero value.
1585 */
1586int
1587vdead_check(struct vnode *vp, int flags)
1588{
1589
1590 KASSERT(mutex_owned(vp->v_interlock));
1591
1592 if (! ISSET(flags, VDEAD_NOWAIT))
1593 VSTATE_WAIT_STABLE(vp);
1594
1595 if (VSTATE_GET(vp) == VS_RECLAIMING) {
1596 KASSERT(ISSET(flags, VDEAD_NOWAIT));
1597 return EBUSY;
1598 } else if (VSTATE_GET(vp) == VS_RECLAIMED) {
1599 return ENOENT;
1600 }
1601
1602 return 0;
1603}
1604
1605int
1606vfs_drainvnodes(long target)
1607{
1608 int error;
1609
1610 mutex_enter(&vnode_free_list_lock);
1611
1612 while (numvnodes > target) {
1613 error = cleanvnode();
1614 if (error != 0)
1615 return error;
1616 mutex_enter(&vnode_free_list_lock);
1617 }
1618
1619 mutex_exit(&vnode_free_list_lock);
1620
1621 vcache_reinit();
1622
1623 return 0;
1624}
1625
1626void
1627vnpanic(vnode_t *vp, const char *fmt, ...)
1628{
1629 va_list ap;
1630
1631#ifdef DIAGNOSTIC
1632 vprint(NULL, vp);
1633#endif
1634 va_start(ap, fmt);
1635 vpanic(fmt, ap);
1636 va_end(ap);
1637}
1638