vfs_bio.c source code [src/src/sys/kern/vfs_bio.c]

1	/ $NetBSD: vfs_bio.c,v 1.262 2016/10/28 20:17:27 jdolecek Exp $ /
2
3	/-*
4	* Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc.
5	* All rights reserved.
6	*
7	* This code is derived from software contributed to The NetBSD Foundation
8	* by Andrew Doran, and by Wasabi Systems, Inc.
9	*
10	* Redistribution and use in source and binary forms, with or without
11	* modification, are permitted provided that the following conditions
12	* are met:
13	* 1. Redistributions of source code must retain the above copyright
14	* notice, this list of conditions and the following disclaimer.
15	* 2. Redistributions in binary form must reproduce the above copyright
16	* notice, this list of conditions and the following disclaimer in the
17	* documentation and/or other materials provided with the distribution.
18	*
19	* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29	* POSSIBILITY OF SUCH DAMAGE.
30	*/
31
32	/-*
33	* Copyright (c) 1982, 1986, 1989, 1993
34	* The Regents of the University of California. All rights reserved.
35	* (c) UNIX System Laboratories, Inc.
36	* All or some portions of this file are derived from material licensed
37	* to the University of California by American Telephone and Telegraph
38	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
39	* the permission of UNIX System Laboratories, Inc.
40	*
41	* Redistribution and use in source and binary forms, with or without
42	* modification, are permitted provided that the following conditions
43	* are met:
44	* 1. Redistributions of source code must retain the above copyright
45	* notice, this list of conditions and the following disclaimer.
46	* 2. Redistributions in binary form must reproduce the above copyright
47	* notice, this list of conditions and the following disclaimer in the
48	* documentation and/or other materials provided with the distribution.
49	* 3. Neither the name of the University nor the names of its contributors
50	* may be used to endorse or promote products derived from this software
51	* without specific prior written permission.
52	*
53	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63	* SUCH DAMAGE.
64	*
65	* @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
66	*/
67
68	/-*
69	* Copyright (c) 1994 Christopher G. Demetriou
70	*
71	* Redistribution and use in source and binary forms, with or without
72	* modification, are permitted provided that the following conditions
73	* are met:
74	* 1. Redistributions of source code must retain the above copyright
75	* notice, this list of conditions and the following disclaimer.
76	* 2. Redistributions in binary form must reproduce the above copyright
77	* notice, this list of conditions and the following disclaimer in the
78	* documentation and/or other materials provided with the distribution.
79	* 3. All advertising materials mentioning features or use of this software
80	* must display the following acknowledgement:
81	* This product includes software developed by the University of
82	* California, Berkeley and its contributors.
83	* 4. Neither the name of the University nor the names of its contributors
84	* may be used to endorse or promote products derived from this software
85	* without specific prior written permission.
86	*
87	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
88	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
89	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
90	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
91	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
92	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
93	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
94	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
95	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
96	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
97	* SUCH DAMAGE.
98	*
99	* @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
100	*/
101
102	/*
103	* The buffer cache subsystem.
104	*
105	* Some references:
106	* Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
107	* Leffler, et al.: The Design and Implementation of the 4.3BSD
108	* UNIX Operating System (Addison Welley, 1989)
109	*
110	* Locking
111	*
112	* There are three locks:
113	* - bufcache_lock: protects global buffer cache state.
114	* - BC_BUSY: a long term per-buffer lock.
115	* - buf_t::b_objlock: lock on completion (biowait vs biodone).
116	*
117	* For buffers associated with vnodes (a most common case) b_objlock points
118	* to the vnode_t::v_interlock. Otherwise, it points to generic buffer_lock.
119	*
120	* Lock order:
121	* bufcache_lock ->
122	* buf_t::b_objlock
123	*/
124
125	#include <sys/cdefs.h>
126	__KERNEL_RCSID(`0`, "$NetBSD: vfs_bio.c,v 1.262 2016/10/28 20:17:27 jdolecek Exp $");
127
128	#ifdef _KERNEL_OPT
129	#include "opt_bufcache.h"
130	#include "opt_dtrace.h"
131	#endif
132
133	#include <sys/param.h>
134	#include <sys/systm.h>
135	#include <sys/kernel.h>
136	#include <sys/proc.h>
137	#include <sys/buf.h>
138	#include <sys/vnode.h>
139	#include <sys/mount.h>
140	#include <sys/resourcevar.h>
141	#include <sys/sysctl.h>
142	#include <sys/conf.h>
143	#include <sys/kauth.h>
144	#include <sys/fstrans.h>
145	#include <sys/intr.h>
146	#include <sys/cpu.h>
147	#include <sys/wapbl.h>
148	#include <sys/bitops.h>
149	#include <sys/cprng.h>
150	#include <sys/sdt.h>
151
152	#include <uvm/uvm.h> /* extern struct uvm uvm */
153
154	#include <miscfs/specfs/specdev.h>
155
156	#ifndef BUFPAGES
157	# define BUFPAGES 0
158	#endif
159
160	#ifdef BUFCACHE
161	# if (BUFCACHE < 5) \|\| (BUFCACHE > 95)
162	# error BUFCACHE is not between 5 and 95
163	# endif
164	#else
165	# define BUFCACHE 15
166	#endif
167
168	u_int nbuf; / desired number of buffer headers /
169	u_int bufpages = BUFPAGES; / optional hardwired count /
170	u_int bufcache = BUFCACHE; / max % of RAM to use for buffer cache /
171
172	/ Function prototypes /
173	struct bqueue;
174
175	static void buf_setwm(void);
176	static int buf_trim(void);
177	static void bufpool_page_alloc(struct* pool , int*);
178	static void bufpool_page_free(struct pool , void* *);
179	static buf_t bio_doread(struct* vnode , daddr_t, int, int*);
180	static buf_t getnewbuf(int, int, int*);
181	static int buf_lotsfree(void);
182	static int buf_canrelease(void);
183	static u_long buf_mempoolidx(u_long);
184	static u_long buf_roundsize(u_long);
185	static void *buf_alloc(size_t);
186	static void buf_mrelease(void *, size_t);
187	static void binsheadfree(buf_t , struct* bqueue *);
188	static void binstailfree(buf_t , struct* bqueue *);
189	#ifdef DEBUG
190	static int checkfreelist(buf_t , struct* bqueue , int*);
191	#endif
192	static void biointr(void *);
193	static void biodone2(buf_t *);
194	static void bref(buf_t *);
195	static void brele(buf_t *);
196	static void sysctl_kern_buf_setup(void);
197	static void sysctl_vm_buf_setup(void);
198
199	/*
200	* Definitions for the buffer hash lists.
201	*/
202	#define BUFHASH(dvp, lbn) \
203	(&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash])
204	LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
205	u_long bufhash;
206	struct bqueue bufqueues[BQUEUES];
207
208	static kcondvar_t needbuffer_cv;
209
210	/*
211	* Buffer queue lock.
212	*/
213	kmutex_t bufcache_lock;
214	kmutex_t buffer_lock;
215
216	/ Software ISR for completed transfers. /
217	static void *biodone_sih;
218
219	/ Buffer pool for I/O buffers. /
220	static pool_cache_t buf_cache;
221	static pool_cache_t bufio_cache;
222
223	#define MEMPOOL_INDEX_OFFSET (ilog2(DEV_BSIZE)) /* smallest pool is 512 bytes */
224	#define NMEMPOOLS (ilog2(MAXBSIZE) - MEMPOOL_INDEX_OFFSET + 1)
225	__CTASSERT((`1` << (NMEMPOOLS + MEMPOOL_INDEX_OFFSET - `1`)) == MAXBSIZE);
226
227	/ Buffer memory pools /
228	static struct pool bmempools[NMEMPOOLS];
229
230	static struct vm_map *buf_map;
231
232	/*
233	* Buffer memory pool allocator.
234	*/
235	static void *
236	bufpool_page_alloc(struct pool pp, int* flags)
237	{
238
239	return (void *)uvm_km_alloc(buf_map,
240	MAXBSIZE, MAXBSIZE,
241	((flags & PR_WAITOK) ? `0` : UVM_KMF_NOWAIT\|UVM_KMF_TRYLOCK)
242	\| UVM_KMF_WIRED);
243	}
244
245	static void
246	bufpool_page_free(struct pool pp, void* *v)
247	{
248
249	uvm_km_free(buf_map, (vaddr_t)v, MAXBSIZE, UVM_KMF_WIRED);
250	}
251
252	static struct pool_allocator bufmempool_allocator = {
253	.pa_alloc = bufpool_page_alloc,
254	.pa_free = bufpool_page_free,
255	.pa_pagesz = MAXBSIZE,
256	};
257
258	/ Buffer memory management variables /
259	u_long bufmem_valimit;
260	u_long bufmem_hiwater;
261	u_long bufmem_lowater;
262	u_long bufmem;
263
264	/*
265	* MD code can call this to set a hard limit on the amount
266	* of virtual memory used by the buffer cache.
267	*/
268	int
269	buf_setvalimit(vsize_t sz)
270	{
271
272	/ We need to accommodate at least NMEMPOOLS of MAXBSIZE each /
273	if (sz < NMEMPOOLS * MAXBSIZE)
274	return EINVAL;
275
276	bufmem_valimit = sz;
277	return `0`;
278	}
279
280	static void
281	buf_setwm(void)
282	{
283
284	bufmem_hiwater = buf_memcalc();
285	/ lowater is approx. 2% of memory (with bufcache = 15) /
286	#define BUFMEM_WMSHIFT 3
287	#define BUFMEM_HIWMMIN (64 * 1024 << BUFMEM_WMSHIFT)
288	if (bufmem_hiwater < BUFMEM_HIWMMIN)
289	/ Ensure a reasonable minimum value /
290	bufmem_hiwater = BUFMEM_HIWMMIN;
291	bufmem_lowater = bufmem_hiwater >> BUFMEM_WMSHIFT;
292	}
293
294	#ifdef DEBUG
295	int debug_verify_freelist = `0`;
296	static int
297	checkfreelist(buf_t bp, struct* bqueue dp, int* ison)
298	{
299	buf_t *b;
300
301	if (!debug_verify_freelist)
302	return `1`;
303
304	TAILQ_FOREACH(b, &dp->bq_queue, b_freelist) {
305	if (b == bp)
306	return ison ? `1` : `0`;
307	}
308
309	return ison ? `0` : `1`;
310	}
311	#endif
312
313	/*
314	* Insq/Remq for the buffer hash lists.
315	* Call with buffer queue locked.
316	*/
317	static void
318	binsheadfree(buf_t bp, struct* bqueue *dp)
319	{
320
321	KASSERT(mutex_owned(&bufcache_lock));
322	KASSERT(bp->b_freelistindex == -`1`);
323	TAILQ_INSERT_HEAD(&dp->bq_queue, bp, b_freelist);
324	dp->bq_bytes += bp->b_bufsize;
325	bp->b_freelistindex = dp - bufqueues;
326	}
327
328	static void
329	binstailfree(buf_t bp, struct* bqueue *dp)
330	{
331
332	KASSERT(mutex_owned(&bufcache_lock));
333	KASSERTMSG(bp->b_freelistindex == -`1`, "double free of buffer? "
334	"bp=%p, b_freelistindex=%d\n", bp, bp->b_freelistindex);
335	TAILQ_INSERT_TAIL(&dp->bq_queue, bp, b_freelist);
336	dp->bq_bytes += bp->b_bufsize;
337	bp->b_freelistindex = dp - bufqueues;
338	}
339
340	void
341	bremfree(buf_t *bp)
342	{
343	struct bqueue *dp;
344	int bqidx = bp->b_freelistindex;
345
346	KASSERT(mutex_owned(&bufcache_lock));
347
348	KASSERT(bqidx != -`1`);
349	dp = &bufqueues[bqidx];
350	KDASSERT(checkfreelist(bp, dp, `1`));
351	KASSERT(dp->bq_bytes >= bp->b_bufsize);
352	TAILQ_REMOVE(&dp->bq_queue, bp, b_freelist);
353	dp->bq_bytes -= bp->b_bufsize;
354
355	/ For the sysctl helper. /
356	if (bp == dp->bq_marker)
357	dp->bq_marker = NULL;
358
359	#if defined(DIAGNOSTIC)
360	bp->b_freelistindex = -`1`;
361	#endif /* defined(DIAGNOSTIC) */
362	}
363
364	/*
365	* Add a reference to an buffer structure that came from buf_cache.
366	*/
367	static inline void
368	bref(buf_t *bp)
369	{
370
371	KASSERT(mutex_owned(&bufcache_lock));
372	KASSERT(bp->b_refcnt > `0`);
373
374	bp->b_refcnt++;
375	}
376
377	/*
378	* Free an unused buffer structure that came from buf_cache.
379	*/
380	static inline void
381	brele(buf_t *bp)
382	{
383
384	KASSERT(mutex_owned(&bufcache_lock));
385	KASSERT(bp->b_refcnt > `0`);
386
387	if (bp->b_refcnt-- == `1`) {
388	buf_destroy(bp);
389	#ifdef DEBUG
390	memset((char )bp, `0`, sizeof(bp));
391	#endif
392	pool_cache_put(buf_cache, bp);
393	}
394	}
395
396	/*
397	* note that for some ports this is used by pmap bootstrap code to
398	* determine kva size.
399	*/
400	u_long
401	buf_memcalc(void)
402	{
403	u_long n;
404	vsize_t mapsz = `0`;
405
406	/*
407	* Determine the upper bound of memory to use for buffers.
408	*
409	* - If bufpages is specified, use that as the number
410	* pages.
411	*
412	* - Otherwise, use bufcache as the percentage of
413	* physical memory.
414	*/
415	if (bufpages != `0`) {
416	n = bufpages;
417	} else {
418	if (bufcache < `5`) {
419	printf("forcing bufcache %d -> 5", bufcache);
420	bufcache = `5`;
421	}
422	if (bufcache > `95`) {
423	printf("forcing bufcache %d -> 95", bufcache);
424	bufcache = `95`;
425	}
426	if (buf_map != NULL)
427	mapsz = vm_map_max(buf_map) - vm_map_min(buf_map);
428	n = calc_cache_size(mapsz, bufcache,
429	(buf_map != kernel_map) ? `100` : BUFCACHE_VA_MAXPCT)
430	/ PAGE_SIZE;
431	}
432
433	n <<= PAGE_SHIFT;
434	if (bufmem_valimit != `0` && n > bufmem_valimit)
435	n = bufmem_valimit;
436
437	return (n);
438	}
439
440	/*
441	* Initialize buffers and hash links for buffers.
442	*/
443	void
444	bufinit(void)
445	{
446	struct bqueue *dp;
447	int use_std;
448	u_int i;
449
450	biodone_vfs = biodone;
451
452	mutex_init(&bufcache_lock, MUTEX_DEFAULT, IPL_NONE);
453	mutex_init(&buffer_lock, MUTEX_DEFAULT, IPL_NONE);
454	cv_init(&needbuffer_cv, "needbuf");
455
456	if (bufmem_valimit != `0`) {
457	vaddr_t minaddr = `0`, maxaddr;
458	buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
459	bufmem_valimit, `0`, false, `0`);
460	if (buf_map == NULL)
461	panic("bufinit: cannot allocate submap");
462	} else
463	buf_map = kernel_map;
464
465	/*
466	* Initialize buffer cache memory parameters.
467	*/
468	bufmem = `0`;
469	buf_setwm();
470
471	/ On "small" machines use small pool page sizes where possible /
472	use_std = (physmem < atop(`16``1024``1024`));
473
474	/*
475	* Also use them on systems that can map the pool pages using
476	* a direct-mapped segment.
477	*/
478	#ifdef PMAP_MAP_POOLPAGE
479	use_std = `1`;
480	#endif
481
482	buf_cache = pool_cache_init(sizeof(buf_t), `0`, `0`, `0`,
483	"bufpl", NULL, IPL_SOFTBIO, NULL, NULL, NULL);
484	bufio_cache = pool_cache_init(sizeof(buf_t), `0`, `0`, `0`,
485	"biopl", NULL, IPL_BIO, NULL, NULL, NULL);
486
487	for (i = `0`; i < NMEMPOOLS; i++) {
488	struct pool_allocator *pa;
489	struct pool *pp = &bmempools[i];
490	u_int size = `1` << (i + MEMPOOL_INDEX_OFFSET);
491	char name = kmem_alloc(`8`, KM_SLEEP); /* XXX: never freed /
492	if (__predict_false(size >= `1048576`))
493	(void)snprintf(name, `8`, "buf%um", size / `1048576`);
494	else if (__predict_true(size >= `1024`))
495	(void)snprintf(name, `8`, "buf%uk", size / `1024`);
496	else
497	(void)snprintf(name, `8`, "buf%ub", size);
498	pa = (size <= PAGE_SIZE && use_std)
499	? &pool_allocator_nointr
500	: &bufmempool_allocator;
501	pool_init(pp, size, `0`, `0`, `0`, name, pa, IPL_NONE);
502	pool_setlowat(pp, `1`);
503	pool_sethiwat(pp, `1`);
504	}
505
506	/ Initialize the buffer queues /
507	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) {
508	TAILQ_INIT(&dp->bq_queue);
509	dp->bq_bytes = `0`;
510	}
511
512	/*
513	* Estimate hash table size based on the amount of memory we
514	* intend to use for the buffer cache. The average buffer
515	* size is dependent on our clients (i.e. filesystems).
516	*
517	* For now, use an empirical 3K per buffer.
518	*/
519	nbuf = (bufmem_hiwater / `1024`) / `3`;
520	bufhashtbl = hashinit(nbuf, HASH_LIST, true, &bufhash);
521
522	sysctl_kern_buf_setup();
523	sysctl_vm_buf_setup();
524	}
525
526	void
527	bufinit2(void)
528	{
529
530	biodone_sih = softint_establish(SOFTINT_BIO \| SOFTINT_MPSAFE, biointr,
531	NULL);
532	if (biodone_sih == NULL)
533	panic("bufinit2: can't establish soft interrupt");
534	}
535
536	static int
537	buf_lotsfree(void)
538	{
539	u_long guess;
540
541	/ Always allocate if less than the low water mark. /
542	if (bufmem < bufmem_lowater)
543	return `1`;
544
545	/ Never allocate if greater than the high water mark. /
546	if (bufmem > bufmem_hiwater)
547	return `0`;
548
549	/ If there's anything on the AGE list, it should be eaten. /
550	if (TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue) != NULL)
551	return `0`;
552
553	/*
554	* The probabily of getting a new allocation is inversely
555	* proportional to the current size of the cache above
556	* the low water mark. Divide the total first to avoid overflows
557	* in the product.
558	*/
559	guess = cprng_fast32() % `16`;
560
561	if ((bufmem_hiwater - bufmem_lowater) / `16` * guess >=
562	(bufmem - bufmem_lowater))
563	return `1`;
564
565	/ Otherwise don't allocate. /
566	return `0`;
567	}
568
569	/*
570	* Return estimate of bytes we think need to be
571	* released to help resolve low memory conditions.
572	*
573	* => called with bufcache_lock held.
574	*/
575	static int
576	buf_canrelease(void)
577	{
578	int pagedemand, ninvalid = `0`;
579
580	KASSERT(mutex_owned(&bufcache_lock));
581
582	if (bufmem < bufmem_lowater)
583	return `0`;
584
585	if (bufmem > bufmem_hiwater)
586	return bufmem - bufmem_hiwater;
587
588	ninvalid += bufqueues[BQ_AGE].bq_bytes;
589
590	pagedemand = uvmexp.freetarg - uvmexp.free;
591	if (pagedemand < `0`)
592	return ninvalid;
593	return MAX(ninvalid, MIN(`2` * MAXBSIZE,
594	MIN((bufmem - bufmem_lowater) / `16`, pagedemand * PAGE_SIZE)));
595	}
596
597	/*
598	* Buffer memory allocation helper functions
599	*/
600	static u_long
601	buf_mempoolidx(u_long size)
602	{
603	u_int n = `0`;
604
605	size -= `1`;
606	size >>= MEMPOOL_INDEX_OFFSET;
607	while (size) {
608	size >>= `1`;
609	n += `1`;
610	}
611	if (n >= NMEMPOOLS)
612	panic("buf mem pool index %d", n);
613	return n;
614	}
615
616	static u_long
617	buf_roundsize(u_long size)
618	{
619	/ Round up to nearest power of 2 /
620	return (`1` << (buf_mempoolidx(size) + MEMPOOL_INDEX_OFFSET));
621	}
622
623	static void *
624	buf_alloc(size_t size)
625	{
626	u_int n = buf_mempoolidx(size);
627	void *addr;
628
629	while (`1`) {
630	addr = pool_get(&bmempools[n], PR_NOWAIT);
631	if (addr != NULL)
632	break;
633
634	/ No memory, see if we can free some. If so, try again /
635	mutex_enter(&bufcache_lock);
636	if (buf_drain(`1`) > `0`) {
637	mutex_exit(&bufcache_lock);
638	continue;
639	}
640
641	if (curlwp == uvm.pagedaemon_lwp) {
642	mutex_exit(&bufcache_lock);
643	return NULL;
644	}
645
646	/ Wait for buffers to arrive on the LRU queue /
647	cv_timedwait(&needbuffer_cv, &bufcache_lock, hz / `4`);
648	mutex_exit(&bufcache_lock);
649	}
650
651	return addr;
652	}
653
654	static void
655	buf_mrelease(void *addr, size_t size)
656	{
657
658	pool_put(&bmempools[buf_mempoolidx(size)], addr);
659	}
660
661	/*
662	* bread()/breadn() helper.
663	*/
664	static buf_t *
665	bio_doread(struct vnode vp, daddr_t blkno, int* size, int async)
666	{
667	buf_t *bp;
668	struct mount *mp;
669
670	bp = getblk(vp, blkno, size, `0`, `0`);
671
672	/*
673	* getblk() may return NULL if we are the pagedaemon.
674	*/
675	if (bp == NULL) {
676	KASSERT(curlwp == uvm.pagedaemon_lwp);
677	return NULL;
678	}
679
680	/*
681	* If buffer does not have data valid, start a read.
682	* Note that if buffer is BC_INVAL, getblk() won't return it.
683	* Therefore, it's valid if its I/O has completed or been delayed.
684	*/
685	if (!ISSET(bp->b_oflags, (BO_DONE \| BO_DELWRI))) {
686	/ Start I/O for the buffer. /
687	SET(bp->b_flags, B_READ \| async);
688	if (async)
689	BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
690	else
691	BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
692	VOP_STRATEGY(vp, bp);
693
694	/ Pay for the read. /
695	curlwp->l_ru.ru_inblock++;
696	} else if (async)
697	brelse(bp, `0`);
698
699	if (vp->v_type == VBLK)
700	mp = spec_node_getmountedfs(vp);
701	else
702	mp = vp->v_mount;
703
704	/*
705	* Collect statistics on synchronous and asynchronous reads.
706	* Reads from block devices are charged to their associated
707	* filesystem (if any).
708	*/
709	if (mp != NULL) {
710	if (async == `0`)
711	mp->mnt_stat.f_syncreads++;
712	else
713	mp->mnt_stat.f_asyncreads++;
714	}
715
716	return (bp);
717	}
718
719	/*
720	* Read a disk block.
721	* This algorithm described in Bach (p.54).
722	*/
723	int
724	bread(struct vnode vp, daddr_t blkno, int* size, int flags, buf_t **bpp)
725	{
726	buf_t *bp;
727	int error;
728
729	/ Get buffer for block. /
730	bp = *bpp = bio_doread(vp, blkno, size, `0`);
731	if (bp == NULL)
732	return ENOMEM;
733
734	/ Wait for the read to complete, and return result. /
735	error = biowait(bp);
736	if (error == `0` && (flags & B_MODIFY) != `0`)
737	error = fscow_run(bp, true);
738	if (error) {
739	brelse(bp, `0`);
740	*bpp = NULL;
741	}
742
743	return error;
744	}
745
746	/*
747	* Read-ahead multiple disk blocks. The first is sync, the rest async.
748	* Trivial modification to the breada algorithm presented in Bach (p.55).
749	*/
750	int
751	breadn(struct vnode vp, daddr_t blkno, int* size, daddr_t *rablks,
752	int rasizes, int* nrablks, int flags, buf_t **bpp)
753	{
754	buf_t *bp;
755	int error, i;
756
757	bp = *bpp = bio_doread(vp, blkno, size, `0`);
758	if (bp == NULL)
759	return ENOMEM;
760
761	/*
762	* For each of the read-ahead blocks, start a read, if necessary.
763	*/
764	mutex_enter(&bufcache_lock);
765	for (i = `0`; i < nrablks; i++) {
766	/ If it's in the cache, just go on to next one. /
767	if (incore(vp, rablks[i]))
768	continue;
769
770	/ Get a buffer for the read-ahead block /
771	mutex_exit(&bufcache_lock);
772	(void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC);
773	mutex_enter(&bufcache_lock);
774	}
775	mutex_exit(&bufcache_lock);
776
777	/ Otherwise, we had to start a read for it; wait until it's valid. /
778	error = biowait(bp);
779	if (error == `0` && (flags & B_MODIFY) != `0`)
780	error = fscow_run(bp, true);
781	if (error) {
782	brelse(bp, `0`);
783	*bpp = NULL;
784	}
785
786	return error;
787	}
788
789	/*
790	* Block write. Described in Bach (p.56)
791	*/
792	int
793	bwrite(buf_t *bp)
794	{
795	int rv, sync, wasdelayed;
796	struct vnode *vp;
797	struct mount *mp;
798
799	KASSERT(ISSET(bp->b_cflags, BC_BUSY));
800	KASSERT(!cv_has_waiters(&bp->b_done));
801
802	vp = bp->b_vp;
803
804	/*
805	* dholland 20160728 AFAICT vp==NULL must be impossible as it
806	* will crash upon reaching VOP_STRATEGY below... see further
807	* analysis on tech-kern.
808	*/
809	KASSERTMSG(vp != NULL, "bwrite given buffer with null vnode");
810
811	if (vp != NULL) {
812	KASSERT(bp->b_objlock == vp->v_interlock);
813	if (vp->v_type == VBLK)
814	mp = spec_node_getmountedfs(vp);
815	else
816	mp = vp->v_mount;
817	} else {
818	mp = NULL;
819	}
820
821	if (mp && mp->mnt_wapbl) {
822	if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) {
823	bdwrite(bp);
824	return `0`;
825	}
826	}
827
828	/*
829	* Remember buffer type, to switch on it later. If the write was
830	* synchronous, but the file system was mounted with MNT_ASYNC,
831	* convert it to a delayed write.
832	* XXX note that this relies on delayed tape writes being converted
833	* to async, not sync writes (which is safe, but ugly).
834	*/
835	sync = !ISSET(bp->b_flags, B_ASYNC);
836	if (sync && mp != NULL && ISSET(mp->mnt_flag, MNT_ASYNC)) {
837	bdwrite(bp);
838	return (`0`);
839	}
840
841	/*
842	* Collect statistics on synchronous and asynchronous writes.
843	* Writes to block devices are charged to their associated
844	* filesystem (if any).
845	*/
846	if (mp != NULL) {
847	if (sync)
848	mp->mnt_stat.f_syncwrites++;
849	else
850	mp->mnt_stat.f_asyncwrites++;
851	}
852
853	/*
854	* Pay for the I/O operation and make sure the buf is on the correct
855	* vnode queue.
856	*/
857	bp->b_error = `0`;
858	wasdelayed = ISSET(bp->b_oflags, BO_DELWRI);
859	CLR(bp->b_flags, B_READ);
860	if (wasdelayed) {
861	mutex_enter(&bufcache_lock);
862	mutex_enter(bp->b_objlock);
863	CLR(bp->b_oflags, BO_DONE \| BO_DELWRI);
864	reassignbuf(bp, bp->b_vp);
865	mutex_exit(&bufcache_lock);
866	} else {
867	curlwp->l_ru.ru_oublock++;
868	mutex_enter(bp->b_objlock);
869	CLR(bp->b_oflags, BO_DONE \| BO_DELWRI);
870	}
871	if (vp != NULL)
872	vp->v_numoutput++;
873	mutex_exit(bp->b_objlock);
874
875	/ Initiate disk write. /
876	if (sync)
877	BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
878	else
879	BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
880
881	VOP_STRATEGY(vp, bp);
882
883	if (sync) {
884	/ If I/O was synchronous, wait for it to complete. /
885	rv = biowait(bp);
886
887	/ Release the buffer. /
888	brelse(bp, `0`);
889
890	return (rv);
891	} else {
892	return (`0`);
893	}
894	}
895
896	int
897	vn_bwrite(void *v)
898	{
899	struct vop_bwrite_args *ap = v;
900
901	return (bwrite(ap->a_bp));
902	}
903
904	/*
905	* Delayed write.
906	*
907	* The buffer is marked dirty, but is not queued for I/O.
908	* This routine should be used when the buffer is expected
909	* to be modified again soon, typically a small write that
910	* partially fills a buffer.
911	*
912	* NB: magnetic tapes cannot be delayed; they must be
913	* written in the order that the writes are requested.
914	*
915	* Described in Leffler, et al. (pp. 208-213).
916	*/
917	void
918	bdwrite(buf_t *bp)
919	{
920
921	KASSERT(bp->b_vp == NULL \|\| bp->b_vp->v_tag != VT_UFS \|\|
922	bp->b_vp->v_type == VBLK \|\| ISSET(bp->b_flags, B_COWDONE));
923	KASSERT(ISSET(bp->b_cflags, BC_BUSY));
924	KASSERT(!cv_has_waiters(&bp->b_done));
925
926	/ If this is a tape block, write the block now. /
927	if (bdev_type(bp->b_dev) == D_TAPE) {
928	bawrite(bp);
929	return;
930	}
931
932	if (wapbl_vphaswapbl(bp->b_vp)) {
933	struct mount *mp = wapbl_vptomp(bp->b_vp);
934
935	if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) {
936	WAPBL_ADD_BUF(mp, bp);
937	}
938	}
939
940	/*
941	* If the block hasn't been seen before:
942	* (1) Mark it as having been seen,
943	* (2) Charge for the write,
944	* (3) Make sure it's on its vnode's correct block list.
945	*/
946	KASSERT(bp->b_vp == NULL \|\| bp->b_objlock == bp->b_vp->v_interlock);
947
948	if (!ISSET(bp->b_oflags, BO_DELWRI)) {
949	mutex_enter(&bufcache_lock);
950	mutex_enter(bp->b_objlock);
951	SET(bp->b_oflags, BO_DELWRI);
952	curlwp->l_ru.ru_oublock++;
953	reassignbuf(bp, bp->b_vp);
954	mutex_exit(&bufcache_lock);
955	} else {
956	mutex_enter(bp->b_objlock);
957	}
958	/ Otherwise, the "write" is done, so mark and release the buffer. /
959	CLR(bp->b_oflags, BO_DONE);
960	mutex_exit(bp->b_objlock);
961
962	brelse(bp, `0`);
963	}
964
965	/*
966	* Asynchronous block write; just an asynchronous bwrite().
967	*/
968	void
969	bawrite(buf_t *bp)
970	{
971
972	KASSERT(ISSET(bp->b_cflags, BC_BUSY));
973	KASSERT(bp->b_vp != NULL);
974
975	SET(bp->b_flags, B_ASYNC);
976	VOP_BWRITE(bp->b_vp, bp);
977	}
978
979	/*
980	* Release a buffer on to the free lists.
981	* Described in Bach (p. 46).
982	*/
983	void
984	brelsel(buf_t bp, int* set)
985	{
986	struct bqueue *bufq;
987	struct vnode *vp;
988
989	KASSERT(bp != NULL);
990	KASSERT(mutex_owned(&bufcache_lock));
991	KASSERT(!cv_has_waiters(&bp->b_done));
992	KASSERT(bp->b_refcnt > `0`);
993
994	SET(bp->b_cflags, set);
995
996	KASSERT(ISSET(bp->b_cflags, BC_BUSY));
997	KASSERT(bp->b_iodone == NULL);
998
999	/ Wake up any processes waiting for any buffer to become free. /
1000	cv_signal(&needbuffer_cv);
1001
1002	/ Wake up any proceeses waiting for _this_ buffer to become free /
1003	if (ISSET(bp->b_cflags, BC_WANTED))
1004	CLR(bp->b_cflags, BC_WANTED\|BC_AGE);
1005
1006	/ If it's clean clear the copy-on-write flag. /
1007	if (ISSET(bp->b_flags, B_COWDONE)) {
1008	mutex_enter(bp->b_objlock);
1009	if (!ISSET(bp->b_oflags, BO_DELWRI))
1010	CLR(bp->b_flags, B_COWDONE);
1011	mutex_exit(bp->b_objlock);
1012	}
1013
1014	/*
1015	* Determine which queue the buffer should be on, then put it there.
1016	*/
1017
1018	/ If it's locked, don't report an error; try again later. /
1019	if (ISSET(bp->b_flags, B_LOCKED))
1020	bp->b_error = `0`;
1021
1022	/ If it's not cacheable, or an error, mark it invalid. /
1023	if (ISSET(bp->b_cflags, BC_NOCACHE) \|\| bp->b_error != `0`)
1024	SET(bp->b_cflags, BC_INVAL);
1025
1026	if (ISSET(bp->b_cflags, BC_VFLUSH)) {
1027	/*
1028	* This is a delayed write buffer that was just flushed to
1029	* disk. It is still on the LRU queue. If it's become
1030	* invalid, then we need to move it to a different queue;
1031	* otherwise leave it in its current position.
1032	*/
1033	CLR(bp->b_cflags, BC_VFLUSH);
1034	if (!ISSET(bp->b_cflags, BC_INVAL\|BC_AGE) &&
1035	!ISSET(bp->b_flags, B_LOCKED) && bp->b_error == `0`) {
1036	KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], `1`));
1037	goto already_queued;
1038	} else {
1039	bremfree(bp);
1040	}
1041	}
1042
1043	KDASSERT(checkfreelist(bp, &bufqueues[BQ_AGE], `0`));
1044	KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], `0`));
1045	KDASSERT(checkfreelist(bp, &bufqueues[BQ_LOCKED], `0`));
1046
1047	if ((bp->b_bufsize <= `0`) \|\| ISSET(bp->b_cflags, BC_INVAL)) {
1048	/*
1049	* If it's invalid or empty, dissociate it from its vnode
1050	* and put on the head of the appropriate queue.
1051	*/
1052	if (ISSET(bp->b_flags, B_LOCKED)) {
1053	if (wapbl_vphaswapbl(vp = bp->b_vp)) {
1054	struct mount *mp = wapbl_vptomp(vp);
1055
1056	KASSERT(bp->b_iodone
1057	!= mp->mnt_wapbl_op->wo_wapbl_biodone);
1058	WAPBL_REMOVE_BUF(mp, bp);
1059	}
1060	}
1061
1062	mutex_enter(bp->b_objlock);
1063	CLR(bp->b_oflags, BO_DONE\|BO_DELWRI);
1064	if ((vp = bp->b_vp) != NULL) {
1065	KASSERT(bp->b_objlock == vp->v_interlock);
1066	reassignbuf(bp, bp->b_vp);
1067	brelvp(bp);
1068	mutex_exit(vp->v_interlock);
1069	} else {
1070	KASSERT(bp->b_objlock == &buffer_lock);
1071	mutex_exit(bp->b_objlock);
1072	}
1073
1074	if (bp->b_bufsize <= `0`)
1075	/ no data /
1076	goto already_queued;
1077	else
1078	/ invalid data /
1079	bufq = &bufqueues[BQ_AGE];
1080	binsheadfree(bp, bufq);
1081	} else {
1082	/*
1083	* It has valid data. Put it on the end of the appropriate
1084	* queue, so that it'll stick around for as long as possible.
1085	* If buf is AGE, but has dependencies, must put it on last
1086	* bufqueue to be scanned, ie LRU. This protects against the
1087	* livelock where BQ_AGE only has buffers with dependencies,
1088	* and we thus never get to the dependent buffers in BQ_LRU.
1089	*/
1090	if (ISSET(bp->b_flags, B_LOCKED)) {
1091	/ locked in core /
1092	bufq = &bufqueues[BQ_LOCKED];
1093	} else if (!ISSET(bp->b_cflags, BC_AGE)) {
1094	/ valid data /
1095	bufq = &bufqueues[BQ_LRU];
1096	} else {
1097	/ stale but valid data /
1098	bufq = &bufqueues[BQ_AGE];
1099	}
1100	binstailfree(bp, bufq);
1101	}
1102	already_queued:
1103	/ Unlock the buffer. /
1104	CLR(bp->b_cflags, BC_AGE\|BC_BUSY\|BC_NOCACHE);
1105	CLR(bp->b_flags, B_ASYNC);
1106	cv_broadcast(&bp->b_busy);
1107
1108	if (bp->b_bufsize <= `0`)
1109	brele(bp);
1110	}
1111
1112	void
1113	brelse(buf_t bp, int* set)
1114	{
1115
1116	mutex_enter(&bufcache_lock);
1117	brelsel(bp, set);
1118	mutex_exit(&bufcache_lock);
1119	}
1120
1121	/*
1122	* Determine if a block is in the cache.
1123	* Just look on what would be its hash chain. If it's there, return
1124	* a pointer to it, unless it's marked invalid. If it's marked invalid,
1125	* we normally don't return the buffer, unless the caller explicitly
1126	* wants us to.
1127	*/
1128	buf_t *
1129	incore(struct vnode *vp, daddr_t blkno)
1130	{
1131	buf_t *bp;
1132
1133	KASSERT(mutex_owned(&bufcache_lock));
1134
1135	/ Search hash chain /
1136	LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) {
1137	if (bp->b_lblkno == blkno && bp->b_vp == vp &&
1138	!ISSET(bp->b_cflags, BC_INVAL)) {
1139	KASSERT(bp->b_objlock == vp->v_interlock);
1140	return (bp);
1141	}
1142	}
1143
1144	return (NULL);
1145	}
1146
1147	/*
1148	* Get a block of requested size that is associated with
1149	* a given vnode and block offset. If it is found in the
1150	* block cache, mark it as having been found, make it busy
1151	* and return it. Otherwise, return an empty block of the
1152	* correct size. It is up to the caller to insure that the
1153	* cached blocks be of the correct size.
1154	*/
1155	buf_t *
1156	getblk(struct vnode vp, daddr_t blkno, int* size, int slpflag, int slptimeo)
1157	{
1158	int err, preserve;
1159	buf_t *bp;
1160
1161	mutex_enter(&bufcache_lock);
1162	loop:
1163	bp = incore(vp, blkno);
1164	if (bp != NULL) {
1165	err = bbusy(bp, ((slpflag & PCATCH) != `0`), slptimeo, NULL);
1166	if (err != `0`) {
1167	if (err == EPASSTHROUGH)
1168	goto loop;
1169	mutex_exit(&bufcache_lock);
1170	return (NULL);
1171	}
1172	KASSERT(!cv_has_waiters(&bp->b_done));
1173	#ifdef DIAGNOSTIC
1174	if (ISSET(bp->b_oflags, BO_DONE\|BO_DELWRI) &&
1175	bp->b_bcount < size && vp->v_type != VBLK)
1176	panic("getblk: block size invariant failed");
1177	#endif
1178	bremfree(bp);
1179	preserve = `1`;
1180	} else {
1181	if ((bp = getnewbuf(slpflag, slptimeo, `0`)) == NULL)
1182	goto loop;
1183
1184	if (incore(vp, blkno) != NULL) {
1185	/ The block has come into memory in the meantime. /
1186	brelsel(bp, `0`);
1187	goto loop;
1188	}
1189
1190	LIST_INSERT_HEAD(BUFHASH(vp, blkno), bp, b_hash);
1191	bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno;
1192	mutex_enter(vp->v_interlock);
1193	bgetvp(vp, bp);
1194	mutex_exit(vp->v_interlock);
1195	preserve = `0`;
1196	}
1197	mutex_exit(&bufcache_lock);
1198
1199	/*
1200	* LFS can't track total size of B_LOCKED buffer (locked_queue_bytes)
1201	* if we re-size buffers here.
1202	*/
1203	if (ISSET(bp->b_flags, B_LOCKED)) {
1204	KASSERT(bp->b_bufsize >= size);
1205	} else {
1206	if (allocbuf(bp, size, preserve)) {
1207	mutex_enter(&bufcache_lock);
1208	LIST_REMOVE(bp, b_hash);
1209	mutex_exit(&bufcache_lock);
1210	brelse(bp, BC_INVAL);
1211	return NULL;
1212	}
1213	}
1214	BIO_SETPRIO(bp, BPRIO_DEFAULT);
1215	return (bp);
1216	}
1217
1218	/*
1219	* Get an empty, disassociated buffer of given size.
1220	*/
1221	buf_t *
1222	geteblk(int size)
1223	{
1224	buf_t *bp;
1225	int error __diagused;
1226
1227	mutex_enter(&bufcache_lock);
1228	while ((bp = getnewbuf(`0`, `0`, `0`)) == NULL)
1229	;
1230
1231	SET(bp->b_cflags, BC_INVAL);
1232	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1233	mutex_exit(&bufcache_lock);
1234	BIO_SETPRIO(bp, BPRIO_DEFAULT);
1235	error = allocbuf(bp, size, `0`);
1236	KASSERT(error == `0`);
1237	return (bp);
1238	}
1239
1240	/*
1241	* Expand or contract the actual memory allocated to a buffer.
1242	*
1243	* If the buffer shrinks, data is lost, so it's up to the
1244	* caller to have written it out first; this routine will not
1245	* start a write. If the buffer grows, it's the callers
1246	* responsibility to fill out the buffer's additional contents.
1247	*/
1248	int
1249	allocbuf(buf_t bp, int* size, int preserve)
1250	{
1251	void *addr;
1252	vsize_t oldsize, desired_size;
1253	int oldcount;
1254	int delta;
1255
1256	desired_size = buf_roundsize(size);
1257	if (desired_size > MAXBSIZE)
1258	printf("allocbuf: buffer larger than MAXBSIZE requested");
1259
1260	oldcount = bp->b_bcount;
1261
1262	bp->b_bcount = size;
1263
1264	oldsize = bp->b_bufsize;
1265	if (oldsize == desired_size) {
1266	/*
1267	* Do not short cut the WAPBL resize, as the buffer length
1268	* could still have changed and this would corrupt the
1269	* tracking of the transaction length.
1270	*/
1271	goto out;
1272	}
1273
1274	/*
1275	* If we want a buffer of a different size, re-allocate the
1276	* buffer's memory; copy old content only if needed.
1277	*/
1278	addr = buf_alloc(desired_size);
1279	if (addr == NULL)
1280	return ENOMEM;
1281	if (preserve)
1282	memcpy(addr, bp->b_data, MIN(oldsize,desired_size));
1283	if (bp->b_data != NULL)
1284	buf_mrelease(bp->b_data, oldsize);
1285	bp->b_data = addr;
1286	bp->b_bufsize = desired_size;
1287
1288	/*
1289	* Update overall buffer memory counter (protected by bufcache_lock)
1290	*/
1291	delta = (long)desired_size - (long)oldsize;
1292
1293	mutex_enter(&bufcache_lock);
1294	if ((bufmem += delta) > bufmem_hiwater) {
1295	/*
1296	* Need to trim overall memory usage.
1297	*/
1298	while (buf_canrelease()) {
1299	if (curcpu()->ci_schedstate.spc_flags &
1300	SPCF_SHOULDYIELD) {
1301	mutex_exit(&bufcache_lock);
1302	preempt();
1303	mutex_enter(&bufcache_lock);
1304	}
1305	if (buf_trim() == `0`)
1306	break;
1307	}
1308	}
1309	mutex_exit(&bufcache_lock);
1310
1311	out:
1312	if (wapbl_vphaswapbl(bp->b_vp))
1313	WAPBL_RESIZE_BUF(wapbl_vptomp(bp->b_vp), bp, oldsize, oldcount);
1314
1315	return `0`;
1316	}
1317
1318	/*
1319	* Find a buffer which is available for use.
1320	* Select something from a free list.
1321	* Preference is to AGE list, then LRU list.
1322	*
1323	* Called with the buffer queues locked.
1324	* Return buffer locked.
1325	*/
1326	buf_t *
1327	getnewbuf(int slpflag, int slptimeo, int from_bufq)
1328	{
1329	buf_t *bp;
1330	struct vnode *vp;
1331
1332	start:
1333	KASSERT(mutex_owned(&bufcache_lock));
1334
1335	/*
1336	* Get a new buffer from the pool.
1337	*/
1338	if (!from_bufq && buf_lotsfree()) {
1339	mutex_exit(&bufcache_lock);
1340	bp = pool_cache_get(buf_cache, PR_NOWAIT);
1341	if (bp != NULL) {
1342	memset((char )bp, `0`, sizeof(bp));
1343	buf_init(bp);
1344	SET(bp->b_cflags, BC_BUSY); / mark buffer busy /
1345	mutex_enter(&bufcache_lock);
1346	#if defined(DIAGNOSTIC)
1347	bp->b_freelistindex = -`1`;
1348	#endif /* defined(DIAGNOSTIC) */
1349	return (bp);
1350	}
1351	mutex_enter(&bufcache_lock);
1352	}
1353
1354	KASSERT(mutex_owned(&bufcache_lock));
1355	if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue)) != NULL \|\|
1356	(bp = TAILQ_FIRST(&bufqueues[BQ_LRU].bq_queue)) != NULL) {
1357	KASSERT(!ISSET(bp->b_cflags, BC_BUSY) \|\| ISSET(bp->b_cflags, BC_VFLUSH));
1358	bremfree(bp);
1359
1360	/ Buffer is no longer on free lists. /
1361	SET(bp->b_cflags, BC_BUSY);
1362	} else {
1363	/*
1364	* XXX: !from_bufq should be removed.
1365	*/
1366	if (!from_bufq \|\| curlwp != uvm.pagedaemon_lwp) {
1367	/ wait for a free buffer of any kind /
1368	if ((slpflag & PCATCH) != `0`)
1369	(void)cv_timedwait_sig(&needbuffer_cv,
1370	&bufcache_lock, slptimeo);
1371	else
1372	(void)cv_timedwait(&needbuffer_cv,
1373	&bufcache_lock, slptimeo);
1374	}
1375	return (NULL);
1376	}
1377
1378	#ifdef DIAGNOSTIC
1379	if (bp->b_bufsize <= `0`)
1380	panic("buffer %p: on queue but empty", bp);
1381	#endif
1382
1383	if (ISSET(bp->b_cflags, BC_VFLUSH)) {
1384	/*
1385	* This is a delayed write buffer being flushed to disk. Make
1386	* sure it gets aged out of the queue when it's finished, and
1387	* leave it off the LRU queue.
1388	*/
1389	CLR(bp->b_cflags, BC_VFLUSH);
1390	SET(bp->b_cflags, BC_AGE);
1391	goto start;
1392	}
1393
1394	KASSERT(ISSET(bp->b_cflags, BC_BUSY));
1395	KASSERT(bp->b_refcnt > `0`);
1396	KASSERT(!cv_has_waiters(&bp->b_done));
1397
1398	/*
1399	* If buffer was a delayed write, start it and return NULL
1400	* (since we might sleep while starting the write).
1401	*/
1402	if (ISSET(bp->b_oflags, BO_DELWRI)) {
1403	/*
1404	* This buffer has gone through the LRU, so make sure it gets
1405	* reused ASAP.
1406	*/
1407	SET(bp->b_cflags, BC_AGE);
1408	mutex_exit(&bufcache_lock);
1409	bawrite(bp);
1410	mutex_enter(&bufcache_lock);
1411	return (NULL);
1412	}
1413
1414	vp = bp->b_vp;
1415
1416	/ clear out various other fields /
1417	bp->b_cflags = BC_BUSY;
1418	bp->b_oflags = `0`;
1419	bp->b_flags = `0`;
1420	bp->b_dev = NODEV;
1421	bp->b_blkno = `0`;
1422	bp->b_lblkno = `0`;
1423	bp->b_rawblkno = `0`;
1424	bp->b_iodone = `0`;
1425	bp->b_error = `0`;
1426	bp->b_resid = `0`;
1427	bp->b_bcount = `0`;
1428
1429	LIST_REMOVE(bp, b_hash);
1430
1431	/ Disassociate us from our vnode, if we had one... /
1432	if (vp != NULL) {
1433	mutex_enter(vp->v_interlock);
1434	brelvp(bp);
1435	mutex_exit(vp->v_interlock);
1436	}
1437
1438	return (bp);
1439	}
1440
1441	/*
1442	* Attempt to free an aged buffer off the queues.
1443	* Called with queue lock held.
1444	* Returns the amount of buffer memory freed.
1445	*/
1446	static int
1447	buf_trim(void)
1448	{
1449	buf_t *bp;
1450	long size;
1451
1452	KASSERT(mutex_owned(&bufcache_lock));
1453
1454	/ Instruct getnewbuf() to get buffers off the queues /
1455	if ((bp = getnewbuf(PCATCH, `1`, `1`)) == NULL)
1456	return `0`;
1457
1458	KASSERT((bp->b_cflags & BC_WANTED) == `0`);
1459	size = bp->b_bufsize;
1460	bufmem -= size;
1461	if (size > `0`) {
1462	buf_mrelease(bp->b_data, size);
1463	bp->b_bcount = bp->b_bufsize = `0`;
1464	}
1465	/ brelse() will return the buffer to the global buffer pool /
1466	brelsel(bp, `0`);
1467	return size;
1468	}
1469
1470	int
1471	buf_drain(int n)
1472	{
1473	int size = `0`, sz;
1474
1475	KASSERT(mutex_owned(&bufcache_lock));
1476
1477	while (size < n && bufmem > bufmem_lowater) {
1478	sz = buf_trim();
1479	if (sz <= `0`)
1480	break;
1481	size += sz;
1482	}
1483
1484	return size;
1485	}
1486
1487	SDT_PROVIDER_DEFINE(io);
1488
1489	SDT_PROBE_DEFINE1(io, kernel, , wait__start, "struct buf "/bp/*);
1490	SDT_PROBE_DEFINE1(io, kernel, , wait__done, "struct buf "/bp/*);
1491
1492	/*
1493	* Wait for operations on the buffer to complete.
1494	* When they do, extract and return the I/O's error value.
1495	*/
1496	int
1497	biowait(buf_t *bp)
1498	{
1499
1500	KASSERT(ISSET(bp->b_cflags, BC_BUSY));
1501	KASSERT(bp->b_refcnt > `0`);
1502
1503	SDT_PROBE1(io, kernel, , wait__start, bp);
1504
1505	mutex_enter(bp->b_objlock);
1506	while (!ISSET(bp->b_oflags, BO_DONE \| BO_DELWRI))
1507	cv_wait(&bp->b_done, bp->b_objlock);
1508	mutex_exit(bp->b_objlock);
1509
1510	SDT_PROBE1(io, kernel, , wait__done, bp);
1511
1512	return bp->b_error;
1513	}
1514
1515	/*
1516	* Mark I/O complete on a buffer.
1517	*
1518	* If a callback has been requested, e.g. the pageout
1519	* daemon, do so. Otherwise, awaken waiting processes.
1520	*
1521	* [ Leffler, et al., says on p.247:
1522	* "This routine wakes up the blocked process, frees the buffer
1523	* for an asynchronous write, or, for a request by the pagedaemon
1524	* process, invokes a procedure specified in the buffer structure" ]
1525	*
1526	* In real life, the pagedaemon (or other system processes) wants
1527	* to do async stuff to, and doesn't want the buffer brelse()'d.
1528	* (for swap pager, that puts swap buffers on the free lists (!!!),
1529	* for the vn device, that puts allocated buffers on the free lists!)
1530	*/
1531	void
1532	biodone(buf_t *bp)
1533	{
1534	int s;
1535
1536	KASSERT(!ISSET(bp->b_oflags, BO_DONE));
1537
1538	if (cpu_intr_p()) {
1539	/ From interrupt mode: defer to a soft interrupt. /
1540	s = splvm();
1541	TAILQ_INSERT_TAIL(&curcpu()->ci_data.cpu_biodone, bp, b_actq);
1542	softint_schedule(biodone_sih);
1543	splx(s);
1544	} else {
1545	/ Process now - the buffer may be freed soon. /
1546	biodone2(bp);
1547	}
1548	}
1549
1550	SDT_PROBE_DEFINE1(io, kernel, , done, "struct buf "/bp/*);
1551
1552	static void
1553	biodone2(buf_t *bp)
1554	{
1555	void (callout)(buf_t );
1556
1557	SDT_PROBE1(io, kernel, ,done, bp);
1558
1559	mutex_enter(bp->b_objlock);
1560	/ Note that the transfer is done. /
1561	if (ISSET(bp->b_oflags, BO_DONE))
1562	panic("biodone2 already");
1563	CLR(bp->b_flags, B_COWDONE);
1564	SET(bp->b_oflags, BO_DONE);
1565	BIO_SETPRIO(bp, BPRIO_DEFAULT);
1566
1567	/ Wake up waiting writers. /
1568	if (!ISSET(bp->b_flags, B_READ))
1569	vwakeup(bp);
1570
1571	if ((callout = bp->b_iodone) != NULL) {
1572	/ Note callout done, then call out. /
1573	KASSERT(!cv_has_waiters(&bp->b_done));
1574	KERNEL_LOCK(`1`, NULL); / XXXSMP /
1575	bp->b_iodone = NULL;
1576	mutex_exit(bp->b_objlock);
1577	(*callout)(bp);
1578	KERNEL_UNLOCK_ONE(NULL); / XXXSMP /
1579	} else if (ISSET(bp->b_flags, B_ASYNC)) {
1580	/ If async, release. /
1581	KASSERT(!cv_has_waiters(&bp->b_done));
1582	mutex_exit(bp->b_objlock);
1583	brelse(bp, `0`);
1584	} else {
1585	/ Otherwise just wake up waiters in biowait(). /
1586	cv_broadcast(&bp->b_done);
1587	mutex_exit(bp->b_objlock);
1588	}
1589	}
1590
1591	static void
1592	biointr(void *cookie)
1593	{
1594	struct cpu_info *ci;
1595	buf_t *bp;
1596	int s;
1597
1598	ci = curcpu();
1599
1600	while (!TAILQ_EMPTY(&ci->ci_data.cpu_biodone)) {
1601	KASSERT(curcpu() == ci);
1602
1603	s = splvm();
1604	bp = TAILQ_FIRST(&ci->ci_data.cpu_biodone);
1605	TAILQ_REMOVE(&ci->ci_data.cpu_biodone, bp, b_actq);
1606	splx(s);
1607
1608	biodone2(bp);
1609	}
1610	}
1611
1612	/*
1613	* Wait for all buffers to complete I/O
1614	* Return the number of "stuck" buffers.
1615	*/
1616	int
1617	buf_syncwait(void)
1618	{
1619	buf_t *bp;
1620	int iter, nbusy, nbusy_prev = `0`, ihash;
1621
1622	for (iter = `0`; iter < `20`;) {
1623	mutex_enter(&bufcache_lock);
1624	nbusy = `0`;
1625	for (ihash = `0`; ihash < bufhash+`1`; ihash++) {
1626	LIST_FOREACH(bp, &bufhashtbl[ihash], b_hash) {
1627	if ((bp->b_cflags & (BC_BUSY\|BC_INVAL)) == BC_BUSY)
1628	nbusy += ((bp->b_flags & B_READ) == `0`);
1629	}
1630	}
1631	mutex_exit(&bufcache_lock);
1632
1633	if (nbusy == `0`)
1634	break;
1635	if (nbusy_prev == `0`)
1636	nbusy_prev = nbusy;
1637	printf("%d ", nbusy);
1638	kpause("bflush", false, MAX(`1`, hz / `25` * iter), NULL);
1639	if (nbusy >= nbusy_prev) / we didn't flush anything /
1640	iter++;
1641	else
1642	nbusy_prev = nbusy;
1643	}
1644
1645	if (nbusy) {
1646	#if defined(DEBUG) \|\| defined(DEBUG_HALT_BUSY)
1647	printf("giving up\nPrinting vnodes for busy buffers\n");
1648	for (ihash = `0`; ihash < bufhash+`1`; ihash++) {
1649	LIST_FOREACH(bp, &bufhashtbl[ihash], b_hash) {
1650	if ((bp->b_cflags & (BC_BUSY\|BC_INVAL)) == BC_BUSY &&
1651	(bp->b_flags & B_READ) == `0`)
1652	vprint(NULL, bp->b_vp);
1653	}
1654	}
1655	#endif
1656	}
1657
1658	return nbusy;
1659	}
1660
1661	static void
1662	sysctl_fillbuf(buf_t i, struct* buf_sysctl *o)
1663	{
1664
1665	o->b_flags = i->b_flags \| i->b_cflags \| i->b_oflags;
1666	o->b_error = i->b_error;
1667	o->b_prio = i->b_prio;
1668	o->b_dev = i->b_dev;
1669	o->b_bufsize = i->b_bufsize;
1670	o->b_bcount = i->b_bcount;
1671	o->b_resid = i->b_resid;
1672	o->b_addr = PTRTOUINT64(i->b_data);
1673	o->b_blkno = i->b_blkno;
1674	o->b_rawblkno = i->b_rawblkno;
1675	o->b_iodone = PTRTOUINT64(i->b_iodone);
1676	o->b_proc = PTRTOUINT64(i->b_proc);
1677	o->b_vp = PTRTOUINT64(i->b_vp);
1678	o->b_saveaddr = PTRTOUINT64(i->b_saveaddr);
1679	o->b_lblkno = i->b_lblkno;
1680	}
1681
1682	#define KERN_BUFSLOP 20
1683	static int
1684	sysctl_dobuf(SYSCTLFN_ARGS)
1685	{
1686	buf_t *bp;
1687	struct buf_sysctl bs;
1688	struct bqueue *bq;
1689	char *dp;
1690	u_int i, op, arg;
1691	size_t len, needed, elem_size, out_size;
1692	int error, elem_count, retries;
1693
1694	if (namelen == `1` && name[`0`] == CTL_QUERY)
1695	return (sysctl_query(SYSCTLFN_CALL(rnode)));
1696
1697	if (namelen != `4`)
1698	return (EINVAL);
1699
1700	retries = `100`;
1701	retry:
1702	dp = oldp;
1703	len = (oldp != NULL) ? *oldlenp : `0`;
1704	op = name[`0`];
1705	arg = name[`1`];
1706	elem_size = name[`2`];
1707	elem_count = name[`3`];
1708	out_size = MIN(sizeof(bs), elem_size);
1709
1710	/*
1711	* at the moment, these are just "placeholders" to make the
1712	* API for retrieving kern.buf data more extensible in the
1713	* future.
1714	*
1715	* XXX kern.buf currently has "netbsd32" issues. hopefully
1716	* these will be resolved at a later point.
1717	*/
1718	if (op != KERN_BUF_ALL \|\| arg != KERN_BUF_ALL \|\|
1719	elem_size < `1` \|\| elem_count < `0`)
1720	return (EINVAL);
1721
1722	error = `0`;
1723	needed = `0`;
1724	sysctl_unlock();
1725	mutex_enter(&bufcache_lock);
1726	for (i = `0`; i < BQUEUES; i++) {
1727	bq = &bufqueues[i];
1728	TAILQ_FOREACH(bp, &bq->bq_queue, b_freelist) {
1729	bq->bq_marker = bp;
1730	if (len >= elem_size && elem_count > `0`) {
1731	sysctl_fillbuf(bp, &bs);
1732	mutex_exit(&bufcache_lock);
1733	error = copyout(&bs, dp, out_size);
1734	mutex_enter(&bufcache_lock);
1735	if (error)
1736	break;
1737	if (bq->bq_marker != bp) {
1738	/*
1739	* This sysctl node is only for
1740	* statistics. Retry; if the
1741	* queue keeps changing, then
1742	* bail out.
1743	*/
1744	if (retries-- == `0`) {
1745	error = EAGAIN;
1746	break;
1747	}
1748	mutex_exit(&bufcache_lock);
1749	sysctl_relock();
1750	goto retry;
1751	}
1752	dp += elem_size;
1753	len -= elem_size;
1754	}
1755	needed += elem_size;
1756	if (elem_count > `0` && elem_count != INT_MAX)
1757	elem_count--;
1758	}
1759	if (error != `0`)
1760	break;
1761	}
1762	mutex_exit(&bufcache_lock);
1763	sysctl_relock();
1764
1765	*oldlenp = needed;
1766	if (oldp == NULL)
1767	oldlenp += KERN_BUFSLOP sizeof(buf_t);
1768
1769	return (error);
1770	}
1771
1772	static int
1773	sysctl_bufvm_update(SYSCTLFN_ARGS)
1774	{
1775	int error, rv;
1776	struct sysctlnode node;
1777	unsigned int temp_bufcache;
1778	unsigned long temp_water;
1779
1780	/ Take a copy of the supplied node and its data /
1781	node = *rnode;
1782	if (node.sysctl_data == &bufcache) {
1783	node.sysctl_data = &temp_bufcache;
1784	temp_bufcache = (unsigned* int *)rnode->sysctl_data;
1785	} else {
1786	node.sysctl_data = &temp_water;
1787	temp_water = (unsigned* long *)rnode->sysctl_data;
1788	}
1789
1790	/ Update the copy /
1791	error = sysctl_lookup(SYSCTLFN_CALL(&node));
1792	if (error \|\| newp == NULL)
1793	return (error);
1794
1795	if (rnode->sysctl_data == &bufcache) {
1796	if (temp_bufcache > `100`)
1797	return (EINVAL);
1798	bufcache = temp_bufcache;
1799	buf_setwm();
1800	} else if (rnode->sysctl_data == &bufmem_lowater) {
1801	if (bufmem_hiwater - temp_water < `16`)
1802	return (EINVAL);
1803	bufmem_lowater = temp_water;
1804	} else if (rnode->sysctl_data == &bufmem_hiwater) {
1805	if (temp_water - bufmem_lowater < `16`)
1806	return (EINVAL);
1807	bufmem_hiwater = temp_water;
1808	} else
1809	return (EINVAL);
1810
1811	/ Drain until below new high water mark /
1812	sysctl_unlock();
1813	mutex_enter(&bufcache_lock);
1814	while (bufmem > bufmem_hiwater) {
1815	rv = buf_drain((bufmem - bufmem_hiwater) / (`2` * `1024`));
1816	if (rv <= `0`)
1817	break;
1818	}
1819	mutex_exit(&bufcache_lock);
1820	sysctl_relock();
1821
1822	return `0`;
1823	}
1824
1825	static struct sysctllog *vfsbio_sysctllog;
1826
1827	static void
1828	sysctl_kern_buf_setup(void)
1829	{
1830
1831	sysctl_createv(&vfsbio_sysctllog, `0`, NULL, NULL,
1832	CTLFLAG_PERMANENT,
1833	CTLTYPE_NODE, "buf",
1834	SYSCTL_DESCR("Kernel buffer cache information"),
1835	sysctl_dobuf, `0`, NULL, `0`,
1836	CTL_KERN, KERN_BUF, CTL_EOL);
1837	}
1838
1839	static void
1840	sysctl_vm_buf_setup(void)
1841	{
1842
1843	sysctl_createv(&vfsbio_sysctllog, `0`, NULL, NULL,
1844	CTLFLAG_PERMANENT\|CTLFLAG_READWRITE,
1845	CTLTYPE_INT, "bufcache",
1846	SYSCTL_DESCR("Percentage of physical memory to use for "
1847	"buffer cache"),
1848	sysctl_bufvm_update, `0`, &bufcache, `0`,
1849	CTL_VM, CTL_CREATE, CTL_EOL);
1850	sysctl_createv(&vfsbio_sysctllog, `0`, NULL, NULL,
1851	CTLFLAG_PERMANENT\|CTLFLAG_READONLY,
1852	CTLTYPE_LONG, "bufmem",
1853	SYSCTL_DESCR("Amount of kernel memory used by buffer "
1854	"cache"),
1855	NULL, `0`, &bufmem, `0`,
1856	CTL_VM, CTL_CREATE, CTL_EOL);
1857	sysctl_createv(&vfsbio_sysctllog, `0`, NULL, NULL,
1858	CTLFLAG_PERMANENT\|CTLFLAG_READWRITE,
1859	CTLTYPE_LONG, "bufmem_lowater",
1860	SYSCTL_DESCR("Minimum amount of kernel memory to "
1861	"reserve for buffer cache"),
1862	sysctl_bufvm_update, `0`, &bufmem_lowater, `0`,
1863	CTL_VM, CTL_CREATE, CTL_EOL);
1864	sysctl_createv(&vfsbio_sysctllog, `0`, NULL, NULL,
1865	CTLFLAG_PERMANENT\|CTLFLAG_READWRITE,
1866	CTLTYPE_LONG, "bufmem_hiwater",
1867	SYSCTL_DESCR("Maximum amount of kernel memory to use "
1868	"for buffer cache"),
1869	sysctl_bufvm_update, `0`, &bufmem_hiwater, `0`,
1870	CTL_VM, CTL_CREATE, CTL_EOL);
1871	}
1872
1873	#ifdef DEBUG
1874	/*
1875	* Print out statistics on the current allocation of the buffer pool.
1876	* Can be enabled to print out on every ``sync'' by setting "syncprt"
1877	* in vfs_syscalls.c using sysctl.
1878	*/
1879	void
1880	vfs_bufstats(void)
1881	{
1882	int i, j, count;
1883	buf_t *bp;
1884	struct bqueue *dp;
1885	int counts[MAXBSIZE / MIN_PAGE_SIZE + `1`];
1886	static const char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE" };
1887
1888	for (dp = bufqueues, i = `0`; dp < &bufqueues[BQUEUES]; dp++, i++) {
1889	count = `0`;
1890	memset(counts, `0`, sizeof(counts));
1891	TAILQ_FOREACH(bp, &dp->bq_queue, b_freelist) {
1892	counts[bp->b_bufsize / PAGE_SIZE]++;
1893	count++;
1894	}
1895	printf("%s: total-%d", bname[i], count);
1896	for (j = `0`; j <= MAXBSIZE / PAGE_SIZE; j++)
1897	if (counts[j] != `0`)
1898	printf(", %d-%d", j * PAGE_SIZE, counts[j]);
1899	printf("\n");
1900	}
1901	}
1902	#endif /* DEBUG */
1903
1904	/ ------------------------------ /
1905
1906	buf_t *
1907	getiobuf(struct vnode *vp, bool waitok)
1908	{
1909	buf_t *bp;
1910
1911	bp = pool_cache_get(bufio_cache, (waitok ? PR_WAITOK : PR_NOWAIT));
1912	if (bp == NULL)
1913	return bp;
1914
1915	buf_init(bp);
1916
1917	if ((bp->b_vp = vp) == NULL)
1918	bp->b_objlock = &buffer_lock;
1919	else
1920	bp->b_objlock = vp->v_interlock;
1921
1922	return bp;
1923	}
1924
1925	void
1926	putiobuf(buf_t *bp)
1927	{
1928
1929	buf_destroy(bp);
1930	pool_cache_put(bufio_cache, bp);
1931	}
1932
1933	/*
1934	* nestiobuf_iodone: b_iodone callback for nested buffers.
1935	*/
1936
1937	void
1938	nestiobuf_iodone(buf_t *bp)
1939	{
1940	buf_t *mbp = bp->b_private;
1941	int error;
1942	int donebytes;
1943
1944	KASSERT(bp->b_bcount <= bp->b_bufsize);
1945	KASSERT(mbp != bp);
1946
1947	error = bp->b_error;
1948	if (bp->b_error == `0` &&
1949	(bp->b_bcount < bp->b_bufsize \|\| bp->b_resid > `0`)) {
1950	/*
1951	* Not all got transfered, raise an error. We have no way to
1952	* propagate these conditions to mbp.
1953	*/
1954	error = EIO;
1955	}
1956
1957	donebytes = bp->b_bufsize;
1958
1959	putiobuf(bp);
1960	nestiobuf_done(mbp, donebytes, error);
1961	}
1962
1963	/*
1964	* nestiobuf_setup: setup a "nested" buffer.
1965	*
1966	* => 'mbp' is a "master" buffer which is being divided into sub pieces.
1967	* => 'bp' should be a buffer allocated by getiobuf.
1968	* => 'offset' is a byte offset in the master buffer.
1969	* => 'size' is a size in bytes of this nested buffer.
1970	*/
1971
1972	void
1973	nestiobuf_setup(buf_t mbp, buf_t bp, int offset, size_t size)
1974	{
1975	const int b_read = mbp->b_flags & B_READ;
1976	struct vnode *vp = mbp->b_vp;
1977
1978	KASSERT(mbp->b_bcount >= offset + size);
1979	bp->b_vp = vp;
1980	bp->b_dev = mbp->b_dev;
1981	bp->b_objlock = mbp->b_objlock;
1982	bp->b_cflags = BC_BUSY;
1983	bp->b_flags = B_ASYNC \| b_read;
1984	bp->b_iodone = nestiobuf_iodone;
1985	bp->b_data = (char *)mbp->b_data + offset;
1986	bp->b_resid = bp->b_bcount = size;
1987	bp->b_bufsize = bp->b_bcount;
1988	bp->b_private = mbp;
1989	BIO_COPYPRIO(bp, mbp);
1990	if (!b_read && vp != NULL) {
1991	mutex_enter(vp->v_interlock);
1992	vp->v_numoutput++;
1993	mutex_exit(vp->v_interlock);
1994	}
1995	}
1996
1997	/*
1998	* nestiobuf_done: propagate completion to the master buffer.
1999	*
2000	* => 'donebytes' specifies how many bytes in the 'mbp' is completed.
2001	* => 'error' is an errno(2) that 'donebytes' has been completed with.
2002	*/
2003
2004	void
2005	nestiobuf_done(buf_t mbp, int* donebytes, int error)
2006	{
2007
2008	if (donebytes == `0`) {
2009	return;
2010	}
2011	mutex_enter(mbp->b_objlock);
2012	KASSERT(mbp->b_resid >= donebytes);
2013	mbp->b_resid -= donebytes;
2014	if (error)
2015	mbp->b_error = error;
2016	if (mbp->b_resid == `0`) {
2017	if (mbp->b_error)
2018	mbp->b_resid = mbp->b_bcount;
2019	mutex_exit(mbp->b_objlock);
2020	biodone(mbp);
2021	} else
2022	mutex_exit(mbp->b_objlock);
2023	}
2024
2025	void
2026	buf_init(buf_t *bp)
2027	{
2028
2029	cv_init(&bp->b_busy, "biolock");
2030	cv_init(&bp->b_done, "biowait");
2031	bp->b_dev = NODEV;
2032	bp->b_error = `0`;
2033	bp->b_flags = `0`;
2034	bp->b_cflags = `0`;
2035	bp->b_oflags = `0`;
2036	bp->b_objlock = &buffer_lock;
2037	bp->b_iodone = NULL;
2038	bp->b_refcnt = `1`;
2039	bp->b_dev = NODEV;
2040	bp->b_vnbufs.le_next = NOLIST;
2041	BIO_SETPRIO(bp, BPRIO_DEFAULT);
2042	}
2043
2044	void
2045	buf_destroy(buf_t *bp)
2046	{
2047
2048	cv_destroy(&bp->b_done);
2049	cv_destroy(&bp->b_busy);
2050	}
2051
2052	int
2053	bbusy(buf_t bp, bool intr, int* timo, kmutex_t *interlock)
2054	{
2055	int error;
2056
2057	KASSERT(mutex_owned(&bufcache_lock));
2058
2059	if ((bp->b_cflags & BC_BUSY) != `0`) {
2060	if (curlwp == uvm.pagedaemon_lwp)
2061	return EDEADLK;
2062	bp->b_cflags \|= BC_WANTED;
2063	bref(bp);
2064	if (interlock != NULL)
2065	mutex_exit(interlock);
2066	if (intr) {
2067	error = cv_timedwait_sig(&bp->b_busy, &bufcache_lock,
2068	timo);
2069	} else {
2070	error = cv_timedwait(&bp->b_busy, &bufcache_lock,
2071	timo);
2072	}
2073	brele(bp);
2074	if (interlock != NULL)
2075	mutex_enter(interlock);
2076	if (error != `0`)
2077	return error;
2078	return EPASSTHROUGH;
2079	}
2080	bp->b_cflags \|= BC_BUSY;
2081
2082	return `0`;
2083	}
2084

Browse the source code of src/src/sys/kern/vfs_bio.c

Definitions