vfs_wapbl.c source code [src/src/sys/kern/vfs_wapbl.c]

1	/ $NetBSD: vfs_wapbl.c,v 1.86 2016/11/10 20:56:32 jdolecek Exp $ /
2
3	/-*
4	* Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
5	* All rights reserved.
6	*
7	* This code is derived from software contributed to The NetBSD Foundation
8	* by Wasabi Systems, Inc.
9	*
10	* Redistribution and use in source and binary forms, with or without
11	* modification, are permitted provided that the following conditions
12	* are met:
13	* 1. Redistributions of source code must retain the above copyright
14	* notice, this list of conditions and the following disclaimer.
15	* 2. Redistributions in binary form must reproduce the above copyright
16	* notice, this list of conditions and the following disclaimer in the
17	* documentation and/or other materials provided with the distribution.
18	*
19	* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29	* POSSIBILITY OF SUCH DAMAGE.
30	*/
31
32	/*
33	* This implements file system independent write ahead filesystem logging.
34	*/
35
36	#define WAPBL_INTERNAL
37
38	#include <sys/cdefs.h>
39	__KERNEL_RCSID(`0`, "$NetBSD: vfs_wapbl.c,v 1.86 2016/11/10 20:56:32 jdolecek Exp $");
40
41	#include <sys/param.h>
42	#include <sys/bitops.h>
43	#include <sys/time.h>
44	#include <sys/wapbl.h>
45	#include <sys/wapbl_replay.h>
46
47	#ifdef _KERNEL
48
49	#include <sys/atomic.h>
50	#include <sys/conf.h>
51	#include <sys/file.h>
52	#include <sys/kauth.h>
53	#include <sys/kernel.h>
54	#include <sys/module.h>
55	#include <sys/mount.h>
56	#include <sys/mutex.h>
57	#include <sys/namei.h>
58	#include <sys/proc.h>
59	#include <sys/resourcevar.h>
60	#include <sys/sysctl.h>
61	#include <sys/uio.h>
62	#include <sys/vnode.h>
63
64	#include <miscfs/specfs/specdev.h>
65
66	#define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP)
67	#define wapbl_free(a, s) kmem_free((a), (s))
68	#define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
69
70	static struct sysctllog *wapbl_sysctl;
71	static int wapbl_flush_disk_cache = `1`;
72	static int wapbl_verbose_commit = `0`;
73
74	static inline size_t wapbl_space_free(size_t, off_t, off_t);
75
76	#else /* !_KERNEL */
77
78	#include <assert.h>
79	#include <errno.h>
80	#include <stdbool.h>
81	#include <stdio.h>
82	#include <stdlib.h>
83	#include <string.h>
84
85	#define KDASSERT(x) assert(x)
86	#define KASSERT(x) assert(x)
87	#define wapbl_alloc(s) malloc(s)
88	#define wapbl_free(a, s) free(a)
89	#define wapbl_calloc(n, s) calloc((n), (s))
90
91	#endif /* !_KERNEL */
92
93	/*
94	* INTERNAL DATA STRUCTURES
95	*/
96
97	/*
98	* This structure holds per-mount log information.
99	*
100	* Legend: a = atomic access only
101	* r = read-only after init
102	* l = rwlock held
103	* m = mutex held
104	* lm = rwlock held writing or mutex held
105	* u = unlocked access ok
106	* b = bufcache_lock held
107	*/
108	LIST_HEAD(wapbl_ino_head, wapbl_ino);
109	struct wapbl {
110	struct vnode wl_logvp; /* r: log here /
111	struct vnode wl_devvp; /* r: log on this device /
112	struct mount wl_mount; /* r: mountpoint wl is associated with /
113	daddr_t wl_logpbn; / r: Physical block number of start of log /
114	int wl_log_dev_bshift; / r: logarithm of device block size of log*
115	device /*
116	int wl_fs_dev_bshift; / r: logarithm of device block size of*
117	filesystem device /*
118
119	unsigned wl_lock_count; / m: Count of transactions in progress /
120
121	size_t wl_circ_size; / r: Number of bytes in buffer of log /
122	size_t wl_circ_off; / r: Number of bytes reserved at start /
123
124	size_t wl_bufcount_max; / r: Number of buffers reserved for log /
125	size_t wl_bufbytes_max; / r: Number of buf bytes reserved for log /
126
127	off_t wl_head; / l: Byte offset of log head /
128	off_t wl_tail; / l: Byte offset of log tail /
129	/*
130	* WAPBL log layout, stored on wl_devvp at wl_logpbn:
131	*
132	* ___________________ wl_circ_size __________________
133	* / \
134	* +---------+---------+-------+--------------+--------+
135	* [ commit0 \| commit1 \| CCWCW \| EEEEEEEEEEEE \| CCCWCW ]
136	* +---------+---------+-------+--------------+--------+
137	* wl_circ_off --^ ^-- wl_head ^-- wl_tail
138	*
139	* commit0 and commit1 are commit headers. A commit header has
140	* a generation number, indicating which of the two headers is
141	* more recent, and an assignment of head and tail pointers.
142	* The rest is a circular queue of log records, starting at
143	* the byte offset wl_circ_off.
144	*
145	* E marks empty space for records.
146	* W marks records for block writes issued but waiting.
147	* C marks completed records.
148	*
149	* wapbl_flush writes new records to empty `E' spaces after
150	* wl_head from the current transaction in memory.
151	*
152	* wapbl_truncate advances wl_tail past any completed `C'
153	* records, freeing them up for use.
154	*
155	* head == tail == 0 means log is empty.
156	* head == tail != 0 means log is full.
157	*
158	* See assertions in wapbl_advance() for other boundary
159	* conditions.
160	*
161	* Only wapbl_flush moves the head, except when wapbl_truncate
162	* sets it to 0 to indicate that the log is empty.
163	*
164	* Only wapbl_truncate moves the tail, except when wapbl_flush
165	* sets it to wl_circ_off to indicate that the log is full.
166	*/
167
168	struct wapbl_wc_header wl_wc_header; /* l /
169	void wl_wc_scratch; /* l: scratch space (XXX: por que?!?) /
170
171	kmutex_t wl_mtx; / u: short-term lock /
172	krwlock_t wl_rwlock; / u: File system transaction lock /
173
174	/*
175	* Must be held while accessing
176	* wl_count or wl_bufs or head or tail
177	*/
178
179	/*
180	* Callback called from within the flush routine to flush any extra
181	* bits. Note that flush may be skipped without calling this if
182	* there are no outstanding buffers in the transaction.
183	*/
184	#if _KERNEL
185	wapbl_flush_fn_t wl_flush; / r /
186	wapbl_flush_fn_t wl_flush_abort;/ r /
187	#endif
188
189	size_t wl_bufbytes; / m: Byte count of pages in wl_bufs /
190	size_t wl_bufcount; / m: Count of buffers in wl_bufs /
191	size_t wl_bcount; / m: Total bcount of wl_bufs /
192
193	LIST_HEAD(, buf) wl_bufs; / m: Buffers in current transaction /
194
195	kcondvar_t wl_reclaimable_cv; / m (obviously) /
196	size_t wl_reclaimable_bytes; / m: Amount of space available for*
197	reclamation by truncate /*
198	int wl_error_count; / m: # of wl_entries with errors /
199	size_t wl_reserved_bytes; / never truncate log smaller than this /
200
201	#ifdef WAPBL_DEBUG_BUFBYTES
202	size_t wl_unsynced_bufbytes; / Byte count of unsynced buffers /
203	#endif
204
205	#if _KERNEL
206	int wl_brperjblock; / r Block records per journal block /
207	#endif
208
209	TAILQ_HEAD(, wapbl_dealloc) wl_dealloclist; / lm: list head /
210	int wl_dealloccnt; / lm: total count /
211	int wl_dealloclim; / r: max count /
212
213	/ hashtable of inode numbers for allocated but unlinked inodes /
214	/ synch ??? /
215	struct wapbl_ino_head *wl_inohash;
216	u_long wl_inohashmask;
217	int wl_inohashcnt;
218
219	SIMPLEQ_HEAD(, wapbl_entry) wl_entries; / On disk transaction*
220	accounting /*
221
222	u_char wl_buffer; /* l: buffer for wapbl_buffered_write() /
223	daddr_t wl_buffer_dblk; / l: buffer disk block address /
224	size_t wl_buffer_used; / l: buffer current use /
225	};
226
227	#ifdef WAPBL_DEBUG_PRINT
228	int wapbl_debug_print = WAPBL_DEBUG_PRINT;
229	#endif
230
231	/**************************************************************/
232	#ifdef _KERNEL
233
234	#ifdef WAPBL_DEBUG
235	struct wapbl *wapbl_debug_wl;
236	#endif
237
238	static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
239	static int wapbl_write_blocks(struct wapbl wl, off_t offp);
240	static int wapbl_write_revocations(struct wapbl wl, off_t offp);
241	static int wapbl_write_inodes(struct wapbl wl, off_t offp);
242	#endif /* _KERNEL */
243
244	static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
245
246	static inline size_t wapbl_space_used(size_t avail, off_t head,
247	off_t tail);
248
249	#ifdef _KERNEL
250
251	static struct pool wapbl_entry_pool;
252	static struct pool wapbl_dealloc_pool;
253
254	#define WAPBL_INODETRK_SIZE 83
255	static int wapbl_ino_pool_refcount;
256	static struct pool wapbl_ino_pool;
257	struct wapbl_ino {
258	LIST_ENTRY(wapbl_ino) wi_hash;
259	ino_t wi_ino;
260	mode_t wi_mode;
261	};
262
263	static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
264	static void wapbl_inodetrk_free(struct wapbl *wl);
265	static struct wapbl_ino wapbl_inodetrk_get(struct* wapbl *wl, ino_t ino);
266
267	static size_t wapbl_transaction_len(struct wapbl *wl);
268	static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
269
270	static void wapbl_deallocation_free(struct wapbl , struct* wapbl_dealloc *,
271	bool);
272
273	#if 0
274	int wapbl_replay_verify(struct wapbl_replay , struct* vnode *);
275	#endif
276
277	static int wapbl_replay_isopen1(struct wapbl_replay *);
278
279	struct wapbl_ops wapbl_ops = {
280	.wo_wapbl_discard = wapbl_discard,
281	.wo_wapbl_replay_isopen = wapbl_replay_isopen1,
282	.wo_wapbl_replay_can_read = wapbl_replay_can_read,
283	.wo_wapbl_replay_read = wapbl_replay_read,
284	.wo_wapbl_add_buf = wapbl_add_buf,
285	.wo_wapbl_remove_buf = wapbl_remove_buf,
286	.wo_wapbl_resize_buf = wapbl_resize_buf,
287	.wo_wapbl_begin = wapbl_begin,
288	.wo_wapbl_end = wapbl_end,
289	.wo_wapbl_junlock_assert= wapbl_junlock_assert,
290
291	/ XXX: the following is only used to say "this is a wapbl buf" /
292	.wo_wapbl_biodone = wapbl_biodone,
293	};
294
295	static int
296	wapbl_sysctl_init(void)
297	{
298	int rv;
299	const struct sysctlnode rnode, cnode;
300
301	wapbl_sysctl = NULL;
302
303	rv = sysctl_createv(&wapbl_sysctl, `0`, NULL, &rnode,
304	CTLFLAG_PERMANENT,
305	CTLTYPE_NODE, "wapbl",
306	SYSCTL_DESCR("WAPBL journaling options"),
307	NULL, `0`, NULL, `0`,
308	CTL_VFS, CTL_CREATE, CTL_EOL);
309	if (rv)
310	return rv;
311
312	rv = sysctl_createv(&wapbl_sysctl, `0`, &rnode, &cnode,
313	CTLFLAG_PERMANENT\|CTLFLAG_READWRITE,
314	CTLTYPE_INT, "flush_disk_cache",
315	SYSCTL_DESCR("flush disk cache"),
316	NULL, `0`, &wapbl_flush_disk_cache, `0`,
317	CTL_CREATE, CTL_EOL);
318	if (rv)
319	return rv;
320
321	rv = sysctl_createv(&wapbl_sysctl, `0`, &rnode, &cnode,
322	CTLFLAG_PERMANENT\|CTLFLAG_READWRITE,
323	CTLTYPE_INT, "verbose_commit",
324	SYSCTL_DESCR("show time and size of wapbl log commits"),
325	NULL, `0`, &wapbl_verbose_commit, `0`,
326	CTL_CREATE, CTL_EOL);
327	return rv;
328	}
329
330	static void
331	wapbl_init(void)
332	{
333
334	pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), `0`, `0`, `0`,
335	"wapblentrypl", &pool_allocator_kmem, IPL_VM);
336	pool_init(&wapbl_dealloc_pool, sizeof(struct wapbl_dealloc), `0`, `0`, `0`,
337	"wapbldealloc", &pool_allocator_nointr, IPL_NONE);
338
339	wapbl_sysctl_init();
340	}
341
342	static int
343	wapbl_fini(void)
344	{
345
346	if (wapbl_sysctl != NULL)
347	sysctl_teardown(&wapbl_sysctl);
348
349	pool_destroy(&wapbl_dealloc_pool);
350	pool_destroy(&wapbl_entry_pool);
351
352	return `0`;
353	}
354
355	static int
356	wapbl_start_flush_inodes(struct wapbl wl, struct* wapbl_replay *wr)
357	{
358	int error, i;
359
360	WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
361	("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
362
363	/*
364	* Its only valid to reuse the replay log if its
365	* the same as the new log we just opened.
366	*/
367	KDASSERT(!wapbl_replay_isopen(wr));
368	KASSERT(wl->wl_devvp->v_type == VBLK);
369	KASSERT(wr->wr_devvp->v_type == VBLK);
370	KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
371	KASSERT(wl->wl_logpbn == wr->wr_logpbn);
372	KASSERT(wl->wl_circ_size == wr->wr_circ_size);
373	KASSERT(wl->wl_circ_off == wr->wr_circ_off);
374	KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
375	KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
376
377	wl->wl_wc_header->wc_generation = wr->wr_generation + `1`;
378
379	for (i = `0`; i < wr->wr_inodescnt; i++)
380	wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
381	wr->wr_inodes[i].wr_imode);
382
383	/ Make sure new transaction won't overwrite old inodes list /
384	KDASSERT(wapbl_transaction_len(wl) <=
385	wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
386	wr->wr_inodestail));
387
388	wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
389	wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
390	wapbl_transaction_len(wl);
391
392	error = wapbl_write_inodes(wl, &wl->wl_head);
393	if (error)
394	return error;
395
396	KASSERT(wl->wl_head != wl->wl_tail);
397	KASSERT(wl->wl_head != `0`);
398
399	return `0`;
400	}
401
402	int
403	wapbl_start(struct wapbl wlp, struct** mount mp, struct* vnode *vp,
404	daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
405	wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
406	{
407	struct wapbl *wl;
408	struct vnode *devvp;
409	daddr_t logpbn;
410	int error;
411	int log_dev_bshift = ilog2(blksize);
412	int fs_dev_bshift = log_dev_bshift;
413	int run;
414
415	WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
416	" count=%zu blksize=%zu\n", vp, off, count, blksize));
417
418	if (log_dev_bshift > fs_dev_bshift) {
419	WAPBL_PRINTF(WAPBL_PRINT_OPEN,
420	("wapbl: log device's block size cannot be larger "
421	"than filesystem's\n"));
422	/*
423	* Not currently implemented, although it could be if
424	* needed someday.
425	*/
426	return ENOSYS;
427	}
428
429	if (off < `0`)
430	return EINVAL;
431
432	if (blksize < DEV_BSIZE)
433	return EINVAL;
434	if (blksize % DEV_BSIZE)
435	return EINVAL;
436
437	/ XXXTODO: verify that the full load is writable /
438
439	/*
440	* XXX check for minimum log size
441	* minimum is governed by minimum amount of space
442	* to complete a transaction. (probably truncate)
443	*/
444	/ XXX for now pick something minimal /
445	if ((count * blksize) < MAXPHYS) {
446	return ENOSPC;
447	}
448
449	if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != `0`) {
450	return error;
451	}
452
453	wl = wapbl_calloc(`1`, sizeof(*wl));
454	rw_init(&wl->wl_rwlock);
455	mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
456	cv_init(&wl->wl_reclaimable_cv, "wapblrec");
457	LIST_INIT(&wl->wl_bufs);
458	SIMPLEQ_INIT(&wl->wl_entries);
459
460	wl->wl_logvp = vp;
461	wl->wl_devvp = devvp;
462	wl->wl_mount = mp;
463	wl->wl_logpbn = logpbn;
464	wl->wl_log_dev_bshift = log_dev_bshift;
465	wl->wl_fs_dev_bshift = fs_dev_bshift;
466
467	wl->wl_flush = flushfn;
468	wl->wl_flush_abort = flushabortfn;
469
470	/ Reserve two log device blocks for the commit headers /
471	wl->wl_circ_off = `2`<<wl->wl_log_dev_bshift;
472	wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
473	/ truncate the log usage to a multiple of log_dev_bshift /
474	wl->wl_circ_size >>= wl->wl_log_dev_bshift;
475	wl->wl_circ_size <<= wl->wl_log_dev_bshift;
476
477	/*
478	* wl_bufbytes_max limits the size of the in memory transaction space.
479	* - Since buffers are allocated and accounted for in units of
480	* PAGE_SIZE it is required to be a multiple of PAGE_SIZE
481	* (i.e. 1<<PAGE_SHIFT)
482	* - Since the log device has to be written in units of
483	* 1<<wl_log_dev_bshift it is required to be a mulitple of
484	* 1<<wl_log_dev_bshift.
485	* - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
486	* it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
487	* Therefore it must be multiple of the least common multiple of those
488	* three quantities. Fortunately, all of those quantities are
489	* guaranteed to be a power of two, and the least common multiple of
490	* a set of numbers which are all powers of two is simply the maximum
491	* of those numbers. Finally, the maximum logarithm of a power of two
492	* is the same as the log of the maximum power of two. So we can do
493	* the following operations to size wl_bufbytes_max:
494	*/
495
496	/ XXX fix actual number of pages reserved per filesystem. /
497	wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / `2`);
498
499	/ Round wl_bufbytes_max to the largest power of two constraint /
500	wl->wl_bufbytes_max >>= PAGE_SHIFT;
501	wl->wl_bufbytes_max <<= PAGE_SHIFT;
502	wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
503	wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
504	wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
505	wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
506
507	/ XXX maybe use filesystem fragment size instead of 1024 /
508	/ XXX fix actual number of buffers reserved per filesystem. /
509	wl->wl_bufcount_max = (nbuf / `2`) * `1024`;
510
511	wl->wl_brperjblock = ((`1`<<wl->wl_log_dev_bshift)
512	- offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
513	sizeof(((struct wapbl_wc_blocklist *)`0`)->wc_blocks[`0`]);
514	KASSERT(wl->wl_brperjblock > `0`);
515
516	/ XXX tie this into resource estimation /
517	wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / `2`;
518	TAILQ_INIT(&wl->wl_dealloclist);
519
520	wl->wl_buffer = wapbl_alloc(MAXPHYS);
521	wl->wl_buffer_used = `0`;
522
523	wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
524
525	/ Initialize the commit header /
526	{
527	struct wapbl_wc_header *wc;
528	size_t len = `1` << wl->wl_log_dev_bshift;
529	wc = wapbl_calloc(`1`, len);
530	wc->wc_type = WAPBL_WC_HEADER;
531	wc->wc_len = len;
532	wc->wc_circ_off = wl->wl_circ_off;
533	wc->wc_circ_size = wl->wl_circ_size;
534	/ XXX wc->wc_fsid /
535	wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
536	wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
537	wl->wl_wc_header = wc;
538	wl->wl_wc_scratch = wapbl_alloc(len);
539	}
540
541	/*
542	* if there was an existing set of unlinked but
543	* allocated inodes, preserve it in the new
544	* log.
545	*/
546	if (wr && wr->wr_inodescnt) {
547	error = wapbl_start_flush_inodes(wl, wr);
548	if (error)
549	goto errout;
550	}
551
552	error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
553	if (error) {
554	goto errout;
555	}
556
557	*wlp = wl;
558	#if defined(WAPBL_DEBUG)
559	wapbl_debug_wl = wl;
560	#endif
561
562	return `0`;
563	errout:
564	wapbl_discard(wl);
565	wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
566	wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
567	wapbl_free(wl->wl_buffer, MAXPHYS);
568	wapbl_inodetrk_free(wl);
569	wapbl_free(wl, sizeof(*wl));
570
571	return error;
572	}
573
574	/*
575	* Like wapbl_flush, only discards the transaction
576	* completely
577	*/
578
579	void
580	wapbl_discard(struct wapbl *wl)
581	{
582	struct wapbl_entry *we;
583	struct wapbl_dealloc *wd;
584	struct buf *bp;
585	int i;
586
587	/*
588	* XXX we may consider using upgrade here
589	* if we want to call flush from inside a transaction
590	*/
591	rw_enter(&wl->wl_rwlock, RW_WRITER);
592	wl->wl_flush(wl->wl_mount, TAILQ_FIRST(&wl->wl_dealloclist));
593
594	#ifdef WAPBL_DEBUG_PRINT
595	{
596	pid_t pid = -`1`;
597	lwpid_t lid = -`1`;
598	if (curproc)
599	pid = curproc->p_pid;
600	if (curlwp)
601	lid = curlwp->l_lid;
602	#ifdef WAPBL_DEBUG_BUFBYTES
603	WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
604	("wapbl_discard: thread %d.%d discarding "
605	"transaction\n"
606	"\tbufcount=%zu bufbytes=%zu bcount=%zu "
607	"deallocs=%d inodes=%d\n"
608	"\terrcnt = %u, reclaimable=%zu reserved=%zu "
609	"unsynced=%zu\n",
610	pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
611	wl->wl_bcount, wl->wl_dealloccnt,
612	wl->wl_inohashcnt, wl->wl_error_count,
613	wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
614	wl->wl_unsynced_bufbytes));
615	SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
616	WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
617	("\tentry: bufcount = %zu, reclaimable = %zu, "
618	"error = %d, unsynced = %zu\n",
619	we->we_bufcount, we->we_reclaimable_bytes,
620	we->we_error, we->we_unsynced_bufbytes));
621	}
622	#else /* !WAPBL_DEBUG_BUFBYTES */
623	WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
624	("wapbl_discard: thread %d.%d discarding transaction\n"
625	"\tbufcount=%zu bufbytes=%zu bcount=%zu "
626	"deallocs=%d inodes=%d\n"
627	"\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
628	pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
629	wl->wl_bcount, wl->wl_dealloccnt,
630	wl->wl_inohashcnt, wl->wl_error_count,
631	wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
632	SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
633	WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
634	("\tentry: bufcount = %zu, reclaimable = %zu, "
635	"error = %d\n",
636	we->we_bufcount, we->we_reclaimable_bytes,
637	we->we_error));
638	}
639	#endif /* !WAPBL_DEBUG_BUFBYTES */
640	}
641	#endif /* WAPBL_DEBUG_PRINT */
642
643	for (i = `0`; i <= wl->wl_inohashmask; i++) {
644	struct wapbl_ino_head *wih;
645	struct wapbl_ino *wi;
646
647	wih = &wl->wl_inohash[i];
648	while ((wi = LIST_FIRST(wih)) != NULL) {
649	LIST_REMOVE(wi, wi_hash);
650	pool_put(&wapbl_ino_pool, wi);
651	KASSERT(wl->wl_inohashcnt > `0`);
652	wl->wl_inohashcnt--;
653	}
654	}
655
656	/*
657	* clean buffer list
658	*/
659	mutex_enter(&bufcache_lock);
660	mutex_enter(&wl->wl_mtx);
661	while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
662	if (bbusy(bp, `0`, `0`, &wl->wl_mtx) == `0`) {
663	/*
664	* The buffer will be unlocked and
665	* removed from the transaction in brelse
666	*/
667	mutex_exit(&wl->wl_mtx);
668	brelsel(bp, `0`);
669	mutex_enter(&wl->wl_mtx);
670	}
671	}
672	mutex_exit(&wl->wl_mtx);
673	mutex_exit(&bufcache_lock);
674
675	/*
676	* Remove references to this wl from wl_entries, free any which
677	* no longer have buffers, others will be freed in wapbl_biodone
678	* when they no longer have any buffers.
679	*/
680	while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
681	SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
682	/ XXX should we be accumulating wl_error_count*
683	* and increasing reclaimable bytes ? */
684	we->we_wapbl = NULL;
685	if (we->we_bufcount == `0`) {
686	#ifdef WAPBL_DEBUG_BUFBYTES
687	KASSERT(we->we_unsynced_bufbytes == `0`);
688	#endif
689	pool_put(&wapbl_entry_pool, we);
690	}
691	}
692
693	/ Discard list of deallocs /
694	while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL)
695	wapbl_deallocation_free(wl, wd, true);
696
697	/ XXX should we clear wl_reserved_bytes? /
698
699	KASSERT(wl->wl_bufbytes == `0`);
700	KASSERT(wl->wl_bcount == `0`);
701	KASSERT(wl->wl_bufcount == `0`);
702	KASSERT(LIST_EMPTY(&wl->wl_bufs));
703	KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
704	KASSERT(wl->wl_inohashcnt == `0`);
705	KASSERT(TAILQ_EMPTY(&wl->wl_dealloclist));
706	KASSERT(wl->wl_dealloccnt == `0`);
707
708	rw_exit(&wl->wl_rwlock);
709	}
710
711	int
712	wapbl_stop(struct wapbl wl, int* force)
713	{
714	int error;
715
716	WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
717	error = wapbl_flush(wl, `1`);
718	if (error) {
719	if (force)
720	wapbl_discard(wl);
721	else
722	return error;
723	}
724
725	/ Unlinked inodes persist after a flush /
726	if (wl->wl_inohashcnt) {
727	if (force) {
728	wapbl_discard(wl);
729	} else {
730	return EBUSY;
731	}
732	}
733
734	KASSERT(wl->wl_bufbytes == `0`);
735	KASSERT(wl->wl_bcount == `0`);
736	KASSERT(wl->wl_bufcount == `0`);
737	KASSERT(LIST_EMPTY(&wl->wl_bufs));
738	KASSERT(wl->wl_dealloccnt == `0`);
739	KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
740	KASSERT(wl->wl_inohashcnt == `0`);
741	KASSERT(TAILQ_EMPTY(&wl->wl_dealloclist));
742	KASSERT(wl->wl_dealloccnt == `0`);
743
744	wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
745	wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
746	wapbl_free(wl->wl_buffer, MAXPHYS);
747	wapbl_inodetrk_free(wl);
748
749	cv_destroy(&wl->wl_reclaimable_cv);
750	mutex_destroy(&wl->wl_mtx);
751	rw_destroy(&wl->wl_rwlock);
752	wapbl_free(wl, sizeof(*wl));
753
754	return `0`;
755	}
756
757	/**************************************************************/
758	/*
759	* Unbuffered disk I/O
760	*/
761
762	static int
763	wapbl_doio(void data, size_t len, struct* vnode devvp, daddr_t pbn, int* flags)
764	{
765	struct pstats *pstats = curlwp->l_proc->p_stats;
766	struct buf *bp;
767	int error;
768
769	KASSERT((flags & ~(B_WRITE \| B_READ)) == `0`);
770	KASSERT(devvp->v_type == VBLK);
771
772	if ((flags & (B_WRITE \| B_READ)) == B_WRITE) {
773	mutex_enter(devvp->v_interlock);
774	devvp->v_numoutput++;
775	mutex_exit(devvp->v_interlock);
776	pstats->p_ru.ru_oublock++;
777	} else {
778	pstats->p_ru.ru_inblock++;
779	}
780
781	bp = getiobuf(devvp, true);
782	bp->b_flags = flags;
783	bp->b_cflags = BC_BUSY; / silly & dubious /
784	bp->b_dev = devvp->v_rdev;
785	bp->b_data = data;
786	bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
787	bp->b_blkno = pbn;
788	BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
789
790	WAPBL_PRINTF(WAPBL_PRINT_IO,
791	("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
792	BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
793	bp->b_blkno, bp->b_dev));
794
795	VOP_STRATEGY(devvp, bp);
796
797	error = biowait(bp);
798	putiobuf(bp);
799
800	if (error) {
801	WAPBL_PRINTF(WAPBL_PRINT_ERROR,
802	("wapbl_doio: %s %zu bytes at block %" PRId64
803	" on dev 0x%"PRIx64" failed with error %d\n",
804	(((flags & (B_WRITE \| B_READ)) == B_WRITE) ?
805	"write" : "read"),
806	len, pbn, devvp->v_rdev, error));
807	}
808
809	return error;
810	}
811
812	/*
813	* wapbl_write(data, len, devvp, pbn)
814	*
815	* Synchronously write len bytes from data to physical block pbn
816	* on devvp.
817	*/
818	int
819	wapbl_write(void data, size_t len, struct* vnode *devvp, daddr_t pbn)
820	{
821
822	return wapbl_doio(data, len, devvp, pbn, B_WRITE);
823	}
824
825	/*
826	* wapbl_read(data, len, devvp, pbn)
827	*
828	* Synchronously read len bytes into data from physical block pbn
829	* on devvp.
830	*/
831	int
832	wapbl_read(void data, size_t len, struct* vnode *devvp, daddr_t pbn)
833	{
834
835	return wapbl_doio(data, len, devvp, pbn, B_READ);
836	}
837
838	/**************************************************************/
839	/*
840	* Buffered disk writes -- try to coalesce writes and emit
841	* MAXPHYS-aligned blocks.
842	*/
843
844	/*
845	* wapbl_buffered_flush(wl)
846	*
847	* Flush any buffered writes from wapbl_buffered_write.
848	*/
849	static int
850	wapbl_buffered_flush(struct wapbl *wl)
851	{
852	int error;
853
854	if (wl->wl_buffer_used == `0`)
855	return `0`;
856
857	error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
858	wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
859	wl->wl_buffer_used = `0`;
860
861	return error;
862	}
863
864	/*
865	* wapbl_buffered_write(data, len, wl, pbn)
866	*
867	* Write len bytes from data to physical block pbn on
868	* wl->wl_devvp. The write may not complete until
869	* wapbl_buffered_flush.
870	*/
871	static int
872	wapbl_buffered_write(void data, size_t len, struct* wapbl *wl, daddr_t pbn)
873	{
874	int error;
875	size_t resid;
876
877	/*
878	* If not adjacent to buffered data flush first. Disk block
879	* address is always valid for non-empty buffer.
880	*/
881	if (wl->wl_buffer_used > `0` &&
882	pbn != wl->wl_buffer_dblk + btodb(wl->wl_buffer_used)) {
883	error = wapbl_buffered_flush(wl);
884	if (error)
885	return error;
886	}
887	/*
888	* If this write goes to an empty buffer we have to
889	* save the disk block address first.
890	*/
891	if (wl->wl_buffer_used == `0`)
892	wl->wl_buffer_dblk = pbn;
893	/*
894	* Remaining space so this buffer ends on a MAXPHYS boundary.
895	*
896	* Cannot become less or equal zero as the buffer would have been
897	* flushed on the last call then.
898	*/
899	resid = MAXPHYS - dbtob(wl->wl_buffer_dblk % btodb(MAXPHYS)) -
900	wl->wl_buffer_used;
901	KASSERT(resid > `0`);
902	KASSERT(dbtob(btodb(resid)) == resid);
903	if (len >= resid) {
904	memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid);
905	wl->wl_buffer_used += resid;
906	error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
907	wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
908	data = (uint8_t *)data + resid;
909	len -= resid;
910	wl->wl_buffer_dblk = pbn + btodb(resid);
911	wl->wl_buffer_used = `0`;
912	if (error)
913	return error;
914	}
915	KASSERT(len < MAXPHYS);
916	if (len > `0`) {
917	memcpy(wl->wl_buffer + wl->wl_buffer_used, data, len);
918	wl->wl_buffer_used += len;
919	}
920
921	return `0`;
922	}
923
924	/*
925	* wapbl_circ_write(wl, data, len, offp)
926	*
927	* Write len bytes from data to the circular queue of wl, starting
928	* at linear byte offset *offp, and returning the new linear byte
929	* offset in *offp.
930	*
931	* If the starting linear byte offset precedes wl->wl_circ_off,
932	* the write instead begins at wl->wl_circ_off. XXX WTF? This
933	* should be a KASSERT, not a conditional.
934	*
935	* The write is buffered in wl and must be flushed with
936	* wapbl_buffered_flush before it will be submitted to the disk.
937	*/
938	static int
939	wapbl_circ_write(struct wapbl wl, void* data, size_t len, off_t offp)
940	{
941	size_t slen;
942	off_t off = *offp;
943	int error;
944	daddr_t pbn;
945
946	KDASSERT(((len >> wl->wl_log_dev_bshift) <<
947	wl->wl_log_dev_bshift) == len);
948
949	if (off < wl->wl_circ_off)
950	off = wl->wl_circ_off;
951	slen = wl->wl_circ_off + wl->wl_circ_size - off;
952	if (slen < len) {
953	pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
954	#ifdef _KERNEL
955	pbn = btodb(pbn << wl->wl_log_dev_bshift);
956	#endif
957	error = wapbl_buffered_write(data, slen, wl, pbn);
958	if (error)
959	return error;
960	data = (uint8_t *)data + slen;
961	len -= slen;
962	off = wl->wl_circ_off;
963	}
964	pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
965	#ifdef _KERNEL
966	pbn = btodb(pbn << wl->wl_log_dev_bshift);
967	#endif
968	error = wapbl_buffered_write(data, len, wl, pbn);
969	if (error)
970	return error;
971	off += len;
972	if (off >= wl->wl_circ_off + wl->wl_circ_size)
973	off = wl->wl_circ_off;
974	*offp = off;
975	return `0`;
976	}
977
978	/**************************************************************/
979	/*
980	* WAPBL transactions: entering, adding/removing bufs, and exiting
981	*/
982
983	int
984	wapbl_begin(struct wapbl wl, const* char file, int* line)
985	{
986	int doflush;
987	unsigned lockcount;
988
989	KDASSERT(wl);
990
991	/*
992	* XXX this needs to be made much more sophisticated.
993	* perhaps each wapbl_begin could reserve a specified
994	* number of buffers and bytes.
995	*/
996	mutex_enter(&wl->wl_mtx);
997	lockcount = wl->wl_lock_count;
998	doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
999	wl->wl_bufbytes_max / `2`) \|\|
1000	((wl->wl_bufcount + (lockcount * `10`)) >
1001	wl->wl_bufcount_max / `2`) \|\|
1002	(wapbl_transaction_len(wl) > wl->wl_circ_size / `2`) \|\|
1003	(wl->wl_dealloccnt >= (wl->wl_dealloclim / `2`));
1004	mutex_exit(&wl->wl_mtx);
1005
1006	if (doflush) {
1007	WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1008	("force flush lockcnt=%d bufbytes=%zu "
1009	"(max=%zu) bufcount=%zu (max=%zu) "
1010	"dealloccnt %d (lim=%d)\n",
1011	lockcount, wl->wl_bufbytes,
1012	wl->wl_bufbytes_max, wl->wl_bufcount,
1013	wl->wl_bufcount_max,
1014	wl->wl_dealloccnt, wl->wl_dealloclim));
1015	}
1016
1017	if (doflush) {
1018	int error = wapbl_flush(wl, `0`);
1019	if (error)
1020	return error;
1021	}
1022
1023	rw_enter(&wl->wl_rwlock, RW_READER);
1024	mutex_enter(&wl->wl_mtx);
1025	wl->wl_lock_count++;
1026	mutex_exit(&wl->wl_mtx);
1027
1028	#if defined(WAPBL_DEBUG_PRINT)
1029	WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
1030	("wapbl_begin thread %d.%d with bufcount=%zu "
1031	"bufbytes=%zu bcount=%zu at %s:%d\n",
1032	curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1033	wl->wl_bufbytes, wl->wl_bcount, file, line));
1034	#endif
1035
1036	return `0`;
1037	}
1038
1039	void
1040	wapbl_end(struct wapbl *wl)
1041	{
1042
1043	#if defined(WAPBL_DEBUG_PRINT)
1044	WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
1045	("wapbl_end thread %d.%d with bufcount=%zu "
1046	"bufbytes=%zu bcount=%zu\n",
1047	curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1048	wl->wl_bufbytes, wl->wl_bcount));
1049	#endif
1050
1051	/*
1052	* XXX this could be handled more gracefully, perhaps place
1053	* only a partial transaction in the log and allow the
1054	* remaining to flush without the protection of the journal.
1055	*/
1056	KASSERTMSG((wapbl_transaction_len(wl) <=
1057	(wl->wl_circ_size - wl->wl_reserved_bytes)),
1058	"wapbl_end: current transaction too big to flush");
1059
1060	mutex_enter(&wl->wl_mtx);
1061	KASSERT(wl->wl_lock_count > `0`);
1062	wl->wl_lock_count--;
1063	mutex_exit(&wl->wl_mtx);
1064
1065	rw_exit(&wl->wl_rwlock);
1066	}
1067
1068	void
1069	wapbl_add_buf(struct wapbl wl, struct* buf * bp)
1070	{
1071
1072	KASSERT(bp->b_cflags & BC_BUSY);
1073	KASSERT(bp->b_vp);
1074
1075	wapbl_jlock_assert(wl);
1076
1077	#if 0
1078	/*
1079	* XXX this might be an issue for swapfiles.
1080	* see uvm_swap.c:1702
1081	*
1082	* XXX2 why require it then? leap of semantics?
1083	*/
1084	KASSERT((bp->b_cflags & BC_NOCACHE) == `0`);
1085	#endif
1086
1087	mutex_enter(&wl->wl_mtx);
1088	if (bp->b_flags & B_LOCKED) {
1089	LIST_REMOVE(bp, b_wapbllist);
1090	WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
1091	("wapbl_add_buf thread %d.%d re-adding buf %p "
1092	"with %d bytes %d bcount\n",
1093	curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1094	bp->b_bcount));
1095	} else {
1096	/ unlocked by dirty buffers shouldn't exist /
1097	KASSERT(!(bp->b_oflags & BO_DELWRI));
1098	wl->wl_bufbytes += bp->b_bufsize;
1099	wl->wl_bcount += bp->b_bcount;
1100	wl->wl_bufcount++;
1101	WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1102	("wapbl_add_buf thread %d.%d adding buf %p "
1103	"with %d bytes %d bcount\n",
1104	curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
1105	bp->b_bcount));
1106	}
1107	LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
1108	mutex_exit(&wl->wl_mtx);
1109
1110	bp->b_flags \|= B_LOCKED;
1111	}
1112
1113	static void
1114	wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
1115	{
1116
1117	KASSERT(mutex_owned(&wl->wl_mtx));
1118	KASSERT(bp->b_cflags & BC_BUSY);
1119	wapbl_jlock_assert(wl);
1120
1121	#if 0
1122	/*
1123	* XXX this might be an issue for swapfiles.
1124	* see uvm_swap.c:1725
1125	*
1126	* XXXdeux: see above
1127	*/
1128	KASSERT((bp->b_flags & BC_NOCACHE) == `0`);
1129	#endif
1130	KASSERT(bp->b_flags & B_LOCKED);
1131
1132	WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
1133	("wapbl_remove_buf thread %d.%d removing buf %p with "
1134	"%d bytes %d bcount\n",
1135	curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
1136
1137	KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
1138	wl->wl_bufbytes -= bp->b_bufsize;
1139	KASSERT(wl->wl_bcount >= bp->b_bcount);
1140	wl->wl_bcount -= bp->b_bcount;
1141	KASSERT(wl->wl_bufcount > `0`);
1142	wl->wl_bufcount--;
1143	KASSERT((wl->wl_bufcount == `0`) == (wl->wl_bufbytes == `0`));
1144	KASSERT((wl->wl_bufcount == `0`) == (wl->wl_bcount == `0`));
1145	LIST_REMOVE(bp, b_wapbllist);
1146
1147	bp->b_flags &= ~B_LOCKED;
1148	}
1149
1150	/ called from brelsel() in vfs_bio among other places /
1151	void
1152	wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
1153	{
1154
1155	mutex_enter(&wl->wl_mtx);
1156	wapbl_remove_buf_locked(wl, bp);
1157	mutex_exit(&wl->wl_mtx);
1158	}
1159
1160	void
1161	wapbl_resize_buf(struct wapbl wl, struct* buf bp, long* oldsz, long oldcnt)
1162	{
1163
1164	KASSERT(bp->b_cflags & BC_BUSY);
1165
1166	/*
1167	* XXX: why does this depend on B_LOCKED? otherwise the buf
1168	* is not for a transaction? if so, why is this called in the
1169	* first place?
1170	*/
1171	if (bp->b_flags & B_LOCKED) {
1172	mutex_enter(&wl->wl_mtx);
1173	wl->wl_bufbytes += bp->b_bufsize - oldsz;
1174	wl->wl_bcount += bp->b_bcount - oldcnt;
1175	mutex_exit(&wl->wl_mtx);
1176	}
1177	}
1178
1179	#endif /* _KERNEL */
1180
1181	/**************************************************************/
1182	/ Some utility inlines /
1183
1184	/*
1185	* wapbl_space_used(avail, head, tail)
1186	*
1187	* Number of bytes used in a circular queue of avail total bytes,
1188	* from tail to head.
1189	*/
1190	static inline size_t
1191	wapbl_space_used(size_t avail, off_t head, off_t tail)
1192	{
1193
1194	if (tail == `0`) {
1195	KASSERT(head == `0`);
1196	return `0`;
1197	}
1198	return ((head + (avail - `1`) - tail) % avail) + `1`;
1199	}
1200
1201	#ifdef _KERNEL
1202	/*
1203	* wapbl_advance(size, off, oldoff, delta)
1204	*
1205	* Given a byte offset oldoff into a circular queue of size bytes
1206	* starting at off, return a new byte offset oldoff + delta into
1207	* the circular queue.
1208	*/
1209	static inline off_t
1210	wapbl_advance(size_t size, size_t off, off_t oldoff, size_t delta)
1211	{
1212	off_t newoff;
1213
1214	/ Define acceptable ranges for inputs. /
1215	KASSERT(delta <= (size_t)size);
1216	KASSERT((oldoff == `0`) \|\| ((size_t)oldoff >= off));
1217	KASSERT(oldoff < (off_t)(size + off));
1218
1219	if ((oldoff == `0`) && (delta != `0`))
1220	newoff = off + delta;
1221	else if ((oldoff + delta) < (size + off))
1222	newoff = oldoff + delta;
1223	else
1224	newoff = (oldoff + delta) - size;
1225
1226	/ Note some interesting axioms /
1227	KASSERT((delta != `0`) \|\| (newoff == oldoff));
1228	KASSERT((delta == `0`) \|\| (newoff != `0`));
1229	KASSERT((delta != (size)) \|\| (newoff == oldoff));
1230
1231	/ Define acceptable ranges for output. /
1232	KASSERT((newoff == `0`) \|\| ((size_t)newoff >= off));
1233	KASSERT((size_t)newoff < (size + off));
1234	return newoff;
1235	}
1236
1237	/*
1238	* wapbl_space_free(avail, head, tail)
1239	*
1240	* Number of bytes free in a circular queue of avail total bytes,
1241	* in which everything from tail to head is used.
1242	*/
1243	static inline size_t
1244	wapbl_space_free(size_t avail, off_t head, off_t tail)
1245	{
1246
1247	return avail - wapbl_space_used(avail, head, tail);
1248	}
1249
1250	/*
1251	* wapbl_advance_head(size, off, delta, headp, tailp)
1252	*
1253	* In a circular queue of size bytes starting at off, given the
1254	* old head and tail offsets headp and tailp, store the new head
1255	* and tail offsets in headp and tailp resulting from adding
1256	* delta bytes of data to the head.
1257	*/
1258	static inline void
1259	wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
1260	off_t *tailp)
1261	{
1262	off_t head = *headp;
1263	off_t tail = *tailp;
1264
1265	KASSERT(delta <= wapbl_space_free(size, head, tail));
1266	head = wapbl_advance(size, off, head, delta);
1267	if ((tail == `0`) && (head != `0`))
1268	tail = off;
1269	*headp = head;
1270	*tailp = tail;
1271	}
1272
1273	/*
1274	* wapbl_advance_tail(size, off, delta, headp, tailp)
1275	*
1276	* In a circular queue of size bytes starting at off, given the
1277	* old head and tail offsets headp and tailp, store the new head
1278	* and tail offsets in headp and tailp resulting from removing
1279	* delta bytes of data from the tail.
1280	*/
1281	static inline void
1282	wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
1283	off_t *tailp)
1284	{
1285	off_t head = *headp;
1286	off_t tail = *tailp;
1287
1288	KASSERT(delta <= wapbl_space_used(size, head, tail));
1289	tail = wapbl_advance(size, off, tail, delta);
1290	if (head == tail) {
1291	head = tail = `0`;
1292	}
1293	*headp = head;
1294	*tailp = tail;
1295	}
1296
1297
1298	/**************************************************************/
1299
1300	/*
1301	* wapbl_truncate(wl, minfree)
1302	*
1303	* Wait until at least minfree bytes are available in the log.
1304	*
1305	* If it was necessary to wait for writes to complete,
1306	* advance the circular queue tail to reflect the new write
1307	* completions and issue a write commit to the log.
1308	*
1309	* => Caller must hold wl->wl_rwlock writer lock.
1310	*/
1311	static int
1312	wapbl_truncate(struct wapbl *wl, size_t minfree)
1313	{
1314	size_t delta;
1315	size_t avail;
1316	off_t head;
1317	off_t tail;
1318	int error = `0`;
1319
1320	KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
1321	KASSERT(rw_write_held(&wl->wl_rwlock));
1322
1323	mutex_enter(&wl->wl_mtx);
1324
1325	/*
1326	* First check to see if we have to do a commit
1327	* at all.
1328	*/
1329	avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
1330	if (minfree < avail) {
1331	mutex_exit(&wl->wl_mtx);
1332	return `0`;
1333	}
1334	minfree -= avail;
1335	while ((wl->wl_error_count == `0`) &&
1336	(wl->wl_reclaimable_bytes < minfree)) {
1337	WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1338	("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
1339	"minfree=%zd\n",
1340	&wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
1341	minfree));
1342
1343	cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
1344	}
1345	if (wl->wl_reclaimable_bytes < minfree) {
1346	KASSERT(wl->wl_error_count);
1347	/ XXX maybe get actual error from buffer instead someday? /
1348	error = EIO;
1349	}
1350	head = wl->wl_head;
1351	tail = wl->wl_tail;
1352	delta = wl->wl_reclaimable_bytes;
1353
1354	/ If all of of the entries are flushed, then be sure to keep*
1355	* the reserved bytes reserved. Watch out for discarded transactions,
1356	* which could leave more bytes reserved than are reclaimable.
1357	*/
1358	if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
1359	(delta >= wl->wl_reserved_bytes)) {
1360	delta -= wl->wl_reserved_bytes;
1361	}
1362	wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
1363	&tail);
1364	KDASSERT(wl->wl_reserved_bytes <=
1365	wapbl_space_used(wl->wl_circ_size, head, tail));
1366	mutex_exit(&wl->wl_mtx);
1367
1368	if (error)
1369	return error;
1370
1371	/*
1372	* This is where head, tail and delta are unprotected
1373	* from races against itself or flush. This is ok since
1374	* we only call this routine from inside flush itself.
1375	*
1376	* XXX: how can it race against itself when accessed only
1377	* from behind the write-locked rwlock?
1378	*/
1379	error = wapbl_write_commit(wl, head, tail);
1380	if (error)
1381	return error;
1382
1383	wl->wl_head = head;
1384	wl->wl_tail = tail;
1385
1386	mutex_enter(&wl->wl_mtx);
1387	KASSERT(wl->wl_reclaimable_bytes >= delta);
1388	wl->wl_reclaimable_bytes -= delta;
1389	mutex_exit(&wl->wl_mtx);
1390	WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
1391	("wapbl_truncate thread %d.%d truncating %zu bytes\n",
1392	curproc->p_pid, curlwp->l_lid, delta));
1393
1394	return `0`;
1395	}
1396
1397	/**************************************************************/
1398
1399	void
1400	wapbl_biodone(struct buf *bp)
1401	{
1402	struct wapbl_entry *we = bp->b_private;
1403	struct wapbl *wl = we->we_wapbl;
1404	#ifdef WAPBL_DEBUG_BUFBYTES
1405	const int bufsize = bp->b_bufsize;
1406	#endif
1407
1408	/*
1409	* Handle possible flushing of buffers after log has been
1410	* decomissioned.
1411	*/
1412	if (!wl) {
1413	KASSERT(we->we_bufcount > `0`);
1414	we->we_bufcount--;
1415	#ifdef WAPBL_DEBUG_BUFBYTES
1416	KASSERT(we->we_unsynced_bufbytes >= bufsize);
1417	we->we_unsynced_bufbytes -= bufsize;
1418	#endif
1419
1420	if (we->we_bufcount == `0`) {
1421	#ifdef WAPBL_DEBUG_BUFBYTES
1422	KASSERT(we->we_unsynced_bufbytes == `0`);
1423	#endif
1424	pool_put(&wapbl_entry_pool, we);
1425	}
1426
1427	brelse(bp, `0`);
1428	return;
1429	}
1430
1431	#ifdef ohbother
1432	KDASSERT(bp->b_oflags & BO_DONE);
1433	KDASSERT(!(bp->b_oflags & BO_DELWRI));
1434	KDASSERT(bp->b_flags & B_ASYNC);
1435	KDASSERT(bp->b_cflags & BC_BUSY);
1436	KDASSERT(!(bp->b_flags & B_LOCKED));
1437	KDASSERT(!(bp->b_flags & B_READ));
1438	KDASSERT(!(bp->b_cflags & BC_INVAL));
1439	KDASSERT(!(bp->b_cflags & BC_NOCACHE));
1440	#endif
1441
1442	if (bp->b_error) {
1443	/*
1444	* If an error occurs, it would be nice to leave the buffer
1445	* as a delayed write on the LRU queue so that we can retry
1446	* it later. But buffercache(9) can't handle dirty buffer
1447	* reuse, so just mark the log permanently errored out.
1448	*/
1449	mutex_enter(&wl->wl_mtx);
1450	if (wl->wl_error_count == `0`) {
1451	wl->wl_error_count++;
1452	cv_broadcast(&wl->wl_reclaimable_cv);
1453	}
1454	mutex_exit(&wl->wl_mtx);
1455	}
1456
1457	/*
1458	* Release the buffer here. wapbl_flush() may wait for the
1459	* log to become empty and we better unbusy the buffer before
1460	* wapbl_flush() returns.
1461	*/
1462	brelse(bp, `0`);
1463
1464	mutex_enter(&wl->wl_mtx);
1465
1466	KASSERT(we->we_bufcount > `0`);
1467	we->we_bufcount--;
1468	#ifdef WAPBL_DEBUG_BUFBYTES
1469	KASSERT(we->we_unsynced_bufbytes >= bufsize);
1470	we->we_unsynced_bufbytes -= bufsize;
1471	KASSERT(wl->wl_unsynced_bufbytes >= bufsize);
1472	wl->wl_unsynced_bufbytes -= bufsize;
1473	#endif
1474
1475	/*
1476	* If the current transaction can be reclaimed, start
1477	* at the beginning and reclaim any consecutive reclaimable
1478	* transactions. If we successfully reclaim anything,
1479	* then wakeup anyone waiting for the reclaim.
1480	*/
1481	if (we->we_bufcount == `0`) {
1482	size_t delta = `0`;
1483	int errcnt = `0`;
1484	#ifdef WAPBL_DEBUG_BUFBYTES
1485	KDASSERT(we->we_unsynced_bufbytes == `0`);
1486	#endif
1487	/*
1488	* clear any posted error, since the buffer it came from
1489	* has successfully flushed by now
1490	*/
1491	while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
1492	(we->we_bufcount == `0`)) {
1493	delta += we->we_reclaimable_bytes;
1494	if (we->we_error)
1495	errcnt++;
1496	SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
1497	pool_put(&wapbl_entry_pool, we);
1498	}
1499
1500	if (delta) {
1501	wl->wl_reclaimable_bytes += delta;
1502	KASSERT(wl->wl_error_count >= errcnt);
1503	wl->wl_error_count -= errcnt;
1504	cv_broadcast(&wl->wl_reclaimable_cv);
1505	}
1506	}
1507
1508	mutex_exit(&wl->wl_mtx);
1509	}
1510
1511	/*
1512	* wapbl_flush(wl, wait)
1513	*
1514	* Flush pending block writes, deallocations, and inodes from
1515	* the current transaction in memory to the log on disk:
1516	*
1517	* 1. Call the file system's wl_flush callback to flush any
1518	* per-file-system pending updates.
1519	* 2. Wait for enough space in the log for the current transaction.
1520	* 3. Synchronously write the new log records, advancing the
1521	* circular queue head.
1522	* 4. Issue the pending block writes asynchronously, now that they
1523	* are recorded in the log and can be replayed after crash.
1524	* 5. If wait is true, wait for all writes to complete and for the
1525	* log to become empty.
1526	*
1527	* On failure, call the file system's wl_flush_abort callback.
1528	*/
1529	int
1530	wapbl_flush(struct wapbl wl, int* waitfor)
1531	{
1532	struct buf *bp;
1533	struct wapbl_entry *we;
1534	off_t off;
1535	off_t head;
1536	off_t tail;
1537	size_t delta = `0`;
1538	size_t flushsize;
1539	size_t reserved;
1540	int error = `0`;
1541
1542	/*
1543	* Do a quick check to see if a full flush can be skipped
1544	* This assumes that the flush callback does not need to be called
1545	* unless there are other outstanding bufs.
1546	*/
1547	if (!waitfor) {
1548	size_t nbufs;
1549	mutex_enter(&wl->wl_mtx); / XXX need mutex here to*
1550	protect the KASSERTS /*
1551	nbufs = wl->wl_bufcount;
1552	KASSERT((wl->wl_bufcount == `0`) == (wl->wl_bufbytes == `0`));
1553	KASSERT((wl->wl_bufcount == `0`) == (wl->wl_bcount == `0`));
1554	mutex_exit(&wl->wl_mtx);
1555	if (nbufs == `0`)
1556	return `0`;
1557	}
1558
1559	/*
1560	* XXX we may consider using LK_UPGRADE here
1561	* if we want to call flush from inside a transaction
1562	*/
1563	rw_enter(&wl->wl_rwlock, RW_WRITER);
1564	wl->wl_flush(wl->wl_mount, TAILQ_FIRST(&wl->wl_dealloclist));
1565
1566	/*
1567	* Now that we are exclusively locked and the file system has
1568	* issued any deferred block writes for this transaction, check
1569	* whether there are any blocks to write to the log. If not,
1570	* skip waiting for space or writing any log entries.
1571	*
1572	* XXX Shouldn't this also check wl_dealloccnt and
1573	* wl_inohashcnt? Perhaps wl_dealloccnt doesn't matter if the
1574	* file system didn't produce any blocks as a consequence of
1575	* it, but the same does not seem to be so of wl_inohashcnt.
1576	*/
1577	if (wl->wl_bufcount == `0`) {
1578	goto wait_out;
1579	}
1580
1581	#if 0
1582	WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1583	("wapbl_flush thread %d.%d flushing entries with "
1584	"bufcount=%zu bufbytes=%zu\n",
1585	curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
1586	wl->wl_bufbytes));
1587	#endif
1588
1589	/ Calculate amount of space needed to flush /
1590	flushsize = wapbl_transaction_len(wl);
1591	if (wapbl_verbose_commit) {
1592	struct timespec ts;
1593	getnanotime(&ts);
1594	printf("%s: %lld.%09ld this transaction = %zu bytes\n",
1595	__func__, (long long)ts.tv_sec,
1596	(long)ts.tv_nsec, flushsize);
1597	}
1598
1599	if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
1600	/*
1601	* XXX this could be handled more gracefully, perhaps place
1602	* only a partial transaction in the log and allow the
1603	* remaining to flush without the protection of the journal.
1604	*/
1605	panic("wapbl_flush: current transaction too big to flush");
1606	}
1607
1608	error = wapbl_truncate(wl, flushsize);
1609	if (error)
1610	goto out;
1611
1612	off = wl->wl_head;
1613	KASSERT((off == `0`) \|\| (off >= wl->wl_circ_off));
1614	KASSERT((off == `0`) \|\| (off < wl->wl_circ_off + wl->wl_circ_size));
1615	error = wapbl_write_blocks(wl, &off);
1616	if (error)
1617	goto out;
1618	error = wapbl_write_revocations(wl, &off);
1619	if (error)
1620	goto out;
1621	error = wapbl_write_inodes(wl, &off);
1622	if (error)
1623	goto out;
1624
1625	reserved = `0`;
1626	if (wl->wl_inohashcnt)
1627	reserved = wapbl_transaction_inodes_len(wl);
1628
1629	head = wl->wl_head;
1630	tail = wl->wl_tail;
1631
1632	wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
1633	&head, &tail);
1634
1635	KASSERTMSG(head == off,
1636	"lost head! head=%"PRIdMAX" tail=%" PRIdMAX
1637	" off=%"PRIdMAX" flush=%zu",
1638	(intmax_t)head, (intmax_t)tail, (intmax_t)off,
1639	flushsize);
1640
1641	/ Opportunistically move the tail forward if we can /
1642	mutex_enter(&wl->wl_mtx);
1643	delta = wl->wl_reclaimable_bytes;
1644	mutex_exit(&wl->wl_mtx);
1645	wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
1646	&head, &tail);
1647
1648	error = wapbl_write_commit(wl, head, tail);
1649	if (error)
1650	goto out;
1651
1652	we = pool_get(&wapbl_entry_pool, PR_WAITOK);
1653
1654	#ifdef WAPBL_DEBUG_BUFBYTES
1655	WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1656	("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1657	" unsynced=%zu"
1658	"\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1659	"inodes=%d\n",
1660	curproc->p_pid, curlwp->l_lid, flushsize, delta,
1661	wapbl_space_used(wl->wl_circ_size, head, tail),
1662	wl->wl_unsynced_bufbytes, wl->wl_bufcount,
1663	wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
1664	wl->wl_inohashcnt));
1665	#else
1666	WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1667	("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
1668	"\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
1669	"inodes=%d\n",
1670	curproc->p_pid, curlwp->l_lid, flushsize, delta,
1671	wapbl_space_used(wl->wl_circ_size, head, tail),
1672	wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1673	wl->wl_dealloccnt, wl->wl_inohashcnt));
1674	#endif
1675
1676
1677	mutex_enter(&bufcache_lock);
1678	mutex_enter(&wl->wl_mtx);
1679
1680	wl->wl_reserved_bytes = reserved;
1681	wl->wl_head = head;
1682	wl->wl_tail = tail;
1683	KASSERT(wl->wl_reclaimable_bytes >= delta);
1684	wl->wl_reclaimable_bytes -= delta;
1685	KDASSERT(wl->wl_dealloccnt == `0`);
1686	#ifdef WAPBL_DEBUG_BUFBYTES
1687	wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
1688	#endif
1689
1690	we->we_wapbl = wl;
1691	we->we_bufcount = wl->wl_bufcount;
1692	#ifdef WAPBL_DEBUG_BUFBYTES
1693	we->we_unsynced_bufbytes = wl->wl_bufbytes;
1694	#endif
1695	we->we_reclaimable_bytes = flushsize;
1696	we->we_error = `0`;
1697	SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
1698
1699	/*
1700	* this flushes bufs in reverse order than they were queued
1701	* it shouldn't matter, but if we care we could use TAILQ instead.
1702	* XXX Note they will get put on the lru queue when they flush
1703	* so we might actually want to change this to preserve order.
1704	*/
1705	while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
1706	if (bbusy(bp, `0`, `0`, &wl->wl_mtx)) {
1707	continue;
1708	}
1709	bp->b_iodone = wapbl_biodone;
1710	bp->b_private = we;
1711	bremfree(bp);
1712	wapbl_remove_buf_locked(wl, bp);
1713	mutex_exit(&wl->wl_mtx);
1714	mutex_exit(&bufcache_lock);
1715	bawrite(bp);
1716	mutex_enter(&bufcache_lock);
1717	mutex_enter(&wl->wl_mtx);
1718	}
1719	mutex_exit(&wl->wl_mtx);
1720	mutex_exit(&bufcache_lock);
1721
1722	#if 0
1723	WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
1724	("wapbl_flush thread %d.%d done flushing entries...\n",
1725	curproc->p_pid, curlwp->l_lid));
1726	#endif
1727
1728	wait_out:
1729
1730	/*
1731	* If the waitfor flag is set, don't return until everything is
1732	* fully flushed and the on disk log is empty.
1733	*/
1734	if (waitfor) {
1735	error = wapbl_truncate(wl, wl->wl_circ_size -
1736	wl->wl_reserved_bytes);
1737	}
1738
1739	out:
1740	if (error) {
1741	wl->wl_flush_abort(wl->wl_mount,
1742	TAILQ_FIRST(&wl->wl_dealloclist));
1743	}
1744
1745	#ifdef WAPBL_DEBUG_PRINT
1746	if (error) {
1747	pid_t pid = -`1`;
1748	lwpid_t lid = -`1`;
1749	if (curproc)
1750	pid = curproc->p_pid;
1751	if (curlwp)
1752	lid = curlwp->l_lid;
1753	mutex_enter(&wl->wl_mtx);
1754	#ifdef WAPBL_DEBUG_BUFBYTES
1755	WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1756	("wapbl_flush: thread %d.%d aborted flush: "
1757	"error = %d\n"
1758	"\tbufcount=%zu bufbytes=%zu bcount=%zu "
1759	"deallocs=%d inodes=%d\n"
1760	"\terrcnt = %d, reclaimable=%zu reserved=%zu "
1761	"unsynced=%zu\n",
1762	pid, lid, error, wl->wl_bufcount,
1763	wl->wl_bufbytes, wl->wl_bcount,
1764	wl->wl_dealloccnt, wl->wl_inohashcnt,
1765	wl->wl_error_count, wl->wl_reclaimable_bytes,
1766	wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
1767	SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1768	WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1769	("\tentry: bufcount = %zu, reclaimable = %zu, "
1770	"error = %d, unsynced = %zu\n",
1771	we->we_bufcount, we->we_reclaimable_bytes,
1772	we->we_error, we->we_unsynced_bufbytes));
1773	}
1774	#else
1775	WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1776	("wapbl_flush: thread %d.%d aborted flush: "
1777	"error = %d\n"
1778	"\tbufcount=%zu bufbytes=%zu bcount=%zu "
1779	"deallocs=%d inodes=%d\n"
1780	"\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
1781	pid, lid, error, wl->wl_bufcount,
1782	wl->wl_bufbytes, wl->wl_bcount,
1783	wl->wl_dealloccnt, wl->wl_inohashcnt,
1784	wl->wl_error_count, wl->wl_reclaimable_bytes,
1785	wl->wl_reserved_bytes));
1786	SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1787	WAPBL_PRINTF(WAPBL_PRINT_ERROR,
1788	("\tentry: bufcount = %zu, reclaimable = %zu, "
1789	"error = %d\n", we->we_bufcount,
1790	we->we_reclaimable_bytes, we->we_error));
1791	}
1792	#endif
1793	mutex_exit(&wl->wl_mtx);
1794	}
1795	#endif
1796
1797	rw_exit(&wl->wl_rwlock);
1798	return error;
1799	}
1800
1801	/**************************************************************/
1802
1803	void
1804	wapbl_jlock_assert(struct wapbl *wl)
1805	{
1806
1807	KASSERT(rw_lock_held(&wl->wl_rwlock));
1808	}
1809
1810	void
1811	wapbl_junlock_assert(struct wapbl *wl)
1812	{
1813
1814	KASSERT(!rw_write_held(&wl->wl_rwlock));
1815	}
1816
1817	/**************************************************************/
1818
1819	/ locks missing /
1820	void
1821	wapbl_print(struct wapbl *wl,
1822	int full,
1823	void (pr)(const* char *, ...))
1824	{
1825	struct buf *bp;
1826	struct wapbl_entry *we;
1827	(*pr)("wapbl %p", wl);
1828	(*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
1829	wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
1830	(*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
1831	wl->wl_circ_size, wl->wl_circ_off,
1832	(intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
1833	(*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
1834	wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
1835	#ifdef WAPBL_DEBUG_BUFBYTES
1836	(*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1837	"reserved = %zu errcnt = %d unsynced = %zu\n",
1838	wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
1839	wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1840	wl->wl_error_count, wl->wl_unsynced_bufbytes);
1841	#else
1842	(*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
1843	"reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
1844	wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
1845	wl->wl_error_count);
1846	#endif
1847	(*pr)("\tdealloccnt = %d, dealloclim = %d\n",
1848	wl->wl_dealloccnt, wl->wl_dealloclim);
1849	(*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
1850	wl->wl_inohashcnt, wl->wl_inohashmask);
1851	(*pr)("entries:\n");
1852	SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
1853	#ifdef WAPBL_DEBUG_BUFBYTES
1854	(*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
1855	"unsynced = %zu\n",
1856	we->we_bufcount, we->we_reclaimable_bytes,
1857	we->we_error, we->we_unsynced_bufbytes);
1858	#else
1859	(*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
1860	we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
1861	#endif
1862	}
1863	if (full) {
1864	int cnt = `0`;
1865	(*pr)("bufs =");
1866	LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
1867	if (!LIST_NEXT(bp, b_wapbllist)) {
1868	(*pr)(" %p", bp);
1869	} else if ((++cnt % `6`) == `0`) {
1870	(*pr)(" %p,\n\t", bp);
1871	} else {
1872	(*pr)(" %p,", bp);
1873	}
1874	}
1875	(*pr)("\n");
1876
1877	(*pr)("dealloced blks = ");
1878	{
1879	struct wapbl_dealloc *wd;
1880	cnt = `0`;
1881	TAILQ_FOREACH(wd, &wl->wl_dealloclist, wd_entries) {
1882	(*pr)(" %"PRId64":%d,",
1883	wd->wd_blkno,
1884	wd->wd_len);
1885	if ((++cnt % `4`) == `0`) {
1886	(*pr)("\n\t");
1887	}
1888	}
1889	}
1890	(*pr)("\n");
1891
1892	(*pr)("registered inodes = ");
1893	{
1894	int i;
1895	cnt = `0`;
1896	for (i = `0`; i <= wl->wl_inohashmask; i++) {
1897	struct wapbl_ino_head *wih;
1898	struct wapbl_ino *wi;
1899
1900	wih = &wl->wl_inohash[i];
1901	LIST_FOREACH(wi, wih, wi_hash) {
1902	if (wi->wi_ino == `0`)
1903	continue;
1904	(*pr)(" %"PRIu64"/0%06"PRIo32",",
1905	wi->wi_ino, wi->wi_mode);
1906	if ((++cnt % `4`) == `0`) {
1907	(*pr)("\n\t");
1908	}
1909	}
1910	}
1911	(*pr)("\n");
1912	}
1913	}
1914	}
1915
1916	#if defined(WAPBL_DEBUG) \|\| defined(DDB)
1917	void
1918	wapbl_dump(struct wapbl *wl)
1919	{
1920	#if defined(WAPBL_DEBUG)
1921	if (!wl)
1922	wl = wapbl_debug_wl;
1923	#endif
1924	if (!wl)
1925	return;
1926	wapbl_print(wl, `1`, printf);
1927	}
1928	#endif
1929
1930	/**************************************************************/
1931
1932	int
1933	wapbl_register_deallocation(struct wapbl wl, daddr_t blk, int* len, bool force,
1934	void **cookiep)
1935	{
1936	struct wapbl_dealloc *wd;
1937	int error = `0`;
1938
1939	wapbl_jlock_assert(wl);
1940
1941	mutex_enter(&wl->wl_mtx);
1942
1943	if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim)) {
1944	if (!force) {
1945	error = EAGAIN;
1946	goto out;
1947	}
1948
1949	/*
1950	* Forced registration can only be used when:
1951	* 1) the caller can't cope with failure
1952	* 2) the path can be triggered only bounded, small
1953	* times per transaction
1954	* If this is not fullfilled, and the path would be triggered
1955	* many times, this could overflow maximum transaction size
1956	* and panic later.
1957	*/
1958	printf("%s: forced dealloc registration over limit: %d >= %d\n",
1959	wl->wl_mount->mnt_stat.f_mntonname,
1960	wl->wl_dealloccnt, wl->wl_dealloclim);
1961	}
1962
1963	wl->wl_dealloccnt++;
1964	mutex_exit(&wl->wl_mtx);
1965
1966	wd = pool_get(&wapbl_dealloc_pool, PR_WAITOK);
1967	wd->wd_blkno = blk;
1968	wd->wd_len = len;
1969
1970	mutex_enter(&wl->wl_mtx);
1971	TAILQ_INSERT_TAIL(&wl->wl_dealloclist, wd, wd_entries);
1972
1973	if (cookiep)
1974	*cookiep = wd;
1975
1976	out:
1977	mutex_exit(&wl->wl_mtx);
1978
1979	WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
1980	("wapbl_register_deallocation: blk=%"PRId64" len=%d error=%d\n",
1981	blk, len, error));
1982
1983	return error;
1984	}
1985
1986	static void
1987	wapbl_deallocation_free(struct wapbl wl, struct* wapbl_dealloc *wd,
1988	bool locked)
1989	{
1990	KASSERT(!locked
1991	\|\| rw_lock_held(&wl->wl_rwlock) \|\| mutex_owned(&wl->wl_mtx));
1992
1993	if (!locked)
1994	mutex_enter(&wl->wl_mtx);
1995
1996	TAILQ_REMOVE(&wl->wl_dealloclist, wd, wd_entries);
1997	wl->wl_dealloccnt--;
1998
1999	if (!locked)
2000	mutex_exit(&wl->wl_mtx);
2001
2002	pool_put(&wapbl_dealloc_pool, wd);
2003	}
2004
2005	void
2006	wapbl_unregister_deallocation(struct wapbl wl, void* *cookie)
2007	{
2008	KASSERT(cookie != NULL);
2009	wapbl_deallocation_free(wl, cookie, false);
2010	}
2011
2012	/**************************************************************/
2013
2014	static void
2015	wapbl_inodetrk_init(struct wapbl *wl, u_int size)
2016	{
2017
2018	wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
2019	if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == `1`) {
2020	pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), `0`, `0`, `0`,
2021	"wapblinopl", &pool_allocator_nointr, IPL_NONE);
2022	}
2023	}
2024
2025	static void
2026	wapbl_inodetrk_free(struct wapbl *wl)
2027	{
2028
2029	/ XXX this KASSERT needs locking/mutex analysis /
2030	KASSERT(wl->wl_inohashcnt == `0`);
2031	hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
2032	if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == `0`) {
2033	pool_destroy(&wapbl_ino_pool);
2034	}
2035	}
2036
2037	static struct wapbl_ino *
2038	wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
2039	{
2040	struct wapbl_ino_head *wih;
2041	struct wapbl_ino *wi;
2042
2043	KASSERT(mutex_owned(&wl->wl_mtx));
2044
2045	wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
2046	LIST_FOREACH(wi, wih, wi_hash) {
2047	if (ino == wi->wi_ino)
2048	return wi;
2049	}
2050	return `0`;
2051	}
2052
2053	void
2054	wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
2055	{
2056	struct wapbl_ino_head *wih;
2057	struct wapbl_ino *wi;
2058
2059	wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
2060
2061	mutex_enter(&wl->wl_mtx);
2062	if (wapbl_inodetrk_get(wl, ino) == NULL) {
2063	wi->wi_ino = ino;
2064	wi->wi_mode = mode;
2065	wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
2066	LIST_INSERT_HEAD(wih, wi, wi_hash);
2067	wl->wl_inohashcnt++;
2068	WAPBL_PRINTF(WAPBL_PRINT_INODE,
2069	("wapbl_register_inode: ino=%"PRId64"\n", ino));
2070	mutex_exit(&wl->wl_mtx);
2071	} else {
2072	mutex_exit(&wl->wl_mtx);
2073	pool_put(&wapbl_ino_pool, wi);
2074	}
2075	}
2076
2077	void
2078	wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
2079	{
2080	struct wapbl_ino *wi;
2081
2082	mutex_enter(&wl->wl_mtx);
2083	wi = wapbl_inodetrk_get(wl, ino);
2084	if (wi) {
2085	WAPBL_PRINTF(WAPBL_PRINT_INODE,
2086	("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
2087	KASSERT(wl->wl_inohashcnt > `0`);
2088	wl->wl_inohashcnt--;
2089	LIST_REMOVE(wi, wi_hash);
2090	mutex_exit(&wl->wl_mtx);
2091
2092	pool_put(&wapbl_ino_pool, wi);
2093	} else {
2094	mutex_exit(&wl->wl_mtx);
2095	}
2096	}
2097
2098	/**************************************************************/
2099
2100	/*
2101	* wapbl_transaction_inodes_len(wl)
2102	*
2103	* Calculate the number of bytes required for inode registration
2104	* log records in wl.
2105	*/
2106	static inline size_t
2107	wapbl_transaction_inodes_len(struct wapbl *wl)
2108	{
2109	int blocklen = `1`<<wl->wl_log_dev_bshift;
2110	int iph;
2111
2112	/ Calculate number of inodes described in a inodelist header /
2113	iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2114	sizeof(((struct wapbl_wc_inodelist *)`0`)->wc_inodes[`0`]);
2115
2116	KASSERT(iph > `0`);
2117
2118	return MAX(`1`, howmany(wl->wl_inohashcnt, iph)) * blocklen;
2119	}
2120
2121
2122	/*
2123	* wapbl_transaction_len(wl)
2124	*
2125	* Calculate number of bytes required for all log records in wl.
2126	*/
2127	static size_t
2128	wapbl_transaction_len(struct wapbl *wl)
2129	{
2130	int blocklen = `1`<<wl->wl_log_dev_bshift;
2131	size_t len;
2132
2133	/ Calculate number of blocks described in a blocklist header /
2134	len = wl->wl_bcount;
2135	len += howmany(wl->wl_bufcount, wl->wl_brperjblock) * blocklen;
2136	len += howmany(wl->wl_dealloccnt, wl->wl_brperjblock) * blocklen;
2137	len += wapbl_transaction_inodes_len(wl);
2138
2139	return len;
2140	}
2141
2142	/*
2143	* wapbl_cache_sync(wl, msg)
2144	*
2145	* Issue DIOCCACHESYNC to wl->wl_devvp.
2146	*
2147	* If sysctl(vfs.wapbl.verbose_commit) >= 2, print a message
2148	* including msg about the duration of the cache sync.
2149	*/
2150	static int
2151	wapbl_cache_sync(struct wapbl wl, const* char *msg)
2152	{
2153	const bool verbose = wapbl_verbose_commit >= `2`;
2154	struct bintime start_time;
2155	int force = `1`;
2156	int error;
2157
2158	if (!wapbl_flush_disk_cache) {
2159	return `0`;
2160	}
2161	if (verbose) {
2162	bintime(&start_time);
2163	}
2164	error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
2165	FWRITE, FSCRED);
2166	if (error) {
2167	WAPBL_PRINTF(WAPBL_PRINT_ERROR,
2168	("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%jx "
2169	"returned %d\n", (uintmax_t)wl->wl_devvp->v_rdev, error));
2170	}
2171	if (verbose) {
2172	struct bintime d;
2173	struct timespec ts;
2174
2175	bintime(&d);
2176	bintime_sub(&d, &start_time);
2177	bintime2timespec(&d, &ts);
2178	printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n",
2179	msg, (uintmax_t)wl->wl_devvp->v_rdev,
2180	(uintmax_t)ts.tv_sec, ts.tv_nsec);
2181	}
2182	return error;
2183	}
2184
2185	/*
2186	* wapbl_write_commit(wl, head, tail)
2187	*
2188	* Issue a disk cache sync to wait for all pending writes to the
2189	* log to complete, and then synchronously commit the current
2190	* circular queue head and tail to the log, in the next of two
2191	* locations for commit headers on disk.
2192	*
2193	* Increment the generation number. If the generation number
2194	* rolls over to zero, then a subsequent commit would appear to
2195	* have an older generation than this one -- in that case, issue a
2196	* duplicate commit to avoid this.
2197	*
2198	* => Caller must have exclusive access to wl, either by holding
2199	* wl->wl_rwlock for writer or by being wapbl_start before anyone
2200	* else has seen wl.
2201	*/
2202	static int
2203	wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
2204	{
2205	struct wapbl_wc_header *wc = wl->wl_wc_header;
2206	struct timespec ts;
2207	int error;
2208	daddr_t pbn;
2209
2210	error = wapbl_buffered_flush(wl);
2211	if (error)
2212	return error;
2213	/*
2214	* flush disk cache to ensure that blocks we've written are actually
2215	* written to the stable storage before the commit header.
2216	*
2217	* XXX Calc checksum here, instead we do this for now
2218	*/
2219	wapbl_cache_sync(wl, "1");
2220
2221	wc->wc_head = head;
2222	wc->wc_tail = tail;
2223	wc->wc_checksum = `0`;
2224	wc->wc_version = `1`;
2225	getnanotime(&ts);
2226	wc->wc_time = ts.tv_sec;
2227	wc->wc_timensec = ts.tv_nsec;
2228
2229	WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2230	("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
2231	(intmax_t)head, (intmax_t)tail));
2232
2233	/*
2234	* write the commit header.
2235	*
2236	* XXX if generation will rollover, then first zero
2237	* over second commit header before trying to write both headers.
2238	*/
2239
2240	pbn = wl->wl_logpbn + (wc->wc_generation % `2`);
2241	#ifdef _KERNEL
2242	pbn = btodb(pbn << wc->wc_log_dev_bshift);
2243	#endif
2244	error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn);
2245	if (error)
2246	return error;
2247	error = wapbl_buffered_flush(wl);
2248	if (error)
2249	return error;
2250
2251	/*
2252	* flush disk cache to ensure that the commit header is actually
2253	* written before meta data blocks.
2254	*/
2255	wapbl_cache_sync(wl, "2");
2256
2257	/*
2258	* If the generation number was zero, write it out a second time.
2259	* This handles initialization and generation number rollover
2260	*/
2261	if (wc->wc_generation++ == `0`) {
2262	error = wapbl_write_commit(wl, head, tail);
2263	/*
2264	* This panic should be able to be removed if we do the
2265	* zero'ing mentioned above, and we are certain to roll
2266	* back generation number on failure.
2267	*/
2268	if (error)
2269	panic("wapbl_write_commit: error writing duplicate "
2270	"log header: %d", error);
2271	}
2272	return `0`;
2273	}
2274
2275	/*
2276	* wapbl_write_blocks(wl, offp)
2277	*
2278	* Write all pending physical blocks in the current transaction
2279	* from wapbl_add_buf to the log on disk, adding to the circular
2280	* queue head at byte offset *offp, and returning the new head's
2281	* byte offset in *offp.
2282	*/
2283	static int
2284	wapbl_write_blocks(struct wapbl wl, off_t offp)
2285	{
2286	struct wapbl_wc_blocklist *wc =
2287	(struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2288	int blocklen = `1`<<wl->wl_log_dev_bshift;
2289	struct buf *bp;
2290	off_t off = *offp;
2291	int error;
2292	size_t padding;
2293
2294	KASSERT(rw_write_held(&wl->wl_rwlock));
2295
2296	bp = LIST_FIRST(&wl->wl_bufs);
2297
2298	while (bp) {
2299	int cnt;
2300	struct buf *obp = bp;
2301
2302	KASSERT(bp->b_flags & B_LOCKED);
2303
2304	wc->wc_type = WAPBL_WC_BLOCKS;
2305	wc->wc_len = blocklen;
2306	wc->wc_blkcount = `0`;
2307	while (bp && (wc->wc_blkcount < wl->wl_brperjblock)) {
2308	/*
2309	* Make sure all the physical block numbers are up to
2310	* date. If this is not always true on a given
2311	* filesystem, then VOP_BMAP must be called. We
2312	* could call VOP_BMAP here, or else in the filesystem
2313	* specific flush callback, although neither of those
2314	* solutions allow us to take the vnode lock. If a
2315	* filesystem requires that we must take the vnode lock
2316	* to call VOP_BMAP, then we can probably do it in
2317	* bwrite when the vnode lock should already be held
2318	* by the invoking code.
2319	*/
2320	KASSERT((bp->b_vp->v_type == VBLK) \|\|
2321	(bp->b_blkno != bp->b_lblkno));
2322	KASSERT(bp->b_blkno > `0`);
2323
2324	wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
2325	wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
2326	wc->wc_len += bp->b_bcount;
2327	wc->wc_blkcount++;
2328	bp = LIST_NEXT(bp, b_wapbllist);
2329	}
2330	if (wc->wc_len % blocklen != `0`) {
2331	padding = blocklen - wc->wc_len % blocklen;
2332	wc->wc_len += padding;
2333	} else {
2334	padding = `0`;
2335	}
2336
2337	WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2338	("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
2339	wc->wc_len, padding, (intmax_t)off));
2340
2341	error = wapbl_circ_write(wl, wc, blocklen, &off);
2342	if (error)
2343	return error;
2344	bp = obp;
2345	cnt = `0`;
2346	while (bp && (cnt++ < wl->wl_brperjblock)) {
2347	error = wapbl_circ_write(wl, bp->b_data,
2348	bp->b_bcount, &off);
2349	if (error)
2350	return error;
2351	bp = LIST_NEXT(bp, b_wapbllist);
2352	}
2353	if (padding) {
2354	void *zero;
2355
2356	zero = wapbl_alloc(padding);
2357	memset(zero, `0`, padding);
2358	error = wapbl_circ_write(wl, zero, padding, &off);
2359	wapbl_free(zero, padding);
2360	if (error)
2361	return error;
2362	}
2363	}
2364	*offp = off;
2365	return `0`;
2366	}
2367
2368	/*
2369	* wapbl_write_revocations(wl, offp)
2370	*
2371	* Write all pending deallocations in the current transaction from
2372	* wapbl_register_deallocation to the log on disk, adding to the
2373	* circular queue's head at byte offset *offp, and returning the
2374	* new head's byte offset in *offp.
2375	*/
2376	static int
2377	wapbl_write_revocations(struct wapbl wl, off_t offp)
2378	{
2379	struct wapbl_wc_blocklist *wc =
2380	(struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
2381	struct wapbl_dealloc wd, lwd;
2382	int blocklen = `1`<<wl->wl_log_dev_bshift;
2383	off_t off = *offp;
2384	int error;
2385
2386	if (wl->wl_dealloccnt == `0`)
2387	return `0`;
2388
2389	while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL) {
2390	wc->wc_type = WAPBL_WC_REVOCATIONS;
2391	wc->wc_len = blocklen;
2392	wc->wc_blkcount = `0`;
2393	while (wd && (wc->wc_blkcount < wl->wl_brperjblock)) {
2394	wc->wc_blocks[wc->wc_blkcount].wc_daddr =
2395	wd->wd_blkno;
2396	wc->wc_blocks[wc->wc_blkcount].wc_dlen =
2397	wd->wd_len;
2398	wc->wc_blkcount++;
2399
2400	wd = TAILQ_NEXT(wd, wd_entries);
2401	}
2402	WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2403	("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
2404	wc->wc_len, (intmax_t)off));
2405	error = wapbl_circ_write(wl, wc, blocklen, &off);
2406	if (error)
2407	return error;
2408
2409	/ free all successfully written deallocs /
2410	lwd = wd;
2411	while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL) {
2412	if (wd == lwd)
2413	break;
2414	wapbl_deallocation_free(wl, wd, true);
2415	}
2416	}
2417	*offp = off;
2418	return `0`;
2419	}
2420
2421	/*
2422	* wapbl_write_inodes(wl, offp)
2423	*
2424	* Write all pending inode allocations in the current transaction
2425	* from wapbl_register_inode to the log on disk, adding to the
2426	* circular queue's head at byte offset *offp and returning the
2427	* new head's byte offset in *offp.
2428	*/
2429	static int
2430	wapbl_write_inodes(struct wapbl wl, off_t offp)
2431	{
2432	struct wapbl_wc_inodelist *wc =
2433	(struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
2434	int i;
2435	int blocklen = `1` << wl->wl_log_dev_bshift;
2436	off_t off = *offp;
2437	int error;
2438
2439	struct wapbl_ino_head *wih;
2440	struct wapbl_ino *wi;
2441	int iph;
2442
2443	iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
2444	sizeof(((struct wapbl_wc_inodelist *)`0`)->wc_inodes[`0`]);
2445
2446	i = `0`;
2447	wih = &wl->wl_inohash[`0`];
2448	wi = `0`;
2449	do {
2450	wc->wc_type = WAPBL_WC_INODES;
2451	wc->wc_len = blocklen;
2452	wc->wc_inocnt = `0`;
2453	wc->wc_clear = (i == `0`);
2454	while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
2455	while (!wi) {
2456	KASSERT((wih - &wl->wl_inohash[`0`])
2457	<= wl->wl_inohashmask);
2458	wi = LIST_FIRST(wih++);
2459	}
2460	wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
2461	wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
2462	wc->wc_inocnt++;
2463	i++;
2464	wi = LIST_NEXT(wi, wi_hash);
2465	}
2466	WAPBL_PRINTF(WAPBL_PRINT_WRITE,
2467	("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
2468	wc->wc_len, (intmax_t)off));
2469	error = wapbl_circ_write(wl, wc, blocklen, &off);
2470	if (error)
2471	return error;
2472	} while (i < wl->wl_inohashcnt);
2473
2474	*offp = off;
2475	return `0`;
2476	}
2477
2478	#endif /* _KERNEL */
2479
2480	/**************************************************************/
2481
2482	struct wapbl_blk {
2483	LIST_ENTRY(wapbl_blk) wb_hash;
2484	daddr_t wb_blk;
2485	off_t wb_off; / Offset of this block in the log /
2486	};
2487	#define WAPBL_BLKPOOL_MIN 83
2488
2489	static void
2490	wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
2491	{
2492	if (size < WAPBL_BLKPOOL_MIN)
2493	size = WAPBL_BLKPOOL_MIN;
2494	KASSERT(wr->wr_blkhash == `0`);
2495	#ifdef _KERNEL
2496	wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
2497	#else /* ! _KERNEL */
2498	/ Manually implement hashinit /
2499	{
2500	unsigned long i, hashsize;
2501	for (hashsize = `1`; hashsize < size; hashsize <<= `1`)
2502	continue;
2503	wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash));
2504	for (i = `0`; i < hashsize; i++)
2505	LIST_INIT(&wr->wr_blkhash[i]);
2506	wr->wr_blkhashmask = hashsize - `1`;
2507	}
2508	#endif /* ! _KERNEL */
2509	}
2510
2511	static void
2512	wapbl_blkhash_free(struct wapbl_replay *wr)
2513	{
2514	KASSERT(wr->wr_blkhashcnt == `0`);
2515	#ifdef _KERNEL
2516	hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
2517	#else /* ! _KERNEL */
2518	wapbl_free(wr->wr_blkhash,
2519	(wr->wr_blkhashmask + `1`) * sizeof(*wr->wr_blkhash));
2520	#endif /* ! _KERNEL */
2521	}
2522
2523	static struct wapbl_blk *
2524	wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
2525	{
2526	struct wapbl_blk_head *wbh;
2527	struct wapbl_blk *wb;
2528	wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2529	LIST_FOREACH(wb, wbh, wb_hash) {
2530	if (blk == wb->wb_blk)
2531	return wb;
2532	}
2533	return `0`;
2534	}
2535
2536	static void
2537	wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
2538	{
2539	struct wapbl_blk_head *wbh;
2540	struct wapbl_blk *wb;
2541	wb = wapbl_blkhash_get(wr, blk);
2542	if (wb) {
2543	KASSERT(wb->wb_blk == blk);
2544	wb->wb_off = off;
2545	} else {
2546	wb = wapbl_alloc(sizeof(*wb));
2547	wb->wb_blk = blk;
2548	wb->wb_off = off;
2549	wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
2550	LIST_INSERT_HEAD(wbh, wb, wb_hash);
2551	wr->wr_blkhashcnt++;
2552	}
2553	}
2554
2555	static void
2556	wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
2557	{
2558	struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
2559	if (wb) {
2560	KASSERT(wr->wr_blkhashcnt > `0`);
2561	wr->wr_blkhashcnt--;
2562	LIST_REMOVE(wb, wb_hash);
2563	wapbl_free(wb, sizeof(*wb));
2564	}
2565	}
2566
2567	static void
2568	wapbl_blkhash_clear(struct wapbl_replay *wr)
2569	{
2570	unsigned long i;
2571	for (i = `0`; i <= wr->wr_blkhashmask; i++) {
2572	struct wapbl_blk *wb;
2573
2574	while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
2575	KASSERT(wr->wr_blkhashcnt > `0`);
2576	wr->wr_blkhashcnt--;
2577	LIST_REMOVE(wb, wb_hash);
2578	wapbl_free(wb, sizeof(*wb));
2579	}
2580	}
2581	KASSERT(wr->wr_blkhashcnt == `0`);
2582	}
2583
2584	/**************************************************************/
2585
2586	/*
2587	* wapbl_circ_read(wr, data, len, offp)
2588	*
2589	* Read len bytes into data from the circular queue of wr,
2590	* starting at the linear byte offset *offp, and returning the new
2591	* linear byte offset in *offp.
2592	*
2593	* If the starting linear byte offset precedes wr->wr_circ_off,
2594	* the read instead begins at wr->wr_circ_off. XXX WTF? This
2595	* should be a KASSERT, not a conditional.
2596	*/
2597	static int
2598	wapbl_circ_read(struct wapbl_replay wr, void* data, size_t len, off_t offp)
2599	{
2600	size_t slen;
2601	off_t off = *offp;
2602	int error;
2603	daddr_t pbn;
2604
2605	KASSERT(((len >> wr->wr_log_dev_bshift) <<
2606	wr->wr_log_dev_bshift) == len);
2607
2608	if (off < wr->wr_circ_off)
2609	off = wr->wr_circ_off;
2610	slen = wr->wr_circ_off + wr->wr_circ_size - off;
2611	if (slen < len) {
2612	pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2613	#ifdef _KERNEL
2614	pbn = btodb(pbn << wr->wr_log_dev_bshift);
2615	#endif
2616	error = wapbl_read(data, slen, wr->wr_devvp, pbn);
2617	if (error)
2618	return error;
2619	data = (uint8_t *)data + slen;
2620	len -= slen;
2621	off = wr->wr_circ_off;
2622	}
2623	pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
2624	#ifdef _KERNEL
2625	pbn = btodb(pbn << wr->wr_log_dev_bshift);
2626	#endif
2627	error = wapbl_read(data, len, wr->wr_devvp, pbn);
2628	if (error)
2629	return error;
2630	off += len;
2631	if (off >= wr->wr_circ_off + wr->wr_circ_size)
2632	off = wr->wr_circ_off;
2633	*offp = off;
2634	return `0`;
2635	}
2636
2637	/*
2638	* wapbl_circ_advance(wr, len, offp)
2639	*
2640	* Compute the linear byte offset of the circular queue of wr that
2641	* is len bytes past offp, and store it in offp.
2642	*
2643	* This is as if wapbl_circ_read, but without actually reading
2644	* anything.
2645	*
2646	* If the starting linear byte offset precedes wr->wr_circ_off, it
2647	* is taken to be wr->wr_circ_off instead. XXX WTF? This should
2648	* be a KASSERT, not a conditional.
2649	*/
2650	static void
2651	wapbl_circ_advance(struct wapbl_replay wr, size_t len, off_t offp)
2652	{
2653	size_t slen;
2654	off_t off = *offp;
2655
2656	KASSERT(((len >> wr->wr_log_dev_bshift) <<
2657	wr->wr_log_dev_bshift) == len);
2658
2659	if (off < wr->wr_circ_off)
2660	off = wr->wr_circ_off;
2661	slen = wr->wr_circ_off + wr->wr_circ_size - off;
2662	if (slen < len) {
2663	len -= slen;
2664	off = wr->wr_circ_off;
2665	}
2666	off += len;
2667	if (off >= wr->wr_circ_off + wr->wr_circ_size)
2668	off = wr->wr_circ_off;
2669	*offp = off;
2670	}
2671
2672	/**************************************************************/
2673
2674	int
2675	wapbl_replay_start(struct wapbl_replay wrp, struct** vnode *vp,
2676	daddr_t off, size_t count, size_t blksize)
2677	{
2678	struct wapbl_replay *wr;
2679	int error;
2680	struct vnode *devvp;
2681	daddr_t logpbn;
2682	uint8_t *scratch;
2683	struct wapbl_wc_header *wch;
2684	struct wapbl_wc_header *wch2;
2685	/ Use this until we read the actual log header /
2686	int log_dev_bshift = ilog2(blksize);
2687	size_t used;
2688	daddr_t pbn;
2689
2690	WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2691	("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
2692	vp, off, count, blksize));
2693
2694	if (off < `0`)
2695	return EINVAL;
2696
2697	if (blksize < DEV_BSIZE)
2698	return EINVAL;
2699	if (blksize % DEV_BSIZE)
2700	return EINVAL;
2701
2702	#ifdef _KERNEL
2703	#if 0
2704	/ XXX vp->v_size isn't reliably set for VBLK devices,*
2705	* especially root. However, we might still want to verify
2706	* that the full load is readable */
2707	if ((off + count) * blksize > vp->v_size)
2708	return EINVAL;
2709	#endif
2710	if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, `0`)) != `0`) {
2711	return error;
2712	}
2713	#else /* ! _KERNEL */
2714	devvp = vp;
2715	logpbn = off;
2716	#endif /* ! _KERNEL */
2717
2718	scratch = wapbl_alloc(MAXBSIZE);
2719
2720	pbn = logpbn;
2721	#ifdef _KERNEL
2722	pbn = btodb(pbn << log_dev_bshift);
2723	#endif
2724	error = wapbl_read(scratch, `2`<<log_dev_bshift, devvp, pbn);
2725	if (error)
2726	goto errout;
2727
2728	wch = (struct wapbl_wc_header *)scratch;
2729	wch2 =
2730	(struct wapbl_wc_header *)(scratch + (`1`<<log_dev_bshift));
2731	/ XXX verify checksums and magic numbers /
2732	if (wch->wc_type != WAPBL_WC_HEADER) {
2733	printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
2734	error = EFTYPE;
2735	goto errout;
2736	}
2737
2738	if (wch2->wc_generation > wch->wc_generation)
2739	wch = wch2;
2740
2741	wr = wapbl_calloc(`1`, sizeof(*wr));
2742
2743	wr->wr_logvp = vp;
2744	wr->wr_devvp = devvp;
2745	wr->wr_logpbn = logpbn;
2746
2747	wr->wr_scratch = scratch;
2748
2749	wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
2750	wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
2751	wr->wr_circ_off = wch->wc_circ_off;
2752	wr->wr_circ_size = wch->wc_circ_size;
2753	wr->wr_generation = wch->wc_generation;
2754
2755	used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
2756
2757	WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
2758	("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
2759	" len=%"PRId64" used=%zu\n",
2760	wch->wc_head, wch->wc_tail, wch->wc_circ_off,
2761	wch->wc_circ_size, used));
2762
2763	wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
2764
2765	error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
2766	if (error) {
2767	wapbl_replay_stop(wr);
2768	wapbl_replay_free(wr);
2769	return error;
2770	}
2771
2772	*wrp = wr;
2773	return `0`;
2774
2775	errout:
2776	wapbl_free(scratch, MAXBSIZE);
2777	return error;
2778	}
2779
2780	void
2781	wapbl_replay_stop(struct wapbl_replay *wr)
2782	{
2783
2784	if (!wapbl_replay_isopen(wr))
2785	return;
2786
2787	WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
2788
2789	wapbl_free(wr->wr_scratch, MAXBSIZE);
2790	wr->wr_scratch = NULL;
2791
2792	wr->wr_logvp = NULL;
2793
2794	wapbl_blkhash_clear(wr);
2795	wapbl_blkhash_free(wr);
2796	}
2797
2798	void
2799	wapbl_replay_free(struct wapbl_replay *wr)
2800	{
2801
2802	KDASSERT(!wapbl_replay_isopen(wr));
2803
2804	if (wr->wr_inodes)
2805	wapbl_free(wr->wr_inodes,
2806	wr->wr_inodescnt * sizeof(wr->wr_inodes[`0`]));
2807	wapbl_free(wr, sizeof(*wr));
2808	}
2809
2810	#ifdef _KERNEL
2811	int
2812	wapbl_replay_isopen1(struct wapbl_replay *wr)
2813	{
2814
2815	return wapbl_replay_isopen(wr);
2816	}
2817	#endif
2818
2819	/*
2820	* calculate the disk address for the i'th block in the wc_blockblist
2821	* offset by j blocks of size blen.
2822	*
2823	* wc_daddr is always a kernel disk address in DEV_BSIZE units that
2824	* was written to the journal.
2825	*
2826	* The kernel needs that address plus the offset in DEV_BSIZE units.
2827	*
2828	* Userland needs that address plus the offset in blen units.
2829	*
2830	*/
2831	static daddr_t
2832	wapbl_block_daddr(struct wapbl_wc_blocklist wc, int* i, int j, int blen)
2833	{
2834	daddr_t pbn;
2835
2836	#ifdef _KERNEL
2837	pbn = wc->wc_blocks[i].wc_daddr + btodb(j * blen);
2838	#else
2839	pbn = dbtob(wc->wc_blocks[i].wc_daddr) / blen + j;
2840	#endif
2841
2842	return pbn;
2843	}
2844
2845	static void
2846	wapbl_replay_process_blocks(struct wapbl_replay wr, off_t offp)
2847	{
2848	struct wapbl_wc_blocklist *wc =
2849	(struct wapbl_wc_blocklist *)wr->wr_scratch;
2850	int fsblklen = `1` << wr->wr_fs_dev_bshift;
2851	int i, j, n;
2852
2853	for (i = `0`; i < wc->wc_blkcount; i++) {
2854	/*
2855	* Enter each physical block into the hashtable independently.
2856	*/
2857	n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2858	for (j = `0`; j < n; j++) {
2859	wapbl_blkhash_ins(wr, wapbl_block_daddr(wc, i, j, fsblklen),
2860	*offp);
2861	wapbl_circ_advance(wr, fsblklen, offp);
2862	}
2863	}
2864	}
2865
2866	static void
2867	wapbl_replay_process_revocations(struct wapbl_replay *wr)
2868	{
2869	struct wapbl_wc_blocklist *wc =
2870	(struct wapbl_wc_blocklist *)wr->wr_scratch;
2871	int fsblklen = `1` << wr->wr_fs_dev_bshift;
2872	int i, j, n;
2873
2874	for (i = `0`; i < wc->wc_blkcount; i++) {
2875	/*
2876	* Remove any blocks found from the hashtable.
2877	*/
2878	n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
2879	for (j = `0`; j < n; j++)
2880	wapbl_blkhash_rem(wr, wapbl_block_daddr(wc, i, j, fsblklen));
2881	}
2882	}
2883
2884	static void
2885	wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
2886	{
2887	struct wapbl_wc_inodelist *wc =
2888	(struct wapbl_wc_inodelist *)wr->wr_scratch;
2889	void *new_inodes;
2890	const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[`0`]);
2891
2892	KASSERT(sizeof(wr->wr_inodes[`0`]) == sizeof(wc->wc_inodes[`0`]));
2893
2894	/*
2895	* Keep track of where we found this so location won't be
2896	* overwritten.
2897	*/
2898	if (wc->wc_clear) {
2899	wr->wr_inodestail = oldoff;
2900	wr->wr_inodescnt = `0`;
2901	if (wr->wr_inodes != NULL) {
2902	wapbl_free(wr->wr_inodes, oldsize);
2903	wr->wr_inodes = NULL;
2904	}
2905	}
2906	wr->wr_inodeshead = newoff;
2907	if (wc->wc_inocnt == `0`)
2908	return;
2909
2910	new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) *
2911	sizeof(wr->wr_inodes[`0`]));
2912	if (wr->wr_inodes != NULL) {
2913	memcpy(new_inodes, wr->wr_inodes, oldsize);
2914	wapbl_free(wr->wr_inodes, oldsize);
2915	}
2916	wr->wr_inodes = new_inodes;
2917	memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
2918	wc->wc_inocnt * sizeof(wr->wr_inodes[`0`]));
2919	wr->wr_inodescnt += wc->wc_inocnt;
2920	}
2921
2922	static int
2923	wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
2924	{
2925	off_t off;
2926	int error;
2927
2928	int logblklen = `1` << wr->wr_log_dev_bshift;
2929
2930	wapbl_blkhash_clear(wr);
2931
2932	off = tail;
2933	while (off != head) {
2934	struct wapbl_wc_null *wcn;
2935	off_t saveoff = off;
2936	error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2937	if (error)
2938	goto errout;
2939	wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2940	switch (wcn->wc_type) {
2941	case WAPBL_WC_BLOCKS:
2942	wapbl_replay_process_blocks(wr, &off);
2943	break;
2944
2945	case WAPBL_WC_REVOCATIONS:
2946	wapbl_replay_process_revocations(wr);
2947	break;
2948
2949	case WAPBL_WC_INODES:
2950	wapbl_replay_process_inodes(wr, saveoff, off);
2951	break;
2952
2953	default:
2954	printf("Unrecognized wapbl type: 0x%08x\n",
2955	wcn->wc_type);
2956	error = EFTYPE;
2957	goto errout;
2958	}
2959	wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
2960	if (off != saveoff) {
2961	printf("wapbl_replay: corrupted records\n");
2962	error = EFTYPE;
2963	goto errout;
2964	}
2965	}
2966	return `0`;
2967
2968	errout:
2969	wapbl_blkhash_clear(wr);
2970	return error;
2971	}
2972
2973	#if 0
2974	int
2975	wapbl_replay_verify(struct wapbl_replay wr, struct* vnode *fsdevvp)
2976	{
2977	off_t off;
2978	int mismatchcnt = `0`;
2979	int logblklen = `1` << wr->wr_log_dev_bshift;
2980	int fsblklen = `1` << wr->wr_fs_dev_bshift;
2981	void *scratch1 = wapbl_alloc(MAXBSIZE);
2982	void *scratch2 = wapbl_alloc(MAXBSIZE);
2983	int error = `0`;
2984
2985	KDASSERT(wapbl_replay_isopen(wr));
2986
2987	off = wch->wc_tail;
2988	while (off != wch->wc_head) {
2989	struct wapbl_wc_null *wcn;
2990	#ifdef DEBUG
2991	off_t saveoff = off;
2992	#endif
2993	error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
2994	if (error)
2995	goto out;
2996	wcn = (struct wapbl_wc_null *)wr->wr_scratch;
2997	switch (wcn->wc_type) {
2998	case WAPBL_WC_BLOCKS:
2999	{
3000	struct wapbl_wc_blocklist *wc =
3001	(struct wapbl_wc_blocklist *)wr->wr_scratch;
3002	int i;
3003	for (i = `0`; i < wc->wc_blkcount; i++) {
3004	int foundcnt = `0`;
3005	int dirtycnt = `0`;
3006	int j, n;
3007	/*
3008	* Check each physical block into the
3009	* hashtable independently
3010	*/
3011	n = wc->wc_blocks[i].wc_dlen >>
3012	wch->wc_fs_dev_bshift;
3013	for (j = `0`; j < n; j++) {
3014	struct wapbl_blk *wb =
3015	wapbl_blkhash_get(wr,
3016	wapbl_block_daddr(wc, i, j, fsblklen));
3017	if (wb && (wb->wb_off == off)) {
3018	foundcnt++;
3019	error =
3020	wapbl_circ_read(wr,
3021	scratch1, fsblklen,
3022	&off);
3023	if (error)
3024	goto out;
3025	error =
3026	wapbl_read(scratch2,
3027	fsblklen, fsdevvp,
3028	wb->wb_blk);
3029	if (error)
3030	goto out;
3031	if (memcmp(scratch1,
3032	scratch2,
3033	fsblklen)) {
3034	printf(
3035	"wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
3036	wb->wb_blk, (intmax_t)off);
3037	dirtycnt++;
3038	mismatchcnt++;
3039	}
3040	} else {
3041	wapbl_circ_advance(wr,
3042	fsblklen, &off);
3043	}
3044	}
3045	#if 0
3046	/*
3047	* If all of the blocks in an entry
3048	* are clean, then remove all of its
3049	* blocks from the hashtable since they
3050	* never will need replay.
3051	*/
3052	if ((foundcnt != `0`) &&
3053	(dirtycnt == `0`)) {
3054	off = saveoff;
3055	wapbl_circ_advance(wr,
3056	logblklen, &off);
3057	for (j = `0`; j < n; j++) {
3058	struct wapbl_blk *wb =
3059	wapbl_blkhash_get(wr,
3060	wapbl_block_daddr(wc, i, j, fsblklen));
3061	if (wb &&
3062	(wb->wb_off == off)) {
3063	wapbl_blkhash_rem(wr, wb->wb_blk);
3064	}
3065	wapbl_circ_advance(wr,
3066	fsblklen, &off);
3067	}
3068	}
3069	#endif
3070	}
3071	}
3072	break;
3073	case WAPBL_WC_REVOCATIONS:
3074	case WAPBL_WC_INODES:
3075	break;
3076	default:
3077	KASSERT(`0`);
3078	}
3079	#ifdef DEBUG
3080	wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
3081	KASSERT(off == saveoff);
3082	#endif
3083	}
3084	out:
3085	wapbl_free(scratch1, MAXBSIZE);
3086	wapbl_free(scratch2, MAXBSIZE);
3087	if (!error && mismatchcnt)
3088	error = EFTYPE;
3089	return error;
3090	}
3091	#endif
3092
3093	int
3094	wapbl_replay_write(struct wapbl_replay wr, struct* vnode *fsdevvp)
3095	{
3096	struct wapbl_blk *wb;
3097	size_t i;
3098	off_t off;
3099	void *scratch;
3100	int error = `0`;
3101	int fsblklen = `1` << wr->wr_fs_dev_bshift;
3102
3103	KDASSERT(wapbl_replay_isopen(wr));
3104
3105	scratch = wapbl_alloc(MAXBSIZE);
3106
3107	for (i = `0`; i <= wr->wr_blkhashmask; ++i) {
3108	LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
3109	off = wb->wb_off;
3110	error = wapbl_circ_read(wr, scratch, fsblklen, &off);
3111	if (error)
3112	break;
3113	error = wapbl_write(scratch, fsblklen, fsdevvp,
3114	wb->wb_blk);
3115	if (error)
3116	break;
3117	}
3118	}
3119
3120	wapbl_free(scratch, MAXBSIZE);
3121	return error;
3122	}
3123
3124	int
3125	wapbl_replay_can_read(struct wapbl_replay wr, daddr_t blk, long* len)
3126	{
3127	int fsblklen = `1` << wr->wr_fs_dev_bshift;
3128
3129	KDASSERT(wapbl_replay_isopen(wr));
3130	KASSERT((len % fsblklen) == `0`);
3131
3132	while (len != `0`) {
3133	struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
3134	if (wb)
3135	return `1`;
3136	len -= fsblklen;
3137	}
3138	return `0`;
3139	}
3140
3141	int
3142	wapbl_replay_read(struct wapbl_replay wr, void* data, daddr_t blk, long* len)
3143	{
3144	int fsblklen = `1` << wr->wr_fs_dev_bshift;
3145
3146	KDASSERT(wapbl_replay_isopen(wr));
3147
3148	KASSERT((len % fsblklen) == `0`);
3149
3150	while (len != `0`) {
3151	struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
3152	if (wb) {
3153	off_t off = wb->wb_off;
3154	int error;
3155	error = wapbl_circ_read(wr, data, fsblklen, &off);
3156	if (error)
3157	return error;
3158	}
3159	data = (uint8_t *)data + fsblklen;
3160	len -= fsblklen;
3161	blk++;
3162	}
3163	return `0`;
3164	}
3165
3166	#ifdef _KERNEL
3167
3168	MODULE(MODULE_CLASS_VFS, wapbl, NULL);
3169
3170	static int
3171	wapbl_modcmd(modcmd_t cmd, void *arg)
3172	{
3173
3174	switch (cmd) {
3175	case MODULE_CMD_INIT:
3176	wapbl_init();
3177	return `0`;
3178	case MODULE_CMD_FINI:
3179	return wapbl_fini();
3180	default:
3181	return ENOTTY;
3182	}
3183	}
3184	#endif /* _KERNEL */
3185

Definitions

wapbl_sysctl
wapbl_flush_disk_cache
wapbl_verbose_commit
wapbl_ino_head
wapbl
wapbl_entry_pool
wapbl_dealloc_pool
wapbl_ino_pool_refcount
wapbl_ino_pool
wapbl_ino
wapbl_ops
wapbl_sysctl_init
wapbl_init
wapbl_fini
wapbl_start_flush_inodes
wapbl_start
wapbl_discard
wapbl_stop
wapbl_doio
wapbl_write
wapbl_read
wapbl_buffered_flush
wapbl_buffered_write
wapbl_circ_write
wapbl_begin
wapbl_end
wapbl_add_buf
wapbl_remove_buf_locked
wapbl_remove_buf
wapbl_resize_buf
wapbl_space_used
wapbl_advance
wapbl_space_free
wapbl_advance_head
wapbl_advance_tail
wapbl_truncate
wapbl_biodone
wapbl_flush
wapbl_jlock_assert
wapbl_junlock_assert
wapbl_print
wapbl_dump
wapbl_register_deallocation
wapbl_deallocation_free
wapbl_unregister_deallocation
wapbl_inodetrk_init
wapbl_inodetrk_free
wapbl_inodetrk_get
wapbl_register_inode
wapbl_unregister_inode
wapbl_transaction_inodes_len
wapbl_transaction_len
wapbl_cache_sync
wapbl_write_commit
wapbl_write_blocks
wapbl_write_revocations
wapbl_write_inodes
wapbl_blk
wapbl_blkhash_init
wapbl_blkhash_free
wapbl_blkhash_get
wapbl_blkhash_ins
wapbl_blkhash_rem
wapbl_blkhash_clear
wapbl_circ_read
wapbl_circ_advance
wapbl_replay_start
wapbl_replay_stop
wapbl_replay_free
wapbl_replay_isopen1
wapbl_block_daddr
wapbl_replay_process_blocks
wapbl_replay_process_revocations
wapbl_replay_process_inodes
wapbl_replay_process
wapbl_replay_write
wapbl_replay_can_read
wapbl_replay_read

Browse the source code of src/src/sys/kern/vfs_wapbl.c

Definitions