lfs_segment.c source code [src/src/sys/ufs/lfs/lfs_segment.c]

1	/ $NetBSD: lfs_segment.c,v 1.263 2015/10/19 04:21:48 dholland Exp $ /
2
3	/-*
4	* Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
5	* All rights reserved.
6	*
7	* This code is derived from software contributed to The NetBSD Foundation
8	* by Konrad E. Schroder <perseant@hhhh.org>.
9	*
10	* Redistribution and use in source and binary forms, with or without
11	* modification, are permitted provided that the following conditions
12	* are met:
13	* 1. Redistributions of source code must retain the above copyright
14	* notice, this list of conditions and the following disclaimer.
15	* 2. Redistributions in binary form must reproduce the above copyright
16	* notice, this list of conditions and the following disclaimer in the
17	* documentation and/or other materials provided with the distribution.
18	*
19	* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29	* POSSIBILITY OF SUCH DAMAGE.
30	*/
31	/*
32	* Copyright (c) 1991, 1993
33	* The Regents of the University of California. All rights reserved.
34	*
35	* Redistribution and use in source and binary forms, with or without
36	* modification, are permitted provided that the following conditions
37	* are met:
38	* 1. Redistributions of source code must retain the above copyright
39	* notice, this list of conditions and the following disclaimer.
40	* 2. Redistributions in binary form must reproduce the above copyright
41	* notice, this list of conditions and the following disclaimer in the
42	* documentation and/or other materials provided with the distribution.
43	* 3. Neither the name of the University nor the names of its contributors
44	* may be used to endorse or promote products derived from this software
45	* without specific prior written permission.
46	*
47	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57	* SUCH DAMAGE.
58	*
59	* @(#)lfs_segment.c 8.10 (Berkeley) 6/10/95
60	*/
61
62	#include <sys/cdefs.h>
63	__KERNEL_RCSID(`0`, "$NetBSD: lfs_segment.c,v 1.263 2015/10/19 04:21:48 dholland Exp $");
64
65	#ifdef DEBUG
66	# define vndebug(vp, str) do { \
67	if (VTOI(vp)->i_flag & IN_CLEANING) \
68	DLOG((DLOG_WVNODE, "not writing ino %d because %s (op %d)\n", \
69	VTOI(vp)->i_number, (str), op)); \
70	} while(0)
71	#else
72	# define vndebug(vp, str)
73	#endif
74	#define ivndebug(vp, str) \
75	DLOG((DLOG_WVNODE, "ino %d: %s\n", VTOI(vp)->i_number, (str)))
76
77	#if defined(_KERNEL_OPT)
78	#include "opt_ddb.h"
79	#endif
80
81	#include <sys/param.h>
82	#include <sys/systm.h>
83	#include <sys/namei.h>
84	#include <sys/kernel.h>
85	#include <sys/resourcevar.h>
86	#include <sys/file.h>
87	#include <sys/stat.h>
88	#include <sys/buf.h>
89	#include <sys/proc.h>
90	#include <sys/vnode.h>
91	#include <sys/mount.h>
92	#include <sys/kauth.h>
93	#include <sys/syslog.h>
94
95	#include <miscfs/specfs/specdev.h>
96	#include <miscfs/fifofs/fifo.h>
97
98	#include <ufs/lfs/ulfs_inode.h>
99	#include <ufs/lfs/ulfsmount.h>
100	#include <ufs/lfs/ulfs_extern.h>
101
102	#include <ufs/lfs/lfs.h>
103	#include <ufs/lfs/lfs_accessors.h>
104	#include <ufs/lfs/lfs_kernel.h>
105	#include <ufs/lfs/lfs_extern.h>
106
107	#include <uvm/uvm.h>
108	#include <uvm/uvm_extern.h>
109
110	MALLOC_JUSTDEFINE(M_SEGMENT, "LFS segment", "Segment for LFS");
111
112	static void lfs_generic_callback(struct buf , void* ()(struct* buf *));
113	static void lfs_free_aiodone(struct buf *);
114	static void lfs_super_aiodone(struct buf *);
115	static void lfs_cluster_aiodone(struct buf *);
116	static void lfs_cluster_callback(struct buf *);
117
118	/*
119	* Determine if it's OK to start a partial in this segment, or if we need
120	* to go on to a new segment.
121	*/
122	#define LFS_PARTIAL_FITS(fs) \
123	(lfs_sb_getfsbpseg(fs) - \
124	(lfs_sb_getoffset(fs) - lfs_sb_getcurseg(fs)) > \
125	lfs_sb_getfrag(fs))
126
127	/*
128	* Figure out whether we should do a checkpoint write or go ahead with
129	* an ordinary write.
130	*/
131	#define LFS_SHOULD_CHECKPOINT(fs, flags) \
132	((flags & SEGM_CLEAN) == 0 && \
133	((fs->lfs_nactive > LFS_MAX_ACTIVE \|\| \
134	(flags & SEGM_CKP) \|\| \
135	lfs_sb_getnclean(fs) < LFS_MAX_ACTIVE)))
136
137	int lfs_match_fake(struct lfs , struct* buf *);
138	void lfs_newseg(struct lfs *);
139	void lfs_supercallback(struct buf *);
140	void lfs_updatemeta(struct segment *);
141	void lfs_writesuper(struct lfs *, daddr_t);
142	int lfs_writevnodes(struct lfs fs, struct* mount *mp,
143	struct segment sp, int* dirops);
144
145	static void lfs_shellsort(struct lfs , struct* buf , union** lfs_blocks *,
146	int, int);
147
148	int lfs_allclean_wakeup; / Cleaner wakeup address. /
149	int lfs_writeindir = `1`; / whether to flush indir on non-ckp /
150	int lfs_clean_vnhead = `0`; / Allow freeing to head of vn list /
151	int lfs_dirvcount = `0`; / # active dirops /
152
153	/ Statistics Counters /
154	int lfs_dostats = `1`;
155	struct lfs_stats lfs_stats;
156
157	/ op values to lfs_writevnodes /
158	#define VN_REG 0
159	#define VN_DIROP 1
160	#define VN_EMPTY 2
161	#define VN_CLEAN 3
162
163	/*
164	* XXX KS - Set modification time on the Ifile, so the cleaner can
165	* read the fs mod time off of it. We don't set IN_UPDATE here,
166	* since we don't really need this to be flushed to disk (and in any
167	* case that wouldn't happen to the Ifile until we checkpoint).
168	*/
169	void
170	lfs_imtime(struct lfs *fs)
171	{
172	struct timespec ts;
173	struct inode *ip;
174
175	ASSERT_MAYBE_SEGLOCK(fs);
176	vfs_timestamp(&ts);
177	ip = VTOI(fs->lfs_ivnode);
178	lfs_dino_setmtime(fs, ip->i_din, ts.tv_sec);
179	lfs_dino_setmtimensec(fs, ip->i_din, ts.tv_nsec);
180	}
181
182	/*
183	* Ifile and meta data blocks are not marked busy, so segment writes MUST be
184	* single threaded. Currently, there are two paths into lfs_segwrite, sync()
185	* and getnewbuf(). They both mark the file system busy. Lfs_vflush()
186	* explicitly marks the file system busy. So lfs_segwrite is safe. I think.
187	*/
188
189	#define IS_FLUSHING(fs,vp) ((fs)->lfs_flushvp == (vp))
190
191	int
192	lfs_vflush(struct vnode *vp)
193	{
194	struct inode *ip;
195	struct lfs *fs;
196	struct segment *sp;
197	struct buf bp, nbp, tbp, tnbp;
198	int error;
199	int flushed;
200	int relock;
201
202	ip = VTOI(vp);
203	fs = VFSTOULFS(vp->v_mount)->um_lfs;
204	relock = `0`;
205
206	top:
207	KASSERT(mutex_owned(vp->v_interlock) == false);
208	KASSERT(mutex_owned(&lfs_lock) == false);
209	KASSERT(mutex_owned(&bufcache_lock) == false);
210	ASSERT_NO_SEGLOCK(fs);
211	if (ip->i_flag & IN_CLEANING) {
212	ivndebug(vp,"vflush/in_cleaning");
213	mutex_enter(&lfs_lock);
214	LFS_CLR_UINO(ip, IN_CLEANING);
215	LFS_SET_UINO(ip, IN_MODIFIED);
216	mutex_exit(&lfs_lock);
217
218	/*
219	* Toss any cleaning buffers that have real counterparts
220	* to avoid losing new data.
221	*/
222	mutex_enter(vp->v_interlock);
223	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
224	nbp = LIST_NEXT(bp, b_vnbufs);
225	if (!LFS_IS_MALLOC_BUF(bp))
226	continue;
227	/*
228	* Look for pages matching the range covered
229	* by cleaning blocks. It's okay if more dirty
230	* pages appear, so long as none disappear out
231	* from under us.
232	*/
233	if (bp->b_lblkno > `0` && vp->v_type == VREG &&
234	vp != fs->lfs_ivnode) {
235	struct vm_page *pg;
236	voff_t off;
237
238	for (off = lfs_lblktosize(fs, bp->b_lblkno);
239	off < lfs_lblktosize(fs, bp->b_lblkno + `1`);
240	off += PAGE_SIZE) {
241	pg = uvm_pagelookup(&vp->v_uobj, off);
242	if (pg == NULL)
243	continue;
244	if ((pg->flags & PG_CLEAN) == `0` \|\|
245	pmap_is_modified(pg)) {
246	lfs_sb_addavail(fs,
247	lfs_btofsb(fs,
248	bp->b_bcount));
249	wakeup(&fs->lfs_availsleep);
250	mutex_exit(vp->v_interlock);
251	lfs_freebuf(fs, bp);
252	mutex_enter(vp->v_interlock);
253	bp = NULL;
254	break;
255	}
256	}
257	}
258	for (tbp = LIST_FIRST(&vp->v_dirtyblkhd); tbp;
259	tbp = tnbp)
260	{
261	tnbp = LIST_NEXT(tbp, b_vnbufs);
262	if (tbp->b_vp == bp->b_vp
263	&& tbp->b_lblkno == bp->b_lblkno
264	&& tbp != bp)
265	{
266	lfs_sb_addavail(fs, lfs_btofsb(fs,
267	bp->b_bcount));
268	wakeup(&fs->lfs_availsleep);
269	mutex_exit(vp->v_interlock);
270	lfs_freebuf(fs, bp);
271	mutex_enter(vp->v_interlock);
272	bp = NULL;
273	break;
274	}
275	}
276	}
277	} else {
278	mutex_enter(vp->v_interlock);
279	}
280
281	/ If the node is being written, wait until that is done /
282	while (WRITEINPROG(vp)) {
283	ivndebug(vp,"vflush/writeinprog");
284	cv_wait(&vp->v_cv, vp->v_interlock);
285	}
286	error = vdead_check(vp, VDEAD_NOWAIT);
287	mutex_exit(vp->v_interlock);
288
289	/ Protect against deadlock in vinvalbuf() /
290	lfs_seglock(fs, SEGM_SYNC \| ((error != `0`) ? SEGM_RECLAIM : `0`));
291	if (error != `0`) {
292	fs->lfs_reclino = ip->i_number;
293	}
294
295	/ If we're supposed to flush a freed inode, just toss it /
296	if (ip->i_lfs_iflags & LFSI_DELETED) {
297	DLOG((DLOG_VNODE, "lfs_vflush: ino %d freed, not flushing\n",
298	ip->i_number));
299	/ Drain v_numoutput /
300	mutex_enter(vp->v_interlock);
301	while (vp->v_numoutput > `0`) {
302	cv_wait(&vp->v_cv, vp->v_interlock);
303	}
304	KASSERT(vp->v_numoutput == `0`);
305	mutex_exit(vp->v_interlock);
306
307	mutex_enter(&bufcache_lock);
308	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
309	nbp = LIST_NEXT(bp, b_vnbufs);
310
311	KASSERT((bp->b_flags & B_GATHERED) == `0`);
312	if (bp->b_oflags & BO_DELWRI) { / XXX always true? /
313	lfs_sb_addavail(fs, lfs_btofsb(fs, bp->b_bcount));
314	wakeup(&fs->lfs_availsleep);
315	}
316	/ Copied from lfs_writeseg /
317	if (bp->b_iodone != NULL) {
318	mutex_exit(&bufcache_lock);
319	biodone(bp);
320	mutex_enter(&bufcache_lock);
321	} else {
322	bremfree(bp);
323	LFS_UNLOCK_BUF(bp);
324	mutex_enter(vp->v_interlock);
325	bp->b_flags &= ~(B_READ \| B_GATHERED);
326	bp->b_oflags = (bp->b_oflags & ~BO_DELWRI) \| BO_DONE;
327	bp->b_error = `0`;
328	reassignbuf(bp, vp);
329	mutex_exit(vp->v_interlock);
330	brelse(bp, `0`);
331	}
332	}
333	mutex_exit(&bufcache_lock);
334	LFS_CLR_UINO(ip, IN_CLEANING);
335	LFS_CLR_UINO(ip, IN_MODIFIED \| IN_ACCESSED);
336	ip->i_flag &= ~IN_ALLMOD;
337	DLOG((DLOG_VNODE, "lfs_vflush: done not flushing ino %d\n",
338	ip->i_number));
339	lfs_segunlock(fs);
340
341	KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
342
343	return `0`;
344	}
345
346	fs->lfs_flushvp = vp;
347	if (LFS_SHOULD_CHECKPOINT(fs, fs->lfs_sp->seg_flags)) {
348	error = lfs_segwrite(vp->v_mount, SEGM_CKP \| SEGM_SYNC);
349	fs->lfs_flushvp = NULL;
350	KASSERT(fs->lfs_flushvp_fakevref == `0`);
351	lfs_segunlock(fs);
352
353	/ Make sure that any pending buffers get written /
354	mutex_enter(vp->v_interlock);
355	while (vp->v_numoutput > `0`) {
356	cv_wait(&vp->v_cv, vp->v_interlock);
357	}
358	KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
359	KASSERT(vp->v_numoutput == `0`);
360	mutex_exit(vp->v_interlock);
361
362	return error;
363	}
364	sp = fs->lfs_sp;
365
366	flushed = `0`;
367	if (VPISEMPTY(vp)) {
368	lfs_writevnodes(fs, vp->v_mount, sp, VN_EMPTY);
369	++flushed;
370	} else if ((ip->i_flag & IN_CLEANING) &&
371	(fs->lfs_sp->seg_flags & SEGM_CLEAN)) {
372	ivndebug(vp,"vflush/clean");
373	lfs_writevnodes(fs, vp->v_mount, sp, VN_CLEAN);
374	++flushed;
375	} else if (lfs_dostats) {
376	if (!VPISEMPTY(vp) \|\| (VTOI(vp)->i_flag & IN_ALLMOD))
377	++lfs_stats.vflush_invoked;
378	ivndebug(vp,"vflush");
379	}
380
381	#ifdef DIAGNOSTIC
382	if (vp->v_uflag & VU_DIROP) {
383	DLOG((DLOG_VNODE, "lfs_vflush: flushing VU_DIROP\n"));
384	/ panic("lfs_vflush: VU_DIROP being flushed...this can\'t happen"); /
385	}
386	#endif
387
388	do {
389	#ifdef DEBUG
390	int loopcount = `0`;
391	#endif
392	do {
393	if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) {
394	relock = lfs_writefile(fs, sp, vp);
395	if (relock && vp != fs->lfs_ivnode) {
396	/*
397	* Might have to wait for the
398	* cleaner to run; but we're
399	* still not done with this vnode.
400	* XXX we can do better than this.
401	*/
402	KDASSERT(ip->i_number != LFS_IFILE_INUM);
403	lfs_writeinode(fs, sp, ip);
404	mutex_enter(&lfs_lock);
405	LFS_SET_UINO(ip, IN_MODIFIED);
406	mutex_exit(&lfs_lock);
407	lfs_writeseg(fs, sp);
408	lfs_segunlock(fs);
409	lfs_segunlock_relock(fs);
410	goto top;
411	}
412	}
413	/*
414	* If we begin a new segment in the middle of writing
415	* the Ifile, it creates an inconsistent checkpoint,
416	* since the Ifile information for the new segment
417	* is not up-to-date. Take care of this here by
418	* sending the Ifile through again in case there
419	* are newly dirtied blocks. But wait, there's more!
420	* This second Ifile write could also cross a segment
421	* boundary, if the first one was large. The second
422	* one is guaranteed to be no more than 8 blocks,
423	* though (two segment blocks and supporting indirects)
424	* so the third write will not cross the boundary.
425	*/
426	if (vp == fs->lfs_ivnode) {
427	lfs_writefile(fs, sp, vp);
428	lfs_writefile(fs, sp, vp);
429	}
430	#ifdef DEBUG
431	if (++loopcount > `2`)
432	log(LOG_NOTICE, "lfs_vflush: looping count=%d\n", loopcount);
433	#endif
434	} while (lfs_writeinode(fs, sp, ip));
435	} while (lfs_writeseg(fs, sp) && ip->i_number == LFS_IFILE_INUM);
436
437	if (lfs_dostats) {
438	++lfs_stats.nwrites;
439	if (sp->seg_flags & SEGM_SYNC)
440	++lfs_stats.nsync_writes;
441	if (sp->seg_flags & SEGM_CKP)
442	++lfs_stats.ncheckpoints;
443	}
444	/*
445	* If we were called from somewhere that has already held the seglock
446	* (e.g., lfs_markv()), the lfs_segunlock will not wait for
447	* the write to complete because we are still locked.
448	* Since lfs_vflush() must return the vnode with no dirty buffers,
449	* we must explicitly wait, if that is the case.
450	*
451	* We compare the iocount against 1, not 0, because it is
452	* artificially incremented by lfs_seglock().
453	*/
454	mutex_enter(&lfs_lock);
455	if (fs->lfs_seglock > `1`) {
456	while (fs->lfs_iocount > `1`)
457	(void)mtsleep(&fs->lfs_iocount, PRIBIO + `1`,
458	"lfs_vflush", `0`, &lfs_lock);
459	}
460	mutex_exit(&lfs_lock);
461
462	lfs_segunlock(fs);
463
464	/ Wait for these buffers to be recovered by aiodoned /
465	mutex_enter(vp->v_interlock);
466	while (vp->v_numoutput > `0`) {
467	cv_wait(&vp->v_cv, vp->v_interlock);
468	}
469	KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
470	KASSERT(vp->v_numoutput == `0`);
471	mutex_exit(vp->v_interlock);
472
473	fs->lfs_flushvp = NULL;
474	KASSERT(fs->lfs_flushvp_fakevref == `0`);
475
476	return (`0`);
477	}
478
479	struct lfs_writevnodes_ctx {
480	int op;
481	struct lfs *fs;
482	};
483	static bool
484	lfs_writevnodes_selector(void cl, struct* vnode *vp)
485	{
486	struct lfs_writevnodes_ctx *c = cl;
487	struct inode *ip = VTOI(vp);
488	int op = c->op;
489
490	if (ip == NULL \|\| vp->v_type == VNON)
491	return false;
492	if ((op == VN_DIROP && !(vp->v_uflag & VU_DIROP)) \|\|
493	(op != VN_DIROP && op != VN_CLEAN && (vp->v_uflag & VU_DIROP))) {
494	vndebug(vp, "dirop");
495	return false;
496	}
497	if (op == VN_EMPTY && !VPISEMPTY(vp)) {
498	vndebug(vp,"empty");
499	return false;;
500	}
501	if (op == VN_CLEAN && ip->i_number != LFS_IFILE_INUM &&
502	vp != c->fs->lfs_flushvp && !(ip->i_flag & IN_CLEANING)) {
503	vndebug(vp,"cleaning");
504	return false;
505	}
506	mutex_enter(&lfs_lock);
507	if (vp == c->fs->lfs_unlockvp) {
508	mutex_exit(&lfs_lock);
509	return false;
510	}
511	mutex_exit(&lfs_lock);
512
513	return true;
514	}
515
516	int
517	lfs_writevnodes(struct lfs fs, struct* mount mp, struct* segment sp, int* op)
518	{
519	struct inode *ip;
520	struct vnode *vp;
521	struct vnode_iterator *marker;
522	struct lfs_writevnodes_ctx ctx;
523	int inodes_written = `0`;
524	int error = `0`;
525
526	/*
527	* XXX This was TAILQ_FOREACH_REVERSE on &mp->mnt_vnodelist.
528	* XXX The rationale is unclear, the initial commit had no information.
529	* XXX If the order really matters we have to sort the vnodes first.
530	*/
531
532	ASSERT_SEGLOCK(fs);
533	vfs_vnode_iterator_init(mp, &marker);
534	ctx.op = op;
535	ctx.fs = fs;
536	while ((vp = vfs_vnode_iterator_next(marker,
537	lfs_writevnodes_selector, &ctx)) != NULL) {
538	ip = VTOI(vp);
539
540	/*
541	* Write the inode/file if dirty and it's not the IFILE.
542	*/
543	if (((ip->i_flag & IN_ALLMOD) \|\| !VPISEMPTY(vp)) &&
544	ip->i_number != LFS_IFILE_INUM) {
545	error = lfs_writefile(fs, sp, vp);
546	if (error) {
547	vrele(vp);
548	if (error == EAGAIN) {
549	/*
550	* This error from lfs_putpages
551	* indicates we need to drop
552	* the segment lock and start
553	* over after the cleaner has
554	* had a chance to run.
555	*/
556	lfs_writeinode(fs, sp, ip);
557	lfs_writeseg(fs, sp);
558	if (!VPISEMPTY(vp) &&
559	!WRITEINPROG(vp) &&
560	!(ip->i_flag & IN_ALLMOD)) {
561	mutex_enter(&lfs_lock);
562	LFS_SET_UINO(ip, IN_MODIFIED);
563	mutex_exit(&lfs_lock);
564	}
565	break;
566	}
567	error = `0`; / XXX not quite right /
568	continue;
569	}
570
571	if (!VPISEMPTY(vp)) {
572	if (WRITEINPROG(vp)) {
573	ivndebug(vp,"writevnodes/write2");
574	} else if (!(ip->i_flag & IN_ALLMOD)) {
575	mutex_enter(&lfs_lock);
576	LFS_SET_UINO(ip, IN_MODIFIED);
577	mutex_exit(&lfs_lock);
578	}
579	}
580	(void) lfs_writeinode(fs, sp, ip);
581	inodes_written++;
582	}
583	vrele(vp);
584	}
585	vfs_vnode_iterator_destroy(marker);
586	return error;
587	}
588
589	/*
590	* Do a checkpoint.
591	*/
592	int
593	lfs_segwrite(struct mount mp, int* flags)
594	{
595	struct buf *bp;
596	struct inode *ip;
597	struct lfs *fs;
598	struct segment *sp;
599	struct vnode *vp;
600	SEGUSE *segusep;
601	int do_ckp, did_ckp, error;
602	unsigned n, segleft, maxseg, sn, i, curseg;
603	int writer_set = `0`;
604	int dirty;
605	int redo;
606	SEGSUM *ssp;
607	int um_error;
608
609	fs = VFSTOULFS(mp)->um_lfs;
610	ASSERT_MAYBE_SEGLOCK(fs);
611
612	if (fs->lfs_ronly)
613	return EROFS;
614
615	lfs_imtime(fs);
616
617	/*
618	* Allocate a segment structure and enough space to hold pointers to
619	* the maximum possible number of buffers which can be described in a
620	* single summary block.
621	*/
622	do_ckp = LFS_SHOULD_CHECKPOINT(fs, flags);
623
624	/ We can't do a partial write and checkpoint at the same time. /
625	if (do_ckp)
626	flags &= ~SEGM_SINGLE;
627
628	lfs_seglock(fs, flags \| (do_ckp ? SEGM_CKP : `0`));
629	sp = fs->lfs_sp;
630	if (sp->seg_flags & (SEGM_CLEAN \| SEGM_CKP))
631	do_ckp = `1`;
632
633	/*
634	* If lfs_flushvp is non-NULL, we are called from lfs_vflush,
635	* in which case we have to flush all buffers off of this vnode.
636	* We don't care about other nodes, but write any non-dirop nodes
637	* anyway in anticipation of another getnewvnode().
638	*
639	* If we're cleaning we only write cleaning and ifile blocks, and
640	* no dirops, since otherwise we'd risk corruption in a crash.
641	*/
642	if (sp->seg_flags & SEGM_CLEAN)
643	lfs_writevnodes(fs, mp, sp, VN_CLEAN);
644	else if (!(sp->seg_flags & SEGM_FORCE_CKP)) {
645	do {
646	um_error = lfs_writevnodes(fs, mp, sp, VN_REG);
647	if ((sp->seg_flags & SEGM_SINGLE) &&
648	lfs_sb_getcurseg(fs) != fs->lfs_startseg) {
649	DLOG((DLOG_SEG, "lfs_segwrite: breaking out of segment write at daddr 0x%jx\n", (uintmax_t)lfs_sb_getoffset(fs)));
650	break;
651	}
652
653	if (do_ckp \|\| fs->lfs_dirops == `0`) {
654	if (!writer_set) {
655	lfs_writer_enter(fs, "lfs writer");
656	writer_set = `1`;
657	}
658	error = lfs_writevnodes(fs, mp, sp, VN_DIROP);
659	if (um_error == `0`)
660	um_error = error;
661	/ In case writevnodes errored out /
662	lfs_flush_dirops(fs);
663	ssp = (SEGSUM *)(sp->segsum);
664	lfs_ss_setflags(fs, ssp,
665	lfs_ss_getflags(fs, ssp) & ~(SS_CONT));
666	lfs_finalize_fs_seguse(fs);
667	}
668	if (do_ckp && um_error) {
669	lfs_segunlock_relock(fs);
670	sp = fs->lfs_sp;
671	}
672	} while (do_ckp && um_error != `0`);
673	}
674
675	/*
676	* If we are doing a checkpoint, mark everything since the
677	* last checkpoint as no longer ACTIVE.
678	*/
679	if (do_ckp \|\| fs->lfs_doifile) {
680	segleft = lfs_sb_getnseg(fs);
681	curseg = `0`;
682	for (n = `0`; n < lfs_sb_getsegtabsz(fs); n++) {
683	int bread_error;
684
685	dirty = `0`;
686	bread_error = bread(fs->lfs_ivnode,
687	lfs_sb_getcleansz(fs) + n,
688	lfs_sb_getbsize(fs), B_MODIFY, &bp);
689	if (bread_error)
690	panic("lfs_segwrite: ifile read: "
691	"seguse %u: error %d\n",
692	n, bread_error);
693	segusep = (SEGUSE *)bp->b_data;
694	maxseg = min(segleft, lfs_sb_getsepb(fs));
695	for (i = `0`; i < maxseg; i++) {
696	sn = curseg + i;
697	if (sn != lfs_dtosn(fs, lfs_sb_getcurseg(fs)) &&
698	segusep->su_flags & SEGUSE_ACTIVE) {
699	segusep->su_flags &= ~SEGUSE_ACTIVE;
700	--fs->lfs_nactive;
701	++dirty;
702	}
703	fs->lfs_suflags[fs->lfs_activesb][sn] =
704	segusep->su_flags;
705	if (lfs_sb_getversion(fs) > `1`)
706	++segusep;
707	else
708	segusep = (SEGUSE *)
709	((SEGUSE_V1 *)segusep + `1`);
710	}
711
712	if (dirty)
713	error = LFS_BWRITE_LOG(bp); / Ifile /
714	else
715	brelse(bp, `0`);
716	segleft -= lfs_sb_getsepb(fs);
717	curseg += lfs_sb_getsepb(fs);
718	}
719	}
720
721	KASSERT(LFS_SEGLOCK_HELD(fs));
722
723	did_ckp = `0`;
724	if (do_ckp \|\| fs->lfs_doifile) {
725	vp = fs->lfs_ivnode;
726	#ifdef DEBUG
727	int loopcount = `0`;
728	#endif
729	do {
730	#ifdef DEBUG
731	LFS_ENTER_LOG("pretend", __FILE__, __LINE__, `0`, `0`, curproc->p_pid);
732	#endif
733	mutex_enter(&lfs_lock);
734	fs->lfs_flags &= ~LFS_IFDIRTY;
735	mutex_exit(&lfs_lock);
736
737	ip = VTOI(vp);
738
739	if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) {
740	/*
741	* Ifile has no pages, so we don't need
742	* to check error return here.
743	*/
744	lfs_writefile(fs, sp, vp);
745	/*
746	* Ensure the Ifile takes the current segment
747	* into account. See comment in lfs_vflush.
748	*/
749	lfs_writefile(fs, sp, vp);
750	lfs_writefile(fs, sp, vp);
751	}
752
753	if (ip->i_flag & IN_ALLMOD)
754	++did_ckp;
755	#if 0
756	redo = (do_ckp ? lfs_writeinode(fs, sp, ip) : `0`);
757	#else
758	redo = lfs_writeinode(fs, sp, ip);
759	#endif
760	redo += lfs_writeseg(fs, sp);
761	mutex_enter(&lfs_lock);
762	redo += (fs->lfs_flags & LFS_IFDIRTY);
763	mutex_exit(&lfs_lock);
764	#ifdef DEBUG
765	if (++loopcount > `2`)
766	log(LOG_NOTICE, "lfs_segwrite: looping count=%d\n",
767	loopcount);
768	#endif
769	} while (redo && do_ckp);
770
771	/*
772	* Unless we are unmounting, the Ifile may continue to have
773	* dirty blocks even after a checkpoint, due to changes to
774	* inodes' atime. If we're checkpointing, it's "impossible"
775	* for other parts of the Ifile to be dirty after the loop
776	* above, since we hold the segment lock.
777	*/
778	mutex_enter(vp->v_interlock);
779	if (LIST_EMPTY(&vp->v_dirtyblkhd)) {
780	LFS_CLR_UINO(ip, IN_ALLMOD);
781	}
782	#ifdef DIAGNOSTIC
783	else if (do_ckp) {
784	int do_panic = `0`;
785	LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
786	if (bp->b_lblkno < lfs_sb_getcleansz(fs) +
787	lfs_sb_getsegtabsz(fs) &&
788	!(bp->b_flags & B_GATHERED)) {
789	printf("ifile lbn %ld still dirty (flags %lx)\n",
790	(long)bp->b_lblkno,
791	(long)bp->b_flags);
792	++do_panic;
793	}
794	}
795	if (do_panic)
796	panic("dirty blocks");
797	}
798	#endif
799	mutex_exit(vp->v_interlock);
800	} else {
801	(void) lfs_writeseg(fs, sp);
802	}
803
804	/ Note Ifile no longer needs to be written /
805	fs->lfs_doifile = `0`;
806	if (writer_set)
807	lfs_writer_leave(fs);
808
809	/*
810	* If we didn't write the Ifile, we didn't really do anything.
811	* That means that (1) there is a checkpoint on disk and (2)
812	* nothing has changed since it was written.
813	*
814	* Take the flags off of the segment so that lfs_segunlock
815	* doesn't have to write the superblock either.
816	*/
817	if (do_ckp && !did_ckp) {
818	sp->seg_flags &= ~SEGM_CKP;
819	}
820
821	if (lfs_dostats) {
822	++lfs_stats.nwrites;
823	if (sp->seg_flags & SEGM_SYNC)
824	++lfs_stats.nsync_writes;
825	if (sp->seg_flags & SEGM_CKP)
826	++lfs_stats.ncheckpoints;
827	}
828	lfs_segunlock(fs);
829	return (`0`);
830	}
831
832	/*
833	* Write the dirty blocks associated with a vnode.
834	*/
835	int
836	lfs_writefile(struct lfs fs, struct* segment sp, struct* vnode *vp)
837	{
838	struct inode *ip;
839	int i, frag;
840	SEGSUM *ssp;
841	int error;
842
843	ASSERT_SEGLOCK(fs);
844	error = `0`;
845	ip = VTOI(vp);
846
847	lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
848
849	if (vp->v_uflag & VU_DIROP) {
850	ssp = (SEGSUM *)sp->segsum;
851	lfs_ss_setflags(fs, ssp,
852	lfs_ss_getflags(fs, ssp) \| (SS_DIROP\|SS_CONT));
853	}
854
855	if (sp->seg_flags & SEGM_CLEAN) {
856	lfs_gather(fs, sp, vp, lfs_match_fake);
857	/*
858	* For a file being flushed, we need to write all blocks.
859	* This means writing the cleaning blocks first, and then
860	* immediately following with any non-cleaning blocks.
861	* The same is true of the Ifile since checkpoints assume
862	* that all valid Ifile blocks are written.
863	*/
864	if (IS_FLUSHING(fs, vp) \|\| vp == fs->lfs_ivnode) {
865	lfs_gather(fs, sp, vp, lfs_match_data);
866	/*
867	* Don't call VOP_PUTPAGES: if we're flushing,
868	* we've already done it, and the Ifile doesn't
869	* use the page cache.
870	*/
871	}
872	} else {
873	lfs_gather(fs, sp, vp, lfs_match_data);
874	/*
875	* If we're flushing, we've already called VOP_PUTPAGES
876	* so don't do it again. Otherwise, we want to write
877	* everything we've got.
878	*/
879	if (!IS_FLUSHING(fs, vp)) {
880	mutex_enter(vp->v_interlock);
881	error = VOP_PUTPAGES(vp, `0`, `0`,
882	PGO_CLEANIT \| PGO_ALLPAGES \| PGO_LOCKED);
883	}
884	}
885
886	/*
887	* It may not be necessary to write the meta-data blocks at this point,
888	* as the roll-forward recovery code should be able to reconstruct the
889	* list.
890	*
891	* We have to write them anyway, though, under two conditions: (1) the
892	* vnode is being flushed (for reuse by vinvalbuf); or (2) we are
893	* checkpointing.
894	*
895	* BUT if we are cleaning, we might have indirect blocks that refer to
896	* new blocks not being written yet, in addition to fragments being
897	* moved out of a cleaned segment. If that is the case, don't
898	* write the indirect blocks, or the finfo will have a small block
899	* in the middle of it!
900	* XXX in this case isn't the inode size wrong too?
901	*/
902	frag = `0`;
903	if (sp->seg_flags & SEGM_CLEAN) {
904	for (i = `0`; i < ULFS_NDADDR; i++)
905	if (ip->i_lfs_fragsize[i] > `0` &&
906	ip->i_lfs_fragsize[i] < lfs_sb_getbsize(fs))
907	++frag;
908	}
909	#ifdef DIAGNOSTIC
910	if (frag > `1`)
911	panic("lfs_writefile: more than one fragment!");
912	#endif
913	if (IS_FLUSHING(fs, vp) \|\|
914	(frag == `0` && (lfs_writeindir \|\| (sp->seg_flags & SEGM_CKP)))) {
915	lfs_gather(fs, sp, vp, lfs_match_indir);
916	lfs_gather(fs, sp, vp, lfs_match_dindir);
917	lfs_gather(fs, sp, vp, lfs_match_tindir);
918	}
919	lfs_release_finfo(fs);
920
921	return error;
922	}
923
924	/*
925	* Update segment accounting to reflect this inode's change of address.
926	*/
927	static int
928	lfs_update_iaddr(struct lfs fs, struct* segment sp, struct* inode *ip, daddr_t ndaddr)
929	{
930	struct buf *bp;
931	daddr_t daddr;
932	IFILE *ifp;
933	SEGUSE *sup;
934	ino_t ino;
935	int redo_ifile;
936	u_int32_t sn;
937
938	redo_ifile = `0`;
939
940	/*
941	* If updating the ifile, update the super-block. Update the disk
942	* address and access times for this inode in the ifile.
943	*/
944	ino = ip->i_number;
945	if (ino == LFS_IFILE_INUM) {
946	daddr = lfs_sb_getidaddr(fs);
947	lfs_sb_setidaddr(fs, LFS_DBTOFSB(fs, ndaddr));
948	} else {
949	LFS_IENTRY(ifp, fs, ino, bp);
950	daddr = lfs_if_getdaddr(fs, ifp);
951	lfs_if_setdaddr(fs, ifp, LFS_DBTOFSB(fs, ndaddr));
952	(void)LFS_BWRITE_LOG(bp); / Ifile /
953	}
954
955	/*
956	* If this is the Ifile and lfs_offset is set to the first block
957	* in the segment, dirty the new segment's accounting block
958	* (XXX should already be dirty?) and tell the caller to do it again.
959	*/
960	if (ip->i_number == LFS_IFILE_INUM) {
961	sn = lfs_dtosn(fs, lfs_sb_getoffset(fs));
962	if (lfs_sntod(fs, sn) + lfs_btofsb(fs, lfs_sb_getsumsize(fs)) ==
963	lfs_sb_getoffset(fs)) {
964	LFS_SEGENTRY(sup, fs, sn, bp);
965	KASSERT(bp->b_oflags & BO_DELWRI);
966	LFS_WRITESEGENTRY(sup, fs, sn, bp);
967	/ fs->lfs_flags \|= LFS_IFDIRTY; /
968	redo_ifile \|= `1`;
969	}
970	}
971
972	/*
973	* The inode's last address should not be in the current partial
974	* segment, except under exceptional circumstances (lfs_writevnodes
975	* had to start over, and in the meantime more blocks were written
976	* to a vnode). Both inodes will be accounted to this segment
977	* in lfs_writeseg so we need to subtract the earlier version
978	* here anyway. The segment count can temporarily dip below
979	* zero here; keep track of how many duplicates we have in
980	* "dupino" so we don't panic below.
981	*/
982	if (daddr >= lfs_sb_getlastpseg(fs) && daddr <= lfs_sb_getoffset(fs)) {
983	++sp->ndupino;
984	DLOG((DLOG_SEG, "lfs_writeinode: last inode addr in current pseg "
985	"(ino %d daddr 0x%llx) ndupino=%d\n", ino,
986	(long long)daddr, sp->ndupino));
987	}
988	/*
989	* Account the inode: it no longer belongs to its former segment,
990	* though it will not belong to the new segment until that segment
991	* is actually written.
992	*/
993	if (daddr != LFS_UNUSED_DADDR) {
994	u_int32_t oldsn = lfs_dtosn(fs, daddr);
995	#ifdef DIAGNOSTIC
996	int ndupino = (sp->seg_number == oldsn) ? sp->ndupino : `0`;
997	#endif
998	LFS_SEGENTRY(sup, fs, oldsn, bp);
999	#ifdef DIAGNOSTIC
1000	if (sup->su_nbytes + DINOSIZE(fs) * ndupino < DINOSIZE(fs)) {
1001	printf("lfs_writeinode: negative bytes "
1002	"(segment %" PRIu32 " short by %d, "
1003	"oldsn=%" PRIu32 ", cursn=%" PRIu32
1004	", daddr=%" PRId64 ", su_nbytes=%u, "
1005	"ndupino=%d)\n",
1006	lfs_dtosn(fs, daddr),
1007	(int)DINOSIZE(fs) *
1008	(`1` - sp->ndupino) - sup->su_nbytes,
1009	oldsn, sp->seg_number, daddr,
1010	(unsigned int)sup->su_nbytes,
1011	sp->ndupino);
1012	panic("lfs_writeinode: negative bytes");
1013	sup->su_nbytes = DINOSIZE(fs);
1014	}
1015	#endif
1016	DLOG((DLOG_SU, "seg %d -= %d for ino %d inode\n",
1017	lfs_dtosn(fs, daddr), DINOSIZE(fs), ino));
1018	sup->su_nbytes -= DINOSIZE(fs);
1019	redo_ifile \|=
1020	(ino == LFS_IFILE_INUM && !(bp->b_flags & B_GATHERED));
1021	if (redo_ifile) {
1022	mutex_enter(&lfs_lock);
1023	fs->lfs_flags \|= LFS_IFDIRTY;
1024	mutex_exit(&lfs_lock);
1025	/ Don't double-account /
1026	lfs_sb_setidaddr(fs, `0x0`);
1027	}
1028	LFS_WRITESEGENTRY(sup, fs, oldsn, bp); / Ifile /
1029	}
1030
1031	return redo_ifile;
1032	}
1033
1034	int
1035	lfs_writeinode(struct lfs fs, struct* segment sp, struct* inode *ip)
1036	{
1037	struct buf *bp;
1038	union lfs_dinode *cdp;
1039	struct vnode *vp = ITOV(ip);
1040	daddr_t daddr;
1041	IINFO *iip;
1042	int i;
1043	int redo_ifile = `0`;
1044	int gotblk = `0`;
1045	int count;
1046	SEGSUM *ssp;
1047
1048	ASSERT_SEGLOCK(fs);
1049	if (!(ip->i_flag & IN_ALLMOD) && !(vp->v_uflag & VU_DIROP))
1050	return (`0`);
1051
1052	/ Can't write ifile when writer is not set /
1053	KASSERT(ip->i_number != LFS_IFILE_INUM \|\| fs->lfs_writer > `0` \|\|
1054	(sp->seg_flags & SEGM_CLEAN));
1055
1056	/*
1057	* If this is the Ifile, see if writing it here will generate a
1058	* temporary misaccounting. If it will, do the accounting and write
1059	* the blocks, postponing the inode write until the accounting is
1060	* solid.
1061	*/
1062	count = `0`;
1063	while (vp == fs->lfs_ivnode) {
1064	int redo = `0`;
1065
1066	if (sp->idp == NULL && sp->ibp == NULL &&
1067	(sp->seg_bytes_left < lfs_sb_getibsize(fs) \|\|
1068	sp->sum_bytes_left < sizeof(int32_t))) {
1069	(void) lfs_writeseg(fs, sp);
1070	continue;
1071	}
1072
1073	/ Look for dirty Ifile blocks /
1074	LIST_FOREACH(bp, &fs->lfs_ivnode->v_dirtyblkhd, b_vnbufs) {
1075	if (!(bp->b_flags & B_GATHERED)) {
1076	redo = `1`;
1077	break;
1078	}
1079	}
1080
1081	if (redo == `0`)
1082	redo = lfs_update_iaddr(fs, sp, ip, `0x0`);
1083	if (redo == `0`)
1084	break;
1085
1086	if (sp->idp) {
1087	lfs_dino_setinumber(fs, sp->idp, `0`);
1088	sp->idp = NULL;
1089	}
1090	++count;
1091	if (count > `2`)
1092	log(LOG_NOTICE, "lfs_writeinode: looping count=%d\n", count);
1093	lfs_writefile(fs, sp, fs->lfs_ivnode);
1094	}
1095
1096	/ Allocate a new inode block if necessary. /
1097	if ((ip->i_number != LFS_IFILE_INUM \|\| sp->idp == NULL) &&
1098	sp->ibp == NULL) {
1099	/ Allocate a new segment if necessary. /
1100	if (sp->seg_bytes_left < lfs_sb_getibsize(fs) \|\|
1101	sp->sum_bytes_left < sizeof(int32_t))
1102	(void) lfs_writeseg(fs, sp);
1103
1104	/ Get next inode block. /
1105	daddr = lfs_sb_getoffset(fs);
1106	lfs_sb_addoffset(fs, lfs_btofsb(fs, lfs_sb_getibsize(fs)));
1107	sp->ibp = *sp->cbpp++ =
1108	getblk(VTOI(fs->lfs_ivnode)->i_devvp,
1109	LFS_FSBTODB(fs, daddr), lfs_sb_getibsize(fs), `0`, `0`);
1110	gotblk++;
1111
1112	/ Zero out inode numbers /
1113	for (i = `0`; i < LFS_INOPB(fs); ++i) {
1114	union lfs_dinode *tmpdi;
1115
1116	tmpdi = (union lfs_dinode )((char* *)sp->ibp->b_data +
1117	DINOSIZE(fs) * i);
1118	lfs_dino_setinumber(fs, tmpdi, `0`);
1119	}
1120
1121	++sp->start_bpp;
1122	lfs_sb_subavail(fs, lfs_btofsb(fs, lfs_sb_getibsize(fs)));
1123	/ Set remaining space counters. /
1124	sp->seg_bytes_left -= lfs_sb_getibsize(fs);
1125	sp->sum_bytes_left -= sizeof(int32_t);
1126
1127	/ Store the address in the segment summary. /
1128	iip = NTH_IINFO(fs, sp->segsum, sp->ninodes / LFS_INOPB(fs));
1129	lfs_ii_setblock(fs, iip, daddr);
1130	}
1131
1132	/ Check VU_DIROP in case there is a new file with no data blocks /
1133	if (vp->v_uflag & VU_DIROP) {
1134	ssp = (SEGSUM *)sp->segsum;
1135	lfs_ss_setflags(fs, ssp,
1136	lfs_ss_getflags(fs, ssp) \| (SS_DIROP\|SS_CONT));
1137	}
1138
1139	/ Update the inode times and copy the inode onto the inode page. /
1140	/ XXX kludge --- don't redirty the ifile just to put times on it /
1141	if (ip->i_number != LFS_IFILE_INUM)
1142	LFS_ITIMES(ip, NULL, NULL, NULL);
1143
1144	/*
1145	* If this is the Ifile, and we've already written the Ifile in this
1146	* partial segment, just overwrite it (it's not on disk yet) and
1147	* continue.
1148	*
1149	* XXX we know that the bp that we get the second time around has
1150	* already been gathered.
1151	*/
1152	if (ip->i_number == LFS_IFILE_INUM && sp->idp) {
1153	lfs_copy_dinode(fs, sp->idp, ip->i_din);
1154	ip->i_lfs_osize = ip->i_size;
1155	return `0`;
1156	}
1157
1158	bp = sp->ibp;
1159	cdp = DINO_IN_BLOCK(fs, bp->b_data, sp->ninodes % LFS_INOPB(fs));
1160	lfs_copy_dinode(fs, cdp, ip->i_din);
1161
1162	/*
1163	* This inode is on its way to disk; clear its VU_DIROP status when
1164	* the write is complete.
1165	*/
1166	if (vp->v_uflag & VU_DIROP) {
1167	if (!(sp->seg_flags & SEGM_CLEAN))
1168	ip->i_flag \|= IN_CDIROP;
1169	else {
1170	DLOG((DLOG_DIROP, "lfs_writeinode: not clearing dirop for cleaned ino %d\n", (int)ip->i_number));
1171	}
1172	}
1173
1174	/*
1175	* If cleaning, link counts and directory file sizes cannot change,
1176	* since those would be directory operations---even if the file
1177	* we are writing is marked VU_DIROP we should write the old values.
1178	* If we're not cleaning, of course, update the values so we get
1179	* current values the next time we clean.
1180	*/
1181	if (sp->seg_flags & SEGM_CLEAN) {
1182	if (vp->v_uflag & VU_DIROP) {
1183	lfs_dino_setnlink(fs, cdp, ip->i_lfs_odnlink);
1184	/ if (vp->v_type == VDIR) /
1185	lfs_dino_setsize(fs, cdp, ip->i_lfs_osize);
1186	}
1187	} else {
1188	ip->i_lfs_odnlink = lfs_dino_getnlink(fs, cdp);
1189	ip->i_lfs_osize = ip->i_size;
1190	}
1191
1192
1193	/ We can finish the segment accounting for truncations now /
1194	lfs_finalize_ino_seguse(fs, ip);
1195
1196	/*
1197	* If we are cleaning, ensure that we don't write UNWRITTEN disk
1198	* addresses to disk; possibly change the on-disk record of
1199	* the inode size, either by reverting to the previous size
1200	* (in the case of cleaning) or by verifying the inode's block
1201	* holdings (in the case of files being allocated as they are being
1202	* written).
1203	* XXX By not writing UNWRITTEN blocks, we are making the lfs_avail
1204	* XXX count on disk wrong by the same amount. We should be
1205	* XXX able to "borrow" from lfs_avail and return it after the
1206	* XXX Ifile is written. See also in lfs_writeseg.
1207	*/
1208
1209	/ Check file size based on highest allocated block /
1210	if (((lfs_dino_getmode(fs, ip->i_din) & LFS_IFMT) == LFS_IFREG \|\|
1211	(lfs_dino_getmode(fs, ip->i_din) & LFS_IFMT) == LFS_IFDIR) &&
1212	ip->i_size > ((ip->i_lfs_hiblk + `1`) << lfs_sb_getbshift(fs))) {
1213	lfs_dino_setsize(fs, cdp, (ip->i_lfs_hiblk + `1`) << lfs_sb_getbshift(fs));
1214	DLOG((DLOG_SEG, "lfs_writeinode: ino %d size %" PRId64 " -> %"
1215	PRId64 "\n", (int)ip->i_number, ip->i_size, lfs_dino_getsize(fs, cdp)));
1216	}
1217	if (ip->i_lfs_effnblks != lfs_dino_getblocks(fs, ip->i_din)) {
1218	DLOG((DLOG_SEG, "lfs_writeinode: cleansing ino %d eff %jd != nblk %d)"
1219	" at %jx\n", ip->i_number, (intmax_t)ip->i_lfs_effnblks,
1220	lfs_dino_getblocks(fs, ip->i_din), (uintmax_t)lfs_sb_getoffset(fs)));
1221	for (i=`0`; i<ULFS_NDADDR; i++) {
1222	if (lfs_dino_getdb(fs, cdp, i) == UNWRITTEN) {
1223	DLOG((DLOG_SEG, "lfs_writeinode: wiping UNWRITTEN\n"));
1224	lfs_dino_setdb(fs, cdp, i, `0`);
1225	}
1226	}
1227	for (i=`0`; i<ULFS_NIADDR; i++) {
1228	if (lfs_dino_getib(fs, cdp, i) == UNWRITTEN) {
1229	DLOG((DLOG_SEG, "lfs_writeinode: wiping UNWRITTEN\n"));
1230	lfs_dino_setib(fs, cdp, i, `0`);
1231	}
1232	}
1233	}
1234
1235	#ifdef DIAGNOSTIC
1236	/*
1237	* Check dinode held blocks against dinode size.
1238	* This should be identical to the check in lfs_vget().
1239	*/
1240	for (i = (lfs_dino_getsize(fs, cdp) + lfs_sb_getbsize(fs) - `1`) >> lfs_sb_getbshift(fs);
1241	i < ULFS_NDADDR; i++) {
1242	KASSERT(i >= `0`);
1243	if ((lfs_dino_getmode(fs, cdp) & LFS_IFMT) == LFS_IFLNK)
1244	continue;
1245	if (((lfs_dino_getmode(fs, cdp) & LFS_IFMT) == LFS_IFBLK \|\|
1246	(lfs_dino_getmode(fs, cdp) & LFS_IFMT) == LFS_IFCHR) && i == `0`)
1247	continue;
1248	if (lfs_dino_getdb(fs, cdp, i) != `0`) {
1249	# ifdef DEBUG
1250	lfs_dump_dinode(fs, cdp);
1251	# endif
1252	panic("writing inconsistent inode");
1253	}
1254	}
1255	#endif /* DIAGNOSTIC */
1256
1257	if (ip->i_flag & IN_CLEANING)
1258	LFS_CLR_UINO(ip, IN_CLEANING);
1259	else {
1260	/ XXX IN_ALLMOD /
1261	LFS_CLR_UINO(ip, IN_ACCESSED \| IN_ACCESS \| IN_CHANGE \|
1262	IN_UPDATE \| IN_MODIFY);
1263	if (ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din))
1264	LFS_CLR_UINO(ip, IN_MODIFIED);
1265	else {
1266	DLOG((DLOG_VNODE, "lfs_writeinode: ino %d: real "
1267	"blks=%d, eff=%jd\n", ip->i_number,
1268	lfs_dino_getblocks(fs, ip->i_din), (intmax_t)ip->i_lfs_effnblks));
1269	}
1270	}
1271
1272	if (ip->i_number == LFS_IFILE_INUM) {
1273	/ We know sp->idp == NULL /
1274	sp->idp = DINO_IN_BLOCK(fs, bp, sp->ninodes % LFS_INOPB(fs));
1275
1276	/ Not dirty any more /
1277	mutex_enter(&lfs_lock);
1278	fs->lfs_flags &= ~LFS_IFDIRTY;
1279	mutex_exit(&lfs_lock);
1280	}
1281
1282	if (gotblk) {
1283	mutex_enter(&bufcache_lock);
1284	LFS_LOCK_BUF(bp);
1285	brelsel(bp, `0`);
1286	mutex_exit(&bufcache_lock);
1287	}
1288
1289	/ Increment inode count in segment summary block. /
1290
1291	ssp = (SEGSUM *)sp->segsum;
1292	lfs_ss_setninos(fs, ssp, lfs_ss_getninos(fs, ssp) + `1`);
1293
1294	/ If this page is full, set flag to allocate a new page. /
1295	if (++sp->ninodes % LFS_INOPB(fs) == `0`)
1296	sp->ibp = NULL;
1297
1298	redo_ifile = lfs_update_iaddr(fs, sp, ip, bp->b_blkno);
1299
1300	KASSERT(redo_ifile == `0`);
1301	return (redo_ifile);
1302	}
1303
1304	int
1305	lfs_gatherblock(struct segment sp, struct* buf bp, kmutex_t mptr)
1306	{
1307	struct lfs *fs;
1308	int vers;
1309	int j, blksinblk;
1310
1311	ASSERT_SEGLOCK(sp->fs);
1312	/*
1313	* If full, finish this segment. We may be doing I/O, so
1314	* release and reacquire the splbio().
1315	*/
1316	#ifdef DIAGNOSTIC
1317	if (sp->vp == NULL)
1318	panic ("lfs_gatherblock: Null vp in segment");
1319	#endif
1320	fs = sp->fs;
1321	blksinblk = howmany(bp->b_bcount, lfs_sb_getbsize(fs));
1322	if (sp->sum_bytes_left < sizeof(int32_t) * blksinblk \|\|
1323	sp->seg_bytes_left < bp->b_bcount) {
1324	if (mptr)
1325	mutex_exit(mptr);
1326	lfs_updatemeta(sp);
1327
1328	vers = lfs_fi_getversion(fs, sp->fip);
1329	(void) lfs_writeseg(fs, sp);
1330
1331	/ Add the current file to the segment summary. /
1332	lfs_acquire_finfo(fs, VTOI(sp->vp)->i_number, vers);
1333
1334	if (mptr)
1335	mutex_enter(mptr);
1336	return (`1`);
1337	}
1338
1339	if (bp->b_flags & B_GATHERED) {
1340	DLOG((DLOG_SEG, "lfs_gatherblock: already gathered! Ino %ju,"
1341	" lbn %" PRId64 "\n",
1342	(uintmax_t)lfs_fi_getino(fs, sp->fip), bp->b_lblkno));
1343	return (`0`);
1344	}
1345
1346	/ Insert into the buffer list, update the FINFO block. /
1347	bp->b_flags \|= B_GATHERED;
1348
1349	*sp->cbpp++ = bp;
1350	for (j = `0`; j < blksinblk; j++) {
1351	unsigned bn;
1352
1353	bn = lfs_fi_getnblocks(fs, sp->fip);
1354	lfs_fi_setnblocks(fs, sp->fip, bn+`1`);
1355	lfs_fi_setblock(fs, sp->fip, bn, bp->b_lblkno + j);
1356	/ This block's accounting moves from lfs_favail to lfs_avail /
1357	lfs_deregister_block(sp->vp, bp->b_lblkno + j);
1358	}
1359
1360	sp->sum_bytes_left -= sizeof(int32_t) * blksinblk;
1361	sp->seg_bytes_left -= bp->b_bcount;
1362	return (`0`);
1363	}
1364
1365	int
1366	lfs_gather(struct lfs fs, struct* segment sp, struct* vnode *vp,
1367	int (match)(struct* lfs , struct* buf *))
1368	{
1369	struct buf bp, nbp;
1370	int count = `0`;
1371
1372	ASSERT_SEGLOCK(fs);
1373	if (vp->v_type == VBLK)
1374	return `0`;
1375	KASSERT(sp->vp == NULL);
1376	sp->vp = vp;
1377	mutex_enter(&bufcache_lock);
1378
1379	#ifndef LFS_NO_BACKBUF_HACK
1380	/ This is a hack to see if ordering the blocks in LFS makes a difference. /
1381	# define BUF_OFFSET \
1382	(((char )&LIST_NEXT(bp, b_vnbufs)) - (char )bp)
1383	# define BACK_BUF(BP) \
1384	((struct buf )(((char )(BP)->b_vnbufs.le_prev) - BUF_OFFSET))
1385	# define BEG_OF_LIST \
1386	((struct buf )(((char )&LIST_FIRST(&vp->v_dirtyblkhd)) - BUF_OFFSET))
1387
1388	loop:
1389	/ Find last buffer. /
1390	for (bp = LIST_FIRST(&vp->v_dirtyblkhd);
1391	bp && LIST_NEXT(bp, b_vnbufs) != NULL;
1392	bp = LIST_NEXT(bp, b_vnbufs))
1393	/ nothing /;
1394	for (; bp && bp != BEG_OF_LIST; bp = nbp) {
1395	nbp = BACK_BUF(bp);
1396	#else /* LFS_NO_BACKBUF_HACK */
1397	loop:
1398	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1399	nbp = LIST_NEXT(bp, b_vnbufs);
1400	#endif /* LFS_NO_BACKBUF_HACK */
1401	if ((bp->b_cflags & BC_BUSY) != `0` \|\|
1402	(bp->b_flags & B_GATHERED) != `0` \|\| !match(fs, bp)) {
1403	#ifdef DEBUG
1404	if (vp == fs->lfs_ivnode &&
1405	(bp->b_cflags & BC_BUSY) != `0` &&
1406	(bp->b_flags & B_GATHERED) == `0`)
1407	log(LOG_NOTICE, "lfs_gather: ifile lbn %"
1408	PRId64 " busy (%x) at 0x%jx",
1409	bp->b_lblkno, bp->b_flags,
1410	(uintmax_t)lfs_sb_getoffset(fs));
1411	#endif
1412	continue;
1413	}
1414	#ifdef DIAGNOSTIC
1415	# ifdef LFS_USE_B_INVAL
1416	if ((bp->b_flags & BC_INVAL) != `0` && bp->b_iodone == NULL) {
1417	DLOG((DLOG_SEG, "lfs_gather: lbn %" PRId64
1418	" is BC_INVAL\n", bp->b_lblkno));
1419	VOP_PRINT(bp->b_vp);
1420	}
1421	# endif /* LFS_USE_B_INVAL */
1422	if (!(bp->b_oflags & BO_DELWRI))
1423	panic("lfs_gather: bp not BO_DELWRI");
1424	if (!(bp->b_flags & B_LOCKED)) {
1425	DLOG((DLOG_SEG, "lfs_gather: lbn %" PRId64
1426	" blk %" PRId64 " not B_LOCKED\n",
1427	bp->b_lblkno,
1428	LFS_DBTOFSB(fs, bp->b_blkno)));
1429	VOP_PRINT(bp->b_vp);
1430	panic("lfs_gather: bp not B_LOCKED");
1431	}
1432	#endif
1433	if (lfs_gatherblock(sp, bp, &bufcache_lock)) {
1434	goto loop;
1435	}
1436	count++;
1437	}
1438	mutex_exit(&bufcache_lock);
1439	lfs_updatemeta(sp);
1440	KASSERT(sp->vp == vp);
1441	sp->vp = NULL;
1442	return count;
1443	}
1444
1445	#if DEBUG
1446	# define DEBUG_OOFF(n) do { \
1447	if (ooff == 0) { \
1448	DLOG((DLOG_SEG, "lfs_updatemeta[%d]: warning: writing " \
1449	"ino %d lbn %" PRId64 " at 0x%" PRIx32 \
1450	", was 0x0 (or %" PRId64 ")\n", \
1451	(n), ip->i_number, lbn, ndaddr, daddr)); \
1452	} \
1453	} while (0)
1454	#else
1455	# define DEBUG_OOFF(n)
1456	#endif
1457
1458	/*
1459	* Change the given block's address to ndaddr, finding its previous
1460	* location using ulfs_bmaparray().
1461	*
1462	* Account for this change in the segment table.
1463	*
1464	* called with sp == NULL by roll-forwarding code.
1465	*/
1466	void
1467	lfs_update_single(struct lfs fs, struct* segment *sp,
1468	struct vnode vp, daddr_t lbn, daddr_t ndaddr, int* size)
1469	{
1470	SEGUSE *sup;
1471	struct buf *bp;
1472	struct indir a[ULFS_NIADDR + `2`], *ap;
1473	struct inode *ip;
1474	daddr_t daddr, ooff;
1475	int num, error;
1476	int bb, osize, obb;
1477
1478	ASSERT_SEGLOCK(fs);
1479	KASSERT(sp == NULL \|\| sp->vp == vp);
1480	ip = VTOI(vp);
1481
1482	error = ulfs_bmaparray(vp, lbn, &daddr, a, &num, NULL, NULL);
1483	if (error)
1484	panic("lfs_updatemeta: ulfs_bmaparray returned %d", error);
1485
1486	KASSERT(daddr <= LFS_MAX_DADDR(fs));
1487	if (daddr > `0`)
1488	daddr = LFS_DBTOFSB(fs, daddr);
1489
1490	bb = lfs_numfrags(fs, size);
1491	switch (num) {
1492	case `0`:
1493	ooff = lfs_dino_getdb(fs, ip->i_din, lbn);
1494	DEBUG_OOFF(`0`);
1495	if (ooff == UNWRITTEN)
1496	lfs_dino_setblocks(fs, ip->i_din,
1497	lfs_dino_getblocks(fs, ip->i_din) + bb);
1498	else {
1499	/ possible fragment truncation or extension /
1500	obb = lfs_btofsb(fs, ip->i_lfs_fragsize[lbn]);
1501	lfs_dino_setblocks(fs, ip->i_din,
1502	lfs_dino_getblocks(fs, ip->i_din) + (bb-obb));
1503	}
1504	lfs_dino_setdb(fs, ip->i_din, lbn, ndaddr);
1505	break;
1506	case `1`:
1507	ooff = lfs_dino_getib(fs, ip->i_din, a[`0`].in_off);
1508	DEBUG_OOFF(`1`);
1509	if (ooff == UNWRITTEN)
1510	lfs_dino_setblocks(fs, ip->i_din,
1511	lfs_dino_getblocks(fs, ip->i_din) + bb);
1512	lfs_dino_setib(fs, ip->i_din, a[`0`].in_off, ndaddr);
1513	break;
1514	default:
1515	ap = &a[num - `1`];
1516	if (bread(vp, ap->in_lbn, lfs_sb_getbsize(fs),
1517	B_MODIFY, &bp))
1518	panic("lfs_updatemeta: bread bno %" PRId64,
1519	ap->in_lbn);
1520
1521	ooff = lfs_iblock_get(fs, bp->b_data, ap->in_off);
1522	DEBUG_OOFF(num);
1523	if (ooff == UNWRITTEN)
1524	lfs_dino_setblocks(fs, ip->i_din,
1525	lfs_dino_getblocks(fs, ip->i_din) + bb);
1526	lfs_iblock_set(fs, bp->b_data, ap->in_off, ndaddr);
1527	(void) VOP_BWRITE(bp->b_vp, bp);
1528	}
1529
1530	KASSERT(ooff == `0` \|\| ooff == UNWRITTEN \|\| ooff == daddr);
1531
1532	/ Update hiblk when extending the file /
1533	if (lbn > ip->i_lfs_hiblk)
1534	ip->i_lfs_hiblk = lbn;
1535
1536	/*
1537	* Though we'd rather it couldn't, this can happen right now
1538	* if cleaning blocks and regular blocks coexist.
1539	*/
1540	/ KASSERT(daddr < fs->lfs_lastpseg \|\| daddr > ndaddr); /
1541
1542	/*
1543	* Update segment usage information, based on old size
1544	* and location.
1545	*/
1546	if (daddr > `0`) {
1547	u_int32_t oldsn = lfs_dtosn(fs, daddr);
1548	#ifdef DIAGNOSTIC
1549	int ndupino;
1550
1551	if (sp && sp->seg_number == oldsn) {
1552	ndupino = sp->ndupino;
1553	} else {
1554	ndupino = `0`;
1555	}
1556	#endif
1557	KASSERT(oldsn < lfs_sb_getnseg(fs));
1558	if (lbn >= `0` && lbn < ULFS_NDADDR)
1559	osize = ip->i_lfs_fragsize[lbn];
1560	else
1561	osize = lfs_sb_getbsize(fs);
1562	LFS_SEGENTRY(sup, fs, oldsn, bp);
1563	#ifdef DIAGNOSTIC
1564	if (sup->su_nbytes + DINOSIZE(fs) * ndupino < osize) {
1565	printf("lfs_updatemeta: negative bytes "
1566	"(segment %" PRIu32 " short by %" PRId64
1567	")\n", lfs_dtosn(fs, daddr),
1568	(int64_t)osize -
1569	(DINOSIZE(fs) * ndupino + sup->su_nbytes));
1570	printf("lfs_updatemeta: ino %llu, lbn %" PRId64
1571	", addr = 0x%" PRIx64 "\n",
1572	(unsigned long long)ip->i_number, lbn, daddr);
1573	printf("lfs_updatemeta: ndupino=%d\n", ndupino);
1574	panic("lfs_updatemeta: negative bytes");
1575	sup->su_nbytes = osize -
1576	DINOSIZE(fs) * ndupino;
1577	}
1578	#endif
1579	DLOG((DLOG_SU, "seg %" PRIu32 " -= %d for ino %d lbn %" PRId64
1580	" db 0x%" PRIx64 "\n",
1581	lfs_dtosn(fs, daddr), osize,
1582	ip->i_number, lbn, daddr));
1583	sup->su_nbytes -= osize;
1584	if (!(bp->b_flags & B_GATHERED)) {
1585	mutex_enter(&lfs_lock);
1586	fs->lfs_flags \|= LFS_IFDIRTY;
1587	mutex_exit(&lfs_lock);
1588	}
1589	LFS_WRITESEGENTRY(sup, fs, oldsn, bp);
1590	}
1591	/*
1592	* Now that this block has a new address, and its old
1593	* segment no longer owns it, we can forget about its
1594	* old size.
1595	*/
1596	if (lbn >= `0` && lbn < ULFS_NDADDR)
1597	ip->i_lfs_fragsize[lbn] = size;
1598	}
1599
1600	/*
1601	* Update the metadata that points to the blocks listed in the FINFO
1602	* array.
1603	*/
1604	void
1605	lfs_updatemeta(struct segment *sp)
1606	{
1607	struct buf *sbp;
1608	struct lfs *fs;
1609	struct vnode *vp;
1610	daddr_t lbn;
1611	int i, nblocks, num;
1612	int __diagused nblocks_orig;
1613	int bb;
1614	int bytesleft, size;
1615	unsigned lastlength;
1616	union lfs_blocks tmpptr;
1617
1618	fs = sp->fs;
1619	vp = sp->vp;
1620	ASSERT_SEGLOCK(fs);
1621
1622	/*
1623	* This used to be:
1624	*
1625	* nblocks = &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp;
1626	*
1627	* that is, it allowed for the possibility that start_lbp did
1628	* not point to the beginning of the finfo block pointer area.
1629	* This particular formulation is six kinds of painful in the
1630	* lfs64 world where we have two sizes of block pointer, so
1631	* unless/until everything can be cleaned up to not move
1632	* start_lbp around but instead use an offset, we do the
1633	* following:
1634	* 1. Get NEXT_FINFO(sp->fip). This is the same pointer as
1635	* &sp->fip->fi_blocks[sp->fip->fi_nblocks], just the wrong
1636	* type. (Ugh.)
1637	* 2. Cast it to void *, then assign it to a temporary
1638	* union lfs_blocks.
1639	* 3. Subtract start_lbp from that.
1640	* 4. Save the value of nblocks in blocks_orig so we can
1641	* assert below that it hasn't changed without repeating this
1642	* rubbish.
1643	*
1644	* XXX.
1645	*/
1646	lfs_blocks_fromvoid(fs, &tmpptr, (void *)NEXT_FINFO(fs, sp->fip));
1647	nblocks = lfs_blocks_sub(fs, &tmpptr, &sp->start_lbp);
1648	nblocks_orig = nblocks;
1649
1650	KASSERT(nblocks >= `0`);
1651	KASSERT(vp != NULL);
1652	if (nblocks == `0`)
1653	return;
1654
1655	/*
1656	* This count may be high due to oversize blocks from lfs_gop_write.
1657	* Correct for this. (XXX we should be able to keep track of these.)
1658	*/
1659	for (i = `0`; i < nblocks; i++) {
1660	if (sp->start_bpp[i] == NULL) {
1661	DLOG((DLOG_SEG, "lfs_updatemeta: nblocks = %d, not %d\n", i, nblocks));
1662	nblocks = i;
1663	break;
1664	}
1665	num = howmany(sp->start_bpp[i]->b_bcount, lfs_sb_getbsize(fs));
1666	KASSERT(sp->start_bpp[i]->b_lblkno >= `0` \|\| num == `1`);
1667	nblocks -= num - `1`;
1668	}
1669
1670	#if 0
1671	/ pre-lfs64 assertion /
1672	KASSERT(vp->v_type == VREG \|\|
1673	nblocks == &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp);
1674	#else
1675	KASSERT(vp->v_type == VREG \|\| nblocks == nblocks_orig);
1676	#endif
1677	KASSERT(nblocks == sp->cbpp - sp->start_bpp);
1678
1679	/*
1680	* Sort the blocks.
1681	*
1682	* We have to sort even if the blocks come from the
1683	* cleaner, because there might be other pending blocks on the
1684	* same inode...and if we don't sort, and there are fragments
1685	* present, blocks may be written in the wrong place.
1686	*/
1687	lfs_shellsort(fs, sp->start_bpp, &sp->start_lbp, nblocks, lfs_sb_getbsize(fs));
1688
1689	/*
1690	* Record the length of the last block in case it's a fragment.
1691	* If there are indirect blocks present, they sort last. An
1692	* indirect block will be lfs_bsize and its presence indicates
1693	* that you cannot have fragments.
1694	*
1695	* XXX This last is a lie. A cleaned fragment can coexist with
1696	* XXX a later indirect block. This will continue to be
1697	* XXX true until lfs_markv is fixed to do everything with
1698	* XXX fake blocks (including fake inodes and fake indirect blocks).
1699	*/
1700	lastlength = ((sp->start_bpp[nblocks - `1`]->b_bcount - `1`) &
1701	lfs_sb_getbmask(fs)) + `1`;
1702	lfs_fi_setlastlength(fs, sp->fip, lastlength);
1703
1704	/*
1705	* Assign disk addresses, and update references to the logical
1706	* block and the segment usage information.
1707	*/
1708	for (i = nblocks; i--; ++sp->start_bpp) {
1709	sbp = *sp->start_bpp;
1710	lbn = lfs_blocks_get(fs, &sp->start_lbp, `0`);
1711	KASSERT(sbp->b_lblkno == lbn);
1712
1713	sbp->b_blkno = LFS_FSBTODB(fs, lfs_sb_getoffset(fs));
1714
1715	/*
1716	* If we write a frag in the wrong place, the cleaner won't
1717	* be able to correctly identify its size later, and the
1718	* segment will be uncleanable. (Even worse, it will assume
1719	* that the indirect block that actually ends the list
1720	* is of a smaller size!)
1721	*/
1722	if ((sbp->b_bcount & lfs_sb_getbmask(fs)) && i != `0`)
1723	panic("lfs_updatemeta: fragment is not last block");
1724
1725	/*
1726	* For each subblock in this possibly oversized block,
1727	* update its address on disk.
1728	*/
1729	KASSERT(lbn >= `0` \|\| sbp->b_bcount == lfs_sb_getbsize(fs));
1730	KASSERT(vp == sbp->b_vp);
1731	for (bytesleft = sbp->b_bcount; bytesleft > `0`;
1732	bytesleft -= lfs_sb_getbsize(fs)) {
1733	size = MIN(bytesleft, lfs_sb_getbsize(fs));
1734	bb = lfs_numfrags(fs, size);
1735	lbn = lfs_blocks_get(fs, &sp->start_lbp, `0`);
1736	lfs_blocks_inc(fs, &sp->start_lbp);
1737	lfs_update_single(fs, sp, sp->vp, lbn, lfs_sb_getoffset(fs),
1738	size);
1739	lfs_sb_addoffset(fs, bb);
1740	}
1741
1742	}
1743
1744	/ This inode has been modified /
1745	LFS_SET_UINO(VTOI(vp), IN_MODIFIED);
1746	}
1747
1748	/*
1749	* Move lfs_offset to a segment earlier than newsn.
1750	*/
1751	int
1752	lfs_rewind(struct lfs fs, int* newsn)
1753	{
1754	int sn, osn, isdirty;
1755	struct buf *bp;
1756	SEGUSE *sup;
1757
1758	ASSERT_SEGLOCK(fs);
1759
1760	osn = lfs_dtosn(fs, lfs_sb_getoffset(fs));
1761	if (osn < newsn)
1762	return `0`;
1763
1764	/ lfs_avail eats the remaining space in this segment /
1765	lfs_sb_subavail(fs, lfs_sb_getfsbpseg(fs) - (lfs_sb_getoffset(fs) - lfs_sb_getcurseg(fs)));
1766
1767	/ Find a low-numbered segment /
1768	for (sn = `0`; sn < lfs_sb_getnseg(fs); ++sn) {
1769	LFS_SEGENTRY(sup, fs, sn, bp);
1770	isdirty = sup->su_flags & SEGUSE_DIRTY;
1771	brelse(bp, `0`);
1772
1773	if (!isdirty)
1774	break;
1775	}
1776	if (sn == lfs_sb_getnseg(fs))
1777	panic("lfs_rewind: no clean segments");
1778	if (newsn >= `0` && sn >= newsn)
1779	return ENOENT;
1780	lfs_sb_setnextseg(fs, lfs_sntod(fs, sn));
1781	lfs_newseg(fs);
1782	lfs_sb_setoffset(fs, lfs_sb_getcurseg(fs));
1783
1784	return `0`;
1785	}
1786
1787	/*
1788	* Start a new partial segment.
1789	*
1790	* Return 1 when we entered to a new segment.
1791	* Otherwise, return 0.
1792	*/
1793	int
1794	lfs_initseg(struct lfs *fs)
1795	{
1796	struct segment *sp = fs->lfs_sp;
1797	SEGSUM *ssp;
1798	struct buf sbp; /* buffer for SEGSUM /
1799	int repeat = `0`; / return value /
1800
1801	ASSERT_SEGLOCK(fs);
1802	/ Advance to the next segment. /
1803	if (!LFS_PARTIAL_FITS(fs)) {
1804	SEGUSE *sup;
1805	struct buf *bp;
1806
1807	/ lfs_avail eats the remaining space /
1808	lfs_sb_subavail(fs, lfs_sb_getfsbpseg(fs) - (lfs_sb_getoffset(fs) -
1809	lfs_sb_getcurseg(fs)));
1810	/ Wake up any cleaning procs waiting on this file system. /
1811	lfs_wakeup_cleaner(fs);
1812	lfs_newseg(fs);
1813	repeat = `1`;
1814	lfs_sb_setoffset(fs, lfs_sb_getcurseg(fs));
1815
1816	sp->seg_number = lfs_dtosn(fs, lfs_sb_getcurseg(fs));
1817	sp->seg_bytes_left = lfs_fsbtob(fs, lfs_sb_getfsbpseg(fs));
1818
1819	/*
1820	* If the segment contains a superblock, update the offset
1821	* and summary address to skip over it.
1822	*/
1823	LFS_SEGENTRY(sup, fs, sp->seg_number, bp);
1824	if (sup->su_flags & SEGUSE_SUPERBLOCK) {
1825	lfs_sb_addoffset(fs, lfs_btofsb(fs, LFS_SBPAD));
1826	sp->seg_bytes_left -= LFS_SBPAD;
1827	}
1828	brelse(bp, `0`);
1829	/ Segment zero could also contain the labelpad /
1830	if (lfs_sb_getversion(fs) > `1` && sp->seg_number == `0` &&
1831	lfs_sb_gets0addr(fs) < lfs_btofsb(fs, LFS_LABELPAD)) {
1832	lfs_sb_addoffset(fs,
1833	lfs_btofsb(fs, LFS_LABELPAD) - lfs_sb_gets0addr(fs));
1834	sp->seg_bytes_left -=
1835	LFS_LABELPAD - lfs_fsbtob(fs, lfs_sb_gets0addr(fs));
1836	}
1837	} else {
1838	sp->seg_number = lfs_dtosn(fs, lfs_sb_getcurseg(fs));
1839	sp->seg_bytes_left = lfs_fsbtob(fs, lfs_sb_getfsbpseg(fs) -
1840	(lfs_sb_getoffset(fs) - lfs_sb_getcurseg(fs)));
1841	}
1842	lfs_sb_setlastpseg(fs, lfs_sb_getoffset(fs));
1843
1844	/ Record first address of this partial segment /
1845	if (sp->seg_flags & SEGM_CLEAN) {
1846	fs->lfs_cleanint[fs->lfs_cleanind] = lfs_sb_getoffset(fs);
1847	if (++fs->lfs_cleanind >= LFS_MAX_CLEANIND) {
1848	/ "1" is the artificial inc in lfs_seglock /
1849	mutex_enter(&lfs_lock);
1850	while (fs->lfs_iocount > `1`) {
1851	mtsleep(&fs->lfs_iocount, PRIBIO + `1`,
1852	"lfs_initseg", `0`, &lfs_lock);
1853	}
1854	mutex_exit(&lfs_lock);
1855	fs->lfs_cleanind = `0`;
1856	}
1857	}
1858
1859	sp->fs = fs;
1860	sp->ibp = NULL;
1861	sp->idp = NULL;
1862	sp->ninodes = `0`;
1863	sp->ndupino = `0`;
1864
1865	sp->cbpp = sp->bpp;
1866
1867	/ Get a new buffer for SEGSUM /
1868	sbp = lfs_newbuf(fs, VTOI(fs->lfs_ivnode)->i_devvp,
1869	LFS_FSBTODB(fs, lfs_sb_getoffset(fs)), lfs_sb_getsumsize(fs), LFS_NB_SUMMARY);
1870
1871	/ ... and enter it into the buffer list. /
1872	*sp->cbpp = sbp;
1873	sp->cbpp++;
1874	lfs_sb_addoffset(fs, lfs_btofsb(fs, lfs_sb_getsumsize(fs)));
1875
1876	sp->start_bpp = sp->cbpp;
1877
1878	/ Set point to SEGSUM, initialize it. /
1879	ssp = sp->segsum = sbp->b_data;
1880	memset(ssp, `0`, lfs_sb_getsumsize(fs));
1881	lfs_ss_setnext(fs, ssp, lfs_sb_getnextseg(fs));
1882	lfs_ss_setnfinfo(fs, ssp, `0`);
1883	lfs_ss_setninos(fs, ssp, `0`);
1884	lfs_ss_setmagic(fs, ssp, SS_MAGIC);
1885
1886	/ Set pointer to first FINFO, initialize it. /
1887	sp->fip = SEGSUM_FINFOBASE(fs, sp->segsum);
1888	lfs_fi_setnblocks(fs, sp->fip, `0`);
1889	lfs_fi_setlastlength(fs, sp->fip, `0`);
1890	lfs_blocks_fromfinfo(fs, &sp->start_lbp, sp->fip);
1891
1892	sp->seg_bytes_left -= lfs_sb_getsumsize(fs);
1893	sp->sum_bytes_left = lfs_sb_getsumsize(fs) - SEGSUM_SIZE(fs);
1894
1895	return (repeat);
1896	}
1897
1898	/*
1899	* Remove SEGUSE_INVAL from all segments.
1900	*/
1901	void
1902	lfs_unset_inval_all(struct lfs *fs)
1903	{
1904	SEGUSE *sup;
1905	struct buf *bp;
1906	int i;
1907
1908	for (i = `0`; i < lfs_sb_getnseg(fs); i++) {
1909	LFS_SEGENTRY(sup, fs, i, bp);
1910	if (sup->su_flags & SEGUSE_INVAL) {
1911	sup->su_flags &= ~SEGUSE_INVAL;
1912	LFS_WRITESEGENTRY(sup, fs, i, bp);
1913	} else
1914	brelse(bp, `0`);
1915	}
1916	}
1917
1918	/*
1919	* Return the next segment to write.
1920	*/
1921	void
1922	lfs_newseg(struct lfs *fs)
1923	{
1924	CLEANERINFO *cip;
1925	SEGUSE *sup;
1926	struct buf *bp;
1927	int curseg, isdirty, sn, skip_inval;
1928
1929	ASSERT_SEGLOCK(fs);
1930
1931	/ Honor LFCNWRAPSTOP /
1932	mutex_enter(&lfs_lock);
1933	while (lfs_sb_getnextseg(fs) < lfs_sb_getcurseg(fs) && fs->lfs_nowrap) {
1934	if (fs->lfs_wrappass) {
1935	log(LOG_NOTICE, "%s: wrappass=%d\n",
1936	lfs_sb_getfsmnt(fs), fs->lfs_wrappass);
1937	fs->lfs_wrappass = `0`;
1938	break;
1939	}
1940	fs->lfs_wrapstatus = LFS_WRAP_WAITING;
1941	wakeup(&fs->lfs_nowrap);
1942	log(LOG_NOTICE, "%s: waiting at log wrap\n", lfs_sb_getfsmnt(fs));
1943	mtsleep(&fs->lfs_wrappass, PVFS, "newseg", `10` * hz,
1944	&lfs_lock);
1945	}
1946	fs->lfs_wrapstatus = LFS_WRAP_GOING;
1947	mutex_exit(&lfs_lock);
1948
1949	LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, lfs_sb_getnextseg(fs)), bp);
1950	DLOG((DLOG_SU, "lfs_newseg: seg %d := 0 in newseg\n",
1951	lfs_dtosn(fs, lfs_sb_getnextseg(fs))));
1952	sup->su_flags \|= SEGUSE_DIRTY \| SEGUSE_ACTIVE;
1953	sup->su_nbytes = `0`;
1954	sup->su_nsums = `0`;
1955	sup->su_ninos = `0`;
1956	LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, lfs_sb_getnextseg(fs)), bp);
1957
1958	LFS_CLEANERINFO(cip, fs, bp);
1959	lfs_ci_shiftcleantodirty(fs, cip, `1`);
1960	lfs_sb_setnclean(fs, lfs_ci_getclean(fs, cip));
1961	LFS_SYNC_CLEANERINFO(cip, fs, bp, `1`);
1962
1963	lfs_sb_setlastseg(fs, lfs_sb_getcurseg(fs));
1964	lfs_sb_setcurseg(fs, lfs_sb_getnextseg(fs));
1965	skip_inval = `1`;
1966	for (sn = curseg = lfs_dtosn(fs, lfs_sb_getcurseg(fs)) + lfs_sb_getinterleave(fs);;) {
1967	sn = (sn + `1`) % lfs_sb_getnseg(fs);
1968
1969	if (sn == curseg) {
1970	if (skip_inval)
1971	skip_inval = `0`;
1972	else
1973	panic("lfs_nextseg: no clean segments");
1974	}
1975	LFS_SEGENTRY(sup, fs, sn, bp);
1976	isdirty = sup->su_flags & (SEGUSE_DIRTY \| (skip_inval ? SEGUSE_INVAL : `0`));
1977	/ Check SEGUSE_EMPTY as we go along /
1978	if (isdirty && sup->su_nbytes == `0` &&
1979	!(sup->su_flags & SEGUSE_EMPTY))
1980	LFS_WRITESEGENTRY(sup, fs, sn, bp);
1981	else
1982	brelse(bp, `0`);
1983
1984	if (!isdirty)
1985	break;
1986	}
1987	if (skip_inval == `0`)
1988	lfs_unset_inval_all(fs);
1989
1990	++fs->lfs_nactive;
1991	lfs_sb_setnextseg(fs, lfs_sntod(fs, sn));
1992	if (lfs_dostats) {
1993	++lfs_stats.segsused;
1994	}
1995	}
1996
1997	static struct buf *
1998	lfs_newclusterbuf(struct lfs fs, struct* vnode *vp, daddr_t addr,
1999	int n)
2000	{
2001	struct lfs_cluster *cl;
2002	struct buf *bpp, bp;
2003
2004	ASSERT_SEGLOCK(fs);
2005	cl = (struct lfs_cluster *)pool_get(&fs->lfs_clpool, PR_WAITOK);
2006	bpp = (struct buf **)pool_get(&fs->lfs_bpppool, PR_WAITOK);
2007	memset(cl, `0`, sizeof(*cl));
2008	cl->fs = fs;
2009	cl->bpp = bpp;
2010	cl->bufcount = `0`;
2011	cl->bufsize = `0`;
2012
2013	/ If this segment is being written synchronously, note that /
2014	if (fs->lfs_sp->seg_flags & SEGM_SYNC) {
2015	cl->flags \|= LFS_CL_SYNC;
2016	cl->seg = fs->lfs_sp;
2017	++cl->seg->seg_iocount;
2018	}
2019
2020	/ Get an empty buffer header, or maybe one with something on it /
2021	bp = getiobuf(vp, true);
2022	bp->b_dev = NODEV;
2023	bp->b_blkno = bp->b_lblkno = addr;
2024	bp->b_iodone = lfs_cluster_callback;
2025	bp->b_private = cl;
2026
2027	return bp;
2028	}
2029
2030	int
2031	lfs_writeseg(struct lfs fs, struct* segment *sp)
2032	{
2033	struct buf *bpp, bp, cbp, newbp, *unbusybp;
2034	SEGUSE *sup;
2035	SEGSUM *ssp;
2036	int i;
2037	int do_again, nblocks, byteoffset;
2038	size_t el_size;
2039	struct lfs_cluster *cl;
2040	u_short ninos;
2041	struct vnode *devvp;
2042	char *p = NULL;
2043	struct vnode *vp;
2044	unsigned ibindex, iblimit;
2045	int changed;
2046	u_int32_t sum;
2047	size_t sumstart;
2048	#ifdef DEBUG
2049	FINFO *fip;
2050	int findex;
2051	#endif
2052
2053	ASSERT_SEGLOCK(fs);
2054
2055	ssp = (SEGSUM *)sp->segsum;
2056
2057	/*
2058	* If there are no buffers other than the segment summary to write,
2059	* don't do anything. If we are the end of a dirop sequence, however,
2060	* write the empty segment summary anyway, to help out the
2061	* roll-forward agent.
2062	*/
2063	if ((nblocks = sp->cbpp - sp->bpp) == `1`) {
2064	if ((lfs_ss_getflags(fs, ssp) & (SS_DIROP \| SS_CONT)) != SS_DIROP)
2065	return `0`;
2066	}
2067
2068	/ Note if partial segment is being written by the cleaner /
2069	if (sp->seg_flags & SEGM_CLEAN)
2070	lfs_ss_setflags(fs, ssp, lfs_ss_getflags(fs, ssp) \| SS_CLEAN);
2071
2072	/ Note if we are writing to reclaim /
2073	if (sp->seg_flags & SEGM_RECLAIM) {
2074	lfs_ss_setflags(fs, ssp, lfs_ss_getflags(fs, ssp) \| SS_RECLAIM);
2075	lfs_ss_setreclino(fs, ssp, fs->lfs_reclino);
2076	}
2077
2078	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
2079
2080	/ Update the segment usage information. /
2081	LFS_SEGENTRY(sup, fs, sp->seg_number, bp);
2082
2083	/ Loop through all blocks, except the segment summary. /
2084	for (bpp = sp->bpp; ++bpp < sp->cbpp; ) {
2085	if ((*bpp)->b_vp != devvp) {
2086	sup->su_nbytes += (*bpp)->b_bcount;
2087	DLOG((DLOG_SU, "seg %" PRIu32 " += %ld for ino %d"
2088	" lbn %" PRId64 " db 0x%" PRIx64 "\n",
2089	sp->seg_number, (*bpp)->b_bcount,
2090	VTOI((bpp)->b_vp)->i_number, (bpp)->b_lblkno,
2091	(*bpp)->b_blkno));
2092	}
2093	}
2094
2095	#ifdef DEBUG
2096	/ Check for zero-length and zero-version FINFO entries. /
2097	fip = SEGSUM_FINFOBASE(fs, ssp);
2098	for (findex = `0`; findex < lfs_ss_getnfinfo(fs, ssp); findex++) {
2099	KDASSERT(lfs_fi_getnblocks(fs, fip) > `0`);
2100	KDASSERT(lfs_fi_getversion(fs, fip) > `0`);
2101	fip = NEXT_FINFO(fs, fip);
2102	}
2103	#endif /* DEBUG */
2104
2105	ninos = (lfs_ss_getninos(fs, ssp) + LFS_INOPB(fs) - `1`) / LFS_INOPB(fs);
2106	DLOG((DLOG_SU, "seg %d += %d for %d inodes\n",
2107	sp->seg_number,
2108	lfs_ss_getninos(fs, ssp) * DINOSIZE(fs),
2109	lfs_ss_getninos(fs, ssp)));
2110	sup->su_nbytes += lfs_ss_getninos(fs, ssp) * DINOSIZE(fs);
2111	/ sup->su_nbytes += lfs_sb_getsumsize(fs); /
2112	if (lfs_sb_getversion(fs) == `1`)
2113	sup->su_olastmod = time_second;
2114	else
2115	sup->su_lastmod = time_second;
2116	sup->su_ninos += ninos;
2117	++sup->su_nsums;
2118	lfs_sb_subavail(fs, lfs_btofsb(fs, lfs_sb_getsumsize(fs)));
2119
2120	do_again = !(bp->b_flags & B_GATHERED);
2121	LFS_WRITESEGENTRY(sup, fs, sp->seg_number, bp); / Ifile /
2122
2123	/*
2124	* Mark blocks B_BUSY, to prevent then from being changed between
2125	* the checksum computation and the actual write.
2126	*
2127	* If we are cleaning, check indirect blocks for UNWRITTEN, and if
2128	* there are any, replace them with copies that have UNASSIGNED
2129	* instead.
2130	*/
2131	mutex_enter(&bufcache_lock);
2132	for (bpp = sp->bpp, i = nblocks - `1`; i--;) {
2133	++bpp;
2134	bp = *bpp;
2135	if (bp->b_iodone != NULL) { / UBC or malloced buffer /
2136	bp->b_cflags \|= BC_BUSY;
2137	continue;
2138	}
2139
2140	while (bp->b_cflags & BC_BUSY) {
2141	DLOG((DLOG_SEG, "lfs_writeseg: avoiding potential"
2142	" data summary corruption for ino %d, lbn %"
2143	PRId64 "\n",
2144	VTOI(bp->b_vp)->i_number, bp->b_lblkno));
2145	bp->b_cflags \|= BC_WANTED;
2146	cv_wait(&bp->b_busy, &bufcache_lock);
2147	}
2148	bp->b_cflags \|= BC_BUSY;
2149	mutex_exit(&bufcache_lock);
2150	unbusybp = NULL;
2151
2152	/*
2153	* Check and replace indirect block UNWRITTEN bogosity.
2154	* XXX See comment in lfs_writefile.
2155	*/
2156	if (bp->b_lblkno < `0` && bp->b_vp != devvp && bp->b_vp &&
2157	lfs_dino_getblocks(fs, VTOI(bp->b_vp)->i_din) !=
2158	VTOI(bp->b_vp)->i_lfs_effnblks) {
2159	DLOG((DLOG_VNODE, "lfs_writeseg: cleansing ino %d (%jd != %d)\n",
2160	VTOI(bp->b_vp)->i_number,
2161	(intmax_t)VTOI(bp->b_vp)->i_lfs_effnblks,
2162	lfs_dino_getblocks(fs, VTOI(bp->b_vp)->i_din)));
2163	/ Make a copy we'll make changes to /
2164	newbp = lfs_newbuf(fs, bp->b_vp, bp->b_lblkno,
2165	bp->b_bcount, LFS_NB_IBLOCK);
2166	newbp->b_blkno = bp->b_blkno;
2167	memcpy(newbp->b_data, bp->b_data,
2168	newbp->b_bcount);
2169
2170	changed = `0`;
2171	iblimit = newbp->b_bcount / LFS_BLKPTRSIZE(fs);
2172	for (ibindex = `0`; ibindex < iblimit; ibindex++) {
2173	if (lfs_iblock_get(fs, newbp->b_data, ibindex) == UNWRITTEN) {
2174	++changed;
2175	lfs_iblock_set(fs, newbp->b_data,
2176	ibindex, `0`);
2177	}
2178	}
2179	/*
2180	* Get rid of the old buffer. Don't mark it clean,
2181	* though, if it still has dirty data on it.
2182	*/
2183	if (changed) {
2184	DLOG((DLOG_SEG, "lfs_writeseg: replacing UNWRITTEN(%d):"
2185	" bp = %p newbp = %p\n", changed, bp,
2186	newbp));
2187	*bpp = newbp;
2188	bp->b_flags &= ~B_GATHERED;
2189	bp->b_error = `0`;
2190	if (bp->b_iodone != NULL) {
2191	DLOG((DLOG_SEG, "lfs_writeseg: "
2192	"indir bp should not be B_CALL\n"));
2193	biodone(bp);
2194	bp = NULL;
2195	} else {
2196	/ Still on free list, leave it there /
2197	unbusybp = bp;
2198	/*
2199	* We have to re-decrement lfs_avail
2200	* since this block is going to come
2201	* back around to us in the next
2202	* segment.
2203	*/
2204	lfs_sb_subavail(fs,
2205	lfs_btofsb(fs, bp->b_bcount));
2206	}
2207	} else {
2208	lfs_freebuf(fs, newbp);
2209	}
2210	}
2211	mutex_enter(&bufcache_lock);
2212	if (unbusybp != NULL) {
2213	unbusybp->b_cflags &= ~BC_BUSY;
2214	if (unbusybp->b_cflags & BC_WANTED)
2215	cv_broadcast(&bp->b_busy);
2216	}
2217	}
2218	mutex_exit(&bufcache_lock);
2219
2220	/*
2221	* Compute checksum across data and then across summary; the first
2222	* block (the summary block) is skipped. Set the create time here
2223	* so that it's guaranteed to be later than the inode mod times.
2224	*/
2225	sum = `0`;
2226	if (lfs_sb_getversion(fs) == `1`)
2227	el_size = sizeof(u_long);
2228	else
2229	el_size = sizeof(u_int32_t);
2230	for (bpp = sp->bpp, i = nblocks - `1`; i--; ) {
2231	++bpp;
2232	/ Loop through gop_write cluster blocks /
2233	for (byteoffset = `0`; byteoffset < (*bpp)->b_bcount;
2234	byteoffset += lfs_sb_getbsize(fs)) {
2235	#ifdef LFS_USE_B_INVAL
2236	if (((*bpp)->b_cflags & BC_INVAL) != `0` &&
2237	(*bpp)->b_iodone != NULL) {
2238	if (copyin((void )(bpp)->b_saveaddr +
2239	byteoffset, dp, el_size)) {
2240	panic("lfs_writeseg: copyin failed [1]:"
2241	" ino %" PRIu64 " blk %" PRId64,
2242	VTOI((*bpp)->b_vp)->i_number,
2243	(*bpp)->b_lblkno);
2244	}
2245	} else
2246	#endif /* LFS_USE_B_INVAL */
2247	{
2248	sum = lfs_cksum_part((char *)
2249	(*bpp)->b_data + byteoffset, el_size, sum);
2250	}
2251	}
2252	}
2253	if (lfs_sb_getversion(fs) == `1`)
2254	lfs_ss_setocreate(fs, ssp, time_second);
2255	else {
2256	lfs_ss_setcreate(fs, ssp, time_second);
2257	lfs_sb_addserial(fs, `1`);
2258	lfs_ss_setserial(fs, ssp, lfs_sb_getserial(fs));
2259	lfs_ss_setident(fs, ssp, lfs_sb_getident(fs));
2260	}
2261	lfs_ss_setdatasum(fs, ssp, lfs_cksum_fold(sum));
2262	sumstart = lfs_ss_getsumstart(fs);
2263	lfs_ss_setsumsum(fs, ssp, cksum((char *)ssp + sumstart,
2264	lfs_sb_getsumsize(fs) - sumstart));
2265
2266	mutex_enter(&lfs_lock);
2267	lfs_sb_subbfree(fs, (lfs_btofsb(fs, ninos * lfs_sb_getibsize(fs)) +
2268	lfs_btofsb(fs, lfs_sb_getsumsize(fs))));
2269	lfs_sb_adddmeta(fs, (lfs_btofsb(fs, ninos * lfs_sb_getibsize(fs)) +
2270	lfs_btofsb(fs, lfs_sb_getsumsize(fs))));
2271	mutex_exit(&lfs_lock);
2272
2273	/*
2274	* When we simply write the blocks we lose a rotation for every block
2275	* written. To avoid this problem, we cluster the buffers into a
2276	* chunk and write the chunk. MAXPHYS is the largest size I/O
2277	* devices can handle, use that for the size of the chunks.
2278	*
2279	* Blocks that are already clusters (from GOP_WRITE), however, we
2280	* don't bother to copy into other clusters.
2281	*/
2282
2283	#define CHUNKSIZE MAXPHYS
2284
2285	if (devvp == NULL)
2286	panic("devvp is NULL");
2287	for (bpp = sp->bpp, i = nblocks; i;) {
2288	cbp = lfs_newclusterbuf(fs, devvp, (*bpp)->b_blkno, i);
2289	cl = cbp->b_private;
2290
2291	cbp->b_flags \|= B_ASYNC;
2292	cbp->b_cflags \|= BC_BUSY;
2293	cbp->b_bcount = `0`;
2294
2295	#if defined(DEBUG) && defined(DIAGNOSTIC)
2296	if (bpp - sp->bpp > (lfs_sb_getsumsize(fs) - SEGSUM_SIZE(fs))
2297	/ sizeof(int32_t)) {
2298	panic("lfs_writeseg: real bpp overwrite");
2299	}
2300	if (bpp - sp->bpp > lfs_segsize(fs) / lfs_sb_getfsize(fs)) {
2301	panic("lfs_writeseg: theoretical bpp overwrite");
2302	}
2303	#endif
2304
2305	/*
2306	* Construct the cluster.
2307	*/
2308	mutex_enter(&lfs_lock);
2309	++fs->lfs_iocount;
2310	mutex_exit(&lfs_lock);
2311	while (i && cbp->b_bcount < CHUNKSIZE) {
2312	bp = *bpp;
2313
2314	if (bp->b_bcount > (CHUNKSIZE - cbp->b_bcount))
2315	break;
2316	if (cbp->b_bcount > `0` && !(cl->flags & LFS_CL_MALLOC))
2317	break;
2318
2319	/ Clusters from GOP_WRITE are expedited /
2320	if (bp->b_bcount > lfs_sb_getbsize(fs)) {
2321	if (cbp->b_bcount > `0`)
2322	/ Put in its own buffer /
2323	break;
2324	else {
2325	cbp->b_data = bp->b_data;
2326	}
2327	} else if (cbp->b_bcount == `0`) {
2328	p = cbp->b_data = lfs_malloc(fs, CHUNKSIZE,
2329	LFS_NB_CLUSTER);
2330	cl->flags \|= LFS_CL_MALLOC;
2331	}
2332	#ifdef DIAGNOSTIC
2333	if (lfs_dtosn(fs, LFS_DBTOFSB(fs, bp->b_blkno +
2334	btodb(bp->b_bcount - `1`))) !=
2335	sp->seg_number) {
2336	printf("blk size %d daddr %" PRIx64
2337	" not in seg %d\n",
2338	bp->b_bcount, bp->b_blkno,
2339	sp->seg_number);
2340	panic("segment overwrite");
2341	}
2342	#endif
2343
2344	#ifdef LFS_USE_B_INVAL
2345	/*
2346	* Fake buffers from the cleaner are marked as B_INVAL.
2347	* We need to copy the data from user space rather than
2348	* from the buffer indicated.
2349	* XXX == what do I do on an error?
2350	*/
2351	if ((bp->b_cflags & BC_INVAL) != `0` &&
2352	bp->b_iodone != NULL) {
2353	if (copyin(bp->b_saveaddr, p, bp->b_bcount))
2354	panic("lfs_writeseg: "
2355	"copyin failed [2]");
2356	} else
2357	#endif /* LFS_USE_B_INVAL */
2358	if (cl->flags & LFS_CL_MALLOC) {
2359	/ copy data into our cluster. /
2360	memcpy(p, bp->b_data, bp->b_bcount);
2361	p += bp->b_bcount;
2362	}
2363
2364	cbp->b_bcount += bp->b_bcount;
2365	cl->bufsize += bp->b_bcount;
2366
2367	bp->b_flags &= ~B_READ;
2368	bp->b_error = `0`;
2369	cl->bpp[cl->bufcount++] = bp;
2370
2371	vp = bp->b_vp;
2372	mutex_enter(&bufcache_lock);
2373	mutex_enter(vp->v_interlock);
2374	bp->b_oflags &= ~(BO_DELWRI \| BO_DONE);
2375	reassignbuf(bp, vp);
2376	vp->v_numoutput++;
2377	mutex_exit(vp->v_interlock);
2378	mutex_exit(&bufcache_lock);
2379
2380	bpp++;
2381	i--;
2382	}
2383	if (fs->lfs_sp->seg_flags & SEGM_SYNC)
2384	BIO_SETPRIO(cbp, BPRIO_TIMECRITICAL);
2385	else
2386	BIO_SETPRIO(cbp, BPRIO_TIMELIMITED);
2387	mutex_enter(devvp->v_interlock);
2388	devvp->v_numoutput++;
2389	mutex_exit(devvp->v_interlock);
2390	VOP_STRATEGY(devvp, cbp);
2391	curlwp->l_ru.ru_oublock++;
2392	}
2393
2394	if (lfs_dostats) {
2395	++lfs_stats.psegwrites;
2396	lfs_stats.blocktot += nblocks - `1`;
2397	if (fs->lfs_sp->seg_flags & SEGM_SYNC)
2398	++lfs_stats.psyncwrites;
2399	if (fs->lfs_sp->seg_flags & SEGM_CLEAN) {
2400	++lfs_stats.pcleanwrites;
2401	lfs_stats.cleanblocks += nblocks - `1`;
2402	}
2403	}
2404
2405	return (lfs_initseg(fs) \|\| do_again);
2406	}
2407
2408	void
2409	lfs_writesuper(struct lfs *fs, daddr_t daddr)
2410	{
2411	struct buf *bp;
2412	struct vnode *devvp = VTOI(fs->lfs_ivnode)->i_devvp;
2413	int s;
2414
2415	ASSERT_MAYBE_SEGLOCK(fs);
2416	#ifdef DIAGNOSTIC
2417	if (fs->lfs_is64) {
2418	KASSERT(fs->lfs_dlfs_u.u_64.dlfs_magic == LFS64_MAGIC);
2419	} else {
2420	KASSERT(fs->lfs_dlfs_u.u_32.dlfs_magic == LFS_MAGIC);
2421	}
2422	#endif
2423	/*
2424	* If we can write one superblock while another is in
2425	* progress, we risk not having a complete checkpoint if we crash.
2426	* So, block here if a superblock write is in progress.
2427	*/
2428	mutex_enter(&lfs_lock);
2429	s = splbio();
2430	while (fs->lfs_sbactive) {
2431	mtsleep(&fs->lfs_sbactive, PRIBIO+`1`, "lfs sb", `0`,
2432	&lfs_lock);
2433	}
2434	fs->lfs_sbactive = daddr;
2435	splx(s);
2436	mutex_exit(&lfs_lock);
2437
2438	/ Set timestamp of this version of the superblock /
2439	if (lfs_sb_getversion(fs) == `1`)
2440	lfs_sb_setotstamp(fs, time_second);
2441	lfs_sb_settstamp(fs, time_second);
2442
2443	/ The next chunk of code relies on this assumption /
2444	CTASSERT(sizeof(struct dlfs) == sizeof(struct dlfs64));
2445
2446	/ Checksum the superblock and copy it into a buffer. /
2447	lfs_sb_setcksum(fs, lfs_sb_cksum(fs));
2448	bp = lfs_newbuf(fs, devvp,
2449	LFS_FSBTODB(fs, daddr), LFS_SBPAD, LFS_NB_SBLOCK);
2450	memcpy(bp->b_data, &fs->lfs_dlfs_u, sizeof(struct dlfs));
2451	memset((char )bp->b_data + sizeof(struct* dlfs), `0`,
2452	LFS_SBPAD - sizeof(struct dlfs));
2453
2454	bp->b_cflags \|= BC_BUSY;
2455	bp->b_flags = (bp->b_flags & ~B_READ) \| B_ASYNC;
2456	bp->b_oflags &= ~(BO_DONE \| BO_DELWRI);
2457	bp->b_error = `0`;
2458	bp->b_iodone = lfs_supercallback;
2459
2460	if (fs->lfs_sp != NULL && fs->lfs_sp->seg_flags & SEGM_SYNC)
2461	BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
2462	else
2463	BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
2464	curlwp->l_ru.ru_oublock++;
2465
2466	mutex_enter(devvp->v_interlock);
2467	devvp->v_numoutput++;
2468	mutex_exit(devvp->v_interlock);
2469
2470	mutex_enter(&lfs_lock);
2471	++fs->lfs_iocount;
2472	mutex_exit(&lfs_lock);
2473	VOP_STRATEGY(devvp, bp);
2474	}
2475
2476	/*
2477	* Logical block number match routines used when traversing the dirty block
2478	* chain.
2479	*/
2480	int
2481	lfs_match_fake(struct lfs fs, struct* buf *bp)
2482	{
2483
2484	ASSERT_SEGLOCK(fs);
2485	return LFS_IS_MALLOC_BUF(bp);
2486	}
2487
2488	#if 0
2489	int
2490	lfs_match_real(struct lfs fs, struct* buf *bp)
2491	{
2492
2493	ASSERT_SEGLOCK(fs);
2494	return (lfs_match_data(fs, bp) && !lfs_match_fake(fs, bp));
2495	}
2496	#endif
2497
2498	int
2499	lfs_match_data(struct lfs fs, struct* buf *bp)
2500	{
2501
2502	ASSERT_SEGLOCK(fs);
2503	return (bp->b_lblkno >= `0`);
2504	}
2505
2506	int
2507	lfs_match_indir(struct lfs fs, struct* buf *bp)
2508	{
2509	daddr_t lbn;
2510
2511	ASSERT_SEGLOCK(fs);
2512	lbn = bp->b_lblkno;
2513	return (lbn < `0` && (-lbn - ULFS_NDADDR) % LFS_NINDIR(fs) == `0`);
2514	}
2515
2516	int
2517	lfs_match_dindir(struct lfs fs, struct* buf *bp)
2518	{
2519	daddr_t lbn;
2520
2521	ASSERT_SEGLOCK(fs);
2522	lbn = bp->b_lblkno;
2523	return (lbn < `0` && (-lbn - ULFS_NDADDR) % LFS_NINDIR(fs) == `1`);
2524	}
2525
2526	int
2527	lfs_match_tindir(struct lfs fs, struct* buf *bp)
2528	{
2529	daddr_t lbn;
2530
2531	ASSERT_SEGLOCK(fs);
2532	lbn = bp->b_lblkno;
2533	return (lbn < `0` && (-lbn - ULFS_NDADDR) % LFS_NINDIR(fs) == `2`);
2534	}
2535
2536	static void
2537	lfs_free_aiodone(struct buf *bp)
2538	{
2539	struct lfs *fs;
2540
2541	KERNEL_LOCK(`1`, curlwp);
2542	fs = bp->b_private;
2543	ASSERT_NO_SEGLOCK(fs);
2544	lfs_freebuf(fs, bp);
2545	KERNEL_UNLOCK_LAST(curlwp);
2546	}
2547
2548	static void
2549	lfs_super_aiodone(struct buf *bp)
2550	{
2551	struct lfs *fs;
2552
2553	KERNEL_LOCK(`1`, curlwp);
2554	fs = bp->b_private;
2555	ASSERT_NO_SEGLOCK(fs);
2556	mutex_enter(&lfs_lock);
2557	fs->lfs_sbactive = `0`;
2558	if (--fs->lfs_iocount <= `1`)
2559	wakeup(&fs->lfs_iocount);
2560	wakeup(&fs->lfs_sbactive);
2561	mutex_exit(&lfs_lock);
2562	lfs_freebuf(fs, bp);
2563	KERNEL_UNLOCK_LAST(curlwp);
2564	}
2565
2566	static void
2567	lfs_cluster_aiodone(struct buf *bp)
2568	{
2569	struct lfs_cluster *cl;
2570	struct lfs *fs;
2571	struct buf tbp, fbp;
2572	struct vnode vp, devvp, *ovp;
2573	struct inode *ip;
2574	int error;
2575
2576	KERNEL_LOCK(`1`, curlwp);
2577
2578	error = bp->b_error;
2579	cl = bp->b_private;
2580	fs = cl->fs;
2581	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
2582	ASSERT_NO_SEGLOCK(fs);
2583
2584	/ Put the pages back, and release the buffer /
2585	while (cl->bufcount--) {
2586	tbp = cl->bpp[cl->bufcount];
2587	KASSERT(tbp->b_cflags & BC_BUSY);
2588	if (error) {
2589	tbp->b_error = error;
2590	}
2591
2592	/*
2593	* We're done with tbp. If it has not been re-dirtied since
2594	* the cluster was written, free it. Otherwise, keep it on
2595	* the locked list to be written again.
2596	*/
2597	vp = tbp->b_vp;
2598
2599	tbp->b_flags &= ~B_GATHERED;
2600
2601	LFS_BCLEAN_LOG(fs, tbp);
2602
2603	mutex_enter(&bufcache_lock);
2604	if (tbp->b_iodone == NULL) {
2605	KASSERT(tbp->b_flags & B_LOCKED);
2606	bremfree(tbp);
2607	if (vp) {
2608	mutex_enter(vp->v_interlock);
2609	reassignbuf(tbp, vp);
2610	mutex_exit(vp->v_interlock);
2611	}
2612	tbp->b_flags \|= B_ASYNC; / for biodone /
2613	}
2614
2615	if (((tbp->b_flags \| tbp->b_oflags) &
2616	(B_LOCKED \| BO_DELWRI)) == B_LOCKED)
2617	LFS_UNLOCK_BUF(tbp);
2618
2619	if (tbp->b_oflags & BO_DONE) {
2620	DLOG((DLOG_SEG, "blk %d biodone already (flags %lx)\n",
2621	cl->bufcount, (long)tbp->b_flags));
2622	}
2623
2624	if (tbp->b_iodone != NULL && !LFS_IS_MALLOC_BUF(tbp)) {
2625	/*
2626	* A buffer from the page daemon.
2627	* We use the same iodone as it does,
2628	* so we must manually disassociate its
2629	* buffers from the vp.
2630	*/
2631	if ((ovp = tbp->b_vp) != NULL) {
2632	/ This is just silly /
2633	mutex_enter(ovp->v_interlock);
2634	brelvp(tbp);
2635	mutex_exit(ovp->v_interlock);
2636	tbp->b_vp = vp;
2637	tbp->b_objlock = vp->v_interlock;
2638	}
2639	/ Put it back the way it was /
2640	tbp->b_flags \|= B_ASYNC;
2641	/ Master buffers have BC_AGE /
2642	if (tbp->b_private == tbp)
2643	tbp->b_cflags \|= BC_AGE;
2644	}
2645	mutex_exit(&bufcache_lock);
2646
2647	biodone(tbp);
2648
2649	/*
2650	* If this is the last block for this vnode, but
2651	* there are other blocks on its dirty list,
2652	* set IN_MODIFIED/IN_CLEANING depending on what
2653	* sort of block. Only do this for our mount point,
2654	* not for, e.g., inode blocks that are attached to
2655	* the devvp.
2656	* XXX KS - Shouldn't we set both if both types
2657	* of blocks are present (traverse the dirty list?)
2658	*/
2659	mutex_enter(vp->v_interlock);
2660	mutex_enter(&lfs_lock);
2661	if (vp != devvp && vp->v_numoutput == `0` &&
2662	(fbp = LIST_FIRST(&vp->v_dirtyblkhd)) != NULL) {
2663	ip = VTOI(vp);
2664	DLOG((DLOG_SEG, "lfs_cluster_aiodone: mark ino %d\n",
2665	ip->i_number));
2666	if (LFS_IS_MALLOC_BUF(fbp))
2667	LFS_SET_UINO(ip, IN_CLEANING);
2668	else
2669	LFS_SET_UINO(ip, IN_MODIFIED);
2670	}
2671	cv_broadcast(&vp->v_cv);
2672	mutex_exit(&lfs_lock);
2673	mutex_exit(vp->v_interlock);
2674	}
2675
2676	/ Fix up the cluster buffer, and release it /
2677	if (cl->flags & LFS_CL_MALLOC)
2678	lfs_free(fs, bp->b_data, LFS_NB_CLUSTER);
2679	putiobuf(bp);
2680
2681	/ Note i/o done /
2682	if (cl->flags & LFS_CL_SYNC) {
2683	if (--cl->seg->seg_iocount == `0`)
2684	wakeup(&cl->seg->seg_iocount);
2685	}
2686	mutex_enter(&lfs_lock);
2687	#ifdef DIAGNOSTIC
2688	if (fs->lfs_iocount == `0`)
2689	panic("lfs_cluster_aiodone: zero iocount");
2690	#endif
2691	if (--fs->lfs_iocount <= `1`)
2692	wakeup(&fs->lfs_iocount);
2693	mutex_exit(&lfs_lock);
2694
2695	KERNEL_UNLOCK_LAST(curlwp);
2696
2697	pool_put(&fs->lfs_bpppool, cl->bpp);
2698	cl->bpp = NULL;
2699	pool_put(&fs->lfs_clpool, cl);
2700	}
2701
2702	static void
2703	lfs_generic_callback(struct buf bp, void* (aiodone)(struct* buf *))
2704	{
2705	/ reset b_iodone for when this is a single-buf i/o. /
2706	bp->b_iodone = aiodone;
2707
2708	workqueue_enqueue(uvm.aiodone_queue, &bp->b_work, NULL);
2709	}
2710
2711	static void
2712	lfs_cluster_callback(struct buf *bp)
2713	{
2714
2715	lfs_generic_callback(bp, lfs_cluster_aiodone);
2716	}
2717
2718	void
2719	lfs_supercallback(struct buf *bp)
2720	{
2721
2722	lfs_generic_callback(bp, lfs_super_aiodone);
2723	}
2724
2725	/*
2726	* The only buffers that are going to hit these functions are the
2727	* segment write blocks, or the segment summaries, or the superblocks.
2728	*
2729	* All of the above are created by lfs_newbuf, and so do not need to be
2730	* released via brelse.
2731	*/
2732	void
2733	lfs_callback(struct buf *bp)
2734	{
2735
2736	lfs_generic_callback(bp, lfs_free_aiodone);
2737	}
2738
2739	/*
2740	* Shellsort (diminishing increment sort) from Data Structures and
2741	* Algorithms, Aho, Hopcraft and Ullman, 1983 Edition, page 290;
2742	* see also Knuth Vol. 3, page 84. The increments are selected from
2743	* formula (8), page 95. Roughly O(N^3/2).
2744	*/
2745	/*
2746	* This is our own private copy of shellsort because we want to sort
2747	* two parallel arrays (the array of buffer pointers and the array of
2748	* logical block numbers) simultaneously. Note that we cast the array
2749	* of logical block numbers to a unsigned in this routine so that the
2750	* negative block numbers (meta data blocks) sort AFTER the data blocks.
2751	*/
2752
2753	static void
2754	lfs_shellsort(struct lfs *fs,
2755	struct buf bp_array, union** lfs_blocks *lb_array,
2756	int nmemb, int size)
2757	{
2758	static int __rsshell_increments[] = { `4`, `1`, `0` };
2759	int incr, *incrp, t1, t2;
2760	struct buf *bp_temp;
2761
2762	#ifdef DEBUG
2763	incr = `0`;
2764	for (t1 = `0`; t1 < nmemb; t1++) {
2765	for (t2 = `0`; t2 * size < bp_array[t1]->b_bcount; t2++) {
2766	if (lfs_blocks_get(fs, lb_array, incr++) != bp_array[t1]->b_lblkno + t2) {
2767	/ dump before panic /
2768	printf("lfs_shellsort: nmemb=%d, size=%d\n",
2769	nmemb, size);
2770	incr = `0`;
2771	for (t1 = `0`; t1 < nmemb; t1++) {
2772	const struct buf *bp = bp_array[t1];
2773
2774	printf("bp[%d]: lbn=%" PRIu64 ", size=%"
2775	PRIu64 "\n", t1,
2776	(uint64_t)bp->b_bcount,
2777	(uint64_t)bp->b_lblkno);
2778	printf("lbns:");
2779	for (t2 = `0`; t2 * size < bp->b_bcount;
2780	t2++) {
2781	printf(" %jd",
2782	(intmax_t)lfs_blocks_get(fs, lb_array, incr++));
2783	}
2784	printf("\n");
2785	}
2786	panic("lfs_shellsort: inconsistent input");
2787	}
2788	}
2789	}
2790	#endif
2791
2792	for (incrp = __rsshell_increments; (incr = *incrp++) != `0`;)
2793	for (t1 = incr; t1 < nmemb; ++t1)
2794	for (t2 = t1 - incr; t2 >= `0`;)
2795	if ((u_int64_t)bp_array[t2]->b_lblkno >
2796	(u_int64_t)bp_array[t2 + incr]->b_lblkno) {
2797	bp_temp = bp_array[t2];
2798	bp_array[t2] = bp_array[t2 + incr];
2799	bp_array[t2 + incr] = bp_temp;
2800	t2 -= incr;
2801	} else
2802	break;
2803
2804	/ Reform the list of logical blocks /
2805	incr = `0`;
2806	for (t1 = `0`; t1 < nmemb; t1++) {
2807	for (t2 = `0`; t2 * size < bp_array[t1]->b_bcount; t2++) {
2808	lfs_blocks_set(fs, lb_array, incr++,
2809	bp_array[t1]->b_lblkno + t2);
2810	}
2811	}
2812	}
2813
2814	/*
2815	* Set up an FINFO entry for a new file. The fip pointer is assumed to
2816	* point at uninitialized space.
2817	*/
2818	void
2819	lfs_acquire_finfo(struct lfs fs, ino_t ino, int* vers)
2820	{
2821	struct segment *sp = fs->lfs_sp;
2822	SEGSUM *ssp;
2823
2824	KASSERT(vers > `0`);
2825
2826	if (sp->seg_bytes_left < lfs_sb_getbsize(fs) \|\|
2827	sp->sum_bytes_left < FINFOSIZE(fs) + LFS_BLKPTRSIZE(fs))
2828	(void) lfs_writeseg(fs, fs->lfs_sp);
2829
2830	sp->sum_bytes_left -= FINFOSIZE(fs);
2831	ssp = (SEGSUM *)sp->segsum;
2832	lfs_ss_setnfinfo(fs, ssp, lfs_ss_getnfinfo(fs, ssp) + `1`);
2833	lfs_fi_setnblocks(fs, sp->fip, `0`);
2834	lfs_fi_setino(fs, sp->fip, ino);
2835	lfs_fi_setversion(fs, sp->fip, vers);
2836	}
2837
2838	/*
2839	* Release the FINFO entry, either clearing out an unused entry or
2840	* advancing us to the next available entry.
2841	*/
2842	void
2843	lfs_release_finfo(struct lfs *fs)
2844	{
2845	struct segment *sp = fs->lfs_sp;
2846	SEGSUM *ssp;
2847
2848	if (lfs_fi_getnblocks(fs, sp->fip) != `0`) {
2849	sp->fip = NEXT_FINFO(fs, sp->fip);
2850	lfs_blocks_fromfinfo(fs, &sp->start_lbp, sp->fip);
2851	} else {
2852	/ XXX shouldn't this update sp->fip? /
2853	sp->sum_bytes_left += FINFOSIZE(fs);
2854	ssp = (SEGSUM *)sp->segsum;
2855	lfs_ss_setnfinfo(fs, ssp, lfs_ss_getnfinfo(fs, ssp) - `1`);
2856	}
2857	}
2858

Browse the source code of src/src/sys/ufs/lfs/lfs_segment.c