ffs_snapshot.c source code [src/src/sys/ufs/ffs/ffs_snapshot.c]

1	/ $NetBSD: ffs_snapshot.c,v 1.143 2016/10/28 20:38:12 jdolecek Exp $ /
2
3	/*
4	* Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
5	*
6	* Further information about snapshots can be obtained from:
7	*
8	* Marshall Kirk McKusick http://www.mckusick.com/softdep/
9	* 1614 Oxford Street mckusick@mckusick.com
10	* Berkeley, CA 94709-1608 +1-510-843-9542
11	* USA
12	*
13	* Redistribution and use in source and binary forms, with or without
14	* modification, are permitted provided that the following conditions
15	* are met:
16	*
17	* 1. Redistributions of source code must retain the above copyright
18	* notice, this list of conditions and the following disclaimer.
19	* 2. Redistributions in binary form must reproduce the above copyright
20	* notice, this list of conditions and the following disclaimer in the
21	* documentation and/or other materials provided with the distribution.
22	*
23	* THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
24	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
25	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26	* DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
27	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33	* SUCH DAMAGE.
34	*
35	* @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00
36	*
37	* from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
38	*/
39
40	#include <sys/cdefs.h>
41	__KERNEL_RCSID(`0`, "$NetBSD: ffs_snapshot.c,v 1.143 2016/10/28 20:38:12 jdolecek Exp $");
42
43	#if defined(_KERNEL_OPT)
44	#include "opt_ffs.h"
45	#include "opt_quota.h"
46	#endif
47
48	#include <sys/param.h>
49	#include <sys/kernel.h>
50	#include <sys/systm.h>
51	#include <sys/conf.h>
52	#include <sys/buf.h>
53	#include <sys/proc.h>
54	#include <sys/namei.h>
55	#include <sys/sched.h>
56	#include <sys/stat.h>
57	#include <sys/malloc.h>
58	#include <sys/mount.h>
59	#include <sys/resource.h>
60	#include <sys/resourcevar.h>
61	#include <sys/vnode.h>
62	#include <sys/kauth.h>
63	#include <sys/fstrans.h>
64	#include <sys/wapbl.h>
65
66	#include <miscfs/specfs/specdev.h>
67
68	#include <ufs/ufs/quota.h>
69	#include <ufs/ufs/ufsmount.h>
70	#include <ufs/ufs/inode.h>
71	#include <ufs/ufs/ufs_extern.h>
72	#include <ufs/ufs/ufs_bswap.h>
73	#include <ufs/ufs/ufs_wapbl.h>
74
75	#include <ufs/ffs/fs.h>
76	#include <ufs/ffs/ffs_extern.h>
77
78	#include <uvm/uvm.h>
79
80	TAILQ_HEAD(inodelst, inode); / List of active snapshots /
81
82	struct snap_info {
83	kmutex_t si_lock; / Lock this snapinfo /
84	kmutex_t si_snaplock; / Snapshot vnode common lock /
85	lwp_t si_owner; /* Snaplock owner /
86	struct inodelst si_snapshots; / List of active snapshots /
87	daddr_t si_snapblklist; /* Snapshot block hints list /
88	uint32_t si_gen; / Incremented on change /
89	};
90
91	#if !defined(FFS_NO_SNAPSHOT)
92	typedef int (*acctfunc_t)
93	(struct vnode , void* , int, int, struct* fs , daddr_t, int*);
94
95	static int snapshot_setup(struct mount , struct* vnode *);
96	static int snapshot_copyfs(struct mount , struct* vnode , void* **);
97	static int snapshot_expunge(struct mount , struct* vnode *,
98	struct fs , daddr_t , daddr_t **);
99	static int snapshot_expunge_snap(struct mount , struct* vnode *,
100	struct fs *, daddr_t);
101	static int snapshot_writefs(struct mount , struct* vnode , void* *);
102	static int cgaccount(struct vnode , int, int* *);
103	static int cgaccount1(int, struct vnode , void* , int*);
104	static int expunge(struct vnode , struct* inode , struct* fs *,
105	acctfunc_t, int);
106	static int indiracct(struct vnode , struct* vnode , int*, daddr_t,
107	daddr_t, daddr_t, daddr_t, daddr_t, struct fs , acctfunc_t, int*);
108	static int fullacct(struct vnode , void* , int, int, struct* fs *,
109	daddr_t, int);
110	static int snapacct(struct vnode , void* , int, int, struct* fs *,
111	daddr_t, int);
112	static int mapacct(struct vnode , void* , int, int, struct* fs *,
113	daddr_t, int);
114	#endif /* !defined(FFS_NO_SNAPSHOT) */
115
116	static int ffs_copyonwrite(void , struct* buf *, bool);
117	static int snapblkaddr(struct vnode , daddr_t, daddr_t );
118	static int rwfsblk(struct vnode , int, void* *, daddr_t);
119	static int syncsnap(struct vnode *);
120	static int wrsnapblk(struct vnode , void* *, daddr_t);
121	#if !defined(FFS_NO_SNAPSHOT)
122	static int blocks_in_journal(struct fs *);
123	#endif
124
125	static inline bool is_active_snapshot(struct snap_info , struct* inode *);
126	static inline daddr_t db_get(struct inode , int*);
127	static inline void db_assign(struct inode , int*, daddr_t);
128	static inline daddr_t ib_get(struct inode , int*);
129	static inline daddr_t idb_get(struct inode , void* , int*);
130	static inline void idb_assign(struct inode , void* , int*, daddr_t);
131
132	#ifdef DEBUG
133	static int snapdebug = `0`;
134	#endif
135
136	int
137	ffs_snapshot_init(struct ufsmount *ump)
138	{
139	struct snap_info *si;
140
141	si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP);
142	if (si == NULL)
143	return ENOMEM;
144
145	TAILQ_INIT(&si->si_snapshots);
146	mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE);
147	mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE);
148	si->si_owner = NULL;
149	si->si_gen = `0`;
150	si->si_snapblklist = NULL;
151
152	return `0`;
153	}
154
155	void
156	ffs_snapshot_fini(struct ufsmount *ump)
157	{
158	struct snap_info *si;
159
160	si = ump->um_snapinfo;
161	ump->um_snapinfo = NULL;
162
163	KASSERT(TAILQ_EMPTY(&si->si_snapshots));
164	mutex_destroy(&si->si_lock);
165	mutex_destroy(&si->si_snaplock);
166	KASSERT(si->si_snapblklist == NULL);
167	kmem_free(si, sizeof(*si));
168	}
169
170	/*
171	* Create a snapshot file and initialize it for the filesystem.
172	* Vnode is locked on entry and return.
173	*/
174	int
175	ffs_snapshot(struct mount mp, struct* vnode vp, struct* timespec *ctime)
176	{
177	#if defined(FFS_NO_SNAPSHOT)
178	return EOPNOTSUPP;
179	}
180	#else /* defined(FFS_NO_SNAPSHOT) */
181	bool suspended = false;
182	int error, redo = `0`, snaploc;
183	void *sbbuf = NULL;
184	daddr_t *snaplist = NULL, snaplistsize = `0`;
185	struct buf bp, nbp;
186	struct fs *copy_fs = NULL;
187	struct fs *fs = VFSTOUFS(mp)->um_fs;
188	struct inode *ip = VTOI(vp);
189	struct lwp *l = curlwp;
190	struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
191	struct timespec ts;
192	struct timeval starttime;
193	#ifdef DEBUG
194	struct timeval endtime;
195	#endif
196	struct vnode *devvp = ip->i_devvp;
197
198	/*
199	* If the vnode already is a snapshot, return.
200	*/
201	if ((ip->i_flags & SF_SNAPSHOT)) {
202	if ((ip->i_flags & SF_SNAPINVAL))
203	return EINVAL;
204	if (ctime) {
205	ctime->tv_sec = DIP(ip, mtime);
206	ctime->tv_nsec = DIP(ip, mtimensec);
207	}
208	return `0`;
209	}
210	/*
211	* Check for free snapshot slot in the superblock.
212	*/
213	for (snaploc = `0`; snaploc < FSMAXSNAP; snaploc++)
214	if (fs->fs_snapinum[snaploc] == `0`)
215	break;
216	if (snaploc == FSMAXSNAP)
217	return (ENOSPC);
218	/*
219	* Prepare the vnode to become a snapshot.
220	*/
221	error = snapshot_setup(mp, vp);
222	if (error)
223	goto out;
224
225	/*
226	* Copy all the cylinder group maps. Although the
227	* filesystem is still active, we hope that only a few
228	* cylinder groups will change between now and when we
229	* suspend operations. Thus, we will be able to quickly
230	* touch up the few cylinder groups that changed during
231	* the suspension period.
232	*/
233	error = cgaccount(vp, `1`, NULL);
234	if (error)
235	goto out;
236
237	/*
238	* snapshot is now valid
239	*/
240	ip->i_flags &= ~SF_SNAPINVAL;
241	DIP_ASSIGN(ip, flags, ip->i_flags);
242	ip->i_flag \|= IN_CHANGE \| IN_UPDATE;
243
244	/*
245	* Ensure that the snapshot is completely on disk.
246	* Since we have marked it as a snapshot it is safe to
247	* unlock it as no process will be allowed to write to it.
248	*/
249	error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, `0`, `0`);
250	if (error)
251	goto out;
252	VOP_UNLOCK(vp);
253	/*
254	* All allocations are done, so we can now suspend the filesystem.
255	*/
256	error = vfs_suspend(vp->v_mount, `0`);
257	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
258	if (error)
259	goto out;
260	suspended = true;
261	getmicrotime(&starttime);
262	/*
263	* First, copy all the cylinder group maps that have changed.
264	*/
265	error = cgaccount(vp, `2`, &redo);
266	if (error)
267	goto out;
268	/*
269	* Create a copy of the superblock and its summary information.
270	*/
271	error = snapshot_copyfs(mp, vp, &sbbuf);
272	if (error)
273	goto out;
274	copy_fs = (struct fs )((char* *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc));
275	/*
276	* Expunge unlinked files from our view.
277	*/
278	error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist);
279	if (error)
280	goto out;
281	/*
282	* Record snapshot inode. Since this is the newest snapshot,
283	* it must be placed at the end of the list.
284	*/
285	if (ip->i_nlink > `0`)
286	fs->fs_snapinum[snaploc] = ip->i_number;
287
288	mutex_enter(&si->si_lock);
289	if (is_active_snapshot(si, ip))
290	panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number);
291	TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
292	if (TAILQ_FIRST(&si->si_snapshots) == ip) {
293	/*
294	* If this is the first snapshot on this filesystem, put the
295	* preliminary list in place and establish the cow handler.
296	*/
297	si->si_snapblklist = snaplist;
298	fscow_establish(mp, ffs_copyonwrite, devvp);
299	}
300	si->si_gen++;
301	mutex_exit(&si->si_lock);
302
303	vp->v_vflag \|= VV_SYSTEM;
304	/*
305	* Set the mtime to the time the snapshot has been taken.
306	*/
307	TIMEVAL_TO_TIMESPEC(&starttime, &ts);
308	if (ctime)
309	*ctime = ts;
310	DIP_ASSIGN(ip, mtime, ts.tv_sec);
311	DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
312	ip->i_flag \|= IN_CHANGE \| IN_UPDATE;
313	/*
314	* Copy allocation information from all snapshots and then
315	* expunge them from our view.
316	*/
317	error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize);
318	if (error)
319	goto out;
320	/*
321	* Write the superblock and its summary information to the snapshot.
322	*/
323	error = snapshot_writefs(mp, vp, sbbuf);
324	if (error)
325	goto out;
326	/*
327	* We're nearly done, ensure that the snapshot is completely on disk.
328	*/
329	error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, `0`, `0`);
330	if (error)
331	goto out;
332	/*
333	* Invalidate and free all pages on the snapshot vnode.
334	* We will read and write through the buffercache.
335	*/
336	mutex_enter(vp->v_interlock);
337	error = VOP_PUTPAGES(vp, `0`, `0`,
338	PGO_ALLPAGES \| PGO_CLEANIT \| PGO_SYNCIO \| PGO_FREE);
339	if (error)
340	goto out;
341	/*
342	* Invalidate short ( < fs_bsize ) buffers. We will always read
343	* full size buffers later.
344	*/
345	mutex_enter(&bufcache_lock);
346	KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
347	for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
348	nbp = LIST_NEXT(bp, b_vnbufs);
349	if (bp->b_bcount == fs->fs_bsize)
350	continue;
351	error = bbusy(bp, false, `0`, NULL);
352	if (error != `0`) {
353	if (error == EPASSTHROUGH) {
354	nbp = LIST_FIRST(&vp->v_cleanblkhd);
355	continue;
356	}
357	break;
358	}
359	brelsel(bp, BC_INVAL \| BC_VFLUSH);
360	}
361	mutex_exit(&bufcache_lock);
362
363	out:
364	if (sbbuf != NULL) {
365	free(copy_fs->fs_csp, M_UFSMNT);
366	free(sbbuf, M_UFSMNT);
367	}
368	if (fs->fs_active != NULL) {
369	free(fs->fs_active, M_DEVBUF);
370	fs->fs_active = NULL;
371	}
372
373	mutex_enter(&si->si_lock);
374	if (snaplist != NULL) {
375	if (si->si_snapblklist == snaplist)
376	si->si_snapblklist = NULL;
377	free(snaplist, M_UFSMNT);
378	}
379	if (error) {
380	fs->fs_snapinum[snaploc] = `0`;
381	} else {
382	/*
383	* As this is the newest list, it is the most inclusive, so
384	* should replace the previous list.
385	*/
386	si->si_snapblklist = ip->i_snapblklist;
387	}
388	si->si_gen++;
389	mutex_exit(&si->si_lock);
390
391	if (suspended) {
392	VOP_UNLOCK(vp);
393	vfs_resume(vp->v_mount);
394	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
395	#ifdef DEBUG
396	getmicrotime(&endtime);
397	timersub(&endtime, &starttime, &endtime);
398	printf("%s: suspended %lld.%03d sec, redo %d of %d\n",
399	mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec,
400	endtime.tv_usec / `1000`, redo, fs->fs_ncg);
401	#endif
402	}
403	if (error) {
404	if (UFS_WAPBL_BEGIN(mp) == `0`) {
405	/*
406	* We depend on ffs_truncate() to call ffs_snapremove()
407	* before it may return an error. On failed
408	* ffs_truncate() we have normal file with leaked
409	* (meta-) data, but no snapshot to use.
410	*/
411	(void) ffs_truncate(vp, (off_t)`0`, `0`, NOCRED);
412	UFS_WAPBL_END(mp);
413	}
414	} else if (ip->i_nlink > `0`)
415	vref(vp);
416	return (error);
417	}
418
419	/*
420	* Prepare vnode to become a snapshot.
421	*/
422	static int
423	snapshot_setup(struct mount mp, struct* vnode *vp)
424	{
425	int error, n, len, loc, cg;
426	daddr_t blkno, numblks;
427	struct buf ibp, nbp;
428	struct fs *fs = VFSTOUFS(mp)->um_fs;
429	struct lwp *l = curlwp;
430	const int wbreak = blocks_in_journal(fs)/`8`;
431	struct inode *ip = VTOI(vp);
432
433	/*
434	* Check mount, readonly reference and owner.
435	*/
436	if (vp->v_mount != mp)
437	return EXDEV;
438	if (vp->v_writecount != `0`)
439	return EBUSY;
440	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_SNAPSHOT,
441	`0`, mp, vp, NULL);
442	if (error)
443	return EACCES;
444
445	if (vp->v_size != `0`) {
446	/*
447	* Must completely truncate the file here. Allocated
448	* blocks on a snapshot mean that block has been copied
449	* on write, see ffs_copyonwrite() testing "blkno != 0"
450	*/
451	error = ufs_truncate_retry(vp, `0`, NOCRED);
452	if (error)
453	return error;
454	}
455
456	/ Change inode to snapshot type file. /
457	error = UFS_WAPBL_BEGIN(mp);
458	if (error)
459	return error;
460	#if defined(QUOTA) \|\| defined(QUOTA2)
461	/ shapshot inodes are not accounted in quotas /
462	chkiq(ip, -`1`, l->l_cred, `0`);
463	#endif
464	ip->i_flags \|= (SF_SNAPSHOT \| SF_SNAPINVAL);
465	DIP_ASSIGN(ip, flags, ip->i_flags);
466	ip->i_flag \|= IN_CHANGE \| IN_UPDATE;
467	ffs_update(vp, NULL, NULL, UPDATE_WAIT);
468	UFS_WAPBL_END(mp);
469
470	KASSERT(ip->i_flags & SF_SNAPSHOT);
471	/*
472	* Write an empty list of preallocated blocks to the end of
473	* the snapshot to set size to at least that of the filesystem.
474	*/
475	numblks = howmany(fs->fs_size, fs->fs_frag);
476	blkno = `1`;
477	blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs));
478	error = vn_rdwr(UIO_WRITE, vp,
479	(void )&blkno, sizeof*(blkno), ffs_lblktosize(fs, (off_t)numblks),
480	UIO_SYSSPACE, IO_NODELOCKED\|IO_UNIT, l->l_cred, NULL, NULL);
481	if (error)
482	return error;
483	/*
484	* Preallocate critical data structures so that we can copy
485	* them in without further allocation after we suspend all
486	* operations on the filesystem. We would like to just release
487	* the allocated buffers without writing them since they will
488	* be filled in below once we are ready to go, but this upsets
489	* the soft update code, so we go ahead and write the new buffers.
490	*
491	* Allocate all indirect blocks and mark all of them as not
492	* needing to be copied.
493	*/
494	error = UFS_WAPBL_BEGIN(mp);
495	if (error)
496	return error;
497	for (blkno = UFS_NDADDR, n = `0`; blkno < numblks; blkno += FFS_NINDIR(fs)) {
498	error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno),
499	fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
500	if (error)
501	goto out;
502	brelse(ibp, `0`);
503	if (wbreak > `0` && (++n % wbreak) == `0`) {
504	UFS_WAPBL_END(mp);
505	error = UFS_WAPBL_BEGIN(mp);
506	if (error)
507	return error;
508	}
509	}
510	/*
511	* Allocate copies for the superblock and its summary information.
512	*/
513	error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred,
514	`0`, &nbp);
515	if (error)
516	goto out;
517	bawrite(nbp);
518	blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
519	len = howmany(fs->fs_cssize, fs->fs_bsize);
520	for (loc = `0`; loc < len; loc++) {
521	error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(blkno + loc)),
522	fs->fs_bsize, l->l_cred, `0`, &nbp);
523	if (error)
524	goto out;
525	bawrite(nbp);
526	if (wbreak > `0` && (++n % wbreak) == `0`) {
527	UFS_WAPBL_END(mp);
528	error = UFS_WAPBL_BEGIN(mp);
529	if (error)
530	return error;
531	}
532	}
533	/*
534	* Allocate all cylinder group blocks.
535	*/
536	for (cg = `0`; cg < fs->fs_ncg; cg++) {
537	error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)),
538	fs->fs_bsize, l->l_cred, `0`, &nbp);
539	if (error)
540	goto out;
541	bawrite(nbp);
542	if (wbreak > `0` && (++n % wbreak) == `0`) {
543	UFS_WAPBL_END(mp);
544	error = UFS_WAPBL_BEGIN(mp);
545	if (error)
546	return error;
547	}
548	}
549
550	out:
551	UFS_WAPBL_END(mp);
552	return error;
553	}
554
555	/*
556	* Create a copy of the superblock and its summary information.
557	* It is up to the caller to free copyfs and copy_fs->fs_csp.
558	*/
559	static int
560	snapshot_copyfs(struct mount mp, struct* vnode vp, void* **sbbuf)
561	{
562	int error, i, len, loc, size;
563	void *space;
564	int32_t *lp;
565	struct buf *bp;
566	struct fs copyfs, fs = VFSTOUFS(mp)->um_fs;
567	struct vnode *devvp = VTOI(vp)->i_devvp;
568
569	/*
570	* Grab a copy of the superblock and its summary information.
571	* We delay writing it until the suspension is released below.
572	*/
573	*sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
574	loc = ffs_blkoff(fs, fs->fs_sblockloc);
575	if (loc > `0`)
576	memset(*sbbuf, `0`, loc);
577	copyfs = (struct fs )((char* )(sbbuf) + loc);
578	memcpy(copyfs, fs, fs->fs_sbsize);
579	size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
580	if (fs->fs_sbsize < size)
581	memset((char )(sbbuf) + loc + fs->fs_sbsize, `0`,
582	size - fs->fs_sbsize);
583	size = ffs_blkroundup(fs, fs->fs_cssize);
584	if (fs->fs_contigsumsize > `0`)
585	size += fs->fs_ncg * sizeof(int32_t);
586	space = malloc(size, M_UFSMNT, M_WAITOK);
587	copyfs->fs_csp = space;
588	memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize);
589	space = (char *)space + fs->fs_cssize;
590	loc = howmany(fs->fs_cssize, fs->fs_fsize);
591	i = fs->fs_frag - loc % fs->fs_frag;
592	len = (i == fs->fs_frag) ? `0` : i * fs->fs_fsize;
593	if (len > `0`) {
594	if ((error = bread(devvp, FFS_FSBTODB(fs, fs->fs_csaddr + loc),
595	len, `0`, &bp)) != `0`) {
596	free(copyfs->fs_csp, M_UFSMNT);
597	free(*sbbuf, M_UFSMNT);
598	*sbbuf = NULL;
599	return error;
600	}
601	memcpy(space, bp->b_data, (u_int)len);
602	space = (char *)space + len;
603	brelse(bp, BC_INVAL \| BC_NOCACHE);
604	}
605	if (fs->fs_contigsumsize > `0`) {
606	copyfs->fs_maxcluster = lp = space;
607	for (i = `0`; i < fs->fs_ncg; i++)
608	*lp++ = fs->fs_contigsumsize;
609	}
610	if (mp->mnt_wapbl)
611	copyfs->fs_flags &= ~FS_DOWAPBL;
612	return `0`;
613	}
614
615	struct snapshot_expunge_ctx {
616	struct vnode *logvp;
617	struct lwp *l;
618	struct vnode *vp;
619	struct fs *copy_fs;
620	};
621
622	static bool
623	snapshot_expunge_selector(void cl, struct* vnode *xvp)
624	{
625	struct vattr vat;
626	struct snapshot_expunge_ctx *c = cl;
627	struct inode *xp;
628
629	xp = VTOI(xvp);
630	if (xvp->v_type == VNON \|\| VTOI(xvp) == NULL \|\|
631	(xp->i_flags & SF_SNAPSHOT))
632	return false;
633	#ifdef DEBUG
634	if (snapdebug)
635	vprint("ffs_snapshot: busy vnode", xvp);
636	#endif
637
638	if (xvp == c->logvp)
639	return true;
640
641	if (VOP_GETATTR(xvp, &vat, c->l->l_cred) == `0` &&
642	vat.va_nlink > `0`)
643	return false;
644
645	if (ffs_checkfreefile(c->copy_fs, c->vp, xp->i_number))
646	return false;
647
648	return true;
649	}
650
651	/*
652	* We must check for active files that have been unlinked (e.g., with a zero
653	* link count). We have to expunge all trace of these files from the snapshot
654	* so that they are not reclaimed prematurely by fsck or unnecessarily dumped.
655	* Note that we skip unlinked snapshot files as they will be handled separately.
656	* Calculate the snapshot list size and create a preliminary list.
657	*/
658	static int
659	snapshot_expunge(struct mount mp, struct* vnode vp, struct* fs *copy_fs,
660	daddr_t snaplistsize, daddr_t *snaplist)
661	{
662	int cg, error = `0`, len, loc;
663	daddr_t blkno, *blkp;
664	struct fs *fs = VFSTOUFS(mp)->um_fs;
665	struct inode *xp;
666	struct lwp *l = curlwp;
667	struct vnode logvp = NULL, xvp;
668	struct vnode_iterator *marker;
669	struct snapshot_expunge_ctx ctx;
670
671	*snaplist = NULL;
672	/*
673	* Get the log inode if any.
674	*/
675	if ((fs->fs_flags & FS_DOWAPBL) &&
676	fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
677	error = VFS_VGET(mp,
678	fs->fs_journallocs[UFS_WAPBL_INFS_INO], &logvp);
679	if (error)
680	goto out;
681	}
682	/*
683	* We also calculate the needed size for the snapshot list.
684	*/
685	*snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
686	FSMAXSNAP + `1` / superblock / + `1` / last block / + `1` / size /;
687
688	vfs_vnode_iterator_init(mp, &marker);
689	ctx.logvp = logvp;
690	ctx.l = l;
691	ctx.vp = vp;
692	ctx.copy_fs = copy_fs;
693	while ((xvp = vfs_vnode_iterator_next(marker, snapshot_expunge_selector,
694	&ctx)))
695	{
696	/*
697	* If there is a fragment, clear it here.
698	*/
699	xp = VTOI(xvp);
700	blkno = `0`;
701	loc = howmany(xp->i_size, fs->fs_bsize) - `1`;
702	if (loc < UFS_NDADDR) {
703	len = ffs_fragroundup(fs, ffs_blkoff(fs, xp->i_size));
704	if (len > `0` && len < fs->fs_bsize) {
705	error = UFS_WAPBL_BEGIN(mp);
706	if (error) {
707	vrele(xvp);
708	vfs_vnode_iterator_destroy(marker);
709	goto out;
710	}
711	ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc),
712	len, xp->i_number);
713	blkno = db_get(xp, loc);
714	db_assign(xp, loc, `0`);
715	UFS_WAPBL_END(mp);
716	}
717	}
718	*snaplistsize += `1`;
719	error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY);
720	if (blkno)
721	db_assign(xp, loc, blkno);
722	if (!error) {
723	error = UFS_WAPBL_BEGIN(mp);
724	if (!error) {
725	error = ffs_freefile_snap(copy_fs, vp,
726	xp->i_number, xp->i_mode);
727	UFS_WAPBL_END(mp);
728	}
729	}
730	vrele(xvp);
731	if (error) {
732	vfs_vnode_iterator_destroy(marker);
733	goto out;
734	}
735	}
736	vfs_vnode_iterator_destroy(marker);
737
738	/*
739	* Create a preliminary list of preallocated snapshot blocks.
740	*/
741	snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
742	blkp = &(*snaplist)[`1`];
743	*blkp++ = ffs_lblkno(fs, fs->fs_sblockloc);
744	blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
745	for (cg = `0`; cg < fs->fs_ncg; cg++) {
746	if (ffs_fragstoblks(fs, cgtod(fs, cg)) > blkno)
747	break;
748	*blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg));
749	}
750	len = howmany(fs->fs_cssize, fs->fs_bsize);
751	for (loc = `0`; loc < len; loc++)
752	*blkp++ = blkno + loc;
753	for (; cg < fs->fs_ncg; cg++)
754	*blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg));
755	(snaplist)[`0`] = blkp - &(snaplist)[`0`];
756
757	out:
758	if (logvp != NULL)
759	vput(logvp);
760	if (error && *snaplist != NULL) {
761	free(*snaplist, M_UFSMNT);
762	*snaplist = NULL;
763	}
764
765	return error;
766	}
767
768	/*
769	* Copy allocation information from all the snapshots in this snapshot and
770	* then expunge them from its view. Also, collect the list of allocated
771	* blocks in i_snapblklist.
772	*/
773	static int
774	snapshot_expunge_snap(struct mount mp, struct* vnode *vp,
775	struct fs *copy_fs, daddr_t snaplistsize)
776	{
777	int error = `0`, i;
778	daddr_t numblks, *snaplist = NULL;
779	struct fs *fs = VFSTOUFS(mp)->um_fs;
780	struct inode ip = VTOI(vp), xp;
781	struct lwp *l = curlwp;
782	struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
783
784	TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) {
785	if (xp != ip) {
786	error = expunge(vp, xp, fs, snapacct, BLK_SNAP);
787	if (error)
788	break;
789	}
790	if (xp->i_nlink != `0`)
791	continue;
792	error = UFS_WAPBL_BEGIN(mp);
793	if (error)
794	break;
795	error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode);
796	UFS_WAPBL_END(mp);
797	if (error)
798	break;
799	}
800	if (error)
801	goto out;
802	/*
803	* Allocate space for the full list of preallocated snapshot blocks.
804	*/
805	snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
806	ip->i_snapblklist = &snaplist[`1`];
807	/*
808	* Expunge the blocks used by the snapshots from the set of
809	* blocks marked as used in the snapshot bitmaps. Also, collect
810	* the list of allocated blocks in i_snapblklist.
811	*/
812	error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP);
813	if (error)
814	goto out;
815	if (snaplistsize < ip->i_snapblklist - snaplist)
816	panic("ffs_snapshot: list too small");
817	snaplistsize = ip->i_snapblklist - snaplist;
818	snaplist[`0`] = snaplistsize;
819	ip->i_snapblklist = &snaplist[`0`];
820	/*
821	* Write out the list of allocated blocks to the end of the snapshot.
822	*/
823	numblks = howmany(fs->fs_size, fs->fs_frag);
824	for (i = `0`; i < snaplistsize; i++)
825	snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
826	error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist,
827	snaplistsize * sizeof(daddr_t), ffs_lblktosize(fs, (off_t)numblks),
828	UIO_SYSSPACE, IO_NODELOCKED \| IO_UNIT, l->l_cred, NULL, NULL);
829	for (i = `0`; i < snaplistsize; i++)
830	snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
831	out:
832	if (error && snaplist != NULL) {
833	free(snaplist, M_UFSMNT);
834	ip->i_snapblklist = NULL;
835	}
836	return error;
837	}
838
839	/*
840	* Write the superblock and its summary information to the snapshot.
841	* Make sure, the first UFS_NDADDR blocks get copied to the snapshot.
842	*/
843	static int
844	snapshot_writefs(struct mount mp, struct* vnode vp, void* *sbbuf)
845	{
846	int error, len, loc;
847	void *space;
848	daddr_t blkno;
849	struct buf *bp;
850	struct fs copyfs, fs = VFSTOUFS(mp)->um_fs;
851	struct inode *ip = VTOI(vp);
852	struct lwp *l = curlwp;
853
854	copyfs = (struct fs )((char* *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc));
855
856	/*
857	* Write the superblock and its summary information
858	* to the snapshot.
859	*/
860	blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
861	len = howmany(fs->fs_cssize, fs->fs_bsize);
862	space = copyfs->fs_csp;
863	#ifdef FFS_EI
864	if (UFS_FSNEEDSWAP(fs)) {
865	ffs_sb_swap(copyfs, copyfs);
866	ffs_csum_swap(space, space, fs->fs_cssize);
867	}
868	#endif
869	error = UFS_WAPBL_BEGIN(mp);
870	if (error)
871	return error;
872	for (loc = `0`; loc < len; loc++) {
873	error = bread(vp, blkno + loc, fs->fs_bsize,
874	B_MODIFY, &bp);
875	if (error) {
876	break;
877	}
878	memcpy(bp->b_data, space, fs->fs_bsize);
879	space = (char *)space + fs->fs_bsize;
880	bawrite(bp);
881	}
882	if (error)
883	goto out;
884	error = bread(vp, ffs_lblkno(fs, fs->fs_sblockloc),
885	fs->fs_bsize, B_MODIFY, &bp);
886	if (error) {
887	goto out;
888	} else {
889	memcpy(bp->b_data, sbbuf, fs->fs_bsize);
890	bawrite(bp);
891	}
892	/*
893	* Copy the first UFS_NDADDR blocks to the snapshot so
894	* ffs_copyonwrite() and ffs_snapblkfree() will always work on
895	* indirect blocks.
896	*/
897	for (loc = `0`; loc < UFS_NDADDR; loc++) {
898	if (db_get(ip, loc) != `0`)
899	continue;
900	error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)loc),
901	fs->fs_bsize, l->l_cred, `0`, &bp);
902	if (error)
903	break;
904	error = rwfsblk(vp, B_READ, bp->b_data, loc);
905	if (error) {
906	brelse(bp, `0`);
907	break;
908	}
909	bawrite(bp);
910	}
911
912	out:
913	UFS_WAPBL_END(mp);
914	return error;
915	}
916
917	/*
918	* Copy all cylinder group maps.
919	*/
920	static int
921	cgaccount(struct vnode vp, int* passno, int *redo)
922	{
923	int cg, error = `0`;
924	struct buf *nbp;
925	struct fs *fs = VTOI(vp)->i_fs;
926
927	if (redo != NULL)
928	*redo = `0`;
929	if (passno == `1`)
930	fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY),
931	M_DEVBUF, M_WAITOK \| M_ZERO);
932	for (cg = `0`; cg < fs->fs_ncg; cg++) {
933	if (passno == `2` && ACTIVECG_ISSET(fs, cg))
934	continue;
935
936	if (redo != NULL)
937	*redo += `1`;
938	error = UFS_WAPBL_BEGIN(vp->v_mount);
939	if (error)
940	return error;
941	error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)),
942	fs->fs_bsize, curlwp->l_cred, `0`, &nbp);
943	if (error) {
944	UFS_WAPBL_END(vp->v_mount);
945	break;
946	}
947	error = cgaccount1(cg, vp, nbp->b_data, passno);
948	bawrite(nbp);
949	UFS_WAPBL_END(vp->v_mount);
950	if (error)
951	break;
952	}
953	return error;
954	}
955
956	/*
957	* Copy a cylinder group map. All the unallocated blocks are marked
958	* BLK_NOCOPY so that the snapshot knows that it need not copy them
959	* if they are later written. If passno is one, then this is a first
960	* pass, so only setting needs to be done. If passno is 2, then this
961	* is a revision to a previous pass which must be undone as the
962	* replacement pass is done.
963	*/
964	static int
965	cgaccount1(int cg, struct vnode vp, void* data, int* passno)
966	{
967	struct buf bp, ibp;
968	struct inode *ip;
969	struct cg *cgp;
970	struct fs *fs;
971	struct lwp *l = curlwp;
972	daddr_t base, numblks;
973	int error, len, loc, ns __unused, indiroff;
974
975	ip = VTOI(vp);
976	fs = ip->i_fs;
977	ns = UFS_FSNEEDSWAP(fs);
978	error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
979	(int)fs->fs_cgsize, `0`, &bp);
980	if (error) {
981	return (error);
982	}
983	cgp = (struct cg *)bp->b_data;
984	if (!cg_chkmagic(cgp, ns)) {
985	brelse(bp, `0`);
986	return (EIO);
987	}
988	ACTIVECG_SET(fs, cg);
989
990	memcpy(data, bp->b_data, fs->fs_cgsize);
991	brelse(bp, `0`);
992	if (fs->fs_cgsize < fs->fs_bsize)
993	memset((char *)data + fs->fs_cgsize, `0`,
994	fs->fs_bsize - fs->fs_cgsize);
995	numblks = howmany(fs->fs_size, fs->fs_frag);
996	len = howmany(fs->fs_fpg, fs->fs_frag);
997	base = cg * fs->fs_fpg / fs->fs_frag;
998	if (base + len >= numblks)
999	len = numblks - base - `1`;
1000	loc = `0`;
1001	if (base < UFS_NDADDR) {
1002	for ( ; loc < UFS_NDADDR; loc++) {
1003	if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
1004	db_assign(ip, loc, BLK_NOCOPY);
1005	else if (db_get(ip, loc) == BLK_NOCOPY) {
1006	if (passno == `2`)
1007	db_assign(ip, loc, `0`);
1008	else if (passno == `1`)
1009	panic("ffs_snapshot: lost direct block");
1010	}
1011	}
1012	}
1013	if ((error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(base + loc)),
1014	fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != `0`)
1015	return (error);
1016	indiroff = (base + loc - UFS_NDADDR) % FFS_NINDIR(fs);
1017	for ( ; loc < len; loc++, indiroff++) {
1018	if (indiroff >= FFS_NINDIR(fs)) {
1019	bawrite(ibp);
1020	if ((error = ffs_balloc(vp,
1021	ffs_lblktosize(fs, (off_t)(base + loc)),
1022	fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != `0`)
1023	return (error);
1024	indiroff = `0`;
1025	}
1026	if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
1027	idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY);
1028	else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) {
1029	if (passno == `2`)
1030	idb_assign(ip, ibp->b_data, indiroff, `0`);
1031	else if (passno == `1`)
1032	panic("ffs_snapshot: lost indirect block");
1033	}
1034	}
1035	bdwrite(ibp);
1036	return (`0`);
1037	}
1038
1039	/*
1040	* Before expunging a snapshot inode, note all the
1041	* blocks that it claims with BLK_SNAP so that fsck will
1042	* be able to account for those blocks properly and so
1043	* that this snapshot knows that it need not copy them
1044	* if the other snapshot holding them is freed.
1045	*/
1046	static int
1047	expunge(struct vnode snapvp, struct* inode cancelip, struct* fs *fs,
1048	acctfunc_t acctfunc, int expungetype)
1049	{
1050	int i, error, ns __unused;
1051	daddr_t lbn, rlbn;
1052	daddr_t len, blkno, numblks, blksperindir;
1053	struct ufs1_dinode *dip1;
1054	struct ufs2_dinode *dip2;
1055	struct lwp *l = curlwp;
1056	void *bap;
1057	struct buf *bp;
1058	struct mount *mp;
1059
1060	ns = UFS_FSNEEDSWAP(fs);
1061	mp = snapvp->v_mount;
1062
1063	error = UFS_WAPBL_BEGIN(mp);
1064	if (error)
1065	return error;
1066	/*
1067	* Prepare to expunge the inode. If its inode block has not
1068	* yet been copied, then allocate and fill the copy.
1069	*/
1070	lbn = ffs_fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1071	error = snapblkaddr(snapvp, lbn, &blkno);
1072	if (error)
1073	return error;
1074	if (blkno != `0`) {
1075	error = bread(snapvp, lbn, fs->fs_bsize,
1076	B_MODIFY, &bp);
1077	} else {
1078	error = ffs_balloc(snapvp, ffs_lblktosize(fs, (off_t)lbn),
1079	fs->fs_bsize, l->l_cred, `0`, &bp);
1080	if (! error)
1081	error = rwfsblk(snapvp, B_READ, bp->b_data, lbn);
1082	}
1083	if (error) {
1084	UFS_WAPBL_END(mp);
1085	return error;
1086	}
1087	/*
1088	* Set a snapshot inode to be a zero length file, regular files
1089	* or unlinked snapshots to be completely unallocated.
1090	*/
1091	if (fs->fs_magic == FS_UFS1_MAGIC) {
1092	dip1 = (struct ufs1_dinode *)bp->b_data +
1093	ino_to_fsbo(fs, cancelip->i_number);
1094	if (cancelip->i_flags & SF_SNAPSHOT) {
1095	dip1->di_flags =
1096	ufs_rw32(ufs_rw32(dip1->di_flags, ns) \|
1097	SF_SNAPINVAL, ns);
1098	}
1099	if (expungetype == BLK_NOCOPY \|\| cancelip->i_nlink == `0`)
1100	dip1->di_mode = `0`;
1101	dip1->di_size = `0`;
1102	dip1->di_blocks = `0`;
1103	memset(&dip1->di_db[`0`], `0`, (UFS_NDADDR + UFS_NIADDR) * sizeof(int32_t));
1104	} else {
1105	dip2 = (struct ufs2_dinode *)bp->b_data +
1106	ino_to_fsbo(fs, cancelip->i_number);
1107	if (cancelip->i_flags & SF_SNAPSHOT) {
1108	dip2->di_flags =
1109	ufs_rw32(ufs_rw32(dip2->di_flags, ns) \|
1110	SF_SNAPINVAL, ns);
1111	}
1112	if (expungetype == BLK_NOCOPY \|\| cancelip->i_nlink == `0`)
1113	dip2->di_mode = `0`;
1114	dip2->di_size = `0`;
1115	dip2->di_blocks = `0`;
1116	memset(&dip2->di_db[`0`], `0`, (UFS_NDADDR + UFS_NIADDR) * sizeof(int64_t));
1117	}
1118	bdwrite(bp);
1119	UFS_WAPBL_END(mp);
1120	/*
1121	* Now go through and expunge all the blocks in the file
1122	* using the function requested.
1123	*/
1124	numblks = howmany(cancelip->i_size, fs->fs_bsize);
1125	if (fs->fs_magic == FS_UFS1_MAGIC)
1126	bap = &cancelip->i_ffs1_db[`0`];
1127	else
1128	bap = &cancelip->i_ffs2_db[`0`];
1129	error = (*acctfunc)(snapvp, bap, `0`, UFS_NDADDR, fs, `0`, expungetype);
1130	if (error)
1131	return (error);
1132	if (fs->fs_magic == FS_UFS1_MAGIC)
1133	bap = &cancelip->i_ffs1_ib[`0`];
1134	else
1135	bap = &cancelip->i_ffs2_ib[`0`];
1136	error = (*acctfunc)(snapvp, bap, `0`, UFS_NIADDR, fs, -`1`, expungetype);
1137	if (error)
1138	return (error);
1139	blksperindir = `1`;
1140	lbn = -UFS_NDADDR;
1141	len = numblks - UFS_NDADDR;
1142	rlbn = UFS_NDADDR;
1143	for (i = `0`; len > `0` && i < UFS_NIADDR; i++) {
1144	error = indiracct(snapvp, ITOV(cancelip), i,
1145	ib_get(cancelip, i), lbn, rlbn, len,
1146	blksperindir, fs, acctfunc, expungetype);
1147	if (error)
1148	return (error);
1149	blksperindir *= FFS_NINDIR(fs);
1150	lbn -= blksperindir + `1`;
1151	len -= blksperindir;
1152	rlbn += blksperindir;
1153	}
1154	return (`0`);
1155	}
1156
1157	/*
1158	* Descend an indirect block chain for vnode cancelvp accounting for all
1159	* its indirect blocks in snapvp.
1160	*/
1161	static int
1162	indiracct(struct vnode snapvp, struct* vnode cancelvp, int* level,
1163	daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks,
1164	daddr_t blksperindir, struct fs fs, acctfunc_t acctfunc, int* expungetype)
1165	{
1166	int error, num, i;
1167	daddr_t subblksperindir;
1168	struct indir indirs[UFS_NIADDR + `2`];
1169	daddr_t last;
1170	void *bap;
1171	struct buf *bp;
1172
1173	if (blkno == `0`) {
1174	if (expungetype == BLK_NOCOPY)
1175	return (`0`);
1176	panic("indiracct: missing indir");
1177	}
1178	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != `0`)
1179	return (error);
1180	if (lbn != indirs[num - `1` - level].in_lbn \|\| num < `2`)
1181	panic("indiracct: botched params");
1182	/*
1183	* We have to expand bread here since it will deadlock looking
1184	* up the block number for any blocks that are not in the cache.
1185	*/
1186	error = ffs_getblk(cancelvp, lbn, FFS_FSBTODB(fs, blkno), fs->fs_bsize,
1187	false, &bp);
1188	if (error)
1189	return error;
1190	if ((bp->b_oflags & (BO_DONE \| BO_DELWRI)) == `0` && (error =
1191	rwfsblk(bp->b_vp, B_READ, bp->b_data, ffs_fragstoblks(fs, blkno)))) {
1192	brelse(bp, `0`);
1193	return (error);
1194	}
1195	/*
1196	* Account for the block pointers in this indirect block.
1197	*/
1198	last = howmany(remblks, blksperindir);
1199	if (last > FFS_NINDIR(fs))
1200	last = FFS_NINDIR(fs);
1201	bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK \| M_ZERO);
1202	memcpy((void *)bap, bp->b_data, fs->fs_bsize);
1203	brelse(bp, `0`);
1204	error = (*acctfunc)(snapvp, bap, `0`, last,
1205	fs, level == `0` ? rlbn : -`1`, expungetype);
1206	if (error \|\| level == `0`)
1207	goto out;
1208	/*
1209	* Account for the block pointers in each of the indirect blocks
1210	* in the levels below us.
1211	*/
1212	subblksperindir = blksperindir / FFS_NINDIR(fs);
1213	for (lbn++, level--, i = `0`; i < last; i++) {
1214	error = indiracct(snapvp, cancelvp, level,
1215	idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks,
1216	subblksperindir, fs, acctfunc, expungetype);
1217	if (error)
1218	goto out;
1219	rlbn += blksperindir;
1220	lbn -= blksperindir;
1221	remblks -= blksperindir;
1222	}
1223	out:
1224	free(bap, M_DEVBUF);
1225	return (error);
1226	}
1227
1228	/*
1229	* Do both snap accounting and map accounting.
1230	*/
1231	static int
1232	fullacct(struct vnode vp, void* bap, int* oldblkp, int lastblkp,
1233	struct fs *fs, daddr_t lblkno,
1234	int exptype / BLK_SNAP or BLK_NOCOPY /)
1235	{
1236	int error;
1237
1238	if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)))
1239	return (error);
1240	return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype));
1241	}
1242
1243	/*
1244	* Identify a set of blocks allocated in a snapshot inode.
1245	*/
1246	static int
1247	snapacct(struct vnode vp, void* bap, int* oldblkp, int lastblkp,
1248	struct fs *fs, daddr_t lblkno,
1249	int expungetype / BLK_SNAP or BLK_NOCOPY /)
1250	{
1251	struct inode *ip = VTOI(vp);
1252	struct lwp *l = curlwp;
1253	struct mount *mp = vp->v_mount;
1254	daddr_t blkno;
1255	daddr_t lbn;
1256	struct buf *ibp;
1257	int error, n;
1258	const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/`8`;
1259
1260	error = UFS_WAPBL_BEGIN(mp);
1261	if (error)
1262	return error;
1263	for ( n = `0`; oldblkp < lastblkp; oldblkp++) {
1264	blkno = idb_get(ip, bap, oldblkp);
1265	if (blkno == `0` \|\| blkno == BLK_NOCOPY \|\| blkno == BLK_SNAP)
1266	continue;
1267	lbn = ffs_fragstoblks(fs, blkno);
1268	if (lbn < UFS_NDADDR) {
1269	blkno = db_get(ip, lbn);
1270	ip->i_flag \|= IN_CHANGE \| IN_UPDATE;
1271	} else {
1272	error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn),
1273	fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1274	if (error)
1275	break;
1276	blkno = idb_get(ip, ibp->b_data,
1277	(lbn - UFS_NDADDR) % FFS_NINDIR(fs));
1278	}
1279	/*
1280	* If we are expunging a snapshot vnode and we
1281	* find a block marked BLK_NOCOPY, then it is
1282	* one that has been allocated to this snapshot after
1283	* we took our current snapshot and can be ignored.
1284	*/
1285	if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
1286	if (lbn >= UFS_NDADDR)
1287	brelse(ibp, `0`);
1288	} else {
1289	if (blkno != `0`)
1290	panic("snapacct: bad block");
1291	if (lbn < UFS_NDADDR)
1292	db_assign(ip, lbn, expungetype);
1293	else {
1294	idb_assign(ip, ibp->b_data,
1295	(lbn - UFS_NDADDR) % FFS_NINDIR(fs), expungetype);
1296	bdwrite(ibp);
1297	}
1298	}
1299	if (wbreak > `0` && (++n % wbreak) == `0`) {
1300	UFS_WAPBL_END(mp);
1301	error = UFS_WAPBL_BEGIN(mp);
1302	if (error)
1303	return error;
1304	}
1305	}
1306	UFS_WAPBL_END(mp);
1307	return error;
1308	}
1309
1310	/*
1311	* Account for a set of blocks allocated in a snapshot inode.
1312	*/
1313	static int
1314	mapacct(struct vnode vp, void* bap, int* oldblkp, int lastblkp,
1315	struct fs fs, daddr_t lblkno, int* expungetype)
1316	{
1317	daddr_t blkno;
1318	struct inode *ip;
1319	struct mount *mp = vp->v_mount;
1320	ino_t inum;
1321	int acctit, error, n;
1322	const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/`8`;
1323
1324	error = UFS_WAPBL_BEGIN(mp);
1325	if (error)
1326	return error;
1327	ip = VTOI(vp);
1328	inum = ip->i_number;
1329	if (lblkno == -`1`)
1330	acctit = `0`;
1331	else
1332	acctit = `1`;
1333	for ( n = `0`; oldblkp < lastblkp; oldblkp++, lblkno++) {
1334	blkno = idb_get(ip, bap, oldblkp);
1335	if (blkno == `0` \|\| blkno == BLK_NOCOPY)
1336	continue;
1337	if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1338	*ip->i_snapblklist++ = lblkno;
1339	if (blkno == BLK_SNAP)
1340	blkno = ffs_blkstofrags(fs, lblkno);
1341	ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum);
1342	if (wbreak > `0` && (++n % wbreak) == `0`) {
1343	UFS_WAPBL_END(mp);
1344	error = UFS_WAPBL_BEGIN(mp);
1345	if (error)
1346	return error;
1347	}
1348	}
1349	UFS_WAPBL_END(mp);
1350	return (`0`);
1351	}
1352
1353	/*
1354	* Number of blocks that fit into the journal or zero if not logging.
1355	*/
1356	static int
1357	blocks_in_journal(struct fs *fs)
1358	{
1359	off_t bpj;
1360
1361	if ((fs->fs_flags & FS_DOWAPBL) == `0`)
1362	return `0`;
1363	bpj = `1`;
1364	if (fs->fs_journal_version == UFS_WAPBL_VERSION) {
1365	switch (fs->fs_journal_location) {
1366	case UFS_WAPBL_JOURNALLOC_END_PARTITION:
1367	bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]*
1368	fs->fs_journallocs[UFS_WAPBL_EPART_COUNT];
1369	break;
1370	case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
1371	bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]*
1372	fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
1373	break;
1374	}
1375	}
1376	bpj /= fs->fs_bsize;
1377	return (bpj > `0` ? bpj : `1`);
1378	}
1379	#endif /* defined(FFS_NO_SNAPSHOT) */
1380
1381	/*
1382	* Decrement extra reference on snapshot when last name is removed.
1383	* It will not be freed until the last open reference goes away.
1384	*/
1385	void
1386	ffs_snapgone(struct vnode *vp)
1387	{
1388	struct inode xp, ip = VTOI(vp);
1389	struct mount *mp = spec_node_getmountedfs(ip->i_devvp);
1390	struct fs *fs;
1391	struct snap_info *si;
1392	int snaploc;
1393
1394	si = VFSTOUFS(mp)->um_snapinfo;
1395
1396	/*
1397	* Find snapshot in incore list.
1398	*/
1399	mutex_enter(&si->si_lock);
1400	TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
1401	if (xp == ip)
1402	break;
1403	mutex_exit(&si->si_lock);
1404	if (xp != NULL)
1405	vrele(ITOV(ip));
1406	#ifdef DEBUG
1407	else if (snapdebug)
1408	printf("ffs_snapgone: lost snapshot vnode %llu\n",
1409	(unsigned long long)ip->i_number);
1410	#endif
1411	/*
1412	* Delete snapshot inode from superblock. Keep list dense.
1413	*/
1414	mutex_enter(&si->si_lock);
1415	fs = ip->i_fs;
1416	for (snaploc = `0`; snaploc < FSMAXSNAP; snaploc++)
1417	if (fs->fs_snapinum[snaploc] == ip->i_number)
1418	break;
1419	if (snaploc < FSMAXSNAP) {
1420	for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
1421	if (fs->fs_snapinum[snaploc] == `0`)
1422	break;
1423	fs->fs_snapinum[snaploc - `1`] = fs->fs_snapinum[snaploc];
1424	}
1425	fs->fs_snapinum[snaploc - `1`] = `0`;
1426	}
1427	si->si_gen++;
1428	mutex_exit(&si->si_lock);
1429	}
1430
1431	/*
1432	* Prepare a snapshot file for being removed.
1433	*/
1434	void
1435	ffs_snapremove(struct vnode *vp)
1436	{
1437	struct inode ip = VTOI(vp), xp;
1438	struct vnode *devvp = ip->i_devvp;
1439	struct fs *fs = ip->i_fs;
1440	struct mount *mp = spec_node_getmountedfs(devvp);
1441	struct buf *ibp;
1442	struct snap_info *si;
1443	struct lwp *l = curlwp;
1444	daddr_t numblks, blkno, dblk;
1445	int error, loc, last;
1446
1447	si = VFSTOUFS(mp)->um_snapinfo;
1448	/*
1449	* If active, delete from incore list (this snapshot may
1450	* already have been in the process of being deleted, so
1451	* would not have been active).
1452	*
1453	* Clear copy-on-write flag if last snapshot.
1454	*/
1455	mutex_enter(&si->si_snaplock);
1456	mutex_enter(&si->si_lock);
1457	if (is_active_snapshot(si, ip)) {
1458	TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap);
1459	if (TAILQ_FIRST(&si->si_snapshots) != `0`) {
1460	/ Roll back the list of preallocated blocks. /
1461	xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1462	si->si_snapblklist = xp->i_snapblklist;
1463	si->si_gen++;
1464	mutex_exit(&si->si_lock);
1465	mutex_exit(&si->si_snaplock);
1466	} else {
1467	si->si_snapblklist = `0`;
1468	si->si_gen++;
1469	mutex_exit(&si->si_lock);
1470	mutex_exit(&si->si_snaplock);
1471	fscow_disestablish(mp, ffs_copyonwrite, devvp);
1472	}
1473	if (ip->i_snapblklist != NULL) {
1474	free(ip->i_snapblklist, M_UFSMNT);
1475	ip->i_snapblklist = NULL;
1476	}
1477	} else {
1478	mutex_exit(&si->si_lock);
1479	mutex_exit(&si->si_snaplock);
1480	}
1481	/*
1482	* Clear all BLK_NOCOPY fields. Pass any block claims to other
1483	* snapshots that want them (see ffs_snapblkfree below).
1484	*/
1485	for (blkno = `1`; blkno < UFS_NDADDR; blkno++) {
1486	dblk = db_get(ip, blkno);
1487	if (dblk == BLK_NOCOPY \|\| dblk == BLK_SNAP)
1488	db_assign(ip, blkno, `0`);
1489	else if ((dblk == ffs_blkstofrags(fs, blkno) &&
1490	ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
1491	ip->i_number))) {
1492	DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1493	db_assign(ip, blkno, `0`);
1494	}
1495	}
1496	numblks = howmany(ip->i_size, fs->fs_bsize);
1497	for (blkno = UFS_NDADDR; blkno < numblks; blkno += FFS_NINDIR(fs)) {
1498	error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno),
1499	fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1500	if (error)
1501	continue;
1502	if (fs->fs_size - blkno > FFS_NINDIR(fs))
1503	last = FFS_NINDIR(fs);
1504	else
1505	last = fs->fs_size - blkno;
1506	for (loc = `0`; loc < last; loc++) {
1507	dblk = idb_get(ip, ibp->b_data, loc);
1508	if (dblk == BLK_NOCOPY \|\| dblk == BLK_SNAP)
1509	idb_assign(ip, ibp->b_data, loc, `0`);
1510	else if (dblk == ffs_blkstofrags(fs, blkno) &&
1511	ffs_snapblkfree(fs, ip->i_devvp, dblk,
1512	fs->fs_bsize, ip->i_number)) {
1513	DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1514	idb_assign(ip, ibp->b_data, loc, `0`);
1515	}
1516	}
1517	bawrite(ibp);
1518	UFS_WAPBL_END(mp);
1519	error = UFS_WAPBL_BEGIN(mp);
1520	KASSERT(error == `0`);
1521	}
1522	/*
1523	* Clear snapshot flag and drop reference.
1524	*/
1525	ip->i_flags &= ~(SF_SNAPSHOT \| SF_SNAPINVAL);
1526	DIP_ASSIGN(ip, flags, ip->i_flags);
1527	ip->i_flag \|= IN_CHANGE \| IN_UPDATE;
1528	#if defined(QUOTA) \|\| defined(QUOTA2)
1529	chkdq(ip, DIP(ip, blocks), l->l_cred, FORCE);
1530	chkiq(ip, `1`, l->l_cred, FORCE);
1531	#endif
1532	}
1533
1534	/*
1535	* Notification that a block is being freed. Return zero if the free
1536	* should be allowed to proceed. Return non-zero if the snapshot file
1537	* wants to claim the block. The block will be claimed if it is an
1538	* uncopied part of one of the snapshots. It will be freed if it is
1539	* either a BLK_NOCOPY or has already been copied in all of the snapshots.
1540	* If a fragment is being freed, then all snapshots that care about
1541	* it must make a copy since a snapshot file can only claim full sized
1542	* blocks. Note that if more than one snapshot file maps the block,
1543	* we can pick one at random to claim it. Since none of the snapshots
1544	* can change, we are assurred that they will all see the same unmodified
1545	* image. When deleting a snapshot file (see ffs_snapremove above), we
1546	* must push any of these claimed blocks to one of the other snapshots
1547	* that maps it. These claimed blocks are easily identified as they will
1548	* have a block number equal to their logical block number within the
1549	* snapshot. A copied block can never have this property because they
1550	* must always have been allocated from a BLK_NOCOPY location.
1551	*/
1552	int
1553	ffs_snapblkfree(struct fs fs, struct* vnode *devvp, daddr_t bno,
1554	long size, ino_t inum)
1555	{
1556	struct mount *mp = spec_node_getmountedfs(devvp);
1557	struct buf *ibp;
1558	struct inode *ip;
1559	struct vnode *vp = NULL;
1560	struct snap_info *si;
1561	void *saved_data = NULL;
1562	daddr_t lbn;
1563	daddr_t blkno;
1564	uint32_t gen;
1565	int indiroff = `0`, error = `0`, claimedblk = `0`;
1566
1567	si = VFSTOUFS(mp)->um_snapinfo;
1568	lbn = ffs_fragstoblks(fs, bno);
1569	mutex_enter(&si->si_snaplock);
1570	mutex_enter(&si->si_lock);
1571	si->si_owner = curlwp;
1572
1573	retry:
1574	gen = si->si_gen;
1575	TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1576	vp = ITOV(ip);
1577	/*
1578	* Lookup block being written.
1579	*/
1580	if (lbn < UFS_NDADDR) {
1581	blkno = db_get(ip, lbn);
1582	} else {
1583	mutex_exit(&si->si_lock);
1584	error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn),
1585	fs->fs_bsize, FSCRED, B_METAONLY, &ibp);
1586	if (error) {
1587	mutex_enter(&si->si_lock);
1588	break;
1589	}
1590	indiroff = (lbn - UFS_NDADDR) % FFS_NINDIR(fs);
1591	blkno = idb_get(ip, ibp->b_data, indiroff);
1592	mutex_enter(&si->si_lock);
1593	if (gen != si->si_gen) {
1594	brelse(ibp, `0`);
1595	goto retry;
1596	}
1597	}
1598	/*
1599	* Check to see if block needs to be copied.
1600	*/
1601	if (blkno == `0`) {
1602	/*
1603	* A block that we map is being freed. If it has not
1604	* been claimed yet, we will claim or copy it (below).
1605	*/
1606	claimedblk = `1`;
1607	} else if (blkno == BLK_SNAP) {
1608	/*
1609	* No previous snapshot claimed the block,
1610	* so it will be freed and become a BLK_NOCOPY
1611	* (don't care) for us.
1612	*/
1613	if (claimedblk)
1614	panic("snapblkfree: inconsistent block type");
1615	if (lbn < UFS_NDADDR) {
1616	db_assign(ip, lbn, BLK_NOCOPY);
1617	ip->i_flag \|= IN_CHANGE \| IN_UPDATE;
1618	} else {
1619	idb_assign(ip, ibp->b_data, indiroff,
1620	BLK_NOCOPY);
1621	mutex_exit(&si->si_lock);
1622	if (ip->i_nlink > `0`)
1623	bwrite(ibp);
1624	else
1625	bdwrite(ibp);
1626	mutex_enter(&si->si_lock);
1627	if (gen != si->si_gen)
1628	goto retry;
1629	}
1630	continue;
1631	} else / BLK_NOCOPY or default / {
1632	/*
1633	* If the snapshot has already copied the block
1634	* (default), or does not care about the block,
1635	* it is not needed.
1636	*/
1637	if (lbn >= UFS_NDADDR)
1638	brelse(ibp, `0`);
1639	continue;
1640	}
1641	/*
1642	* If this is a full size block, we will just grab it
1643	* and assign it to the snapshot inode. Otherwise we
1644	* will proceed to copy it. See explanation for this
1645	* routine as to why only a single snapshot needs to
1646	* claim this block.
1647	*/
1648	if (size == fs->fs_bsize) {
1649	#ifdef DEBUG
1650	if (snapdebug)
1651	printf("%s %llu lbn %" PRId64
1652	"from inum %llu\n",
1653	"Grabonremove: snapino",
1654	(unsigned long long)ip->i_number,
1655	lbn, (unsigned long long)inum);
1656	#endif
1657	mutex_exit(&si->si_lock);
1658	if (lbn < UFS_NDADDR) {
1659	db_assign(ip, lbn, bno);
1660	} else {
1661	idb_assign(ip, ibp->b_data, indiroff, bno);
1662	if (ip->i_nlink > `0`)
1663	bwrite(ibp);
1664	else
1665	bdwrite(ibp);
1666	}
1667	DIP_ADD(ip, blocks, btodb(size));
1668	ip->i_flag \|= IN_CHANGE \| IN_UPDATE;
1669	if (ip->i_nlink > `0` && mp->mnt_wapbl)
1670	error = syncsnap(vp);
1671	else
1672	error = `0`;
1673	mutex_enter(&si->si_lock);
1674	si->si_owner = NULL;
1675	mutex_exit(&si->si_lock);
1676	mutex_exit(&si->si_snaplock);
1677	return (error == `0`);
1678	}
1679	if (lbn >= UFS_NDADDR)
1680	brelse(ibp, `0`);
1681	#ifdef DEBUG
1682	if (snapdebug)
1683	printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n",
1684	"Copyonremove: snapino ",
1685	(unsigned long long)ip->i_number,
1686	lbn, "for inum", (unsigned long long)inum, size);
1687	#endif
1688	/*
1689	* If we have already read the old block contents, then
1690	* simply copy them to the new block. Note that we need
1691	* to synchronously write snapshots that have not been
1692	* unlinked, and hence will be visible after a crash,
1693	* to ensure their integrity.
1694	*/
1695	mutex_exit(&si->si_lock);
1696	if (saved_data == NULL) {
1697	saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1698	error = rwfsblk(vp, B_READ, saved_data, lbn);
1699	if (error) {
1700	free(saved_data, M_UFSMNT);
1701	saved_data = NULL;
1702	mutex_enter(&si->si_lock);
1703	break;
1704	}
1705	}
1706	error = wrsnapblk(vp, saved_data, lbn);
1707	if (error == `0` && ip->i_nlink > `0` && mp->mnt_wapbl)
1708	error = syncsnap(vp);
1709	mutex_enter(&si->si_lock);
1710	if (error)
1711	break;
1712	if (gen != si->si_gen)
1713	goto retry;
1714	}
1715	si->si_owner = NULL;
1716	mutex_exit(&si->si_lock);
1717	mutex_exit(&si->si_snaplock);
1718	if (saved_data)
1719	free(saved_data, M_UFSMNT);
1720	/*
1721	* If we have been unable to allocate a block in which to do
1722	* the copy, then return non-zero so that the fragment will
1723	* not be freed. Although space will be lost, the snapshot
1724	* will stay consistent.
1725	*/
1726	return (error);
1727	}
1728
1729	/*
1730	* Associate snapshot files when mounting.
1731	*/
1732	void
1733	ffs_snapshot_mount(struct mount *mp)
1734	{
1735	struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1736	struct fs *fs = VFSTOUFS(mp)->um_fs;
1737	struct lwp *l = curlwp;
1738	struct vnode *vp;
1739	struct inode ip, xp;
1740	struct snap_info *si;
1741	daddr_t snaplistsize, *snapblklist;
1742	int i, error, ns __unused, snaploc, loc;
1743
1744	/*
1745	* No persistent snapshots on apple ufs file systems.
1746	*/
1747	if (UFS_MPISAPPLEUFS(VFSTOUFS(mp)))
1748	return;
1749
1750	si = VFSTOUFS(mp)->um_snapinfo;
1751	ns = UFS_FSNEEDSWAP(fs);
1752	/*
1753	* XXX The following needs to be set before ffs_truncate or
1754	* VOP_READ can be called.
1755	*/
1756	mp->mnt_stat.f_iosize = fs->fs_bsize;
1757	/*
1758	* Process each snapshot listed in the superblock.
1759	*/
1760	vp = NULL;
1761	mutex_enter(&si->si_lock);
1762	for (snaploc = `0`; snaploc < FSMAXSNAP; snaploc++) {
1763	if (fs->fs_snapinum[snaploc] == `0`)
1764	break;
1765	if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
1766	&vp)) != `0`) {
1767	printf("ffs_snapshot_mount: vget failed %d\n", error);
1768	continue;
1769	}
1770	ip = VTOI(vp);
1771	if ((ip->i_flags & (SF_SNAPSHOT \| SF_SNAPINVAL)) !=
1772	SF_SNAPSHOT) {
1773	printf("ffs_snapshot_mount: non-snapshot inode %d\n",
1774	fs->fs_snapinum[snaploc]);
1775	vput(vp);
1776	vp = NULL;
1777	for (loc = snaploc + `1`; loc < FSMAXSNAP; loc++) {
1778	if (fs->fs_snapinum[loc] == `0`)
1779	break;
1780	fs->fs_snapinum[loc - `1`] = fs->fs_snapinum[loc];
1781	}
1782	fs->fs_snapinum[loc - `1`] = `0`;
1783	snaploc--;
1784	continue;
1785	}
1786
1787	/*
1788	* Read the block hints list. Use an empty list on
1789	* read errors.
1790	*/
1791	error = vn_rdwr(UIO_READ, vp,
1792	(void )&snaplistsize, sizeof*(snaplistsize),
1793	ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1794	UIO_SYSSPACE, IO_NODELOCKED\|IO_UNIT\|IO_ALTSEMANTICS,
1795	l->l_cred, NULL, NULL);
1796	if (error) {
1797	printf("ffs_snapshot_mount: read_1 failed %d\n", error);
1798	snaplistsize = `1`;
1799	} else
1800	snaplistsize = ufs_rw64(snaplistsize, ns);
1801	snapblklist = malloc(
1802	snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
1803	if (error)
1804	snapblklist[`0`] = `1`;
1805	else {
1806	error = vn_rdwr(UIO_READ, vp, (void *)snapblklist,
1807	snaplistsize * sizeof(daddr_t),
1808	ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1809	UIO_SYSSPACE, IO_NODELOCKED\|IO_UNIT\|IO_ALTSEMANTICS,
1810	l->l_cred, NULL, NULL);
1811	for (i = `0`; i < snaplistsize; i++)
1812	snapblklist[i] = ufs_rw64(snapblklist[i], ns);
1813	if (error) {
1814	printf("ffs_snapshot_mount: read_2 failed %d\n",
1815	error);
1816	snapblklist[`0`] = `1`;
1817	}
1818	}
1819	ip->i_snapblklist = &snapblklist[`0`];
1820
1821	/*
1822	* Link it onto the active snapshot list.
1823	*/
1824	if (is_active_snapshot(si, ip))
1825	panic("ffs_snapshot_mount: %"PRIu64" already on list",
1826	ip->i_number);
1827	else
1828	TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
1829	vp->v_vflag \|= VV_SYSTEM;
1830	VOP_UNLOCK(vp);
1831	}
1832	/*
1833	* No usable snapshots found.
1834	*/
1835	if (vp == NULL) {
1836	mutex_exit(&si->si_lock);
1837	return;
1838	}
1839	/*
1840	* Attach the block hints list. We always want to
1841	* use the list from the newest snapshot.
1842	*/
1843	xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1844	si->si_snapblklist = xp->i_snapblklist;
1845	fscow_establish(mp, ffs_copyonwrite, devvp);
1846	si->si_gen++;
1847	mutex_exit(&si->si_lock);
1848	}
1849
1850	/*
1851	* Disassociate snapshot files when unmounting.
1852	*/
1853	void
1854	ffs_snapshot_unmount(struct mount *mp)
1855	{
1856	struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1857	struct inode *xp;
1858	struct vnode *vp = NULL;
1859	struct snap_info *si;
1860
1861	si = VFSTOUFS(mp)->um_snapinfo;
1862	mutex_enter(&si->si_lock);
1863	while ((xp = TAILQ_FIRST(&si->si_snapshots)) != `0`) {
1864	vp = ITOV(xp);
1865	TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap);
1866	if (xp->i_snapblklist == si->si_snapblklist)
1867	si->si_snapblklist = NULL;
1868	free(xp->i_snapblklist, M_UFSMNT);
1869	if (xp->i_nlink > `0`) {
1870	si->si_gen++;
1871	mutex_exit(&si->si_lock);
1872	vrele(vp);
1873	mutex_enter(&si->si_lock);
1874	}
1875	}
1876	si->si_gen++;
1877	mutex_exit(&si->si_lock);
1878	if (vp)
1879	fscow_disestablish(mp, ffs_copyonwrite, devvp);
1880	}
1881
1882	/*
1883	* Check for need to copy block that is about to be written,
1884	* copying the block if necessary.
1885	*/
1886	static int
1887	ffs_copyonwrite(void v, struct* buf *bp, bool data_valid)
1888	{
1889	struct fs *fs;
1890	struct inode *ip;
1891	struct vnode devvp = v, vp = NULL;
1892	struct mount *mp = spec_node_getmountedfs(devvp);
1893	struct snap_info *si;
1894	void *saved_data = NULL;
1895	daddr_t lbn, blkno, *snapblklist;
1896	uint32_t gen;
1897	int lower, upper, mid, snapshot_locked = `0`, error = `0`;
1898
1899	/*
1900	* Check for valid snapshots.
1901	*/
1902	si = VFSTOUFS(mp)->um_snapinfo;
1903	mutex_enter(&si->si_lock);
1904	ip = TAILQ_FIRST(&si->si_snapshots);
1905	if (ip == NULL) {
1906	mutex_exit(&si->si_lock);
1907	return `0`;
1908	}
1909	/*
1910	* First check to see if it is after the file system,
1911	* in the journal or in the preallocated list.
1912	* By doing these checks we avoid several potential deadlocks.
1913	*/
1914	fs = ip->i_fs;
1915	lbn = ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno));
1916	if (bp->b_blkno >= FFS_FSBTODB(fs, fs->fs_size)) {
1917	mutex_exit(&si->si_lock);
1918	return `0`;
1919	}
1920	if ((fs->fs_flags & FS_DOWAPBL) &&
1921	fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
1922	off_t blk_off, log_start, log_end;
1923
1924	log_start = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] *
1925	fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
1926	log_end = log_start + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] *
1927	fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
1928	blk_off = dbtob(bp->b_blkno);
1929	if (blk_off >= log_start && blk_off < log_end) {
1930	mutex_exit(&si->si_lock);
1931	return `0`;
1932	}
1933	}
1934	snapblklist = si->si_snapblklist;
1935	upper = (snapblklist != NULL ? snapblklist[`0`] - `1` : `0`);
1936	lower = `1`;
1937	while (lower <= upper) {
1938	mid = (lower + upper) / `2`;
1939	if (snapblklist[mid] == lbn)
1940	break;
1941	if (snapblklist[mid] < lbn)
1942	lower = mid + `1`;
1943	else
1944	upper = mid - `1`;
1945	}
1946	if (lower <= upper) {
1947	mutex_exit(&si->si_lock);
1948	return `0`;
1949	}
1950	/*
1951	* Not in the precomputed list, so check the snapshots.
1952	*/
1953	if (si->si_owner != curlwp) {
1954	if (!mutex_tryenter(&si->si_snaplock)) {
1955	mutex_exit(&si->si_lock);
1956	mutex_enter(&si->si_snaplock);
1957	mutex_enter(&si->si_lock);
1958	}
1959	si->si_owner = curlwp;
1960	snapshot_locked = `1`;
1961	}
1962	if (data_valid && bp->b_bcount == fs->fs_bsize)
1963	saved_data = bp->b_data;
1964	retry:
1965	gen = si->si_gen;
1966	TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1967	vp = ITOV(ip);
1968	/*
1969	* We ensure that everything of our own that needs to be
1970	* copied will be done at the time that ffs_snapshot is
1971	* called. Thus we can skip the check here which can
1972	* deadlock in doing the lookup in ffs_balloc.
1973	*/
1974	if (bp->b_vp == vp)
1975	continue;
1976	/*
1977	* Check to see if block needs to be copied.
1978	*/
1979	if (lbn < UFS_NDADDR) {
1980	blkno = db_get(ip, lbn);
1981	} else {
1982	mutex_exit(&si->si_lock);
1983	blkno = `0`; / XXX: GCC /
1984	if ((error = snapblkaddr(vp, lbn, &blkno)) != `0`) {
1985	mutex_enter(&si->si_lock);
1986	break;
1987	}
1988	mutex_enter(&si->si_lock);
1989	if (gen != si->si_gen)
1990	goto retry;
1991	}
1992	#ifdef DIAGNOSTIC
1993	if (blkno == BLK_SNAP && bp->b_lblkno >= `0`)
1994	panic("ffs_copyonwrite: bad copy block");
1995	#endif
1996	if (blkno != `0`)
1997	continue;
1998
1999	if (curlwp == uvm.pagedaemon_lwp) {
2000	error = ENOMEM;
2001	break;
2002	}
2003	/ Only one level of recursion allowed. /
2004	KASSERT(snapshot_locked);
2005	/*
2006	* Allocate the block into which to do the copy. Since
2007	* multiple processes may all try to copy the same block,
2008	* we have to recheck our need to do a copy if we sleep
2009	* waiting for the lock.
2010	*
2011	* Because all snapshots on a filesystem share a single
2012	* lock, we ensure that we will never be in competition
2013	* with another process to allocate a block.
2014	*/
2015	#ifdef DEBUG
2016	if (snapdebug) {
2017	printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ",
2018	(unsigned long long)ip->i_number, lbn);
2019	if (bp->b_vp == devvp)
2020	printf("fs metadata");
2021	else
2022	printf("inum %llu", (unsigned long long)
2023	VTOI(bp->b_vp)->i_number);
2024	printf(" lblkno %" PRId64 "\n", bp->b_lblkno);
2025	}
2026	#endif
2027	/*
2028	* If we have already read the old block contents, then
2029	* simply copy them to the new block. Note that we need
2030	* to synchronously write snapshots that have not been
2031	* unlinked, and hence will be visible after a crash,
2032	* to ensure their integrity.
2033	*/
2034	mutex_exit(&si->si_lock);
2035	if (saved_data == NULL) {
2036	saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
2037	error = rwfsblk(vp, B_READ, saved_data, lbn);
2038	if (error) {
2039	free(saved_data, M_UFSMNT);
2040	saved_data = NULL;
2041	mutex_enter(&si->si_lock);
2042	break;
2043	}
2044	}
2045	error = wrsnapblk(vp, saved_data, lbn);
2046	if (error == `0` && ip->i_nlink > `0` && mp->mnt_wapbl)
2047	error = syncsnap(vp);
2048	mutex_enter(&si->si_lock);
2049	if (error)
2050	break;
2051	if (gen != si->si_gen)
2052	goto retry;
2053	}
2054	/*
2055	* Note that we need to synchronously write snapshots that
2056	* have not been unlinked, and hence will be visible after
2057	* a crash, to ensure their integrity.
2058	*/
2059	if (snapshot_locked) {
2060	si->si_owner = NULL;
2061	mutex_exit(&si->si_lock);
2062	mutex_exit(&si->si_snaplock);
2063	} else
2064	mutex_exit(&si->si_lock);
2065	if (saved_data && saved_data != bp->b_data)
2066	free(saved_data, M_UFSMNT);
2067	return error;
2068	}
2069
2070	/*
2071	* Read from a snapshot.
2072	*/
2073	int
2074	ffs_snapshot_read(struct vnode vp, struct* uio uio, int* ioflag)
2075	{
2076	struct inode *ip = VTOI(vp);
2077	struct fs *fs = ip->i_fs;
2078	struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo;
2079	struct buf *bp;
2080	daddr_t lbn, nextlbn;
2081	off_t fsbytes, bytesinfile;
2082	long size, xfersize, blkoffset;
2083	int error;
2084
2085	fstrans_start(vp->v_mount, FSTRANS_SHARED);
2086	mutex_enter(&si->si_snaplock);
2087
2088	if (ioflag & IO_ALTSEMANTICS)
2089	fsbytes = ip->i_size;
2090	else
2091	fsbytes = ffs_lfragtosize(fs, fs->fs_size);
2092	for (error = `0`, bp = NULL; uio->uio_resid > `0`; bp = NULL) {
2093	bytesinfile = fsbytes - uio->uio_offset;
2094	if (bytesinfile <= `0`)
2095	break;
2096	lbn = ffs_lblkno(fs, uio->uio_offset);
2097	nextlbn = lbn + `1`;
2098	size = fs->fs_bsize;
2099	blkoffset = ffs_blkoff(fs, uio->uio_offset);
2100	xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
2101	bytesinfile);
2102
2103	if (ffs_lblktosize(fs, nextlbn + `1`) >= fsbytes) {
2104	if (ffs_lblktosize(fs, lbn) + size > fsbytes)
2105	size = ffs_fragroundup(fs,
2106	fsbytes - ffs_lblktosize(fs, lbn));
2107	error = bread(vp, lbn, size, `0`, &bp);
2108	} else {
2109	int nextsize = fs->fs_bsize;
2110	error = breadn(vp, lbn,
2111	size, &nextlbn, &nextsize, `1`, `0`, &bp);
2112	}
2113	if (error)
2114	break;
2115
2116	/*
2117	* We should only get non-zero b_resid when an I/O error
2118	* has occurred, which should cause us to break above.
2119	* However, if the short read did not cause an error,
2120	* then we want to ensure that we do not uiomove bad
2121	* or uninitialized data.
2122	*/
2123	size -= bp->b_resid;
2124	if (size < blkoffset + xfersize) {
2125	xfersize = size - blkoffset;
2126	if (xfersize <= `0`)
2127	break;
2128	}
2129	error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
2130	if (error)
2131	break;
2132	brelse(bp, BC_AGE);
2133	}
2134	if (bp != NULL)
2135	brelse(bp, BC_AGE);
2136
2137	mutex_exit(&si->si_snaplock);
2138	fstrans_done(vp->v_mount);
2139	return error;
2140	}
2141
2142	/*
2143	* Lookup a snapshots data block address.
2144	* Simpler than UFS_BALLOC() as we know all metadata is already allocated
2145	* and safe even for the pagedaemon where we cannot bread().
2146	*/
2147	static int
2148	snapblkaddr(struct vnode vp, daddr_t lbn, daddr_t res)
2149	{
2150	struct indir indirs[UFS_NIADDR + `2`];
2151	struct inode *ip = VTOI(vp);
2152	struct fs *fs = ip->i_fs;
2153	struct buf *bp;
2154	int error, num;
2155
2156	KASSERT(lbn >= `0`);
2157
2158	if (lbn < UFS_NDADDR) {
2159	*res = db_get(ip, lbn);
2160	return `0`;
2161	}
2162	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != `0`)
2163	return error;
2164	if (curlwp == uvm.pagedaemon_lwp) {
2165	mutex_enter(&bufcache_lock);
2166	bp = incore(vp, indirs[num-`1`].in_lbn);
2167	if (bp && (bp->b_oflags & (BO_DONE \| BO_DELWRI))) {
2168	*res = idb_get(ip, bp->b_data, indirs[num-`1`].in_off);
2169	error = `0`;
2170	} else
2171	error = ENOMEM;
2172	mutex_exit(&bufcache_lock);
2173	return error;
2174	}
2175	error = bread(vp, indirs[num-`1`].in_lbn, fs->fs_bsize, `0`, &bp);
2176	if (error == `0`) {
2177	*res = idb_get(ip, bp->b_data, indirs[num-`1`].in_off);
2178	brelse(bp, `0`);
2179	}
2180
2181	return error;
2182	}
2183
2184	/*
2185	* Read or write the specified block of the filesystem vp resides on
2186	* from or to the disk bypassing the buffer cache.
2187	*/
2188	static int
2189	rwfsblk(struct vnode vp, int* flags, void *data, daddr_t lbn)
2190	{
2191	int error;
2192	struct inode *ip = VTOI(vp);
2193	struct fs *fs = ip->i_fs;
2194	struct buf *nbp;
2195
2196	nbp = getiobuf(NULL, true);
2197	nbp->b_flags = flags;
2198	nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize;
2199	nbp->b_error = `0`;
2200	nbp->b_data = data;
2201	nbp->b_blkno = nbp->b_rawblkno = FFS_FSBTODB(fs, ffs_blkstofrags(fs, lbn));
2202	nbp->b_proc = NULL;
2203	nbp->b_dev = ip->i_devvp->v_rdev;
2204	SET(nbp->b_cflags, BC_BUSY); / mark buffer busy /
2205
2206	bdev_strategy(nbp);
2207
2208	error = biowait(nbp);
2209
2210	putiobuf(nbp);
2211
2212	return error;
2213	}
2214
2215	/*
2216	* Write all dirty buffers to disk and invalidate them.
2217	*/
2218	static int
2219	syncsnap(struct vnode *vp)
2220	{
2221	int error;
2222	buf_t *bp;
2223	struct fs *fs = VTOI(vp)->i_fs;
2224
2225	mutex_enter(&bufcache_lock);
2226	while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
2227	error = bbusy(bp, false, `0`, NULL);
2228	if (error == EPASSTHROUGH)
2229	continue;
2230	else if (error != `0`) {
2231	mutex_exit(&bufcache_lock);
2232	return error;
2233	}
2234	KASSERT(bp->b_bcount == fs->fs_bsize);
2235	mutex_exit(&bufcache_lock);
2236	error = rwfsblk(vp, B_WRITE, bp->b_data,
2237	ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno)));
2238	brelse(bp, BC_INVAL \| BC_VFLUSH);
2239	if (error)
2240	return error;
2241	mutex_enter(&bufcache_lock);
2242	}
2243	mutex_exit(&bufcache_lock);
2244
2245	return `0`;
2246	}
2247
2248	/*
2249	* Write the specified block to a snapshot.
2250	*/
2251	static int
2252	wrsnapblk(struct vnode vp, void* *data, daddr_t lbn)
2253	{
2254	struct inode *ip = VTOI(vp);
2255	struct fs *fs = ip->i_fs;
2256	struct buf *bp;
2257	int error;
2258
2259	error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn), fs->fs_bsize,
2260	FSCRED, (ip->i_nlink > `0` ? B_SYNC : `0`), &bp);
2261	if (error)
2262	return error;
2263	memcpy(bp->b_data, data, fs->fs_bsize);
2264	if (ip->i_nlink > `0`)
2265	error = bwrite(bp);
2266	else
2267	bawrite(bp);
2268
2269	return error;
2270	}
2271
2272	/*
2273	* Check if this inode is present on the active snapshot list.
2274	* Must be called with snapinfo locked.
2275	*/
2276	static inline bool
2277	is_active_snapshot(struct snap_info si, struct* inode *ip)
2278	{
2279	struct inode *xp;
2280
2281	KASSERT(mutex_owned(&si->si_lock));
2282
2283	TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
2284	if (xp == ip)
2285	return true;
2286	return false;
2287	}
2288
2289	/*
2290	* Get/Put direct block from inode or buffer containing disk addresses. Take
2291	* care for fs type (UFS1/UFS2) and byte swapping. These functions should go
2292	* into a global include.
2293	*/
2294	static inline daddr_t
2295	db_get(struct inode ip, int* loc)
2296	{
2297	if (ip->i_ump->um_fstype == UFS1)
2298	return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip));
2299	else
2300	return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip));
2301	}
2302
2303	static inline void
2304	db_assign(struct inode ip, int* loc, daddr_t val)
2305	{
2306	if (ip->i_ump->um_fstype == UFS1)
2307	ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2308	else
2309	ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2310	}
2311
2312	__unused static inline daddr_t
2313	ib_get(struct inode ip, int* loc)
2314	{
2315	if (ip->i_ump->um_fstype == UFS1)
2316	return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip));
2317	else
2318	return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip));
2319	}
2320
2321	static inline daddr_t
2322	idb_get(struct inode ip, void* bf, int* loc)
2323	{
2324	if (ip->i_ump->um_fstype == UFS1)
2325	return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
2326	else
2327	return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
2328	}
2329
2330	static inline void
2331	idb_assign(struct inode ip, void* bf, int* loc, daddr_t val)
2332	{
2333	if (ip->i_ump->um_fstype == UFS1)
2334	((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2335	else
2336	((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2337	}
2338

Browse the source code of src/src/sys/ufs/ffs/ffs_snapshot.c