1/* $NetBSD: fss.c,v 1.95 2016/07/31 12:17:36 hannken Exp $ */
2
3/*-
4 * Copyright (c) 2003 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Juergen Hannken-Illjes.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*
33 * File system snapshot disk driver.
34 *
35 * Block/character interface to the snapshot of a mounted file system.
36 */
37
38#include <sys/cdefs.h>
39__KERNEL_RCSID(0, "$NetBSD: fss.c,v 1.95 2016/07/31 12:17:36 hannken Exp $");
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/namei.h>
44#include <sys/proc.h>
45#include <sys/errno.h>
46#include <sys/malloc.h>
47#include <sys/buf.h>
48#include <sys/ioctl.h>
49#include <sys/disklabel.h>
50#include <sys/device.h>
51#include <sys/disk.h>
52#include <sys/stat.h>
53#include <sys/mount.h>
54#include <sys/vnode.h>
55#include <sys/file.h>
56#include <sys/uio.h>
57#include <sys/conf.h>
58#include <sys/kthread.h>
59#include <sys/fstrans.h>
60#include <sys/vfs_syscalls.h> /* For do_sys_unlink(). */
61
62#include <miscfs/specfs/specdev.h>
63
64#include <dev/fssvar.h>
65
66#include <uvm/uvm.h>
67
68#include "ioconf.h"
69
70dev_type_open(fss_open);
71dev_type_close(fss_close);
72dev_type_read(fss_read);
73dev_type_write(fss_write);
74dev_type_ioctl(fss_ioctl);
75dev_type_strategy(fss_strategy);
76dev_type_dump(fss_dump);
77dev_type_size(fss_size);
78
79static void fss_unmount_hook(struct mount *);
80static int fss_copy_on_write(void *, struct buf *, bool);
81static inline void fss_error(struct fss_softc *, const char *);
82static int fss_create_files(struct fss_softc *, struct fss_set *,
83 off_t *, struct lwp *);
84static int fss_create_snapshot(struct fss_softc *, struct fss_set *,
85 struct lwp *);
86static int fss_delete_snapshot(struct fss_softc *, struct lwp *);
87static int fss_softc_alloc(struct fss_softc *);
88static void fss_softc_free(struct fss_softc *);
89static int fss_read_cluster(struct fss_softc *, u_int32_t);
90static void fss_bs_thread(void *);
91static int fss_bs_io(struct fss_softc *, fss_io_type,
92 u_int32_t, off_t, int, void *);
93static u_int32_t *fss_bs_indir(struct fss_softc *, u_int32_t);
94
95static kmutex_t fss_device_lock; /* Protect all units. */
96static int fss_num_attached = 0; /* Number of attached devices. */
97static struct vfs_hooks fss_vfs_hooks = {
98 .vh_unmount = fss_unmount_hook
99};
100
101const struct bdevsw fss_bdevsw = {
102 .d_open = fss_open,
103 .d_close = fss_close,
104 .d_strategy = fss_strategy,
105 .d_ioctl = fss_ioctl,
106 .d_dump = fss_dump,
107 .d_psize = fss_size,
108 .d_discard = nodiscard,
109 .d_flag = D_DISK | D_MPSAFE
110};
111
112const struct cdevsw fss_cdevsw = {
113 .d_open = fss_open,
114 .d_close = fss_close,
115 .d_read = fss_read,
116 .d_write = fss_write,
117 .d_ioctl = fss_ioctl,
118 .d_stop = nostop,
119 .d_tty = notty,
120 .d_poll = nopoll,
121 .d_mmap = nommap,
122 .d_kqfilter = nokqfilter,
123 .d_discard = nodiscard,
124 .d_flag = D_DISK | D_MPSAFE
125};
126
127static int fss_match(device_t, cfdata_t, void *);
128static void fss_attach(device_t, device_t, void *);
129static int fss_detach(device_t, int);
130
131CFATTACH_DECL_NEW(fss, sizeof(struct fss_softc),
132 fss_match, fss_attach, fss_detach, NULL);
133extern struct cfdriver fss_cd;
134
135void
136fssattach(int num)
137{
138
139 mutex_init(&fss_device_lock, MUTEX_DEFAULT, IPL_NONE);
140 if (config_cfattach_attach(fss_cd.cd_name, &fss_ca))
141 aprint_error("%s: unable to register\n", fss_cd.cd_name);
142}
143
144static int
145fss_match(device_t self, cfdata_t cfdata, void *aux)
146{
147 return 1;
148}
149
150static void
151fss_attach(device_t parent, device_t self, void *aux)
152{
153 struct fss_softc *sc = device_private(self);
154
155 sc->sc_dev = self;
156 sc->sc_bdev = NODEV;
157 mutex_init(&sc->sc_slock, MUTEX_DEFAULT, IPL_NONE);
158 mutex_init(&sc->sc_lock, MUTEX_DEFAULT, IPL_NONE);
159 cv_init(&sc->sc_work_cv, "fssbs");
160 cv_init(&sc->sc_cache_cv, "cowwait");
161 bufq_alloc(&sc->sc_bufq, "fcfs", 0);
162 sc->sc_dkdev = malloc(sizeof(*sc->sc_dkdev), M_DEVBUF, M_WAITOK);
163 sc->sc_dkdev->dk_info = NULL;
164 disk_init(sc->sc_dkdev, device_xname(self), NULL);
165 if (!pmf_device_register(self, NULL, NULL))
166 aprint_error_dev(self, "couldn't establish power handler\n");
167
168 if (fss_num_attached++ == 0)
169 vfs_hooks_attach(&fss_vfs_hooks);
170}
171
172static int
173fss_detach(device_t self, int flags)
174{
175 struct fss_softc *sc = device_private(self);
176
177 if (sc->sc_flags & FSS_ACTIVE)
178 return EBUSY;
179
180 if (--fss_num_attached == 0)
181 vfs_hooks_detach(&fss_vfs_hooks);
182
183 pmf_device_deregister(self);
184 mutex_destroy(&sc->sc_slock);
185 mutex_destroy(&sc->sc_lock);
186 cv_destroy(&sc->sc_work_cv);
187 cv_destroy(&sc->sc_cache_cv);
188 bufq_drain(sc->sc_bufq);
189 bufq_free(sc->sc_bufq);
190 disk_destroy(sc->sc_dkdev);
191 free(sc->sc_dkdev, M_DEVBUF);
192
193 return 0;
194}
195
196int
197fss_open(dev_t dev, int flags, int mode, struct lwp *l)
198{
199 int mflag;
200 cfdata_t cf;
201 struct fss_softc *sc;
202
203 mflag = (mode == S_IFCHR ? FSS_CDEV_OPEN : FSS_BDEV_OPEN);
204
205 mutex_enter(&fss_device_lock);
206
207 sc = device_lookup_private(&fss_cd, minor(dev));
208 if (sc == NULL) {
209 cf = malloc(sizeof(*cf), M_DEVBUF, M_WAITOK);
210 cf->cf_name = fss_cd.cd_name;
211 cf->cf_atname = fss_cd.cd_name;
212 cf->cf_unit = minor(dev);
213 cf->cf_fstate = FSTATE_STAR;
214 sc = device_private(config_attach_pseudo(cf));
215 if (sc == NULL) {
216 mutex_exit(&fss_device_lock);
217 return ENOMEM;
218 }
219 }
220
221 mutex_enter(&sc->sc_slock);
222
223 sc->sc_flags |= mflag;
224
225 mutex_exit(&sc->sc_slock);
226 mutex_exit(&fss_device_lock);
227
228 return 0;
229}
230
231int
232fss_close(dev_t dev, int flags, int mode, struct lwp *l)
233{
234 int mflag, error;
235 cfdata_t cf;
236 struct fss_softc *sc = device_lookup_private(&fss_cd, minor(dev));
237
238 mflag = (mode == S_IFCHR ? FSS_CDEV_OPEN : FSS_BDEV_OPEN);
239 error = 0;
240
241 mutex_enter(&fss_device_lock);
242restart:
243 mutex_enter(&sc->sc_slock);
244 if ((sc->sc_flags & (FSS_CDEV_OPEN|FSS_BDEV_OPEN)) != mflag) {
245 sc->sc_flags &= ~mflag;
246 mutex_exit(&sc->sc_slock);
247 mutex_exit(&fss_device_lock);
248 return 0;
249 }
250 if ((sc->sc_flags & FSS_ACTIVE) != 0 &&
251 (sc->sc_uflags & FSS_UNCONFIG_ON_CLOSE) != 0) {
252 sc->sc_uflags &= ~FSS_UNCONFIG_ON_CLOSE;
253 mutex_exit(&sc->sc_slock);
254 error = fss_ioctl(dev, FSSIOCCLR, NULL, FWRITE, l);
255 goto restart;
256 }
257 if ((sc->sc_flags & FSS_ACTIVE) != 0) {
258 mutex_exit(&sc->sc_slock);
259 mutex_exit(&fss_device_lock);
260 return error;
261 }
262
263 KASSERT((sc->sc_flags & FSS_ACTIVE) == 0);
264 KASSERT((sc->sc_flags & (FSS_CDEV_OPEN|FSS_BDEV_OPEN)) == mflag);
265 mutex_exit(&sc->sc_slock);
266 cf = device_cfdata(sc->sc_dev);
267 error = config_detach(sc->sc_dev, DETACH_QUIET);
268 if (! error)
269 free(cf, M_DEVBUF);
270 mutex_exit(&fss_device_lock);
271
272 return error;
273}
274
275void
276fss_strategy(struct buf *bp)
277{
278 const bool write = ((bp->b_flags & B_READ) != B_READ);
279 struct fss_softc *sc = device_lookup_private(&fss_cd, minor(bp->b_dev));
280
281 mutex_enter(&sc->sc_slock);
282
283 if (write || !FSS_ISVALID(sc)) {
284
285 mutex_exit(&sc->sc_slock);
286
287 bp->b_error = (write ? EROFS : ENXIO);
288 bp->b_resid = bp->b_bcount;
289 biodone(bp);
290 return;
291 }
292
293 bp->b_rawblkno = bp->b_blkno;
294 bufq_put(sc->sc_bufq, bp);
295 cv_signal(&sc->sc_work_cv);
296
297 mutex_exit(&sc->sc_slock);
298}
299
300int
301fss_read(dev_t dev, struct uio *uio, int flags)
302{
303 return physio(fss_strategy, NULL, dev, B_READ, minphys, uio);
304}
305
306int
307fss_write(dev_t dev, struct uio *uio, int flags)
308{
309 return physio(fss_strategy, NULL, dev, B_WRITE, minphys, uio);
310}
311
312int
313fss_ioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
314{
315 int error;
316 struct fss_softc *sc = device_lookup_private(&fss_cd, minor(dev));
317 struct fss_set _fss;
318 struct fss_set *fss = (struct fss_set *)data;
319 struct fss_set50 *fss50 = (struct fss_set50 *)data;
320 struct fss_get *fsg = (struct fss_get *)data;
321#ifndef _LP64
322 struct fss_get50 *fsg50 = (struct fss_get50 *)data;
323#endif
324
325 switch (cmd) {
326 case FSSIOCSET50:
327 fss = &_fss;
328 fss->fss_mount = fss50->fss_mount;
329 fss->fss_bstore = fss50->fss_bstore;
330 fss->fss_csize = fss50->fss_csize;
331 fss->fss_flags = 0;
332 /* Fall through */
333 case FSSIOCSET:
334 mutex_enter(&sc->sc_lock);
335 if ((flag & FWRITE) == 0)
336 error = EPERM;
337 else if ((sc->sc_flags & FSS_ACTIVE) != 0)
338 error = EBUSY;
339 else
340 error = fss_create_snapshot(sc, fss, l);
341 if (error == 0)
342 sc->sc_uflags = fss->fss_flags;
343 mutex_exit(&sc->sc_lock);
344 break;
345
346 case FSSIOCCLR:
347 mutex_enter(&sc->sc_lock);
348 if ((flag & FWRITE) == 0)
349 error = EPERM;
350 else if ((sc->sc_flags & FSS_ACTIVE) == 0)
351 error = ENXIO;
352 else
353 error = fss_delete_snapshot(sc, l);
354 mutex_exit(&sc->sc_lock);
355 break;
356
357#ifndef _LP64
358 case FSSIOCGET50:
359 mutex_enter(&sc->sc_lock);
360 switch (sc->sc_flags & (FSS_PERSISTENT | FSS_ACTIVE)) {
361 case FSS_ACTIVE:
362 memcpy(fsg50->fsg_mount, sc->sc_mntname, MNAMELEN);
363 fsg50->fsg_csize = FSS_CLSIZE(sc);
364 timeval_to_timeval50(&sc->sc_time, &fsg50->fsg_time);
365 fsg50->fsg_mount_size = sc->sc_clcount;
366 fsg50->fsg_bs_size = sc->sc_clnext;
367 error = 0;
368 break;
369 case FSS_PERSISTENT | FSS_ACTIVE:
370 memcpy(fsg50->fsg_mount, sc->sc_mntname, MNAMELEN);
371 fsg50->fsg_csize = 0;
372 timeval_to_timeval50(&sc->sc_time, &fsg50->fsg_time);
373 fsg50->fsg_mount_size = 0;
374 fsg50->fsg_bs_size = 0;
375 error = 0;
376 break;
377 default:
378 error = ENXIO;
379 break;
380 }
381 mutex_exit(&sc->sc_lock);
382 break;
383#endif /* _LP64 */
384
385 case FSSIOCGET:
386 mutex_enter(&sc->sc_lock);
387 switch (sc->sc_flags & (FSS_PERSISTENT | FSS_ACTIVE)) {
388 case FSS_ACTIVE:
389 memcpy(fsg->fsg_mount, sc->sc_mntname, MNAMELEN);
390 fsg->fsg_csize = FSS_CLSIZE(sc);
391 fsg->fsg_time = sc->sc_time;
392 fsg->fsg_mount_size = sc->sc_clcount;
393 fsg->fsg_bs_size = sc->sc_clnext;
394 error = 0;
395 break;
396 case FSS_PERSISTENT | FSS_ACTIVE:
397 memcpy(fsg->fsg_mount, sc->sc_mntname, MNAMELEN);
398 fsg->fsg_csize = 0;
399 fsg->fsg_time = sc->sc_time;
400 fsg->fsg_mount_size = 0;
401 fsg->fsg_bs_size = 0;
402 error = 0;
403 break;
404 default:
405 error = ENXIO;
406 break;
407 }
408 mutex_exit(&sc->sc_lock);
409 break;
410
411 case FSSIOFSET:
412 mutex_enter(&sc->sc_slock);
413 sc->sc_uflags = *(int *)data;
414 mutex_exit(&sc->sc_slock);
415 error = 0;
416 break;
417
418 case FSSIOFGET:
419 mutex_enter(&sc->sc_slock);
420 *(int *)data = sc->sc_uflags;
421 mutex_exit(&sc->sc_slock);
422 error = 0;
423 break;
424
425 default:
426 error = EINVAL;
427 break;
428 }
429
430 return error;
431}
432
433int
434fss_size(dev_t dev)
435{
436 return -1;
437}
438
439int
440fss_dump(dev_t dev, daddr_t blkno, void *va,
441 size_t size)
442{
443 return EROFS;
444}
445
446/*
447 * An error occurred reading or writing the snapshot or backing store.
448 * If it is the first error log to console and disestablish cow handler.
449 * The caller holds the mutex.
450 */
451static inline void
452fss_error(struct fss_softc *sc, const char *msg)
453{
454
455 if ((sc->sc_flags & (FSS_ACTIVE | FSS_ERROR)) != FSS_ACTIVE)
456 return;
457
458 aprint_error_dev(sc->sc_dev, "snapshot invalid: %s\n", msg);
459 if ((sc->sc_flags & FSS_PERSISTENT) == 0)
460 fscow_disestablish(sc->sc_mount, fss_copy_on_write, sc);
461 sc->sc_flags |= FSS_ERROR;
462}
463
464/*
465 * Allocate the variable sized parts of the softc and
466 * fork the kernel thread.
467 *
468 * The fields sc_clcount, sc_clshift, sc_cache_size and sc_indir_size
469 * must be initialized.
470 */
471static int
472fss_softc_alloc(struct fss_softc *sc)
473{
474 int i, error;
475
476 if ((sc->sc_flags & FSS_PERSISTENT) == 0) {
477 sc->sc_copied =
478 kmem_zalloc(howmany(sc->sc_clcount, NBBY), KM_SLEEP);
479 if (sc->sc_copied == NULL)
480 return(ENOMEM);
481
482 sc->sc_cache = kmem_alloc(sc->sc_cache_size *
483 sizeof(struct fss_cache), KM_SLEEP);
484 if (sc->sc_cache == NULL)
485 return(ENOMEM);
486
487 for (i = 0; i < sc->sc_cache_size; i++) {
488 sc->sc_cache[i].fc_type = FSS_CACHE_FREE;
489 sc->sc_cache[i].fc_data =
490 kmem_alloc(FSS_CLSIZE(sc), KM_SLEEP);
491 if (sc->sc_cache[i].fc_data == NULL)
492 return(ENOMEM);
493 cv_init(&sc->sc_cache[i].fc_state_cv, "cowwait1");
494 }
495
496 sc->sc_indir_valid =
497 kmem_zalloc(howmany(sc->sc_indir_size, NBBY), KM_SLEEP);
498 if (sc->sc_indir_valid == NULL)
499 return(ENOMEM);
500
501 sc->sc_indir_data = kmem_zalloc(FSS_CLSIZE(sc), KM_SLEEP);
502 if (sc->sc_indir_data == NULL)
503 return(ENOMEM);
504 } else {
505 sc->sc_copied = NULL;
506 sc->sc_cache = NULL;
507 sc->sc_indir_valid = NULL;
508 sc->sc_indir_data = NULL;
509 }
510
511 sc->sc_flags |= FSS_BS_THREAD;
512 if ((error = kthread_create(PRI_BIO, KTHREAD_MUSTJOIN, NULL,
513 fss_bs_thread, sc, &sc->sc_bs_lwp,
514 "%s", device_xname(sc->sc_dev))) != 0) {
515 sc->sc_flags &= ~FSS_BS_THREAD;
516 return error;
517 }
518
519 disk_attach(sc->sc_dkdev);
520
521 return 0;
522}
523
524/*
525 * Free the variable sized parts of the softc.
526 */
527static void
528fss_softc_free(struct fss_softc *sc)
529{
530 int i;
531
532 if ((sc->sc_flags & FSS_BS_THREAD) != 0) {
533 mutex_enter(&sc->sc_slock);
534 sc->sc_flags &= ~FSS_BS_THREAD;
535 cv_signal(&sc->sc_work_cv);
536 mutex_exit(&sc->sc_slock);
537 kthread_join(sc->sc_bs_lwp);
538
539 disk_detach(sc->sc_dkdev);
540 }
541
542 if (sc->sc_copied != NULL)
543 kmem_free(sc->sc_copied, howmany(sc->sc_clcount, NBBY));
544 sc->sc_copied = NULL;
545
546 if (sc->sc_cache != NULL) {
547 for (i = 0; i < sc->sc_cache_size; i++)
548 if (sc->sc_cache[i].fc_data != NULL) {
549 cv_destroy(&sc->sc_cache[i].fc_state_cv);
550 kmem_free(sc->sc_cache[i].fc_data,
551 FSS_CLSIZE(sc));
552 }
553 kmem_free(sc->sc_cache,
554 sc->sc_cache_size*sizeof(struct fss_cache));
555 }
556 sc->sc_cache = NULL;
557
558 if (sc->sc_indir_valid != NULL)
559 kmem_free(sc->sc_indir_valid, howmany(sc->sc_indir_size, NBBY));
560 sc->sc_indir_valid = NULL;
561
562 if (sc->sc_indir_data != NULL)
563 kmem_free(sc->sc_indir_data, FSS_CLSIZE(sc));
564 sc->sc_indir_data = NULL;
565}
566
567/*
568 * Set all active snapshots on this file system into ERROR state.
569 */
570static void
571fss_unmount_hook(struct mount *mp)
572{
573 int i;
574 struct fss_softc *sc;
575
576 mutex_enter(&fss_device_lock);
577 for (i = 0; i < fss_cd.cd_ndevs; i++) {
578 if ((sc = device_lookup_private(&fss_cd, i)) == NULL)
579 continue;
580 mutex_enter(&sc->sc_slock);
581 if ((sc->sc_flags & FSS_ACTIVE) != 0 && sc->sc_mount == mp)
582 fss_error(sc, "forced by unmount");
583 mutex_exit(&sc->sc_slock);
584 }
585 mutex_exit(&fss_device_lock);
586}
587
588/*
589 * A buffer is written to the snapshotted block device. Copy to
590 * backing store if needed.
591 */
592static int
593fss_copy_on_write(void *v, struct buf *bp, bool data_valid)
594{
595 int error;
596 u_int32_t cl, ch, c;
597 struct fss_softc *sc = v;
598
599 mutex_enter(&sc->sc_slock);
600 if (!FSS_ISVALID(sc)) {
601 mutex_exit(&sc->sc_slock);
602 return 0;
603 }
604
605 cl = FSS_BTOCL(sc, dbtob(bp->b_blkno));
606 ch = FSS_BTOCL(sc, dbtob(bp->b_blkno)+bp->b_bcount-1);
607 error = 0;
608 if (curlwp == uvm.pagedaemon_lwp) {
609 for (c = cl; c <= ch; c++)
610 if (isclr(sc->sc_copied, c)) {
611 error = ENOMEM;
612 break;
613 }
614 }
615 mutex_exit(&sc->sc_slock);
616
617 if (error == 0)
618 for (c = cl; c <= ch; c++) {
619 error = fss_read_cluster(sc, c);
620 if (error)
621 break;
622 }
623
624 return error;
625}
626
627/*
628 * Lookup and open needed files.
629 *
630 * For file system internal snapshot initializes sc_mntname, sc_mount,
631 * sc_bs_vp and sc_time.
632 *
633 * Otherwise returns dev and size of the underlying block device.
634 * Initializes sc_mntname, sc_mount, sc_bdev, sc_bs_vp and sc_mount
635 */
636static int
637fss_create_files(struct fss_softc *sc, struct fss_set *fss,
638 off_t *bsize, struct lwp *l)
639{
640 int error, bits, fsbsize;
641 uint64_t numsec;
642 unsigned int secsize;
643 struct timespec ts;
644 /* nd -> nd2 to reduce mistakes while updating only some namei calls */
645 struct pathbuf *pb2;
646 struct nameidata nd2;
647 struct vnode *vp;
648
649 /*
650 * Get the mounted file system.
651 */
652
653 error = namei_simple_user(fss->fss_mount,
654 NSM_FOLLOW_NOEMULROOT, &vp);
655 if (error != 0)
656 return error;
657
658 if ((vp->v_vflag & VV_ROOT) != VV_ROOT) {
659 vrele(vp);
660 return EINVAL;
661 }
662
663 sc->sc_mount = vp->v_mount;
664 memcpy(sc->sc_mntname, sc->sc_mount->mnt_stat.f_mntonname, MNAMELEN);
665
666 vrele(vp);
667
668 /*
669 * Check for file system internal snapshot.
670 */
671
672 error = namei_simple_user(fss->fss_bstore,
673 NSM_FOLLOW_NOEMULROOT, &vp);
674 if (error != 0)
675 return error;
676
677 if (vp->v_type == VREG && vp->v_mount == sc->sc_mount) {
678 sc->sc_flags |= FSS_PERSISTENT;
679 sc->sc_bs_vp = vp;
680
681 fsbsize = sc->sc_bs_vp->v_mount->mnt_stat.f_iosize;
682 bits = sizeof(sc->sc_bs_bshift)*NBBY;
683 for (sc->sc_bs_bshift = 1; sc->sc_bs_bshift < bits;
684 sc->sc_bs_bshift++)
685 if (FSS_FSBSIZE(sc) == fsbsize)
686 break;
687 if (sc->sc_bs_bshift >= bits)
688 return EINVAL;
689
690 sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
691 sc->sc_clshift = 0;
692
693 if ((fss->fss_flags & FSS_UNLINK_ON_CREATE) != 0) {
694 error = do_sys_unlink(fss->fss_bstore, UIO_USERSPACE);
695 if (error)
696 return error;
697 }
698 error = vn_lock(vp, LK_EXCLUSIVE);
699 if (error != 0)
700 return error;
701 error = VFS_SNAPSHOT(sc->sc_mount, sc->sc_bs_vp, &ts);
702 TIMESPEC_TO_TIMEVAL(&sc->sc_time, &ts);
703
704 VOP_UNLOCK(sc->sc_bs_vp);
705
706 return error;
707 }
708 vrele(vp);
709
710 /*
711 * Get the block device it is mounted on and its size.
712 */
713
714 error = spec_node_lookup_by_mount(sc->sc_mount, &vp);
715 if (error)
716 return error;
717 sc->sc_bdev = vp->v_rdev;
718
719 error = getdisksize(vp, &numsec, &secsize);
720 vrele(vp);
721 if (error)
722 return error;
723
724 *bsize = (off_t)numsec*secsize;
725
726 /*
727 * Get the backing store
728 */
729
730 error = pathbuf_copyin(fss->fss_bstore, &pb2);
731 if (error) {
732 return error;
733 }
734 NDINIT(&nd2, LOOKUP, FOLLOW, pb2);
735 if ((error = vn_open(&nd2, FREAD|FWRITE, 0)) != 0) {
736 pathbuf_destroy(pb2);
737 return error;
738 }
739 VOP_UNLOCK(nd2.ni_vp);
740
741 sc->sc_bs_vp = nd2.ni_vp;
742
743 if (nd2.ni_vp->v_type != VREG && nd2.ni_vp->v_type != VCHR) {
744 pathbuf_destroy(pb2);
745 return EINVAL;
746 }
747 pathbuf_destroy(pb2);
748
749 if ((fss->fss_flags & FSS_UNLINK_ON_CREATE) != 0) {
750 error = do_sys_unlink(fss->fss_bstore, UIO_USERSPACE);
751 if (error)
752 return error;
753 }
754 if (sc->sc_bs_vp->v_type == VREG) {
755 fsbsize = sc->sc_bs_vp->v_mount->mnt_stat.f_iosize;
756 if (fsbsize & (fsbsize-1)) /* No power of two */
757 return EINVAL;
758 for (sc->sc_bs_bshift = 1; sc->sc_bs_bshift < 32;
759 sc->sc_bs_bshift++)
760 if (FSS_FSBSIZE(sc) == fsbsize)
761 break;
762 if (sc->sc_bs_bshift >= 32)
763 return EINVAL;
764 sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
765 } else {
766 sc->sc_bs_bshift = DEV_BSHIFT;
767 sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
768 }
769
770 return 0;
771}
772
773/*
774 * Create a snapshot.
775 */
776static int
777fss_create_snapshot(struct fss_softc *sc, struct fss_set *fss, struct lwp *l)
778{
779 int len, error;
780 u_int32_t csize;
781 off_t bsize;
782
783 bsize = 0; /* XXX gcc */
784
785 /*
786 * Open needed files.
787 */
788 if ((error = fss_create_files(sc, fss, &bsize, l)) != 0)
789 goto bad;
790
791 if (sc->sc_flags & FSS_PERSISTENT) {
792 fss_softc_alloc(sc);
793 sc->sc_flags |= FSS_ACTIVE;
794 return 0;
795 }
796
797 /*
798 * Set cluster size. Must be a power of two and
799 * a multiple of backing store block size.
800 */
801 if (fss->fss_csize <= 0)
802 csize = MAXPHYS;
803 else
804 csize = fss->fss_csize;
805 if (bsize/csize > FSS_CLUSTER_MAX)
806 csize = bsize/FSS_CLUSTER_MAX+1;
807
808 for (sc->sc_clshift = sc->sc_bs_bshift; sc->sc_clshift < 32;
809 sc->sc_clshift++)
810 if (FSS_CLSIZE(sc) >= csize)
811 break;
812 if (sc->sc_clshift >= 32) {
813 error = EINVAL;
814 goto bad;
815 }
816 sc->sc_clmask = FSS_CLSIZE(sc)-1;
817
818 /*
819 * Set number of cache slots.
820 */
821 if (FSS_CLSIZE(sc) <= 8192)
822 sc->sc_cache_size = 32;
823 else if (FSS_CLSIZE(sc) <= 65536)
824 sc->sc_cache_size = 8;
825 else
826 sc->sc_cache_size = 4;
827
828 /*
829 * Set number of clusters and size of last cluster.
830 */
831 sc->sc_clcount = FSS_BTOCL(sc, bsize-1)+1;
832 sc->sc_clresid = FSS_CLOFF(sc, bsize-1)+1;
833
834 /*
835 * Set size of indirect table.
836 */
837 len = sc->sc_clcount*sizeof(u_int32_t);
838 sc->sc_indir_size = FSS_BTOCL(sc, len)+1;
839 sc->sc_clnext = sc->sc_indir_size;
840 sc->sc_indir_cur = 0;
841
842 if ((error = fss_softc_alloc(sc)) != 0)
843 goto bad;
844
845 /*
846 * Activate the snapshot.
847 */
848
849 if ((error = vfs_suspend(sc->sc_mount, 0)) != 0)
850 goto bad;
851
852 microtime(&sc->sc_time);
853
854 error = fscow_establish(sc->sc_mount, fss_copy_on_write, sc);
855 if (error == 0)
856 sc->sc_flags |= FSS_ACTIVE;
857
858 vfs_resume(sc->sc_mount);
859
860 if (error != 0)
861 goto bad;
862
863 aprint_debug_dev(sc->sc_dev, "%s snapshot active\n", sc->sc_mntname);
864 aprint_debug_dev(sc->sc_dev,
865 "%u clusters of %u, %u cache slots, %u indir clusters\n",
866 sc->sc_clcount, FSS_CLSIZE(sc),
867 sc->sc_cache_size, sc->sc_indir_size);
868
869 return 0;
870
871bad:
872 fss_softc_free(sc);
873 if (sc->sc_bs_vp != NULL) {
874 if (sc->sc_flags & FSS_PERSISTENT)
875 vrele(sc->sc_bs_vp);
876 else
877 vn_close(sc->sc_bs_vp, FREAD|FWRITE, l->l_cred);
878 }
879 sc->sc_bs_vp = NULL;
880
881 return error;
882}
883
884/*
885 * Delete a snapshot.
886 */
887static int
888fss_delete_snapshot(struct fss_softc *sc, struct lwp *l)
889{
890
891 if ((sc->sc_flags & (FSS_PERSISTENT | FSS_ERROR)) == 0)
892 fscow_disestablish(sc->sc_mount, fss_copy_on_write, sc);
893
894 mutex_enter(&sc->sc_slock);
895 sc->sc_flags &= ~(FSS_ACTIVE|FSS_ERROR);
896 sc->sc_mount = NULL;
897 sc->sc_bdev = NODEV;
898 mutex_exit(&sc->sc_slock);
899
900 fss_softc_free(sc);
901 if (sc->sc_flags & FSS_PERSISTENT)
902 vrele(sc->sc_bs_vp);
903 else
904 vn_close(sc->sc_bs_vp, FREAD|FWRITE, l->l_cred);
905 sc->sc_bs_vp = NULL;
906 sc->sc_flags &= ~FSS_PERSISTENT;
907
908 return 0;
909}
910
911/*
912 * Read a cluster from the snapshotted block device to the cache.
913 */
914static int
915fss_read_cluster(struct fss_softc *sc, u_int32_t cl)
916{
917 int error, todo, offset, len;
918 daddr_t dblk;
919 struct buf *bp, *mbp;
920 struct fss_cache *scp, *scl;
921
922 /*
923 * Get a free cache slot.
924 */
925 scl = sc->sc_cache+sc->sc_cache_size;
926
927 mutex_enter(&sc->sc_slock);
928
929restart:
930 if (isset(sc->sc_copied, cl) || !FSS_ISVALID(sc)) {
931 mutex_exit(&sc->sc_slock);
932 return 0;
933 }
934
935 for (scp = sc->sc_cache; scp < scl; scp++)
936 if (scp->fc_cluster == cl) {
937 if (scp->fc_type == FSS_CACHE_VALID) {
938 mutex_exit(&sc->sc_slock);
939 return 0;
940 } else if (scp->fc_type == FSS_CACHE_BUSY) {
941 cv_wait(&scp->fc_state_cv, &sc->sc_slock);
942 goto restart;
943 }
944 }
945
946 for (scp = sc->sc_cache; scp < scl; scp++)
947 if (scp->fc_type == FSS_CACHE_FREE) {
948 scp->fc_type = FSS_CACHE_BUSY;
949 scp->fc_cluster = cl;
950 break;
951 }
952 if (scp >= scl) {
953 cv_wait(&sc->sc_cache_cv, &sc->sc_slock);
954 goto restart;
955 }
956
957 mutex_exit(&sc->sc_slock);
958
959 /*
960 * Start the read.
961 */
962 dblk = btodb(FSS_CLTOB(sc, cl));
963 if (cl == sc->sc_clcount-1) {
964 todo = sc->sc_clresid;
965 memset((char *)scp->fc_data + todo, 0, FSS_CLSIZE(sc) - todo);
966 } else
967 todo = FSS_CLSIZE(sc);
968 offset = 0;
969 mbp = getiobuf(NULL, true);
970 mbp->b_bufsize = todo;
971 mbp->b_data = scp->fc_data;
972 mbp->b_resid = mbp->b_bcount = todo;
973 mbp->b_flags = B_READ;
974 mbp->b_cflags = BC_BUSY;
975 mbp->b_dev = sc->sc_bdev;
976 while (todo > 0) {
977 len = todo;
978 if (len > MAXPHYS)
979 len = MAXPHYS;
980 if (btodb(FSS_CLTOB(sc, cl)) == dblk && len == todo)
981 bp = mbp;
982 else {
983 bp = getiobuf(NULL, true);
984 nestiobuf_setup(mbp, bp, offset, len);
985 }
986 bp->b_lblkno = 0;
987 bp->b_blkno = dblk;
988 bdev_strategy(bp);
989 dblk += btodb(len);
990 offset += len;
991 todo -= len;
992 }
993 error = biowait(mbp);
994 putiobuf(mbp);
995
996 mutex_enter(&sc->sc_slock);
997 scp->fc_type = (error ? FSS_CACHE_FREE : FSS_CACHE_VALID);
998 cv_broadcast(&scp->fc_state_cv);
999 if (error == 0) {
1000 setbit(sc->sc_copied, scp->fc_cluster);
1001 cv_signal(&sc->sc_work_cv);
1002 }
1003 mutex_exit(&sc->sc_slock);
1004
1005 return error;
1006}
1007
1008/*
1009 * Read/write clusters from/to backing store.
1010 * For persistent snapshots must be called with cl == 0. off is the
1011 * offset into the snapshot.
1012 */
1013static int
1014fss_bs_io(struct fss_softc *sc, fss_io_type rw,
1015 u_int32_t cl, off_t off, int len, void *data)
1016{
1017 int error;
1018
1019 off += FSS_CLTOB(sc, cl);
1020
1021 vn_lock(sc->sc_bs_vp, LK_EXCLUSIVE|LK_RETRY);
1022
1023 error = vn_rdwr((rw == FSS_READ ? UIO_READ : UIO_WRITE), sc->sc_bs_vp,
1024 data, len, off, UIO_SYSSPACE,
1025 IO_ADV_ENCODE(POSIX_FADV_NOREUSE) | IO_NODELOCKED,
1026 sc->sc_bs_lwp->l_cred, NULL, NULL);
1027 if (error == 0) {
1028 mutex_enter(sc->sc_bs_vp->v_interlock);
1029 error = VOP_PUTPAGES(sc->sc_bs_vp, trunc_page(off),
1030 round_page(off+len), PGO_CLEANIT | PGO_FREE | PGO_SYNCIO);
1031 }
1032
1033 VOP_UNLOCK(sc->sc_bs_vp);
1034
1035 return error;
1036}
1037
1038/*
1039 * Get a pointer to the indirect slot for this cluster.
1040 */
1041static u_int32_t *
1042fss_bs_indir(struct fss_softc *sc, u_int32_t cl)
1043{
1044 u_int32_t icl;
1045 int ioff;
1046
1047 icl = cl/(FSS_CLSIZE(sc)/sizeof(u_int32_t));
1048 ioff = cl%(FSS_CLSIZE(sc)/sizeof(u_int32_t));
1049
1050 if (sc->sc_indir_cur == icl)
1051 return &sc->sc_indir_data[ioff];
1052
1053 if (sc->sc_indir_dirty) {
1054 if (fss_bs_io(sc, FSS_WRITE, sc->sc_indir_cur, 0,
1055 FSS_CLSIZE(sc), (void *)sc->sc_indir_data) != 0)
1056 return NULL;
1057 setbit(sc->sc_indir_valid, sc->sc_indir_cur);
1058 }
1059
1060 sc->sc_indir_dirty = 0;
1061 sc->sc_indir_cur = icl;
1062
1063 if (isset(sc->sc_indir_valid, sc->sc_indir_cur)) {
1064 if (fss_bs_io(sc, FSS_READ, sc->sc_indir_cur, 0,
1065 FSS_CLSIZE(sc), (void *)sc->sc_indir_data) != 0)
1066 return NULL;
1067 } else
1068 memset(sc->sc_indir_data, 0, FSS_CLSIZE(sc));
1069
1070 return &sc->sc_indir_data[ioff];
1071}
1072
1073/*
1074 * The kernel thread (one for every active snapshot).
1075 *
1076 * After wakeup it cleans the cache and runs the I/O requests.
1077 */
1078static void
1079fss_bs_thread(void *arg)
1080{
1081 bool thread_idle, is_valid;
1082 int error, i, todo, len, crotor, is_read;
1083 long off;
1084 char *addr;
1085 u_int32_t c, cl, ch, *indirp;
1086 struct buf *bp, *nbp;
1087 struct fss_softc *sc;
1088 struct fss_cache *scp, *scl;
1089
1090 sc = arg;
1091 scl = sc->sc_cache+sc->sc_cache_size;
1092 crotor = 0;
1093 thread_idle = false;
1094
1095 mutex_enter(&sc->sc_slock);
1096
1097 for (;;) {
1098 if (thread_idle)
1099 cv_wait(&sc->sc_work_cv, &sc->sc_slock);
1100 thread_idle = true;
1101 if ((sc->sc_flags & FSS_BS_THREAD) == 0) {
1102 mutex_exit(&sc->sc_slock);
1103 kthread_exit(0);
1104 }
1105
1106 /*
1107 * Process I/O requests (persistent)
1108 */
1109
1110 if (sc->sc_flags & FSS_PERSISTENT) {
1111 if ((bp = bufq_get(sc->sc_bufq)) == NULL)
1112 continue;
1113 is_valid = FSS_ISVALID(sc);
1114 is_read = (bp->b_flags & B_READ);
1115 thread_idle = false;
1116 mutex_exit(&sc->sc_slock);
1117
1118 if (is_valid) {
1119 disk_busy(sc->sc_dkdev);
1120 error = fss_bs_io(sc, FSS_READ, 0,
1121 dbtob(bp->b_blkno), bp->b_bcount,
1122 bp->b_data);
1123 disk_unbusy(sc->sc_dkdev,
1124 (error ? 0 : bp->b_bcount), is_read);
1125 } else
1126 error = ENXIO;
1127
1128 bp->b_error = error;
1129 bp->b_resid = (error ? bp->b_bcount : 0);
1130 biodone(bp);
1131
1132 mutex_enter(&sc->sc_slock);
1133 continue;
1134 }
1135
1136 /*
1137 * Clean the cache
1138 */
1139 for (i = 0; i < sc->sc_cache_size; i++) {
1140 crotor = (crotor + 1) % sc->sc_cache_size;
1141 scp = sc->sc_cache + crotor;
1142 if (scp->fc_type != FSS_CACHE_VALID)
1143 continue;
1144 mutex_exit(&sc->sc_slock);
1145
1146 thread_idle = false;
1147 indirp = fss_bs_indir(sc, scp->fc_cluster);
1148 if (indirp != NULL) {
1149 error = fss_bs_io(sc, FSS_WRITE, sc->sc_clnext,
1150 0, FSS_CLSIZE(sc), scp->fc_data);
1151 } else
1152 error = EIO;
1153
1154 mutex_enter(&sc->sc_slock);
1155 if (error == 0) {
1156 *indirp = sc->sc_clnext++;
1157 sc->sc_indir_dirty = 1;
1158 } else
1159 fss_error(sc, "write error on backing store");
1160
1161 scp->fc_type = FSS_CACHE_FREE;
1162 cv_broadcast(&sc->sc_cache_cv);
1163 break;
1164 }
1165
1166 /*
1167 * Process I/O requests
1168 */
1169 if ((bp = bufq_get(sc->sc_bufq)) == NULL)
1170 continue;
1171 is_valid = FSS_ISVALID(sc);
1172 is_read = (bp->b_flags & B_READ);
1173 thread_idle = false;
1174
1175 if (!is_valid) {
1176 mutex_exit(&sc->sc_slock);
1177
1178 bp->b_error = ENXIO;
1179 bp->b_resid = bp->b_bcount;
1180 biodone(bp);
1181
1182 mutex_enter(&sc->sc_slock);
1183 continue;
1184 }
1185
1186 disk_busy(sc->sc_dkdev);
1187
1188 /*
1189 * First read from the snapshotted block device unless
1190 * this request is completely covered by backing store.
1191 */
1192
1193 cl = FSS_BTOCL(sc, dbtob(bp->b_blkno));
1194 off = FSS_CLOFF(sc, dbtob(bp->b_blkno));
1195 ch = FSS_BTOCL(sc, dbtob(bp->b_blkno)+bp->b_bcount-1);
1196 error = 0;
1197 bp->b_resid = 0;
1198 bp->b_error = 0;
1199 for (c = cl; c <= ch; c++) {
1200 if (isset(sc->sc_copied, c))
1201 continue;
1202 mutex_exit(&sc->sc_slock);
1203
1204 /* Not on backing store, read from device. */
1205 nbp = getiobuf(NULL, true);
1206 nbp->b_flags = B_READ;
1207 nbp->b_resid = nbp->b_bcount = bp->b_bcount;
1208 nbp->b_bufsize = bp->b_bcount;
1209 nbp->b_data = bp->b_data;
1210 nbp->b_blkno = bp->b_blkno;
1211 nbp->b_lblkno = 0;
1212 nbp->b_dev = sc->sc_bdev;
1213 SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */
1214
1215 bdev_strategy(nbp);
1216
1217 error = biowait(nbp);
1218 if (error != 0) {
1219 bp->b_resid = bp->b_bcount;
1220 bp->b_error = nbp->b_error;
1221 disk_unbusy(sc->sc_dkdev, 0, is_read);
1222 biodone(bp);
1223 }
1224 putiobuf(nbp);
1225
1226 mutex_enter(&sc->sc_slock);
1227 break;
1228 }
1229 if (error)
1230 continue;
1231
1232 /*
1233 * Replace those parts that have been saved to backing store.
1234 */
1235
1236 addr = bp->b_data;
1237 todo = bp->b_bcount;
1238 for (c = cl; c <= ch; c++, off = 0, todo -= len, addr += len) {
1239 len = FSS_CLSIZE(sc)-off;
1240 if (len > todo)
1241 len = todo;
1242 if (isclr(sc->sc_copied, c))
1243 continue;
1244 mutex_exit(&sc->sc_slock);
1245
1246 indirp = fss_bs_indir(sc, c);
1247 if (indirp == NULL || *indirp == 0) {
1248 /*
1249 * Not on backing store. Either in cache
1250 * or hole in the snapshotted block device.
1251 */
1252
1253 mutex_enter(&sc->sc_slock);
1254 for (scp = sc->sc_cache; scp < scl; scp++)
1255 if (scp->fc_type == FSS_CACHE_VALID &&
1256 scp->fc_cluster == c)
1257 break;
1258 if (scp < scl)
1259 memcpy(addr, (char *)scp->fc_data+off,
1260 len);
1261 else
1262 memset(addr, 0, len);
1263 continue;
1264 }
1265
1266 /*
1267 * Read from backing store.
1268 */
1269 error =
1270 fss_bs_io(sc, FSS_READ, *indirp, off, len, addr);
1271
1272 mutex_enter(&sc->sc_slock);
1273 if (error) {
1274 bp->b_resid = bp->b_bcount;
1275 bp->b_error = error;
1276 break;
1277 }
1278 }
1279 mutex_exit(&sc->sc_slock);
1280
1281 disk_unbusy(sc->sc_dkdev, (error ? 0 : bp->b_bcount), is_read);
1282 biodone(bp);
1283
1284 mutex_enter(&sc->sc_slock);
1285 }
1286}
1287
1288#ifdef _MODULE
1289
1290#include <sys/module.h>
1291
1292MODULE(MODULE_CLASS_DRIVER, fss, NULL);
1293CFDRIVER_DECL(fss, DV_DISK, NULL);
1294
1295devmajor_t fss_bmajor = -1, fss_cmajor = -1;
1296
1297static int
1298fss_modcmd(modcmd_t cmd, void *arg)
1299{
1300 int error = 0;
1301
1302 switch (cmd) {
1303 case MODULE_CMD_INIT:
1304 mutex_init(&fss_device_lock, MUTEX_DEFAULT, IPL_NONE);
1305 error = config_cfdriver_attach(&fss_cd);
1306 if (error) {
1307 mutex_destroy(&fss_device_lock);
1308 break;
1309 }
1310 error = config_cfattach_attach(fss_cd.cd_name, &fss_ca);
1311 if (error) {
1312 config_cfdriver_detach(&fss_cd);
1313 mutex_destroy(&fss_device_lock);
1314 break;
1315 }
1316 error = devsw_attach(fss_cd.cd_name,
1317 &fss_bdevsw, &fss_bmajor, &fss_cdevsw, &fss_cmajor);
1318
1319 if (error) {
1320 config_cfattach_detach(fss_cd.cd_name, &fss_ca);
1321 config_cfdriver_detach(&fss_cd);
1322 mutex_destroy(&fss_device_lock);
1323 break;
1324 }
1325 break;
1326
1327 case MODULE_CMD_FINI:
1328 devsw_detach(&fss_bdevsw, &fss_cdevsw);
1329 error = config_cfattach_detach(fss_cd.cd_name, &fss_ca);
1330 if (error) {
1331 devsw_attach(fss_cd.cd_name, &fss_bdevsw, &fss_bmajor,
1332 &fss_cdevsw, &fss_cmajor);
1333 break;
1334 }
1335 config_cfdriver_detach(&fss_cd);
1336 mutex_destroy(&fss_device_lock);
1337 break;
1338
1339 default:
1340 error = ENOTTY;
1341 break;
1342 }
1343
1344 return error;
1345}
1346
1347#endif /* _MODULE */
1348