1 | /* $NetBSD: vfs_vnops.c,v 1.193 2015/02/04 07:09:37 msaitoh Exp $ */ |
2 | |
3 | /*- |
4 | * Copyright (c) 2009 The NetBSD Foundation, Inc. |
5 | * All rights reserved. |
6 | * |
7 | * This code is derived from software contributed to The NetBSD Foundation |
8 | * by Andrew Doran. |
9 | * |
10 | * Redistribution and use in source and binary forms, with or without |
11 | * modification, are permitted provided that the following conditions |
12 | * are met: |
13 | * 1. Redistributions of source code must retain the above copyright |
14 | * notice, this list of conditions and the following disclaimer. |
15 | * 2. Redistributions in binary form must reproduce the above copyright |
16 | * notice, this list of conditions and the following disclaimer in the |
17 | * documentation and/or other materials provided with the distribution. |
18 | * |
19 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
20 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
21 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
22 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
29 | * POSSIBILITY OF SUCH DAMAGE. |
30 | */ |
31 | |
32 | /* |
33 | * Copyright (c) 1982, 1986, 1989, 1993 |
34 | * The Regents of the University of California. All rights reserved. |
35 | * (c) UNIX System Laboratories, Inc. |
36 | * All or some portions of this file are derived from material licensed |
37 | * to the University of California by American Telephone and Telegraph |
38 | * Co. or Unix System Laboratories, Inc. and are reproduced herein with |
39 | * the permission of UNIX System Laboratories, Inc. |
40 | * |
41 | * Redistribution and use in source and binary forms, with or without |
42 | * modification, are permitted provided that the following conditions |
43 | * are met: |
44 | * 1. Redistributions of source code must retain the above copyright |
45 | * notice, this list of conditions and the following disclaimer. |
46 | * 2. Redistributions in binary form must reproduce the above copyright |
47 | * notice, this list of conditions and the following disclaimer in the |
48 | * documentation and/or other materials provided with the distribution. |
49 | * 3. Neither the name of the University nor the names of its contributors |
50 | * may be used to endorse or promote products derived from this software |
51 | * without specific prior written permission. |
52 | * |
53 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
54 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
55 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
56 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
57 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
58 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
59 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
60 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
61 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
62 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
63 | * SUCH DAMAGE. |
64 | * |
65 | * @(#)vfs_vnops.c 8.14 (Berkeley) 6/15/95 |
66 | */ |
67 | |
68 | #include <sys/cdefs.h> |
69 | __KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.193 2015/02/04 07:09:37 msaitoh Exp $" ); |
70 | |
71 | #include "veriexec.h" |
72 | |
73 | #include <sys/param.h> |
74 | #include <sys/systm.h> |
75 | #include <sys/kernel.h> |
76 | #include <sys/file.h> |
77 | #include <sys/stat.h> |
78 | #include <sys/buf.h> |
79 | #include <sys/proc.h> |
80 | #include <sys/mount.h> |
81 | #include <sys/namei.h> |
82 | #include <sys/vnode.h> |
83 | #include <sys/ioctl.h> |
84 | #include <sys/tty.h> |
85 | #include <sys/poll.h> |
86 | #include <sys/kauth.h> |
87 | #include <sys/syslog.h> |
88 | #include <sys/fstrans.h> |
89 | #include <sys/atomic.h> |
90 | #include <sys/filedesc.h> |
91 | #include <sys/wapbl.h> |
92 | #include <sys/mman.h> |
93 | |
94 | #include <miscfs/specfs/specdev.h> |
95 | #include <miscfs/fifofs/fifo.h> |
96 | |
97 | #include <uvm/uvm_extern.h> |
98 | #include <uvm/uvm_readahead.h> |
99 | #include <uvm/uvm_device.h> |
100 | |
101 | #ifdef UNION |
102 | #include <fs/union/union.h> |
103 | #endif |
104 | |
105 | #ifndef COMPAT_ZERODEV |
106 | #define COMPAT_ZERODEV(dev) (0) |
107 | #endif |
108 | |
109 | int (*vn_union_readdir_hook) (struct vnode **, struct file *, struct lwp *); |
110 | |
111 | #include <sys/verified_exec.h> |
112 | |
113 | static int vn_read(file_t *fp, off_t *offset, struct uio *uio, |
114 | kauth_cred_t cred, int flags); |
115 | static int vn_write(file_t *fp, off_t *offset, struct uio *uio, |
116 | kauth_cred_t cred, int flags); |
117 | static int vn_closefile(file_t *fp); |
118 | static int vn_poll(file_t *fp, int events); |
119 | static int vn_fcntl(file_t *fp, u_int com, void *data); |
120 | static int vn_statfile(file_t *fp, struct stat *sb); |
121 | static int vn_ioctl(file_t *fp, u_long com, void *data); |
122 | static int vn_mmap(struct file *, off_t *, size_t, int, int *, int *, |
123 | struct uvm_object **, int *); |
124 | |
125 | const struct fileops vnops = { |
126 | .fo_read = vn_read, |
127 | .fo_write = vn_write, |
128 | .fo_ioctl = vn_ioctl, |
129 | .fo_fcntl = vn_fcntl, |
130 | .fo_poll = vn_poll, |
131 | .fo_stat = vn_statfile, |
132 | .fo_close = vn_closefile, |
133 | .fo_kqfilter = vn_kqfilter, |
134 | .fo_restart = fnullop_restart, |
135 | .fo_mmap = vn_mmap, |
136 | }; |
137 | |
138 | /* |
139 | * Common code for vnode open operations. |
140 | * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. |
141 | */ |
142 | int |
143 | vn_open(struct nameidata *ndp, int fmode, int cmode) |
144 | { |
145 | struct vnode *vp; |
146 | struct lwp *l = curlwp; |
147 | kauth_cred_t cred = l->l_cred; |
148 | struct vattr va; |
149 | int error; |
150 | const char *pathstring; |
151 | |
152 | if ((fmode & (O_CREAT | O_DIRECTORY)) == (O_CREAT | O_DIRECTORY)) |
153 | return EINVAL; |
154 | |
155 | ndp->ni_cnd.cn_flags &= TRYEMULROOT | NOCHROOT; |
156 | |
157 | if (fmode & O_CREAT) { |
158 | ndp->ni_cnd.cn_nameiop = CREATE; |
159 | ndp->ni_cnd.cn_flags |= LOCKPARENT | LOCKLEAF; |
160 | if ((fmode & O_EXCL) == 0 && |
161 | ((fmode & O_NOFOLLOW) == 0)) |
162 | ndp->ni_cnd.cn_flags |= FOLLOW; |
163 | } else { |
164 | ndp->ni_cnd.cn_nameiop = LOOKUP; |
165 | ndp->ni_cnd.cn_flags |= LOCKLEAF; |
166 | if ((fmode & O_NOFOLLOW) == 0) |
167 | ndp->ni_cnd.cn_flags |= FOLLOW; |
168 | } |
169 | |
170 | pathstring = pathbuf_stringcopy_get(ndp->ni_pathbuf); |
171 | if (pathstring == NULL) { |
172 | return ENOMEM; |
173 | } |
174 | |
175 | error = namei(ndp); |
176 | if (error) |
177 | goto out; |
178 | |
179 | vp = ndp->ni_vp; |
180 | |
181 | #if NVERIEXEC > 0 |
182 | error = veriexec_openchk(l, ndp->ni_vp, pathstring, fmode); |
183 | if (error) { |
184 | /* We have to release the locks ourselves */ |
185 | if (fmode & O_CREAT) { |
186 | if (vp == NULL) { |
187 | vput(ndp->ni_dvp); |
188 | } else { |
189 | VOP_ABORTOP(ndp->ni_dvp, &ndp->ni_cnd); |
190 | if (ndp->ni_dvp == ndp->ni_vp) |
191 | vrele(ndp->ni_dvp); |
192 | else |
193 | vput(ndp->ni_dvp); |
194 | ndp->ni_dvp = NULL; |
195 | vput(vp); |
196 | } |
197 | } else { |
198 | vput(vp); |
199 | } |
200 | goto out; |
201 | } |
202 | #endif /* NVERIEXEC > 0 */ |
203 | |
204 | if (fmode & O_CREAT) { |
205 | if (ndp->ni_vp == NULL) { |
206 | vattr_null(&va); |
207 | va.va_type = VREG; |
208 | va.va_mode = cmode; |
209 | if (fmode & O_EXCL) |
210 | va.va_vaflags |= VA_EXCLUSIVE; |
211 | error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, |
212 | &ndp->ni_cnd, &va); |
213 | vput(ndp->ni_dvp); |
214 | if (error) |
215 | goto out; |
216 | fmode &= ~O_TRUNC; |
217 | vp = ndp->ni_vp; |
218 | vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); |
219 | } else { |
220 | VOP_ABORTOP(ndp->ni_dvp, &ndp->ni_cnd); |
221 | if (ndp->ni_dvp == ndp->ni_vp) |
222 | vrele(ndp->ni_dvp); |
223 | else |
224 | vput(ndp->ni_dvp); |
225 | ndp->ni_dvp = NULL; |
226 | vp = ndp->ni_vp; |
227 | if (fmode & O_EXCL) { |
228 | error = EEXIST; |
229 | goto bad; |
230 | } |
231 | fmode &= ~O_CREAT; |
232 | } |
233 | } else { |
234 | vp = ndp->ni_vp; |
235 | } |
236 | if (vp->v_type == VSOCK) { |
237 | error = EOPNOTSUPP; |
238 | goto bad; |
239 | } |
240 | if (ndp->ni_vp->v_type == VLNK) { |
241 | error = EFTYPE; |
242 | goto bad; |
243 | } |
244 | |
245 | if ((fmode & O_CREAT) == 0) { |
246 | error = vn_openchk(vp, cred, fmode); |
247 | if (error != 0) |
248 | goto bad; |
249 | } |
250 | |
251 | if (fmode & O_TRUNC) { |
252 | vattr_null(&va); |
253 | va.va_size = 0; |
254 | error = VOP_SETATTR(vp, &va, cred); |
255 | if (error != 0) |
256 | goto bad; |
257 | } |
258 | if ((error = VOP_OPEN(vp, fmode, cred)) != 0) |
259 | goto bad; |
260 | if (fmode & FWRITE) { |
261 | mutex_enter(vp->v_interlock); |
262 | vp->v_writecount++; |
263 | mutex_exit(vp->v_interlock); |
264 | } |
265 | |
266 | bad: |
267 | if (error) |
268 | vput(vp); |
269 | out: |
270 | pathbuf_stringcopy_put(ndp->ni_pathbuf, pathstring); |
271 | return (error); |
272 | } |
273 | |
274 | /* |
275 | * Check for write permissions on the specified vnode. |
276 | * Prototype text segments cannot be written. |
277 | */ |
278 | int |
279 | vn_writechk(struct vnode *vp) |
280 | { |
281 | |
282 | /* |
283 | * If the vnode is in use as a process's text, |
284 | * we can't allow writing. |
285 | */ |
286 | if (vp->v_iflag & VI_TEXT) |
287 | return (ETXTBSY); |
288 | return (0); |
289 | } |
290 | |
291 | int |
292 | vn_openchk(struct vnode *vp, kauth_cred_t cred, int fflags) |
293 | { |
294 | int permbits = 0; |
295 | int error; |
296 | |
297 | if ((fflags & O_DIRECTORY) != 0 && vp->v_type != VDIR) |
298 | return ENOTDIR; |
299 | |
300 | if ((fflags & FREAD) != 0) { |
301 | permbits = VREAD; |
302 | } |
303 | if ((fflags & (FWRITE | O_TRUNC)) != 0) { |
304 | permbits |= VWRITE; |
305 | if (vp->v_type == VDIR) { |
306 | error = EISDIR; |
307 | goto bad; |
308 | } |
309 | error = vn_writechk(vp); |
310 | if (error != 0) |
311 | goto bad; |
312 | } |
313 | error = VOP_ACCESS(vp, permbits, cred); |
314 | bad: |
315 | return error; |
316 | } |
317 | |
318 | /* |
319 | * Mark a vnode as having executable mappings. |
320 | */ |
321 | void |
322 | vn_markexec(struct vnode *vp) |
323 | { |
324 | |
325 | if ((vp->v_iflag & VI_EXECMAP) != 0) { |
326 | /* Safe unlocked, as long as caller holds a reference. */ |
327 | return; |
328 | } |
329 | |
330 | mutex_enter(vp->v_interlock); |
331 | if ((vp->v_iflag & VI_EXECMAP) == 0) { |
332 | atomic_add_int(&uvmexp.filepages, -vp->v_uobj.uo_npages); |
333 | atomic_add_int(&uvmexp.execpages, vp->v_uobj.uo_npages); |
334 | vp->v_iflag |= VI_EXECMAP; |
335 | } |
336 | mutex_exit(vp->v_interlock); |
337 | } |
338 | |
339 | /* |
340 | * Mark a vnode as being the text of a process. |
341 | * Fail if the vnode is currently writable. |
342 | */ |
343 | int |
344 | vn_marktext(struct vnode *vp) |
345 | { |
346 | |
347 | if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP)) == (VI_TEXT|VI_EXECMAP)) { |
348 | /* Safe unlocked, as long as caller holds a reference. */ |
349 | return (0); |
350 | } |
351 | |
352 | mutex_enter(vp->v_interlock); |
353 | if (vp->v_writecount != 0) { |
354 | KASSERT((vp->v_iflag & VI_TEXT) == 0); |
355 | mutex_exit(vp->v_interlock); |
356 | return (ETXTBSY); |
357 | } |
358 | if ((vp->v_iflag & VI_EXECMAP) == 0) { |
359 | atomic_add_int(&uvmexp.filepages, -vp->v_uobj.uo_npages); |
360 | atomic_add_int(&uvmexp.execpages, vp->v_uobj.uo_npages); |
361 | } |
362 | vp->v_iflag |= (VI_TEXT | VI_EXECMAP); |
363 | mutex_exit(vp->v_interlock); |
364 | return (0); |
365 | } |
366 | |
367 | /* |
368 | * Vnode close call |
369 | * |
370 | * Note: takes an unlocked vnode, while VOP_CLOSE takes a locked node. |
371 | */ |
372 | int |
373 | vn_close(struct vnode *vp, int flags, kauth_cred_t cred) |
374 | { |
375 | int error; |
376 | |
377 | if (flags & FWRITE) { |
378 | mutex_enter(vp->v_interlock); |
379 | KASSERT(vp->v_writecount > 0); |
380 | vp->v_writecount--; |
381 | mutex_exit(vp->v_interlock); |
382 | } |
383 | vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); |
384 | error = VOP_CLOSE(vp, flags, cred); |
385 | vput(vp); |
386 | return (error); |
387 | } |
388 | |
389 | static int |
390 | enforce_rlimit_fsize(struct vnode *vp, struct uio *uio, int ioflag) |
391 | { |
392 | struct lwp *l = curlwp; |
393 | off_t testoff; |
394 | |
395 | if (uio->uio_rw != UIO_WRITE || vp->v_type != VREG) |
396 | return 0; |
397 | |
398 | KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); |
399 | if (ioflag & IO_APPEND) |
400 | testoff = vp->v_size; |
401 | else |
402 | testoff = uio->uio_offset; |
403 | |
404 | if (testoff + uio->uio_resid > |
405 | l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { |
406 | mutex_enter(proc_lock); |
407 | psignal(l->l_proc, SIGXFSZ); |
408 | mutex_exit(proc_lock); |
409 | return EFBIG; |
410 | } |
411 | |
412 | return 0; |
413 | } |
414 | |
415 | /* |
416 | * Package up an I/O request on a vnode into a uio and do it. |
417 | */ |
418 | int |
419 | vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, |
420 | enum uio_seg segflg, int ioflg, kauth_cred_t cred, size_t *aresid, |
421 | struct lwp *l) |
422 | { |
423 | struct uio auio; |
424 | struct iovec aiov; |
425 | int error; |
426 | |
427 | if ((ioflg & IO_NODELOCKED) == 0) { |
428 | if (rw == UIO_READ) { |
429 | vn_lock(vp, LK_SHARED | LK_RETRY); |
430 | } else /* UIO_WRITE */ { |
431 | vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); |
432 | } |
433 | } |
434 | auio.uio_iov = &aiov; |
435 | auio.uio_iovcnt = 1; |
436 | aiov.iov_base = base; |
437 | aiov.iov_len = len; |
438 | auio.uio_resid = len; |
439 | auio.uio_offset = offset; |
440 | auio.uio_rw = rw; |
441 | if (segflg == UIO_SYSSPACE) { |
442 | UIO_SETUP_SYSSPACE(&auio); |
443 | } else { |
444 | auio.uio_vmspace = l->l_proc->p_vmspace; |
445 | } |
446 | |
447 | if ((error = enforce_rlimit_fsize(vp, &auio, ioflg)) != 0) |
448 | goto out; |
449 | |
450 | if (rw == UIO_READ) { |
451 | error = VOP_READ(vp, &auio, ioflg, cred); |
452 | } else { |
453 | error = VOP_WRITE(vp, &auio, ioflg, cred); |
454 | } |
455 | |
456 | if (aresid) |
457 | *aresid = auio.uio_resid; |
458 | else |
459 | if (auio.uio_resid && error == 0) |
460 | error = EIO; |
461 | |
462 | out: |
463 | if ((ioflg & IO_NODELOCKED) == 0) { |
464 | VOP_UNLOCK(vp); |
465 | } |
466 | return (error); |
467 | } |
468 | |
469 | int |
470 | vn_readdir(file_t *fp, char *bf, int segflg, u_int count, int *done, |
471 | struct lwp *l, off_t **cookies, int *ncookies) |
472 | { |
473 | struct vnode *vp = fp->f_vnode; |
474 | struct iovec aiov; |
475 | struct uio auio; |
476 | int error, eofflag; |
477 | |
478 | /* Limit the size on any kernel buffers used by VOP_READDIR */ |
479 | count = min(MAXBSIZE, count); |
480 | |
481 | unionread: |
482 | if (vp->v_type != VDIR) |
483 | return (EINVAL); |
484 | aiov.iov_base = bf; |
485 | aiov.iov_len = count; |
486 | auio.uio_iov = &aiov; |
487 | auio.uio_iovcnt = 1; |
488 | auio.uio_rw = UIO_READ; |
489 | if (segflg == UIO_SYSSPACE) { |
490 | UIO_SETUP_SYSSPACE(&auio); |
491 | } else { |
492 | KASSERT(l == curlwp); |
493 | auio.uio_vmspace = l->l_proc->p_vmspace; |
494 | } |
495 | auio.uio_resid = count; |
496 | vn_lock(vp, LK_SHARED | LK_RETRY); |
497 | auio.uio_offset = fp->f_offset; |
498 | error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, cookies, |
499 | ncookies); |
500 | mutex_enter(&fp->f_lock); |
501 | fp->f_offset = auio.uio_offset; |
502 | mutex_exit(&fp->f_lock); |
503 | VOP_UNLOCK(vp); |
504 | if (error) |
505 | return (error); |
506 | |
507 | if (count == auio.uio_resid && vn_union_readdir_hook) { |
508 | struct vnode *ovp = vp; |
509 | |
510 | error = (*vn_union_readdir_hook)(&vp, fp, l); |
511 | if (error) |
512 | return (error); |
513 | if (vp != ovp) |
514 | goto unionread; |
515 | } |
516 | |
517 | if (count == auio.uio_resid && (vp->v_vflag & VV_ROOT) && |
518 | (vp->v_mount->mnt_flag & MNT_UNION)) { |
519 | struct vnode *tvp = vp; |
520 | vp = vp->v_mount->mnt_vnodecovered; |
521 | vref(vp); |
522 | mutex_enter(&fp->f_lock); |
523 | fp->f_vnode = vp; |
524 | fp->f_offset = 0; |
525 | mutex_exit(&fp->f_lock); |
526 | vrele(tvp); |
527 | goto unionread; |
528 | } |
529 | *done = count - auio.uio_resid; |
530 | return error; |
531 | } |
532 | |
533 | /* |
534 | * File table vnode read routine. |
535 | */ |
536 | static int |
537 | vn_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, |
538 | int flags) |
539 | { |
540 | struct vnode *vp = fp->f_vnode; |
541 | int error, ioflag, fflag; |
542 | size_t count; |
543 | |
544 | ioflag = IO_ADV_ENCODE(fp->f_advice); |
545 | fflag = fp->f_flag; |
546 | if (fflag & FNONBLOCK) |
547 | ioflag |= IO_NDELAY; |
548 | if ((fflag & (FFSYNC | FRSYNC)) == (FFSYNC | FRSYNC)) |
549 | ioflag |= IO_SYNC; |
550 | if (fflag & FALTIO) |
551 | ioflag |= IO_ALTSEMANTICS; |
552 | if (fflag & FDIRECT) |
553 | ioflag |= IO_DIRECT; |
554 | vn_lock(vp, LK_SHARED | LK_RETRY); |
555 | uio->uio_offset = *offset; |
556 | count = uio->uio_resid; |
557 | error = VOP_READ(vp, uio, ioflag, cred); |
558 | if (flags & FOF_UPDATE_OFFSET) |
559 | *offset += count - uio->uio_resid; |
560 | VOP_UNLOCK(vp); |
561 | return (error); |
562 | } |
563 | |
564 | /* |
565 | * File table vnode write routine. |
566 | */ |
567 | static int |
568 | vn_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, |
569 | int flags) |
570 | { |
571 | struct vnode *vp = fp->f_vnode; |
572 | int error, ioflag, fflag; |
573 | size_t count; |
574 | |
575 | ioflag = IO_ADV_ENCODE(fp->f_advice) | IO_UNIT; |
576 | fflag = fp->f_flag; |
577 | if (vp->v_type == VREG && (fflag & O_APPEND)) |
578 | ioflag |= IO_APPEND; |
579 | if (fflag & FNONBLOCK) |
580 | ioflag |= IO_NDELAY; |
581 | if (fflag & FFSYNC || |
582 | (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) |
583 | ioflag |= IO_SYNC; |
584 | else if (fflag & FDSYNC) |
585 | ioflag |= IO_DSYNC; |
586 | if (fflag & FALTIO) |
587 | ioflag |= IO_ALTSEMANTICS; |
588 | if (fflag & FDIRECT) |
589 | ioflag |= IO_DIRECT; |
590 | vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); |
591 | uio->uio_offset = *offset; |
592 | count = uio->uio_resid; |
593 | |
594 | if ((error = enforce_rlimit_fsize(vp, uio, ioflag)) != 0) |
595 | goto out; |
596 | |
597 | error = VOP_WRITE(vp, uio, ioflag, cred); |
598 | |
599 | if (flags & FOF_UPDATE_OFFSET) { |
600 | if (ioflag & IO_APPEND) { |
601 | /* |
602 | * SUSv3 describes behaviour for count = 0 as following: |
603 | * "Before any action ... is taken, and if nbyte is zero |
604 | * and the file is a regular file, the write() function |
605 | * ... in the absence of errors ... shall return zero |
606 | * and have no other results." |
607 | */ |
608 | if (count) |
609 | *offset = uio->uio_offset; |
610 | } else |
611 | *offset += count - uio->uio_resid; |
612 | } |
613 | |
614 | out: |
615 | VOP_UNLOCK(vp); |
616 | return (error); |
617 | } |
618 | |
619 | /* |
620 | * File table vnode stat routine. |
621 | */ |
622 | static int |
623 | vn_statfile(file_t *fp, struct stat *sb) |
624 | { |
625 | struct vnode *vp = fp->f_vnode; |
626 | int error; |
627 | |
628 | vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); |
629 | error = vn_stat(vp, sb); |
630 | VOP_UNLOCK(vp); |
631 | return error; |
632 | } |
633 | |
634 | int |
635 | vn_stat(struct vnode *vp, struct stat *sb) |
636 | { |
637 | struct vattr va; |
638 | int error; |
639 | mode_t mode; |
640 | |
641 | memset(&va, 0, sizeof(va)); |
642 | error = VOP_GETATTR(vp, &va, kauth_cred_get()); |
643 | if (error) |
644 | return (error); |
645 | /* |
646 | * Copy from vattr table |
647 | */ |
648 | memset(sb, 0, sizeof(*sb)); |
649 | sb->st_dev = va.va_fsid; |
650 | sb->st_ino = va.va_fileid; |
651 | mode = va.va_mode; |
652 | switch (vp->v_type) { |
653 | case VREG: |
654 | mode |= S_IFREG; |
655 | break; |
656 | case VDIR: |
657 | mode |= S_IFDIR; |
658 | break; |
659 | case VBLK: |
660 | mode |= S_IFBLK; |
661 | break; |
662 | case VCHR: |
663 | mode |= S_IFCHR; |
664 | break; |
665 | case VLNK: |
666 | mode |= S_IFLNK; |
667 | break; |
668 | case VSOCK: |
669 | mode |= S_IFSOCK; |
670 | break; |
671 | case VFIFO: |
672 | mode |= S_IFIFO; |
673 | break; |
674 | default: |
675 | return (EBADF); |
676 | } |
677 | sb->st_mode = mode; |
678 | sb->st_nlink = va.va_nlink; |
679 | sb->st_uid = va.va_uid; |
680 | sb->st_gid = va.va_gid; |
681 | sb->st_rdev = va.va_rdev; |
682 | sb->st_size = va.va_size; |
683 | sb->st_atimespec = va.va_atime; |
684 | sb->st_mtimespec = va.va_mtime; |
685 | sb->st_ctimespec = va.va_ctime; |
686 | sb->st_birthtimespec = va.va_birthtime; |
687 | sb->st_blksize = va.va_blocksize; |
688 | sb->st_flags = va.va_flags; |
689 | sb->st_gen = 0; |
690 | sb->st_blocks = va.va_bytes / S_BLKSIZE; |
691 | return (0); |
692 | } |
693 | |
694 | /* |
695 | * File table vnode fcntl routine. |
696 | */ |
697 | static int |
698 | vn_fcntl(file_t *fp, u_int com, void *data) |
699 | { |
700 | struct vnode *vp = fp->f_vnode; |
701 | int error; |
702 | |
703 | error = VOP_FCNTL(vp, com, data, fp->f_flag, kauth_cred_get()); |
704 | return (error); |
705 | } |
706 | |
707 | /* |
708 | * File table vnode ioctl routine. |
709 | */ |
710 | static int |
711 | vn_ioctl(file_t *fp, u_long com, void *data) |
712 | { |
713 | struct vnode *vp = fp->f_vnode, *ovp; |
714 | struct vattr vattr; |
715 | int error; |
716 | |
717 | switch (vp->v_type) { |
718 | |
719 | case VREG: |
720 | case VDIR: |
721 | if (com == FIONREAD) { |
722 | vn_lock(vp, LK_SHARED | LK_RETRY); |
723 | error = VOP_GETATTR(vp, &vattr, kauth_cred_get()); |
724 | VOP_UNLOCK(vp); |
725 | if (error) |
726 | return (error); |
727 | *(int *)data = vattr.va_size - fp->f_offset; |
728 | return (0); |
729 | } |
730 | if ((com == FIONWRITE) || (com == FIONSPACE)) { |
731 | /* |
732 | * Files don't have send queues, so there never |
733 | * are any bytes in them, nor is there any |
734 | * open space in them. |
735 | */ |
736 | *(int *)data = 0; |
737 | return (0); |
738 | } |
739 | if (com == FIOGETBMAP) { |
740 | daddr_t *block; |
741 | |
742 | if (*(daddr_t *)data < 0) |
743 | return (EINVAL); |
744 | block = (daddr_t *)data; |
745 | return (VOP_BMAP(vp, *block, NULL, block, NULL)); |
746 | } |
747 | if (com == OFIOGETBMAP) { |
748 | daddr_t ibn, obn; |
749 | |
750 | if (*(int32_t *)data < 0) |
751 | return (EINVAL); |
752 | ibn = (daddr_t)*(int32_t *)data; |
753 | error = VOP_BMAP(vp, ibn, NULL, &obn, NULL); |
754 | *(int32_t *)data = (int32_t)obn; |
755 | return error; |
756 | } |
757 | if (com == FIONBIO || com == FIOASYNC) /* XXX */ |
758 | return (0); /* XXX */ |
759 | /* fall into ... */ |
760 | case VFIFO: |
761 | case VCHR: |
762 | case VBLK: |
763 | error = VOP_IOCTL(vp, com, data, fp->f_flag, |
764 | kauth_cred_get()); |
765 | if (error == 0 && com == TIOCSCTTY) { |
766 | vref(vp); |
767 | mutex_enter(proc_lock); |
768 | ovp = curproc->p_session->s_ttyvp; |
769 | curproc->p_session->s_ttyvp = vp; |
770 | mutex_exit(proc_lock); |
771 | if (ovp != NULL) |
772 | vrele(ovp); |
773 | } |
774 | return (error); |
775 | |
776 | default: |
777 | return (EPASSTHROUGH); |
778 | } |
779 | } |
780 | |
781 | /* |
782 | * File table vnode poll routine. |
783 | */ |
784 | static int |
785 | vn_poll(file_t *fp, int events) |
786 | { |
787 | |
788 | return (VOP_POLL(fp->f_vnode, events)); |
789 | } |
790 | |
791 | /* |
792 | * File table vnode kqfilter routine. |
793 | */ |
794 | int |
795 | vn_kqfilter(file_t *fp, struct knote *kn) |
796 | { |
797 | |
798 | return (VOP_KQFILTER(fp->f_vnode, kn)); |
799 | } |
800 | |
801 | static int |
802 | vn_mmap(struct file *fp, off_t *offp, size_t size, int prot, int *flagsp, |
803 | int *advicep, struct uvm_object **uobjp, int *maxprotp) |
804 | { |
805 | struct uvm_object *uobj; |
806 | struct vnode *vp; |
807 | struct vattr va; |
808 | struct lwp *l; |
809 | vm_prot_t maxprot; |
810 | off_t off; |
811 | int error, flags; |
812 | bool needwritemap; |
813 | |
814 | l = curlwp; |
815 | |
816 | off = *offp; |
817 | flags = *flagsp; |
818 | maxprot = VM_PROT_EXECUTE; |
819 | |
820 | vp = fp->f_vnode; |
821 | if (vp->v_type != VREG && vp->v_type != VCHR && |
822 | vp->v_type != VBLK) { |
823 | /* only REG/CHR/BLK support mmap */ |
824 | return ENODEV; |
825 | } |
826 | if (vp->v_type != VCHR && off < 0) { |
827 | return EINVAL; |
828 | } |
829 | if (vp->v_type != VCHR && (off_t)(off + size) < off) { |
830 | /* no offset wrapping */ |
831 | return EOVERFLOW; |
832 | } |
833 | |
834 | /* special case: catch SunOS style /dev/zero */ |
835 | if (vp->v_type == VCHR && |
836 | (vp->v_rdev == zerodev || COMPAT_ZERODEV(vp->v_rdev))) { |
837 | *uobjp = NULL; |
838 | *maxprotp = VM_PROT_ALL; |
839 | return 0; |
840 | } |
841 | |
842 | /* |
843 | * Old programs may not select a specific sharing type, so |
844 | * default to an appropriate one. |
845 | * |
846 | * XXX: how does MAP_ANON fit in the picture? |
847 | */ |
848 | if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) { |
849 | #if defined(DEBUG) |
850 | struct proc *p = l->l_proc; |
851 | printf("WARNING: defaulted mmap() share type to " |
852 | "%s (pid %d command %s)\n" , vp->v_type == VCHR ? |
853 | "MAP_SHARED" : "MAP_PRIVATE" , p->p_pid, |
854 | p->p_comm); |
855 | #endif |
856 | if (vp->v_type == VCHR) |
857 | flags |= MAP_SHARED; /* for a device */ |
858 | else |
859 | flags |= MAP_PRIVATE; /* for a file */ |
860 | } |
861 | |
862 | /* |
863 | * MAP_PRIVATE device mappings don't make sense (and aren't |
864 | * supported anyway). However, some programs rely on this, |
865 | * so just change it to MAP_SHARED. |
866 | */ |
867 | if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) { |
868 | flags = (flags & ~MAP_PRIVATE) | MAP_SHARED; |
869 | } |
870 | |
871 | /* |
872 | * now check protection |
873 | */ |
874 | |
875 | /* check read access */ |
876 | if (fp->f_flag & FREAD) |
877 | maxprot |= VM_PROT_READ; |
878 | else if (prot & PROT_READ) { |
879 | return EACCES; |
880 | } |
881 | |
882 | /* check write access, shared case first */ |
883 | if (flags & MAP_SHARED) { |
884 | /* |
885 | * if the file is writable, only add PROT_WRITE to |
886 | * maxprot if the file is not immutable, append-only. |
887 | * otherwise, if we have asked for PROT_WRITE, return |
888 | * EPERM. |
889 | */ |
890 | if (fp->f_flag & FWRITE) { |
891 | vn_lock(vp, LK_SHARED | LK_RETRY); |
892 | error = VOP_GETATTR(vp, &va, l->l_cred); |
893 | VOP_UNLOCK(vp); |
894 | if (error) { |
895 | return error; |
896 | } |
897 | if ((va.va_flags & |
898 | (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0) |
899 | maxprot |= VM_PROT_WRITE; |
900 | else if (prot & PROT_WRITE) { |
901 | return EPERM; |
902 | } |
903 | } else if (prot & PROT_WRITE) { |
904 | return EACCES; |
905 | } |
906 | } else { |
907 | /* MAP_PRIVATE mappings can always write to */ |
908 | maxprot |= VM_PROT_WRITE; |
909 | } |
910 | |
911 | /* |
912 | * Don't allow mmap for EXEC if the file system |
913 | * is mounted NOEXEC. |
914 | */ |
915 | if ((prot & PROT_EXEC) != 0 && |
916 | (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0) { |
917 | return EACCES; |
918 | } |
919 | |
920 | if (vp->v_type != VCHR) { |
921 | error = VOP_MMAP(vp, prot, curlwp->l_cred); |
922 | if (error) { |
923 | return error; |
924 | } |
925 | vref(vp); |
926 | uobj = &vp->v_uobj; |
927 | |
928 | /* |
929 | * If the vnode is being mapped with PROT_EXEC, |
930 | * then mark it as text. |
931 | */ |
932 | if (prot & PROT_EXEC) { |
933 | vn_markexec(vp); |
934 | } |
935 | } else { |
936 | int i = maxprot; |
937 | |
938 | /* |
939 | * XXX Some devices don't like to be mapped with |
940 | * XXX PROT_EXEC or PROT_WRITE, but we don't really |
941 | * XXX have a better way of handling this, right now |
942 | */ |
943 | do { |
944 | uobj = udv_attach(vp->v_rdev, |
945 | (flags & MAP_SHARED) ? i : |
946 | (i & ~VM_PROT_WRITE), off, size); |
947 | i--; |
948 | } while ((uobj == NULL) && (i > 0)); |
949 | if (uobj == NULL) { |
950 | return EINVAL; |
951 | } |
952 | *advicep = UVM_ADV_RANDOM; |
953 | } |
954 | |
955 | /* |
956 | * Set vnode flags to indicate the new kinds of mapping. |
957 | * We take the vnode lock in exclusive mode here to serialize |
958 | * with direct I/O. |
959 | * |
960 | * Safe to check for these flag values without a lock, as |
961 | * long as a reference to the vnode is held. |
962 | */ |
963 | needwritemap = (vp->v_iflag & VI_WRMAP) == 0 && |
964 | (flags & MAP_SHARED) != 0 && |
965 | (maxprot & VM_PROT_WRITE) != 0; |
966 | if ((vp->v_vflag & VV_MAPPED) == 0 || needwritemap) { |
967 | vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); |
968 | vp->v_vflag |= VV_MAPPED; |
969 | if (needwritemap) { |
970 | mutex_enter(vp->v_interlock); |
971 | vp->v_iflag |= VI_WRMAP; |
972 | mutex_exit(vp->v_interlock); |
973 | } |
974 | VOP_UNLOCK(vp); |
975 | } |
976 | |
977 | #if NVERIEXEC > 0 |
978 | |
979 | /* |
980 | * Check if the file can be executed indirectly. |
981 | * |
982 | * XXX: This gives false warnings about "Incorrect access type" |
983 | * XXX: if the mapping is not executable. Harmless, but will be |
984 | * XXX: fixed as part of other changes. |
985 | */ |
986 | if (veriexec_verify(l, vp, "(mmap)" , VERIEXEC_INDIRECT, |
987 | NULL)) { |
988 | |
989 | /* |
990 | * Don't allow executable mappings if we can't |
991 | * indirectly execute the file. |
992 | */ |
993 | if (prot & VM_PROT_EXECUTE) { |
994 | return EPERM; |
995 | } |
996 | |
997 | /* |
998 | * Strip the executable bit from 'maxprot' to make sure |
999 | * it can't be made executable later. |
1000 | */ |
1001 | maxprot &= ~VM_PROT_EXECUTE; |
1002 | } |
1003 | #endif /* NVERIEXEC > 0 */ |
1004 | |
1005 | *uobjp = uobj; |
1006 | *maxprotp = maxprot; |
1007 | *flagsp = flags; |
1008 | |
1009 | return 0; |
1010 | } |
1011 | |
1012 | |
1013 | |
1014 | /* |
1015 | * Check that the vnode is still valid, and if so |
1016 | * acquire requested lock. |
1017 | */ |
1018 | int |
1019 | vn_lock(struct vnode *vp, int flags) |
1020 | { |
1021 | int error; |
1022 | |
1023 | #if 0 |
1024 | KASSERT(vp->v_usecount > 0 || (vp->v_iflag & VI_ONWORKLST) != 0); |
1025 | #endif |
1026 | KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT|LK_RETRY)) == 0); |
1027 | KASSERT(!mutex_owned(vp->v_interlock)); |
1028 | |
1029 | #ifdef DIAGNOSTIC |
1030 | if (wapbl_vphaswapbl(vp)) |
1031 | WAPBL_JUNLOCK_ASSERT(wapbl_vptomp(vp)); |
1032 | #endif |
1033 | |
1034 | error = VOP_LOCK(vp, flags); |
1035 | if ((flags & LK_RETRY) != 0 && error == ENOENT) |
1036 | error = VOP_LOCK(vp, flags); |
1037 | |
1038 | KASSERT((flags & LK_RETRY) == 0 || (flags & LK_NOWAIT) != 0 || |
1039 | error == 0); |
1040 | |
1041 | return error; |
1042 | } |
1043 | |
1044 | /* |
1045 | * File table vnode close routine. |
1046 | */ |
1047 | static int |
1048 | vn_closefile(file_t *fp) |
1049 | { |
1050 | |
1051 | return vn_close(fp->f_vnode, fp->f_flag, fp->f_cred); |
1052 | } |
1053 | |
1054 | /* |
1055 | * Simplified in-kernel wrapper calls for extended attribute access. |
1056 | * Both calls pass in a NULL credential, authorizing a "kernel" access. |
1057 | * Set IO_NODELOCKED in ioflg if the vnode is already locked. |
1058 | */ |
1059 | int |
1060 | vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, |
1061 | const char *attrname, size_t *buflen, void *bf, struct lwp *l) |
1062 | { |
1063 | struct uio auio; |
1064 | struct iovec aiov; |
1065 | int error; |
1066 | |
1067 | aiov.iov_len = *buflen; |
1068 | aiov.iov_base = bf; |
1069 | |
1070 | auio.uio_iov = &aiov; |
1071 | auio.uio_iovcnt = 1; |
1072 | auio.uio_rw = UIO_READ; |
1073 | auio.uio_offset = 0; |
1074 | auio.uio_resid = *buflen; |
1075 | UIO_SETUP_SYSSPACE(&auio); |
1076 | |
1077 | if ((ioflg & IO_NODELOCKED) == 0) |
1078 | vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); |
1079 | |
1080 | error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL); |
1081 | |
1082 | if ((ioflg & IO_NODELOCKED) == 0) |
1083 | VOP_UNLOCK(vp); |
1084 | |
1085 | if (error == 0) |
1086 | *buflen = *buflen - auio.uio_resid; |
1087 | |
1088 | return (error); |
1089 | } |
1090 | |
1091 | /* |
1092 | * XXX Failure mode if partially written? |
1093 | */ |
1094 | int |
1095 | vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, |
1096 | const char *attrname, size_t buflen, const void *bf, struct lwp *l) |
1097 | { |
1098 | struct uio auio; |
1099 | struct iovec aiov; |
1100 | int error; |
1101 | |
1102 | aiov.iov_len = buflen; |
1103 | aiov.iov_base = __UNCONST(bf); /* XXXUNCONST kills const */ |
1104 | |
1105 | auio.uio_iov = &aiov; |
1106 | auio.uio_iovcnt = 1; |
1107 | auio.uio_rw = UIO_WRITE; |
1108 | auio.uio_offset = 0; |
1109 | auio.uio_resid = buflen; |
1110 | UIO_SETUP_SYSSPACE(&auio); |
1111 | |
1112 | if ((ioflg & IO_NODELOCKED) == 0) { |
1113 | vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); |
1114 | } |
1115 | |
1116 | error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL); |
1117 | |
1118 | if ((ioflg & IO_NODELOCKED) == 0) { |
1119 | VOP_UNLOCK(vp); |
1120 | } |
1121 | |
1122 | return (error); |
1123 | } |
1124 | |
1125 | int |
1126 | vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, |
1127 | const char *attrname, struct lwp *l) |
1128 | { |
1129 | int error; |
1130 | |
1131 | if ((ioflg & IO_NODELOCKED) == 0) { |
1132 | vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); |
1133 | } |
1134 | |
1135 | error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL); |
1136 | if (error == EOPNOTSUPP) |
1137 | error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, NULL); |
1138 | |
1139 | if ((ioflg & IO_NODELOCKED) == 0) { |
1140 | VOP_UNLOCK(vp); |
1141 | } |
1142 | |
1143 | return (error); |
1144 | } |
1145 | |
1146 | void |
1147 | vn_ra_allocctx(struct vnode *vp) |
1148 | { |
1149 | struct uvm_ractx *ra = NULL; |
1150 | |
1151 | KASSERT(mutex_owned(vp->v_interlock)); |
1152 | |
1153 | if (vp->v_type != VREG) { |
1154 | return; |
1155 | } |
1156 | if (vp->v_ractx != NULL) { |
1157 | return; |
1158 | } |
1159 | if (vp->v_ractx == NULL) { |
1160 | mutex_exit(vp->v_interlock); |
1161 | ra = uvm_ra_allocctx(); |
1162 | mutex_enter(vp->v_interlock); |
1163 | if (ra != NULL && vp->v_ractx == NULL) { |
1164 | vp->v_ractx = ra; |
1165 | ra = NULL; |
1166 | } |
1167 | } |
1168 | if (ra != NULL) { |
1169 | uvm_ra_freectx(ra); |
1170 | } |
1171 | } |
1172 | |
1173 | int |
1174 | vn_fifo_bypass(void *v) |
1175 | { |
1176 | struct vop_generic_args *ap = v; |
1177 | |
1178 | return VOCALL(fifo_vnodeop_p, ap->a_desc->vdesc_offset, v); |
1179 | } |
1180 | |