1 | /* $NetBSD: sys_select.c,v 1.39 2014/04/25 15:52:45 pooka Exp $ */ |
2 | |
3 | /*- |
4 | * Copyright (c) 2007, 2008, 2009, 2010 The NetBSD Foundation, Inc. |
5 | * All rights reserved. |
6 | * |
7 | * This code is derived from software contributed to The NetBSD Foundation |
8 | * by Andrew Doran and Mindaugas Rasiukevicius. |
9 | * |
10 | * Redistribution and use in source and binary forms, with or without |
11 | * modification, are permitted provided that the following conditions |
12 | * are met: |
13 | * 1. Redistributions of source code must retain the above copyright |
14 | * notice, this list of conditions and the following disclaimer. |
15 | * 2. Redistributions in binary form must reproduce the above copyright |
16 | * notice, this list of conditions and the following disclaimer in the |
17 | * documentation and/or other materials provided with the distribution. |
18 | * |
19 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
20 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
21 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
22 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
29 | * POSSIBILITY OF SUCH DAMAGE. |
30 | */ |
31 | |
32 | /* |
33 | * Copyright (c) 1982, 1986, 1989, 1993 |
34 | * The Regents of the University of California. All rights reserved. |
35 | * (c) UNIX System Laboratories, Inc. |
36 | * All or some portions of this file are derived from material licensed |
37 | * to the University of California by American Telephone and Telegraph |
38 | * Co. or Unix System Laboratories, Inc. and are reproduced herein with |
39 | * the permission of UNIX System Laboratories, Inc. |
40 | * |
41 | * Redistribution and use in source and binary forms, with or without |
42 | * modification, are permitted provided that the following conditions |
43 | * are met: |
44 | * 1. Redistributions of source code must retain the above copyright |
45 | * notice, this list of conditions and the following disclaimer. |
46 | * 2. Redistributions in binary form must reproduce the above copyright |
47 | * notice, this list of conditions and the following disclaimer in the |
48 | * documentation and/or other materials provided with the distribution. |
49 | * 3. Neither the name of the University nor the names of its contributors |
50 | * may be used to endorse or promote products derived from this software |
51 | * without specific prior written permission. |
52 | * |
53 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
54 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
55 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
56 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
57 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
58 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
59 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
60 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
61 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
62 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
63 | * SUCH DAMAGE. |
64 | * |
65 | * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95 |
66 | */ |
67 | |
68 | /* |
69 | * System calls of synchronous I/O multiplexing subsystem. |
70 | * |
71 | * Locking |
72 | * |
73 | * Two locks are used: <object-lock> and selcluster_t::sc_lock. |
74 | * |
75 | * The <object-lock> might be a device driver or another subsystem, e.g. |
76 | * socket or pipe. This lock is not exported, and thus invisible to this |
77 | * subsystem. Mainly, synchronisation between selrecord() and selnotify() |
78 | * routines depends on this lock, as it will be described in the comments. |
79 | * |
80 | * Lock order |
81 | * |
82 | * <object-lock> -> |
83 | * selcluster_t::sc_lock |
84 | */ |
85 | |
86 | #include <sys/cdefs.h> |
87 | __KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.39 2014/04/25 15:52:45 pooka Exp $" ); |
88 | |
89 | #include <sys/param.h> |
90 | #include <sys/systm.h> |
91 | #include <sys/filedesc.h> |
92 | #include <sys/file.h> |
93 | #include <sys/proc.h> |
94 | #include <sys/socketvar.h> |
95 | #include <sys/signalvar.h> |
96 | #include <sys/uio.h> |
97 | #include <sys/kernel.h> |
98 | #include <sys/lwp.h> |
99 | #include <sys/poll.h> |
100 | #include <sys/mount.h> |
101 | #include <sys/syscallargs.h> |
102 | #include <sys/cpu.h> |
103 | #include <sys/atomic.h> |
104 | #include <sys/socketvar.h> |
105 | #include <sys/sleepq.h> |
106 | #include <sys/sysctl.h> |
107 | |
108 | /* Flags for lwp::l_selflag. */ |
109 | #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */ |
110 | #define SEL_SCANNING 1 /* polling descriptors */ |
111 | #define SEL_BLOCKING 2 /* blocking and waiting for event */ |
112 | #define SEL_EVENT 3 /* interrupted, events set directly */ |
113 | |
114 | /* Operations: either select() or poll(). */ |
115 | #define SELOP_SELECT 1 |
116 | #define SELOP_POLL 2 |
117 | |
118 | /* |
119 | * Per-cluster state for select()/poll(). For a system with fewer |
120 | * than 32 CPUs, this gives us per-CPU clusters. |
121 | */ |
122 | #define SELCLUSTERS 32 |
123 | #define SELCLUSTERMASK (SELCLUSTERS - 1) |
124 | |
125 | typedef struct selcluster { |
126 | kmutex_t *sc_lock; |
127 | sleepq_t sc_sleepq; |
128 | int sc_ncoll; |
129 | uint32_t sc_mask; |
130 | } selcluster_t; |
131 | |
132 | static inline int selscan(char *, const int, const size_t, register_t *); |
133 | static inline int pollscan(struct pollfd *, const int, register_t *); |
134 | static void selclear(void); |
135 | |
136 | static const int sel_flag[] = { |
137 | POLLRDNORM | POLLHUP | POLLERR, |
138 | POLLWRNORM | POLLHUP | POLLERR, |
139 | POLLRDBAND |
140 | }; |
141 | |
142 | static syncobj_t select_sobj = { |
143 | SOBJ_SLEEPQ_FIFO, |
144 | sleepq_unsleep, |
145 | sleepq_changepri, |
146 | sleepq_lendpri, |
147 | syncobj_noowner, |
148 | }; |
149 | |
150 | static selcluster_t *selcluster[SELCLUSTERS] __read_mostly; |
151 | static int direct_select __read_mostly = 0; |
152 | |
153 | /* |
154 | * Select system call. |
155 | */ |
156 | int |
157 | sys___pselect50(struct lwp *l, const struct sys___pselect50_args *uap, |
158 | register_t *retval) |
159 | { |
160 | /* { |
161 | syscallarg(int) nd; |
162 | syscallarg(fd_set *) in; |
163 | syscallarg(fd_set *) ou; |
164 | syscallarg(fd_set *) ex; |
165 | syscallarg(const struct timespec *) ts; |
166 | syscallarg(sigset_t *) mask; |
167 | } */ |
168 | struct timespec ats, *ts = NULL; |
169 | sigset_t amask, *mask = NULL; |
170 | int error; |
171 | |
172 | if (SCARG(uap, ts)) { |
173 | error = copyin(SCARG(uap, ts), &ats, sizeof(ats)); |
174 | if (error) |
175 | return error; |
176 | ts = &ats; |
177 | } |
178 | if (SCARG(uap, mask) != NULL) { |
179 | error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); |
180 | if (error) |
181 | return error; |
182 | mask = &amask; |
183 | } |
184 | |
185 | return selcommon(retval, SCARG(uap, nd), SCARG(uap, in), |
186 | SCARG(uap, ou), SCARG(uap, ex), ts, mask); |
187 | } |
188 | |
189 | int |
190 | sys___select50(struct lwp *l, const struct sys___select50_args *uap, |
191 | register_t *retval) |
192 | { |
193 | /* { |
194 | syscallarg(int) nd; |
195 | syscallarg(fd_set *) in; |
196 | syscallarg(fd_set *) ou; |
197 | syscallarg(fd_set *) ex; |
198 | syscallarg(struct timeval *) tv; |
199 | } */ |
200 | struct timeval atv; |
201 | struct timespec ats, *ts = NULL; |
202 | int error; |
203 | |
204 | if (SCARG(uap, tv)) { |
205 | error = copyin(SCARG(uap, tv), (void *)&atv, sizeof(atv)); |
206 | if (error) |
207 | return error; |
208 | TIMEVAL_TO_TIMESPEC(&atv, &ats); |
209 | ts = &ats; |
210 | } |
211 | |
212 | return selcommon(retval, SCARG(uap, nd), SCARG(uap, in), |
213 | SCARG(uap, ou), SCARG(uap, ex), ts, NULL); |
214 | } |
215 | |
216 | /* |
217 | * sel_do_scan: common code to perform the scan on descriptors. |
218 | */ |
219 | static int |
220 | sel_do_scan(const int op, void *fds, const int nf, const size_t ni, |
221 | struct timespec *ts, sigset_t *mask, register_t *retval) |
222 | { |
223 | lwp_t * const l = curlwp; |
224 | selcluster_t *sc; |
225 | kmutex_t *lock; |
226 | struct timespec sleepts; |
227 | int error, timo; |
228 | |
229 | timo = 0; |
230 | if (ts && inittimeleft(ts, &sleepts) == -1) { |
231 | return EINVAL; |
232 | } |
233 | |
234 | if (__predict_false(mask)) |
235 | sigsuspendsetup(l, mask); |
236 | |
237 | sc = curcpu()->ci_data.cpu_selcluster; |
238 | lock = sc->sc_lock; |
239 | l->l_selcluster = sc; |
240 | if (op == SELOP_SELECT) { |
241 | l->l_selbits = fds; |
242 | l->l_selni = ni; |
243 | } else { |
244 | l->l_selbits = NULL; |
245 | } |
246 | |
247 | for (;;) { |
248 | int ncoll; |
249 | |
250 | SLIST_INIT(&l->l_selwait); |
251 | l->l_selret = 0; |
252 | |
253 | /* |
254 | * No need to lock. If this is overwritten by another value |
255 | * while scanning, we will retry below. We only need to see |
256 | * exact state from the descriptors that we are about to poll, |
257 | * and lock activity resulting from fo_poll is enough to |
258 | * provide an up to date value for new polling activity. |
259 | */ |
260 | l->l_selflag = SEL_SCANNING; |
261 | ncoll = sc->sc_ncoll; |
262 | |
263 | if (op == SELOP_SELECT) { |
264 | error = selscan((char *)fds, nf, ni, retval); |
265 | } else { |
266 | error = pollscan((struct pollfd *)fds, nf, retval); |
267 | } |
268 | if (error || *retval) |
269 | break; |
270 | if (ts && (timo = gettimeleft(ts, &sleepts)) <= 0) |
271 | break; |
272 | /* |
273 | * Acquire the lock and perform the (re)checks. Note, if |
274 | * collision has occured, then our state does not matter, |
275 | * as we must perform re-scan. Therefore, check it first. |
276 | */ |
277 | state_check: |
278 | mutex_spin_enter(lock); |
279 | if (__predict_false(sc->sc_ncoll != ncoll)) { |
280 | /* Collision: perform re-scan. */ |
281 | mutex_spin_exit(lock); |
282 | selclear(); |
283 | continue; |
284 | } |
285 | if (__predict_true(l->l_selflag == SEL_EVENT)) { |
286 | /* Events occured, they are set directly. */ |
287 | mutex_spin_exit(lock); |
288 | break; |
289 | } |
290 | if (__predict_true(l->l_selflag == SEL_RESET)) { |
291 | /* Events occured, but re-scan is requested. */ |
292 | mutex_spin_exit(lock); |
293 | selclear(); |
294 | continue; |
295 | } |
296 | /* Nothing happen, therefore - sleep. */ |
297 | l->l_selflag = SEL_BLOCKING; |
298 | l->l_kpriority = true; |
299 | sleepq_enter(&sc->sc_sleepq, l, lock); |
300 | sleepq_enqueue(&sc->sc_sleepq, sc, "select" , &select_sobj); |
301 | error = sleepq_block(timo, true); |
302 | if (error != 0) { |
303 | break; |
304 | } |
305 | /* Awoken: need to check the state. */ |
306 | goto state_check; |
307 | } |
308 | selclear(); |
309 | |
310 | /* Add direct events if any. */ |
311 | if (l->l_selflag == SEL_EVENT) { |
312 | KASSERT(l->l_selret != 0); |
313 | *retval += l->l_selret; |
314 | } |
315 | |
316 | if (__predict_false(mask)) |
317 | sigsuspendteardown(l); |
318 | |
319 | /* select and poll are not restarted after signals... */ |
320 | if (error == ERESTART) |
321 | return EINTR; |
322 | if (error == EWOULDBLOCK) |
323 | return 0; |
324 | return error; |
325 | } |
326 | |
327 | int |
328 | selcommon(register_t *retval, int nd, fd_set *u_in, fd_set *u_ou, |
329 | fd_set *u_ex, struct timespec *ts, sigset_t *mask) |
330 | { |
331 | char smallbits[howmany(FD_SETSIZE, NFDBITS) * |
332 | sizeof(fd_mask) * 6]; |
333 | char *bits; |
334 | int error, nf; |
335 | size_t ni; |
336 | |
337 | if (nd < 0) |
338 | return (EINVAL); |
339 | nf = curlwp->l_fd->fd_dt->dt_nfiles; |
340 | if (nd > nf) { |
341 | /* forgiving; slightly wrong */ |
342 | nd = nf; |
343 | } |
344 | ni = howmany(nd, NFDBITS) * sizeof(fd_mask); |
345 | if (ni * 6 > sizeof(smallbits)) { |
346 | bits = kmem_alloc(ni * 6, KM_SLEEP); |
347 | if (bits == NULL) |
348 | return ENOMEM; |
349 | } else |
350 | bits = smallbits; |
351 | |
352 | #define getbits(name, x) \ |
353 | if (u_ ## name) { \ |
354 | error = copyin(u_ ## name, bits + ni * x, ni); \ |
355 | if (error) \ |
356 | goto fail; \ |
357 | } else \ |
358 | memset(bits + ni * x, 0, ni); |
359 | getbits(in, 0); |
360 | getbits(ou, 1); |
361 | getbits(ex, 2); |
362 | #undef getbits |
363 | |
364 | error = sel_do_scan(SELOP_SELECT, bits, nd, ni, ts, mask, retval); |
365 | if (error == 0 && u_in != NULL) |
366 | error = copyout(bits + ni * 3, u_in, ni); |
367 | if (error == 0 && u_ou != NULL) |
368 | error = copyout(bits + ni * 4, u_ou, ni); |
369 | if (error == 0 && u_ex != NULL) |
370 | error = copyout(bits + ni * 5, u_ex, ni); |
371 | fail: |
372 | if (bits != smallbits) |
373 | kmem_free(bits, ni * 6); |
374 | return (error); |
375 | } |
376 | |
377 | static inline int |
378 | selscan(char *bits, const int nfd, const size_t ni, register_t *retval) |
379 | { |
380 | fd_mask *ibitp, *obitp; |
381 | int msk, i, j, fd, n; |
382 | file_t *fp; |
383 | |
384 | ibitp = (fd_mask *)(bits + ni * 0); |
385 | obitp = (fd_mask *)(bits + ni * 3); |
386 | n = 0; |
387 | |
388 | memset(obitp, 0, ni * 3); |
389 | for (msk = 0; msk < 3; msk++) { |
390 | for (i = 0; i < nfd; i += NFDBITS) { |
391 | fd_mask ibits, obits; |
392 | |
393 | ibits = *ibitp; |
394 | obits = 0; |
395 | while ((j = ffs(ibits)) && (fd = i + --j) < nfd) { |
396 | ibits &= ~(1 << j); |
397 | if ((fp = fd_getfile(fd)) == NULL) |
398 | return (EBADF); |
399 | /* |
400 | * Setup an argument to selrecord(), which is |
401 | * a file descriptor number. |
402 | */ |
403 | curlwp->l_selrec = fd; |
404 | if ((*fp->f_ops->fo_poll)(fp, sel_flag[msk])) { |
405 | obits |= (1 << j); |
406 | n++; |
407 | } |
408 | fd_putfile(fd); |
409 | } |
410 | if (obits != 0) { |
411 | if (direct_select) { |
412 | kmutex_t *lock; |
413 | lock = curlwp->l_selcluster->sc_lock; |
414 | mutex_spin_enter(lock); |
415 | *obitp |= obits; |
416 | mutex_spin_exit(lock); |
417 | } else { |
418 | *obitp |= obits; |
419 | } |
420 | } |
421 | ibitp++; |
422 | obitp++; |
423 | } |
424 | } |
425 | *retval = n; |
426 | return (0); |
427 | } |
428 | |
429 | /* |
430 | * Poll system call. |
431 | */ |
432 | int |
433 | sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval) |
434 | { |
435 | /* { |
436 | syscallarg(struct pollfd *) fds; |
437 | syscallarg(u_int) nfds; |
438 | syscallarg(int) timeout; |
439 | } */ |
440 | struct timespec ats, *ts = NULL; |
441 | |
442 | if (SCARG(uap, timeout) != INFTIM) { |
443 | ats.tv_sec = SCARG(uap, timeout) / 1000; |
444 | ats.tv_nsec = (SCARG(uap, timeout) % 1000) * 1000000; |
445 | ts = &ats; |
446 | } |
447 | |
448 | return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, NULL); |
449 | } |
450 | |
451 | /* |
452 | * Poll system call. |
453 | */ |
454 | int |
455 | sys___pollts50(struct lwp *l, const struct sys___pollts50_args *uap, |
456 | register_t *retval) |
457 | { |
458 | /* { |
459 | syscallarg(struct pollfd *) fds; |
460 | syscallarg(u_int) nfds; |
461 | syscallarg(const struct timespec *) ts; |
462 | syscallarg(const sigset_t *) mask; |
463 | } */ |
464 | struct timespec ats, *ts = NULL; |
465 | sigset_t amask, *mask = NULL; |
466 | int error; |
467 | |
468 | if (SCARG(uap, ts)) { |
469 | error = copyin(SCARG(uap, ts), &ats, sizeof(ats)); |
470 | if (error) |
471 | return error; |
472 | ts = &ats; |
473 | } |
474 | if (SCARG(uap, mask)) { |
475 | error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); |
476 | if (error) |
477 | return error; |
478 | mask = &amask; |
479 | } |
480 | |
481 | return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, mask); |
482 | } |
483 | |
484 | int |
485 | pollcommon(register_t *retval, struct pollfd *u_fds, u_int nfds, |
486 | struct timespec *ts, sigset_t *mask) |
487 | { |
488 | struct pollfd smallfds[32]; |
489 | struct pollfd *fds; |
490 | int error; |
491 | size_t ni; |
492 | |
493 | if (nfds > 1000 + curlwp->l_fd->fd_dt->dt_nfiles) { |
494 | /* |
495 | * Either the user passed in a very sparse 'fds' or junk! |
496 | * The kmem_alloc() call below would be bad news. |
497 | * We could process the 'fds' array in chunks, but that |
498 | * is a lot of code that isn't normally useful. |
499 | * (Or just move the copyin/out into pollscan().) |
500 | * Historically the code silently truncated 'fds' to |
501 | * dt_nfiles entries - but that does cause issues. |
502 | */ |
503 | return EINVAL; |
504 | } |
505 | ni = nfds * sizeof(struct pollfd); |
506 | if (ni > sizeof(smallfds)) { |
507 | fds = kmem_alloc(ni, KM_SLEEP); |
508 | if (fds == NULL) |
509 | return ENOMEM; |
510 | } else |
511 | fds = smallfds; |
512 | |
513 | error = copyin(u_fds, fds, ni); |
514 | if (error) |
515 | goto fail; |
516 | |
517 | error = sel_do_scan(SELOP_POLL, fds, nfds, ni, ts, mask, retval); |
518 | if (error == 0) |
519 | error = copyout(fds, u_fds, ni); |
520 | fail: |
521 | if (fds != smallfds) |
522 | kmem_free(fds, ni); |
523 | return (error); |
524 | } |
525 | |
526 | static inline int |
527 | pollscan(struct pollfd *fds, const int nfd, register_t *retval) |
528 | { |
529 | file_t *fp; |
530 | int i, n = 0, revents; |
531 | |
532 | for (i = 0; i < nfd; i++, fds++) { |
533 | fds->revents = 0; |
534 | if (fds->fd < 0) { |
535 | revents = 0; |
536 | } else if ((fp = fd_getfile(fds->fd)) == NULL) { |
537 | revents = POLLNVAL; |
538 | } else { |
539 | /* |
540 | * Perform poll: registers select request or returns |
541 | * the events which are set. Setup an argument for |
542 | * selrecord(), which is a pointer to struct pollfd. |
543 | */ |
544 | curlwp->l_selrec = (uintptr_t)fds; |
545 | revents = (*fp->f_ops->fo_poll)(fp, |
546 | fds->events | POLLERR | POLLHUP); |
547 | fd_putfile(fds->fd); |
548 | } |
549 | if (revents) { |
550 | fds->revents = revents; |
551 | n++; |
552 | } |
553 | } |
554 | *retval = n; |
555 | return (0); |
556 | } |
557 | |
558 | int |
559 | seltrue(dev_t dev, int events, lwp_t *l) |
560 | { |
561 | |
562 | return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); |
563 | } |
564 | |
565 | /* |
566 | * Record a select request. Concurrency issues: |
567 | * |
568 | * The caller holds the same lock across calls to selrecord() and |
569 | * selnotify(), so we don't need to consider a concurrent wakeup |
570 | * while in this routine. |
571 | * |
572 | * The only activity we need to guard against is selclear(), called by |
573 | * another thread that is exiting sel_do_scan(). |
574 | * `sel_lwp' can only become non-NULL while the caller's lock is held, |
575 | * so it cannot become non-NULL due to a change made by another thread |
576 | * while we are in this routine. It can only become _NULL_ due to a |
577 | * call to selclear(). |
578 | * |
579 | * If it is non-NULL and != selector there is the potential for |
580 | * selclear() to be called by another thread. If either of those |
581 | * conditions are true, we're not interested in touching the `named |
582 | * waiter' part of the selinfo record because we need to record a |
583 | * collision. Hence there is no need for additional locking in this |
584 | * routine. |
585 | */ |
586 | void |
587 | selrecord(lwp_t *selector, struct selinfo *sip) |
588 | { |
589 | selcluster_t *sc; |
590 | lwp_t *other; |
591 | |
592 | KASSERT(selector == curlwp); |
593 | |
594 | sc = selector->l_selcluster; |
595 | other = sip->sel_lwp; |
596 | |
597 | if (other == selector) { |
598 | /* 1. We (selector) already claimed to be the first LWP. */ |
599 | KASSERT(sip->sel_cluster == sc); |
600 | } else if (other == NULL) { |
601 | /* |
602 | * 2. No first LWP, therefore we (selector) are the first. |
603 | * |
604 | * There may be unnamed waiters (collisions). Issue a memory |
605 | * barrier to ensure that we access sel_lwp (above) before |
606 | * other fields - this guards against a call to selclear(). |
607 | */ |
608 | membar_enter(); |
609 | sip->sel_lwp = selector; |
610 | SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain); |
611 | /* Copy the argument, which is for selnotify(). */ |
612 | sip->sel_fdinfo = selector->l_selrec; |
613 | /* Replace selinfo's lock with the chosen cluster's lock. */ |
614 | sip->sel_cluster = sc; |
615 | } else { |
616 | /* 3. Multiple waiters: record a collision. */ |
617 | sip->sel_collision |= sc->sc_mask; |
618 | KASSERT(sip->sel_cluster != NULL); |
619 | } |
620 | } |
621 | |
622 | /* |
623 | * sel_setevents: a helper function for selnotify(), to set the events |
624 | * for LWP sleeping in selcommon() or pollcommon(). |
625 | */ |
626 | static inline bool |
627 | sel_setevents(lwp_t *l, struct selinfo *sip, const int events) |
628 | { |
629 | const int oflag = l->l_selflag; |
630 | int ret = 0; |
631 | |
632 | /* |
633 | * If we require re-scan or it was required by somebody else, |
634 | * then just (re)set SEL_RESET and return. |
635 | */ |
636 | if (__predict_false(events == 0 || oflag == SEL_RESET)) { |
637 | l->l_selflag = SEL_RESET; |
638 | return true; |
639 | } |
640 | /* |
641 | * Direct set. Note: select state of LWP is locked. First, |
642 | * determine whether it is selcommon() or pollcommon(). |
643 | */ |
644 | if (l->l_selbits != NULL) { |
645 | const size_t ni = l->l_selni; |
646 | fd_mask *fds = (fd_mask *)l->l_selbits; |
647 | fd_mask *ofds = (fd_mask *)((char *)fds + ni * 3); |
648 | const int fd = sip->sel_fdinfo, fbit = 1 << (fd & __NFDMASK); |
649 | const int idx = fd >> __NFDSHIFT; |
650 | int n; |
651 | |
652 | for (n = 0; n < 3; n++) { |
653 | if ((fds[idx] & fbit) != 0 && |
654 | (ofds[idx] & fbit) == 0 && |
655 | (sel_flag[n] & events)) { |
656 | ofds[idx] |= fbit; |
657 | ret++; |
658 | } |
659 | fds = (fd_mask *)((char *)fds + ni); |
660 | ofds = (fd_mask *)((char *)ofds + ni); |
661 | } |
662 | } else { |
663 | struct pollfd *pfd = (void *)sip->sel_fdinfo; |
664 | int revents = events & (pfd->events | POLLERR | POLLHUP); |
665 | |
666 | if (revents) { |
667 | if (pfd->revents == 0) |
668 | ret = 1; |
669 | pfd->revents |= revents; |
670 | } |
671 | } |
672 | /* Check whether there are any events to return. */ |
673 | if (!ret) { |
674 | return false; |
675 | } |
676 | /* Indicate direct set and note the event (cluster lock is held). */ |
677 | l->l_selflag = SEL_EVENT; |
678 | l->l_selret += ret; |
679 | return true; |
680 | } |
681 | |
682 | /* |
683 | * Do a wakeup when a selectable event occurs. Concurrency issues: |
684 | * |
685 | * As per selrecord(), the caller's object lock is held. If there |
686 | * is a named waiter, we must acquire the associated selcluster's lock |
687 | * in order to synchronize with selclear() and pollers going to sleep |
688 | * in sel_do_scan(). |
689 | * |
690 | * sip->sel_cluser cannot change at this point, as it is only changed |
691 | * in selrecord(), and concurrent calls to selrecord() are locked |
692 | * out by the caller. |
693 | */ |
694 | void |
695 | selnotify(struct selinfo *sip, int events, long knhint) |
696 | { |
697 | selcluster_t *sc; |
698 | uint32_t mask; |
699 | int index, oflag; |
700 | lwp_t *l; |
701 | kmutex_t *lock; |
702 | |
703 | KNOTE(&sip->sel_klist, knhint); |
704 | |
705 | if (sip->sel_lwp != NULL) { |
706 | /* One named LWP is waiting. */ |
707 | sc = sip->sel_cluster; |
708 | lock = sc->sc_lock; |
709 | mutex_spin_enter(lock); |
710 | /* Still there? */ |
711 | if (sip->sel_lwp != NULL) { |
712 | /* |
713 | * Set the events for our LWP and indicate that. |
714 | * Otherwise, request for a full re-scan. |
715 | */ |
716 | l = sip->sel_lwp; |
717 | oflag = l->l_selflag; |
718 | |
719 | if (!direct_select) { |
720 | l->l_selflag = SEL_RESET; |
721 | } else if (!sel_setevents(l, sip, events)) { |
722 | /* No events to return. */ |
723 | mutex_spin_exit(lock); |
724 | return; |
725 | } |
726 | |
727 | /* |
728 | * If thread is sleeping, wake it up. If it's not |
729 | * yet asleep, it will notice the change in state |
730 | * and will re-poll the descriptors. |
731 | */ |
732 | if (oflag == SEL_BLOCKING && l->l_mutex == lock) { |
733 | KASSERT(l->l_wchan == sc); |
734 | sleepq_unsleep(l, false); |
735 | } |
736 | } |
737 | mutex_spin_exit(lock); |
738 | } |
739 | |
740 | if ((mask = sip->sel_collision) != 0) { |
741 | /* |
742 | * There was a collision (multiple waiters): we must |
743 | * inform all potentially interested waiters. |
744 | */ |
745 | sip->sel_collision = 0; |
746 | do { |
747 | index = ffs(mask) - 1; |
748 | mask &= ~(1 << index); |
749 | sc = selcluster[index]; |
750 | lock = sc->sc_lock; |
751 | mutex_spin_enter(lock); |
752 | sc->sc_ncoll++; |
753 | sleepq_wake(&sc->sc_sleepq, sc, (u_int)-1, lock); |
754 | } while (__predict_false(mask != 0)); |
755 | } |
756 | } |
757 | |
758 | /* |
759 | * Remove an LWP from all objects that it is waiting for. Concurrency |
760 | * issues: |
761 | * |
762 | * The object owner's (e.g. device driver) lock is not held here. Calls |
763 | * can be made to selrecord() and we do not synchronize against those |
764 | * directly using locks. However, we use `sel_lwp' to lock out changes. |
765 | * Before clearing it we must use memory barriers to ensure that we can |
766 | * safely traverse the list of selinfo records. |
767 | */ |
768 | static void |
769 | selclear(void) |
770 | { |
771 | struct selinfo *sip, *next; |
772 | selcluster_t *sc; |
773 | lwp_t *l; |
774 | kmutex_t *lock; |
775 | |
776 | l = curlwp; |
777 | sc = l->l_selcluster; |
778 | lock = sc->sc_lock; |
779 | |
780 | mutex_spin_enter(lock); |
781 | for (sip = SLIST_FIRST(&l->l_selwait); sip != NULL; sip = next) { |
782 | KASSERT(sip->sel_lwp == l); |
783 | KASSERT(sip->sel_cluster == l->l_selcluster); |
784 | |
785 | /* |
786 | * Read link to next selinfo record, if any. |
787 | * It's no longer safe to touch `sip' after clearing |
788 | * `sel_lwp', so ensure that the read of `sel_chain' |
789 | * completes before the clearing of sel_lwp becomes |
790 | * globally visible. |
791 | */ |
792 | next = SLIST_NEXT(sip, sel_chain); |
793 | membar_exit(); |
794 | /* Release the record for another named waiter to use. */ |
795 | sip->sel_lwp = NULL; |
796 | } |
797 | mutex_spin_exit(lock); |
798 | } |
799 | |
800 | /* |
801 | * Initialize the select/poll system calls. Called once for each |
802 | * CPU in the system, as they are attached. |
803 | */ |
804 | void |
805 | selsysinit(struct cpu_info *ci) |
806 | { |
807 | selcluster_t *sc; |
808 | u_int index; |
809 | |
810 | /* If already a cluster in place for this bit, re-use. */ |
811 | index = cpu_index(ci) & SELCLUSTERMASK; |
812 | sc = selcluster[index]; |
813 | if (sc == NULL) { |
814 | sc = kmem_alloc(roundup2(sizeof(selcluster_t), |
815 | coherency_unit) + coherency_unit, KM_SLEEP); |
816 | sc = (void *)roundup2((uintptr_t)sc, coherency_unit); |
817 | sc->sc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); |
818 | sleepq_init(&sc->sc_sleepq); |
819 | sc->sc_ncoll = 0; |
820 | sc->sc_mask = (1 << index); |
821 | selcluster[index] = sc; |
822 | } |
823 | ci->ci_data.cpu_selcluster = sc; |
824 | } |
825 | |
826 | /* |
827 | * Initialize a selinfo record. |
828 | */ |
829 | void |
830 | selinit(struct selinfo *sip) |
831 | { |
832 | |
833 | memset(sip, 0, sizeof(*sip)); |
834 | } |
835 | |
836 | /* |
837 | * Destroy a selinfo record. The owning object must not gain new |
838 | * references while this is in progress: all activity on the record |
839 | * must be stopped. |
840 | * |
841 | * Concurrency issues: we only need guard against a call to selclear() |
842 | * by a thread exiting sel_do_scan(). The caller has prevented further |
843 | * references being made to the selinfo record via selrecord(), and it |
844 | * will not call selnotify() again. |
845 | */ |
846 | void |
847 | seldestroy(struct selinfo *sip) |
848 | { |
849 | selcluster_t *sc; |
850 | kmutex_t *lock; |
851 | lwp_t *l; |
852 | |
853 | if (sip->sel_lwp == NULL) |
854 | return; |
855 | |
856 | /* |
857 | * Lock out selclear(). The selcluster pointer can't change while |
858 | * we are here since it is only ever changed in selrecord(), |
859 | * and that will not be entered again for this record because |
860 | * it is dying. |
861 | */ |
862 | KASSERT(sip->sel_cluster != NULL); |
863 | sc = sip->sel_cluster; |
864 | lock = sc->sc_lock; |
865 | mutex_spin_enter(lock); |
866 | if ((l = sip->sel_lwp) != NULL) { |
867 | /* |
868 | * This should rarely happen, so although SLIST_REMOVE() |
869 | * is slow, using it here is not a problem. |
870 | */ |
871 | KASSERT(l->l_selcluster == sc); |
872 | SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain); |
873 | sip->sel_lwp = NULL; |
874 | } |
875 | mutex_spin_exit(lock); |
876 | } |
877 | |
878 | /* |
879 | * System control nodes. |
880 | */ |
881 | SYSCTL_SETUP(sysctl_select_setup, "sysctl select setup" ) |
882 | { |
883 | |
884 | sysctl_createv(clog, 0, NULL, NULL, |
885 | CTLFLAG_PERMANENT | CTLFLAG_READWRITE, |
886 | CTLTYPE_INT, "direct_select" , |
887 | SYSCTL_DESCR("Enable/disable direct select (for testing)" ), |
888 | NULL, 0, &direct_select, 0, |
889 | CTL_KERN, CTL_CREATE, CTL_EOL); |
890 | } |
891 | |