kern_event.c source code [src/src/sys/kern/kern_event.c]

1	/ $NetBSD: kern_event.c,v 1.88 2016/07/14 18:16:51 christos Exp $ /
2
3	/-*
4	* Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5	* All rights reserved.
6	*
7	* This code is derived from software contributed to The NetBSD Foundation
8	* by Andrew Doran.
9	*
10	* Redistribution and use in source and binary forms, with or without
11	* modification, are permitted provided that the following conditions
12	* are met:
13	* 1. Redistributions of source code must retain the above copyright
14	* notice, this list of conditions and the following disclaimer.
15	* 2. Redistributions in binary form must reproduce the above copyright
16	* notice, this list of conditions and the following disclaimer in the
17	* documentation and/or other materials provided with the distribution.
18	*
19	* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29	* POSSIBILITY OF SUCH DAMAGE.
30	*/
31
32	/-*
33	* Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
34	* All rights reserved.
35	*
36	* Redistribution and use in source and binary forms, with or without
37	* modification, are permitted provided that the following conditions
38	* are met:
39	* 1. Redistributions of source code must retain the above copyright
40	* notice, this list of conditions and the following disclaimer.
41	* 2. Redistributions in binary form must reproduce the above copyright
42	* notice, this list of conditions and the following disclaimer in the
43	* documentation and/or other materials provided with the distribution.
44	*
45	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
46	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
49	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55	* SUCH DAMAGE.
56	*
57	* FreeBSD: src/sys/kern/kern_event.c,v 1.27 2001/07/05 17:10:44 rwatson Exp
58	*/
59
60	#include <sys/cdefs.h>
61	__KERNEL_RCSID(`0`, "$NetBSD: kern_event.c,v 1.88 2016/07/14 18:16:51 christos Exp $");
62
63	#include <sys/param.h>
64	#include <sys/systm.h>
65	#include <sys/kernel.h>
66	#include <sys/wait.h>
67	#include <sys/proc.h>
68	#include <sys/file.h>
69	#include <sys/select.h>
70	#include <sys/queue.h>
71	#include <sys/event.h>
72	#include <sys/eventvar.h>
73	#include <sys/poll.h>
74	#include <sys/kmem.h>
75	#include <sys/stat.h>
76	#include <sys/filedesc.h>
77	#include <sys/syscallargs.h>
78	#include <sys/kauth.h>
79	#include <sys/conf.h>
80	#include <sys/atomic.h>
81
82	static int kqueue_scan(file_t , size_t, struct* kevent *,
83	const struct timespec , register_t ,
84	const struct kevent_ops , struct* kevent *,
85	size_t);
86	static int kqueue_ioctl(file_t , u_long, void* *);
87	static int kqueue_fcntl(file_t , u_int, void* *);
88	static int kqueue_poll(file_t , int*);
89	static int kqueue_kqfilter(file_t , struct* knote *);
90	static int kqueue_stat(file_t , struct* stat *);
91	static int kqueue_close(file_t *);
92	static int kqueue_register(struct kqueue , struct* kevent *);
93	static void kqueue_doclose(struct kqueue , struct* klist , int*);
94
95	static void knote_detach(struct knote , filedesc_t fdp, bool);
96	static void knote_enqueue(struct knote *);
97	static void knote_activate(struct knote *);
98
99	static void filt_kqdetach(struct knote *);
100	static int filt_kqueue(struct knote , long* hint);
101	static int filt_procattach(struct knote *);
102	static void filt_procdetach(struct knote *);
103	static int filt_proc(struct knote , long* hint);
104	static int filt_fileattach(struct knote *);
105	static void filt_timerexpire(void *x);
106	static int filt_timerattach(struct knote *);
107	static void filt_timerdetach(struct knote *);
108	static int filt_timer(struct knote , long* hint);
109
110	static const struct fileops kqueueops = {
111	.fo_read = (void *)enxio,
112	.fo_write = (void *)enxio,
113	.fo_ioctl = kqueue_ioctl,
114	.fo_fcntl = kqueue_fcntl,
115	.fo_poll = kqueue_poll,
116	.fo_stat = kqueue_stat,
117	.fo_close = kqueue_close,
118	.fo_kqfilter = kqueue_kqfilter,
119	.fo_restart = fnullop_restart,
120	};
121
122	static const struct filterops kqread_filtops =
123	{ `1`, NULL, filt_kqdetach, filt_kqueue };
124	static const struct filterops proc_filtops =
125	{ `0`, filt_procattach, filt_procdetach, filt_proc };
126	static const struct filterops file_filtops =
127	{ `1`, filt_fileattach, NULL, NULL };
128	static const struct filterops timer_filtops =
129	{ `0`, filt_timerattach, filt_timerdetach, filt_timer };
130
131	static u_int kq_ncallouts = `0`;
132	static int kq_calloutmax = (`4` * `1024`);
133
134	#define KN_HASHSIZE 64 /* XXX should be tunable */
135	#define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
136
137	extern const struct filterops sig_filtops;
138
139	/*
140	* Table for for all system-defined filters.
141	* These should be listed in the numeric order of the EVFILT_* defines.
142	* If filtops is NULL, the filter isn't implemented in NetBSD.
143	* End of list is when name is NULL.
144	*
145	* Note that 'refcnt' is meaningless for built-in filters.
146	*/
147	struct kfilter {
148	const char name; /* name of filter /
149	uint32_t filter; / id of filter /
150	unsigned refcnt; / reference count /
151	const struct filterops filtops;/* operations for filter /
152	size_t namelen; / length of name string /
153	};
154
155	/ System defined filters /
156	static struct kfilter sys_kfilters[] = {
157	{ "EVFILT_READ", EVFILT_READ, `0`, &file_filtops, `0` },
158	{ "EVFILT_WRITE", EVFILT_WRITE, `0`, &file_filtops, `0`, },
159	{ "EVFILT_AIO", EVFILT_AIO, `0`, NULL, `0` },
160	{ "EVFILT_VNODE", EVFILT_VNODE, `0`, &file_filtops, `0` },
161	{ "EVFILT_PROC", EVFILT_PROC, `0`, &proc_filtops, `0` },
162	{ "EVFILT_SIGNAL", EVFILT_SIGNAL, `0`, &sig_filtops, `0` },
163	{ "EVFILT_TIMER", EVFILT_TIMER, `0`, &timer_filtops, `0` },
164	{ NULL, `0`, `0`, NULL, `0` },
165	};
166
167	/ User defined kfilters /
168	static struct kfilter user_kfilters; /* array /
169	static int user_kfilterc; / current offset /
170	static int user_kfiltermaxc; / max size so far /
171	static size_t user_kfiltersz; / size of allocated memory /
172
173	/ Locks /
174	static krwlock_t kqueue_filter_lock; / lock on filter lists /
175	static kmutex_t kqueue_misc_lock; / miscellaneous /
176
177	static kauth_listener_t kqueue_listener;
178
179	static int
180	kqueue_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
181	void arg0, void* arg1, void* arg2, void* *arg3)
182	{
183	struct proc *p;
184	int result;
185
186	result = KAUTH_RESULT_DEFER;
187	p = arg0;
188
189	if (action != KAUTH_PROCESS_KEVENT_FILTER)
190	return result;
191
192	if ((kauth_cred_getuid(p->p_cred) != kauth_cred_getuid(cred) \|\|
193	ISSET(p->p_flag, PK_SUGID)))
194	return result;
195
196	result = KAUTH_RESULT_ALLOW;
197
198	return result;
199	}
200
201	/*
202	* Initialize the kqueue subsystem.
203	*/
204	void
205	kqueue_init(void)
206	{
207
208	rw_init(&kqueue_filter_lock);
209	mutex_init(&kqueue_misc_lock, MUTEX_DEFAULT, IPL_NONE);
210
211	kqueue_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
212	kqueue_listener_cb, NULL);
213	}
214
215	/*
216	* Find kfilter entry by name, or NULL if not found.
217	*/
218	static struct kfilter *
219	kfilter_byname_sys(const char *name)
220	{
221	int i;
222
223	KASSERT(rw_lock_held(&kqueue_filter_lock));
224
225	for (i = `0`; sys_kfilters[i].name != NULL; i++) {
226	if (strcmp(name, sys_kfilters[i].name) == `0`)
227	return &sys_kfilters[i];
228	}
229	return NULL;
230	}
231
232	static struct kfilter *
233	kfilter_byname_user(const char *name)
234	{
235	int i;
236
237	KASSERT(rw_lock_held(&kqueue_filter_lock));
238
239	/ user filter slots have a NULL name if previously deregistered /
240	for (i = `0`; i < user_kfilterc ; i++) {
241	if (user_kfilters[i].name != NULL &&
242	strcmp(name, user_kfilters[i].name) == `0`)
243	return &user_kfilters[i];
244	}
245	return NULL;
246	}
247
248	static struct kfilter *
249	kfilter_byname(const char *name)
250	{
251	struct kfilter *kfilter;
252
253	KASSERT(rw_lock_held(&kqueue_filter_lock));
254
255	if ((kfilter = kfilter_byname_sys(name)) != NULL)
256	return kfilter;
257
258	return kfilter_byname_user(name);
259	}
260
261	/*
262	* Find kfilter entry by filter id, or NULL if not found.
263	* Assumes entries are indexed in filter id order, for speed.
264	*/
265	static struct kfilter *
266	kfilter_byfilter(uint32_t filter)
267	{
268	struct kfilter *kfilter;
269
270	KASSERT(rw_lock_held(&kqueue_filter_lock));
271
272	if (filter < EVFILT_SYSCOUNT) / it's a system filter /
273	kfilter = &sys_kfilters[filter];
274	else if (user_kfilters != NULL &&
275	filter < EVFILT_SYSCOUNT + user_kfilterc)
276	/ it's a user filter /
277	kfilter = &user_kfilters[filter - EVFILT_SYSCOUNT];
278	else
279	return (NULL); / out of range /
280	KASSERT(kfilter->filter == filter); / sanity check! /
281	return (kfilter);
282	}
283
284	/*
285	* Register a new kfilter. Stores the entry in user_kfilters.
286	* Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
287	* If retfilter != NULL, the new filterid is returned in it.
288	*/
289	int
290	kfilter_register(const char name, const* struct filterops *filtops,
291	int *retfilter)
292	{
293	struct kfilter *kfilter;
294	size_t len;
295	int i;
296
297	if (name == NULL \|\| name[`0`] == `'\0'` \|\| filtops == NULL)
298	return (EINVAL); / invalid args /
299
300	rw_enter(&kqueue_filter_lock, RW_WRITER);
301	if (kfilter_byname(name) != NULL) {
302	rw_exit(&kqueue_filter_lock);
303	return (EEXIST); / already exists /
304	}
305	if (user_kfilterc > `0xffffffff` - EVFILT_SYSCOUNT) {
306	rw_exit(&kqueue_filter_lock);
307	return (EINVAL); / too many /
308	}
309
310	for (i = `0`; i < user_kfilterc; i++) {
311	kfilter = &user_kfilters[i];
312	if (kfilter->name == NULL) {
313	/ Previously deregistered slot. Reuse. /
314	goto reuse;
315	}
316	}
317
318	/ check if need to grow user_kfilters /
319	if (user_kfilterc + `1` > user_kfiltermaxc) {
320	/ Grow in KFILTER_EXTENT chunks. /
321	user_kfiltermaxc += KFILTER_EXTENT;
322	len = user_kfiltermaxc * sizeof(*kfilter);
323	kfilter = kmem_alloc(len, KM_SLEEP);
324	memset((char *)kfilter + user_kfiltersz, `0`, len - user_kfiltersz);
325	if (user_kfilters != NULL) {
326	memcpy(kfilter, user_kfilters, user_kfiltersz);
327	kmem_free(user_kfilters, user_kfiltersz);
328	}
329	user_kfiltersz = len;
330	user_kfilters = kfilter;
331	}
332	/ Adding new slot /
333	kfilter = &user_kfilters[user_kfilterc++];
334	reuse:
335	kfilter->namelen = strlen(name) + `1`;
336	kfilter->name = kmem_alloc(kfilter->namelen, KM_SLEEP);
337	memcpy(__UNCONST(kfilter->name), name, kfilter->namelen);
338
339	kfilter->filter = (kfilter - user_kfilters) + EVFILT_SYSCOUNT;
340
341	kfilter->filtops = kmem_alloc(sizeof(*filtops), KM_SLEEP);
342	memcpy(__UNCONST(kfilter->filtops), filtops, sizeof(*filtops));
343
344	if (retfilter != NULL)
345	*retfilter = kfilter->filter;
346	rw_exit(&kqueue_filter_lock);
347
348	return (`0`);
349	}
350
351	/*
352	* Unregister a kfilter previously registered with kfilter_register.
353	* This retains the filter id, but clears the name and frees filtops (filter
354	* operations), so that the number isn't reused during a boot.
355	* Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
356	*/
357	int
358	kfilter_unregister(const char *name)
359	{
360	struct kfilter *kfilter;
361
362	if (name == NULL \|\| name[`0`] == `'\0'`)
363	return (EINVAL); / invalid name /
364
365	rw_enter(&kqueue_filter_lock, RW_WRITER);
366	if (kfilter_byname_sys(name) != NULL) {
367	rw_exit(&kqueue_filter_lock);
368	return (EINVAL); / can't detach system filters /
369	}
370
371	kfilter = kfilter_byname_user(name);
372	if (kfilter == NULL) {
373	rw_exit(&kqueue_filter_lock);
374	return (ENOENT);
375	}
376	if (kfilter->refcnt != `0`) {
377	rw_exit(&kqueue_filter_lock);
378	return (EBUSY);
379	}
380
381	/ Cast away const (but we know it's safe. /
382	kmem_free(__UNCONST(kfilter->name), kfilter->namelen);
383	kfilter->name = NULL; / mark as `not implemented' /
384
385	if (kfilter->filtops != NULL) {
386	/ Cast away const (but we know it's safe. /
387	kmem_free(__UNCONST(kfilter->filtops),
388	sizeof(*kfilter->filtops));
389	kfilter->filtops = NULL; / mark as `not implemented' /
390	}
391	rw_exit(&kqueue_filter_lock);
392
393	return (`0`);
394	}
395
396
397	/*
398	* Filter attach method for EVFILT_READ and EVFILT_WRITE on normal file
399	* descriptors. Calls fileops kqfilter method for given file descriptor.
400	*/
401	static int
402	filt_fileattach(struct knote *kn)
403	{
404	file_t *fp;
405
406	fp = kn->kn_obj;
407
408	return (*fp->f_ops->fo_kqfilter)(fp, kn);
409	}
410
411	/*
412	* Filter detach method for EVFILT_READ on kqueue descriptor.
413	*/
414	static void
415	filt_kqdetach(struct knote *kn)
416	{
417	struct kqueue *kq;
418
419	kq = ((file_t *)kn->kn_obj)->f_kqueue;
420
421	mutex_spin_enter(&kq->kq_lock);
422	SLIST_REMOVE(&kq->kq_sel.sel_klist, kn, knote, kn_selnext);
423	mutex_spin_exit(&kq->kq_lock);
424	}
425
426	/*
427	* Filter event method for EVFILT_READ on kqueue descriptor.
428	*/
429	/ARGSUSED/
430	static int
431	filt_kqueue(struct knote kn, long* hint)
432	{
433	struct kqueue *kq;
434	int rv;
435
436	kq = ((file_t *)kn->kn_obj)->f_kqueue;
437
438	if (hint != NOTE_SUBMIT)
439	mutex_spin_enter(&kq->kq_lock);
440	kn->kn_data = kq->kq_count;
441	rv = (kn->kn_data > `0`);
442	if (hint != NOTE_SUBMIT)
443	mutex_spin_exit(&kq->kq_lock);
444
445	return rv;
446	}
447
448	/*
449	* Filter attach method for EVFILT_PROC.
450	*/
451	static int
452	filt_procattach(struct knote *kn)
453	{
454	struct proc *p;
455	struct lwp *curl;
456
457	curl = curlwp;
458
459	mutex_enter(proc_lock);
460	if (kn->kn_flags & EV_FLAG1) {
461	/*
462	* NOTE_TRACK attaches to the child process too early
463	* for proc_find, so do a raw look up and check the state
464	* explicitly.
465	*/
466	p = proc_find_raw(kn->kn_id);
467	if (p != NULL && p->p_stat != SIDL)
468	p = NULL;
469	} else {
470	p = proc_find(kn->kn_id);
471	}
472
473	if (p == NULL) {
474	mutex_exit(proc_lock);
475	return ESRCH;
476	}
477
478	/*
479	* Fail if it's not owned by you, or the last exec gave us
480	* setuid/setgid privs (unless you're root).
481	*/
482	mutex_enter(p->p_lock);
483	mutex_exit(proc_lock);
484	if (kauth_authorize_process(curl->l_cred, KAUTH_PROCESS_KEVENT_FILTER,
485	p, NULL, NULL, NULL) != `0`) {
486	mutex_exit(p->p_lock);
487	return EACCES;
488	}
489
490	kn->kn_obj = p;
491	kn->kn_flags \|= EV_CLEAR; / automatically set /
492
493	/*
494	* internal flag indicating registration done by kernel
495	*/
496	if (kn->kn_flags & EV_FLAG1) {
497	kn->kn_data = kn->kn_sdata; / ppid /
498	kn->kn_fflags = NOTE_CHILD;
499	kn->kn_flags &= ~EV_FLAG1;
500	}
501	SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
502	mutex_exit(p->p_lock);
503
504	return `0`;
505	}
506
507	/*
508	* Filter detach method for EVFILT_PROC.
509	*
510	* The knote may be attached to a different process, which may exit,
511	* leaving nothing for the knote to be attached to. So when the process
512	* exits, the knote is marked as DETACHED and also flagged as ONESHOT so
513	* it will be deleted when read out. However, as part of the knote deletion,
514	* this routine is called, so a check is needed to avoid actually performing
515	* a detach, because the original process might not exist any more.
516	*/
517	static void
518	filt_procdetach(struct knote *kn)
519	{
520	struct proc *p;
521
522	if (kn->kn_status & KN_DETACHED)
523	return;
524
525	p = kn->kn_obj;
526
527	mutex_enter(p->p_lock);
528	SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
529	mutex_exit(p->p_lock);
530	}
531
532	/*
533	* Filter event method for EVFILT_PROC.
534	*/
535	static int
536	filt_proc(struct knote kn, long* hint)
537	{
538	u_int event, fflag;
539	struct kevent kev;
540	struct kqueue *kq;
541	int error;
542
543	event = (u_int)hint & NOTE_PCTRLMASK;
544	kq = kn->kn_kq;
545	fflag = `0`;
546
547	/ If the user is interested in this event, record it. /
548	if (kn->kn_sfflags & event)
549	fflag \|= event;
550
551	if (event == NOTE_EXIT) {
552	struct proc *p = kn->kn_obj;
553
554	if (p != NULL)
555	kn->kn_data = P_WAITSTATUS(p);
556	/*
557	* Process is gone, so flag the event as finished.
558	*
559	* Detach the knote from watched process and mark
560	* it as such. We can't leave this to kqueue_scan(),
561	* since the process might not exist by then. And we
562	* have to do this now, since psignal KNOTE() is called
563	* also for zombies and we might end up reading freed
564	* memory if the kevent would already be picked up
565	* and knote g/c'ed.
566	*/
567	filt_procdetach(kn);
568
569	mutex_spin_enter(&kq->kq_lock);
570	kn->kn_status \|= KN_DETACHED;
571	/ Mark as ONESHOT, so that the knote it g/c'ed when read /
572	kn->kn_flags \|= (EV_EOF \| EV_ONESHOT);
573	kn->kn_fflags \|= fflag;
574	mutex_spin_exit(&kq->kq_lock);
575
576	return `1`;
577	}
578
579	mutex_spin_enter(&kq->kq_lock);
580	if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
581	/*
582	* Process forked, and user wants to track the new process,
583	* so attach a new knote to it, and immediately report an
584	* event with the parent's pid. Register knote with new
585	* process.
586	*/
587	kev.ident = hint & NOTE_PDATAMASK; / pid /
588	kev.filter = kn->kn_filter;
589	kev.flags = kn->kn_flags \| EV_ADD \| EV_ENABLE \| EV_FLAG1;
590	kev.fflags = kn->kn_sfflags;
591	kev.data = kn->kn_id; / parent /
592	kev.udata = kn->kn_kevent.udata; / preserve udata /
593	mutex_spin_exit(&kq->kq_lock);
594	error = kqueue_register(kq, &kev);
595	mutex_spin_enter(&kq->kq_lock);
596	if (error != `0`)
597	kn->kn_fflags \|= NOTE_TRACKERR;
598	}
599	kn->kn_fflags \|= fflag;
600	fflag = kn->kn_fflags;
601	mutex_spin_exit(&kq->kq_lock);
602
603	return fflag != `0`;
604	}
605
606	static void
607	filt_timerexpire(void *knx)
608	{
609	struct knote *kn = knx;
610	int tticks;
611
612	mutex_enter(&kqueue_misc_lock);
613	kn->kn_data++;
614	knote_activate(kn);
615	if ((kn->kn_flags & EV_ONESHOT) == `0`) {
616	tticks = mstohz(kn->kn_sdata);
617	if (tticks <= `0`)
618	tticks = `1`;
619	callout_schedule((callout_t *)kn->kn_hook, tticks);
620	}
621	mutex_exit(&kqueue_misc_lock);
622	}
623
624	/*
625	* data contains amount of time to sleep, in milliseconds
626	*/
627	static int
628	filt_timerattach(struct knote *kn)
629	{
630	callout_t *calloutp;
631	struct kqueue *kq;
632	int tticks;
633
634	tticks = mstohz(kn->kn_sdata);
635
636	/ if the supplied value is under our resolution, use 1 tick /
637	if (tticks == `0`) {
638	if (kn->kn_sdata == `0`)
639	return EINVAL;
640	tticks = `1`;
641	}
642
643	if (atomic_inc_uint_nv(&kq_ncallouts) >= kq_calloutmax \|\|
644	(calloutp = kmem_alloc(sizeof(*calloutp), KM_NOSLEEP)) == NULL) {
645	atomic_dec_uint(&kq_ncallouts);
646	return ENOMEM;
647	}
648	callout_init(calloutp, CALLOUT_MPSAFE);
649
650	kq = kn->kn_kq;
651	mutex_spin_enter(&kq->kq_lock);
652	kn->kn_flags \|= EV_CLEAR; / automatically set /
653	kn->kn_hook = calloutp;
654	mutex_spin_exit(&kq->kq_lock);
655
656	callout_reset(calloutp, tticks, filt_timerexpire, kn);
657
658	return (`0`);
659	}
660
661	static void
662	filt_timerdetach(struct knote *kn)
663	{
664	callout_t *calloutp;
665
666	calloutp = (callout_t *)kn->kn_hook;
667	callout_halt(calloutp, NULL);
668	callout_destroy(calloutp);
669	kmem_free(calloutp, sizeof(*calloutp));
670	atomic_dec_uint(&kq_ncallouts);
671	}
672
673	static int
674	filt_timer(struct knote kn, long* hint)
675	{
676	int rv;
677
678	mutex_enter(&kqueue_misc_lock);
679	rv = (kn->kn_data != `0`);
680	mutex_exit(&kqueue_misc_lock);
681
682	return rv;
683	}
684
685	/*
686	* filt_seltrue:
687	*
688	* This filter "event" routine simulates seltrue().
689	*/
690	int
691	filt_seltrue(struct knote kn, long* hint)
692	{
693
694	/*
695	* We don't know how much data can be read/written,
696	* but we know that it can be. This is about as
697	* good as select/poll does as well.
698	*/
699	kn->kn_data = `0`;
700	return (`1`);
701	}
702
703	/*
704	* This provides full kqfilter entry for device switch tables, which
705	* has same effect as filter using filt_seltrue() as filter method.
706	*/
707	static void
708	filt_seltruedetach(struct knote *kn)
709	{
710	/ Nothing to do /
711	}
712
713	const struct filterops seltrue_filtops =
714	{ `1`, NULL, filt_seltruedetach, filt_seltrue };
715
716	int
717	seltrue_kqfilter(dev_t dev, struct knote *kn)
718	{
719	switch (kn->kn_filter) {
720	case EVFILT_READ:
721	case EVFILT_WRITE:
722	kn->kn_fop = &seltrue_filtops;
723	break;
724	default:
725	return (EINVAL);
726	}
727
728	/ Nothing more to do /
729	return (`0`);
730	}
731
732	/*
733	* kqueue(2) system call.
734	*/
735	static int
736	kqueue1(struct lwp l, int* flags, register_t *retval)
737	{
738	struct kqueue *kq;
739	file_t *fp;
740	int fd, error;
741
742	if ((error = fd_allocfile(&fp, &fd)) != `0`)
743	return error;
744	fp->f_flag = FREAD \| FWRITE \| (flags & (FNONBLOCK\|FNOSIGPIPE));
745	fp->f_type = DTYPE_KQUEUE;
746	fp->f_ops = &kqueueops;
747	kq = kmem_zalloc(sizeof(*kq), KM_SLEEP);
748	mutex_init(&kq->kq_lock, MUTEX_DEFAULT, IPL_SCHED);
749	cv_init(&kq->kq_cv, "kqueue");
750	selinit(&kq->kq_sel);
751	TAILQ_INIT(&kq->kq_head);
752	fp->f_kqueue = kq;
753	*retval = fd;
754	kq->kq_fdp = curlwp->l_fd;
755	fd_set_exclose(l, fd, (flags & O_CLOEXEC) != `0`);
756	fd_affix(curproc, fp, fd);
757	return error;
758	}
759
760	/*
761	* kqueue(2) system call.
762	*/
763	int
764	sys_kqueue(struct lwp l, const* void v, register_t retval)
765	{
766	return kqueue1(l, `0`, retval);
767	}
768
769	int
770	sys_kqueue1(struct lwp l, const* struct sys_kqueue1_args *uap,
771	register_t *retval)
772	{
773	/ {*
774	syscallarg(int) flags;
775	} /*
776	return kqueue1(l, SCARG(uap, flags), retval);
777	}
778
779	/*
780	* kevent(2) system call.
781	*/
782	int
783	kevent_fetch_changes(void ctx, const* struct kevent *changelist,
784	struct kevent changes, size_t index, int* n)
785	{
786
787	return copyin(changelist + index, changes, n * sizeof(*changes));
788	}
789
790	int
791	kevent_put_events(void ctx, struct* kevent *events,
792	struct kevent eventlist, size_t index, int* n)
793	{
794
795	return copyout(events, eventlist + index, n * sizeof(*events));
796	}
797
798	static const struct kevent_ops kevent_native_ops = {
799	.keo_private = NULL,
800	.keo_fetch_timeout = copyin,
801	.keo_fetch_changes = kevent_fetch_changes,
802	.keo_put_events = kevent_put_events,
803	};
804
805	int
806	sys___kevent50(struct lwp l, const* struct sys___kevent50_args *uap,
807	register_t *retval)
808	{
809	/ {*
810	syscallarg(int) fd;
811	syscallarg(const struct kevent ) changelist;*
812	syscallarg(size_t) nchanges;
813	syscallarg(struct kevent ) eventlist;*
814	syscallarg(size_t) nevents;
815	syscallarg(const struct timespec ) timeout;*
816	} /*
817
818	return kevent1(retval, SCARG(uap, fd), SCARG(uap, changelist),
819	SCARG(uap, nchanges), SCARG(uap, eventlist), SCARG(uap, nevents),
820	SCARG(uap, timeout), &kevent_native_ops);
821	}
822
823	int
824	kevent1(register_t retval, int* fd,
825	const struct kevent *changelist, size_t nchanges,
826	struct kevent *eventlist, size_t nevents,
827	const struct timespec *timeout,
828	const struct kevent_ops *keops)
829	{
830	struct kevent *kevp;
831	struct kqueue *kq;
832	struct timespec ts;
833	size_t i, n, ichange;
834	int nerrors, error;
835	struct kevent kevbuf[KQ_NEVENTS]; / approx 300 bytes on 64-bit /
836	file_t *fp;
837
838	/ check that we're dealing with a kq /
839	fp = fd_getfile(fd);
840	if (fp == NULL)
841	return (EBADF);
842
843	if (fp->f_type != DTYPE_KQUEUE) {
844	fd_putfile(fd);
845	return (EBADF);
846	}
847
848	if (timeout != NULL) {
849	error = (keops->keo_fetch_timeout)(timeout, &ts, sizeof*(ts));
850	if (error)
851	goto done;
852	timeout = &ts;
853	}
854
855	kq = fp->f_kqueue;
856	nerrors = `0`;
857	ichange = `0`;
858
859	/ traverse list of events to register /
860	while (nchanges > `0`) {
861	n = MIN(nchanges, __arraycount(kevbuf));
862	error = (*keops->keo_fetch_changes)(keops->keo_private,
863	changelist, kevbuf, ichange, n);
864	if (error)
865	goto done;
866	for (i = `0`; i < n; i++) {
867	kevp = &kevbuf[i];
868	kevp->flags &= ~EV_SYSFLAGS;
869	/ register each knote /
870	error = kqueue_register(kq, kevp);
871	if (error \|\| (kevp->flags & EV_RECEIPT)) {
872	if (nevents != `0`) {
873	kevp->flags = EV_ERROR;
874	kevp->data = error;
875	error = (*keops->keo_put_events)
876	(keops->keo_private, kevp,
877	eventlist, nerrors, `1`);
878	if (error)
879	goto done;
880	nevents--;
881	nerrors++;
882	} else {
883	goto done;
884	}
885	}
886	}
887	nchanges -= n; / update the results /
888	ichange += n;
889	}
890	if (nerrors) {
891	*retval = nerrors;
892	error = `0`;
893	goto done;
894	}
895
896	/ actually scan through the events /
897	error = kqueue_scan(fp, nevents, eventlist, timeout, retval, keops,
898	kevbuf, __arraycount(kevbuf));
899	done:
900	fd_putfile(fd);
901	return (error);
902	}
903
904	/*
905	* Register a given kevent kev onto the kqueue
906	*/
907	static int
908	kqueue_register(struct kqueue kq, struct* kevent *kev)
909	{
910	struct kfilter *kfilter;
911	filedesc_t *fdp;
912	file_t *fp;
913	fdfile_t *ff;
914	struct knote kn, newkn;
915	struct klist *list;
916	int error, fd, rv;
917
918	fdp = kq->kq_fdp;
919	fp = NULL;
920	kn = NULL;
921	error = `0`;
922	fd = `0`;
923
924	newkn = kmem_zalloc(sizeof(*newkn), KM_SLEEP);
925
926	rw_enter(&kqueue_filter_lock, RW_READER);
927	kfilter = kfilter_byfilter(kev->filter);
928	if (kfilter == NULL \|\| kfilter->filtops == NULL) {
929	/ filter not found nor implemented /
930	rw_exit(&kqueue_filter_lock);
931	kmem_free(newkn, sizeof(*newkn));
932	return (EINVAL);
933	}
934
935	/ search if knote already exists /
936	if (kfilter->filtops->f_isfd) {
937	/ monitoring a file descriptor /
938	/ validate descriptor /
939	if (kev->ident > INT_MAX
940	\|\| (fp = fd_getfile(fd = kev->ident)) == NULL) {
941	rw_exit(&kqueue_filter_lock);
942	kmem_free(newkn, sizeof(*newkn));
943	return EBADF;
944	}
945	mutex_enter(&fdp->fd_lock);
946	ff = fdp->fd_dt->dt_ff[fd];
947	if (fd <= fdp->fd_lastkqfile) {
948	SLIST_FOREACH(kn, &ff->ff_knlist, kn_link) {
949	if (kq == kn->kn_kq &&
950	kev->filter == kn->kn_filter)
951	break;
952	}
953	}
954	} else {
955	/*
956	* not monitoring a file descriptor, so
957	* lookup knotes in internal hash table
958	*/
959	mutex_enter(&fdp->fd_lock);
960	if (fdp->fd_knhashmask != `0`) {
961	list = &fdp->fd_knhash[
962	KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
963	SLIST_FOREACH(kn, list, kn_link) {
964	if (kev->ident == kn->kn_id &&
965	kq == kn->kn_kq &&
966	kev->filter == kn->kn_filter)
967	break;
968	}
969	}
970	}
971
972	/*
973	* kn now contains the matching knote, or NULL if no match
974	*/
975	if (kev->flags & EV_ADD) {
976	if (kn == NULL) {
977	/ create new knote /
978	kn = newkn;
979	newkn = NULL;
980	kn->kn_obj = fp;
981	kn->kn_id = kev->ident;
982	kn->kn_kq = kq;
983	kn->kn_fop = kfilter->filtops;
984	kn->kn_kfilter = kfilter;
985	kn->kn_sfflags = kev->fflags;
986	kn->kn_sdata = kev->data;
987	kev->fflags = `0`;
988	kev->data = `0`;
989	kn->kn_kevent = *kev;
990
991	KASSERT(kn->kn_fop != NULL);
992	/*
993	* apply reference count to knote structure, and
994	* do not release it at the end of this routine.
995	*/
996	fp = NULL;
997
998	if (!kn->kn_fop->f_isfd) {
999	/*
1000	* If knote is not on an fd, store on
1001	* internal hash table.
1002	*/
1003	if (fdp->fd_knhashmask == `0`) {
1004	/ XXXAD can block with fd_lock held /
1005	fdp->fd_knhash = hashinit(KN_HASHSIZE,
1006	HASH_LIST, true,
1007	&fdp->fd_knhashmask);
1008	}
1009	list = &fdp->fd_knhash[KN_HASH(kn->kn_id,
1010	fdp->fd_knhashmask)];
1011	} else {
1012	/ Otherwise, knote is on an fd. /
1013	list = (struct klist *)
1014	&fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
1015	if ((int)kn->kn_id > fdp->fd_lastkqfile)
1016	fdp->fd_lastkqfile = kn->kn_id;
1017	}
1018	SLIST_INSERT_HEAD(list, kn, kn_link);
1019
1020	KERNEL_LOCK(`1`, NULL); / XXXSMP /
1021	error = (*kfilter->filtops->f_attach)(kn);
1022	KERNEL_UNLOCK_ONE(NULL); / XXXSMP /
1023	if (error != `0`) {
1024	#ifdef DIAGNOSTIC
1025	printf("%s: event not supported for file type"
1026	" %d\n", __func__, fp ? fp->f_type : -`1`);
1027	#endif
1028	/ knote_detach() drops fdp->fd_lock /
1029	knote_detach(kn, fdp, false);
1030	goto done;
1031	}
1032	atomic_inc_uint(&kfilter->refcnt);
1033	} else {
1034	/*
1035	* The user may change some filter values after the
1036	* initial EV_ADD, but doing so will not reset any
1037	* filter which have already been triggered.
1038	*/
1039	kn->kn_sfflags = kev->fflags;
1040	kn->kn_sdata = kev->data;
1041	kn->kn_kevent.udata = kev->udata;
1042	}
1043	/*
1044	* We can get here if we are trying to attach
1045	* an event to a file descriptor that does not
1046	* support events, and the attach routine is
1047	* broken and does not return an error.
1048	*/
1049	KASSERT(kn->kn_fop != NULL);
1050	KASSERT(kn->kn_fop->f_event != NULL);
1051	KERNEL_LOCK(`1`, NULL); / XXXSMP /
1052	rv = (*kn->kn_fop->f_event)(kn, `0`);
1053	KERNEL_UNLOCK_ONE(NULL); / XXXSMP /
1054	if (rv)
1055	knote_activate(kn);
1056	} else {
1057	if (kn == NULL) {
1058	error = ENOENT;
1059	mutex_exit(&fdp->fd_lock);
1060	goto done;
1061	}
1062	if (kev->flags & EV_DELETE) {
1063	/ knote_detach() drops fdp->fd_lock /
1064	knote_detach(kn, fdp, true);
1065	goto done;
1066	}
1067	}
1068
1069	/ disable knote /
1070	if ((kev->flags & EV_DISABLE)) {
1071	mutex_spin_enter(&kq->kq_lock);
1072	if ((kn->kn_status & KN_DISABLED) == `0`)
1073	kn->kn_status \|= KN_DISABLED;
1074	mutex_spin_exit(&kq->kq_lock);
1075	}
1076
1077	/ enable knote /
1078	if ((kev->flags & EV_ENABLE)) {
1079	knote_enqueue(kn);
1080	}
1081	mutex_exit(&fdp->fd_lock);
1082	done:
1083	rw_exit(&kqueue_filter_lock);
1084	if (newkn != NULL)
1085	kmem_free(newkn, sizeof(*newkn));
1086	if (fp != NULL)
1087	fd_putfile(fd);
1088	return (error);
1089	}
1090
1091	#if defined(DEBUG)
1092	static void
1093	kq_check(struct kqueue *kq)
1094	{
1095	const struct knote *kn;
1096	int count;
1097	int nmarker;
1098
1099	KASSERT(mutex_owned(&kq->kq_lock));
1100	KASSERT(kq->kq_count >= `0`);
1101
1102	count = `0`;
1103	nmarker = `0`;
1104	TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
1105	if ((kn->kn_status & (KN_MARKER \| KN_QUEUED)) == `0`) {
1106	panic("%s: kq=%p kn=%p inconsist 1", __func__, kq, kn);
1107	}
1108	if ((kn->kn_status & KN_MARKER) == `0`) {
1109	if (kn->kn_kq != kq) {
1110	panic("%s: kq=%p kn=%p inconsist 2",
1111	__func__, kq, kn);
1112	}
1113	if ((kn->kn_status & KN_ACTIVE) == `0`) {
1114	panic("%s: kq=%p kn=%p: not active",
1115	__func__, kq, kn);
1116	}
1117	count++;
1118	if (count > kq->kq_count) {
1119	goto bad;
1120	}
1121	} else {
1122	nmarker++;
1123	#if 0
1124	if (nmarker > `10000`) {
1125	panic("%s: kq=%p too many markers: %d != %d, "
1126	"nmarker=%d",
1127	__func__, kq, kq->kq_count, count, nmarker);
1128	}
1129	#endif
1130	}
1131	}
1132	if (kq->kq_count != count) {
1133	bad:
1134	panic("%s: kq=%p inconsist 3: %d != %d, nmarker=%d",
1135	__func__, kq, kq->kq_count, count, nmarker);
1136	}
1137	}
1138	#else /* defined(DEBUG) */
1139	#define kq_check(a) /* nothing */
1140	#endif /* defined(DEBUG) */
1141
1142	/*
1143	* Scan through the list of events on fp (for a maximum of maxevents),
1144	* returning the results in to ulistp. Timeout is determined by tsp; if
1145	* NULL, wait indefinitely, if 0 valued, perform a poll, otherwise wait
1146	* as appropriate.
1147	*/
1148	static int
1149	kqueue_scan(file_t fp, size_t maxevents, struct* kevent *ulistp,
1150	const struct timespec tsp, register_t retval,
1151	const struct kevent_ops keops, struct* kevent *kevbuf,
1152	size_t kevcnt)
1153	{
1154	struct kqueue *kq;
1155	struct kevent *kevp;
1156	struct timespec ats, sleepts;
1157	struct knote kn, marker, morker;
1158	size_t count, nkev, nevents;
1159	int timeout, error, rv;
1160	filedesc_t *fdp;
1161
1162	fdp = curlwp->l_fd;
1163	kq = fp->f_kqueue;
1164	count = maxevents;
1165	nkev = nevents = error = `0`;
1166	if (count == `0`) {
1167	*retval = `0`;
1168	return `0`;
1169	}
1170
1171	if (tsp) { / timeout supplied /
1172	ats = *tsp;
1173	if (inittimeleft(&ats, &sleepts) == -`1`) {
1174	*retval = maxevents;
1175	return EINVAL;
1176	}
1177	timeout = tstohz(&ats);
1178	if (timeout <= `0`)
1179	timeout = -`1`; / do poll /
1180	} else {
1181	/ no timeout, wait forever /
1182	timeout = `0`;
1183	}
1184
1185	memset(&morker, `0`, sizeof(morker));
1186	marker = &morker;
1187	marker->kn_status = KN_MARKER;
1188	mutex_spin_enter(&kq->kq_lock);
1189	retry:
1190	kevp = kevbuf;
1191	if (kq->kq_count == `0`) {
1192	if (timeout >= `0`) {
1193	error = cv_timedwait_sig(&kq->kq_cv,
1194	&kq->kq_lock, timeout);
1195	if (error == `0`) {
1196	if (tsp == NULL \|\| (timeout =
1197	gettimeleft(&ats, &sleepts)) > `0`)
1198	goto retry;
1199	} else {
1200	/ don't restart after signals... /
1201	if (error == ERESTART)
1202	error = EINTR;
1203	if (error == EWOULDBLOCK)
1204	error = `0`;
1205	}
1206	}
1207	} else {
1208	/ mark end of knote list /
1209	TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1210
1211	while (count != `0`) {
1212	kn = TAILQ_FIRST(&kq->kq_head); / get next knote /
1213	while ((kn->kn_status & KN_MARKER) != `0`) {
1214	if (kn == marker) {
1215	/ it's our marker, stop /
1216	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1217	if (count < maxevents \|\| (tsp != NULL &&
1218	(timeout = gettimeleft(&ats,
1219	&sleepts)) <= `0`))
1220	goto done;
1221	goto retry;
1222	}
1223	/ someone else's marker. /
1224	kn = TAILQ_NEXT(kn, kn_tqe);
1225	}
1226	kq_check(kq);
1227	kq->kq_count--;
1228	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1229	kn->kn_status &= ~KN_QUEUED;
1230	kn->kn_status \|= KN_BUSY;
1231	kq_check(kq);
1232	if (kn->kn_status & KN_DISABLED) {
1233	kn->kn_status &= ~KN_BUSY;
1234	/ don't want disabled events /
1235	continue;
1236	}
1237	if ((kn->kn_flags & EV_ONESHOT) == `0`) {
1238	mutex_spin_exit(&kq->kq_lock);
1239	KASSERT(kn->kn_fop != NULL);
1240	KASSERT(kn->kn_fop->f_event != NULL);
1241	KERNEL_LOCK(`1`, NULL); / XXXSMP /
1242	rv = (*kn->kn_fop->f_event)(kn, `0`);
1243	KERNEL_UNLOCK_ONE(NULL); / XXXSMP /
1244	mutex_spin_enter(&kq->kq_lock);
1245	/ Re-poll if note was re-enqueued. /
1246	if ((kn->kn_status & KN_QUEUED) != `0`) {
1247	kn->kn_status &= ~KN_BUSY;
1248	continue;
1249	}
1250	if (rv == `0`) {
1251	/*
1252	* non-ONESHOT event that hasn't
1253	* triggered again, so de-queue.
1254	*/
1255	kn->kn_status &= ~(KN_ACTIVE\|KN_BUSY);
1256	continue;
1257	}
1258	}
1259	/ XXXAD should be got from f_event if !oneshot. /
1260	*kevp++ = kn->kn_kevent;
1261	nkev++;
1262	if (kn->kn_flags & EV_ONESHOT) {
1263	/ delete ONESHOT events after retrieval /
1264	mutex_spin_exit(&kq->kq_lock);
1265	mutex_enter(&fdp->fd_lock);
1266	kn->kn_status &= ~KN_BUSY;
1267	knote_detach(kn, fdp, true);
1268	mutex_spin_enter(&kq->kq_lock);
1269	} else if (kn->kn_flags & EV_CLEAR) {
1270	/ clear state after retrieval /
1271	kn->kn_data = `0`;
1272	kn->kn_fflags = `0`;
1273	kn->kn_status &= ~(KN_QUEUED\|KN_ACTIVE\|KN_BUSY);
1274	} else if (kn->kn_flags & EV_DISPATCH) {
1275	kn->kn_status \|= KN_DISABLED;
1276	kn->kn_status &= ~(KN_QUEUED\|KN_ACTIVE\|KN_BUSY);
1277	} else {
1278	/ add event back on list /
1279	kq_check(kq);
1280	kn->kn_status \|= KN_QUEUED;
1281	kn->kn_status &= ~KN_BUSY;
1282	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1283	kq->kq_count++;
1284	kq_check(kq);
1285	}
1286	if (nkev == kevcnt) {
1287	/ do copyouts in kevcnt chunks /
1288	mutex_spin_exit(&kq->kq_lock);
1289	error = (*keops->keo_put_events)
1290	(keops->keo_private,
1291	kevbuf, ulistp, nevents, nkev);
1292	mutex_spin_enter(&kq->kq_lock);
1293	nevents += nkev;
1294	nkev = `0`;
1295	kevp = kevbuf;
1296	}
1297	count--;
1298	if (error != `0` \|\| count == `0`) {
1299	/ remove marker /
1300	TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
1301	break;
1302	}
1303	}
1304	}
1305	done:
1306	mutex_spin_exit(&kq->kq_lock);
1307	if (nkev != `0`) {
1308	/ copyout remaining events /
1309	error = (*keops->keo_put_events)(keops->keo_private,
1310	kevbuf, ulistp, nevents, nkev);
1311	}
1312	*retval = maxevents - count;
1313
1314	return error;
1315	}
1316
1317	/*
1318	* fileops ioctl method for a kqueue descriptor.
1319	*
1320	* Two ioctls are currently supported. They both use struct kfilter_mapping:
1321	* KFILTER_BYNAME find name for filter, and return result in
1322	* name, which is of size len.
1323	* KFILTER_BYFILTER find filter for name. len is ignored.
1324	*/
1325	/ARGSUSED/
1326	static int
1327	kqueue_ioctl(file_t fp, u_long com, void* *data)
1328	{
1329	struct kfilter_mapping *km;
1330	const struct kfilter *kfilter;
1331	char *name;
1332	int error;
1333
1334	km = data;
1335	error = `0`;
1336	name = kmem_alloc(KFILTER_MAXNAME, KM_SLEEP);
1337
1338	switch (com) {
1339	case KFILTER_BYFILTER: / convert filter -> name /
1340	rw_enter(&kqueue_filter_lock, RW_READER);
1341	kfilter = kfilter_byfilter(km->filter);
1342	if (kfilter != NULL) {
1343	strlcpy(name, kfilter->name, KFILTER_MAXNAME);
1344	rw_exit(&kqueue_filter_lock);
1345	error = copyoutstr(name, km->name, km->len, NULL);
1346	} else {
1347	rw_exit(&kqueue_filter_lock);
1348	error = ENOENT;
1349	}
1350	break;
1351
1352	case KFILTER_BYNAME: / convert name -> filter /
1353	error = copyinstr(km->name, name, KFILTER_MAXNAME, NULL);
1354	if (error) {
1355	break;
1356	}
1357	rw_enter(&kqueue_filter_lock, RW_READER);
1358	kfilter = kfilter_byname(name);
1359	if (kfilter != NULL)
1360	km->filter = kfilter->filter;
1361	else
1362	error = ENOENT;
1363	rw_exit(&kqueue_filter_lock);
1364	break;
1365
1366	default:
1367	error = ENOTTY;
1368	break;
1369
1370	}
1371	kmem_free(name, KFILTER_MAXNAME);
1372	return (error);
1373	}
1374
1375	/*
1376	* fileops fcntl method for a kqueue descriptor.
1377	*/
1378	static int
1379	kqueue_fcntl(file_t fp, u_int com, void* *data)
1380	{
1381
1382	return (ENOTTY);
1383	}
1384
1385	/*
1386	* fileops poll method for a kqueue descriptor.
1387	* Determine if kqueue has events pending.
1388	*/
1389	static int
1390	kqueue_poll(file_t fp, int* events)
1391	{
1392	struct kqueue *kq;
1393	int revents;
1394
1395	kq = fp->f_kqueue;
1396
1397	revents = `0`;
1398	if (events & (POLLIN \| POLLRDNORM)) {
1399	mutex_spin_enter(&kq->kq_lock);
1400	if (kq->kq_count != `0`) {
1401	revents \|= events & (POLLIN \| POLLRDNORM);
1402	} else {
1403	selrecord(curlwp, &kq->kq_sel);
1404	}
1405	kq_check(kq);
1406	mutex_spin_exit(&kq->kq_lock);
1407	}
1408
1409	return revents;
1410	}
1411
1412	/*
1413	* fileops stat method for a kqueue descriptor.
1414	* Returns dummy info, with st_size being number of events pending.
1415	*/
1416	static int
1417	kqueue_stat(file_t fp, struct* stat *st)
1418	{
1419	struct kqueue *kq;
1420
1421	kq = fp->f_kqueue;
1422
1423	memset(st, `0`, sizeof(*st));
1424	st->st_size = kq->kq_count;
1425	st->st_blksize = sizeof(struct kevent);
1426	st->st_mode = S_IFIFO;
1427
1428	return `0`;
1429	}
1430
1431	static void
1432	kqueue_doclose(struct kqueue kq, struct* klist list, int* fd)
1433	{
1434	struct knote *kn;
1435	filedesc_t *fdp;
1436
1437	fdp = kq->kq_fdp;
1438
1439	KASSERT(mutex_owned(&fdp->fd_lock));
1440
1441	for (kn = SLIST_FIRST(list); kn != NULL;) {
1442	if (kq != kn->kn_kq) {
1443	kn = SLIST_NEXT(kn, kn_link);
1444	continue;
1445	}
1446	knote_detach(kn, fdp, true);
1447	mutex_enter(&fdp->fd_lock);
1448	kn = SLIST_FIRST(list);
1449	}
1450	}
1451
1452
1453	/*
1454	* fileops close method for a kqueue descriptor.
1455	*/
1456	static int
1457	kqueue_close(file_t *fp)
1458	{
1459	struct kqueue *kq;
1460	filedesc_t *fdp;
1461	fdfile_t *ff;
1462	int i;
1463
1464	kq = fp->f_kqueue;
1465	fp->f_kqueue = NULL;
1466	fp->f_type = `0`;
1467	fdp = curlwp->l_fd;
1468
1469	mutex_enter(&fdp->fd_lock);
1470	for (i = `0`; i <= fdp->fd_lastkqfile; i++) {
1471	if ((ff = fdp->fd_dt->dt_ff[i]) == NULL)
1472	continue;
1473	kqueue_doclose(kq, (struct klist *)&ff->ff_knlist, i);
1474	}
1475	if (fdp->fd_knhashmask != `0`) {
1476	for (i = `0`; i < fdp->fd_knhashmask + `1`; i++) {
1477	kqueue_doclose(kq, &fdp->fd_knhash[i], -`1`);
1478	}
1479	}
1480	mutex_exit(&fdp->fd_lock);
1481
1482	KASSERT(kq->kq_count == `0`);
1483	mutex_destroy(&kq->kq_lock);
1484	cv_destroy(&kq->kq_cv);
1485	seldestroy(&kq->kq_sel);
1486	kmem_free(kq, sizeof(*kq));
1487
1488	return (`0`);
1489	}
1490
1491	/*
1492	* struct fileops kqfilter method for a kqueue descriptor.
1493	* Event triggered when monitored kqueue changes.
1494	*/
1495	static int
1496	kqueue_kqfilter(file_t fp, struct* knote *kn)
1497	{
1498	struct kqueue *kq;
1499
1500	kq = ((file_t *)kn->kn_obj)->f_kqueue;
1501
1502	KASSERT(fp == kn->kn_obj);
1503
1504	if (kn->kn_filter != EVFILT_READ)
1505	return `1`;
1506
1507	kn->kn_fop = &kqread_filtops;
1508	mutex_enter(&kq->kq_lock);
1509	SLIST_INSERT_HEAD(&kq->kq_sel.sel_klist, kn, kn_selnext);
1510	mutex_exit(&kq->kq_lock);
1511
1512	return `0`;
1513	}
1514
1515
1516	/*
1517	* Walk down a list of knotes, activating them if their event has
1518	* triggered. The caller's object lock (e.g. device driver lock)
1519	* must be held.
1520	*/
1521	void
1522	knote(struct klist list, long* hint)
1523	{
1524	struct knote kn, tmpkn;
1525
1526	SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmpkn) {
1527	KASSERT(kn->kn_fop != NULL);
1528	KASSERT(kn->kn_fop->f_event != NULL);
1529	if ((*kn->kn_fop->f_event)(kn, hint))
1530	knote_activate(kn);
1531	}
1532	}
1533
1534	/*
1535	* Remove all knotes referencing a specified fd
1536	*/
1537	void
1538	knote_fdclose(int fd)
1539	{
1540	struct klist *list;
1541	struct knote *kn;
1542	filedesc_t *fdp;
1543
1544	fdp = curlwp->l_fd;
1545	list = (struct klist *)&fdp->fd_dt->dt_ff[fd]->ff_knlist;
1546	mutex_enter(&fdp->fd_lock);
1547	while ((kn = SLIST_FIRST(list)) != NULL) {
1548	knote_detach(kn, fdp, true);
1549	mutex_enter(&fdp->fd_lock);
1550	}
1551	mutex_exit(&fdp->fd_lock);
1552	}
1553
1554	/*
1555	* Drop knote. Called with fdp->fd_lock held, and will drop before
1556	* returning.
1557	*/
1558	static void
1559	knote_detach(struct knote kn, filedesc_t fdp, bool dofop)
1560	{
1561	struct klist *list;
1562	struct kqueue *kq;
1563
1564	kq = kn->kn_kq;
1565
1566	KASSERT((kn->kn_status & KN_MARKER) == `0`);
1567	KASSERT(mutex_owned(&fdp->fd_lock));
1568
1569	KASSERT(kn->kn_fop != NULL);
1570	/ Remove from monitored object. /
1571	if (dofop) {
1572	KASSERT(kn->kn_fop->f_detach != NULL);
1573	KERNEL_LOCK(`1`, NULL); / XXXSMP /
1574	(*kn->kn_fop->f_detach)(kn);
1575	KERNEL_UNLOCK_ONE(NULL); / XXXSMP /
1576	}
1577
1578	/ Remove from descriptor table. /
1579	if (kn->kn_fop->f_isfd)
1580	list = (struct klist *)&fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
1581	else
1582	list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
1583
1584	SLIST_REMOVE(list, kn, knote, kn_link);
1585
1586	/ Remove from kqueue. /
1587	again:
1588	mutex_spin_enter(&kq->kq_lock);
1589	if ((kn->kn_status & KN_QUEUED) != `0`) {
1590	kq_check(kq);
1591	kq->kq_count--;
1592	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1593	kn->kn_status &= ~KN_QUEUED;
1594	kq_check(kq);
1595	} else if (kn->kn_status & KN_BUSY) {
1596	mutex_spin_exit(&kq->kq_lock);
1597	goto again;
1598	}
1599	mutex_spin_exit(&kq->kq_lock);
1600
1601	mutex_exit(&fdp->fd_lock);
1602	if (kn->kn_fop->f_isfd)
1603	fd_putfile(kn->kn_id);
1604	atomic_dec_uint(&kn->kn_kfilter->refcnt);
1605	kmem_free(kn, sizeof(*kn));
1606	}
1607
1608	/*
1609	* Queue new event for knote.
1610	*/
1611	static void
1612	knote_enqueue(struct knote *kn)
1613	{
1614	struct kqueue *kq;
1615
1616	KASSERT((kn->kn_status & KN_MARKER) == `0`);
1617
1618	kq = kn->kn_kq;
1619
1620	mutex_spin_enter(&kq->kq_lock);
1621	if ((kn->kn_status & KN_DISABLED) != `0`) {
1622	kn->kn_status &= ~KN_DISABLED;
1623	}
1624	if ((kn->kn_status & (KN_ACTIVE \| KN_QUEUED)) == KN_ACTIVE) {
1625	kq_check(kq);
1626	kn->kn_status \|= KN_QUEUED;
1627	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1628	kq->kq_count++;
1629	kq_check(kq);
1630	cv_broadcast(&kq->kq_cv);
1631	selnotify(&kq->kq_sel, `0`, NOTE_SUBMIT);
1632	}
1633	mutex_spin_exit(&kq->kq_lock);
1634	}
1635	/*
1636	* Queue new event for knote.
1637	*/
1638	static void
1639	knote_activate(struct knote *kn)
1640	{
1641	struct kqueue *kq;
1642
1643	KASSERT((kn->kn_status & KN_MARKER) == `0`);
1644
1645	kq = kn->kn_kq;
1646
1647	mutex_spin_enter(&kq->kq_lock);
1648	kn->kn_status \|= KN_ACTIVE;
1649	if ((kn->kn_status & (KN_QUEUED \| KN_DISABLED)) == `0`) {
1650	kq_check(kq);
1651	kn->kn_status \|= KN_QUEUED;
1652	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1653	kq->kq_count++;
1654	kq_check(kq);
1655	cv_broadcast(&kq->kq_cv);
1656	selnotify(&kq->kq_sel, `0`, NOTE_SUBMIT);
1657	}
1658	mutex_spin_exit(&kq->kq_lock);
1659	}
1660

Browse the source code of src/src/sys/kern/kern_event.c

Definitions