sched_4bsd.c source code [src/src/sys/kern/sched_4bsd.c]

1	/ $NetBSD: sched_4bsd.c,v 1.30 2014/06/24 10:08:45 maxv Exp $ /
2
3	/-*
4	* Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
5	* All rights reserved.
6	*
7	* This code is derived from software contributed to The NetBSD Foundation
8	* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9	* NASA Ames Research Center, by Charles M. Hannum, Andrew Doran, and
10	* Daniel Sieger.
11	*
12	* Redistribution and use in source and binary forms, with or without
13	* modification, are permitted provided that the following conditions
14	* are met:
15	* 1. Redistributions of source code must retain the above copyright
16	* notice, this list of conditions and the following disclaimer.
17	* 2. Redistributions in binary form must reproduce the above copyright
18	* notice, this list of conditions and the following disclaimer in the
19	* documentation and/or other materials provided with the distribution.
20	*
21	* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31	* POSSIBILITY OF SUCH DAMAGE.
32	*/
33
34	/-*
35	* Copyright (c) 1982, 1986, 1990, 1991, 1993
36	* The Regents of the University of California. All rights reserved.
37	* (c) UNIX System Laboratories, Inc.
38	* All or some portions of this file are derived from material licensed
39	* to the University of California by American Telephone and Telegraph
40	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
41	* the permission of UNIX System Laboratories, Inc.
42	*
43	* Redistribution and use in source and binary forms, with or without
44	* modification, are permitted provided that the following conditions
45	* are met:
46	* 1. Redistributions of source code must retain the above copyright
47	* notice, this list of conditions and the following disclaimer.
48	* 2. Redistributions in binary form must reproduce the above copyright
49	* notice, this list of conditions and the following disclaimer in the
50	* documentation and/or other materials provided with the distribution.
51	* 3. Neither the name of the University nor the names of its contributors
52	* may be used to endorse or promote products derived from this software
53	* without specific prior written permission.
54	*
55	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65	* SUCH DAMAGE.
66	*
67	* @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
68	*/
69
70	#include <sys/cdefs.h>
71	__KERNEL_RCSID(`0`, "$NetBSD: sched_4bsd.c,v 1.30 2014/06/24 10:08:45 maxv Exp $");
72
73	#include "opt_ddb.h"
74	#include "opt_lockdebug.h"
75	#include "opt_perfctrs.h"
76
77	#include <sys/param.h>
78	#include <sys/systm.h>
79	#include <sys/callout.h>
80	#include <sys/cpu.h>
81	#include <sys/proc.h>
82	#include <sys/kernel.h>
83	#include <sys/signalvar.h>
84	#include <sys/resourcevar.h>
85	#include <sys/sched.h>
86	#include <sys/sysctl.h>
87	#include <sys/kauth.h>
88	#include <sys/lockdebug.h>
89	#include <sys/kmem.h>
90	#include <sys/intr.h>
91
92	static void updatepri(struct lwp *);
93	static void resetpriority(struct lwp *);
94
95	extern unsigned int sched_pstats_ticks; / defined in kern_synch.c /
96
97	/ Number of hardclock ticks per sched_tick() /
98	static int rrticks;
99
100	/*
101	* Force switch among equal priority processes every 100ms.
102	* Called from hardclock every hz/10 == rrticks hardclock ticks.
103	*
104	* There's no need to lock anywhere in this routine, as it's
105	* CPU-local and runs at IPL_SCHED (called from clock interrupt).
106	*/
107	/ ARGSUSED /
108	void
109	sched_tick(struct cpu_info *ci)
110	{
111	struct schedstate_percpu *spc = &ci->ci_schedstate;
112	lwp_t *l;
113
114	spc->spc_ticks = rrticks;
115
116	if (CURCPU_IDLE_P()) {
117	cpu_need_resched(ci, `0`);
118	return;
119	}
120	l = ci->ci_data.cpu_onproc;
121	if (l == NULL) {
122	return;
123	}
124	switch (l->l_class) {
125	case SCHED_FIFO:
126	/ No timeslicing for FIFO jobs. /
127	break;
128	case SCHED_RR:
129	/ Force it into mi_switch() to look for other jobs to run. /
130	cpu_need_resched(ci, RESCHED_KPREEMPT);
131	break;
132	default:
133	if (spc->spc_flags & SPCF_SHOULDYIELD) {
134	/*
135	* Process is stuck in kernel somewhere, probably
136	* due to buggy or inefficient code. Force a
137	* kernel preemption.
138	*/
139	cpu_need_resched(ci, RESCHED_KPREEMPT);
140	} else if (spc->spc_flags & SPCF_SEENRR) {
141	/*
142	* The process has already been through a roundrobin
143	* without switching and may be hogging the CPU.
144	* Indicate that the process should yield.
145	*/
146	spc->spc_flags \|= SPCF_SHOULDYIELD;
147	cpu_need_resched(ci, `0`);
148	} else {
149	spc->spc_flags \|= SPCF_SEENRR;
150	}
151	break;
152	}
153	}
154
155	/*
156	* Why PRIO_MAX - 2? From setpriority(2):
157	*
158	* prio is a value in the range -20 to 20. The default priority is
159	* 0; lower priorities cause more favorable scheduling. A value of
160	* 19 or 20 will schedule a process only when nothing at priority <=
161	* 0 is runnable.
162	*
163	* This gives estcpu influence over 18 priority levels, and leaves nice
164	* with 40 levels. One way to think about it is that nice has 20 levels
165	* either side of estcpu's 18.
166	*/
167	#define ESTCPU_SHIFT 11
168	#define ESTCPU_MAX ((PRIO_MAX - 2) << ESTCPU_SHIFT)
169	#define ESTCPU_ACCUM (1 << (ESTCPU_SHIFT - 1))
170	#define ESTCPULIM(e) min((e), ESTCPU_MAX)
171
172	/*
173	* Constants for digital decay and forget:
174	* 90% of (l_estcpu) usage in 5 * loadav time
175	* 95% of (l_pctcpu) usage in 60 seconds (load insensitive)
176	* Note that, as ps(1) mentions, this can let percentages
177	* total over 100% (I've seen 137.9% for 3 processes).
178	*
179	* Note that hardclock updates l_estcpu and l_cpticks independently.
180	*
181	* We wish to decay away 90% of l_estcpu in (5 * loadavg) seconds.
182	* That is, the system wants to compute a value of decay such
183	* that the following for loop:
184	* for (i = 0; i < (5 * loadavg); i++)
185	* l_estcpu *= decay;
186	* will compute
187	* l_estcpu *= 0.1;
188	* for all values of loadavg:
189	*
190	* Mathematically this loop can be expressed by saying:
191	* decay ** (5 * loadavg) ~= .1
192	*
193	* The system computes decay as:
194	* decay = (2 * loadavg) / (2 * loadavg + 1)
195	*
196	* We wish to prove that the system's computation of decay
197	* will always fulfill the equation:
198	* decay ** (5 * loadavg) ~= .1
199	*
200	* If we compute b as:
201	* b = 2 * loadavg
202	* then
203	* decay = b / (b + 1)
204	*
205	* We now need to prove two things:
206	* 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
207	* 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
208	*
209	* Facts:
210	* For x close to zero, exp(x) =~ 1 + x, since
211	* exp(x) = 0! + x1/1! + x2/2! + ... .
212	* therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
213	* For x close to zero, ln(1+x) =~ x, since
214	* ln(1+x) = x - x2/2 + x3/3 - ... -1 < x < 1
215	* therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
216	* ln(.1) =~ -2.30
217	*
218	* Proof of (1):
219	* Solve (factor)*(power) =~ .1 given power (5loadav):
220	* solving for factor,
221	* ln(factor) =~ (-2.30/5*loadav), or
222	* factor =~ exp(-1/((5/2.30)loadav)) =~ exp(-1/(2loadav)) =
223	* exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED
224	*
225	* Proof of (2):
226	* Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
227	* solving for power,
228	* power*ln(b/(b+1)) =~ -2.30, or
229	* power =~ 2.3 * (b + 1) = 4.6loadav + 2.3 =~ 5loadav. QED
230	*
231	* Actual power values for the implemented algorithm are as follows:
232	* loadav: 1 2 3 4
233	* power: 5.68 10.32 14.94 19.55
234	*/
235
236	/ calculations for digital decay to forget 90% of usage in 5loadav sec /*
237	#define loadfactor(loadav) (2 * (loadav) / ncpu)
238
239	static fixpt_t
240	decay_cpu(fixpt_t loadfac, fixpt_t estcpu)
241	{
242
243	if (estcpu == `0`) {
244	return `0`;
245	}
246
247	#if !defined(_LP64)
248	/ avoid 64bit arithmetics. /
249	#define FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1))
250	if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) {
251	return estcpu * loadfac / (loadfac + FSCALE);
252	}
253	#endif /* !defined(_LP64) */
254
255	return (uint64_t)estcpu * loadfac / (loadfac + FSCALE);
256	}
257
258	/*
259	* For all load averages >= 1 and max l_estcpu of (255 << ESTCPU_SHIFT),
260	* sleeping for at least seven times the loadfactor will decay l_estcpu to
261	* less than (1 << ESTCPU_SHIFT).
262	*
263	* note that our ESTCPU_MAX is actually much smaller than (255 << ESTCPU_SHIFT).
264	*/
265	static fixpt_t
266	decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n)
267	{
268
269	if ((n << FSHIFT) >= `7` * loadfac) {
270	return `0`;
271	}
272
273	while (estcpu != `0` && n > `1`) {
274	estcpu = decay_cpu(loadfac, estcpu);
275	n--;
276	}
277
278	return estcpu;
279	}
280
281	/*
282	* sched_pstats_hook:
283	*
284	* Periodically called from sched_pstats(); used to recalculate priorities.
285	*/
286	void
287	sched_pstats_hook(struct lwp l, int* batch)
288	{
289	fixpt_t loadfac;
290
291	/*
292	* If the LWP has slept an entire second, stop recalculating
293	* its priority until it wakes up.
294	*/
295	KASSERT(lwp_locked(l, NULL));
296	if (l->l_stat == LSSLEEP \|\| l->l_stat == LSSTOP \|\|
297	l->l_stat == LSSUSPENDED) {
298	if (l->l_slptime > `1`) {
299	return;
300	}
301	}
302	loadfac = `2` * (averunnable.ldavg[`0`]);
303	l->l_estcpu = decay_cpu(loadfac, l->l_estcpu);
304	resetpriority(l);
305	}
306
307	/*
308	* Recalculate the priority of a process after it has slept for a while.
309	*/
310	static void
311	updatepri(struct lwp *l)
312	{
313	fixpt_t loadfac;
314
315	KASSERT(lwp_locked(l, NULL));
316	KASSERT(l->l_slptime > `1`);
317
318	loadfac = loadfactor(averunnable.ldavg[`0`]);
319
320	l->l_slptime--; / the first time was done in sched_pstats /
321	l->l_estcpu = decay_cpu_batch(loadfac, l->l_estcpu, l->l_slptime);
322	resetpriority(l);
323	}
324
325	void
326	sched_rqinit(void)
327	{
328
329	}
330
331	void
332	sched_setrunnable(struct lwp *l)
333	{
334
335	if (l->l_slptime > `1`)
336	updatepri(l);
337	}
338
339	void
340	sched_nice(struct proc p, int* n)
341	{
342	struct lwp *l;
343
344	KASSERT(mutex_owned(p->p_lock));
345
346	p->p_nice = n;
347	LIST_FOREACH(l, &p->p_lwps, l_sibling) {
348	lwp_lock(l);
349	resetpriority(l);
350	lwp_unlock(l);
351	}
352	}
353
354	/*
355	* Recompute the priority of an LWP. Arrange to reschedule if
356	* the resulting priority is better than that of the current LWP.
357	*/
358	static void
359	resetpriority(struct lwp *l)
360	{
361	pri_t pri;
362	struct proc *p = l->l_proc;
363
364	KASSERT(lwp_locked(l, NULL));
365
366	if (l->l_class != SCHED_OTHER)
367	return;
368
369	/ See comments above ESTCPU_SHIFT definition. /
370	pri = (PRI_KERNEL - `1`) - (l->l_estcpu >> ESTCPU_SHIFT) - p->p_nice;
371	pri = imax(pri, `0`);
372	if (pri != l->l_priority)
373	lwp_changepri(l, pri);
374	}
375
376	/*
377	* We adjust the priority of the current LWP. The priority of a LWP
378	* gets worse as it accumulates CPU time. The CPU usage estimator (l_estcpu)
379	* is increased here. The formula for computing priorities will compute a
380	* different value each time l_estcpu increases. This can cause a switch,
381	* but unless the priority crosses a PPQ boundary the actual queue will not
382	* change. The CPU usage estimator ramps up quite quickly when the process
383	* is running (linearly), and decays away exponentially, at a rate which is
384	* proportionally slower when the system is busy. The basic principle is
385	* that the system will 90% forget that the process used a lot of CPU time
386	* in 5 * loadav seconds. This causes the system to favor processes which
387	* haven't run much recently, and to round-robin among other processes.
388	*/
389
390	void
391	sched_schedclock(struct lwp *l)
392	{
393
394	if (l->l_class != SCHED_OTHER)
395	return;
396
397	KASSERT(!CURCPU_IDLE_P());
398	l->l_estcpu = ESTCPULIM(l->l_estcpu + ESTCPU_ACCUM);
399	lwp_lock(l);
400	resetpriority(l);
401	lwp_unlock(l);
402	}
403
404	/*
405	* sched_proc_fork:
406	*
407	* Inherit the parent's scheduler history.
408	*/
409	void
410	sched_proc_fork(struct proc parent, struct* proc *child)
411	{
412	lwp_t *pl;
413
414	KASSERT(mutex_owned(parent->p_lock));
415
416	pl = LIST_FIRST(&parent->p_lwps);
417	child->p_estcpu_inherited = pl->l_estcpu;
418	child->p_forktime = sched_pstats_ticks;
419	}
420
421	/*
422	* sched_proc_exit:
423	*
424	* Chargeback parents for the sins of their children.
425	*/
426	void
427	sched_proc_exit(struct proc parent, struct* proc *child)
428	{
429	fixpt_t loadfac = loadfactor(averunnable.ldavg[`0`]);
430	fixpt_t estcpu;
431	lwp_t pl, cl;
432
433	/ XXX Only if parent != init?? /
434
435	mutex_enter(parent->p_lock);
436	pl = LIST_FIRST(&parent->p_lwps);
437	cl = LIST_FIRST(&child->p_lwps);
438	estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited,
439	sched_pstats_ticks - child->p_forktime);
440	if (cl->l_estcpu > estcpu) {
441	lwp_lock(pl);
442	pl->l_estcpu = ESTCPULIM(pl->l_estcpu + cl->l_estcpu - estcpu);
443	lwp_unlock(pl);
444	}
445	mutex_exit(parent->p_lock);
446	}
447
448	void
449	sched_wakeup(struct lwp *l)
450	{
451
452	}
453
454	void
455	sched_slept(struct lwp *l)
456	{
457
458	}
459
460	void
461	sched_lwp_fork(struct lwp l1, struct* lwp *l2)
462	{
463
464	l2->l_estcpu = l1->l_estcpu;
465	}
466
467	void
468	sched_lwp_collect(struct lwp *t)
469	{
470	lwp_t *l;
471
472	/ Absorb estcpu value of collected LWP. /
473	l = curlwp;
474	lwp_lock(l);
475	l->l_estcpu += t->l_estcpu;
476	lwp_unlock(l);
477	}
478
479	void
480	sched_oncpu(lwp_t *l)
481	{
482
483	}
484
485	void
486	sched_newts(lwp_t *l)
487	{
488
489	}
490
491	/*
492	* Sysctl nodes and initialization.
493	*/
494
495	static int
496	sysctl_sched_rtts(SYSCTLFN_ARGS)
497	{
498	struct sysctlnode node;
499	int rttsms = hztoms(rrticks);
500
501	node = *rnode;
502	node.sysctl_data = &rttsms;
503	return sysctl_lookup(SYSCTLFN_CALL(&node));
504	}
505
506	SYSCTL_SETUP(sysctl_sched_4bsd_setup, "sysctl sched setup")
507	{
508	const struct sysctlnode *node = NULL;
509
510	sysctl_createv(clog, `0`, NULL, &node,
511	CTLFLAG_PERMANENT,
512	CTLTYPE_NODE, "sched",
513	SYSCTL_DESCR("Scheduler options"),
514	NULL, `0`, NULL, `0`,
515	CTL_KERN, CTL_CREATE, CTL_EOL);
516
517	if (node == NULL)
518	return;
519
520	rrticks = hz / `10`;
521
522	sysctl_createv(NULL, `0`, &node, NULL,
523	CTLFLAG_PERMANENT,
524	CTLTYPE_STRING, "name", NULL,
525	NULL, `0`, __UNCONST("4.4BSD"), `0`,
526	CTL_CREATE, CTL_EOL);
527	sysctl_createv(NULL, `0`, &node, NULL,
528	CTLFLAG_PERMANENT,
529	CTLTYPE_INT, "rtts",
530	SYSCTL_DESCR("Round-robin time quantum (in milliseconds)"),
531	sysctl_sched_rtts, `0`, NULL, `0`,
532	CTL_CREATE, CTL_EOL);
533	}
534

Browse the source code of src/src/sys/kern/sched_4bsd.c