1 | /* $NetBSD: sched_4bsd.c,v 1.30 2014/06/24 10:08:45 maxv Exp $ */ |
2 | |
3 | /*- |
4 | * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc. |
5 | * All rights reserved. |
6 | * |
7 | * This code is derived from software contributed to The NetBSD Foundation |
8 | * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, |
9 | * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran, and |
10 | * Daniel Sieger. |
11 | * |
12 | * Redistribution and use in source and binary forms, with or without |
13 | * modification, are permitted provided that the following conditions |
14 | * are met: |
15 | * 1. Redistributions of source code must retain the above copyright |
16 | * notice, this list of conditions and the following disclaimer. |
17 | * 2. Redistributions in binary form must reproduce the above copyright |
18 | * notice, this list of conditions and the following disclaimer in the |
19 | * documentation and/or other materials provided with the distribution. |
20 | * |
21 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
22 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
23 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
24 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
25 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
26 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
27 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
28 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
29 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
30 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
31 | * POSSIBILITY OF SUCH DAMAGE. |
32 | */ |
33 | |
34 | /*- |
35 | * Copyright (c) 1982, 1986, 1990, 1991, 1993 |
36 | * The Regents of the University of California. All rights reserved. |
37 | * (c) UNIX System Laboratories, Inc. |
38 | * All or some portions of this file are derived from material licensed |
39 | * to the University of California by American Telephone and Telegraph |
40 | * Co. or Unix System Laboratories, Inc. and are reproduced herein with |
41 | * the permission of UNIX System Laboratories, Inc. |
42 | * |
43 | * Redistribution and use in source and binary forms, with or without |
44 | * modification, are permitted provided that the following conditions |
45 | * are met: |
46 | * 1. Redistributions of source code must retain the above copyright |
47 | * notice, this list of conditions and the following disclaimer. |
48 | * 2. Redistributions in binary form must reproduce the above copyright |
49 | * notice, this list of conditions and the following disclaimer in the |
50 | * documentation and/or other materials provided with the distribution. |
51 | * 3. Neither the name of the University nor the names of its contributors |
52 | * may be used to endorse or promote products derived from this software |
53 | * without specific prior written permission. |
54 | * |
55 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
56 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
57 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
58 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
59 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
60 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
61 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
62 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
63 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
64 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
65 | * SUCH DAMAGE. |
66 | * |
67 | * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 |
68 | */ |
69 | |
70 | #include <sys/cdefs.h> |
71 | __KERNEL_RCSID(0, "$NetBSD: sched_4bsd.c,v 1.30 2014/06/24 10:08:45 maxv Exp $" ); |
72 | |
73 | #include "opt_ddb.h" |
74 | #include "opt_lockdebug.h" |
75 | #include "opt_perfctrs.h" |
76 | |
77 | #include <sys/param.h> |
78 | #include <sys/systm.h> |
79 | #include <sys/callout.h> |
80 | #include <sys/cpu.h> |
81 | #include <sys/proc.h> |
82 | #include <sys/kernel.h> |
83 | #include <sys/signalvar.h> |
84 | #include <sys/resourcevar.h> |
85 | #include <sys/sched.h> |
86 | #include <sys/sysctl.h> |
87 | #include <sys/kauth.h> |
88 | #include <sys/lockdebug.h> |
89 | #include <sys/kmem.h> |
90 | #include <sys/intr.h> |
91 | |
92 | static void updatepri(struct lwp *); |
93 | static void resetpriority(struct lwp *); |
94 | |
95 | extern unsigned int sched_pstats_ticks; /* defined in kern_synch.c */ |
96 | |
97 | /* Number of hardclock ticks per sched_tick() */ |
98 | static int rrticks; |
99 | |
100 | /* |
101 | * Force switch among equal priority processes every 100ms. |
102 | * Called from hardclock every hz/10 == rrticks hardclock ticks. |
103 | * |
104 | * There's no need to lock anywhere in this routine, as it's |
105 | * CPU-local and runs at IPL_SCHED (called from clock interrupt). |
106 | */ |
107 | /* ARGSUSED */ |
108 | void |
109 | sched_tick(struct cpu_info *ci) |
110 | { |
111 | struct schedstate_percpu *spc = &ci->ci_schedstate; |
112 | lwp_t *l; |
113 | |
114 | spc->spc_ticks = rrticks; |
115 | |
116 | if (CURCPU_IDLE_P()) { |
117 | cpu_need_resched(ci, 0); |
118 | return; |
119 | } |
120 | l = ci->ci_data.cpu_onproc; |
121 | if (l == NULL) { |
122 | return; |
123 | } |
124 | switch (l->l_class) { |
125 | case SCHED_FIFO: |
126 | /* No timeslicing for FIFO jobs. */ |
127 | break; |
128 | case SCHED_RR: |
129 | /* Force it into mi_switch() to look for other jobs to run. */ |
130 | cpu_need_resched(ci, RESCHED_KPREEMPT); |
131 | break; |
132 | default: |
133 | if (spc->spc_flags & SPCF_SHOULDYIELD) { |
134 | /* |
135 | * Process is stuck in kernel somewhere, probably |
136 | * due to buggy or inefficient code. Force a |
137 | * kernel preemption. |
138 | */ |
139 | cpu_need_resched(ci, RESCHED_KPREEMPT); |
140 | } else if (spc->spc_flags & SPCF_SEENRR) { |
141 | /* |
142 | * The process has already been through a roundrobin |
143 | * without switching and may be hogging the CPU. |
144 | * Indicate that the process should yield. |
145 | */ |
146 | spc->spc_flags |= SPCF_SHOULDYIELD; |
147 | cpu_need_resched(ci, 0); |
148 | } else { |
149 | spc->spc_flags |= SPCF_SEENRR; |
150 | } |
151 | break; |
152 | } |
153 | } |
154 | |
155 | /* |
156 | * Why PRIO_MAX - 2? From setpriority(2): |
157 | * |
158 | * prio is a value in the range -20 to 20. The default priority is |
159 | * 0; lower priorities cause more favorable scheduling. A value of |
160 | * 19 or 20 will schedule a process only when nothing at priority <= |
161 | * 0 is runnable. |
162 | * |
163 | * This gives estcpu influence over 18 priority levels, and leaves nice |
164 | * with 40 levels. One way to think about it is that nice has 20 levels |
165 | * either side of estcpu's 18. |
166 | */ |
167 | #define ESTCPU_SHIFT 11 |
168 | #define ESTCPU_MAX ((PRIO_MAX - 2) << ESTCPU_SHIFT) |
169 | #define ESTCPU_ACCUM (1 << (ESTCPU_SHIFT - 1)) |
170 | #define ESTCPULIM(e) min((e), ESTCPU_MAX) |
171 | |
172 | /* |
173 | * Constants for digital decay and forget: |
174 | * 90% of (l_estcpu) usage in 5 * loadav time |
175 | * 95% of (l_pctcpu) usage in 60 seconds (load insensitive) |
176 | * Note that, as ps(1) mentions, this can let percentages |
177 | * total over 100% (I've seen 137.9% for 3 processes). |
178 | * |
179 | * Note that hardclock updates l_estcpu and l_cpticks independently. |
180 | * |
181 | * We wish to decay away 90% of l_estcpu in (5 * loadavg) seconds. |
182 | * That is, the system wants to compute a value of decay such |
183 | * that the following for loop: |
184 | * for (i = 0; i < (5 * loadavg); i++) |
185 | * l_estcpu *= decay; |
186 | * will compute |
187 | * l_estcpu *= 0.1; |
188 | * for all values of loadavg: |
189 | * |
190 | * Mathematically this loop can be expressed by saying: |
191 | * decay ** (5 * loadavg) ~= .1 |
192 | * |
193 | * The system computes decay as: |
194 | * decay = (2 * loadavg) / (2 * loadavg + 1) |
195 | * |
196 | * We wish to prove that the system's computation of decay |
197 | * will always fulfill the equation: |
198 | * decay ** (5 * loadavg) ~= .1 |
199 | * |
200 | * If we compute b as: |
201 | * b = 2 * loadavg |
202 | * then |
203 | * decay = b / (b + 1) |
204 | * |
205 | * We now need to prove two things: |
206 | * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) |
207 | * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) |
208 | * |
209 | * Facts: |
210 | * For x close to zero, exp(x) =~ 1 + x, since |
211 | * exp(x) = 0! + x**1/1! + x**2/2! + ... . |
212 | * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. |
213 | * For x close to zero, ln(1+x) =~ x, since |
214 | * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 |
215 | * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). |
216 | * ln(.1) =~ -2.30 |
217 | * |
218 | * Proof of (1): |
219 | * Solve (factor)**(power) =~ .1 given power (5*loadav): |
220 | * solving for factor, |
221 | * ln(factor) =~ (-2.30/5*loadav), or |
222 | * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = |
223 | * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED |
224 | * |
225 | * Proof of (2): |
226 | * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): |
227 | * solving for power, |
228 | * power*ln(b/(b+1)) =~ -2.30, or |
229 | * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED |
230 | * |
231 | * Actual power values for the implemented algorithm are as follows: |
232 | * loadav: 1 2 3 4 |
233 | * power: 5.68 10.32 14.94 19.55 |
234 | */ |
235 | |
236 | /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ |
237 | #define loadfactor(loadav) (2 * (loadav) / ncpu) |
238 | |
239 | static fixpt_t |
240 | decay_cpu(fixpt_t loadfac, fixpt_t estcpu) |
241 | { |
242 | |
243 | if (estcpu == 0) { |
244 | return 0; |
245 | } |
246 | |
247 | #if !defined(_LP64) |
248 | /* avoid 64bit arithmetics. */ |
249 | #define FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1)) |
250 | if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) { |
251 | return estcpu * loadfac / (loadfac + FSCALE); |
252 | } |
253 | #endif /* !defined(_LP64) */ |
254 | |
255 | return (uint64_t)estcpu * loadfac / (loadfac + FSCALE); |
256 | } |
257 | |
258 | /* |
259 | * For all load averages >= 1 and max l_estcpu of (255 << ESTCPU_SHIFT), |
260 | * sleeping for at least seven times the loadfactor will decay l_estcpu to |
261 | * less than (1 << ESTCPU_SHIFT). |
262 | * |
263 | * note that our ESTCPU_MAX is actually much smaller than (255 << ESTCPU_SHIFT). |
264 | */ |
265 | static fixpt_t |
266 | decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n) |
267 | { |
268 | |
269 | if ((n << FSHIFT) >= 7 * loadfac) { |
270 | return 0; |
271 | } |
272 | |
273 | while (estcpu != 0 && n > 1) { |
274 | estcpu = decay_cpu(loadfac, estcpu); |
275 | n--; |
276 | } |
277 | |
278 | return estcpu; |
279 | } |
280 | |
281 | /* |
282 | * sched_pstats_hook: |
283 | * |
284 | * Periodically called from sched_pstats(); used to recalculate priorities. |
285 | */ |
286 | void |
287 | sched_pstats_hook(struct lwp *l, int batch) |
288 | { |
289 | fixpt_t loadfac; |
290 | |
291 | /* |
292 | * If the LWP has slept an entire second, stop recalculating |
293 | * its priority until it wakes up. |
294 | */ |
295 | KASSERT(lwp_locked(l, NULL)); |
296 | if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP || |
297 | l->l_stat == LSSUSPENDED) { |
298 | if (l->l_slptime > 1) { |
299 | return; |
300 | } |
301 | } |
302 | loadfac = 2 * (averunnable.ldavg[0]); |
303 | l->l_estcpu = decay_cpu(loadfac, l->l_estcpu); |
304 | resetpriority(l); |
305 | } |
306 | |
307 | /* |
308 | * Recalculate the priority of a process after it has slept for a while. |
309 | */ |
310 | static void |
311 | updatepri(struct lwp *l) |
312 | { |
313 | fixpt_t loadfac; |
314 | |
315 | KASSERT(lwp_locked(l, NULL)); |
316 | KASSERT(l->l_slptime > 1); |
317 | |
318 | loadfac = loadfactor(averunnable.ldavg[0]); |
319 | |
320 | l->l_slptime--; /* the first time was done in sched_pstats */ |
321 | l->l_estcpu = decay_cpu_batch(loadfac, l->l_estcpu, l->l_slptime); |
322 | resetpriority(l); |
323 | } |
324 | |
325 | void |
326 | sched_rqinit(void) |
327 | { |
328 | |
329 | } |
330 | |
331 | void |
332 | sched_setrunnable(struct lwp *l) |
333 | { |
334 | |
335 | if (l->l_slptime > 1) |
336 | updatepri(l); |
337 | } |
338 | |
339 | void |
340 | sched_nice(struct proc *p, int n) |
341 | { |
342 | struct lwp *l; |
343 | |
344 | KASSERT(mutex_owned(p->p_lock)); |
345 | |
346 | p->p_nice = n; |
347 | LIST_FOREACH(l, &p->p_lwps, l_sibling) { |
348 | lwp_lock(l); |
349 | resetpriority(l); |
350 | lwp_unlock(l); |
351 | } |
352 | } |
353 | |
354 | /* |
355 | * Recompute the priority of an LWP. Arrange to reschedule if |
356 | * the resulting priority is better than that of the current LWP. |
357 | */ |
358 | static void |
359 | resetpriority(struct lwp *l) |
360 | { |
361 | pri_t pri; |
362 | struct proc *p = l->l_proc; |
363 | |
364 | KASSERT(lwp_locked(l, NULL)); |
365 | |
366 | if (l->l_class != SCHED_OTHER) |
367 | return; |
368 | |
369 | /* See comments above ESTCPU_SHIFT definition. */ |
370 | pri = (PRI_KERNEL - 1) - (l->l_estcpu >> ESTCPU_SHIFT) - p->p_nice; |
371 | pri = imax(pri, 0); |
372 | if (pri != l->l_priority) |
373 | lwp_changepri(l, pri); |
374 | } |
375 | |
376 | /* |
377 | * We adjust the priority of the current LWP. The priority of a LWP |
378 | * gets worse as it accumulates CPU time. The CPU usage estimator (l_estcpu) |
379 | * is increased here. The formula for computing priorities will compute a |
380 | * different value each time l_estcpu increases. This can cause a switch, |
381 | * but unless the priority crosses a PPQ boundary the actual queue will not |
382 | * change. The CPU usage estimator ramps up quite quickly when the process |
383 | * is running (linearly), and decays away exponentially, at a rate which is |
384 | * proportionally slower when the system is busy. The basic principle is |
385 | * that the system will 90% forget that the process used a lot of CPU time |
386 | * in 5 * loadav seconds. This causes the system to favor processes which |
387 | * haven't run much recently, and to round-robin among other processes. |
388 | */ |
389 | |
390 | void |
391 | sched_schedclock(struct lwp *l) |
392 | { |
393 | |
394 | if (l->l_class != SCHED_OTHER) |
395 | return; |
396 | |
397 | KASSERT(!CURCPU_IDLE_P()); |
398 | l->l_estcpu = ESTCPULIM(l->l_estcpu + ESTCPU_ACCUM); |
399 | lwp_lock(l); |
400 | resetpriority(l); |
401 | lwp_unlock(l); |
402 | } |
403 | |
404 | /* |
405 | * sched_proc_fork: |
406 | * |
407 | * Inherit the parent's scheduler history. |
408 | */ |
409 | void |
410 | sched_proc_fork(struct proc *parent, struct proc *child) |
411 | { |
412 | lwp_t *pl; |
413 | |
414 | KASSERT(mutex_owned(parent->p_lock)); |
415 | |
416 | pl = LIST_FIRST(&parent->p_lwps); |
417 | child->p_estcpu_inherited = pl->l_estcpu; |
418 | child->p_forktime = sched_pstats_ticks; |
419 | } |
420 | |
421 | /* |
422 | * sched_proc_exit: |
423 | * |
424 | * Chargeback parents for the sins of their children. |
425 | */ |
426 | void |
427 | sched_proc_exit(struct proc *parent, struct proc *child) |
428 | { |
429 | fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); |
430 | fixpt_t estcpu; |
431 | lwp_t *pl, *cl; |
432 | |
433 | /* XXX Only if parent != init?? */ |
434 | |
435 | mutex_enter(parent->p_lock); |
436 | pl = LIST_FIRST(&parent->p_lwps); |
437 | cl = LIST_FIRST(&child->p_lwps); |
438 | estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited, |
439 | sched_pstats_ticks - child->p_forktime); |
440 | if (cl->l_estcpu > estcpu) { |
441 | lwp_lock(pl); |
442 | pl->l_estcpu = ESTCPULIM(pl->l_estcpu + cl->l_estcpu - estcpu); |
443 | lwp_unlock(pl); |
444 | } |
445 | mutex_exit(parent->p_lock); |
446 | } |
447 | |
448 | void |
449 | sched_wakeup(struct lwp *l) |
450 | { |
451 | |
452 | } |
453 | |
454 | void |
455 | sched_slept(struct lwp *l) |
456 | { |
457 | |
458 | } |
459 | |
460 | void |
461 | sched_lwp_fork(struct lwp *l1, struct lwp *l2) |
462 | { |
463 | |
464 | l2->l_estcpu = l1->l_estcpu; |
465 | } |
466 | |
467 | void |
468 | sched_lwp_collect(struct lwp *t) |
469 | { |
470 | lwp_t *l; |
471 | |
472 | /* Absorb estcpu value of collected LWP. */ |
473 | l = curlwp; |
474 | lwp_lock(l); |
475 | l->l_estcpu += t->l_estcpu; |
476 | lwp_unlock(l); |
477 | } |
478 | |
479 | void |
480 | sched_oncpu(lwp_t *l) |
481 | { |
482 | |
483 | } |
484 | |
485 | void |
486 | sched_newts(lwp_t *l) |
487 | { |
488 | |
489 | } |
490 | |
491 | /* |
492 | * Sysctl nodes and initialization. |
493 | */ |
494 | |
495 | static int |
496 | sysctl_sched_rtts(SYSCTLFN_ARGS) |
497 | { |
498 | struct sysctlnode node; |
499 | int rttsms = hztoms(rrticks); |
500 | |
501 | node = *rnode; |
502 | node.sysctl_data = &rttsms; |
503 | return sysctl_lookup(SYSCTLFN_CALL(&node)); |
504 | } |
505 | |
506 | SYSCTL_SETUP(sysctl_sched_4bsd_setup, "sysctl sched setup" ) |
507 | { |
508 | const struct sysctlnode *node = NULL; |
509 | |
510 | sysctl_createv(clog, 0, NULL, &node, |
511 | CTLFLAG_PERMANENT, |
512 | CTLTYPE_NODE, "sched" , |
513 | SYSCTL_DESCR("Scheduler options" ), |
514 | NULL, 0, NULL, 0, |
515 | CTL_KERN, CTL_CREATE, CTL_EOL); |
516 | |
517 | if (node == NULL) |
518 | return; |
519 | |
520 | rrticks = hz / 10; |
521 | |
522 | sysctl_createv(NULL, 0, &node, NULL, |
523 | CTLFLAG_PERMANENT, |
524 | CTLTYPE_STRING, "name" , NULL, |
525 | NULL, 0, __UNCONST("4.4BSD" ), 0, |
526 | CTL_CREATE, CTL_EOL); |
527 | sysctl_createv(NULL, 0, &node, NULL, |
528 | CTLFLAG_PERMANENT, |
529 | CTLTYPE_INT, "rtts" , |
530 | SYSCTL_DESCR("Round-robin time quantum (in milliseconds)" ), |
531 | sysctl_sched_rtts, 0, NULL, 0, |
532 | CTL_CREATE, CTL_EOL); |
533 | } |
534 | |