1/* $NetBSD: kern_clock.c,v 1.134 2015/04/22 16:46:58 pooka Exp $ */
2
3/*-
4 * Copyright (c) 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center.
10 * This code is derived from software contributed to The NetBSD Foundation
11 * by Charles M. Hannum.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 * POSSIBILITY OF SUCH DAMAGE.
33 */
34
35/*-
36 * Copyright (c) 1982, 1986, 1991, 1993
37 * The Regents of the University of California. All rights reserved.
38 * (c) UNIX System Laboratories, Inc.
39 * All or some portions of this file are derived from material licensed
40 * to the University of California by American Telephone and Telegraph
41 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
42 * the permission of UNIX System Laboratories, Inc.
43 *
44 * Redistribution and use in source and binary forms, with or without
45 * modification, are permitted provided that the following conditions
46 * are met:
47 * 1. Redistributions of source code must retain the above copyright
48 * notice, this list of conditions and the following disclaimer.
49 * 2. Redistributions in binary form must reproduce the above copyright
50 * notice, this list of conditions and the following disclaimer in the
51 * documentation and/or other materials provided with the distribution.
52 * 3. Neither the name of the University nor the names of its contributors
53 * may be used to endorse or promote products derived from this software
54 * without specific prior written permission.
55 *
56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
66 * SUCH DAMAGE.
67 *
68 * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
69 */
70
71#include <sys/cdefs.h>
72__KERNEL_RCSID(0, "$NetBSD: kern_clock.c,v 1.134 2015/04/22 16:46:58 pooka Exp $");
73
74#ifdef _KERNEL_OPT
75#include "opt_dtrace.h"
76#include "opt_perfctrs.h"
77#endif
78
79#include <sys/param.h>
80#include <sys/systm.h>
81#include <sys/callout.h>
82#include <sys/kernel.h>
83#include <sys/proc.h>
84#include <sys/resourcevar.h>
85#include <sys/signalvar.h>
86#include <sys/sysctl.h>
87#include <sys/timex.h>
88#include <sys/sched.h>
89#include <sys/time.h>
90#include <sys/timetc.h>
91#include <sys/cpu.h>
92#include <sys/atomic.h>
93
94#ifdef GPROF
95#include <sys/gmon.h>
96#endif
97
98#ifdef KDTRACE_HOOKS
99#include <sys/dtrace_bsd.h>
100#include <sys/cpu.h>
101
102cyclic_clock_func_t cyclic_clock_func[MAXCPUS];
103#endif
104
105static int sysctl_kern_clockrate(SYSCTLFN_PROTO);
106
107/*
108 * Clock handling routines.
109 *
110 * This code is written to operate with two timers that run independently of
111 * each other. The main clock, running hz times per second, is used to keep
112 * track of real time. The second timer handles kernel and user profiling,
113 * and does resource use estimation. If the second timer is programmable,
114 * it is randomized to avoid aliasing between the two clocks. For example,
115 * the randomization prevents an adversary from always giving up the CPU
116 * just before its quantum expires. Otherwise, it would never accumulate
117 * CPU ticks. The mean frequency of the second timer is stathz.
118 *
119 * If no second timer exists, stathz will be zero; in this case we drive
120 * profiling and statistics off the main clock. This WILL NOT be accurate;
121 * do not do it unless absolutely necessary.
122 *
123 * The statistics clock may (or may not) be run at a higher rate while
124 * profiling. This profile clock runs at profhz. We require that profhz
125 * be an integral multiple of stathz.
126 *
127 * If the statistics clock is running fast, it must be divided by the ratio
128 * profhz/stathz for statistics. (For profiling, every tick counts.)
129 */
130
131int stathz;
132int profhz;
133int profsrc;
134int schedhz;
135int profprocs;
136int hardclock_ticks;
137static int hardscheddiv; /* hard => sched divider (used if schedhz == 0) */
138static int psdiv; /* prof => stat divider */
139int psratio; /* ratio: prof / stat */
140
141static u_int get_intr_timecount(struct timecounter *);
142
143static struct timecounter intr_timecounter = {
144 get_intr_timecount, /* get_timecount */
145 0, /* no poll_pps */
146 ~0u, /* counter_mask */
147 0, /* frequency */
148 "clockinterrupt", /* name */
149 0, /* quality - minimum implementation level for a clock */
150 NULL, /* prev */
151 NULL, /* next */
152};
153
154static u_int
155get_intr_timecount(struct timecounter *tc)
156{
157
158 return (u_int)hardclock_ticks;
159}
160
161/*
162 * Initialize clock frequencies and start both clocks running.
163 */
164void
165initclocks(void)
166{
167 static struct sysctllog *clog;
168 int i;
169
170 /*
171 * Set divisors to 1 (normal case) and let the machine-specific
172 * code do its bit.
173 */
174 psdiv = 1;
175 /*
176 * provide minimum default time counter
177 * will only run at interrupt resolution
178 */
179 intr_timecounter.tc_frequency = hz;
180 tc_init(&intr_timecounter);
181 cpu_initclocks();
182
183 /*
184 * Compute profhz and stathz, fix profhz if needed.
185 */
186 i = stathz ? stathz : hz;
187 if (profhz == 0)
188 profhz = i;
189 psratio = profhz / i;
190 if (schedhz == 0) {
191 /* 16Hz is best */
192 hardscheddiv = hz / 16;
193 if (hardscheddiv <= 0)
194 panic("hardscheddiv");
195 }
196
197 sysctl_createv(&clog, 0, NULL, NULL,
198 CTLFLAG_PERMANENT,
199 CTLTYPE_STRUCT, "clockrate",
200 SYSCTL_DESCR("Kernel clock rates"),
201 sysctl_kern_clockrate, 0, NULL,
202 sizeof(struct clockinfo),
203 CTL_KERN, KERN_CLOCKRATE, CTL_EOL);
204 sysctl_createv(&clog, 0, NULL, NULL,
205 CTLFLAG_PERMANENT,
206 CTLTYPE_INT, "hardclock_ticks",
207 SYSCTL_DESCR("Number of hardclock ticks"),
208 NULL, 0, &hardclock_ticks, sizeof(hardclock_ticks),
209 CTL_KERN, KERN_HARDCLOCK_TICKS, CTL_EOL);
210}
211
212/*
213 * The real-time timer, interrupting hz times per second.
214 */
215void
216hardclock(struct clockframe *frame)
217{
218 struct lwp *l;
219 struct cpu_info *ci;
220
221 ci = curcpu();
222 l = ci->ci_data.cpu_onproc;
223
224 timer_tick(l, CLKF_USERMODE(frame));
225
226 /*
227 * If no separate statistics clock is available, run it from here.
228 */
229 if (stathz == 0)
230 statclock(frame);
231 /*
232 * If no separate schedclock is provided, call it here
233 * at about 16 Hz.
234 */
235 if (schedhz == 0) {
236 if ((int)(--ci->ci_schedstate.spc_schedticks) <= 0) {
237 schedclock(l);
238 ci->ci_schedstate.spc_schedticks = hardscheddiv;
239 }
240 }
241 if ((--ci->ci_schedstate.spc_ticks) <= 0)
242 sched_tick(ci);
243
244 if (CPU_IS_PRIMARY(ci)) {
245 hardclock_ticks++;
246 tc_ticktock();
247 }
248
249 /*
250 * Update real-time timeout queue.
251 */
252 callout_hardclock();
253
254#ifdef KDTRACE_HOOKS
255 cyclic_clock_func_t func = cyclic_clock_func[cpu_index(ci)];
256 if (func) {
257 (*func)((struct clockframe *)frame);
258 }
259#endif
260}
261
262/*
263 * Start profiling on a process.
264 *
265 * Kernel profiling passes proc0 which never exits and hence
266 * keeps the profile clock running constantly.
267 */
268void
269startprofclock(struct proc *p)
270{
271
272 KASSERT(mutex_owned(&p->p_stmutex));
273
274 if ((p->p_stflag & PST_PROFIL) == 0) {
275 p->p_stflag |= PST_PROFIL;
276 /*
277 * This is only necessary if using the clock as the
278 * profiling source.
279 */
280 if (++profprocs == 1 && stathz != 0)
281 psdiv = psratio;
282 }
283}
284
285/*
286 * Stop profiling on a process.
287 */
288void
289stopprofclock(struct proc *p)
290{
291
292 KASSERT(mutex_owned(&p->p_stmutex));
293
294 if (p->p_stflag & PST_PROFIL) {
295 p->p_stflag &= ~PST_PROFIL;
296 /*
297 * This is only necessary if using the clock as the
298 * profiling source.
299 */
300 if (--profprocs == 0 && stathz != 0)
301 psdiv = 1;
302 }
303}
304
305#if defined(PERFCTRS)
306/*
307 * Independent profiling "tick" in case we're using a separate
308 * clock or profiling event source. Currently, that's just
309 * performance counters--hence the wrapper.
310 */
311void
312proftick(struct clockframe *frame)
313{
314#ifdef GPROF
315 struct gmonparam *g;
316 intptr_t i;
317#endif
318 struct lwp *l;
319 struct proc *p;
320
321 l = curcpu()->ci_data.cpu_onproc;
322 p = (l ? l->l_proc : NULL);
323 if (CLKF_USERMODE(frame)) {
324 mutex_spin_enter(&p->p_stmutex);
325 if (p->p_stflag & PST_PROFIL)
326 addupc_intr(l, CLKF_PC(frame));
327 mutex_spin_exit(&p->p_stmutex);
328 } else {
329#ifdef GPROF
330 g = &_gmonparam;
331 if (g->state == GMON_PROF_ON) {
332 i = CLKF_PC(frame) - g->lowpc;
333 if (i < g->textsize) {
334 i /= HISTFRACTION * sizeof(*g->kcount);
335 g->kcount[i]++;
336 }
337 }
338#endif
339#ifdef LWP_PC
340 if (p != NULL && (p->p_stflag & PST_PROFIL) != 0)
341 addupc_intr(l, LWP_PC(l));
342#endif
343 }
344}
345#endif
346
347void
348schedclock(struct lwp *l)
349{
350 if ((l->l_flag & LW_IDLE) != 0)
351 return;
352
353 sched_schedclock(l);
354}
355
356/*
357 * Statistics clock. Grab profile sample, and if divider reaches 0,
358 * do process and kernel statistics.
359 */
360void
361statclock(struct clockframe *frame)
362{
363#ifdef GPROF
364 struct gmonparam *g;
365 intptr_t i;
366#endif
367 struct cpu_info *ci = curcpu();
368 struct schedstate_percpu *spc = &ci->ci_schedstate;
369 struct proc *p;
370 struct lwp *l;
371
372 /*
373 * Notice changes in divisor frequency, and adjust clock
374 * frequency accordingly.
375 */
376 if (spc->spc_psdiv != psdiv) {
377 spc->spc_psdiv = psdiv;
378 spc->spc_pscnt = psdiv;
379 if (psdiv == 1) {
380 setstatclockrate(stathz);
381 } else {
382 setstatclockrate(profhz);
383 }
384 }
385 l = ci->ci_data.cpu_onproc;
386 if ((l->l_flag & LW_IDLE) != 0) {
387 /*
388 * don't account idle lwps as swapper.
389 */
390 p = NULL;
391 } else {
392 p = l->l_proc;
393 mutex_spin_enter(&p->p_stmutex);
394 }
395
396 if (CLKF_USERMODE(frame)) {
397 if ((p->p_stflag & PST_PROFIL) && profsrc == PROFSRC_CLOCK)
398 addupc_intr(l, CLKF_PC(frame));
399 if (--spc->spc_pscnt > 0) {
400 mutex_spin_exit(&p->p_stmutex);
401 return;
402 }
403
404 /*
405 * Came from user mode; CPU was in user state.
406 * If this process is being profiled record the tick.
407 */
408 p->p_uticks++;
409 if (p->p_nice > NZERO)
410 spc->spc_cp_time[CP_NICE]++;
411 else
412 spc->spc_cp_time[CP_USER]++;
413 } else {
414#ifdef GPROF
415 /*
416 * Kernel statistics are just like addupc_intr, only easier.
417 */
418 g = &_gmonparam;
419 if (profsrc == PROFSRC_CLOCK && g->state == GMON_PROF_ON) {
420 i = CLKF_PC(frame) - g->lowpc;
421 if (i < g->textsize) {
422 i /= HISTFRACTION * sizeof(*g->kcount);
423 g->kcount[i]++;
424 }
425 }
426#endif
427#ifdef LWP_PC
428 if (p != NULL && profsrc == PROFSRC_CLOCK &&
429 (p->p_stflag & PST_PROFIL)) {
430 addupc_intr(l, LWP_PC(l));
431 }
432#endif
433 if (--spc->spc_pscnt > 0) {
434 if (p != NULL)
435 mutex_spin_exit(&p->p_stmutex);
436 return;
437 }
438 /*
439 * Came from kernel mode, so we were:
440 * - handling an interrupt,
441 * - doing syscall or trap work on behalf of the current
442 * user process, or
443 * - spinning in the idle loop.
444 * Whichever it is, charge the time as appropriate.
445 * Note that we charge interrupts to the current process,
446 * regardless of whether they are ``for'' that process,
447 * so that we know how much of its real time was spent
448 * in ``non-process'' (i.e., interrupt) work.
449 */
450 if (CLKF_INTR(frame) || (curlwp->l_pflag & LP_INTR) != 0) {
451 if (p != NULL) {
452 p->p_iticks++;
453 }
454 spc->spc_cp_time[CP_INTR]++;
455 } else if (p != NULL) {
456 p->p_sticks++;
457 spc->spc_cp_time[CP_SYS]++;
458 } else {
459 spc->spc_cp_time[CP_IDLE]++;
460 }
461 }
462 spc->spc_pscnt = psdiv;
463
464 if (p != NULL) {
465 atomic_inc_uint(&l->l_cpticks);
466 mutex_spin_exit(&p->p_stmutex);
467 }
468}
469
470/*
471 * sysctl helper routine for kern.clockrate. Assembles a struct on
472 * the fly to be returned to the caller.
473 */
474static int
475sysctl_kern_clockrate(SYSCTLFN_ARGS)
476{
477 struct clockinfo clkinfo;
478 struct sysctlnode node;
479
480 clkinfo.tick = tick;
481 clkinfo.tickadj = tickadj;
482 clkinfo.hz = hz;
483 clkinfo.profhz = profhz;
484 clkinfo.stathz = stathz ? stathz : hz;
485
486 node = *rnode;
487 node.sysctl_data = &clkinfo;
488 return (sysctl_lookup(SYSCTLFN_CALL(&node)));
489}
490