1 | /* $NetBSD: x86_machdep.c,v 1.76 2016/11/15 15:00:56 maxv Exp $ */ |
2 | |
3 | /*- |
4 | * Copyright (c) 2002, 2006, 2007 YAMAMOTO Takashi, |
5 | * Copyright (c) 2005, 2008, 2009 The NetBSD Foundation, Inc. |
6 | * All rights reserved. |
7 | * |
8 | * This code is derived from software contributed to The NetBSD Foundation |
9 | * by Julio M. Merino Vidal. |
10 | * |
11 | * Redistribution and use in source and binary forms, with or without |
12 | * modification, are permitted provided that the following conditions |
13 | * are met: |
14 | * 1. Redistributions of source code must retain the above copyright |
15 | * notice, this list of conditions and the following disclaimer. |
16 | * 2. Redistributions in binary form must reproduce the above copyright |
17 | * notice, this list of conditions and the following disclaimer in the |
18 | * documentation and/or other materials provided with the distribution. |
19 | * |
20 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
21 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
22 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
23 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
24 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
25 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
26 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
27 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
28 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
29 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
30 | * POSSIBILITY OF SUCH DAMAGE. |
31 | */ |
32 | |
33 | #include <sys/cdefs.h> |
34 | __KERNEL_RCSID(0, "$NetBSD: x86_machdep.c,v 1.76 2016/11/15 15:00:56 maxv Exp $" ); |
35 | |
36 | #include "opt_modular.h" |
37 | #include "opt_physmem.h" |
38 | #include "opt_splash.h" |
39 | |
40 | #include <sys/types.h> |
41 | #include <sys/param.h> |
42 | #include <sys/systm.h> |
43 | #include <sys/kcore.h> |
44 | #include <sys/errno.h> |
45 | #include <sys/kauth.h> |
46 | #include <sys/mutex.h> |
47 | #include <sys/cpu.h> |
48 | #include <sys/intr.h> |
49 | #include <sys/atomic.h> |
50 | #include <sys/module.h> |
51 | #include <sys/sysctl.h> |
52 | #include <sys/extent.h> |
53 | #include <sys/rnd.h> |
54 | |
55 | #include <x86/cpuvar.h> |
56 | #include <x86/cputypes.h> |
57 | #include <x86/machdep.h> |
58 | #include <x86/nmi.h> |
59 | #include <x86/pio.h> |
60 | |
61 | #include <dev/splash/splash.h> |
62 | #include <dev/isa/isareg.h> |
63 | #include <dev/ic/i8042reg.h> |
64 | #include <dev/mm.h> |
65 | |
66 | #include <machine/bootinfo.h> |
67 | #include <machine/vmparam.h> |
68 | |
69 | #include <uvm/uvm_extern.h> |
70 | |
71 | #include "acpica.h" |
72 | #if NACPICA > 0 |
73 | #include <dev/acpi/acpivar.h> |
74 | #endif |
75 | |
76 | #include "opt_md.h" |
77 | #if defined(MEMORY_DISK_HOOKS) && defined(MEMORY_DISK_DYNAMIC) |
78 | #include <dev/md.h> |
79 | #endif |
80 | |
81 | void (*x86_cpu_idle)(void); |
82 | static bool x86_cpu_idle_ipi; |
83 | static char x86_cpu_idle_text[16]; |
84 | |
85 | #ifdef XEN |
86 | char module_machine_amd64_xen[] = "amd64-xen" ; |
87 | char module_machine_i386_xen[] = "i386-xen" ; |
88 | char module_machine_i386pae_xen[] = "i386pae-xen" ; |
89 | #endif |
90 | |
91 | |
92 | /* --------------------------------------------------------------------- */ |
93 | |
94 | /* |
95 | * Main bootinfo structure. This is filled in by the bootstrap process |
96 | * done in locore.S based on the information passed by the boot loader. |
97 | */ |
98 | struct bootinfo bootinfo; |
99 | |
100 | /* --------------------------------------------------------------------- */ |
101 | |
102 | static kauth_listener_t x86_listener; |
103 | |
104 | /* |
105 | * Given the type of a bootinfo entry, looks for a matching item inside |
106 | * the bootinfo structure. If found, returns a pointer to it (which must |
107 | * then be casted to the appropriate bootinfo_* type); otherwise, returns |
108 | * NULL. |
109 | */ |
110 | void * |
111 | lookup_bootinfo(int type) |
112 | { |
113 | bool found; |
114 | int i; |
115 | struct btinfo_common *bic; |
116 | |
117 | bic = (struct btinfo_common *)(bootinfo.bi_data); |
118 | found = FALSE; |
119 | for (i = 0; i < bootinfo.bi_nentries && !found; i++) { |
120 | if (bic->type == type) |
121 | found = TRUE; |
122 | else |
123 | bic = (struct btinfo_common *) |
124 | ((uint8_t *)bic + bic->len); |
125 | } |
126 | |
127 | return found ? bic : NULL; |
128 | } |
129 | |
130 | #ifdef notyet |
131 | /* |
132 | * List the available bootinfo entries. |
133 | */ |
134 | static const char *btinfo_str[] = { |
135 | BTINFO_STR |
136 | }; |
137 | |
138 | void |
139 | aprint_bootinfo(void) |
140 | { |
141 | int i; |
142 | struct btinfo_common *bic; |
143 | |
144 | aprint_normal("bootinfo:" ); |
145 | bic = (struct btinfo_common *)(bootinfo.bi_data); |
146 | for (i = 0; i < bootinfo.bi_nentries; i++) { |
147 | if (bic->type >= 0 && bic->type < __arraycount(btinfo_str)) |
148 | aprint_normal(" %s" , btinfo_str[bic->type]); |
149 | else |
150 | aprint_normal(" %d" , bic->type); |
151 | bic = (struct btinfo_common *) |
152 | ((uint8_t *)bic + bic->len); |
153 | } |
154 | aprint_normal("\n" ); |
155 | } |
156 | #endif |
157 | |
158 | /* |
159 | * mm_md_physacc: check if given pa is accessible. |
160 | */ |
161 | int |
162 | mm_md_physacc(paddr_t pa, vm_prot_t prot) |
163 | { |
164 | extern phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX]; |
165 | extern int mem_cluster_cnt; |
166 | int i; |
167 | |
168 | for (i = 0; i < mem_cluster_cnt; i++) { |
169 | const phys_ram_seg_t *seg = &mem_clusters[i]; |
170 | paddr_t lstart = seg->start; |
171 | |
172 | if (lstart <= pa && pa - lstart <= seg->size) { |
173 | return 0; |
174 | } |
175 | } |
176 | return kauth_authorize_machdep(kauth_cred_get(), |
177 | KAUTH_MACHDEP_UNMANAGEDMEM, NULL, NULL, NULL, NULL); |
178 | } |
179 | |
180 | #ifdef MODULAR |
181 | /* |
182 | * Push any modules loaded by the boot loader. |
183 | */ |
184 | void |
185 | module_init_md(void) |
186 | { |
187 | struct btinfo_modulelist *biml; |
188 | struct bi_modulelist_entry *bi, *bimax; |
189 | |
190 | /* setup module path for XEN kernels */ |
191 | #ifdef XEN |
192 | #if defined(amd64) |
193 | module_machine = module_machine_amd64_xen; |
194 | #elif defined(i386) |
195 | #ifdef PAE |
196 | module_machine = module_machine_i386pae_xen; |
197 | #else |
198 | module_machine = module_machine_i386_xen; |
199 | #endif |
200 | #endif |
201 | #endif |
202 | |
203 | biml = lookup_bootinfo(BTINFO_MODULELIST); |
204 | if (biml == NULL) { |
205 | aprint_debug("No module info at boot\n" ); |
206 | return; |
207 | } |
208 | |
209 | bi = (struct bi_modulelist_entry *)((uint8_t *)biml + sizeof(*biml)); |
210 | bimax = bi + biml->num; |
211 | for (; bi < bimax; bi++) { |
212 | switch (bi->type) { |
213 | case BI_MODULE_ELF: |
214 | aprint_debug("Prep module path=%s len=%d pa=%x\n" , |
215 | bi->path, bi->len, bi->base); |
216 | KASSERT(trunc_page(bi->base) == bi->base); |
217 | module_prime(bi->path, |
218 | (void *)((uintptr_t)bi->base + KERNBASE), |
219 | bi->len); |
220 | break; |
221 | case BI_MODULE_IMAGE: |
222 | #ifdef SPLASHSCREEN |
223 | aprint_debug("Splash image path=%s len=%d pa=%x\n" , |
224 | bi->path, bi->len, bi->base); |
225 | KASSERT(trunc_page(bi->base) == bi->base); |
226 | splash_setimage( |
227 | (void *)((uintptr_t)bi->base + KERNBASE), bi->len); |
228 | #endif |
229 | break; |
230 | case BI_MODULE_RND: |
231 | aprint_debug("Random seed data path=%s len=%d pa=%x\n" , |
232 | bi->path, bi->len, bi->base); |
233 | KASSERT(trunc_page(bi->base) == bi->base); |
234 | rnd_seed( |
235 | (void *)((uintptr_t)bi->base + KERNBASE), |
236 | bi->len); |
237 | break; |
238 | case BI_MODULE_FS: |
239 | aprint_debug("File-system image path=%s len=%d pa=%x\n" , |
240 | bi->path, bi->len, bi->base); |
241 | KASSERT(trunc_page(bi->base) == bi->base); |
242 | #if defined(MEMORY_DISK_HOOKS) && defined(MEMORY_DISK_DYNAMIC) |
243 | md_root_setconf((void *)((uintptr_t)bi->base + KERNBASE), |
244 | bi->len); |
245 | #endif |
246 | break; |
247 | default: |
248 | aprint_debug("Skipping non-ELF module\n" ); |
249 | break; |
250 | } |
251 | } |
252 | } |
253 | #endif /* MODULAR */ |
254 | |
255 | void |
256 | cpu_need_resched(struct cpu_info *ci, int flags) |
257 | { |
258 | struct cpu_info *cur; |
259 | lwp_t *l; |
260 | |
261 | KASSERT(kpreempt_disabled()); |
262 | cur = curcpu(); |
263 | l = ci->ci_data.cpu_onproc; |
264 | ci->ci_want_resched |= flags; |
265 | |
266 | if (__predict_false((l->l_pflag & LP_INTR) != 0)) { |
267 | /* |
268 | * No point doing anything, it will switch soon. |
269 | * Also here to prevent an assertion failure in |
270 | * kpreempt() due to preemption being set on a |
271 | * soft interrupt LWP. |
272 | */ |
273 | return; |
274 | } |
275 | |
276 | if (l == ci->ci_data.cpu_idlelwp) { |
277 | if (ci == cur) |
278 | return; |
279 | if (x86_cpu_idle_ipi != false) { |
280 | cpu_kick(ci); |
281 | } |
282 | return; |
283 | } |
284 | |
285 | if ((flags & RESCHED_KPREEMPT) != 0) { |
286 | #ifdef __HAVE_PREEMPTION |
287 | atomic_or_uint(&l->l_dopreempt, DOPREEMPT_ACTIVE); |
288 | if (ci == cur) { |
289 | softint_trigger(1 << SIR_PREEMPT); |
290 | } else { |
291 | x86_send_ipi(ci, X86_IPI_KPREEMPT); |
292 | } |
293 | return; |
294 | #endif |
295 | } |
296 | |
297 | aston(l, X86_AST_PREEMPT); |
298 | if (ci == cur) { |
299 | return; |
300 | } |
301 | if ((flags & RESCHED_IMMED) != 0) { |
302 | cpu_kick(ci); |
303 | } |
304 | } |
305 | |
306 | void |
307 | cpu_signotify(struct lwp *l) |
308 | { |
309 | |
310 | KASSERT(kpreempt_disabled()); |
311 | aston(l, X86_AST_GENERIC); |
312 | if (l->l_cpu != curcpu()) |
313 | cpu_kick(l->l_cpu); |
314 | } |
315 | |
316 | void |
317 | cpu_need_proftick(struct lwp *l) |
318 | { |
319 | |
320 | KASSERT(kpreempt_disabled()); |
321 | KASSERT(l->l_cpu == curcpu()); |
322 | |
323 | l->l_pflag |= LP_OWEUPC; |
324 | aston(l, X86_AST_GENERIC); |
325 | } |
326 | |
327 | bool |
328 | cpu_intr_p(void) |
329 | { |
330 | int idepth; |
331 | |
332 | kpreempt_disable(); |
333 | idepth = curcpu()->ci_idepth; |
334 | kpreempt_enable(); |
335 | return (idepth >= 0); |
336 | } |
337 | |
338 | #ifdef __HAVE_PREEMPTION |
339 | /* |
340 | * Called to check MD conditions that would prevent preemption, and to |
341 | * arrange for those conditions to be rechecked later. |
342 | */ |
343 | bool |
344 | cpu_kpreempt_enter(uintptr_t where, int s) |
345 | { |
346 | struct pcb *pcb; |
347 | lwp_t *l; |
348 | |
349 | KASSERT(kpreempt_disabled()); |
350 | l = curlwp; |
351 | |
352 | /* |
353 | * If SPL raised, can't go. Note this implies that spin |
354 | * mutexes at IPL_NONE are _not_ valid to use. |
355 | */ |
356 | if (s > IPL_PREEMPT) { |
357 | softint_trigger(1 << SIR_PREEMPT); |
358 | aston(l, X86_AST_PREEMPT); /* paranoid */ |
359 | return false; |
360 | } |
361 | |
362 | /* Must save cr2 or it could be clobbered. */ |
363 | pcb = lwp_getpcb(l); |
364 | pcb->pcb_cr2 = rcr2(); |
365 | |
366 | return true; |
367 | } |
368 | |
369 | /* |
370 | * Called after returning from a kernel preemption, and called with |
371 | * preemption disabled. |
372 | */ |
373 | void |
374 | cpu_kpreempt_exit(uintptr_t where) |
375 | { |
376 | extern char x86_copyfunc_start, x86_copyfunc_end; |
377 | struct pcb *pcb; |
378 | |
379 | KASSERT(kpreempt_disabled()); |
380 | |
381 | /* |
382 | * If we interrupted any of the copy functions we must reload |
383 | * the pmap when resuming, as they cannot tolerate it being |
384 | * swapped out. |
385 | */ |
386 | if (where >= (uintptr_t)&x86_copyfunc_start && |
387 | where < (uintptr_t)&x86_copyfunc_end) { |
388 | pmap_load(); |
389 | } |
390 | |
391 | /* Restore cr2 only after the pmap, as pmap_load can block. */ |
392 | pcb = lwp_getpcb(curlwp); |
393 | lcr2(pcb->pcb_cr2); |
394 | } |
395 | |
396 | /* |
397 | * Return true if preemption is disabled for MD reasons. Must be called |
398 | * with preemption disabled, and thus is only for diagnostic checks. |
399 | */ |
400 | bool |
401 | cpu_kpreempt_disabled(void) |
402 | { |
403 | |
404 | return curcpu()->ci_ilevel > IPL_NONE; |
405 | } |
406 | #endif /* __HAVE_PREEMPTION */ |
407 | |
408 | SYSCTL_SETUP(sysctl_machdep_cpu_idle, "sysctl machdep cpu_idle" ) |
409 | { |
410 | const struct sysctlnode *mnode, *node; |
411 | |
412 | sysctl_createv(NULL, 0, NULL, &mnode, |
413 | CTLFLAG_PERMANENT, CTLTYPE_NODE, "machdep" , NULL, |
414 | NULL, 0, NULL, 0, CTL_MACHDEP, CTL_EOL); |
415 | |
416 | sysctl_createv(NULL, 0, &mnode, &node, |
417 | CTLFLAG_PERMANENT, CTLTYPE_STRING, "idle-mechanism" , |
418 | SYSCTL_DESCR("Mechanism used for the idle loop." ), |
419 | NULL, 0, x86_cpu_idle_text, 0, |
420 | CTL_CREATE, CTL_EOL); |
421 | } |
422 | |
423 | void |
424 | x86_cpu_idle_init(void) |
425 | { |
426 | |
427 | #ifndef XEN |
428 | if ((cpu_feature[1] & CPUID2_MONITOR) == 0 || |
429 | cpu_vendor == CPUVENDOR_AMD) |
430 | x86_cpu_idle_set(x86_cpu_idle_halt, "halt" , true); |
431 | else |
432 | x86_cpu_idle_set(x86_cpu_idle_mwait, "mwait" , false); |
433 | #else |
434 | x86_cpu_idle_set(x86_cpu_idle_xen, "xen" , true); |
435 | #endif |
436 | } |
437 | |
438 | void |
439 | x86_cpu_idle_get(void (**func)(void), char *text, size_t len) |
440 | { |
441 | |
442 | *func = x86_cpu_idle; |
443 | |
444 | (void)strlcpy(text, x86_cpu_idle_text, len); |
445 | } |
446 | |
447 | void |
448 | x86_cpu_idle_set(void (*func)(void), const char *text, bool ipi) |
449 | { |
450 | |
451 | x86_cpu_idle = func; |
452 | x86_cpu_idle_ipi = ipi; |
453 | |
454 | (void)strlcpy(x86_cpu_idle_text, text, sizeof(x86_cpu_idle_text)); |
455 | } |
456 | |
457 | #ifndef XEN |
458 | |
459 | #define KBTOB(x) ((size_t)(x) * 1024UL) |
460 | #define MBTOB(x) ((size_t)(x) * 1024UL * 1024UL) |
461 | |
462 | static struct { |
463 | int freelist; |
464 | uint64_t limit; |
465 | } x86_freelists[VM_NFREELIST] = { |
466 | { VM_FREELIST_DEFAULT, 0 }, |
467 | #ifdef VM_FREELIST_FIRST1T |
468 | /* 40-bit addresses needed for modern graphics. */ |
469 | { VM_FREELIST_FIRST1T, 1ULL * 1024 * 1024 * 1024 * 1024 }, |
470 | #endif |
471 | #ifdef VM_FREELIST_FIRST64G |
472 | /* 36-bit addresses needed for oldish graphics. */ |
473 | { VM_FREELIST_FIRST64G, 64ULL * 1024 * 1024 * 1024 }, |
474 | #endif |
475 | #ifdef VM_FREELIST_FIRST4G |
476 | /* 32-bit addresses needed for PCI 32-bit DMA and old graphics. */ |
477 | { VM_FREELIST_FIRST4G, 4ULL * 1024 * 1024 * 1024 }, |
478 | #endif |
479 | /* 30-bit addresses needed for ancient graphics. */ |
480 | { VM_FREELIST_FIRST1G, 1ULL * 1024 * 1024 * 1024 }, |
481 | /* 24-bit addresses needed for ISA DMA. */ |
482 | { VM_FREELIST_FIRST16, 16 * 1024 * 1024 }, |
483 | }; |
484 | |
485 | extern paddr_t avail_start, avail_end; |
486 | |
487 | int |
488 | x86_select_freelist(uint64_t maxaddr) |
489 | { |
490 | unsigned int i; |
491 | |
492 | if (avail_end <= maxaddr) |
493 | return VM_NFREELIST; |
494 | |
495 | for (i = 0; i < __arraycount(x86_freelists); i++) { |
496 | if ((x86_freelists[i].limit - 1) <= maxaddr) |
497 | return x86_freelists[i].freelist; |
498 | } |
499 | |
500 | panic("no freelist for maximum address %" PRIx64, maxaddr); |
501 | } |
502 | |
503 | static int |
504 | x86_add_cluster(struct extent *iomem_ex, uint64_t seg_start, uint64_t seg_end, |
505 | uint32_t type) |
506 | { |
507 | uint64_t new_physmem = 0; |
508 | phys_ram_seg_t *cluster; |
509 | int i; |
510 | |
511 | #ifdef i386 |
512 | #ifdef PAE |
513 | #define TOPLIMIT 0x1000000000ULL /* 64GB */ |
514 | #else |
515 | #define TOPLIMIT 0x100000000ULL /* 4GB */ |
516 | #endif |
517 | #else |
518 | #define TOPLIMIT 0x100000000000ULL /* 16TB */ |
519 | #endif |
520 | |
521 | if (seg_end > TOPLIMIT) { |
522 | aprint_verbose("WARNING: skipping large memory map entry: " |
523 | "0x%" PRIx64"/0x%" PRIx64"/0x%x\n" , |
524 | seg_start, (seg_end - seg_start), type); |
525 | return 0; |
526 | } |
527 | |
528 | /* |
529 | * XXX: Chop the last page off the size so that it can fit in avail_end. |
530 | */ |
531 | if (seg_end == TOPLIMIT) |
532 | seg_end -= PAGE_SIZE; |
533 | |
534 | if (seg_end <= seg_start) |
535 | return 0; |
536 | |
537 | for (i = 0; i < mem_cluster_cnt; i++) { |
538 | cluster = &mem_clusters[i]; |
539 | if ((cluster->start == round_page(seg_start)) && |
540 | (cluster->size == trunc_page(seg_end) - cluster->start)) { |
541 | #ifdef DEBUG_MEMLOAD |
542 | printf("WARNING: skipping duplicate segment entry\n" ); |
543 | #endif |
544 | return 0; |
545 | } |
546 | } |
547 | |
548 | /* |
549 | * Allocate the physical addresses used by RAM from the iomem extent |
550 | * map. This is done before the addresses are page rounded just to make |
551 | * sure we get them all. |
552 | */ |
553 | if (seg_start < 0x100000000ULL) { |
554 | uint64_t io_end; |
555 | |
556 | if (seg_end > 0x100000000ULL) |
557 | io_end = 0x100000000ULL; |
558 | else |
559 | io_end = seg_end; |
560 | |
561 | if (iomem_ex != NULL && extent_alloc_region(iomem_ex, seg_start, |
562 | io_end - seg_start, EX_NOWAIT)) { |
563 | /* XXX What should we do? */ |
564 | printf("WARNING: CAN't ALLOCATE MEMORY SEGMENT " |
565 | "(0x%" PRIx64"/0x%" PRIx64"/0x%x) FROM " |
566 | "IOMEM EXTENT MAP!\n" , |
567 | seg_start, seg_end - seg_start, type); |
568 | return 0; |
569 | } |
570 | } |
571 | |
572 | /* If it's not free memory, skip it. */ |
573 | if (type != BIM_Memory) |
574 | return 0; |
575 | |
576 | if (mem_cluster_cnt >= VM_PHYSSEG_MAX) { |
577 | panic("%s: too many memory segments (increase VM_PHYSSEG_MAX)" , |
578 | __func__); |
579 | } |
580 | |
581 | #ifdef PHYSMEM_MAX_ADDR |
582 | if (seg_start >= MBTOB(PHYSMEM_MAX_ADDR)) |
583 | return 0; |
584 | if (seg_end > MBTOB(PHYSMEM_MAX_ADDR)) |
585 | seg_end = MBTOB(PHYSMEM_MAX_ADDR); |
586 | #endif |
587 | |
588 | seg_start = round_page(seg_start); |
589 | seg_end = trunc_page(seg_end); |
590 | |
591 | if (seg_start == seg_end) |
592 | return 0; |
593 | |
594 | cluster = &mem_clusters[mem_cluster_cnt]; |
595 | cluster->start = seg_start; |
596 | if (iomem_ex != NULL) |
597 | new_physmem = physmem + atop(seg_end - seg_start); |
598 | |
599 | #ifdef PHYSMEM_MAX_SIZE |
600 | if (iomem_ex != NULL) { |
601 | if (physmem >= atop(MBTOB(PHYSMEM_MAX_SIZE))) |
602 | return 0; |
603 | if (new_physmem > atop(MBTOB(PHYSMEM_MAX_SIZE))) { |
604 | seg_end = seg_start + MBTOB(PHYSMEM_MAX_SIZE) - ptoa(physmem); |
605 | new_physmem = atop(MBTOB(PHYSMEM_MAX_SIZE)); |
606 | } |
607 | } |
608 | #endif |
609 | |
610 | cluster->size = seg_end - seg_start; |
611 | |
612 | if (iomem_ex != NULL) { |
613 | if (avail_end < seg_end) |
614 | avail_end = seg_end; |
615 | physmem = new_physmem; |
616 | } |
617 | mem_cluster_cnt++; |
618 | |
619 | return 0; |
620 | } |
621 | |
622 | static int |
623 | x86_parse_clusters(struct btinfo_memmap *bim, struct extent *iomem_ex) |
624 | { |
625 | uint64_t seg_start, seg_end; |
626 | uint64_t addr, size; |
627 | uint32_t type; |
628 | int x; |
629 | |
630 | KASSERT(bim != NULL); |
631 | KASSERT(bim->num > 0); |
632 | |
633 | #ifdef DEBUG_MEMLOAD |
634 | printf("BIOS MEMORY MAP (%d ENTRIES):\n" , bim->num); |
635 | #endif |
636 | |
637 | for (x = 0; x < bim->num; x++) { |
638 | addr = bim->entry[x].addr; |
639 | size = bim->entry[x].size; |
640 | type = bim->entry[x].type; |
641 | #ifdef DEBUG_MEMLOAD |
642 | printf(" addr 0x%" PRIx64" size 0x%" PRIx64" type 0x%x\n" , |
643 | addr, size, type); |
644 | #endif |
645 | |
646 | /* If the segment is not memory, skip it. */ |
647 | switch (type) { |
648 | case BIM_Memory: |
649 | case BIM_ACPI: |
650 | case BIM_NVS: |
651 | break; |
652 | default: |
653 | continue; |
654 | } |
655 | |
656 | /* If the segment is smaller than a page, skip it. */ |
657 | if (size < PAGE_SIZE) |
658 | continue; |
659 | |
660 | seg_start = addr; |
661 | seg_end = addr + size; |
662 | |
663 | /* |
664 | * XXX XXX: Avoid compatibility holes. |
665 | * |
666 | * Holes within memory space that allow access to be directed |
667 | * to the PC-compatible frame buffer (0xa0000-0xbffff), to |
668 | * adapter ROM space (0xc0000-0xdffff), and to system BIOS |
669 | * space (0xe0000-0xfffff). |
670 | * |
671 | * Some laptop (for example, Toshiba Satellite2550X) report |
672 | * this area and occurred problems, so we avoid this area. |
673 | */ |
674 | if (seg_start < 0x100000 && seg_end > 0xa0000) { |
675 | printf("WARNING: memory map entry overlaps " |
676 | "with ``Compatibility Holes'': " |
677 | "0x%" PRIx64"/0x%" PRIx64"/0x%x\n" , seg_start, |
678 | seg_end - seg_start, type); |
679 | |
680 | x86_add_cluster(iomem_ex, seg_start, 0xa0000, type); |
681 | x86_add_cluster(iomem_ex, 0x100000, seg_end, type); |
682 | } else { |
683 | x86_add_cluster(iomem_ex, seg_start, seg_end, type); |
684 | } |
685 | } |
686 | |
687 | return 0; |
688 | } |
689 | |
690 | static int |
691 | x86_fake_clusters(struct extent *iomem_ex) |
692 | { |
693 | phys_ram_seg_t *cluster; |
694 | KASSERT(mem_cluster_cnt == 0); |
695 | |
696 | /* |
697 | * Allocate the physical addresses used by RAM from the iomem extent |
698 | * map. This is done before the addresses are page rounded just to make |
699 | * sure we get them all. |
700 | */ |
701 | if (extent_alloc_region(iomem_ex, 0, KBTOB(biosbasemem), EX_NOWAIT)) { |
702 | /* XXX What should we do? */ |
703 | printf("WARNING: CAN'T ALLOCATE BASE MEMORY FROM " |
704 | "IOMEM EXTENT MAP!\n" ); |
705 | } |
706 | |
707 | cluster = &mem_clusters[0]; |
708 | cluster->start = 0; |
709 | cluster->size = trunc_page(KBTOB(biosbasemem)); |
710 | physmem += atop(cluster->size); |
711 | |
712 | if (extent_alloc_region(iomem_ex, IOM_END, KBTOB(biosextmem), |
713 | EX_NOWAIT)) { |
714 | /* XXX What should we do? */ |
715 | printf("WARNING: CAN'T ALLOCATE EXTENDED MEMORY FROM " |
716 | "IOMEM EXTENT MAP!\n" ); |
717 | } |
718 | |
719 | #if NISADMA > 0 |
720 | /* |
721 | * Some motherboards/BIOSes remap the 384K of RAM that would |
722 | * normally be covered by the ISA hole to the end of memory |
723 | * so that it can be used. However, on a 16M system, this |
724 | * would cause bounce buffers to be allocated and used. |
725 | * This is not desirable behaviour, as more than 384K of |
726 | * bounce buffers might be allocated. As a work-around, |
727 | * we round memory down to the nearest 1M boundary if |
728 | * we're using any isadma devices and the remapped memory |
729 | * is what puts us over 16M. |
730 | */ |
731 | if (biosextmem > (15*1024) && biosextmem < (16*1024)) { |
732 | char pbuf[9]; |
733 | |
734 | format_bytes(pbuf, sizeof(pbuf), biosextmem - (15*1024)); |
735 | printf("Warning: ignoring %s of remapped memory\n" , pbuf); |
736 | biosextmem = (15*1024); |
737 | } |
738 | #endif |
739 | |
740 | cluster = &mem_clusters[1]; |
741 | cluster->start = IOM_END; |
742 | cluster->size = trunc_page(KBTOB(biosextmem)); |
743 | physmem += atop(cluster->size); |
744 | |
745 | mem_cluster_cnt = 2; |
746 | |
747 | avail_end = IOM_END + trunc_page(KBTOB(biosextmem)); |
748 | |
749 | return 0; |
750 | } |
751 | |
752 | /* |
753 | * x86_load_region: load the physical memory region from seg_start to seg_end |
754 | * into the VM system. |
755 | */ |
756 | static void |
757 | x86_load_region(uint64_t seg_start, uint64_t seg_end) |
758 | { |
759 | unsigned int i; |
760 | uint64_t tmp; |
761 | |
762 | i = __arraycount(x86_freelists); |
763 | while (i--) { |
764 | if (x86_freelists[i].limit <= seg_start) |
765 | continue; |
766 | if (x86_freelists[i].freelist == VM_FREELIST_DEFAULT) |
767 | continue; |
768 | tmp = MIN(x86_freelists[i].limit, seg_end); |
769 | if (tmp == seg_start) |
770 | continue; |
771 | |
772 | #ifdef DEBUG_MEMLOAD |
773 | printf("loading freelist %d 0x%" PRIx64"-0x%" PRIx64 |
774 | " (0x%" PRIx64"-0x%" PRIx64")\n" , x86_freelists[i].freelist, |
775 | seg_start, tmp, (uint64_t)atop(seg_start), |
776 | (uint64_t)atop(tmp)); |
777 | #endif |
778 | |
779 | uvm_page_physload(atop(seg_start), atop(tmp), atop(seg_start), |
780 | atop(tmp), x86_freelists[i].freelist); |
781 | seg_start = tmp; |
782 | } |
783 | |
784 | if (seg_start != seg_end) { |
785 | #ifdef DEBUG_MEMLOAD |
786 | printf("loading default 0x%" PRIx64"-0x%" PRIx64 |
787 | " (0x%" PRIx64"-0x%" PRIx64")\n" , seg_start, seg_end, |
788 | (uint64_t)atop(seg_start), (uint64_t)atop(seg_end)); |
789 | #endif |
790 | uvm_page_physload(atop(seg_start), atop(seg_end), |
791 | atop(seg_start), atop(seg_end), VM_FREELIST_DEFAULT); |
792 | } |
793 | } |
794 | |
795 | /* |
796 | * init_x86_clusters: retrieve the memory clusters provided by the BIOS, and |
797 | * initialize mem_clusters. |
798 | */ |
799 | void |
800 | init_x86_clusters(void) |
801 | { |
802 | extern struct extent *iomem_ex; |
803 | struct btinfo_memmap *bim; |
804 | |
805 | /* |
806 | * Check to see if we have a memory map from the BIOS (passed to us by |
807 | * the boot program). |
808 | */ |
809 | #ifdef i386 |
810 | extern int biosmem_implicit; |
811 | bim = lookup_bootinfo(BTINFO_MEMMAP); |
812 | if ((biosmem_implicit || (biosbasemem == 0 && biosextmem == 0)) && |
813 | bim != NULL && bim->num > 0) |
814 | x86_parse_clusters(bim, iomem_ex); |
815 | #else |
816 | #if !defined(REALBASEMEM) && !defined(REALEXTMEM) |
817 | bim = lookup_bootinfo(BTINFO_MEMMAP); |
818 | if (bim != NULL && bim->num > 0) |
819 | x86_parse_clusters(bim, iomem_ex); |
820 | #else |
821 | (void)bim, (void)iomem_ex; |
822 | #endif |
823 | #endif |
824 | |
825 | if (mem_cluster_cnt == 0) { |
826 | /* |
827 | * If x86_parse_clusters didn't find any valid segment, create |
828 | * fake clusters. |
829 | */ |
830 | x86_fake_clusters(iomem_ex); |
831 | } |
832 | } |
833 | |
834 | /* |
835 | * init_x86_vm: initialize the VM system on x86. We basically internalize as |
836 | * many physical pages as we can, starting at avail_start, but we don't |
837 | * internalize the kernel physical pages (from IOM_END to pa_kend). |
838 | */ |
839 | int |
840 | init_x86_vm(paddr_t pa_kend) |
841 | { |
842 | uint64_t seg_start, seg_end; |
843 | uint64_t seg_start1, seg_end1; |
844 | int x; |
845 | unsigned i; |
846 | |
847 | for (i = 0; i < __arraycount(x86_freelists); i++) { |
848 | if (avail_end < x86_freelists[i].limit) |
849 | x86_freelists[i].freelist = VM_FREELIST_DEFAULT; |
850 | } |
851 | |
852 | #ifdef amd64 |
853 | extern vaddr_t kern_end; |
854 | extern vaddr_t module_start, module_end; |
855 | |
856 | module_start = kern_end; |
857 | module_end = KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2; |
858 | #endif |
859 | |
860 | /* |
861 | * Now, load the memory clusters (which have already been rounded and |
862 | * truncated) into the VM system. |
863 | * |
864 | * NOTE: we assume that memory starts at 0 and that the kernel is |
865 | * loaded at IOM_END (1MB). |
866 | */ |
867 | for (x = 0; x < mem_cluster_cnt; x++) { |
868 | const phys_ram_seg_t *cluster = &mem_clusters[x]; |
869 | |
870 | seg_start = cluster->start; |
871 | seg_end = cluster->start + cluster->size; |
872 | seg_start1 = 0; |
873 | seg_end1 = 0; |
874 | |
875 | /* Skip memory before our available starting point. */ |
876 | if (seg_end <= avail_start) |
877 | continue; |
878 | |
879 | if (seg_start <= avail_start && avail_start < seg_end) { |
880 | seg_start = avail_start; |
881 | if (seg_start == seg_end) |
882 | continue; |
883 | } |
884 | |
885 | /* |
886 | * If this segment contains the kernel, split it in two, around |
887 | * the kernel. |
888 | */ |
889 | if (seg_start <= IOM_END && pa_kend <= seg_end) { |
890 | seg_start1 = pa_kend; |
891 | seg_end1 = seg_end; |
892 | seg_end = IOM_END; |
893 | KASSERT(seg_end < seg_end1); |
894 | } |
895 | |
896 | /* First hunk */ |
897 | if (seg_start != seg_end) { |
898 | x86_load_region(seg_start, seg_end); |
899 | } |
900 | |
901 | /* Second hunk */ |
902 | if (seg_start1 != seg_end1) { |
903 | x86_load_region(seg_start1, seg_end1); |
904 | } |
905 | } |
906 | |
907 | return 0; |
908 | } |
909 | |
910 | #endif /* !XEN */ |
911 | |
912 | void |
913 | x86_reset(void) |
914 | { |
915 | uint8_t b; |
916 | |
917 | #if NACPICA > 0 |
918 | /* |
919 | * If ACPI is active, try to reset using the reset register |
920 | * defined in the FADT. |
921 | */ |
922 | if (acpi_active) { |
923 | if (acpi_reset() == 0) { |
924 | delay(500000); /* wait 0.5 sec to see if that did it */ |
925 | } |
926 | } |
927 | #endif |
928 | |
929 | /* |
930 | * The keyboard controller has 4 random output pins, one of which is |
931 | * connected to the RESET pin on the CPU in many PCs. We tell the |
932 | * keyboard controller to pulse this line a couple of times. |
933 | */ |
934 | outb(IO_KBD + KBCMDP, KBC_PULSE0); |
935 | delay(100000); |
936 | outb(IO_KBD + KBCMDP, KBC_PULSE0); |
937 | delay(100000); |
938 | |
939 | /* |
940 | * Attempt to force a reset via the Reset Control register at |
941 | * I/O port 0xcf9. Bit 2 forces a system reset when it |
942 | * transitions from 0 to 1. Bit 1 selects the type of reset |
943 | * to attempt: 0 selects a "soft" reset, and 1 selects a |
944 | * "hard" reset. We try a "hard" reset. The first write sets |
945 | * bit 1 to select a "hard" reset and clears bit 2. The |
946 | * second write forces a 0 -> 1 transition in bit 2 to trigger |
947 | * a reset. |
948 | */ |
949 | outb(0xcf9, 0x2); |
950 | outb(0xcf9, 0x6); |
951 | DELAY(500000); /* wait 0.5 sec to see if that did it */ |
952 | |
953 | /* |
954 | * Attempt to force a reset via the Fast A20 and Init register |
955 | * at I/O port 0x92. Bit 1 serves as an alternate A20 gate. |
956 | * Bit 0 asserts INIT# when set to 1. We are careful to only |
957 | * preserve bit 1 while setting bit 0. We also must clear bit |
958 | * 0 before setting it if it isn't already clear. |
959 | */ |
960 | b = inb(0x92); |
961 | if (b != 0xff) { |
962 | if ((b & 0x1) != 0) |
963 | outb(0x92, b & 0xfe); |
964 | outb(0x92, b | 0x1); |
965 | DELAY(500000); /* wait 0.5 sec to see if that did it */ |
966 | } |
967 | } |
968 | |
969 | static int |
970 | x86_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, |
971 | void *arg0, void *arg1, void *arg2, void *arg3) |
972 | { |
973 | int result; |
974 | |
975 | result = KAUTH_RESULT_DEFER; |
976 | |
977 | switch (action) { |
978 | case KAUTH_MACHDEP_IOPERM_GET: |
979 | case KAUTH_MACHDEP_LDT_GET: |
980 | case KAUTH_MACHDEP_LDT_SET: |
981 | case KAUTH_MACHDEP_MTRR_GET: |
982 | result = KAUTH_RESULT_ALLOW; |
983 | |
984 | break; |
985 | |
986 | default: |
987 | break; |
988 | } |
989 | |
990 | return result; |
991 | } |
992 | |
993 | void |
994 | machdep_init(void) |
995 | { |
996 | |
997 | x86_listener = kauth_listen_scope(KAUTH_SCOPE_MACHDEP, |
998 | x86_listener_cb, NULL); |
999 | } |
1000 | |
1001 | /* |
1002 | * x86_startup: x86 common startup routine |
1003 | * |
1004 | * called by cpu_startup. |
1005 | */ |
1006 | |
1007 | void |
1008 | x86_startup(void) |
1009 | { |
1010 | |
1011 | #if !defined(XEN) |
1012 | nmi_init(); |
1013 | #endif /* !defined(XEN) */ |
1014 | } |
1015 | |
1016 | /* |
1017 | * machine dependent system variables. |
1018 | */ |
1019 | static int |
1020 | sysctl_machdep_booted_kernel(SYSCTLFN_ARGS) |
1021 | { |
1022 | struct btinfo_bootpath *bibp; |
1023 | struct sysctlnode node; |
1024 | |
1025 | bibp = lookup_bootinfo(BTINFO_BOOTPATH); |
1026 | if(!bibp) |
1027 | return ENOENT; /* ??? */ |
1028 | |
1029 | node = *rnode; |
1030 | node.sysctl_data = bibp->bootpath; |
1031 | node.sysctl_size = sizeof(bibp->bootpath); |
1032 | return sysctl_lookup(SYSCTLFN_CALL(&node)); |
1033 | } |
1034 | |
1035 | static int |
1036 | sysctl_machdep_diskinfo(SYSCTLFN_ARGS) |
1037 | { |
1038 | struct sysctlnode node; |
1039 | extern struct bi_devmatch *x86_alldisks; |
1040 | extern int x86_ndisks; |
1041 | |
1042 | if (x86_alldisks == NULL) |
1043 | return EOPNOTSUPP; |
1044 | |
1045 | node = *rnode; |
1046 | node.sysctl_data = x86_alldisks; |
1047 | node.sysctl_size = sizeof(struct disklist) + |
1048 | (x86_ndisks - 1) * sizeof(struct nativedisk_info); |
1049 | return sysctl_lookup(SYSCTLFN_CALL(&node)); |
1050 | } |
1051 | |
1052 | static void |
1053 | const_sysctl(struct sysctllog **clog, const char *name, int type, |
1054 | u_quad_t value, int tag) |
1055 | { |
1056 | (sysctl_createv)(clog, 0, NULL, NULL, |
1057 | CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE, |
1058 | type, name, NULL, NULL, value, NULL, 0, |
1059 | CTL_MACHDEP, tag, CTL_EOL); |
1060 | } |
1061 | |
1062 | SYSCTL_SETUP(sysctl_machdep_setup, "sysctl machdep subtree setup" ) |
1063 | { |
1064 | extern uint64_t tsc_freq; |
1065 | extern int sparse_dump; |
1066 | |
1067 | sysctl_createv(clog, 0, NULL, NULL, |
1068 | CTLFLAG_PERMANENT, |
1069 | CTLTYPE_NODE, "machdep" , NULL, |
1070 | NULL, 0, NULL, 0, |
1071 | CTL_MACHDEP, CTL_EOL); |
1072 | |
1073 | sysctl_createv(clog, 0, NULL, NULL, |
1074 | CTLFLAG_PERMANENT, |
1075 | CTLTYPE_STRUCT, "console_device" , NULL, |
1076 | sysctl_consdev, 0, NULL, sizeof(dev_t), |
1077 | CTL_MACHDEP, CPU_CONSDEV, CTL_EOL); |
1078 | sysctl_createv(clog, 0, NULL, NULL, |
1079 | CTLFLAG_PERMANENT, |
1080 | CTLTYPE_STRING, "booted_kernel" , NULL, |
1081 | sysctl_machdep_booted_kernel, 0, NULL, 0, |
1082 | CTL_MACHDEP, CPU_BOOTED_KERNEL, CTL_EOL); |
1083 | sysctl_createv(clog, 0, NULL, NULL, |
1084 | CTLFLAG_PERMANENT, |
1085 | CTLTYPE_STRUCT, "diskinfo" , NULL, |
1086 | sysctl_machdep_diskinfo, 0, NULL, 0, |
1087 | CTL_MACHDEP, CPU_DISKINFO, CTL_EOL); |
1088 | |
1089 | sysctl_createv(clog, 0, NULL, NULL, |
1090 | CTLFLAG_PERMANENT, |
1091 | CTLTYPE_STRING, "cpu_brand" , NULL, |
1092 | NULL, 0, cpu_brand_string, 0, |
1093 | CTL_MACHDEP, CTL_CREATE, CTL_EOL); |
1094 | sysctl_createv(clog, 0, NULL, NULL, |
1095 | CTLFLAG_PERMANENT|CTLFLAG_READWRITE, |
1096 | CTLTYPE_INT, "sparse_dump" , NULL, |
1097 | NULL, 0, &sparse_dump, 0, |
1098 | CTL_MACHDEP, CTL_CREATE, CTL_EOL); |
1099 | sysctl_createv(clog, 0, NULL, NULL, |
1100 | CTLFLAG_PERMANENT, |
1101 | CTLTYPE_QUAD, "tsc_freq" , NULL, |
1102 | NULL, 0, &tsc_freq, 0, |
1103 | CTL_MACHDEP, CTL_CREATE, CTL_EOL); |
1104 | sysctl_createv(clog, 0, NULL, NULL, |
1105 | CTLFLAG_PERMANENT, |
1106 | CTLTYPE_INT, "pae" , |
1107 | SYSCTL_DESCR("Whether the kernel uses PAE" ), |
1108 | NULL, 0, &use_pae, 0, |
1109 | CTL_MACHDEP, CTL_CREATE, CTL_EOL); |
1110 | |
1111 | /* None of these can ever change once the system has booted */ |
1112 | const_sysctl(clog, "fpu_present" , CTLTYPE_INT, i386_fpu_present, |
1113 | CPU_FPU_PRESENT); |
1114 | const_sysctl(clog, "osfxsr" , CTLTYPE_INT, i386_use_fxsave, |
1115 | CPU_OSFXSR); |
1116 | const_sysctl(clog, "sse" , CTLTYPE_INT, i386_has_sse, |
1117 | CPU_SSE); |
1118 | const_sysctl(clog, "sse2" , CTLTYPE_INT, i386_has_sse2, |
1119 | CPU_SSE2); |
1120 | |
1121 | const_sysctl(clog, "fpu_save" , CTLTYPE_INT, x86_fpu_save, |
1122 | CTL_CREATE); |
1123 | const_sysctl(clog, "fpu_save_size" , CTLTYPE_INT, x86_fpu_save_size, |
1124 | CTL_CREATE); |
1125 | const_sysctl(clog, "xsave_features" , CTLTYPE_QUAD, x86_xsave_features, |
1126 | CTL_CREATE); |
1127 | |
1128 | #ifndef XEN |
1129 | const_sysctl(clog, "biosbasemem" , CTLTYPE_INT, biosbasemem, |
1130 | CPU_BIOSBASEMEM); |
1131 | const_sysctl(clog, "biosextmem" , CTLTYPE_INT, biosextmem, |
1132 | CPU_BIOSEXTMEM); |
1133 | #endif |
1134 | } |
1135 | |