1/* $NetBSD: machdep.c,v 1.233 2016/11/17 16:26:08 maxv Exp $ */
2
3/*-
4 * Copyright (c) 1996, 1997, 1998, 2000, 2006, 2007, 2008, 2011
5 * The NetBSD Foundation, Inc.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
10 * Simulation Facility, NASA Ames Research Center.
11 *
12 * This code is derived from software contributed to The NetBSD Foundation
13 * by Coyote Point Systems, Inc. which was written under contract to Coyote
14 * Point by Jed Davis and Devon O'Dell.
15 *
16 * Redistribution and use in source and binary forms, with or without
17 * modification, are permitted provided that the following conditions
18 * are met:
19 * 1. Redistributions of source code must retain the above copyright
20 * notice, this list of conditions and the following disclaimer.
21 * 2. Redistributions in binary form must reproduce the above copyright
22 * notice, this list of conditions and the following disclaimer in the
23 * documentation and/or other materials provided with the distribution.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
26 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38/*
39 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
40 *
41 * Permission to use, copy, modify, and distribute this software for any
42 * purpose with or without fee is hereby granted, provided that the above
43 * copyright notice and this permission notice appear in all copies.
44 *
45 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
46 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
47 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
48 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
49 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
50 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
51 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
52 */
53
54/*
55 * Copyright (c) 2007 Manuel Bouyer.
56 *
57 * Redistribution and use in source and binary forms, with or without
58 * modification, are permitted provided that the following conditions
59 * are met:
60 * 1. Redistributions of source code must retain the above copyright
61 * notice, this list of conditions and the following disclaimer.
62 * 2. Redistributions in binary form must reproduce the above copyright
63 * notice, this list of conditions and the following disclaimer in the
64 * documentation and/or other materials provided with the distribution.
65 *
66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
67 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
68 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
69 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
70 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
71 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
72 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
73 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
74 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
75 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
76 *
77 */
78
79/*-
80 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
81 * All rights reserved.
82 *
83 * This code is derived from software contributed to Berkeley by
84 * William Jolitz.
85 *
86 * Redistribution and use in source and binary forms, with or without
87 * modification, are permitted provided that the following conditions
88 * are met:
89 * 1. Redistributions of source code must retain the above copyright
90 * notice, this list of conditions and the following disclaimer.
91 * 2. Redistributions in binary form must reproduce the above copyright
92 * notice, this list of conditions and the following disclaimer in the
93 * documentation and/or other materials provided with the distribution.
94 * 3. Neither the name of the University nor the names of its contributors
95 * may be used to endorse or promote products derived from this software
96 * without specific prior written permission.
97 *
98 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
99 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
100 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
101 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
102 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
103 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
104 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
105 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
106 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
107 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
108 * SUCH DAMAGE.
109 *
110 * @(#)machdep.c 7.4 (Berkeley) 6/3/91
111 */
112
113#include <sys/cdefs.h>
114__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.233 2016/11/17 16:26:08 maxv Exp $");
115
116/* #define XENDEBUG_LOW */
117
118#include "opt_modular.h"
119#include "opt_user_ldt.h"
120#include "opt_ddb.h"
121#include "opt_kgdb.h"
122#include "opt_cpureset_delay.h"
123#include "opt_mtrr.h"
124#include "opt_realmem.h"
125#include "opt_xen.h"
126#ifndef XEN
127#include "opt_physmem.h"
128#endif
129#include "isa.h"
130#include "pci.h"
131
132#include <sys/param.h>
133#include <sys/systm.h>
134#include <sys/signal.h>
135#include <sys/signalvar.h>
136#include <sys/kernel.h>
137#include <sys/cpu.h>
138#include <sys/exec.h>
139#include <sys/exec_aout.h> /* for MID_* */
140#include <sys/reboot.h>
141#include <sys/conf.h>
142#include <sys/mbuf.h>
143#include <sys/msgbuf.h>
144#include <sys/mount.h>
145#include <sys/core.h>
146#include <sys/kcore.h>
147#include <sys/ucontext.h>
148#include <machine/kcore.h>
149#include <sys/ras.h>
150#include <sys/syscallargs.h>
151#include <sys/ksyms.h>
152#include <sys/device.h>
153#include <sys/lwp.h>
154#include <sys/proc.h>
155
156#ifdef KGDB
157#include <sys/kgdb.h>
158#endif
159
160#include <dev/cons.h>
161#include <dev/mm.h>
162
163#include <uvm/uvm.h>
164#include <uvm/uvm_page.h>
165
166#include <sys/sysctl.h>
167
168#include <machine/cpu.h>
169#include <machine/cpufunc.h>
170#include <machine/gdt.h>
171#include <machine/intr.h>
172#include <machine/pio.h>
173#include <machine/psl.h>
174#include <machine/reg.h>
175#include <machine/specialreg.h>
176#include <machine/bootinfo.h>
177#include <x86/fpu.h>
178#include <machine/mtrr.h>
179#include <machine/mpbiosvar.h>
180
181#include <x86/cputypes.h>
182#include <x86/cpuvar.h>
183#include <x86/machdep.h>
184
185#include <x86/x86/tsc.h>
186
187#include <dev/isa/isareg.h>
188#include <machine/isa_machdep.h>
189#include <dev/ic/i8042reg.h>
190
191#ifdef XEN
192#include <xen/xen.h>
193#include <xen/hypervisor.h>
194#include <xen/evtchn.h>
195#endif
196
197#ifdef DDB
198#include <machine/db_machdep.h>
199#include <ddb/db_extern.h>
200#include <ddb/db_output.h>
201#include <ddb/db_interface.h>
202#endif
203
204#include "acpica.h"
205
206#if NACPICA > 0
207#include <dev/acpi/acpivar.h>
208#define ACPI_MACHDEP_PRIVATE
209#include <machine/acpi_machdep.h>
210#endif
211
212#include "isa.h"
213#include "isadma.h"
214#include "ksyms.h"
215
216/* the following is used externally (sysctl_hw) */
217char machine[] = "amd64"; /* CPU "architecture" */
218char machine_arch[] = "x86_64"; /* machine == machine_arch */
219
220#ifdef CPURESET_DELAY
221int cpureset_delay = CPURESET_DELAY;
222#else
223int cpureset_delay = 2000; /* default to 2s */
224#endif
225
226int cpu_class = CPUCLASS_686;
227
228#ifdef MTRR
229struct mtrr_funcs *mtrr_funcs;
230#endif
231
232uint64_t dumpmem_low;
233uint64_t dumpmem_high;
234int cpu_class;
235int use_pae;
236
237#ifndef NO_SPARSE_DUMP
238int sparse_dump = 1;
239
240paddr_t max_paddr = 0;
241unsigned char *sparse_dump_physmap;
242#endif
243
244char *dump_headerbuf, *dump_headerbuf_ptr;
245#define dump_headerbuf_size PAGE_SIZE
246#define dump_headerbuf_end (dump_headerbuf + dump_headerbuf_size)
247#define dump_headerbuf_avail (dump_headerbuf_end - dump_headerbuf_ptr)
248daddr_t dump_header_blkno;
249
250size_t dump_nmemsegs;
251size_t dump_npages;
252size_t dump_header_size;
253size_t dump_totalbytesleft;
254
255vaddr_t msgbuf_vaddr;
256
257struct {
258 paddr_t paddr;
259 psize_t sz;
260} msgbuf_p_seg[VM_PHYSSEG_MAX];
261unsigned int msgbuf_p_cnt = 0;
262
263vaddr_t idt_vaddr;
264paddr_t idt_paddr;
265vaddr_t gdt_vaddr;
266paddr_t gdt_paddr;
267vaddr_t ldt_vaddr;
268paddr_t ldt_paddr;
269
270vaddr_t module_start, module_end;
271static struct vm_map module_map_store;
272extern struct vm_map *module_map;
273vaddr_t kern_end;
274
275struct vm_map *phys_map = NULL;
276
277extern paddr_t avail_start, avail_end;
278#ifdef XEN
279extern paddr_t pmap_pa_start, pmap_pa_end;
280#endif
281
282#ifndef XEN
283void (*delay_func)(unsigned int) = i8254_delay;
284void (*initclock_func)(void) = i8254_initclocks;
285#else /* XEN */
286void (*delay_func)(unsigned int) = xen_delay;
287void (*initclock_func)(void) = xen_initclocks;
288#endif
289
290
291/*
292 * Size of memory segments, before any memory is stolen.
293 */
294phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
295int mem_cluster_cnt;
296
297char x86_64_doubleflt_stack[4096];
298
299int cpu_dump(void);
300int cpu_dumpsize(void);
301u_long cpu_dump_mempagecnt(void);
302void dodumpsys(void);
303void dumpsys(void);
304
305extern int time_adjusted; /* XXX no common header */
306
307void dump_misc_init(void);
308void dump_seg_prep(void);
309int dump_seg_iter(int (*)(paddr_t, paddr_t));
310
311#ifndef NO_SPARSE_DUMP
312void sparse_dump_reset(void);
313void sparse_dump_mark(void);
314void cpu_dump_prep_sparse(void);
315#endif
316
317void dump_header_start(void);
318int dump_header_flush(void);
319int dump_header_addbytes(const void*, size_t);
320int dump_header_addseg(paddr_t, paddr_t);
321int dump_header_finish(void);
322
323int dump_seg_count_range(paddr_t, paddr_t);
324int dumpsys_seg(paddr_t, paddr_t);
325
326void init_x86_64(paddr_t);
327
328static int valid_user_selector(struct lwp *, uint64_t);
329
330/*
331 * Machine-dependent startup code
332 */
333void
334cpu_startup(void)
335{
336 int x, y;
337 vaddr_t minaddr, maxaddr;
338 psize_t sz;
339
340 /*
341 * For console drivers that require uvm and pmap to be initialized,
342 * we'll give them one more chance here...
343 */
344 consinit();
345
346 /*
347 * Initialize error message buffer (et end of core).
348 */
349 if (msgbuf_p_cnt == 0)
350 panic("msgbuf paddr map has not been set up");
351 for (x = 0, sz = 0; x < msgbuf_p_cnt; sz += msgbuf_p_seg[x++].sz)
352 continue;
353
354 msgbuf_vaddr = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_VAONLY);
355 if (msgbuf_vaddr == 0)
356 panic("failed to valloc msgbuf_vaddr");
357
358 for (y = 0, sz = 0; y < msgbuf_p_cnt; y++) {
359 for (x = 0; x < btoc(msgbuf_p_seg[y].sz); x++, sz += PAGE_SIZE)
360 pmap_kenter_pa((vaddr_t)msgbuf_vaddr + sz,
361 msgbuf_p_seg[y].paddr + x * PAGE_SIZE,
362 VM_PROT_READ|VM_PROT_WRITE, 0);
363 }
364
365 pmap_update(pmap_kernel());
366
367 initmsgbuf((void *)msgbuf_vaddr, round_page(sz));
368
369 minaddr = 0;
370
371 /*
372 * Allocate a submap for physio.
373 */
374 phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
375 VM_PHYS_SIZE, 0, false, NULL);
376
377 /*
378 * Create the module map.
379 *
380 * The kernel uses RIP-relative addressing with a maximum offset of
381 * 2GB. The problem is, kernel_map is too far away in memory from
382 * the kernel .text. So we cannot use it, and have to create a
383 * special module_map.
384 *
385 * The module map is taken as what is left of the bootstrap memory
386 * created in locore.S. This memory is right above the kernel
387 * image, so this is the best place to put our modules.
388 */
389 uvm_map_setup(&module_map_store, module_start, module_end, 0);
390 module_map_store.pmap = pmap_kernel();
391 module_map = &module_map_store;
392
393 /* Say hello. */
394 banner();
395
396#if NISA > 0 || NPCI > 0
397 /* Safe for i/o port / memory space allocation to use malloc now. */
398 x86_bus_space_mallocok();
399#endif
400
401 gdt_init();
402 x86_64_proc0_tss_ldt_init();
403
404 cpu_init_tss(&cpu_info_primary);
405#if !defined(XEN)
406 ltr(cpu_info_primary.ci_tss_sel);
407#endif /* !defined(XEN) */
408
409 x86_startup();
410}
411
412#ifdef XEN
413/* used in assembly */
414void hypervisor_callback(void);
415void failsafe_callback(void);
416void x86_64_switch_context(struct pcb *);
417void x86_64_tls_switch(struct lwp *);
418
419void
420x86_64_switch_context(struct pcb *new)
421{
422 HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), new->pcb_rsp0);
423 struct physdev_op physop;
424 physop.cmd = PHYSDEVOP_SET_IOPL;
425 physop.u.set_iopl.iopl = new->pcb_iopl;
426 HYPERVISOR_physdev_op(&physop);
427}
428
429void
430x86_64_tls_switch(struct lwp *l)
431{
432 struct cpu_info *ci = curcpu();
433 struct pcb *pcb = lwp_getpcb(l);
434 struct trapframe *tf = l->l_md.md_regs;
435
436 /*
437 * Raise the IPL to IPL_HIGH.
438 * FPU IPIs can alter the LWP's saved cr0. Dropping the priority
439 * is deferred until mi_switch(), when cpu_switchto() returns.
440 */
441 (void)splhigh();
442 /*
443 * If our floating point registers are on a different CPU,
444 * set CR0_TS so we'll trap rather than reuse bogus state.
445 */
446 if (l != ci->ci_fpcurlwp) {
447 HYPERVISOR_fpu_taskswitch(1);
448 }
449
450 /* Update TLS segment pointers */
451 if (pcb->pcb_flags & PCB_COMPAT32) {
452 update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &pcb->pcb_fs);
453 update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &pcb->pcb_gs);
454 setfs(tf->tf_fs);
455 HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, tf->tf_gs);
456 } else {
457 setfs(0);
458 HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, 0);
459 HYPERVISOR_set_segment_base(SEGBASE_FS, pcb->pcb_fs);
460 HYPERVISOR_set_segment_base(SEGBASE_GS_USER, pcb->pcb_gs);
461 }
462}
463#endif /* XEN */
464
465/*
466 * Set up proc0's TSS and LDT.
467 */
468void
469x86_64_proc0_tss_ldt_init(void)
470{
471 struct lwp *l = &lwp0;
472 struct pcb *pcb = lwp_getpcb(l);
473
474 pcb->pcb_flags = 0;
475 pcb->pcb_fs = 0;
476 pcb->pcb_gs = 0;
477 pcb->pcb_rsp0 = (uvm_lwp_getuarea(l) + USPACE - 16) & ~0xf;
478 pcb->pcb_iopl = SEL_KPL;
479
480 pmap_kernel()->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
481 pcb->pcb_cr0 = rcr0() & ~CR0_TS;
482 l->l_md.md_regs = (struct trapframe *)pcb->pcb_rsp0 - 1;
483
484#if !defined(XEN)
485 lldt(pmap_kernel()->pm_ldt_sel);
486#else
487 {
488 struct physdev_op physop;
489 xen_set_ldt((vaddr_t) ldtstore, LDT_SIZE >> 3);
490 /* Reset TS bit and set kernel stack for interrupt handlers */
491 HYPERVISOR_fpu_taskswitch(1);
492 HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_rsp0);
493 physop.cmd = PHYSDEVOP_SET_IOPL;
494 physop.u.set_iopl.iopl = pcb->pcb_iopl;
495 HYPERVISOR_physdev_op(&physop);
496 }
497#endif /* XEN */
498}
499
500/*
501 * Set up TSS and I/O bitmap.
502 */
503void
504cpu_init_tss(struct cpu_info *ci)
505{
506 struct x86_64_tss *tss = &ci->ci_tss;
507 uintptr_t p;
508
509 tss->tss_iobase = IOMAP_INVALOFF << 16;
510 /* tss->tss_ist[0] is filled by cpu_intr_init */
511
512 /* double fault */
513 tss->tss_ist[1] = (uint64_t)x86_64_doubleflt_stack + PAGE_SIZE - 16;
514
515 /* NMI */
516 p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED);
517 tss->tss_ist[2] = p + PAGE_SIZE - 16;
518 ci->ci_tss_sel = tss_alloc(tss);
519}
520
521void
522buildcontext(struct lwp *l, void *catcher, void *f)
523{
524 struct trapframe *tf = l->l_md.md_regs;
525
526 tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
527 tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
528 tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
529 tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL);
530
531 tf->tf_rip = (uint64_t)catcher;
532 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
533 tf->tf_rflags &= ~PSL_CLEARSIG;
534 tf->tf_rsp = (uint64_t)f;
535 tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
536
537 /* Ensure FP state is sane */
538 fpu_save_area_reset(l);
539}
540
541void
542sendsig_sigcontext(const ksiginfo_t *ksi, const sigset_t *mask)
543{
544
545 printf("sendsig_sigcontext: illegal\n");
546 sigexit(curlwp, SIGILL);
547}
548
549void
550sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask)
551{
552 struct lwp *l = curlwp;
553 struct proc *p = l->l_proc;
554 struct sigacts *ps = p->p_sigacts;
555 int onstack, error;
556 int sig = ksi->ksi_signo;
557 struct sigframe_siginfo *fp, frame;
558 sig_t catcher = SIGACTION(p, sig).sa_handler;
559 struct trapframe *tf = l->l_md.md_regs;
560 char *sp;
561
562 KASSERT(mutex_owned(p->p_lock));
563
564 /* Do we need to jump onto the signal stack? */
565 onstack =
566 (l->l_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 &&
567 (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
568
569 /* Allocate space for the signal handler context. */
570 if (onstack)
571 sp = ((char *)l->l_sigstk.ss_sp + l->l_sigstk.ss_size);
572 else
573 /* AMD64 ABI 128-bytes "red zone". */
574 sp = (char *)tf->tf_rsp - 128;
575
576 sp -= sizeof(struct sigframe_siginfo);
577 /* Round down the stackpointer to a multiple of 16 for the ABI. */
578 fp = (struct sigframe_siginfo *)(((unsigned long)sp & ~15) - 8);
579
580 frame.sf_ra = (uint64_t)ps->sa_sigdesc[sig].sd_tramp;
581 frame.sf_si._info = ksi->ksi_info;
582 frame.sf_uc.uc_flags = _UC_SIGMASK;
583 frame.sf_uc.uc_sigmask = *mask;
584 frame.sf_uc.uc_link = l->l_ctxlink;
585 frame.sf_uc.uc_flags |= (l->l_sigstk.ss_flags & SS_ONSTACK)
586 ? _UC_SETSTACK : _UC_CLRSTACK;
587 memset(&frame.sf_uc.uc_stack, 0, sizeof(frame.sf_uc.uc_stack));
588 sendsig_reset(l, sig);
589
590 mutex_exit(p->p_lock);
591 cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags);
592 /* Copyout all the fp regs, the signal handler might expect them. */
593 error = copyout(&frame, fp, sizeof frame);
594 mutex_enter(p->p_lock);
595
596 if (error != 0) {
597 /*
598 * Process has trashed its stack; give it an illegal
599 * instruction to halt it in its tracks.
600 */
601 sigexit(l, SIGILL);
602 /* NOTREACHED */
603 }
604
605 buildcontext(l, catcher, fp);
606
607 tf->tf_rdi = sig;
608 tf->tf_rsi = (uint64_t)&fp->sf_si;
609 tf->tf_rdx = tf->tf_r15 = (uint64_t)&fp->sf_uc;
610
611 /* Remember that we're now on the signal stack. */
612 if (onstack)
613 l->l_sigstk.ss_flags |= SS_ONSTACK;
614
615 if ((vaddr_t)catcher >= VM_MAXUSER_ADDRESS) {
616 /*
617 * process has given an invalid address for the
618 * handler. Stop it, but do not do it before so
619 * we can return the right info to userland (or in core dump)
620 */
621 sigexit(l, SIGILL);
622 /* NOTREACHED */
623 }
624}
625
626struct pcb dumppcb;
627
628void
629cpu_reboot(int howto, char *bootstr)
630{
631 static bool syncdone = false;
632 int s = IPL_NONE;
633 __USE(s); /* ugly otherwise */
634
635 if (cold) {
636 howto |= RB_HALT;
637 goto haltsys;
638 }
639
640 boothowto = howto;
641
642 /* i386 maybe_dump() */
643
644 /*
645 * If we've panic'd, don't make the situation potentially
646 * worse by syncing or unmounting the file systems.
647 */
648 if ((howto & RB_NOSYNC) == 0 && panicstr == NULL) {
649 if (!syncdone) {
650 syncdone = true;
651 /* XXX used to force unmount as well, here */
652 vfs_sync_all(curlwp);
653 /*
654 * If we've been adjusting the clock, the todr
655 * will be out of synch; adjust it now.
656 *
657 * XXX used to do this after unmounting all
658 * filesystems with vfs_shutdown().
659 */
660 if (time_adjusted != 0)
661 resettodr();
662 }
663
664 while (vfs_unmountall1(curlwp, false, false) ||
665 config_detach_all(boothowto) ||
666 vfs_unmount_forceone(curlwp))
667 ; /* do nothing */
668 } else
669 suspendsched();
670
671 pmf_system_shutdown(boothowto);
672
673 /* Disable interrupts. */
674 s = splhigh();
675
676 /* Do a dump if requested. */
677 if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP)
678 dumpsys();
679
680haltsys:
681 doshutdownhooks();
682
683 if ((howto & RB_POWERDOWN) == RB_POWERDOWN) {
684#if NACPICA > 0
685 if (s != IPL_NONE)
686 splx(s);
687
688 acpi_enter_sleep_state(ACPI_STATE_S5);
689#endif
690#ifdef XEN
691 HYPERVISOR_shutdown();
692#endif /* XEN */
693 }
694
695 cpu_broadcast_halt();
696
697 if (howto & RB_HALT) {
698#if NACPICA > 0
699 acpi_disable();
700#endif
701
702 printf("\n");
703 printf("The operating system has halted.\n");
704 printf("Please press any key to reboot.\n\n");
705 cnpollc(1); /* for proper keyboard command handling */
706 if (cngetc() == 0) {
707 /* no console attached, so just hlt */
708 printf("No keyboard - cannot reboot after all.\n");
709 for(;;) {
710 x86_hlt();
711 }
712 }
713 cnpollc(0);
714 }
715
716 printf("rebooting...\n");
717 if (cpureset_delay > 0)
718 delay(cpureset_delay * 1000);
719 cpu_reset();
720 for(;;) ;
721 /*NOTREACHED*/
722}
723
724/*
725 * XXXfvdl share dumpcode.
726 */
727
728/*
729 * Perform assorted dump-related initialization tasks. Assumes that
730 * the maximum physical memory address will not increase afterwards.
731 */
732void
733dump_misc_init(void)
734{
735#ifndef NO_SPARSE_DUMP
736 int i;
737#endif
738
739 if (dump_headerbuf != NULL)
740 return; /* already called */
741
742#ifndef NO_SPARSE_DUMP
743 for (i = 0; i < mem_cluster_cnt; ++i) {
744 paddr_t top = mem_clusters[i].start + mem_clusters[i].size;
745 if (max_paddr < top)
746 max_paddr = top;
747 }
748#ifdef DEBUG
749 printf("dump_misc_init: max_paddr = 0x%lx\n",
750 (unsigned long)max_paddr);
751#endif
752 if (max_paddr == 0) {
753 printf("Your machine does not initialize mem_clusters; "
754 "sparse_dumps disabled\n");
755 sparse_dump = 0;
756 } else {
757 sparse_dump_physmap = (void *)uvm_km_alloc(kernel_map,
758 roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE),
759 PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO);
760 }
761#endif
762 dump_headerbuf = (void *)uvm_km_alloc(kernel_map,
763 dump_headerbuf_size,
764 PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO);
765 /* XXXjld should check for failure here, disable dumps if so. */
766}
767
768#ifndef NO_SPARSE_DUMP
769/*
770 * Clear the set of pages to include in a sparse dump.
771 */
772void
773sparse_dump_reset(void)
774{
775 memset(sparse_dump_physmap, 0,
776 roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE));
777}
778
779/*
780 * Include or exclude pages in a sparse dump.
781 */
782void
783sparse_dump_mark(void)
784{
785 paddr_t p, pstart, pend;
786 struct vm_page *pg;
787 int i;
788
789 /*
790 * Mark all memory pages, then unmark pages that are uninteresting.
791 * Dereferenceing pg->uobject might crash again if another CPU
792 * frees the object out from under us, but we can't lock anything
793 * so it's a risk we have to take.
794 */
795
796 for (i = 0; i < mem_cluster_cnt; ++i) {
797 pstart = mem_clusters[i].start / PAGE_SIZE;
798 pend = pstart + mem_clusters[i].size / PAGE_SIZE;
799
800 for (p = pstart; p < pend; p++) {
801 setbit(sparse_dump_physmap, p);
802 }
803 }
804 for (i = 0; i < vm_nphysseg; i++) {
805 struct vm_physseg *seg = VM_PHYSMEM_PTR(i);
806
807 for (pg = seg->pgs; pg < seg->lastpg; pg++) {
808 if (pg->uanon || (pg->pqflags & PQ_FREE) ||
809 (pg->uobject && pg->uobject->pgops)) {
810 p = VM_PAGE_TO_PHYS(pg) / PAGE_SIZE;
811 clrbit(sparse_dump_physmap, p);
812 }
813 }
814 }
815}
816
817/*
818 * Machine-dependently decides on the contents of a sparse dump, using
819 * the above.
820 */
821void
822cpu_dump_prep_sparse(void)
823{
824 sparse_dump_reset();
825 /* XXX could the alternate recursive page table be skipped? */
826 sparse_dump_mark();
827 /* Memory for I/O buffers could be unmarked here, for example. */
828 /* The kernel text could also be unmarked, but gdb would be upset. */
829}
830#endif
831
832/*
833 * Abstractly iterate over the collection of memory segments to be
834 * dumped; the callback lacks the customary environment-pointer
835 * argument because none of the current users really need one.
836 *
837 * To be used only after dump_seg_prep is called to set things up.
838 */
839int
840dump_seg_iter(int (*callback)(paddr_t, paddr_t))
841{
842 int error, i;
843
844#define CALLBACK(start,size) do { \
845 error = callback(start,size); \
846 if (error) \
847 return error; \
848} while(0)
849
850 for (i = 0; i < mem_cluster_cnt; ++i) {
851#ifndef NO_SPARSE_DUMP
852 /*
853 * The bitmap is scanned within each memory segment,
854 * rather than over its entire domain, in case any
855 * pages outside of the memory proper have been mapped
856 * into kva; they might be devices that wouldn't
857 * appreciate being arbitrarily read, and including
858 * them could also break the assumption that a sparse
859 * dump will always be smaller than a full one.
860 */
861 if (sparse_dump && sparse_dump_physmap) {
862 paddr_t p, start, end;
863 int lastset;
864
865 start = mem_clusters[i].start;
866 end = start + mem_clusters[i].size;
867 start = rounddown(start, PAGE_SIZE); /* unnecessary? */
868 lastset = 0;
869 for (p = start; p < end; p += PAGE_SIZE) {
870 int thisset = isset(sparse_dump_physmap,
871 p/PAGE_SIZE);
872
873 if (!lastset && thisset)
874 start = p;
875 if (lastset && !thisset)
876 CALLBACK(start, p - start);
877 lastset = thisset;
878 }
879 if (lastset)
880 CALLBACK(start, p - start);
881 } else
882#endif
883 CALLBACK(mem_clusters[i].start, mem_clusters[i].size);
884 }
885 return 0;
886#undef CALLBACK
887}
888
889/*
890 * Prepare for an impending core dump: decide what's being dumped and
891 * how much space it will take up.
892 */
893void
894dump_seg_prep(void)
895{
896#ifndef NO_SPARSE_DUMP
897 if (sparse_dump && sparse_dump_physmap)
898 cpu_dump_prep_sparse();
899#endif
900
901 dump_nmemsegs = 0;
902 dump_npages = 0;
903 dump_seg_iter(dump_seg_count_range);
904
905 dump_header_size = ALIGN(sizeof(kcore_seg_t)) +
906 ALIGN(sizeof(cpu_kcore_hdr_t)) +
907 ALIGN(dump_nmemsegs * sizeof(phys_ram_seg_t));
908 dump_header_size = roundup(dump_header_size, dbtob(1));
909
910 /*
911 * savecore(8) will read this to decide how many pages to
912 * copy, and cpu_dumpconf has already used the pessimistic
913 * value to set dumplo, so it's time to tell the truth.
914 */
915 dumpsize = dump_npages; /* XXX could these just be one variable? */
916}
917
918int
919dump_seg_count_range(paddr_t start, paddr_t size)
920{
921 ++dump_nmemsegs;
922 dump_npages += size / PAGE_SIZE;
923 return 0;
924}
925
926/*
927 * A sparse dump's header may be rather large, due to the number of
928 * "segments" emitted. These routines manage a simple output buffer,
929 * so that the header can be written to disk incrementally.
930 */
931void
932dump_header_start(void)
933{
934 dump_headerbuf_ptr = dump_headerbuf;
935 dump_header_blkno = dumplo;
936}
937
938int
939dump_header_flush(void)
940{
941 const struct bdevsw *bdev;
942 size_t to_write;
943 int error;
944
945 bdev = bdevsw_lookup(dumpdev);
946 to_write = roundup(dump_headerbuf_ptr - dump_headerbuf, dbtob(1));
947 error = bdev->d_dump(dumpdev, dump_header_blkno,
948 dump_headerbuf, to_write);
949 dump_header_blkno += btodb(to_write);
950 dump_headerbuf_ptr = dump_headerbuf;
951 return error;
952}
953
954int
955dump_header_addbytes(const void* vptr, size_t n)
956{
957 const char* ptr = vptr;
958 int error;
959
960 while (n > dump_headerbuf_avail) {
961 memcpy(dump_headerbuf_ptr, ptr, dump_headerbuf_avail);
962 ptr += dump_headerbuf_avail;
963 n -= dump_headerbuf_avail;
964 dump_headerbuf_ptr = dump_headerbuf_end;
965 error = dump_header_flush();
966 if (error)
967 return error;
968 }
969 memcpy(dump_headerbuf_ptr, ptr, n);
970 dump_headerbuf_ptr += n;
971
972 return 0;
973}
974
975int
976dump_header_addseg(paddr_t start, paddr_t size)
977{
978 phys_ram_seg_t seg = { start, size };
979
980 return dump_header_addbytes(&seg, sizeof(seg));
981}
982
983int
984dump_header_finish(void)
985{
986 memset(dump_headerbuf_ptr, 0, dump_headerbuf_avail);
987 return dump_header_flush();
988}
989
990
991/*
992 * These variables are needed by /sbin/savecore
993 */
994uint32_t dumpmag = 0x8fca0101; /* magic number */
995int dumpsize = 0; /* pages */
996long dumplo = 0; /* blocks */
997
998/*
999 * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers
1000 * for a full (non-sparse) dump.
1001 */
1002int
1003cpu_dumpsize(void)
1004{
1005 int size;
1006
1007 size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(sizeof(cpu_kcore_hdr_t)) +
1008 ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t));
1009 if (roundup(size, dbtob(1)) != dbtob(1))
1010 return (-1);
1011
1012 return (1);
1013}
1014
1015/*
1016 * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped
1017 * for a full (non-sparse) dump.
1018 */
1019u_long
1020cpu_dump_mempagecnt(void)
1021{
1022 u_long i, n;
1023
1024 n = 0;
1025 for (i = 0; i < mem_cluster_cnt; i++)
1026 n += atop(mem_clusters[i].size);
1027 return (n);
1028}
1029
1030/*
1031 * cpu_dump: dump the machine-dependent kernel core dump headers.
1032 */
1033int
1034cpu_dump(void)
1035{
1036 kcore_seg_t seg;
1037 cpu_kcore_hdr_t cpuhdr;
1038 const struct bdevsw *bdev;
1039
1040 bdev = bdevsw_lookup(dumpdev);
1041 if (bdev == NULL)
1042 return (ENXIO);
1043
1044 /*
1045 * Generate a segment header.
1046 */
1047 CORE_SETMAGIC(seg, KCORE_MAGIC, MID_MACHINE, CORE_CPU);
1048 seg.c_size = dump_header_size - ALIGN(sizeof(seg));
1049 (void)dump_header_addbytes(&seg, ALIGN(sizeof(seg)));
1050
1051 /*
1052 * Add the machine-dependent header info.
1053 */
1054 cpuhdr.ptdpaddr = PDPpaddr;
1055 cpuhdr.nmemsegs = dump_nmemsegs;
1056 (void)dump_header_addbytes(&cpuhdr, ALIGN(sizeof(cpuhdr)));
1057
1058 /*
1059 * Write out the memory segment descriptors.
1060 */
1061 return dump_seg_iter(dump_header_addseg);
1062}
1063
1064/*
1065 * Doadump comes here after turning off memory management and
1066 * getting on the dump stack, either when called above, or by
1067 * the auto-restart code.
1068 */
1069#define BYTES_PER_DUMP PAGE_SIZE /* must be a multiple of pagesize XXX small */
1070static vaddr_t dumpspace;
1071
1072vaddr_t
1073reserve_dumppages(vaddr_t p)
1074{
1075
1076 dumpspace = p;
1077 return (p + BYTES_PER_DUMP);
1078}
1079
1080int
1081dumpsys_seg(paddr_t maddr, paddr_t bytes)
1082{
1083 u_long i, m, n;
1084 daddr_t blkno;
1085 const struct bdevsw *bdev;
1086 int (*dump)(dev_t, daddr_t, void *, size_t);
1087 int error;
1088
1089 if (dumpdev == NODEV)
1090 return ENODEV;
1091 bdev = bdevsw_lookup(dumpdev);
1092 if (bdev == NULL || bdev->d_psize == NULL)
1093 return ENODEV;
1094
1095 dump = bdev->d_dump;
1096
1097 blkno = dump_header_blkno;
1098 for (i = 0; i < bytes; i += n, dump_totalbytesleft -= n) {
1099 /* Print out how many MBs we have left to go. */
1100 if ((dump_totalbytesleft % (1024*1024)) == 0)
1101 printf_nolog("%lu ", (unsigned long)
1102 (dump_totalbytesleft / (1024 * 1024)));
1103
1104 /* Limit size for next transfer. */
1105 n = bytes - i;
1106 if (n > BYTES_PER_DUMP)
1107 n = BYTES_PER_DUMP;
1108
1109 for (m = 0; m < n; m += NBPG)
1110 pmap_kenter_pa(dumpspace + m, maddr + m,
1111 VM_PROT_READ, 0);
1112 pmap_update(pmap_kernel());
1113
1114 error = (*dump)(dumpdev, blkno, (void *)dumpspace, n);
1115 pmap_kremove_local(dumpspace, n);
1116 if (error)
1117 return error;
1118 maddr += n;
1119 blkno += btodb(n); /* XXX? */
1120
1121#if 0 /* XXX this doesn't work. grr. */
1122 /* operator aborting dump? */
1123 if (sget() != NULL)
1124 return EINTR;
1125#endif
1126 }
1127 dump_header_blkno = blkno;
1128
1129 return 0;
1130}
1131
1132void
1133dodumpsys(void)
1134{
1135 const struct bdevsw *bdev;
1136 int dumpend, psize;
1137 int error;
1138
1139 if (dumpdev == NODEV)
1140 return;
1141
1142 bdev = bdevsw_lookup(dumpdev);
1143 if (bdev == NULL || bdev->d_psize == NULL)
1144 return;
1145 /*
1146 * For dumps during autoconfiguration,
1147 * if dump device has already configured...
1148 */
1149 if (dumpsize == 0)
1150 cpu_dumpconf();
1151
1152 printf("\ndumping to dev %llu,%llu (offset=%ld, size=%d):",
1153 (unsigned long long)major(dumpdev),
1154 (unsigned long long)minor(dumpdev), dumplo, dumpsize);
1155
1156 if (dumplo <= 0 || dumpsize <= 0) {
1157 printf(" not possible\n");
1158 return;
1159 }
1160
1161 psize = bdev_size(dumpdev);
1162 printf("\ndump ");
1163 if (psize == -1) {
1164 printf("area unavailable\n");
1165 return;
1166 }
1167
1168#if 0 /* XXX this doesn't work. grr. */
1169 /* toss any characters present prior to dump */
1170 while (sget() != NULL); /*syscons and pccons differ */
1171#endif
1172
1173 dump_seg_prep();
1174 dumpend = dumplo + btodb(dump_header_size) + ctod(dump_npages);
1175 if (dumpend > psize) {
1176 printf("failed: insufficient space (%d < %d)\n",
1177 psize, dumpend);
1178 goto failed;
1179 }
1180
1181 dump_header_start();
1182 if ((error = cpu_dump()) != 0)
1183 goto err;
1184 if ((error = dump_header_finish()) != 0)
1185 goto err;
1186
1187 if (dump_header_blkno != dumplo + btodb(dump_header_size)) {
1188 printf("BAD header size (%ld [written] != %ld [expected])\n",
1189 (long)(dump_header_blkno - dumplo),
1190 (long)btodb(dump_header_size));
1191 goto failed;
1192 }
1193
1194 dump_totalbytesleft = roundup(ptoa(dump_npages), BYTES_PER_DUMP);
1195 error = dump_seg_iter(dumpsys_seg);
1196
1197 if (error == 0 && dump_header_blkno != dumpend) {
1198 printf("BAD dump size (%ld [written] != %ld [expected])\n",
1199 (long)(dumpend - dumplo),
1200 (long)(dump_header_blkno - dumplo));
1201 goto failed;
1202 }
1203
1204err:
1205 switch (error) {
1206
1207 case ENXIO:
1208 printf("device bad\n");
1209 break;
1210
1211 case EFAULT:
1212 printf("device not ready\n");
1213 break;
1214
1215 case EINVAL:
1216 printf("area improper\n");
1217 break;
1218
1219 case EIO:
1220 printf("i/o error\n");
1221 break;
1222
1223 case EINTR:
1224 printf("aborted from console\n");
1225 break;
1226
1227 case 0:
1228 printf("succeeded\n");
1229 break;
1230
1231 default:
1232 printf("error %d\n", error);
1233 break;
1234 }
1235failed:
1236 printf("\n\n");
1237 delay(5000000); /* 5 seconds */
1238}
1239
1240/*
1241 * This is called by main to set dumplo and dumpsize.
1242 * Dumps always skip the first PAGE_SIZE of disk space
1243 * in case there might be a disk label stored there.
1244 * If there is extra space, put dump at the end to
1245 * reduce the chance that swapping trashes it.
1246 *
1247 * Sparse dumps can't placed as close to the end as possible, because
1248 * savecore(8) has to know where to start reading in the dump device
1249 * before it has access to any of the crashed system's state.
1250 *
1251 * Note also that a sparse dump will never be larger than a full one:
1252 * in order to add a phys_ram_seg_t to the header, at least one page
1253 * must be removed.
1254 */
1255void
1256cpu_dumpconf(void)
1257{
1258 int nblks, dumpblks; /* size of dump area */
1259
1260 if (dumpdev == NODEV)
1261 goto bad;
1262 nblks = bdev_size(dumpdev);
1263 if (nblks <= ctod(1))
1264 goto bad;
1265
1266 dumpblks = cpu_dumpsize();
1267 if (dumpblks < 0)
1268 goto bad;
1269
1270 /* dumpsize is in page units, and doesn't include headers. */
1271 dumpsize = cpu_dump_mempagecnt();
1272
1273 dumpblks += ctod(dumpsize);
1274
1275 /* If dump won't fit (incl. room for possible label), punt. */
1276 if (dumpblks > (nblks - ctod(1))) {
1277#ifndef NO_SPARSE_DUMP
1278 /* A sparse dump might (and hopefully will) fit. */
1279 dumplo = ctod(1);
1280#else
1281 /* But if we're not configured for that, punt. */
1282 goto bad;
1283#endif
1284 } else {
1285 /* Put dump at end of partition */
1286 dumplo = nblks - dumpblks;
1287 }
1288
1289
1290 /* Now that we've decided this will work, init ancillary stuff. */
1291 dump_misc_init();
1292 return;
1293
1294 bad:
1295 dumpsize = 0;
1296}
1297
1298/*
1299 * Clear registers on exec
1300 */
1301void
1302setregs(struct lwp *l, struct exec_package *pack, vaddr_t stack)
1303{
1304 struct pcb *pcb = lwp_getpcb(l);
1305 struct trapframe *tf;
1306
1307#ifdef USER_LDT
1308 pmap_ldt_cleanup(l);
1309#endif
1310
1311 fpu_save_area_clear(l, pack->ep_osversion >= 699002600
1312 ? __NetBSD_NPXCW__ : __NetBSD_COMPAT_NPXCW__);
1313 pcb->pcb_flags = 0;
1314
1315 l->l_proc->p_flag &= ~PK_32;
1316
1317 tf = l->l_md.md_regs;
1318 tf->tf_ds = LSEL(LUDATA_SEL, SEL_UPL);
1319 tf->tf_es = LSEL(LUDATA_SEL, SEL_UPL);
1320 cpu_fsgs_zero(l);
1321 tf->tf_rdi = 0;
1322 tf->tf_rsi = 0;
1323 tf->tf_rbp = 0;
1324 tf->tf_rbx = l->l_proc->p_psstrp;
1325 tf->tf_rdx = 0;
1326 tf->tf_rcx = 0;
1327 tf->tf_rax = 0;
1328 tf->tf_rip = pack->ep_entry;
1329 tf->tf_cs = LSEL(LUCODE_SEL, SEL_UPL);
1330 tf->tf_rflags = PSL_USERSET;
1331 tf->tf_rsp = stack;
1332 tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL);
1333}
1334
1335/*
1336 * Initialize segments and descriptor tables
1337 */
1338
1339#ifdef XEN
1340struct trap_info *xen_idt;
1341int xen_idt_idx;
1342#endif
1343char *ldtstore;
1344char *gdtstore;
1345
1346void
1347setgate(struct gate_descriptor *gd, void *func, int ist, int type, int dpl, int sel)
1348{
1349
1350 kpreempt_disable();
1351 pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE);
1352
1353 gd->gd_looffset = (uint64_t)func & 0xffff;
1354 gd->gd_selector = sel;
1355 gd->gd_ist = ist;
1356 gd->gd_type = type;
1357 gd->gd_dpl = dpl;
1358 gd->gd_p = 1;
1359 gd->gd_hioffset = (uint64_t)func >> 16;
1360 gd->gd_zero = 0;
1361 gd->gd_xx1 = 0;
1362 gd->gd_xx2 = 0;
1363 gd->gd_xx3 = 0;
1364
1365 pmap_changeprot_local(idt_vaddr, VM_PROT_READ);
1366 kpreempt_enable();
1367}
1368
1369void
1370unsetgate(struct gate_descriptor *gd)
1371{
1372
1373 kpreempt_disable();
1374 pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE);
1375
1376 memset(gd, 0, sizeof (*gd));
1377
1378 pmap_changeprot_local(idt_vaddr, VM_PROT_READ);
1379 kpreempt_enable();
1380}
1381
1382void
1383setregion(struct region_descriptor *rd, void *base, uint16_t limit)
1384{
1385 rd->rd_limit = limit;
1386 rd->rd_base = (uint64_t)base;
1387}
1388
1389/*
1390 * Note that the base and limit fields are ignored in long mode.
1391 */
1392void
1393set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit,
1394 int type, int dpl, int gran, int def32, int is64)
1395{
1396 sd->sd_lolimit = (unsigned)limit;
1397 sd->sd_lobase = (unsigned long)base;
1398 sd->sd_type = type;
1399 sd->sd_dpl = dpl;
1400 sd->sd_p = 1;
1401 sd->sd_hilimit = (unsigned)limit >> 16;
1402 sd->sd_avl = 0;
1403 sd->sd_long = is64;
1404 sd->sd_def32 = def32;
1405 sd->sd_gran = gran;
1406 sd->sd_hibase = (unsigned long)base >> 24;
1407}
1408
1409void
1410set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit,
1411 int type, int dpl, int gran)
1412{
1413 memset(sd, 0, sizeof *sd);
1414 sd->sd_lolimit = (unsigned)limit;
1415 sd->sd_lobase = (uint64_t)base;
1416 sd->sd_type = type;
1417 sd->sd_dpl = dpl;
1418 sd->sd_p = 1;
1419 sd->sd_hilimit = (unsigned)limit >> 16;
1420 sd->sd_gran = gran;
1421 sd->sd_hibase = (uint64_t)base >> 24;
1422}
1423
1424void
1425cpu_init_idt(void)
1426{
1427#ifndef XEN
1428 struct region_descriptor region;
1429
1430 setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
1431 lidt(&region);
1432#else
1433 if (HYPERVISOR_set_trap_table(xen_idt))
1434 panic("HYPERVISOR_set_trap_table() failed");
1435#endif
1436}
1437
1438#define IDTVEC(name) __CONCAT(X, name)
1439typedef void (vector)(void);
1440extern vector IDTVEC(syscall);
1441extern vector IDTVEC(syscall32);
1442extern vector IDTVEC(osyscall);
1443extern vector IDTVEC(oosyscall);
1444extern vector *IDTVEC(exceptions)[];
1445
1446static void
1447init_x86_64_msgbuf(void)
1448{
1449 /* Message buffer is located at end of core. */
1450 struct vm_physseg *vps;
1451 psize_t sz = round_page(MSGBUFSIZE);
1452 psize_t reqsz = sz;
1453 int x;
1454
1455 search_again:
1456 vps = NULL;
1457
1458 for (x = 0; x < vm_nphysseg; x++) {
1459 vps = VM_PHYSMEM_PTR(x);
1460 if (ctob(vps->avail_end) == avail_end)
1461 break;
1462 }
1463 if (x == vm_nphysseg)
1464 panic("init_x86_64: can't find end of memory");
1465
1466 /* Shrink so it'll fit in the last segment. */
1467 if ((vps->avail_end - vps->avail_start) < atop(sz))
1468 sz = ctob(vps->avail_end - vps->avail_start);
1469
1470 vps->avail_end -= atop(sz);
1471 vps->end -= atop(sz);
1472 msgbuf_p_seg[msgbuf_p_cnt].sz = sz;
1473 msgbuf_p_seg[msgbuf_p_cnt++].paddr = ctob(vps->avail_end);
1474
1475 /* Remove the last segment if it now has no pages. */
1476 if (vps->start == vps->end) {
1477 for (vm_nphysseg--; x < vm_nphysseg; x++)
1478 VM_PHYSMEM_PTR_SWAP(x, x + 1);
1479 }
1480
1481 /* Now find where the new avail_end is. */
1482 for (avail_end = 0, x = 0; x < vm_nphysseg; x++)
1483 if (VM_PHYSMEM_PTR(x)->avail_end > avail_end)
1484 avail_end = VM_PHYSMEM_PTR(x)->avail_end;
1485 avail_end = ctob(avail_end);
1486
1487 if (sz == reqsz)
1488 return;
1489
1490 reqsz -= sz;
1491 if (msgbuf_p_cnt == VM_PHYSSEG_MAX) {
1492 /* No more segments available, bail out. */
1493 printf("WARNING: MSGBUFSIZE (%zu) too large, using %zu.\n",
1494 (size_t)MSGBUFSIZE, (size_t)(MSGBUFSIZE - reqsz));
1495 return;
1496 }
1497
1498 sz = reqsz;
1499 goto search_again;
1500}
1501
1502static void
1503init_x86_64_ksyms(void)
1504{
1505#if NKSYMS || defined(DDB) || defined(MODULAR)
1506 extern int end;
1507 extern int *esym;
1508#ifndef XEN
1509 struct btinfo_symtab *symtab;
1510 vaddr_t tssym, tesym;
1511#endif
1512
1513#ifdef DDB
1514 db_machine_init();
1515#endif
1516
1517#ifndef XEN
1518 symtab = lookup_bootinfo(BTINFO_SYMTAB);
1519 if (symtab) {
1520 tssym = (vaddr_t)symtab->ssym + KERNBASE;
1521 tesym = (vaddr_t)symtab->esym + KERNBASE;
1522 ksyms_addsyms_elf(symtab->nsym, (void *)tssym, (void *)tesym);
1523 } else
1524 ksyms_addsyms_elf(*(long *)(void *)&end,
1525 ((long *)(void *)&end) + 1, esym);
1526#else /* XEN */
1527 esym = xen_start_info.mod_start ?
1528 (void *)xen_start_info.mod_start :
1529 (void *)xen_start_info.mfn_list;
1530 ksyms_addsyms_elf(*(int *)(void *)&end,
1531 ((int *)(void *)&end) + 1, esym);
1532#endif /* XEN */
1533#endif
1534}
1535
1536void
1537init_x86_64(paddr_t first_avail)
1538{
1539 extern void consinit(void);
1540 struct region_descriptor region;
1541 struct mem_segment_descriptor *ldt_segp;
1542 int x;
1543#ifndef XEN
1544 int ist;
1545#endif
1546
1547 KASSERT(first_avail % PAGE_SIZE == 0);
1548
1549#ifdef XEN
1550 KASSERT(HYPERVISOR_shared_info != NULL);
1551 cpu_info_primary.ci_vcpu = &HYPERVISOR_shared_info->vcpu_info[0];
1552
1553 __PRINTK(("init_x86_64(0x%lx)\n", first_avail));
1554#endif /* XEN */
1555
1556 cpu_probe(&cpu_info_primary);
1557 cpu_init_msrs(&cpu_info_primary, true);
1558
1559 use_pae = 1; /* PAE always enabled in long mode */
1560
1561#ifdef XEN
1562 struct pcb *pcb = lwp_getpcb(&lwp0);
1563 mutex_init(&pte_lock, MUTEX_DEFAULT, IPL_VM);
1564 pcb->pcb_cr3 = xen_start_info.pt_base - KERNBASE;
1565 __PRINTK(("pcb_cr3 0x%lx\n", xen_start_info.pt_base - KERNBASE));
1566#endif
1567
1568#if NISA > 0 || NPCI > 0
1569 x86_bus_space_init();
1570#endif
1571
1572 consinit(); /* XXX SHOULD NOT BE DONE HERE */
1573
1574 /*
1575 * Initialize PAGE_SIZE-dependent variables.
1576 */
1577 uvm_setpagesize();
1578
1579 uvmexp.ncolors = 2;
1580
1581#ifndef XEN
1582 /*
1583 * Low memory reservations:
1584 * Page 0: BIOS data
1585 * Page 1: BIOS callback (not used yet, for symmetry with i386)
1586 * Page 2: MP bootstrap code (MP_TRAMPOLINE)
1587 * Page 3: ACPI wakeup code (ACPI_WAKEUP_ADDR)
1588 * Page 4: Temporary page table for 0MB-4MB
1589 * Page 5: Temporary page directory
1590 * Page 6: Temporary page map level 3
1591 * Page 7: Temporary page map level 4
1592 */
1593 avail_start = 8 * PAGE_SIZE;
1594
1595 /* Initialize the memory clusters (needed in pmap_boostrap). */
1596 init_x86_clusters();
1597#else /* XEN */
1598 /* Parse Xen command line (replace bootinfo) */
1599 xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL);
1600
1601 /* Determine physical address space */
1602 avail_start = first_avail;
1603 avail_end = ctob(xen_start_info.nr_pages);
1604 pmap_pa_start = (KERNTEXTOFF - KERNBASE);
1605 pmap_pa_end = avail_end;
1606 __PRINTK(("pmap_pa_start 0x%lx avail_start 0x%lx avail_end 0x%lx\n",
1607 pmap_pa_start, avail_start, avail_end));
1608#endif /* !XEN */
1609
1610 /* End of the virtual space we have created so far. */
1611 kern_end = KERNBASE + first_avail;
1612
1613 /*
1614 * Call pmap initialization to make new kernel address space.
1615 * We must do this before loading pages into the VM system.
1616 */
1617 pmap_bootstrap(VM_MIN_KERNEL_ADDRESS);
1618
1619#ifndef XEN
1620 /* Internalize the physical pages into the VM system. */
1621 init_x86_vm(first_avail);
1622#else /* XEN */
1623 physmem = xen_start_info.nr_pages;
1624
1625 uvm_page_physload(atop(avail_start),
1626 atop(avail_end), atop(avail_start),
1627 atop(avail_end), VM_FREELIST_DEFAULT);
1628#endif /* !XEN */
1629
1630 init_x86_64_msgbuf();
1631
1632 pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024);
1633
1634 kpreempt_disable();
1635
1636 pmap_kenter_pa(idt_vaddr, idt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1637 pmap_kenter_pa(gdt_vaddr, gdt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1638 pmap_kenter_pa(ldt_vaddr, ldt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1639 pmap_update(pmap_kernel());
1640 memset((void *)idt_vaddr, 0, PAGE_SIZE);
1641 memset((void *)gdt_vaddr, 0, PAGE_SIZE);
1642 memset((void *)ldt_vaddr, 0, PAGE_SIZE);
1643
1644#ifndef XEN
1645 pmap_changeprot_local(idt_vaddr, VM_PROT_READ);
1646#endif
1647
1648 pmap_update(pmap_kernel());
1649
1650#ifndef XEN
1651 idt = (struct gate_descriptor *)idt_vaddr;
1652#else
1653 xen_idt = (struct trap_info *)idt_vaddr;
1654 xen_idt_idx = 0;
1655#endif
1656 gdtstore = (char *)gdt_vaddr;
1657 ldtstore = (char *)ldt_vaddr;
1658
1659 /*
1660 * Make GDT gates and memory segments.
1661 */
1662 set_mem_segment(GDT_ADDR_MEM(gdtstore, GCODE_SEL), 0,
1663 0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1);
1664
1665 set_mem_segment(GDT_ADDR_MEM(gdtstore, GDATA_SEL), 0,
1666 0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1);
1667
1668 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE_SEL), 0,
1669 x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1);
1670
1671 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA_SEL), 0,
1672 x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1);
1673
1674#ifndef XEN
1675 set_sys_segment(GDT_ADDR_SYS(gdtstore, GLDT_SEL), ldtstore,
1676 LDT_SIZE - 1, SDT_SYSLDT, SEL_KPL, 0);
1677#endif
1678
1679 /*
1680 * Make LDT gates and memory segments.
1681 */
1682 setgate((struct gate_descriptor *)(ldtstore + LSYS5CALLS_SEL),
1683 &IDTVEC(oosyscall), 0, SDT_SYS386CGT, SEL_UPL,
1684 GSEL(GCODE_SEL, SEL_KPL));
1685 *(struct mem_segment_descriptor *)(ldtstore + LUCODE_SEL) =
1686 *GDT_ADDR_MEM(gdtstore, GUCODE_SEL);
1687 *(struct mem_segment_descriptor *)(ldtstore + LUDATA_SEL) =
1688 *GDT_ADDR_MEM(gdtstore, GUDATA_SEL);
1689
1690 /*
1691 * 32 bit GDT entries.
1692 */
1693 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE32_SEL), 0,
1694 x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMERA, SEL_UPL, 1, 1, 0);
1695
1696 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA32_SEL), 0,
1697 x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1698
1699 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUFS_SEL), 0,
1700 x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1701
1702 set_mem_segment(GDT_ADDR_MEM(gdtstore, GUGS_SEL), 0,
1703 x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1704
1705 /*
1706 * 32 bit LDT entries.
1707 */
1708 ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUCODE32_SEL);
1709 set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1,
1710 SDT_MEMERA, SEL_UPL, 1, 1, 0);
1711 ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUDATA32_SEL);
1712 set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1,
1713 SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1714
1715 /*
1716 * Other LDT entries.
1717 */
1718 memcpy((struct gate_descriptor *)(ldtstore + LSOL26CALLS_SEL),
1719 (struct gate_descriptor *)(ldtstore + LSYS5CALLS_SEL),
1720 sizeof (struct gate_descriptor));
1721 memcpy((struct gate_descriptor *)(ldtstore + LBSDICALLS_SEL),
1722 (struct gate_descriptor *)(ldtstore + LSYS5CALLS_SEL),
1723 sizeof (struct gate_descriptor));
1724
1725 /* CPU-specific IDT exceptions. */
1726 for (x = 0; x < NCPUIDT; x++) {
1727#ifndef XEN
1728 idt_vec_reserve(x);
1729 switch (x) {
1730 case 2: /* NMI */
1731 ist = 3;
1732 break;
1733 case 8: /* double fault */
1734 ist = 2;
1735 break;
1736 default:
1737 ist = 0;
1738 break;
1739 }
1740 setgate(&idt[x], IDTVEC(exceptions)[x], ist, SDT_SYS386IGT,
1741 (x == 3 || x == 4) ? SEL_UPL : SEL_KPL,
1742 GSEL(GCODE_SEL, SEL_KPL));
1743#else /* XEN */
1744 pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE);
1745 xen_idt[xen_idt_idx].vector = x;
1746
1747 switch (x) {
1748 case 2: /* NMI */
1749 case 18: /* MCA */
1750 TI_SET_IF(&(xen_idt[xen_idt_idx]), 2);
1751 break;
1752 case 3:
1753 case 4:
1754 xen_idt[xen_idt_idx].flags = SEL_UPL;
1755 break;
1756 default:
1757 xen_idt[xen_idt_idx].flags = SEL_KPL;
1758 break;
1759 }
1760
1761 xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
1762 xen_idt[xen_idt_idx].address =
1763 (unsigned long)IDTVEC(exceptions)[x];
1764 xen_idt_idx++;
1765#endif /* XEN */
1766 }
1767
1768 /* new-style interrupt gate for syscalls */
1769#ifndef XEN
1770 idt_vec_reserve(128);
1771 setgate(&idt[128], &IDTVEC(osyscall), 0, SDT_SYS386IGT, SEL_UPL,
1772 GSEL(GCODE_SEL, SEL_KPL));
1773#else
1774 xen_idt[xen_idt_idx].vector = 128;
1775 xen_idt[xen_idt_idx].flags = SEL_KPL;
1776 xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
1777 xen_idt[xen_idt_idx].address = (unsigned long) &IDTVEC(osyscall);
1778 xen_idt_idx++;
1779 pmap_changeprot_local(idt_vaddr, VM_PROT_READ);
1780#endif /* XEN */
1781 kpreempt_enable();
1782
1783 setregion(&region, gdtstore, DYNSEL_START - 1);
1784 lgdt(&region);
1785
1786#ifdef XEN
1787 /* Init Xen callbacks and syscall handlers */
1788 if (HYPERVISOR_set_callbacks(
1789 (unsigned long) hypervisor_callback,
1790 (unsigned long) failsafe_callback,
1791 (unsigned long) Xsyscall))
1792 panic("HYPERVISOR_set_callbacks() failed");
1793#endif /* XEN */
1794 cpu_init_idt();
1795
1796 init_x86_64_ksyms();
1797
1798#ifndef XEN
1799 intr_default_setup();
1800#else
1801 events_default_setup();
1802#endif
1803
1804 splraise(IPL_HIGH);
1805 x86_enable_intr();
1806
1807#ifdef DDB
1808 if (boothowto & RB_KDB)
1809 Debugger();
1810#endif
1811#ifdef KGDB
1812 kgdb_port_init();
1813 if (boothowto & RB_KDB) {
1814 kgdb_debug_init = 1;
1815 kgdb_connect(1);
1816 }
1817#endif
1818}
1819
1820void
1821cpu_reset(void)
1822{
1823 x86_disable_intr();
1824
1825#ifdef XEN
1826 HYPERVISOR_reboot();
1827#else
1828
1829 x86_reset();
1830
1831 /*
1832 * Try to cause a triple fault and watchdog reset by making the IDT
1833 * invalid and causing a fault.
1834 */
1835 kpreempt_disable();
1836 pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE);
1837 memset((void *)idt, 0, NIDT * sizeof(idt[0]));
1838 kpreempt_enable();
1839 breakpoint();
1840
1841#if 0
1842 /*
1843 * Try to cause a triple fault and watchdog reset by unmapping the
1844 * entire address space and doing a TLB flush.
1845 */
1846 memset((void *)PTD, 0, PAGE_SIZE);
1847 tlbflush();
1848#endif
1849#endif /* XEN */
1850
1851 for (;;);
1852}
1853
1854void
1855cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags)
1856{
1857 const struct trapframe *tf = l->l_md.md_regs;
1858 __greg_t ras_rip;
1859
1860 /* Copy general registers member by member */
1861#define copy_from_tf(reg, REG, idx) mcp->__gregs[_REG_##REG] = tf->tf_##reg;
1862 _FRAME_GREG(copy_from_tf)
1863#undef copy_from_tf
1864
1865 if ((ras_rip = (__greg_t)ras_lookup(l->l_proc,
1866 (void *) mcp->__gregs[_REG_RIP])) != -1)
1867 mcp->__gregs[_REG_RIP] = ras_rip;
1868
1869 *flags |= _UC_CPU;
1870
1871 mcp->_mc_tlsbase = (uintptr_t)l->l_private;
1872 *flags |= _UC_TLSBASE;
1873
1874 process_read_fpregs_xmm(l, (struct fxsave *)&mcp->__fpregs);
1875 *flags |= _UC_FPU;
1876}
1877
1878int
1879cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags)
1880{
1881 struct trapframe *tf = l->l_md.md_regs;
1882 const __greg_t *gr = mcp->__gregs;
1883 struct proc *p = l->l_proc;
1884 int error;
1885 int err, trapno;
1886 int64_t rflags;
1887
1888 CTASSERT(sizeof (mcontext_t) == 26 * 8 + 8 + 512);
1889
1890 if ((flags & _UC_CPU) != 0) {
1891 error = cpu_mcontext_validate(l, mcp);
1892 if (error != 0)
1893 return error;
1894 /*
1895 * save and restore some values we don't want to change.
1896 * _FRAME_GREG(copy_to_tf) below overwrites them.
1897 *
1898 * XXX maybe inline this.
1899 */
1900 rflags = tf->tf_rflags;
1901 err = tf->tf_err;
1902 trapno = tf->tf_trapno;
1903
1904 /* Copy general registers member by member */
1905#define copy_to_tf(reg, REG, idx) tf->tf_##reg = gr[_REG_##REG];
1906 _FRAME_GREG(copy_to_tf)
1907#undef copy_to_tf
1908
1909#ifdef XEN
1910 /*
1911 * Xen has its own way of dealing with %cs and %ss,
1912 * reset it to proper values.
1913 */
1914 tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
1915 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
1916#endif
1917 rflags &= ~PSL_USER;
1918 tf->tf_rflags = rflags | (gr[_REG_RFLAGS] & PSL_USER);
1919 tf->tf_err = err;
1920 tf->tf_trapno = trapno;
1921
1922 l->l_md.md_flags |= MDL_IRET;
1923 }
1924
1925 if ((flags & _UC_FPU) != 0)
1926 process_write_fpregs_xmm(l, (const struct fxsave *)&mcp->__fpregs);
1927
1928 if ((flags & _UC_TLSBASE) != 0)
1929 lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase);
1930
1931 mutex_enter(p->p_lock);
1932 if (flags & _UC_SETSTACK)
1933 l->l_sigstk.ss_flags |= SS_ONSTACK;
1934 if (flags & _UC_CLRSTACK)
1935 l->l_sigstk.ss_flags &= ~SS_ONSTACK;
1936 mutex_exit(p->p_lock);
1937
1938 return 0;
1939}
1940
1941int
1942cpu_mcontext_validate(struct lwp *l, const mcontext_t *mcp)
1943{
1944 const __greg_t *gr;
1945 uint16_t sel;
1946 int error;
1947 struct pmap *pmap = l->l_proc->p_vmspace->vm_map.pmap;
1948 struct proc *p = l->l_proc;
1949 struct trapframe *tf = l->l_md.md_regs;
1950
1951 gr = mcp->__gregs;
1952
1953 if (((gr[_REG_RFLAGS] ^ tf->tf_rflags) & PSL_USERSTATIC) != 0)
1954 return EINVAL;
1955
1956 if (__predict_false(pmap->pm_ldt != NULL)) {
1957 error = valid_user_selector(l, gr[_REG_ES]);
1958 if (error != 0)
1959 return error;
1960
1961 error = valid_user_selector(l, gr[_REG_FS]);
1962 if (error != 0)
1963 return error;
1964
1965 error = valid_user_selector(l, gr[_REG_GS]);
1966 if (error != 0)
1967 return error;
1968
1969 if ((gr[_REG_DS] & 0xffff) == 0)
1970 return EINVAL;
1971 error = valid_user_selector(l, gr[_REG_DS]);
1972 if (error != 0)
1973 return error;
1974
1975#ifndef XEN
1976 if ((gr[_REG_SS] & 0xffff) == 0)
1977 return EINVAL;
1978 error = valid_user_selector(l, gr[_REG_SS]);
1979 if (error != 0)
1980 return error;
1981#endif
1982 } else {
1983#define VUD(sel) \
1984 ((p->p_flag & PK_32) ? VALID_USER_DSEL32(sel) : VALID_USER_DSEL(sel))
1985 sel = gr[_REG_ES] & 0xffff;
1986 if (sel != 0 && !VUD(sel))
1987 return EINVAL;
1988
1989/* XXX: Shouldn't this be FSEL32? */
1990#define VUF(sel) \
1991 ((p->p_flag & PK_32) ? VALID_USER_DSEL32(sel) : VALID_USER_DSEL(sel))
1992 sel = gr[_REG_FS] & 0xffff;
1993 if (sel != 0 && !VUF(sel))
1994 return EINVAL;
1995
1996#define VUG(sel) \
1997 ((p->p_flag & PK_32) ? VALID_USER_GSEL32(sel) : VALID_USER_DSEL(sel))
1998 sel = gr[_REG_GS] & 0xffff;
1999 if (sel != 0 && !VUG(sel))
2000 return EINVAL;
2001
2002 sel = gr[_REG_DS] & 0xffff;
2003 if (!VUD(sel))
2004 return EINVAL;
2005
2006#ifndef XEN
2007 sel = gr[_REG_SS] & 0xffff;
2008 if (!VUD(sel))
2009 return EINVAL;
2010#endif
2011
2012 }
2013
2014#ifndef XEN
2015#define VUC(sel) \
2016 ((p->p_flag & PK_32) ? VALID_USER_CSEL32(sel) : VALID_USER_CSEL(sel))
2017 sel = gr[_REG_CS] & 0xffff;
2018 if (!VUC(sel))
2019 return EINVAL;
2020#endif
2021
2022 if (gr[_REG_RIP] >= VM_MAXUSER_ADDRESS)
2023 return EINVAL;
2024 return 0;
2025}
2026
2027void
2028cpu_initclocks(void)
2029{
2030 (*initclock_func)();
2031}
2032
2033static int
2034valid_user_selector(struct lwp *l, uint64_t seg)
2035{
2036 int off, len;
2037 char *dt;
2038 struct mem_segment_descriptor *sdp;
2039 struct proc *p = l->l_proc;
2040 struct pmap *pmap= p->p_vmspace->vm_map.pmap;
2041 uint64_t base;
2042
2043 seg &= 0xffff;
2044
2045 if (seg == 0)
2046 return 0;
2047
2048 off = (seg & 0xfff8);
2049 if (seg & SEL_LDT) {
2050 if (pmap->pm_ldt != NULL) {
2051 len = pmap->pm_ldt_len; /* XXX broken */
2052 dt = (char *)pmap->pm_ldt;
2053 } else {
2054 dt = ldtstore;
2055 len = LDT_SIZE;
2056 }
2057
2058 if (off > (len - 8))
2059 return EINVAL;
2060 } else {
2061 CTASSERT(GUDATA_SEL & SEL_LDT);
2062 KASSERT(seg != GUDATA_SEL);
2063 CTASSERT(GUDATA32_SEL & SEL_LDT);
2064 KASSERT(seg != GUDATA32_SEL);
2065 return EINVAL;
2066 }
2067
2068 sdp = (struct mem_segment_descriptor *)(dt + off);
2069 if (sdp->sd_type < SDT_MEMRO || sdp->sd_p == 0)
2070 return EINVAL;
2071
2072 base = ((uint64_t)sdp->sd_hibase << 32) | ((uint64_t)sdp->sd_lobase);
2073 if (sdp->sd_gran == 1)
2074 base <<= PAGE_SHIFT;
2075
2076 if (base >= VM_MAXUSER_ADDRESS)
2077 return EINVAL;
2078
2079 return 0;
2080}
2081
2082int
2083mm_md_kernacc(void *ptr, vm_prot_t prot, bool *handled)
2084{
2085 extern int start, __data_start;
2086 const vaddr_t v = (vaddr_t)ptr;
2087
2088 if (v >= (vaddr_t)&start && v < (vaddr_t)kern_end) {
2089 *handled = true;
2090 /* Either the text or rodata segment */
2091 if (v < (vaddr_t)&__data_start && (prot & VM_PROT_WRITE))
2092 return EFAULT;
2093
2094 } else if (v >= module_start && v < module_end) {
2095 *handled = true;
2096 if (!uvm_map_checkprot(module_map, v, v + 1, prot))
2097 return EFAULT;
2098 } else {
2099 *handled = false;
2100 }
2101 return 0;
2102}
2103
2104/*
2105 * Zero out an LWP's TLS context (%fs and %gs and associated stuff).
2106 * Used when exec'ing a new program.
2107 */
2108
2109void
2110cpu_fsgs_zero(struct lwp *l)
2111{
2112 struct trapframe * const tf = l->l_md.md_regs;
2113 struct pcb *pcb;
2114 uint64_t zero = 0;
2115
2116 pcb = lwp_getpcb(l);
2117 if (l == curlwp) {
2118 kpreempt_disable();
2119 tf->tf_fs = 0;
2120 tf->tf_gs = 0;
2121 setfs(0);
2122#ifndef XEN
2123 setusergs(0);
2124#else
2125 HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, 0);
2126#endif
2127 if ((l->l_proc->p_flag & PK_32) == 0) {
2128#ifndef XEN
2129 wrmsr(MSR_FSBASE, 0);
2130 wrmsr(MSR_KERNELGSBASE, 0);
2131#else
2132 HYPERVISOR_set_segment_base(SEGBASE_FS, 0);
2133 HYPERVISOR_set_segment_base(SEGBASE_GS_USER, 0);
2134#endif
2135 }
2136 pcb->pcb_fs = 0;
2137 pcb->pcb_gs = 0;
2138 update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &zero);
2139 update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &zero);
2140 kpreempt_enable();
2141 } else {
2142 tf->tf_fs = 0;
2143 tf->tf_gs = 0;
2144 pcb->pcb_fs = 0;
2145 pcb->pcb_gs = 0;
2146 }
2147
2148}
2149
2150/*
2151 * Load an LWP's TLS context, possibly changing the %fs and %gs selectors.
2152 * Used only for 32-bit processes.
2153 */
2154
2155void
2156cpu_fsgs_reload(struct lwp *l, int fssel, int gssel)
2157{
2158 struct trapframe *tf;
2159 struct pcb *pcb;
2160
2161 KASSERT(l->l_proc->p_flag & PK_32);
2162 tf = l->l_md.md_regs;
2163 if (l == curlwp) {
2164 pcb = lwp_getpcb(l);
2165 kpreempt_disable();
2166 update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &pcb->pcb_fs);
2167 update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &pcb->pcb_gs);
2168 setfs(fssel);
2169#ifndef XEN
2170 setusergs(gssel);
2171#else
2172 HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gssel);
2173#endif
2174 tf->tf_fs = fssel;
2175 tf->tf_gs = gssel;
2176 kpreempt_enable();
2177 } else {
2178 tf->tf_fs = fssel;
2179 tf->tf_gs = gssel;
2180 }
2181}
2182
2183
2184#ifdef __HAVE_DIRECT_MAP
2185bool
2186mm_md_direct_mapped_io(void *addr, paddr_t *paddr)
2187{
2188 vaddr_t va = (vaddr_t)addr;
2189
2190 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
2191 *paddr = PMAP_DIRECT_UNMAP(va);
2192 return true;
2193 }
2194 return false;
2195}
2196
2197bool
2198mm_md_direct_mapped_phys(paddr_t paddr, vaddr_t *vaddr)
2199{
2200 *vaddr = PMAP_DIRECT_MAP(paddr);
2201 return true;
2202}
2203#endif
2204