1/* $NetBSD: pmap.c,v 1.227 2016/11/17 16:32:06 maxv Exp $ */
2
3/*-
4 * Copyright (c) 2008, 2010, 2016 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran, and by Maxime Villard.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*
33 * Copyright (c) 2007 Manuel Bouyer.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 *
44 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54 *
55 */
56
57/*
58 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
59 *
60 * Permission to use, copy, modify, and distribute this software for any
61 * purpose with or without fee is hereby granted, provided that the above
62 * copyright notice and this permission notice appear in all copies.
63 *
64 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
65 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
66 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
67 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
68 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
69 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
70 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
71 */
72
73/*
74 * Copyright (c) 1997 Charles D. Cranor and Washington University.
75 * All rights reserved.
76 *
77 * Redistribution and use in source and binary forms, with or without
78 * modification, are permitted provided that the following conditions
79 * are met:
80 * 1. Redistributions of source code must retain the above copyright
81 * notice, this list of conditions and the following disclaimer.
82 * 2. Redistributions in binary form must reproduce the above copyright
83 * notice, this list of conditions and the following disclaimer in the
84 * documentation and/or other materials provided with the distribution.
85 *
86 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
87 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
88 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
89 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
90 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
91 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
92 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
93 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
94 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
95 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
96 */
97
98/*
99 * Copyright 2001 (c) Wasabi Systems, Inc.
100 * All rights reserved.
101 *
102 * Written by Frank van der Linden for Wasabi Systems, Inc.
103 *
104 * Redistribution and use in source and binary forms, with or without
105 * modification, are permitted provided that the following conditions
106 * are met:
107 * 1. Redistributions of source code must retain the above copyright
108 * notice, this list of conditions and the following disclaimer.
109 * 2. Redistributions in binary form must reproduce the above copyright
110 * notice, this list of conditions and the following disclaimer in the
111 * documentation and/or other materials provided with the distribution.
112 * 3. All advertising materials mentioning features or use of this software
113 * must display the following acknowledgement:
114 * This product includes software developed for the NetBSD Project by
115 * Wasabi Systems, Inc.
116 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
117 * or promote products derived from this software without specific prior
118 * written permission.
119 *
120 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
121 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
122 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
123 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
124 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
125 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
126 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
127 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
128 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
129 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
130 * POSSIBILITY OF SUCH DAMAGE.
131 */
132
133/*
134 * This is the i386 pmap modified and generalized to support x86-64
135 * as well. The idea is to hide the upper N levels of the page tables
136 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
137 * is mostly untouched, except that it uses some more generalized
138 * macros and interfaces.
139 *
140 * This pmap has been tested on the i386 as well, and it can be easily
141 * adapted to PAE.
142 *
143 * fvdl@wasabisystems.com 18-Jun-2001
144 */
145
146/*
147 * pmap.c: i386 pmap module rewrite
148 * Chuck Cranor <chuck@netbsd>
149 * 11-Aug-97
150 *
151 * history of this pmap module: in addition to my own input, i used
152 * the following references for this rewrite of the i386 pmap:
153 *
154 * [1] the NetBSD i386 pmap. this pmap appears to be based on the
155 * BSD hp300 pmap done by Mike Hibler at University of Utah.
156 * it was then ported to the i386 by William Jolitz of UUNET
157 * Technologies, Inc. Then Charles M. Hannum of the NetBSD
158 * project fixed some bugs and provided some speed ups.
159 *
160 * [2] the FreeBSD i386 pmap. this pmap seems to be the
161 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
162 * and David Greenman.
163 *
164 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated
165 * between several processors. the VAX version was done by
166 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386
167 * version was done by Lance Berc, Mike Kupfer, Bob Baron,
168 * David Golub, and Richard Draves. the alpha version was
169 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou
170 * (NetBSD/alpha).
171 */
172
173#include <sys/cdefs.h>
174__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.227 2016/11/17 16:32:06 maxv Exp $");
175
176#include "opt_user_ldt.h"
177#include "opt_lockdebug.h"
178#include "opt_multiprocessor.h"
179#include "opt_xen.h"
180#if !defined(__x86_64__)
181#include "opt_kstack_dr0.h"
182#endif /* !defined(__x86_64__) */
183
184#include <sys/param.h>
185#include <sys/systm.h>
186#include <sys/proc.h>
187#include <sys/pool.h>
188#include <sys/kernel.h>
189#include <sys/atomic.h>
190#include <sys/cpu.h>
191#include <sys/intr.h>
192#include <sys/xcall.h>
193#include <sys/kcore.h>
194
195#include <uvm/uvm.h>
196#include <uvm/pmap/pmap_pvt.h>
197
198#include <dev/isa/isareg.h>
199
200#include <machine/specialreg.h>
201#include <machine/gdt.h>
202#include <machine/isa_machdep.h>
203#include <machine/cpuvar.h>
204#include <machine/cputypes.h>
205
206#include <x86/pmap.h>
207#include <x86/pmap_pv.h>
208
209#include <x86/i82489reg.h>
210#include <x86/i82489var.h>
211
212#ifdef XEN
213#include <xen/xen-public/xen.h>
214#include <xen/hypervisor.h>
215#endif
216
217/*
218 * general info:
219 *
220 * - for an explanation of how the i386 MMU hardware works see
221 * the comments in <machine/pte.h>.
222 *
223 * - for an explanation of the general memory structure used by
224 * this pmap (including the recursive mapping), see the comments
225 * in <machine/pmap.h>.
226 *
227 * this file contains the code for the "pmap module." the module's
228 * job is to manage the hardware's virtual to physical address mappings.
229 * note that there are two levels of mapping in the VM system:
230 *
231 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
232 * to map ranges of virtual address space to objects/files. for
233 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
234 * to the file /bin/ls starting at offset zero." note that
235 * the upper layer mapping is not concerned with how individual
236 * vm_pages are mapped.
237 *
238 * [2] the lower layer of the VM system (the pmap) maintains the mappings
239 * from virtual addresses. it is concerned with which vm_page is
240 * mapped where. for example, when you run /bin/ls and start
241 * at page 0x1000 the fault routine may lookup the correct page
242 * of the /bin/ls file and then ask the pmap layer to establish
243 * a mapping for it.
244 *
245 * note that information in the lower layer of the VM system can be
246 * thrown away since it can easily be reconstructed from the info
247 * in the upper layer.
248 *
249 * data structures we use include:
250 *
251 * - struct pmap: describes the address space of one thread
252 * - struct pmap_page: describes one pv-tracked page, without
253 * necessarily a corresponding vm_page
254 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA
255 * - struct pv_head: there is one pv_head per pv-tracked page of
256 * physical memory. the pv_head points to a list of pv_entry
257 * structures which describe all the <PMAP,VA> pairs that this
258 * page is mapped in. this is critical for page based operations
259 * such as pmap_page_protect() [change protection on _all_ mappings
260 * of a page]
261 */
262
263/*
264 * memory allocation
265 *
266 * - there are three data structures that we must dynamically allocate:
267 *
268 * [A] new process' page directory page (PDP)
269 * - plan 1: done at pmap_create() we use
270 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this
271 * allocation.
272 *
273 * if we are low in free physical memory then we sleep in
274 * uvm_km_alloc -- in this case this is ok since we are creating
275 * a new pmap and should not be holding any locks.
276 *
277 * if the kernel is totally out of virtual space
278 * (i.e. uvm_km_alloc returns NULL), then we panic.
279 *
280 * [B] new page tables pages (PTP)
281 * - call uvm_pagealloc()
282 * => success: zero page, add to pm_pdir
283 * => failure: we are out of free vm_pages, let pmap_enter()
284 * tell UVM about it.
285 *
286 * note: for kernel PTPs, we start with NKPTP of them. as we map
287 * kernel memory (at uvm_map time) we check to see if we've grown
288 * the kernel pmap. if so, we call the optional function
289 * pmap_growkernel() to grow the kernel PTPs in advance.
290 *
291 * [C] pv_entry structures
292 */
293
294/*
295 * locking
296 *
297 * we have the following locks that we must contend with:
298 *
299 * mutexes:
300 *
301 * - pmap lock (per pmap, part of uvm_object)
302 * this lock protects the fields in the pmap structure including
303 * the non-kernel PDEs in the PDP, and the PTEs. it also locks
304 * in the alternate PTE space (since that is determined by the
305 * entry in the PDP).
306 *
307 * - pvh_lock (per pv_head)
308 * this lock protects the pv_entry list which is chained off the
309 * pv_head structure for a specific pv-tracked PA. it is locked
310 * when traversing the list (e.g. adding/removing mappings,
311 * syncing R/M bits, etc.)
312 *
313 * - pmaps_lock
314 * this lock protects the list of active pmaps (headed by "pmaps").
315 * we lock it when adding or removing pmaps from this list.
316 */
317
318const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
319const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
320const long nkptpmax[] = NKPTPMAX_INITIALIZER;
321const long nbpd[] = NBPD_INITIALIZER;
322pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
323
324long nkptp[] = NKPTP_INITIALIZER;
325
326struct pmap_head pmaps;
327kmutex_t pmaps_lock;
328
329static vaddr_t pmap_maxkvaddr;
330
331/*
332 * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable.
333 * actual locking is done by pm_lock.
334 */
335#if defined(DIAGNOSTIC)
336#define PMAP_SUBOBJ_LOCK(pm, idx) \
337 KASSERT(mutex_owned((pm)->pm_lock)); \
338 if ((idx) != 0) \
339 mutex_enter((pm)->pm_obj[(idx)].vmobjlock)
340#define PMAP_SUBOBJ_UNLOCK(pm, idx) \
341 KASSERT(mutex_owned((pm)->pm_lock)); \
342 if ((idx) != 0) \
343 mutex_exit((pm)->pm_obj[(idx)].vmobjlock)
344#else /* defined(DIAGNOSTIC) */
345#define PMAP_SUBOBJ_LOCK(pm, idx) /* nothing */
346#define PMAP_SUBOBJ_UNLOCK(pm, idx) /* nothing */
347#endif /* defined(DIAGNOSTIC) */
348
349/*
350 * Misc. event counters.
351 */
352struct evcnt pmap_iobmp_evcnt;
353struct evcnt pmap_ldt_evcnt;
354
355/*
356 * PAT
357 */
358#define PATENTRY(n, type) (type << ((n) * 8))
359#define PAT_UC 0x0ULL
360#define PAT_WC 0x1ULL
361#define PAT_WT 0x4ULL
362#define PAT_WP 0x5ULL
363#define PAT_WB 0x6ULL
364#define PAT_UCMINUS 0x7ULL
365
366static bool cpu_pat_enabled __read_mostly = false;
367
368/*
369 * Global data structures
370 */
371
372static struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */
373struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
374
375/*
376 * pmap_pg_nx: if our processor supports PG_NX in the PTE then we
377 * set pmap_pg_nx to PG_NX (otherwise it is zero).
378 */
379pd_entry_t pmap_pg_nx __read_mostly = 0;
380
381/*
382 * pmap_pg_g: if our processor supports PG_G in the PTE then we
383 * set pmap_pg_g to PG_G (otherwise it is zero).
384 */
385pd_entry_t pmap_pg_g __read_mostly = 0;
386
387/*
388 * pmap_largepages: if our processor supports PG_PS and we are
389 * using it, this is set to true.
390 */
391int pmap_largepages __read_mostly = 0;
392
393/*
394 * i386 physical memory comes in a big contig chunk with a small
395 * hole toward the front of it... the following two paddr_t's
396 * (shared with machdep.c) describe the physical address space
397 * of this machine.
398 */
399paddr_t avail_start __read_mostly; /* PA of first available physical page */
400paddr_t avail_end __read_mostly; /* PA of last available physical page */
401
402#ifdef XEN
403#ifdef __x86_64__
404/* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */
405static paddr_t xen_dummy_user_pgd;
406#endif /* __x86_64__ */
407paddr_t pmap_pa_start; /* PA of first physical page for this domain */
408paddr_t pmap_pa_end; /* PA of last physical page for this domain */
409#endif /* XEN */
410
411#define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp)
412
413#define PV_HASH_SIZE 32768
414#define PV_HASH_LOCK_CNT 32
415
416struct pv_hash_lock {
417 kmutex_t lock;
418} __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
419 __aligned(CACHE_LINE_SIZE);
420
421struct pv_hash_head {
422 SLIST_HEAD(, pv_entry) hh_list;
423} pv_hash_heads[PV_HASH_SIZE];
424
425static u_int
426pvhash_hash(struct vm_page *ptp, vaddr_t va)
427{
428
429 return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
430}
431
432static struct pv_hash_head *
433pvhash_head(u_int hash)
434{
435
436 return &pv_hash_heads[hash % PV_HASH_SIZE];
437}
438
439static kmutex_t *
440pvhash_lock(u_int hash)
441{
442
443 return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
444}
445
446static struct pv_entry *
447pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
448{
449 struct pv_entry *pve;
450 struct pv_entry *prev;
451
452 prev = NULL;
453 SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
454 if (pve->pve_pte.pte_ptp == ptp &&
455 pve->pve_pte.pte_va == va) {
456 if (prev != NULL) {
457 SLIST_REMOVE_AFTER(prev, pve_hash);
458 } else {
459 SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
460 }
461 break;
462 }
463 prev = pve;
464 }
465 return pve;
466}
467
468/*
469 * Other data structures
470 */
471
472static pt_entry_t protection_codes[8] __read_mostly;
473
474static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
475
476/*
477 * The following two vaddr_t's are used during system startup to keep track of
478 * how much of the kernel's VM space we have used. Once the system is started,
479 * the management of the remaining kernel VM space is turned over to the
480 * kernel_map vm_map.
481 */
482static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */
483static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */
484
485/*
486 * pool that pmap structures are allocated from
487 */
488static struct pool_cache pmap_cache;
489
490/*
491 * pv_entry cache
492 */
493static struct pool_cache pmap_pv_cache;
494
495#ifndef __HAVE_DIRECT_MAP
496/*
497 * MULTIPROCESSOR: special VAs and PTEs are actually allocated inside a
498 * (maxcpus * NPTECL) array of PTE, to avoid cache line thrashing due to
499 * false sharing.
500 */
501#ifdef MULTIPROCESSOR
502#define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
503#define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
504#else
505#define PTESLEW(pte, id) ((void)id, pte)
506#define VASLEW(va,id) ((void)id, va)
507#endif
508
509/*
510 * Special VAs and the PTEs that map them
511 */
512static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte;
513static char *csrcp, *cdstp, *zerop, *ptpp;
514#ifdef XEN
515char *early_zerop; /* also referenced from xen_locore() */
516#else
517static char *early_zerop;
518#endif
519
520#endif
521
522int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
523
524/* PDP pool_cache(9) and its callbacks */
525struct pool_cache pmap_pdp_cache;
526static int pmap_pdp_ctor(void *, void *, int);
527static void pmap_pdp_dtor(void *, void *);
528#ifdef PAE
529/* need to allocate items of 4 pages */
530static void *pmap_pdp_alloc(struct pool *, int);
531static void pmap_pdp_free(struct pool *, void *);
532static struct pool_allocator pmap_pdp_allocator = {
533 .pa_alloc = pmap_pdp_alloc,
534 .pa_free = pmap_pdp_free,
535 .pa_pagesz = PAGE_SIZE * PDP_SIZE,
536};
537#endif /* PAE */
538
539extern vaddr_t idt_vaddr;
540extern paddr_t idt_paddr;
541extern vaddr_t gdt_vaddr;
542extern paddr_t gdt_paddr;
543extern vaddr_t ldt_vaddr;
544extern paddr_t ldt_paddr;
545
546extern int end;
547
548#ifdef i386
549/* stuff to fix the pentium f00f bug */
550extern vaddr_t pentium_idt_vaddr;
551#endif
552
553/*
554 * Local prototypes
555 */
556
557#ifdef __HAVE_DIRECT_MAP
558static void pmap_init_directmap(struct pmap *);
559#endif
560#ifndef XEN
561static void pmap_remap_largepages(void);
562#endif
563
564static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t,
565 pd_entry_t * const *);
566static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
567static void pmap_freepage(struct pmap *, struct vm_page *, int);
568static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t,
569 pt_entry_t *, pd_entry_t * const *);
570static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
571 vaddr_t, struct pv_entry **);
572static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t,
573 vaddr_t, struct pv_entry **);
574
575static paddr_t pmap_get_physpage(void);
576static void pmap_alloc_level(vaddr_t, long *);
577
578static bool pmap_reactivate(struct pmap *);
579
580/*
581 * p m a p h e l p e r f u n c t i o n s
582 */
583
584static inline void
585pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
586{
587
588 if (pmap == pmap_kernel()) {
589 atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
590 atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
591 } else {
592 KASSERT(mutex_owned(pmap->pm_lock));
593 pmap->pm_stats.resident_count += resid_diff;
594 pmap->pm_stats.wired_count += wired_diff;
595 }
596}
597
598static inline void
599pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
600{
601 int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0);
602 int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0);
603
604 KASSERT((npte & (PG_V | PG_W)) != PG_W);
605 KASSERT((opte & (PG_V | PG_W)) != PG_W);
606
607 pmap_stats_update(pmap, resid_diff, wired_diff);
608}
609
610/*
611 * ptp_to_pmap: lookup pmap by ptp
612 */
613
614static struct pmap *
615ptp_to_pmap(struct vm_page *ptp)
616{
617 struct pmap *pmap;
618
619 if (ptp == NULL) {
620 return pmap_kernel();
621 }
622 pmap = (struct pmap *)ptp->uobject;
623 KASSERT(pmap != NULL);
624 KASSERT(&pmap->pm_obj[0] == ptp->uobject);
625 return pmap;
626}
627
628static inline struct pv_pte *
629pve_to_pvpte(struct pv_entry *pve)
630{
631
632 KASSERT((void *)&pve->pve_pte == (void *)pve);
633 return &pve->pve_pte;
634}
635
636static inline struct pv_entry *
637pvpte_to_pve(struct pv_pte *pvpte)
638{
639 struct pv_entry *pve = (void *)pvpte;
640
641 KASSERT(pve_to_pvpte(pve) == pvpte);
642 return pve;
643}
644
645/*
646 * pv_pte_first, pv_pte_next: PV list iterator.
647 */
648
649static struct pv_pte *
650pv_pte_first(struct pmap_page *pp)
651{
652
653 if ((pp->pp_flags & PP_EMBEDDED) != 0) {
654 return &pp->pp_pte;
655 }
656 return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
657}
658
659static struct pv_pte *
660pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
661{
662
663 KASSERT(pvpte != NULL);
664 if (pvpte == &pp->pp_pte) {
665 KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
666 return NULL;
667 }
668 KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
669 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
670}
671
672/*
673 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
674 * of course the kernel is always loaded
675 */
676
677bool
678pmap_is_curpmap(struct pmap *pmap)
679{
680 return((pmap == pmap_kernel()) ||
681 (pmap == curcpu()->ci_pmap));
682}
683
684/*
685 * Add a reference to the specified pmap.
686 */
687
688void
689pmap_reference(struct pmap *pmap)
690{
691
692 atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
693}
694
695/*
696 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
697 *
698 * there are several pmaps involved. some or all of them might be same.
699 *
700 * - the pmap given by the first argument
701 * our caller wants to access this pmap's PTEs.
702 *
703 * - pmap_kernel()
704 * the kernel pmap. note that it only contains the kernel part
705 * of the address space which is shared by any pmap. ie. any
706 * pmap can be used instead of pmap_kernel() for our purpose.
707 *
708 * - ci->ci_pmap
709 * pmap currently loaded on the cpu.
710 *
711 * - vm_map_pmap(&curproc->p_vmspace->vm_map)
712 * current process' pmap.
713 *
714 * => we lock enough pmaps to keep things locked in
715 * => must be undone with pmap_unmap_ptes before returning
716 */
717
718void
719pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
720 pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
721{
722 struct pmap *curpmap;
723 struct cpu_info *ci;
724 lwp_t *l;
725
726 /* The kernel's pmap is always accessible. */
727 if (pmap == pmap_kernel()) {
728 *pmap2 = NULL;
729 *ptepp = PTE_BASE;
730 *pdeppp = normal_pdes;
731 return;
732 }
733 KASSERT(kpreempt_disabled());
734
735 l = curlwp;
736 retry:
737 mutex_enter(pmap->pm_lock);
738 ci = curcpu();
739 curpmap = ci->ci_pmap;
740 if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
741 /* Our own pmap so just load it: easy. */
742 if (__predict_false(ci->ci_want_pmapload)) {
743 mutex_exit(pmap->pm_lock);
744 pmap_load();
745 goto retry;
746 }
747 KASSERT(pmap == curpmap);
748 } else if (pmap == curpmap) {
749 /*
750 * Already on the CPU: make it valid. This is very
751 * often the case during exit(), when we have switched
752 * to the kernel pmap in order to destroy a user pmap.
753 */
754 if (!pmap_reactivate(pmap)) {
755 u_int gen = uvm_emap_gen_return();
756 tlbflush();
757 uvm_emap_update(gen);
758 }
759 } else {
760 /*
761 * Toss current pmap from CPU, but keep a reference to it.
762 * The reference will be dropped by pmap_unmap_ptes().
763 * Can happen if we block during exit().
764 */
765 const cpuid_t cid = cpu_index(ci);
766
767 kcpuset_atomic_clear(curpmap->pm_cpus, cid);
768 kcpuset_atomic_clear(curpmap->pm_kernel_cpus, cid);
769 ci->ci_pmap = pmap;
770 ci->ci_tlbstate = TLBSTATE_VALID;
771 kcpuset_atomic_set(pmap->pm_cpus, cid);
772 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
773 cpu_load_pmap(pmap, curpmap);
774 }
775 pmap->pm_ncsw = l->l_ncsw;
776 *pmap2 = curpmap;
777 *ptepp = PTE_BASE;
778#if defined(XEN) && defined(__x86_64__)
779 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
780 ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
781 *pdeppp = ci->ci_normal_pdes;
782#else /* XEN && __x86_64__ */
783 *pdeppp = normal_pdes;
784#endif /* XEN && __x86_64__ */
785}
786
787/*
788 * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
789 */
790
791void
792pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
793{
794 struct cpu_info *ci;
795 struct pmap *mypmap;
796
797 KASSERT(kpreempt_disabled());
798
799 /* The kernel's pmap is always accessible. */
800 if (pmap == pmap_kernel()) {
801 return;
802 }
803
804 ci = curcpu();
805#if defined(XEN) && defined(__x86_64__)
806 /* Reset per-cpu normal_pdes */
807 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
808 ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
809#endif /* XEN && __x86_64__ */
810 /*
811 * We cannot tolerate context switches while mapped in.
812 * If it is our own pmap all we have to do is unlock.
813 */
814 KASSERT(pmap->pm_ncsw == curlwp->l_ncsw);
815 mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map);
816 if (pmap == mypmap) {
817 mutex_exit(pmap->pm_lock);
818 return;
819 }
820
821 /*
822 * Mark whatever's on the CPU now as lazy and unlock.
823 * If the pmap was already installed, we are done.
824 */
825 ci->ci_tlbstate = TLBSTATE_LAZY;
826 ci->ci_want_pmapload = (mypmap != pmap_kernel());
827 mutex_exit(pmap->pm_lock);
828 if (pmap == pmap2) {
829 return;
830 }
831
832 /*
833 * We installed another pmap on the CPU. Grab a reference to
834 * it and leave in place. Toss the evicted pmap (can block).
835 */
836 pmap_reference(pmap);
837 pmap_destroy(pmap2);
838}
839
840
841inline static void
842pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
843{
844
845#if !defined(__x86_64__)
846 if (curproc == NULL || curproc->p_vmspace == NULL ||
847 pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
848 return;
849
850 if ((opte ^ npte) & PG_X)
851 pmap_update_pg(va);
852
853 /*
854 * Executability was removed on the last executable change.
855 * Reset the code segment to something conservative and
856 * let the trap handler deal with setting the right limit.
857 * We can't do that because of locking constraints on the vm map.
858 */
859
860 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
861 struct trapframe *tf = curlwp->l_md.md_regs;
862
863 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
864 pm->pm_hiexec = I386_MAX_EXE_ADDR;
865 }
866#endif /* !defined(__x86_64__) */
867}
868
869#if !defined(__x86_64__)
870/*
871 * Fixup the code segment to cover all potential executable mappings.
872 * returns 0 if no changes to the code segment were made.
873 */
874
875int
876pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
877{
878 struct vm_map_entry *ent;
879 struct pmap *pm = vm_map_pmap(map);
880 vaddr_t va = 0;
881
882 vm_map_lock_read(map);
883 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
884
885 /*
886 * This entry has greater va than the entries before.
887 * We need to make it point to the last page, not past it.
888 */
889
890 if (ent->protection & VM_PROT_EXECUTE)
891 va = trunc_page(ent->end) - PAGE_SIZE;
892 }
893 vm_map_unlock_read(map);
894 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
895 return (0);
896
897 pm->pm_hiexec = va;
898 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
899 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
900 } else {
901 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
902 return (0);
903 }
904 return (1);
905}
906#endif /* !defined(__x86_64__) */
907
908void
909pat_init(struct cpu_info *ci)
910{
911 uint64_t pat;
912
913 if (!(ci->ci_feat_val[0] & CPUID_PAT))
914 return;
915
916 /* We change WT to WC. Leave all other entries the default values. */
917 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
918 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
919 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
920 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
921
922 wrmsr(MSR_CR_PAT, pat);
923 cpu_pat_enabled = true;
924 aprint_debug_dev(ci->ci_dev, "PAT enabled\n");
925}
926
927static pt_entry_t
928pmap_pat_flags(u_int flags)
929{
930 u_int cacheflags = (flags & PMAP_CACHE_MASK);
931
932 if (!cpu_pat_enabled) {
933 switch (cacheflags) {
934 case PMAP_NOCACHE:
935 case PMAP_NOCACHE_OVR:
936 /* results in PGC_UCMINUS on cpus which have
937 * the cpuid PAT but PAT "disabled"
938 */
939 return PG_N;
940 default:
941 return 0;
942 }
943 }
944
945 switch (cacheflags) {
946 case PMAP_NOCACHE:
947 return PGC_UC;
948 case PMAP_WRITE_COMBINE:
949 return PGC_WC;
950 case PMAP_WRITE_BACK:
951 return PGC_WB;
952 case PMAP_NOCACHE_OVR:
953 return PGC_UCMINUS;
954 }
955
956 return 0;
957}
958
959/*
960 * p m a p k e n t e r f u n c t i o n s
961 *
962 * functions to quickly enter/remove pages from the kernel address
963 * space. pmap_kremove is exported to MI kernel. we make use of
964 * the recursive PTE mappings.
965 */
966
967/*
968 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
969 *
970 * => no need to lock anything, assume va is already allocated
971 * => should be faster than normal pmap enter function
972 */
973
974void
975pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
976{
977 pt_entry_t *pte, opte, npte;
978
979 KASSERT(!(prot & ~VM_PROT_ALL));
980
981 if (va < VM_MIN_KERNEL_ADDRESS)
982 pte = vtopte(va);
983 else
984 pte = kvtopte(va);
985#ifdef DOM0OPS
986 if (pa < pmap_pa_start || pa >= pmap_pa_end) {
987#ifdef DEBUG
988 printf_nolog("%s: pa 0x%" PRIx64 " for va 0x%" PRIx64
989 " outside range\n", __func__, (int64_t)pa, (int64_t)va);
990#endif /* DEBUG */
991 npte = pa;
992 } else
993#endif /* DOM0OPS */
994 npte = pmap_pa2pte(pa);
995 npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g;
996 npte |= pmap_pat_flags(flags);
997 opte = pmap_pte_testset(pte, npte); /* zap! */
998#if defined(DIAGNOSTIC)
999 /*
1000 * XXX: make sure we are not dealing with a large page, since the only
1001 * large pages created are for the kernel image, and they should never
1002 * be kentered.
1003 */
1004 if (opte & PG_PS)
1005 panic("%s: PG_PS", __func__);
1006#endif
1007 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1008 /* This should not happen. */
1009 printf_nolog("%s: mapping already present\n", __func__);
1010 kpreempt_disable();
1011 pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1012 kpreempt_enable();
1013 }
1014}
1015
1016void
1017pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot)
1018{
1019 pt_entry_t *pte, npte;
1020
1021 KASSERT((prot & ~VM_PROT_ALL) == 0);
1022 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1023
1024#ifdef DOM0OPS
1025 if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1026 npte = pa;
1027 } else
1028#endif
1029 npte = pmap_pa2pte(pa);
1030
1031 npte = pmap_pa2pte(pa);
1032 npte |= protection_codes[prot] | PG_k | PG_V;
1033 pmap_pte_set(pte, npte);
1034}
1035
1036/*
1037 * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred.
1038 */
1039void
1040pmap_emap_sync(bool canload)
1041{
1042 struct cpu_info *ci = curcpu();
1043 struct pmap *pmap;
1044
1045 KASSERT(kpreempt_disabled());
1046 if (__predict_true(ci->ci_want_pmapload && canload)) {
1047 /*
1048 * XXX: Hint for pmap_reactivate(), which might suggest to
1049 * not perform TLB flush, if state has not changed.
1050 */
1051 pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1052 if (__predict_false(pmap == ci->ci_pmap)) {
1053 kcpuset_atomic_clear(pmap->pm_cpus, cpu_index(ci));
1054 }
1055 pmap_load();
1056 KASSERT(ci->ci_want_pmapload == 0);
1057 } else {
1058 tlbflush();
1059 }
1060}
1061
1062void
1063pmap_emap_remove(vaddr_t sva, vsize_t len)
1064{
1065 pt_entry_t *pte;
1066 vaddr_t va, eva = sva + len;
1067
1068 for (va = sva; va < eva; va += PAGE_SIZE) {
1069 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1070 pmap_pte_set(pte, 0);
1071 }
1072}
1073
1074__strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1075
1076#if defined(__x86_64__)
1077/*
1078 * Change protection for a virtual address. Local for a CPU only, don't
1079 * care about TLB shootdowns.
1080 *
1081 * => must be called with preemption disabled
1082 */
1083void
1084pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1085{
1086 pt_entry_t *pte, opte, npte;
1087
1088 KASSERT(kpreempt_disabled());
1089
1090 if (va < VM_MIN_KERNEL_ADDRESS)
1091 pte = vtopte(va);
1092 else
1093 pte = kvtopte(va);
1094
1095 npte = opte = *pte;
1096
1097 if ((prot & VM_PROT_WRITE) != 0)
1098 npte |= PG_RW;
1099 else
1100 npte &= ~PG_RW;
1101
1102 if (opte != npte) {
1103 pmap_pte_set(pte, npte);
1104 pmap_pte_flush();
1105 invlpg(va);
1106 }
1107}
1108#endif /* defined(__x86_64__) */
1109
1110/*
1111 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1112 *
1113 * => no need to lock anything
1114 * => caller must dispose of any vm_page mapped in the va range
1115 * => note: not an inline function
1116 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1117 * => we assume kernel only unmaps valid addresses and thus don't bother
1118 * checking the valid bit before doing TLB flushing
1119 * => must be followed by call to pmap_update() before reuse of page
1120 */
1121
1122static inline void
1123pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
1124{
1125 pt_entry_t *pte, opte;
1126 vaddr_t va, eva;
1127
1128 eva = sva + len;
1129
1130 kpreempt_disable();
1131 for (va = sva; va < eva; va += PAGE_SIZE) {
1132 pte = kvtopte(va);
1133 opte = pmap_pte_testset(pte, 0); /* zap! */
1134 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U) && !localonly) {
1135 pmap_tlb_shootdown(pmap_kernel(), va, opte,
1136 TLBSHOOT_KREMOVE);
1137 }
1138 KASSERT((opte & PG_PS) == 0);
1139 KASSERT((opte & PG_PVLIST) == 0);
1140 }
1141 if (localonly) {
1142 tlbflushg();
1143 }
1144 kpreempt_enable();
1145}
1146
1147void
1148pmap_kremove(vaddr_t sva, vsize_t len)
1149{
1150
1151 pmap_kremove1(sva, len, false);
1152}
1153
1154/*
1155 * pmap_kremove_local: like pmap_kremove(), but only worry about
1156 * TLB invalidations on the current CPU. this is only intended
1157 * for use while writing kernel crash dumps.
1158 */
1159
1160void
1161pmap_kremove_local(vaddr_t sva, vsize_t len)
1162{
1163
1164 KASSERT(panicstr != NULL);
1165 pmap_kremove1(sva, len, true);
1166}
1167
1168/*
1169 * p m a p i n i t f u n c t i o n s
1170 *
1171 * pmap_bootstrap and pmap_init are called during system startup
1172 * to init the pmap module. pmap_bootstrap() does a low level
1173 * init just to get things rolling. pmap_init() finishes the job.
1174 */
1175
1176/*
1177 * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area.
1178 * This function is to be used before any VM system has been set up.
1179 *
1180 * The va is taken from virtual_avail.
1181 */
1182static vaddr_t
1183pmap_bootstrap_valloc(size_t npages)
1184{
1185 vaddr_t va = virtual_avail;
1186 virtual_avail += npages * PAGE_SIZE;
1187 return va;
1188}
1189
1190/*
1191 * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area.
1192 * This function is to be used before any VM system has been set up.
1193 *
1194 * The pa is taken from avail_start.
1195 */
1196static paddr_t
1197pmap_bootstrap_palloc(size_t npages)
1198{
1199 paddr_t pa = avail_start;
1200 avail_start += npages * PAGE_SIZE;
1201 return pa;
1202}
1203
1204/*
1205 * pmap_bootstrap: get the system in a state where it can run with VM properly
1206 * enabled (called before main()). The VM system is fully init'd later.
1207 *
1208 * => on i386, locore.S has already enabled the MMU by allocating a PDP for the
1209 * kernel, and nkpde PTP's for the kernel.
1210 * => kva_start is the first free virtual address in kernel space.
1211 */
1212void
1213pmap_bootstrap(vaddr_t kva_start)
1214{
1215 struct pmap *kpm;
1216 int i;
1217 vaddr_t kva;
1218#ifndef XEN
1219 unsigned long p1i;
1220 vaddr_t kva_end;
1221#endif
1222
1223 pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0);
1224
1225 /*
1226 * Set up our local static global vars that keep track of the usage of
1227 * KVM before kernel_map is set up.
1228 */
1229 virtual_avail = kva_start; /* first free KVA */
1230 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */
1231
1232 /*
1233 * Set up protection_codes: we need to be able to convert from a MI
1234 * protection code (some combo of VM_PROT...) to something we can jam
1235 * into a x86 PTE.
1236 */
1237 protection_codes[VM_PROT_NONE] = pmap_pg_nx;
1238 protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X;
1239 protection_codes[VM_PROT_READ] = PG_RO | pmap_pg_nx;
1240 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;
1241 protection_codes[VM_PROT_WRITE] = PG_RW | pmap_pg_nx;
1242 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;
1243 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pmap_pg_nx;
1244 protection_codes[VM_PROT_ALL] = PG_RW | PG_X;
1245
1246 /*
1247 * Now we init the kernel's pmap.
1248 *
1249 * The kernel pmap's pm_obj is not used for much. However, in user pmaps
1250 * the pm_obj contains the list of active PTPs.
1251 *
1252 * The pm_obj currently does not have a pager. It might be possible to
1253 * add a pager that would allow a process to read-only mmap its own page
1254 * tables (fast user-level vtophys?). This may or may not be useful.
1255 */
1256 kpm = pmap_kernel();
1257 for (i = 0; i < PTP_LEVELS - 1; i++) {
1258 mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
1259 uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1);
1260 uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]);
1261 kpm->pm_ptphint[i] = NULL;
1262 }
1263 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */
1264
1265 kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE);
1266 for (i = 0; i < PDP_SIZE; i++)
1267 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1268
1269 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1270 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1271
1272 kcpuset_create(&kpm->pm_cpus, true);
1273 kcpuset_create(&kpm->pm_kernel_cpus, true);
1274
1275 /*
1276 * the above is just a rough estimate and not critical to the proper
1277 * operation of the system.
1278 */
1279
1280#ifndef XEN
1281 /*
1282 * Begin to enable global TLB entries if they are supported.
1283 * The G bit has no effect until the CR4_PGE bit is set in CR4,
1284 * which happens in cpu_init(), which is run on each cpu
1285 * (and happens later)
1286 */
1287 if (cpu_feature[0] & CPUID_PGE) {
1288 pmap_pg_g = PG_G; /* enable software */
1289
1290 /* add PG_G attribute to already mapped kernel pages */
1291
1292 if (KERNBASE == VM_MIN_KERNEL_ADDRESS) {
1293 /* i386 only */
1294 kva_end = virtual_avail;
1295 } else {
1296 /* amd64 only */
1297 extern vaddr_t kern_end;
1298 kva_end = kern_end;
1299 }
1300
1301 for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) {
1302 p1i = pl1_i(kva);
1303 if (pmap_valid_entry(PTE_BASE[p1i]))
1304 PTE_BASE[p1i] |= PG_G;
1305 }
1306 }
1307
1308 /*
1309 * Enable large pages if they are supported.
1310 */
1311 if (cpu_feature[0] & CPUID_PSE) {
1312 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */
1313 pmap_largepages = 1; /* enable software */
1314
1315 /*
1316 * The TLB must be flushed after enabling large pages on Pentium
1317 * CPUs, according to section 3.6.2.2 of "Intel Architecture
1318 * Software Developer's Manual, Volume 3: System Programming".
1319 */
1320 tlbflushg();
1321
1322 /* Remap the kernel. */
1323 pmap_remap_largepages();
1324 }
1325#endif /* !XEN */
1326
1327#ifdef __HAVE_DIRECT_MAP
1328 pmap_init_directmap(kpm);
1329#else
1330 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
1331 /*
1332 * zero_pte is stuck at the end of mapped space for the kernel
1333 * image (disjunct from kva space). This is done so that it
1334 * can safely be used in pmap_growkernel (pmap_get_physpage),
1335 * when it's called for the first time.
1336 * XXXfvdl fix this for MULTIPROCESSOR later.
1337 */
1338#ifdef XEN
1339 /* early_zerop initialized in xen_locore() */
1340#else
1341 early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1342#endif
1343 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1344 }
1345
1346 /*
1347 * Now we allocate the "special" VAs which are used for tmp mappings
1348 * by the pmap (and other modules). We allocate the VAs by advancing
1349 * virtual_avail (note that there are no pages mapped at these VAs).
1350 * we find the PTE that maps the allocated VA via the linear PTE
1351 * mapping.
1352 */
1353
1354 pt_entry_t *pte = PTE_BASE + pl1_i(virtual_avail);
1355
1356#ifdef MULTIPROCESSOR
1357 /*
1358 * Waste some VA space to avoid false sharing of cache lines
1359 * for page table pages: Give each possible CPU a cache line
1360 * of PTE's (8) to play with, though we only need 4. We could
1361 * recycle some of this waste by putting the idle stacks here
1362 * as well; we could waste less space if we knew the largest
1363 * CPU ID beforehand.
1364 */
1365 csrcp = (char *) virtual_avail; csrc_pte = pte;
1366
1367 cdstp = (char *) virtual_avail+PAGE_SIZE; cdst_pte = pte+1;
1368
1369 zerop = (char *) virtual_avail+PAGE_SIZE*2; zero_pte = pte+2;
1370
1371 ptpp = (char *) virtual_avail+PAGE_SIZE*3; ptp_pte = pte+3;
1372
1373 virtual_avail += PAGE_SIZE * maxcpus * NPTECL;
1374 pte += maxcpus * NPTECL;
1375#else
1376 csrcp = (void *) virtual_avail; csrc_pte = pte; /* allocate */
1377 virtual_avail += PAGE_SIZE; pte++; /* advance */
1378
1379 cdstp = (void *) virtual_avail; cdst_pte = pte;
1380 virtual_avail += PAGE_SIZE; pte++;
1381
1382 zerop = (void *) virtual_avail; zero_pte = pte;
1383 virtual_avail += PAGE_SIZE; pte++;
1384
1385 ptpp = (void *) virtual_avail; ptp_pte = pte;
1386 virtual_avail += PAGE_SIZE; pte++;
1387#endif
1388
1389 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) {
1390 early_zerop = zerop;
1391 early_zero_pte = zero_pte;
1392 }
1393#endif
1394
1395#if defined(XEN) && defined(__x86_64__)
1396 /*
1397 * We want a dummy page directory for Xen: when deactivating a pmap, Xen
1398 * will still consider it active. So we set user PGD to this one to lift
1399 * all protection on the now inactive page tables set.
1400 */
1401 xen_dummy_user_pgd = pmap_bootstrap_palloc(1);
1402
1403 /* Zero fill it, the less checks in Xen it requires the better */
1404 memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1405 /* Mark read-only */
1406 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1407 pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG);
1408 /* Pin as L4 */
1409 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1410#endif
1411
1412 /*
1413 * Allocate space for the IDT, GDT and LDT.
1414 */
1415 idt_vaddr = pmap_bootstrap_valloc(1);
1416 idt_paddr = pmap_bootstrap_palloc(1);
1417
1418 gdt_vaddr = pmap_bootstrap_valloc(1);
1419 gdt_paddr = pmap_bootstrap_palloc(1);
1420
1421 ldt_vaddr = pmap_bootstrap_valloc(1);
1422 ldt_paddr = pmap_bootstrap_palloc(1);
1423
1424#if !defined(__x86_64__) && !defined(XEN)
1425 /* pentium f00f bug stuff */
1426 pentium_idt_vaddr = pmap_bootstrap_valloc(1);
1427#endif
1428
1429 /*
1430 * Now we reserve some VM for mapping pages when doing a crash dump.
1431 */
1432 virtual_avail = reserve_dumppages(virtual_avail);
1433
1434 /*
1435 * Init the static-global locks and global lists.
1436 *
1437 * => pventry::pvh_lock (initialized elsewhere) must also be
1438 * a spin lock, again at IPL_VM to prevent deadlock, and
1439 * again is never taken from interrupt context.
1440 */
1441 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1442 LIST_INIT(&pmaps);
1443
1444 /*
1445 * Ensure the TLB is sync'd with reality by flushing it...
1446 */
1447 tlbflushg();
1448
1449 /*
1450 * Calculate pmap_maxkvaddr from nkptp[].
1451 */
1452 kva = VM_MIN_KERNEL_ADDRESS;
1453 for (i = PTP_LEVELS - 1; i >= 1; i--) {
1454 kva += nkptp[i] * nbpd[i];
1455 }
1456 pmap_maxkvaddr = kva;
1457}
1458
1459#ifdef __HAVE_DIRECT_MAP
1460/*
1461 * Create the amd64 direct map. Called only once at boot time.
1462 */
1463static void
1464pmap_init_directmap(struct pmap *kpm)
1465{
1466 extern phys_ram_seg_t mem_clusters[];
1467 extern int mem_cluster_cnt;
1468
1469 paddr_t lastpa, dm_pd, dm_pdp, pdp;
1470 vaddr_t tmpva;
1471 pt_entry_t *pte;
1472 pd_entry_t *pde;
1473 phys_ram_seg_t *mc;
1474 long n_dm_pdp;
1475 int i;
1476
1477 const pd_entry_t pteflags = PG_V | PG_KW | pmap_pg_nx;
1478
1479 /* Get the last physical address available */
1480 lastpa = 0;
1481 for (i = 0; i < mem_cluster_cnt; i++) {
1482 mc = &mem_clusters[i];
1483 lastpa = MAX(lastpa, mc->start + mc->size);
1484 }
1485
1486 /*
1487 * We allocate only one L4 entry for the direct map (PDIR_SLOT_DIRECT),
1488 * so we cannot map more than 512GB.
1489 */
1490 if (lastpa > NBPD_L4) {
1491 panic("RAM limit reached: > 512GB not supported");
1492 }
1493
1494 /* Allocate L3. */
1495 dm_pdp = pmap_bootstrap_palloc(1);
1496
1497 /* Number of L3 entries. */
1498 n_dm_pdp = (lastpa + NBPD_L3 - 1) >> L3_SHIFT;
1499
1500 /* In locore.S, we allocated a tmp va. Use it now. */
1501 tmpva = (KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1502 pte = PTE_BASE + pl1_i(tmpva);
1503 *pte = dm_pdp | pteflags;
1504 pmap_update_pg(tmpva);
1505 memset((void *)tmpva, 0, PAGE_SIZE);
1506
1507 /*
1508 * Map the direct map RW. Use super pages (1GB) or large pages (2MB) if
1509 * they are supported. Note: PG_G is not allowed on non-leaf PTPs.
1510 */
1511 if (cpu_feature[2] & CPUID_P1GB) {
1512 /* Super pages are supported. Just create L3. */
1513 for (i = 0; i < n_dm_pdp; i++) {
1514 pdp = (paddr_t)&(((pd_entry_t *)dm_pdp)[i]);
1515 *pte = (pdp & PG_FRAME) | pteflags;
1516 pmap_update_pg(tmpva);
1517
1518 pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1519 *pde = ((paddr_t)i << L3_SHIFT) | pteflags | PG_U |
1520 PG_PS | PG_G;
1521 }
1522 } else {
1523 /* Allocate L2. */
1524 dm_pd = pmap_bootstrap_palloc(n_dm_pdp);
1525
1526 /* Zero out the L2 pages. */
1527 for (i = 0; i < n_dm_pdp; i++) {
1528 pdp = dm_pd + i * PAGE_SIZE;
1529 *pte = (pdp & PG_FRAME) | pteflags;
1530 pmap_update_pg(tmpva);
1531
1532 memset((void *)tmpva, 0, PAGE_SIZE);
1533 }
1534
1535 KASSERT(pmap_largepages != 0);
1536
1537 /* Large pages are supported. Just create L2. */
1538 for (i = 0; i < NPDPG * n_dm_pdp; i++) {
1539 pdp = (paddr_t)&(((pd_entry_t *)dm_pd)[i]);
1540 *pte = (pdp & PG_FRAME) | pteflags;
1541 pmap_update_pg(tmpva);
1542
1543 pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1544 *pde = ((paddr_t)i << L2_SHIFT) | pteflags |
1545 PG_U | PG_PS | PG_G;
1546 }
1547
1548 /* Fill in the L3 entries, linked to L2. */
1549 for (i = 0; i < n_dm_pdp; i++) {
1550 pdp = (paddr_t)&(((pd_entry_t *)dm_pdp)[i]);
1551 *pte = (pdp & PG_FRAME) | pteflags;
1552 pmap_update_pg(tmpva);
1553
1554 pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1555 *pde = (dm_pd + (i << PAGE_SHIFT)) | pteflags | PG_U;
1556 }
1557 }
1558
1559 kpm->pm_pdir[PDIR_SLOT_DIRECT] = dm_pdp | pteflags | PG_U;
1560
1561 *pte = 0;
1562 pmap_update_pg(tmpva);
1563
1564 tlbflush();
1565}
1566#endif /* __HAVE_DIRECT_MAP */
1567
1568#ifndef XEN
1569/*
1570 * Remap several kernel segments with large pages. We cover as many pages as we
1571 * can. Called only once at boot time, if the CPU supports large pages.
1572 */
1573static void
1574pmap_remap_largepages(void)
1575{
1576 extern char __rodata_start;
1577 extern char __data_start;
1578 extern char __kernel_end;
1579 pd_entry_t *pde;
1580 vaddr_t kva, kva_end;
1581 paddr_t pa;
1582
1583 /* Remap the kernel text using large pages. */
1584 kva = KERNBASE;
1585 kva_end = rounddown((vaddr_t)&__rodata_start, NBPD_L1);
1586 pa = kva - KERNBASE;
1587 for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1588 pde = &L2_BASE[pl2_i(kva)];
1589 *pde = pa | pmap_pg_g | PG_PS | PG_KR | PG_V;
1590 tlbflushg();
1591 }
1592#if defined(DEBUG)
1593 aprint_normal("kernel text is mapped with %" PRIuPSIZE " large "
1594 "pages and %" PRIuPSIZE " normal pages\n",
1595 howmany(kva - KERNBASE, NBPD_L2),
1596 howmany((vaddr_t)&__rodata_start - kva, NBPD_L1));
1597#endif /* defined(DEBUG) */
1598
1599 /* Remap the kernel rodata using large pages. */
1600 kva = roundup((vaddr_t)&__rodata_start, NBPD_L2);
1601 kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1);
1602 pa = kva - KERNBASE;
1603 for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1604 pde = &L2_BASE[pl2_i(kva)];
1605 *pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KR | PG_V;
1606 tlbflushg();
1607 }
1608
1609 /* Remap the kernel data+bss using large pages. */
1610 /*
1611 * XXX: we need to make sure the first page (PAGE_SIZE) of .data is not
1612 * mapped with a large page. As bizarre as it might seem, this first
1613 * page is used as the VA for the LAPIC page.
1614 */
1615 kva = roundup((vaddr_t)&__data_start+PAGE_SIZE, NBPD_L2);
1616 kva_end = rounddown((vaddr_t)&__kernel_end, NBPD_L1);
1617 pa = kva - KERNBASE;
1618 for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1619 pde = &L2_BASE[pl2_i(kva)];
1620 *pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KW | PG_V;
1621 tlbflushg();
1622 }
1623}
1624#endif /* !XEN */
1625
1626/*
1627 * pmap_init: called from uvm_init, our job is to get the pmap
1628 * system ready to manage mappings...
1629 */
1630
1631void
1632pmap_init(void)
1633{
1634 int i, flags;
1635
1636 for (i = 0; i < PV_HASH_SIZE; i++) {
1637 SLIST_INIT(&pv_hash_heads[i].hh_list);
1638 }
1639 for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
1640 mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
1641 }
1642
1643 /*
1644 * initialize caches.
1645 */
1646
1647 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
1648 "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
1649
1650#ifdef XEN
1651 /*
1652 * pool_cache(9) should not touch cached objects, since they
1653 * are pinned on xen and R/O for the domU
1654 */
1655 flags = PR_NOTOUCH;
1656#else /* XEN */
1657 flags = 0;
1658#endif /* XEN */
1659#ifdef PAE
1660 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
1661 "pdppl", &pmap_pdp_allocator, IPL_NONE,
1662 pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1663#else /* PAE */
1664 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, flags,
1665 "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1666#endif /* PAE */
1667 pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1668 PR_LARGECACHE, "pvpl", &pool_allocator_kmem, IPL_NONE, NULL,
1669 NULL, NULL);
1670
1671 pmap_tlb_init();
1672
1673 /* XXX: Since cpu_hatch() is only for secondary CPUs. */
1674 pmap_tlb_cpu_init(curcpu());
1675
1676 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1677 NULL, "x86", "io bitmap copy");
1678 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1679 NULL, "x86", "ldt sync");
1680
1681 /*
1682 * done: pmap module is up (and ready for business)
1683 */
1684
1685 pmap_initialized = true;
1686}
1687
1688/*
1689 * pmap_cpu_init_late: perform late per-CPU initialization.
1690 */
1691
1692#ifndef XEN
1693void
1694pmap_cpu_init_late(struct cpu_info *ci)
1695{
1696 /*
1697 * The BP has already its own PD page allocated during early
1698 * MD startup.
1699 */
1700 if (ci == &cpu_info_primary)
1701 return;
1702
1703#ifdef PAE
1704 cpu_alloc_l3_page(ci);
1705#endif
1706}
1707#endif
1708
1709/*
1710 * p v _ e n t r y f u n c t i o n s
1711 */
1712
1713/*
1714 * pmap_free_pvs: free a list of pv_entrys
1715 */
1716
1717static void
1718pmap_free_pvs(struct pv_entry *pve)
1719{
1720 struct pv_entry *next;
1721
1722 for ( /* null */ ; pve != NULL ; pve = next) {
1723 next = pve->pve_next;
1724 pool_cache_put(&pmap_pv_cache, pve);
1725 }
1726}
1727
1728/*
1729 * main pv_entry manipulation functions:
1730 * pmap_enter_pv: enter a mapping onto a pv_head list
1731 * pmap_remove_pv: remove a mapping from a pv_head list
1732 *
1733 * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1734 * the pvh before calling
1735 */
1736
1737/*
1738 * insert_pv: a helper of pmap_enter_pv
1739 */
1740
1741static void
1742insert_pv(struct pmap_page *pp, struct pv_entry *pve)
1743{
1744 struct pv_hash_head *hh;
1745 kmutex_t *lock;
1746 u_int hash;
1747
1748 hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va);
1749 lock = pvhash_lock(hash);
1750 hh = pvhash_head(hash);
1751 mutex_spin_enter(lock);
1752 SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash);
1753 mutex_spin_exit(lock);
1754
1755 LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list);
1756}
1757
1758/*
1759 * pmap_enter_pv: enter a mapping onto a pv_head lst
1760 *
1761 * => caller should adjust ptp's wire_count before calling
1762 * => caller has preallocated pve and *sparepve for us
1763 */
1764
1765static struct pv_entry *
1766pmap_enter_pv(struct pmap_page *pp, struct pv_entry *pve,
1767 struct pv_entry **sparepve, struct vm_page *ptp, vaddr_t va)
1768{
1769
1770 KASSERT(ptp == NULL || ptp->wire_count >= 2);
1771 KASSERT(ptp == NULL || ptp->uobject != NULL);
1772 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1773
1774 if ((pp->pp_flags & PP_EMBEDDED) == 0) {
1775 if (LIST_EMPTY(&pp->pp_head.pvh_list)) {
1776 pp->pp_flags |= PP_EMBEDDED;
1777 pp->pp_pte.pte_ptp = ptp;
1778 pp->pp_pte.pte_va = va;
1779
1780 return pve;
1781 }
1782 } else {
1783 struct pv_entry *pve2;
1784
1785 pve2 = *sparepve;
1786 *sparepve = NULL;
1787
1788 pve2->pve_pte = pp->pp_pte;
1789 pp->pp_flags &= ~PP_EMBEDDED;
1790 LIST_INIT(&pp->pp_head.pvh_list);
1791 insert_pv(pp, pve2);
1792 }
1793
1794 pve->pve_pte.pte_ptp = ptp;
1795 pve->pve_pte.pte_va = va;
1796 insert_pv(pp, pve);
1797
1798 return NULL;
1799}
1800
1801/*
1802 * pmap_remove_pv: try to remove a mapping from a pv_list
1803 *
1804 * => caller should adjust ptp's wire_count and free PTP if needed
1805 * => we return the removed pve
1806 */
1807
1808static struct pv_entry *
1809pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
1810{
1811 struct pv_hash_head *hh;
1812 struct pv_entry *pve;
1813 kmutex_t *lock;
1814 u_int hash;
1815
1816 KASSERT(ptp == NULL || ptp->uobject != NULL);
1817 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1818
1819 if ((pp->pp_flags & PP_EMBEDDED) != 0) {
1820 KASSERT(pp->pp_pte.pte_ptp == ptp);
1821 KASSERT(pp->pp_pte.pte_va == va);
1822
1823 pp->pp_flags &= ~PP_EMBEDDED;
1824 LIST_INIT(&pp->pp_head.pvh_list);
1825
1826 return NULL;
1827 }
1828
1829 hash = pvhash_hash(ptp, va);
1830 lock = pvhash_lock(hash);
1831 hh = pvhash_head(hash);
1832 mutex_spin_enter(lock);
1833 pve = pvhash_remove(hh, ptp, va);
1834 mutex_spin_exit(lock);
1835
1836 LIST_REMOVE(pve, pve_list);
1837
1838 return pve;
1839}
1840
1841/*
1842 * p t p f u n c t i o n s
1843 */
1844
1845static inline struct vm_page *
1846pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1847{
1848 int lidx = level - 1;
1849 struct vm_page *pg;
1850
1851 KASSERT(mutex_owned(pmap->pm_lock));
1852
1853 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1854 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
1855 return (pmap->pm_ptphint[lidx]);
1856 }
1857 PMAP_SUBOBJ_LOCK(pmap, lidx);
1858 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1859 PMAP_SUBOBJ_UNLOCK(pmap, lidx);
1860
1861 KASSERT(pg == NULL || pg->wire_count >= 1);
1862 return pg;
1863}
1864
1865static inline void
1866pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
1867{
1868 lwp_t *l;
1869 int lidx;
1870 struct uvm_object *obj;
1871
1872 KASSERT(ptp->wire_count == 1);
1873
1874 lidx = level - 1;
1875
1876 obj = &pmap->pm_obj[lidx];
1877 pmap_stats_update(pmap, -1, 0);
1878 if (lidx != 0)
1879 mutex_enter(obj->vmobjlock);
1880 if (pmap->pm_ptphint[lidx] == ptp)
1881 pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
1882 ptp->wire_count = 0;
1883 uvm_pagerealloc(ptp, NULL, 0);
1884 l = curlwp;
1885 KASSERT((l->l_pflag & LP_INTR) == 0);
1886 VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp;
1887 l->l_md.md_gc_ptp = ptp;
1888 if (lidx != 0)
1889 mutex_exit(obj->vmobjlock);
1890}
1891
1892static void
1893pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1894 pt_entry_t *ptes, pd_entry_t * const *pdes)
1895{
1896 unsigned long index;
1897 int level;
1898 vaddr_t invaladdr;
1899 pd_entry_t opde;
1900
1901 KASSERT(pmap != pmap_kernel());
1902 KASSERT(mutex_owned(pmap->pm_lock));
1903 KASSERT(kpreempt_disabled());
1904
1905 level = 1;
1906 do {
1907 index = pl_i(va, level + 1);
1908 opde = pmap_pte_testset(&pdes[level - 1][index], 0);
1909#if defined(XEN)
1910# if defined(__x86_64__)
1911 /*
1912 * If ptp is a L3 currently mapped in kernel space,
1913 * on any cpu, clear it before freeing
1914 */
1915 if (level == PTP_LEVELS - 1) {
1916 /*
1917 * Update the per-cpu PD on all cpus the current
1918 * pmap is active on
1919 */
1920 xen_kpm_sync(pmap, index);
1921 }
1922# endif /*__x86_64__ */
1923 invaladdr = level == 1 ? (vaddr_t)ptes :
1924 (vaddr_t)pdes[level - 2];
1925 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
1926 opde, TLBSHOOT_FREE_PTP1);
1927 pmap_tlb_shootnow();
1928#else /* XEN */
1929 invaladdr = level == 1 ? (vaddr_t)ptes :
1930 (vaddr_t)pdes[level - 2];
1931 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
1932 opde, TLBSHOOT_FREE_PTP1);
1933#endif /* XEN */
1934 pmap_freepage(pmap, ptp, level);
1935 if (level < PTP_LEVELS - 1) {
1936 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1937 ptp->wire_count--;
1938 if (ptp->wire_count > 1)
1939 break;
1940 }
1941 } while (++level < PTP_LEVELS);
1942 pmap_pte_flush();
1943}
1944
1945/*
1946 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1947 *
1948 * => pmap should NOT be pmap_kernel()
1949 * => pmap should be locked
1950 * => preemption should be disabled
1951 */
1952
1953static struct vm_page *
1954pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes)
1955{
1956 struct vm_page *ptp, *pptp;
1957 int i;
1958 unsigned long index;
1959 pd_entry_t *pva;
1960 paddr_t ppa, pa;
1961 struct uvm_object *obj;
1962
1963 KASSERT(pmap != pmap_kernel());
1964 KASSERT(mutex_owned(pmap->pm_lock));
1965 KASSERT(kpreempt_disabled());
1966
1967 ptp = NULL;
1968 pa = (paddr_t)-1;
1969
1970 /*
1971 * Loop through all page table levels seeing if we need to
1972 * add a new page to that level.
1973 */
1974 for (i = PTP_LEVELS; i > 1; i--) {
1975 /*
1976 * Save values from previous round.
1977 */
1978 pptp = ptp;
1979 ppa = pa;
1980
1981 index = pl_i(va, i);
1982 pva = pdes[i - 2];
1983
1984 if (pmap_valid_entry(pva[index])) {
1985 ppa = pmap_pte2pa(pva[index]);
1986 ptp = NULL;
1987 continue;
1988 }
1989
1990 obj = &pmap->pm_obj[i-2];
1991 PMAP_SUBOBJ_LOCK(pmap, i - 2);
1992 ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
1993 UVM_PGA_USERESERVE|UVM_PGA_ZERO);
1994 PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
1995
1996 if (ptp == NULL)
1997 return NULL;
1998
1999 ptp->flags &= ~PG_BUSY; /* never busy */
2000 ptp->wire_count = 1;
2001 pmap->pm_ptphint[i - 2] = ptp;
2002 pa = VM_PAGE_TO_PHYS(ptp);
2003 pmap_pte_set(&pva[index], (pd_entry_t)
2004 (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
2005#if defined(XEN) && defined(__x86_64__)
2006 if(i == PTP_LEVELS) {
2007 /*
2008 * Update the per-cpu PD on all cpus the current
2009 * pmap is active on
2010 */
2011 xen_kpm_sync(pmap, index);
2012 }
2013#endif
2014 pmap_pte_flush();
2015 pmap_stats_update(pmap, 1, 0);
2016 /*
2017 * If we're not in the top level, increase the
2018 * wire count of the parent page.
2019 */
2020 if (i < PTP_LEVELS) {
2021 if (pptp == NULL) {
2022 pptp = pmap_find_ptp(pmap, va, ppa, i);
2023 KASSERT(pptp != NULL);
2024 }
2025 pptp->wire_count++;
2026 }
2027 }
2028
2029 /*
2030 * PTP is not NULL if we just allocated a new PTP. If it is
2031 * still NULL, we must look up the existing one.
2032 */
2033 if (ptp == NULL) {
2034 ptp = pmap_find_ptp(pmap, va, ppa, 1);
2035 KASSERTMSG(ptp != NULL, "pmap_get_ptp: va %" PRIxVADDR
2036 "ppa %" PRIxPADDR "\n", va, ppa);
2037 }
2038
2039 pmap->pm_ptphint[0] = ptp;
2040 return ptp;
2041}
2042
2043/*
2044 * p m a p l i f e c y c l e f u n c t i o n s
2045 */
2046
2047/*
2048 * pmap_pdp_ctor: constructor for the PDP cache.
2049 */
2050static int
2051pmap_pdp_ctor(void *arg, void *v, int flags)
2052{
2053 pd_entry_t *pdir = v;
2054 paddr_t pdirpa = 0;
2055 vaddr_t object;
2056 int i;
2057
2058#if !defined(XEN) || !defined(__x86_64__)
2059 int npde;
2060#endif
2061#ifdef XEN
2062 int s;
2063#endif
2064
2065 /*
2066 * NOTE: The `pmaps_lock' is held when the PDP is allocated.
2067 */
2068
2069#if defined(XEN) && defined(__x86_64__)
2070 /* Fetch the physical address of the page directory */
2071 (void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa);
2072
2073 /* Zero the area */
2074 memset(pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
2075
2076 /*
2077 * This pdir will NEVER be active in kernel mode, so mark
2078 * recursive entry invalid.
2079 */
2080 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u;
2081
2082 /*
2083 * PDP constructed this way won't be for the kernel, hence we
2084 * don't put kernel mappings on Xen.
2085 *
2086 * But we need to make pmap_create() happy, so put a dummy
2087 * (without PG_V) value at the right place.
2088 */
2089 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2090 (pd_entry_t)-1 & PG_FRAME;
2091#else /* XEN && __x86_64__*/
2092 /* Zero the area */
2093 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2094
2095 object = (vaddr_t)v;
2096 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2097 /* Fetch the physical address of the page directory */
2098 (void)pmap_extract(pmap_kernel(), object, &pdirpa);
2099
2100 /* Put in recursive PDE to map the PTEs */
2101 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V |
2102 pmap_pg_nx;
2103#ifndef XEN
2104 pdir[PDIR_SLOT_PTE + i] |= PG_KW;
2105#endif
2106 }
2107
2108 /* Copy the kernel's top level PDE */
2109 npde = nkptp[PTP_LEVELS - 1];
2110
2111 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2112 npde * sizeof(pd_entry_t));
2113
2114 /* Zero the rest */
2115 memset(&pdir[PDIR_SLOT_KERN + npde], 0, (PAGE_SIZE * PDP_SIZE) -
2116 (PDIR_SLOT_KERN + npde) * sizeof(pd_entry_t));
2117
2118 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2119 int idx = pl_i(KERNBASE, PTP_LEVELS);
2120 pdir[idx] = PDP_BASE[idx];
2121 }
2122
2123#ifdef __HAVE_DIRECT_MAP
2124 pdir[PDIR_SLOT_DIRECT] = PDP_BASE[PDIR_SLOT_DIRECT];
2125#endif
2126#endif /* XEN && __x86_64__*/
2127
2128#ifdef XEN
2129 s = splvm();
2130 object = (vaddr_t)v;
2131 pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
2132 VM_PROT_READ);
2133 pmap_update(pmap_kernel());
2134 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2135 /*
2136 * pin as L2/L4 page, we have to do the page with the
2137 * PDIR_SLOT_PTE entries last
2138 */
2139#ifdef PAE
2140 if (i == l2tol3(PDIR_SLOT_PTE))
2141 continue;
2142#endif
2143
2144 (void) pmap_extract(pmap_kernel(), object, &pdirpa);
2145#ifdef __x86_64__
2146 xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2147#else
2148 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2149#endif
2150 }
2151#ifdef PAE
2152 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE);
2153 (void)pmap_extract(pmap_kernel(), object, &pdirpa);
2154 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2155#endif
2156 splx(s);
2157#endif /* XEN */
2158
2159 return (0);
2160}
2161
2162/*
2163 * pmap_pdp_dtor: destructor for the PDP cache.
2164 */
2165
2166static void
2167pmap_pdp_dtor(void *arg, void *v)
2168{
2169#ifdef XEN
2170 paddr_t pdirpa = 0; /* XXX: GCC */
2171 vaddr_t object = (vaddr_t)v;
2172 int i;
2173 int s = splvm();
2174 pt_entry_t *pte;
2175
2176 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2177 /* fetch the physical address of the page directory. */
2178 (void) pmap_extract(pmap_kernel(), object, &pdirpa);
2179 /* unpin page table */
2180 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2181 }
2182 object = (vaddr_t)v;
2183 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2184 /* Set page RW again */
2185 pte = kvtopte(object);
2186 pmap_pte_set(pte, *pte | PG_RW);
2187 xen_bcast_invlpg((vaddr_t)object);
2188 }
2189 splx(s);
2190#endif /* XEN */
2191}
2192
2193#ifdef PAE
2194
2195/* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */
2196
2197static void *
2198pmap_pdp_alloc(struct pool *pp, int flags)
2199{
2200 return (void *)uvm_km_alloc(kernel_map,
2201 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2202 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
2203 | UVM_KMF_WIRED);
2204}
2205
2206/*
2207 * pmap_pdp_free: free a PDP
2208 */
2209
2210static void
2211pmap_pdp_free(struct pool *pp, void *v)
2212{
2213 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2214 UVM_KMF_WIRED);
2215}
2216#endif /* PAE */
2217
2218/*
2219 * pmap_create: create a pmap object.
2220 */
2221struct pmap *
2222pmap_create(void)
2223{
2224 struct pmap *pmap;
2225 int i;
2226
2227 pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2228
2229 /* init uvm_object */
2230 for (i = 0; i < PTP_LEVELS - 1; i++) {
2231 mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
2232 uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1);
2233 uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]);
2234 pmap->pm_ptphint[i] = NULL;
2235 }
2236 pmap->pm_stats.wired_count = 0;
2237 /* count the PDP allocd below */
2238 pmap->pm_stats.resident_count = PDP_SIZE;
2239#if !defined(__x86_64__)
2240 pmap->pm_hiexec = 0;
2241#endif /* !defined(__x86_64__) */
2242 pmap->pm_flags = 0;
2243 pmap->pm_gc_ptp = NULL;
2244
2245 kcpuset_create(&pmap->pm_cpus, true);
2246 kcpuset_create(&pmap->pm_kernel_cpus, true);
2247#ifdef XEN
2248 kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
2249#endif
2250 /* init the LDT */
2251 pmap->pm_ldt = NULL;
2252 pmap->pm_ldt_len = 0;
2253 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2254
2255 /* allocate PDP */
2256 try_again:
2257 pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
2258
2259 mutex_enter(&pmaps_lock);
2260
2261 if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
2262 mutex_exit(&pmaps_lock);
2263 pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
2264 goto try_again;
2265 }
2266
2267 for (i = 0; i < PDP_SIZE; i++)
2268 pmap->pm_pdirpa[i] =
2269 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2270
2271 LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2272
2273 mutex_exit(&pmaps_lock);
2274
2275 return (pmap);
2276}
2277
2278/*
2279 * pmap_free_ptps: put a list of ptps back to the freelist.
2280 */
2281
2282void
2283pmap_free_ptps(struct vm_page *empty_ptps)
2284{
2285 struct vm_page *ptp;
2286 struct pmap_page *pp;
2287
2288 while ((ptp = empty_ptps) != NULL) {
2289 pp = VM_PAGE_TO_PP(ptp);
2290 empty_ptps = pp->pp_link;
2291 LIST_INIT(&pp->pp_head.pvh_list);
2292 uvm_pagefree(ptp);
2293 }
2294}
2295
2296/*
2297 * pmap_destroy: drop reference count on pmap. free pmap if
2298 * reference count goes to zero.
2299 */
2300
2301void
2302pmap_destroy(struct pmap *pmap)
2303{
2304 lwp_t *l;
2305 int i;
2306
2307 /*
2308 * If we have torn down this pmap, process deferred frees and
2309 * invalidations. Free now if the system is low on memory.
2310 * Otherwise, free when the pmap is destroyed thus avoiding a
2311 * TLB shootdown.
2312 */
2313 l = curlwp;
2314 if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
2315 if (uvmexp.free < uvmexp.freetarg) {
2316 pmap_update(pmap);
2317 } else {
2318 KASSERT(pmap->pm_gc_ptp == NULL);
2319 pmap->pm_gc_ptp = l->l_md.md_gc_ptp;
2320 l->l_md.md_gc_ptp = NULL;
2321 l->l_md.md_gc_pmap = NULL;
2322 }
2323 }
2324
2325 /*
2326 * drop reference count
2327 */
2328
2329 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2330 return;
2331 }
2332
2333#ifdef DIAGNOSTIC
2334 CPU_INFO_ITERATOR cii;
2335 struct cpu_info *ci;
2336
2337 for (CPU_INFO_FOREACH(cii, ci)) {
2338 if (ci->ci_pmap == pmap)
2339 panic("destroying pmap being used");
2340#if defined(XEN) && defined(__x86_64__)
2341 for (i = 0; i < PDIR_SLOT_PTE; i++) {
2342 if (pmap->pm_pdir[i] != 0 &&
2343 ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
2344 printf("pmap_destroy(%p) pmap_kernel %p "
2345 "curcpu %d cpu %d ci_pmap %p "
2346 "ci->ci_kpm_pdir[%d]=%" PRIx64
2347 " pmap->pm_pdir[%d]=%" PRIx64 "\n",
2348 pmap, pmap_kernel(), curcpu()->ci_index,
2349 ci->ci_index, ci->ci_pmap,
2350 i, ci->ci_kpm_pdir[i],
2351 i, pmap->pm_pdir[i]);
2352 panic("pmap_destroy: used pmap");
2353 }
2354 }
2355#endif
2356 }
2357#endif /* DIAGNOSTIC */
2358
2359 /*
2360 * Reference count is zero, free pmap resources and then free pmap.
2361 * First, remove it from global list of pmaps.
2362 */
2363
2364 mutex_enter(&pmaps_lock);
2365 LIST_REMOVE(pmap, pm_list);
2366 mutex_exit(&pmaps_lock);
2367
2368 /*
2369 * Process deferred PTP frees. No TLB shootdown required, as the
2370 * PTP pages are no longer visible to any CPU.
2371 */
2372
2373 pmap_free_ptps(pmap->pm_gc_ptp);
2374
2375 /*
2376 * destroyed pmap shouldn't have remaining PTPs
2377 */
2378
2379 for (i = 0; i < PTP_LEVELS - 1; i++) {
2380 KASSERT(pmap->pm_obj[i].uo_npages == 0);
2381 KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
2382 }
2383
2384 pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
2385
2386#ifdef USER_LDT
2387 if (pmap->pm_ldt != NULL) {
2388 /*
2389 * no need to switch the LDT; this address space is gone,
2390 * nothing is using it.
2391 *
2392 * No need to lock the pmap for ldt_free (or anything else),
2393 * we're the last one to use it.
2394 */
2395 mutex_enter(&cpu_lock);
2396 ldt_free(pmap->pm_ldt_sel);
2397 mutex_exit(&cpu_lock);
2398 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2399 pmap->pm_ldt_len, UVM_KMF_WIRED);
2400 }
2401#endif
2402
2403 for (i = 0; i < PTP_LEVELS - 1; i++) {
2404 uvm_obj_destroy(&pmap->pm_obj[i], false);
2405 mutex_destroy(&pmap->pm_obj_lock[i]);
2406 }
2407 kcpuset_destroy(pmap->pm_cpus);
2408 kcpuset_destroy(pmap->pm_kernel_cpus);
2409#ifdef XEN
2410 kcpuset_destroy(pmap->pm_xen_ptp_cpus);
2411#endif
2412 pool_cache_put(&pmap_cache, pmap);
2413}
2414
2415/*
2416 * pmap_remove_all: pmap is being torn down by the current thread.
2417 * avoid unnecessary invalidations.
2418 */
2419
2420void
2421pmap_remove_all(struct pmap *pmap)
2422{
2423 lwp_t *l = curlwp;
2424
2425 KASSERT(l->l_md.md_gc_pmap == NULL);
2426
2427 l->l_md.md_gc_pmap = pmap;
2428}
2429
2430#if defined(PMAP_FORK)
2431/*
2432 * pmap_fork: perform any necessary data structure manipulation when
2433 * a VM space is forked.
2434 */
2435
2436void
2437pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
2438{
2439#ifdef USER_LDT
2440 union descriptor *new_ldt;
2441 size_t len;
2442 int sel;
2443
2444 if (__predict_true(pmap1->pm_ldt == NULL)) {
2445 return;
2446 }
2447
2448 /*
2449 * Copy the LDT into the new process.
2450 *
2451 * Read pmap1's ldt pointer and length unlocked; if it changes
2452 * behind our back we'll retry. This will starve if there's a
2453 * stream of LDT changes in another thread but that should not
2454 * happen.
2455 */
2456
2457 retry:
2458 if (pmap1->pm_ldt != NULL) {
2459 len = pmap1->pm_ldt_len;
2460 /* Allocate space for the new process's LDT */
2461 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0,
2462 UVM_KMF_WIRED);
2463 if (new_ldt == NULL) {
2464 printf("WARNING: pmap_fork: "
2465 "unable to allocate LDT space\n");
2466 return;
2467 }
2468 mutex_enter(&cpu_lock);
2469 /* Get a GDT slot for it */
2470 sel = ldt_alloc(new_ldt, len);
2471 if (sel == -1) {
2472 mutex_exit(&cpu_lock);
2473 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2474 UVM_KMF_WIRED);
2475 printf("WARNING: pmap_fork: "
2476 "unable to allocate LDT selector\n");
2477 return;
2478 }
2479 } else {
2480 /* Wasn't anything there after all. */
2481 len = -1;
2482 new_ldt = NULL;
2483 sel = -1;
2484 mutex_enter(&cpu_lock);
2485 }
2486
2487 /* If there's still something there now that we have cpu_lock... */
2488 if (pmap1->pm_ldt != NULL) {
2489 if (len != pmap1->pm_ldt_len) {
2490 /* Oops, it changed. Drop what we did and try again */
2491 if (len != -1) {
2492 ldt_free(sel);
2493 uvm_km_free(kernel_map, (vaddr_t)new_ldt,
2494 len, UVM_KMF_WIRED);
2495 }
2496 mutex_exit(&cpu_lock);
2497 goto retry;
2498 }
2499
2500 /* Copy the LDT data and install it in pmap2 */
2501 memcpy(new_ldt, pmap1->pm_ldt, len);
2502 pmap2->pm_ldt = new_ldt;
2503 pmap2->pm_ldt_len = pmap1->pm_ldt_len;
2504 pmap2->pm_ldt_sel = sel;
2505 len = -1;
2506 }
2507
2508 if (len != -1) {
2509 /* There wasn't still something there, so mop up */
2510 ldt_free(sel);
2511 mutex_exit(&cpu_lock);
2512 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2513 UVM_KMF_WIRED);
2514 } else {
2515 mutex_exit(&cpu_lock);
2516 }
2517#endif /* USER_LDT */
2518}
2519#endif /* PMAP_FORK */
2520
2521#ifdef USER_LDT
2522
2523/*
2524 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap
2525 * is active, reload LDTR.
2526 */
2527static void
2528pmap_ldt_xcall(void *arg1, void *arg2)
2529{
2530 struct pmap *pm;
2531
2532 kpreempt_disable();
2533 pm = arg1;
2534 if (curcpu()->ci_pmap == pm) {
2535 lldt(pm->pm_ldt_sel);
2536 }
2537 kpreempt_enable();
2538}
2539
2540/*
2541 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap
2542 * in the new selector on all CPUs.
2543 */
2544void
2545pmap_ldt_sync(struct pmap *pm)
2546{
2547 uint64_t where;
2548
2549 KASSERT(mutex_owned(&cpu_lock));
2550
2551 pmap_ldt_evcnt.ev_count++;
2552 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
2553 xc_wait(where);
2554}
2555
2556/*
2557 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
2558 * restore the default.
2559 */
2560
2561void
2562pmap_ldt_cleanup(struct lwp *l)
2563{
2564 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
2565 union descriptor *dp = NULL;
2566 size_t len = 0;
2567 int sel = -1;
2568
2569 if (__predict_true(pmap->pm_ldt == NULL)) {
2570 return;
2571 }
2572
2573 mutex_enter(&cpu_lock);
2574 if (pmap->pm_ldt != NULL) {
2575 sel = pmap->pm_ldt_sel;
2576 dp = pmap->pm_ldt;
2577 len = pmap->pm_ldt_len;
2578 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2579 pmap->pm_ldt = NULL;
2580 pmap->pm_ldt_len = 0;
2581 pmap_ldt_sync(pmap);
2582 ldt_free(sel);
2583 uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED);
2584 }
2585 mutex_exit(&cpu_lock);
2586}
2587#endif /* USER_LDT */
2588
2589/*
2590 * pmap_activate: activate a process' pmap
2591 *
2592 * => must be called with kernel preemption disabled
2593 * => if lwp is the curlwp, then set ci_want_pmapload so that
2594 * actual MMU context switch will be done by pmap_load() later
2595 */
2596
2597void
2598pmap_activate(struct lwp *l)
2599{
2600 struct cpu_info *ci;
2601 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2602
2603 KASSERT(kpreempt_disabled());
2604
2605 ci = curcpu();
2606
2607 if (l == ci->ci_curlwp) {
2608 KASSERT(ci->ci_want_pmapload == 0);
2609 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
2610#ifdef KSTACK_CHECK_DR0
2611 /*
2612 * setup breakpoint on the top of stack
2613 */
2614 if (l == &lwp0)
2615 dr0(0, 0, 0, 0);
2616 else
2617 dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1);
2618#endif
2619
2620 /*
2621 * no need to switch to kernel vmspace because
2622 * it's a subset of any vmspace.
2623 */
2624
2625 if (pmap == pmap_kernel()) {
2626 ci->ci_want_pmapload = 0;
2627 return;
2628 }
2629
2630 ci->ci_want_pmapload = 1;
2631 }
2632}
2633
2634/*
2635 * pmap_reactivate: try to regain reference to the pmap.
2636 *
2637 * => Must be called with kernel preemption disabled.
2638 */
2639
2640static bool
2641pmap_reactivate(struct pmap *pmap)
2642{
2643 struct cpu_info * const ci = curcpu();
2644 const cpuid_t cid = cpu_index(ci);
2645 bool result;
2646
2647 KASSERT(kpreempt_disabled());
2648#if defined(XEN) && defined(__x86_64__)
2649 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2650#elif defined(PAE)
2651 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2652#elif !defined(XEN)
2653 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2654#endif
2655
2656 /*
2657 * If we still have a lazy reference to this pmap, we can assume
2658 * that there was no TLB shootdown for this pmap in the meantime.
2659 *
2660 * The order of events here is important as we must synchronize
2661 * with TLB shootdown interrupts. Declare interest in invalidations
2662 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
2663 * change only when the state is TLBSTATE_LAZY.
2664 */
2665
2666 ci->ci_tlbstate = TLBSTATE_VALID;
2667 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
2668
2669 if (kcpuset_isset(pmap->pm_cpus, cid)) {
2670 /* We have the reference, state is valid. */
2671 result = true;
2672 } else {
2673 /* Must reload the TLB. */
2674 kcpuset_atomic_set(pmap->pm_cpus, cid);
2675 result = false;
2676 }
2677 return result;
2678}
2679
2680/*
2681 * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
2682 * and relevant LDT info.
2683 *
2684 * Ensures that the current process' pmap is loaded on the current CPU's
2685 * MMU and that there are no stale TLB entries.
2686 *
2687 * => The caller should disable kernel preemption or do check-and-retry
2688 * to prevent a preemption from undoing our efforts.
2689 * => This function may block.
2690 */
2691void
2692pmap_load(void)
2693{
2694 struct cpu_info *ci;
2695 struct pmap *pmap, *oldpmap;
2696 struct lwp *l;
2697 struct pcb *pcb;
2698 cpuid_t cid;
2699 uint64_t ncsw;
2700
2701 kpreempt_disable();
2702 retry:
2703 ci = curcpu();
2704 if (!ci->ci_want_pmapload) {
2705 kpreempt_enable();
2706 return;
2707 }
2708 l = ci->ci_curlwp;
2709 ncsw = l->l_ncsw;
2710
2711 /* should be able to take ipis. */
2712 KASSERT(ci->ci_ilevel < IPL_HIGH);
2713#ifdef XEN
2714 /* Check to see if interrupts are enabled (ie; no events are masked) */
2715 KASSERT(x86_read_psl() == 0);
2716#else
2717 KASSERT((x86_read_psl() & PSL_I) != 0);
2718#endif
2719
2720 KASSERT(l != NULL);
2721 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2722 KASSERT(pmap != pmap_kernel());
2723 oldpmap = ci->ci_pmap;
2724 pcb = lwp_getpcb(l);
2725
2726 if (pmap == oldpmap) {
2727 if (!pmap_reactivate(pmap)) {
2728 u_int gen = uvm_emap_gen_return();
2729
2730 /*
2731 * pmap has been changed during deactivated.
2732 * our tlb may be stale.
2733 */
2734
2735 tlbflush();
2736 uvm_emap_update(gen);
2737 }
2738
2739 ci->ci_want_pmapload = 0;
2740 kpreempt_enable();
2741 return;
2742 }
2743
2744 /*
2745 * Acquire a reference to the new pmap and perform the switch.
2746 */
2747
2748 pmap_reference(pmap);
2749
2750 cid = cpu_index(ci);
2751 kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
2752 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
2753
2754#if defined(XEN) && defined(__x86_64__)
2755 KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd ||
2756 oldpmap == pmap_kernel());
2757#elif defined(PAE)
2758 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2759#elif !defined(XEN)
2760 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3()));
2761#endif
2762 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
2763 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
2764
2765 /*
2766 * Mark the pmap in use by this CPU. Again, we must synchronize
2767 * with TLB shootdown interrupts, so set the state VALID first,
2768 * then register us for shootdown events on this pmap.
2769 */
2770 ci->ci_tlbstate = TLBSTATE_VALID;
2771 kcpuset_atomic_set(pmap->pm_cpus, cid);
2772 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
2773 ci->ci_pmap = pmap;
2774
2775 /*
2776 * update tss. now that we have registered for invalidations
2777 * from other CPUs, we're good to load the page tables.
2778 */
2779#ifdef PAE
2780 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
2781#else
2782 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
2783#endif
2784
2785#ifdef i386
2786#ifndef XEN
2787 ci->ci_tss.tss_ldt = pmap->pm_ldt_sel;
2788 ci->ci_tss.tss_cr3 = pcb->pcb_cr3;
2789#endif /* !XEN */
2790#endif /* i386 */
2791
2792 lldt(pmap->pm_ldt_sel);
2793
2794 u_int gen = uvm_emap_gen_return();
2795 cpu_load_pmap(pmap, oldpmap);
2796 uvm_emap_update(gen);
2797
2798 ci->ci_want_pmapload = 0;
2799
2800 /*
2801 * we're now running with the new pmap. drop the reference
2802 * to the old pmap. if we block, we need to go around again.
2803 */
2804
2805 pmap_destroy(oldpmap);
2806 if (l->l_ncsw != ncsw) {
2807 goto retry;
2808 }
2809
2810 kpreempt_enable();
2811}
2812
2813/*
2814 * pmap_deactivate: deactivate a process' pmap.
2815 *
2816 * => Must be called with kernel preemption disabled (high IPL is enough).
2817 */
2818void
2819pmap_deactivate(struct lwp *l)
2820{
2821 struct pmap *pmap;
2822 struct cpu_info *ci;
2823
2824 KASSERT(kpreempt_disabled());
2825
2826 if (l != curlwp) {
2827 return;
2828 }
2829
2830 /*
2831 * Wait for pending TLB shootdowns to complete. Necessary because
2832 * TLB shootdown state is per-CPU, and the LWP may be coming off
2833 * the CPU before it has a chance to call pmap_update(), e.g. due
2834 * to kernel preemption or blocking routine in between.
2835 */
2836 pmap_tlb_shootnow();
2837
2838 ci = curcpu();
2839
2840 if (ci->ci_want_pmapload) {
2841 /*
2842 * ci_want_pmapload means that our pmap is not loaded on
2843 * the CPU or TLB might be stale. note that pmap_kernel()
2844 * is always considered loaded.
2845 */
2846 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2847 != pmap_kernel());
2848 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2849 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
2850
2851 /*
2852 * userspace has not been touched.
2853 * nothing to do here.
2854 */
2855
2856 ci->ci_want_pmapload = 0;
2857 return;
2858 }
2859
2860 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2861
2862 if (pmap == pmap_kernel()) {
2863 return;
2864 }
2865
2866#if defined(XEN) && defined(__x86_64__)
2867 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2868#elif defined(PAE)
2869 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2870#elif !defined(XEN)
2871 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2872#endif
2873 KASSERT(ci->ci_pmap == pmap);
2874
2875 /*
2876 * we aren't interested in TLB invalidations for this pmap,
2877 * at least for the time being.
2878 */
2879
2880 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
2881 ci->ci_tlbstate = TLBSTATE_LAZY;
2882}
2883
2884/*
2885 * end of lifecycle functions
2886 */
2887
2888/*
2889 * some misc. functions
2890 */
2891
2892int
2893pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
2894{
2895 int i;
2896 unsigned long index;
2897 pd_entry_t pde;
2898
2899 for (i = PTP_LEVELS; i > 1; i--) {
2900 index = pl_i(va, i);
2901 pde = pdes[i - 2][index];
2902 if ((pde & PG_V) == 0)
2903 return i;
2904 }
2905 if (lastpde != NULL)
2906 *lastpde = pde;
2907 return 0;
2908}
2909
2910/*
2911 * pmap_extract: extract a PA for the given VA
2912 */
2913
2914bool
2915pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2916{
2917 pt_entry_t *ptes, pte;
2918 pd_entry_t pde;
2919 pd_entry_t * const *pdes;
2920 struct pmap *pmap2;
2921 struct cpu_info *ci;
2922 paddr_t pa;
2923 lwp_t *l;
2924 bool hard, rv;
2925
2926#ifdef __HAVE_DIRECT_MAP
2927 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
2928 if (pap != NULL) {
2929 *pap = va - PMAP_DIRECT_BASE;
2930 }
2931 return true;
2932 }
2933#endif
2934
2935 rv = false;
2936 pa = 0;
2937 l = curlwp;
2938
2939 kpreempt_disable();
2940 ci = l->l_cpu;
2941 if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) ||
2942 pmap == pmap_kernel()) {
2943 /*
2944 * no need to lock, because it's pmap_kernel() or our
2945 * own pmap and is active. if a user pmap, the caller
2946 * will hold the vm_map write/read locked and so prevent
2947 * entries from disappearing while we are here. ptps
2948 * can disappear via pmap_remove() and pmap_protect(),
2949 * but they are called with the vm_map write locked.
2950 */
2951 hard = false;
2952 ptes = PTE_BASE;
2953 pdes = normal_pdes;
2954 } else {
2955 /* we lose, do it the hard way. */
2956 hard = true;
2957 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2958 }
2959 if (pmap_pdes_valid(va, pdes, &pde)) {
2960 pte = ptes[pl1_i(va)];
2961 if (pde & PG_PS) {
2962 pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1));
2963 rv = true;
2964 } else if (__predict_true((pte & PG_V) != 0)) {
2965 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
2966 rv = true;
2967 }
2968 }
2969 if (__predict_false(hard)) {
2970 pmap_unmap_ptes(pmap, pmap2);
2971 }
2972 kpreempt_enable();
2973 if (pap != NULL) {
2974 *pap = pa;
2975 }
2976 return rv;
2977}
2978
2979
2980/*
2981 * vtophys: virtual address to physical address. For use by
2982 * machine-dependent code only.
2983 */
2984
2985paddr_t
2986vtophys(vaddr_t va)
2987{
2988 paddr_t pa;
2989
2990 if (pmap_extract(pmap_kernel(), va, &pa) == true)
2991 return (pa);
2992 return (0);
2993}
2994
2995__strict_weak_alias(pmap_extract_ma, pmap_extract);
2996
2997#ifdef XEN
2998
2999/*
3000 * vtomach: virtual address to machine address. For use by
3001 * machine-dependent code only.
3002 */
3003
3004paddr_t
3005vtomach(vaddr_t va)
3006{
3007 paddr_t pa;
3008
3009 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
3010 return (pa);
3011 return (0);
3012}
3013
3014#endif /* XEN */
3015
3016/*
3017 * pmap_virtual_space: used during bootup [pmap_steal_memory] to
3018 * determine the bounds of the kernel virtual addess space.
3019 */
3020
3021void
3022pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3023{
3024 *startp = virtual_avail;
3025 *endp = virtual_end;
3026}
3027
3028/*
3029 * pmap_zero_page: zero a page
3030 */
3031
3032void
3033pmap_zero_page(paddr_t pa)
3034{
3035#if defined(__HAVE_DIRECT_MAP)
3036 pagezero(PMAP_DIRECT_MAP(pa));
3037#else
3038#if defined(XEN)
3039 if (XEN_VERSION_SUPPORTED(3, 4))
3040 xen_pagezero(pa);
3041#endif
3042 pt_entry_t *zpte;
3043 void *zerova;
3044 int id;
3045
3046 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U |
3047 PG_k;
3048
3049 kpreempt_disable();
3050 id = cpu_number();
3051 zpte = PTESLEW(zero_pte, id);
3052 zerova = VASLEW(zerop, id);
3053
3054#ifdef DIAGNOSTIC
3055 if (*zpte)
3056 panic("pmap_zero_page: lock botch");
3057#endif
3058
3059 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
3060 pmap_pte_flush();
3061 pmap_update_pg((vaddr_t)zerova); /* flush TLB */
3062
3063 memset(zerova, 0, PAGE_SIZE);
3064
3065#if defined(DIAGNOSTIC) || defined(XEN)
3066 pmap_pte_set(zpte, 0); /* zap ! */
3067 pmap_pte_flush();
3068#endif
3069
3070 kpreempt_enable();
3071#endif /* defined(__HAVE_DIRECT_MAP) */
3072}
3073
3074/*
3075 * pmap_pagezeroidle: the same, for the idle loop page zero'er.
3076 * Returns true if the page was zero'd, false if we aborted for
3077 * some reason.
3078 */
3079
3080bool
3081pmap_pageidlezero(paddr_t pa)
3082{
3083#ifdef __HAVE_DIRECT_MAP
3084 KASSERT(cpu_feature[0] & CPUID_SSE2);
3085 return sse2_idlezero_page((void *)PMAP_DIRECT_MAP(pa));
3086#else
3087 pt_entry_t *zpte;
3088 void *zerova;
3089 bool rv;
3090 int id;
3091
3092 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U |
3093 PG_k;
3094
3095 id = cpu_number();
3096 zpte = PTESLEW(zero_pte, id);
3097 zerova = VASLEW(zerop, id);
3098
3099 KASSERT(cpu_feature[0] & CPUID_SSE2);
3100 KASSERT(*zpte == 0);
3101
3102 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
3103 pmap_pte_flush();
3104 pmap_update_pg((vaddr_t)zerova); /* flush TLB */
3105
3106 rv = sse2_idlezero_page(zerova);
3107
3108#if defined(DIAGNOSTIC) || defined(XEN)
3109 pmap_pte_set(zpte, 0); /* zap ! */
3110 pmap_pte_flush();
3111#endif
3112
3113 return rv;
3114#endif
3115}
3116
3117/*
3118 * pmap_copy_page: copy a page
3119 */
3120
3121void
3122pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3123{
3124#if defined(__HAVE_DIRECT_MAP)
3125 vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
3126 vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
3127
3128 memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
3129#else
3130#if defined(XEN)
3131 if (XEN_VERSION_SUPPORTED(3, 4)) {
3132 xen_copy_page(srcpa, dstpa);
3133 return;
3134 }
3135#endif
3136 pt_entry_t *spte;
3137 pt_entry_t *dpte;
3138 void *csrcva;
3139 void *cdstva;
3140 int id;
3141
3142 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U | PG_k;
3143
3144 kpreempt_disable();
3145 id = cpu_number();
3146 spte = PTESLEW(csrc_pte,id);
3147 dpte = PTESLEW(cdst_pte,id);
3148 csrcva = VASLEW(csrcp, id);
3149 cdstva = VASLEW(cdstp, id);
3150
3151 KASSERT(*spte == 0 && *dpte == 0);
3152
3153 pmap_pte_set(spte, pmap_pa2pte(srcpa) | pteflags);
3154 pmap_pte_set(dpte, pmap_pa2pte(dstpa) | pteflags | PG_M);
3155 pmap_pte_flush();
3156 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
3157
3158 memcpy(cdstva, csrcva, PAGE_SIZE);
3159
3160#if defined(DIAGNOSTIC) || defined(XEN)
3161 pmap_pte_set(spte, 0);
3162 pmap_pte_set(dpte, 0);
3163 pmap_pte_flush();
3164#endif
3165
3166 kpreempt_enable();
3167#endif /* defined(__HAVE_DIRECT_MAP) */
3168}
3169
3170static pt_entry_t *
3171pmap_map_ptp(struct vm_page *ptp)
3172{
3173#ifdef __HAVE_DIRECT_MAP
3174 return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
3175#else
3176 pt_entry_t *ptppte;
3177 void *ptpva;
3178 int id;
3179
3180 KASSERT(kpreempt_disabled());
3181
3182#ifndef XEN
3183 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U | PG_M |
3184 PG_k;
3185#else
3186 const pd_entry_t pteflags = PG_V | pmap_pg_nx | PG_U | PG_M | PG_k;
3187#endif
3188
3189 id = cpu_number();
3190 ptppte = PTESLEW(ptp_pte, id);
3191 ptpva = VASLEW(ptpp, id);
3192 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags);
3193
3194 pmap_pte_flush();
3195 pmap_update_pg((vaddr_t)ptpva);
3196
3197 return (pt_entry_t *)ptpva;
3198#endif
3199}
3200
3201static void
3202pmap_unmap_ptp(void)
3203{
3204#ifndef __HAVE_DIRECT_MAP
3205#if defined(DIAGNOSTIC) || defined(XEN)
3206 pt_entry_t *pte;
3207
3208 KASSERT(kpreempt_disabled());
3209
3210 pte = PTESLEW(ptp_pte, cpu_number());
3211 if (*pte != 0) {
3212 pmap_pte_set(pte, 0);
3213 pmap_pte_flush();
3214 }
3215#endif
3216#endif
3217}
3218
3219static pt_entry_t *
3220pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3221{
3222
3223 KASSERT(kpreempt_disabled());
3224 if (pmap_is_curpmap(pmap)) {
3225 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3226 }
3227 KASSERT(ptp != NULL);
3228 return pmap_map_ptp(ptp) + pl1_pi(va);
3229}
3230
3231static void
3232pmap_unmap_pte(void)
3233{
3234
3235 KASSERT(kpreempt_disabled());
3236
3237 pmap_unmap_ptp();
3238}
3239
3240/*
3241 * p m a p r e m o v e f u n c t i o n s
3242 *
3243 * functions that remove mappings
3244 */
3245
3246/*
3247 * pmap_remove_ptes: remove PTEs from a PTP
3248 *
3249 * => caller must hold pmap's lock
3250 * => PTP must be mapped into KVA
3251 * => PTP should be null if pmap == pmap_kernel()
3252 * => must be called with kernel preemption disabled
3253 * => returns composite pte if at least one page should be shot down
3254 */
3255
3256static void
3257pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3258 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree)
3259{
3260 pt_entry_t *pte = (pt_entry_t *)ptpva;
3261
3262 KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3263 KASSERT(kpreempt_disabled());
3264
3265 /*
3266 * note that ptpva points to the PTE that maps startva. this may
3267 * or may not be the first PTE in the PTP.
3268 *
3269 * we loop through the PTP while there are still PTEs to look at
3270 * and the wire_count is greater than 1 (because we use the wire_count
3271 * to keep track of the number of real PTEs in the PTP).
3272 */
3273 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
3274 (void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree);
3275 startva += PAGE_SIZE;
3276 pte++;
3277 }
3278}
3279
3280
3281/*
3282 * pmap_remove_pte: remove a single PTE from a PTP.
3283 *
3284 * => caller must hold pmap's lock
3285 * => PTP must be mapped into KVA
3286 * => PTP should be null if pmap == pmap_kernel()
3287 * => returns true if we removed a mapping
3288 * => must be called with kernel preemption disabled
3289 */
3290static bool
3291pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3292 vaddr_t va, struct pv_entry **pv_tofree)
3293{
3294 struct pv_entry *pve;
3295 struct vm_page *pg;
3296 struct pmap_page *pp;
3297 pt_entry_t opte;
3298
3299 KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3300 KASSERT(kpreempt_disabled());
3301
3302 if (!pmap_valid_entry(*pte)) {
3303 /* VA not mapped. */
3304 return false;
3305 }
3306
3307 /* Atomically save the old PTE and zap it. */
3308 opte = pmap_pte_testset(pte, 0);
3309 if (!pmap_valid_entry(opte)) {
3310 return false;
3311 }
3312
3313 pmap_exec_account(pmap, va, opte, 0);
3314 pmap_stats_update_bypte(pmap, 0, opte);
3315
3316 if (ptp) {
3317 /*
3318 * Dropping a PTE. Make sure that the PDE is flushed.
3319 */
3320 ptp->wire_count--;
3321 if (ptp->wire_count <= 1) {
3322 opte |= PG_U;
3323 }
3324 }
3325
3326 if ((opte & PG_U) != 0) {
3327 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
3328 }
3329
3330 /*
3331 * If we are not on a pv_head list - we are done.
3332 */
3333 if ((opte & PG_PVLIST) == 0) {
3334#if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3335 if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL ||
3336 pmap_pv_tracked(pmap_pte2pa(opte)) != NULL)
3337 panic("pmap_remove_pte: managed or pv-tracked page"
3338 " without PG_PVLIST for %#"PRIxVADDR, va);
3339#endif
3340 return true;
3341 }
3342
3343 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
3344 KASSERT(uvm_page_locked_p(pg));
3345 pp = VM_PAGE_TO_PP(pg);
3346 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
3347 paddr_t pa = pmap_pte2pa(opte);
3348 panic("pmap_remove_pte: PG_PVLIST with pv-untracked page"
3349 " va = 0x%"PRIxVADDR
3350 " pa = 0x%"PRIxPADDR" (0x%"PRIxPADDR")",
3351 va, pa, atop(pa));
3352 }
3353
3354 /* Sync R/M bits. */
3355 pp->pp_attrs |= opte;
3356 pve = pmap_remove_pv(pp, ptp, va);
3357
3358 if (pve) {
3359 pve->pve_next = *pv_tofree;
3360 *pv_tofree = pve;
3361 }
3362 return true;
3363}
3364
3365/*
3366 * pmap_remove: mapping removal function.
3367 *
3368 * => caller should not be holding any pmap locks
3369 */
3370
3371void
3372pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
3373{
3374 pt_entry_t *ptes;
3375 pd_entry_t pde;
3376 pd_entry_t * const *pdes;
3377 struct pv_entry *pv_tofree = NULL;
3378 bool result;
3379 int i;
3380 paddr_t ptppa;
3381 vaddr_t blkendva, va = sva;
3382 struct vm_page *ptp;
3383 struct pmap *pmap2;
3384
3385 kpreempt_disable();
3386 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */
3387
3388 /*
3389 * removing one page? take shortcut function.
3390 */
3391
3392 if (va + PAGE_SIZE == eva) {
3393 if (pmap_pdes_valid(va, pdes, &pde)) {
3394
3395 /* PA of the PTP */
3396 ptppa = pmap_pte2pa(pde);
3397
3398 /* Get PTP if non-kernel mapping. */
3399 if (pmap != pmap_kernel()) {
3400 ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3401 KASSERTMSG(ptp != NULL,
3402 "pmap_remove: unmanaged PTP detected");
3403 } else {
3404 /* Never free kernel PTPs. */
3405 ptp = NULL;
3406 }
3407
3408 result = pmap_remove_pte(pmap, ptp,
3409 &ptes[pl1_i(va)], va, &pv_tofree);
3410
3411 /*
3412 * if mapping removed and the PTP is no longer
3413 * being used, free it!
3414 */
3415
3416 if (result && ptp && ptp->wire_count <= 1)
3417 pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3418 }
3419 } else for (/* null */ ; va < eva ; va = blkendva) {
3420 int lvl;
3421
3422 /* determine range of block */
3423 blkendva = x86_round_pdr(va+1);
3424 if (blkendva > eva)
3425 blkendva = eva;
3426
3427 /*
3428 * Our PTE mappings should never be removed with pmap_remove.
3429 *
3430 * XXXmaxv: still needed?
3431 *
3432 * A long term solution is to move the PTEs out of user address
3433 * space, and into kernel address space. Then we can set
3434 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS.
3435 */
3436 for (i = 0; i < PDP_SIZE; i++) {
3437 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3438 panic("PTE space accessed");
3439 }
3440
3441 lvl = pmap_pdes_invalid(va, pdes, &pde);
3442 if (lvl != 0) {
3443 /*
3444 * skip a range corresponding to an invalid pde.
3445 */
3446 blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1];
3447 continue;
3448 }
3449
3450 /* PA of the PTP */
3451 ptppa = pmap_pte2pa(pde);
3452
3453 /* Get PTP if non-kernel mapping. */
3454 if (pmap != pmap_kernel()) {
3455 ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3456 KASSERTMSG(ptp != NULL,
3457 "pmap_remove: unmanaged PTP detected");
3458 } else {
3459 /* Never free kernel PTPs. */
3460 ptp = NULL;
3461 }
3462
3463 pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
3464 blkendva, &pv_tofree);
3465
3466 /* if PTP is no longer being used, free it! */
3467 if (ptp && ptp->wire_count <= 1) {
3468 pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3469 }
3470 }
3471 pmap_unmap_ptes(pmap, pmap2); /* unlock pmap */
3472 kpreempt_enable();
3473
3474 /* Now we free unused PVs */
3475 if (pv_tofree)
3476 pmap_free_pvs(pv_tofree);
3477}
3478
3479/*
3480 * pmap_sync_pv: clear pte bits and return the old value of the pte.
3481 *
3482 * => Caller should disable kernel preemption.
3483 * => issues tlb shootdowns if necessary.
3484 */
3485
3486static int
3487pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits,
3488 pt_entry_t *optep)
3489{
3490 struct pmap *pmap;
3491 struct vm_page *ptp;
3492 vaddr_t va;
3493 pt_entry_t *ptep;
3494 pt_entry_t opte;
3495 pt_entry_t npte;
3496 bool need_shootdown;
3497
3498 ptp = pvpte->pte_ptp;
3499 va = pvpte->pte_va;
3500 KASSERT(ptp == NULL || ptp->uobject != NULL);
3501 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
3502 pmap = ptp_to_pmap(ptp);
3503
3504 KASSERT((expect & ~(PG_FRAME | PG_V)) == 0);
3505 KASSERT((expect & PG_V) != 0);
3506 KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0);
3507 KASSERT(kpreempt_disabled());
3508
3509 ptep = pmap_map_pte(pmap, ptp, va);
3510 do {
3511 opte = *ptep;
3512 KASSERT((opte & (PG_M | PG_U)) != PG_M);
3513 KASSERT((opte & (PG_U | PG_V)) != PG_U);
3514 KASSERT(opte == 0 || (opte & PG_V) != 0);
3515 if ((opte & (PG_FRAME | PG_V)) != expect) {
3516
3517 /*
3518 * we lost a race with a V->P operation like
3519 * pmap_remove(). wait for the competitor
3520 * reflecting pte bits into mp_attrs.
3521 *
3522 * issue a redundant TLB shootdown so that
3523 * we can wait for its completion.
3524 */
3525
3526 pmap_unmap_pte();
3527 if (clearbits != 0) {
3528 pmap_tlb_shootdown(pmap, va,
3529 (pmap == pmap_kernel() ? PG_G : 0),
3530 TLBSHOOT_SYNC_PV1);
3531 }
3532 return EAGAIN;
3533 }
3534
3535 /*
3536 * check if there's anything to do on this pte.
3537 */
3538
3539 if ((opte & clearbits) == 0) {
3540 need_shootdown = false;
3541 break;
3542 }
3543
3544 /*
3545 * we need a shootdown if the pte is cached. (PG_U)
3546 *
3547 * ...unless we are clearing only the PG_RW bit and
3548 * it isn't cached as RW. (PG_M)
3549 */
3550
3551 need_shootdown = (opte & PG_U) != 0 &&
3552 !(clearbits == PG_RW && (opte & PG_M) == 0);
3553
3554 npte = opte & ~clearbits;
3555
3556 /*
3557 * if we need a shootdown anyway, clear PG_U and PG_M.
3558 */
3559
3560 if (need_shootdown) {
3561 npte &= ~(PG_U | PG_M);
3562 }
3563 KASSERT((npte & (PG_M | PG_U)) != PG_M);
3564 KASSERT((npte & (PG_U | PG_V)) != PG_U);
3565 KASSERT(npte == 0 || (opte & PG_V) != 0);
3566 } while (pmap_pte_cas(ptep, opte, npte) != opte);
3567
3568 if (need_shootdown) {
3569 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2);
3570 }
3571 pmap_unmap_pte();
3572
3573 *optep = opte;
3574 return 0;
3575}
3576
3577static void
3578pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
3579{
3580 struct pv_pte *pvpte;
3581 struct pv_entry *killlist = NULL;
3582 struct vm_page *ptp;
3583 pt_entry_t expect;
3584 int count;
3585
3586 expect = pmap_pa2pte(pa) | PG_V;
3587 count = SPINLOCK_BACKOFF_MIN;
3588 kpreempt_disable();
3589startover:
3590 while ((pvpte = pv_pte_first(pp)) != NULL) {
3591 struct pmap *pmap;
3592 struct pv_entry *pve;
3593 pt_entry_t opte;
3594 vaddr_t va;
3595 int error;
3596
3597 /*
3598 * add a reference to the pmap before clearing the pte.
3599 * otherwise the pmap can disappear behind us.
3600 */
3601
3602 ptp = pvpte->pte_ptp;
3603 pmap = ptp_to_pmap(ptp);
3604 if (ptp != NULL) {
3605 pmap_reference(pmap);
3606 }
3607
3608 error = pmap_sync_pv(pvpte, expect, ~0, &opte);
3609 if (error == EAGAIN) {
3610 int hold_count;
3611 KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3612 if (ptp != NULL) {
3613 pmap_destroy(pmap);
3614 }
3615 SPINLOCK_BACKOFF(count);
3616 KERNEL_LOCK(hold_count, curlwp);
3617 goto startover;
3618 }
3619
3620 pp->pp_attrs |= opte;
3621 va = pvpte->pte_va;
3622 pve = pmap_remove_pv(pp, ptp, va);
3623
3624 /* update the PTP reference count. free if last reference. */
3625 if (ptp != NULL) {
3626 struct pmap *pmap2;
3627 pt_entry_t *ptes;
3628 pd_entry_t * const *pdes;
3629
3630 KASSERT(pmap != pmap_kernel());
3631
3632 pmap_tlb_shootnow();
3633 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3634 pmap_stats_update_bypte(pmap, 0, opte);
3635 ptp->wire_count--;
3636 if (ptp->wire_count <= 1) {
3637 pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3638 }
3639 pmap_unmap_ptes(pmap, pmap2);
3640 pmap_destroy(pmap);
3641 } else {
3642 KASSERT(pmap == pmap_kernel());
3643 pmap_stats_update_bypte(pmap, 0, opte);
3644 }
3645
3646 if (pve != NULL) {
3647 pve->pve_next = killlist; /* mark it for death */
3648 killlist = pve;
3649 }
3650 }
3651 pmap_tlb_shootnow();
3652 kpreempt_enable();
3653
3654 /* Now free unused pvs. */
3655 pmap_free_pvs(killlist);
3656}
3657
3658/*
3659 * pmap_page_remove: remove a managed vm_page from all pmaps that map it
3660 *
3661 * => R/M bits are sync'd back to attrs
3662 */
3663
3664void
3665pmap_page_remove(struct vm_page *pg)
3666{
3667 struct pmap_page *pp;
3668 paddr_t pa;
3669
3670 KASSERT(uvm_page_locked_p(pg));
3671
3672 pp = VM_PAGE_TO_PP(pg);
3673 pa = VM_PAGE_TO_PHYS(pg);
3674 pmap_pp_remove(pp, pa);
3675}
3676
3677/*
3678 * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
3679 * that map it
3680 */
3681
3682void
3683pmap_pv_remove(paddr_t pa)
3684{
3685 struct pmap_page *pp;
3686
3687 pp = pmap_pv_tracked(pa);
3688 if (pp == NULL)
3689 panic("pmap_pv_protect: page not pv-tracked: 0x%"PRIxPADDR,
3690 pa);
3691 pmap_pp_remove(pp, pa);
3692}
3693
3694/*
3695 * p m a p a t t r i b u t e f u n c t i o n s
3696 * functions that test/change managed page's attributes
3697 * since a page can be mapped multiple times we must check each PTE that
3698 * maps it by going down the pv lists.
3699 */
3700
3701/*
3702 * pmap_test_attrs: test a page's attributes
3703 */
3704
3705bool
3706pmap_test_attrs(struct vm_page *pg, unsigned testbits)
3707{
3708 struct pmap_page *pp;
3709 struct pv_pte *pvpte;
3710 pt_entry_t expect;
3711 u_int result;
3712
3713 KASSERT(uvm_page_locked_p(pg));
3714
3715 pp = VM_PAGE_TO_PP(pg);
3716 if ((pp->pp_attrs & testbits) != 0) {
3717 return true;
3718 }
3719 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3720 kpreempt_disable();
3721 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3722 pt_entry_t opte;
3723 int error;
3724
3725 if ((pp->pp_attrs & testbits) != 0) {
3726 break;
3727 }
3728 error = pmap_sync_pv(pvpte, expect, 0, &opte);
3729 if (error == 0) {
3730 pp->pp_attrs |= opte;
3731 }
3732 }
3733 result = pp->pp_attrs & testbits;
3734 kpreempt_enable();
3735
3736 /*
3737 * note that we will exit the for loop with a non-null pve if
3738 * we have found the bits we are testing for.
3739 */
3740
3741 return result != 0;
3742}
3743
3744static bool
3745pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
3746{
3747 struct pv_pte *pvpte;
3748 u_int result;
3749 pt_entry_t expect;
3750 int count;
3751
3752 expect = pmap_pa2pte(pa) | PG_V;
3753 count = SPINLOCK_BACKOFF_MIN;
3754 kpreempt_disable();
3755startover:
3756 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3757 pt_entry_t opte;
3758 int error;
3759
3760 error = pmap_sync_pv(pvpte, expect, clearbits, &opte);
3761 if (error == EAGAIN) {
3762 int hold_count;
3763 KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3764 SPINLOCK_BACKOFF(count);
3765 KERNEL_LOCK(hold_count, curlwp);
3766 goto startover;
3767 }
3768 pp->pp_attrs |= opte;
3769 }
3770 result = pp->pp_attrs & clearbits;
3771 pp->pp_attrs &= ~clearbits;
3772 pmap_tlb_shootnow();
3773 kpreempt_enable();
3774
3775 return result != 0;
3776}
3777
3778/*
3779 * pmap_clear_attrs: clear the specified attribute for a page.
3780 *
3781 * => we return true if we cleared one of the bits we were asked to
3782 */
3783
3784bool
3785pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
3786{
3787 struct pmap_page *pp;
3788 paddr_t pa;
3789
3790 KASSERT(uvm_page_locked_p(pg));
3791
3792 pp = VM_PAGE_TO_PP(pg);
3793 pa = VM_PAGE_TO_PHYS(pg);
3794
3795 return pmap_pp_clear_attrs(pp, pa, clearbits);
3796}
3797
3798/*
3799 * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
3800 * pv-tracked page.
3801 */
3802
3803bool
3804pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
3805{
3806 struct pmap_page *pp;
3807
3808 pp = pmap_pv_tracked(pa);
3809 if (pp == NULL)
3810 panic("pmap_pv_protect: page not pv-tracked: 0x%"PRIxPADDR,
3811 pa);
3812
3813 return pmap_pp_clear_attrs(pp, pa, clearbits);
3814}
3815
3816/*
3817 * p m a p p r o t e c t i o n f u n c t i o n s
3818 */
3819
3820/*
3821 * pmap_page_protect: change the protection of all recorded mappings
3822 * of a managed page
3823 *
3824 * => NOTE: this is an inline function in pmap.h
3825 */
3826
3827/* see pmap.h */
3828
3829/*
3830 * pmap_pv_protect: change the protection of all recorded mappings
3831 * of an unmanaged pv-tracked page
3832 *
3833 * => NOTE: this is an inline function in pmap.h
3834 */
3835
3836/* see pmap.h */
3837
3838/*
3839 * pmap_protect: set the protection in of the pages in a pmap
3840 *
3841 * => NOTE: this is an inline function in pmap.h
3842 */
3843
3844/* see pmap.h */
3845
3846/*
3847 * pmap_write_protect: write-protect pages in a pmap.
3848 */
3849void
3850pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
3851{
3852 pt_entry_t bit_rem, bit_put;
3853 pt_entry_t *ptes;
3854 pt_entry_t * const *pdes;
3855 struct pmap *pmap2;
3856 vaddr_t blockend, va;
3857
3858 KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3859
3860 bit_rem = 0;
3861 if (!(prot & VM_PROT_WRITE))
3862 bit_rem = PG_RW;
3863
3864 bit_put = 0;
3865 if (!(prot & VM_PROT_EXECUTE))
3866 bit_put = pmap_pg_nx;
3867
3868 sva &= PG_FRAME;
3869 eva &= PG_FRAME;
3870
3871 /* Acquire pmap. */
3872 kpreempt_disable();
3873 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3874
3875 for (va = sva ; va < eva; va = blockend) {
3876 pt_entry_t *spte, *epte;
3877 int i;
3878
3879 blockend = x86_round_pdr(va + 1);
3880 if (blockend > eva)
3881 blockend = eva;
3882
3883 /*
3884 * Our PTE mappings should never be write-protected.
3885 *
3886 * XXXmaxv: still needed?
3887 *
3888 * A long term solution is to move the PTEs out of user address
3889 * space, and into kernel address space. Then we can set
3890 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS.
3891 */
3892 for (i = 0; i < PDP_SIZE; i++) {
3893 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3894 panic("PTE space accessed");
3895 }
3896
3897 /* Is it a valid block? */
3898 if (!pmap_pdes_valid(va, pdes, NULL)) {
3899 continue;
3900 }
3901 KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
3902
3903 spte = &ptes[pl1_i(va)];
3904 epte = &ptes[pl1_i(blockend)];
3905
3906 for (/* */; spte < epte; spte++) {
3907 pt_entry_t opte, npte;
3908
3909 do {
3910 opte = *spte;
3911 if (!pmap_valid_entry(opte)) {
3912 goto next;
3913 }
3914 npte = (opte & ~bit_rem) | bit_put;
3915 } while (pmap_pte_cas(spte, opte, npte) != opte);
3916
3917 if ((opte & PG_M) != 0) {
3918 vaddr_t tva = x86_ptob(spte - ptes);
3919 pmap_tlb_shootdown(pmap, tva, opte,
3920 TLBSHOOT_WRITE_PROTECT);
3921 }
3922next:;
3923 }
3924 }
3925
3926 /* Release pmap. */
3927 pmap_unmap_ptes(pmap, pmap2);
3928 kpreempt_enable();
3929}
3930
3931/*
3932 * pmap_unwire: clear the wired bit in the PTE.
3933 *
3934 * => Mapping should already be present.
3935 */
3936void
3937pmap_unwire(struct pmap *pmap, vaddr_t va)
3938{
3939 pt_entry_t *ptes, *ptep, opte;
3940 pd_entry_t * const *pdes;
3941 struct pmap *pmap2;
3942
3943 /* Acquire pmap. */
3944 kpreempt_disable();
3945 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3946
3947 if (!pmap_pdes_valid(va, pdes, NULL)) {
3948 panic("pmap_unwire: invalid PDE");
3949 }
3950
3951 ptep = &ptes[pl1_i(va)];
3952 opte = *ptep;
3953 KASSERT(pmap_valid_entry(opte));
3954
3955 if (opte & PG_W) {
3956 pt_entry_t npte = opte & ~PG_W;
3957
3958 opte = pmap_pte_testset(ptep, npte);
3959 pmap_stats_update_bypte(pmap, npte, opte);
3960 } else {
3961 printf("pmap_unwire: wiring for pmap %p va 0x%lx "
3962 "did not change!\n", pmap, va);
3963 }
3964
3965 /* Release pmap. */
3966 pmap_unmap_ptes(pmap, pmap2);
3967 kpreempt_enable();
3968}
3969
3970/*
3971 * pmap_copy: copy mappings from one pmap to another
3972 *
3973 * => optional function
3974 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
3975 */
3976
3977/*
3978 * defined as macro in pmap.h
3979 */
3980
3981__strict_weak_alias(pmap_enter, pmap_enter_default);
3982
3983int
3984pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
3985 u_int flags)
3986{
3987 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
3988}
3989
3990/*
3991 * pmap_enter: enter a mapping into a pmap
3992 *
3993 * => must be done "now" ... no lazy-evaluation
3994 * => we set pmap => pv_head locking
3995 */
3996int
3997pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
3998 vm_prot_t prot, u_int flags, int domid)
3999{
4000 pt_entry_t *ptes, opte, npte;
4001 pt_entry_t *ptep;
4002 pd_entry_t * const *pdes;
4003 struct vm_page *ptp;
4004 struct vm_page *new_pg, *old_pg;
4005 struct pmap_page *new_pp, *old_pp;
4006 struct pv_entry *old_pve = NULL;
4007 struct pv_entry *new_pve;
4008 struct pv_entry *new_sparepve;
4009 int error;
4010 bool wired = (flags & PMAP_WIRED) != 0;
4011 struct pmap *pmap2;
4012
4013 KASSERT(pmap_initialized);
4014 KASSERT(curlwp->l_md.md_gc_pmap != pmap);
4015 KASSERT(va < VM_MAX_KERNEL_ADDRESS);
4016 KASSERTMSG(va != (vaddr_t)PDP_BASE,
4017 "pmap_enter: trying to map over PDP!");
4018 KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
4019 pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
4020 "pmap_enter: missing kernel PTP for VA %lx!", va);
4021
4022#ifdef XEN
4023 KASSERT(domid == DOMID_SELF || pa == 0);
4024#endif /* XEN */
4025
4026 npte = ma | protection_codes[prot] | PG_V;
4027 npte |= pmap_pat_flags(flags);
4028 if (wired)
4029 npte |= PG_W;
4030 if (va < VM_MAXUSER_ADDRESS)
4031 npte |= PG_u;
4032 else if (va < VM_MAX_ADDRESS)
4033 panic("PTE space accessed"); /* XXXmaxv: no longer needed? */
4034 else
4035 npte |= PG_k;
4036 if (pmap == pmap_kernel())
4037 npte |= pmap_pg_g;
4038 if (flags & VM_PROT_ALL) {
4039 npte |= PG_U;
4040 if (flags & VM_PROT_WRITE) {
4041 KASSERT((npte & PG_RW) != 0);
4042 npte |= PG_M;
4043 }
4044 }
4045
4046#ifdef XEN
4047 if (domid != DOMID_SELF)
4048 new_pg = NULL;
4049 else
4050#endif
4051 new_pg = PHYS_TO_VM_PAGE(pa);
4052 if (new_pg != NULL) {
4053 /* This is a managed page */
4054 npte |= PG_PVLIST;
4055 new_pp = VM_PAGE_TO_PP(new_pg);
4056 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
4057 /* This is an unmanaged pv-tracked page */
4058 npte |= PG_PVLIST;
4059 } else {
4060 new_pp = NULL;
4061 }
4062
4063 /* get pves. */
4064 new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4065 new_sparepve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4066 if (new_pve == NULL || new_sparepve == NULL) {
4067 if (flags & PMAP_CANFAIL) {
4068 error = ENOMEM;
4069 goto out2;
4070 }
4071 panic("pmap_enter: pve allocation failed");
4072 }
4073
4074 kpreempt_disable();
4075 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */
4076 if (pmap == pmap_kernel()) {
4077 ptp = NULL;
4078 } else {
4079 ptp = pmap_get_ptp(pmap, va, pdes);
4080 if (ptp == NULL) {
4081 pmap_unmap_ptes(pmap, pmap2);
4082 if (flags & PMAP_CANFAIL) {
4083 error = ENOMEM;
4084 goto out;
4085 }
4086 panic("pmap_enter: get ptp failed");
4087 }
4088 }
4089
4090 /*
4091 * update the pte.
4092 */
4093
4094 ptep = &ptes[pl1_i(va)];
4095 do {
4096 opte = *ptep;
4097
4098 /*
4099 * if the same page, inherit PG_U and PG_M.
4100 */
4101 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4102 npte |= opte & (PG_U | PG_M);
4103 }
4104#if defined(XEN)
4105 if (domid != DOMID_SELF) {
4106 /* pmap_pte_cas with error handling */
4107 int s = splvm();
4108 if (opte != *ptep) {
4109 splx(s);
4110 continue;
4111 }
4112 error = xpq_update_foreign(
4113 vtomach((vaddr_t)ptep), npte, domid);
4114 splx(s);
4115 if (error) {
4116 if (ptp != NULL && ptp->wire_count <= 1) {
4117 pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4118 }
4119 pmap_unmap_ptes(pmap, pmap2);
4120 goto out;
4121 }
4122 break;
4123 }
4124#endif /* defined(XEN) */
4125 } while (pmap_pte_cas(ptep, opte, npte) != opte);
4126
4127 /*
4128 * update statistics and PTP's reference count.
4129 */
4130
4131 pmap_stats_update_bypte(pmap, npte, opte);
4132 if (ptp != NULL && !pmap_valid_entry(opte)) {
4133 ptp->wire_count++;
4134 }
4135 KASSERT(ptp == NULL || ptp->wire_count > 1);
4136
4137 /*
4138 * if the same page, we can skip pv_entry handling.
4139 */
4140
4141 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4142 KASSERT(((opte ^ npte) & PG_PVLIST) == 0);
4143 goto same_pa;
4144 }
4145
4146 /*
4147 * if old page is pv-tracked, remove pv_entry from its list.
4148 */
4149
4150 if ((~opte & (PG_V | PG_PVLIST)) == 0) {
4151 if ((old_pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
4152 KASSERT(uvm_page_locked_p(old_pg));
4153 old_pp = VM_PAGE_TO_PP(old_pg);
4154 } else if ((old_pp = pmap_pv_tracked(pmap_pte2pa(opte)))
4155 == NULL) {
4156 pa = pmap_pte2pa(opte);
4157 panic("pmap_enter: PG_PVLIST with pv-untracked page"
4158 " va = 0x%"PRIxVADDR
4159 " pa = 0x%" PRIxPADDR " (0x%" PRIxPADDR ")",
4160 va, pa, atop(pa));
4161 }
4162
4163 old_pve = pmap_remove_pv(old_pp, ptp, va);
4164 old_pp->pp_attrs |= opte;
4165 }
4166
4167 /*
4168 * if new page is pv-tracked, insert pv_entry into its list.
4169 */
4170
4171 if (new_pp) {
4172 new_pve = pmap_enter_pv(new_pp, new_pve, &new_sparepve, ptp, va);
4173 }
4174
4175same_pa:
4176 pmap_unmap_ptes(pmap, pmap2);
4177
4178 /*
4179 * shootdown tlb if necessary.
4180 */
4181
4182 if ((~opte & (PG_V | PG_U)) == 0 &&
4183 ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) {
4184 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
4185 }
4186
4187 error = 0;
4188out:
4189 kpreempt_enable();
4190out2:
4191 if (old_pve != NULL) {
4192 pool_cache_put(&pmap_pv_cache, old_pve);
4193 }
4194 if (new_pve != NULL) {
4195 pool_cache_put(&pmap_pv_cache, new_pve);
4196 }
4197 if (new_sparepve != NULL) {
4198 pool_cache_put(&pmap_pv_cache, new_sparepve);
4199 }
4200
4201 return error;
4202}
4203
4204static paddr_t
4205pmap_get_physpage(void)
4206{
4207 struct vm_page *ptp;
4208 struct pmap *kpm = pmap_kernel();
4209 paddr_t pa;
4210
4211 if (!uvm.page_init_done) {
4212 /*
4213 * We're growing the kernel pmap early (from
4214 * uvm_pageboot_alloc()). This case must be
4215 * handled a little differently.
4216 */
4217
4218 if (!uvm_page_physget(&pa))
4219 panic("pmap_get_physpage: out of memory");
4220#if defined(__HAVE_DIRECT_MAP)
4221 pagezero(PMAP_DIRECT_MAP(pa));
4222#else
4223#if defined(XEN)
4224 if (XEN_VERSION_SUPPORTED(3, 4)) {
4225 xen_pagezero(pa);
4226 return pa;
4227 }
4228#endif
4229 kpreempt_disable();
4230 pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PG_V |
4231 PG_RW | pmap_pg_nx | PG_k);
4232 pmap_pte_flush();
4233 pmap_update_pg((vaddr_t)early_zerop);
4234 memset(early_zerop, 0, PAGE_SIZE);
4235#if defined(DIAGNOSTIC) || defined(XEN)
4236 pmap_pte_set(early_zero_pte, 0);
4237 pmap_pte_flush();
4238#endif /* defined(DIAGNOSTIC) */
4239 kpreempt_enable();
4240#endif /* defined(__HAVE_DIRECT_MAP) */
4241 } else {
4242 /* XXX */
4243 ptp = uvm_pagealloc(NULL, 0, NULL,
4244 UVM_PGA_USERESERVE|UVM_PGA_ZERO);
4245 if (ptp == NULL)
4246 panic("pmap_get_physpage: out of memory");
4247 ptp->flags &= ~PG_BUSY;
4248 ptp->wire_count = 1;
4249 pa = VM_PAGE_TO_PHYS(ptp);
4250 }
4251 pmap_stats_update(kpm, 1, 0);
4252
4253 return pa;
4254}
4255
4256/*
4257 * Expand the page tree with the specified amount of PTPs, mapping virtual
4258 * addresses starting at kva. We populate all the levels but the last one
4259 * (L1). The nodes of the tree are created as RWX, but the pages covered
4260 * will be kentered in L1, with proper permissions.
4261 *
4262 * Used only by pmap_growkernel.
4263 */
4264static void
4265pmap_alloc_level(vaddr_t kva, long *needed_ptps)
4266{
4267 unsigned long i;
4268 paddr_t pa;
4269 unsigned long index, endindex;
4270 int level;
4271 pd_entry_t *pdep;
4272#ifdef XEN
4273 int s = splvm(); /* protect xpq_* */
4274#endif
4275
4276 for (level = PTP_LEVELS; level > 1; level--) {
4277 if (level == PTP_LEVELS)
4278 pdep = pmap_kernel()->pm_pdir;
4279 else
4280 pdep = normal_pdes[level - 2];
4281 index = pl_i_roundup(kva, level);
4282 endindex = index + needed_ptps[level - 1] - 1;
4283
4284 for (i = index; i <= endindex; i++) {
4285 pt_entry_t pte;
4286
4287 KASSERT(!pmap_valid_entry(pdep[i]));
4288 pa = pmap_get_physpage();
4289 pte = pmap_pa2pte(pa) | PG_k | PG_V | PG_RW;
4290 pmap_pte_set(&pdep[i], pte);
4291
4292#if defined(XEN) && (defined(PAE) || defined(__x86_64__))
4293 if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
4294 if (__predict_true(
4295 cpu_info_primary.ci_flags & CPUF_PRESENT)) {
4296 /* update per-cpu PMDs on all cpus */
4297 xen_kpm_sync(pmap_kernel(), i);
4298 } else {
4299 /*
4300 * too early; update primary CPU
4301 * PMD only (without locks)
4302 */
4303#ifdef PAE
4304 pd_entry_t *cpu_pdep =
4305 &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
4306#endif
4307#ifdef __x86_64__
4308 pd_entry_t *cpu_pdep =
4309 &cpu_info_primary.ci_kpm_pdir[i];
4310#endif
4311 pmap_pte_set(cpu_pdep, pte);
4312 }
4313 }
4314#endif /* XEN && (PAE || __x86_64__) */
4315
4316 KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
4317 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
4318 nkptp[level - 1]++;
4319 }
4320 pmap_pte_flush();
4321 }
4322#ifdef XEN
4323 splx(s);
4324#endif
4325}
4326
4327/*
4328 * pmap_growkernel: increase usage of KVM space.
4329 *
4330 * => we allocate new PTPs for the kernel and install them in all
4331 * the pmaps on the system.
4332 */
4333
4334vaddr_t
4335pmap_growkernel(vaddr_t maxkvaddr)
4336{
4337 struct pmap *kpm = pmap_kernel();
4338#if !defined(XEN) || !defined(__x86_64__)
4339 struct pmap *pm;
4340 long old;
4341#endif
4342 int s, i;
4343 long needed_kptp[PTP_LEVELS], target_nptp;
4344 bool invalidate = false;
4345
4346 s = splvm(); /* to be safe */
4347 mutex_enter(kpm->pm_lock);
4348
4349 if (maxkvaddr <= pmap_maxkvaddr) {
4350 mutex_exit(kpm->pm_lock);
4351 splx(s);
4352 return pmap_maxkvaddr;
4353 }
4354
4355 maxkvaddr = x86_round_pdr(maxkvaddr);
4356#if !defined(XEN) || !defined(__x86_64__)
4357 old = nkptp[PTP_LEVELS - 1];
4358#endif
4359
4360 /* Initialize needed_kptp. */
4361 for (i = PTP_LEVELS - 1; i >= 1; i--) {
4362 target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
4363 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
4364
4365 if (target_nptp > nkptpmax[i])
4366 panic("out of KVA space");
4367 KASSERT(target_nptp >= nkptp[i]);
4368 needed_kptp[i] = target_nptp - nkptp[i];
4369 }
4370
4371 pmap_alloc_level(pmap_maxkvaddr, needed_kptp);
4372
4373 /*
4374 * If the number of top level entries changed, update all pmaps.
4375 */
4376 if (needed_kptp[PTP_LEVELS - 1] != 0) {
4377#ifdef XEN
4378#ifdef __x86_64__
4379 /* nothing, kernel entries are never entered in user pmap */
4380#else /* __x86_64__ */
4381 mutex_enter(&pmaps_lock);
4382 LIST_FOREACH(pm, &pmaps, pm_list) {
4383 int pdkidx;
4384 for (pdkidx = PDIR_SLOT_KERN + old;
4385 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4386 pdkidx++) {
4387 pmap_pte_set(&pm->pm_pdir[pdkidx],
4388 kpm->pm_pdir[pdkidx]);
4389 }
4390 pmap_pte_flush();
4391 }
4392 mutex_exit(&pmaps_lock);
4393#endif /* __x86_64__ */
4394#else /* XEN */
4395 unsigned newpdes;
4396 newpdes = nkptp[PTP_LEVELS - 1] - old;
4397 mutex_enter(&pmaps_lock);
4398 LIST_FOREACH(pm, &pmaps, pm_list) {
4399 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
4400 &kpm->pm_pdir[PDIR_SLOT_KERN + old],
4401 newpdes * sizeof (pd_entry_t));
4402 }
4403 mutex_exit(&pmaps_lock);
4404#endif
4405 invalidate = true;
4406 }
4407 pmap_maxkvaddr = maxkvaddr;
4408 mutex_exit(kpm->pm_lock);
4409 splx(s);
4410
4411 if (invalidate && pmap_initialized) {
4412 /* Invalidate the PDP cache. */
4413 pool_cache_invalidate(&pmap_pdp_cache);
4414 }
4415
4416 return maxkvaddr;
4417}
4418
4419#ifdef DEBUG
4420void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
4421
4422/*
4423 * pmap_dump: dump all the mappings from a pmap
4424 *
4425 * => caller should not be holding any pmap locks
4426 */
4427
4428void
4429pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4430{
4431 pt_entry_t *ptes, *pte;
4432 pd_entry_t * const *pdes;
4433 struct pmap *pmap2;
4434 vaddr_t blkendva;
4435
4436 /*
4437 * if end is out of range truncate.
4438 * if (end == start) update to max.
4439 */
4440
4441 if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
4442 eva = VM_MAXUSER_ADDRESS;
4443
4444 /*
4445 * we lock in the pmap => pv_head direction
4446 */
4447
4448 kpreempt_disable();
4449 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */
4450
4451 /*
4452 * dumping a range of pages: we dump in PTP sized blocks (4MB)
4453 */
4454
4455 for (/* null */ ; sva < eva ; sva = blkendva) {
4456
4457 /* determine range of block */
4458 blkendva = x86_round_pdr(sva+1);
4459 if (blkendva > eva)
4460 blkendva = eva;
4461
4462 /* valid block? */
4463 if (!pmap_pdes_valid(sva, pdes, NULL))
4464 continue;
4465
4466 pte = &ptes[pl1_i(sva)];
4467 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
4468 if (!pmap_valid_entry(*pte))
4469 continue;
4470 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
4471 " (pte=%#" PRIxPADDR ")\n",
4472 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
4473 }
4474 }
4475 pmap_unmap_ptes(pmap, pmap2);
4476 kpreempt_enable();
4477}
4478#endif
4479
4480/*
4481 * pmap_update: process deferred invalidations and frees.
4482 */
4483
4484void
4485pmap_update(struct pmap *pmap)
4486{
4487 struct vm_page *empty_ptps;
4488 lwp_t *l = curlwp;
4489
4490 /*
4491 * If we have torn down this pmap, invalidate non-global TLB
4492 * entries on any processors using it.
4493 */
4494 kpreempt_disable();
4495 if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
4496 l->l_md.md_gc_pmap = NULL;
4497 pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE);
4498 }
4499 /*
4500 * Initiate any pending TLB shootdowns. Wait for them to
4501 * complete before returning control to the caller.
4502 */
4503 pmap_tlb_shootnow();
4504 kpreempt_enable();
4505
4506 /*
4507 * Now that shootdowns are complete, process deferred frees,
4508 * but not from interrupt context.
4509 */
4510 if (l->l_md.md_gc_ptp != NULL) {
4511 KASSERT((l->l_pflag & LP_INTR) == 0);
4512 if (cpu_intr_p()) {
4513 return;
4514 }
4515 empty_ptps = l->l_md.md_gc_ptp;
4516 l->l_md.md_gc_ptp = NULL;
4517 pmap_free_ptps(empty_ptps);
4518 }
4519}
4520
4521#if PTP_LEVELS > 4
4522#error "Unsupported number of page table mappings"
4523#endif
4524
4525paddr_t
4526pmap_init_tmp_pgtbl(paddr_t pg)
4527{
4528 static bool maps_loaded;
4529 static const paddr_t x86_tmp_pml_paddr[] = {
4530 4 * PAGE_SIZE, /* L1 */
4531 5 * PAGE_SIZE, /* L2 */
4532 6 * PAGE_SIZE, /* L3 */
4533 7 * PAGE_SIZE /* L4 */
4534 };
4535 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
4536
4537 pd_entry_t *tmp_pml, *kernel_pml;
4538
4539 int level;
4540
4541 if (!maps_loaded) {
4542 for (level = 0; level < PTP_LEVELS; ++level) {
4543 x86_tmp_pml_vaddr[level] =
4544 uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
4545 UVM_KMF_VAONLY);
4546
4547 if (x86_tmp_pml_vaddr[level] == 0)
4548 panic("mapping of real mode PML failed\n");
4549 pmap_kenter_pa(x86_tmp_pml_vaddr[level],
4550 x86_tmp_pml_paddr[level],
4551 VM_PROT_READ | VM_PROT_WRITE, 0);
4552 }
4553 pmap_update(pmap_kernel());
4554 maps_loaded = true;
4555 }
4556
4557 /* Zero levels 1-3 */
4558 for (level = 0; level < PTP_LEVELS - 1; ++level) {
4559 tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4560 memset(tmp_pml, 0, PAGE_SIZE);
4561 }
4562
4563 /* Copy PML4 */
4564 kernel_pml = pmap_kernel()->pm_pdir;
4565 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
4566 memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
4567
4568#ifdef PAE
4569 /*
4570 * Use the last 4 entries of the L2 page as L3 PD entries. These
4571 * last entries are unlikely to be used for temporary mappings.
4572 * 508: maps 0->1GB (userland)
4573 * 509: unused
4574 * 510: unused
4575 * 511: maps 3->4GB (kernel)
4576 */
4577 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V;
4578 tmp_pml[509] = 0;
4579 tmp_pml[510] = 0;
4580 tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PG_V;
4581#endif
4582
4583 for (level = PTP_LEVELS - 1; level > 0; --level) {
4584 tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4585
4586 tmp_pml[pl_i(pg, level + 1)] =
4587 (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V;
4588 }
4589
4590 tmp_pml = (void *)x86_tmp_pml_vaddr[0];
4591 tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V;
4592
4593#ifdef PAE
4594 /* Return the PA of the L3 page (entry 508 of the L2 page) */
4595 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
4596#endif
4597
4598 return x86_tmp_pml_paddr[PTP_LEVELS - 1];
4599}
4600
4601u_int
4602x86_mmap_flags(paddr_t mdpgno)
4603{
4604 u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
4605 u_int pflag = 0;
4606
4607 if (nflag & X86_MMAP_FLAG_PREFETCH)
4608 pflag |= PMAP_WRITE_COMBINE;
4609
4610 return pflag;
4611}
4612