head 1.2; access; symbols pkgsrc-2020Q3:1.1.0.2; locks; strict; comment @# @; 1.2 date 2020.11.06.21.45.49; author bouyer; state dead; branches; next 1.1; commitid WDPVMlrHGXeceSuC; 1.1 date 2020.10.21.09.04.10; author bouyer; state Exp; branches 1.1.2.1; next ; commitid z0QsOseMMzGbyKsC; 1.1.2.1 date 2020.10.21.09.04.10; author bsiegert; state dead; branches; next 1.1.2.2; commitid 4LJlEfpW0QY3ZUsC; 1.1.2.2 date 2020.10.22.16.29.05; author bsiegert; state Exp; branches; next ; commitid 4LJlEfpW0QY3ZUsC; desc @@ 1.2 log @Update xenkernel413 and xentools413 to 4.13.2. This includes fixes for XSA up to XSA347, and an improved fix for XSA 286. @ text @$NetBSD: patch-XSA286,v 1.1 2020/10/21 09:04:10 bouyer Exp $ From: Jan Beulich Subject: x86/mm: split L4 and L3 parts of the walk out of do_page_walk() The L3 one at least is going to be re-used by a subsequent patch, and splitting the L4 one then as well seems only natural. This is part of XSA-286. Signed-off-by: Jan Beulich Reviewed-by: George Dunlap Reviewed-by: Andrew Cooper diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index db4f035d8d..b1582b56fb 100644 --- xen/arch/x86/x86_64/mm.c.orig +++ xen/arch/x86/x86_64/mm.c @@@@ -44,26 +44,47 @@@@ unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START; l2_pgentry_t *compat_idle_pg_table_l2; -void *do_page_walk(struct vcpu *v, unsigned long addr) +static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr) { - unsigned long mfn = pagetable_get_pfn(v->arch.guest_table); - l4_pgentry_t l4e, *l4t; - l3_pgentry_t l3e, *l3t; - l2_pgentry_t l2e, *l2t; - l1_pgentry_t l1e, *l1t; + unsigned long mfn = pagetable_get_pfn(root); + l4_pgentry_t *l4t, l4e; - if ( !is_pv_vcpu(v) || !is_canonical_address(addr) ) - return NULL; + if ( !is_canonical_address(addr) ) + return l4e_empty(); l4t = map_domain_page(_mfn(mfn)); l4e = l4t[l4_table_offset(addr)]; unmap_domain_page(l4t); + + return l4e; +} + +static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr) +{ + l4_pgentry_t l4e = page_walk_get_l4e(root, addr); + l3_pgentry_t *l3t, l3e; + if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) - return NULL; + return l3e_empty(); l3t = map_l3t_from_l4e(l4e); l3e = l3t[l3_table_offset(addr)]; unmap_domain_page(l3t); + + return l3e; +} + +void *do_page_walk(struct vcpu *v, unsigned long addr) +{ + l3_pgentry_t l3e; + l2_pgentry_t l2e, *l2t; + l1_pgentry_t l1e, *l1t; + unsigned long mfn; + + if ( !is_pv_vcpu(v) ) + return NULL; + + l3e = page_walk_get_l3e(v->arch.guest_table, addr); mfn = l3e_get_pfn(l3e); if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) ) return NULL; From: Jan Beulich Subject: x86/mm: check page types in do_page_walk() For page table entries read to be guaranteed valid, transiently locking the pages and validating their types is necessary. Note that guest use of linear page tables is intentionally not taken into account here, as ordinary data (guest stacks) can't possibly live inside page tables. This is part of XSA-286. Signed-off-by: Jan Beulich Reviewed-by: George Dunlap Reviewed-by: Andrew Cooper diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index b1582b56fb..7d439639b7 100644 --- xen/arch/x86/x86_64/mm.c.orig +++ xen/arch/x86/x86_64/mm.c @@@@ -46,15 +46,29 @@@@ l2_pgentry_t *compat_idle_pg_table_l2; static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr) { - unsigned long mfn = pagetable_get_pfn(root); - l4_pgentry_t *l4t, l4e; + mfn_t mfn = pagetable_get_mfn(root); + /* current's root page table can't disappear under our feet. */ + bool need_lock = !mfn_eq(mfn, pagetable_get_mfn(current->arch.guest_table)); + struct page_info *pg; + l4_pgentry_t l4e = l4e_empty(); if ( !is_canonical_address(addr) ) return l4e_empty(); - l4t = map_domain_page(_mfn(mfn)); - l4e = l4t[l4_table_offset(addr)]; - unmap_domain_page(l4t); + pg = mfn_to_page(mfn); + if ( need_lock && !page_lock(pg) ) + return l4e_empty(); + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table ) + { + l4_pgentry_t *l4t = map_domain_page(mfn); + + l4e = l4t[l4_table_offset(addr)]; + unmap_domain_page(l4t); + } + + if ( need_lock ) + page_unlock(pg); return l4e; } @@@@ -62,14 +76,26 @@@@ static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr) static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr) { l4_pgentry_t l4e = page_walk_get_l4e(root, addr); - l3_pgentry_t *l3t, l3e; + mfn_t mfn = l4e_get_mfn(l4e); + struct page_info *pg; + l3_pgentry_t l3e = l3e_empty(); if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) return l3e_empty(); - l3t = map_l3t_from_l4e(l4e); - l3e = l3t[l3_table_offset(addr)]; - unmap_domain_page(l3t); + pg = mfn_to_page(mfn); + if ( !page_lock(pg) ) + return l3e_empty(); + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l3_page_table ) + { + l3_pgentry_t *l3t = map_domain_page(mfn); + + l3e = l3t[l3_table_offset(addr)]; + unmap_domain_page(l3t); + } + + page_unlock(pg); return l3e; } @@@@ -77,44 +103,67 @@@@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr) void *do_page_walk(struct vcpu *v, unsigned long addr) { l3_pgentry_t l3e; - l2_pgentry_t l2e, *l2t; - l1_pgentry_t l1e, *l1t; - unsigned long mfn; + l2_pgentry_t l2e = l2e_empty(); + l1_pgentry_t l1e = l1e_empty(); + mfn_t mfn; + struct page_info *pg; if ( !is_pv_vcpu(v) ) return NULL; l3e = page_walk_get_l3e(v->arch.guest_table, addr); - mfn = l3e_get_pfn(l3e); - if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) ) + mfn = l3e_get_mfn(l3e); + if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(mfn) ) return NULL; if ( (l3e_get_flags(l3e) & _PAGE_PSE) ) { - mfn += PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1)); + mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1))); goto ret; } - l2t = map_domain_page(_mfn(mfn)); - l2e = l2t[l2_table_offset(addr)]; - unmap_domain_page(l2t); - mfn = l2e_get_pfn(l2e); - if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) ) + pg = mfn_to_page(mfn); + if ( !page_lock(pg) ) + return NULL; + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table ) + { + const l2_pgentry_t *l2t = map_domain_page(mfn); + + l2e = l2t[l2_table_offset(addr)]; + unmap_domain_page(l2t); + } + + page_unlock(pg); + + mfn = l2e_get_mfn(l2e); + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(mfn) ) return NULL; if ( (l2e_get_flags(l2e) & _PAGE_PSE) ) { - mfn += PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1)); + mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1))); goto ret; } - l1t = map_domain_page(_mfn(mfn)); - l1e = l1t[l1_table_offset(addr)]; - unmap_domain_page(l1t); - mfn = l1e_get_pfn(l1e); - if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) ) + pg = mfn_to_page(mfn); + if ( !page_lock(pg) ) + return NULL; + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table ) + { + const l1_pgentry_t *l1t = map_domain_page(mfn); + + l1e = l1t[l1_table_offset(addr)]; + unmap_domain_page(l1t); + } + + page_unlock(pg); + + mfn = l1e_get_mfn(l1e); + if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(mfn) ) return NULL; ret: - return map_domain_page(_mfn(mfn)) + (addr & ~PAGE_MASK); + return map_domain_page(mfn) + (addr & ~PAGE_MASK); } /* From: Jan Beulich Subject: x86/mm: avoid using linear page tables in map_guest_l1e() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the linear L2 table access by an actual page walk. This is part of XSA-286. Reported-by: Jann Horn Signed-off-by: Jan Beulich Signed-off-by: Roger Pau Monné Reviewed-by: George Dunlap Reviewed-by: Andrew Cooper diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c index 2b0dadc8da..acebf9e957 100644 --- xen/arch/x86/pv/mm.c.orig +++ xen/arch/x86/pv/mm.c @@@@ -40,11 +40,14 @@@@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn) if ( unlikely(!__addr_ok(linear)) ) return NULL; - /* Find this l1e and its enclosing l1mfn in the linear map. */ - if ( __copy_from_user(&l2e, - &__linear_l2_table[l2_linear_offset(linear)], - sizeof(l2_pgentry_t)) ) + if ( unlikely(!(current->arch.flags & TF_kernel_mode)) ) + { + ASSERT_UNREACHABLE(); return NULL; + } + + /* Find this l1e and its enclosing l1mfn. */ + l2e = page_walk_get_l2e(current->arch.guest_table, linear); /* Check flags that it will be safe to read the l1e. */ if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) != _PAGE_PRESENT ) diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index 7d439639b7..670aa3f892 100644 --- xen/arch/x86/x86_64/mm.c.orig +++ xen/arch/x86/x86_64/mm.c @@@@ -100,6 +100,34 @@@@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr) return l3e; } +l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr) +{ + l3_pgentry_t l3e = page_walk_get_l3e(root, addr); + mfn_t mfn = l3e_get_mfn(l3e); + struct page_info *pg; + l2_pgentry_t l2e = l2e_empty(); + + if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || + (l3e_get_flags(l3e) & _PAGE_PSE) ) + return l2e_empty(); + + pg = mfn_to_page(mfn); + if ( !page_lock(pg) ) + return l2e_empty(); + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table ) + { + l2_pgentry_t *l2t = map_domain_page(mfn); + + l2e = l2t[l2_table_offset(addr)]; + unmap_domain_page(l2t); + } + + page_unlock(pg); + + return l2e; +} + void *do_page_walk(struct vcpu *v, unsigned long addr) { l3_pgentry_t l3e; diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index 320c6cd196..cd3e7ec501 100644 --- xen/include/asm-x86/mm.h.orig +++ xen/include/asm-x86/mm.h @@@@ -577,7 +577,9 @@@@ void audit_domains(void); void make_cr3(struct vcpu *v, mfn_t mfn); void update_cr3(struct vcpu *v); int vcpu_destroy_pagetables(struct vcpu *); + void *do_page_walk(struct vcpu *v, unsigned long addr); +l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr); int __sync_local_execstate(void); From: Jan Beulich Subject: x86/mm: avoid using linear page tables in guest_get_eff_kern_l1e() First of all drop guest_get_eff_l1e() entirely - there's no actual user of it: pv_ro_page_fault() has a guest_kernel_mode() conditional around its only call site. Then replace the linear L1 table access by an actual page walk. This is part of XSA-286. Reported-by: Jann Horn Signed-off-by: Jan Beulich Reviewed-by: George Dunlap Reviewed-by: Andrew Cooper diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c index acebf9e957..7624447246 100644 --- xen/arch/x86/pv/mm.c.orig +++ xen/arch/x86/pv/mm.c @@@@ -59,27 +59,6 @@@@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn) } /* - * Read the guest's l1e that maps this address, from the kernel-mode - * page tables. - */ -static l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear) -{ - struct vcpu *curr = current; - const bool user_mode = !(curr->arch.flags & TF_kernel_mode); - l1_pgentry_t l1e; - - if ( user_mode ) - toggle_guest_pt(curr); - - l1e = guest_get_eff_l1e(linear); - - if ( user_mode ) - toggle_guest_pt(curr); - - return l1e; -} - -/* * Map a guest's LDT page (covering the byte at @@offset from start of the LDT) * into Xen's virtual range. Returns true if the mapping changed, false * otherwise. diff --git a/xen/arch/x86/pv/mm.h b/xen/arch/x86/pv/mm.h index a1bd473b29..43d33a1fd1 100644 --- xen/arch/x86/pv/mm.h.orig +++ xen/arch/x86/pv/mm.h @@@@ -5,19 +5,19 @@@@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn); int new_guest_cr3(mfn_t mfn); -/* Read a PV guest's l1e that maps this linear address. */ -static inline l1_pgentry_t guest_get_eff_l1e(unsigned long linear) +/* + * Read the guest's l1e that maps this address, from the kernel-mode + * page tables. + */ +static inline l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear) { - l1_pgentry_t l1e; + l1_pgentry_t l1e = l1e_empty(); ASSERT(!paging_mode_translate(current->domain)); ASSERT(!paging_mode_external(current->domain)); - if ( unlikely(!__addr_ok(linear)) || - __copy_from_user(&l1e, - &__linear_l1_table[l1_linear_offset(linear)], - sizeof(l1_pgentry_t)) ) - l1e = l1e_empty(); + if ( likely(__addr_ok(linear)) ) + l1e = page_walk_get_l1e(current->arch.guest_table, linear); return l1e; } diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c index a920fb5e15..2bf4497a16 100644 --- xen/arch/x86/pv/ro-page-fault.c.orig +++ xen/arch/x86/pv/ro-page-fault.c @@@@ -357,7 +357,7 @@@@ int pv_ro_page_fault(unsigned long addr, struct cpu_user_regs *regs) bool mmio_ro; /* Attempt to read the PTE that maps the VA being accessed. */ - pte = guest_get_eff_l1e(addr); + pte = guest_get_eff_kern_l1e(addr); /* We are only looking for read-only mappings */ if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT | _PAGE_RW)) != _PAGE_PRESENT) ) diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index 670aa3f892..c5686e0d25 100644 --- xen/arch/x86/x86_64/mm.c.orig +++ xen/arch/x86/x86_64/mm.c @@@@ -128,6 +128,62 @@@@ l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr) return l2e; } +/* + * For now no "set_accessed" parameter, as all callers want it set to true. + * For now also no "set_dirty" parameter, as all callers deal with r/o + * mappings, and we don't want to set the dirty bit there (conflicts with + * CET-SS). However, as there are CPUs which may set the dirty bit on r/o + * PTEs, the logic below tolerates the bit becoming set "behind our backs". + */ +l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr) +{ + l2_pgentry_t l2e = page_walk_get_l2e(root, addr); + mfn_t mfn = l2e_get_mfn(l2e); + struct page_info *pg; + l1_pgentry_t l1e = l1e_empty(); + + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || + (l2e_get_flags(l2e) & _PAGE_PSE) ) + return l1e_empty(); + + pg = mfn_to_page(mfn); + if ( !page_lock(pg) ) + return l1e_empty(); + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table ) + { + l1_pgentry_t *l1t = map_domain_page(mfn); + + l1e = l1t[l1_table_offset(addr)]; + + if ( (l1e_get_flags(l1e) & (_PAGE_ACCESSED | _PAGE_PRESENT)) == + _PAGE_PRESENT ) + { + l1_pgentry_t ol1e = l1e; + + l1e_add_flags(l1e, _PAGE_ACCESSED); + /* + * Best effort only; with the lock held the page shouldn't + * change anyway, except for the dirty bit to perhaps become set. + */ + while ( cmpxchg(&l1e_get_intpte(l1t[l1_table_offset(addr)]), + l1e_get_intpte(ol1e), l1e_get_intpte(l1e)) != + l1e_get_intpte(ol1e) && + !(l1e_get_flags(l1e) & _PAGE_DIRTY) ) + { + l1e_add_flags(ol1e, _PAGE_DIRTY); + l1e_add_flags(l1e, _PAGE_DIRTY); + } + } + + unmap_domain_page(l1t); + } + + page_unlock(pg); + + return l1e; +} + void *do_page_walk(struct vcpu *v, unsigned long addr) { l3_pgentry_t l3e; diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index cd3e7ec501..865db999c1 100644 --- xen/include/asm-x86/mm.h.orig +++ xen/include/asm-x86/mm.h @@@@ -580,6 +580,7 @@@@ int vcpu_destroy_pagetables(struct vcpu *); void *do_page_walk(struct vcpu *v, unsigned long addr); l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr); +l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr); int __sync_local_execstate(void); From: Jan Beulich Subject: x86/mm: avoid using top level linear page tables in {,un}map_domain_page() Move the page table recursion two levels down. This entails avoiding to free the recursive mapping prematurely in free_perdomain_mappings(). This is part of XSA-286. Reported-by: Jann Horn Signed-off-by: Jan Beulich Reviewed-by: George Dunlap Reviewed-by: Andrew Cooper diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c index 4a07cfb18e..660bd06aaf 100644 --- xen/arch/x86/domain_page.c.orig +++ xen/arch/x86/domain_page.c @@@@ -65,7 +65,8 @@@@ void __init mapcache_override_current(struct vcpu *v) #define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER) #define MAPCACHE_L2_ENTRIES (mapcache_l2_entry(MAPCACHE_ENTRIES - 1) + 1) #define MAPCACHE_L1ENT(idx) \ - __linear_l1_table[l1_linear_offset(MAPCACHE_VIRT_START + pfn_to_paddr(idx))] + ((l1_pgentry_t *)(MAPCACHE_VIRT_START | \ + ((L2_PAGETABLE_ENTRIES - 1) << L2_PAGETABLE_SHIFT)))[idx] void *map_domain_page(mfn_t mfn) { @@@@ -235,6 +236,7 @@@@ int mapcache_domain_init(struct domain *d) { struct mapcache_domain *dcache = &d->arch.pv.mapcache; unsigned int bitmap_pages; + int rc; ASSERT(is_pv_domain(d)); @@@@ -243,8 +245,10 @@@@ int mapcache_domain_init(struct domain *d) return 0; #endif + BUILD_BUG_ON(MAPCACHE_VIRT_START & ((1 << L3_PAGETABLE_SHIFT) - 1)); BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 + - 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) > + 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) + + (1U << L2_PAGETABLE_SHIFT) > MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20)); bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long)); dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE; @@@@ -253,9 +257,25 @@@@ int mapcache_domain_init(struct domain *d) spin_lock_init(&dcache->lock); - return create_perdomain_mapping(d, (unsigned long)dcache->inuse, - 2 * bitmap_pages + 1, - NIL(l1_pgentry_t *), NULL); + rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse, + 2 * bitmap_pages + 1, + NIL(l1_pgentry_t *), NULL); + if ( !rc ) + { + /* + * Install mapping of our L2 table into its own last slot, for easy + * access to the L1 entries via MAPCACHE_L1ENT(). + */ + l3_pgentry_t *l3t = __map_domain_page(d->arch.perdomain_l3_pg); + l3_pgentry_t l3e = l3t[l3_table_offset(MAPCACHE_VIRT_END)]; + l2_pgentry_t *l2t = map_l2t_from_l3e(l3e); + + l2e_get_intpte(l2t[L2_PAGETABLE_ENTRIES - 1]) = l3e_get_intpte(l3e); + unmap_domain_page(l2t); + unmap_domain_page(l3t); + } + + return rc; } int mapcache_vcpu_init(struct vcpu *v) @@@@ -346,7 +366,7 @@@@ mfn_t domain_page_map_to_mfn(const void *ptr) else { ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END); - pl1e = &__linear_l1_table[l1_linear_offset(va)]; + pl1e = &MAPCACHE_L1ENT(PFN_DOWN(va - MAPCACHE_VIRT_START)); } return l1e_get_mfn(*pl1e); diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 30dffb68e8..279664a83e 100644 --- xen/arch/x86/mm.c.orig +++ xen/arch/x86/mm.c @@@@ -6031,6 +6031,10 @@@@ void free_perdomain_mappings(struct domain *d) { struct page_info *l1pg = l2e_get_page(l2tab[j]); + /* mapcache_domain_init() installs a recursive entry. */ + if ( l1pg == l2pg ) + continue; + if ( l2e_get_flags(l2tab[j]) & _PAGE_AVAIL0 ) { l1_pgentry_t *l1tab = __map_domain_page(l1pg); From: Jan Beulich Subject: x86/mm: restrict use of linear page tables to shadow mode code Other code does not require them to be set up anymore, so restrict when to populate the respective L4 slot and reduce visibility of the accessors. While with the removal of all uses the vulnerability is actually fixed, removing the creation of the linear mapping adds an extra layer of protection. Similarly reducing visibility of the accessors mostly eliminates the risk of undue re-introduction of uses of the linear mappings. This is (not strictly) part of XSA-286. Signed-off-by: Jan Beulich Reviewed-by: George Dunlap Reviewed-by: Andrew Cooper diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 279664a83e..fa0f813d29 100644 --- xen/arch/x86/mm.c.orig +++ xen/arch/x86/mm.c @@@@ -1757,9 +1757,10 @@@@ void init_xen_l4_slots(l4_pgentry_t *l4t, mfn_t l4mfn, l4t[l4_table_offset(PCI_MCFG_VIRT_START)] = idle_pg_table[l4_table_offset(PCI_MCFG_VIRT_START)]; - /* Slot 258: Self linear mappings. */ + /* Slot 258: Self linear mappings (shadow pt only). */ ASSERT(!mfn_eq(l4mfn, INVALID_MFN)); l4t[l4_table_offset(LINEAR_PT_VIRT_START)] = + !shadow_mode_external(d) ? l4e_empty() : l4e_from_mfn(l4mfn, __PAGE_HYPERVISOR_RW); /* Slot 259: Shadow linear mappings (if applicable) .*/ diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h index 3217777921..b214087194 100644 --- xen/arch/x86/mm/shadow/private.h.orig +++ xen/arch/x86/mm/shadow/private.h @@@@ -135,6 +135,15 @@@@ enum { # define GUEST_PTE_SIZE 4 #endif +/* Where to find each level of the linear mapping */ +#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START)) +#define __linear_l2_table \ + ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START))) +#define __linear_l3_table \ + ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START))) +#define __linear_l4_table \ + ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START))) + /****************************************************************************** * Auditing routines */ diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index c5686e0d25..dcb20d1d9d 100644 --- xen/arch/x86/x86_64/mm.c.orig +++ xen/arch/x86/x86_64/mm.c @@@@ -833,9 +833,6 @@@@ void __init paging_init(void) machine_to_phys_mapping_valid = 1; - /* Set up linear page table mapping. */ - l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)], - l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR_RW)); return; nomem: diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h index 8d79a71398..9d587a076a 100644 --- xen/include/asm-x86/config.h.orig +++ xen/include/asm-x86/config.h @@@@ -193,7 +193,7 @@@@ extern unsigned char boot_edid_info[128]; */ #define PCI_MCFG_VIRT_START (PML4_ADDR(257)) #define PCI_MCFG_VIRT_END (PCI_MCFG_VIRT_START + PML4_ENTRY_BYTES) -/* Slot 258: linear page table (guest table). */ +/* Slot 258: linear page table (monitor table, HVM only). */ #define LINEAR_PT_VIRT_START (PML4_ADDR(258)) #define LINEAR_PT_VIRT_END (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES) /* Slot 259: linear page table (shadow table). */ diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h index c1e92937c0..e72c277b9f 100644 --- xen/include/asm-x86/page.h.orig +++ xen/include/asm-x86/page.h @@@@ -274,19 +274,6 @@@@ void copy_page_sse2(void *, const void *); #define vmap_to_mfn(va) _mfn(l1e_get_pfn(*virt_to_xen_l1e((unsigned long)(va)))) #define vmap_to_page(va) mfn_to_page(vmap_to_mfn(va)) -#endif /* !defined(__ASSEMBLY__) */ - -/* Where to find each level of the linear mapping */ -#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START)) -#define __linear_l2_table \ - ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START))) -#define __linear_l3_table \ - ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START))) -#define __linear_l4_table \ - ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START))) - - -#ifndef __ASSEMBLY__ extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES]; extern l2_pgentry_t *compat_idle_pg_table_l2; extern unsigned int m2p_compat_vstart; @ 1.1 log @Add upstream security patches for XSA286, XSA345, XSA346, XSA347. Bump PKGREVISION. @ text @d1 1 a1 1 $NetBSD: $ @ 1.1.2.1 log @file patch-XSA286 was added on branch pkgsrc-2020Q3 on 2020-10-22 16:29:05 +0000 @ text @d1 716 @ 1.1.2.2 log @Pullup ticket #6355 - requested by bouyer sysutils/xenkernel411: security fix sysutils/xenkernel413: security fix Revisions pulled up: - sysutils/xenkernel411/Makefile 1.17 - sysutils/xenkernel411/distinfo 1.15 - sysutils/xenkernel411/patches/patch-XSA286 1.1 - sysutils/xenkernel411/patches/patch-XSA345 1.1 - sysutils/xenkernel411/patches/patch-XSA346 1.1 - sysutils/xenkernel411/patches/patch-XSA347 1.1 - sysutils/xenkernel413/Makefile 1.6 - sysutils/xenkernel413/distinfo 1.4 - sysutils/xenkernel413/patches/patch-XSA286 1.1 - sysutils/xenkernel413/patches/patch-XSA345 1.1 - sysutils/xenkernel413/patches/patch-XSA346 1.1 - sysutils/xenkernel413/patches/patch-XSA347 1.1 --- Module Name: pkgsrc Committed By: bouyer Date: Wed Oct 21 09:03:05 UTC 2020 Modified Files: pkgsrc/sysutils/xenkernel411: Makefile distinfo Added Files: pkgsrc/sysutils/xenkernel411/patches: patch-XSA286 patch-XSA345 patch-XSA346 patch-XSA347 Log Message: Add upstream security patches for XSA286, XSA345, XSA346, XSA347. Bump PKGREVISION. --- Module Name: pkgsrc Committed By: bouyer Date: Wed Oct 21 09:04:10 UTC 2020 Modified Files: pkgsrc/sysutils/xenkernel413: Makefile distinfo Added Files: pkgsrc/sysutils/xenkernel413/patches: patch-XSA286 patch-XSA345 patch-XSA346 patch-XSA347 Log Message: Add upstream security patches for XSA286, XSA345, XSA346, XSA347. Bump PKGREVISION. @ text @a0 716 $NetBSD: patch-XSA286,v 1.1 2020/10/21 09:04:10 bouyer Exp $ From: Jan Beulich Subject: x86/mm: split L4 and L3 parts of the walk out of do_page_walk() The L3 one at least is going to be re-used by a subsequent patch, and splitting the L4 one then as well seems only natural. This is part of XSA-286. Signed-off-by: Jan Beulich Reviewed-by: George Dunlap Reviewed-by: Andrew Cooper diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index db4f035d8d..b1582b56fb 100644 --- xen/arch/x86/x86_64/mm.c.orig +++ xen/arch/x86/x86_64/mm.c @@@@ -44,26 +44,47 @@@@ unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START; l2_pgentry_t *compat_idle_pg_table_l2; -void *do_page_walk(struct vcpu *v, unsigned long addr) +static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr) { - unsigned long mfn = pagetable_get_pfn(v->arch.guest_table); - l4_pgentry_t l4e, *l4t; - l3_pgentry_t l3e, *l3t; - l2_pgentry_t l2e, *l2t; - l1_pgentry_t l1e, *l1t; + unsigned long mfn = pagetable_get_pfn(root); + l4_pgentry_t *l4t, l4e; - if ( !is_pv_vcpu(v) || !is_canonical_address(addr) ) - return NULL; + if ( !is_canonical_address(addr) ) + return l4e_empty(); l4t = map_domain_page(_mfn(mfn)); l4e = l4t[l4_table_offset(addr)]; unmap_domain_page(l4t); + + return l4e; +} + +static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr) +{ + l4_pgentry_t l4e = page_walk_get_l4e(root, addr); + l3_pgentry_t *l3t, l3e; + if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) - return NULL; + return l3e_empty(); l3t = map_l3t_from_l4e(l4e); l3e = l3t[l3_table_offset(addr)]; unmap_domain_page(l3t); + + return l3e; +} + +void *do_page_walk(struct vcpu *v, unsigned long addr) +{ + l3_pgentry_t l3e; + l2_pgentry_t l2e, *l2t; + l1_pgentry_t l1e, *l1t; + unsigned long mfn; + + if ( !is_pv_vcpu(v) ) + return NULL; + + l3e = page_walk_get_l3e(v->arch.guest_table, addr); mfn = l3e_get_pfn(l3e); if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) ) return NULL; From: Jan Beulich Subject: x86/mm: check page types in do_page_walk() For page table entries read to be guaranteed valid, transiently locking the pages and validating their types is necessary. Note that guest use of linear page tables is intentionally not taken into account here, as ordinary data (guest stacks) can't possibly live inside page tables. This is part of XSA-286. Signed-off-by: Jan Beulich Reviewed-by: George Dunlap Reviewed-by: Andrew Cooper diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index b1582b56fb..7d439639b7 100644 --- xen/arch/x86/x86_64/mm.c.orig +++ xen/arch/x86/x86_64/mm.c @@@@ -46,15 +46,29 @@@@ l2_pgentry_t *compat_idle_pg_table_l2; static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr) { - unsigned long mfn = pagetable_get_pfn(root); - l4_pgentry_t *l4t, l4e; + mfn_t mfn = pagetable_get_mfn(root); + /* current's root page table can't disappear under our feet. */ + bool need_lock = !mfn_eq(mfn, pagetable_get_mfn(current->arch.guest_table)); + struct page_info *pg; + l4_pgentry_t l4e = l4e_empty(); if ( !is_canonical_address(addr) ) return l4e_empty(); - l4t = map_domain_page(_mfn(mfn)); - l4e = l4t[l4_table_offset(addr)]; - unmap_domain_page(l4t); + pg = mfn_to_page(mfn); + if ( need_lock && !page_lock(pg) ) + return l4e_empty(); + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table ) + { + l4_pgentry_t *l4t = map_domain_page(mfn); + + l4e = l4t[l4_table_offset(addr)]; + unmap_domain_page(l4t); + } + + if ( need_lock ) + page_unlock(pg); return l4e; } @@@@ -62,14 +76,26 @@@@ static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr) static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr) { l4_pgentry_t l4e = page_walk_get_l4e(root, addr); - l3_pgentry_t *l3t, l3e; + mfn_t mfn = l4e_get_mfn(l4e); + struct page_info *pg; + l3_pgentry_t l3e = l3e_empty(); if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) return l3e_empty(); - l3t = map_l3t_from_l4e(l4e); - l3e = l3t[l3_table_offset(addr)]; - unmap_domain_page(l3t); + pg = mfn_to_page(mfn); + if ( !page_lock(pg) ) + return l3e_empty(); + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l3_page_table ) + { + l3_pgentry_t *l3t = map_domain_page(mfn); + + l3e = l3t[l3_table_offset(addr)]; + unmap_domain_page(l3t); + } + + page_unlock(pg); return l3e; } @@@@ -77,44 +103,67 @@@@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr) void *do_page_walk(struct vcpu *v, unsigned long addr) { l3_pgentry_t l3e; - l2_pgentry_t l2e, *l2t; - l1_pgentry_t l1e, *l1t; - unsigned long mfn; + l2_pgentry_t l2e = l2e_empty(); + l1_pgentry_t l1e = l1e_empty(); + mfn_t mfn; + struct page_info *pg; if ( !is_pv_vcpu(v) ) return NULL; l3e = page_walk_get_l3e(v->arch.guest_table, addr); - mfn = l3e_get_pfn(l3e); - if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) ) + mfn = l3e_get_mfn(l3e); + if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(mfn) ) return NULL; if ( (l3e_get_flags(l3e) & _PAGE_PSE) ) { - mfn += PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1)); + mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1))); goto ret; } - l2t = map_domain_page(_mfn(mfn)); - l2e = l2t[l2_table_offset(addr)]; - unmap_domain_page(l2t); - mfn = l2e_get_pfn(l2e); - if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) ) + pg = mfn_to_page(mfn); + if ( !page_lock(pg) ) + return NULL; + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table ) + { + const l2_pgentry_t *l2t = map_domain_page(mfn); + + l2e = l2t[l2_table_offset(addr)]; + unmap_domain_page(l2t); + } + + page_unlock(pg); + + mfn = l2e_get_mfn(l2e); + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(mfn) ) return NULL; if ( (l2e_get_flags(l2e) & _PAGE_PSE) ) { - mfn += PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1)); + mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1))); goto ret; } - l1t = map_domain_page(_mfn(mfn)); - l1e = l1t[l1_table_offset(addr)]; - unmap_domain_page(l1t); - mfn = l1e_get_pfn(l1e); - if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) ) + pg = mfn_to_page(mfn); + if ( !page_lock(pg) ) + return NULL; + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table ) + { + const l1_pgentry_t *l1t = map_domain_page(mfn); + + l1e = l1t[l1_table_offset(addr)]; + unmap_domain_page(l1t); + } + + page_unlock(pg); + + mfn = l1e_get_mfn(l1e); + if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(mfn) ) return NULL; ret: - return map_domain_page(_mfn(mfn)) + (addr & ~PAGE_MASK); + return map_domain_page(mfn) + (addr & ~PAGE_MASK); } /* From: Jan Beulich Subject: x86/mm: avoid using linear page tables in map_guest_l1e() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the linear L2 table access by an actual page walk. This is part of XSA-286. Reported-by: Jann Horn Signed-off-by: Jan Beulich Signed-off-by: Roger Pau Monné Reviewed-by: George Dunlap Reviewed-by: Andrew Cooper diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c index 2b0dadc8da..acebf9e957 100644 --- xen/arch/x86/pv/mm.c.orig +++ xen/arch/x86/pv/mm.c @@@@ -40,11 +40,14 @@@@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn) if ( unlikely(!__addr_ok(linear)) ) return NULL; - /* Find this l1e and its enclosing l1mfn in the linear map. */ - if ( __copy_from_user(&l2e, - &__linear_l2_table[l2_linear_offset(linear)], - sizeof(l2_pgentry_t)) ) + if ( unlikely(!(current->arch.flags & TF_kernel_mode)) ) + { + ASSERT_UNREACHABLE(); return NULL; + } + + /* Find this l1e and its enclosing l1mfn. */ + l2e = page_walk_get_l2e(current->arch.guest_table, linear); /* Check flags that it will be safe to read the l1e. */ if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) != _PAGE_PRESENT ) diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index 7d439639b7..670aa3f892 100644 --- xen/arch/x86/x86_64/mm.c.orig +++ xen/arch/x86/x86_64/mm.c @@@@ -100,6 +100,34 @@@@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr) return l3e; } +l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr) +{ + l3_pgentry_t l3e = page_walk_get_l3e(root, addr); + mfn_t mfn = l3e_get_mfn(l3e); + struct page_info *pg; + l2_pgentry_t l2e = l2e_empty(); + + if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || + (l3e_get_flags(l3e) & _PAGE_PSE) ) + return l2e_empty(); + + pg = mfn_to_page(mfn); + if ( !page_lock(pg) ) + return l2e_empty(); + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table ) + { + l2_pgentry_t *l2t = map_domain_page(mfn); + + l2e = l2t[l2_table_offset(addr)]; + unmap_domain_page(l2t); + } + + page_unlock(pg); + + return l2e; +} + void *do_page_walk(struct vcpu *v, unsigned long addr) { l3_pgentry_t l3e; diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index 320c6cd196..cd3e7ec501 100644 --- xen/include/asm-x86/mm.h.orig +++ xen/include/asm-x86/mm.h @@@@ -577,7 +577,9 @@@@ void audit_domains(void); void make_cr3(struct vcpu *v, mfn_t mfn); void update_cr3(struct vcpu *v); int vcpu_destroy_pagetables(struct vcpu *); + void *do_page_walk(struct vcpu *v, unsigned long addr); +l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr); int __sync_local_execstate(void); From: Jan Beulich Subject: x86/mm: avoid using linear page tables in guest_get_eff_kern_l1e() First of all drop guest_get_eff_l1e() entirely - there's no actual user of it: pv_ro_page_fault() has a guest_kernel_mode() conditional around its only call site. Then replace the linear L1 table access by an actual page walk. This is part of XSA-286. Reported-by: Jann Horn Signed-off-by: Jan Beulich Reviewed-by: George Dunlap Reviewed-by: Andrew Cooper diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c index acebf9e957..7624447246 100644 --- xen/arch/x86/pv/mm.c.orig +++ xen/arch/x86/pv/mm.c @@@@ -59,27 +59,6 @@@@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn) } /* - * Read the guest's l1e that maps this address, from the kernel-mode - * page tables. - */ -static l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear) -{ - struct vcpu *curr = current; - const bool user_mode = !(curr->arch.flags & TF_kernel_mode); - l1_pgentry_t l1e; - - if ( user_mode ) - toggle_guest_pt(curr); - - l1e = guest_get_eff_l1e(linear); - - if ( user_mode ) - toggle_guest_pt(curr); - - return l1e; -} - -/* * Map a guest's LDT page (covering the byte at @@offset from start of the LDT) * into Xen's virtual range. Returns true if the mapping changed, false * otherwise. diff --git a/xen/arch/x86/pv/mm.h b/xen/arch/x86/pv/mm.h index a1bd473b29..43d33a1fd1 100644 --- xen/arch/x86/pv/mm.h.orig +++ xen/arch/x86/pv/mm.h @@@@ -5,19 +5,19 @@@@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn); int new_guest_cr3(mfn_t mfn); -/* Read a PV guest's l1e that maps this linear address. */ -static inline l1_pgentry_t guest_get_eff_l1e(unsigned long linear) +/* + * Read the guest's l1e that maps this address, from the kernel-mode + * page tables. + */ +static inline l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear) { - l1_pgentry_t l1e; + l1_pgentry_t l1e = l1e_empty(); ASSERT(!paging_mode_translate(current->domain)); ASSERT(!paging_mode_external(current->domain)); - if ( unlikely(!__addr_ok(linear)) || - __copy_from_user(&l1e, - &__linear_l1_table[l1_linear_offset(linear)], - sizeof(l1_pgentry_t)) ) - l1e = l1e_empty(); + if ( likely(__addr_ok(linear)) ) + l1e = page_walk_get_l1e(current->arch.guest_table, linear); return l1e; } diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c index a920fb5e15..2bf4497a16 100644 --- xen/arch/x86/pv/ro-page-fault.c.orig +++ xen/arch/x86/pv/ro-page-fault.c @@@@ -357,7 +357,7 @@@@ int pv_ro_page_fault(unsigned long addr, struct cpu_user_regs *regs) bool mmio_ro; /* Attempt to read the PTE that maps the VA being accessed. */ - pte = guest_get_eff_l1e(addr); + pte = guest_get_eff_kern_l1e(addr); /* We are only looking for read-only mappings */ if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT | _PAGE_RW)) != _PAGE_PRESENT) ) diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index 670aa3f892..c5686e0d25 100644 --- xen/arch/x86/x86_64/mm.c.orig +++ xen/arch/x86/x86_64/mm.c @@@@ -128,6 +128,62 @@@@ l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr) return l2e; } +/* + * For now no "set_accessed" parameter, as all callers want it set to true. + * For now also no "set_dirty" parameter, as all callers deal with r/o + * mappings, and we don't want to set the dirty bit there (conflicts with + * CET-SS). However, as there are CPUs which may set the dirty bit on r/o + * PTEs, the logic below tolerates the bit becoming set "behind our backs". + */ +l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr) +{ + l2_pgentry_t l2e = page_walk_get_l2e(root, addr); + mfn_t mfn = l2e_get_mfn(l2e); + struct page_info *pg; + l1_pgentry_t l1e = l1e_empty(); + + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || + (l2e_get_flags(l2e) & _PAGE_PSE) ) + return l1e_empty(); + + pg = mfn_to_page(mfn); + if ( !page_lock(pg) ) + return l1e_empty(); + + if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table ) + { + l1_pgentry_t *l1t = map_domain_page(mfn); + + l1e = l1t[l1_table_offset(addr)]; + + if ( (l1e_get_flags(l1e) & (_PAGE_ACCESSED | _PAGE_PRESENT)) == + _PAGE_PRESENT ) + { + l1_pgentry_t ol1e = l1e; + + l1e_add_flags(l1e, _PAGE_ACCESSED); + /* + * Best effort only; with the lock held the page shouldn't + * change anyway, except for the dirty bit to perhaps become set. + */ + while ( cmpxchg(&l1e_get_intpte(l1t[l1_table_offset(addr)]), + l1e_get_intpte(ol1e), l1e_get_intpte(l1e)) != + l1e_get_intpte(ol1e) && + !(l1e_get_flags(l1e) & _PAGE_DIRTY) ) + { + l1e_add_flags(ol1e, _PAGE_DIRTY); + l1e_add_flags(l1e, _PAGE_DIRTY); + } + } + + unmap_domain_page(l1t); + } + + page_unlock(pg); + + return l1e; +} + void *do_page_walk(struct vcpu *v, unsigned long addr) { l3_pgentry_t l3e; diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index cd3e7ec501..865db999c1 100644 --- xen/include/asm-x86/mm.h.orig +++ xen/include/asm-x86/mm.h @@@@ -580,6 +580,7 @@@@ int vcpu_destroy_pagetables(struct vcpu *); void *do_page_walk(struct vcpu *v, unsigned long addr); l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr); +l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr); int __sync_local_execstate(void); From: Jan Beulich Subject: x86/mm: avoid using top level linear page tables in {,un}map_domain_page() Move the page table recursion two levels down. This entails avoiding to free the recursive mapping prematurely in free_perdomain_mappings(). This is part of XSA-286. Reported-by: Jann Horn Signed-off-by: Jan Beulich Reviewed-by: George Dunlap Reviewed-by: Andrew Cooper diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c index 4a07cfb18e..660bd06aaf 100644 --- xen/arch/x86/domain_page.c.orig +++ xen/arch/x86/domain_page.c @@@@ -65,7 +65,8 @@@@ void __init mapcache_override_current(struct vcpu *v) #define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER) #define MAPCACHE_L2_ENTRIES (mapcache_l2_entry(MAPCACHE_ENTRIES - 1) + 1) #define MAPCACHE_L1ENT(idx) \ - __linear_l1_table[l1_linear_offset(MAPCACHE_VIRT_START + pfn_to_paddr(idx))] + ((l1_pgentry_t *)(MAPCACHE_VIRT_START | \ + ((L2_PAGETABLE_ENTRIES - 1) << L2_PAGETABLE_SHIFT)))[idx] void *map_domain_page(mfn_t mfn) { @@@@ -235,6 +236,7 @@@@ int mapcache_domain_init(struct domain *d) { struct mapcache_domain *dcache = &d->arch.pv.mapcache; unsigned int bitmap_pages; + int rc; ASSERT(is_pv_domain(d)); @@@@ -243,8 +245,10 @@@@ int mapcache_domain_init(struct domain *d) return 0; #endif + BUILD_BUG_ON(MAPCACHE_VIRT_START & ((1 << L3_PAGETABLE_SHIFT) - 1)); BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 + - 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) > + 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) + + (1U << L2_PAGETABLE_SHIFT) > MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20)); bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long)); dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE; @@@@ -253,9 +257,25 @@@@ int mapcache_domain_init(struct domain *d) spin_lock_init(&dcache->lock); - return create_perdomain_mapping(d, (unsigned long)dcache->inuse, - 2 * bitmap_pages + 1, - NIL(l1_pgentry_t *), NULL); + rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse, + 2 * bitmap_pages + 1, + NIL(l1_pgentry_t *), NULL); + if ( !rc ) + { + /* + * Install mapping of our L2 table into its own last slot, for easy + * access to the L1 entries via MAPCACHE_L1ENT(). + */ + l3_pgentry_t *l3t = __map_domain_page(d->arch.perdomain_l3_pg); + l3_pgentry_t l3e = l3t[l3_table_offset(MAPCACHE_VIRT_END)]; + l2_pgentry_t *l2t = map_l2t_from_l3e(l3e); + + l2e_get_intpte(l2t[L2_PAGETABLE_ENTRIES - 1]) = l3e_get_intpte(l3e); + unmap_domain_page(l2t); + unmap_domain_page(l3t); + } + + return rc; } int mapcache_vcpu_init(struct vcpu *v) @@@@ -346,7 +366,7 @@@@ mfn_t domain_page_map_to_mfn(const void *ptr) else { ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END); - pl1e = &__linear_l1_table[l1_linear_offset(va)]; + pl1e = &MAPCACHE_L1ENT(PFN_DOWN(va - MAPCACHE_VIRT_START)); } return l1e_get_mfn(*pl1e); diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 30dffb68e8..279664a83e 100644 --- xen/arch/x86/mm.c.orig +++ xen/arch/x86/mm.c @@@@ -6031,6 +6031,10 @@@@ void free_perdomain_mappings(struct domain *d) { struct page_info *l1pg = l2e_get_page(l2tab[j]); + /* mapcache_domain_init() installs a recursive entry. */ + if ( l1pg == l2pg ) + continue; + if ( l2e_get_flags(l2tab[j]) & _PAGE_AVAIL0 ) { l1_pgentry_t *l1tab = __map_domain_page(l1pg); From: Jan Beulich Subject: x86/mm: restrict use of linear page tables to shadow mode code Other code does not require them to be set up anymore, so restrict when to populate the respective L4 slot and reduce visibility of the accessors. While with the removal of all uses the vulnerability is actually fixed, removing the creation of the linear mapping adds an extra layer of protection. Similarly reducing visibility of the accessors mostly eliminates the risk of undue re-introduction of uses of the linear mappings. This is (not strictly) part of XSA-286. Signed-off-by: Jan Beulich Reviewed-by: George Dunlap Reviewed-by: Andrew Cooper diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 279664a83e..fa0f813d29 100644 --- xen/arch/x86/mm.c.orig +++ xen/arch/x86/mm.c @@@@ -1757,9 +1757,10 @@@@ void init_xen_l4_slots(l4_pgentry_t *l4t, mfn_t l4mfn, l4t[l4_table_offset(PCI_MCFG_VIRT_START)] = idle_pg_table[l4_table_offset(PCI_MCFG_VIRT_START)]; - /* Slot 258: Self linear mappings. */ + /* Slot 258: Self linear mappings (shadow pt only). */ ASSERT(!mfn_eq(l4mfn, INVALID_MFN)); l4t[l4_table_offset(LINEAR_PT_VIRT_START)] = + !shadow_mode_external(d) ? l4e_empty() : l4e_from_mfn(l4mfn, __PAGE_HYPERVISOR_RW); /* Slot 259: Shadow linear mappings (if applicable) .*/ diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h index 3217777921..b214087194 100644 --- xen/arch/x86/mm/shadow/private.h.orig +++ xen/arch/x86/mm/shadow/private.h @@@@ -135,6 +135,15 @@@@ enum { # define GUEST_PTE_SIZE 4 #endif +/* Where to find each level of the linear mapping */ +#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START)) +#define __linear_l2_table \ + ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START))) +#define __linear_l3_table \ + ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START))) +#define __linear_l4_table \ + ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START))) + /****************************************************************************** * Auditing routines */ diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index c5686e0d25..dcb20d1d9d 100644 --- xen/arch/x86/x86_64/mm.c.orig +++ xen/arch/x86/x86_64/mm.c @@@@ -833,9 +833,6 @@@@ void __init paging_init(void) machine_to_phys_mapping_valid = 1; - /* Set up linear page table mapping. */ - l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)], - l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR_RW)); return; nomem: diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h index 8d79a71398..9d587a076a 100644 --- xen/include/asm-x86/config.h.orig +++ xen/include/asm-x86/config.h @@@@ -193,7 +193,7 @@@@ extern unsigned char boot_edid_info[128]; */ #define PCI_MCFG_VIRT_START (PML4_ADDR(257)) #define PCI_MCFG_VIRT_END (PCI_MCFG_VIRT_START + PML4_ENTRY_BYTES) -/* Slot 258: linear page table (guest table). */ +/* Slot 258: linear page table (monitor table, HVM only). */ #define LINEAR_PT_VIRT_START (PML4_ADDR(258)) #define LINEAR_PT_VIRT_END (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES) /* Slot 259: linear page table (shadow table). */ diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h index c1e92937c0..e72c277b9f 100644 --- xen/include/asm-x86/page.h.orig +++ xen/include/asm-x86/page.h @@@@ -274,19 +274,6 @@@@ void copy_page_sse2(void *, const void *); #define vmap_to_mfn(va) _mfn(l1e_get_pfn(*virt_to_xen_l1e((unsigned long)(va)))) #define vmap_to_page(va) mfn_to_page(vmap_to_mfn(va)) -#endif /* !defined(__ASSEMBLY__) */ - -/* Where to find each level of the linear mapping */ -#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START)) -#define __linear_l2_table \ - ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START))) -#define __linear_l3_table \ - ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START))) -#define __linear_l4_table \ - ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START))) - - -#ifndef __ASSEMBLY__ extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES]; extern l2_pgentry_t *compat_idle_pg_table_l2; extern unsigned int m2p_compat_vstart; @