head	1.3;
access;
symbols
	pkgsrc-2017Q4:1.2.0.2
	pkgsrc-2017Q4-base:1.2
	pkgsrc-2017Q3:1.1.0.2;
locks; strict;
comment	@# @;


1.3
date	2018.01.24.23.29.32;	author bouyer;	state dead;
branches;
next	1.2;
commitid	WktdwS8UoS8bvboA;

1.2
date	2017.12.15.14.02.15;	author bouyer;	state Exp;
branches
	1.2.2.1;
next	1.1;
commitid	0ZGmm4R3nBw9FZiA;

1.1
date	2017.10.17.08.42.30;	author bouyer;	state Exp;
branches
	1.1.2.1;
next	;
commitid	OJpItiWkoMToMnbA;

1.2.2.1
date	2018.01.28.15.23.24;	author bsiegert;	state dead;
branches;
next	;
commitid	hLOFEOUtck6sHEoA;

1.1.2.1
date	2017.10.17.08.42.30;	author bsiegert;	state dead;
branches;
next	1.1.2.2;
commitid	fTgNkUKfFJMQdrbA;

1.1.2.2
date	2017.10.17.19.02.25;	author bsiegert;	state Exp;
branches;
next	;
commitid	fTgNkUKfFJMQdrbA;


desc
@@


1.3
log
@Update xen 4.8 packages to 4.8.3. Changes since 4.8.2: include patches from
all security advisory up to and including XSA254.

While there pass XEN_VENDORVERSION=nb${PKGREVISION} to make so that
'xl info' shows the NetBSD PKGREVISION. If PKGREVISION is not available,
define this as 'nb0'.
@
text
@$NetBSD: patch-XSA240,v 1.2 2017/12/15 14:02:15 bouyer Exp $

From 2315b8c651e0cc31c9153d09c9912b8fbe632ad2 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@@suse.com>
Date: Thu, 28 Sep 2017 15:17:25 +0100
Subject: [PATCH 1/2] x86: limit linear page table use to a single level

That's the only way that they're meant to be used. Without such a
restriction arbitrarily long chains of same-level page tables can be
built, tearing down of which may then cause arbitrarily deep recursion,
causing a stack overflow. To facilitate this restriction, a counter is
being introduced to track both the number of same-level entries in a
page table as well as the number of uses of a page table in another
same-level one (counting into positive and negative direction
respectively, utilizing the fact that both counts can't be non-zero at
the same time).

Note that the added accounting introduces a restriction on the number
of times a page can be used in other same-level page tables - more than
32k of such uses are no longer possible.

Note also that some put_page_and_type[_preemptible]() calls are
replaced with open-coded equivalents.  This seemed preferrable to
adding "parent_table" to the matrix of functions.

Note further that cross-domain same-level page table references are no
longer permitted (they probably never should have been).

This is XSA-240.

Reported-by: Jann Horn <jannh@@google.com>
Signed-off-by: Jan Beulich <jbeulich@@suse.com>
Signed-off-by: George Dunlap <george.dunlap@@citrix.com>
---
 xen/arch/x86/domain.c        |   1 +
 xen/arch/x86/mm.c            | 171 ++++++++++++++++++++++++++++++++++++++-----
 xen/include/asm-x86/domain.h |   2 +
 xen/include/asm-x86/mm.h     |  25 +++++--
 4 files changed, 175 insertions(+), 24 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index a725b43a67..5265b0496c 100644
--- xen/arch/x86/domain.c.orig
+++ xen/arch/x86/domain.c
@@@@ -1245,6 +1245,7 @@@@ int arch_set_info_guest(
                     rc = -ERESTART;
                     /* Fallthrough */
                 case -ERESTART:
+                    v->arch.old_guest_ptpg = NULL;
                     v->arch.old_guest_table =
                         pagetable_get_page(v->arch.guest_table);
                     v->arch.guest_table = pagetable_null();
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index a40461d4d6..31d4a03840 100644
--- xen/arch/x86/mm.c.orig
+++ xen/arch/x86/mm.c
@@@@ -733,6 +733,61 @@@@ static void put_data_page(
         put_page(page);
 }
 
+static bool inc_linear_entries(struct page_info *pg)
+{
+    typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
+
+    do {
+        /*
+         * The check below checks for the "linear use" count being non-zero
+         * as well as overflow.  Signed integer overflow is undefined behavior
+         * according to the C spec.  However, as long as linear_pt_count is
+         * smaller in size than 'int', the arithmetic operation of the
+         * increment below won't overflow; rather the result will be truncated
+         * when stored.  Ensure that this is always true.
+         */
+        BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
+        oc = nc++;
+        if ( nc <= 0 )
+            return false;
+        nc = cmpxchg(&pg->linear_pt_count, oc, nc);
+    } while ( oc != nc );
+
+    return true;
+}
+
+static void dec_linear_entries(struct page_info *pg)
+{
+    typeof(pg->linear_pt_count) oc;
+
+    oc = arch_fetch_and_add(&pg->linear_pt_count, -1);
+    ASSERT(oc > 0);
+}
+
+static bool inc_linear_uses(struct page_info *pg)
+{
+    typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
+
+    do {
+        /* See the respective comment in inc_linear_entries(). */
+        BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
+        oc = nc--;
+        if ( nc >= 0 )
+            return false;
+        nc = cmpxchg(&pg->linear_pt_count, oc, nc);
+    } while ( oc != nc );
+
+    return true;
+}
+
+static void dec_linear_uses(struct page_info *pg)
+{
+    typeof(pg->linear_pt_count) oc;
+
+    oc = arch_fetch_and_add(&pg->linear_pt_count, 1);
+    ASSERT(oc < 0);
+}
+
 /*
  * We allow root tables to map each other (a.k.a. linear page tables). It
  * needs some special care with reference counts and access permissions:
@@@@ -762,15 +817,35 @@@@ get_##level##_linear_pagetable(                                             \
                                                                             \
     if ( (pfn = level##e_get_pfn(pde)) != pde_pfn )                         \
     {                                                                       \
+        struct page_info *ptpg = mfn_to_page(pde_pfn);                      \
+                                                                            \
+        /* Make sure the page table belongs to the correct domain. */       \
+        if ( unlikely(page_get_owner(ptpg) != d) )                          \
+            return 0;                                                       \
+                                                                            \
         /* Make sure the mapped frame belongs to the correct domain. */     \
         if ( unlikely(!get_page_from_pagenr(pfn, d)) )                      \
             return 0;                                                       \
                                                                             \
         /*                                                                  \
-         * Ensure that the mapped frame is an already-validated page table. \
+         * Ensure that the mapped frame is an already-validated page table  \
+         * and is not itself having linear entries, as well as that the     \
+         * containing page table is not iself in use as a linear page table \
+         * elsewhere.                                                       \
          * If so, atomically increment the count (checking for overflow).   \
          */                                                                 \
         page = mfn_to_page(pfn);                                            \
+        if ( !inc_linear_entries(ptpg) )                                    \
+        {                                                                   \
+            put_page(page);                                                 \
+            return 0;                                                       \
+        }                                                                   \
+        if ( !inc_linear_uses(page) )                                       \
+        {                                                                   \
+            dec_linear_entries(ptpg);                                       \
+            put_page(page);                                                 \
+            return 0;                                                       \
+        }                                                                   \
         y = page->u.inuse.type_info;                                        \
         do {                                                                \
             x = y;                                                          \
@@@@ -778,6 +853,8 @@@@ get_##level##_linear_pagetable(                                             \
                  unlikely((x & (PGT_type_mask|PGT_validated)) !=            \
                           (PGT_##level##_page_table|PGT_validated)) )       \
             {                                                               \
+                dec_linear_uses(page);                                      \
+                dec_linear_entries(ptpg);                                   \
                 put_page(page);                                             \
                 return 0;                                                   \
             }                                                               \
@@@@ -1202,6 +1279,9 @@@@ get_page_from_l4e(
             l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);   \
     } while ( 0 )
 
+static int _put_page_type(struct page_info *page, bool preemptible,
+                          struct page_info *ptpg);
+
 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
 {
     unsigned long     pfn = l1e_get_pfn(l1e);
@@@@ -1271,17 +1351,22 @@@@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
     if ( l2e_get_flags(l2e) & _PAGE_PSE )
         put_superpage(l2e_get_pfn(l2e));
     else
-        put_page_and_type(l2e_get_page(l2e));
+    {
+        struct page_info *pg = l2e_get_page(l2e);
+        int rc = _put_page_type(pg, false, mfn_to_page(pfn));
+
+        ASSERT(!rc);
+        put_page(pg);
+    }
 
     return 0;
 }
 
-static int __put_page_type(struct page_info *, int preemptible);
-
 static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
                              int partial, bool_t defer)
 {
     struct page_info *pg;
+    int rc;
 
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
         return 1;
@@@@ -1304,21 +1389,28 @@@@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
     if ( unlikely(partial > 0) )
     {
         ASSERT(!defer);
-        return __put_page_type(pg, 1);
+        return _put_page_type(pg, true, mfn_to_page(pfn));
     }
 
     if ( defer )
     {
+        current->arch.old_guest_ptpg = mfn_to_page(pfn);
         current->arch.old_guest_table = pg;
         return 0;
     }
 
-    return put_page_and_type_preemptible(pg);
+    rc = _put_page_type(pg, true, mfn_to_page(pfn));
+    if ( likely(!rc) )
+        put_page(pg);
+
+    return rc;
 }
 
 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
                              int partial, bool_t defer)
 {
+    int rc = 1;
+
     if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && 
          (l4e_get_pfn(l4e) != pfn) )
     {
@@@@ -1327,18 +1419,22 @@@@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
         if ( unlikely(partial > 0) )
         {
             ASSERT(!defer);
-            return __put_page_type(pg, 1);
+            return _put_page_type(pg, true, mfn_to_page(pfn));
         }
 
         if ( defer )
         {
+            current->arch.old_guest_ptpg = mfn_to_page(pfn);
             current->arch.old_guest_table = pg;
             return 0;
         }
 
-        return put_page_and_type_preemptible(pg);
+        rc = _put_page_type(pg, true, mfn_to_page(pfn));
+        if ( likely(!rc) )
+            put_page(pg);
     }
-    return 1;
+
+    return rc;
 }
 
 static int alloc_l1_table(struct page_info *page)
@@@@ -1536,6 +1632,7 @@@@ static int alloc_l3_table(struct page_info *page)
         {
             page->nr_validated_ptes = i;
             page->partial_pte = 0;
+            current->arch.old_guest_ptpg = NULL;
             current->arch.old_guest_table = page;
         }
         while ( i-- > 0 )
@@@@ -1628,6 +1725,7 @@@@ static int alloc_l4_table(struct page_info *page)
                 {
                     if ( current->arch.old_guest_table )
                         page->nr_validated_ptes++;
+                    current->arch.old_guest_ptpg = NULL;
                     current->arch.old_guest_table = page;
                 }
             }
@@@@ -2370,14 +2468,20 @@@@ int free_page_type(struct page_info *pag
 }
 
 
-static int __put_final_page_type(
-    struct page_info *page, unsigned long type, int preemptible)
+static int _put_final_page_type(struct page_info *page, unsigned long type,
+                                bool preemptible, struct page_info *ptpg)
 {
     int rc = free_page_type(page, type, preemptible);
 
     /* No need for atomic update of type_info here: noone else updates it. */
     if ( rc == 0 )
     {
+        if ( ptpg && PGT_type_equal(type, ptpg->u.inuse.type_info) )
+        {
+            dec_linear_uses(page);
+            dec_linear_entries(ptpg);
+        }
+        ASSERT(!page->linear_pt_count || page_get_owner(page)->is_dying);
         /*
          * Record TLB information for flush later. We do not stamp page tables
          * when running in shadow mode:
@@@@ -2413,8 +2517,8 @@@@ static int __put_final_page_type(
 }
 
 
-static int __put_page_type(struct page_info *page,
-                           int preemptible)
+static int _put_page_type(struct page_info *page, bool preemptible,
+                          struct page_info *ptpg)
 {
     unsigned long nx, x, y = page->u.inuse.type_info;
     int rc = 0;
@@@@ -2441,12 +2545,28 @@@@ static int __put_page_type(struct page_info *page,
                                            x, nx)) != x) )
                     continue;
                 /* We cleared the 'valid bit' so we do the clean up. */
-                rc = __put_final_page_type(page, x, preemptible);
+                rc = _put_final_page_type(page, x, preemptible, ptpg);
+                ptpg = NULL;
                 if ( x & PGT_partial )
                     put_page(page);
                 break;
             }
 
+            if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) )
+            {
+                /*
+                 * page_set_tlbflush_timestamp() accesses the same union
+                 * linear_pt_count lives in. Unvalidated page table pages,
+                 * however, should occur during domain destruction only
+                 * anyway.  Updating of linear_pt_count luckily is not
+                 * necessary anymore for a dying domain.
+                 */
+                ASSERT(page_get_owner(page)->is_dying);
+                ASSERT(page->linear_pt_count < 0);
+                ASSERT(ptpg->linear_pt_count > 0);
+                ptpg = NULL;
+            }
+
             /*
              * Record TLB information for flush later. We do not stamp page
              * tables when running in shadow mode:
@@@@ -2466,6 +2586,13 @@@@ static int __put_page_type(struct page_info *page,
             return -EINTR;
     }
 
+    if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) )
+    {
+        ASSERT(!rc);
+        dec_linear_uses(page);
+        dec_linear_entries(ptpg);
+    }
+
     return rc;
 }
 
@@@@ -2600,6 +2727,7 @@@@ static int __get_page_type(struct page_info *page, unsigned long type,
             page->nr_validated_ptes = 0;
             page->partial_pte = 0;
         }
+        page->linear_pt_count = 0;
         rc = alloc_page_type(page, type, preemptible);
     }
 
@@@@ -2614,7 +2742,7 @@@@ static int __get_page_type(struct page_info *page, unsigned long type,
 
 void put_page_type(struct page_info *page)
 {
-    int rc = __put_page_type(page, 0);
+    int rc = _put_page_type(page, false, NULL);
     ASSERT(rc == 0);
     (void)rc;
 }
@@@@ -2630,7 +2758,7 @@@@ int get_page_type(struct page_info *page, unsigned long type)
 
 int put_page_type_preemptible(struct page_info *page)
 {
-    return __put_page_type(page, 1);
+    return _put_page_type(page, true, NULL);
 }
 
 int get_page_type_preemptible(struct page_info *page, unsigned long type)
@@@@ -2836,11 +2964,14 @@@@ int put_old_guest_table(struct vcpu *v)
     if ( !v->arch.old_guest_table )
         return 0;
 
-    switch ( rc = put_page_and_type_preemptible(v->arch.old_guest_table) )
+    switch ( rc = _put_page_type(v->arch.old_guest_table, true,
+                                 v->arch.old_guest_ptpg) )
     {
     case -EINTR:
     case -ERESTART:
         return -ERESTART;
+    case 0:
+        put_page(v->arch.old_guest_table);
     }
 
     v->arch.old_guest_table = NULL;
@@@@ -2997,6 +3128,7 @@@@ int new_guest_cr3(unsigned long mfn)
                 rc = -ERESTART;
                 /* fallthrough */
             case -ERESTART:
+                curr->arch.old_guest_ptpg = NULL;
                 curr->arch.old_guest_table = page;
                 break;
             default:
@@@@ -3264,7 +3396,10 @@@@ long do_mmuext_op(
                     if ( type == PGT_l1_page_table )
                         put_page_and_type(page);
                     else
+                    {
+                        curr->arch.old_guest_ptpg = NULL;
                         curr->arch.old_guest_table = page;
+                    }
                 }
             }
 
@@@@ -3297,6 +3432,7 @@@@ long do_mmuext_op(
             {
             case -EINTR:
             case -ERESTART:
+                curr->arch.old_guest_ptpg = NULL;
                 curr->arch.old_guest_table = page;
                 rc = 0;
                 break;
@@@@ -3375,6 +3511,7 @@@@ long do_mmuext_op(
                         rc = -ERESTART;
                         /* fallthrough */
                     case -ERESTART:
+                        curr->arch.old_guest_ptpg = NULL;
                         curr->arch.old_guest_table = page;
                         break;
                     default:
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index f6a40eb881..60bb8c9014 100644
--- xen/include/asm-x86/domain.h.orig
+++ xen/include/asm-x86/domain.h
@@@@ -531,6 +531,8 @@@@ struct arch_vcpu
     pagetable_t guest_table_user;       /* (MFN) x86/64 user-space pagetable */
     pagetable_t guest_table;            /* (MFN) guest notion of cr3 */
     struct page_info *old_guest_table;  /* partially destructed pagetable */
+    struct page_info *old_guest_ptpg;   /* containing page table of the */
+                                        /* former, if any */
     /* guest_table holds a ref to the page, and also a type-count unless
      * shadow refcounts are in use */
     pagetable_t shadow_table[4];        /* (MFN) shadow(s) of guest */
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 6687dbc985..63590a7716 100644
--- xen/include/asm-x86/mm.h.orig
+++ xen/include/asm-x86/mm.h
@@@@ -125,11 +125,11 @@@@ struct page_info
         u32 tlbflush_timestamp;
 
         /*
-         * When PGT_partial is true then this field is valid and indicates
-         * that PTEs in the range [0, @@nr_validated_ptes) have been validated.
-         * An extra page reference must be acquired (or not dropped) whenever
-         * PGT_partial gets set, and it must be dropped when the flag gets
-         * cleared. This is so that a get() leaving a page in partially
+         * When PGT_partial is true then the first two fields are valid and
+         * indicate that PTEs in the range [0, @@nr_validated_ptes) have been
+         * validated. An extra page reference must be acquired (or not dropped)
+         * whenever PGT_partial gets set, and it must be dropped when the flag
+         * gets cleared. This is so that a get() leaving a page in partially
          * validated state (where the caller would drop the reference acquired
          * due to the getting of the type [apparently] failing [-ERESTART])
          * would not accidentally result in a page left with zero general
@@@@ -153,10 +153,18 @@@@ struct page_info
          * put_page_from_lNe() (due to the apparent failure), and hence it
          * must be dropped when the put operation is resumed (and completes),
          * but it must not be acquired if picking up the page for validation.
+         *
+         * The 3rd field, @@linear_pt_count, indicates
+         * - by a positive value, how many same-level page table entries a page
+         *   table has,
+         * - by a negative value, in how many same-level page tables a page is
+         *   in use.
          */
         struct {
-            u16 nr_validated_ptes;
-            s8 partial_pte;
+            u16 nr_validated_ptes:PAGETABLE_ORDER + 1;
+            u16 :16 - PAGETABLE_ORDER - 1 - 2;
+            s16 partial_pte:2;
+            s16 linear_pt_count;
         };
 
         /*
@@@@ -207,6 +215,9 @@@@ struct page_info
 #define PGT_count_width   PG_shift(9)
 #define PGT_count_mask    ((1UL<<PGT_count_width)-1)
 
+/* Are the 'type mask' bits identical? */
+#define PGT_type_equal(x, y) (!(((x) ^ (y)) & PGT_type_mask))
+
  /* Cleared when the owning guest 'frees' this page. */
 #define _PGC_allocated    PG_shift(1)
 #define PGC_allocated     PG_mask(1, 1)
-- 
2.14.1

From 41d579aad2fee971e5ce0279a9b559a0fdc74452 Mon Sep 17 00:00:00 2001
From: George Dunlap <george.dunlap@@citrix.com>
Date: Fri, 22 Sep 2017 11:46:55 +0100
Subject: [PATCH 2/2] x86/mm: Disable PV linear pagetables by default

Allowing pagetables to point to other pagetables of the same level
(often called 'linear pagetables') has been included in Xen since its
inception.  But it is not used by the most common PV guests (Linux,
NetBSD, minios), and has been the source of a number of subtle
reference-counting bugs.

Add a command-line option to control whether PV linear pagetables are
allowed (disabled by default).

Reported-by: Jann Horn <jannh@@google.com>
Signed-off-by: George Dunlap <george.dunlap@@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@@citrix.com>
---
Changes since v2:
- s/_/-/; in command-line option
- Added __read_mostly
---
 docs/misc/xen-command-line.markdown | 15 +++++++++++++++
 xen/arch/x86/mm.c                   |  9 +++++++++
 2 files changed, 24 insertions(+)

diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
index 54acc60723..ffa66eb146 100644
--- docs/misc/xen-command-line.markdown.orig
+++ docs/misc/xen-command-line.markdown
@@@@ -1350,6 +1350,21 @@@@ The following resources are available:
     CDP, one COS will corespond two CBMs other than one with CAT, due to the
     sum of CBMs is fixed, that means actual `cos_max` in use will automatically
     reduce to half when CDP is enabled.
+
+### pv-linear-pt
+> `= <boolean>`
+
+> Default: `false`
+
+Allow PV guests to have pagetable entries pointing to other pagetables
+of the same level (i.e., allowing L2 PTEs to point to other L2 pages).
+This technique is often called "linear pagetables", and is sometimes
+used to allow operating systems a simple way to consistently map the
+current process's pagetables into its own virtual address space.
+
+None of the most common PV operating systems (Linux, NetBSD, MiniOS)
+use this technique, but there may be custom operating systems which
+do.
 
 ### reboot
 > `= t[riple] | k[bd] | a[cpi] | p[ci] | P[ower] | e[fi] | n[o] [, [w]arm | [c]old]`
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 31d4a03840..5d125cff3a 100644
--- xen/arch/x86/mm.c.orig
+++ xen/arch/x86/mm.c
@@@@ -800,6 +800,9 @@@@ static void dec_linear_uses(struct page_info *pg)
  *     frame if it is mapped by a different root table. This is sufficient and
  *     also necessary to allow validation of a root table mapping itself.
  */
+static bool __read_mostly pv_linear_pt_enable = true;
+boolean_param("pv-linear-pt", pv_linear_pt_enable);
+
 #define define_get_linear_pagetable(level)                                  \
 static int                                                                  \
 get_##level##_linear_pagetable(                                             \
@@@@ -809,6 +812,12 @@@@ get_##level##_linear_pagetable(                                             \
     struct page_info *page;                                                 \
     unsigned long pfn;                                                      \
                                                                             \
+    if ( !pv_linear_pt_enable )                                             \
+    {                                                                       \
+        MEM_LOG("Attempt to create linear p.t. (feature disabled)");        \
+        return 0;                                                           \
+    }                                                                       \
+                                                                            \
     if ( (level##e_get_flags(pde) & _PAGE_RW) )                             \
     {                                                                       \
         MEM_LOG("Attempt to create linear p.t. with write perms");          \
-- 
2.14.1

From: Jan Beulich <jbeulich@@suse.com>
Subject: x86: don't wrongly trigger linear page table assertion

_put_page_type() may do multiple iterations until its cmpxchg()
succeeds. It invokes set_tlbflush_timestamp() on the first
iteration, however. Code inside the function takes care of this, but
- the assertion in _put_final_page_type() would trigger on the second
  iteration if time stamps in a debug build are permitted to be
  sufficiently much wider than the default 6 bits (see WRAP_MASK in
  flushtlb.c),
- it returning -EINTR (for a continuation to be scheduled) would leave
  the page inconsistent state (until the re-invocation completes).
Make the set_tlbflush_timestamp() invocation conditional, bypassing it
(for now) only in the case we really can't tolerate the stamp to be
stored.

This is part of XSA-240.

Signed-off-by: Jan Beulich <jbeulich@@suse.com>
Reviewed-by: George Dunlap <george.dunlap@@citrix.com>

--- xen/arch/x86/mm.c.orig
+++ xen/arch/x86/mm.c
@@@@ -2561,30 +2561,21 @@@@
                 break;
             }
 
-            if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) )
-            {
-                /*
-                 * page_set_tlbflush_timestamp() accesses the same union
-                 * linear_pt_count lives in. Unvalidated page table pages,
-                 * however, should occur during domain destruction only
-                 * anyway.  Updating of linear_pt_count luckily is not
-                 * necessary anymore for a dying domain.
-                 */
-                ASSERT(page_get_owner(page)->is_dying);
-                ASSERT(page->linear_pt_count < 0);
-                ASSERT(ptpg->linear_pt_count > 0);
-                ptpg = NULL;
-            }
-
             /*
              * Record TLB information for flush later. We do not stamp page
              * tables when running in shadow mode:
              *  1. Pointless, since it's the shadow pt's which must be tracked.
              *  2. Shadow mode reuses this field for shadowed page tables to
              *     store flags info -- we don't want to conflict with that.
+             * Also page_set_tlbflush_timestamp() accesses the same union
+             * linear_pt_count lives in. Pages (including page table ones),
+             * however, don't need their flush time stamp set except when
+             * the last reference is being dropped. For page table pages
+             * this happens in _put_final_page_type().
              */
-            if ( !(shadow_mode_enabled(page_get_owner(page)) &&
+            if ( (!ptpg || !PGT_type_equal(x, ptpg->u.inuse.type_info)) &&  
+                 !(shadow_mode_enabled(page_get_owner(page)) &&
                    (page->count_info & PGC_page_table)) )
                 page->tlbflush_timestamp = tlbflush_current_time();
         }
 
From: Jan Beulich <jbeulich@@suse.com>
Subject: x86: don't wrongly trigger linear page table assertion (2)

_put_final_page_type(), when free_page_type() has exited early to allow
for preemption, should not update the time stamp, as the page continues
to retain the typ which is in the process of being unvalidated. I can't
see why the time stamp update was put on that path in the first place
(albeit it may well have been me who had put it there years ago).

This is part of XSA-240.

Signed-off-by: Jan Beulich <jbeulich@@suse.com>
Reviewed-by: <George Dunlap <george.dunlap.com>

--- xen/arch/x86/mm.c.orig
+++ xen/arch/x86/mm.c
@@@@ -2560,9 +2560,6 @@@@ static int _put_final_page_type(struct p
     {
         ASSERT((page->u.inuse.type_info &
                 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
-        if ( !(shadow_mode_enabled(page_get_owner(page)) &&
-               (page->count_info & PGC_page_table)) )
-            page->tlbflush_timestamp = tlbflush_current_time();
         wmb();
         page->u.inuse.type_info |= PGT_validated;
     }
@


1.2
log
@Apply patches from upstream, fixing security issues XSA246 up to XSA251.
Also update patch-XSA240 from upstream, fixing issues in linear page table
handling introduced by the original XSA240 patch.
Bump PKGREVISION
@
text
@d1 1
a1 1
$NetBSD: patch-XSA240,v 1.1 2017/10/17 08:42:30 bouyer Exp $
@


1.2.2.1
log
@Pullup ticket #5693 - requested by bouyer
sysutils/xenkernel48: security fix
sysutils/xentools48: security fix

Revisions pulled up:
- sysutils/xenkernel48/Makefile                                 1.12
- sysutils/xenkernel48/distinfo                                 1.6
- sysutils/xenkernel48/patches/patch-XSA231                     deleted
- sysutils/xenkernel48/patches/patch-XSA232                     deleted
- sysutils/xenkernel48/patches/patch-XSA234                     deleted
- sysutils/xenkernel48/patches/patch-XSA237                     deleted
- sysutils/xenkernel48/patches/patch-XSA238                     deleted
- sysutils/xenkernel48/patches/patch-XSA239                     deleted
- sysutils/xenkernel48/patches/patch-XSA240                     deleted
- sysutils/xenkernel48/patches/patch-XSA241                     deleted
- sysutils/xenkernel48/patches/patch-XSA242                     deleted
- sysutils/xenkernel48/patches/patch-XSA243                     deleted
- sysutils/xenkernel48/patches/patch-XSA244                     deleted
- sysutils/xenkernel48/patches/patch-XSA246                     deleted
- sysutils/xenkernel48/patches/patch-XSA247                     deleted
- sysutils/xenkernel48/patches/patch-XSA248                     deleted
- sysutils/xenkernel48/patches/patch-XSA249                     deleted
- sysutils/xenkernel48/patches/patch-XSA250                     deleted
- sysutils/xenkernel48/patches/patch-XSA251                     deleted
- sysutils/xenkernel48/patches/patch-XSA254-1                   deleted
- sysutils/xenkernel48/patches/patch-XSA254-2                   deleted
- sysutils/xenkernel48/patches/patch-XSA254-3                   deleted
- sysutils/xenkernel48/patches/patch-XSA254-4                   deleted
- sysutils/xentools48/Makefile                                  1.16
- sysutils/xentools48/distinfo                                  1.7-1.8
- sysutils/xentools48/patches/patch-XSA233                      deleted
- sysutils/xentools48/patches/patch-XSA240                      deleted

---
   Module Name:	pkgsrc
   Committed By:	bouyer
   Date:		Wed Jan 24 23:29:33 UTC 2018

   Modified Files:
   	pkgsrc/sysutils/xenkernel48: Makefile distinfo
   	pkgsrc/sysutils/xentools48: Makefile distinfo
   Removed Files:
   	pkgsrc/sysutils/xenkernel48/patches: patch-XSA231 patch-XSA232
   	    patch-XSA234 patch-XSA237 patch-XSA238 patch-XSA239 patch-XSA240
   	    patch-XSA241 patch-XSA242 patch-XSA243 patch-XSA244 patch-XSA246
   	    patch-XSA247 patch-XSA248 patch-XSA249 patch-XSA250 patch-XSA251
   	    patch-XSA254-1 patch-XSA254-2 patch-XSA254-3 patch-XSA254-4
   	pkgsrc/sysutils/xentools48/patches: patch-XSA233 patch-XSA240

   Log Message:
   Update xen 4.8 packages to 4.8.3. Changes since 4.8.2: include patches from
   all security advisory up to and including XSA254.

   While there pass XEN_VENDORVERSION=nb${PKGREVISION} to make so that
   'xl info' shows the NetBSD PKGREVISION. If PKGREVISION is not available,
   define this as 'nb0'.

---
   Module Name:	pkgsrc
   Committed By:	bouyer
   Date:		Sat Jan 27 16:44:40 UTC 2018

   Modified Files:
   	pkgsrc/sysutils/xentools48: distinfo

   Log Message:
   Remove entries for patch-XSA233 and patch-XSA240 which have been deleted.
@
text
@d1 1
a1 1
$NetBSD: patch-XSA240,v 1.2 2017/12/15 14:02:15 bouyer Exp $
@


1.1
log
@Update xentools48 and xenkernel48 to 4.8.2, and apply security patches up
to XSA244. Keep PKGREVISION to 1 to account for the fact that it's
not a stock Xen 4.8.2.

Note that, unlike upstream, pv-linear-pt defaults to true, so that
NetBSD PV guests (including dom0) will continue to boot without changes
to boot.cfg
@
text
@d1 1
a1 1
$NetBSD: $
d535 1
a535 1
+> Default: `true`
d543 3
a545 3
+None of the most common PV operating systems (Linux, MiniOS)
+use this technique, but NetBSD in PV mode,  and maybe custom operating
+systems which do.
d579 87
@


1.1.2.1
log
@file patch-XSA240 was added on branch pkgsrc-2017Q3 on 2017-10-17 19:02:25 +0000
@
text
@d1 578
@


1.1.2.2
log
@Pullup ticket #5579 - requested by bouyer
sysutils/xenkernel48, sysutils/xentools48: security fix

Revisions pulled up:
- sysutils/xenkernel48/MESSAGE                                  1.2
- sysutils/xenkernel48/Makefile                                 1.6
- sysutils/xenkernel48/distinfo                                 1.3
- sysutils/xenkernel48/patches/patch-XSA-212                    deleted
- sysutils/xenkernel48/patches/patch-XSA231                     1.1
- sysutils/xenkernel48/patches/patch-XSA232                     1.1
- sysutils/xenkernel48/patches/patch-XSA234                     1.1
- sysutils/xenkernel48/patches/patch-XSA237                     1.1
- sysutils/xenkernel48/patches/patch-XSA238                     1.1
- sysutils/xenkernel48/patches/patch-XSA239                     1.1
- sysutils/xenkernel48/patches/patch-XSA240                     1.1
- sysutils/xenkernel48/patches/patch-XSA241                     1.1
- sysutils/xenkernel48/patches/patch-XSA242                     1.1
- sysutils/xenkernel48/patches/patch-XSA243                     1.1
- sysutils/xenkernel48/patches/patch-XSA244                     1.1
- sysutils/xentools48/Makefile                                  1.8
- sysutils/xentools48/distinfo                                  1.4
- sysutils/xentools48/patches/patch-XSA-211-1                   deleted
- sysutils/xentools48/patches/patch-XSA-211-2                   deleted
- sysutils/xentools48/patches/patch-XSA233                      1.1
- sysutils/xentools48/patches/patch-XSA240                      1.1

---
   Module Name:	pkgsrc
   Committed By:	bouyer
   Date:		Tue Oct 17 08:42:30 UTC 2017

   Modified Files:
   	pkgsrc/sysutils/xenkernel48: MESSAGE Makefile distinfo
   	pkgsrc/sysutils/xentools48: Makefile distinfo
   Added Files:
   	pkgsrc/sysutils/xenkernel48/patches: patch-XSA231 patch-XSA232
   	    patch-XSA234 patch-XSA237 patch-XSA238 patch-XSA239 patch-XSA240
   	    patch-XSA241 patch-XSA242 patch-XSA243 patch-XSA244
   	pkgsrc/sysutils/xentools48/patches: patch-XSA233 patch-XSA240
   Removed Files:
   	pkgsrc/sysutils/xenkernel48/patches: patch-XSA-212
   	pkgsrc/sysutils/xentools48/patches: patch-XSA-211-1 patch-XSA-211-2

   Log Message:
   Update xentools48 and xenkernel48 to 4.8.2, and apply security patches up
   to XSA244. Keep PKGREVISION to 1 to account for the fact that it's
   not a stock Xen 4.8.2.

   Note that, unlike upstream, pv-linear-pt defaults to true, so that
   NetBSD PV guests (including dom0) will continue to boot without changes
   to boot.cfg
@
text
@a0 578
$NetBSD: patch-XSA240,v 1.1 2017/10/17 08:42:30 bouyer Exp $

From 2315b8c651e0cc31c9153d09c9912b8fbe632ad2 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@@suse.com>
Date: Thu, 28 Sep 2017 15:17:25 +0100
Subject: [PATCH 1/2] x86: limit linear page table use to a single level

That's the only way that they're meant to be used. Without such a
restriction arbitrarily long chains of same-level page tables can be
built, tearing down of which may then cause arbitrarily deep recursion,
causing a stack overflow. To facilitate this restriction, a counter is
being introduced to track both the number of same-level entries in a
page table as well as the number of uses of a page table in another
same-level one (counting into positive and negative direction
respectively, utilizing the fact that both counts can't be non-zero at
the same time).

Note that the added accounting introduces a restriction on the number
of times a page can be used in other same-level page tables - more than
32k of such uses are no longer possible.

Note also that some put_page_and_type[_preemptible]() calls are
replaced with open-coded equivalents.  This seemed preferrable to
adding "parent_table" to the matrix of functions.

Note further that cross-domain same-level page table references are no
longer permitted (they probably never should have been).

This is XSA-240.

Reported-by: Jann Horn <jannh@@google.com>
Signed-off-by: Jan Beulich <jbeulich@@suse.com>
Signed-off-by: George Dunlap <george.dunlap@@citrix.com>
---
 xen/arch/x86/domain.c        |   1 +
 xen/arch/x86/mm.c            | 171 ++++++++++++++++++++++++++++++++++++++-----
 xen/include/asm-x86/domain.h |   2 +
 xen/include/asm-x86/mm.h     |  25 +++++--
 4 files changed, 175 insertions(+), 24 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index a725b43a67..5265b0496c 100644
--- xen/arch/x86/domain.c.orig
+++ xen/arch/x86/domain.c
@@@@ -1245,6 +1245,7 @@@@ int arch_set_info_guest(
                     rc = -ERESTART;
                     /* Fallthrough */
                 case -ERESTART:
+                    v->arch.old_guest_ptpg = NULL;
                     v->arch.old_guest_table =
                         pagetable_get_page(v->arch.guest_table);
                     v->arch.guest_table = pagetable_null();
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index a40461d4d6..31d4a03840 100644
--- xen/arch/x86/mm.c.orig
+++ xen/arch/x86/mm.c
@@@@ -733,6 +733,61 @@@@ static void put_data_page(
         put_page(page);
 }
 
+static bool inc_linear_entries(struct page_info *pg)
+{
+    typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
+
+    do {
+        /*
+         * The check below checks for the "linear use" count being non-zero
+         * as well as overflow.  Signed integer overflow is undefined behavior
+         * according to the C spec.  However, as long as linear_pt_count is
+         * smaller in size than 'int', the arithmetic operation of the
+         * increment below won't overflow; rather the result will be truncated
+         * when stored.  Ensure that this is always true.
+         */
+        BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
+        oc = nc++;
+        if ( nc <= 0 )
+            return false;
+        nc = cmpxchg(&pg->linear_pt_count, oc, nc);
+    } while ( oc != nc );
+
+    return true;
+}
+
+static void dec_linear_entries(struct page_info *pg)
+{
+    typeof(pg->linear_pt_count) oc;
+
+    oc = arch_fetch_and_add(&pg->linear_pt_count, -1);
+    ASSERT(oc > 0);
+}
+
+static bool inc_linear_uses(struct page_info *pg)
+{
+    typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
+
+    do {
+        /* See the respective comment in inc_linear_entries(). */
+        BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
+        oc = nc--;
+        if ( nc >= 0 )
+            return false;
+        nc = cmpxchg(&pg->linear_pt_count, oc, nc);
+    } while ( oc != nc );
+
+    return true;
+}
+
+static void dec_linear_uses(struct page_info *pg)
+{
+    typeof(pg->linear_pt_count) oc;
+
+    oc = arch_fetch_and_add(&pg->linear_pt_count, 1);
+    ASSERT(oc < 0);
+}
+
 /*
  * We allow root tables to map each other (a.k.a. linear page tables). It
  * needs some special care with reference counts and access permissions:
@@@@ -762,15 +817,35 @@@@ get_##level##_linear_pagetable(                                             \
                                                                             \
     if ( (pfn = level##e_get_pfn(pde)) != pde_pfn )                         \
     {                                                                       \
+        struct page_info *ptpg = mfn_to_page(pde_pfn);                      \
+                                                                            \
+        /* Make sure the page table belongs to the correct domain. */       \
+        if ( unlikely(page_get_owner(ptpg) != d) )                          \
+            return 0;                                                       \
+                                                                            \
         /* Make sure the mapped frame belongs to the correct domain. */     \
         if ( unlikely(!get_page_from_pagenr(pfn, d)) )                      \
             return 0;                                                       \
                                                                             \
         /*                                                                  \
-         * Ensure that the mapped frame is an already-validated page table. \
+         * Ensure that the mapped frame is an already-validated page table  \
+         * and is not itself having linear entries, as well as that the     \
+         * containing page table is not iself in use as a linear page table \
+         * elsewhere.                                                       \
          * If so, atomically increment the count (checking for overflow).   \
          */                                                                 \
         page = mfn_to_page(pfn);                                            \
+        if ( !inc_linear_entries(ptpg) )                                    \
+        {                                                                   \
+            put_page(page);                                                 \
+            return 0;                                                       \
+        }                                                                   \
+        if ( !inc_linear_uses(page) )                                       \
+        {                                                                   \
+            dec_linear_entries(ptpg);                                       \
+            put_page(page);                                                 \
+            return 0;                                                       \
+        }                                                                   \
         y = page->u.inuse.type_info;                                        \
         do {                                                                \
             x = y;                                                          \
@@@@ -778,6 +853,8 @@@@ get_##level##_linear_pagetable(                                             \
                  unlikely((x & (PGT_type_mask|PGT_validated)) !=            \
                           (PGT_##level##_page_table|PGT_validated)) )       \
             {                                                               \
+                dec_linear_uses(page);                                      \
+                dec_linear_entries(ptpg);                                   \
                 put_page(page);                                             \
                 return 0;                                                   \
             }                                                               \
@@@@ -1202,6 +1279,9 @@@@ get_page_from_l4e(
             l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);   \
     } while ( 0 )
 
+static int _put_page_type(struct page_info *page, bool preemptible,
+                          struct page_info *ptpg);
+
 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
 {
     unsigned long     pfn = l1e_get_pfn(l1e);
@@@@ -1271,17 +1351,22 @@@@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
     if ( l2e_get_flags(l2e) & _PAGE_PSE )
         put_superpage(l2e_get_pfn(l2e));
     else
-        put_page_and_type(l2e_get_page(l2e));
+    {
+        struct page_info *pg = l2e_get_page(l2e);
+        int rc = _put_page_type(pg, false, mfn_to_page(pfn));
+
+        ASSERT(!rc);
+        put_page(pg);
+    }
 
     return 0;
 }
 
-static int __put_page_type(struct page_info *, int preemptible);
-
 static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
                              int partial, bool_t defer)
 {
     struct page_info *pg;
+    int rc;
 
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
         return 1;
@@@@ -1304,21 +1389,28 @@@@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
     if ( unlikely(partial > 0) )
     {
         ASSERT(!defer);
-        return __put_page_type(pg, 1);
+        return _put_page_type(pg, true, mfn_to_page(pfn));
     }
 
     if ( defer )
     {
+        current->arch.old_guest_ptpg = mfn_to_page(pfn);
         current->arch.old_guest_table = pg;
         return 0;
     }
 
-    return put_page_and_type_preemptible(pg);
+    rc = _put_page_type(pg, true, mfn_to_page(pfn));
+    if ( likely(!rc) )
+        put_page(pg);
+
+    return rc;
 }
 
 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
                              int partial, bool_t defer)
 {
+    int rc = 1;
+
     if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && 
          (l4e_get_pfn(l4e) != pfn) )
     {
@@@@ -1327,18 +1419,22 @@@@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
         if ( unlikely(partial > 0) )
         {
             ASSERT(!defer);
-            return __put_page_type(pg, 1);
+            return _put_page_type(pg, true, mfn_to_page(pfn));
         }
 
         if ( defer )
         {
+            current->arch.old_guest_ptpg = mfn_to_page(pfn);
             current->arch.old_guest_table = pg;
             return 0;
         }
 
-        return put_page_and_type_preemptible(pg);
+        rc = _put_page_type(pg, true, mfn_to_page(pfn));
+        if ( likely(!rc) )
+            put_page(pg);
     }
-    return 1;
+
+    return rc;
 }
 
 static int alloc_l1_table(struct page_info *page)
@@@@ -1536,6 +1632,7 @@@@ static int alloc_l3_table(struct page_info *page)
         {
             page->nr_validated_ptes = i;
             page->partial_pte = 0;
+            current->arch.old_guest_ptpg = NULL;
             current->arch.old_guest_table = page;
         }
         while ( i-- > 0 )
@@@@ -1628,6 +1725,7 @@@@ static int alloc_l4_table(struct page_info *page)
                 {
                     if ( current->arch.old_guest_table )
                         page->nr_validated_ptes++;
+                    current->arch.old_guest_ptpg = NULL;
                     current->arch.old_guest_table = page;
                 }
             }
@@@@ -2370,14 +2468,20 @@@@ int free_page_type(struct page_info *pag
 }
 
 
-static int __put_final_page_type(
-    struct page_info *page, unsigned long type, int preemptible)
+static int _put_final_page_type(struct page_info *page, unsigned long type,
+                                bool preemptible, struct page_info *ptpg)
 {
     int rc = free_page_type(page, type, preemptible);
 
     /* No need for atomic update of type_info here: noone else updates it. */
     if ( rc == 0 )
     {
+        if ( ptpg && PGT_type_equal(type, ptpg->u.inuse.type_info) )
+        {
+            dec_linear_uses(page);
+            dec_linear_entries(ptpg);
+        }
+        ASSERT(!page->linear_pt_count || page_get_owner(page)->is_dying);
         /*
          * Record TLB information for flush later. We do not stamp page tables
          * when running in shadow mode:
@@@@ -2413,8 +2517,8 @@@@ static int __put_final_page_type(
 }
 
 
-static int __put_page_type(struct page_info *page,
-                           int preemptible)
+static int _put_page_type(struct page_info *page, bool preemptible,
+                          struct page_info *ptpg)
 {
     unsigned long nx, x, y = page->u.inuse.type_info;
     int rc = 0;
@@@@ -2441,12 +2545,28 @@@@ static int __put_page_type(struct page_info *page,
                                            x, nx)) != x) )
                     continue;
                 /* We cleared the 'valid bit' so we do the clean up. */
-                rc = __put_final_page_type(page, x, preemptible);
+                rc = _put_final_page_type(page, x, preemptible, ptpg);
+                ptpg = NULL;
                 if ( x & PGT_partial )
                     put_page(page);
                 break;
             }
 
+            if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) )
+            {
+                /*
+                 * page_set_tlbflush_timestamp() accesses the same union
+                 * linear_pt_count lives in. Unvalidated page table pages,
+                 * however, should occur during domain destruction only
+                 * anyway.  Updating of linear_pt_count luckily is not
+                 * necessary anymore for a dying domain.
+                 */
+                ASSERT(page_get_owner(page)->is_dying);
+                ASSERT(page->linear_pt_count < 0);
+                ASSERT(ptpg->linear_pt_count > 0);
+                ptpg = NULL;
+            }
+
             /*
              * Record TLB information for flush later. We do not stamp page
              * tables when running in shadow mode:
@@@@ -2466,6 +2586,13 @@@@ static int __put_page_type(struct page_info *page,
             return -EINTR;
     }
 
+    if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) )
+    {
+        ASSERT(!rc);
+        dec_linear_uses(page);
+        dec_linear_entries(ptpg);
+    }
+
     return rc;
 }
 
@@@@ -2600,6 +2727,7 @@@@ static int __get_page_type(struct page_info *page, unsigned long type,
             page->nr_validated_ptes = 0;
             page->partial_pte = 0;
         }
+        page->linear_pt_count = 0;
         rc = alloc_page_type(page, type, preemptible);
     }
 
@@@@ -2614,7 +2742,7 @@@@ static int __get_page_type(struct page_info *page, unsigned long type,
 
 void put_page_type(struct page_info *page)
 {
-    int rc = __put_page_type(page, 0);
+    int rc = _put_page_type(page, false, NULL);
     ASSERT(rc == 0);
     (void)rc;
 }
@@@@ -2630,7 +2758,7 @@@@ int get_page_type(struct page_info *page, unsigned long type)
 
 int put_page_type_preemptible(struct page_info *page)
 {
-    return __put_page_type(page, 1);
+    return _put_page_type(page, true, NULL);
 }
 
 int get_page_type_preemptible(struct page_info *page, unsigned long type)
@@@@ -2836,11 +2964,14 @@@@ int put_old_guest_table(struct vcpu *v)
     if ( !v->arch.old_guest_table )
         return 0;
 
-    switch ( rc = put_page_and_type_preemptible(v->arch.old_guest_table) )
+    switch ( rc = _put_page_type(v->arch.old_guest_table, true,
+                                 v->arch.old_guest_ptpg) )
     {
     case -EINTR:
     case -ERESTART:
         return -ERESTART;
+    case 0:
+        put_page(v->arch.old_guest_table);
     }
 
     v->arch.old_guest_table = NULL;
@@@@ -2997,6 +3128,7 @@@@ int new_guest_cr3(unsigned long mfn)
                 rc = -ERESTART;
                 /* fallthrough */
             case -ERESTART:
+                curr->arch.old_guest_ptpg = NULL;
                 curr->arch.old_guest_table = page;
                 break;
             default:
@@@@ -3264,7 +3396,10 @@@@ long do_mmuext_op(
                     if ( type == PGT_l1_page_table )
                         put_page_and_type(page);
                     else
+                    {
+                        curr->arch.old_guest_ptpg = NULL;
                         curr->arch.old_guest_table = page;
+                    }
                 }
             }
 
@@@@ -3297,6 +3432,7 @@@@ long do_mmuext_op(
             {
             case -EINTR:
             case -ERESTART:
+                curr->arch.old_guest_ptpg = NULL;
                 curr->arch.old_guest_table = page;
                 rc = 0;
                 break;
@@@@ -3375,6 +3511,7 @@@@ long do_mmuext_op(
                         rc = -ERESTART;
                         /* fallthrough */
                     case -ERESTART:
+                        curr->arch.old_guest_ptpg = NULL;
                         curr->arch.old_guest_table = page;
                         break;
                     default:
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index f6a40eb881..60bb8c9014 100644
--- xen/include/asm-x86/domain.h.orig
+++ xen/include/asm-x86/domain.h
@@@@ -531,6 +531,8 @@@@ struct arch_vcpu
     pagetable_t guest_table_user;       /* (MFN) x86/64 user-space pagetable */
     pagetable_t guest_table;            /* (MFN) guest notion of cr3 */
     struct page_info *old_guest_table;  /* partially destructed pagetable */
+    struct page_info *old_guest_ptpg;   /* containing page table of the */
+                                        /* former, if any */
     /* guest_table holds a ref to the page, and also a type-count unless
      * shadow refcounts are in use */
     pagetable_t shadow_table[4];        /* (MFN) shadow(s) of guest */
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 6687dbc985..63590a7716 100644
--- xen/include/asm-x86/mm.h.orig
+++ xen/include/asm-x86/mm.h
@@@@ -125,11 +125,11 @@@@ struct page_info
         u32 tlbflush_timestamp;
 
         /*
-         * When PGT_partial is true then this field is valid and indicates
-         * that PTEs in the range [0, @@nr_validated_ptes) have been validated.
-         * An extra page reference must be acquired (or not dropped) whenever
-         * PGT_partial gets set, and it must be dropped when the flag gets
-         * cleared. This is so that a get() leaving a page in partially
+         * When PGT_partial is true then the first two fields are valid and
+         * indicate that PTEs in the range [0, @@nr_validated_ptes) have been
+         * validated. An extra page reference must be acquired (or not dropped)
+         * whenever PGT_partial gets set, and it must be dropped when the flag
+         * gets cleared. This is so that a get() leaving a page in partially
          * validated state (where the caller would drop the reference acquired
          * due to the getting of the type [apparently] failing [-ERESTART])
          * would not accidentally result in a page left with zero general
@@@@ -153,10 +153,18 @@@@ struct page_info
          * put_page_from_lNe() (due to the apparent failure), and hence it
          * must be dropped when the put operation is resumed (and completes),
          * but it must not be acquired if picking up the page for validation.
+         *
+         * The 3rd field, @@linear_pt_count, indicates
+         * - by a positive value, how many same-level page table entries a page
+         *   table has,
+         * - by a negative value, in how many same-level page tables a page is
+         *   in use.
          */
         struct {
-            u16 nr_validated_ptes;
-            s8 partial_pte;
+            u16 nr_validated_ptes:PAGETABLE_ORDER + 1;
+            u16 :16 - PAGETABLE_ORDER - 1 - 2;
+            s16 partial_pte:2;
+            s16 linear_pt_count;
         };
 
         /*
@@@@ -207,6 +215,9 @@@@ struct page_info
 #define PGT_count_width   PG_shift(9)
 #define PGT_count_mask    ((1UL<<PGT_count_width)-1)
 
+/* Are the 'type mask' bits identical? */
+#define PGT_type_equal(x, y) (!(((x) ^ (y)) & PGT_type_mask))
+
  /* Cleared when the owning guest 'frees' this page. */
 #define _PGC_allocated    PG_shift(1)
 #define PGC_allocated     PG_mask(1, 1)
-- 
2.14.1

From 41d579aad2fee971e5ce0279a9b559a0fdc74452 Mon Sep 17 00:00:00 2001
From: George Dunlap <george.dunlap@@citrix.com>
Date: Fri, 22 Sep 2017 11:46:55 +0100
Subject: [PATCH 2/2] x86/mm: Disable PV linear pagetables by default

Allowing pagetables to point to other pagetables of the same level
(often called 'linear pagetables') has been included in Xen since its
inception.  But it is not used by the most common PV guests (Linux,
NetBSD, minios), and has been the source of a number of subtle
reference-counting bugs.

Add a command-line option to control whether PV linear pagetables are
allowed (disabled by default).

Reported-by: Jann Horn <jannh@@google.com>
Signed-off-by: George Dunlap <george.dunlap@@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@@citrix.com>
---
Changes since v2:
- s/_/-/; in command-line option
- Added __read_mostly
---
 docs/misc/xen-command-line.markdown | 15 +++++++++++++++
 xen/arch/x86/mm.c                   |  9 +++++++++
 2 files changed, 24 insertions(+)

diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
index 54acc60723..ffa66eb146 100644
--- docs/misc/xen-command-line.markdown.orig
+++ docs/misc/xen-command-line.markdown
@@@@ -1350,6 +1350,21 @@@@ The following resources are available:
     CDP, one COS will corespond two CBMs other than one with CAT, due to the
     sum of CBMs is fixed, that means actual `cos_max` in use will automatically
     reduce to half when CDP is enabled.
+
+### pv-linear-pt
+> `= <boolean>`
+
+> Default: `true`
+
+Allow PV guests to have pagetable entries pointing to other pagetables
+of the same level (i.e., allowing L2 PTEs to point to other L2 pages).
+This technique is often called "linear pagetables", and is sometimes
+used to allow operating systems a simple way to consistently map the
+current process's pagetables into its own virtual address space.
+
+None of the most common PV operating systems (Linux, MiniOS)
+use this technique, but NetBSD in PV mode,  and maybe custom operating
+systems which do.
 
 ### reboot
 > `= t[riple] | k[bd] | a[cpi] | p[ci] | P[ower] | e[fi] | n[o] [, [w]arm | [c]old]`
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 31d4a03840..5d125cff3a 100644
--- xen/arch/x86/mm.c.orig
+++ xen/arch/x86/mm.c
@@@@ -800,6 +800,9 @@@@ static void dec_linear_uses(struct page_info *pg)
  *     frame if it is mapped by a different root table. This is sufficient and
  *     also necessary to allow validation of a root table mapping itself.
  */
+static bool __read_mostly pv_linear_pt_enable = true;
+boolean_param("pv-linear-pt", pv_linear_pt_enable);
+
 #define define_get_linear_pagetable(level)                                  \
 static int                                                                  \
 get_##level##_linear_pagetable(                                             \
@@@@ -809,6 +812,12 @@@@ get_##level##_linear_pagetable(                                             \
     struct page_info *page;                                                 \
     unsigned long pfn;                                                      \
                                                                             \
+    if ( !pv_linear_pt_enable )                                             \
+    {                                                                       \
+        MEM_LOG("Attempt to create linear p.t. (feature disabled)");        \
+        return 0;                                                           \
+    }                                                                       \
+                                                                            \
     if ( (level##e_get_flags(pde) & _PAGE_RW) )                             \
     {                                                                       \
         MEM_LOG("Attempt to create linear p.t. with write perms");          \
-- 
2.14.1

@


