head	1.2;
access;
symbols
	pkgsrc-2020Q2:1.1.0.8
	pkgsrc-2020Q2-base:1.1
	pkgsrc-2020Q1:1.1.0.4
	pkgsrc-2020Q1-base:1.1
	pkgsrc-2019Q4:1.1.0.6
	pkgsrc-2019Q4-base:1.1
	pkgsrc-2019Q3:1.1.0.2;
locks; strict;
comment	@# @;


1.2
date	2020.08.24.10.35.35;	author bouyer;	state dead;
branches;
next	1.1;
commitid	MGsqrLPx72UHUilC;

1.1
date	2019.12.13.13.44.21;	author bouyer;	state Exp;
branches
	1.1.2.1;
next	;
commitid	w6P0WFKdEprc9yOB;

1.1.2.1
date	2019.12.13.13.44.21;	author bsiegert;	state dead;
branches;
next	1.1.2.2;
commitid	TcQZmrJvhj3Y6WOB;

1.1.2.2
date	2019.12.16.13.51.58;	author bsiegert;	state Exp;
branches;
next	;
commitid	TcQZmrJvhj3Y6WOB;


desc
@@


1.2
log
@Update to 4.11.4nb1
Keep PKGREVISION at 1 to reflect that it's not a stock Xen 4.11.4 kernel,
we have additinnal security fixes (all relevant patches from upstream to
date).
Changes: mosly bug fixes and improvements; better support for newer AMD CPUs.
full changelog at https://xenproject.org/downloads/xen-project-archives/xen-proj
ect-4-11-series/xen-project-4-11-4/
@
text
@$NetBSD: patch-XSA311,v 1.1 2019/12/13 13:44:21 bouyer Exp $

From: Andrew Cooper <andrew.cooper3@@citrix.com>
Subject: AMD/IOMMU: Cease using a dynamic height for the IOMMU pagetables

update_paging_mode() has multiple bugs:

 1) Booting with iommu=debug will cause it to inform you that that it called
    without the pdev_list lock held.
 2) When growing by more than a single level, it leaks the newly allocated
    table(s) in the case of a further error.

Furthermore, the choice of default level for a domain has issues:

 1) All HVM guests grow from 2 to 3 levels during construction because of the
    position of the VRAM just below the 4G boundary, so defaulting to 2 is a
    waste of effort.
 2) The limit for PV guests doesn't take memory hotplug into account, and
    isn't dynamic at runtime like HVM guests.  This means that a PV guest may
    get RAM which it can't map in the IOMMU.

The dynamic height is a property unique to AMD, and adds a substantial
quantity of complexity for what is a marginal performance improvement.  Remove
the complexity by removing the dynamic height.

PV guests now get 3 or 4 levels based on any hotplug regions in the host.
This only makes a difference for hardware which previously had all RAM below
the 512G boundary, and a hotplug region above.

HVM guests now get 4 levels (which will be sufficient until 256TB guests
become a thing), because we don't currently have the information to know when
3 would be safe to use.

The overhead of this extra level is not expected to be noticeable.  It costs
one page (4k) per domain, and one extra IO-TLB paging structure cache entry
which is very hot and less likely to be evicted.

This is XSA-311.

Reported-by: XXX PERSON <XXX EMAIL>3
Signed-off-by: Andrew Cooper <andrew.cooper3@@citrix.com>
Signed-off-by: Andrew Cooper <andrew.cooper3@@citrix.com>
Acked-by: Jan Beulich <jbeulich@@suse.com>

--- xen/drivers/passthrough/amd/iommu_map.c.orig
+++ xen/drivers/passthrough/amd/iommu_map.c
@@@@ -569,97 +569,6 @@@@ static int iommu_pde_from_gfn(struct dom
     return 0;
 }
 
-static int update_paging_mode(struct domain *d, unsigned long gfn)
-{
-    u16 bdf;
-    void *device_entry;
-    unsigned int req_id, level, offset;
-    unsigned long flags;
-    struct pci_dev *pdev;
-    struct amd_iommu *iommu = NULL;
-    struct page_info *new_root = NULL;
-    struct page_info *old_root = NULL;
-    void *new_root_vaddr;
-    unsigned long old_root_mfn;
-    struct domain_iommu *hd = dom_iommu(d);
-
-    if ( gfn == gfn_x(INVALID_GFN) )
-        return -EADDRNOTAVAIL;
-    ASSERT(!(gfn >> DEFAULT_DOMAIN_ADDRESS_WIDTH));
-
-    level = hd->arch.paging_mode;
-    old_root = hd->arch.root_table;
-    offset = gfn >> (PTE_PER_TABLE_SHIFT * (level - 1));
-
-    ASSERT(spin_is_locked(&hd->arch.mapping_lock) && is_hvm_domain(d));
-
-    while ( offset >= PTE_PER_TABLE_SIZE )
-    {
-        /* Allocate and install a new root table.
-         * Only upper I/O page table grows, no need to fix next level bits */
-        new_root = alloc_amd_iommu_pgtable();
-        if ( new_root == NULL )
-        {
-            AMD_IOMMU_DEBUG("%s Cannot allocate I/O page table\n",
-                            __func__);
-            return -ENOMEM;
-        }
-
-        new_root_vaddr = __map_domain_page(new_root);
-        old_root_mfn = mfn_x(page_to_mfn(old_root));
-        set_iommu_pde_present(new_root_vaddr, old_root_mfn, level,
-                              !!IOMMUF_writable, !!IOMMUF_readable);
-        level++;
-        old_root = new_root;
-        offset >>= PTE_PER_TABLE_SHIFT;
-        unmap_domain_page(new_root_vaddr);
-    }
-
-    if ( new_root != NULL )
-    {
-        hd->arch.paging_mode = level;
-        hd->arch.root_table = new_root;
-
-        if ( !pcidevs_locked() )
-            AMD_IOMMU_DEBUG("%s Try to access pdev_list "
-                            "without aquiring pcidevs_lock.\n", __func__);
-
-        /* Update device table entries using new root table and paging mode */
-        for_each_pdev( d, pdev )
-        {
-            bdf = PCI_BDF2(pdev->bus, pdev->devfn);
-            iommu = find_iommu_for_device(pdev->seg, bdf);
-            if ( !iommu )
-            {
-                AMD_IOMMU_DEBUG("%s Fail to find iommu.\n", __func__);
-                return -ENODEV;
-            }
-
-            spin_lock_irqsave(&iommu->lock, flags);
-            do {
-                req_id = get_dma_requestor_id(pdev->seg, bdf);
-                device_entry = iommu->dev_table.buffer +
-                               (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
-
-                /* valid = 0 only works for dom0 passthrough mode */
-                amd_iommu_set_root_page_table((u32 *)device_entry,
-                                              page_to_maddr(hd->arch.root_table),
-                                              d->domain_id,
-                                              hd->arch.paging_mode, 1);
-
-                amd_iommu_flush_device(iommu, req_id);
-                bdf += pdev->phantom_stride;
-            } while ( PCI_DEVFN2(bdf) != pdev->devfn &&
-                      PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) );
-            spin_unlock_irqrestore(&iommu->lock, flags);
-        }
-
-        /* For safety, invalidate all entries */
-        amd_iommu_flush_all_pages(d);
-    }
-    return 0;
-}
-
 int amd_iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn,
                        unsigned int flags)
 {
@@@@ -685,19 +594,6 @@@@ int amd_iommu_map_page(struct domain *d,
         return rc;
     }
 
-    /* Since HVM domain is initialized with 2 level IO page table,
-     * we might need a deeper page table for lager gfn now */
-    if ( is_hvm_domain(d) )
-    {
-        if ( update_paging_mode(d, gfn) )
-        {
-            spin_unlock(&hd->arch.mapping_lock);
-            AMD_IOMMU_DEBUG("Update page mode failed gfn = %lx\n", gfn);
-            domain_crash(d);
-            return -EFAULT;
-        }
-    }
-
     if ( iommu_pde_from_gfn(d, gfn, pt_mfn, true) || (pt_mfn[1] == 0) )
     {
         spin_unlock(&hd->arch.mapping_lock);
--- xen/drivers/passthrough/amd/pci_amd_iommu.c.orig
+++ xen/drivers/passthrough/amd/pci_amd_iommu.c
@@@@ -242,11 +242,17 @@@@ static int amd_iommu_domain_init(struct
 {
     struct domain_iommu *hd = dom_iommu(d);
 
-    /* For pv and dom0, stick with get_paging_mode(max_page)
-     * For HVM dom0, use 2 level page table at first */
-    hd->arch.paging_mode = is_hvm_domain(d) ?
-                      IOMMU_PAGING_MODE_LEVEL_2 :
-                      get_paging_mode(max_page);
+    /*
+     * Choose the number of levels for the IOMMU page tables.
+     * - PV needs 3 or 4, depending on whether there is RAM (including hotplug
+     *   RAM) above the 512G boundary.
+     * - HVM could in principle use 3 or 4 depending on how much guest
+     *   physical address space we give it, but this isn't known yet so use 4
+     *   unilaterally.
+     */
+    hd->arch.paging_mode = is_hvm_domain(d)
+        ? IOMMU_PAGING_MODE_LEVEL_4 : get_paging_mode(get_upper_mfn_bound());
+
     return 0;
 }
 
@


1.1
log
@Update xenkernel411 to 4.11.3nb1, and xentools411 to 4.11.3
(PKGREVISION not reset on xenkernel411 on purpose, to enphasis that it's
not a stock Xen 4.11.3 kernel).
Changes since 4.11.2:
- includes all security patches up to XSA306
- other minor bug fixes, hardware support and performances improvements

In addition, xenkernel411 includes all security patches released since 4.11.3,
up to XSA311
@
text
@d1 1
a1 1
$NetBSD: $
@


1.1.2.1
log
@file patch-XSA311 was added on branch pkgsrc-2019Q3 on 2019-12-16 13:51:58 +0000
@
text
@d1 189
@


1.1.2.2
log
@Pullup ticket #6104 - requested by bouyer
sysutils/xenkernel411, sysutils/xentools411: security fix

Revisions pulled up:
- sysutils/xenkernel411/Makefile                                1.12
- sysutils/xenkernel411/distinfo                                1.9
- sysutils/xenkernel411/patches/patch-XSA298                    deleted
- sysutils/xenkernel411/patches/patch-XSA299                    deleted
- sysutils/xenkernel411/patches/patch-XSA302                    deleted
- sysutils/xenkernel411/patches/patch-XSA304                    deleted
- sysutils/xenkernel411/patches/patch-XSA305                    deleted
- sysutils/xenkernel411/patches/patch-XSA306                    deleted
- sysutils/xenkernel411/patches/patch-XSA307                    1.1
- sysutils/xenkernel411/patches/patch-XSA308                    1.1
- sysutils/xenkernel411/patches/patch-XSA309                    1.1
- sysutils/xenkernel411/patches/patch-XSA310                    1.1
- sysutils/xenkernel411/patches/patch-XSA311                    1.1
- sysutils/xentools411/Makefile                                 1.12
- sysutils/xentools411/distinfo                                 1.8

---
   Module Name:	pkgsrc
   Committed By:	bouyer
   Date:		Fri Dec 13 13:44:21 UTC 2019

   Modified Files:
   	pkgsrc/sysutils/xenkernel411: Makefile distinfo
   	pkgsrc/sysutils/xentools411: Makefile distinfo
   Added Files:
   	pkgsrc/sysutils/xenkernel411/patches: patch-XSA307 patch-XSA308
   	    patch-XSA309 patch-XSA310 patch-XSA311
   Removed Files:
   	pkgsrc/sysutils/xenkernel411/patches: patch-XSA298 patch-XSA299
   	    patch-XSA302 patch-XSA304 patch-XSA305 patch-XSA306

   Log Message:
   Update xenkernel411 to 4.11.3nb1, and xentools411 to 4.11.3
   (PKGREVISION not reset on xenkernel411 on purpose, to enphasis that it's
   not a stock Xen 4.11.3 kernel).
   Changes since 4.11.2:
   - includes all security patches up to XSA306
   - other minor bug fixes, hardware support and performances improvements

   In addition, xenkernel411 includes all security patches released since 4.11.3,
   up to XSA311
@
text
@a0 189
$NetBSD: patch-XSA311,v 1.1 2019/12/13 13:44:21 bouyer Exp $

From: Andrew Cooper <andrew.cooper3@@citrix.com>
Subject: AMD/IOMMU: Cease using a dynamic height for the IOMMU pagetables

update_paging_mode() has multiple bugs:

 1) Booting with iommu=debug will cause it to inform you that that it called
    without the pdev_list lock held.
 2) When growing by more than a single level, it leaks the newly allocated
    table(s) in the case of a further error.

Furthermore, the choice of default level for a domain has issues:

 1) All HVM guests grow from 2 to 3 levels during construction because of the
    position of the VRAM just below the 4G boundary, so defaulting to 2 is a
    waste of effort.
 2) The limit for PV guests doesn't take memory hotplug into account, and
    isn't dynamic at runtime like HVM guests.  This means that a PV guest may
    get RAM which it can't map in the IOMMU.

The dynamic height is a property unique to AMD, and adds a substantial
quantity of complexity for what is a marginal performance improvement.  Remove
the complexity by removing the dynamic height.

PV guests now get 3 or 4 levels based on any hotplug regions in the host.
This only makes a difference for hardware which previously had all RAM below
the 512G boundary, and a hotplug region above.

HVM guests now get 4 levels (which will be sufficient until 256TB guests
become a thing), because we don't currently have the information to know when
3 would be safe to use.

The overhead of this extra level is not expected to be noticeable.  It costs
one page (4k) per domain, and one extra IO-TLB paging structure cache entry
which is very hot and less likely to be evicted.

This is XSA-311.

Reported-by: XXX PERSON <XXX EMAIL>3
Signed-off-by: Andrew Cooper <andrew.cooper3@@citrix.com>
Signed-off-by: Andrew Cooper <andrew.cooper3@@citrix.com>
Acked-by: Jan Beulich <jbeulich@@suse.com>

--- xen/drivers/passthrough/amd/iommu_map.c.orig
+++ xen/drivers/passthrough/amd/iommu_map.c
@@@@ -569,97 +569,6 @@@@ static int iommu_pde_from_gfn(struct dom
     return 0;
 }
 
-static int update_paging_mode(struct domain *d, unsigned long gfn)
-{
-    u16 bdf;
-    void *device_entry;
-    unsigned int req_id, level, offset;
-    unsigned long flags;
-    struct pci_dev *pdev;
-    struct amd_iommu *iommu = NULL;
-    struct page_info *new_root = NULL;
-    struct page_info *old_root = NULL;
-    void *new_root_vaddr;
-    unsigned long old_root_mfn;
-    struct domain_iommu *hd = dom_iommu(d);
-
-    if ( gfn == gfn_x(INVALID_GFN) )
-        return -EADDRNOTAVAIL;
-    ASSERT(!(gfn >> DEFAULT_DOMAIN_ADDRESS_WIDTH));
-
-    level = hd->arch.paging_mode;
-    old_root = hd->arch.root_table;
-    offset = gfn >> (PTE_PER_TABLE_SHIFT * (level - 1));
-
-    ASSERT(spin_is_locked(&hd->arch.mapping_lock) && is_hvm_domain(d));
-
-    while ( offset >= PTE_PER_TABLE_SIZE )
-    {
-        /* Allocate and install a new root table.
-         * Only upper I/O page table grows, no need to fix next level bits */
-        new_root = alloc_amd_iommu_pgtable();
-        if ( new_root == NULL )
-        {
-            AMD_IOMMU_DEBUG("%s Cannot allocate I/O page table\n",
-                            __func__);
-            return -ENOMEM;
-        }
-
-        new_root_vaddr = __map_domain_page(new_root);
-        old_root_mfn = mfn_x(page_to_mfn(old_root));
-        set_iommu_pde_present(new_root_vaddr, old_root_mfn, level,
-                              !!IOMMUF_writable, !!IOMMUF_readable);
-        level++;
-        old_root = new_root;
-        offset >>= PTE_PER_TABLE_SHIFT;
-        unmap_domain_page(new_root_vaddr);
-    }
-
-    if ( new_root != NULL )
-    {
-        hd->arch.paging_mode = level;
-        hd->arch.root_table = new_root;
-
-        if ( !pcidevs_locked() )
-            AMD_IOMMU_DEBUG("%s Try to access pdev_list "
-                            "without aquiring pcidevs_lock.\n", __func__);
-
-        /* Update device table entries using new root table and paging mode */
-        for_each_pdev( d, pdev )
-        {
-            bdf = PCI_BDF2(pdev->bus, pdev->devfn);
-            iommu = find_iommu_for_device(pdev->seg, bdf);
-            if ( !iommu )
-            {
-                AMD_IOMMU_DEBUG("%s Fail to find iommu.\n", __func__);
-                return -ENODEV;
-            }
-
-            spin_lock_irqsave(&iommu->lock, flags);
-            do {
-                req_id = get_dma_requestor_id(pdev->seg, bdf);
-                device_entry = iommu->dev_table.buffer +
-                               (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
-
-                /* valid = 0 only works for dom0 passthrough mode */
-                amd_iommu_set_root_page_table((u32 *)device_entry,
-                                              page_to_maddr(hd->arch.root_table),
-                                              d->domain_id,
-                                              hd->arch.paging_mode, 1);
-
-                amd_iommu_flush_device(iommu, req_id);
-                bdf += pdev->phantom_stride;
-            } while ( PCI_DEVFN2(bdf) != pdev->devfn &&
-                      PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) );
-            spin_unlock_irqrestore(&iommu->lock, flags);
-        }
-
-        /* For safety, invalidate all entries */
-        amd_iommu_flush_all_pages(d);
-    }
-    return 0;
-}
-
 int amd_iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn,
                        unsigned int flags)
 {
@@@@ -685,19 +594,6 @@@@ int amd_iommu_map_page(struct domain *d,
         return rc;
     }
 
-    /* Since HVM domain is initialized with 2 level IO page table,
-     * we might need a deeper page table for lager gfn now */
-    if ( is_hvm_domain(d) )
-    {
-        if ( update_paging_mode(d, gfn) )
-        {
-            spin_unlock(&hd->arch.mapping_lock);
-            AMD_IOMMU_DEBUG("Update page mode failed gfn = %lx\n", gfn);
-            domain_crash(d);
-            return -EFAULT;
-        }
-    }
-
     if ( iommu_pde_from_gfn(d, gfn, pt_mfn, true) || (pt_mfn[1] == 0) )
     {
         spin_unlock(&hd->arch.mapping_lock);
--- xen/drivers/passthrough/amd/pci_amd_iommu.c.orig
+++ xen/drivers/passthrough/amd/pci_amd_iommu.c
@@@@ -242,11 +242,17 @@@@ static int amd_iommu_domain_init(struct
 {
     struct domain_iommu *hd = dom_iommu(d);
 
-    /* For pv and dom0, stick with get_paging_mode(max_page)
-     * For HVM dom0, use 2 level page table at first */
-    hd->arch.paging_mode = is_hvm_domain(d) ?
-                      IOMMU_PAGING_MODE_LEVEL_2 :
-                      get_paging_mode(max_page);
+    /*
+     * Choose the number of levels for the IOMMU page tables.
+     * - PV needs 3 or 4, depending on whether there is RAM (including hotplug
+     *   RAM) above the 512G boundary.
+     * - HVM could in principle use 3 or 4 depending on how much guest
+     *   physical address space we give it, but this isn't known yet so use 4
+     *   unilaterally.
+     */
+    hd->arch.paging_mode = is_hvm_domain(d)
+        ? IOMMU_PAGING_MODE_LEVEL_4 : get_paging_mode(get_upper_mfn_bound());
+
     return 0;
 }
 
@