diff options
author | Ben Hutchings <ben@decadent.org.uk> | 2018-09-28 04:01:44 +0100 |
---|---|---|
committer | Ben Hutchings <ben@decadent.org.uk> | 2018-09-28 04:49:54 +0100 |
commit | ac5ee8cb77fe9723357b26d1b605a5e6907d4607 (patch) | |
tree | dc46252790d2d3a6d232c417e6d01719d5845fb3 | |
parent | 84e721a6cc8601a459500305fc8fe4c01905696d (diff) | |
download | linux-stable-queue-ac5ee8cb77fe9723357b26d1b605a5e6907d4607.tar.gz |
Backport L1TF mitigation
32 files changed, 3182 insertions, 0 deletions
diff --git a/queue-3.16/drm-drivers-add-support-for-using-the-arch-wc-mapping-api.patch b/queue-3.16/drm-drivers-add-support-for-using-the-arch-wc-mapping-api.patch new file mode 100644 index 00000000..76c52315 --- /dev/null +++ b/queue-3.16/drm-drivers-add-support-for-using-the-arch-wc-mapping-api.patch @@ -0,0 +1,157 @@ +From: Dave Airlie <airlied@redhat.com> +Date: Mon, 24 Oct 2016 15:37:48 +1000 +Subject: drm/drivers: add support for using the arch wc mapping API. +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 7cf321d118a825c1541b43ca45294126fd474efa upstream. + +This fixes a regression in all these drivers since the cache +mode tracking was fixed for mixed mappings. It uses the new +arch API to add the VRAM range to the PAT mapping tracking +tables. + +Fixes: 87744ab3832 (mm: fix cache mode tracking in vm_insert_mixed()) +Reviewed-by: Christian König <christian.koenig@amd.com>. +Signed-off-by: Dave Airlie <airlied@redhat.com> +[bwh: Backported to 3.16: + - Drop changes in amdgpu + - In nouveau, use struct nouveau_device * and nv_device_resource_{start,len}() + - Adjust context] +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- +--- a/drivers/gpu/drm/ast/ast_ttm.c ++++ b/drivers/gpu/drm/ast/ast_ttm.c +@@ -275,6 +275,8 @@ int ast_mm_init(struct ast_private *ast) + return ret; + } + ++ arch_io_reserve_memtype_wc(pci_resource_start(dev->pdev, 0), ++ pci_resource_len(dev->pdev, 0)); + ast->fb_mtrr = arch_phys_wc_add(pci_resource_start(dev->pdev, 0), + pci_resource_len(dev->pdev, 0)); + +@@ -283,11 +285,15 @@ int ast_mm_init(struct ast_private *ast) + + void ast_mm_fini(struct ast_private *ast) + { ++ struct drm_device *dev = ast->dev; ++ + ttm_bo_device_release(&ast->ttm.bdev); + + ast_ttm_global_release(ast); + + arch_phys_wc_del(ast->fb_mtrr); ++ arch_io_free_memtype_wc(pci_resource_start(dev->pdev, 0), ++ pci_resource_len(dev->pdev, 0)); + } + + void ast_ttm_placement(struct ast_bo *bo, int domain) +--- a/drivers/gpu/drm/cirrus/cirrus_ttm.c ++++ b/drivers/gpu/drm/cirrus/cirrus_ttm.c +@@ -275,6 +275,9 @@ int cirrus_mm_init(struct cirrus_device + return ret; + } + ++ arch_io_reserve_memtype_wc(pci_resource_start(dev->pdev, 0), ++ pci_resource_len(dev->pdev, 0)); ++ + cirrus->fb_mtrr = arch_phys_wc_add(pci_resource_start(dev->pdev, 0), + pci_resource_len(dev->pdev, 0)); + +@@ -284,6 +287,8 @@ int cirrus_mm_init(struct cirrus_device + + void cirrus_mm_fini(struct cirrus_device *cirrus) + { ++ struct drm_device *dev = cirrus->dev; ++ + if (!cirrus->mm_inited) + return; + +@@ -293,6 +298,8 @@ void cirrus_mm_fini(struct cirrus_device + + arch_phys_wc_del(cirrus->fb_mtrr); + cirrus->fb_mtrr = 0; ++ arch_io_free_memtype_wc(pci_resource_start(dev->pdev, 0), ++ pci_resource_len(dev->pdev, 0)); + } + + void cirrus_ttm_placement(struct cirrus_bo *bo, int domain) +--- a/drivers/gpu/drm/mgag200/mgag200_ttm.c ++++ b/drivers/gpu/drm/mgag200/mgag200_ttm.c +@@ -274,6 +274,9 @@ int mgag200_mm_init(struct mga_device *m + return ret; + } + ++ arch_io_reserve_memtype_wc(pci_resource_start(dev->pdev, 0), ++ pci_resource_len(dev->pdev, 0)); ++ + mdev->fb_mtrr = arch_phys_wc_add(pci_resource_start(dev->pdev, 0), + pci_resource_len(dev->pdev, 0)); + +@@ -282,10 +285,14 @@ int mgag200_mm_init(struct mga_device *m + + void mgag200_mm_fini(struct mga_device *mdev) + { ++ struct drm_device *dev = mdev->dev; ++ + ttm_bo_device_release(&mdev->ttm.bdev); + + mgag200_ttm_global_release(mdev); + ++ arch_io_free_memtype_wc(pci_resource_start(dev->pdev, 0), ++ pci_resource_len(dev->pdev, 0)); + arch_phys_wc_del(mdev->fb_mtrr); + mdev->fb_mtrr = 0; + } +--- a/drivers/gpu/drm/nouveau/nouveau_ttm.c ++++ b/drivers/gpu/drm/nouveau/nouveau_ttm.c +@@ -397,6 +397,9 @@ nouveau_ttm_init(struct nouveau_drm *drm + drm->gem.vram_available = nouveau_fb(drm->device)->ram->size; + drm->gem.vram_available -= nouveau_instmem(drm->device)->reserved; + ++ arch_io_reserve_memtype_wc(nv_device_resource_start(device, 1), ++ nv_device_resource_len(device, 1)); ++ + ret = ttm_bo_init_mm(&drm->ttm.bdev, TTM_PL_VRAM, + drm->gem.vram_available >> PAGE_SHIFT); + if (ret) { +@@ -429,6 +432,8 @@ nouveau_ttm_init(struct nouveau_drm *drm + void + nouveau_ttm_fini(struct nouveau_drm *drm) + { ++ struct nouveau_device *device = nv_device(drm->device); ++ + mutex_lock(&drm->dev->struct_mutex); + ttm_bo_clean_mm(&drm->ttm.bdev, TTM_PL_VRAM); + ttm_bo_clean_mm(&drm->ttm.bdev, TTM_PL_TT); +@@ -440,4 +445,7 @@ nouveau_ttm_fini(struct nouveau_drm *drm + + arch_phys_wc_del(drm->ttm.mtrr); + drm->ttm.mtrr = 0; ++ arch_io_free_memtype_wc(nv_device_resource_start(device, 1), ++ nv_device_resource_len(device, 1)); ++ + } +--- a/drivers/gpu/drm/radeon/radeon_object.c ++++ b/drivers/gpu/drm/radeon/radeon_object.c +@@ -359,6 +359,10 @@ void radeon_bo_force_delete(struct radeo + + int radeon_bo_init(struct radeon_device *rdev) + { ++ /* reserve PAT memory space to WC for VRAM */ ++ arch_io_reserve_memtype_wc(rdev->mc.aper_base, ++ rdev->mc.aper_size); ++ + /* Add an MTRR for the VRAM */ + if (!rdev->fastfb_working) { + rdev->mc.vram_mtrr = arch_phys_wc_add(rdev->mc.aper_base, +@@ -376,6 +380,7 @@ void radeon_bo_fini(struct radeon_device + { + radeon_ttm_fini(rdev); + arch_phys_wc_del(rdev->mc.vram_mtrr); ++ arch_io_free_memtype_wc(rdev->mc.aper_base, rdev->mc.aper_size); + } + + /* Returns how many bytes TTM can move per IB. diff --git a/queue-3.16/mm-add-vm_insert_pfn_prot.patch b/queue-3.16/mm-add-vm_insert_pfn_prot.patch new file mode 100644 index 00000000..603d67fe --- /dev/null +++ b/queue-3.16/mm-add-vm_insert_pfn_prot.patch @@ -0,0 +1,97 @@ +From: Andy Lutomirski <luto@kernel.org> +Date: Tue, 29 Dec 2015 20:12:20 -0800 +Subject: mm: Add vm_insert_pfn_prot() + +commit 1745cbc5d0dee0749a6bc0ea8e872c5db0074061 upstream. + +The x86 vvar vma contains pages with differing cacheability +flags. x86 currently implements this by manually inserting all +the ptes using (io_)remap_pfn_range when the vma is set up. + +x86 wants to move to using .fault with VM_FAULT_NOPAGE to set up +the mappings as needed. The correct API to use to insert a pfn +in .fault is vm_insert_pfn(), but vm_insert_pfn() can't override the +vma's cache mode, and the HPET page in particular needs to be +uncached despite the fact that the rest of the VMA is cached. + +Add vm_insert_pfn_prot() to support varying cacheability within +the same non-COW VMA in a more sane manner. + +x86 could alternatively use multiple VMAs, but that's messy, +would break CRIU, and would create unnecessary VMAs that would +waste memory. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Reviewed-by: Kees Cook <keescook@chromium.org> +Acked-by: Andrew Morton <akpm@linux-foundation.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Fenghua Yu <fenghua.yu@intel.com> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Oleg Nesterov <oleg@redhat.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/d2938d1eb37be7a5e4f86182db646551f11e45aa.1451446564.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + include/linux/mm.h | 2 ++ + mm/memory.c | 25 +++++++++++++++++++++++-- + 2 files changed, 25 insertions(+), 2 deletions(-) + +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1965,6 +1965,8 @@ int remap_pfn_range(struct vm_area_struc + int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); + int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn); ++int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, ++ unsigned long pfn, pgprot_t pgprot); + int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn); + int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -1574,8 +1574,29 @@ out: + int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn) + { ++ return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); ++} ++EXPORT_SYMBOL(vm_insert_pfn); ++ ++/** ++ * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot ++ * @vma: user vma to map to ++ * @addr: target user address of this page ++ * @pfn: source kernel pfn ++ * @pgprot: pgprot flags for the inserted page ++ * ++ * This is exactly like vm_insert_pfn, except that it allows drivers to ++ * to override pgprot on a per-page basis. ++ * ++ * This only makes sense for IO mappings, and it makes no sense for ++ * cow mappings. In general, using multiple vmas is preferable; ++ * vm_insert_pfn_prot should only be used if using multiple VMAs is ++ * impractical. ++ */ ++int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, ++ unsigned long pfn, pgprot_t pgprot) ++{ + int ret; +- pgprot_t pgprot = vma->vm_page_prot; + /* + * Technically, architectures with pte_special can avoid all these + * restrictions (same for remap_pfn_range). However we would like +@@ -1597,7 +1618,7 @@ int vm_insert_pfn(struct vm_area_struct + + return ret; + } +-EXPORT_SYMBOL(vm_insert_pfn); ++EXPORT_SYMBOL(vm_insert_pfn_prot); + + int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn) diff --git a/queue-3.16/mm-fix-cache-mode-tracking-in-vm_insert_mixed.patch b/queue-3.16/mm-fix-cache-mode-tracking-in-vm_insert_mixed.patch new file mode 100644 index 00000000..f74904b8 --- /dev/null +++ b/queue-3.16/mm-fix-cache-mode-tracking-in-vm_insert_mixed.patch @@ -0,0 +1,59 @@ +From: Dan Williams <dan.j.williams@intel.com> +Date: Fri, 7 Oct 2016 17:00:18 -0700 +Subject: mm: fix cache mode tracking in vm_insert_mixed() + +commit 9ac0dc7d949db7afd4116d55fa4fcf6a66d820f0 upstream. + +commit 87744ab3832b83ba71b931f86f9cfdb000d07da5 upstream + +vm_insert_mixed() unlike vm_insert_pfn_prot() and vmf_insert_pfn_pmd(), +fails to check the pgprot_t it uses for the mapping against the one +recorded in the memtype tracking tree. Add the missing call to +track_pfn_insert() to preclude cases where incompatible aliased mappings +are established for a given physical address range. + +[groeck: Backport to v4.4.y] + +Link: http://lkml.kernel.org/r/147328717909.35069.14256589123570653697.stgit@dwillia2-desk3.amr.corp.intel.com +Signed-off-by: Dan Williams <dan.j.williams@intel.com> +Cc: David Airlie <airlied@linux.ie> +Cc: Matthew Wilcox <mawilcox@microsoft.com> +Cc: Ross Zwisler <ross.zwisler@linux.intel.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> +Signed-off-by: Guenter Roeck <linux@roeck-us.net> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + mm/memory.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -1623,10 +1623,14 @@ EXPORT_SYMBOL(vm_insert_pfn_prot); + int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn) + { ++ pgprot_t pgprot = vma->vm_page_prot; ++ + BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); + + if (addr < vma->vm_start || addr >= vma->vm_end) + return -EFAULT; ++ if (track_pfn_insert(vma, &pgprot, pfn)) ++ return -EINVAL; + + /* + * If we don't have pte special, then we have to use the pfn_valid() +@@ -1639,9 +1643,9 @@ int vm_insert_mixed(struct vm_area_struc + struct page *page; + + page = pfn_to_page(pfn); +- return insert_page(vma, addr, page, vma->vm_page_prot); ++ return insert_page(vma, addr, page, pgprot); + } +- return insert_pfn(vma, addr, pfn, vma->vm_page_prot); ++ return insert_pfn(vma, addr, pfn, pgprot); + } + EXPORT_SYMBOL(vm_insert_mixed); + diff --git a/queue-3.16/mm-pagewalk-remove-pgd_entry-and-pud_entry.patch b/queue-3.16/mm-pagewalk-remove-pgd_entry-and-pud_entry.patch new file mode 100644 index 00000000..9e9400cf --- /dev/null +++ b/queue-3.16/mm-pagewalk-remove-pgd_entry-and-pud_entry.patch @@ -0,0 +1,74 @@ +From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> +Date: Wed, 11 Feb 2015 15:27:34 -0800 +Subject: mm/pagewalk: remove pgd_entry() and pud_entry() + +commit 0b1fbfe50006c41014cc25660c0e735d21c34939 upstream. + +Currently no user of page table walker sets ->pgd_entry() or +->pud_entry(), so checking their existence in each loop is just wasting +CPU cycle. So let's remove it to reduce overhead. + +Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> +Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> +Cc: Andrea Arcangeli <aarcange@redhat.com> +Cc: Cyrill Gorcunov <gorcunov@openvz.org> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> +Cc: Pavel Emelyanov <xemul@parallels.com> +Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> +[bwh: Backported to 3.16 as dependency of L1TF mitigation] +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + include/linux/mm.h | 6 ------ + mm/pagewalk.c | 9 ++------- + 2 files changed, 2 insertions(+), 13 deletions(-) + +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1114,8 +1114,6 @@ void unmap_vmas(struct mmu_gather *tlb, + + /** + * mm_walk - callbacks for walk_page_range +- * @pgd_entry: if set, called for each non-empty PGD (top-level) entry +- * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry + * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry + * this handler is required to be able to handle + * pmd_trans_huge() pmds. They may simply choose to +@@ -1129,10 +1127,6 @@ void unmap_vmas(struct mmu_gather *tlb, + * (see walk_page_range for more details) + */ + struct mm_walk { +- int (*pgd_entry)(pgd_t *pgd, unsigned long addr, +- unsigned long next, struct mm_walk *walk); +- int (*pud_entry)(pud_t *pud, unsigned long addr, +- unsigned long next, struct mm_walk *walk); + int (*pmd_entry)(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk); + int (*pte_entry)(pte_t *pte, unsigned long addr, +--- a/mm/pagewalk.c ++++ b/mm/pagewalk.c +@@ -86,9 +86,7 @@ static int walk_pud_range(pgd_t *pgd, un + break; + continue; + } +- if (walk->pud_entry) +- err = walk->pud_entry(pud, addr, next, walk); +- if (!err && (walk->pmd_entry || walk->pte_entry)) ++ if (walk->pmd_entry || walk->pte_entry) + err = walk_pmd_range(pud, addr, next, walk); + if (err) + break; +@@ -237,10 +235,7 @@ int walk_page_range(unsigned long addr, + pgd++; + continue; + } +- if (walk->pgd_entry) +- err = walk->pgd_entry(pgd, addr, next, walk); +- if (!err && +- (walk->pud_entry || walk->pmd_entry || walk->pte_entry)) ++ if (walk->pmd_entry || walk->pte_entry) + err = walk_pud_range(pgd, addr, next, walk); + if (err) + break; diff --git a/queue-3.16/mm-x86-move-_page_swp_soft_dirty-from-bit-7-to-bit-1.patch b/queue-3.16/mm-x86-move-_page_swp_soft_dirty-from-bit-7-to-bit-1.patch new file mode 100644 index 00000000..704eb1c8 --- /dev/null +++ b/queue-3.16/mm-x86-move-_page_swp_soft_dirty-from-bit-7-to-bit-1.patch @@ -0,0 +1,97 @@ +From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> +Date: Fri, 8 Sep 2017 16:10:46 -0700 +Subject: mm: x86: move _PAGE_SWP_SOFT_DIRTY from bit 7 to bit 1 + +commit eee4818baac0f2b37848fdf90e4b16430dc536ac upstream. + +_PAGE_PSE is used to distinguish between a truly non-present +(_PAGE_PRESENT=0) PMD, and a PMD which is undergoing a THP split and +should be treated as present. + +But _PAGE_SWP_SOFT_DIRTY currently uses the _PAGE_PSE bit, which would +cause confusion between one of those PMDs undergoing a THP split, and a +soft-dirty PMD. Dropping _PAGE_PSE check in pmd_present() does not work +well, because it can hurt optimization of tlb handling in thp split. + +Thus, we need to move the bit. + +In the current kernel, bits 1-4 are not used in non-present format since +commit 00839ee3b299 ("x86/mm: Move swap offset/type up in PTE to work +around erratum"). So let's move _PAGE_SWP_SOFT_DIRTY to bit 1. Bit 7 +is used as reserved (always clear), so please don't use it for other +purpose. + +Link: http://lkml.kernel.org/r/20170717193955.20207-3-zi.yan@sent.com +Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> +Signed-off-by: Zi Yan <zi.yan@cs.rutgers.edu> +Acked-by: Dave Hansen <dave.hansen@intel.com> +Cc: "H. Peter Anvin" <hpa@zytor.com> +Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> +Cc: David Nellans <dnellans@nvidia.com> +Cc: Ingo Molnar <mingo@elte.hu> +Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> +Cc: Mel Gorman <mgorman@techsingularity.net> +Cc: Minchan Kim <minchan@kernel.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: Vlastimil Babka <vbabka@suse.cz> +Cc: Andrea Arcangeli <aarcange@redhat.com> +Cc: Michal Hocko <mhocko@kernel.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> +[bwh: Backported to 3.16: Bit 9 may be reserved for PAGE_BIT_NUMA here] +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + arch/x86/include/asm/pgtable_64.h | 12 +++++++++--- + arch/x86/include/asm/pgtable_types.h | 10 +++++----- + 2 files changed, 14 insertions(+), 8 deletions(-) + +--- a/arch/x86/include/asm/pgtable_64.h ++++ b/arch/x86/include/asm/pgtable_64.h +@@ -165,15 +165,21 @@ static inline int pgd_large(pgd_t pgd) { + /* + * Encode and de-code a swap entry + * +- * | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number +- * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names +- * | OFFSET (15->63) | TYPE (10-14) | 0 |0|X|X|X| X| X|X|X|0| <- swp entry ++ * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number ++ * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names ++ * | OFFSET (15->63) | TYPE (10-14) | 0 |0|0|X|X| X| X|X|SD|0| <- swp entry + * + * G (8) is aliased and used as a PROT_NONE indicator for + * !present ptes. We need to start storing swap entries above + * there. We also need to avoid using A and D because of an + * erratum where they can be incorrectly set by hardware on + * non-present PTEs. ++ * ++ * SD (1) in swp entry is used to store soft dirty bit, which helps us ++ * remember soft dirty over page migration ++ * ++ * Bit 7 in swp entry should be 0 because pmd_present checks not only P, ++ * but also L and G. + */ + #ifdef CONFIG_NUMA_BALANCING + /* Automatic NUMA balancing needs to be distinguishable from swap entries */ +--- a/arch/x86/include/asm/pgtable_types.h ++++ b/arch/x86/include/asm/pgtable_types.h +@@ -94,15 +94,15 @@ + /* + * Tracking soft dirty bit when a page goes to a swap is tricky. + * We need a bit which can be stored in pte _and_ not conflict +- * with swap entry format. On x86 bits 6 and 7 are *not* involved +- * into swap entry computation, but bit 6 is used for nonlinear +- * file mapping, so we borrow bit 7 for soft dirty tracking. ++ * with swap entry format. On x86 bits 1-4 are *not* involved ++ * into swap entry computation, but bit 7 is used for thp migration, ++ * so we borrow bit 1 for soft dirty tracking. + * + * Please note that this bit must be treated as swap dirty page +- * mark if and only if the PTE has present bit clear! ++ * mark if and only if the PTE/PMD has present bit clear! + */ + #ifdef CONFIG_MEM_SOFT_DIRTY +-#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE ++#define _PAGE_SWP_SOFT_DIRTY _PAGE_RW + #else + #define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) + #endif diff --git a/queue-3.16/pagewalk-improve-vma-handling.patch b/queue-3.16/pagewalk-improve-vma-handling.patch new file mode 100644 index 00000000..79f7d2f9 --- /dev/null +++ b/queue-3.16/pagewalk-improve-vma-handling.patch @@ -0,0 +1,341 @@ +From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> +Date: Wed, 11 Feb 2015 15:27:37 -0800 +Subject: pagewalk: improve vma handling + +commit fafaa4264eba49fd10695c193a82760558d093f4 upstream. + +Current implementation of page table walker has a fundamental problem in +vma handling, which started when we tried to handle vma(VM_HUGETLB). +Because it's done in pgd loop, considering vma boundary makes code +complicated and bug-prone. + +From the users viewpoint, some user checks some vma-related condition to +determine whether the user really does page walk over the vma. + +In order to solve these, this patch moves vma check outside pgd loop and +introduce a new callback ->test_walk(). + +Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> +Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> +Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> +Cc: Andrea Arcangeli <aarcange@redhat.com> +Cc: Cyrill Gorcunov <gorcunov@openvz.org> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Pavel Emelyanov <xemul@parallels.com> +Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> +[bwh: Backported to 3.16 as dependency of L1TF mitigation] +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + include/linux/mm.h | 15 +++- + mm/pagewalk.c | 206 +++++++++++++++++++++++++-------------------- + 2 files changed, 129 insertions(+), 92 deletions(-) + +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1121,10 +1121,16 @@ void unmap_vmas(struct mmu_gather *tlb, + * @pte_entry: if set, called for each non-empty PTE (4th-level) entry + * @pte_hole: if set, called for each hole at all levels + * @hugetlb_entry: if set, called for each hugetlb entry +- * *Caution*: The caller must hold mmap_sem() if @hugetlb_entry +- * is used. ++ * @test_walk: caller specific callback function to determine whether ++ * we walk over the current vma or not. A positive returned ++ * value means "do page table walk over the current vma," ++ * and a negative one means "abort current page table walk ++ * right now." 0 means "skip the current vma." ++ * @mm: mm_struct representing the target process of page table walk ++ * @vma: vma currently walked (NULL if walking outside vmas) ++ * @private: private data for callbacks' usage + * +- * (see walk_page_range for more details) ++ * (see the comment on walk_page_range() for more details) + */ + struct mm_walk { + int (*pmd_entry)(pmd_t *pmd, unsigned long addr, +@@ -1136,7 +1142,10 @@ struct mm_walk { + int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk); ++ int (*test_walk)(unsigned long addr, unsigned long next, ++ struct mm_walk *walk); + struct mm_struct *mm; ++ struct vm_area_struct *vma; + void *private; + }; + +--- a/mm/pagewalk.c ++++ b/mm/pagewalk.c +@@ -59,7 +59,7 @@ again: + continue; + + split_huge_page_pmd_mm(walk->mm, addr, pmd); +- if (pmd_none_or_trans_huge_or_clear_bad(pmd)) ++ if (pmd_trans_unstable(pmd)) + goto again; + err = walk_pte_range(pmd, addr, next, walk); + if (err) +@@ -95,6 +95,32 @@ static int walk_pud_range(pgd_t *pgd, un + return err; + } + ++static int walk_pgd_range(unsigned long addr, unsigned long end, ++ struct mm_walk *walk) ++{ ++ pgd_t *pgd; ++ unsigned long next; ++ int err = 0; ++ ++ pgd = pgd_offset(walk->mm, addr); ++ do { ++ next = pgd_addr_end(addr, end); ++ if (pgd_none_or_clear_bad(pgd)) { ++ if (walk->pte_hole) ++ err = walk->pte_hole(addr, next, walk); ++ if (err) ++ break; ++ continue; ++ } ++ if (walk->pmd_entry || walk->pte_entry) ++ err = walk_pud_range(pgd, addr, next, walk); ++ if (err) ++ break; ++ } while (pgd++, addr = next, addr != end); ++ ++ return err; ++} ++ + #ifdef CONFIG_HUGETLB_PAGE + static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, + unsigned long end) +@@ -103,10 +129,10 @@ static unsigned long hugetlb_entry_end(s + return boundary < end ? boundary : end; + } + +-static int walk_hugetlb_range(struct vm_area_struct *vma, +- unsigned long addr, unsigned long end, ++static int walk_hugetlb_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) + { ++ struct vm_area_struct *vma = walk->vma; + struct hstate *h = hstate_vma(vma); + unsigned long next; + unsigned long hmask = huge_page_mask(h); +@@ -119,15 +145,14 @@ static int walk_hugetlb_range(struct vm_ + if (pte && walk->hugetlb_entry) + err = walk->hugetlb_entry(pte, hmask, addr, next, walk); + if (err) +- return err; ++ break; + } while (addr = next, addr != end); + +- return 0; ++ return err; + } + + #else /* CONFIG_HUGETLB_PAGE */ +-static int walk_hugetlb_range(struct vm_area_struct *vma, +- unsigned long addr, unsigned long end, ++static int walk_hugetlb_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) + { + return 0; +@@ -135,112 +160,115 @@ static int walk_hugetlb_range(struct vm_ + + #endif /* CONFIG_HUGETLB_PAGE */ + ++/* ++ * Decide whether we really walk over the current vma on [@start, @end) ++ * or skip it via the returned value. Return 0 if we do walk over the ++ * current vma, and return 1 if we skip the vma. Negative values means ++ * error, where we abort the current walk. ++ * ++ * Default check (only VM_PFNMAP check for now) is used when the caller ++ * doesn't define test_walk() callback. ++ */ ++static int walk_page_test(unsigned long start, unsigned long end, ++ struct mm_walk *walk) ++{ ++ struct vm_area_struct *vma = walk->vma; + ++ if (walk->test_walk) ++ return walk->test_walk(start, end, walk); ++ ++ /* ++ * Do not walk over vma(VM_PFNMAP), because we have no valid struct ++ * page backing a VM_PFNMAP range. See also commit a9ff785e4437. ++ */ ++ if (vma->vm_flags & VM_PFNMAP) ++ return 1; ++ return 0; ++} ++ ++static int __walk_page_range(unsigned long start, unsigned long end, ++ struct mm_walk *walk) ++{ ++ int err = 0; ++ struct vm_area_struct *vma = walk->vma; ++ ++ if (vma && is_vm_hugetlb_page(vma)) { ++ if (walk->hugetlb_entry) ++ err = walk_hugetlb_range(start, end, walk); ++ } else ++ err = walk_pgd_range(start, end, walk); ++ ++ return err; ++} + + /** +- * walk_page_range - walk a memory map's page tables with a callback +- * @addr: starting address +- * @end: ending address +- * @walk: set of callbacks to invoke for each level of the tree ++ * walk_page_range - walk page table with caller specific callbacks + * +- * Recursively walk the page table for the memory area in a VMA, +- * calling supplied callbacks. Callbacks are called in-order (first +- * PGD, first PUD, first PMD, first PTE, second PTE... second PMD, +- * etc.). If lower-level callbacks are omitted, walking depth is reduced. ++ * Recursively walk the page table tree of the process represented by @walk->mm ++ * within the virtual address range [@start, @end). During walking, we can do ++ * some caller-specific works for each entry, by setting up pmd_entry(), ++ * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these ++ * callbacks, the associated entries/pages are just ignored. ++ * The return values of these callbacks are commonly defined like below: ++ * - 0 : succeeded to handle the current entry, and if you don't reach the ++ * end address yet, continue to walk. ++ * - >0 : succeeded to handle the current entry, and return to the caller ++ * with caller specific value. ++ * - <0 : failed to handle the current entry, and return to the caller ++ * with error code. + * +- * Each callback receives an entry pointer and the start and end of the +- * associated range, and a copy of the original mm_walk for access to +- * the ->private or ->mm fields. ++ * Before starting to walk page table, some callers want to check whether ++ * they really want to walk over the current vma, typically by checking ++ * its vm_flags. walk_page_test() and @walk->test_walk() are used for this ++ * purpose. + * +- * Usually no locks are taken, but splitting transparent huge page may +- * take page table lock. And the bottom level iterator will map PTE +- * directories from highmem if necessary. ++ * struct mm_walk keeps current values of some common data like vma and pmd, ++ * which are useful for the access from callbacks. If you want to pass some ++ * caller-specific data to callbacks, @walk->private should be helpful. + * +- * If any callback returns a non-zero value, the walk is aborted and +- * the return value is propagated back to the caller. Otherwise 0 is returned. +- * +- * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry +- * is !NULL. ++ * Locking: ++ * Callers of walk_page_range() and walk_page_vma() should hold ++ * @walk->mm->mmap_sem, because these function traverse vma list and/or ++ * access to vma's data. + */ +-int walk_page_range(unsigned long addr, unsigned long end, ++int walk_page_range(unsigned long start, unsigned long end, + struct mm_walk *walk) + { +- pgd_t *pgd; +- unsigned long next; + int err = 0; ++ unsigned long next; ++ struct vm_area_struct *vma; + +- if (addr >= end) +- return err; ++ if (start >= end) ++ return -EINVAL; + + if (!walk->mm) + return -EINVAL; + + VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); + +- pgd = pgd_offset(walk->mm, addr); ++ vma = find_vma(walk->mm, start); + do { +- struct vm_area_struct *vma = NULL; +- +- next = pgd_addr_end(addr, end); ++ if (!vma) { /* after the last vma */ ++ walk->vma = NULL; ++ next = end; ++ } else if (start < vma->vm_start) { /* outside vma */ ++ walk->vma = NULL; ++ next = min(end, vma->vm_start); ++ } else { /* inside vma */ ++ walk->vma = vma; ++ next = min(end, vma->vm_end); ++ vma = vma->vm_next; + +- /* +- * This function was not intended to be vma based. +- * But there are vma special cases to be handled: +- * - hugetlb vma's +- * - VM_PFNMAP vma's +- */ +- vma = find_vma(walk->mm, addr); +- if (vma) { +- /* +- * There are no page structures backing a VM_PFNMAP +- * range, so do not allow split_huge_page_pmd(). +- */ +- if ((vma->vm_start <= addr) && +- (vma->vm_flags & VM_PFNMAP)) { +- if (walk->pte_hole) +- err = walk->pte_hole(addr, next, walk); +- if (err) +- break; +- pgd = pgd_offset(walk->mm, next); +- continue; +- } +- /* +- * Handle hugetlb vma individually because pagetable +- * walk for the hugetlb page is dependent on the +- * architecture and we can't handled it in the same +- * manner as non-huge pages. +- */ +- if (walk->hugetlb_entry && (vma->vm_start <= addr) && +- is_vm_hugetlb_page(vma)) { +- if (vma->vm_end < next) +- next = vma->vm_end; +- /* +- * Hugepage is very tightly coupled with vma, +- * so walk through hugetlb entries within a +- * given vma. +- */ +- err = walk_hugetlb_range(vma, addr, next, walk); +- if (err) +- break; +- pgd = pgd_offset(walk->mm, next); ++ err = walk_page_test(start, next, walk); ++ if (err > 0) + continue; +- } +- } +- +- if (pgd_none_or_clear_bad(pgd)) { +- if (walk->pte_hole) +- err = walk->pte_hole(addr, next, walk); +- if (err) ++ if (err < 0) + break; +- pgd++; +- continue; + } +- if (walk->pmd_entry || walk->pte_entry) +- err = walk_pud_range(pgd, addr, next, walk); ++ if (walk->vma || walk->pte_hole) ++ err = __walk_page_range(start, next, walk); + if (err) + break; +- pgd++; +- } while (addr = next, addr < end); +- ++ } while (start = next, start < end); + return err; + } diff --git a/queue-3.16/series b/queue-3.16/series index 5de06fd9..2bad97d0 100644 --- a/queue-3.16/series +++ b/queue-3.16/series @@ -88,3 +88,34 @@ unicore32-drop-pte_file-related-helpers.patch x86-drop-_page_file-and-pte_file-related-helpers.patch xtensa-drop-_page_file-and-pte_file-related-helpers.patch powerpc-drop-_page_file-and-pte_file-related-helpers.patch +x86-speculation-l1tf-increase-32bit-pae-__physical_page_shift.patch +x86-mm-move-swap-offset-type-up-in-pte-to-work-around-erratum.patch +mm-x86-move-_page_swp_soft_dirty-from-bit-7-to-bit-1.patch +x86-speculation-l1tf-change-order-of-offset-type-in-swap-entry.patch +x86-speculation-l1tf-protect-swap-entries-against-l1tf.patch +x86-mm-add-pud-functions.patch +x86-speculation-l1tf-protect-prot_none-ptes-against-speculation.patch +x86-speculation-l1tf-make-sure-the-first-page-is-always-reserved.patch +x86-speculation-l1tf-add-sysfs-reporting-for-l1tf.patch +mm-add-vm_insert_pfn_prot.patch +mm-fix-cache-mode-tracking-in-vm_insert_mixed.patch +x86-io-add-interface-to-reserve-io-memtype-for-a-resource-range.patch +drm-drivers-add-support-for-using-the-arch-wc-mapping-api.patch +mm-pagewalk-remove-pgd_entry-and-pud_entry.patch +pagewalk-improve-vma-handling.patch +x86-speculation-l1tf-disallow-non-privileged-high-mmio-prot_none.patch +x86-speculation-l1tf-limit-swap-file-size-to-max_pa-2.patch +x86-init-fix-build-with-config_swap-n.patch +x86-bugs-move-the-l1tf-function-and-define-pr_fmt-properly.patch +x86-speculation-l1tf-extend-64bit-swap-file-size-limit.patch +x86-speculation-l1tf-protect-pae-swap-entries-against-l1tf.patch +x86-speculation-l1tf-fix-overflow-in-l1tf_pfn_limit-on-32bit.patch +x86-speculation-l1tf-fix-off-by-one-error-when-warning-that-system.patch +x86-speculation-l1tf-fix-up-pte-pfn-conversion-for-pae.patch +x86-speculation-l1tf-unbreak-__have_arch_pfn_modify_allowed.patch +x86-speculation-l1tf-invert-all-not-present-mappings.patch +x86-speculation-l1tf-exempt-zeroed-ptes-from-inversion.patch +x86-speculation-l1tf-make-pmd-pud_mknotpresent-invert.patch +x86-mm-pat-make-set_memory_np-l1tf-safe.patch +x86-mm-kmmio-make-the-tracer-robust-against-l1tf.patch +x86-speculation-l1tf-suggest-what-to-do-on-systems-with-too-much-ram.patch diff --git a/queue-3.16/x86-bugs-move-the-l1tf-function-and-define-pr_fmt-properly.patch b/queue-3.16/x86-bugs-move-the-l1tf-function-and-define-pr_fmt-properly.patch new file mode 100644 index 00000000..e8858c70 --- /dev/null +++ b/queue-3.16/x86-bugs-move-the-l1tf-function-and-define-pr_fmt-properly.patch @@ -0,0 +1,93 @@ +From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> +Date: Wed, 20 Jun 2018 16:42:57 -0400 +Subject: x86/bugs: Move the l1tf function and define pr_fmt properly + +commit 56563f53d3066afa9e63d6c997bf67e76a8b05c0 upstream. + +The pr_warn in l1tf_select_mitigation would have used the prior pr_fmt +which was defined as "Spectre V2 : ". + +Move the function to be past SSBD and also define the pr_fmt. + +Fixes: 17dbca119312 ("x86/speculation/l1tf: Add sysfs reporting for l1tf") +Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Guenter Roeck <linux@roeck-us.net> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + arch/x86/kernel/cpu/bugs.c | 55 ++++++++++++++++++++------------------ + 1 file changed, 29 insertions(+), 26 deletions(-) + +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -270,32 +270,6 @@ static void x86_amd_ssb_disable(void) + wrmsrl(MSR_AMD64_LS_CFG, msrval); + } + +-static void __init l1tf_select_mitigation(void) +-{ +- u64 half_pa; +- +- if (!boot_cpu_has_bug(X86_BUG_L1TF)) +- return; +- +-#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE) +- pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n"); +- return; +-#endif +- +- /* +- * This is extremely unlikely to happen because almost all +- * systems have far more MAX_PA/2 than RAM can be fit into +- * DIMM slots. +- */ +- half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT; +- if (e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) { +- pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n"); +- return; +- } +- +- setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV); +-} +- + #ifdef RETPOLINE + static bool spectre_v2_bad_module; + +@@ -721,6 +695,35 @@ void x86_spec_ctrl_setup_ap(void) + x86_amd_ssb_disable(); + } + ++#undef pr_fmt ++#define pr_fmt(fmt) "L1TF: " fmt ++static void __init l1tf_select_mitigation(void) ++{ ++ u64 half_pa; ++ ++ if (!boot_cpu_has_bug(X86_BUG_L1TF)) ++ return; ++ ++#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE) ++ pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n"); ++ return; ++#endif ++ ++ /* ++ * This is extremely unlikely to happen because almost all ++ * systems have far more MAX_PA/2 than RAM can be fit into ++ * DIMM slots. ++ */ ++ half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT; ++ if (e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) { ++ pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n"); ++ return; ++ } ++ ++ setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV); ++} ++#undef pr_fmt ++ + #ifdef CONFIG_SYSFS + + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, diff --git a/queue-3.16/x86-init-fix-build-with-config_swap-n.patch b/queue-3.16/x86-init-fix-build-with-config_swap-n.patch new file mode 100644 index 00000000..8e886556 --- /dev/null +++ b/queue-3.16/x86-init-fix-build-with-config_swap-n.patch @@ -0,0 +1,35 @@ +From: Vlastimil Babka <vbabka@suse.cz> +Date: Tue, 14 Aug 2018 20:50:47 +0200 +Subject: x86/init: fix build with CONFIG_SWAP=n + +commit 792adb90fa724ce07c0171cbc96b9215af4b1045 upstream. + +The introduction of generic_max_swapfile_size and arch-specific versions has +broken linking on x86 with CONFIG_SWAP=n due to undefined reference to +'generic_max_swapfile_size'. Fix it by compiling the x86-specific +max_swapfile_size() only with CONFIG_SWAP=y. + +Reported-by: Tomas Pruzina <pruzinat@gmail.com> +Fixes: 377eeaa8e11f ("x86/speculation/l1tf: Limit swap file size to MAX_PA/2") +Signed-off-by: Vlastimil Babka <vbabka@suse.cz> +Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + arch/x86/mm/init.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -701,6 +701,7 @@ void __init zone_sizes_init(void) + free_area_init_nodes(max_zone_pfns); + } + ++#ifdef CONFIG_SWAP + unsigned long max_swapfile_size(void) + { + unsigned long pages; +@@ -713,3 +714,4 @@ unsigned long max_swapfile_size(void) + } + return pages; + } ++#endif diff --git a/queue-3.16/x86-io-add-interface-to-reserve-io-memtype-for-a-resource-range.patch b/queue-3.16/x86-io-add-interface-to-reserve-io-memtype-for-a-resource-range.patch new file mode 100644 index 00000000..50109452 --- /dev/null +++ b/queue-3.16/x86-io-add-interface-to-reserve-io-memtype-for-a-resource-range.patch @@ -0,0 +1,118 @@ +From: Dave Airlie <airlied@redhat.com> +Date: Mon, 24 Oct 2016 15:27:59 +1000 +Subject: x86/io: add interface to reserve io memtype for a resource range. + (v1.1) + +commit 8ef4227615e158faa4ee85a1d6466782f7e22f2f upstream. + +A recent change to the mm code in: +87744ab3832b mm: fix cache mode tracking in vm_insert_mixed() + +started enforcing checking the memory type against the registered list for +amixed pfn insertion mappings. It happens that the drm drivers for a number +of gpus relied on this being broken. Currently the driver only inserted +VRAM mappings into the tracking table when they came from the kernel, +and userspace mappings never landed in the table. This led to a regression +where all the mapping end up as UC instead of WC now. + +I've considered a number of solutions but since this needs to be fixed +in fixes and not next, and some of the solutions were going to introduce +overhead that hadn't been there before I didn't consider them viable at +this stage. These mainly concerned hooking into the TTM io reserve APIs, +but these API have a bunch of fast paths I didn't want to unwind to add +this to. + +The solution I've decided on is to add a new API like the arch_phys_wc +APIs (these would have worked but wc_del didn't take a range), and +use them from the drivers to add a WC compatible mapping to the table +for all VRAM on those GPUs. This means we can then create userspace +mapping that won't get degraded to UC. + +v1.1: use CONFIG_X86_PAT + add some comments in io.h + +Cc: Toshi Kani <toshi.kani@hp.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: x86@kernel.org +Cc: mcgrof@suse.com +Cc: Dan Williams <dan.j.williams@intel.com> +Acked-by: Ingo Molnar <mingo@kernel.org> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Dave Airlie <airlied@redhat.com> +[bwh: Backported to 3.16: Memory types have type unsigned long, and the + constant is named _PAGE_CACHE_WC instead of _PAGE_CACHE_MODE_WC.] +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + arch/x86/include/asm/io.h | 6 ++++++ + arch/x86/mm/pat.c | 14 ++++++++++++++ + include/linux/io.h | 22 ++++++++++++++++++++++ + 3 files changed, 42 insertions(+) + +--- a/arch/x86/include/asm/io.h ++++ b/arch/x86/include/asm/io.h +@@ -340,4 +340,10 @@ extern void arch_phys_wc_del(int handle) + #define arch_phys_wc_add arch_phys_wc_add + #endif + ++#ifdef CONFIG_X86_PAT ++extern int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size); ++extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size); ++#define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc ++#endif ++ + #endif /* _ASM_X86_IO_H */ +--- a/arch/x86/mm/pat.c ++++ b/arch/x86/mm/pat.c +@@ -481,6 +481,20 @@ void io_free_memtype(resource_size_t sta + free_memtype(start, end); + } + ++int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size) ++{ ++ unsigned long type = _PAGE_CACHE_WC; ++ ++ return io_reserve_memtype(start, start + size, &type); ++} ++EXPORT_SYMBOL(arch_io_reserve_memtype_wc); ++ ++void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) ++{ ++ io_free_memtype(start, start + size); ++} ++EXPORT_SYMBOL(arch_io_free_memtype_wc); ++ + pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t vma_prot) + { +--- a/include/linux/io.h ++++ b/include/linux/io.h +@@ -101,4 +101,26 @@ static inline void arch_phys_wc_del(int + #define arch_phys_wc_add arch_phys_wc_add + #endif + ++/* ++ * On x86 PAT systems we have memory tracking that keeps track of ++ * the allowed mappings on memory ranges. This tracking works for ++ * all the in-kernel mapping APIs (ioremap*), but where the user ++ * wishes to map a range from a physical device into user memory ++ * the tracking won't be updated. This API is to be used by ++ * drivers which remap physical device pages into userspace, ++ * and wants to make sure they are mapped WC and not UC. ++ */ ++#ifndef arch_io_reserve_memtype_wc ++static inline int arch_io_reserve_memtype_wc(resource_size_t base, ++ resource_size_t size) ++{ ++ return 0; ++} ++ ++static inline void arch_io_free_memtype_wc(resource_size_t base, ++ resource_size_t size) ++{ ++} ++#endif ++ + #endif /* _LINUX_IO_H */ diff --git a/queue-3.16/x86-mm-add-pud-functions.patch b/queue-3.16/x86-mm-add-pud-functions.patch new file mode 100644 index 00000000..c985d260 --- /dev/null +++ b/queue-3.16/x86-mm-add-pud-functions.patch @@ -0,0 +1,51 @@ +From: Ben Hutchings <ben@decadent.org.uk> +Date: Fri, 28 Sep 2018 01:15:29 +0100 +Subject: x86: mm: Add PUD functions + +These are extracted from commit a00cc7d9dd93 "mm, x86: add support for +PUD-sized transparent hugepages" and will be used by later patches. + +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -303,6 +303,25 @@ static inline pmd_t pmd_mknotpresent(pmd + return pmd_clear_flags(pmd, _PAGE_PRESENT); + } + ++static inline pud_t pud_set_flags(pud_t pud, pudval_t set) ++{ ++ pudval_t v = native_pud_val(pud); ++ ++ return __pud(v | set); ++} ++ ++static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) ++{ ++ pudval_t v = native_pud_val(pud); ++ ++ return __pud(v & ~clear); ++} ++ ++static inline pud_t pud_mkhuge(pud_t pud) ++{ ++ return pud_set_flags(pud, _PAGE_PSE); ++} ++ + #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY + static inline int pte_soft_dirty(pte_t pte) + { +@@ -352,6 +371,12 @@ static inline pmd_t pfn_pmd(unsigned lon + massage_pgprot(pgprot)); + } + ++static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot) ++{ ++ return __pud(((phys_addr_t)page_nr << PAGE_SHIFT) | ++ massage_pgprot(pgprot)); ++} ++ + static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) + { + pteval_t val = pte_val(pte); diff --git a/queue-3.16/x86-mm-kmmio-make-the-tracer-robust-against-l1tf.patch b/queue-3.16/x86-mm-kmmio-make-the-tracer-robust-against-l1tf.patch new file mode 100644 index 00000000..923a8667 --- /dev/null +++ b/queue-3.16/x86-mm-kmmio-make-the-tracer-robust-against-l1tf.patch @@ -0,0 +1,66 @@ +From: Andi Kleen <ak@linux.intel.com> +Date: Tue, 7 Aug 2018 15:09:38 -0700 +Subject: x86/mm/kmmio: Make the tracer robust against L1TF + +commit 1063711b57393c1999248cccb57bebfaf16739e7 upstream. + +The mmio tracer sets io mapping PTEs and PMDs to non present when enabled +without inverting the address bits, which makes the PTE entry vulnerable +for L1TF. + +Make it use the right low level macros to actually invert the address bits +to protect against L1TF. + +In principle this could be avoided because MMIO tracing is not likely to be +enabled on production machines, but the fix is straigt forward and for +consistency sake it's better to get rid of the open coded PTE manipulation. + +Signed-off-by: Andi Kleen <ak@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + arch/x86/mm/kmmio.c | 25 +++++++++++++++---------- + 1 file changed, 15 insertions(+), 10 deletions(-) + +--- a/arch/x86/mm/kmmio.c ++++ b/arch/x86/mm/kmmio.c +@@ -114,24 +114,29 @@ static struct kmmio_fault_page *get_kmmi + + static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old) + { ++ pmd_t new_pmd; + pmdval_t v = pmd_val(*pmd); + if (clear) { +- *old = v & _PAGE_PRESENT; +- v &= ~_PAGE_PRESENT; +- } else /* presume this has been called with clear==true previously */ +- v |= *old; +- set_pmd(pmd, __pmd(v)); ++ *old = v; ++ new_pmd = pmd_mknotpresent(*pmd); ++ } else { ++ /* Presume this has been called with clear==true previously */ ++ new_pmd = __pmd(*old); ++ } ++ set_pmd(pmd, new_pmd); + } + + static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old) + { + pteval_t v = pte_val(*pte); + if (clear) { +- *old = v & _PAGE_PRESENT; +- v &= ~_PAGE_PRESENT; +- } else /* presume this has been called with clear==true previously */ +- v |= *old; +- set_pte_atomic(pte, __pte(v)); ++ *old = v; ++ /* Nothing should care about address */ ++ pte_clear(&init_mm, 0, pte); ++ } else { ++ /* Presume this has been called with clear==true previously */ ++ set_pte_atomic(pte, __pte(*old)); ++ } + } + + static int clear_page_presence(struct kmmio_fault_page *f, bool clear) diff --git a/queue-3.16/x86-mm-move-swap-offset-type-up-in-pte-to-work-around-erratum.patch b/queue-3.16/x86-mm-move-swap-offset-type-up-in-pte-to-work-around-erratum.patch new file mode 100644 index 00000000..32cdfe4f --- /dev/null +++ b/queue-3.16/x86-mm-move-swap-offset-type-up-in-pte-to-work-around-erratum.patch @@ -0,0 +1,104 @@ +From: Dave Hansen <dave.hansen@linux.intel.com> +Date: Thu, 7 Jul 2016 17:19:11 -0700 +Subject: x86/mm: Move swap offset/type up in PTE to work around erratum + +commit 00839ee3b299303c6a5e26a0a2485427a3afcbbf upstream. + +This erratum can result in Accessed/Dirty getting set by the hardware +when we do not expect them to be (on !Present PTEs). + +Instead of trying to fix them up after this happens, we just +allow the bits to get set and try to ignore them. We do this by +shifting the layout of the bits we use for swap offset/type in +our 64-bit PTEs. + +It looks like this: + + bitnrs: | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| + names: | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| + before: | OFFSET (9-63) |0|X|X| TYPE(1-5) |0| + after: | OFFSET (14-63) | TYPE (9-13) |0|X|X|X| X| X|X|X|0| + +Note that D was already a don't care (X) even before. We just +move TYPE up and turn its old spot (which could be hit by the +A bit) into all don't cares. + +We take 5 bits away from the offset, but that still leaves us +with 50 bits which lets us index into a 62-bit swapfile (4 EiB). +I think that's probably fine for the moment. We could +theoretically reclaim 5 of the bits (1, 2, 3, 4, 7) but it +doesn't gain us anything. + +Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave@sr71.net> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Luis R. Rodriguez <mcgrof@suse.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: Toshi Kani <toshi.kani@hp.com> +Cc: dave.hansen@intel.com +Cc: linux-mm@kvack.org +Cc: mhocko@suse.com +Link: http://lkml.kernel.org/r/20160708001911.9A3FD2B6@viggo.jf.intel.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> +[bwh: Backported to 3.16: Bit 9 may be reserved for PAGE_BIT_NUMA, which + no longer exists upstream. Adjust the bit numbers accordingly, + incorporating commit ace7fab7a6cd "x86/mm: Fix swap entry comment and + macro".] +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- +--- a/arch/x86/include/asm/pgtable_64.h ++++ b/arch/x86/include/asm/pgtable_64.h +@@ -162,23 +162,37 @@ static inline int pgd_large(pgd_t pgd) { + #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) + #define pte_unmap(pte) ((void)(pte))/* NOP */ + +-/* Encode and de-code a swap entry */ +-#define SWP_TYPE_BITS 5 ++/* ++ * Encode and de-code a swap entry ++ * ++ * | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number ++ * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names ++ * | OFFSET (15->63) | TYPE (10-14) | 0 |0|X|X|X| X| X|X|X|0| <- swp entry ++ * ++ * G (8) is aliased and used as a PROT_NONE indicator for ++ * !present ptes. We need to start storing swap entries above ++ * there. We also need to avoid using A and D because of an ++ * erratum where they can be incorrectly set by hardware on ++ * non-present PTEs. ++ */ + #ifdef CONFIG_NUMA_BALANCING + /* Automatic NUMA balancing needs to be distinguishable from swap entries */ +-#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2) ++#define SWP_TYPE_FIRST_SHIFT (_PAGE_BIT_PROTNONE + 2) + #else +-#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) ++#define SWP_TYPE_FIRST_SHIFT (_PAGE_BIT_PROTNONE + 1) + #endif ++#define SWP_TYPE_BITS 5 ++/* Place the offset above the type: */ ++#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS) + + #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) + +-#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \ ++#define __swp_type(x) (((x).val >> (SWP_TYPE_FIRST_BIT)) \ + & ((1U << SWP_TYPE_BITS) - 1)) +-#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT) ++#define __swp_offset(x) ((x).val >> SWP_OFFSET_FIRST_BIT) + #define __swp_entry(type, offset) ((swp_entry_t) { \ +- ((type) << (_PAGE_BIT_PRESENT + 1)) \ +- | ((offset) << SWP_OFFSET_SHIFT) }) ++ ((type) << (SWP_TYPE_FIRST_BIT)) \ ++ | ((offset) << SWP_OFFSET_FIRST_BIT) }) + #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) + #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) + diff --git a/queue-3.16/x86-mm-pat-make-set_memory_np-l1tf-safe.patch b/queue-3.16/x86-mm-pat-make-set_memory_np-l1tf-safe.patch new file mode 100644 index 00000000..8cedb8d4 --- /dev/null +++ b/queue-3.16/x86-mm-pat-make-set_memory_np-l1tf-safe.patch @@ -0,0 +1,45 @@ +From: Andi Kleen <ak@linux.intel.com> +Date: Tue, 7 Aug 2018 15:09:39 -0700 +Subject: x86/mm/pat: Make set_memory_np() L1TF safe + +commit 958f79b9ee55dfaf00c8106ed1c22a2919e0028b upstream + +set_memory_np() is used to mark kernel mappings not present, but it has +it's own open coded mechanism which does not have the L1TF protection of +inverting the address bits. + +Replace the open coded PTE manipulation with the L1TF protecting low level +PTE routines. + +Passes the CPA self test. + +Signed-off-by: Andi Kleen <ak@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +[bwh: Backported to 3.16: + - cpa->pfn is actually a physical address here and needs to be shifted to + produce a PFN + - Adjust context] +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- +--- a/arch/x86/mm/pageattr.c ++++ b/arch/x86/mm/pageattr.c +@@ -952,7 +952,8 @@ static int populate_pmd(struct cpa_data + + pmd = pmd_offset(pud, start); + +- set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot))); ++ set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn >> PAGE_SHIFT, ++ canon_pgprot(pgprot)))); + + start += PMD_SIZE; + cpa->pfn += PMD_SIZE; +@@ -1022,7 +1023,8 @@ static int populate_pud(struct cpa_data + * Map everything starting from the Gb boundary, possibly with 1G pages + */ + while (end - start >= PUD_SIZE) { +- set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot))); ++ set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn >> PAGE_SHIFT, ++ canon_pgprot(pgprot)))); + + start += PUD_SIZE; + cpa->pfn += PUD_SIZE; diff --git a/queue-3.16/x86-speculation-l1tf-add-sysfs-reporting-for-l1tf.patch b/queue-3.16/x86-speculation-l1tf-add-sysfs-reporting-for-l1tf.patch new file mode 100644 index 00000000..889f66d7 --- /dev/null +++ b/queue-3.16/x86-speculation-l1tf-add-sysfs-reporting-for-l1tf.patch @@ -0,0 +1,235 @@ +From: Andi Kleen <ak@linux.intel.com> +Date: Wed, 13 Jun 2018 15:48:26 -0700 +Subject: x86/speculation/l1tf: Add sysfs reporting for l1tf + +commit 17dbca119312b4e8173d4e25ff64262119fcef38 upstream + +L1TF core kernel workarounds are cheap and normally always enabled, However +they still should be reported in sysfs if the system is vulnerable or +mitigated. Add the necessary CPU feature/bug bits. + +- Extend the existing checks for Meltdowns to determine if the system is + vulnerable. All CPUs which are not vulnerable to Meltdown are also not + vulnerable to L1TF + +- Check for 32bit non PAE and emit a warning as there is no practical way + for mitigation due to the limited physical address bits + +- If the system has more than MAX_PA/2 physical memory the invert page + workarounds don't protect the system against the L1TF attack anymore, + because an inverted physical address will also point to valid + memory. Print a warning in this case and report that the system is + vulnerable. + +Add a function which returns the PFN limit for the L1TF mitigation, which +will be used in follow up patches for sanity and range checks. + +[ tglx: Renamed the CPU feature bit to L1TF_PTEINV ] +[ dwmw2: Backport to 4.9 (cpufeatures.h, E820) ] + +Signed-off-by: Andi Kleen <ak@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com> +Acked-by: Dave Hansen <dave.hansen@intel.com> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Guenter Roeck <linux@roeck-us.net> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +[bwh: Backported to 3.16: + - Assign the next available bits from feature word 7 and bug word 0 + - CONFIG_PGTABLE_LEVELS is not defined; use other config symbols in the + condition + - Adjust context] +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + arch/x86/include/asm/cpufeature.h | 3 ++- + arch/x86/include/asm/processor.h | 5 ++++ + arch/x86/kernel/cpu/bugs.c | 40 ++++++++++++++++++++++++++++++ + arch/x86/kernel/cpu/common.c | 20 +++++++++++++++ + drivers/base/cpu.c | 8 ++++++ + include/linux/cpu.h | 2 ++ + 6 files changed, 77 insertions(+), 1 deletion(-) + +--- a/arch/x86/include/asm/cpufeature.h ++++ b/arch/x86/include/asm/cpufeature.h +@@ -199,6 +199,7 @@ + #define X86_FEATURE_MSR_SPEC_CTRL (7*32+19) /* "" MSR SPEC_CTRL is implemented */ + #define X86_FEATURE_SSBD (7*32+20) /* Speculative Store Bypass Disable */ + #define X86_FEATURE_ZEN (7*32+21) /* "" CPU is AMD family 0x17 (Zen) */ ++#define X86_FEATURE_L1TF_PTEINV (7*32+22) /* "" L1TF workaround PTE inversion */ + + #define X86_FEATURE_RETPOLINE (7*32+29) /* "" Generic Retpoline mitigation for Spectre variant 2 */ + #define X86_FEATURE_RETPOLINE_AMD (7*32+30) /* "" AMD Retpoline mitigation for Spectre variant 2 */ +@@ -271,6 +272,7 @@ + #define X86_BUG_SPECTRE_V1 X86_BUG(6) /* CPU is affected by Spectre variant 1 attack with conditional branches */ + #define X86_BUG_SPECTRE_V2 X86_BUG(7) /* CPU is affected by Spectre variant 2 attack with indirect branches */ + #define X86_BUG_SPEC_STORE_BYPASS X86_BUG(8) /* CPU is affected by speculative store bypass attack */ ++#define X86_BUG_L1TF X86_BUG(9) /* CPU is affected by L1 Terminal Fault */ + + #if defined(__KERNEL__) && !defined(__ASSEMBLY__) + +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -165,6 +165,11 @@ extern const struct seq_operations cpuin + extern void cpu_detect(struct cpuinfo_x86 *c); + extern void fpu_detect(struct cpuinfo_x86 *c); + ++static inline unsigned long l1tf_pfn_limit(void) ++{ ++ return BIT(boot_cpu_data.x86_phys_bits - 1 - PAGE_SHIFT) - 1; ++} ++ + extern void early_cpu_init(void); + extern void identify_boot_cpu(void); + extern void identify_secondary_cpu(struct cpuinfo_x86 *); +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -26,9 +26,11 @@ + #include <asm/pgtable.h> + #include <asm/cacheflush.h> + #include <asm/intel-family.h> ++#include <asm/e820.h> + + static void __init spectre_v2_select_mitigation(void); + static void __init ssb_select_mitigation(void); ++static void __init l1tf_select_mitigation(void); + + /* + * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any +@@ -138,6 +140,8 @@ void __init check_bugs(void) + */ + ssb_select_mitigation(); + ++ l1tf_select_mitigation(); ++ + #ifdef CONFIG_X86_32 + /* + * Check whether we are able to run this kernel safely on SMP. +@@ -266,6 +270,32 @@ static void x86_amd_ssb_disable(void) + wrmsrl(MSR_AMD64_LS_CFG, msrval); + } + ++static void __init l1tf_select_mitigation(void) ++{ ++ u64 half_pa; ++ ++ if (!boot_cpu_has_bug(X86_BUG_L1TF)) ++ return; ++ ++#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE) ++ pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n"); ++ return; ++#endif ++ ++ /* ++ * This is extremely unlikely to happen because almost all ++ * systems have far more MAX_PA/2 than RAM can be fit into ++ * DIMM slots. ++ */ ++ half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT; ++ if (e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) { ++ pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n"); ++ return; ++ } ++ ++ setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV); ++} ++ + #ifdef RETPOLINE + static bool spectre_v2_bad_module; + +@@ -718,6 +748,11 @@ static ssize_t cpu_show_common(struct de + case X86_BUG_SPEC_STORE_BYPASS: + return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); + ++ case X86_BUG_L1TF: ++ if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV)) ++ return sprintf(buf, "Mitigation: Page Table Inversion\n"); ++ break; ++ + default: + break; + } +@@ -744,4 +779,9 @@ ssize_t cpu_show_spec_store_bypass(struc + { + return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS); + } ++ ++ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ return cpu_show_common(dev, attr, buf, X86_BUG_L1TF); ++} + #endif +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -842,6 +842,21 @@ static const __initconst struct x86_cpu_ + {} + }; + ++static const __initconst struct x86_cpu_id cpu_no_l1tf[] = { ++ /* in addition to cpu_no_speculation */ ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD }, ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT }, ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON }, ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE }, ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, ++ {} ++}; ++ + static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) + { + u64 ia32_cap = 0; +@@ -867,6 +882,11 @@ static void __init cpu_set_bug_bits(stru + return; + + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); ++ ++ if (x86_match_cpu(cpu_no_l1tf)) ++ return; ++ ++ setup_force_cpu_bug(X86_BUG_L1TF); + } + + /* +--- a/drivers/base/cpu.c ++++ b/drivers/base/cpu.c +@@ -444,16 +444,24 @@ ssize_t __weak cpu_show_spec_store_bypas + return sprintf(buf, "Not affected\n"); + } + ++ssize_t __weak cpu_show_l1tf(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "Not affected\n"); ++} ++ + static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); + static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); + static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); + static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL); ++static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL); + + static struct attribute *cpu_root_vulnerabilities_attrs[] = { + &dev_attr_meltdown.attr, + &dev_attr_spectre_v1.attr, + &dev_attr_spectre_v2.attr, + &dev_attr_spec_store_bypass.attr, ++ &dev_attr_l1tf.attr, + NULL + }; + +--- a/include/linux/cpu.h ++++ b/include/linux/cpu.h +@@ -47,6 +47,8 @@ extern ssize_t cpu_show_spectre_v2(struc + struct device_attribute *attr, char *buf); + extern ssize_t cpu_show_spec_store_bypass(struct device *dev, + struct device_attribute *attr, char *buf); ++extern ssize_t cpu_show_l1tf(struct device *dev, ++ struct device_attribute *attr, char *buf); + + #ifdef CONFIG_HOTPLUG_CPU + extern void unregister_cpu(struct cpu *cpu); diff --git a/queue-3.16/x86-speculation-l1tf-change-order-of-offset-type-in-swap-entry.patch b/queue-3.16/x86-speculation-l1tf-change-order-of-offset-type-in-swap-entry.patch new file mode 100644 index 00000000..83bf5c3e --- /dev/null +++ b/queue-3.16/x86-speculation-l1tf-change-order-of-offset-type-in-swap-entry.patch @@ -0,0 +1,108 @@ +From: Linus Torvalds <torvalds@linux-foundation.org> +Date: Wed, 13 Jun 2018 15:48:22 -0700 +Subject: x86/speculation/l1tf: Change order of offset/type in swap entry + +commit bcd11afa7adad8d720e7ba5ef58bdcd9775cf45f upstream. + +If pages are swapped out, the swap entry is stored in the corresponding +PTE, which has the Present bit cleared. CPUs vulnerable to L1TF speculate +on PTE entries which have the present bit set and would treat the swap +entry as phsyical address (PFN). To mitigate that the upper bits of the PTE +must be set so the PTE points to non existent memory. + +The swap entry stores the type and the offset of a swapped out page in the +PTE. type is stored in bit 9-13 and offset in bit 14-63. The hardware +ignores the bits beyond the phsyical address space limit, so to make the +mitigation effective its required to start 'offset' at the lowest possible +bit so that even large swap offsets do not reach into the physical address +space limit bits. + +Move offset to bit 9-58 and type to bit 59-63 which are the bits that +hardware generally doesn't care about. + +That, in turn, means that if you on desktop chip with only 40 bits of +physical addressing, now that the offset starts at bit 9, there needs to be +30 bits of offset actually *in use* until bit 39 ends up being set, which +means when inverted it will again point into existing memory. + +So that's 4 terabyte of swap space (because the offset is counted in pages, +so 30 bits of offset is 42 bits of actual coverage). With bigger physical +addressing, that obviously grows further, until the limit of the offset is +hit (at 50 bits of offset - 62 bits of actual swap file coverage). + +This is a preparatory change for the actual swap entry inversion to protect +against L1TF. + +[ AK: Updated description and minor tweaks. Split into two parts ] +[ tglx: Massaged changelog ] + +Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> +Signed-off-by: Andi Kleen <ak@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Tested-by: Andi Kleen <ak@linux.intel.com> +Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com> +Acked-by: Michal Hocko <mhocko@suse.com> +Acked-by: Vlastimil Babka <vbabka@suse.cz> +Acked-by: Dave Hansen <dave.hansen@intel.com> +[bwh: Backported to 3.16: Bit 9 may be reserved for PAGE_BIT_NUMA here] +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + arch/x86/include/asm/pgtable_64.h | 31 ++++++++++++++++++++----------- + 1 file changed, 20 insertions(+), 11 deletions(-) + +--- a/arch/x86/include/asm/pgtable_64.h ++++ b/arch/x86/include/asm/pgtable_64.h +@@ -167,7 +167,7 @@ static inline int pgd_large(pgd_t pgd) { + * + * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number + * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names +- * | OFFSET (15->63) | TYPE (10-14) | 0 |0|0|X|X| X| X|X|SD|0| <- swp entry ++ * | TYPE (59-63) | OFFSET (10-58) | 0 |0|0|X|X| X| X|X|SD|0| <- swp entry + * + * G (8) is aliased and used as a PROT_NONE indicator for + * !present ptes. We need to start storing swap entries above +@@ -181,24 +181,33 @@ static inline int pgd_large(pgd_t pgd) { + * Bit 7 in swp entry should be 0 because pmd_present checks not only P, + * but also L and G. + */ ++#define SWP_TYPE_BITS 5 ++ + #ifdef CONFIG_NUMA_BALANCING + /* Automatic NUMA balancing needs to be distinguishable from swap entries */ +-#define SWP_TYPE_FIRST_SHIFT (_PAGE_BIT_PROTNONE + 2) ++#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 2) + #else +-#define SWP_TYPE_FIRST_SHIFT (_PAGE_BIT_PROTNONE + 1) ++#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) + #endif +-#define SWP_TYPE_BITS 5 +-/* Place the offset above the type: */ +-#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS) ++ ++/* We always extract/encode the offset by shifting it all the way up, and then down again */ ++#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT+SWP_TYPE_BITS) + + #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) + +-#define __swp_type(x) (((x).val >> (SWP_TYPE_FIRST_BIT)) \ +- & ((1U << SWP_TYPE_BITS) - 1)) +-#define __swp_offset(x) ((x).val >> SWP_OFFSET_FIRST_BIT) +-#define __swp_entry(type, offset) ((swp_entry_t) { \ +- ((type) << (SWP_TYPE_FIRST_BIT)) \ +- | ((offset) << SWP_OFFSET_FIRST_BIT) }) ++/* Extract the high bits for type */ ++#define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS)) ++ ++/* Shift up (to get rid of type), then down to get value */ ++#define __swp_offset(x) ((x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT) ++ ++/* ++ * Shift the offset up "too far" by TYPE bits, then down again ++ */ ++#define __swp_entry(type, offset) ((swp_entry_t) { \ ++ ((unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \ ++ | ((unsigned long)(type) << (64-SWP_TYPE_BITS)) }) ++ + #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) + #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) + diff --git a/queue-3.16/x86-speculation-l1tf-disallow-non-privileged-high-mmio-prot_none.patch b/queue-3.16/x86-speculation-l1tf-disallow-non-privileged-high-mmio-prot_none.patch new file mode 100644 index 00000000..38919fde --- /dev/null +++ b/queue-3.16/x86-speculation-l1tf-disallow-non-privileged-high-mmio-prot_none.patch @@ -0,0 +1,274 @@ +From: Andi Kleen <ak@linux.intel.com> +Date: Wed, 13 Jun 2018 15:48:27 -0700 +Subject: x86/speculation/l1tf: Disallow non privileged high MMIO PROT_NONE + mappings + +commit 42e4089c7890725fcd329999252dc489b72f2921 upstream + +For L1TF PROT_NONE mappings are protected by inverting the PFN in the page +table entry. This sets the high bits in the CPU's address space, thus +making sure to point to not point an unmapped entry to valid cached memory. + +Some server system BIOSes put the MMIO mappings high up in the physical +address space. If such an high mapping was mapped to unprivileged users +they could attack low memory by setting such a mapping to PROT_NONE. This +could happen through a special device driver which is not access +protected. Normal /dev/mem is of course access protected. + +To avoid this forbid PROT_NONE mappings or mprotect for high MMIO mappings. + +Valid page mappings are allowed because the system is then unsafe anyways. + +It's not expected that users commonly use PROT_NONE on MMIO. But to +minimize any impact this is only enforced if the mapping actually refers to +a high MMIO address (defined as the MAX_PA-1 bit being set), and also skip +the check for root. + +For mmaps this is straight forward and can be handled in vm_insert_pfn and +in remap_pfn_range(). + +For mprotect it's a bit trickier. At the point where the actual PTEs are +accessed a lot of state has been changed and it would be difficult to undo +on an error. Since this is a uncommon case use a separate early page talk +walk pass for MMIO PROT_NONE mappings that checks for this condition +early. For non MMIO and non PROT_NONE there are no changes. + +[dwmw2: Backport to 4.9] +[groeck: Backport to 4.4] + +Signed-off-by: Andi Kleen <ak@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com> +Acked-by: Dave Hansen <dave.hansen@intel.com> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Guenter Roeck <linux@roeck-us.net> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + arch/x86/include/asm/pgtable.h | 8 ++++++ + arch/x86/mm/mmap.c | 21 +++++++++++++++ + include/asm-generic/pgtable.h | 12 +++++++++ + mm/memory.c | 29 +++++++++++++++----- + mm/mprotect.c | 49 ++++++++++++++++++++++++++++++++++ + 5 files changed, 112 insertions(+), 7 deletions(-) + +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -940,6 +940,14 @@ static inline pte_t pte_swp_clear_soft_d + } + #endif + ++#define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1 ++extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot); ++ ++static inline bool arch_has_pfn_modify_check(void) ++{ ++ return boot_cpu_has_bug(X86_BUG_L1TF); ++} ++ + #include <asm-generic/pgtable.h> + #endif /* __ASSEMBLY__ */ + +--- a/arch/x86/mm/mmap.c ++++ b/arch/x86/mm/mmap.c +@@ -114,3 +114,24 @@ void arch_pick_mmap_layout(struct mm_str + mm->get_unmapped_area = arch_get_unmapped_area_topdown; + } + } ++ ++/* ++ * Only allow root to set high MMIO mappings to PROT_NONE. ++ * This prevents an unpriv. user to set them to PROT_NONE and invert ++ * them, then pointing to valid memory for L1TF speculation. ++ * ++ * Note: for locked down kernels may want to disable the root override. ++ */ ++bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) ++{ ++ if (!boot_cpu_has_bug(X86_BUG_L1TF)) ++ return true; ++ if (!__pte_needs_invert(pgprot_val(prot))) ++ return true; ++ /* If it's real memory always allow */ ++ if (pfn_valid(pfn)) ++ return true; ++ if (pfn > l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN)) ++ return false; ++ return true; ++} +--- a/include/asm-generic/pgtable.h ++++ b/include/asm-generic/pgtable.h +@@ -812,4 +812,16 @@ static inline void pmdp_set_numa(struct + #define io_remap_pfn_range remap_pfn_range + #endif + ++#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED ++static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) ++{ ++ return true; ++} ++ ++static inline bool arch_has_pfn_modify_check(void) ++{ ++ return false; ++} ++#endif ++ + #endif /* _ASM_GENERIC_PGTABLE_H */ +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -1614,6 +1614,9 @@ int vm_insert_pfn_prot(struct vm_area_st + if (track_pfn_insert(vma, &pgprot, pfn)) + return -EINVAL; + ++ if (!pfn_modify_allowed(pfn, pgprot)) ++ return -EACCES; ++ + ret = insert_pfn(vma, addr, pfn, pgprot); + + return ret; +@@ -1632,6 +1635,9 @@ int vm_insert_mixed(struct vm_area_struc + if (track_pfn_insert(vma, &pgprot, pfn)) + return -EINVAL; + ++ if (!pfn_modify_allowed(pfn, pgprot)) ++ return -EACCES; ++ + /* + * If we don't have pte special, then we have to use the pfn_valid() + * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* +@@ -1660,6 +1666,7 @@ static int remap_pte_range(struct mm_str + { + pte_t *pte; + spinlock_t *ptl; ++ int err = 0; + + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); + if (!pte) +@@ -1667,12 +1674,16 @@ static int remap_pte_range(struct mm_str + arch_enter_lazy_mmu_mode(); + do { + BUG_ON(!pte_none(*pte)); ++ if (!pfn_modify_allowed(pfn, prot)) { ++ err = -EACCES; ++ break; ++ } + set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); + pfn++; + } while (pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(pte - 1, ptl); +- return 0; ++ return err; + } + + static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, +@@ -1681,6 +1692,7 @@ static inline int remap_pmd_range(struct + { + pmd_t *pmd; + unsigned long next; ++ int err; + + pfn -= addr >> PAGE_SHIFT; + pmd = pmd_alloc(mm, pud, addr); +@@ -1689,9 +1701,10 @@ static inline int remap_pmd_range(struct + VM_BUG_ON(pmd_trans_huge(*pmd)); + do { + next = pmd_addr_end(addr, end); +- if (remap_pte_range(mm, pmd, addr, next, +- pfn + (addr >> PAGE_SHIFT), prot)) +- return -ENOMEM; ++ err = remap_pte_range(mm, pmd, addr, next, ++ pfn + (addr >> PAGE_SHIFT), prot); ++ if (err) ++ return err; + } while (pmd++, addr = next, addr != end); + return 0; + } +@@ -1702,6 +1715,7 @@ static inline int remap_pud_range(struct + { + pud_t *pud; + unsigned long next; ++ int err; + + pfn -= addr >> PAGE_SHIFT; + pud = pud_alloc(mm, pgd, addr); +@@ -1709,9 +1723,10 @@ static inline int remap_pud_range(struct + return -ENOMEM; + do { + next = pud_addr_end(addr, end); +- if (remap_pmd_range(mm, pud, addr, next, +- pfn + (addr >> PAGE_SHIFT), prot)) +- return -ENOMEM; ++ err = remap_pmd_range(mm, pud, addr, next, ++ pfn + (addr >> PAGE_SHIFT), prot); ++ if (err) ++ return err; + } while (pud++, addr = next, addr != end); + return 0; + } +--- a/mm/mprotect.c ++++ b/mm/mprotect.c +@@ -258,6 +258,42 @@ unsigned long change_protection(struct v + return pages; + } + ++static int prot_none_pte_entry(pte_t *pte, unsigned long addr, ++ unsigned long next, struct mm_walk *walk) ++{ ++ return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? ++ 0 : -EACCES; ++} ++ ++static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask, ++ unsigned long addr, unsigned long next, ++ struct mm_walk *walk) ++{ ++ return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? ++ 0 : -EACCES; ++} ++ ++static int prot_none_test(unsigned long addr, unsigned long next, ++ struct mm_walk *walk) ++{ ++ return 0; ++} ++ ++static int prot_none_walk(struct vm_area_struct *vma, unsigned long start, ++ unsigned long end, unsigned long newflags) ++{ ++ pgprot_t new_pgprot = vm_get_page_prot(newflags); ++ struct mm_walk prot_none_walk = { ++ .pte_entry = prot_none_pte_entry, ++ .hugetlb_entry = prot_none_hugetlb_entry, ++ .test_walk = prot_none_test, ++ .mm = current->mm, ++ .private = &new_pgprot, ++ }; ++ ++ return walk_page_range(start, end, &prot_none_walk); ++} ++ + int + mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, + unsigned long start, unsigned long end, unsigned long newflags) +@@ -276,6 +312,19 @@ mprotect_fixup(struct vm_area_struct *vm + } + + /* ++ * Do PROT_NONE PFN permission checks here when we can still ++ * bail out without undoing a lot of state. This is a rather ++ * uncommon case, so doesn't need to be very optimized. ++ */ ++ if (arch_has_pfn_modify_check() && ++ (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && ++ (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) { ++ error = prot_none_walk(vma, start, end, newflags); ++ if (error) ++ return error; ++ } ++ ++ /* + * If we make a private mapping writable we increase our commit; + * but (without finer accounting) cannot reduce our commit if we + * make it unwritable again. hugetlb mapping were accounted for diff --git a/queue-3.16/x86-speculation-l1tf-exempt-zeroed-ptes-from-inversion.patch b/queue-3.16/x86-speculation-l1tf-exempt-zeroed-ptes-from-inversion.patch new file mode 100644 index 00000000..0fdd9585 --- /dev/null +++ b/queue-3.16/x86-speculation-l1tf-exempt-zeroed-ptes-from-inversion.patch @@ -0,0 +1,69 @@ +From: Sean Christopherson <sean.j.christopherson@intel.com> +Date: Fri, 17 Aug 2018 10:27:36 -0700 +Subject: x86/speculation/l1tf: Exempt zeroed PTEs from inversion + +commit f19f5c49bbc3ffcc9126cc245fc1b24cc29f4a37 upstream. + +It turns out that we should *not* invert all not-present mappings, +because the all zeroes case is obviously special. + +clear_page() does not undergo the XOR logic to invert the address bits, +i.e. PTE, PMD and PUD entries that have not been individually written +will have val=0 and so will trigger __pte_needs_invert(). As a result, +{pte,pmd,pud}_pfn() will return the wrong PFN value, i.e. all ones +(adjusted by the max PFN mask) instead of zero. A zeroed entry is ok +because the page at physical address 0 is reserved early in boot +specifically to mitigate L1TF, so explicitly exempt them from the +inversion when reading the PFN. + +Manifested as an unexpected mprotect(..., PROT_NONE) failure when called +on a VMA that has VM_PFNMAP and was mmap'd to as something other than +PROT_NONE but never used. mprotect() sends the PROT_NONE request down +prot_none_walk(), which walks the PTEs to check the PFNs. +prot_none_pte_entry() gets the bogus PFN from pte_pfn() and returns +-EACCES because it thinks mprotect() is trying to adjust a high MMIO +address. + +[ This is a very modified version of Sean's original patch, but all + credit goes to Sean for doing this and also pointing out that + sometimes the __pte_needs_invert() function only gets the protection + bits, not the full eventual pte. But zero remains special even in + just protection bits, so that's ok. - Linus ] + +Fixes: f22cc87f6c1f ("x86/speculation/l1tf: Invert all not present mappings") +Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> +Acked-by: Andi Kleen <ak@linux.intel.com> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Michal Hocko <mhocko@suse.com> +Cc: Vlastimil Babka <vbabka@suse.cz> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + arch/x86/include/asm/pgtable-invert.h | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +--- a/arch/x86/include/asm/pgtable-invert.h ++++ b/arch/x86/include/asm/pgtable-invert.h +@@ -4,9 +4,18 @@ + + #ifndef __ASSEMBLY__ + ++/* ++ * A clear pte value is special, and doesn't get inverted. ++ * ++ * Note that even users that only pass a pgprot_t (rather ++ * than a full pte) won't trigger the special zero case, ++ * because even PAGE_NONE has _PAGE_PROTNONE | _PAGE_ACCESSED ++ * set. So the all zero case really is limited to just the ++ * cleared page table entry case. ++ */ + static inline bool __pte_needs_invert(u64 val) + { +- return !(val & _PAGE_PRESENT); ++ return val && !(val & _PAGE_PRESENT); + } + + /* Get a mask to xor with the page table entry to get the correct pfn. */ diff --git a/queue-3.16/x86-speculation-l1tf-extend-64bit-swap-file-size-limit.patch b/queue-3.16/x86-speculation-l1tf-extend-64bit-swap-file-size-limit.patch new file mode 100644 index 00000000..96858d10 --- /dev/null +++ b/queue-3.16/x86-speculation-l1tf-extend-64bit-swap-file-size-limit.patch @@ -0,0 +1,42 @@ +From: Vlastimil Babka <vbabka@suse.cz> +Date: Thu, 21 Jun 2018 12:36:29 +0200 +Subject: x86/speculation/l1tf: Extend 64bit swap file size limit + +commit 1a7ed1ba4bba6c075d5ad61bb75e3fbc870840d6 upstream. + +The previous patch has limited swap file size so that large offsets cannot +clear bits above MAX_PA/2 in the pte and interfere with L1TF mitigation. + +It assumed that offsets are encoded starting with bit 12, same as pfn. But +on x86_64, offsets are encoded starting with bit 9. + +Thus the limit can be raised by 3 bits. That means 16TB with 42bit MAX_PA +and 256TB with 46bit MAX_PA. + +Fixes: 377eeaa8e11f ("x86/speculation/l1tf: Limit swap file size to MAX_PA/2") +Signed-off-by: Vlastimil Babka <vbabka@suse.cz> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + arch/x86/mm/init.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -710,7 +710,15 @@ unsigned long max_swapfile_size(void) + + if (boot_cpu_has_bug(X86_BUG_L1TF)) { + /* Limit the swap file size to MAX_PA/2 for L1TF workaround */ +- pages = min_t(unsigned long, l1tf_pfn_limit() + 1, pages); ++ unsigned long l1tf_limit = l1tf_pfn_limit() + 1; ++ /* ++ * We encode swap offsets also with 3 bits below those for pfn ++ * which makes the usable limit higher. ++ */ ++#ifdef CONFIG_X86_64 ++ l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT; ++#endif ++ pages = min_t(unsigned long, l1tf_limit, pages); + } + return pages; + } diff --git a/queue-3.16/x86-speculation-l1tf-fix-off-by-one-error-when-warning-that-system.patch b/queue-3.16/x86-speculation-l1tf-fix-off-by-one-error-when-warning-that-system.patch new file mode 100644 index 00000000..0b76a3fb --- /dev/null +++ b/queue-3.16/x86-speculation-l1tf-fix-off-by-one-error-when-warning-that-system.patch @@ -0,0 +1,78 @@ +From: Vlastimil Babka <vbabka@suse.cz> +Date: Thu, 23 Aug 2018 15:44:18 +0200 +Subject: x86/speculation/l1tf: Fix off-by-one error when warning that system + has too much RAM + +commit b0a182f875689647b014bc01d36b340217792852 upstream. + +Two users have reported [1] that they have an "extremely unlikely" system +with more than MAX_PA/2 memory and L1TF mitigation is not effective. In +fact it's a CPU with 36bits phys limit (64GB) and 32GB memory, but due to +holes in the e820 map, the main region is almost 500MB over the 32GB limit: + +[ 0.000000] BIOS-e820: [mem 0x0000000100000000-0x000000081effffff] usable + +Suggestions to use 'mem=32G' to enable the L1TF mitigation while losing the +500MB revealed, that there's an off-by-one error in the check in +l1tf_select_mitigation(). + +l1tf_pfn_limit() returns the last usable pfn (inclusive) and the range +check in the mitigation path does not take this into account. + +Instead of amending the range check, make l1tf_pfn_limit() return the first +PFN which is over the limit which is less error prone. Adjust the other +users accordingly. + +[1] https://bugzilla.suse.com/show_bug.cgi?id=1105536 + +Fixes: 17dbca119312 ("x86/speculation/l1tf: Add sysfs reporting for l1tf") +Reported-by: George Anchev <studio@anchev.net> +Reported-by: Christopher Snowhill <kode54@gmail.com> +Signed-off-by: Vlastimil Babka <vbabka@suse.cz> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: "H . Peter Anvin" <hpa@zytor.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Link: https://lkml.kernel.org/r/20180823134418.17008-1-vbabka@suse.cz +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + arch/x86/include/asm/processor.h | 2 +- + arch/x86/mm/init.c | 2 +- + arch/x86/mm/mmap.c | 2 +- + 3 files changed, 3 insertions(+), 3 deletions(-) + +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -167,7 +167,7 @@ extern void fpu_detect(struct cpuinfo_x8 + + static inline unsigned long long l1tf_pfn_limit(void) + { +- return BIT_ULL(boot_cpu_data.x86_phys_bits - 1 - PAGE_SHIFT) - 1; ++ return BIT_ULL(boot_cpu_data.x86_phys_bits - 1 - PAGE_SHIFT); + } + + extern void early_cpu_init(void); +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -710,7 +710,7 @@ unsigned long max_swapfile_size(void) + + if (boot_cpu_has_bug(X86_BUG_L1TF)) { + /* Limit the swap file size to MAX_PA/2 for L1TF workaround */ +- unsigned long long l1tf_limit = l1tf_pfn_limit() + 1; ++ unsigned long long l1tf_limit = l1tf_pfn_limit(); + /* + * We encode swap offsets also with 3 bits below those for pfn + * which makes the usable limit higher. +--- a/arch/x86/mm/mmap.c ++++ b/arch/x86/mm/mmap.c +@@ -131,7 +131,7 @@ bool pfn_modify_allowed(unsigned long pf + /* If it's real memory always allow */ + if (pfn_valid(pfn)) + return true; +- if (pfn > l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN)) ++ if (pfn >= l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN)) + return false; + return true; + } diff --git a/queue-3.16/x86-speculation-l1tf-fix-overflow-in-l1tf_pfn_limit-on-32bit.patch b/queue-3.16/x86-speculation-l1tf-fix-overflow-in-l1tf_pfn_limit-on-32bit.patch new file mode 100644 index 00000000..9fbc2d17 --- /dev/null +++ b/queue-3.16/x86-speculation-l1tf-fix-overflow-in-l1tf_pfn_limit-on-32bit.patch @@ -0,0 +1,70 @@ +From: Vlastimil Babka <vbabka@suse.cz> +Date: Mon, 20 Aug 2018 11:58:35 +0200 +Subject: x86/speculation/l1tf: Fix overflow in l1tf_pfn_limit() on 32bit + +commit 9df9516940a61d29aedf4d91b483ca6597e7d480 upstream. + +On 32bit PAE kernels on 64bit hardware with enough physical bits, +l1tf_pfn_limit() will overflow unsigned long. This in turn affects +max_swapfile_size() and can lead to swapon returning -EINVAL. This has been +observed in a 32bit guest with 42 bits physical address size, where +max_swapfile_size() overflows exactly to 1 << 32, thus zero, and produces +the following warning to dmesg: + +[ 6.396845] Truncating oversized swap area, only using 0k out of 2047996k + +Fix this by using unsigned long long instead. + +Fixes: 17dbca119312 ("x86/speculation/l1tf: Add sysfs reporting for l1tf") +Fixes: 377eeaa8e11f ("x86/speculation/l1tf: Limit swap file size to MAX_PA/2") +Reported-by: Dominique Leuenberger <dimstar@suse.de> +Reported-by: Adrian Schroeter <adrian@suse.de> +Signed-off-by: Vlastimil Babka <vbabka@suse.cz> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: Andi Kleen <ak@linux.intel.com> +Acked-by: Michal Hocko <mhocko@suse.com> +Cc: "H . Peter Anvin" <hpa@zytor.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Michal Hocko <mhocko@kernel.org> +Link: https://lkml.kernel.org/r/20180820095835.5298-1-vbabka@suse.cz +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + arch/x86/include/asm/processor.h | 4 ++-- + arch/x86/mm/init.c | 4 ++-- + 2 files changed, 4 insertions(+), 4 deletions(-) + +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -165,9 +165,9 @@ extern const struct seq_operations cpuin + extern void cpu_detect(struct cpuinfo_x86 *c); + extern void fpu_detect(struct cpuinfo_x86 *c); + +-static inline unsigned long l1tf_pfn_limit(void) ++static inline unsigned long long l1tf_pfn_limit(void) + { +- return BIT(boot_cpu_data.x86_phys_bits - 1 - PAGE_SHIFT) - 1; ++ return BIT_ULL(boot_cpu_data.x86_phys_bits - 1 - PAGE_SHIFT) - 1; + } + + extern void early_cpu_init(void); +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -710,7 +710,7 @@ unsigned long max_swapfile_size(void) + + if (boot_cpu_has_bug(X86_BUG_L1TF)) { + /* Limit the swap file size to MAX_PA/2 for L1TF workaround */ +- unsigned long l1tf_limit = l1tf_pfn_limit() + 1; ++ unsigned long long l1tf_limit = l1tf_pfn_limit() + 1; + /* + * We encode swap offsets also with 3 bits below those for pfn + * which makes the usable limit higher. +@@ -718,7 +718,7 @@ unsigned long max_swapfile_size(void) + #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT; + #endif +- pages = min_t(unsigned long, l1tf_limit, pages); ++ pages = min_t(unsigned long long, l1tf_limit, pages); + } + return pages; + } diff --git a/queue-3.16/x86-speculation-l1tf-fix-up-pte-pfn-conversion-for-pae.patch b/queue-3.16/x86-speculation-l1tf-fix-up-pte-pfn-conversion-for-pae.patch new file mode 100644 index 00000000..96858342 --- /dev/null +++ b/queue-3.16/x86-speculation-l1tf-fix-up-pte-pfn-conversion-for-pae.patch @@ -0,0 +1,81 @@ +From: Michal Hocko <mhocko@suse.cz> +Date: Wed, 27 Jun 2018 17:46:50 +0200 +Subject: x86/speculation/l1tf: Fix up pte->pfn conversion for PAE + +commit e14d7dfb41f5807a0c1c26a13f2b8ef16af24935 upstream + +Jan has noticed that pte_pfn and co. resp. pfn_pte are incorrect for +CONFIG_PAE because phys_addr_t is wider than unsigned long and so the +pte_val reps. shift left would get truncated. Fix this up by using proper +types. + +[dwmw2: Backport to 4.9] + +Fixes: 6b28baca9b1f ("x86/speculation/l1tf: Protect PROT_NONE PTEs against speculation") +Reported-by: Jan Beulich <JBeulich@suse.com> +Signed-off-by: Michal Hocko <mhocko@suse.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: Vlastimil Babka <vbabka@suse.cz> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Guenter Roeck <linux@roeck-us.net> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +[bwh: Backported to 3.16: Adjust context. Also restore the fix to pfn_pud().] +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + arch/x86/include/asm/pgtable.h | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -147,21 +147,21 @@ static inline u64 protnone_mask(u64 val) + + static inline unsigned long pte_pfn(pte_t pte) + { +- unsigned long pfn = pte_val(pte); ++ phys_addr_t pfn = pte_val(pte); + pfn ^= protnone_mask(pfn); + return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT; + } + + static inline unsigned long pmd_pfn(pmd_t pmd) + { +- unsigned long pfn = pmd_val(pmd); ++ phys_addr_t pfn = pmd_val(pmd); + pfn ^= protnone_mask(pfn); + return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT; + } + + static inline unsigned long pud_pfn(pud_t pud) + { +- unsigned long pfn = pud_val(pud); ++ phys_addr_t pfn = pud_val(pud); + pfn ^= protnone_mask(pfn); + return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT; + } +@@ -371,7 +371,7 @@ static inline pgprotval_t massage_pgprot + + static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) + { +- phys_addr_t pfn = page_nr << PAGE_SHIFT; ++ phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; + pfn ^= protnone_mask(pgprot_val(pgprot)); + pfn &= PTE_PFN_MASK; + return __pte(pfn | massage_pgprot(pgprot)); +@@ -379,7 +379,7 @@ static inline pte_t pfn_pte(unsigned lon + + static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) + { +- phys_addr_t pfn = page_nr << PAGE_SHIFT; ++ phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; + pfn ^= protnone_mask(pgprot_val(pgprot)); + pfn &= PTE_PFN_MASK; + return __pmd(pfn | massage_pgprot(pgprot)); +@@ -387,7 +387,7 @@ static inline pmd_t pfn_pmd(unsigned lon + + static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot) + { +- phys_addr_t pfn = page_nr << PAGE_SHIFT; ++ phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; + pfn ^= protnone_mask(pgprot_val(pgprot)); + pfn &= PTE_PFN_MASK; + return __pud(pfn | massage_pgprot(pgprot)); diff --git a/queue-3.16/x86-speculation-l1tf-increase-32bit-pae-__physical_page_shift.patch b/queue-3.16/x86-speculation-l1tf-increase-32bit-pae-__physical_page_shift.patch new file mode 100644 index 00000000..ef156edc --- /dev/null +++ b/queue-3.16/x86-speculation-l1tf-increase-32bit-pae-__physical_page_shift.patch @@ -0,0 +1,77 @@ +From: Andi Kleen <ak@linux.intel.com> +Date: Wed, 13 Jun 2018 15:48:21 -0700 +Subject: x86/speculation/l1tf: Increase 32bit PAE __PHYSICAL_PAGE_SHIFT + +commit 50896e180c6aa3a9c61a26ced99e15d602666a4c upstream. + +L1 Terminal Fault (L1TF) is a speculation related vulnerability. The CPU +speculates on PTE entries which do not have the PRESENT bit set, if the +content of the resulting physical address is available in the L1D cache. + +The OS side mitigation makes sure that a !PRESENT PTE entry points to a +physical address outside the actually existing and cachable memory +space. This is achieved by inverting the upper bits of the PTE. Due to the +address space limitations this only works for 64bit and 32bit PAE kernels, +but not for 32bit non PAE. + +This mitigation applies to both host and guest kernels, but in case of a +64bit host (hypervisor) and a 32bit PAE guest, inverting the upper bits of +the PAE address space (44bit) is not enough if the host has more than 43 +bits of populated memory address space, because the speculation treats the +PTE content as a physical host address bypassing EPT. + +The host (hypervisor) protects itself against the guest by flushing L1D as +needed, but pages inside the guest are not protected against attacks from +other processes inside the same guest. + +For the guest the inverted PTE mask has to match the host to provide the +full protection for all pages the host could possibly map into the +guest. The hosts populated address space is not known to the guest, so the +mask must cover the possible maximal host address space, i.e. 52 bit. + +On 32bit PAE the maximum PTE mask is currently set to 44 bit because that +is the limit imposed by 32bit unsigned long PFNs in the VMs. This limits +the mask to be below what the host could possible use for physical pages. + +The L1TF PROT_NONE protection code uses the PTE masks to determine which +bits to invert to make sure the higher bits are set for unmapped entries to +prevent L1TF speculation attacks against EPT inside guests. + +In order to invert all bits that could be used by the host, increase +__PHYSICAL_PAGE_SHIFT to 52 to match 64bit. + +The real limit for a 32bit PAE kernel is still 44 bits because all Linux +PTEs are created from unsigned long PFNs, so they cannot be higher than 44 +bits on a 32bit kernel. So these extra PFN bits should be never set. The +only users of this macro are using it to look at PTEs, so it's safe. + +[ tglx: Massaged changelog ] + +Signed-off-by: Andi Kleen <ak@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com> +Acked-by: Michal Hocko <mhocko@suse.com> +Acked-by: Dave Hansen <dave.hansen@intel.com> +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + arch/x86/include/asm/page_32_types.h | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +--- a/arch/x86/include/asm/page_32_types.h ++++ b/arch/x86/include/asm/page_32_types.h +@@ -27,8 +27,13 @@ + #define N_EXCEPTION_STACKS 1 + + #ifdef CONFIG_X86_PAE +-/* 44=32+12, the limit we can fit into an unsigned long pfn */ +-#define __PHYSICAL_MASK_SHIFT 44 ++/* ++ * This is beyond the 44 bit limit imposed by the 32bit long pfns, ++ * but we need the full mask to make sure inverted PROT_NONE ++ * entries have all the host bits set in a guest. ++ * The real limit is still 44 bits. ++ */ ++#define __PHYSICAL_MASK_SHIFT 52 + #define __VIRTUAL_MASK_SHIFT 32 + + #else /* !CONFIG_X86_PAE */ diff --git a/queue-3.16/x86-speculation-l1tf-invert-all-not-present-mappings.patch b/queue-3.16/x86-speculation-l1tf-invert-all-not-present-mappings.patch new file mode 100644 index 00000000..9a5eb67e --- /dev/null +++ b/queue-3.16/x86-speculation-l1tf-invert-all-not-present-mappings.patch @@ -0,0 +1,31 @@ +From: Andi Kleen <ak@linux.intel.com> +Date: Tue, 7 Aug 2018 15:09:36 -0700 +Subject: x86/speculation/l1tf: Invert all not present mappings + +commit f22cc87f6c1f771b57c407555cfefd811cdd9507 upstream. + +For kernel mappings PAGE_PROTNONE is not necessarily set for a non present +mapping, but the inversion logic explicitely checks for !PRESENT and +PROT_NONE. + +Remove the PROT_NONE check and make the inversion unconditional for all not +present mappings. + +Signed-off-by: Andi Kleen <ak@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + arch/x86/include/asm/pgtable-invert.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/include/asm/pgtable-invert.h ++++ b/arch/x86/include/asm/pgtable-invert.h +@@ -6,7 +6,7 @@ + + static inline bool __pte_needs_invert(u64 val) + { +- return (val & (_PAGE_PRESENT|_PAGE_PROTNONE)) == _PAGE_PROTNONE; ++ return !(val & _PAGE_PRESENT); + } + + /* Get a mask to xor with the page table entry to get the correct pfn. */ diff --git a/queue-3.16/x86-speculation-l1tf-limit-swap-file-size-to-max_pa-2.patch b/queue-3.16/x86-speculation-l1tf-limit-swap-file-size-to-max_pa-2.patch new file mode 100644 index 00000000..3f61b90a --- /dev/null +++ b/queue-3.16/x86-speculation-l1tf-limit-swap-file-size-to-max_pa-2.patch @@ -0,0 +1,128 @@ +From: Andi Kleen <ak@linux.intel.com> +Date: Wed, 13 Jun 2018 15:48:28 -0700 +Subject: x86/speculation/l1tf: Limit swap file size to MAX_PA/2 + +commit 377eeaa8e11fe815b1d07c81c4a0e2843a8c15eb upstream. + +For the L1TF workaround its necessary to limit the swap file size to below +MAX_PA/2, so that the higher bits of the swap offset inverted never point +to valid memory. + +Add a mechanism for the architecture to override the swap file size check +in swapfile.c and add a x86 specific max swapfile check function that +enforces that limit. + +The check is only enabled if the CPU is vulnerable to L1TF. + +In VMs with 42bit MAX_PA the typical limit is 2TB now, on a native system +with 46bit PA it is 32TB. The limit is only per individual swap file, so +it's always possible to exceed these limits with multiple swap files or +partitions. + +Signed-off-by: Andi Kleen <ak@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com> +Acked-by: Michal Hocko <mhocko@suse.com> +Acked-by: Dave Hansen <dave.hansen@intel.com> +[bwh: Backported to 3.16: adjust context] +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -4,6 +4,8 @@ + #include <linux/swap.h> + #include <linux/memblock.h> + #include <linux/bootmem.h> /* for max_low_pfn */ ++#include <linux/swapfile.h> ++#include <linux/swapops.h> + + #include <asm/cacheflush.h> + #include <asm/e820.h> +@@ -699,3 +701,15 @@ void __init zone_sizes_init(void) + free_area_init_nodes(max_zone_pfns); + } + ++unsigned long max_swapfile_size(void) ++{ ++ unsigned long pages; ++ ++ pages = generic_max_swapfile_size(); ++ ++ if (boot_cpu_has_bug(X86_BUG_L1TF)) { ++ /* Limit the swap file size to MAX_PA/2 for L1TF workaround */ ++ pages = min_t(unsigned long, l1tf_pfn_limit() + 1, pages); ++ } ++ return pages; ++} +--- a/include/linux/swapfile.h ++++ b/include/linux/swapfile.h +@@ -9,5 +9,7 @@ extern spinlock_t swap_lock; + extern struct plist_head swap_active_head; + extern struct swap_info_struct *swap_info[]; + extern int try_to_unuse(unsigned int, bool, unsigned long); ++extern unsigned long generic_max_swapfile_size(void); ++extern unsigned long max_swapfile_size(void); + + #endif /* _LINUX_SWAPFILE_H */ +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -2166,6 +2166,35 @@ static int claim_swapfile(struct swap_in + return 0; + } + ++ ++/* ++ * Find out how many pages are allowed for a single swap device. There ++ * are two limiting factors: ++ * 1) the number of bits for the swap offset in the swp_entry_t type, and ++ * 2) the number of bits in the swap pte, as defined by the different ++ * architectures. ++ * ++ * In order to find the largest possible bit mask, a swap entry with ++ * swap type 0 and swap offset ~0UL is created, encoded to a swap pte, ++ * decoded to a swp_entry_t again, and finally the swap offset is ++ * extracted. ++ * ++ * This will mask all the bits from the initial ~0UL mask that can't ++ * be encoded in either the swp_entry_t or the architecture definition ++ * of a swap pte. ++ */ ++unsigned long generic_max_swapfile_size(void) ++{ ++ return swp_offset(pte_to_swp_entry( ++ swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; ++} ++ ++/* Can be overridden by an architecture for additional checks. */ ++__weak unsigned long max_swapfile_size(void) ++{ ++ return generic_max_swapfile_size(); ++} ++ + static unsigned long read_swap_header(struct swap_info_struct *p, + union swap_header *swap_header, + struct inode *inode) +@@ -2201,22 +2230,7 @@ static unsigned long read_swap_header(st + p->cluster_next = 1; + p->cluster_nr = 0; + +- /* +- * Find out how many pages are allowed for a single swap +- * device. There are two limiting factors: 1) the number +- * of bits for the swap offset in the swp_entry_t type, and +- * 2) the number of bits in the swap pte as defined by the +- * different architectures. In order to find the +- * largest possible bit mask, a swap entry with swap type 0 +- * and swap offset ~0UL is created, encoded to a swap pte, +- * decoded to a swp_entry_t again, and finally the swap +- * offset is extracted. This will mask all the bits from +- * the initial ~0UL mask that can't be encoded in either +- * the swp_entry_t or the architecture definition of a +- * swap pte. +- */ +- maxpages = swp_offset(pte_to_swp_entry( +- swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; ++ maxpages = max_swapfile_size(); + last_page = swap_header->info.last_page; + if (last_page > maxpages) { + pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", diff --git a/queue-3.16/x86-speculation-l1tf-make-pmd-pud_mknotpresent-invert.patch b/queue-3.16/x86-speculation-l1tf-make-pmd-pud_mknotpresent-invert.patch new file mode 100644 index 00000000..744870c4 --- /dev/null +++ b/queue-3.16/x86-speculation-l1tf-make-pmd-pud_mknotpresent-invert.patch @@ -0,0 +1,55 @@ +From: Andi Kleen <ak@linux.intel.com> +Date: Tue, 7 Aug 2018 15:09:37 -0700 +Subject: x86/speculation/l1tf: Make pmd/pud_mknotpresent() invert + +commit 0768f91530ff46683e0b372df14fd79fe8d156e5 upstream. + +Some cases in THP like: + - MADV_FREE + - mprotect + - split + +mark the PMD non present for temporarily to prevent races. The window for +an L1TF attack in these contexts is very small, but it wants to be fixed +for correctness sake. + +Use the proper low level functions for pmd/pud_mknotpresent() to address +this. + +Signed-off-by: Andi Kleen <ak@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +[bwh: Backported to 3.16: + - Drop change to pud_mknotpresent() + - pmd_mknotpresent() does not touch _PAGE_NONE] +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + arch/x86/include/asm/pgtable.h | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -308,11 +308,6 @@ static inline pmd_t pmd_mkwrite(pmd_t pm + return pmd_set_flags(pmd, _PAGE_RW); + } + +-static inline pmd_t pmd_mknotpresent(pmd_t pmd) +-{ +- return pmd_clear_flags(pmd, _PAGE_PRESENT); +-} +- + static inline pud_t pud_set_flags(pud_t pud, pudval_t set) + { + pudval_t v = native_pud_val(pud); +@@ -393,6 +388,12 @@ static inline pud_t pfn_pud(unsigned lon + return __pud(pfn | massage_pgprot(pgprot)); + } + ++static inline pmd_t pmd_mknotpresent(pmd_t pmd) ++{ ++ return pfn_pmd(pmd_pfn(pmd), ++ __pgprot(pmd_flags(pmd) & ~_PAGE_PRESENT)); ++} ++ + static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask); + + static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) diff --git a/queue-3.16/x86-speculation-l1tf-make-sure-the-first-page-is-always-reserved.patch b/queue-3.16/x86-speculation-l1tf-make-sure-the-first-page-is-always-reserved.patch new file mode 100644 index 00000000..d5f661e6 --- /dev/null +++ b/queue-3.16/x86-speculation-l1tf-make-sure-the-first-page-is-always-reserved.patch @@ -0,0 +1,39 @@ +From: Andi Kleen <ak@linux.intel.com> +Date: Wed, 13 Jun 2018 15:48:25 -0700 +Subject: x86/speculation/l1tf: Make sure the first page is always reserved + +commit 10a70416e1f067f6c4efda6ffd8ea96002ac4223 upstream. + +The L1TF workaround doesn't make any attempt to mitigate speculate accesses +to the first physical page for zeroed PTEs. Normally it only contains some +data from the early real mode BIOS. + +It's not entirely clear that the first page is reserved in all +configurations, so add an extra reservation call to make sure it is really +reserved. In most configurations (e.g. with the standard reservations) +it's likely a nop. + +Signed-off-by: Andi Kleen <ak@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com> +Acked-by: Dave Hansen <dave.hansen@intel.com> +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + arch/x86/kernel/setup.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/arch/x86/kernel/setup.c ++++ b/arch/x86/kernel/setup.c +@@ -860,6 +860,12 @@ void __init setup_arch(char **cmdline_p) + memblock_reserve(__pa_symbol(_text), + (unsigned long)__bss_stop - (unsigned long)_text); + ++ /* ++ * Make sure page 0 is always reserved because on systems with ++ * L1TF its contents can be leaked to user processes. ++ */ ++ memblock_reserve(0, PAGE_SIZE); ++ + early_reserve_initrd(); + + /* diff --git a/queue-3.16/x86-speculation-l1tf-protect-pae-swap-entries-against-l1tf.patch b/queue-3.16/x86-speculation-l1tf-protect-pae-swap-entries-against-l1tf.patch new file mode 100644 index 00000000..580794a8 --- /dev/null +++ b/queue-3.16/x86-speculation-l1tf-protect-pae-swap-entries-against-l1tf.patch @@ -0,0 +1,88 @@ +From: Vlastimil Babka <vbabka@suse.cz> +Date: Fri, 22 Jun 2018 17:39:33 +0200 +Subject: x86/speculation/l1tf: Protect PAE swap entries against L1TF + +commit 0d0f6249058834ffe1ceaad0bb31464af66f6e7a upstream. + +The PAE 3-level paging code currently doesn't mitigate L1TF by flipping the +offset bits, and uses the high PTE word, thus bits 32-36 for type, 37-63 for +offset. The lower word is zeroed, thus systems with less than 4GB memory are +safe. With 4GB to 128GB the swap type selects the memory locations vulnerable +to L1TF; with even more memory, also the swap offfset influences the address. +This might be a problem with 32bit PAE guests running on large 64bit hosts. + +By continuing to keep the whole swap entry in either high or low 32bit word of +PTE we would limit the swap size too much. Thus this patch uses the whole PAE +PTE with the same layout as the 64bit version does. The macros just become a +bit tricky since they assume the arch-dependent swp_entry_t to be 32bit. + +Signed-off-by: Vlastimil Babka <vbabka@suse.cz> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: Michal Hocko <mhocko@suse.com> +[bwh: Backported to 3.16: CONFIG_PGTABLE_LEVELS is not defined; use other + config symbols in the condition.] +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + arch/x86/include/asm/pgtable-3level.h | 35 +++++++++++++++++++++++++-- + arch/x86/mm/init.c | 2 +- + 2 files changed, 34 insertions(+), 3 deletions(-) + +--- a/arch/x86/include/asm/pgtable-3level.h ++++ b/arch/x86/include/asm/pgtable-3level.h +@@ -177,12 +177,43 @@ static inline pmd_t native_pmdp_get_and_ + #endif + + /* Encode and de-code a swap entry */ ++#define SWP_TYPE_BITS 5 ++ ++#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) ++ ++/* We always extract/encode the offset by shifting it all the way up, and then down again */ ++#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS) ++ + #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) + #define __swp_type(x) (((x).val) & 0x1f) + #define __swp_offset(x) ((x).val >> 5) + #define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) +-#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) +-#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) ++ ++/* ++ * Normally, __swp_entry() converts from arch-independent swp_entry_t to ++ * arch-dependent swp_entry_t, and __swp_entry_to_pte() just stores the result ++ * to pte. But here we have 32bit swp_entry_t and 64bit pte, and need to use the ++ * whole 64 bits. Thus, we shift the "real" arch-dependent conversion to ++ * __swp_entry_to_pte() through the following helper macro based on 64bit ++ * __swp_entry(). ++ */ ++#define __swp_pteval_entry(type, offset) ((pteval_t) { \ ++ (~(pteval_t)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \ ++ | ((pteval_t)(type) << (64 - SWP_TYPE_BITS)) }) ++ ++#define __swp_entry_to_pte(x) ((pte_t){ .pte = \ ++ __swp_pteval_entry(__swp_type(x), __swp_offset(x)) }) ++/* ++ * Analogically, __pte_to_swp_entry() doesn't just extract the arch-dependent ++ * swp_entry_t, but also has to convert it from 64bit to the 32bit ++ * intermediate representation, using the following macros based on 64bit ++ * __swp_type() and __swp_offset(). ++ */ ++#define __pteval_swp_type(x) ((unsigned long)((x).pte >> (64 - SWP_TYPE_BITS))) ++#define __pteval_swp_offset(x) ((unsigned long)(~((x).pte) << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)) ++ ++#define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \ ++ __pteval_swp_offset(pte))) + + #include <asm/pgtable-invert.h> + +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -715,7 +715,7 @@ unsigned long max_swapfile_size(void) + * We encode swap offsets also with 3 bits below those for pfn + * which makes the usable limit higher. + */ +-#ifdef CONFIG_X86_64 ++#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT; + #endif + pages = min_t(unsigned long, l1tf_limit, pages); diff --git a/queue-3.16/x86-speculation-l1tf-protect-prot_none-ptes-against-speculation.patch b/queue-3.16/x86-speculation-l1tf-protect-prot_none-ptes-against-speculation.patch new file mode 100644 index 00000000..23a73c64 --- /dev/null +++ b/queue-3.16/x86-speculation-l1tf-protect-prot_none-ptes-against-speculation.patch @@ -0,0 +1,254 @@ +From: Andi Kleen <ak@linux.intel.com> +Date: Wed, 13 Jun 2018 15:48:24 -0700 +Subject: x86/speculation/l1tf: Protect PROT_NONE PTEs against speculation + +commit 6b28baca9b1f0d4a42b865da7a05b1c81424bd5c upstream. + +When PTEs are set to PROT_NONE the kernel just clears the Present bit and +preserves the PFN, which creates attack surface for L1TF speculation +speculation attacks. + +This is important inside guests, because L1TF speculation bypasses physical +page remapping. While the host has its own migitations preventing leaking +data from other VMs into the guest, this would still risk leaking the wrong +page inside the current guest. + +This uses the same technique as Linus' swap entry patch: while an entry is +is in PROTNONE state invert the complete PFN part part of it. This ensures +that the the highest bit will point to non existing memory. + +The invert is done by pte/pmd_modify and pfn/pmd/pud_pte for PROTNONE and +pte/pmd/pud_pfn undo it. + +This assume that no code path touches the PFN part of a PTE directly +without using these primitives. + +This doesn't handle the case that MMIO is on the top of the CPU physical +memory. If such an MMIO region was exposed by an unpriviledged driver for +mmap it would be possible to attack some real memory. However this +situation is all rather unlikely. + +For 32bit non PAE the inversion is not done because there are really not +enough bits to protect anything. + +Q: Why does the guest need to be protected when the HyperVisor already has + L1TF mitigations? + +A: Here's an example: + + Physical pages 1 2 get mapped into a guest as + GPA 1 -> PA 2 + GPA 2 -> PA 1 + through EPT. + + The L1TF speculation ignores the EPT remapping. + + Now the guest kernel maps GPA 1 to process A and GPA 2 to process B, and + they belong to different users and should be isolated. + + A sets the GPA 1 PA 2 PTE to PROT_NONE to bypass the EPT remapping and + gets read access to the underlying physical page. Which in this case + points to PA 2, so it can read process B's data, if it happened to be in + L1, so isolation inside the guest is broken. + + There's nothing the hypervisor can do about this. This mitigation has to + be done in the guest itself. + +[ tglx: Massaged changelog ] + +Signed-off-by: Andi Kleen <ak@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com> +Acked-by: Michal Hocko <mhocko@suse.com> +Acked-by: Vlastimil Babka <vbabka@suse.cz> +Acked-by: Dave Hansen <dave.hansen@intel.com> +[bwh: Backported to 3.16: + - s/check_pgprot/massage_pgprot/ + - Keep using PTE_PFN_MASK to extract PFN from pmd_pfn() and pud_pfn(), + as we don't need to worry about the PAT bit being set here] +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + arch/x86/include/asm/pgtable-2level.h | 17 +++++++++++ + arch/x86/include/asm/pgtable-3level.h | 2 ++ + arch/x86/include/asm/pgtable-invert.h | 32 +++++++++++++++++++ + arch/x86/include/asm/pgtable.h | 44 +++++++++++++++++++-------- + arch/x86/include/asm/pgtable_64.h | 2 ++ + 5 files changed, 84 insertions(+), 13 deletions(-) + create mode 100644 arch/x86/include/asm/pgtable-invert.h + +--- a/arch/x86/include/asm/pgtable-2level.h ++++ b/arch/x86/include/asm/pgtable-2level.h +@@ -77,4 +77,21 @@ static inline unsigned long pte_bitop(un + #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) + #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) + ++/* No inverted PFNs on 2 level page tables */ ++ ++static inline u64 protnone_mask(u64 val) ++{ ++ return 0; ++} ++ ++static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask) ++{ ++ return val; ++} ++ ++static inline bool __pte_needs_invert(u64 val) ++{ ++ return false; ++} ++ + #endif /* _ASM_X86_PGTABLE_2LEVEL_H */ +--- a/arch/x86/include/asm/pgtable-3level.h ++++ b/arch/x86/include/asm/pgtable-3level.h +@@ -184,4 +184,6 @@ static inline pmd_t native_pmdp_get_and_ + #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) + #define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) + ++#include <asm/pgtable-invert.h> ++ + #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ +--- /dev/null ++++ b/arch/x86/include/asm/pgtable-invert.h +@@ -0,0 +1,32 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _ASM_PGTABLE_INVERT_H ++#define _ASM_PGTABLE_INVERT_H 1 ++ ++#ifndef __ASSEMBLY__ ++ ++static inline bool __pte_needs_invert(u64 val) ++{ ++ return (val & (_PAGE_PRESENT|_PAGE_PROTNONE)) == _PAGE_PROTNONE; ++} ++ ++/* Get a mask to xor with the page table entry to get the correct pfn. */ ++static inline u64 protnone_mask(u64 val) ++{ ++ return __pte_needs_invert(val) ? ~0ull : 0; ++} ++ ++static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask) ++{ ++ /* ++ * When a PTE transitions from NONE to !NONE or vice-versa ++ * invert the PFN part to stop speculation. ++ * pte_pfn undoes this when needed. ++ */ ++ if (__pte_needs_invert(oldval) != __pte_needs_invert(val)) ++ val = (val & ~mask) | (~val & mask); ++ return val; ++} ++ ++#endif /* __ASSEMBLY__ */ ++ ++#endif +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -141,19 +141,29 @@ static inline int pte_special(pte_t pte) + (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_PROTNONE)); + } + ++/* Entries that were set to PROT_NONE are inverted */ ++ ++static inline u64 protnone_mask(u64 val); ++ + static inline unsigned long pte_pfn(pte_t pte) + { +- return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT; ++ unsigned long pfn = pte_val(pte); ++ pfn ^= protnone_mask(pfn); ++ return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT; + } + + static inline unsigned long pmd_pfn(pmd_t pmd) + { +- return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; ++ unsigned long pfn = pmd_val(pmd); ++ pfn ^= protnone_mask(pfn); ++ return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT; + } + + static inline unsigned long pud_pfn(pud_t pud) + { +- return (pud_val(pud) & PTE_PFN_MASK) >> PAGE_SHIFT; ++ unsigned long pfn = pud_val(pud); ++ pfn ^= protnone_mask(pfn); ++ return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT; + } + + #define pte_page(pte) pfn_to_page(pte_pfn(pte)) +@@ -361,25 +371,33 @@ static inline pgprotval_t massage_pgprot + + static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) + { +- return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | +- massage_pgprot(pgprot)); ++ phys_addr_t pfn = page_nr << PAGE_SHIFT; ++ pfn ^= protnone_mask(pgprot_val(pgprot)); ++ pfn &= PTE_PFN_MASK; ++ return __pte(pfn | massage_pgprot(pgprot)); + } + + static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) + { +- return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | +- massage_pgprot(pgprot)); ++ phys_addr_t pfn = page_nr << PAGE_SHIFT; ++ pfn ^= protnone_mask(pgprot_val(pgprot)); ++ pfn &= PTE_PFN_MASK; ++ return __pmd(pfn | massage_pgprot(pgprot)); + } + + static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot) + { +- return __pud(((phys_addr_t)page_nr << PAGE_SHIFT) | +- massage_pgprot(pgprot)); ++ phys_addr_t pfn = page_nr << PAGE_SHIFT; ++ pfn ^= protnone_mask(pgprot_val(pgprot)); ++ pfn &= PTE_PFN_MASK; ++ return __pud(pfn | massage_pgprot(pgprot)); + } + ++static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask); ++ + static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) + { +- pteval_t val = pte_val(pte); ++ pteval_t val = pte_val(pte), oldval = val; + + /* + * Chop off the NX bit (if present), and add the NX portion of +@@ -387,17 +405,17 @@ static inline pte_t pte_modify(pte_t pte + */ + val &= _PAGE_CHG_MASK; + val |= massage_pgprot(newprot) & ~_PAGE_CHG_MASK; +- ++ val = flip_protnone_guard(oldval, val, PTE_PFN_MASK); + return __pte(val); + } + + static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) + { +- pmdval_t val = pmd_val(pmd); ++ pmdval_t val = pmd_val(pmd), oldval = val; + + val &= _HPAGE_CHG_MASK; + val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK; +- ++ val = flip_protnone_guard(oldval, val, PTE_PFN_MASK); + return __pmd(val); + } + +--- a/arch/x86/include/asm/pgtable_64.h ++++ b/arch/x86/include/asm/pgtable_64.h +@@ -239,6 +239,8 @@ extern void cleanup_highmap(void); + extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); + extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); + ++#include <asm/pgtable-invert.h> ++ + #endif /* !__ASSEMBLY__ */ + + #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/queue-3.16/x86-speculation-l1tf-protect-swap-entries-against-l1tf.patch b/queue-3.16/x86-speculation-l1tf-protect-swap-entries-against-l1tf.patch new file mode 100644 index 00000000..68ea59b8 --- /dev/null +++ b/queue-3.16/x86-speculation-l1tf-protect-swap-entries-against-l1tf.patch @@ -0,0 +1,81 @@ +From: Linus Torvalds <torvalds@linux-foundation.org> +Date: Wed, 13 Jun 2018 15:48:23 -0700 +Subject: x86/speculation/l1tf: Protect swap entries against L1TF + +commit 2f22b4cd45b67b3496f4aa4c7180a1271c6452f6 upstream. + +With L1 terminal fault the CPU speculates into unmapped PTEs, and resulting +side effects allow to read the memory the PTE is pointing too, if its +values are still in the L1 cache. + +For swapped out pages Linux uses unmapped PTEs and stores a swap entry into +them. + +To protect against L1TF it must be ensured that the swap entry is not +pointing to valid memory, which requires setting higher bits (between bit +36 and bit 45) that are inside the CPUs physical address space, but outside +any real memory. + +To do this invert the offset to make sure the higher bits are always set, +as long as the swap file is not too big. + +Note there is no workaround for 32bit !PAE, or on systems which have more +than MAX_PA/2 worth of memory. The later case is very unlikely to happen on +real systems. + +[AK: updated description and minor tweaks by. Split out from the original + patch ] + +Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> +Signed-off-by: Andi Kleen <ak@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Tested-by: Andi Kleen <ak@linux.intel.com> +Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com> +Acked-by: Michal Hocko <mhocko@suse.com> +Acked-by: Vlastimil Babka <vbabka@suse.cz> +Acked-by: Dave Hansen <dave.hansen@intel.com> +[bwh: Backported to 3.16: Bit 9 may be reserved for PAGE_BIT_NUMA here] +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + arch/x86/include/asm/pgtable_64.h | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +--- a/arch/x86/include/asm/pgtable_64.h ++++ b/arch/x86/include/asm/pgtable_64.h +@@ -167,7 +167,7 @@ static inline int pgd_large(pgd_t pgd) { + * + * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number + * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names +- * | TYPE (59-63) | OFFSET (10-58) | 0 |0|0|X|X| X| X|X|SD|0| <- swp entry ++ * | TYPE (59-63) | ~OFFSET (10-58) | 0 |0|0|X|X| X| X|X|SD|0| <- swp entry + * + * G (8) is aliased and used as a PROT_NONE indicator for + * !present ptes. We need to start storing swap entries above +@@ -180,6 +180,9 @@ static inline int pgd_large(pgd_t pgd) { + * + * Bit 7 in swp entry should be 0 because pmd_present checks not only P, + * but also L and G. ++ * ++ * The offset is inverted by a binary not operation to make the high ++ * physical bits set. + */ + #define SWP_TYPE_BITS 5 + +@@ -199,13 +202,15 @@ static inline int pgd_large(pgd_t pgd) { + #define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS)) + + /* Shift up (to get rid of type), then down to get value */ +-#define __swp_offset(x) ((x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT) ++#define __swp_offset(x) (~(x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT) + + /* + * Shift the offset up "too far" by TYPE bits, then down again ++ * The offset is inverted by a binary not operation to make the high ++ * physical bits set. + */ + #define __swp_entry(type, offset) ((swp_entry_t) { \ +- ((unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \ ++ (~(unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \ + | ((unsigned long)(type) << (64-SWP_TYPE_BITS)) }) + + #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) diff --git a/queue-3.16/x86-speculation-l1tf-suggest-what-to-do-on-systems-with-too-much-ram.patch b/queue-3.16/x86-speculation-l1tf-suggest-what-to-do-on-systems-with-too-much-ram.patch new file mode 100644 index 00000000..34c4327d --- /dev/null +++ b/queue-3.16/x86-speculation-l1tf-suggest-what-to-do-on-systems-with-too-much-ram.patch @@ -0,0 +1,41 @@ +From: Vlastimil Babka <vbabka@suse.cz> +Date: Thu, 23 Aug 2018 16:21:29 +0200 +Subject: x86/speculation/l1tf: Suggest what to do on systems with too much RAM + +commit 6a012288d6906fee1dbc244050ade1dafe4a9c8d upstream. + +Two users have reported [1] that they have an "extremely unlikely" system +with more than MAX_PA/2 memory and L1TF mitigation is not effective. + +Make the warning more helpful by suggesting the proper mem=X kernel boot +parameter to make it effective and a link to the L1TF document to help +decide if the mitigation is worth the unusable RAM. + +[1] https://bugzilla.suse.com/show_bug.cgi?id=1105536 + +Suggested-by: Michal Hocko <mhocko@suse.com> +Signed-off-by: Vlastimil Babka <vbabka@suse.cz> +Acked-by: Michal Hocko <mhocko@suse.com> +Cc: "H . Peter Anvin" <hpa@zytor.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Link: https://lkml.kernel.org/r/966571f0-9d7f-43dc-92c6-a10eec7a1254@suse.cz +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + arch/x86/kernel/cpu/bugs.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -717,6 +717,10 @@ static void __init l1tf_select_mitigatio + half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT; + if (e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) { + pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n"); ++ pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n", ++ half_pa); ++ pr_info("However, doing so will make a part of your RAM unusable.\n"); ++ pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html might help you decide.\n"); + return; + } + diff --git a/queue-3.16/x86-speculation-l1tf-unbreak-__have_arch_pfn_modify_allowed.patch b/queue-3.16/x86-speculation-l1tf-unbreak-__have_arch_pfn_modify_allowed.patch new file mode 100644 index 00000000..f8995787 --- /dev/null +++ b/queue-3.16/x86-speculation-l1tf-unbreak-__have_arch_pfn_modify_allowed.patch @@ -0,0 +1,63 @@ +From: Jiri Kosina <jkosina@suse.cz> +Date: Sat, 14 Jul 2018 21:56:13 +0200 +Subject: x86/speculation/l1tf: Unbreak !__HAVE_ARCH_PFN_MODIFY_ALLOWED + architectures + +commit 8f2adf3d2118cc0822b83a7bb43475f9149a1d26 upstream. + +commit 6c26fcd2abfe0a56bbd95271fce02df2896cfd24 upstream. + +pfn_modify_allowed() and arch_has_pfn_modify_check() are outside of the +!__ASSEMBLY__ section in include/asm-generic/pgtable.h, which confuses +assembler on archs that don't have __HAVE_ARCH_PFN_MODIFY_ALLOWED (e.g. +ia64) and breaks build: + + include/asm-generic/pgtable.h: Assembler messages: + include/asm-generic/pgtable.h:538: Error: Unknown opcode `static inline bool pfn_modify_allowed(unsigned long pfn,pgprot_t prot)' + include/asm-generic/pgtable.h:540: Error: Unknown opcode `return true' + include/asm-generic/pgtable.h:543: Error: Unknown opcode `static inline bool arch_has_pfn_modify_check(void)' + include/asm-generic/pgtable.h:545: Error: Unknown opcode `return false' + arch/ia64/kernel/entry.S:69: Error: `mov' does not fit into bundle + +Move those two static inlines into the !__ASSEMBLY__ section so that they +don't confuse the asm build pass. + +Fixes: 42e4089c7890 ("x86/speculation/l1tf: Disallow non privileged high MMIO PROT_NONE mappings") +Signed-off-by: Jiri Kosina <jkosina@suse.cz> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +[groeck: Context changes] +Signed-off-by: Guenter Roeck <linux@roeck-us.net> +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +--- + include/asm-generic/pgtable.h | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +--- a/include/asm-generic/pgtable.h ++++ b/include/asm-generic/pgtable.h +@@ -806,12 +806,6 @@ static inline void pmdp_set_numa(struct + + #endif /* CONFIG_MMU */ + +-#endif /* !__ASSEMBLY__ */ +- +-#ifndef io_remap_pfn_range +-#define io_remap_pfn_range remap_pfn_range +-#endif +- + #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED + static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) + { +@@ -822,6 +816,12 @@ static inline bool arch_has_pfn_modify_c + { + return false; + } ++#endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */ ++ ++#endif /* !__ASSEMBLY__ */ ++ ++#ifndef io_remap_pfn_range ++#define io_remap_pfn_range remap_pfn_range + #endif + + #endif /* _ASM_GENERIC_PGTABLE_H */ |