diff options
author | Paul Gortmaker <paul.gortmaker@windriver.com> | 2018-07-31 09:56:16 -0400 |
---|---|---|
committer | Paul Gortmaker <paul.gortmaker@windriver.com> | 2018-07-31 09:56:16 -0400 |
commit | c74f87441bf22916cc728f6302a5443b3a92b02b (patch) | |
tree | cc9401c605df3125016ecff3c029b9f1479093f0 | |
parent | 20cfbaef16ddf21a30fccc84544f2483d5cd70b4 (diff) | |
download | longterm-queue-4.12-c74f87441bf22916cc728f6302a5443b3a92b02b.tar.gz |
raw import of commits seen deployed in 4.14.x
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
175 files changed, 18284 insertions, 0 deletions
diff --git a/queue/ACPI-APEI-Replace-ioremap_page_range-with-fixmap.patch b/queue/ACPI-APEI-Replace-ioremap_page_range-with-fixmap.patch new file mode 100644 index 0000000..347ba4c --- /dev/null +++ b/queue/ACPI-APEI-Replace-ioremap_page_range-with-fixmap.patch @@ -0,0 +1,177 @@ +From 4f89fa286f6729312e227e7c2d764e8e7b9d340e Mon Sep 17 00:00:00 2001 +From: James Morse <james.morse@arm.com> +Date: Mon, 6 Nov 2017 18:44:24 +0000 +Subject: [PATCH] ACPI / APEI: Replace ioremap_page_range() with fixmap + +commit 4f89fa286f6729312e227e7c2d764e8e7b9d340e upstream. + +Replace ghes_io{re,un}map_pfn_{nmi,irq}()s use of ioremap_page_range() +with __set_fixmap() as ioremap_page_range() may sleep to allocate a new +level of page-table, even if its passed an existing final-address to +use in the mapping. + +The GHES driver can only be enabled for architectures that select +HAVE_ACPI_APEI: Add fixmap entries to both x86 and arm64. + +clear_fixmap() does the TLB invalidation in __set_fixmap() for arm64 +and __set_pte_vaddr() for x86. In each case its the same as the +respective arch_apei_flush_tlb_one(). + +Reported-by: Fengguang Wu <fengguang.wu@intel.com> +Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> +Signed-off-by: James Morse <james.morse@arm.com> +Reviewed-by: Borislav Petkov <bp@suse.de> +Tested-by: Tyler Baicar <tbaicar@codeaurora.org> +Tested-by: Toshi Kani <toshi.kani@hpe.com> +[ For the arm64 bits: ] +Acked-by: Will Deacon <will.deacon@arm.com> +[ For the x86 bits: ] +Acked-by: Ingo Molnar <mingo@kernel.org> +Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> +Cc: All applicable <stable@vger.kernel.org> + +diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h +index caf86be815ba..4052ec39e8db 100644 +--- a/arch/arm64/include/asm/fixmap.h ++++ b/arch/arm64/include/asm/fixmap.h +@@ -51,6 +51,13 @@ enum fixed_addresses { + + FIX_EARLYCON_MEM_BASE, + FIX_TEXT_POKE0, ++ ++#ifdef CONFIG_ACPI_APEI_GHES ++ /* Used for GHES mapping from assorted contexts */ ++ FIX_APEI_GHES_IRQ, ++ FIX_APEI_GHES_NMI, ++#endif /* CONFIG_ACPI_APEI_GHES */ ++ + __end_of_permanent_fixed_addresses, + + /* +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index dcd9fb55e679..b0c505fe9a95 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -104,6 +104,12 @@ enum fixed_addresses { + FIX_GDT_REMAP_BEGIN, + FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1, + ++#ifdef CONFIG_ACPI_APEI_GHES ++ /* Used for GHES mapping from assorted contexts */ ++ FIX_APEI_GHES_IRQ, ++ FIX_APEI_GHES_NMI, ++#endif ++ + __end_of_permanent_fixed_addresses, + + /* +diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c +index cb7aceae3553..572b6c7303ed 100644 +--- a/drivers/acpi/apei/ghes.c ++++ b/drivers/acpi/apei/ghes.c +@@ -51,6 +51,7 @@ + #include <acpi/actbl1.h> + #include <acpi/ghes.h> + #include <acpi/apei.h> ++#include <asm/fixmap.h> + #include <asm/tlbflush.h> + #include <ras/ras_event.h> + +@@ -112,7 +113,7 @@ static DEFINE_MUTEX(ghes_list_mutex); + * Because the memory area used to transfer hardware error information + * from BIOS to Linux can be determined only in NMI, IRQ or timer + * handler, but general ioremap can not be used in atomic context, so +- * a special version of atomic ioremap is implemented for that. ++ * the fixmap is used instead. + */ + + /* +@@ -126,8 +127,8 @@ static DEFINE_MUTEX(ghes_list_mutex); + /* virtual memory area for atomic ioremap */ + static struct vm_struct *ghes_ioremap_area; + /* +- * These 2 spinlock is used to prevent atomic ioremap virtual memory +- * area from being mapped simultaneously. ++ * These 2 spinlocks are used to prevent the fixmap entries from being used ++ * simultaneously. + */ + static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi); + static DEFINE_SPINLOCK(ghes_ioremap_lock_irq); +@@ -159,53 +160,36 @@ static void ghes_ioremap_exit(void) + + static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn) + { +- unsigned long vaddr; + phys_addr_t paddr; + pgprot_t prot; + +- vaddr = (unsigned long)GHES_IOREMAP_NMI_PAGE(ghes_ioremap_area->addr); +- + paddr = pfn << PAGE_SHIFT; + prot = arch_apei_get_mem_attribute(paddr); +- ioremap_page_range(vaddr, vaddr + PAGE_SIZE, paddr, prot); ++ __set_fixmap(FIX_APEI_GHES_NMI, paddr, prot); + +- return (void __iomem *)vaddr; ++ return (void __iomem *) fix_to_virt(FIX_APEI_GHES_NMI); + } + + static void __iomem *ghes_ioremap_pfn_irq(u64 pfn) + { +- unsigned long vaddr; + phys_addr_t paddr; + pgprot_t prot; + +- vaddr = (unsigned long)GHES_IOREMAP_IRQ_PAGE(ghes_ioremap_area->addr); +- + paddr = pfn << PAGE_SHIFT; + prot = arch_apei_get_mem_attribute(paddr); ++ __set_fixmap(FIX_APEI_GHES_IRQ, paddr, prot); + +- ioremap_page_range(vaddr, vaddr + PAGE_SIZE, paddr, prot); +- +- return (void __iomem *)vaddr; ++ return (void __iomem *) fix_to_virt(FIX_APEI_GHES_IRQ); + } + +-static void ghes_iounmap_nmi(void __iomem *vaddr_ptr) ++static void ghes_iounmap_nmi(void) + { +- unsigned long vaddr = (unsigned long __force)vaddr_ptr; +- void *base = ghes_ioremap_area->addr; +- +- BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_NMI_PAGE(base)); +- unmap_kernel_range_noflush(vaddr, PAGE_SIZE); +- arch_apei_flush_tlb_one(vaddr); ++ clear_fixmap(FIX_APEI_GHES_NMI); + } + +-static void ghes_iounmap_irq(void __iomem *vaddr_ptr) ++static void ghes_iounmap_irq(void) + { +- unsigned long vaddr = (unsigned long __force)vaddr_ptr; +- void *base = ghes_ioremap_area->addr; +- +- BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_IRQ_PAGE(base)); +- unmap_kernel_range_noflush(vaddr, PAGE_SIZE); +- arch_apei_flush_tlb_one(vaddr); ++ clear_fixmap(FIX_APEI_GHES_IRQ); + } + + static int ghes_estatus_pool_init(void) +@@ -361,10 +345,10 @@ static void ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len, + paddr += trunk; + buffer += trunk; + if (in_nmi) { +- ghes_iounmap_nmi(vaddr); ++ ghes_iounmap_nmi(); + raw_spin_unlock(&ghes_ioremap_lock_nmi); + } else { +- ghes_iounmap_irq(vaddr); ++ ghes_iounmap_irq(); + spin_unlock_irqrestore(&ghes_ioremap_lock_irq, flags); + } + } +-- +2.15.0 + diff --git a/queue/ACPI-APEI-adjust-a-local-variable-type-in-ghes_iorem.patch b/queue/ACPI-APEI-adjust-a-local-variable-type-in-ghes_iorem.patch new file mode 100644 index 0000000..e40a975 --- /dev/null +++ b/queue/ACPI-APEI-adjust-a-local-variable-type-in-ghes_iorem.patch @@ -0,0 +1,32 @@ +From 095f613c6b386a1704b73a549e9ba66c1d5381ae Mon Sep 17 00:00:00 2001 +From: Jan Beulich <JBeulich@suse.com> +Date: Mon, 25 Sep 2017 02:06:19 -0600 +Subject: [PATCH] ACPI / APEI: adjust a local variable type in + ghes_ioremap_pfn_irq() + +commit 095f613c6b386a1704b73a549e9ba66c1d5381ae upstream. + +Match up with what 7edda0886b ("acpi: apei: handle SEA notification +type for ARMv8") did for ghes_ioremap_pfn_nmi(). + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Borislav Petkov <bp@suse.de> +Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> + +diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c +index 3c3a37b8503b..69ef0b6bf25d 100644 +--- a/drivers/acpi/apei/ghes.c ++++ b/drivers/acpi/apei/ghes.c +@@ -174,7 +174,8 @@ static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn) + + static void __iomem *ghes_ioremap_pfn_irq(u64 pfn) + { +- unsigned long vaddr, paddr; ++ unsigned long vaddr; ++ phys_addr_t paddr; + pgprot_t prot; + + vaddr = (unsigned long)GHES_IOREMAP_IRQ_PAGE(ghes_ioremap_area->addr); +-- +2.15.0 + diff --git a/queue/ACPI-APEI-remove-the-unused-dead-code-for-SEA-NMI-no.patch b/queue/ACPI-APEI-remove-the-unused-dead-code-for-SEA-NMI-no.patch new file mode 100644 index 0000000..5a853d7 --- /dev/null +++ b/queue/ACPI-APEI-remove-the-unused-dead-code-for-SEA-NMI-no.patch @@ -0,0 +1,77 @@ +From c49870e89f4d2c21c76ebe90568246bb0f3572b7 Mon Sep 17 00:00:00 2001 +From: Dongjiu Geng <gengdongjiu@huawei.com> +Date: Tue, 17 Oct 2017 16:02:20 +0800 +Subject: [PATCH] ACPI / APEI: remove the unused dead-code for SEA/NMI + notification type + +commit c49870e89f4d2c21c76ebe90568246bb0f3572b7 upstream. + +For the SEA notification, the two functions ghes_sea_add() and +ghes_sea_remove() are only called when CONFIG_ACPI_APEI_SEA +is defined. If not, it will return errors in the ghes_probe() +and not continue. If the probe is failed, the ghes_sea_remove() +also has no chance to be called. Hence, remove the unnecessary +handling when CONFIG_ACPI_APEI_SEA is not defined. + +For the NMI notification, it has the same issue as SEA notification, +so also remove the unused dead-code for it. + +Signed-off-by: Dongjiu Geng <gengdongjiu@huawei.com> +Tested-by: Tyler Baicar <tbaicar@codeaurora.org> +Reviewed-by: Borislav Petkov <bp@suse.de> +Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> + +diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c +index 69ef0b6bf25d..cb7aceae3553 100644 +--- a/drivers/acpi/apei/ghes.c ++++ b/drivers/acpi/apei/ghes.c +@@ -852,17 +852,8 @@ static void ghes_sea_remove(struct ghes *ghes) + synchronize_rcu(); + } + #else /* CONFIG_ACPI_APEI_SEA */ +-static inline void ghes_sea_add(struct ghes *ghes) +-{ +- pr_err(GHES_PFX "ID: %d, trying to add SEA notification which is not supported\n", +- ghes->generic->header.source_id); +-} +- +-static inline void ghes_sea_remove(struct ghes *ghes) +-{ +- pr_err(GHES_PFX "ID: %d, trying to remove SEA notification which is not supported\n", +- ghes->generic->header.source_id); +-} ++static inline void ghes_sea_add(struct ghes *ghes) { } ++static inline void ghes_sea_remove(struct ghes *ghes) { } + #endif /* CONFIG_ACPI_APEI_SEA */ + + #ifdef CONFIG_HAVE_ACPI_APEI_NMI +@@ -1064,23 +1055,9 @@ static void ghes_nmi_init_cxt(void) + init_irq_work(&ghes_proc_irq_work, ghes_proc_in_irq); + } + #else /* CONFIG_HAVE_ACPI_APEI_NMI */ +-static inline void ghes_nmi_add(struct ghes *ghes) +-{ +- pr_err(GHES_PFX "ID: %d, trying to add NMI notification which is not supported!\n", +- ghes->generic->header.source_id); +- BUG(); +-} +- +-static inline void ghes_nmi_remove(struct ghes *ghes) +-{ +- pr_err(GHES_PFX "ID: %d, trying to remove NMI notification which is not supported!\n", +- ghes->generic->header.source_id); +- BUG(); +-} +- +-static inline void ghes_nmi_init_cxt(void) +-{ +-} ++static inline void ghes_nmi_add(struct ghes *ghes) { } ++static inline void ghes_nmi_remove(struct ghes *ghes) { } ++static inline void ghes_nmi_init_cxt(void) { } + #endif /* CONFIG_HAVE_ACPI_APEI_NMI */ + + static int ghes_probe(struct platform_device *ghes_dev) +-- +2.15.0 + diff --git a/queue/ARM-exynos_defconfig-Enable-UAS-support-for-Odroid-H.patch b/queue/ARM-exynos_defconfig-Enable-UAS-support-for-Odroid-H.patch new file mode 100644 index 0000000..af60844 --- /dev/null +++ b/queue/ARM-exynos_defconfig-Enable-UAS-support-for-Odroid-H.patch @@ -0,0 +1,34 @@ +From a99897f550de96841aecb811455a67ad7a4e39a7 Mon Sep 17 00:00:00 2001 +From: Marek Szyprowski <m.szyprowski@samsung.com> +Date: Mon, 2 Oct 2017 08:39:35 +0200 +Subject: [PATCH] ARM: exynos_defconfig: Enable UAS support for Odroid HC1 + board + +commit a99897f550de96841aecb811455a67ad7a4e39a7 upstream. + +Odroid HC1 board has built-in JMicron USB to SATA bridge, which supports +UAS protocol. Compile-in support for it (instead of enabling it as module) +to make sure that all built-in storage devices are available for rootfs. +The bridge itself also supports fallback to standard USB Mass Storage +protocol, but USB Mass Storage class doesn't bind to it when UAS is +compiled as module and modules are not (yet) available. + +Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com> +Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org> + +diff --git a/arch/arm/configs/exynos_defconfig b/arch/arm/configs/exynos_defconfig +index 8c2a2619971b..f1d7834990ec 100644 +--- a/arch/arm/configs/exynos_defconfig ++++ b/arch/arm/configs/exynos_defconfig +@@ -244,7 +244,7 @@ CONFIG_USB_STORAGE_ONETOUCH=m + CONFIG_USB_STORAGE_KARMA=m + CONFIG_USB_STORAGE_CYPRESS_ATACB=m + CONFIG_USB_STORAGE_ENE_UB6250=m +-CONFIG_USB_UAS=m ++CONFIG_USB_UAS=y + CONFIG_USB_DWC3=y + CONFIG_USB_DWC2=y + CONFIG_USB_HSIC_USB3503=y +-- +2.15.0 + diff --git a/queue/ASoC-codecs-msm8916-wcd-analog-fix-micbias-level.patch b/queue/ASoC-codecs-msm8916-wcd-analog-fix-micbias-level.patch new file mode 100644 index 0000000..ff71974 --- /dev/null +++ b/queue/ASoC-codecs-msm8916-wcd-analog-fix-micbias-level.patch @@ -0,0 +1,49 @@ +From 664611e7e02f76fbc5470ef545b2657ed25c292b Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Jean-Fran=C3=A7ois=20T=C3=AAtu?= + <jean-francois.tetu@savoirfairelinux.com> +Date: Fri, 29 Sep 2017 16:19:44 -0400 +Subject: [PATCH] ASoC: codecs: msm8916-wcd-analog: fix micbias level +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 664611e7e02f76fbc5470ef545b2657ed25c292b upstream. + +The macro used to set the microphone bias level causes the +snd_soc_write() call to overwrite other fields in the CDC_A_MICB_1_VAL +register. The macro also does not return the proper level value +to use. This fixes this by preserving all bits from the register +that are not the level while setting the level. + +Signed-off-by: Jean-François Têtu <jean-francois.tetu@savoirfairelinux.com> +Acked-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org> +Signed-off-by: Mark Brown <broonie@kernel.org> + +diff --git a/sound/soc/codecs/msm8916-wcd-analog.c b/sound/soc/codecs/msm8916-wcd-analog.c +index 549c269acc7d..a42f8ebb9670 100644 +--- a/sound/soc/codecs/msm8916-wcd-analog.c ++++ b/sound/soc/codecs/msm8916-wcd-analog.c +@@ -104,7 +104,7 @@ + #define CDC_A_MICB_1_VAL (0xf141) + #define MICB_MIN_VAL 1600 + #define MICB_STEP_SIZE 50 +-#define MICB_VOLTAGE_REGVAL(v) ((v - MICB_MIN_VAL)/MICB_STEP_SIZE) ++#define MICB_VOLTAGE_REGVAL(v) (((v - MICB_MIN_VAL)/MICB_STEP_SIZE) << 3) + #define MICB_1_VAL_MICB_OUT_VAL_MASK GENMASK(7, 3) + #define MICB_1_VAL_MICB_OUT_VAL_V2P70V ((0x16) << 3) + #define MICB_1_VAL_MICB_OUT_VAL_V1P80V ((0x4) << 3) +@@ -349,8 +349,9 @@ static void pm8916_wcd_analog_micbias_enable(struct snd_soc_codec *codec) + | MICB_1_CTL_EXT_PRECHARG_EN_ENABLE); + + if (wcd->micbias_mv) { +- snd_soc_write(codec, CDC_A_MICB_1_VAL, +- MICB_VOLTAGE_REGVAL(wcd->micbias_mv)); ++ snd_soc_update_bits(codec, CDC_A_MICB_1_VAL, ++ MICB_1_VAL_MICB_OUT_VAL_MASK, ++ MICB_VOLTAGE_REGVAL(wcd->micbias_mv)); + /* + * Special headset needs MICBIAS as 2.7V so wait for + * 50 msec for the MICBIAS to reach 2.7 volts. +-- +2.15.0 + diff --git a/queue/ASoC-codecs-msm8916-wcd-analog-fix-module-autoload.patch b/queue/ASoC-codecs-msm8916-wcd-analog-fix-module-autoload.patch new file mode 100644 index 0000000..9154c34 --- /dev/null +++ b/queue/ASoC-codecs-msm8916-wcd-analog-fix-module-autoload.patch @@ -0,0 +1,43 @@ +From 46d69e141d479585c105a4d5b2337cd2ce6967e5 Mon Sep 17 00:00:00 2001 +From: Nicolas Dechesne <nicolas.dechesne@linaro.org> +Date: Tue, 3 Oct 2017 11:49:51 +0200 +Subject: [PATCH] ASoC: codecs: msm8916-wcd-analog: fix module autoload + +commit 46d69e141d479585c105a4d5b2337cd2ce6967e5 upstream. + +If the driver is built as a module, autoload won't work because the module +alias information is not filled. So user-space can't match the registered +device with the corresponding module. + +Export the module alias information using the MODULE_DEVICE_TABLE() macro. + +Before this patch: + +$ modinfo snd_soc_msm8916_analog | grep alias +$ + +After this patch: + +$ modinfo snd_soc_msm8916_analog | grep alias +alias: of:N*T*Cqcom,pm8916-wcd-analog-codecC* +alias: of:N*T*Cqcom,pm8916-wcd-analog-codec + +Signed-off-by: Nicolas Dechesne <nicolas.dechesne@linaro.org> +Signed-off-by: Mark Brown <broonie@kernel.org> + +diff --git a/sound/soc/codecs/msm8916-wcd-analog.c b/sound/soc/codecs/msm8916-wcd-analog.c +index f562f2d86907..3593c578e3e7 100644 +--- a/sound/soc/codecs/msm8916-wcd-analog.c ++++ b/sound/soc/codecs/msm8916-wcd-analog.c +@@ -1239,6 +1239,8 @@ static const struct of_device_id pm8916_wcd_analog_spmi_match_table[] = { + { } + }; + ++MODULE_DEVICE_TABLE(of, pm8916_wcd_analog_spmi_match_table); ++ + static struct platform_driver pm8916_wcd_analog_spmi_driver = { + .driver = { + .name = "qcom,pm8916-wcd-spmi-codec", +-- +2.15.0 + diff --git a/queue/ASoC-img-parallel-out-Add-pm_runtime_get-put-to-set_.patch b/queue/ASoC-img-parallel-out-Add-pm_runtime_get-put-to-set_.patch new file mode 100644 index 0000000..70cb9f2 --- /dev/null +++ b/queue/ASoC-img-parallel-out-Add-pm_runtime_get-put-to-set_.patch @@ -0,0 +1,35 @@ +From c70458890ff15d858bd347fa9f563818bcd6e457 Mon Sep 17 00:00:00 2001 +From: Ed Blake <ed.blake@sondrel.com> +Date: Mon, 2 Oct 2017 11:00:33 +0100 +Subject: [PATCH] ASoC: img-parallel-out: Add pm_runtime_get/put to set_fmt + callback + +commit c70458890ff15d858bd347fa9f563818bcd6e457 upstream. + +Add pm_runtime_get_sync and pm_runtime_put calls to set_fmt callback +function. This fixes a bus error during boot when CONFIG_SUSPEND is +defined when this function gets called while the device is runtime +disabled and device registers are accessed while the clock is disabled. + +Signed-off-by: Ed Blake <ed.blake@sondrel.com> +Signed-off-by: Mark Brown <broonie@kernel.org> + +diff --git a/sound/soc/img/img-parallel-out.c b/sound/soc/img/img-parallel-out.c +index 23b0f0f6ec9c..2fc8a6372206 100644 +--- a/sound/soc/img/img-parallel-out.c ++++ b/sound/soc/img/img-parallel-out.c +@@ -164,9 +164,11 @@ static int img_prl_out_set_fmt(struct snd_soc_dai *dai, unsigned int fmt) + return -EINVAL; + } + ++ pm_runtime_get_sync(prl->dev); + reg = img_prl_out_readl(prl, IMG_PRL_OUT_CTL); + reg = (reg & ~IMG_PRL_OUT_CTL_EDGE_MASK) | control_set; + img_prl_out_writel(prl, reg, IMG_PRL_OUT_CTL); ++ pm_runtime_put(prl->dev); + + return 0; + } +-- +2.15.0 + diff --git a/queue/Bluetooth-hci_bcm-Fix-setting-of-irq-trigger-type.patch b/queue/Bluetooth-hci_bcm-Fix-setting-of-irq-trigger-type.patch new file mode 100644 index 0000000..f38832f --- /dev/null +++ b/queue/Bluetooth-hci_bcm-Fix-setting-of-irq-trigger-type.patch @@ -0,0 +1,132 @@ +From 227630cccdbb8f8a1b24ac26517b75079c9a69c9 Mon Sep 17 00:00:00 2001 +From: Hans de Goede <hdegoede@redhat.com> +Date: Wed, 4 Oct 2017 20:43:36 +0200 +Subject: [PATCH] Bluetooth: hci_bcm: Fix setting of irq trigger type + +commit 227630cccdbb8f8a1b24ac26517b75079c9a69c9 upstream. + +This commit fixes 2 issues with host-wake irq trigger type handling +in hci_bcm: + +1) bcm_setup_sleep sets sleep_params.host_wake_active based on +bcm_device.irq_polarity, but bcm_request_irq was always requesting +IRQF_TRIGGER_RISING as trigger type independent of irq_polarity. + +This was a problem when the irq is described as a GpioInt rather then +an Interrupt in the DSDT as for GpioInt-s the value passed to request_irq +is honored. This commit fixes this by requesting the correct trigger +type depending on bcm_device.irq_polarity. + +2) bcm_device.irq_polarity was used to directly store an ACPI polarity +value (ACPI_ACTIVE_*). This is undesirable because hci_bcm is also +used with device-tree and checking for something like ACPI_ACTIVE_LOW +in a non ACPI specific function like bcm_request_irq feels wrong. + +This commit fixes this by renaming irq_polarity to irq_active_low +and changing its type to a bool. + +Signed-off-by: Hans de Goede <hdegoede@redhat.com> +Signed-off-by: Marcel Holtmann <marcel@holtmann.org> + +diff --git a/drivers/bluetooth/hci_bcm.c b/drivers/bluetooth/hci_bcm.c +index e2540113d0da..73d2d88ddc03 100644 +--- a/drivers/bluetooth/hci_bcm.c ++++ b/drivers/bluetooth/hci_bcm.c +@@ -68,7 +68,7 @@ struct bcm_device { + u32 init_speed; + u32 oper_speed; + int irq; +- u8 irq_polarity; ++ bool irq_active_low; + + #ifdef CONFIG_PM + struct hci_uart *hu; +@@ -213,7 +213,9 @@ static int bcm_request_irq(struct bcm_data *bcm) + } + + err = devm_request_irq(&bdev->pdev->dev, bdev->irq, bcm_host_wake, +- IRQF_TRIGGER_RISING, "host_wake", bdev); ++ bdev->irq_active_low ? IRQF_TRIGGER_FALLING : ++ IRQF_TRIGGER_RISING, ++ "host_wake", bdev); + if (err) + goto unlock; + +@@ -253,7 +255,7 @@ static int bcm_setup_sleep(struct hci_uart *hu) + struct sk_buff *skb; + struct bcm_set_sleep_mode sleep_params = default_sleep_params; + +- sleep_params.host_wake_active = !bcm->dev->irq_polarity; ++ sleep_params.host_wake_active = !bcm->dev->irq_active_low; + + skb = __hci_cmd_sync(hu->hdev, 0xfc27, sizeof(sleep_params), + &sleep_params, HCI_INIT_TIMEOUT); +@@ -690,10 +692,8 @@ static const struct acpi_gpio_mapping acpi_bcm_int_first_gpios[] = { + }; + + #ifdef CONFIG_ACPI +-static u8 acpi_active_low = ACPI_ACTIVE_LOW; +- + /* IRQ polarity of some chipsets are not defined correctly in ACPI table. */ +-static const struct dmi_system_id bcm_wrong_irq_dmi_table[] = { ++static const struct dmi_system_id bcm_active_low_irq_dmi_table[] = { + { + .ident = "Asus T100TA", + .matches = { +@@ -701,7 +701,6 @@ static const struct dmi_system_id bcm_wrong_irq_dmi_table[] = { + "ASUSTeK COMPUTER INC."), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "T100TA"), + }, +- .driver_data = &acpi_active_low, + }, + { + .ident = "Asus T100CHI", +@@ -710,7 +709,6 @@ static const struct dmi_system_id bcm_wrong_irq_dmi_table[] = { + "ASUSTeK COMPUTER INC."), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "T100CHI"), + }, +- .driver_data = &acpi_active_low, + }, + { /* Handle ThinkPad 8 tablets with BCM2E55 chipset ACPI ID */ + .ident = "Lenovo ThinkPad 8", +@@ -718,7 +716,6 @@ static const struct dmi_system_id bcm_wrong_irq_dmi_table[] = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "ThinkPad 8"), + }, +- .driver_data = &acpi_active_low, + }, + { } + }; +@@ -733,13 +730,13 @@ static int bcm_resource(struct acpi_resource *ares, void *data) + switch (ares->type) { + case ACPI_RESOURCE_TYPE_EXTENDED_IRQ: + irq = &ares->data.extended_irq; +- dev->irq_polarity = irq->polarity; ++ dev->irq_active_low = irq->polarity == ACPI_ACTIVE_LOW; + break; + + case ACPI_RESOURCE_TYPE_GPIO: + gpio = &ares->data.gpio; + if (gpio->connection_type == ACPI_RESOURCE_GPIO_TYPE_INT) +- dev->irq_polarity = gpio->polarity; ++ dev->irq_active_low = gpio->polarity == ACPI_ACTIVE_LOW; + break; + + case ACPI_RESOURCE_TYPE_SERIAL_BUS: +@@ -834,11 +831,11 @@ static int bcm_acpi_probe(struct bcm_device *dev) + return ret; + acpi_dev_free_resource_list(&resources); + +- dmi_id = dmi_first_match(bcm_wrong_irq_dmi_table); ++ dmi_id = dmi_first_match(bcm_active_low_irq_dmi_table); + if (dmi_id) { + bt_dev_warn(dev, "%s: Overwriting IRQ polarity to active low", + dmi_id->ident); +- dev->irq_polarity = *(u8 *)dmi_id->driver_data; ++ dev->irq_active_low = true; + } + + return 0; +-- +2.15.0 + diff --git a/queue/Bluetooth-hci_uart_set_flow_control-Fix-NULL-deref-w.patch b/queue/Bluetooth-hci_uart_set_flow_control-Fix-NULL-deref-w.patch new file mode 100644 index 0000000..fc93aa6 --- /dev/null +++ b/queue/Bluetooth-hci_uart_set_flow_control-Fix-NULL-deref-w.patch @@ -0,0 +1,42 @@ +From 7841d554809b518a22349e7e39b6b63f8a48d0fb Mon Sep 17 00:00:00 2001 +From: Hans de Goede <hdegoede@redhat.com> +Date: Wed, 4 Oct 2017 20:43:35 +0200 +Subject: [PATCH] Bluetooth: hci_uart_set_flow_control: Fix NULL deref when + using serdev + +commit 7841d554809b518a22349e7e39b6b63f8a48d0fb upstream. + +Fix a NULL pointer deref (hu->tty) when calling hci_uart_set_flow_control +on hci_uart-s using serdev. + +Signed-off-by: Hans de Goede <hdegoede@redhat.com> +Signed-off-by: Marcel Holtmann <marcel@holtmann.org> + +diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c +index a746627e784e..eec95019f15c 100644 +--- a/drivers/bluetooth/hci_ldisc.c ++++ b/drivers/bluetooth/hci_ldisc.c +@@ -41,6 +41,7 @@ + #include <linux/ioctl.h> + #include <linux/skbuff.h> + #include <linux/firmware.h> ++#include <linux/serdev.h> + + #include <net/bluetooth/bluetooth.h> + #include <net/bluetooth/hci_core.h> +@@ -298,6 +299,12 @@ void hci_uart_set_flow_control(struct hci_uart *hu, bool enable) + unsigned int set = 0; + unsigned int clear = 0; + ++ if (hu->serdev) { ++ serdev_device_set_flow_control(hu->serdev, !enable); ++ serdev_device_set_rts(hu->serdev, !enable); ++ return; ++ } ++ + if (enable) { + /* Disable hardware flow control */ + ktermios = tty->termios; +-- +2.15.0 + diff --git a/queue/IB-opa_vnic-Properly-clear-Mac-Table-Digest.patch b/queue/IB-opa_vnic-Properly-clear-Mac-Table-Digest.patch new file mode 100644 index 0000000..430f811 --- /dev/null +++ b/queue/IB-opa_vnic-Properly-clear-Mac-Table-Digest.patch @@ -0,0 +1,29 @@ +From 4bbdfe25600c1909c26747d0b5c39fd0e409bb87 Mon Sep 17 00:00:00 2001 +From: Scott Franco <safranco@intel.com> +Date: Tue, 26 Sep 2017 06:44:13 -0700 +Subject: [PATCH] IB/opa_vnic: Properly clear Mac Table Digest + +commit 4bbdfe25600c1909c26747d0b5c39fd0e409bb87 upstream. + +Clear the MAC table digest when the MAC table is freed. + +Reviewed-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com> +Signed-off-by: Scott Franco <safranco@intel.com> +Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> +Signed-off-by: Doug Ledford <dledford@redhat.com> + +diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c +index afa938bd26d6..a72278e9cd27 100644 +--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c ++++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c +@@ -139,6 +139,7 @@ void opa_vnic_release_mac_tbl(struct opa_vnic_adapter *adapter) + rcu_assign_pointer(adapter->mactbl, NULL); + synchronize_rcu(); + opa_vnic_free_mac_tbl(mactbl); ++ adapter->info.vport.mac_tbl_digest = 0; + mutex_unlock(&adapter->mactbl_lock); + } + +-- +2.15.0 + diff --git a/queue/IB-opa_vnic-Properly-return-the-total-MACs-in-UC-MAC.patch b/queue/IB-opa_vnic-Properly-return-the-total-MACs-in-UC-MAC.patch new file mode 100644 index 0000000..dc37db7 --- /dev/null +++ b/queue/IB-opa_vnic-Properly-return-the-total-MACs-in-UC-MAC.patch @@ -0,0 +1,52 @@ +From b77eb45e0d9c324245d165656ab3b38b6f386436 Mon Sep 17 00:00:00 2001 +From: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com> +Date: Tue, 26 Sep 2017 06:44:07 -0700 +Subject: [PATCH] IB/opa_vnic: Properly return the total MACs in UC MAC list + +commit b77eb45e0d9c324245d165656ab3b38b6f386436 upstream. + +Do not include EM specified MAC address in total MACs of the +UC MAC list. + +Reviewed-by: Sudeep Dutt <sudeep.dutt@intel.com> +Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com> +Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> +Signed-off-by: Doug Ledford <dledford@redhat.com> + +diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema_iface.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema_iface.c +index 5856ae3a0d7b..5553900848e3 100644 +--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema_iface.c ++++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema_iface.c +@@ -342,7 +342,7 @@ void opa_vnic_query_mcast_macs(struct opa_vnic_adapter *adapter, + void opa_vnic_query_ucast_macs(struct opa_vnic_adapter *adapter, + struct opa_veswport_iface_macs *macs) + { +- u16 start_idx, tot_macs, num_macs, idx = 0, count = 0; ++ u16 start_idx, tot_macs, num_macs, idx = 0, count = 0, em_macs = 0; + struct netdev_hw_addr *ha; + + start_idx = be16_to_cpu(macs->start_idx); +@@ -353,8 +353,10 @@ void opa_vnic_query_ucast_macs(struct opa_vnic_adapter *adapter, + + /* Do not include EM specified MAC address */ + if (!memcmp(adapter->info.vport.base_mac_addr, ha->addr, +- ARRAY_SIZE(adapter->info.vport.base_mac_addr))) ++ ARRAY_SIZE(adapter->info.vport.base_mac_addr))) { ++ em_macs++; + continue; ++ } + + if (start_idx > idx++) + continue; +@@ -377,7 +379,7 @@ void opa_vnic_query_ucast_macs(struct opa_vnic_adapter *adapter, + } + + tot_macs = netdev_hw_addr_list_count(&adapter->netdev->dev_addrs) + +- netdev_uc_count(adapter->netdev); ++ netdev_uc_count(adapter->netdev) - em_macs; + macs->tot_macs_in_lst = cpu_to_be16(tot_macs); + macs->num_macs_in_msg = cpu_to_be16(count); + macs->gen_count = cpu_to_be16(adapter->info.vport.uc_macs_gen_count); +-- +2.15.0 + diff --git a/queue/IB-rxe-check-for-allocation-failure-on-elem.patch b/queue/IB-rxe-check-for-allocation-failure-on-elem.patch new file mode 100644 index 0000000..ff983a0 --- /dev/null +++ b/queue/IB-rxe-check-for-allocation-failure-on-elem.patch @@ -0,0 +1,33 @@ +From 4831ca9e4a8e48cb27e0a792f73250390827a228 Mon Sep 17 00:00:00 2001 +From: Colin Ian King <colin.king@canonical.com> +Date: Fri, 8 Sep 2017 15:37:45 +0100 +Subject: [PATCH] IB/rxe: check for allocation failure on elem + +commit 4831ca9e4a8e48cb27e0a792f73250390827a228 upstream. + +The allocation for elem may fail (especially because we're using +GFP_ATOMIC) so best to check for a null return. This fixes a potential +null pointer dereference when assigning elem->pool. + +Detected by CoverityScan CID#1357507 ("Dereference null return value") + +Fixes: 8700e3e7c485 ("Soft RoCE driver") +Signed-off-by: Colin Ian King <colin.king@canonical.com> +Signed-off-by: Doug Ledford <dledford@redhat.com> + +diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c +index c1b5f38f31a5..3b4916680018 100644 +--- a/drivers/infiniband/sw/rxe/rxe_pool.c ++++ b/drivers/infiniband/sw/rxe/rxe_pool.c +@@ -404,6 +404,8 @@ void *rxe_alloc(struct rxe_pool *pool) + elem = kmem_cache_zalloc(pool_cache(pool), + (pool->flags & RXE_POOL_ATOMIC) ? + GFP_ATOMIC : GFP_KERNEL); ++ if (!elem) ++ return NULL; + + elem->pool = pool; + kref_init(&elem->ref_cnt); +-- +2.15.0 + diff --git a/queue/PCI-AER-Report-non-fatal-errors-only-to-the-affected.patch b/queue/PCI-AER-Report-non-fatal-errors-only-to-the-affected.patch new file mode 100644 index 0000000..696ef7b --- /dev/null +++ b/queue/PCI-AER-Report-non-fatal-errors-only-to-the-affected.patch @@ -0,0 +1,66 @@ +From 86acc790717fb60fb51ea3095084e331d8711c74 Mon Sep 17 00:00:00 2001 +From: Gabriele Paoloni <gabriele.paoloni@huawei.com> +Date: Thu, 28 Sep 2017 15:33:05 +0100 +Subject: [PATCH] PCI/AER: Report non-fatal errors only to the affected + endpoint + +commit 86acc790717fb60fb51ea3095084e331d8711c74 upstream. + +Previously, if an non-fatal error was reported by an endpoint, we +called report_error_detected() for the endpoint, every sibling on the +bus, and their descendents. If any of them did not implement the +.error_detected() method, do_recovery() failed, leaving all these +devices unrecovered. + +For example, the system described in the bugzilla below has two devices: + + 0000:74:02.0 [19e5:a230] SAS controller, driver has .error_detected() + 0000:74:03.0 [19e5:a235] SATA controller, driver lacks .error_detected() + +When a device such as 74:02.0 reported a non-fatal error, do_recovery() +failed because 74:03.0 lacked an .error_detected() method. But per PCIe +r3.1, sec 6.2.2.2.2, such an error does not compromise the Link and +does not affect 74:03.0: + + Non-fatal errors are uncorrectable errors which cause a particular + transaction to be unreliable but the Link is otherwise fully functional. + Isolating Non-fatal from Fatal errors provides Requester/Receiver logic + in a device or system management software the opportunity to recover from + the error without resetting the components on the Link and disturbing + other transactions in progress. Devices not associated with the + transaction in error are not impacted by the error. + +Report non-fatal errors only to the endpoint that reported them. We really +want to check for AER_NONFATAL here, but the current code structure doesn't +allow that. Looking for pci_channel_io_normal is the best we can do now. + +Link: https://bugzilla.kernel.org/show_bug.cgi?id=197055 +Fixes: 6c2b374d7485 ("PCI-Express AER implemetation: AER core and aerdriver") +Signed-off-by: Gabriele Paoloni <gabriele.paoloni@huawei.com> +Signed-off-by: Dongdong Liu <liudongdong3@huawei.com> +[bhelgaas: changelog] +Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> + +diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c +index 890efcc574cb..744805232155 100644 +--- a/drivers/pci/pcie/aer/aerdrv_core.c ++++ b/drivers/pci/pcie/aer/aerdrv_core.c +@@ -390,7 +390,14 @@ static pci_ers_result_t broadcast_error_message(struct pci_dev *dev, + * If the error is reported by an end point, we think this + * error is related to the upstream link of the end point. + */ +- pci_walk_bus(dev->bus, cb, &result_data); ++ if (state == pci_channel_io_normal) ++ /* ++ * the error is non fatal so the bus is ok, just invoke ++ * the callback for the function that logged the error. ++ */ ++ cb(dev, &result_data); ++ else ++ pci_walk_bus(dev->bus, cb, &result_data); + } + + return result_data.result; +-- +2.15.0 + diff --git a/queue/PCI-Avoid-bus-reset-if-bridge-itself-is-broken.patch b/queue/PCI-Avoid-bus-reset-if-bridge-itself-is-broken.patch new file mode 100644 index 0000000..f2e5ab5 --- /dev/null +++ b/queue/PCI-Avoid-bus-reset-if-bridge-itself-is-broken.patch @@ -0,0 +1,43 @@ +From 357027786f3523d26f42391aa4c075b8495e5d28 Mon Sep 17 00:00:00 2001 +From: David Daney <david.daney@cavium.com> +Date: Fri, 8 Sep 2017 10:10:31 +0200 +Subject: [PATCH] PCI: Avoid bus reset if bridge itself is broken + +commit 357027786f3523d26f42391aa4c075b8495e5d28 upstream. + +When checking to see if a PCI bus can safely be reset, we previously +checked to see if any of the children had their PCI_DEV_FLAGS_NO_BUS_RESET +flag set. Children marked with that flag are known not to behave well +after a bus reset. + +Some PCIe root port bridges also do not behave well after a bus reset, +sometimes causing the devices behind the bridge to become unusable. + +Add a check for PCI_DEV_FLAGS_NO_BUS_RESET being set in the bridge device +to allow these bridges to be flagged, and prevent their secondary buses +from being reset. + +Signed-off-by: David Daney <david.daney@cavium.com> +[jglauber@cavium.com: fixed typo] +Signed-off-by: Jan Glauber <jglauber@cavium.com> +Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> +Reviewed-by: Alex Williamson <alex.williamson@redhat.com> + +diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c +index 6078dfc11b11..74f1c57ab93b 100644 +--- a/drivers/pci/pci.c ++++ b/drivers/pci/pci.c +@@ -4356,6 +4356,10 @@ static bool pci_bus_resetable(struct pci_bus *bus) + { + struct pci_dev *dev; + ++ ++ if (bus->self && (bus->self->dev_flags & PCI_DEV_FLAGS_NO_BUS_RESET)) ++ return false; ++ + list_for_each_entry(dev, &bus->devices, bus_list) { + if (dev->dev_flags & PCI_DEV_FLAGS_NO_BUS_RESET || + (dev->subordinate && !pci_bus_resetable(dev->subordinate))) +-- +2.15.0 + diff --git a/queue/PCI-Create-SR-IOV-virtfn-physfn-links-before-attachi.patch b/queue/PCI-Create-SR-IOV-virtfn-physfn-links-before-attachi.patch new file mode 100644 index 0000000..fa62e91 --- /dev/null +++ b/queue/PCI-Create-SR-IOV-virtfn-physfn-links-before-attachi.patch @@ -0,0 +1,42 @@ +From 27d6162944b9b34c32cd5841acd21786637ee743 Mon Sep 17 00:00:00 2001 +From: Stuart Hayes <stuart.w.hayes@gmail.com> +Date: Wed, 4 Oct 2017 10:57:52 -0500 +Subject: [PATCH] PCI: Create SR-IOV virtfn/physfn links before attaching + driver + +commit 27d6162944b9b34c32cd5841acd21786637ee743 upstream. + +When creating virtual functions, create the "virtfn%u" and "physfn" links +in sysfs *before* attaching the driver instead of after. When we attach +the driver to the new virtual network interface first, there is a race when +the driver attaches to the new sends out an "add" udev event, and the +network interface naming software (biosdevname or systemd, for example) +tries to look at these links. + +Signed-off-by: Stuart Hayes <stuart.w.hayes@gmail.com> +Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> + +diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c +index 7492a65baba9..ce24cf235f01 100644 +--- a/drivers/pci/iov.c ++++ b/drivers/pci/iov.c +@@ -159,7 +159,6 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id) + + pci_device_add(virtfn, virtfn->bus); + +- pci_bus_add_device(virtfn); + sprintf(buf, "virtfn%u", id); + rc = sysfs_create_link(&dev->dev.kobj, &virtfn->dev.kobj, buf); + if (rc) +@@ -170,6 +169,8 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id) + + kobject_uevent(&virtfn->dev.kobj, KOBJ_CHANGE); + ++ pci_bus_add_device(virtfn); ++ + return 0; + + failed2: +-- +2.15.0 + diff --git a/queue/PM-OPP-Move-error-message-to-debug-level.patch b/queue/PM-OPP-Move-error-message-to-debug-level.patch new file mode 100644 index 0000000..ba463e3 --- /dev/null +++ b/queue/PM-OPP-Move-error-message-to-debug-level.patch @@ -0,0 +1,43 @@ +From 035ed07208dc501d023873447113f3f178592156 Mon Sep 17 00:00:00 2001 +From: Fabio Estevam <fabio.estevam@nxp.com> +Date: Fri, 29 Sep 2017 14:39:49 -0300 +Subject: [PATCH] PM / OPP: Move error message to debug level + +commit 035ed07208dc501d023873447113f3f178592156 upstream. + +On some i.MX6 platforms which do not have speed grading +check, opp table will not be created in platform code, +so cpufreq driver prints the following error message: + +cpu cpu0: dev_pm_opp_get_opp_count: OPP table not found (-19) + +However, this is not really an error in this case because the +imx6q-cpufreq driver first calls dev_pm_opp_get_opp_count() +and if it fails, it means that platform code does not provide +OPP and then dev_pm_opp_of_add_table() will be called. + +In order to avoid such confusing error message, move it to +debug level. + +It is up to the caller of dev_pm_opp_get_opp_count() to check its +return value and decide if it will print an error or not. + +Signed-off-by: Fabio Estevam <fabio.estevam@nxp.com> +Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> + +diff --git a/drivers/opp/core.c b/drivers/opp/core.c +index a6de32530693..0459b1204694 100644 +--- a/drivers/opp/core.c ++++ b/drivers/opp/core.c +@@ -296,7 +296,7 @@ int dev_pm_opp_get_opp_count(struct device *dev) + opp_table = _find_opp_table(dev); + if (IS_ERR(opp_table)) { + count = PTR_ERR(opp_table); +- dev_err(dev, "%s: OPP table not found (%d)\n", ++ dev_dbg(dev, "%s: OPP table not found (%d)\n", + __func__, count); + return count; + } +-- +2.15.0 + diff --git a/queue/RDMA-hns-Avoid-NULL-pointer-exception.patch b/queue/RDMA-hns-Avoid-NULL-pointer-exception.patch new file mode 100644 index 0000000..ec9f81c --- /dev/null +++ b/queue/RDMA-hns-Avoid-NULL-pointer-exception.patch @@ -0,0 +1,41 @@ +From 5e437b1d7e8d31ff9a4b8e898eb3a6cee309edd9 Mon Sep 17 00:00:00 2001 +From: "Wei Hu(Xavier)" <xavier.huwei@huawei.com> +Date: Fri, 29 Sep 2017 23:10:12 +0800 +Subject: [PATCH] RDMA/hns: Avoid NULL pointer exception + +commit 5e437b1d7e8d31ff9a4b8e898eb3a6cee309edd9 upstream. + +After the loop in hns_roce_v1_mr_free_work_fn function, it is possible that +all qps will have been freed (in which case ne will be 0). If that +happens, then later in the function when we dereference hr_qp we will +get an exception. Check ne is not 0 to make sure we actually have an +hr_qp left to work on. + +This patch fixes the smatch error as below: +drivers/infiniband/hw/hns/hns_roce_hw_v1.c:1009 hns_roce_v1_mr_free_work_fn() +error: we previously assumed 'hr_qp' could be null + +Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com> +Signed-off-by: Lijun Ou <oulijun@huawei.com> +Signed-off-by: Shaobo Xu <xushaobo2@huawei.com> +Signed-off-by: Doug Ledford <dledford@redhat.com> + +diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +index 98887dd8ccb3..852db18ec128 100644 +--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c ++++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +@@ -1004,6 +1004,11 @@ static void hns_roce_v1_mr_free_work_fn(struct work_struct *work) + } + } + ++ if (!ne) { ++ dev_err(dev, "Reseved loop qp is absent!\n"); ++ goto free_work; ++ } ++ + do { + ret = hns_roce_v1_poll_cq(&mr_free_cq->ib_cq, ne, wc); + if (ret < 0) { +-- +2.15.0 + diff --git a/queue/backlight-pwm_bl-Fix-overflow-condition.patch b/queue/backlight-pwm_bl-Fix-overflow-condition.patch new file mode 100644 index 0000000..c786bda --- /dev/null +++ b/queue/backlight-pwm_bl-Fix-overflow-condition.patch @@ -0,0 +1,43 @@ +From 5d0c49acebc9488e37db95f1d4a55644e545ffe7 Mon Sep 17 00:00:00 2001 +From: Derek Basehore <dbasehore@chromium.org> +Date: Tue, 29 Aug 2017 13:34:34 -0700 +Subject: [PATCH] backlight: pwm_bl: Fix overflow condition + +commit 5d0c49acebc9488e37db95f1d4a55644e545ffe7 upstream. + +This fixes an overflow condition that can happen with high max +brightness and period values in compute_duty_cycle. This fixes it by +using a 64 bit variable for computing the duty cycle. + +Signed-off-by: Derek Basehore <dbasehore@chromium.org> +Acked-by: Thierry Reding <thierry.reding@gmail.com> +Reviewed-by: Brian Norris <briannorris@chromium.org> +Signed-off-by: Lee Jones <lee.jones@linaro.org> + +diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c +index 9bd17682655a..1c2289ddd555 100644 +--- a/drivers/video/backlight/pwm_bl.c ++++ b/drivers/video/backlight/pwm_bl.c +@@ -79,14 +79,17 @@ static void pwm_backlight_power_off(struct pwm_bl_data *pb) + static int compute_duty_cycle(struct pwm_bl_data *pb, int brightness) + { + unsigned int lth = pb->lth_brightness; +- int duty_cycle; ++ u64 duty_cycle; + + if (pb->levels) + duty_cycle = pb->levels[brightness]; + else + duty_cycle = brightness; + +- return (duty_cycle * (pb->period - lth) / pb->scale) + lth; ++ duty_cycle *= pb->period - lth; ++ do_div(duty_cycle, pb->scale); ++ ++ return duty_cycle + lth; + } + + static int pwm_backlight_update_status(struct backlight_device *bl) +-- +2.15.0 + diff --git a/queue/bitops-Add-clear-set_bit32-to-linux-bitops.h.patch b/queue/bitops-Add-clear-set_bit32-to-linux-bitops.h.patch new file mode 100644 index 0000000..d11f49d --- /dev/null +++ b/queue/bitops-Add-clear-set_bit32-to-linux-bitops.h.patch @@ -0,0 +1,58 @@ +From cbe96375025e14fc76f9ed42ee5225120d7210f8 Mon Sep 17 00:00:00 2001 +From: Andi Kleen <ak@linux.intel.com> +Date: Fri, 13 Oct 2017 14:56:41 -0700 +Subject: [PATCH] bitops: Add clear/set_bit32() to linux/bitops.h + +commit cbe96375025e14fc76f9ed42ee5225120d7210f8 upstream. + +Add two simple wrappers around set_bit/clear_bit() that accept +the common case of an u32 array. This avoids writing +casts in all callers. + +Signed-off-by: Andi Kleen <ak@linux.intel.com> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Link: http://lkml.kernel.org/r/20171013215645.23166-2-andi@firstfloor.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/include/linux/bitops.h b/include/linux/bitops.h +index 8fbe259b197c..36794f058ba6 100644 +--- a/include/linux/bitops.h ++++ b/include/linux/bitops.h +@@ -227,6 +227,32 @@ static inline unsigned long __ffs64(u64 word) + return __ffs((unsigned long)word); + } + ++/* ++ * clear_bit32 - Clear a bit in memory for u32 array ++ * @nr: Bit to clear ++ * @addr: u32 * address of bitmap ++ * ++ * Same as clear_bit, but avoids needing casts for u32 arrays. ++ */ ++ ++static __always_inline void clear_bit32(long nr, volatile u32 *addr) ++{ ++ clear_bit(nr, (volatile unsigned long *)addr); ++} ++ ++/* ++ * set_bit32 - Set a bit in memory for u32 array ++ * @nr: Bit to clear ++ * @addr: u32 * address of bitmap ++ * ++ * Same as set_bit, but avoids needing casts for u32 arrays. ++ */ ++ ++static __always_inline void set_bit32(long nr, volatile u32 *addr) ++{ ++ set_bit(nr, (volatile unsigned long *)addr); ++} ++ + #ifdef __KERNEL__ + + #ifndef set_mask_bits +-- +2.15.0 + diff --git a/queue/bitops-Revert-cbe96375025e-bitops-Add-clear-set_bit3.patch b/queue/bitops-Revert-cbe96375025e-bitops-Add-clear-set_bit3.patch new file mode 100644 index 0000000..8dd6518 --- /dev/null +++ b/queue/bitops-Revert-cbe96375025e-bitops-Add-clear-set_bit3.patch @@ -0,0 +1,56 @@ +From 1943dc07b45e347c52c1bfdd4a37e04a86e399aa Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Thu, 2 Nov 2017 13:30:03 +0100 +Subject: [PATCH] bitops: Revert cbe96375025e ("bitops: Add clear/set_bit32() + to linux/bitops.h") + +commit 1943dc07b45e347c52c1bfdd4a37e04a86e399aa upstream. + +These ops are not endian safe and may break on architectures which have +aligment requirements. + +Reverts: cbe96375025e ("bitops: Add clear/set_bit32() to linux/bitops.h") +Reported-by: Peter Zijlstra <peterz@infradead.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andi Kleen <ak@linux.intel.com> + +diff --git a/include/linux/bitops.h b/include/linux/bitops.h +index 36794f058ba6..8fbe259b197c 100644 +--- a/include/linux/bitops.h ++++ b/include/linux/bitops.h +@@ -227,32 +227,6 @@ static inline unsigned long __ffs64(u64 word) + return __ffs((unsigned long)word); + } + +-/* +- * clear_bit32 - Clear a bit in memory for u32 array +- * @nr: Bit to clear +- * @addr: u32 * address of bitmap +- * +- * Same as clear_bit, but avoids needing casts for u32 arrays. +- */ +- +-static __always_inline void clear_bit32(long nr, volatile u32 *addr) +-{ +- clear_bit(nr, (volatile unsigned long *)addr); +-} +- +-/* +- * set_bit32 - Set a bit in memory for u32 array +- * @nr: Bit to clear +- * @addr: u32 * address of bitmap +- * +- * Same as set_bit, but avoids needing casts for u32 arrays. +- */ +- +-static __always_inline void set_bit32(long nr, volatile u32 *addr) +-{ +- set_bit(nr, (volatile unsigned long *)addr); +-} +- + #ifdef __KERNEL__ + + #ifndef set_mask_bits +-- +2.15.0 + diff --git a/queue/block-bfq-Disable-writeback-throttling.patch b/queue/block-bfq-Disable-writeback-throttling.patch new file mode 100644 index 0000000..856b155 --- /dev/null +++ b/queue/block-bfq-Disable-writeback-throttling.patch @@ -0,0 +1,56 @@ +From b5dc5d4d1f4ff9032eb6c21a3c571a1317dc9289 Mon Sep 17 00:00:00 2001 +From: Luca Miccio <lucmiccio@gmail.com> +Date: Mon, 9 Oct 2017 16:27:21 +0200 +Subject: [PATCH] block,bfq: Disable writeback throttling + +commit b5dc5d4d1f4ff9032eb6c21a3c571a1317dc9289 upstream. + +Similarly to CFQ, BFQ has its write-throttling heuristics, and it +is better not to combine them with further write-throttling +heuristics of a different nature. +So this commit disables write-back throttling for a device if BFQ +is used as I/O scheduler for that device. + +Signed-off-by: Luca Miccio <lucmiccio@gmail.com> +Signed-off-by: Paolo Valente <paolo.valente@linaro.org> +Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name> +Tested-by: Lee Tibbert <lee.tibbert@gmail.com> +Signed-off-by: Jens Axboe <axboe@kernel.dk> + +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c +index 70f9177c4f5b..261f98695910 100644 +--- a/block/bfq-iosched.c ++++ b/block/bfq-iosched.c +@@ -108,6 +108,7 @@ + #include "blk-mq-tag.h" + #include "blk-mq-sched.h" + #include "bfq-iosched.h" ++#include "blk-wbt.h" + + #define BFQ_BFQQ_FNS(name) \ + void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ +@@ -4810,7 +4811,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + bfq_init_root_group(bfqd->root_group, bfqd); + bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); + +- ++ wbt_disable_default(q); + return 0; + + out_free: +diff --git a/block/blk-wbt.c b/block/blk-wbt.c +index 6a9a0f03a67b..e59d59c11ebb 100644 +--- a/block/blk-wbt.c ++++ b/block/blk-wbt.c +@@ -654,7 +654,7 @@ void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) + } + + /* +- * Disable wbt, if enabled by default. Only called from CFQ. ++ * Disable wbt, if enabled by default. + */ + void wbt_disable_default(struct request_queue *q) + { +-- +2.15.0 + diff --git a/queue/bpf-don-t-prune-branches-when-a-scalar-is-replaced-w.patch b/queue/bpf-don-t-prune-branches-when-a-scalar-is-replaced-w.patch new file mode 100644 index 0000000..a472206 --- /dev/null +++ b/queue/bpf-don-t-prune-branches-when-a-scalar-is-replaced-w.patch @@ -0,0 +1,48 @@ +From 179d1c5602997fef5a940c6ddcf31212cbfebd14 Mon Sep 17 00:00:00 2001 +From: Jann Horn <jannh@google.com> +Date: Mon, 18 Dec 2017 20:11:59 -0800 +Subject: [PATCH] bpf: don't prune branches when a scalar is replaced with a + pointer + +commit 179d1c5602997fef5a940c6ddcf31212cbfebd14 upstream. + +This could be made safe by passing through a reference to env and checking +for env->allow_ptr_leaks, but it would only work one way and is probably +not worth the hassle - not doing it will not directly lead to program +rejection. + +Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") +Signed-off-by: Jann Horn <jannh@google.com> +Signed-off-by: Alexei Starovoitov <ast@kernel.org> +Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> + +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index 102c519836f6..982bd9ec721a 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -3467,15 +3467,14 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, + return range_within(rold, rcur) && + tnum_in(rold->var_off, rcur->var_off); + } else { +- /* if we knew anything about the old value, we're not +- * equal, because we can't know anything about the +- * scalar value of the pointer in the new value. ++ /* We're trying to use a pointer in place of a scalar. ++ * Even if the scalar was unbounded, this could lead to ++ * pointer leaks because scalars are allowed to leak ++ * while pointers are not. We could make this safe in ++ * special cases if root is calling us, but it's ++ * probably not worth the hassle. + */ +- return rold->umin_value == 0 && +- rold->umax_value == U64_MAX && +- rold->smin_value == S64_MIN && +- rold->smax_value == S64_MAX && +- tnum_is_unknown(rold->var_off); ++ return false; + } + case PTR_TO_MAP_VALUE: + /* If the new min/max/var_off satisfy the old ones and +-- +2.15.0 + diff --git a/queue/bpf-fix-32-bit-ALU-op-verification.patch b/queue/bpf-fix-32-bit-ALU-op-verification.patch new file mode 100644 index 0000000..3c7d210 --- /dev/null +++ b/queue/bpf-fix-32-bit-ALU-op-verification.patch @@ -0,0 +1,86 @@ +From 468f6eafa6c44cb2c5d8aad35e12f06c240a812a Mon Sep 17 00:00:00 2001 +From: Jann Horn <jannh@google.com> +Date: Mon, 18 Dec 2017 20:11:56 -0800 +Subject: [PATCH] bpf: fix 32-bit ALU op verification + +commit 468f6eafa6c44cb2c5d8aad35e12f06c240a812a upstream. + +32-bit ALU ops operate on 32-bit values and have 32-bit outputs. +Adjust the verifier accordingly. + +Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") +Signed-off-by: Jann Horn <jannh@google.com> +Signed-off-by: Alexei Starovoitov <ast@kernel.org> +Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> + +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index f716bdf29dd0..ecdc265244ca 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -2017,6 +2017,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, + return 0; + } + ++/* WARNING: This function does calculations on 64-bit values, but the actual ++ * execution may occur on 32-bit values. Therefore, things like bitshifts ++ * need extra checks in the 32-bit case. ++ */ + static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, + struct bpf_insn *insn, + struct bpf_reg_state *dst_reg, +@@ -2027,12 +2031,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, + bool src_known, dst_known; + s64 smin_val, smax_val; + u64 umin_val, umax_val; ++ u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; + +- if (BPF_CLASS(insn->code) != BPF_ALU64) { +- /* 32-bit ALU ops are (32,32)->64 */ +- coerce_reg_to_size(dst_reg, 4); +- coerce_reg_to_size(&src_reg, 4); +- } + smin_val = src_reg.smin_value; + smax_val = src_reg.smax_value; + umin_val = src_reg.umin_value; +@@ -2168,9 +2168,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, + __update_reg_bounds(dst_reg); + break; + case BPF_LSH: +- if (umax_val > 63) { +- /* Shifts greater than 63 are undefined. This includes +- * shifts by a negative number. ++ if (umax_val >= insn_bitness) { ++ /* Shifts greater than 31 or 63 are undefined. ++ * This includes shifts by a negative number. + */ + mark_reg_unknown(env, regs, insn->dst_reg); + break; +@@ -2196,9 +2196,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, + __update_reg_bounds(dst_reg); + break; + case BPF_RSH: +- if (umax_val > 63) { +- /* Shifts greater than 63 are undefined. This includes +- * shifts by a negative number. ++ if (umax_val >= insn_bitness) { ++ /* Shifts greater than 31 or 63 are undefined. ++ * This includes shifts by a negative number. + */ + mark_reg_unknown(env, regs, insn->dst_reg); + break; +@@ -2234,6 +2234,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, + break; + } + ++ if (BPF_CLASS(insn->code) != BPF_ALU64) { ++ /* 32-bit ALU ops are (32,32)->32 */ ++ coerce_reg_to_size(dst_reg, 4); ++ coerce_reg_to_size(&src_reg, 4); ++ } ++ + __reg_deduce_bounds(dst_reg); + __reg_bound_offset(dst_reg); + return 0; +-- +2.15.0 + diff --git a/queue/bpf-fix-branch-pruning-logic.patch b/queue/bpf-fix-branch-pruning-logic.patch new file mode 100644 index 0000000..cfed4a8 --- /dev/null +++ b/queue/bpf-fix-branch-pruning-logic.patch @@ -0,0 +1,117 @@ +From c131187db2d3fa2f8bf32fdf4e9a4ef805168467 Mon Sep 17 00:00:00 2001 +From: Alexei Starovoitov <ast@fb.com> +Date: Wed, 22 Nov 2017 16:42:05 -0800 +Subject: [PATCH] bpf: fix branch pruning logic + +commit c131187db2d3fa2f8bf32fdf4e9a4ef805168467 upstream. + +when the verifier detects that register contains a runtime constant +and it's compared with another constant it will prune exploration +of the branch that is guaranteed not to be taken at runtime. +This is all correct, but malicious program may be constructed +in such a way that it always has a constant comparison and +the other branch is never taken under any conditions. +In this case such path through the program will not be explored +by the verifier. It won't be taken at run-time either, but since +all instructions are JITed the malicious program may cause JITs +to complain about using reserved fields, etc. +To fix the issue we have to track the instructions explored by +the verifier and sanitize instructions that are dead at run time +with NOPs. We cannot reject such dead code, since llvm generates +it for valid C code, since it doesn't do as much data flow +analysis as the verifier does. + +Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)") +Signed-off-by: Alexei Starovoitov <ast@kernel.org> +Acked-by: Daniel Borkmann <daniel@iogearbox.net> +Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> + +diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h +index b61482d354a2..c561b986bab0 100644 +--- a/include/linux/bpf_verifier.h ++++ b/include/linux/bpf_verifier.h +@@ -115,7 +115,7 @@ struct bpf_insn_aux_data { + struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */ + }; + int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ +- int converted_op_size; /* the valid value width after perceived conversion */ ++ bool seen; /* this insn was processed by the verifier */ + }; + + #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index 308b0638ec5d..d4593571c404 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -3827,6 +3827,7 @@ static int do_check(struct bpf_verifier_env *env) + return err; + + regs = cur_regs(env); ++ env->insn_aux_data[insn_idx].seen = true; + if (class == BPF_ALU || class == BPF_ALU64) { + err = check_alu_op(env, insn); + if (err) +@@ -4022,6 +4023,7 @@ static int do_check(struct bpf_verifier_env *env) + return err; + + insn_idx++; ++ env->insn_aux_data[insn_idx].seen = true; + } else { + verbose(env, "invalid BPF_LD mode\n"); + return -EINVAL; +@@ -4204,6 +4206,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, + u32 off, u32 cnt) + { + struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; ++ int i; + + if (cnt == 1) + return 0; +@@ -4213,6 +4216,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, + memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); + memcpy(new_data + off + cnt - 1, old_data + off, + sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); ++ for (i = off; i < off + cnt - 1; i++) ++ new_data[i].seen = true; + env->insn_aux_data = new_data; + vfree(old_data); + return 0; +@@ -4231,6 +4236,25 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of + return new_prog; + } + ++/* The verifier does more data flow analysis than llvm and will not explore ++ * branches that are dead at run time. Malicious programs can have dead code ++ * too. Therefore replace all dead at-run-time code with nops. ++ */ ++static void sanitize_dead_code(struct bpf_verifier_env *env) ++{ ++ struct bpf_insn_aux_data *aux_data = env->insn_aux_data; ++ struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0); ++ struct bpf_insn *insn = env->prog->insnsi; ++ const int insn_cnt = env->prog->len; ++ int i; ++ ++ for (i = 0; i < insn_cnt; i++) { ++ if (aux_data[i].seen) ++ continue; ++ memcpy(insn + i, &nop, sizeof(nop)); ++ } ++} ++ + /* convert load instructions that access fields of 'struct __sk_buff' + * into sequence of instructions that access fields of 'struct sk_buff' + */ +@@ -4557,6 +4581,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) + while (!pop_stack(env, NULL, NULL)); + free_states(env); + ++ if (ret == 0) ++ sanitize_dead_code(env); ++ + if (ret == 0) + /* program is valid, convert *(u32*)(ctx + off) accesses */ + ret = convert_ctx_accesses(env); +-- +2.15.0 + diff --git a/queue/bpf-fix-build-issues-on-um-due-to-mising-bpf_perf_ev.patch b/queue/bpf-fix-build-issues-on-um-due-to-mising-bpf_perf_ev.patch new file mode 100644 index 0000000..9c2dbfa --- /dev/null +++ b/queue/bpf-fix-build-issues-on-um-due-to-mising-bpf_perf_ev.patch @@ -0,0 +1,57 @@ +From ab95477e7cb35557ecfc837687007b646bab9a9f Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann <daniel@iogearbox.net> +Date: Tue, 12 Dec 2017 02:25:31 +0100 +Subject: [PATCH] bpf: fix build issues on um due to mising bpf_perf_event.h + +commit ab95477e7cb35557ecfc837687007b646bab9a9f upstream. + +[ Note, this is a Git cherry-pick of the following commit: + + a23f06f06dbe ("bpf: fix build issues on um due to mising bpf_perf_event.h") + + ... for easier x86 PTI code testing and back-porting. ] + +Since c895f6f703ad ("bpf: correct broken uapi for +BPF_PROG_TYPE_PERF_EVENT program type") um (uml) won't build +on i386 or x86_64: + + [...] + CC init/main.o + In file included from ../include/linux/perf_event.h:18:0, + from ../include/linux/trace_events.h:10, + from ../include/trace/syscall.h:7, + from ../include/linux/syscalls.h:82, + from ../init/main.c:20: + ../include/uapi/linux/bpf_perf_event.h:11:32: fatal error: + asm/bpf_perf_event.h: No such file or directory #include + <asm/bpf_perf_event.h> + [...] + +Lets add missing bpf_perf_event.h also to um arch. This seems +to be the only one still missing. + +Fixes: c895f6f703ad ("bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type") +Reported-by: Randy Dunlap <rdunlap@infradead.org> +Suggested-by: Richard Weinberger <richard@sigma-star.at> +Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> +Tested-by: Randy Dunlap <rdunlap@infradead.org> +Cc: Hendrik Brueckner <brueckner@linux.vnet.ibm.com> +Cc: Richard Weinberger <richard@sigma-star.at> +Acked-by: Alexei Starovoitov <ast@kernel.org> +Acked-by: Richard Weinberger <richard@nod.at> +Signed-off-by: Alexei Starovoitov <ast@kernel.org> +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild +index 50a32c33d729..73c57f614c9e 100644 +--- a/arch/um/include/asm/Kbuild ++++ b/arch/um/include/asm/Kbuild +@@ -1,4 +1,5 @@ + generic-y += barrier.h ++generic-y += bpf_perf_event.h + generic-y += bug.h + generic-y += clkdev.h + generic-y += current.h +-- +2.15.0 + diff --git a/queue/bpf-fix-corruption-on-concurrent-perf_event_output-c.patch b/queue/bpf-fix-corruption-on-concurrent-perf_event_output-c.patch new file mode 100644 index 0000000..70c786d --- /dev/null +++ b/queue/bpf-fix-corruption-on-concurrent-perf_event_output-c.patch @@ -0,0 +1,108 @@ +From 283ca526a9bd75aed7350220d7b1f8027d99c3fd Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann <daniel@iogearbox.net> +Date: Tue, 12 Dec 2017 02:25:30 +0100 +Subject: [PATCH] bpf: fix corruption on concurrent perf_event_output calls + +commit 283ca526a9bd75aed7350220d7b1f8027d99c3fd upstream. + +When tracing and networking programs are both attached in the +system and both use event-output helpers that eventually call +into perf_event_output(), then we could end up in a situation +where the tracing attached program runs in user context while +a cls_bpf program is triggered on that same CPU out of softirq +context. + +Since both rely on the same per-cpu perf_sample_data, we could +potentially corrupt it. This can only ever happen in a combination +of the two types; all tracing programs use a bpf_prog_active +counter to bail out in case a program is already running on +that CPU out of a different context. XDP and cls_bpf programs +by themselves don't have this issue as they run in the same +context only. Therefore, split both perf_sample_data so they +cannot be accessed from each other. + +Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data") +Reported-by: Alexei Starovoitov <ast@fb.com> +Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> +Tested-by: Song Liu <songliubraving@fb.com> +Acked-by: Alexei Starovoitov <ast@kernel.org> +Signed-off-by: Alexei Starovoitov <ast@kernel.org> + +diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c +index 0ce99c379c30..40207c2a4113 100644 +--- a/kernel/trace/bpf_trace.c ++++ b/kernel/trace/bpf_trace.c +@@ -343,14 +343,13 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { + .arg4_type = ARG_CONST_SIZE, + }; + +-static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); ++static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd); + + static __always_inline u64 + __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, +- u64 flags, struct perf_raw_record *raw) ++ u64 flags, struct perf_sample_data *sd) + { + struct bpf_array *array = container_of(map, struct bpf_array, map); +- struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd); + unsigned int cpu = smp_processor_id(); + u64 index = flags & BPF_F_INDEX_MASK; + struct bpf_event_entry *ee; +@@ -373,8 +372,6 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, + if (unlikely(event->oncpu != cpu)) + return -EOPNOTSUPP; + +- perf_sample_data_init(sd, 0, 0); +- sd->raw = raw; + perf_event_output(event, sd, regs); + return 0; + } +@@ -382,6 +379,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, + BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, + u64, flags, void *, data, u64, size) + { ++ struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd); + struct perf_raw_record raw = { + .frag = { + .size = size, +@@ -392,7 +390,10 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + +- return __bpf_perf_event_output(regs, map, flags, &raw); ++ perf_sample_data_init(sd, 0, 0); ++ sd->raw = &raw; ++ ++ return __bpf_perf_event_output(regs, map, flags, sd); + } + + static const struct bpf_func_proto bpf_perf_event_output_proto = { +@@ -407,10 +408,12 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { + }; + + static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); ++static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd); + + u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, + void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) + { ++ struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd); + struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); + struct perf_raw_frag frag = { + .copy = ctx_copy, +@@ -428,8 +431,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, + }; + + perf_fetch_caller_regs(regs); ++ perf_sample_data_init(sd, 0, 0); ++ sd->raw = &raw; + +- return __bpf_perf_event_output(regs, map, flags, &raw); ++ return __bpf_perf_event_output(regs, map, flags, sd); + } + + BPF_CALL_0(bpf_get_current_task) +-- +2.15.0 + diff --git a/queue/bpf-fix-incorrect-sign-extension-in-check_alu_op.patch b/queue/bpf-fix-incorrect-sign-extension-in-check_alu_op.patch new file mode 100644 index 0000000..25178f6 --- /dev/null +++ b/queue/bpf-fix-incorrect-sign-extension-in-check_alu_op.patch @@ -0,0 +1,48 @@ +From 95a762e2c8c942780948091f8f2a4f32fce1ac6f Mon Sep 17 00:00:00 2001 +From: Jann Horn <jannh@google.com> +Date: Mon, 18 Dec 2017 20:11:54 -0800 +Subject: [PATCH] bpf: fix incorrect sign extension in check_alu_op() + +commit 95a762e2c8c942780948091f8f2a4f32fce1ac6f upstream. + +Distinguish between +BPF_ALU64|BPF_MOV|BPF_K (load 32-bit immediate, sign-extended to 64-bit) +and BPF_ALU|BPF_MOV|BPF_K (load 32-bit immediate, zero-padded to 64-bit); +only perform sign extension in the first case. + +Starting with v4.14, this is exploitable by unprivileged users as long as +the unprivileged_bpf_disabled sysctl isn't set. + +Debian assigned CVE-2017-16995 for this issue. + +v3: + - add CVE number (Ben Hutchings) + +Fixes: 484611357c19 ("bpf: allow access into map value arrays") +Signed-off-by: Jann Horn <jannh@google.com> +Acked-by: Edward Cree <ecree@solarflare.com> +Signed-off-by: Alexei Starovoitov <ast@kernel.org> +Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> + +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index 625e358ca765..c086010ae51e 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -2408,7 +2408,13 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) + * remember the value we stored into this reg + */ + regs[insn->dst_reg].type = SCALAR_VALUE; +- __mark_reg_known(regs + insn->dst_reg, insn->imm); ++ if (BPF_CLASS(insn->code) == BPF_ALU64) { ++ __mark_reg_known(regs + insn->dst_reg, ++ insn->imm); ++ } else { ++ __mark_reg_known(regs + insn->dst_reg, ++ (u32)insn->imm); ++ } + } + + } else if (opcode > BPF_END) { +-- +2.15.0 + diff --git a/queue/bpf-fix-incorrect-tracking-of-register-size-truncati.patch b/queue/bpf-fix-incorrect-tracking-of-register-size-truncati.patch new file mode 100644 index 0000000..2013381 --- /dev/null +++ b/queue/bpf-fix-incorrect-tracking-of-register-size-truncati.patch @@ -0,0 +1,122 @@ +From 0c17d1d2c61936401f4702e1846e2c19b200f958 Mon Sep 17 00:00:00 2001 +From: Jann Horn <jannh@google.com> +Date: Mon, 18 Dec 2017 20:11:55 -0800 +Subject: [PATCH] bpf: fix incorrect tracking of register size truncation + +commit 0c17d1d2c61936401f4702e1846e2c19b200f958 upstream. + +Properly handle register truncation to a smaller size. + +The old code first mirrors the clearing of the high 32 bits in the bitwise +tristate representation, which is correct. But then, it computes the new +arithmetic bounds as the intersection between the old arithmetic bounds and +the bounds resulting from the bitwise tristate representation. Therefore, +when coerce_reg_to_32() is called on a number with bounds +[0xffff'fff8, 0x1'0000'0007], the verifier computes +[0xffff'fff8, 0xffff'ffff] as bounds of the truncated number. +This is incorrect: The truncated number could also be in the range [0, 7], +and no meaningful arithmetic bounds can be computed in that case apart from +the obvious [0, 0xffff'ffff]. + +Starting with v4.14, this is exploitable by unprivileged users as long as +the unprivileged_bpf_disabled sysctl isn't set. + +Debian assigned CVE-2017-16996 for this issue. + +v2: + - flip the mask during arithmetic bounds calculation (Ben Hutchings) +v3: + - add CVE number (Ben Hutchings) + +Fixes: b03c9f9fdc37 ("bpf/verifier: track signed and unsigned min/max values") +Signed-off-by: Jann Horn <jannh@google.com> +Acked-by: Edward Cree <ecree@solarflare.com> +Signed-off-by: Alexei Starovoitov <ast@kernel.org> +Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> + +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index c086010ae51e..f716bdf29dd0 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -1067,6 +1067,29 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, + strict); + } + ++/* truncate register to smaller size (in bytes) ++ * must be called with size < BPF_REG_SIZE ++ */ ++static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) ++{ ++ u64 mask; ++ ++ /* clear high bits in bit representation */ ++ reg->var_off = tnum_cast(reg->var_off, size); ++ ++ /* fix arithmetic bounds */ ++ mask = ((u64)1 << (size * 8)) - 1; ++ if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) { ++ reg->umin_value &= mask; ++ reg->umax_value &= mask; ++ } else { ++ reg->umin_value = 0; ++ reg->umax_value = mask; ++ } ++ reg->smin_value = reg->umin_value; ++ reg->smax_value = reg->umax_value; ++} ++ + /* check whether memory at (regno + off) is accessible for t = (read | write) + * if t==write, value_regno is a register which value is stored into memory + * if t==read, value_regno is a register which will receive the value from memory +@@ -1200,9 +1223,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn + if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ && + regs[value_regno].type == SCALAR_VALUE) { + /* b/h/w load zero-extends, mark upper bits as known 0 */ +- regs[value_regno].var_off = +- tnum_cast(regs[value_regno].var_off, size); +- __update_reg_bounds(®s[value_regno]); ++ coerce_reg_to_size(®s[value_regno], size); + } + return err; + } +@@ -1772,14 +1793,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) + return 0; + } + +-static void coerce_reg_to_32(struct bpf_reg_state *reg) +-{ +- /* clear high 32 bits */ +- reg->var_off = tnum_cast(reg->var_off, 4); +- /* Update bounds */ +- __update_reg_bounds(reg); +-} +- + static bool signed_add_overflows(s64 a, s64 b) + { + /* Do the add in u64, where overflow is well-defined */ +@@ -2017,8 +2030,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, + + if (BPF_CLASS(insn->code) != BPF_ALU64) { + /* 32-bit ALU ops are (32,32)->64 */ +- coerce_reg_to_32(dst_reg); +- coerce_reg_to_32(&src_reg); ++ coerce_reg_to_size(dst_reg, 4); ++ coerce_reg_to_size(&src_reg, 4); + } + smin_val = src_reg.smin_value; + smax_val = src_reg.smax_value; +@@ -2398,10 +2411,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) + return -EACCES; + } + mark_reg_unknown(env, regs, insn->dst_reg); +- /* high 32 bits are known zero. */ +- regs[insn->dst_reg].var_off = tnum_cast( +- regs[insn->dst_reg].var_off, 4); +- __update_reg_bounds(®s[insn->dst_reg]); ++ coerce_reg_to_size(®s[insn->dst_reg], 4); + } + } else { + /* case: R = imm +-- +2.15.0 + diff --git a/queue/bpf-fix-integer-overflows.patch b/queue/bpf-fix-integer-overflows.patch new file mode 100644 index 0000000..f431312 --- /dev/null +++ b/queue/bpf-fix-integer-overflows.patch @@ -0,0 +1,126 @@ +From bb7f0f989ca7de1153bd128a40a71709e339fa03 Mon Sep 17 00:00:00 2001 +From: Alexei Starovoitov <ast@kernel.org> +Date: Mon, 18 Dec 2017 20:12:00 -0800 +Subject: [PATCH] bpf: fix integer overflows + +commit bb7f0f989ca7de1153bd128a40a71709e339fa03 upstream. + +There were various issues related to the limited size of integers used in +the verifier: + - `off + size` overflow in __check_map_access() + - `off + reg->off` overflow in check_mem_access() + - `off + reg->var_off.value` overflow or 32-bit truncation of + `reg->var_off.value` in check_mem_access() + - 32-bit truncation in check_stack_boundary() + +Make sure that any integer math cannot overflow by not allowing +pointer math with large values. + +Also reduce the scope of "scalar op scalar" tracking. + +Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") +Reported-by: Jann Horn <jannh@google.com> +Signed-off-by: Alexei Starovoitov <ast@kernel.org> +Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> + +diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h +index c561b986bab0..1632bb13ad8a 100644 +--- a/include/linux/bpf_verifier.h ++++ b/include/linux/bpf_verifier.h +@@ -15,11 +15,11 @@ + * In practice this is far bigger than any realistic pointer offset; this limit + * ensures that umax_value + (int)off + (int)size cannot overflow a u64. + */ +-#define BPF_MAX_VAR_OFF (1ULL << 31) ++#define BPF_MAX_VAR_OFF (1 << 29) + /* Maximum variable size permitted for ARG_CONST_SIZE[_OR_ZERO]. This ensures + * that converting umax_value to int cannot overflow. + */ +-#define BPF_MAX_VAR_SIZ INT_MAX ++#define BPF_MAX_VAR_SIZ (1 << 29) + + /* Liveness marks, used for registers and spilled-regs (in stack slots). + * Read marks propagate upwards until they find a write mark; they record that +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index 982bd9ec721a..86dfe6b5c243 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -1819,6 +1819,41 @@ static bool signed_sub_overflows(s64 a, s64 b) + return res > a; + } + ++static bool check_reg_sane_offset(struct bpf_verifier_env *env, ++ const struct bpf_reg_state *reg, ++ enum bpf_reg_type type) ++{ ++ bool known = tnum_is_const(reg->var_off); ++ s64 val = reg->var_off.value; ++ s64 smin = reg->smin_value; ++ ++ if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { ++ verbose(env, "math between %s pointer and %lld is not allowed\n", ++ reg_type_str[type], val); ++ return false; ++ } ++ ++ if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) { ++ verbose(env, "%s pointer offset %d is not allowed\n", ++ reg_type_str[type], reg->off); ++ return false; ++ } ++ ++ if (smin == S64_MIN) { ++ verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n", ++ reg_type_str[type]); ++ return false; ++ } ++ ++ if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) { ++ verbose(env, "value %lld makes %s pointer be out of bounds\n", ++ smin, reg_type_str[type]); ++ return false; ++ } ++ ++ return true; ++} ++ + /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. + * Caller should also handle BPF_MOV case separately. + * If we return -EACCES, caller may want to try again treating pointer as a +@@ -1887,6 +1922,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, + dst_reg->type = ptr_reg->type; + dst_reg->id = ptr_reg->id; + ++ if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) || ++ !check_reg_sane_offset(env, ptr_reg, ptr_reg->type)) ++ return -EINVAL; ++ + switch (opcode) { + case BPF_ADD: + /* We can take a fixed offset as long as it doesn't overflow +@@ -2017,6 +2056,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, + return -EACCES; + } + ++ if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type)) ++ return -EINVAL; ++ + __update_reg_bounds(dst_reg); + __reg_deduce_bounds(dst_reg); + __reg_bound_offset(dst_reg); +@@ -2046,6 +2088,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, + src_known = tnum_is_const(src_reg.var_off); + dst_known = tnum_is_const(dst_reg->var_off); + ++ if (!src_known && ++ opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) { ++ __mark_reg_unknown(dst_reg); ++ return 0; ++ } ++ + switch (opcode) { + case BPF_ADD: + if (signed_add_overflows(dst_reg->smin_value, smin_val) || +-- +2.15.0 + diff --git a/queue/bpf-force-strict-alignment-checks-for-stack-pointers.patch b/queue/bpf-force-strict-alignment-checks-for-stack-pointers.patch new file mode 100644 index 0000000..717ef8b --- /dev/null +++ b/queue/bpf-force-strict-alignment-checks-for-stack-pointers.patch @@ -0,0 +1,35 @@ +From a5ec6ae161d72f01411169a938fa5f8baea16e8f Mon Sep 17 00:00:00 2001 +From: Jann Horn <jannh@google.com> +Date: Mon, 18 Dec 2017 20:11:58 -0800 +Subject: [PATCH] bpf: force strict alignment checks for stack pointers + +commit a5ec6ae161d72f01411169a938fa5f8baea16e8f upstream. + +Force strict alignment checks for stack pointers because the tracking of +stack spills relies on it; unaligned stack accesses can lead to corruption +of spilled registers, which is exploitable. + +Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") +Signed-off-by: Jann Horn <jannh@google.com> +Signed-off-by: Alexei Starovoitov <ast@kernel.org> +Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> + +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index 77e4b5223867..102c519836f6 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -1059,6 +1059,11 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, + break; + case PTR_TO_STACK: + pointer_desc = "stack "; ++ /* The stack spill tracking logic in check_stack_write() ++ * and check_stack_read() relies on stack accesses being ++ * aligned. ++ */ ++ strict = true; + break; + default: + break; +-- +2.15.0 + diff --git a/queue/bpf-ppc64-do-not-reload-skb-pointers-in-non-skb-cont.patch b/queue/bpf-ppc64-do-not-reload-skb-pointers-in-non-skb-cont.patch new file mode 100644 index 0000000..f99cad9 --- /dev/null +++ b/queue/bpf-ppc64-do-not-reload-skb-pointers-in-non-skb-cont.patch @@ -0,0 +1,52 @@ +From 87338c8e2cbb317b5f757e6172f94e2e3799cd20 Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann <daniel@iogearbox.net> +Date: Thu, 14 Dec 2017 21:07:24 +0100 +Subject: [PATCH] bpf, ppc64: do not reload skb pointers in non-skb context + +commit 87338c8e2cbb317b5f757e6172f94e2e3799cd20 upstream. + +The assumption of unconditionally reloading skb pointers on +BPF helper calls where bpf_helper_changes_pkt_data() holds +true is wrong. There can be different contexts where the helper +would enforce a reload such as in case of XDP. Here, we do +have a struct xdp_buff instead of struct sk_buff as context, +thus this will access garbage. + +JITs only ever need to deal with cached skb pointer reload +when ld_abs/ind was seen, therefore guard the reload behind +SEEN_SKB. + +Fixes: 156d0e290e96 ("powerpc/ebpf/jit: Implement JIT compiler for extended BPF") +Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> +Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> +Acked-by: Alexei Starovoitov <ast@kernel.org> +Tested-by: Sandipan Das <sandipan@linux.vnet.ibm.com> +Signed-off-by: Alexei Starovoitov <ast@kernel.org> + +diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c +index 46d74e81aff1..d183b4801bdb 100644 +--- a/arch/powerpc/net/bpf_jit_comp64.c ++++ b/arch/powerpc/net/bpf_jit_comp64.c +@@ -763,7 +763,8 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, + func = (u8 *) __bpf_call_base + imm; + + /* Save skb pointer if we need to re-cache skb data */ +- if (bpf_helper_changes_pkt_data(func)) ++ if ((ctx->seen & SEEN_SKB) && ++ bpf_helper_changes_pkt_data(func)) + PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx)); + + bpf_jit_emit_func_call(image, ctx, (u64)func); +@@ -772,7 +773,8 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, + PPC_MR(b2p[BPF_REG_0], 3); + + /* refresh skb cache */ +- if (bpf_helper_changes_pkt_data(func)) { ++ if ((ctx->seen & SEEN_SKB) && ++ bpf_helper_changes_pkt_data(func)) { + /* reload skb pointer to r3 */ + PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx)); + bpf_jit_emit_skb_loads(image, ctx); +-- +2.15.0 + diff --git a/queue/bpf-s390x-do-not-reload-skb-pointers-in-non-skb-cont.patch b/queue/bpf-s390x-do-not-reload-skb-pointers-in-non-skb-cont.patch new file mode 100644 index 0000000..f73fb28 --- /dev/null +++ b/queue/bpf-s390x-do-not-reload-skb-pointers-in-non-skb-cont.patch @@ -0,0 +1,67 @@ +From 6d59b7dbf72ed20d0138e2f9b75ca3d4a9d4faca Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann <daniel@iogearbox.net> +Date: Thu, 14 Dec 2017 21:07:23 +0100 +Subject: [PATCH] bpf, s390x: do not reload skb pointers in non-skb context + +commit 6d59b7dbf72ed20d0138e2f9b75ca3d4a9d4faca upstream. + +The assumption of unconditionally reloading skb pointers on +BPF helper calls where bpf_helper_changes_pkt_data() holds +true is wrong. There can be different contexts where the +BPF helper would enforce a reload such as in case of XDP. +Here, we do have a struct xdp_buff instead of struct sk_buff +as context, thus this will access garbage. + +JITs only ever need to deal with cached skb pointer reload +when ld_abs/ind was seen, therefore guard the reload behind +SEEN_SKB only. Tested on s390x. + +Fixes: 9db7f2b81880 ("s390/bpf: recache skb->data/hlen for skb_vlan_push/pop") +Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> +Acked-by: Alexei Starovoitov <ast@kernel.org> +Cc: Michael Holzheu <holzheu@linux.vnet.ibm.com> +Signed-off-by: Alexei Starovoitov <ast@kernel.org> + +diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c +index e81c16838b90..9557d8b516df 100644 +--- a/arch/s390/net/bpf_jit_comp.c ++++ b/arch/s390/net/bpf_jit_comp.c +@@ -55,8 +55,7 @@ struct bpf_jit { + #define SEEN_LITERAL 8 /* code uses literals */ + #define SEEN_FUNC 16 /* calls C functions */ + #define SEEN_TAIL_CALL 32 /* code uses tail calls */ +-#define SEEN_SKB_CHANGE 64 /* code changes skb data */ +-#define SEEN_REG_AX 128 /* code uses constant blinding */ ++#define SEEN_REG_AX 64 /* code uses constant blinding */ + #define SEEN_STACK (SEEN_FUNC | SEEN_MEM | SEEN_SKB) + + /* +@@ -448,12 +447,12 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth) + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, + REG_15, 152); + } +- if (jit->seen & SEEN_SKB) ++ if (jit->seen & SEEN_SKB) { + emit_load_skb_data_hlen(jit); +- if (jit->seen & SEEN_SKB_CHANGE) + /* stg %b1,ST_OFF_SKBP(%r0,%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, BPF_REG_1, REG_0, REG_15, + STK_OFF_SKBP); ++ } + } + + /* +@@ -983,8 +982,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i + EMIT2(0x0d00, REG_14, REG_W1); + /* lgr %b0,%r2: load return value into %b0 */ + EMIT4(0xb9040000, BPF_REG_0, REG_2); +- if (bpf_helper_changes_pkt_data((void *)func)) { +- jit->seen |= SEEN_SKB_CHANGE; ++ if ((jit->seen & SEEN_SKB) && ++ bpf_helper_changes_pkt_data((void *)func)) { + /* lg %b1,ST_OFF_SKBP(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0, + REG_15, STK_OFF_SKBP); +-- +2.15.0 + diff --git a/queue/bpf-sparc-fix-usage-of-wrong-reg-for-load_skb_regs-a.patch b/queue/bpf-sparc-fix-usage-of-wrong-reg-for-load_skb_regs-a.patch new file mode 100644 index 0000000..935d530 --- /dev/null +++ b/queue/bpf-sparc-fix-usage-of-wrong-reg-for-load_skb_regs-a.patch @@ -0,0 +1,50 @@ +From 07aee94394547721ac168cbf4e1c09c14a5fe671 Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann <daniel@iogearbox.net> +Date: Thu, 14 Dec 2017 21:07:26 +0100 +Subject: [PATCH] bpf, sparc: fix usage of wrong reg for load_skb_regs after + call + +commit 07aee94394547721ac168cbf4e1c09c14a5fe671 upstream. + +When LD_ABS/IND is used in the program, and we have a BPF helper +call that changes packet data (bpf_helper_changes_pkt_data() returns +true), then in case of sparc JIT, we try to reload cached skb data +from bpf2sparc[BPF_REG_6]. However, there is no such guarantee or +assumption that skb sits in R6 at this point, all helpers changing +skb data only have a guarantee that skb sits in R1. Therefore, +store BPF R1 in L7 temporarily and after procedure call use L7 to +reload cached skb data. skb sitting in R6 is only true at the time +when LD_ABS/IND is executed. + +Fixes: 7a12b5031c6b ("sparc64: Add eBPF JIT.") +Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> +Acked-by: David S. Miller <davem@davemloft.net> +Acked-by: Alexei Starovoitov <ast@kernel.org> +Signed-off-by: Alexei Starovoitov <ast@kernel.org> + +diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c +index 5765e7e711f7..ff5f9cb3039a 100644 +--- a/arch/sparc/net/bpf_jit_comp_64.c ++++ b/arch/sparc/net/bpf_jit_comp_64.c +@@ -1245,14 +1245,16 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) + u8 *func = ((u8 *)__bpf_call_base) + imm; + + ctx->saw_call = true; ++ if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func)) ++ emit_reg_move(bpf2sparc[BPF_REG_1], L7, ctx); + + emit_call((u32 *)func, ctx); + emit_nop(ctx); + + emit_reg_move(O0, bpf2sparc[BPF_REG_0], ctx); + +- if (bpf_helper_changes_pkt_data(func) && ctx->saw_ld_abs_ind) +- load_skb_regs(ctx, bpf2sparc[BPF_REG_6]); ++ if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func)) ++ load_skb_regs(ctx, L7); + break; + } + +-- +2.15.0 + diff --git a/queue/bpf-verifier-fix-bounds-calculation-on-BPF_RSH.patch b/queue/bpf-verifier-fix-bounds-calculation-on-BPF_RSH.patch new file mode 100644 index 0000000..b336f66 --- /dev/null +++ b/queue/bpf-verifier-fix-bounds-calculation-on-BPF_RSH.patch @@ -0,0 +1,65 @@ +From 4374f256ce8182019353c0c639bb8d0695b4c941 Mon Sep 17 00:00:00 2001 +From: Edward Cree <ecree@solarflare.com> +Date: Mon, 18 Dec 2017 20:11:53 -0800 +Subject: [PATCH] bpf/verifier: fix bounds calculation on BPF_RSH + +commit 4374f256ce8182019353c0c639bb8d0695b4c941 upstream. + +Incorrect signed bounds were being computed. +If the old upper signed bound was positive and the old lower signed bound was +negative, this could cause the new upper signed bound to be too low, +leading to security issues. + +Fixes: b03c9f9fdc37 ("bpf/verifier: track signed and unsigned min/max values") +Reported-by: Jann Horn <jannh@google.com> +Signed-off-by: Edward Cree <ecree@solarflare.com> +Acked-by: Alexei Starovoitov <ast@kernel.org> +[jannh@google.com: changed description to reflect bug impact] +Signed-off-by: Jann Horn <jannh@google.com> +Signed-off-by: Alexei Starovoitov <ast@kernel.org> +Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> + +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index e39b01317b6f..625e358ca765 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -2190,20 +2190,22 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, + mark_reg_unknown(env, regs, insn->dst_reg); + break; + } +- /* BPF_RSH is an unsigned shift, so make the appropriate casts */ +- if (dst_reg->smin_value < 0) { +- if (umin_val) { +- /* Sign bit will be cleared */ +- dst_reg->smin_value = 0; +- } else { +- /* Lost sign bit information */ +- dst_reg->smin_value = S64_MIN; +- dst_reg->smax_value = S64_MAX; +- } +- } else { +- dst_reg->smin_value = +- (u64)(dst_reg->smin_value) >> umax_val; +- } ++ /* BPF_RSH is an unsigned shift. If the value in dst_reg might ++ * be negative, then either: ++ * 1) src_reg might be zero, so the sign bit of the result is ++ * unknown, so we lose our signed bounds ++ * 2) it's known negative, thus the unsigned bounds capture the ++ * signed bounds ++ * 3) the signed bounds cross zero, so they tell us nothing ++ * about the result ++ * If the value in dst_reg is known nonnegative, then again the ++ * unsigned bounts capture the signed bounds. ++ * Thus, in all cases it suffices to blow away our signed bounds ++ * and rely on inferring new ones from the unsigned bounds and ++ * var_off of the result. ++ */ ++ dst_reg->smin_value = S64_MIN; ++ dst_reg->smax_value = S64_MAX; + if (src_known) + dst_reg->var_off = tnum_rshift(dst_reg->var_off, + umin_val); +-- +2.15.0 + diff --git a/queue/clk-sunxi-ng-nm-Check-if-requested-rate-is-supported.patch b/queue/clk-sunxi-ng-nm-Check-if-requested-rate-is-supported.patch new file mode 100644 index 0000000..6413819 --- /dev/null +++ b/queue/clk-sunxi-ng-nm-Check-if-requested-rate-is-supported.patch @@ -0,0 +1,38 @@ +From 4cdbc40d64d4b8303a97e29a52862e4d99502beb Mon Sep 17 00:00:00 2001 +From: Chen-Yu Tsai <wens@csie.org> +Date: Thu, 12 Oct 2017 16:36:58 +0800 +Subject: [PATCH] clk: sunxi-ng: nm: Check if requested rate is supported by + fractional clock + +commit 4cdbc40d64d4b8303a97e29a52862e4d99502beb upstream. + +The round_rate callback for N-M-factor style clocks does not check if +the requested clock rate is supported by the fractional clock mode. +While this doesn't affect usage in practice, since the clock rates +are also supported through N-M factors, it does not match the set_rate +code. + +Add a check to the round_rate callback so it matches the set_rate +callback. + +Fixes: 6174a1e24b0d ("clk: sunxi-ng: Add N-M-factor clock support") +Signed-off-by: Chen-Yu Tsai <wens@csie.org> +Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com> + +diff --git a/drivers/clk/sunxi-ng/ccu_nm.c b/drivers/clk/sunxi-ng/ccu_nm.c +index a32158e8f2e3..84a5e7f17f6f 100644 +--- a/drivers/clk/sunxi-ng/ccu_nm.c ++++ b/drivers/clk/sunxi-ng/ccu_nm.c +@@ -99,6 +99,9 @@ static long ccu_nm_round_rate(struct clk_hw *hw, unsigned long rate, + struct ccu_nm *nm = hw_to_ccu_nm(hw); + struct _ccu_nm _nm; + ++ if (ccu_frac_helper_has_rate(&nm->common, &nm->frac, rate)) ++ return rate; ++ + _nm.min_n = nm->n.min ?: 1; + _nm.max_n = nm->n.max ?: 1 << nm->n.width; + _nm.min_m = 1; +-- +2.15.0 + diff --git a/queue/clk-sunxi-ng-sun5i-Fix-bit-offset-of-audio-PLL-post-.patch b/queue/clk-sunxi-ng-sun5i-Fix-bit-offset-of-audio-PLL-post-.patch new file mode 100644 index 0000000..6a05589 --- /dev/null +++ b/queue/clk-sunxi-ng-sun5i-Fix-bit-offset-of-audio-PLL-post-.patch @@ -0,0 +1,41 @@ +From d51fe3ba9773c8b6fc79f82bbe75d64baf604292 Mon Sep 17 00:00:00 2001 +From: Chen-Yu Tsai <wens@csie.org> +Date: Thu, 12 Oct 2017 16:36:57 +0800 +Subject: [PATCH] clk: sunxi-ng: sun5i: Fix bit offset of audio PLL + post-divider + +commit d51fe3ba9773c8b6fc79f82bbe75d64baf604292 upstream. + +The post-divider for the audio PLL is in bits [29:26], as specified +in the user manual, not [19:16] as currently programmed in the code. +The post-divider has a default register value of 2, i.e. a divider +of 3. This means the clock rate fed to the audio codec would be off. + +This was discovered when porting sigma-delta modulation for the PLL +to sun5i, which needs the post-divider to be 1. + +Fix the bit offset, so we do actually force the post-divider to a +certain value. + +Fixes: 5e73761786d6 ("clk: sunxi-ng: Add sun5i CCU driver") +Signed-off-by: Chen-Yu Tsai <wens@csie.org> +Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com> + +diff --git a/drivers/clk/sunxi-ng/ccu-sun5i.c b/drivers/clk/sunxi-ng/ccu-sun5i.c +index ab9e850b3707..2f385a57cd91 100644 +--- a/drivers/clk/sunxi-ng/ccu-sun5i.c ++++ b/drivers/clk/sunxi-ng/ccu-sun5i.c +@@ -982,8 +982,8 @@ static void __init sun5i_ccu_init(struct device_node *node, + + /* Force the PLL-Audio-1x divider to 4 */ + val = readl(reg + SUN5I_PLL_AUDIO_REG); +- val &= ~GENMASK(19, 16); +- writel(val | (3 << 16), reg + SUN5I_PLL_AUDIO_REG); ++ val &= ~GENMASK(29, 26); ++ writel(val | (3 << 26), reg + SUN5I_PLL_AUDIO_REG); + + /* + * Use the peripheral PLL as the AHB parent, instead of CPU / +-- +2.15.0 + diff --git a/queue/clk-sunxi-ng-sun6i-Rename-HDMI-DDC-clock-to-avoid-na.patch b/queue/clk-sunxi-ng-sun6i-Rename-HDMI-DDC-clock-to-avoid-na.patch new file mode 100644 index 0000000..46151fb --- /dev/null +++ b/queue/clk-sunxi-ng-sun6i-Rename-HDMI-DDC-clock-to-avoid-na.patch @@ -0,0 +1,34 @@ +From 7f3ed79188f2f094d0ee366fa858857fb7f511ba Mon Sep 17 00:00:00 2001 +From: Chen-Yu Tsai <wens@csie.org> +Date: Fri, 29 Sep 2017 16:22:54 +0800 +Subject: [PATCH] clk: sunxi-ng: sun6i: Rename HDMI DDC clock to avoid name + collision + +commit 7f3ed79188f2f094d0ee366fa858857fb7f511ba upstream. + +The HDMI DDC clock found in the CCU is the parent of the actual DDC +clock within the HDMI controller. That clock is also named "hdmi-ddc". + +Rename the one in the CCU to "ddc". This makes more sense than renaming +the one in the HDMI controller to something else. + +Fixes: c6e6c96d8fa6 ("clk: sunxi-ng: Add A31/A31s clocks") +Signed-off-by: Chen-Yu Tsai <wens@csie.org> +Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com> + +diff --git a/drivers/clk/sunxi-ng/ccu-sun6i-a31.c b/drivers/clk/sunxi-ng/ccu-sun6i-a31.c +index 8af434815fba..241fb13f1c06 100644 +--- a/drivers/clk/sunxi-ng/ccu-sun6i-a31.c ++++ b/drivers/clk/sunxi-ng/ccu-sun6i-a31.c +@@ -608,7 +608,7 @@ static SUNXI_CCU_M_WITH_MUX_GATE(hdmi_clk, "hdmi", lcd_ch1_parents, + 0x150, 0, 4, 24, 2, BIT(31), + CLK_SET_RATE_PARENT); + +-static SUNXI_CCU_GATE(hdmi_ddc_clk, "hdmi-ddc", "osc24M", 0x150, BIT(30), 0); ++static SUNXI_CCU_GATE(hdmi_ddc_clk, "ddc", "osc24M", 0x150, BIT(30), 0); + + static SUNXI_CCU_GATE(ps_clk, "ps", "lcd1-ch1", 0x140, BIT(31), 0); + +-- +2.15.0 + diff --git a/queue/cpuidle-fix-broadcast-control-when-broadcast-can-not.patch b/queue/cpuidle-fix-broadcast-control-when-broadcast-can-not.patch new file mode 100644 index 0000000..866972e --- /dev/null +++ b/queue/cpuidle-fix-broadcast-control-when-broadcast-can-not.patch @@ -0,0 +1,37 @@ +From f187851b9b4a76952b1158b86434563dd2031103 Mon Sep 17 00:00:00 2001 +From: Nicholas Piggin <npiggin@gmail.com> +Date: Fri, 1 Sep 2017 14:29:56 +1000 +Subject: [PATCH] cpuidle: fix broadcast control when broadcast can not be + entered + +commit f187851b9b4a76952b1158b86434563dd2031103 upstream. + +When failing to enter broadcast timer mode for an idle state that +requires it, a new state is selected that does not require broadcast, +but the broadcast variable remains set. This causes +tick_broadcast_exit to be called despite not having entered broadcast +mode. + +This causes the WARN_ON_ONCE(!irqs_disabled()) to trigger in some +cases. It does not appear to cause problems for code today, but seems +to violate the interface so should be fixed. + +Signed-off-by: Nicholas Piggin <npiggin@gmail.com> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> + +diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c +index 484cc8909d5c..ed4df58a855e 100644 +--- a/drivers/cpuidle/cpuidle.c ++++ b/drivers/cpuidle/cpuidle.c +@@ -208,6 +208,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, + return -EBUSY; + } + target_state = &drv->states[index]; ++ broadcast = false; + } + + /* Take note of the planned idle state. */ +-- +2.15.0 + diff --git a/queue/crypto-crypto4xx-increase-context-and-scatter-ring-b.patch b/queue/crypto-crypto4xx-increase-context-and-scatter-ring-b.patch new file mode 100644 index 0000000..f7becbc --- /dev/null +++ b/queue/crypto-crypto4xx-increase-context-and-scatter-ring-b.patch @@ -0,0 +1,55 @@ +From 778f81d6cdb7d25360f082ac0384d5103f04eca5 Mon Sep 17 00:00:00 2001 +From: Christian Lamparter <chunkeey@gmail.com> +Date: Wed, 4 Oct 2017 01:00:08 +0200 +Subject: [PATCH] crypto: crypto4xx - increase context and scatter ring buffer + elements + +commit 778f81d6cdb7d25360f082ac0384d5103f04eca5 upstream. + +If crypto4xx is used in conjunction with dm-crypt, the available +ring buffer elements are not enough to handle the load properly. + +On an aes-cbc-essiv:sha256 encrypted swap partition the read +performance is abyssal: (tested with hdparm -t) + +/dev/mapper/swap_crypt: + Timing buffered disk reads: 14 MB in 3.68 seconds = 3.81 MB/sec + +The patch increases both PPC4XX_NUM_SD and PPC4XX_NUM_PD to 256. +This improves the performance considerably: + +/dev/mapper/swap_crypt: + Timing buffered disk reads: 104 MB in 3.03 seconds = 34.31 MB/sec + +Furthermore, PPC4XX_LAST_SD, PPC4XX_LAST_GD and PPC4XX_LAST_PD +can be easily calculated from their respective PPC4XX_NUM_* +constant. + +Signed-off-by: Christian Lamparter <chunkeey@gmail.com> +Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> + +diff --git a/drivers/crypto/amcc/crypto4xx_core.h b/drivers/crypto/amcc/crypto4xx_core.h +index 97fb8288ab30..27e439c1f5bf 100644 +--- a/drivers/crypto/amcc/crypto4xx_core.h ++++ b/drivers/crypto/amcc/crypto4xx_core.h +@@ -36,12 +36,12 @@ + #define PPC405EX_CE_RESET 0x00000008 + + #define CRYPTO4XX_CRYPTO_PRIORITY 300 +-#define PPC4XX_LAST_PD 63 +-#define PPC4XX_NUM_PD 64 +-#define PPC4XX_LAST_GD 1023 ++#define PPC4XX_NUM_PD 256 ++#define PPC4XX_LAST_PD (PPC4XX_NUM_PD - 1) + #define PPC4XX_NUM_GD 1024 +-#define PPC4XX_LAST_SD 63 +-#define PPC4XX_NUM_SD 64 ++#define PPC4XX_LAST_GD (PPC4XX_NUM_GD - 1) ++#define PPC4XX_NUM_SD 256 ++#define PPC4XX_LAST_SD (PPC4XX_NUM_SD - 1) + #define PPC4XX_SD_BUFFER_SIZE 2048 + + #define PD_ENTRY_INUSE 1 +-- +2.15.0 + diff --git a/queue/crypto-lrw-Fix-an-error-handling-path-in-create.patch b/queue/crypto-lrw-Fix-an-error-handling-path-in-create.patch new file mode 100644 index 0000000..e7f03a7 --- /dev/null +++ b/queue/crypto-lrw-Fix-an-error-handling-path-in-create.patch @@ -0,0 +1,34 @@ +From 616129cc6e75fb4da6681c16c981fa82dfe5e4c7 Mon Sep 17 00:00:00 2001 +From: Christophe Jaillet <christophe.jaillet@wanadoo.fr> +Date: Sun, 8 Oct 2017 11:39:49 +0200 +Subject: [PATCH] crypto: lrw - Fix an error handling path in 'create()' + +commit 616129cc6e75fb4da6681c16c981fa82dfe5e4c7 upstream. + +All error handling paths 'goto err_drop_spawn' except this one. +In order to avoid some resources leak, we should do it as well here. + +Fixes: 700cb3f5fe75 ("crypto: lrw - Convert to skcipher") +Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr> +Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> + +diff --git a/crypto/lrw.c b/crypto/lrw.c +index a8bfae4451bf..eb681e9fe574 100644 +--- a/crypto/lrw.c ++++ b/crypto/lrw.c +@@ -610,8 +610,10 @@ static int create(struct crypto_template *tmpl, struct rtattr **tb) + ecb_name[len - 1] = 0; + + if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, +- "lrw(%s)", ecb_name) >= CRYPTO_MAX_ALG_NAME) +- return -ENAMETOOLONG; ++ "lrw(%s)", ecb_name) >= CRYPTO_MAX_ALG_NAME) { ++ err = -ENAMETOOLONG; ++ goto err_drop_spawn; ++ } + } + + inst->alg.base.cra_flags = alg->base.cra_flags & CRYPTO_ALG_ASYNC; +-- +2.15.0 + diff --git a/queue/drivers-misc-intel-pti-Rename-the-header-file-to-fre.patch b/queue/drivers-misc-intel-pti-Rename-the-header-file-to-fre.patch new file mode 100644 index 0000000..81b0f1d --- /dev/null +++ b/queue/drivers-misc-intel-pti-Rename-the-header-file-to-fre.patch @@ -0,0 +1,60 @@ +From 1784f9144b143a1e8b19fe94083b040aa559182b Mon Sep 17 00:00:00 2001 +From: Ingo Molnar <mingo@kernel.org> +Date: Tue, 5 Dec 2017 14:14:47 +0100 +Subject: [PATCH] drivers/misc/intel/pti: Rename the header file to free up the + namespace + +commit 1784f9144b143a1e8b19fe94083b040aa559182b upstream. + +We'd like to use the 'PTI' acronym for 'Page Table Isolation' - free up the +namespace by renaming the <linux/pti.h> driver header to <linux/intel-pti.h>. + +(Also standardize the header guard name while at it.) + +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: J Freyensee <james_p_freyensee@linux.intel.com> +Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/drivers/misc/pti.c b/drivers/misc/pti.c +index eda38cbe8530..41f2a9f6851d 100644 +--- a/drivers/misc/pti.c ++++ b/drivers/misc/pti.c +@@ -32,7 +32,7 @@ + #include <linux/pci.h> + #include <linux/mutex.h> + #include <linux/miscdevice.h> +-#include <linux/pti.h> ++#include <linux/intel-pti.h> + #include <linux/slab.h> + #include <linux/uaccess.h> + +diff --git a/include/linux/pti.h b/include/linux/intel-pti.h +similarity index 94% +rename from include/linux/pti.h +rename to include/linux/intel-pti.h +index b3ea01a3197e..2710d72de3c9 100644 +--- a/include/linux/pti.h ++++ b/include/linux/intel-pti.h +@@ -22,8 +22,8 @@ + * interface to write out it's contents for debugging a mobile system. + */ + +-#ifndef PTI_H_ +-#define PTI_H_ ++#ifndef LINUX_INTEL_PTI_H_ ++#define LINUX_INTEL_PTI_H_ + + /* offset for last dword of any PTI message. Part of MIPI P1149.7 */ + #define PTI_LASTDWORD_DTS 0x30 +@@ -40,4 +40,4 @@ struct pti_masterchannel *pti_request_masterchannel(u8 type, + const char *thread_name); + void pti_release_masterchannel(struct pti_masterchannel *mc); + +-#endif /*PTI_H_*/ ++#endif /* LINUX_INTEL_PTI_H_ */ +-- +2.15.0 + diff --git a/queue/drm-Add-retries-for-lspcon-mode-detection.patch b/queue/drm-Add-retries-for-lspcon-mode-detection.patch new file mode 100644 index 0000000..b38e513 --- /dev/null +++ b/queue/drm-Add-retries-for-lspcon-mode-detection.patch @@ -0,0 +1,74 @@ +From f687e25a7a245952349f1f9f9cc238ac5a3be258 Mon Sep 17 00:00:00 2001 +From: Shashank Sharma <shashank.sharma@intel.com> +Date: Thu, 12 Oct 2017 22:10:08 +0530 +Subject: [PATCH] drm: Add retries for lspcon mode detection + +commit f687e25a7a245952349f1f9f9cc238ac5a3be258 upstream. + +From the CI builds, its been observed that during a driver +reload/insert, dp dual mode read function sometimes fails to +read from LSPCON device over i2c-over-aux channel. + +This patch: +- adds some delay and few retries, allowing a scope for these + devices to settle down and respond. +- changes one error log's level from ERROR->DEBUG as we want + to call it an error only after all the retries are exhausted. + +V2: Addressed review comments from Jani (for loop for retry) +V3: Addressed review comments from Imre (break on partial read too) +V3: Addressed review comments from Ville/Imre (Add the retries + exclusively for LSPCON, not for all dp_dual_mode devices) +V4: Added r-b from Imre, sending it to dri-devel (Jani) + +Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102294 +Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102295 +Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102359 +Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103186 +Cc: Ville Syrjala <ville.syrjala@linux.intel.com> +Cc: Imre Deak <imre.deak@intel.com> +Cc: Jani Nikula <jani.nikula@linux.intel.com> +Reviewed-by: Imre Deak <imre.deak@intel.com> +Acked-by: Dave Airlie <airlied@gmail.com> +Signed-off-by: Shashank Sharma <shashank.sharma@intel.com> +Signed-off-by: Jani Nikula <jani.nikula@intel.com> +Link: https://patchwork.freedesktop.org/patch/msgid/1507826408-19322-1-git-send-email-shashank.sharma@intel.com + +diff --git a/drivers/gpu/drm/drm_dp_dual_mode_helper.c b/drivers/gpu/drm/drm_dp_dual_mode_helper.c +index 0ef9011a1856..02a50929af67 100644 +--- a/drivers/gpu/drm/drm_dp_dual_mode_helper.c ++++ b/drivers/gpu/drm/drm_dp_dual_mode_helper.c +@@ -410,6 +410,7 @@ int drm_lspcon_get_mode(struct i2c_adapter *adapter, + { + u8 data; + int ret = 0; ++ int retry; + + if (!mode) { + DRM_ERROR("NULL input\n"); +@@ -417,10 +418,19 @@ int drm_lspcon_get_mode(struct i2c_adapter *adapter, + } + + /* Read Status: i2c over aux */ +- ret = drm_dp_dual_mode_read(adapter, DP_DUAL_MODE_LSPCON_CURRENT_MODE, +- &data, sizeof(data)); ++ for (retry = 0; retry < 6; retry++) { ++ if (retry) ++ usleep_range(500, 1000); ++ ++ ret = drm_dp_dual_mode_read(adapter, ++ DP_DUAL_MODE_LSPCON_CURRENT_MODE, ++ &data, sizeof(data)); ++ if (!ret) ++ break; ++ } ++ + if (ret < 0) { +- DRM_ERROR("LSPCON read(0x80, 0x41) failed\n"); ++ DRM_DEBUG_KMS("LSPCON read(0x80, 0x41) failed\n"); + return -EFAULT; + } + +-- +2.15.0 + diff --git a/queue/drm-vc4-Avoid-using-vrefresh-0-mode-in-DSI-htotal-ma.patch b/queue/drm-vc4-Avoid-using-vrefresh-0-mode-in-DSI-htotal-ma.patch new file mode 100644 index 0000000..da7a3d0 --- /dev/null +++ b/queue/drm-vc4-Avoid-using-vrefresh-0-mode-in-DSI-htotal-ma.patch @@ -0,0 +1,40 @@ +From af2eca53206c59ce9308a4f5f46c4a104a179b6b Mon Sep 17 00:00:00 2001 +From: Eric Anholt <eric@anholt.net> +Date: Tue, 15 Aug 2017 16:47:19 -0700 +Subject: [PATCH] drm/vc4: Avoid using vrefresh==0 mode in DSI htotal math. + +commit af2eca53206c59ce9308a4f5f46c4a104a179b6b upstream. + +The incoming mode might have a missing vrefresh field if it came from +drmModeSetCrtc(), which the kernel is supposed to calculate using +drm_mode_vrefresh(). We could either use that or the adjusted_mode's +original vrefresh value. + +However, we can maintain a more exact vrefresh value (not just the +integer approximation), by scaling by the ratio of our clocks. + +v2: Use math suggested by Andrzej Hajda instead. +v3: Simplify math now that adjusted_mode->clock isn't padded. +v4: Drop some parens. + +Signed-off-by: Eric Anholt <eric@anholt.net> +Link: https://patchwork.freedesktop.org/patch/msgid/20170815234722.20700-2-eric@anholt.net +Reviewed-by: Andrzej Hajda <a.hajda@samsung.com> + +diff --git a/drivers/gpu/drm/vc4/vc4_dsi.c b/drivers/gpu/drm/vc4/vc4_dsi.c +index d1e0dc908048..04796d7d0fdb 100644 +--- a/drivers/gpu/drm/vc4/vc4_dsi.c ++++ b/drivers/gpu/drm/vc4/vc4_dsi.c +@@ -866,7 +866,8 @@ static bool vc4_dsi_encoder_mode_fixup(struct drm_encoder *encoder, + adjusted_mode->clock = pixel_clock_hz / 1000 + 1; + + /* Given the new pixel clock, adjust HFP to keep vrefresh the same. */ +- adjusted_mode->htotal = pixel_clock_hz / (mode->vrefresh * mode->vtotal); ++ adjusted_mode->htotal = adjusted_mode->clock * mode->htotal / ++ mode->clock; + adjusted_mode->hsync_end += adjusted_mode->htotal - mode->htotal; + adjusted_mode->hsync_start += adjusted_mode->htotal - mode->htotal; + +-- +2.15.0 + diff --git a/queue/fm10k-ensure-we-process-SM-mbx-when-processing-VF-mb.patch b/queue/fm10k-ensure-we-process-SM-mbx-when-processing-VF-mb.patch new file mode 100644 index 0000000..a3a6e25 --- /dev/null +++ b/queue/fm10k-ensure-we-process-SM-mbx-when-processing-VF-mb.patch @@ -0,0 +1,38 @@ +From 17a91809942ca32c70026d2d5ba3348a2c4fdf8f Mon Sep 17 00:00:00 2001 +From: Jacob Keller <jacob.e.keller@intel.com> +Date: Mon, 2 Oct 2017 07:17:50 -0700 +Subject: [PATCH] fm10k: ensure we process SM mbx when processing VF mbx + +commit 17a91809942ca32c70026d2d5ba3348a2c4fdf8f upstream. + +When we process VF mailboxes, the driver is likely going to also queue +up messages to the switch manager. This process merely queues up the +FIFO, but doesn't actually begin the transmission process. Because we +hold the mailbox lock during this VF processing, the PF<->SM mailbox is +not getting processed at this time. Ensure that we actually process the +PF<->SM mailbox in between each PF<->VF mailbox. + +This should ensure prompt transmission of the messages queued up after +each VF message is received and handled. + +Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> +Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> +Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com> + +diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c +index 5f4dac0d36ef..2ec49116fe91 100644 +--- a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c ++++ b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c +@@ -126,6 +126,9 @@ s32 fm10k_iov_mbx(struct fm10k_intfc *interface) + struct fm10k_mbx_info *mbx = &vf_info->mbx; + u16 glort = vf_info->glort; + ++ /* process the SM mailbox first to drain outgoing messages */ ++ hw->mbx.ops.process(hw, &hw->mbx); ++ + /* verify port mapping is valid, if not reset port */ + if (vf_info->vf_flags && !fm10k_glort_valid_pf(hw, glort)) + hw->iov.ops.reset_lport(hw, vf_info); +-- +2.15.0 + diff --git a/queue/fm10k-fix-mis-ordered-parameters-in-declaration-for-.patch b/queue/fm10k-fix-mis-ordered-parameters-in-declaration-for-.patch new file mode 100644 index 0000000..dfdea1c --- /dev/null +++ b/queue/fm10k-fix-mis-ordered-parameters-in-declaration-for-.patch @@ -0,0 +1,75 @@ +From 3e256ac5b1ec307e5dd5a4c99fbdbc651446c738 Mon Sep 17 00:00:00 2001 +From: Jacob Keller <jacob.e.keller@intel.com> +Date: Fri, 11 Aug 2017 11:14:58 -0700 +Subject: [PATCH] fm10k: fix mis-ordered parameters in declaration for + .ndo_set_vf_bw + +commit 3e256ac5b1ec307e5dd5a4c99fbdbc651446c738 upstream. + +We've had support for setting both a minimum and maximum bandwidth via +.ndo_set_vf_bw since commit 883a9ccbae56 ("fm10k: Add support for SR-IOV +to driver", 2014-09-20). + +Likely because we do not support minimum rates, the declaration +mis-ordered the "unused" parameter, which causes warnings when analyzed +with cppcheck. + +Fix this warning by properly declaring the min_rate and max_rate +variables in the declaration and definition (rather than using +"unused"). Also rename "rate" to max_rate so as to clarify that we only +support setting the maximum rate. + +Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> +Tested-by: Krishneil Singh <krishneil.k.singh@intel.com> +Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com> + +diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h +index 40856bc0f3b9..46973fb234c5 100644 +--- a/drivers/net/ethernet/intel/fm10k/fm10k.h ++++ b/drivers/net/ethernet/intel/fm10k/fm10k.h +@@ -562,8 +562,8 @@ s32 fm10k_iov_update_pvid(struct fm10k_intfc *interface, u16 glort, u16 pvid); + int fm10k_ndo_set_vf_mac(struct net_device *netdev, int vf_idx, u8 *mac); + int fm10k_ndo_set_vf_vlan(struct net_device *netdev, + int vf_idx, u16 vid, u8 qos, __be16 vlan_proto); +-int fm10k_ndo_set_vf_bw(struct net_device *netdev, int vf_idx, int rate, +- int unused); ++int fm10k_ndo_set_vf_bw(struct net_device *netdev, int vf_idx, ++ int __always_unused min_rate, int max_rate); + int fm10k_ndo_get_vf_config(struct net_device *netdev, + int vf_idx, struct ifla_vf_info *ivi); + +diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c +index 4a17cc903eed..ea3ab24265ee 100644 +--- a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c ++++ b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c +@@ -613,7 +613,7 @@ int fm10k_ndo_set_vf_vlan(struct net_device *netdev, int vf_idx, u16 vid, + } + + int fm10k_ndo_set_vf_bw(struct net_device *netdev, int vf_idx, +- int __always_unused unused, int rate) ++ int __always_unused min_rate, int max_rate) + { + struct fm10k_intfc *interface = netdev_priv(netdev); + struct fm10k_iov_data *iov_data = interface->iov_data; +@@ -624,14 +624,15 @@ int fm10k_ndo_set_vf_bw(struct net_device *netdev, int vf_idx, + return -EINVAL; + + /* rate limit cannot be less than 10Mbs or greater than link speed */ +- if (rate && ((rate < FM10K_VF_TC_MIN) || rate > FM10K_VF_TC_MAX)) ++ if (max_rate && ++ (max_rate < FM10K_VF_TC_MIN || max_rate > FM10K_VF_TC_MAX)) + return -EINVAL; + + /* store values */ +- iov_data->vf_info[vf_idx].rate = rate; ++ iov_data->vf_info[vf_idx].rate = max_rate; + + /* update hardware configuration */ +- hw->iov.ops.configure_tc(hw, vf_idx, rate); ++ hw->iov.ops.configure_tc(hw, vf_idx, max_rate); + + return 0; + } +-- +2.15.0 + diff --git a/queue/i40e-fix-client-notify-of-VF-reset.patch b/queue/i40e-fix-client-notify-of-VF-reset.patch new file mode 100644 index 0000000..b2fdb6a --- /dev/null +++ b/queue/i40e-fix-client-notify-of-VF-reset.patch @@ -0,0 +1,43 @@ +From c53d11f669c0e7d0daf46a717b6712ad0b09de99 Mon Sep 17 00:00:00 2001 +From: Alan Brady <alan.brady@intel.com> +Date: Tue, 22 Aug 2017 06:57:53 -0400 +Subject: [PATCH] i40e: fix client notify of VF reset + +commit c53d11f669c0e7d0daf46a717b6712ad0b09de99 upstream. + +Currently there is a bug in which the PF driver fails to inform clients +of a VF reset which then causes clients to leak resources. The bug +exists because we were incorrectly checking the I40E_VF_STATE_PRE_ENABLE +bit. + +When a VF is first init we go through a reset to initialize variables +and allocate resources but we don't want to inform clients of this first +reset since the client isn't fully enabled yet so we set a state bit +signifying we're in a "pre-enabled" client state. During the first +reset we should be clearing the bit, allowing all following resets to +notify the client of the reset when the bit is not set. This patch +fixes the issue by negating the 'test_and_clear_bit' check to accurately +reflect the behavior we want. + +Signed-off-by: Alan Brady <alan.brady@intel.com> +Tested-by: Andrew Bowers <andrewx.bowers@intel.com> +Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com> + +diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +index 989a65d60ac9..04568137e029 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +@@ -1050,8 +1050,8 @@ static void i40e_cleanup_reset_vf(struct i40e_vf *vf) + set_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states); + clear_bit(I40E_VF_STATE_DISABLED, &vf->vf_states); + /* Do not notify the client during VF init */ +- if (test_and_clear_bit(I40E_VF_STATE_PRE_ENABLE, +- &vf->vf_states)) ++ if (!test_and_clear_bit(I40E_VF_STATE_PRE_ENABLE, ++ &vf->vf_states)) + i40e_notify_client_of_vf_reset(pf, abs_vf_id); + vf->num_vlan = 0; + } +-- +2.15.0 + diff --git a/queue/i40e-i40evf-spread-CPU-affinity-hints-across-online-.patch b/queue/i40e-i40evf-spread-CPU-affinity-hints-across-online-.patch new file mode 100644 index 0000000..0dcd14a --- /dev/null +++ b/queue/i40e-i40evf-spread-CPU-affinity-hints-across-online-.patch @@ -0,0 +1,128 @@ +From be664cbefc50977aaefc868ba6a1109ec9b7449d Mon Sep 17 00:00:00 2001 +From: Jacob Keller <jacob.e.keller@intel.com> +Date: Tue, 29 Aug 2017 05:32:31 -0400 +Subject: [PATCH] i40e/i40evf: spread CPU affinity hints across online CPUs + only + +commit be664cbefc50977aaefc868ba6a1109ec9b7449d upstream. + +Currently, when setting up the IRQ for a q_vector, we set an affinity +hint based on the v_idx of that q_vector. Meaning a loop iterates on +v_idx, which is an incremental value, and the cpumask is created based +on this value. + +This is a problem in systems with multiple logical CPUs per core (like in +simultaneous multithreading (SMT) scenarios). If we disable some logical +CPUs, by turning SMT off for example, we will end up with a sparse +cpu_online_mask, i.e., only the first CPU in a core is online, and +incremental filling in q_vector cpumask might lead to multiple offline +CPUs being assigned to q_vectors. + +Example: if we have a system with 8 cores each one containing 8 logical +CPUs (SMT == 8 in this case), we have 64 CPUs in total. But if SMT is +disabled, only the 1st CPU in each core remains online, so the +cpu_online_mask in this case would have only 8 bits set, in a sparse way. + +In general case, when SMT is off the cpu_online_mask has only C bits set: +0, 1*N, 2*N, ..., C*(N-1) where +C == # of cores; +N == # of logical CPUs per core. +In our example, only bits 0, 8, 16, 24, 32, 40, 48, 56 would be set. + +Instead, we should only assign hints for CPUs which are online. Even +better, the kernel already provides a function, cpumask_local_spread() +which takes an index and returns a CPU, spreading the interrupts across +local NUMA nodes first, and then remote ones if necessary. + +Since we generally have a 1:1 mapping between vectors and CPUs, there +is no real advantage to spreading vectors to local CPUs first. In order +to avoid mismatch of the default XPS hints, we'll pass -1 so that it +spreads across all CPUs without regard to the node locality. + +Note that we don't need to change the q_vector->affinity_mask as this is +initialized to cpu_possible_mask, until an actual affinity is set and +then notified back to us. + +Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> +Tested-by: Andrew Bowers <andrewx.bowers@intel.com> +Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com> + +diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c +index b539469f576f..d2bb4f17c89e 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_main.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c +@@ -2885,14 +2885,15 @@ static void i40e_vsi_free_rx_resources(struct i40e_vsi *vsi) + static void i40e_config_xps_tx_ring(struct i40e_ring *ring) + { + struct i40e_vsi *vsi = ring->vsi; ++ int cpu; + + if (!ring->q_vector || !ring->netdev) + return; + + if ((vsi->tc_config.numtc <= 1) && + !test_and_set_bit(__I40E_TX_XPS_INIT_DONE, &ring->state)) { +- netif_set_xps_queue(ring->netdev, +- get_cpu_mask(ring->q_vector->v_idx), ++ cpu = cpumask_local_spread(ring->q_vector->v_idx, -1); ++ netif_set_xps_queue(ring->netdev, get_cpu_mask(cpu), + ring->queue_index); + } + +@@ -3482,6 +3483,7 @@ static int i40e_vsi_request_irq_msix(struct i40e_vsi *vsi, char *basename) + int tx_int_idx = 0; + int vector, err; + int irq_num; ++ int cpu; + + for (vector = 0; vector < q_vectors; vector++) { + struct i40e_q_vector *q_vector = vsi->q_vectors[vector]; +@@ -3517,10 +3519,14 @@ static int i40e_vsi_request_irq_msix(struct i40e_vsi *vsi, char *basename) + q_vector->affinity_notify.notify = i40e_irq_affinity_notify; + q_vector->affinity_notify.release = i40e_irq_affinity_release; + irq_set_affinity_notifier(irq_num, &q_vector->affinity_notify); +- /* get_cpu_mask returns a static constant mask with +- * a permanent lifetime so it's ok to use here. ++ /* Spread affinity hints out across online CPUs. ++ * ++ * get_cpu_mask returns a static constant mask with ++ * a permanent lifetime so it's ok to pass to ++ * irq_set_affinity_hint without making a copy. + */ +- irq_set_affinity_hint(irq_num, get_cpu_mask(q_vector->v_idx)); ++ cpu = cpumask_local_spread(q_vector->v_idx, -1); ++ irq_set_affinity_hint(irq_num, get_cpu_mask(cpu)); + } + + vsi->irqs_ready = true; +diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c +index f2f1e754c2ce..bc76378a71e2 100644 +--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c ++++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c +@@ -515,6 +515,7 @@ i40evf_request_traffic_irqs(struct i40evf_adapter *adapter, char *basename) + unsigned int vector, q_vectors; + unsigned int rx_int_idx = 0, tx_int_idx = 0; + int irq_num, err; ++ int cpu; + + i40evf_irq_disable(adapter); + /* Decrement for Other and TCP Timer vectors */ +@@ -553,10 +554,12 @@ i40evf_request_traffic_irqs(struct i40evf_adapter *adapter, char *basename) + q_vector->affinity_notify.release = + i40evf_irq_affinity_release; + irq_set_affinity_notifier(irq_num, &q_vector->affinity_notify); +- /* get_cpu_mask returns a static constant mask with +- * a permanent lifetime so it's ok to use here. ++ /* Spread the IRQ affinity hints across online CPUs. Note that ++ * get_cpu_mask returns a mask with a permanent lifetime so ++ * it's safe to use as a hint for irq_set_affinity_hint. + */ +- irq_set_affinity_hint(irq_num, get_cpu_mask(q_vector->v_idx)); ++ cpu = cpumask_local_spread(q_vector->v_idx, -1); ++ irq_set_affinity_hint(irq_num, get_cpu_mask(cpu)); + } + + return 0; +-- +2.15.0 + diff --git a/queue/i40e-use-the-safe-hash-table-iterator-when-deleting-.patch b/queue/i40e-use-the-safe-hash-table-iterator-when-deleting-.patch new file mode 100644 index 0000000..09b680c --- /dev/null +++ b/queue/i40e-use-the-safe-hash-table-iterator-when-deleting-.patch @@ -0,0 +1,43 @@ +From 784548c40d6f43eff2297220ad7800dc04be03c6 Mon Sep 17 00:00:00 2001 +From: Lihong Yang <lihong.yang@intel.com> +Date: Thu, 7 Sep 2017 08:05:46 -0400 +Subject: [PATCH] i40e: use the safe hash table iterator when deleting mac + filters + +commit 784548c40d6f43eff2297220ad7800dc04be03c6 upstream. + +This patch replaces hash_for_each function with hash_for_each_safe +when calling __i40e_del_filter. The hash_for_each_safe function is +the right one to use when iterating over a hash table to safely remove +a hash entry. Otherwise, incorrect values may be read from freed memory. + +Detected by CoverityScan, CID 1402048 Read from pointer after free + +Signed-off-by: Lihong Yang <lihong.yang@intel.com> +Tested-by: Andrew Bowers <andrewx.bowers@intel.com> +Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com> + +diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +index 04568137e029..c062d74d21f3 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +@@ -2883,6 +2883,7 @@ int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac) + struct i40e_mac_filter *f; + struct i40e_vf *vf; + int ret = 0; ++ struct hlist_node *h; + int bkt; + + /* validate the request */ +@@ -2921,7 +2922,7 @@ int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac) + /* Delete all the filters for this VSI - we're going to kill it + * anyway. + */ +- hash_for_each(vsi->mac_filter_hash, bkt, f, hlist) ++ hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) + __i40e_del_filter(vsi, f); + + spin_unlock_bh(&vsi->mac_filter_hash_lock); +-- +2.15.0 + diff --git a/queue/ibmvnic-Set-state-UP.patch b/queue/ibmvnic-Set-state-UP.patch new file mode 100644 index 0000000..b85b8ce --- /dev/null +++ b/queue/ibmvnic-Set-state-UP.patch @@ -0,0 +1,37 @@ +From e876a8a7e9dd89dc88c12ca2e81beb478dbe9897 Mon Sep 17 00:00:00 2001 +From: Mick Tarsel <mjtarsel@linux.vnet.ibm.com> +Date: Thu, 28 Sep 2017 13:53:18 -0700 +Subject: [PATCH] ibmvnic: Set state UP + +commit e876a8a7e9dd89dc88c12ca2e81beb478dbe9897 upstream. + +State is initially reported as UNKNOWN. Before register call +netif_carrier_off(). Once the device is opened, call netif_carrier_on() in +order to set the state to UP. + +Signed-off-by: Mick Tarsel <mjtarsel@linux.vnet.ibm.com> +Signed-off-by: David S. Miller <davem@davemloft.net> + +diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c +index cb8182f4fdfa..4bc14a901571 100644 +--- a/drivers/net/ethernet/ibm/ibmvnic.c ++++ b/drivers/net/ethernet/ibm/ibmvnic.c +@@ -927,6 +927,7 @@ static int ibmvnic_open(struct net_device *netdev) + } + + rc = __ibmvnic_open(netdev); ++ netif_carrier_on(netdev); + mutex_unlock(&adapter->reset_lock); + + return rc; +@@ -3899,6 +3900,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) + if (rc) + goto ibmvnic_init_fail; + ++ netif_carrier_off(netdev); + rc = register_netdev(netdev); + if (rc) { + dev_err(&dev->dev, "failed to register netdev rc=%d\n", rc); +-- +2.15.0 + diff --git a/queue/igb-check-memory-allocation-failure.patch b/queue/igb-check-memory-allocation-failure.patch new file mode 100644 index 0000000..0004cc9 --- /dev/null +++ b/queue/igb-check-memory-allocation-failure.patch @@ -0,0 +1,33 @@ +From 18eb86362a52f0af933cc0fd5e37027317eb2d1c Mon Sep 17 00:00:00 2001 +From: Christophe JAILLET <christophe.jaillet@wanadoo.fr> +Date: Sun, 27 Aug 2017 08:39:51 +0200 +Subject: [PATCH] igb: check memory allocation failure + +commit 18eb86362a52f0af933cc0fd5e37027317eb2d1c upstream. + +Check memory allocation failures and return -ENOMEM in such cases, as +already done for other memory allocations in this function. + +This avoids NULL pointers dereference. + +Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr> +Tested-by: Aaron Brown <aaron.f.brown@intel.com +Acked-by: PJ Waskiewicz <peter.waskiewicz.jr@intel.com> +Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com> + +diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c +index fd4a46b03cc8..837d9b46a390 100644 +--- a/drivers/net/ethernet/intel/igb/igb_main.c ++++ b/drivers/net/ethernet/intel/igb/igb_main.c +@@ -3162,6 +3162,8 @@ static int igb_sw_init(struct igb_adapter *adapter) + /* Setup and initialize a copy of the hw vlan table array */ + adapter->shadow_vfta = kcalloc(E1000_VLAN_FILTER_TBL_SIZE, sizeof(u32), + GFP_ATOMIC); ++ if (!adapter->shadow_vfta) ++ return -ENOMEM; + + /* This call may decrease the number of queues */ + if (igb_init_interrupt_scheme(adapter, true)) { +-- +2.15.0 + diff --git a/queue/iio-st_sensors-add-register-mask-for-status-register.patch b/queue/iio-st_sensors-add-register-mask-for-status-register.patch new file mode 100644 index 0000000..3ca800d --- /dev/null +++ b/queue/iio-st_sensors-add-register-mask-for-status-register.patch @@ -0,0 +1,311 @@ +From e72a060151e5bb673af24993665e270fc4f674a7 Mon Sep 17 00:00:00 2001 +From: Lorenzo Bianconi <lorenzo.bianconi83@gmail.com> +Date: Wed, 30 Aug 2017 13:50:39 +0200 +Subject: [PATCH] iio: st_sensors: add register mask for status register + +commit e72a060151e5bb673af24993665e270fc4f674a7 upstream. + +Introduce register mask for data-ready status register since +pressure sensors (e.g. LPS22HB) export just two channels +(BIT(0) and BIT(1)) and BIT(2) is marked reserved while in +st_sensors_new_samples_available() value read from status register +is masked using 0x7. +Moreover do not mask status register using active_scan_mask since +now status value is properly masked and if the result is not zero the +interrupt has to be consumed by the driver. This fix an issue on LPS25H +and LPS331AP where channel definition is swapped respect to status +register. +Furthermore that change allows to properly support new devices +(e.g LIS2DW12) that report just ZYXDA (data-ready) field in status register +to figure out if the interrupt has been generated by the device. + +Fixes: 97865fe41322 (iio: st_sensors: verify interrupt event to status) +Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@st.com> +Reviewed-by: Linus Walleij <linus.walleij@linaro.org> +Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> + +diff --git a/drivers/iio/accel/st_accel_core.c b/drivers/iio/accel/st_accel_core.c +index 1a2e54ff473a..140ba26f6131 100644 +--- a/drivers/iio/accel/st_accel_core.c ++++ b/drivers/iio/accel/st_accel_core.c +@@ -164,7 +164,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { + .mask_int2 = 0x00, + .addr_ihl = 0x25, + .mask_ihl = 0x02, +- .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .stat_drdy = { ++ .addr = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .mask = 0x07, ++ }, + }, + .sim = { + .addr = 0x23, +@@ -236,7 +239,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { + .mask_ihl = 0x80, + .addr_od = 0x22, + .mask_od = 0x40, +- .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .stat_drdy = { ++ .addr = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .mask = 0x07, ++ }, + }, + .sim = { + .addr = 0x23, +@@ -318,7 +324,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { + .mask_int2 = 0x00, + .addr_ihl = 0x23, + .mask_ihl = 0x40, +- .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .stat_drdy = { ++ .addr = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .mask = 0x07, ++ }, + .ig1 = { + .en_addr = 0x23, + .en_mask = 0x08, +@@ -389,7 +398,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { + .drdy_irq = { + .addr = 0x21, + .mask_int1 = 0x04, +- .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .stat_drdy = { ++ .addr = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .mask = 0x07, ++ }, + }, + .sim = { + .addr = 0x21, +@@ -451,7 +463,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { + .mask_ihl = 0x80, + .addr_od = 0x22, + .mask_od = 0x40, +- .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .stat_drdy = { ++ .addr = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .mask = 0x07, ++ }, + }, + .sim = { + .addr = 0x21, +@@ -569,7 +584,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { + .drdy_irq = { + .addr = 0x21, + .mask_int1 = 0x04, +- .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .stat_drdy = { ++ .addr = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .mask = 0x07, ++ }, + }, + .sim = { + .addr = 0x21, +@@ -640,7 +658,10 @@ static const struct st_sensor_settings st_accel_sensors_settings[] = { + .mask_int2 = 0x00, + .addr_ihl = 0x25, + .mask_ihl = 0x02, +- .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .stat_drdy = { ++ .addr = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .mask = 0x07, ++ }, + }, + .sim = { + .addr = 0x23, +diff --git a/drivers/iio/common/st_sensors/st_sensors_core.c b/drivers/iio/common/st_sensors/st_sensors_core.c +index 02e833b14db0..34115f05d5c4 100644 +--- a/drivers/iio/common/st_sensors/st_sensors_core.c ++++ b/drivers/iio/common/st_sensors/st_sensors_core.c +@@ -470,7 +470,7 @@ int st_sensors_set_dataready_irq(struct iio_dev *indio_dev, bool enable) + * different one. Take into account irq status register + * to understand if irq trigger can be properly supported + */ +- if (sdata->sensor_settings->drdy_irq.addr_stat_drdy) ++ if (sdata->sensor_settings->drdy_irq.stat_drdy.addr) + sdata->hw_irq_trigger = enable; + return 0; + } +diff --git a/drivers/iio/common/st_sensors/st_sensors_trigger.c b/drivers/iio/common/st_sensors/st_sensors_trigger.c +index fa73e6795359..fdcc5a891958 100644 +--- a/drivers/iio/common/st_sensors/st_sensors_trigger.c ++++ b/drivers/iio/common/st_sensors/st_sensors_trigger.c +@@ -31,7 +31,7 @@ static int st_sensors_new_samples_available(struct iio_dev *indio_dev, + int ret; + + /* How would I know if I can't check it? */ +- if (!sdata->sensor_settings->drdy_irq.addr_stat_drdy) ++ if (!sdata->sensor_settings->drdy_irq.stat_drdy.addr) + return -EINVAL; + + /* No scan mask, no interrupt */ +@@ -39,23 +39,15 @@ static int st_sensors_new_samples_available(struct iio_dev *indio_dev, + return 0; + + ret = sdata->tf->read_byte(&sdata->tb, sdata->dev, +- sdata->sensor_settings->drdy_irq.addr_stat_drdy, ++ sdata->sensor_settings->drdy_irq.stat_drdy.addr, + &status); + if (ret < 0) { + dev_err(sdata->dev, + "error checking samples available\n"); + return ret; + } +- /* +- * the lower bits of .active_scan_mask[0] is directly mapped +- * to the channels on the sensor: either bit 0 for +- * one-dimensional sensors, or e.g. x,y,z for accelerometers, +- * gyroscopes or magnetometers. No sensor use more than 3 +- * channels, so cut the other status bits here. +- */ +- status &= 0x07; + +- if (status & (u8)indio_dev->active_scan_mask[0]) ++ if (status & sdata->sensor_settings->drdy_irq.stat_drdy.mask) + return 1; + + return 0; +@@ -212,7 +204,7 @@ int st_sensors_allocate_trigger(struct iio_dev *indio_dev, + * it was "our" interrupt. + */ + if (sdata->int_pin_open_drain && +- sdata->sensor_settings->drdy_irq.addr_stat_drdy) ++ sdata->sensor_settings->drdy_irq.stat_drdy.addr) + irq_trig |= IRQF_SHARED; + + err = request_threaded_irq(sdata->get_irq_data_ready(indio_dev), +diff --git a/drivers/iio/gyro/st_gyro_core.c b/drivers/iio/gyro/st_gyro_core.c +index 4cf85aa01dde..22c0c1732996 100644 +--- a/drivers/iio/gyro/st_gyro_core.c ++++ b/drivers/iio/gyro/st_gyro_core.c +@@ -118,7 +118,10 @@ static const struct st_sensor_settings st_gyro_sensors_settings[] = { + * drain settings, but only for INT1 and not + * for the DRDY line on INT2. + */ +- .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .stat_drdy = { ++ .addr = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .mask = 0x07, ++ }, + }, + .multi_read_bit = true, + .bootime = 2, +@@ -188,7 +191,10 @@ static const struct st_sensor_settings st_gyro_sensors_settings[] = { + * drain settings, but only for INT1 and not + * for the DRDY line on INT2. + */ +- .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .stat_drdy = { ++ .addr = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .mask = 0x07, ++ }, + }, + .multi_read_bit = true, + .bootime = 2, +@@ -253,7 +259,10 @@ static const struct st_sensor_settings st_gyro_sensors_settings[] = { + * drain settings, but only for INT1 and not + * for the DRDY line on INT2. + */ +- .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .stat_drdy = { ++ .addr = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .mask = 0x07, ++ }, + }, + .multi_read_bit = true, + .bootime = 2, +diff --git a/drivers/iio/magnetometer/st_magn_core.c b/drivers/iio/magnetometer/st_magn_core.c +index 703de313c418..ace72c57f53c 100644 +--- a/drivers/iio/magnetometer/st_magn_core.c ++++ b/drivers/iio/magnetometer/st_magn_core.c +@@ -317,7 +317,10 @@ static const struct st_sensor_settings st_magn_sensors_settings[] = { + }, + .drdy_irq = { + /* drdy line is routed drdy pin */ +- .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .stat_drdy = { ++ .addr = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .mask = 0x07, ++ }, + }, + .multi_read_bit = true, + .bootime = 2, +@@ -361,7 +364,10 @@ static const struct st_sensor_settings st_magn_sensors_settings[] = { + .drdy_irq = { + .addr = 0x62, + .mask_int1 = 0x01, +- .addr_stat_drdy = 0x67, ++ .stat_drdy = { ++ .addr = 0x67, ++ .mask = 0x07, ++ }, + }, + .multi_read_bit = false, + .bootime = 2, +diff --git a/drivers/iio/pressure/st_pressure_core.c b/drivers/iio/pressure/st_pressure_core.c +index 86120715913b..5f8358e23f5b 100644 +--- a/drivers/iio/pressure/st_pressure_core.c ++++ b/drivers/iio/pressure/st_pressure_core.c +@@ -287,7 +287,10 @@ static const struct st_sensor_settings st_press_sensors_settings[] = { + .mask_ihl = 0x80, + .addr_od = 0x22, + .mask_od = 0x40, +- .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .stat_drdy = { ++ .addr = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .mask = 0x03, ++ }, + }, + .multi_read_bit = true, + .bootime = 2, +@@ -395,7 +398,10 @@ static const struct st_sensor_settings st_press_sensors_settings[] = { + .mask_ihl = 0x80, + .addr_od = 0x22, + .mask_od = 0x40, +- .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .stat_drdy = { ++ .addr = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .mask = 0x03, ++ }, + }, + .multi_read_bit = true, + .bootime = 2, +@@ -456,7 +462,10 @@ static const struct st_sensor_settings st_press_sensors_settings[] = { + .mask_ihl = 0x80, + .addr_od = 0x12, + .mask_od = 0x40, +- .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .stat_drdy = { ++ .addr = ST_SENSORS_DEFAULT_STAT_ADDR, ++ .mask = 0x03, ++ }, + }, + .multi_read_bit = false, + .bootime = 2, +diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h +index 7b0fa8b5c120..ce0ef1c0a30a 100644 +--- a/include/linux/iio/common/st_sensors.h ++++ b/include/linux/iio/common/st_sensors.h +@@ -139,7 +139,7 @@ struct st_sensor_das { + * @mask_ihl: mask to enable/disable active low on the INT lines. + * @addr_od: address to enable/disable Open Drain on the INT lines. + * @mask_od: mask to enable/disable Open Drain on the INT lines. +- * @addr_stat_drdy: address to read status of DRDY (data ready) interrupt ++ * struct stat_drdy - status register of DRDY (data ready) interrupt. + * struct ig1 - represents the Interrupt Generator 1 of sensors. + * @en_addr: address of the enable ig1 register. + * @en_mask: mask to write the on/off value for enable. +@@ -152,7 +152,10 @@ struct st_sensor_data_ready_irq { + u8 mask_ihl; + u8 addr_od; + u8 mask_od; +- u8 addr_stat_drdy; ++ struct { ++ u8 addr; ++ u8 mask; ++ } stat_drdy; + struct { + u8 en_addr; + u8 en_mask; +-- +2.15.0 + diff --git a/queue/ip_gre-check-packet-length-and-mtu-correctly-in-ersp.patch b/queue/ip_gre-check-packet-length-and-mtu-correctly-in-ersp.patch new file mode 100644 index 0000000..e608279 --- /dev/null +++ b/queue/ip_gre-check-packet-length-and-mtu-correctly-in-ersp.patch @@ -0,0 +1,48 @@ +From f192970de860d3ab90aa9e2a22853201a57bde78 Mon Sep 17 00:00:00 2001 +From: William Tu <u9012063@gmail.com> +Date: Thu, 5 Oct 2017 12:07:12 -0700 +Subject: [PATCH] ip_gre: check packet length and mtu correctly in erspan tx + +commit f192970de860d3ab90aa9e2a22853201a57bde78 upstream. + +Similarly to early patch for erspan_xmit(), the ARPHDR_ETHER device +is the length of the whole ether packet. So skb->len should subtract +the dev->hard_header_len. + +Fixes: 1a66a836da63 ("gre: add collect_md mode to ERSPAN tunnel") +Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") +Signed-off-by: William Tu <u9012063@gmail.com> +Cc: Xin Long <lucien.xin@gmail.com> +Cc: David Laight <David.Laight@aculab.com> +Reviewed-by: Xin Long <lucien.xin@gmail.com> +Signed-off-by: David S. Miller <davem@davemloft.net> + +diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c +index dc2317776499..c105a315b1a3 100644 +--- a/net/ipv4/ip_gre.c ++++ b/net/ipv4/ip_gre.c +@@ -579,8 +579,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, + if (gre_handle_offloads(skb, false)) + goto err_free_rt; + +- if (skb->len > dev->mtu) { +- pskb_trim(skb, dev->mtu); ++ if (skb->len > dev->mtu + dev->hard_header_len) { ++ pskb_trim(skb, dev->mtu + dev->hard_header_len); + truncate = true; + } + +@@ -731,8 +731,8 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, + if (skb_cow_head(skb, dev->needed_headroom)) + goto free_skb; + +- if (skb->len - dev->hard_header_len > dev->mtu) { +- pskb_trim(skb, dev->mtu); ++ if (skb->len > dev->mtu + dev->hard_header_len) { ++ pskb_trim(skb, dev->mtu + dev->hard_header_len); + truncate = true; + } + +-- +2.15.0 + diff --git a/queue/ipv6-grab-rt-rt6i_ref-before-allocating-pcpu-rt.patch b/queue/ipv6-grab-rt-rt6i_ref-before-allocating-pcpu-rt.patch new file mode 100644 index 0000000..566315e --- /dev/null +++ b/queue/ipv6-grab-rt-rt6i_ref-before-allocating-pcpu-rt.patch @@ -0,0 +1,121 @@ +From a94b9367e044ba672c9f4105eb1516ff6ff4948a Mon Sep 17 00:00:00 2001 +From: Wei Wang <weiwan@google.com> +Date: Fri, 6 Oct 2017 12:06:04 -0700 +Subject: [PATCH] ipv6: grab rt->rt6i_ref before allocating pcpu rt + +commit a94b9367e044ba672c9f4105eb1516ff6ff4948a upstream. + +After rwlock is replaced with rcu and spinlock, ip6_pol_route() will be +called with only rcu held. That means rt6 route deletion could happen +simultaneously with rt6_make_pcpu_rt(). This could potentially cause +memory leak if rt6_release() is called right before rt6_make_pcpu_rt() +on the same route. + +This patch grabs rt->rt6i_ref safely before calling rt6_make_pcpu_rt() +to make sure rt6_release() will not get triggered while +rt6_make_pcpu_rt() is in progress. And rt6_release() is called after +rt6_make_pcpu_rt() is finished. + +Note: As we are incrementing rt->rt6i_ref in ip6_pol_route(), there is a +very slim chance that fib6_purge_rt() will be triggered unnecessarily +when deleting a route if ip6_pol_route() running on another thread picks +this route as well and tries to make pcpu cache for it. + +Signed-off-by: Wei Wang <weiwan@google.com> +Signed-off-by: Martin KaFai Lau <kafai@fb.com> +Signed-off-by: Eric Dumazet <edumazet@google.com> +Signed-off-by: David S. Miller <davem@davemloft.net> + +diff --git a/net/ipv6/route.c b/net/ipv6/route.c +index 65130dde276a..941c062389d2 100644 +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -1070,7 +1070,6 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) + + static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) + { +- struct fib6_table *table = rt->rt6i_table; + struct rt6_info *pcpu_rt, *prev, **p; + + pcpu_rt = ip6_rt_pcpu_alloc(rt); +@@ -1081,28 +1080,20 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) + return net->ipv6.ip6_null_entry; + } + +- read_lock_bh(&table->tb6_lock); +- if (rt->rt6i_pcpu) { +- p = this_cpu_ptr(rt->rt6i_pcpu); +- prev = cmpxchg(p, NULL, pcpu_rt); +- if (prev) { +- /* If someone did it before us, return prev instead */ +- dst_release_immediate(&pcpu_rt->dst); +- pcpu_rt = prev; +- } +- } else { +- /* rt has been removed from the fib6 tree +- * before we have a chance to acquire the read_lock. +- * In this case, don't brother to create a pcpu rt +- * since rt is going away anyway. The next +- * dst_check() will trigger a re-lookup. +- */ ++ dst_hold(&pcpu_rt->dst); ++ p = this_cpu_ptr(rt->rt6i_pcpu); ++ prev = cmpxchg(p, NULL, pcpu_rt); ++ if (prev) { ++ /* If someone did it before us, return prev instead */ ++ /* release refcnt taken by ip6_rt_pcpu_alloc() */ ++ dst_release_immediate(&pcpu_rt->dst); ++ /* release refcnt taken by above dst_hold() */ + dst_release_immediate(&pcpu_rt->dst); +- pcpu_rt = rt; ++ dst_hold(&prev->dst); ++ pcpu_rt = prev; + } +- dst_hold(&pcpu_rt->dst); ++ + rt6_dst_from_metrics_check(pcpu_rt); +- read_unlock_bh(&table->tb6_lock); + return pcpu_rt; + } + +@@ -1683,19 +1674,28 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, + if (pcpu_rt) { + read_unlock_bh(&table->tb6_lock); + } else { +- /* We have to do the read_unlock first +- * because rt6_make_pcpu_route() may trigger +- * ip6_dst_gc() which will take the write_lock. +- */ +- dst_hold(&rt->dst); +- read_unlock_bh(&table->tb6_lock); +- pcpu_rt = rt6_make_pcpu_route(rt); +- dst_release(&rt->dst); ++ /* atomic_inc_not_zero() is needed when using rcu */ ++ if (atomic_inc_not_zero(&rt->rt6i_ref)) { ++ /* We have to do the read_unlock first ++ * because rt6_make_pcpu_route() may trigger ++ * ip6_dst_gc() which will take the write_lock. ++ * ++ * No dst_hold() on rt is needed because grabbing ++ * rt->rt6i_ref makes sure rt can't be released. ++ */ ++ read_unlock_bh(&table->tb6_lock); ++ pcpu_rt = rt6_make_pcpu_route(rt); ++ rt6_release(rt); ++ } else { ++ /* rt is already removed from tree */ ++ read_unlock_bh(&table->tb6_lock); ++ pcpu_rt = net->ipv6.ip6_null_entry; ++ dst_hold(&pcpu_rt->dst); ++ } + } + + trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6); + return pcpu_rt; +- + } + } + EXPORT_SYMBOL_GPL(ip6_pol_route); +-- +2.15.0 + diff --git a/queue/ixgbe-fix-use-of-uninitialized-padding.patch b/queue/ixgbe-fix-use-of-uninitialized-padding.patch new file mode 100644 index 0000000..8f5e63a --- /dev/null +++ b/queue/ixgbe-fix-use-of-uninitialized-padding.patch @@ -0,0 +1,51 @@ +From dcfd6b839c998bc9838e2a47f44f37afbdf3099c Mon Sep 17 00:00:00 2001 +From: Emil Tantilov <emil.s.tantilov@intel.com> +Date: Mon, 11 Sep 2017 14:21:31 -0700 +Subject: [PATCH] ixgbe: fix use of uninitialized padding + +commit dcfd6b839c998bc9838e2a47f44f37afbdf3099c upstream. + +This patch is resolving Coverity hits where padding in a structure could +be used uninitialized. + +- Initialize fwd_cmd.pad/2 before ixgbe_calculate_checksum() + +- Initialize buffer.pad2/3 before ixgbe_hic_unlocked() + +Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com> +Tested-by: Andrew Bowers <andrewx.bowers@intel.com> +Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com> + +diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c +index 2c19070d2a0b..041940c4bb2b 100644 +--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c ++++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c +@@ -3800,10 +3800,10 @@ s32 ixgbe_set_fw_drv_ver_generic(struct ixgbe_hw *hw, u8 maj, u8 min, + fw_cmd.ver_build = build; + fw_cmd.ver_sub = sub; + fw_cmd.hdr.checksum = 0; +- fw_cmd.hdr.checksum = ixgbe_calculate_checksum((u8 *)&fw_cmd, +- (FW_CEM_HDR_LEN + fw_cmd.hdr.buf_len)); + fw_cmd.pad = 0; + fw_cmd.pad2 = 0; ++ fw_cmd.hdr.checksum = ixgbe_calculate_checksum((u8 *)&fw_cmd, ++ (FW_CEM_HDR_LEN + fw_cmd.hdr.buf_len)); + + for (i = 0; i <= FW_CEM_MAX_RETRIES; i++) { + ret_val = ixgbe_host_interface_command(hw, &fw_cmd, +diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c +index 8cea53b62e1b..cb7da5f9c4da 100644 +--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c ++++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c +@@ -900,6 +900,8 @@ static s32 ixgbe_read_ee_hostif_buffer_X550(struct ixgbe_hw *hw, + /* convert offset from words to bytes */ + buffer.address = cpu_to_be32((offset + current_word) * 2); + buffer.length = cpu_to_be16(words_to_read * 2); ++ buffer.pad2 = 0; ++ buffer.pad3 = 0; + + status = ixgbe_hic_unlocked(hw, (u32 *)&buffer, sizeof(buffer), + IXGBE_HI_COMMAND_TIMEOUT); +-- +2.15.0 + diff --git a/queue/kvm-mm-account-kvm-related-kmem-slabs-to-kmemcg.patch b/queue/kvm-mm-account-kvm-related-kmem-slabs-to-kmemcg.patch new file mode 100644 index 0000000..2847631 --- /dev/null +++ b/queue/kvm-mm-account-kvm-related-kmem-slabs-to-kmemcg.patch @@ -0,0 +1,54 @@ +From 46bea48ac241fe0b413805952dda74dd0c09ba8b Mon Sep 17 00:00:00 2001 +From: Shakeel Butt <shakeelb@google.com> +Date: Thu, 5 Oct 2017 18:07:24 -0700 +Subject: [PATCH] kvm, mm: account kvm related kmem slabs to kmemcg + +commit 46bea48ac241fe0b413805952dda74dd0c09ba8b upstream. + +The kvm slabs can consume a significant amount of system memory +and indeed in our production environment we have observed that +a lot of machines are spending significant amount of memory that +can not be left as system memory overhead. Also the allocations +from these slabs can be triggered directly by user space applications +which has access to kvm and thus a buggy application can leak +such memory. So, these caches should be accounted to kmemcg. + +Signed-off-by: Shakeel Butt <shakeelb@google.com> +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> + +diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c +index 3b7f94715c57..cdedf5320145 100644 +--- a/arch/x86/kvm/mmu.c ++++ b/arch/x86/kvm/mmu.c +@@ -5475,13 +5475,13 @@ int kvm_mmu_module_init(void) + + pte_list_desc_cache = kmem_cache_create("pte_list_desc", + sizeof(struct pte_list_desc), +- 0, 0, NULL); ++ 0, SLAB_ACCOUNT, NULL); + if (!pte_list_desc_cache) + goto nomem; + + mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", + sizeof(struct kvm_mmu_page), +- 0, 0, NULL); ++ 0, SLAB_ACCOUNT, NULL); + if (!mmu_page_header_cache) + goto nomem; + +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index 9deb5a245b83..3d73299e05f2 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -4010,7 +4010,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, + if (!vcpu_align) + vcpu_align = __alignof__(struct kvm_vcpu); + kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align, +- 0, NULL); ++ SLAB_ACCOUNT, NULL); + if (!kvm_vcpu_cache) { + r = -ENOMEM; + goto out_free_3; +-- +2.15.0 + diff --git a/queue/leds-pca955x-Don-t-invert-requested-value-in-pca955x.patch b/queue/leds-pca955x-Don-t-invert-requested-value-in-pca955x.patch new file mode 100644 index 0000000..417208b --- /dev/null +++ b/queue/leds-pca955x-Don-t-invert-requested-value-in-pca955x.patch @@ -0,0 +1,188 @@ +From 52ca7d0f7bdad832b291ed979146443533ee79c0 Mon Sep 17 00:00:00 2001 +From: Andrew Jeffery <andrew@aj.id.au> +Date: Fri, 1 Sep 2017 15:08:58 +0930 +Subject: [PATCH] leds: pca955x: Don't invert requested value in + pca955x_gpio_set_value() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 52ca7d0f7bdad832b291ed979146443533ee79c0 upstream. + +The PCA9552 lines can be used either for driving LEDs or as GPIOs. The +manual states that for LEDs, the operation is open-drain: + + The LSn LED select registers determine the source of the LED data. + + 00 = output is set LOW (LED on) + 01 = output is set high-impedance (LED off; default) + 10 = output blinks at PWM0 rate + 11 = output blinks at PWM1 rate + +For GPIOs it suggests a pull-up so that the open-case drives the line +high: + + For use as output, connect external pull-up resistor to the pin + and size it according to the DC recommended operating + characteristics. LED output pin is HIGH when the output is + programmed as high-impedance, and LOW when the output is + programmed LOW through the ‘LED selector’ register. The output + can be pulse-width controlled when PWM0 or PWM1 are used. + +Now, I have a hardware design that uses the LED controller to control +LEDs. However, for $reasons, we're using the leds-gpio driver to drive +the them. The reasons are here are a tangent but lead to the discovery +of the inversion, which manifested as the LEDs being set to full +brightness at boot when we expected them to be off. + +As we're driving the LEDs through leds-gpio, this means wending our way +through the gpiochip abstractions. So with that in mind we need to +describe an active-low GPIO configuration to drive the LEDs as though +they were GPIOs. + +The set() gpiochip callback in leds-pca955x does the following: + + ... + if (val) + pca955x_led_set(&led->led_cdev, LED_FULL); + else + pca955x_led_set(&led->led_cdev, LED_OFF); + ... + +Where LED_FULL = 255. pca955x_led_set() in turn does: + + ... + switch (value) { + case LED_FULL: + ls = pca955x_ledsel(ls, ls_led, PCA955X_LS_LED_ON); + break; + ... + +Where PCA955X_LS_LED_ON is defined as: + + #define PCA955X_LS_LED_ON 0x0 /* Output LOW */ + +So here we have some type confusion: We've crossed domains from GPIO +behaviour to LED behaviour without accounting for possible inversions +in the process. + +Stepping back to leds-gpio for a moment, during probe() we call +create_gpio_led(), which eventually executes: + + if (template->default_state == LEDS_GPIO_DEFSTATE_KEEP) { + state = gpiod_get_value_cansleep(led_dat->gpiod); + if (state < 0) + return state; + } else { + state = (template->default_state == LEDS_GPIO_DEFSTATE_ON); + } + ... + ret = gpiod_direction_output(led_dat->gpiod, state); + +In the devicetree the GPIO is annotated as active-low, and +gpiod_get_value_cansleep() handles this for us: + + int gpiod_get_value_cansleep(const struct gpio_desc *desc) + { + int value; + + might_sleep_if(extra_checks); + VALIDATE_DESC(desc); + value = _gpiod_get_raw_value(desc); + if (value < 0) + return value; + + if (test_bit(FLAG_ACTIVE_LOW, &desc->flags)) + value = !value; + + return value; + } + +_gpiod_get_raw_value() in turn calls through the get() callback for the +gpiochip implementation, so returning to our get() implementation in +leds-pca955x we find we extract the raw value from hardware: + + static int pca955x_gpio_get_value(struct gpio_chip *gc, unsigned int offset) + { + struct pca955x *pca955x = gpiochip_get_data(gc); + struct pca955x_led *led = &pca955x->leds[offset]; + u8 reg = pca955x_read_input(pca955x->client, led->led_num / 8); + + return !!(reg & (1 << (led->led_num % 8))); + } + +This behaviour is not symmetric with that of set(), where the val is +inverted by the driver. + +Closing the loop on the GPIO_ACTIVE_LOW inversions, +gpiod_direction_output(), like gpiod_get_value_cansleep(), handles it +for us: + + int gpiod_direction_output(struct gpio_desc *desc, int value) + { + VALIDATE_DESC(desc); + if (test_bit(FLAG_ACTIVE_LOW, &desc->flags)) + value = !value; + else + value = !!value; + return _gpiod_direction_output_raw(desc, value); + } + +All-in-all, with a value of 'keep' for default-state property in a +leds-gpio child node, the current state of the hardware will in-fact be +inverted; precisely the opposite of what was intended. + +Rework leds-pca955x so that we avoid the incorrect inversion and clarify +the semantics with respect to GPIO. + +Signed-off-by: Andrew Jeffery <andrew@aj.id.au> +Reviewed-by: Cédric Le Goater <clg@kaod.org> +Tested-by: Joel Stanley <joel@jms.id.au> +Tested-by: Matt Spinler <mspinler@linux.vnet.ibm.com> +Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com> + +diff --git a/drivers/leds/leds-pca955x.c b/drivers/leds/leds-pca955x.c +index 905729191d3e..78183f90820e 100644 +--- a/drivers/leds/leds-pca955x.c ++++ b/drivers/leds/leds-pca955x.c +@@ -61,6 +61,10 @@ + #define PCA955X_LS_BLINK0 0x2 /* Blink at PWM0 rate */ + #define PCA955X_LS_BLINK1 0x3 /* Blink at PWM1 rate */ + ++#define PCA955X_GPIO_INPUT LED_OFF ++#define PCA955X_GPIO_HIGH LED_OFF ++#define PCA955X_GPIO_LOW LED_FULL ++ + enum pca955x_type { + pca9550, + pca9551, +@@ -329,9 +333,9 @@ static int pca955x_set_value(struct gpio_chip *gc, unsigned int offset, + struct pca955x_led *led = &pca955x->leds[offset]; + + if (val) +- return pca955x_led_set(&led->led_cdev, LED_FULL); +- else +- return pca955x_led_set(&led->led_cdev, LED_OFF); ++ return pca955x_led_set(&led->led_cdev, PCA955X_GPIO_HIGH); ++ ++ return pca955x_led_set(&led->led_cdev, PCA955X_GPIO_LOW); + } + + static void pca955x_gpio_set_value(struct gpio_chip *gc, unsigned int offset, +@@ -355,8 +359,11 @@ static int pca955x_gpio_get_value(struct gpio_chip *gc, unsigned int offset) + static int pca955x_gpio_direction_input(struct gpio_chip *gc, + unsigned int offset) + { +- /* To use as input ensure pin is not driven */ +- return pca955x_set_value(gc, offset, 0); ++ struct pca955x *pca955x = gpiochip_get_data(gc); ++ struct pca955x_led *led = &pca955x->leds[offset]; ++ ++ /* To use as input ensure pin is not driven. */ ++ return pca955x_led_set(&led->led_cdev, PCA955X_GPIO_INPUT); + } + + static int pca955x_gpio_direction_output(struct gpio_chip *gc, +-- +2.15.0 + diff --git a/queue/linux-compiler.h-Split-into-compiler.h-and-compiler_.patch b/queue/linux-compiler.h-Split-into-compiler.h-and-compiler_.patch new file mode 100644 index 0000000..5bb5da6 --- /dev/null +++ b/queue/linux-compiler.h-Split-into-compiler.h-and-compiler_.patch @@ -0,0 +1,729 @@ +From d15155824c5014803d91b829736d249c500bdda6 Mon Sep 17 00:00:00 2001 +From: Will Deacon <will.deacon@arm.com> +Date: Tue, 24 Oct 2017 11:22:46 +0100 +Subject: [PATCH] linux/compiler.h: Split into compiler.h and compiler_types.h + +commit d15155824c5014803d91b829736d249c500bdda6 upstream. + +linux/compiler.h is included indirectly by linux/types.h via +uapi/linux/types.h -> uapi/linux/posix_types.h -> linux/stddef.h +-> uapi/linux/stddef.h and is needed to provide a proper definition of +offsetof. + +Unfortunately, compiler.h requires a definition of +smp_read_barrier_depends() for defining lockless_dereference() and soon +for defining READ_ONCE(), which means that all +users of READ_ONCE() will need to include asm/barrier.h to avoid splats +such as: + + In file included from include/uapi/linux/stddef.h:1:0, + from include/linux/stddef.h:4, + from arch/h8300/kernel/asm-offsets.c:11: + include/linux/list.h: In function 'list_empty': +>> include/linux/compiler.h:343:2: error: implicit declaration of function 'smp_read_barrier_depends' [-Werror=implicit-function-declaration] + smp_read_barrier_depends(); /* Enforce dependency ordering from x */ \ + ^ + +A better alternative is to include asm/barrier.h in linux/compiler.h, +but this requires a type definition for "bool" on some architectures +(e.g. x86), which is defined later by linux/types.h. Type "bool" is also +used directly in linux/compiler.h, so the whole thing is pretty fragile. + +This patch splits compiler.h in two: compiler_types.h contains type +annotations, definitions and the compiler-specific parts, whereas +compiler.h #includes compiler-types.h and additionally defines macros +such as {READ,WRITE.ACCESS}_ONCE(). + +uapi/linux/stddef.h and linux/linkage.h are then moved over to include +linux/compiler_types.h, which fixes the build for h8 and blackfin. + +Signed-off-by: Will Deacon <will.deacon@arm.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/1508840570-22169-2-git-send-email-will.deacon@arm.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/arm/include/asm/ptrace.h b/arch/arm/include/asm/ptrace.h +index e9c9a117bd25..c7cdbb43ae7c 100644 +--- a/arch/arm/include/asm/ptrace.h ++++ b/arch/arm/include/asm/ptrace.h +@@ -126,8 +126,7 @@ extern unsigned long profile_pc(struct pt_regs *regs); + /* + * kprobe-based event tracer support + */ +-#include <linux/stddef.h> +-#include <linux/types.h> ++#include <linux/compiler.h> + #define MAX_REG_OFFSET (offsetof(struct pt_regs, ARM_ORIG_r0)) + + extern int regs_query_register_offset(const char *name); +diff --git a/arch/sparc/include/asm/ptrace.h b/arch/sparc/include/asm/ptrace.h +index d73428e4333c..b383484edcd3 100644 +--- a/arch/sparc/include/asm/ptrace.h ++++ b/arch/sparc/include/asm/ptrace.h +@@ -6,6 +6,7 @@ + #if defined(__sparc__) && defined(__arch64__) + #ifndef __ASSEMBLY__ + ++#include <linux/compiler.h> + #include <linux/threads.h> + #include <asm/switch_to.h> + +diff --git a/arch/um/include/shared/init.h b/arch/um/include/shared/init.h +index 233e2593eee0..094e96ce653b 100644 +--- a/arch/um/include/shared/init.h ++++ b/arch/um/include/shared/init.h +@@ -40,7 +40,7 @@ + typedef int (*initcall_t)(void); + typedef void (*exitcall_t)(void); + +-#include <linux/compiler.h> ++#include <linux/compiler_types.h> + + /* These are for everybody (although not all archs will actually + discard it in modules) */ +diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h +index de179993e039..5947a3e6c0e6 100644 +--- a/include/linux/compiler-clang.h ++++ b/include/linux/compiler-clang.h +@@ -1,4 +1,4 @@ +-#ifndef __LINUX_COMPILER_H ++#ifndef __LINUX_COMPILER_TYPES_H + #error "Please don't include <linux/compiler-clang.h> directly, include <linux/compiler.h> instead." + #endif + +diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h +index 16d41de92ee3..ce8e965646ef 100644 +--- a/include/linux/compiler-gcc.h ++++ b/include/linux/compiler-gcc.h +@@ -1,4 +1,4 @@ +-#ifndef __LINUX_COMPILER_H ++#ifndef __LINUX_COMPILER_TYPES_H + #error "Please don't include <linux/compiler-gcc.h> directly, include <linux/compiler.h> instead." + #endif + +diff --git a/include/linux/compiler-intel.h b/include/linux/compiler-intel.h +index d4c71132d07f..e438ac89c692 100644 +--- a/include/linux/compiler-intel.h ++++ b/include/linux/compiler-intel.h +@@ -1,4 +1,4 @@ +-#ifndef __LINUX_COMPILER_H ++#ifndef __LINUX_COMPILER_TYPES_H + #error "Please don't include <linux/compiler-intel.h> directly, include <linux/compiler.h> instead." + #endif + +diff --git a/include/linux/compiler.h b/include/linux/compiler.h +index e95a2631e545..08083186e54f 100644 +--- a/include/linux/compiler.h ++++ b/include/linux/compiler.h +@@ -1,111 +1,12 @@ + #ifndef __LINUX_COMPILER_H + #define __LINUX_COMPILER_H + +-#ifndef __ASSEMBLY__ ++#include <linux/compiler_types.h> + +-#ifdef __CHECKER__ +-# define __user __attribute__((noderef, address_space(1))) +-# define __kernel __attribute__((address_space(0))) +-# define __safe __attribute__((safe)) +-# define __force __attribute__((force)) +-# define __nocast __attribute__((nocast)) +-# define __iomem __attribute__((noderef, address_space(2))) +-# define __must_hold(x) __attribute__((context(x,1,1))) +-# define __acquires(x) __attribute__((context(x,0,1))) +-# define __releases(x) __attribute__((context(x,1,0))) +-# define __acquire(x) __context__(x,1) +-# define __release(x) __context__(x,-1) +-# define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0) +-# define __percpu __attribute__((noderef, address_space(3))) +-# define __rcu __attribute__((noderef, address_space(4))) +-# define __private __attribute__((noderef)) +-extern void __chk_user_ptr(const volatile void __user *); +-extern void __chk_io_ptr(const volatile void __iomem *); +-# define ACCESS_PRIVATE(p, member) (*((typeof((p)->member) __force *) &(p)->member)) +-#else /* __CHECKER__ */ +-# ifdef STRUCTLEAK_PLUGIN +-# define __user __attribute__((user)) +-# else +-# define __user +-# endif +-# define __kernel +-# define __safe +-# define __force +-# define __nocast +-# define __iomem +-# define __chk_user_ptr(x) (void)0 +-# define __chk_io_ptr(x) (void)0 +-# define __builtin_warning(x, y...) (1) +-# define __must_hold(x) +-# define __acquires(x) +-# define __releases(x) +-# define __acquire(x) (void)0 +-# define __release(x) (void)0 +-# define __cond_lock(x,c) (c) +-# define __percpu +-# define __rcu +-# define __private +-# define ACCESS_PRIVATE(p, member) ((p)->member) +-#endif /* __CHECKER__ */ +- +-/* Indirect macros required for expanded argument pasting, eg. __LINE__. */ +-#define ___PASTE(a,b) a##b +-#define __PASTE(a,b) ___PASTE(a,b) ++#ifndef __ASSEMBLY__ + + #ifdef __KERNEL__ + +-#ifdef __GNUC__ +-#include <linux/compiler-gcc.h> +-#endif +- +-#if defined(CC_USING_HOTPATCH) && !defined(__CHECKER__) +-#define notrace __attribute__((hotpatch(0,0))) +-#else +-#define notrace __attribute__((no_instrument_function)) +-#endif +- +-/* Intel compiler defines __GNUC__. So we will overwrite implementations +- * coming from above header files here +- */ +-#ifdef __INTEL_COMPILER +-# include <linux/compiler-intel.h> +-#endif +- +-/* Clang compiler defines __GNUC__. So we will overwrite implementations +- * coming from above header files here +- */ +-#ifdef __clang__ +-#include <linux/compiler-clang.h> +-#endif +- +-/* +- * Generic compiler-dependent macros required for kernel +- * build go below this comment. Actual compiler/compiler version +- * specific implementations come from the above header files +- */ +- +-struct ftrace_branch_data { +- const char *func; +- const char *file; +- unsigned line; +- union { +- struct { +- unsigned long correct; +- unsigned long incorrect; +- }; +- struct { +- unsigned long miss; +- unsigned long hit; +- }; +- unsigned long miss_hit[2]; +- }; +-}; +- +-struct ftrace_likely_data { +- struct ftrace_branch_data data; +- unsigned long constant; +-}; +- + /* + * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code + * to disable branch tracing on a per file basis. +@@ -332,6 +233,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s + * with an explicit memory barrier or atomic instruction that provides the + * required ordering. + */ ++#include <asm/barrier.h> + + #define __READ_ONCE(x, check) \ + ({ \ +@@ -362,167 +264,6 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s + + #endif /* __ASSEMBLY__ */ + +-#ifdef __KERNEL__ +-/* +- * Allow us to mark functions as 'deprecated' and have gcc emit a nice +- * warning for each use, in hopes of speeding the functions removal. +- * Usage is: +- * int __deprecated foo(void) +- */ +-#ifndef __deprecated +-# define __deprecated /* unimplemented */ +-#endif +- +-#ifdef MODULE +-#define __deprecated_for_modules __deprecated +-#else +-#define __deprecated_for_modules +-#endif +- +-#ifndef __must_check +-#define __must_check +-#endif +- +-#ifndef CONFIG_ENABLE_MUST_CHECK +-#undef __must_check +-#define __must_check +-#endif +-#ifndef CONFIG_ENABLE_WARN_DEPRECATED +-#undef __deprecated +-#undef __deprecated_for_modules +-#define __deprecated +-#define __deprecated_for_modules +-#endif +- +-#ifndef __malloc +-#define __malloc +-#endif +- +-/* +- * Allow us to avoid 'defined but not used' warnings on functions and data, +- * as well as force them to be emitted to the assembly file. +- * +- * As of gcc 3.4, static functions that are not marked with attribute((used)) +- * may be elided from the assembly file. As of gcc 3.4, static data not so +- * marked will not be elided, but this may change in a future gcc version. +- * +- * NOTE: Because distributions shipped with a backported unit-at-a-time +- * compiler in gcc 3.3, we must define __used to be __attribute__((used)) +- * for gcc >=3.3 instead of 3.4. +- * +- * In prior versions of gcc, such functions and data would be emitted, but +- * would be warned about except with attribute((unused)). +- * +- * Mark functions that are referenced only in inline assembly as __used so +- * the code is emitted even though it appears to be unreferenced. +- */ +-#ifndef __used +-# define __used /* unimplemented */ +-#endif +- +-#ifndef __maybe_unused +-# define __maybe_unused /* unimplemented */ +-#endif +- +-#ifndef __always_unused +-# define __always_unused /* unimplemented */ +-#endif +- +-#ifndef noinline +-#define noinline +-#endif +- +-/* +- * Rather then using noinline to prevent stack consumption, use +- * noinline_for_stack instead. For documentation reasons. +- */ +-#define noinline_for_stack noinline +- +-#ifndef __always_inline +-#define __always_inline inline +-#endif +- +-#endif /* __KERNEL__ */ +- +-/* +- * From the GCC manual: +- * +- * Many functions do not examine any values except their arguments, +- * and have no effects except the return value. Basically this is +- * just slightly more strict class than the `pure' attribute above, +- * since function is not allowed to read global memory. +- * +- * Note that a function that has pointer arguments and examines the +- * data pointed to must _not_ be declared `const'. Likewise, a +- * function that calls a non-`const' function usually must not be +- * `const'. It does not make sense for a `const' function to return +- * `void'. +- */ +-#ifndef __attribute_const__ +-# define __attribute_const__ /* unimplemented */ +-#endif +- +-#ifndef __designated_init +-# define __designated_init +-#endif +- +-#ifndef __latent_entropy +-# define __latent_entropy +-#endif +- +-#ifndef __randomize_layout +-# define __randomize_layout __designated_init +-#endif +- +-#ifndef __no_randomize_layout +-# define __no_randomize_layout +-#endif +- +-#ifndef randomized_struct_fields_start +-# define randomized_struct_fields_start +-# define randomized_struct_fields_end +-#endif +- +-/* +- * Tell gcc if a function is cold. The compiler will assume any path +- * directly leading to the call is unlikely. +- */ +- +-#ifndef __cold +-#define __cold +-#endif +- +-/* Simple shorthand for a section definition */ +-#ifndef __section +-# define __section(S) __attribute__ ((__section__(#S))) +-#endif +- +-#ifndef __visible +-#define __visible +-#endif +- +-#ifndef __nostackprotector +-# define __nostackprotector +-#endif +- +-/* +- * Assume alignment of return value. +- */ +-#ifndef __assume_aligned +-#define __assume_aligned(a, ...) +-#endif +- +- +-/* Are two types/vars the same type (ignoring qualifiers)? */ +-#ifndef __same_type +-# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) +-#endif +- +-/* Is this type a native word size -- useful for atomic operations */ +-#ifndef __native_word +-# define __native_word(t) (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) +-#endif +- + /* Compile time object size, -1 for unknown */ + #ifndef __compiletime_object_size + # define __compiletime_object_size(obj) -1 +diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h +new file mode 100644 +index 000000000000..6b79a9bba9a7 +--- /dev/null ++++ b/include/linux/compiler_types.h +@@ -0,0 +1,274 @@ ++#ifndef __LINUX_COMPILER_TYPES_H ++#define __LINUX_COMPILER_TYPES_H ++ ++#ifndef __ASSEMBLY__ ++ ++#ifdef __CHECKER__ ++# define __user __attribute__((noderef, address_space(1))) ++# define __kernel __attribute__((address_space(0))) ++# define __safe __attribute__((safe)) ++# define __force __attribute__((force)) ++# define __nocast __attribute__((nocast)) ++# define __iomem __attribute__((noderef, address_space(2))) ++# define __must_hold(x) __attribute__((context(x,1,1))) ++# define __acquires(x) __attribute__((context(x,0,1))) ++# define __releases(x) __attribute__((context(x,1,0))) ++# define __acquire(x) __context__(x,1) ++# define __release(x) __context__(x,-1) ++# define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0) ++# define __percpu __attribute__((noderef, address_space(3))) ++# define __rcu __attribute__((noderef, address_space(4))) ++# define __private __attribute__((noderef)) ++extern void __chk_user_ptr(const volatile void __user *); ++extern void __chk_io_ptr(const volatile void __iomem *); ++# define ACCESS_PRIVATE(p, member) (*((typeof((p)->member) __force *) &(p)->member)) ++#else /* __CHECKER__ */ ++# ifdef STRUCTLEAK_PLUGIN ++# define __user __attribute__((user)) ++# else ++# define __user ++# endif ++# define __kernel ++# define __safe ++# define __force ++# define __nocast ++# define __iomem ++# define __chk_user_ptr(x) (void)0 ++# define __chk_io_ptr(x) (void)0 ++# define __builtin_warning(x, y...) (1) ++# define __must_hold(x) ++# define __acquires(x) ++# define __releases(x) ++# define __acquire(x) (void)0 ++# define __release(x) (void)0 ++# define __cond_lock(x,c) (c) ++# define __percpu ++# define __rcu ++# define __private ++# define ACCESS_PRIVATE(p, member) ((p)->member) ++#endif /* __CHECKER__ */ ++ ++/* Indirect macros required for expanded argument pasting, eg. __LINE__. */ ++#define ___PASTE(a,b) a##b ++#define __PASTE(a,b) ___PASTE(a,b) ++ ++#ifdef __KERNEL__ ++ ++#ifdef __GNUC__ ++#include <linux/compiler-gcc.h> ++#endif ++ ++#if defined(CC_USING_HOTPATCH) && !defined(__CHECKER__) ++#define notrace __attribute__((hotpatch(0,0))) ++#else ++#define notrace __attribute__((no_instrument_function)) ++#endif ++ ++/* Intel compiler defines __GNUC__. So we will overwrite implementations ++ * coming from above header files here ++ */ ++#ifdef __INTEL_COMPILER ++# include <linux/compiler-intel.h> ++#endif ++ ++/* Clang compiler defines __GNUC__. So we will overwrite implementations ++ * coming from above header files here ++ */ ++#ifdef __clang__ ++#include <linux/compiler-clang.h> ++#endif ++ ++/* ++ * Generic compiler-dependent macros required for kernel ++ * build go below this comment. Actual compiler/compiler version ++ * specific implementations come from the above header files ++ */ ++ ++struct ftrace_branch_data { ++ const char *func; ++ const char *file; ++ unsigned line; ++ union { ++ struct { ++ unsigned long correct; ++ unsigned long incorrect; ++ }; ++ struct { ++ unsigned long miss; ++ unsigned long hit; ++ }; ++ unsigned long miss_hit[2]; ++ }; ++}; ++ ++struct ftrace_likely_data { ++ struct ftrace_branch_data data; ++ unsigned long constant; ++}; ++ ++#endif /* __KERNEL__ */ ++ ++#endif /* __ASSEMBLY__ */ ++ ++#ifdef __KERNEL__ ++/* ++ * Allow us to mark functions as 'deprecated' and have gcc emit a nice ++ * warning for each use, in hopes of speeding the functions removal. ++ * Usage is: ++ * int __deprecated foo(void) ++ */ ++#ifndef __deprecated ++# define __deprecated /* unimplemented */ ++#endif ++ ++#ifdef MODULE ++#define __deprecated_for_modules __deprecated ++#else ++#define __deprecated_for_modules ++#endif ++ ++#ifndef __must_check ++#define __must_check ++#endif ++ ++#ifndef CONFIG_ENABLE_MUST_CHECK ++#undef __must_check ++#define __must_check ++#endif ++#ifndef CONFIG_ENABLE_WARN_DEPRECATED ++#undef __deprecated ++#undef __deprecated_for_modules ++#define __deprecated ++#define __deprecated_for_modules ++#endif ++ ++#ifndef __malloc ++#define __malloc ++#endif ++ ++/* ++ * Allow us to avoid 'defined but not used' warnings on functions and data, ++ * as well as force them to be emitted to the assembly file. ++ * ++ * As of gcc 3.4, static functions that are not marked with attribute((used)) ++ * may be elided from the assembly file. As of gcc 3.4, static data not so ++ * marked will not be elided, but this may change in a future gcc version. ++ * ++ * NOTE: Because distributions shipped with a backported unit-at-a-time ++ * compiler in gcc 3.3, we must define __used to be __attribute__((used)) ++ * for gcc >=3.3 instead of 3.4. ++ * ++ * In prior versions of gcc, such functions and data would be emitted, but ++ * would be warned about except with attribute((unused)). ++ * ++ * Mark functions that are referenced only in inline assembly as __used so ++ * the code is emitted even though it appears to be unreferenced. ++ */ ++#ifndef __used ++# define __used /* unimplemented */ ++#endif ++ ++#ifndef __maybe_unused ++# define __maybe_unused /* unimplemented */ ++#endif ++ ++#ifndef __always_unused ++# define __always_unused /* unimplemented */ ++#endif ++ ++#ifndef noinline ++#define noinline ++#endif ++ ++/* ++ * Rather then using noinline to prevent stack consumption, use ++ * noinline_for_stack instead. For documentation reasons. ++ */ ++#define noinline_for_stack noinline ++ ++#ifndef __always_inline ++#define __always_inline inline ++#endif ++ ++#endif /* __KERNEL__ */ ++ ++/* ++ * From the GCC manual: ++ * ++ * Many functions do not examine any values except their arguments, ++ * and have no effects except the return value. Basically this is ++ * just slightly more strict class than the `pure' attribute above, ++ * since function is not allowed to read global memory. ++ * ++ * Note that a function that has pointer arguments and examines the ++ * data pointed to must _not_ be declared `const'. Likewise, a ++ * function that calls a non-`const' function usually must not be ++ * `const'. It does not make sense for a `const' function to return ++ * `void'. ++ */ ++#ifndef __attribute_const__ ++# define __attribute_const__ /* unimplemented */ ++#endif ++ ++#ifndef __designated_init ++# define __designated_init ++#endif ++ ++#ifndef __latent_entropy ++# define __latent_entropy ++#endif ++ ++#ifndef __randomize_layout ++# define __randomize_layout __designated_init ++#endif ++ ++#ifndef __no_randomize_layout ++# define __no_randomize_layout ++#endif ++ ++#ifndef randomized_struct_fields_start ++# define randomized_struct_fields_start ++# define randomized_struct_fields_end ++#endif ++ ++/* ++ * Tell gcc if a function is cold. The compiler will assume any path ++ * directly leading to the call is unlikely. ++ */ ++ ++#ifndef __cold ++#define __cold ++#endif ++ ++/* Simple shorthand for a section definition */ ++#ifndef __section ++# define __section(S) __attribute__ ((__section__(#S))) ++#endif ++ ++#ifndef __visible ++#define __visible ++#endif ++ ++#ifndef __nostackprotector ++# define __nostackprotector ++#endif ++ ++/* ++ * Assume alignment of return value. ++ */ ++#ifndef __assume_aligned ++#define __assume_aligned(a, ...) ++#endif ++ ++ ++/* Are two types/vars the same type (ignoring qualifiers)? */ ++#ifndef __same_type ++# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) ++#endif ++ ++/* Is this type a native word size -- useful for atomic operations */ ++#ifndef __native_word ++# define __native_word(t) (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) ++#endif ++ ++#endif /* __LINUX_COMPILER_TYPES_H */ +diff --git a/include/linux/linkage.h b/include/linux/linkage.h +index a6a42dd02466..ebd61b80fed4 100644 +--- a/include/linux/linkage.h ++++ b/include/linux/linkage.h +@@ -1,7 +1,7 @@ + #ifndef _LINUX_LINKAGE_H + #define _LINUX_LINKAGE_H + +-#include <linux/compiler.h> ++#include <linux/compiler_types.h> + #include <linux/stringify.h> + #include <linux/export.h> + #include <asm/linkage.h> +diff --git a/include/uapi/linux/stddef.h b/include/uapi/linux/stddef.h +index 621fa8ac4425..d1f7cb732dfc 100644 +--- a/include/uapi/linux/stddef.h ++++ b/include/uapi/linux/stddef.h +@@ -1,4 +1,4 @@ +-#include <linux/compiler.h> ++#include <linux/compiler_types.h> + + #ifndef __always_inline + #define __always_inline inline +diff --git a/scripts/headers_install.sh b/scripts/headers_install.sh +index fdebd66f8fc1..63b8cc26456a 100755 +--- a/scripts/headers_install.sh ++++ b/scripts/headers_install.sh +@@ -33,7 +33,7 @@ do + sed -r \ + -e 's/([ \t(])(__user|__force|__iomem)[ \t]/\1/g' \ + -e 's/__attribute_const__([ \t]|$)/\1/g' \ +- -e 's@^#include <linux/compiler.h>@@' \ ++ -e 's@^#include <linux/compiler(|_types).h>@@' \ + -e 's/(^|[^a-zA-Z0-9])__packed([^a-zA-Z0-9_]|$)/\1__attribute__((packed))\2/g' \ + -e 's/(^|[ \t(])(inline|asm|volatile)([ \t(]|$)/\1__\2__\3/g' \ + -e 's@#(ifndef|define|endif[ \t]*/[*])[ \t]*_UAPI@#\1 @' \ +-- +2.15.0 + diff --git a/queue/locking-barriers-Add-implicit-smp_read_barrier_depen.patch b/queue/locking-barriers-Add-implicit-smp_read_barrier_depen.patch new file mode 100644 index 0000000..a2aa1bb --- /dev/null +++ b/queue/locking-barriers-Add-implicit-smp_read_barrier_depen.patch @@ -0,0 +1,42 @@ +From c2bc66082e1048c7573d72e62f597bdc5ce13fea Mon Sep 17 00:00:00 2001 +From: Will Deacon <will.deacon@arm.com> +Date: Tue, 24 Oct 2017 11:22:47 +0100 +Subject: [PATCH] locking/barriers: Add implicit smp_read_barrier_depends() to + READ_ONCE() + +commit c2bc66082e1048c7573d72e62f597bdc5ce13fea upstream. + +[ Note, this is a Git cherry-pick of the following commit: + + 76ebbe78f739 ("locking/barriers: Add implicit smp_read_barrier_depends() to READ_ONCE()") + + ... for easier x86 PTI code testing and back-porting. ] + +In preparation for the removal of lockless_dereference(), which is the +same as READ_ONCE() on all architectures other than Alpha, add an +implicit smp_read_barrier_depends() to READ_ONCE() so that it can be +used to head dependency chains on all architectures. + +Signed-off-by: Will Deacon <will.deacon@arm.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/1508840570-22169-3-git-send-email-will.deacon@arm.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/include/linux/compiler.h b/include/linux/compiler.h +index 202710420d6d..712cd8bb00b4 100644 +--- a/include/linux/compiler.h ++++ b/include/linux/compiler.h +@@ -341,6 +341,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s + __read_once_size(&(x), __u.__c, sizeof(x)); \ + else \ + __read_once_size_nocheck(&(x), __u.__c, sizeof(x)); \ ++ smp_read_barrier_depends(); /* Enforce dependency ordering from x */ \ + __u.__val; \ + }) + #define READ_ONCE(x) __READ_ONCE(x, 1) +-- +2.15.0 + diff --git a/queue/locking-barriers-Convert-users-of-lockless_dereferen.patch b/queue/locking-barriers-Convert-users-of-lockless_dereferen.patch new file mode 100644 index 0000000..fca1536 --- /dev/null +++ b/queue/locking-barriers-Convert-users-of-lockless_dereferen.patch @@ -0,0 +1,301 @@ +From 3382290ed2d5e275429cef510ab21889d3ccd164 Mon Sep 17 00:00:00 2001 +From: Will Deacon <will.deacon@arm.com> +Date: Tue, 24 Oct 2017 11:22:48 +0100 +Subject: [PATCH] locking/barriers: Convert users of lockless_dereference() to + READ_ONCE() + +commit 3382290ed2d5e275429cef510ab21889d3ccd164 upstream. + +[ Note, this is a Git cherry-pick of the following commit: + + 506458efaf15 ("locking/barriers: Convert users of lockless_dereference() to READ_ONCE()") + + ... for easier x86 PTI code testing and back-porting. ] + +READ_ONCE() now has an implicit smp_read_barrier_depends() call, so it +can be used instead of lockless_dereference() without any change in +semantics. + +Signed-off-by: Will Deacon <will.deacon@arm.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/1508840570-22169-4-git-send-email-will.deacon@arm.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c +index 80534d3c2480..589af1eec7c1 100644 +--- a/arch/x86/events/core.c ++++ b/arch/x86/events/core.c +@@ -2371,7 +2371,7 @@ static unsigned long get_segment_base(unsigned int segment) + struct ldt_struct *ldt; + + /* IRQs are off, so this synchronizes with smp_store_release */ +- ldt = lockless_dereference(current->active_mm->context.ldt); ++ ldt = READ_ONCE(current->active_mm->context.ldt); + if (!ldt || idx >= ldt->nr_entries) + return 0; + +diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h +index 6699fc441644..6d16d15d09a0 100644 +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -73,8 +73,8 @@ static inline void load_mm_ldt(struct mm_struct *mm) + #ifdef CONFIG_MODIFY_LDT_SYSCALL + struct ldt_struct *ldt; + +- /* lockless_dereference synchronizes with smp_store_release */ +- ldt = lockless_dereference(mm->context.ldt); ++ /* READ_ONCE synchronizes with smp_store_release */ ++ ldt = READ_ONCE(mm->context.ldt); + + /* + * Any change to mm->context.ldt is followed by an IPI to all +diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c +index ae5615b03def..1c1eae961340 100644 +--- a/arch/x86/kernel/ldt.c ++++ b/arch/x86/kernel/ldt.c +@@ -103,7 +103,7 @@ static void finalize_ldt_struct(struct ldt_struct *ldt) + static void install_ldt(struct mm_struct *current_mm, + struct ldt_struct *ldt) + { +- /* Synchronizes with lockless_dereference in load_mm_ldt. */ ++ /* Synchronizes with READ_ONCE in load_mm_ldt. */ + smp_store_release(¤t_mm->context.ldt, ldt); + + /* Activate the LDT for all CPUs using current_mm. */ +diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c +index 11f273d2f018..3f88c9d32f7e 100644 +--- a/drivers/md/dm-mpath.c ++++ b/drivers/md/dm-mpath.c +@@ -366,7 +366,7 @@ static struct pgpath *choose_path_in_pg(struct multipath *m, + + pgpath = path_to_pgpath(path); + +- if (unlikely(lockless_dereference(m->current_pg) != pg)) { ++ if (unlikely(READ_ONCE(m->current_pg) != pg)) { + /* Only update current_pgpath if pg changed */ + spin_lock_irqsave(&m->lock, flags); + m->current_pgpath = pgpath; +@@ -390,7 +390,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) + } + + /* Were we instructed to switch PG? */ +- if (lockless_dereference(m->next_pg)) { ++ if (READ_ONCE(m->next_pg)) { + spin_lock_irqsave(&m->lock, flags); + pg = m->next_pg; + if (!pg) { +@@ -406,7 +406,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) + + /* Don't change PG until it has no remaining paths */ + check_current_pg: +- pg = lockless_dereference(m->current_pg); ++ pg = READ_ONCE(m->current_pg); + if (pg) { + pgpath = choose_path_in_pg(m, pg, nr_bytes); + if (!IS_ERR_OR_NULL(pgpath)) +@@ -473,7 +473,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, + struct request *clone; + + /* Do we need to select a new pgpath? */ +- pgpath = lockless_dereference(m->current_pgpath); ++ pgpath = READ_ONCE(m->current_pgpath); + if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) + pgpath = choose_pgpath(m, nr_bytes); + +@@ -535,7 +535,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m + bool queue_io; + + /* Do we need to select a new pgpath? */ +- pgpath = lockless_dereference(m->current_pgpath); ++ pgpath = READ_ONCE(m->current_pgpath); + queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags); + if (!pgpath || !queue_io) + pgpath = choose_pgpath(m, nr_bytes); +@@ -1804,7 +1804,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, + struct pgpath *current_pgpath; + int r; + +- current_pgpath = lockless_dereference(m->current_pgpath); ++ current_pgpath = READ_ONCE(m->current_pgpath); + if (!current_pgpath) + current_pgpath = choose_pgpath(m, 0); + +@@ -1826,7 +1826,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, + } + + if (r == -ENOTCONN) { +- if (!lockless_dereference(m->current_pg)) { ++ if (!READ_ONCE(m->current_pg)) { + /* Path status changed, redo selection */ + (void) choose_pgpath(m, 0); + } +@@ -1895,9 +1895,9 @@ static int multipath_busy(struct dm_target *ti) + return (m->queue_mode != DM_TYPE_MQ_REQUEST_BASED); + + /* Guess which priority_group will be used at next mapping time */ +- pg = lockless_dereference(m->current_pg); +- next_pg = lockless_dereference(m->next_pg); +- if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg)) ++ pg = READ_ONCE(m->current_pg); ++ next_pg = READ_ONCE(m->next_pg); ++ if (unlikely(!READ_ONCE(m->current_pgpath) && next_pg)) + pg = next_pg; + + if (!pg) { +diff --git a/fs/dcache.c b/fs/dcache.c +index f90141387f01..34c852af215c 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -231,7 +231,7 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c + { + /* + * Be careful about RCU walk racing with rename: +- * use 'lockless_dereference' to fetch the name pointer. ++ * use 'READ_ONCE' to fetch the name pointer. + * + * NOTE! Even if a rename will mean that the length + * was not loaded atomically, we don't care. The +@@ -245,7 +245,7 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c + * early because the data cannot match (there can + * be no NUL in the ct/tcount data) + */ +- const unsigned char *cs = lockless_dereference(dentry->d_name.name); ++ const unsigned char *cs = READ_ONCE(dentry->d_name.name); + + return dentry_string_cmp(cs, ct, tcount); + } +diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h +index 25d9b5adcd42..36b49bd09264 100644 +--- a/fs/overlayfs/ovl_entry.h ++++ b/fs/overlayfs/ovl_entry.h +@@ -77,5 +77,5 @@ static inline struct ovl_inode *OVL_I(struct inode *inode) + + static inline struct dentry *ovl_upperdentry_dereference(struct ovl_inode *oi) + { +- return lockless_dereference(oi->__upperdentry); ++ return READ_ONCE(oi->__upperdentry); + } +diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c +index 698b74dd750e..c310e3ff7f3f 100644 +--- a/fs/overlayfs/readdir.c ++++ b/fs/overlayfs/readdir.c +@@ -754,7 +754,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, + if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) { + struct inode *inode = file_inode(file); + +- realfile = lockless_dereference(od->upperfile); ++ realfile = READ_ONCE(od->upperfile); + if (!realfile) { + struct path upperpath; + +diff --git a/include/linux/rculist.h b/include/linux/rculist.h +index c2cdd45a880a..127f534fec94 100644 +--- a/include/linux/rculist.h ++++ b/include/linux/rculist.h +@@ -275,7 +275,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, + * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). + */ + #define list_entry_rcu(ptr, type, member) \ +- container_of(lockless_dereference(ptr), type, member) ++ container_of(READ_ONCE(ptr), type, member) + + /* + * Where are list_empty_rcu() and list_first_entry_rcu()? +@@ -368,7 +368,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, + * example is when items are added to the list, but never deleted. + */ + #define list_entry_lockless(ptr, type, member) \ +- container_of((typeof(ptr))lockless_dereference(ptr), type, member) ++ container_of((typeof(ptr))READ_ONCE(ptr), type, member) + + /** + * list_for_each_entry_lockless - iterate over rcu list of given type +diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h +index 1a9f70d44af9..a6ddc42f87a5 100644 +--- a/include/linux/rcupdate.h ++++ b/include/linux/rcupdate.h +@@ -346,7 +346,7 @@ static inline void rcu_preempt_sleep_check(void) { } + #define __rcu_dereference_check(p, c, space) \ + ({ \ + /* Dependency order vs. p above. */ \ +- typeof(*p) *________p1 = (typeof(*p) *__force)lockless_dereference(p); \ ++ typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \ + RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \ + rcu_dereference_sparse(p, space); \ + ((typeof(*p) __force __kernel *)(________p1)); \ +@@ -360,7 +360,7 @@ static inline void rcu_preempt_sleep_check(void) { } + #define rcu_dereference_raw(p) \ + ({ \ + /* Dependency order vs. p above. */ \ +- typeof(p) ________p1 = lockless_dereference(p); \ ++ typeof(p) ________p1 = READ_ONCE(p); \ + ((typeof(*p) __force __kernel *)(________p1)); \ + }) + +diff --git a/kernel/events/core.c b/kernel/events/core.c +index 10cdb9c26b5d..6eee4ed97af0 100644 +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -4233,7 +4233,7 @@ static void perf_remove_from_owner(struct perf_event *event) + * indeed free this event, otherwise we need to serialize on + * owner->perf_event_mutex. + */ +- owner = lockless_dereference(event->owner); ++ owner = READ_ONCE(event->owner); + if (owner) { + /* + * Since delayed_put_task_struct() also drops the last +@@ -4330,7 +4330,7 @@ int perf_event_release_kernel(struct perf_event *event) + * Cannot change, child events are not migrated, see the + * comment with perf_event_ctx_lock_nested(). + */ +- ctx = lockless_dereference(child->ctx); ++ ctx = READ_ONCE(child->ctx); + /* + * Since child_mutex nests inside ctx::mutex, we must jump + * through hoops. We start by grabbing a reference on the ctx. +diff --git a/kernel/seccomp.c b/kernel/seccomp.c +index 418a1c045933..5f0dfb2abb8d 100644 +--- a/kernel/seccomp.c ++++ b/kernel/seccomp.c +@@ -190,7 +190,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, + u32 ret = SECCOMP_RET_ALLOW; + /* Make sure cross-thread synced filter points somewhere sane. */ + struct seccomp_filter *f = +- lockless_dereference(current->seccomp.filter); ++ READ_ONCE(current->seccomp.filter); + + /* Ensure unexpected behavior doesn't result in failing open. */ + if (unlikely(WARN_ON(f == NULL))) +diff --git a/kernel/task_work.c b/kernel/task_work.c +index 5718b3ea202a..0fef395662a6 100644 +--- a/kernel/task_work.c ++++ b/kernel/task_work.c +@@ -68,7 +68,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) + * we raced with task_work_run(), *pprev == NULL/exited. + */ + raw_spin_lock_irqsave(&task->pi_lock, flags); +- while ((work = lockless_dereference(*pprev))) { ++ while ((work = READ_ONCE(*pprev))) { + if (work->func != func) + pprev = &work->next; + else if (cmpxchg(pprev, work, work->next) == work) +diff --git a/mm/slab.h b/mm/slab.h +index 028cdc7df67e..86d7c7d860f9 100644 +--- a/mm/slab.h ++++ b/mm/slab.h +@@ -259,7 +259,7 @@ cache_from_memcg_idx(struct kmem_cache *s, int idx) + * memcg_caches issues a write barrier to match this (see + * memcg_create_kmem_cache()). + */ +- cachep = lockless_dereference(arr->entries[idx]); ++ cachep = READ_ONCE(arr->entries[idx]); + rcu_read_unlock(); + + return cachep; +-- +2.15.0 + diff --git a/queue/md-always-set-THREAD_WAKEUP-and-wake-up-wqueue-if-th.patch b/queue/md-always-set-THREAD_WAKEUP-and-wake-up-wqueue-if-th.patch new file mode 100644 index 0000000..13a85d7 --- /dev/null +++ b/queue/md-always-set-THREAD_WAKEUP-and-wake-up-wqueue-if-th.patch @@ -0,0 +1,59 @@ +From d1d90147c9680aaec4a5757932c2103c42c9c23b Mon Sep 17 00:00:00 2001 +From: Guoqing Jiang <gqjiang@suse.com> +Date: Mon, 9 Oct 2017 10:32:48 +0800 +Subject: [PATCH] md: always set THREAD_WAKEUP and wake up wqueue if thread + existed + +commit d1d90147c9680aaec4a5757932c2103c42c9c23b upstream. + +Since commit 4ad23a976413 ("MD: use per-cpu counter for writes_pending"), +the wait_queue is only got invoked if THREAD_WAKEUP is not set previously. + +With above change, I can see process_metadata_update could always hang on +the wait queue, because mddev->thread could stay on 'D' status and the +THREAD_WAKEUP flag is not cleared since there are lots of place to wake up +mddev->thread. Then deadlock happened as follows: + +linux175:~ # ps aux|grep md|grep D +root 20117 0.0 0.0 0 0 ? D 03:45 0:00 [md0_raid1] +root 20125 0.0 0.0 0 0 ? D 03:45 0:00 [md0_cluster_rec] +linux175:~ # cat /proc/20117/stack +[<ffffffffa0635604>] dlm_lock_sync+0x94/0xd0 [md_cluster] +[<ffffffffa0635674>] lock_token+0x34/0xd0 [md_cluster] +[<ffffffffa0635804>] metadata_update_start+0x64/0x110 [md_cluster] +[<ffffffffa04d985b>] md_update_sb.part.58+0x9b/0x860 [md_mod] +[<ffffffffa04da035>] md_update_sb+0x15/0x30 [md_mod] +[<ffffffffa04dc066>] md_check_recovery+0x266/0x490 [md_mod] +[<ffffffffa06450e2>] raid1d+0x42/0x810 [raid1] +[<ffffffffa04d2252>] md_thread+0x122/0x150 [md_mod] +[<ffffffff81091741>] kthread+0x101/0x140 +linux175:~ # cat /proc/20125/stack +[<ffffffffa0636679>] recv_daemon+0x3f9/0x5c0 [md_cluster] +[<ffffffffa04d2252>] md_thread+0x122/0x150 [md_mod] +[<ffffffff81091741>] kthread+0x101/0x140 + +So let's revert the part of code in the commit to resovle the problem since +we can't get lots of benefits of previous change. + +Fixes: 4ad23a976413 ("MD: use per-cpu counter for writes_pending") +Signed-off-by: Guoqing Jiang <gqjiang@suse.com> +Signed-off-by: Shaohua Li <shli@fb.com> + +diff --git a/drivers/md/md.c b/drivers/md/md.c +index 8b2eb0f4122f..707471e3cb01 100644 +--- a/drivers/md/md.c ++++ b/drivers/md/md.c +@@ -7468,8 +7468,8 @@ void md_wakeup_thread(struct md_thread *thread) + { + if (thread) { + pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); +- if (!test_and_set_bit(THREAD_WAKEUP, &thread->flags)) +- wake_up(&thread->wqueue); ++ set_bit(THREAD_WAKEUP, &thread->flags); ++ wake_up(&thread->wqueue); + } + } + EXPORT_SYMBOL(md_wakeup_thread); +-- +2.15.0 + diff --git a/queue/mm-sparsemem-Allocate-mem_section-at-runtime-for-CON.patch b/queue/mm-sparsemem-Allocate-mem_section-at-runtime-for-CON.patch new file mode 100644 index 0000000..0b77620 --- /dev/null +++ b/queue/mm-sparsemem-Allocate-mem_section-at-runtime-for-CON.patch @@ -0,0 +1,128 @@ +From 83e3c48729d9ebb7af5a31a504f3fd6aff0348c4 Mon Sep 17 00:00:00 2001 +From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> +Date: Fri, 29 Sep 2017 17:08:16 +0300 +Subject: [PATCH] mm/sparsemem: Allocate mem_section at runtime for + CONFIG_SPARSEMEM_EXTREME=y + +commit 83e3c48729d9ebb7af5a31a504f3fd6aff0348c4 upstream. + +Size of the mem_section[] array depends on the size of the physical address space. + +In preparation for boot-time switching between paging modes on x86-64 +we need to make the allocation of mem_section[] dynamic, because otherwise +we waste a lot of RAM: with CONFIG_NODE_SHIFT=10, mem_section[] size is 32kB +for 4-level paging and 2MB for 5-level paging mode. + +The patch allocates the array on the first call to sparse_memory_present_with_active_regions(). + +Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Borislav Petkov <bp@suse.de> +Cc: Cyrill Gorcunov <gorcunov@openvz.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: linux-mm@kvack.org +Link: http://lkml.kernel.org/r/20170929140821.37654-2-kirill.shutemov@linux.intel.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index c8f89417740b..e796edf1296f 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -1150,13 +1150,17 @@ struct mem_section { + #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) + + #ifdef CONFIG_SPARSEMEM_EXTREME +-extern struct mem_section *mem_section[NR_SECTION_ROOTS]; ++extern struct mem_section **mem_section; + #else + extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; + #endif + + static inline struct mem_section *__nr_to_section(unsigned long nr) + { ++#ifdef CONFIG_SPARSEMEM_EXTREME ++ if (!mem_section) ++ return NULL; ++#endif + if (!mem_section[SECTION_NR_TO_ROOT(nr)]) + return NULL; + return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 77e4d3c5c57b..8dfd13f724d9 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -5646,6 +5646,16 @@ void __init sparse_memory_present_with_active_regions(int nid) + unsigned long start_pfn, end_pfn; + int i, this_nid; + ++#ifdef CONFIG_SPARSEMEM_EXTREME ++ if (!mem_section) { ++ unsigned long size, align; ++ ++ size = sizeof(struct mem_section) * NR_SECTION_ROOTS; ++ align = 1 << (INTERNODE_CACHE_SHIFT); ++ mem_section = memblock_virt_alloc(size, align); ++ } ++#endif ++ + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) + memory_present(this_nid, start_pfn, end_pfn); + } +diff --git a/mm/sparse.c b/mm/sparse.c +index 83b3bf6461af..b00a97398795 100644 +--- a/mm/sparse.c ++++ b/mm/sparse.c +@@ -22,8 +22,7 @@ + * 1) mem_section - memory sections, mem_map's for valid memory + */ + #ifdef CONFIG_SPARSEMEM_EXTREME +-struct mem_section *mem_section[NR_SECTION_ROOTS] +- ____cacheline_internodealigned_in_smp; ++struct mem_section **mem_section; + #else + struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT] + ____cacheline_internodealigned_in_smp; +@@ -100,7 +99,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) + int __section_nr(struct mem_section* ms) + { + unsigned long root_nr; +- struct mem_section* root; ++ struct mem_section *root = NULL; + + for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) { + root = __nr_to_section(root_nr * SECTIONS_PER_ROOT); +@@ -111,7 +110,7 @@ int __section_nr(struct mem_section* ms) + break; + } + +- VM_BUG_ON(root_nr == NR_SECTION_ROOTS); ++ VM_BUG_ON(!root); + + return (root_nr * SECTIONS_PER_ROOT) + (ms - root); + } +@@ -329,11 +328,17 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, + static void __init check_usemap_section_nr(int nid, unsigned long *usemap) + { + unsigned long usemap_snr, pgdat_snr; +- static unsigned long old_usemap_snr = NR_MEM_SECTIONS; +- static unsigned long old_pgdat_snr = NR_MEM_SECTIONS; ++ static unsigned long old_usemap_snr; ++ static unsigned long old_pgdat_snr; + struct pglist_data *pgdat = NODE_DATA(nid); + int usemap_nid; + ++ /* First call */ ++ if (!old_usemap_snr) { ++ old_usemap_snr = NR_MEM_SECTIONS; ++ old_pgdat_snr = NR_MEM_SECTIONS; ++ } ++ + usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT); + pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); + if (usemap_snr == pgdat_snr) +-- +2.15.0 + diff --git a/queue/mm-sparsemem-Fix-ARM64-boot-crash-when-CONFIG_SPARSE.patch b/queue/mm-sparsemem-Fix-ARM64-boot-crash-when-CONFIG_SPARSE.patch new file mode 100644 index 0000000..b90edd5 --- /dev/null +++ b/queue/mm-sparsemem-Fix-ARM64-boot-crash-when-CONFIG_SPARSE.patch @@ -0,0 +1,76 @@ +From 629a359bdb0e0652a8227b4ff3125431995fec6e Mon Sep 17 00:00:00 2001 +From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> +Date: Tue, 7 Nov 2017 11:33:37 +0300 +Subject: [PATCH] mm/sparsemem: Fix ARM64 boot crash when + CONFIG_SPARSEMEM_EXTREME=y + +commit 629a359bdb0e0652a8227b4ff3125431995fec6e upstream. + +Since commit: + + 83e3c48729d9 ("mm/sparsemem: Allocate mem_section at runtime for CONFIG_SPARSEMEM_EXTREME=y") + +we allocate the mem_section array dynamically in sparse_memory_present_with_active_regions(), +but some architectures, like arm64, don't call the routine to initialize sparsemem. + +Let's move the initialization into memory_present() it should cover all +architectures. + +Reported-and-tested-by: Sudeep Holla <sudeep.holla@arm.com> +Tested-by: Bjorn Andersson <bjorn.andersson@linaro.org> +Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> +Acked-by: Will Deacon <will.deacon@arm.com> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: linux-mm@kvack.org +Fixes: 83e3c48729d9 ("mm/sparsemem: Allocate mem_section at runtime for CONFIG_SPARSEMEM_EXTREME=y") +Link: http://lkml.kernel.org/r/20171107083337.89952-1-kirill.shutemov@linux.intel.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 8dfd13f724d9..77e4d3c5c57b 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -5646,16 +5646,6 @@ void __init sparse_memory_present_with_active_regions(int nid) + unsigned long start_pfn, end_pfn; + int i, this_nid; + +-#ifdef CONFIG_SPARSEMEM_EXTREME +- if (!mem_section) { +- unsigned long size, align; +- +- size = sizeof(struct mem_section) * NR_SECTION_ROOTS; +- align = 1 << (INTERNODE_CACHE_SHIFT); +- mem_section = memblock_virt_alloc(size, align); +- } +-#endif +- + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) + memory_present(this_nid, start_pfn, end_pfn); + } +diff --git a/mm/sparse.c b/mm/sparse.c +index b00a97398795..d294148ba395 100644 +--- a/mm/sparse.c ++++ b/mm/sparse.c +@@ -206,6 +206,16 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) + { + unsigned long pfn; + ++#ifdef CONFIG_SPARSEMEM_EXTREME ++ if (unlikely(!mem_section)) { ++ unsigned long size, align; ++ ++ size = sizeof(struct mem_section) * NR_SECTION_ROOTS; ++ align = 1 << (INTERNODE_CACHE_SHIFT); ++ mem_section = memblock_virt_alloc(size, align); ++ } ++#endif ++ + start &= PAGE_SECTION_MASK; + mminit_validate_memmodel_limits(&start, &end); + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { +-- +2.15.0 + diff --git a/queue/net-ipv6-send-NS-for-DAD-when-link-operationally-up.patch b/queue/net-ipv6-send-NS-for-DAD-when-link-operationally-up.patch new file mode 100644 index 0000000..ad709c3 --- /dev/null +++ b/queue/net-ipv6-send-NS-for-DAD-when-link-operationally-up.patch @@ -0,0 +1,70 @@ +From 1f372c7bfb23286d2bf4ce0423ab488e86b74bb2 Mon Sep 17 00:00:00 2001 +From: Mike Manning <mmanning@brocade.com> +Date: Mon, 25 Sep 2017 22:01:36 +0100 +Subject: [PATCH] net: ipv6: send NS for DAD when link operationally up + +commit 1f372c7bfb23286d2bf4ce0423ab488e86b74bb2 upstream. + +The NS for DAD are sent on admin up as long as a valid qdisc is found. +A race condition exists by which these packets will not egress the +interface if the operational state of the lower device is not yet up. +The solution is to delay DAD until the link is operationally up +according to RFC2863. Rather than only doing this, follow the existing +code checks by deferring IPv6 device initialization altogether. The fix +allows DAD on devices like tunnels that are controlled by userspace +control plane. The fix has no impact on regular deployments, but means +that there is no IPv6 connectivity until the port has been opened in +the case of port-based network access control, which should be +desirable. + +Signed-off-by: Mike Manning <mmanning@brocade.com> +Signed-off-by: David S. Miller <davem@davemloft.net> + +diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c +index 13c3b697f8c0..f553f72d0bee 100644 +--- a/net/ipv6/addrconf.c ++++ b/net/ipv6/addrconf.c +@@ -303,10 +303,10 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { + .disable_policy = 0, + }; + +-/* Check if a valid qdisc is available */ +-static inline bool addrconf_qdisc_ok(const struct net_device *dev) ++/* Check if link is ready: is it up and is a valid qdisc available */ ++static inline bool addrconf_link_ready(const struct net_device *dev) + { +- return !qdisc_tx_is_noop(dev); ++ return netif_oper_up(dev) && !qdisc_tx_is_noop(dev); + } + + static void addrconf_del_rs_timer(struct inet6_dev *idev) +@@ -451,7 +451,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) + + ndev->token = in6addr_any; + +- if (netif_running(dev) && addrconf_qdisc_ok(dev)) ++ if (netif_running(dev) && addrconf_link_ready(dev)) + ndev->if_flags |= IF_READY; + + ipv6_mc_init_dev(ndev); +@@ -3403,7 +3403,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, + /* restore routes for permanent addresses */ + addrconf_permanent_addr(dev); + +- if (!addrconf_qdisc_ok(dev)) { ++ if (!addrconf_link_ready(dev)) { + /* device is not ready yet. */ + pr_info("ADDRCONF(NETDEV_UP): %s: link is not ready\n", + dev->name); +@@ -3418,7 +3418,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, + run_pending = 1; + } + } else if (event == NETDEV_CHANGE) { +- if (!addrconf_qdisc_ok(dev)) { ++ if (!addrconf_link_ready(dev)) { + /* device is still not ready. */ + break; + } +-- +2.15.0 + diff --git a/queue/net-phy-at803x-Change-error-to-EINVAL-for-invalid-MA.patch b/queue/net-phy-at803x-Change-error-to-EINVAL-for-invalid-MA.patch new file mode 100644 index 0000000..2d4ef26 --- /dev/null +++ b/queue/net-phy-at803x-Change-error-to-EINVAL-for-invalid-MA.patch @@ -0,0 +1,29 @@ +From fc7556877d1748ac00958822a0a3bba1d4bd9e0d Mon Sep 17 00:00:00 2001 +From: Dan Murphy <dmurphy@ti.com> +Date: Tue, 10 Oct 2017 12:42:56 -0500 +Subject: [PATCH] net: phy: at803x: Change error to EINVAL for invalid MAC + +commit fc7556877d1748ac00958822a0a3bba1d4bd9e0d upstream. + +Change the return error code to EINVAL if the MAC +address is not valid in the set_wol function. + +Signed-off-by: Dan Murphy <dmurphy@ti.com> +Signed-off-by: David S. Miller <davem@davemloft.net> + +diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c +index c1e52b9dc58d..5f93e6add563 100644 +--- a/drivers/net/phy/at803x.c ++++ b/drivers/net/phy/at803x.c +@@ -167,7 +167,7 @@ static int at803x_set_wol(struct phy_device *phydev, + mac = (const u8 *) ndev->dev_addr; + + if (!is_valid_ether_addr(mac)) +- return -EFAULT; ++ return -EINVAL; + + for (i = 0; i < 3; i++) { + phy_write(phydev, AT803X_MMD_ACCESS_CONTROL, +-- +2.15.0 + diff --git a/queue/objtool-Don-t-report-end-of-section-error-after-an-e.patch b/queue/objtool-Don-t-report-end-of-section-error-after-an-e.patch new file mode 100644 index 0000000..8a868d0 --- /dev/null +++ b/queue/objtool-Don-t-report-end-of-section-error-after-an-e.patch @@ -0,0 +1,47 @@ +From 00d96180dc38ef872ac471c2d3e14b067cbd895d Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf <jpoimboe@redhat.com> +Date: Mon, 18 Sep 2017 21:43:30 -0500 +Subject: [PATCH] objtool: Don't report end of section error after an empty + unwind hint + +commit 00d96180dc38ef872ac471c2d3e14b067cbd895d upstream. + +If asm code specifies an UNWIND_HINT_EMPTY hint, don't warn if the +section ends unexpectedly. This can happen with the xen-head.S code +because the hypercall_page is "text" but it's all zeros. + +Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Jiri Slaby <jslaby@suse.cz> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/ddafe199dd8797e40e3c2777373347eba1d65572.1505764066.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/tools/objtool/check.c b/tools/objtool/check.c +index a0c518ecf085..83f370fa00c2 100644 +--- a/tools/objtool/check.c ++++ b/tools/objtool/check.c +@@ -1752,11 +1752,14 @@ static int validate_branch(struct objtool_file *file, struct instruction *first, + if (insn->dead_end) + return 0; + +- insn = next_insn; +- if (!insn) { ++ if (!next_insn) { ++ if (state.cfa.base == CFI_UNDEFINED) ++ return 0; + WARN("%s: unexpected end of section", sec->name); + return 1; + } ++ ++ insn = next_insn; + } + + return 0; +-- +2.15.0 + diff --git a/queue/objtool-Print-top-level-commands-on-incorrect-usage.patch b/queue/objtool-Print-top-level-commands-on-incorrect-usage.patch new file mode 100644 index 0000000..4ce56e4 --- /dev/null +++ b/queue/objtool-Print-top-level-commands-on-incorrect-usage.patch @@ -0,0 +1,62 @@ +From 6a93bb7e4a7d6670677d5b0eb980936eb9cc5d2e Mon Sep 17 00:00:00 2001 +From: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com> +Date: Sat, 14 Oct 2017 20:17:54 +0530 +Subject: [PATCH] objtool: Print top level commands on incorrect usage + +commit 6a93bb7e4a7d6670677d5b0eb980936eb9cc5d2e upstream. + +Print top-level objtool commands, along with the error on incorrect +command line usage. Objtool command line parser exit's with code 129, +for incorrect usage. Convert the cmd_usage() exit code also, to maintain +consistency across objtool. + +After the patch: + + $ ./objtool -j + + Unknown option: -j + + usage: objtool COMMAND [ARGS] + + Commands: + check Perform stack metadata validation on an object file + orc Generate in-place ORC unwind tables for an object file + + $ echo $? + 129 + +Signed-off-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com> +Acked-by: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/1507992474-16142-1-git-send-email-kamalesh@linux.vnet.ibm.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c +index 31e0f9143840..07f329919828 100644 +--- a/tools/objtool/objtool.c ++++ b/tools/objtool/objtool.c +@@ -70,7 +70,7 @@ static void cmd_usage(void) + + printf("\n"); + +- exit(1); ++ exit(129); + } + + static void handle_options(int *argc, const char ***argv) +@@ -86,9 +86,7 @@ static void handle_options(int *argc, const char ***argv) + break; + } else { + fprintf(stderr, "Unknown option: %s\n", cmd); +- fprintf(stderr, "\n Usage: %s\n", +- objtool_usage_string); +- exit(1); ++ cmd_usage(); + } + + (*argv)++; +-- +2.15.0 + diff --git a/queue/optee-fix-invalid-of_node_put-in-optee_driver_init.patch b/queue/optee-fix-invalid-of_node_put-in-optee_driver_init.patch new file mode 100644 index 0000000..ba83ecb --- /dev/null +++ b/queue/optee-fix-invalid-of_node_put-in-optee_driver_init.patch @@ -0,0 +1,32 @@ +From f044113113dd95ba73916bde10e804d3cdfa2662 Mon Sep 17 00:00:00 2001 +From: Jens Wiklander <jens.wiklander@linaro.org> +Date: Mon, 9 Oct 2017 11:11:49 +0200 +Subject: [PATCH] optee: fix invalid of_node_put() in optee_driver_init() + +commit f044113113dd95ba73916bde10e804d3cdfa2662 upstream. + +The first node supplied to of_find_matching_node() has its reference +counter decreased as part of call to that function. In optee_driver_init() +after calling of_find_matching_node() it's invalid to call of_node_put() on +the supplied node again. + +So remove the invalid call to of_node_put(). + +Reported-by: Alex Shi <alex.shi@linaro.org> +Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org> + +diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c +index 7952357df9c8..edb6e4e9ef3a 100644 +--- a/drivers/tee/optee/core.c ++++ b/drivers/tee/optee/core.c +@@ -590,7 +590,6 @@ static int __init optee_driver_init(void) + return -ENODEV; + + np = of_find_matching_node(fw_np, optee_match); +- of_node_put(fw_np); + if (!np) + return -ENODEV; + +-- +2.15.0 + diff --git a/queue/perf-x86-Enable-free-running-PEBS-for-REGS_USER-INTR.patch b/queue/perf-x86-Enable-free-running-PEBS-for-REGS_USER-INTR.patch new file mode 100644 index 0000000..b651aee --- /dev/null +++ b/queue/perf-x86-Enable-free-running-PEBS-for-REGS_USER-INTR.patch @@ -0,0 +1,97 @@ +From 2fe1bc1f501d55e5925b4035bcd85781adc76c63 Mon Sep 17 00:00:00 2001 +From: Andi Kleen <ak@linux.intel.com> +Date: Thu, 31 Aug 2017 14:46:30 -0700 +Subject: [PATCH] perf/x86: Enable free running PEBS for REGS_USER/INTR + +commit 2fe1bc1f501d55e5925b4035bcd85781adc76c63 upstream. + +[ Note, this is a Git cherry-pick of the following commit: + + a47ba4d77e12 ("perf/x86: Enable free running PEBS for REGS_USER/INTR") + + ... for easier x86 PTI code testing and back-porting. ] + +Currently free running PEBS is disabled when user or interrupt +registers are requested. Most of the registers are actually +available in the PEBS record and can be supported. + +So we just need to check for the supported registers and then +allow it: it is all except for the segment register. + +For user registers this only works when the counter is limited +to ring 3 only, so this also needs to be checked. + +Signed-off-by: Andi Kleen <ak@linux.intel.com> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/20170831214630.21892-1-andi@firstfloor.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c +index 9fb9a1f1e47b..43445da30cea 100644 +--- a/arch/x86/events/intel/core.c ++++ b/arch/x86/events/intel/core.c +@@ -2958,6 +2958,10 @@ static unsigned long intel_pmu_free_running_flags(struct perf_event *event) + + if (event->attr.use_clockid) + flags &= ~PERF_SAMPLE_TIME; ++ if (!event->attr.exclude_kernel) ++ flags &= ~PERF_SAMPLE_REGS_USER; ++ if (event->attr.sample_regs_user & ~PEBS_REGS) ++ flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); + return flags; + } + +diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h +index 4196f81ec0e1..f7aaadf9331f 100644 +--- a/arch/x86/events/perf_event.h ++++ b/arch/x86/events/perf_event.h +@@ -85,13 +85,15 @@ struct amd_nb { + * Flags PEBS can handle without an PMI. + * + * TID can only be handled by flushing at context switch. ++ * REGS_USER can be handled for events limited to ring 3. + * + */ + #define PEBS_FREERUNNING_FLAGS \ + (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \ + PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \ + PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ +- PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR) ++ PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ ++ PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER) + + /* + * A debug store configuration. +@@ -110,6 +112,26 @@ struct debug_store { + u64 pebs_event_reset[MAX_PEBS_EVENTS]; + }; + ++#define PEBS_REGS \ ++ (PERF_REG_X86_AX | \ ++ PERF_REG_X86_BX | \ ++ PERF_REG_X86_CX | \ ++ PERF_REG_X86_DX | \ ++ PERF_REG_X86_DI | \ ++ PERF_REG_X86_SI | \ ++ PERF_REG_X86_SP | \ ++ PERF_REG_X86_BP | \ ++ PERF_REG_X86_IP | \ ++ PERF_REG_X86_FLAGS | \ ++ PERF_REG_X86_R8 | \ ++ PERF_REG_X86_R9 | \ ++ PERF_REG_X86_R10 | \ ++ PERF_REG_X86_R11 | \ ++ PERF_REG_X86_R12 | \ ++ PERF_REG_X86_R13 | \ ++ PERF_REG_X86_R14 | \ ++ PERF_REG_X86_R15) ++ + /* + * Per register state. + */ +-- +2.15.0 + diff --git a/queue/platform-x86-asus-wireless-send-an-EV_SYN-SYN_REPORT.patch b/queue/platform-x86-asus-wireless-send-an-EV_SYN-SYN_REPORT.patch new file mode 100644 index 0000000..327c8b2 --- /dev/null +++ b/queue/platform-x86-asus-wireless-send-an-EV_SYN-SYN_REPORT.patch @@ -0,0 +1,36 @@ +From bff5bf9db1c9453ffd0a78abed3e2d040c092fd9 Mon Sep 17 00:00:00 2001 +From: Peter Hutterer <peter.hutterer@who-t.net> +Date: Mon, 4 Dec 2017 10:26:17 +1000 +Subject: [PATCH] platform/x86: asus-wireless: send an EV_SYN/SYN_REPORT + between state changes + +commit bff5bf9db1c9453ffd0a78abed3e2d040c092fd9 upstream. + +Sending the switch state change twice within the same frame is invalid +evdev protocol and only works if the client handles keys immediately as +well. Processing events immediately is incorrect, it forces a fake +order of events that does not exist on the device. + +Recent versions of libinput changed to only process the device state and +SYN_REPORT time, so now the key event is lost. + +https://bugs.freedesktop.org/show_bug.cgi?id=104041 + +Signed-off-by: Peter Hutterer <peter.hutterer@who-t.net> +Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org> + +diff --git a/drivers/platform/x86/asus-wireless.c b/drivers/platform/x86/asus-wireless.c +index f3796164329e..d4aeac3477f5 100644 +--- a/drivers/platform/x86/asus-wireless.c ++++ b/drivers/platform/x86/asus-wireless.c +@@ -118,6 +118,7 @@ static void asus_wireless_notify(struct acpi_device *adev, u32 event) + return; + } + input_report_key(data->idev, KEY_RFKILL, 1); ++ input_sync(data->idev); + input_report_key(data->idev, KEY_RFKILL, 0); + input_sync(data->idev); + } +-- +2.15.0 + diff --git a/queue/powerpc-watchdog-Do-not-trigger-SMP-crash-from-touch.patch b/queue/powerpc-watchdog-Do-not-trigger-SMP-crash-from-touch.patch new file mode 100644 index 0000000..32e3fd4 --- /dev/null +++ b/queue/powerpc-watchdog-Do-not-trigger-SMP-crash-from-touch.patch @@ -0,0 +1,42 @@ +From 80e4d70b06863e0104e5a0dc78aa3710297fbd4b Mon Sep 17 00:00:00 2001 +From: Nicholas Piggin <npiggin@gmail.com> +Date: Fri, 29 Sep 2017 13:29:39 +1000 +Subject: [PATCH] powerpc/watchdog: Do not trigger SMP crash from + touch_nmi_watchdog + +commit 80e4d70b06863e0104e5a0dc78aa3710297fbd4b upstream. + +In xmon, touch_nmi_watchdog() is not expected to be checking that +other CPUs have not touched the watchdog, so the code will just call +touch_nmi_watchdog() once before re-enabling hard interrupts. + +Just update our CPU's state, and ignore apparently stuck SMP threads. + +Arguably touch_nmi_watchdog should check for SMP lockups, and callers +should be fixed, but that's not trivial for the input code of xmon. + +Signed-off-by: Nicholas Piggin <npiggin@gmail.com> +Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> + +diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c +index 920e61c79f47..1fb9379dc683 100644 +--- a/arch/powerpc/kernel/watchdog.c ++++ b/arch/powerpc/kernel/watchdog.c +@@ -277,9 +277,12 @@ void arch_touch_nmi_watchdog(void) + { + unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000; + int cpu = smp_processor_id(); ++ u64 tb = get_tb(); + +- if (get_tb() - per_cpu(wd_timer_tb, cpu) >= ticks) +- watchdog_timer_interrupt(cpu); ++ if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) { ++ per_cpu(wd_timer_tb, cpu) = tb; ++ wd_smp_clear_cpu_pending(cpu, tb); ++ } + } + EXPORT_SYMBOL(arch_touch_nmi_watchdog); + +-- +2.15.0 + diff --git a/queue/powerpc-xmon-Avoid-tripping-SMP-hardlockup-watchdog.patch b/queue/powerpc-xmon-Avoid-tripping-SMP-hardlockup-watchdog.patch new file mode 100644 index 0000000..5061c75 --- /dev/null +++ b/queue/powerpc-xmon-Avoid-tripping-SMP-hardlockup-watchdog.patch @@ -0,0 +1,74 @@ +From 064996d62a33ffe10264b5af5dca92d54f60f806 Mon Sep 17 00:00:00 2001 +From: Nicholas Piggin <npiggin@gmail.com> +Date: Fri, 29 Sep 2017 13:29:40 +1000 +Subject: [PATCH] powerpc/xmon: Avoid tripping SMP hardlockup watchdog + +commit 064996d62a33ffe10264b5af5dca92d54f60f806 upstream. + +The SMP hardlockup watchdog cross-checks other CPUs for lockups, which +causes xmon headaches because it's assuming interrupts hard disabled +means no watchdog troubles. Try to improve that by calling +touch_nmi_watchdog() in obvious places where secondaries are spinning. + +Also annotate these spin loops with spin_begin/end calls. + +Signed-off-by: Nicholas Piggin <npiggin@gmail.com> +Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> + +diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c +index 33351c6704b1..d9a12102b111 100644 +--- a/arch/powerpc/xmon/xmon.c ++++ b/arch/powerpc/xmon/xmon.c +@@ -530,14 +530,19 @@ static int xmon_core(struct pt_regs *regs, int fromipi) + + waiting: + secondary = 1; ++ spin_begin(); + while (secondary && !xmon_gate) { + if (in_xmon == 0) { +- if (fromipi) ++ if (fromipi) { ++ spin_end(); + goto leave; ++ } + secondary = test_and_set_bit(0, &in_xmon); + } +- barrier(); ++ spin_cpu_relax(); ++ touch_nmi_watchdog(); + } ++ spin_end(); + + if (!secondary && !xmon_gate) { + /* we are the first cpu to come in */ +@@ -568,21 +573,25 @@ static int xmon_core(struct pt_regs *regs, int fromipi) + mb(); + xmon_gate = 1; + barrier(); ++ touch_nmi_watchdog(); + } + + cmdloop: + while (in_xmon) { + if (secondary) { ++ spin_begin(); + if (cpu == xmon_owner) { + if (!test_and_set_bit(0, &xmon_taken)) { + secondary = 0; ++ spin_end(); + continue; + } + /* missed it */ + while (cpu == xmon_owner) +- barrier(); ++ spin_cpu_relax(); + } +- barrier(); ++ spin_cpu_relax(); ++ touch_nmi_watchdog(); + } else { + cmd = cmds(regs); + if (cmd != 0) { +-- +2.15.0 + diff --git a/queue/ptrace-x86-Make-user_64bit_mode-available-to-32-bit-.patch b/queue/ptrace-x86-Make-user_64bit_mode-available-to-32-bit-.patch new file mode 100644 index 0000000..1a579ec --- /dev/null +++ b/queue/ptrace-x86-Make-user_64bit_mode-available-to-32-bit-.patch @@ -0,0 +1,80 @@ +From e27c310af5c05cf876d9cad006928076c27f54d4 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com> +Date: Fri, 27 Oct 2017 13:25:30 -0700 +Subject: [PATCH] ptrace,x86: Make user_64bit_mode() available to 32-bit builds + +commit e27c310af5c05cf876d9cad006928076c27f54d4 upstream. + +In its current form, user_64bit_mode() can only be used when CONFIG_X86_64 +is selected. This implies that code built with CONFIG_X86_64=n cannot use +it. If a piece of code needs to be built for both CONFIG_X86_64=y and +CONFIG_X86_64=n and wants to use this function, it needs to wrap it in +an #ifdef/#endif; potentially, in multiple places. + +This can be easily avoided with a single #ifdef/#endif pair within +user_64bit_mode() itself. + +Suggested-by: Borislav Petkov <bp@suse.de> +Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: "Michael S. Tsirkin" <mst@redhat.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: ricardo.neri@intel.com +Cc: Adrian Hunter <adrian.hunter@intel.com> +Cc: Paul Gortmaker <paul.gortmaker@windriver.com> +Cc: Huang Rui <ray.huang@amd.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Shuah Khan <shuah@kernel.org> +Cc: Kees Cook <keescook@chromium.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Jiri Slaby <jslaby@suse.cz> +Cc: Dmitry Vyukov <dvyukov@google.com> +Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com> +Cc: Chris Metcalf <cmetcalf@mellanox.com> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Arnaldo Carvalho de Melo <acme@redhat.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Colin Ian King <colin.king@canonical.com> +Cc: Chen Yucong <slaoub@gmail.com> +Cc: Adam Buchbinder <adam.buchbinder@gmail.com> +Cc: Vlastimil Babka <vbabka@suse.cz> +Cc: Lorenzo Stoakes <lstoakes@gmail.com> +Cc: Masami Hiramatsu <mhiramat@kernel.org> +Cc: Paolo Bonzini <pbonzini@redhat.com> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: Thomas Garnier <thgarnie@google.com> +Link: https://lkml.kernel.org/r/1509135945-13762-4-git-send-email-ricardo.neri-calderon@linux.intel.com + +diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h +index 91c04c8e67fa..e2afbf689309 100644 +--- a/arch/x86/include/asm/ptrace.h ++++ b/arch/x86/include/asm/ptrace.h +@@ -135,9 +135,9 @@ static inline int v8086_mode(struct pt_regs *regs) + #endif + } + +-#ifdef CONFIG_X86_64 + static inline bool user_64bit_mode(struct pt_regs *regs) + { ++#ifdef CONFIG_X86_64 + #ifndef CONFIG_PARAVIRT + /* + * On non-paravirt systems, this is the only long mode CPL 3 +@@ -148,8 +148,12 @@ static inline bool user_64bit_mode(struct pt_regs *regs) + /* Headers are too twisted for this to go in paravirt.h. */ + return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs; + #endif ++#else /* !CONFIG_X86_64 */ ++ return false; ++#endif + } + ++#ifdef CONFIG_X86_64 + #define current_user_stack_pointer() current_pt_regs()->sp + #define compat_user_stack_pointer() current_pt_regs()->sp + #endif +-- +2.15.0 + diff --git a/queue/rtc-pl031-make-interrupt-optional.patch b/queue/rtc-pl031-make-interrupt-optional.patch new file mode 100644 index 0000000..cc2f3a0 --- /dev/null +++ b/queue/rtc-pl031-make-interrupt-optional.patch @@ -0,0 +1,50 @@ +From 5b64a2965dfdfca8039e93303c64e2b15c19ff0c Mon Sep 17 00:00:00 2001 +From: Russell King <rmk+kernel@armlinux.org.uk> +Date: Fri, 29 Sep 2017 11:22:15 +0100 +Subject: [PATCH] rtc: pl031: make interrupt optional + +commit 5b64a2965dfdfca8039e93303c64e2b15c19ff0c upstream. + +On some platforms, the interrupt for the PL031 is optional. Avoid +trying to claim the interrupt if it's not specified. + +Reviewed-by: Linus Walleij <linus.walleij@linaro.org> +Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk> +Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com> + +diff --git a/drivers/rtc/rtc-pl031.c b/drivers/rtc/rtc-pl031.c +index 64c77ec1b4ea..82eb7da2c478 100644 +--- a/drivers/rtc/rtc-pl031.c ++++ b/drivers/rtc/rtc-pl031.c +@@ -308,7 +308,8 @@ static int pl031_remove(struct amba_device *adev) + + dev_pm_clear_wake_irq(&adev->dev); + device_init_wakeup(&adev->dev, false); +- free_irq(adev->irq[0], ldata); ++ if (adev->irq[0]) ++ free_irq(adev->irq[0], ldata); + rtc_device_unregister(ldata->rtc); + amba_release_regions(adev); + +@@ -389,12 +390,13 @@ static int pl031_probe(struct amba_device *adev, const struct amba_id *id) + goto out; + } + +- if (request_irq(adev->irq[0], pl031_interrupt, +- vendor->irqflags, "rtc-pl031", ldata)) { +- ret = -EIO; +- goto out_no_irq; ++ if (adev->irq[0]) { ++ ret = request_irq(adev->irq[0], pl031_interrupt, ++ vendor->irqflags, "rtc-pl031", ldata); ++ if (ret) ++ goto out_no_irq; ++ dev_pm_set_wake_irq(&adev->dev, adev->irq[0]); + } +- dev_pm_set_wake_irq(&adev->dev, adev->irq[0]); + return 0; + + out_no_irq: +-- +2.15.0 + diff --git a/queue/rtc-set-the-alarm-to-the-next-expiring-timer.patch b/queue/rtc-set-the-alarm-to-the-next-expiring-timer.patch new file mode 100644 index 0000000..75727d9 --- /dev/null +++ b/queue/rtc-set-the-alarm-to-the-next-expiring-timer.patch @@ -0,0 +1,32 @@ +From 74717b28cb32e1ad3c1042cafd76b264c8c0f68d Mon Sep 17 00:00:00 2001 +From: Alexandre Belloni <alexandre.belloni@free-electrons.com> +Date: Thu, 28 Sep 2017 13:53:27 +0200 +Subject: [PATCH] rtc: set the alarm to the next expiring timer + +commit 74717b28cb32e1ad3c1042cafd76b264c8c0f68d upstream. + +If there is any non expired timer in the queue, the RTC alarm is never set. +This is an issue when adding a timer that expires before the next non +expired timer. + +Ensure the RTC alarm is set in that case. + +Fixes: 2b2f5ff00f63 ("rtc: interface: ignore expired timers when enqueuing new timers") +Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com> + +diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c +index 8cec9a02c0b8..9eb32ead63db 100644 +--- a/drivers/rtc/interface.c ++++ b/drivers/rtc/interface.c +@@ -779,7 +779,7 @@ static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer) + } + + timerqueue_add(&rtc->timerqueue, &timer->node); +- if (!next) { ++ if (!next || ktime_before(timer->node.expires, next->expires)) { + struct rtc_wkalrm alarm; + int err; + alarm.time = rtc_ktime_to_tm(timer->node.expires); +-- +2.15.0 + diff --git a/queue/scsi-cxgb4i-fix-Tx-skb-leak.patch b/queue/scsi-cxgb4i-fix-Tx-skb-leak.patch new file mode 100644 index 0000000..130fe9d --- /dev/null +++ b/queue/scsi-cxgb4i-fix-Tx-skb-leak.patch @@ -0,0 +1,29 @@ +From 9b3a081fb62158b50bcc90522ca2423017544367 Mon Sep 17 00:00:00 2001 +From: Varun Prakash <varun@chelsio.com> +Date: Wed, 11 Oct 2017 19:33:07 +0530 +Subject: [PATCH] scsi: cxgb4i: fix Tx skb leak + +commit 9b3a081fb62158b50bcc90522ca2423017544367 upstream. + +In case of connection reset Tx skb queue can have some skbs which are +not transmitted so purge Tx skb queue in release_offload_resources() to +avoid skb leak. + +Signed-off-by: Varun Prakash <varun@chelsio.com> +Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com> + +diff --git a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c +index 1d02cf9fe06c..30d5f0ef29bb 100644 +--- a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c ++++ b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c +@@ -1575,6 +1575,7 @@ static void release_offload_resources(struct cxgbi_sock *csk) + csk, csk->state, csk->flags, csk->tid); + + cxgbi_sock_free_cpl_skbs(csk); ++ cxgbi_sock_purge_write_queue(csk); + if (csk->wr_cred != csk->wr_max_cred) { + cxgbi_sock_purge_wr_queue(csk); + cxgbi_sock_reset_wr_list(csk); +-- +2.15.0 + diff --git a/queue/scsi-lpfc-Fix-secure-firmware-updates.patch b/queue/scsi-lpfc-Fix-secure-firmware-updates.patch new file mode 100644 index 0000000..3f0b048 --- /dev/null +++ b/queue/scsi-lpfc-Fix-secure-firmware-updates.patch @@ -0,0 +1,34 @@ +From 184fc2b9a8bcbda9c14d0a1e7fbecfc028c7702e Mon Sep 17 00:00:00 2001 +From: Dick Kennedy <dick.kennedy@broadcom.com> +Date: Fri, 29 Sep 2017 17:34:42 -0700 +Subject: [PATCH] scsi: lpfc: Fix secure firmware updates + +commit 184fc2b9a8bcbda9c14d0a1e7fbecfc028c7702e upstream. + +Firmware update fails with: status x17 add_status x56 on the final write + +If multiple DMA buffers are used for the download, some firmware revs +have difficulty with signatures and crcs split across the dma buffer +boundaries. Resolve by making all writes be a single 4k page in length. + +Signed-off-by: Dick Kennedy <dick.kennedy@broadcom.com> +Signed-off-by: James Smart <james.smart@broadcom.com> +Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de> +Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com> + +diff --git a/drivers/scsi/lpfc/lpfc_hw4.h b/drivers/scsi/lpfc/lpfc_hw4.h +index 1db0a38683f4..2b145966c73f 100644 +--- a/drivers/scsi/lpfc/lpfc_hw4.h ++++ b/drivers/scsi/lpfc/lpfc_hw4.h +@@ -3636,7 +3636,7 @@ struct lpfc_mbx_get_port_name { + #define MB_CEQ_STATUS_QUEUE_FLUSHING 0x4 + #define MB_CQE_STATUS_DMA_FAILED 0x5 + +-#define LPFC_MBX_WR_CONFIG_MAX_BDE 8 ++#define LPFC_MBX_WR_CONFIG_MAX_BDE 1 + struct lpfc_mbx_wr_object { + struct mbox_header header; + union { +-- +2.15.0 + diff --git a/queue/scsi-lpfc-Fix-warning-messages-when-NVME_TARGET_FC-n.patch b/queue/scsi-lpfc-Fix-warning-messages-when-NVME_TARGET_FC-n.patch new file mode 100644 index 0000000..6038851 --- /dev/null +++ b/queue/scsi-lpfc-Fix-warning-messages-when-NVME_TARGET_FC-n.patch @@ -0,0 +1,43 @@ +From 2299e4323d2bf6e0728fdc6b9e8e9704978d2dd7 Mon Sep 17 00:00:00 2001 +From: Dick Kennedy <dick.kennedy@broadcom.com> +Date: Fri, 29 Sep 2017 17:34:31 -0700 +Subject: [PATCH] scsi: lpfc: Fix warning messages when NVME_TARGET_FC not + defined + +commit 2299e4323d2bf6e0728fdc6b9e8e9704978d2dd7 upstream. + +Warning messages when NVME_TARGET_FC not defined on ppc builds + +The lpfc_nvmet_replenish_context() function is only meaningful when NVME +target mode enabled. Surround the function body with ifdefs for target +mode enablement. + +Signed-off-by: Dick Kennedy <dick.kennedy@broadcom.com> +Signed-off-by: James Smart <james.smart@broadcom.com> +Reported-by: Stephen Rothwell <sfr@canb.auug.org.au> +Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de> +Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com> + +diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c +index 0b7c1a49e203..313d5c10e0ad 100644 +--- a/drivers/scsi/lpfc/lpfc_nvmet.c ++++ b/drivers/scsi/lpfc/lpfc_nvmet.c +@@ -1457,6 +1457,7 @@ static struct lpfc_nvmet_ctxbuf * + lpfc_nvmet_replenish_context(struct lpfc_hba *phba, + struct lpfc_nvmet_ctx_info *current_infop) + { ++#if (IS_ENABLED(CONFIG_NVME_TARGET_FC)) + struct lpfc_nvmet_ctxbuf *ctx_buf = NULL; + struct lpfc_nvmet_ctx_info *get_infop; + int i; +@@ -1504,6 +1505,7 @@ lpfc_nvmet_replenish_context(struct lpfc_hba *phba, + get_infop = get_infop->nvmet_ctx_next_cpu; + } + ++#endif + /* Nothing found, all contexts for the MRQ are in-flight */ + return NULL; + } +-- +2.15.0 + diff --git a/queue/scsi-lpfc-PLOGI-failures-during-NPIV-testing.patch b/queue/scsi-lpfc-PLOGI-failures-during-NPIV-testing.patch new file mode 100644 index 0000000..1fbd964 --- /dev/null +++ b/queue/scsi-lpfc-PLOGI-failures-during-NPIV-testing.patch @@ -0,0 +1,36 @@ +From e8bcf0ae4c0346fdc78ebefe0eefcaa6a6622d38 Mon Sep 17 00:00:00 2001 +From: Dick Kennedy <dick.kennedy@broadcom.com> +Date: Fri, 29 Sep 2017 17:34:32 -0700 +Subject: [PATCH] scsi: lpfc: PLOGI failures during NPIV testing + +commit e8bcf0ae4c0346fdc78ebefe0eefcaa6a6622d38 upstream. + +Local Reject/Invalid RPI errors seen during discovery. + +Temporary RPI cleanup was occurring regardless of SLI rev. It's only +necessary on SLI-4. + +Adjust the test for whether cleanup is necessary. + +Signed-off-by: Dick Kennedy <dick.kennedy@broadcom.com> +Signed-off-by: James Smart <james.smart@broadcom.com> +Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de> +Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com> + +diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c +index 499df9d17339..d9a03beb76a4 100644 +--- a/drivers/scsi/lpfc/lpfc_hbadisc.c ++++ b/drivers/scsi/lpfc/lpfc_hbadisc.c +@@ -4983,7 +4983,8 @@ lpfc_nlp_remove(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp) + lpfc_cancel_retry_delay_tmo(vport, ndlp); + if ((ndlp->nlp_flag & NLP_DEFER_RM) && + !(ndlp->nlp_flag & NLP_REG_LOGIN_SEND) && +- !(ndlp->nlp_flag & NLP_RPI_REGISTERED)) { ++ !(ndlp->nlp_flag & NLP_RPI_REGISTERED) && ++ phba->sli_rev != LPFC_SLI_REV4) { + /* For this case we need to cleanup the default rpi + * allocated by the firmware. + */ +-- +2.15.0 + diff --git a/queue/scsi-mpt3sas-Fix-IO-error-occurs-on-pulling-out-a-dr.patch b/queue/scsi-mpt3sas-Fix-IO-error-occurs-on-pulling-out-a-dr.patch new file mode 100644 index 0000000..3bf188e --- /dev/null +++ b/queue/scsi-mpt3sas-Fix-IO-error-occurs-on-pulling-out-a-dr.patch @@ -0,0 +1,41 @@ +From 2ce9a3645299ba1752873d333d73f67620f4550b Mon Sep 17 00:00:00 2001 +From: Sreekanth Reddy <sreekanth.reddy@broadcom.com> +Date: Tue, 10 Oct 2017 18:41:18 +0530 +Subject: [PATCH] scsi: mpt3sas: Fix IO error occurs on pulling out a drive + from RAID1 volume created on two SATA drive + +commit 2ce9a3645299ba1752873d333d73f67620f4550b upstream. + +Whenever an I/O for a RAID volume fails with IOCStatus +MPI2_IOCSTATUS_SCSI_IOC_TERMINATED and SCSIStatus equal to +(MPI2_SCSI_STATE_TERMINATED | MPI2_SCSI_STATE_NO_SCSI_STATUS) then +return the I/O to SCSI midlayer with "DID_RESET" (i.e. retry the IO +infinite times) set in the host byte. + +Previously, the driver was completing the I/O with "DID_SOFT_ERROR" +which causes the I/O to be quickly retried. However, firmware needed +more time and hence I/Os were failing. + +Signed-off-by: Sreekanth Reddy <Sreekanth.Reddy@broadcom.com> +Reviewed-by: Tomas Henzl <thenzl@redhat.com> +Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com> + +diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c +index dd2d63b16c7c..814b974001f7 100644 +--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c ++++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c +@@ -4807,6 +4807,11 @@ _scsih_io_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index, u32 reply) + } else if (log_info == VIRTUAL_IO_FAILED_RETRY) { + scmd->result = DID_RESET << 16; + break; ++ } else if ((scmd->device->channel == RAID_CHANNEL) && ++ (scsi_state == (MPI2_SCSI_STATE_TERMINATED | ++ MPI2_SCSI_STATE_NO_SCSI_STATUS))) { ++ scmd->result = DID_RESET << 16; ++ break; + } + scmd->result = DID_SOFT_ERROR << 16; + break; +-- +2.15.0 + diff --git a/queue/sctp-silence-warns-on-sctp_stream_init-allocations.patch b/queue/sctp-silence-warns-on-sctp_stream_init-allocations.patch new file mode 100644 index 0000000..f9374b0 --- /dev/null +++ b/queue/sctp-silence-warns-on-sctp_stream_init-allocations.patch @@ -0,0 +1,51 @@ +From 1ae2eaaa229bc350b6f38fbf4ab9c873532aecfb Mon Sep 17 00:00:00 2001 +From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> +Date: Tue, 3 Oct 2017 19:20:08 -0300 +Subject: [PATCH] sctp: silence warns on sctp_stream_init allocations + +commit 1ae2eaaa229bc350b6f38fbf4ab9c873532aecfb upstream. + +As SCTP supports up to 65535 streams, that can lead to very large +allocations in sctp_stream_init(). As Xin Long noticed, systems with +small amounts of memory are more prone to not have enough memory and +dump warnings on dmesg initiated by user actions. Thus, silence them. + +Also, if the reallocation of stream->out is not necessary, skip it and +keep the memory we already have. + +Reported-by: Xin Long <lucien.xin@gmail.com> +Tested-by: Xin Long <lucien.xin@gmail.com> +Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> +Signed-off-by: David S. Miller <davem@davemloft.net> + +diff --git a/net/sctp/stream.c b/net/sctp/stream.c +index 63ea15503714..1afa95558083 100644 +--- a/net/sctp/stream.c ++++ b/net/sctp/stream.c +@@ -40,9 +40,14 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, + { + int i; + ++ gfp |= __GFP_NOWARN; ++ + /* Initial stream->out size may be very big, so free it and alloc +- * a new one with new outcnt to save memory. ++ * a new one with new outcnt to save memory if needed. + */ ++ if (outcnt == stream->outcnt) ++ goto in; ++ + kfree(stream->out); + + stream->out = kcalloc(outcnt, sizeof(*stream->out), gfp); +@@ -53,6 +58,7 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, + for (i = 0; i < stream->outcnt; i++) + stream->out[i].state = SCTP_STREAM_OPEN; + ++in: + if (!incnt) + return 0; + +-- +2.15.0 + diff --git a/queue/selftests-bpf-add-tests-for-recent-bugfixes.patch b/queue/selftests-bpf-add-tests-for-recent-bugfixes.patch new file mode 100644 index 0000000..ac5fcd8 --- /dev/null +++ b/queue/selftests-bpf-add-tests-for-recent-bugfixes.patch @@ -0,0 +1,700 @@ +From 2255f8d520b0a318fc6d387d0940854b2f522a7f Mon Sep 17 00:00:00 2001 +From: Jann Horn <jannh@google.com> +Date: Mon, 18 Dec 2017 20:12:01 -0800 +Subject: [PATCH] selftests/bpf: add tests for recent bugfixes + +commit 2255f8d520b0a318fc6d387d0940854b2f522a7f upstream. + +These tests should cover the following cases: + + - MOV with both zero-extended and sign-extended immediates + - implicit truncation of register contents via ALU32/MOV32 + - implicit 32-bit truncation of ALU32 output + - oversized register source operand for ALU32 shift + - right-shift of a number that could be positive or negative + - map access where adding the operation size to the offset causes signed + 32-bit overflow + - direct stack access at a ~4GiB offset + +Also remove the F_LOAD_WITH_STRICT_ALIGNMENT flag from a bunch of tests +that should fail independent of what flags userspace passes. + +Signed-off-by: Jann Horn <jannh@google.com> +Signed-off-by: Alexei Starovoitov <ast@kernel.org> +Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> + +diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c +index b03ecfd7185b..961c1426fbf2 100644 +--- a/tools/testing/selftests/bpf/test_verifier.c ++++ b/tools/testing/selftests/bpf/test_verifier.c +@@ -606,7 +606,6 @@ static struct bpf_test tests[] = { + }, + .errstr = "misaligned stack access", + .result = REJECT, +- .flags = F_LOAD_WITH_STRICT_ALIGNMENT, + }, + { + "invalid map_fd for function call", +@@ -1797,7 +1796,6 @@ static struct bpf_test tests[] = { + }, + .result = REJECT, + .errstr = "misaligned stack access off (0x0; 0x0)+-8+2 size 8", +- .flags = F_LOAD_WITH_STRICT_ALIGNMENT, + }, + { + "PTR_TO_STACK store/load - bad alignment on reg", +@@ -1810,7 +1808,6 @@ static struct bpf_test tests[] = { + }, + .result = REJECT, + .errstr = "misaligned stack access off (0x0; 0x0)+-10+8 size 8", +- .flags = F_LOAD_WITH_STRICT_ALIGNMENT, + }, + { + "PTR_TO_STACK store/load - out of bounds low", +@@ -6324,7 +6321,7 @@ static struct bpf_test tests[] = { + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 3 }, +- .errstr = "R0 min value is negative", ++ .errstr = "unbounded min value", + .result = REJECT, + }, + { +@@ -6348,7 +6345,7 @@ static struct bpf_test tests[] = { + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 3 }, +- .errstr = "R0 min value is negative", ++ .errstr = "unbounded min value", + .result = REJECT, + }, + { +@@ -6374,7 +6371,7 @@ static struct bpf_test tests[] = { + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 3 }, +- .errstr = "R8 invalid mem access 'inv'", ++ .errstr = "unbounded min value", + .result = REJECT, + }, + { +@@ -6399,7 +6396,7 @@ static struct bpf_test tests[] = { + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 3 }, +- .errstr = "R8 invalid mem access 'inv'", ++ .errstr = "unbounded min value", + .result = REJECT, + }, + { +@@ -6447,7 +6444,7 @@ static struct bpf_test tests[] = { + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 3 }, +- .errstr = "R0 min value is negative", ++ .errstr = "unbounded min value", + .result = REJECT, + }, + { +@@ -6518,7 +6515,7 @@ static struct bpf_test tests[] = { + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 3 }, +- .errstr = "R0 min value is negative", ++ .errstr = "unbounded min value", + .result = REJECT, + }, + { +@@ -6569,7 +6566,7 @@ static struct bpf_test tests[] = { + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 3 }, +- .errstr = "R0 min value is negative", ++ .errstr = "unbounded min value", + .result = REJECT, + }, + { +@@ -6596,7 +6593,7 @@ static struct bpf_test tests[] = { + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 3 }, +- .errstr = "R0 min value is negative", ++ .errstr = "unbounded min value", + .result = REJECT, + }, + { +@@ -6622,7 +6619,7 @@ static struct bpf_test tests[] = { + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 3 }, +- .errstr = "R0 min value is negative", ++ .errstr = "unbounded min value", + .result = REJECT, + }, + { +@@ -6651,7 +6648,7 @@ static struct bpf_test tests[] = { + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 3 }, +- .errstr = "R0 min value is negative", ++ .errstr = "unbounded min value", + .result = REJECT, + }, + { +@@ -6681,7 +6678,7 @@ static struct bpf_test tests[] = { + BPF_JMP_IMM(BPF_JA, 0, 0, -7), + }, + .fixup_map1 = { 4 }, +- .errstr = "R0 min value is negative", ++ .errstr = "unbounded min value", + .result = REJECT, + }, + { +@@ -6709,8 +6706,7 @@ static struct bpf_test tests[] = { + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 3 }, +- .errstr_unpriv = "R0 pointer comparison prohibited", +- .errstr = "R0 min value is negative", ++ .errstr = "unbounded min value", + .result = REJECT, + .result_unpriv = REJECT, + }, +@@ -6765,6 +6761,462 @@ static struct bpf_test tests[] = { + .errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.", + .result = REJECT, + }, ++ { ++ "bounds check based on zero-extended MOV", ++ .insns = { ++ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), ++ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), ++ BPF_LD_MAP_FD(BPF_REG_1, 0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, ++ BPF_FUNC_map_lookup_elem), ++ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), ++ /* r2 = 0x0000'0000'ffff'ffff */ ++ BPF_MOV32_IMM(BPF_REG_2, 0xffffffff), ++ /* r2 = 0 */ ++ BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 32), ++ /* no-op */ ++ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2), ++ /* access at offset 0 */ ++ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), ++ /* exit */ ++ BPF_MOV64_IMM(BPF_REG_0, 0), ++ BPF_EXIT_INSN(), ++ }, ++ .fixup_map1 = { 3 }, ++ .result = ACCEPT ++ }, ++ { ++ "bounds check based on sign-extended MOV. test1", ++ .insns = { ++ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), ++ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), ++ BPF_LD_MAP_FD(BPF_REG_1, 0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, ++ BPF_FUNC_map_lookup_elem), ++ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), ++ /* r2 = 0xffff'ffff'ffff'ffff */ ++ BPF_MOV64_IMM(BPF_REG_2, 0xffffffff), ++ /* r2 = 0xffff'ffff */ ++ BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 32), ++ /* r0 = <oob pointer> */ ++ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2), ++ /* access to OOB pointer */ ++ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), ++ /* exit */ ++ BPF_MOV64_IMM(BPF_REG_0, 0), ++ BPF_EXIT_INSN(), ++ }, ++ .fixup_map1 = { 3 }, ++ .errstr = "map_value pointer and 4294967295", ++ .result = REJECT ++ }, ++ { ++ "bounds check based on sign-extended MOV. test2", ++ .insns = { ++ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), ++ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), ++ BPF_LD_MAP_FD(BPF_REG_1, 0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, ++ BPF_FUNC_map_lookup_elem), ++ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), ++ /* r2 = 0xffff'ffff'ffff'ffff */ ++ BPF_MOV64_IMM(BPF_REG_2, 0xffffffff), ++ /* r2 = 0xfff'ffff */ ++ BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36), ++ /* r0 = <oob pointer> */ ++ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2), ++ /* access to OOB pointer */ ++ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), ++ /* exit */ ++ BPF_MOV64_IMM(BPF_REG_0, 0), ++ BPF_EXIT_INSN(), ++ }, ++ .fixup_map1 = { 3 }, ++ .errstr = "R0 min value is outside of the array range", ++ .result = REJECT ++ }, ++ { ++ "bounds check based on reg_off + var_off + insn_off. test1", ++ .insns = { ++ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1, ++ offsetof(struct __sk_buff, mark)), ++ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), ++ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), ++ BPF_LD_MAP_FD(BPF_REG_1, 0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, ++ BPF_FUNC_map_lookup_elem), ++ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), ++ BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 1), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, (1 << 29) - 1), ++ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_6), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, (1 << 29) - 1), ++ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 3), ++ BPF_MOV64_IMM(BPF_REG_0, 0), ++ BPF_EXIT_INSN(), ++ }, ++ .fixup_map1 = { 4 }, ++ .errstr = "value_size=8 off=1073741825", ++ .result = REJECT, ++ .prog_type = BPF_PROG_TYPE_SCHED_CLS, ++ }, ++ { ++ "bounds check based on reg_off + var_off + insn_off. test2", ++ .insns = { ++ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1, ++ offsetof(struct __sk_buff, mark)), ++ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), ++ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), ++ BPF_LD_MAP_FD(BPF_REG_1, 0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, ++ BPF_FUNC_map_lookup_elem), ++ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), ++ BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 1), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, (1 << 30) - 1), ++ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_6), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, (1 << 29) - 1), ++ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 3), ++ BPF_MOV64_IMM(BPF_REG_0, 0), ++ BPF_EXIT_INSN(), ++ }, ++ .fixup_map1 = { 4 }, ++ .errstr = "value 1073741823", ++ .result = REJECT, ++ .prog_type = BPF_PROG_TYPE_SCHED_CLS, ++ }, ++ { ++ "bounds check after truncation of non-boundary-crossing range", ++ .insns = { ++ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), ++ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), ++ BPF_LD_MAP_FD(BPF_REG_1, 0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, ++ BPF_FUNC_map_lookup_elem), ++ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), ++ /* r1 = [0x00, 0xff] */ ++ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), ++ BPF_MOV64_IMM(BPF_REG_2, 1), ++ /* r2 = 0x10'0000'0000 */ ++ BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 36), ++ /* r1 = [0x10'0000'0000, 0x10'0000'00ff] */ ++ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2), ++ /* r1 = [0x10'7fff'ffff, 0x10'8000'00fe] */ ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x7fffffff), ++ /* r1 = [0x00, 0xff] */ ++ BPF_ALU32_IMM(BPF_SUB, BPF_REG_1, 0x7fffffff), ++ /* r1 = 0 */ ++ BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8), ++ /* no-op */ ++ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), ++ /* access at offset 0 */ ++ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), ++ /* exit */ ++ BPF_MOV64_IMM(BPF_REG_0, 0), ++ BPF_EXIT_INSN(), ++ }, ++ .fixup_map1 = { 3 }, ++ .result = ACCEPT ++ }, ++ { ++ "bounds check after truncation of boundary-crossing range (1)", ++ .insns = { ++ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), ++ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), ++ BPF_LD_MAP_FD(BPF_REG_1, 0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, ++ BPF_FUNC_map_lookup_elem), ++ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), ++ /* r1 = [0x00, 0xff] */ ++ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xffffff80 >> 1), ++ /* r1 = [0xffff'ff80, 0x1'0000'007f] */ ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xffffff80 >> 1), ++ /* r1 = [0xffff'ff80, 0xffff'ffff] or ++ * [0x0000'0000, 0x0000'007f] ++ */ ++ BPF_ALU32_IMM(BPF_ADD, BPF_REG_1, 0), ++ BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0xffffff80 >> 1), ++ /* r1 = [0x00, 0xff] or ++ * [0xffff'ffff'0000'0080, 0xffff'ffff'ffff'ffff] ++ */ ++ BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0xffffff80 >> 1), ++ /* r1 = 0 or ++ * [0x00ff'ffff'ff00'0000, 0x00ff'ffff'ffff'ffff] ++ */ ++ BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8), ++ /* no-op or OOB pointer computation */ ++ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), ++ /* potentially OOB access */ ++ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), ++ /* exit */ ++ BPF_MOV64_IMM(BPF_REG_0, 0), ++ BPF_EXIT_INSN(), ++ }, ++ .fixup_map1 = { 3 }, ++ /* not actually fully unbounded, but the bound is very high */ ++ .errstr = "R0 unbounded memory access", ++ .result = REJECT ++ }, ++ { ++ "bounds check after truncation of boundary-crossing range (2)", ++ .insns = { ++ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), ++ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), ++ BPF_LD_MAP_FD(BPF_REG_1, 0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, ++ BPF_FUNC_map_lookup_elem), ++ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), ++ /* r1 = [0x00, 0xff] */ ++ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xffffff80 >> 1), ++ /* r1 = [0xffff'ff80, 0x1'0000'007f] */ ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xffffff80 >> 1), ++ /* r1 = [0xffff'ff80, 0xffff'ffff] or ++ * [0x0000'0000, 0x0000'007f] ++ * difference to previous test: truncation via MOV32 ++ * instead of ALU32. ++ */ ++ BPF_MOV32_REG(BPF_REG_1, BPF_REG_1), ++ BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0xffffff80 >> 1), ++ /* r1 = [0x00, 0xff] or ++ * [0xffff'ffff'0000'0080, 0xffff'ffff'ffff'ffff] ++ */ ++ BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0xffffff80 >> 1), ++ /* r1 = 0 or ++ * [0x00ff'ffff'ff00'0000, 0x00ff'ffff'ffff'ffff] ++ */ ++ BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8), ++ /* no-op or OOB pointer computation */ ++ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), ++ /* potentially OOB access */ ++ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), ++ /* exit */ ++ BPF_MOV64_IMM(BPF_REG_0, 0), ++ BPF_EXIT_INSN(), ++ }, ++ .fixup_map1 = { 3 }, ++ /* not actually fully unbounded, but the bound is very high */ ++ .errstr = "R0 unbounded memory access", ++ .result = REJECT ++ }, ++ { ++ "bounds check after wrapping 32-bit addition", ++ .insns = { ++ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), ++ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), ++ BPF_LD_MAP_FD(BPF_REG_1, 0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, ++ BPF_FUNC_map_lookup_elem), ++ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5), ++ /* r1 = 0x7fff'ffff */ ++ BPF_MOV64_IMM(BPF_REG_1, 0x7fffffff), ++ /* r1 = 0xffff'fffe */ ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x7fffffff), ++ /* r1 = 0 */ ++ BPF_ALU32_IMM(BPF_ADD, BPF_REG_1, 2), ++ /* no-op */ ++ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), ++ /* access at offset 0 */ ++ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), ++ /* exit */ ++ BPF_MOV64_IMM(BPF_REG_0, 0), ++ BPF_EXIT_INSN(), ++ }, ++ .fixup_map1 = { 3 }, ++ .result = ACCEPT ++ }, ++ { ++ "bounds check after shift with oversized count operand", ++ .insns = { ++ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), ++ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), ++ BPF_LD_MAP_FD(BPF_REG_1, 0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, ++ BPF_FUNC_map_lookup_elem), ++ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), ++ BPF_MOV64_IMM(BPF_REG_2, 32), ++ BPF_MOV64_IMM(BPF_REG_1, 1), ++ /* r1 = (u32)1 << (u32)32 = ? */ ++ BPF_ALU32_REG(BPF_LSH, BPF_REG_1, BPF_REG_2), ++ /* r1 = [0x0000, 0xffff] */ ++ BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0xffff), ++ /* computes unknown pointer, potentially OOB */ ++ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), ++ /* potentially OOB access */ ++ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), ++ /* exit */ ++ BPF_MOV64_IMM(BPF_REG_0, 0), ++ BPF_EXIT_INSN(), ++ }, ++ .fixup_map1 = { 3 }, ++ .errstr = "R0 max value is outside of the array range", ++ .result = REJECT ++ }, ++ { ++ "bounds check after right shift of maybe-negative number", ++ .insns = { ++ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), ++ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), ++ BPF_LD_MAP_FD(BPF_REG_1, 0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, ++ BPF_FUNC_map_lookup_elem), ++ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), ++ /* r1 = [0x00, 0xff] */ ++ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), ++ /* r1 = [-0x01, 0xfe] */ ++ BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 1), ++ /* r1 = 0 or 0xff'ffff'ffff'ffff */ ++ BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8), ++ /* r1 = 0 or 0xffff'ffff'ffff */ ++ BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8), ++ /* computes unknown pointer, potentially OOB */ ++ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), ++ /* potentially OOB access */ ++ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), ++ /* exit */ ++ BPF_MOV64_IMM(BPF_REG_0, 0), ++ BPF_EXIT_INSN(), ++ }, ++ .fixup_map1 = { 3 }, ++ .errstr = "R0 unbounded memory access", ++ .result = REJECT ++ }, ++ { ++ "bounds check map access with off+size signed 32bit overflow. test1", ++ .insns = { ++ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), ++ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), ++ BPF_LD_MAP_FD(BPF_REG_1, 0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, ++ BPF_FUNC_map_lookup_elem), ++ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), ++ BPF_EXIT_INSN(), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 0x7ffffffe), ++ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), ++ BPF_JMP_A(0), ++ BPF_EXIT_INSN(), ++ }, ++ .fixup_map1 = { 3 }, ++ .errstr = "map_value pointer and 2147483646", ++ .result = REJECT ++ }, ++ { ++ "bounds check map access with off+size signed 32bit overflow. test2", ++ .insns = { ++ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), ++ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), ++ BPF_LD_MAP_FD(BPF_REG_1, 0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, ++ BPF_FUNC_map_lookup_elem), ++ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), ++ BPF_EXIT_INSN(), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 0x1fffffff), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 0x1fffffff), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 0x1fffffff), ++ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), ++ BPF_JMP_A(0), ++ BPF_EXIT_INSN(), ++ }, ++ .fixup_map1 = { 3 }, ++ .errstr = "pointer offset 1073741822", ++ .result = REJECT ++ }, ++ { ++ "bounds check map access with off+size signed 32bit overflow. test3", ++ .insns = { ++ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), ++ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), ++ BPF_LD_MAP_FD(BPF_REG_1, 0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, ++ BPF_FUNC_map_lookup_elem), ++ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), ++ BPF_EXIT_INSN(), ++ BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 0x1fffffff), ++ BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 0x1fffffff), ++ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 2), ++ BPF_JMP_A(0), ++ BPF_EXIT_INSN(), ++ }, ++ .fixup_map1 = { 3 }, ++ .errstr = "pointer offset -1073741822", ++ .result = REJECT ++ }, ++ { ++ "bounds check map access with off+size signed 32bit overflow. test4", ++ .insns = { ++ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), ++ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), ++ BPF_LD_MAP_FD(BPF_REG_1, 0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, ++ BPF_FUNC_map_lookup_elem), ++ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), ++ BPF_EXIT_INSN(), ++ BPF_MOV64_IMM(BPF_REG_1, 1000000), ++ BPF_ALU64_IMM(BPF_MUL, BPF_REG_1, 1000000), ++ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), ++ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 2), ++ BPF_JMP_A(0), ++ BPF_EXIT_INSN(), ++ }, ++ .fixup_map1 = { 3 }, ++ .errstr = "map_value pointer and 1000000000000", ++ .result = REJECT ++ }, ++ { ++ "pointer/scalar confusion in state equality check (way 1)", ++ .insns = { ++ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), ++ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), ++ BPF_LD_MAP_FD(BPF_REG_1, 0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, ++ BPF_FUNC_map_lookup_elem), ++ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), ++ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), ++ BPF_JMP_A(1), ++ BPF_MOV64_REG(BPF_REG_0, BPF_REG_10), ++ BPF_JMP_A(0), ++ BPF_EXIT_INSN(), ++ }, ++ .fixup_map1 = { 3 }, ++ .result = ACCEPT, ++ .result_unpriv = REJECT, ++ .errstr_unpriv = "R0 leaks addr as return value" ++ }, ++ { ++ "pointer/scalar confusion in state equality check (way 2)", ++ .insns = { ++ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), ++ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), ++ BPF_LD_MAP_FD(BPF_REG_1, 0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, ++ BPF_FUNC_map_lookup_elem), ++ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), ++ BPF_MOV64_REG(BPF_REG_0, BPF_REG_10), ++ BPF_JMP_A(1), ++ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), ++ BPF_EXIT_INSN(), ++ }, ++ .fixup_map1 = { 3 }, ++ .result = ACCEPT, ++ .result_unpriv = REJECT, ++ .errstr_unpriv = "R0 leaks addr as return value" ++ }, + { + "variable-offset ctx access", + .insns = { +@@ -6806,6 +7258,71 @@ static struct bpf_test tests[] = { + .result = REJECT, + .prog_type = BPF_PROG_TYPE_LWT_IN, + }, ++ { ++ "indirect variable-offset stack access", ++ .insns = { ++ /* Fill the top 8 bytes of the stack */ ++ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), ++ /* Get an unknown value */ ++ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0), ++ /* Make it small and 4-byte aligned */ ++ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 4), ++ BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 8), ++ /* add it to fp. We now have either fp-4 or fp-8, but ++ * we don't know which ++ */ ++ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10), ++ /* dereference it indirectly */ ++ BPF_LD_MAP_FD(BPF_REG_1, 0), ++ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, ++ BPF_FUNC_map_lookup_elem), ++ BPF_MOV64_IMM(BPF_REG_0, 0), ++ BPF_EXIT_INSN(), ++ }, ++ .fixup_map1 = { 5 }, ++ .errstr = "variable stack read R2", ++ .result = REJECT, ++ .prog_type = BPF_PROG_TYPE_LWT_IN, ++ }, ++ { ++ "direct stack access with 32-bit wraparound. test1", ++ .insns = { ++ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x7fffffff), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x7fffffff), ++ BPF_MOV32_IMM(BPF_REG_0, 0), ++ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), ++ BPF_EXIT_INSN() ++ }, ++ .errstr = "fp pointer and 2147483647", ++ .result = REJECT ++ }, ++ { ++ "direct stack access with 32-bit wraparound. test2", ++ .insns = { ++ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x3fffffff), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x3fffffff), ++ BPF_MOV32_IMM(BPF_REG_0, 0), ++ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), ++ BPF_EXIT_INSN() ++ }, ++ .errstr = "fp pointer and 1073741823", ++ .result = REJECT ++ }, ++ { ++ "direct stack access with 32-bit wraparound. test3", ++ .insns = { ++ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x1fffffff), ++ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x1fffffff), ++ BPF_MOV32_IMM(BPF_REG_0, 0), ++ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), ++ BPF_EXIT_INSN() ++ }, ++ .errstr = "fp pointer offset 1073741822", ++ .result = REJECT ++ }, + { + "liveness pruning and write screening", + .insns = { +-- +2.15.0 + diff --git a/queue/selftests-x86-ldt_gdt-Add-infrastructure-to-test-set.patch b/queue/selftests-x86-ldt_gdt-Add-infrastructure-to-test-set.patch new file mode 100644 index 0000000..968516e --- /dev/null +++ b/queue/selftests-x86-ldt_gdt-Add-infrastructure-to-test-set.patch @@ -0,0 +1,103 @@ +From d744dcad39094c9187075e274d1cdef79c57c8b5 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Sat, 4 Nov 2017 04:19:50 -0700 +Subject: [PATCH] selftests/x86/ldt_gdt: Add infrastructure to test + set_thread_area() + +commit d744dcad39094c9187075e274d1cdef79c57c8b5 upstream. + +Much of the test design could apply to set_thread_area() (i.e. GDT), +not just modify_ldt(). Add set_thread_area() to the +install_valid_mode() helper. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/02c23f8fba5547007f741dc24c3926e5284ede02.1509794321.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c +index b0334338a4b0..45f30249133f 100644 +--- a/tools/testing/selftests/x86/ldt_gdt.c ++++ b/tools/testing/selftests/x86/ldt_gdt.c +@@ -137,30 +137,51 @@ static void check_valid_segment(uint16_t index, int ldt, + } + } + +-static bool install_valid_mode(const struct user_desc *desc, uint32_t ar, +- bool oldmode) ++static bool install_valid_mode(const struct user_desc *d, uint32_t ar, ++ bool oldmode, bool ldt) + { +- int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11, +- desc, sizeof(*desc)); +- if (ret < -1) +- errno = -ret; ++ struct user_desc desc = *d; ++ int ret; ++ ++ if (!ldt) { ++#ifndef __i386__ ++ /* No point testing set_thread_area in a 64-bit build */ ++ return false; ++#endif ++ if (!gdt_entry_num) ++ return false; ++ desc.entry_number = gdt_entry_num; ++ ++ ret = syscall(SYS_set_thread_area, &desc); ++ } else { ++ ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11, ++ &desc, sizeof(desc)); ++ ++ if (ret < -1) ++ errno = -ret; ++ ++ if (ret != 0 && errno == ENOSYS) { ++ printf("[OK]\tmodify_ldt returned -ENOSYS\n"); ++ return false; ++ } ++ } ++ + if (ret == 0) { +- uint32_t limit = desc->limit; +- if (desc->limit_in_pages) ++ uint32_t limit = desc.limit; ++ if (desc.limit_in_pages) + limit = (limit << 12) + 4095; +- check_valid_segment(desc->entry_number, 1, ar, limit, true); ++ check_valid_segment(desc.entry_number, ldt, ar, limit, true); + return true; +- } else if (errno == ENOSYS) { +- printf("[OK]\tmodify_ldt returned -ENOSYS\n"); +- return false; + } else { +- if (desc->seg_32bit) { +- printf("[FAIL]\tUnexpected modify_ldt failure %d\n", ++ if (desc.seg_32bit) { ++ printf("[FAIL]\tUnexpected %s failure %d\n", ++ ldt ? "modify_ldt" : "set_thread_area", + errno); + nerrs++; + return false; + } else { +- printf("[OK]\tmodify_ldt rejected 16 bit segment\n"); ++ printf("[OK]\t%s rejected 16 bit segment\n", ++ ldt ? "modify_ldt" : "set_thread_area"); + return false; + } + } +@@ -168,7 +189,7 @@ static bool install_valid_mode(const struct user_desc *desc, uint32_t ar, + + static bool install_valid(const struct user_desc *desc, uint32_t ar) + { +- return install_valid_mode(desc, ar, false); ++ return install_valid_mode(desc, ar, false, true); + } + + static void install_invalid(const struct user_desc *desc, bool oldmode) +-- +2.15.0 + diff --git a/queue/selftests-x86-ldt_gdt-Run-most-existing-LDT-test-cas.patch b/queue/selftests-x86-ldt_gdt-Run-most-existing-LDT-test-cas.patch new file mode 100644 index 0000000..837bd31 --- /dev/null +++ b/queue/selftests-x86-ldt_gdt-Run-most-existing-LDT-test-cas.patch @@ -0,0 +1,43 @@ +From adedf2893c192dd09b1cc2f2dcfdd7cad99ec49d Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Sat, 4 Nov 2017 04:19:51 -0700 +Subject: [PATCH] selftests/x86/ldt_gdt: Run most existing LDT test cases + against the GDT as well + +commit adedf2893c192dd09b1cc2f2dcfdd7cad99ec49d upstream. + +Now that the main test infrastructure supports the GDT, run tests +that will pass the kernel's GDT permission tests against the GDT. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/686a1eda63414da38fcecc2412db8dba1ae40581.1509794321.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c +index 45f30249133f..3bb42fff5d66 100644 +--- a/tools/testing/selftests/x86/ldt_gdt.c ++++ b/tools/testing/selftests/x86/ldt_gdt.c +@@ -189,7 +189,15 @@ static bool install_valid_mode(const struct user_desc *d, uint32_t ar, + + static bool install_valid(const struct user_desc *desc, uint32_t ar) + { +- return install_valid_mode(desc, ar, false, true); ++ bool ret = install_valid_mode(desc, ar, false, true); ++ ++ if (desc->contents <= 1 && desc->seg_32bit && ++ !desc->seg_not_present) { ++ /* Should work in the GDT, too. */ ++ install_valid_mode(desc, ar, false, false); ++ } ++ ++ return ret; + } + + static void install_invalid(const struct user_desc *desc, bool oldmode) +-- +2.15.0 + diff --git a/queue/series b/queue/series new file mode 100644 index 0000000..1dc30c3 --- /dev/null +++ b/queue/series @@ -0,0 +1,174 @@ +x86-asm-Remove-unnecessary-n-t-in-front-of-CC_SET-fr.patch +objtool-Don-t-report-end-of-section-error-after-an-e.patch +x86-head-Remove-confusing-comment.patch +x86-head-Remove-unused-bad_address-code.patch +x86-head-Fix-head-ELF-function-annotations.patch +x86-boot-Annotate-verify_cpu-as-a-callable-function.patch +x86-xen-Fix-xen-head-ELF-annotations.patch +x86-xen-Add-unwind-hint-annotations.patch +x86-head-Add-unwind-hint-annotations.patch +ACPI-APEI-adjust-a-local-variable-type-in-ghes_iorem.patch +x86-unwinder-Make-CONFIG_UNWINDER_ORC-y-the-default-.patch +x86-fpu-debug-Remove-unused-x86_fpu_state-and-x86_fp.patch +x86-unwind-Rename-unwinder-config-options-to-CONFIG_.patch +x86-unwind-Make-CONFIG_UNWINDER_ORC-y-the-default-in.patch +bitops-Add-clear-set_bit32-to-linux-bitops.h.patch +x86-cpuid-Add-generic-table-for-CPUID-dependencies.patch +x86-fpu-Parse-clearcpuid-as-early-XSAVE-argument.patch +x86-fpu-Make-XSAVE-check-the-base-CPUID-features-bef.patch +x86-fpu-Remove-the-explicit-clearing-of-XSAVE-depend.patch +x86-platform-UV-Convert-timers-to-use-timer_setup.patch +objtool-Print-top-level-commands-on-incorrect-usage.patch +x86-cpuid-Prevent-out-of-bound-access-in-do_clear_cp.patch +mm-sparsemem-Allocate-mem_section-at-runtime-for-CON.patch +x86-kasan-Use-the-same-shadow-offset-for-4-and-5-lev.patch +x86-xen-Provide-pre-built-page-tables-only-for-CONFI.patch +x86-xen-Drop-5-level-paging-support-code-from-the-XE.patch +ACPI-APEI-remove-the-unused-dead-code-for-SEA-NMI-no.patch +x86-asm-Don-t-use-the-confusing-.ifeq-directive.patch +x86-build-Beautify-build-log-of-syscall-headers.patch +x86-mm-64-Rename-the-register_page_bootmem_memmap-si.patch +x86-cpufeatures-Enable-new-SSE-AVX-AVX512-CPU-featur.patch +x86-mm-Relocate-page-fault-error-codes-to-traps.h.patch +x86-boot-Relocate-definition-of-the-initial-state-of.patch +ptrace-x86-Make-user_64bit_mode-available-to-32-bit-.patch +x86-entry-64-Remove-the-restore_c_regs_and_iret-labe.patch +x86-entry-64-Split-the-IRET-to-user-and-IRET-to-kern.patch +x86-entry-64-Move-SWAPGS-into-the-common-IRET-to-use.patch +x86-entry-64-Simplify-reg-restore-code-in-the-standa.patch +x86-entry-64-Shrink-paranoid_exit_restore-and-make-l.patch +x86-entry-64-Use-pop-instead-of-movq-in-syscall_retu.patch +x86-entry-64-Merge-the-fast-and-slow-SYSRET-paths.patch +x86-entry-64-Use-POP-instead-of-MOV-to-restore-regs-.patch +x86-entry-64-Remove-the-RESTORE_._REGS-infrastructur.patch +xen-x86-entry-64-Add-xen-NMI-trap-entry.patch +x86-entry-64-De-Xen-ify-our-NMI-code.patch +x86-entry-32-Pull-the-MSR_IA32_SYSENTER_CS-update-co.patch +x86-entry-64-Pass-SP0-directly-to-load_sp0.patch +x86-entry-Add-task_top_of_stack-to-find-the-top-of-a.patch +x86-xen-64-x86-entry-64-Clean-up-SP-code-in-cpu_init.patch +x86-entry-64-Stop-initializing-TSS.sp0-at-boot.patch +x86-entry-64-Remove-all-remaining-direct-thread_stru.patch +x86-entry-32-Fix-cpu_current_top_of_stack-initializa.patch +x86-entry-64-Remove-thread_struct-sp0.patch +x86-traps-Use-a-new-on_thread_stack-helper-to-clean-.patch +x86-entry-64-Shorten-TEST-instructions.patch +x86-cpuid-Replace-set-clear_bit32.patch +bitops-Revert-cbe96375025e-bitops-Add-clear-set_bit3.patch +x86-mm-Define-_PAGE_TABLE-using-_KERNPG_TABLE.patch +x86-cpufeatures-Re-tabulate-the-X86_FEATURE-definiti.patch +x86-cpufeatures-Fix-various-details-in-the-feature-d.patch +selftests-x86-ldt_gdt-Add-infrastructure-to-test-set.patch +selftests-x86-ldt_gdt-Run-most-existing-LDT-test-cas.patch +ACPI-APEI-Replace-ioremap_page_range-with-fixmap.patch +x86-virt-x86-platform-Merge-struct-x86_hyper-into-st.patch +x86-virt-Add-enum-for-hypervisors-to-replace-x86_hyp.patch +drivers-misc-intel-pti-Rename-the-header-file-to-fre.patch +x86-cpufeature-Add-User-Mode-Instruction-Prevention-.patch +x86-Make-X86_BUG_FXSAVE_LEAK-detectable-in-CPUID-on-.patch +perf-x86-Enable-free-running-PEBS-for-REGS_USER-INTR.patch +bpf-fix-build-issues-on-um-due-to-mising-bpf_perf_ev.patch +locking-barriers-Add-implicit-smp_read_barrier_depen.patch +locking-barriers-Convert-users-of-lockless_dereferen.patch +x86-mm-kasan-Don-t-use-vmemmap_populate-to-initializ.patch +x86-entry-64-paravirt-Use-paravirt-safe-macro-to-acc.patch +x86-unwinder-orc-Dont-bail-on-stack-overflow.patch +x86-unwinder-Handle-stack-overflows-more-gracefully.patch +x86-irq-Remove-an-old-outdated-comment-about-context.patch +x86-irq-64-Print-the-offending-IP-in-the-stack-overf.patch +x86-entry-64-Allocate-and-enable-the-SYSENTER-stack.patch +x86-dumpstack-Add-get_stack_info-support-for-the-SYS.patch +x86-entry-gdt-Put-per-CPU-GDT-remaps-in-ascending-or.patch +x86-mm-fixmap-Generalize-the-GDT-fixmap-mechanism-in.patch +x86-kasan-64-Teach-KASAN-about-the-cpu_entry_area.patch +x86-entry-Fix-assumptions-that-the-HW-TSS-is-at-the-.patch +x86-dumpstack-Handle-stack-overflow-on-all-stacks.patch +x86-entry-Move-SYSENTER_stack-to-the-beginning-of-st.patch +x86-entry-Remap-the-TSS-into-the-CPU-entry-area.patch +x86-entry-64-Separate-cpu_current_top_of_stack-from-.patch +x86-espfix-64-Stop-assuming-that-pt_regs-is-on-the-e.patch +x86-entry-64-Use-a-per-CPU-trampoline-stack-for-IDT-.patch +x86-entry-64-Return-to-userspace-from-the-trampoline.patch +x86-entry-64-Create-a-per-CPU-SYSCALL-entry-trampoli.patch +x86-entry-64-Move-the-IST-stacks-into-struct-cpu_ent.patch +x86-entry-64-Remove-the-SYSENTER-stack-canary.patch +x86-entry-Clean-up-the-SYSENTER_stack-code.patch +x86-entry-64-Make-cpu_entry_area.tss-read-only.patch +x86-paravirt-Dont-patch-flush_tlb_single.patch +x86-paravirt-Provide-a-way-to-check-for-hypervisors.patch +x86-cpufeatures-Make-CPU-bugs-sticky.patch +optee-fix-invalid-of_node_put-in-optee_driver_init.patch +backlight-pwm_bl-Fix-overflow-condition.patch +drm-Add-retries-for-lspcon-mode-detection.patch +clk-sunxi-ng-nm-Check-if-requested-rate-is-supported.patch +clk-sunxi-ng-sun5i-Fix-bit-offset-of-audio-PLL-post-.patch +crypto-crypto4xx-increase-context-and-scatter-ring-b.patch +crypto-lrw-Fix-an-error-handling-path-in-create.patch +rtc-pl031-make-interrupt-optional.patch +kvm-mm-account-kvm-related-kmem-slabs-to-kmemcg.patch +net-phy-at803x-Change-error-to-EINVAL-for-invalid-MA.patch +PCI-Avoid-bus-reset-if-bridge-itself-is-broken.patch +scsi-cxgb4i-fix-Tx-skb-leak.patch +scsi-mpt3sas-Fix-IO-error-occurs-on-pulling-out-a-dr.patch +PCI-Create-SR-IOV-virtfn-physfn-links-before-attachi.patch +PM-OPP-Move-error-message-to-debug-level.patch +igb-check-memory-allocation-failure.patch +i40e-use-the-safe-hash-table-iterator-when-deleting-.patch +iio-st_sensors-add-register-mask-for-status-register.patch +ixgbe-fix-use-of-uninitialized-padding.patch +IB-rxe-check-for-allocation-failure-on-elem.patch +block-bfq-Disable-writeback-throttling.patch +md-always-set-THREAD_WAKEUP-and-wake-up-wqueue-if-th.patch +ip_gre-check-packet-length-and-mtu-correctly-in-ersp.patch +ipv6-grab-rt-rt6i_ref-before-allocating-pcpu-rt.patch +leds-pca955x-Don-t-invert-requested-value-in-pca955x.patch +Bluetooth-hci_uart_set_flow_control-Fix-NULL-deref-w.patch +Bluetooth-hci_bcm-Fix-setting-of-irq-trigger-type.patch +i40e-i40evf-spread-CPU-affinity-hints-across-online-.patch +PCI-AER-Report-non-fatal-errors-only-to-the-affected.patch +tracing-Exclude-generic-fields-from-histograms.patch +ASoC-codecs-msm8916-wcd-analog-fix-micbias-level.patch +ASoC-img-parallel-out-Add-pm_runtime_get-put-to-set_.patch +powerpc-xmon-Avoid-tripping-SMP-hardlockup-watchdog.patch +powerpc-watchdog-Do-not-trigger-SMP-crash-from-touch.patch +sctp-silence-warns-on-sctp_stream_init-allocations.patch +ASoC-codecs-msm8916-wcd-analog-fix-module-autoload.patch +fm10k-fix-mis-ordered-parameters-in-declaration-for-.patch +scsi-lpfc-Fix-secure-firmware-updates.patch +scsi-lpfc-PLOGI-failures-during-NPIV-testing.patch +scsi-lpfc-Fix-warning-messages-when-NVME_TARGET_FC-n.patch +i40e-fix-client-notify-of-VF-reset.patch +vfio-pci-Virtualize-Maximum-Payload-Size.patch +ARM-exynos_defconfig-Enable-UAS-support-for-Odroid-H.patch +fm10k-ensure-we-process-SM-mbx-when-processing-VF-mb.patch +ibmvnic-Set-state-UP.patch +net-ipv6-send-NS-for-DAD-when-link-operationally-up.patch +RDMA-hns-Avoid-NULL-pointer-exception.patch +staging-greybus-light-Release-memory-obtained-by-kas.patch +clk-sunxi-ng-sun6i-Rename-HDMI-DDC-clock-to-avoid-na.patch +tcp-fix-under-evaluated-ssthresh-in-TCP-Vegas.patch +rtc-set-the-alarm-to-the-next-expiring-timer.patch +cpuidle-fix-broadcast-control-when-broadcast-can-not.patch +drm-vc4-Avoid-using-vrefresh-0-mode-in-DSI-htotal-ma.patch +IB-opa_vnic-Properly-clear-Mac-Table-Digest.patch +IB-opa_vnic-Properly-return-the-total-MACs-in-UC-MAC.patch +thermal-drivers-hisi-Fix-missing-interrupt-enablemen.patch +thermal-drivers-hisi-Fix-kernel-panic-on-alarm-inter.patch +thermal-drivers-hisi-Simplify-the-temperature-step-c.patch +thermal-drivers-hisi-Fix-multiple-alarm-interrupts-f.patch +platform-x86-asus-wireless-send-an-EV_SYN-SYN_REPORT.patch +mm-sparsemem-Fix-ARM64-boot-crash-when-CONFIG_SPARSE.patch +bpf-fix-branch-pruning-logic.patch +bpf-fix-corruption-on-concurrent-perf_event_output-c.patch +bpf-s390x-do-not-reload-skb-pointers-in-non-skb-cont.patch +bpf-ppc64-do-not-reload-skb-pointers-in-non-skb-cont.patch +bpf-sparc-fix-usage-of-wrong-reg-for-load_skb_regs-a.patch +bpf-verifier-fix-bounds-calculation-on-BPF_RSH.patch +bpf-fix-incorrect-sign-extension-in-check_alu_op.patch +bpf-fix-incorrect-tracking-of-register-size-truncati.patch +bpf-fix-32-bit-ALU-op-verification.patch +bpf-force-strict-alignment-checks-for-stack-pointers.patch +bpf-don-t-prune-branches-when-a-scalar-is-replaced-w.patch +bpf-fix-integer-overflows.patch +selftests-bpf-add-tests-for-recent-bugfixes.patch +linux-compiler.h-Split-into-compiler.h-and-compiler_.patch diff --git a/queue/staging-greybus-light-Release-memory-obtained-by-kas.patch b/queue/staging-greybus-light-Release-memory-obtained-by-kas.patch new file mode 100644 index 0000000..289357b --- /dev/null +++ b/queue/staging-greybus-light-Release-memory-obtained-by-kas.patch @@ -0,0 +1,29 @@ +From 04820da21050b35eed68aa046115d810163ead0c Mon Sep 17 00:00:00 2001 +From: Arvind Yadav <arvind.yadav.cs@gmail.com> +Date: Sat, 23 Sep 2017 13:25:30 +0530 +Subject: [PATCH] staging: greybus: light: Release memory obtained by kasprintf + +commit 04820da21050b35eed68aa046115d810163ead0c upstream. + +Free memory region, if gb_lights_channel_config is not successful. + +Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com> +Reviewed-by: Rui Miguel Silva <rmfrfs@gmail.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +diff --git a/drivers/staging/greybus/light.c b/drivers/staging/greybus/light.c +index 3f4148c92308..0f538b8c3a07 100644 +--- a/drivers/staging/greybus/light.c ++++ b/drivers/staging/greybus/light.c +@@ -925,6 +925,8 @@ static void __gb_lights_led_unregister(struct gb_channel *channel) + return; + + led_classdev_unregister(cdev); ++ kfree(cdev->name); ++ cdev->name = NULL; + channel->led = NULL; + } + +-- +2.15.0 + diff --git a/queue/tcp-fix-under-evaluated-ssthresh-in-TCP-Vegas.patch b/queue/tcp-fix-under-evaluated-ssthresh-in-TCP-Vegas.patch new file mode 100644 index 0000000..92172bd --- /dev/null +++ b/queue/tcp-fix-under-evaluated-ssthresh-in-TCP-Vegas.patch @@ -0,0 +1,30 @@ +From cf5d74b85ef40c202c76d90959db4d850f301b95 Mon Sep 17 00:00:00 2001 +From: Hoang Tran <tranviethoang.vn@gmail.com> +Date: Wed, 27 Sep 2017 18:30:58 +0200 +Subject: [PATCH] tcp: fix under-evaluated ssthresh in TCP Vegas + +commit cf5d74b85ef40c202c76d90959db4d850f301b95 upstream. + +With the commit 76174004a0f19785 (tcp: do not slow start when cwnd equals +ssthresh), the comparison to the reduced cwnd in tcp_vegas_ssthresh() would +under-evaluate the ssthresh. + +Signed-off-by: Hoang Tran <hoang.tran@uclouvain.be> +Signed-off-by: David S. Miller <davem@davemloft.net> + +diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c +index 218cfcc77650..ee113ff15fd0 100644 +--- a/net/ipv4/tcp_vegas.c ++++ b/net/ipv4/tcp_vegas.c +@@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event); + + static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp) + { +- return min(tp->snd_ssthresh, tp->snd_cwnd-1); ++ return min(tp->snd_ssthresh, tp->snd_cwnd); + } + + static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) +-- +2.15.0 + diff --git a/queue/thermal-drivers-hisi-Fix-kernel-panic-on-alarm-inter.patch b/queue/thermal-drivers-hisi-Fix-kernel-panic-on-alarm-inter.patch new file mode 100644 index 0000000..7f375f4 --- /dev/null +++ b/queue/thermal-drivers-hisi-Fix-kernel-panic-on-alarm-inter.patch @@ -0,0 +1,57 @@ +From 2cb4de785c40d4a2132cfc13e63828f5a28c3351 Mon Sep 17 00:00:00 2001 +From: Daniel Lezcano <daniel.lezcano@linaro.org> +Date: Thu, 19 Oct 2017 19:05:45 +0200 +Subject: [PATCH] thermal/drivers/hisi: Fix kernel panic on alarm interrupt + +commit 2cb4de785c40d4a2132cfc13e63828f5a28c3351 upstream. + +The threaded interrupt for the alarm interrupt is requested before the +temperature controller is setup. This one can fire an interrupt immediately +leading to a kernel panic as the sensor data is not initialized. + +In order to prevent that, move the threaded irq after the Tsensor is setup. + +Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org> +Reviewed-by: Leo Yan <leo.yan@linaro.org> +Tested-by: Leo Yan <leo.yan@linaro.org> +Signed-off-by: Eduardo Valentin <edubezval@gmail.com> + +diff --git a/drivers/thermal/hisi_thermal.c b/drivers/thermal/hisi_thermal.c +index 725d0d434d83..f69aea0b2fe3 100644 +--- a/drivers/thermal/hisi_thermal.c ++++ b/drivers/thermal/hisi_thermal.c +@@ -287,15 +287,6 @@ static int hisi_thermal_probe(struct platform_device *pdev) + if (data->irq < 0) + return data->irq; + +- ret = devm_request_threaded_irq(&pdev->dev, data->irq, +- hisi_thermal_alarm_irq, +- hisi_thermal_alarm_irq_thread, +- 0, "hisi_thermal", data); +- if (ret < 0) { +- dev_err(&pdev->dev, "failed to request alarm irq: %d\n", ret); +- return ret; +- } +- + platform_set_drvdata(pdev, data); + + data->clk = devm_clk_get(&pdev->dev, "thermal_clk"); +@@ -328,6 +319,15 @@ static int hisi_thermal_probe(struct platform_device *pdev) + + hisi_thermal_toggle_sensor(&data->sensors, true); + ++ ret = devm_request_threaded_irq(&pdev->dev, data->irq, ++ hisi_thermal_alarm_irq, ++ hisi_thermal_alarm_irq_thread, ++ 0, "hisi_thermal", data); ++ if (ret < 0) { ++ dev_err(&pdev->dev, "failed to request alarm irq: %d\n", ret); ++ return ret; ++ } ++ + enable_irq(data->irq); + + return 0; +-- +2.15.0 + diff --git a/queue/thermal-drivers-hisi-Fix-missing-interrupt-enablemen.patch b/queue/thermal-drivers-hisi-Fix-missing-interrupt-enablemen.patch new file mode 100644 index 0000000..f8f2ddb --- /dev/null +++ b/queue/thermal-drivers-hisi-Fix-missing-interrupt-enablemen.patch @@ -0,0 +1,50 @@ +From c176b10b025acee4dc8f2ab1cd64eb73b5ccef53 Mon Sep 17 00:00:00 2001 +From: Daniel Lezcano <daniel.lezcano@linaro.org> +Date: Thu, 19 Oct 2017 19:05:43 +0200 +Subject: [PATCH] thermal/drivers/hisi: Fix missing interrupt enablement + +commit c176b10b025acee4dc8f2ab1cd64eb73b5ccef53 upstream. + +The interrupt for the temperature threshold is not enabled at the end of the +probe function, enable it after the setup is complete. + +On the other side, the irq_enabled is not correctly set as we are checking if +the interrupt is masked where 'yes' means irq_enabled=false. + + irq_get_irqchip_state(data->irq, IRQCHIP_STATE_MASKED, + &data->irq_enabled); + +As we are always enabling the interrupt, it is pointless to check if +the interrupt is masked or not, just set irq_enabled to 'true'. + +Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org> +Reviewed-by: Leo Yan <leo.yan@linaro.org> +Tested-by: Leo Yan <leo.yan@linaro.org> +Signed-off-by: Eduardo Valentin <edubezval@gmail.com> + +diff --git a/drivers/thermal/hisi_thermal.c b/drivers/thermal/hisi_thermal.c +index bd3572c41585..8381696241d6 100644 +--- a/drivers/thermal/hisi_thermal.c ++++ b/drivers/thermal/hisi_thermal.c +@@ -345,8 +345,7 @@ static int hisi_thermal_probe(struct platform_device *pdev) + } + + hisi_thermal_enable_bind_irq_sensor(data); +- irq_get_irqchip_state(data->irq, IRQCHIP_STATE_MASKED, +- &data->irq_enabled); ++ data->irq_enabled = true; + + for (i = 0; i < HISI_MAX_SENSORS; ++i) { + ret = hisi_thermal_register_sensor(pdev, data, +@@ -358,6 +357,8 @@ static int hisi_thermal_probe(struct platform_device *pdev) + hisi_thermal_toggle_sensor(&data->sensors[i], true); + } + ++ enable_irq(data->irq); ++ + return 0; + } + +-- +2.15.0 + diff --git a/queue/thermal-drivers-hisi-Fix-multiple-alarm-interrupts-f.patch b/queue/thermal-drivers-hisi-Fix-multiple-alarm-interrupts-f.patch new file mode 100644 index 0000000..be9de08 --- /dev/null +++ b/queue/thermal-drivers-hisi-Fix-multiple-alarm-interrupts-f.patch @@ -0,0 +1,73 @@ +From db2b0332608c8e648ea1e44727d36ad37cdb56cb Mon Sep 17 00:00:00 2001 +From: Daniel Lezcano <daniel.lezcano@linaro.org> +Date: Thu, 19 Oct 2017 19:05:47 +0200 +Subject: [PATCH] thermal/drivers/hisi: Fix multiple alarm interrupts firing + +commit db2b0332608c8e648ea1e44727d36ad37cdb56cb upstream. + +The DT specifies a threshold of 65000, we setup the register with a value in +the temperature resolution for the controller, 64656. + +When we reach 64656, the interrupt fires, the interrupt is disabled. Then the +irq thread runs and calls thermal_zone_device_update() which will call in turn +hisi_thermal_get_temp(). + +The function will look if the temperature decreased, assuming it was more than +65000, but that is not the case because the current temperature is 64656 +(because of the rounding when setting the threshold). This condition being +true, we re-enable the interrupt which fires immediately after exiting the irq +thread. That happens again and again until the temperature goes to more than +65000. + +Potentially, there is here an interrupt storm if the temperature stabilizes at +this temperature. A very unlikely case but possible. + +In any case, it does not make sense to handle dozens of alarm interrupt for +nothing. + +Fix this by rounding the threshold value to the controller resolution so the +check against the threshold is consistent with the one set in the controller. + +Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org> +Reviewed-by: Leo Yan <leo.yan@linaro.org> +Tested-by: Leo Yan <leo.yan@linaro.org> +Signed-off-by: Eduardo Valentin <edubezval@gmail.com> + +diff --git a/drivers/thermal/hisi_thermal.c b/drivers/thermal/hisi_thermal.c +index 583bc1934127..f5231974504c 100644 +--- a/drivers/thermal/hisi_thermal.c ++++ b/drivers/thermal/hisi_thermal.c +@@ -90,6 +90,12 @@ static inline long hisi_thermal_temp_to_step(long temp) + return (temp - HISI_TEMP_BASE) / HISI_TEMP_STEP; + } + ++static inline long hisi_thermal_round_temp(int temp) ++{ ++ return hisi_thermal_step_to_temp( ++ hisi_thermal_temp_to_step(temp)); ++} ++ + static long hisi_thermal_get_sensor_temp(struct hisi_thermal_data *data, + struct hisi_thermal_sensor *sensor) + { +@@ -221,7 +227,7 @@ static irqreturn_t hisi_thermal_alarm_irq_thread(int irq, void *dev) + sensor = &data->sensors; + + dev_crit(&data->pdev->dev, "THERMAL ALARM: T > %d\n", +- sensor->thres_temp / 1000); ++ sensor->thres_temp); + mutex_unlock(&data->thermal_lock); + + thermal_zone_device_update(data->sensors.tzd, +@@ -255,7 +261,7 @@ static int hisi_thermal_register_sensor(struct platform_device *pdev, + + for (i = 0; i < of_thermal_get_ntrips(sensor->tzd); i++) { + if (trip[i].type == THERMAL_TRIP_PASSIVE) { +- sensor->thres_temp = trip[i].temperature; ++ sensor->thres_temp = hisi_thermal_round_temp(trip[i].temperature); + break; + } + } +-- +2.15.0 + diff --git a/queue/thermal-drivers-hisi-Simplify-the-temperature-step-c.patch b/queue/thermal-drivers-hisi-Simplify-the-temperature-step-c.patch new file mode 100644 index 0000000..bf43688 --- /dev/null +++ b/queue/thermal-drivers-hisi-Simplify-the-temperature-step-c.patch @@ -0,0 +1,104 @@ +From 48880b979cdc9ef5a70af020f42b8ba1e51dbd34 Mon Sep 17 00:00:00 2001 +From: Daniel Lezcano <daniel.lezcano@linaro.org> +Date: Thu, 19 Oct 2017 19:05:46 +0200 +Subject: [PATCH] thermal/drivers/hisi: Simplify the temperature/step + computation + +commit 48880b979cdc9ef5a70af020f42b8ba1e51dbd34 upstream. + +The step and the base temperature are fixed values, we can simplify the +computation by converting the base temperature to milli celsius and use a +pre-computed step value. That saves us a lot of mult + div for nothing at +runtime. + +Take also the opportunity to change the function names to be consistent with +the rest of the code. + +Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org> +Reviewed-by: Leo Yan <leo.yan@linaro.org> +Tested-by: Leo Yan <leo.yan@linaro.org> +Signed-off-by: Eduardo Valentin <edubezval@gmail.com> + +diff --git a/drivers/thermal/hisi_thermal.c b/drivers/thermal/hisi_thermal.c +index f69aea0b2fe3..583bc1934127 100644 +--- a/drivers/thermal/hisi_thermal.c ++++ b/drivers/thermal/hisi_thermal.c +@@ -35,8 +35,9 @@ + #define TEMP0_RST_MSK (0x1C) + #define TEMP0_VALUE (0x28) + +-#define HISI_TEMP_BASE (-60) ++#define HISI_TEMP_BASE (-60000) + #define HISI_TEMP_RESET (100000) ++#define HISI_TEMP_STEP (784) + + #define HISI_MAX_SENSORS 4 + #define HISI_DEFAULT_SENSOR 2 +@@ -61,19 +62,32 @@ struct hisi_thermal_data { + void __iomem *regs; + }; + +-/* in millicelsius */ +-static inline int _step_to_temp(int step) ++/* ++ * The temperature computation on the tsensor is as follow: ++ * Unit: millidegree Celsius ++ * Step: 255/200 (0.7843) ++ * Temperature base: -60°C ++ * ++ * The register is programmed in temperature steps, every step is 784 ++ * millidegree and begins at -60 000 m°C ++ * ++ * The temperature from the steps: ++ * ++ * Temp = TempBase + (steps x 784) ++ * ++ * and the steps from the temperature: ++ * ++ * steps = (Temp - TempBase) / 784 ++ * ++ */ ++static inline int hisi_thermal_step_to_temp(int step) + { +- /* +- * Every step equals (1 * 200) / 255 celsius, and finally +- * need convert to millicelsius. +- */ +- return (HISI_TEMP_BASE * 1000 + (step * 200000 / 255)); ++ return HISI_TEMP_BASE + (step * HISI_TEMP_STEP); + } + +-static inline long _temp_to_step(long temp) ++static inline long hisi_thermal_temp_to_step(long temp) + { +- return ((temp - HISI_TEMP_BASE * 1000) * 255) / 200000; ++ return (temp - HISI_TEMP_BASE) / HISI_TEMP_STEP; + } + + static long hisi_thermal_get_sensor_temp(struct hisi_thermal_data *data, +@@ -99,7 +113,7 @@ static long hisi_thermal_get_sensor_temp(struct hisi_thermal_data *data, + usleep_range(3000, 5000); + + val = readl(data->regs + TEMP0_VALUE); +- val = _step_to_temp(val); ++ val = hisi_thermal_step_to_temp(val); + + mutex_unlock(&data->thermal_lock); + +@@ -126,10 +140,11 @@ static void hisi_thermal_enable_bind_irq_sensor + writel((sensor->id << 12), data->regs + TEMP0_CFG); + + /* enable for interrupt */ +- writel(_temp_to_step(sensor->thres_temp) | 0x0FFFFFF00, ++ writel(hisi_thermal_temp_to_step(sensor->thres_temp) | 0x0FFFFFF00, + data->regs + TEMP0_TH); + +- writel(_temp_to_step(HISI_TEMP_RESET), data->regs + TEMP0_RST_TH); ++ writel(hisi_thermal_temp_to_step(HISI_TEMP_RESET), ++ data->regs + TEMP0_RST_TH); + + /* enable module */ + writel(0x1, data->regs + TEMP0_RST_MSK); +-- +2.15.0 + diff --git a/queue/tracing-Exclude-generic-fields-from-histograms.patch b/queue/tracing-Exclude-generic-fields-from-histograms.patch new file mode 100644 index 0000000..33220d2 --- /dev/null +++ b/queue/tracing-Exclude-generic-fields-from-histograms.patch @@ -0,0 +1,44 @@ +From a15f7fc20389a8827d5859907568b201234d4b79 Mon Sep 17 00:00:00 2001 +From: Tom Zanussi <tom.zanussi@linux.intel.com> +Date: Fri, 22 Sep 2017 14:58:17 -0500 +Subject: [PATCH] tracing: Exclude 'generic fields' from histograms + +commit a15f7fc20389a8827d5859907568b201234d4b79 upstream. + +There are a small number of 'generic fields' (comm/COMM/cpu/CPU) that +are found by trace_find_event_field() but are only meant for +filtering. Specifically, they unlike normal fields, they have a size +of 0 and thus wreak havoc when used as a histogram key. + +Exclude these (return -EINVAL) when used as histogram keys. + +Link: http://lkml.kernel.org/r/956154cbc3e8a4f0633d619b886c97f0f0edf7b4.1506105045.git.tom.zanussi@linux.intel.com + +Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com> +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> + +diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c +index f123b5d0c226..121d56850f24 100644 +--- a/kernel/trace/trace_events_hist.c ++++ b/kernel/trace/trace_events_hist.c +@@ -450,7 +450,7 @@ static int create_val_field(struct hist_trigger_data *hist_data, + } + + field = trace_find_event_field(file->event_call, field_name); +- if (!field) { ++ if (!field || !field->size) { + ret = -EINVAL; + goto out; + } +@@ -548,7 +548,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, + } + + field = trace_find_event_field(file->event_call, field_name); +- if (!field) { ++ if (!field || !field->size) { + ret = -EINVAL; + goto out; + } +-- +2.15.0 + diff --git a/queue/vfio-pci-Virtualize-Maximum-Payload-Size.patch b/queue/vfio-pci-Virtualize-Maximum-Payload-Size.patch new file mode 100644 index 0000000..fa79f41 --- /dev/null +++ b/queue/vfio-pci-Virtualize-Maximum-Payload-Size.patch @@ -0,0 +1,44 @@ +From 523184972b282cd9ca17a76f6ca4742394856818 Mon Sep 17 00:00:00 2001 +From: Alex Williamson <alex.williamson@redhat.com> +Date: Mon, 2 Oct 2017 12:39:09 -0600 +Subject: [PATCH] vfio/pci: Virtualize Maximum Payload Size + +commit 523184972b282cd9ca17a76f6ca4742394856818 upstream. + +With virtual PCI-Express chipsets, we now see userspace/guest drivers +trying to match the physical MPS setting to a virtual downstream port. +Of course a lone physical device surrounded by virtual interconnects +cannot make a correct decision for a proper MPS setting. Instead, +let's virtualize the MPS control register so that writes through to +hardware are disallowed. Userspace drivers like QEMU assume they can +write anything to the device and we'll filter out anything dangerous. +Since mismatched MPS can lead to AER and other faults, let's add it +to the kernel side rather than relying on userspace virtualization to +handle it. + +Signed-off-by: Alex Williamson <alex.williamson@redhat.com> +Reviewed-by: Eric Auger <eric.auger@redhat.com> + +diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c +index 5628fe114347..91335e6de88a 100644 +--- a/drivers/vfio/pci/vfio_pci_config.c ++++ b/drivers/vfio/pci/vfio_pci_config.c +@@ -849,11 +849,13 @@ static int __init init_pci_cap_exp_perm(struct perm_bits *perm) + + /* + * Allow writes to device control fields, except devctl_phantom, +- * which could confuse IOMMU, and the ARI bit in devctl2, which ++ * which could confuse IOMMU, MPS, which can break communication ++ * with other physical devices, and the ARI bit in devctl2, which + * is set at probe time. FLR gets virtualized via our writefn. + */ + p_setw(perm, PCI_EXP_DEVCTL, +- PCI_EXP_DEVCTL_BCR_FLR, ~PCI_EXP_DEVCTL_PHANTOM); ++ PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_PAYLOAD, ++ ~PCI_EXP_DEVCTL_PHANTOM); + p_setw(perm, PCI_EXP_DEVCTL2, NO_VIRT, ~PCI_EXP_DEVCTL2_ARI); + return 0; + } +-- +2.15.0 + diff --git a/queue/x86-Make-X86_BUG_FXSAVE_LEAK-detectable-in-CPUID-on-.patch b/queue/x86-Make-X86_BUG_FXSAVE_LEAK-detectable-in-CPUID-on-.patch new file mode 100644 index 0000000..fc65d2d --- /dev/null +++ b/queue/x86-Make-X86_BUG_FXSAVE_LEAK-detectable-in-CPUID-on-.patch @@ -0,0 +1,61 @@ +From f2dbad36c55e5d3a91dccbde6e8cae345fe5632f Mon Sep 17 00:00:00 2001 +From: Rudolf Marek <r.marek@assembler.cz> +Date: Tue, 28 Nov 2017 22:01:06 +0100 +Subject: [PATCH] x86: Make X86_BUG_FXSAVE_LEAK detectable in CPUID on AMD + +commit f2dbad36c55e5d3a91dccbde6e8cae345fe5632f upstream. + +[ Note, this is a Git cherry-pick of the following commit: + + 2b67799bdf25 ("x86: Make X86_BUG_FXSAVE_LEAK detectable in CPUID on AMD") + + ... for easier x86 PTI code testing and back-porting. ] + +The latest AMD AMD64 Architecture Programmer's Manual +adds a CPUID feature XSaveErPtr (CPUID_Fn80000008_EBX[2]). + +If this feature is set, the FXSAVE, XSAVE, FXSAVEOPT, XSAVEC, XSAVES +/ FXRSTOR, XRSTOR, XRSTORS always save/restore error pointers, +thus making the X86_BUG_FXSAVE_LEAK workaround obsolete on such CPUs. + +Signed-Off-By: Rudolf Marek <r.marek@assembler.cz> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Tested-by: Borislav Petkov <bp@suse.de> +Cc: Andy Lutomirski <luto@amacapital.net> +Link: https://lkml.kernel.org/r/bdcebe90-62c5-1f05-083c-eba7f08b2540@assembler.cz +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index c0b0e9e8aa66..800104c8a3ed 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -266,6 +266,7 @@ + /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ + #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ + #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ ++#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ + + /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ + #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ +diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c +index d58184b7cd44..bcb75dc97d44 100644 +--- a/arch/x86/kernel/cpu/amd.c ++++ b/arch/x86/kernel/cpu/amd.c +@@ -804,8 +804,11 @@ static void init_amd(struct cpuinfo_x86 *c) + case 0x17: init_amd_zn(c); break; + } + +- /* Enable workaround for FXSAVE leak */ +- if (c->x86 >= 6) ++ /* ++ * Enable workaround for FXSAVE leak on CPUs ++ * without a XSaveErPtr feature ++ */ ++ if ((c->x86 >= 6) && (!cpu_has(c, X86_FEATURE_XSAVEERPTR))) + set_cpu_bug(c, X86_BUG_FXSAVE_LEAK); + + cpu_detect_cache_sizes(c); +-- +2.15.0 + diff --git a/queue/x86-asm-Don-t-use-the-confusing-.ifeq-directive.patch b/queue/x86-asm-Don-t-use-the-confusing-.ifeq-directive.patch new file mode 100644 index 0000000..2b4a457 --- /dev/null +++ b/queue/x86-asm-Don-t-use-the-confusing-.ifeq-directive.patch @@ -0,0 +1,65 @@ +From 82c62fa0c49aa305104013cee4468772799bb391 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf <jpoimboe@redhat.com> +Date: Fri, 20 Oct 2017 11:21:35 -0500 +Subject: [PATCH] x86/asm: Don't use the confusing '.ifeq' directive + +commit 82c62fa0c49aa305104013cee4468772799bb391 upstream. + +I find the '.ifeq <expression>' directive to be confusing. Reading it +quickly seems to suggest its opposite meaning, or that it's missing an +argument. + +Improve readability by replacing all of its x86 uses with +'.if <expression> == 0'. + +Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Andrei Vagin <avagin@virtuozzo.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/757da028e802c7e98d23fbab8d234b1063e161cf.1508516398.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index f6cdb7a1455e..846e84a1d1f7 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -817,7 +817,7 @@ ENTRY(\sym) + + ASM_CLAC + +- .ifeq \has_error_code ++ .if \has_error_code == 0 + pushq $-1 /* ORIG_RAX: no syscall to restart */ + .endif + +diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S +index 9ed3074d0d27..6e50f87765e5 100644 +--- a/arch/x86/kernel/head_32.S ++++ b/arch/x86/kernel/head_32.S +@@ -401,7 +401,7 @@ ENTRY(early_idt_handler_array) + # 24(%rsp) error code + i = 0 + .rept NUM_EXCEPTION_VECTORS +- .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 ++ .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 + pushl $0 # Dummy error code, to make stack frame uniform + .endif + pushl $i # 20(%esp) Vector number +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S +index 42e32c2e51bb..311db1a73c11 100644 +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -273,7 +273,7 @@ ENDPROC(start_cpu0) + ENTRY(early_idt_handler_array) + i = 0 + .rept NUM_EXCEPTION_VECTORS +- .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 ++ .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 + UNWIND_HINT_IRET_REGS + pushq $0 # Dummy error code, to make stack frame uniform + .else +-- +2.15.0 + diff --git a/queue/x86-asm-Remove-unnecessary-n-t-in-front-of-CC_SET-fr.patch b/queue/x86-asm-Remove-unnecessary-n-t-in-front-of-CC_SET-fr.patch new file mode 100644 index 0000000..a89504e --- /dev/null +++ b/queue/x86-asm-Remove-unnecessary-n-t-in-front-of-CC_SET-fr.patch @@ -0,0 +1,135 @@ +From 3c52b5c64326d9dcfee4e10611c53ec1b1b20675 Mon Sep 17 00:00:00 2001 +From: Uros Bizjak <ubizjak@gmail.com> +Date: Wed, 6 Sep 2017 17:18:08 +0200 +Subject: [PATCH] x86/asm: Remove unnecessary \n\t in front of CC_SET() from + asm templates + +commit 3c52b5c64326d9dcfee4e10611c53ec1b1b20675 upstream. + +There is no need for \n\t in front of CC_SET(), as the macro already includes these two. + +Signed-off-by: Uros Bizjak <ubizjak@gmail.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/20170906151808.5634-1-ubizjak@gmail.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h +index 5b0579abb398..3ac991d81e74 100644 +--- a/arch/x86/include/asm/archrandom.h ++++ b/arch/x86/include/asm/archrandom.h +@@ -45,7 +45,7 @@ static inline bool rdrand_long(unsigned long *v) + bool ok; + unsigned int retry = RDRAND_RETRY_LOOPS; + do { +- asm volatile(RDRAND_LONG "\n\t" ++ asm volatile(RDRAND_LONG + CC_SET(c) + : CC_OUT(c) (ok), "=a" (*v)); + if (ok) +@@ -59,7 +59,7 @@ static inline bool rdrand_int(unsigned int *v) + bool ok; + unsigned int retry = RDRAND_RETRY_LOOPS; + do { +- asm volatile(RDRAND_INT "\n\t" ++ asm volatile(RDRAND_INT + CC_SET(c) + : CC_OUT(c) (ok), "=a" (*v)); + if (ok) +@@ -71,7 +71,7 @@ static inline bool rdrand_int(unsigned int *v) + static inline bool rdseed_long(unsigned long *v) + { + bool ok; +- asm volatile(RDSEED_LONG "\n\t" ++ asm volatile(RDSEED_LONG + CC_SET(c) + : CC_OUT(c) (ok), "=a" (*v)); + return ok; +@@ -80,7 +80,7 @@ static inline bool rdseed_long(unsigned long *v) + static inline bool rdseed_int(unsigned int *v) + { + bool ok; +- asm volatile(RDSEED_INT "\n\t" ++ asm volatile(RDSEED_INT + CC_SET(c) + : CC_OUT(c) (ok), "=a" (*v)); + return ok; +diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h +index 854022772c5b..8cee8db6dffb 100644 +--- a/arch/x86/include/asm/bitops.h ++++ b/arch/x86/include/asm/bitops.h +@@ -142,7 +142,7 @@ static __always_inline void __clear_bit(long nr, volatile unsigned long *addr) + static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) + { + bool negative; +- asm volatile(LOCK_PREFIX "andb %2,%1\n\t" ++ asm volatile(LOCK_PREFIX "andb %2,%1" + CC_SET(s) + : CC_OUT(s) (negative), ADDR + : "ir" ((char) ~(1 << nr)) : "memory"); +@@ -245,7 +245,7 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long * + { + bool oldbit; + +- asm("bts %2,%1\n\t" ++ asm("bts %2,%1" + CC_SET(c) + : CC_OUT(c) (oldbit), ADDR + : "Ir" (nr)); +@@ -285,7 +285,7 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long + { + bool oldbit; + +- asm volatile("btr %2,%1\n\t" ++ asm volatile("btr %2,%1" + CC_SET(c) + : CC_OUT(c) (oldbit), ADDR + : "Ir" (nr)); +@@ -297,7 +297,7 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon + { + bool oldbit; + +- asm volatile("btc %2,%1\n\t" ++ asm volatile("btc %2,%1" + CC_SET(c) + : CC_OUT(c) (oldbit), ADDR + : "Ir" (nr) : "memory"); +@@ -328,7 +328,7 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l + { + bool oldbit; + +- asm volatile("bt %2,%1\n\t" ++ asm volatile("bt %2,%1" + CC_SET(c) + : CC_OUT(c) (oldbit) + : "m" (*(unsigned long *)addr), "Ir" (nr)); +diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h +index 9fa03604b2b3..b21a475fd7ed 100644 +--- a/arch/x86/include/asm/percpu.h ++++ b/arch/x86/include/asm/percpu.h +@@ -525,7 +525,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr, + { + bool oldbit; + +- asm volatile("bt "__percpu_arg(2)",%1\n\t" ++ asm volatile("bt "__percpu_arg(2)",%1" + CC_SET(c) + : CC_OUT(c) (oldbit) + : "m" (*(unsigned long __percpu *)addr), "Ir" (nr)); +diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h +index 045f99211a99..0c411c8bbdbd 100644 +--- a/arch/x86/include/asm/rmwcc.h ++++ b/arch/x86/include/asm/rmwcc.h +@@ -28,7 +28,7 @@ cc_label: \ + #define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ + do { \ + bool c; \ +- asm volatile (fullop ";" CC_SET(cc) \ ++ asm volatile (fullop CC_SET(cc) \ + : [counter] "+m" (var), CC_OUT(cc) (c) \ + : __VA_ARGS__ : clobbers); \ + return c; \ +-- +2.15.0 + diff --git a/queue/x86-boot-Annotate-verify_cpu-as-a-callable-function.patch b/queue/x86-boot-Annotate-verify_cpu-as-a-callable-function.patch new file mode 100644 index 0000000..022e2d4 --- /dev/null +++ b/queue/x86-boot-Annotate-verify_cpu-as-a-callable-function.patch @@ -0,0 +1,41 @@ +From e93db75a0054b23a874a12c63376753544f3fe9e Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf <jpoimboe@redhat.com> +Date: Mon, 18 Sep 2017 21:43:34 -0500 +Subject: [PATCH] x86/boot: Annotate verify_cpu() as a callable function + +commit e93db75a0054b23a874a12c63376753544f3fe9e upstream. + +verify_cpu() is a callable function. Annotate it as such. + +Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Jiri Slaby <jslaby@suse.cz> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/293024b8a080832075312f38c07ccc970fc70292.1505764066.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S +index 014ea59aa153..3d3c2f71f617 100644 +--- a/arch/x86/kernel/verify_cpu.S ++++ b/arch/x86/kernel/verify_cpu.S +@@ -33,7 +33,7 @@ + #include <asm/cpufeatures.h> + #include <asm/msr-index.h> + +-verify_cpu: ++ENTRY(verify_cpu) + pushf # Save caller passed flags + push $0 # Kill any dangerous flags + popf +@@ -139,3 +139,4 @@ verify_cpu: + popf # Restore caller passed flags + xorl %eax, %eax + ret ++ENDPROC(verify_cpu) +-- +2.15.0 + diff --git a/queue/x86-boot-Relocate-definition-of-the-initial-state-of.patch b/queue/x86-boot-Relocate-definition-of-the-initial-state-of.patch new file mode 100644 index 0000000..4962926 --- /dev/null +++ b/queue/x86-boot-Relocate-definition-of-the-initial-state-of.patch @@ -0,0 +1,89 @@ +From b0ce5b8c95c83a7b98c679b117e3d6ae6f97154b Mon Sep 17 00:00:00 2001 +From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com> +Date: Fri, 27 Oct 2017 13:25:29 -0700 +Subject: [PATCH] x86/boot: Relocate definition of the initial state of CR0 + +commit b0ce5b8c95c83a7b98c679b117e3d6ae6f97154b upstream. + +Both head_32.S and head_64.S utilize the same value to initialize the +control register CR0. Also, other parts of the kernel might want to access +this initial definition (e.g., emulation code for User-Mode Instruction +Prevention uses this state to provide a sane dummy value for CR0 when +emulating the smsw instruction). Thus, relocate this definition to a +header file from which it can be conveniently accessed. + +Suggested-by: Borislav Petkov <bp@alien8.de> +Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Reviewed-by: Andy Lutomirski <luto@kernel.org> +Cc: "Michael S. Tsirkin" <mst@redhat.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: ricardo.neri@intel.com +Cc: linux-mm@kvack.org +Cc: Paul Gortmaker <paul.gortmaker@windriver.com> +Cc: Huang Rui <ray.huang@amd.com> +Cc: Shuah Khan <shuah@kernel.org> +Cc: linux-arch@vger.kernel.org +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Jiri Slaby <jslaby@suse.cz> +Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Chris Metcalf <cmetcalf@mellanox.com> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Chen Yucong <slaoub@gmail.com> +Cc: Vlastimil Babka <vbabka@suse.cz> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Masami Hiramatsu <mhiramat@kernel.org> +Cc: Paolo Bonzini <pbonzini@redhat.com> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Link: https://lkml.kernel.org/r/1509135945-13762-3-git-send-email-ricardo.neri-calderon@linux.intel.com + +diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h +index 185f3d10c194..39946d0a1d41 100644 +--- a/arch/x86/include/uapi/asm/processor-flags.h ++++ b/arch/x86/include/uapi/asm/processor-flags.h +@@ -151,5 +151,8 @@ + #define CX86_ARR_BASE 0xc4 + #define CX86_RCR_BASE 0xdc + ++#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ ++ X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ ++ X86_CR0_PG) + + #endif /* _UAPI_ASM_X86_PROCESSOR_FLAGS_H */ +diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S +index 9ed3074d0d27..c3cfc655f551 100644 +--- a/arch/x86/kernel/head_32.S ++++ b/arch/x86/kernel/head_32.S +@@ -211,9 +211,6 @@ ENTRY(startup_32_smp) + #endif + + .Ldefault_entry: +-#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ +- X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ +- X86_CR0_PG) + movl $(CR0_STATE & ~X86_CR0_PG),%eax + movl %eax,%cr0 + +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S +index 513cbb012ecc..5e1bfdd86b5b 100644 +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -149,9 +149,6 @@ ENTRY(secondary_startup_64) + 1: wrmsr /* Make changes effective */ + + /* Setup cr0 */ +-#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ +- X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ +- X86_CR0_PG) + movl $CR0_STATE, %eax + /* Make changes effective */ + movq %rax, %cr0 +-- +2.15.0 + diff --git a/queue/x86-build-Beautify-build-log-of-syscall-headers.patch b/queue/x86-build-Beautify-build-log-of-syscall-headers.patch new file mode 100644 index 0000000..4d4ea41 --- /dev/null +++ b/queue/x86-build-Beautify-build-log-of-syscall-headers.patch @@ -0,0 +1,51 @@ +From af8e947079a7dab0480b5d6db6b093fd04b86fc9 Mon Sep 17 00:00:00 2001 +From: Masahiro Yamada <yamada.masahiro@socionext.com> +Date: Fri, 27 Oct 2017 13:11:10 +0900 +Subject: [PATCH] x86/build: Beautify build log of syscall headers + +commit af8e947079a7dab0480b5d6db6b093fd04b86fc9 upstream. + +This makes the build log look nicer. + +Before: + SYSTBL arch/x86/entry/syscalls/../../include/generated/asm/syscalls_32.h + SYSHDR arch/x86/entry/syscalls/../../include/generated/asm/unistd_32_ia32.h + SYSHDR arch/x86/entry/syscalls/../../include/generated/asm/unistd_64_x32.h + SYSTBL arch/x86/entry/syscalls/../../include/generated/asm/syscalls_64.h + SYSHDR arch/x86/entry/syscalls/../../include/generated/uapi/asm/unistd_32.h + SYSHDR arch/x86/entry/syscalls/../../include/generated/uapi/asm/unistd_64.h + SYSHDR arch/x86/entry/syscalls/../../include/generated/uapi/asm/unistd_x32.h + +After: + SYSTBL arch/x86/include/generated/asm/syscalls_32.h + SYSHDR arch/x86/include/generated/asm/unistd_32_ia32.h + SYSHDR arch/x86/include/generated/asm/unistd_64_x32.h + SYSTBL arch/x86/include/generated/asm/syscalls_64.h + SYSHDR arch/x86/include/generated/uapi/asm/unistd_32.h + SYSHDR arch/x86/include/generated/uapi/asm/unistd_64.h + SYSHDR arch/x86/include/generated/uapi/asm/unistd_x32.h + +Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com> +Acked-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: "H. Peter Anvin" <hpa@zytor.com> +Cc: linux-kbuild@vger.kernel.org +Link: http://lkml.kernel.org/r/1509077470-2735-1-git-send-email-yamada.masahiro@socionext.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile +index 57aa59fd140c..e34c7a931994 100644 +--- a/arch/x86/entry/syscalls/Makefile ++++ b/arch/x86/entry/syscalls/Makefile +@@ -1,5 +1,5 @@ +-out := $(obj)/../../include/generated/asm +-uapi := $(obj)/../../include/generated/uapi/asm ++out := arch/$(SRCARCH)/include/generated/asm ++uapi := arch/$(SRCARCH)/include/generated/uapi/asm + + # Create output directory if not already present + _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \ +-- +2.15.0 + diff --git a/queue/x86-cpufeature-Add-User-Mode-Instruction-Prevention-.patch b/queue/x86-cpufeature-Add-User-Mode-Instruction-Prevention-.patch new file mode 100644 index 0000000..280924a --- /dev/null +++ b/queue/x86-cpufeature-Add-User-Mode-Instruction-Prevention-.patch @@ -0,0 +1,77 @@ +From a8b4db562e7283a1520f9e9730297ecaab7622ea Mon Sep 17 00:00:00 2001 +From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com> +Date: Sun, 5 Nov 2017 18:27:51 -0800 +Subject: [PATCH] x86/cpufeature: Add User-Mode Instruction Prevention + definitions + +commit a8b4db562e7283a1520f9e9730297ecaab7622ea upstream. + +[ Note, this is a Git cherry-pick of the following commit: (limited to the cpufeatures.h file) + + 3522c2a6a4f3 ("x86/cpufeature: Add User-Mode Instruction Prevention definitions") + + ... for easier x86 PTI code testing and back-porting. ] + +User-Mode Instruction Prevention is a security feature present in new +Intel processors that, when set, prevents the execution of a subset of +instructions if such instructions are executed in user mode (CPL > 0). +Attempting to execute such instructions causes a general protection +exception. + +The subset of instructions comprises: + + * SGDT - Store Global Descriptor Table + * SIDT - Store Interrupt Descriptor Table + * SLDT - Store Local Descriptor Table + * SMSW - Store Machine Status Word + * STR - Store Task Register + +This feature is also added to the list of disabled-features to allow +a cleaner handling of build-time configuration. + +Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Chen Yucong <slaoub@gmail.com> +Cc: Chris Metcalf <cmetcalf@mellanox.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Fenghua Yu <fenghua.yu@intel.com> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Huang Rui <ray.huang@amd.com> +Cc: Jiri Slaby <jslaby@suse.cz> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Masami Hiramatsu <mhiramat@kernel.org> +Cc: Michael S. Tsirkin <mst@redhat.com> +Cc: Paolo Bonzini <pbonzini@redhat.com> +Cc: Paul Gortmaker <paul.gortmaker@windriver.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Ravi V. Shankar <ravi.v.shankar@intel.com> +Cc: Shuah Khan <shuah@kernel.org> +Cc: Tony Luck <tony.luck@intel.com> +Cc: Vlastimil Babka <vbabka@suse.cz> +Cc: ricardo.neri@intel.com +Link: http://lkml.kernel.org/r/1509935277-22138-7-git-send-email-ricardo.neri-calderon@linux.intel.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index cdf5be866863..c0b0e9e8aa66 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -296,6 +296,7 @@ + + /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ + #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ ++#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */ + #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ + #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ + #define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ +-- +2.15.0 + diff --git a/queue/x86-cpufeatures-Enable-new-SSE-AVX-AVX512-CPU-featur.patch b/queue/x86-cpufeatures-Enable-new-SSE-AVX-AVX512-CPU-featur.patch new file mode 100644 index 0000000..dd69582 --- /dev/null +++ b/queue/x86-cpufeatures-Enable-new-SSE-AVX-AVX512-CPU-featur.patch @@ -0,0 +1,74 @@ +From c128dbfa0f879f8ce7b79054037889b0b2240728 Mon Sep 17 00:00:00 2001 +From: Gayatri Kammela <gayatri.kammela@intel.com> +Date: Mon, 30 Oct 2017 18:20:29 -0700 +Subject: [PATCH] x86/cpufeatures: Enable new SSE/AVX/AVX512 CPU features + +commit c128dbfa0f879f8ce7b79054037889b0b2240728 upstream. + +Add a few new SSE/AVX/AVX512 instruction groups/features for enumeration +in /proc/cpuinfo: AVX512_VBMI2, GFNI, VAES, VPCLMULQDQ, AVX512_VNNI, +AVX512_BITALG. + + CPUID.(EAX=7,ECX=0):ECX[bit 6] AVX512_VBMI2 + CPUID.(EAX=7,ECX=0):ECX[bit 8] GFNI + CPUID.(EAX=7,ECX=0):ECX[bit 9] VAES + CPUID.(EAX=7,ECX=0):ECX[bit 10] VPCLMULQDQ + CPUID.(EAX=7,ECX=0):ECX[bit 11] AVX512_VNNI + CPUID.(EAX=7,ECX=0):ECX[bit 12] AVX512_BITALG + +Detailed information of CPUID bits for these features can be found +in the Intel Architecture Instruction Set Extensions and Future Features +Programming Interface document (refer to Table 1-1. and Table 1-2.). +A copy of this document is available at +https://bugzilla.kernel.org/show_bug.cgi?id=197239 + +Signed-off-by: Gayatri Kammela <gayatri.kammela@intel.com> +Acked-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andi Kleen <andi.kleen@intel.com> +Cc: Fenghua Yu <fenghua.yu@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Ravi Shankar <ravi.v.shankar@intel.com> +Cc: Ricardo Neri <ricardo.neri@intel.com> +Cc: Yang Zhong <yang.zhong@intel.com> +Cc: bp@alien8.de +Link: http://lkml.kernel.org/r/1509412829-23380-1-git-send-email-gayatri.kammela@intel.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 401a70992060..b0556f882aa8 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -299,6 +299,12 @@ + #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ + #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ + #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ ++#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ ++#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ ++#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ ++#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */ ++#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */ ++#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */ + #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ + #define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ + #define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ +diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c +index c1d49842a411..c21f22d836ad 100644 +--- a/arch/x86/kernel/cpu/cpuid-deps.c ++++ b/arch/x86/kernel/cpu/cpuid-deps.c +@@ -50,6 +50,12 @@ const static struct cpuid_dep cpuid_deps[] = { + { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, ++ { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, ++ { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, ++ { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, ++ { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, ++ { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, ++ { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, +-- +2.15.0 + diff --git a/queue/x86-cpufeatures-Fix-various-details-in-the-feature-d.patch b/queue/x86-cpufeatures-Fix-various-details-in-the-feature-d.patch new file mode 100644 index 0000000..647c431 --- /dev/null +++ b/queue/x86-cpufeatures-Fix-various-details-in-the-feature-d.patch @@ -0,0 +1,358 @@ +From f3a624e901c633593156f7b00ca743a6204a29bc Mon Sep 17 00:00:00 2001 +From: Ingo Molnar <mingo@kernel.org> +Date: Tue, 31 Oct 2017 13:17:23 +0100 +Subject: [PATCH] x86/cpufeatures: Fix various details in the feature + definitions + +commit f3a624e901c633593156f7b00ca743a6204a29bc upstream. + +Kept this commit separate from the re-tabulation changes, to make +the changes easier to review: + + - add better explanation for entries with no explanation + - fix/enhance the text of some of the entries + - fix the vertical alignment of some of the feature number definitions + - fix inconsistent capitalization + - ... and lots of other small details + +i.e. make it all more of a coherent unit, instead of a patchwork of years of additions. + +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/20171031121723.28524-4-mingo@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index ad1b835001cc..cdf5be866863 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -20,14 +20,12 @@ + * Note: If the comment begins with a quoted string, that string is used + * in /proc/cpuinfo instead of the macro name. If the string is "", + * this feature bit is not displayed in /proc/cpuinfo at all. +- */ +- +-/* ++ * + * When adding new features here that depend on other features, +- * please update the table in kernel/cpu/cpuid-deps.c ++ * please update the table in kernel/cpu/cpuid-deps.c as well. + */ + +-/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ ++/* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */ + #define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ + #define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ + #define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ +@@ -42,8 +40,7 @@ + #define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ + #define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ + #define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ +-#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ +- /* (plus FCMOVcc, FCOMI with FPU) */ ++#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ + #define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ + #define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ + #define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ +@@ -63,15 +60,15 @@ + /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ + /* Don't duplicate feature flags which are redundant with Intel! */ + #define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ +-#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ ++#define X86_FEATURE_MP ( 1*32+19) /* MP Capable */ + #define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ + #define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ + #define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ + #define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ + #define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ +-#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ +-#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ +-#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ ++#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64, 64-bit support) */ ++#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow extensions */ ++#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow */ + + /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ + #define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ +@@ -84,66 +81,67 @@ + #define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ + #define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ + #define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ +-/* cpu types for specific tunings: */ ++ ++/* CPU types for specific tunings: */ + #define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ + #define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ + #define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ + #define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ + #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ +-#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ +-#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */ ++#define X86_FEATURE_UP ( 3*32+ 9) /* SMP kernel running on UP */ ++#define X86_FEATURE_ART ( 3*32+10) /* Always running timer (ART) */ + #define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ + #define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ + #define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ +-#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ +-#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ +-#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ +-#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ +-#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ ++#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ ++#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ ++#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ ++#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" MFENCE synchronizes RDTSC */ ++#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ + #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ + #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ + #define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ +-#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ ++#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* CPU topology enum extensions */ + #define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ + #define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ + #define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ +-#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ +-#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ +-#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ ++#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */ ++#define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */ ++#define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ + #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ + #define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ + +-/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ ++/* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */ + #define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ + #define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ + #define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ +-#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ +-#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ ++#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" MONITOR/MWAIT support */ ++#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */ + #define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ +-#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ ++#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer Mode eXtensions */ + #define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ + #define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ + #define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ + #define X86_FEATURE_CID ( 4*32+10) /* Context ID */ + #define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ + #define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ +-#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ ++#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B instruction */ + #define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ +-#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ ++#define X86_FEATURE_PDCM ( 4*32+15) /* Perf/Debug Capabilities MSR */ + #define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ + #define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ + #define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ + #define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ +-#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ ++#define X86_FEATURE_X2APIC ( 4*32+21) /* X2APIC */ + #define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ + #define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ +-#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */ ++#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* TSC deadline timer */ + #define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ +-#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ +-#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ ++#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */ ++#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE instruction enabled in the OS */ + #define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ +-#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ +-#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ ++#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit FP conversions */ ++#define X86_FEATURE_RDRAND ( 4*32+30) /* RDRAND instruction */ + #define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ + + /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ +@@ -158,10 +156,10 @@ + #define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ + #define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ + +-/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ ++/* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */ + #define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ + #define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ +-#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ ++#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure Virtual Machine */ + #define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ + #define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ + #define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ +@@ -175,16 +173,16 @@ + #define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ + #define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ + #define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ +-#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ ++#define X86_FEATURE_TCE ( 6*32+17) /* Translation Cache Extension */ + #define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ +-#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ +-#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ +-#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ ++#define X86_FEATURE_TBM ( 6*32+21) /* Trailing Bit Manipulations */ ++#define X86_FEATURE_TOPOEXT ( 6*32+22) /* Topology extensions CPUID leafs */ ++#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* Core performance counter extensions */ + #define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ +-#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ +-#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ ++#define X86_FEATURE_BPEXT ( 6*32+26) /* Data breakpoint extension */ ++#define X86_FEATURE_PTSC ( 6*32+27) /* Performance time-stamp counter */ + #define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */ +-#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ ++#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX instructions) */ + + /* + * Auxiliary flags: Linux defined - For features scattered in various +@@ -192,7 +190,7 @@ + * + * Reuse free bits when adding new feature flags! + */ +-#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */ ++#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT instructions */ + #define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ + #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ + #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ +@@ -206,8 +204,8 @@ + + #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ + #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ +-#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ +-#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ ++#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ ++#define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */ + + #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ + +@@ -218,19 +216,19 @@ + #define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ + #define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ + +-#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ ++#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */ + #define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ + + +-/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ +-#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ +-#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ ++/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ ++#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ ++#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3B */ + #define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ + #define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ + #define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ + #define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ + #define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ +-#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ ++#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */ + #define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ + #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ + #define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ +@@ -238,8 +236,8 @@ + #define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ + #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ + #define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ +-#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ +-#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ ++#define X86_FEATURE_RDSEED ( 9*32+18) /* RDSEED instruction */ ++#define X86_FEATURE_ADX ( 9*32+19) /* ADCX and ADOX instructions */ + #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ + #define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ + #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ +@@ -251,25 +249,25 @@ + #define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ + #define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ + +-/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ +-#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ +-#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ +-#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ +-#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ ++/* Extended state features, CPUID level 0x0000000d:1 (EAX), word 10 */ ++#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT instruction */ ++#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */ ++#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ ++#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ + +-/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ ++/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (EDX), word 11 */ + #define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ + +-/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ +-#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ ++/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (EDX), word 12 */ ++#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring */ + #define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ + #define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ + +-/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ +-#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ +-#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ ++/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ ++#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ ++#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ + +-/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ ++/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ + #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ + #define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ + #define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ +@@ -281,7 +279,7 @@ + #define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ + #define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ + +-/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */ ++/* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ + #define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ + #define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ + #define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ +@@ -296,24 +294,24 @@ + #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ + #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ + +-/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ ++/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ + #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ + #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ + #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ + #define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ + #define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ + #define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ +-#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */ +-#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */ +-#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */ ++#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */ ++#define X86_FEATURE_AVX512_VNNI (16*32+11) /* Vector Neural Network Instructions */ ++#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ + #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ + #define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ + #define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ + +-/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ +-#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ +-#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ +-#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ ++/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ ++#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */ ++#define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */ ++#define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */ + + /* + * BUG word(s) +@@ -340,4 +338,5 @@ + #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ + #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ + #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ ++ + #endif /* _ASM_X86_CPUFEATURES_H */ +-- +2.15.0 + diff --git a/queue/x86-cpufeatures-Make-CPU-bugs-sticky.patch b/queue/x86-cpufeatures-Make-CPU-bugs-sticky.patch new file mode 100644 index 0000000..a56233a --- /dev/null +++ b/queue/x86-cpufeatures-Make-CPU-bugs-sticky.patch @@ -0,0 +1,95 @@ +From 6cbd2171e89b13377261d15e64384df60ecb530e Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Dec 2017 15:07:32 +0100 +Subject: [PATCH] x86/cpufeatures: Make CPU bugs sticky + +commit 6cbd2171e89b13377261d15e64384df60ecb530e upstream. + +There is currently no way to force CPU bug bits like CPU feature bits. That +makes it impossible to set a bug bit once at boot and have it stick for all +upcoming CPUs. + +Extend the force set/clear arrays to handle bug bits as well. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.992156574@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h +index bf6a76202a77..ea9a7dde62e5 100644 +--- a/arch/x86/include/asm/cpufeature.h ++++ b/arch/x86/include/asm/cpufeature.h +@@ -135,6 +135,8 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); + set_bit(bit, (unsigned long *)cpu_caps_set); \ + } while (0) + ++#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) ++ + #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS) + /* + * Static testing of CPU features. Used the same as boot_cpu_has(). +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index e8991d7f7034..da943411d3d8 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -163,8 +163,8 @@ extern struct cpuinfo_x86 boot_cpu_data; + extern struct cpuinfo_x86 new_cpu_data; + + extern struct x86_hw_tss doublefault_tss; +-extern __u32 cpu_caps_cleared[NCAPINTS]; +-extern __u32 cpu_caps_set[NCAPINTS]; ++extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; ++extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS]; + + #ifdef CONFIG_SMP + DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index c2eada1056de..034900623adf 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -452,8 +452,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c) + return NULL; /* Not found */ + } + +-__u32 cpu_caps_cleared[NCAPINTS]; +-__u32 cpu_caps_set[NCAPINTS]; ++__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; ++__u32 cpu_caps_set[NCAPINTS + NBUGINTS]; + + void load_percpu_segment(int cpu) + { +@@ -812,7 +812,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c) + { + int i; + +- for (i = 0; i < NCAPINTS; i++) { ++ for (i = 0; i < NCAPINTS + NBUGINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } +-- +2.15.0 + diff --git a/queue/x86-cpufeatures-Re-tabulate-the-X86_FEATURE-definiti.patch b/queue/x86-cpufeatures-Re-tabulate-the-X86_FEATURE-definiti.patch new file mode 100644 index 0000000..c80055f --- /dev/null +++ b/queue/x86-cpufeatures-Re-tabulate-the-X86_FEATURE-definiti.patch @@ -0,0 +1,614 @@ +From acbc845ffefd9fb70466182cd8555a26189462b2 Mon Sep 17 00:00:00 2001 +From: Ingo Molnar <mingo@kernel.org> +Date: Tue, 31 Oct 2017 13:17:22 +0100 +Subject: [PATCH] x86/cpufeatures: Re-tabulate the X86_FEATURE definitions + +commit acbc845ffefd9fb70466182cd8555a26189462b2 upstream. + +Over the years asm/cpufeatures.h has become somewhat of a mess: the original +tabulation style was too narrow, while x86 feature names also kept growing +in length, creating frequent field width overflows. + +Re-tabulate it to make it wider and easier to read/modify. Also harmonize +the tabulation of the other defines in this file to match it. + +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/20171031121723.28524-3-mingo@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 74370734663c..ad1b835001cc 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -13,8 +13,8 @@ + /* + * Defines x86 CPU feature bits + */ +-#define NCAPINTS 18 /* N 32-bit words worth of info */ +-#define NBUGINTS 1 /* N 32-bit bug flags */ ++#define NCAPINTS 18 /* N 32-bit words worth of info */ ++#define NBUGINTS 1 /* N 32-bit bug flags */ + + /* + * Note: If the comment begins with a quoted string, that string is used +@@ -28,163 +28,163 @@ + */ + + /* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ +-#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ +-#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ +-#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ +-#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ +-#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ +-#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ +-#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ +-#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ +-#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ +-#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ +-#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ +-#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ +-#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ +-#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ +-#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ ++#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ ++#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ ++#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ ++#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ ++#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ ++#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ ++#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ ++#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ ++#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ ++#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ ++#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ ++#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ ++#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ ++#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ ++#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ + /* (plus FCMOVcc, FCOMI with FPU) */ +-#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ +-#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ +-#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ +-#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ +-#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ +-#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ +-#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ +-#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ +-#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ +-#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ +-#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ +-#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ +-#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ +-#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ +-#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ ++#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ ++#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ ++#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ ++#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ ++#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ ++#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ ++#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ ++#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ ++#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ ++#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ ++#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ ++#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ ++#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ ++#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ ++#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ + + /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ + /* Don't duplicate feature flags which are redundant with Intel! */ +-#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ +-#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ +-#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ +-#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ +-#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ +-#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ +-#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ +-#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ +-#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ +-#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ ++#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ ++#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ ++#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ ++#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ ++#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ ++#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ ++#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ ++#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ ++#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ ++#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ + + /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ +-#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ +-#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ +-#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ ++#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ ++#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ ++#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ + + /* Other features, Linux-defined mapping, word 3 */ + /* This range is used for feature bits which conflict or are synthesized */ +-#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ +-#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ +-#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ +-#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ ++#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ ++#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ ++#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ ++#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ + /* cpu types for specific tunings: */ +-#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ +-#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ +-#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ +-#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ +-#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ +-#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ +-#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */ +-#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ +-#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ +-#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ +-#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ +-#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ +-#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ +-#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ +-#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ +-#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ +-#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ +-#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ +-#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ +-#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ +-#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ +-#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ +-#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ +-#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ +-#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ +-#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ +-#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ ++#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ ++#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ ++#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ ++#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ ++#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ ++#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ ++#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */ ++#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ ++#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ ++#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ ++#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ ++#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ ++#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ ++#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ ++#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ ++#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ ++#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ ++#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ ++#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ ++#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ ++#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ ++#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ ++#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ ++#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ ++#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ ++#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ ++#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ + + /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ +-#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ +-#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ +-#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ +-#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ +-#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ +-#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ +-#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ +-#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ +-#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ +-#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ +-#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ +-#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ +-#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ +-#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ +-#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ +-#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ +-#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ +-#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ +-#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ +-#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ +-#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ +-#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ +-#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ ++#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ ++#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ ++#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ ++#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ ++#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ ++#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ ++#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ ++#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ ++#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ ++#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ ++#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ ++#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ ++#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ ++#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ ++#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ ++#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ ++#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ ++#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ ++#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ ++#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ ++#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ ++#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ ++#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ + #define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */ +-#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ +-#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ +-#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ +-#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ +-#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ +-#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ +-#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ ++#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ ++#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ ++#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ ++#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ ++#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ ++#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ ++#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ + + /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ +-#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ +-#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ +-#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ +-#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ +-#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ +-#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ +-#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ +-#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ +-#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ +-#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ ++#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ ++#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ ++#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ ++#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ ++#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ ++#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ ++#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ ++#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ ++#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ ++#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ + + /* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ +-#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ +-#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ +-#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ +-#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ +-#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ +-#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ +-#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ +-#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ +-#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ +-#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ +-#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ +-#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ +-#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ +-#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ +-#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ +-#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ +-#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ +-#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ +-#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ +-#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ +-#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ +-#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ +-#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ +-#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ +-#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */ +-#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ ++#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ ++#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ ++#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ ++#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ ++#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ ++#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ ++#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ ++#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ ++#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ ++#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ ++#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ ++#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ ++#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ ++#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ ++#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ ++#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ ++#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ ++#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ ++#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ ++#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ ++#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ ++#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ ++#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ ++#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ ++#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */ ++#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ + + /* + * Auxiliary flags: Linux defined - For features scattered in various +@@ -192,152 +192,152 @@ + * + * Reuse free bits when adding new feature flags! + */ +-#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */ +-#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ +-#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ +-#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ +-#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ +-#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ +-#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ ++#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */ ++#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ ++#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ ++#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ ++#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ ++#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ ++#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ + +-#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ +-#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +-#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ ++#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ ++#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ ++#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ + +-#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ +-#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ +-#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ +-#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ ++#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ ++#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ ++#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ ++#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ + +-#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ ++#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ + + /* Virtualization flags: Linux defined, word 8 */ +-#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ +-#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ +-#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ +-#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ +-#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ ++#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ ++#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ ++#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ ++#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ ++#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ + +-#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ +-#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ ++#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ ++#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ + + + /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ +-#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ +-#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ +-#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ +-#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ +-#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ +-#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ +-#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ +-#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ +-#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ +-#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ +-#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ +-#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ +-#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ +-#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ +-#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ +-#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ +-#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ +-#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ +-#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ +-#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ +-#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ +-#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ +-#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ +-#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ +-#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ +-#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ +-#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ ++#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ ++#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ ++#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ ++#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ ++#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ ++#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ ++#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ ++#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ ++#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ ++#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ ++#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ ++#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ ++#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ ++#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ ++#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ ++#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ ++#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ ++#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ ++#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ ++#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ ++#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ ++#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ ++#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ ++#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ ++#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ ++#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ ++#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ + + /* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ +-#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ +-#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ +-#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ +-#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ ++#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ ++#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ ++#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ ++#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ + + /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ +-#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ ++#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ + + /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ +-#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ +-#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ +-#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ ++#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ ++#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ ++#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ + + /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ +-#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ +-#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ ++#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ ++#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ + + /* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ +-#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ +-#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ +-#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ +-#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ +-#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ +-#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ +-#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ +-#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ +-#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ +-#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ ++#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ ++#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ ++#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ ++#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ ++#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ ++#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ ++#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ ++#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ ++#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ ++#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ + + /* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */ +-#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ +-#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ +-#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ +-#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ +-#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ +-#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ +-#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ +-#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ +-#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ +-#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ +-#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ +-#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ +-#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ ++#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ ++#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ ++#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ ++#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ ++#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ ++#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ ++#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ ++#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ ++#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ ++#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ ++#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ ++#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ ++#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ + + /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ +-#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ +-#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ +-#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +-#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ +-#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ +-#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ +-#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */ +-#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */ +-#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */ +-#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ +-#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ +-#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ ++#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ ++#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ ++#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ ++#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ ++#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ ++#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ ++#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */ ++#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */ ++#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */ ++#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ ++#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ ++#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ + + /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ +-#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ +-#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ +-#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ ++#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ ++#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ ++#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ + + /* + * BUG word(s) + */ +-#define X86_BUG(x) (NCAPINTS*32 + (x)) ++#define X86_BUG(x) (NCAPINTS*32 + (x)) + +-#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ +-#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ +-#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ +-#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ +-#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ +-#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ +-#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ +-#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ +-#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ ++#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ ++#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ ++#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ ++#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ ++#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ ++#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ ++#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ ++#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ ++#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ + #ifdef CONFIG_X86_32 + /* + * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional + * to avoid confusion. + */ +-#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ ++#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ + #endif +-#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ +-#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ +-#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ +-#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ ++#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ ++#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ ++#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ ++#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ + #endif /* _ASM_X86_CPUFEATURES_H */ +-- +2.15.0 + diff --git a/queue/x86-cpuid-Add-generic-table-for-CPUID-dependencies.patch b/queue/x86-cpuid-Add-generic-table-for-CPUID-dependencies.patch new file mode 100644 index 0000000..983bf0a --- /dev/null +++ b/queue/x86-cpuid-Add-generic-table-for-CPUID-dependencies.patch @@ -0,0 +1,206 @@ +From 0b00de857a648dafe7020878c7a27cf776f5edf4 Mon Sep 17 00:00:00 2001 +From: Andi Kleen <ak@linux.intel.com> +Date: Fri, 13 Oct 2017 14:56:42 -0700 +Subject: [PATCH] x86/cpuid: Add generic table for CPUID dependencies + +commit 0b00de857a648dafe7020878c7a27cf776f5edf4 upstream. + +Some CPUID features depend on other features. Currently it's +possible to to clear dependent features, but not clear the base features, +which can cause various interesting problems. + +This patch implements a generic table to describe dependencies +between CPUID features, to be used by all code that clears +CPUID. + +Some subsystems (like XSAVE) had an own implementation of this, +but it's better to do it all in a single place for everyone. + +Then clear_cpu_cap and setup_clear_cpu_cap always look up +this table and clear all dependencies too. + +This is intended to be a practical table: only for features +that make sense to clear. If someone for example clears FPU, +or other features that are essentially part of the required +base feature set, not much is going to work. Handling +that is right now out of scope. We're only handling +features which can be usefully cleared. + +Signed-off-by: Andi Kleen <ak@linux.intel.com> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Jonathan McDowell <noodles@earth.li> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Link: http://lkml.kernel.org/r/20171013215645.23166-3-andi@firstfloor.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h +index d59c15c3defd..225fd8374fae 100644 +--- a/arch/x86/include/asm/cpufeature.h ++++ b/arch/x86/include/asm/cpufeature.h +@@ -125,11 +125,10 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; + #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) + + #define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) +-#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability)) +-#define setup_clear_cpu_cap(bit) do { \ +- clear_cpu_cap(&boot_cpu_data, bit); \ +- set_bit(bit, (unsigned long *)cpu_caps_cleared); \ +-} while (0) ++ ++extern void setup_clear_cpu_cap(unsigned int bit); ++extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); ++ + #define setup_force_cpu_cap(bit) do { \ + set_cpu_cap(&boot_cpu_data, bit); \ + set_bit(bit, (unsigned long *)cpu_caps_set); \ +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 2519c6c801c9..401a70992060 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -21,6 +21,11 @@ + * this feature bit is not displayed in /proc/cpuinfo at all. + */ + ++/* ++ * When adding new features here that depend on other features, ++ * please update the table in kernel/cpu/cpuid-deps.c ++ */ ++ + /* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ + #define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ + #define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ +diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile +index e17942c131c8..de260fae1017 100644 +--- a/arch/x86/kernel/cpu/Makefile ++++ b/arch/x86/kernel/cpu/Makefile +@@ -22,6 +22,7 @@ obj-y += rdrand.o + obj-y += match.o + obj-y += bugs.o + obj-$(CONFIG_CPU_FREQ) += aperfmperf.o ++obj-y += cpuid-deps.o + + obj-$(CONFIG_PROC_FS) += proc.o + obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o +diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c +new file mode 100644 +index 000000000000..e48eb7313120 +--- /dev/null ++++ b/arch/x86/kernel/cpu/cpuid-deps.c +@@ -0,0 +1,113 @@ ++/* Declare dependencies between CPUIDs */ ++#include <linux/kernel.h> ++#include <linux/init.h> ++#include <linux/module.h> ++#include <asm/cpufeature.h> ++ ++struct cpuid_dep { ++ unsigned int feature; ++ unsigned int depends; ++}; ++ ++/* ++ * Table of CPUID features that depend on others. ++ * ++ * This only includes dependencies that can be usefully disabled, not ++ * features part of the base set (like FPU). ++ * ++ * Note this all is not __init / __initdata because it can be ++ * called from cpu hotplug. It shouldn't do anything in this case, ++ * but it's difficult to tell that to the init reference checker. ++ */ ++const static struct cpuid_dep cpuid_deps[] = { ++ { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE }, ++ { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE }, ++ { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE }, ++ { X86_FEATURE_AVX, X86_FEATURE_XSAVE }, ++ { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, ++ { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, ++ { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, ++ { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR }, ++ { X86_FEATURE_XMM, X86_FEATURE_FXSR }, ++ { X86_FEATURE_XMM2, X86_FEATURE_XMM }, ++ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, ++ { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 }, ++ { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 }, ++ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, ++ { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 }, ++ { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, }, ++ { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, ++ { X86_FEATURE_AES, X86_FEATURE_XMM2 }, ++ { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, ++ { X86_FEATURE_FMA, X86_FEATURE_AVX }, ++ { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, ++ { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, ++ { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, ++ { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F }, ++ { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F }, ++ { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F }, ++ { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F }, ++ { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, ++ { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, ++ { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, ++ { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, ++ { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, ++ { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, ++ {} ++}; ++ ++static inline void __clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit) ++{ ++ clear_bit32(bit, c->x86_capability); ++} ++ ++static inline void __setup_clear_cpu_cap(unsigned int bit) ++{ ++ clear_cpu_cap(&boot_cpu_data, bit); ++ set_bit32(bit, cpu_caps_cleared); ++} ++ ++static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) ++{ ++ if (!c) ++ __setup_clear_cpu_cap(feature); ++ else ++ __clear_cpu_cap(c, feature); ++} ++ ++static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) ++{ ++ bool changed; ++ DECLARE_BITMAP(disable, NCAPINTS * sizeof(u32) * 8); ++ const struct cpuid_dep *d; ++ ++ clear_feature(c, feature); ++ ++ /* Collect all features to disable, handling dependencies */ ++ memset(disable, 0, sizeof(disable)); ++ __set_bit(feature, disable); ++ ++ /* Loop until we get a stable state. */ ++ do { ++ changed = false; ++ for (d = cpuid_deps; d->feature; d++) { ++ if (!test_bit(d->depends, disable)) ++ continue; ++ if (__test_and_set_bit(d->feature, disable)) ++ continue; ++ ++ changed = true; ++ clear_feature(c, d->feature); ++ } ++ } while (changed); ++} ++ ++void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) ++{ ++ do_clear_cpu_cap(c, feature); ++} ++ ++void setup_clear_cpu_cap(unsigned int feature) ++{ ++ do_clear_cpu_cap(NULL, feature); ++} +-- +2.15.0 + diff --git a/queue/x86-cpuid-Prevent-out-of-bound-access-in-do_clear_cp.patch b/queue/x86-cpuid-Prevent-out-of-bound-access-in-do_clear_cp.patch new file mode 100644 index 0000000..5f55b75 --- /dev/null +++ b/queue/x86-cpuid-Prevent-out-of-bound-access-in-do_clear_cp.patch @@ -0,0 +1,53 @@ +From 57b8b1a1856adaa849d02d547411a553a531022b Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Wed, 18 Oct 2017 19:39:35 +0200 +Subject: [PATCH] x86/cpuid: Prevent out of bound access in do_clear_cpu_cap() + +commit 57b8b1a1856adaa849d02d547411a553a531022b upstream. + +do_clear_cpu_cap() allocates a bitmap to keep track of disabled feature +dependencies. That bitmap is sized NCAPINTS * BITS_PER_INIT. The possible +'features' which can be handed in are larger than this, because after the +capabilities the bug 'feature' bits occupy another 32bit. Not really +obvious... + +So clearing any of the misfeature bits, as 32bit does for the F00F bug, +accesses that bitmap out of bounds thereby corrupting the stack. + +Size the bitmap proper and add a sanity check to catch accidental out of +bound access. + +Fixes: 0b00de857a64 ("x86/cpuid: Add generic table for CPUID dependencies") +Reported-by: kernel test robot <xiaolong.ye@intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Borislav Petkov <bp@alien8.de> +Link: https://lkml.kernel.org/r/20171018022023.GA12058@yexl-desktop + +diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c +index e48eb7313120..c1d49842a411 100644 +--- a/arch/x86/kernel/cpu/cpuid-deps.c ++++ b/arch/x86/kernel/cpu/cpuid-deps.c +@@ -75,11 +75,17 @@ static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) + __clear_cpu_cap(c, feature); + } + ++/* Take the capabilities and the BUG bits into account */ ++#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8) ++ + static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) + { +- bool changed; +- DECLARE_BITMAP(disable, NCAPINTS * sizeof(u32) * 8); ++ DECLARE_BITMAP(disable, MAX_FEATURE_BITS); + const struct cpuid_dep *d; ++ bool changed; ++ ++ if (WARN_ON(feature >= MAX_FEATURE_BITS)) ++ return; + + clear_feature(c, feature); + +-- +2.15.0 + diff --git a/queue/x86-cpuid-Replace-set-clear_bit32.patch b/queue/x86-cpuid-Replace-set-clear_bit32.patch new file mode 100644 index 0000000..bb49405 --- /dev/null +++ b/queue/x86-cpuid-Replace-set-clear_bit32.patch @@ -0,0 +1,60 @@ +From 06dd688ddda5819025e014b79aea9af6ab475fa2 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Thu, 2 Nov 2017 13:22:35 +0100 +Subject: [PATCH] x86/cpuid: Replace set/clear_bit32() + +commit 06dd688ddda5819025e014b79aea9af6ab475fa2 upstream. + +Peter pointed out that the set/clear_bit32() variants are broken in various +aspects. + +Replace them with open coded set/clear_bit() and type cast +cpu_info::x86_capability as it's done in all other places throughout x86. + +Fixes: 0b00de857a64 ("x86/cpuid: Add generic table for CPUID dependencies") +Reported-by: Peter Ziljstra <peterz@infradead.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andi Kleen <ak@linux.intel.com> + +diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c +index c21f22d836ad..904b0a3c4e53 100644 +--- a/arch/x86/kernel/cpu/cpuid-deps.c ++++ b/arch/x86/kernel/cpu/cpuid-deps.c +@@ -62,23 +62,19 @@ const static struct cpuid_dep cpuid_deps[] = { + {} + }; + +-static inline void __clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit) +-{ +- clear_bit32(bit, c->x86_capability); +-} +- +-static inline void __setup_clear_cpu_cap(unsigned int bit) +-{ +- clear_cpu_cap(&boot_cpu_data, bit); +- set_bit32(bit, cpu_caps_cleared); +-} +- + static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) + { +- if (!c) +- __setup_clear_cpu_cap(feature); +- else +- __clear_cpu_cap(c, feature); ++ /* ++ * Note: This could use the non atomic __*_bit() variants, but the ++ * rest of the cpufeature code uses atomics as well, so keep it for ++ * consistency. Cleanup all of it separately. ++ */ ++ if (!c) { ++ clear_cpu_cap(&boot_cpu_data, feature); ++ set_bit(feature, (unsigned long *)cpu_caps_cleared); ++ } else { ++ clear_bit(feature, (unsigned long *)c->x86_capability); ++ } + } + + /* Take the capabilities and the BUG bits into account */ +-- +2.15.0 + diff --git a/queue/x86-dumpstack-Add-get_stack_info-support-for-the-SYS.patch b/queue/x86-dumpstack-Add-get_stack_info-support-for-the-SYS.patch new file mode 100644 index 0000000..60b6410 --- /dev/null +++ b/queue/x86-dumpstack-Add-get_stack_info-support-for-the-SYS.patch @@ -0,0 +1,170 @@ +From 33a2f1a6c4d7c0a02d1c006fb0379cc5ca3b96bb Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 4 Dec 2017 15:07:13 +0100 +Subject: [PATCH] x86/dumpstack: Add get_stack_info() support for the SYSENTER + stack + +commit 33a2f1a6c4d7c0a02d1c006fb0379cc5ca3b96bb upstream. + +get_stack_info() doesn't currently know about the SYSENTER stack, so +unwinding will fail if we entered the kernel on the SYSENTER stack +and haven't fully switched off. Teach get_stack_info() about the +SYSENTER stack. + +With future patches applied that run part of the entry code on the +SYSENTER stack and introduce an intentional BUG(), I would get: + + PANIC: double fault, error_code: 0x0 + ... + RIP: 0010:do_error_trap+0x33/0x1c0 + ... + Call Trace: + Code: ... + +With this patch, I get: + + PANIC: double fault, error_code: 0x0 + ... + Call Trace: + <SYSENTER> + ? async_page_fault+0x36/0x60 + ? invalid_op+0x22/0x40 + ? async_page_fault+0x36/0x60 + ? sync_regs+0x3c/0x40 + ? sync_regs+0x2e/0x40 + ? error_entry+0x6c/0xd0 + ? async_page_fault+0x36/0x60 + </SYSENTER> + Code: ... + +which is a lot more informative. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.392711508@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h +index 8da111b3c342..f8062bfd43a0 100644 +--- a/arch/x86/include/asm/stacktrace.h ++++ b/arch/x86/include/asm/stacktrace.h +@@ -16,6 +16,7 @@ enum stack_type { + STACK_TYPE_TASK, + STACK_TYPE_IRQ, + STACK_TYPE_SOFTIRQ, ++ STACK_TYPE_SYSENTER, + STACK_TYPE_EXCEPTION, + STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, + }; +@@ -28,6 +29,8 @@ struct stack_info { + bool in_task_stack(unsigned long *stack, struct task_struct *task, + struct stack_info *info); + ++bool in_sysenter_stack(unsigned long *stack, struct stack_info *info); ++ + int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask); + +diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c +index 0bc95be5c638..a33a1373a252 100644 +--- a/arch/x86/kernel/dumpstack.c ++++ b/arch/x86/kernel/dumpstack.c +@@ -43,6 +43,25 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, + return true; + } + ++bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) ++{ ++ struct tss_struct *tss = this_cpu_ptr(&cpu_tss); ++ ++ /* Treat the canary as part of the stack for unwinding purposes. */ ++ void *begin = &tss->SYSENTER_stack_canary; ++ void *end = (void *)&tss->SYSENTER_stack + sizeof(tss->SYSENTER_stack); ++ ++ if ((void *)stack < begin || (void *)stack >= end) ++ return false; ++ ++ info->type = STACK_TYPE_SYSENTER; ++ info->begin = begin; ++ info->end = end; ++ info->next_sp = NULL; ++ ++ return true; ++} ++ + static void printk_stack_address(unsigned long address, int reliable, + char *log_lvl) + { +diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c +index daefae83a3aa..5ff13a6b3680 100644 +--- a/arch/x86/kernel/dumpstack_32.c ++++ b/arch/x86/kernel/dumpstack_32.c +@@ -26,6 +26,9 @@ const char *stack_type_name(enum stack_type type) + if (type == STACK_TYPE_SOFTIRQ) + return "SOFTIRQ"; + ++ if (type == STACK_TYPE_SYSENTER) ++ return "SYSENTER"; ++ + return NULL; + } + +@@ -93,6 +96,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, + if (task != current) + goto unknown; + ++ if (in_sysenter_stack(stack, info)) ++ goto recursion_check; ++ + if (in_hardirq_stack(stack, info)) + goto recursion_check; + +diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c +index 88ce2ffdb110..abc828f8c297 100644 +--- a/arch/x86/kernel/dumpstack_64.c ++++ b/arch/x86/kernel/dumpstack_64.c +@@ -37,6 +37,9 @@ const char *stack_type_name(enum stack_type type) + if (type == STACK_TYPE_IRQ) + return "IRQ"; + ++ if (type == STACK_TYPE_SYSENTER) ++ return "SYSENTER"; ++ + if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) + return exception_stack_names[type - STACK_TYPE_EXCEPTION]; + +@@ -115,6 +118,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, + if (in_irq_stack(stack, info)) + goto recursion_check; + ++ if (in_sysenter_stack(stack, info)) ++ goto recursion_check; ++ + goto unknown; + + recursion_check: +-- +2.15.0 + diff --git a/queue/x86-dumpstack-Handle-stack-overflow-on-all-stacks.patch b/queue/x86-dumpstack-Handle-stack-overflow-on-all-stacks.patch new file mode 100644 index 0000000..621b315 --- /dev/null +++ b/queue/x86-dumpstack-Handle-stack-overflow-on-all-stacks.patch @@ -0,0 +1,85 @@ +From 6e60e583426c2f8751c22c2dfe5c207083b4483a Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 4 Dec 2017 15:07:18 +0100 +Subject: [PATCH] x86/dumpstack: Handle stack overflow on all stacks + +commit 6e60e583426c2f8751c22c2dfe5c207083b4483a upstream. + +We currently special-case stack overflow on the task stack. We're +going to start putting special stacks in the fixmap with a custom +layout, so they'll have guard pages, too. Teach the unwinder to be +able to unwind an overflow of any of the stacks. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.802057305@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c +index a33a1373a252..64f8ed2a4827 100644 +--- a/arch/x86/kernel/dumpstack.c ++++ b/arch/x86/kernel/dumpstack.c +@@ -112,24 +112,28 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + * - task stack + * - interrupt stack + * - HW exception stacks (double fault, nmi, debug, mce) ++ * - SYSENTER stack + * +- * x86-32 can have up to three stacks: ++ * x86-32 can have up to four stacks: + * - task stack + * - softirq stack + * - hardirq stack ++ * - SYSENTER stack + */ + for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + const char *stack_name; + +- /* +- * If we overflowed the task stack into a guard page, jump back +- * to the bottom of the usable stack. +- */ +- if (task_stack_page(task) - (void *)stack < PAGE_SIZE) +- stack = task_stack_page(task); +- +- if (get_stack_info(stack, task, &stack_info, &visit_mask)) +- break; ++ if (get_stack_info(stack, task, &stack_info, &visit_mask)) { ++ /* ++ * We weren't on a valid stack. It's possible that ++ * we overflowed a valid stack into a guard page. ++ * See if the next page up is valid so that we can ++ * generate some kind of backtrace if this happens. ++ */ ++ stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack); ++ if (get_stack_info(stack, task, &stack_info, &visit_mask)) ++ break; ++ } + + stack_name = stack_type_name(stack_info.type); + if (stack_name) +-- +2.15.0 + diff --git a/queue/x86-entry-32-Fix-cpu_current_top_of_stack-initializa.patch b/queue/x86-entry-32-Fix-cpu_current_top_of_stack-initializa.patch new file mode 100644 index 0000000..7078b5c --- /dev/null +++ b/queue/x86-entry-32-Fix-cpu_current_top_of_stack-initializa.patch @@ -0,0 +1,40 @@ +From cd493a6deb8b78eca280d05f7fa73fd69403ae29 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 2 Nov 2017 00:59:15 -0700 +Subject: [PATCH] x86/entry/32: Fix cpu_current_top_of_stack initialization at + boot + +commit cd493a6deb8b78eca280d05f7fa73fd69403ae29 upstream. + +cpu_current_top_of_stack's initialization forgot about +TOP_OF_KERNEL_STACK_PADDING. This bug didn't matter because the +idle threads never enter user mode. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/e5e370a7e6e4fddd1c4e4cf619765d96bb874b21.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c +index ad59edd84de7..06c18fe1c09e 100644 +--- a/arch/x86/kernel/smpboot.c ++++ b/arch/x86/kernel/smpboot.c +@@ -961,8 +961,7 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle) + #ifdef CONFIG_X86_32 + /* Stack for startup_32 can be just as for start_secondary onwards */ + irq_ctx_init(cpu); +- per_cpu(cpu_current_top_of_stack, cpu) = +- (unsigned long)task_stack_page(idle) + THREAD_SIZE; ++ per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); + #else + initial_gs = per_cpu_offset(cpu); + #endif +-- +2.15.0 + diff --git a/queue/x86-entry-32-Pull-the-MSR_IA32_SYSENTER_CS-update-co.patch b/queue/x86-entry-32-Pull-the-MSR_IA32_SYSENTER_CS-update-co.patch new file mode 100644 index 0000000..ec26dae --- /dev/null +++ b/queue/x86-entry-32-Pull-the-MSR_IA32_SYSENTER_CS-update-co.patch @@ -0,0 +1,130 @@ +From bd7dc5a6afac719d8ce4092391eef2c7e83c2a75 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 2 Nov 2017 00:59:09 -0700 +Subject: [PATCH] x86/entry/32: Pull the MSR_IA32_SYSENTER_CS update code out + of native_load_sp0() + +commit bd7dc5a6afac719d8ce4092391eef2c7e83c2a75 upstream. + +This causes the MSR_IA32_SYSENTER_CS write to move out of the +paravirt callback. This shouldn't affect Xen PV: Xen already ignores +MSR_IA32_SYSENTER_ESP writes. In any event, Xen doesn't support +vm86() in a useful way. + +Note to any potential backporters: This patch won't break lguest, as +lguest didn't have any SYSENTER support at all. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/75cf09fe03ae778532d0ca6c65aa58e66bc2f90c.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index b390ff76e58f..0167e3e35a57 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -520,13 +520,6 @@ static inline void + native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) + { + tss->x86_tss.sp0 = thread->sp0; +-#ifdef CONFIG_X86_32 +- /* Only happens when SEP is enabled, no need to test "SEP"arately: */ +- if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { +- tss->x86_tss.ss1 = thread->sysenter_cs; +- wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); +- } +-#endif + } + + static inline void native_swapgs(void) +diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h +index fcc5cd387fd1..7ae8caffbada 100644 +--- a/arch/x86/include/asm/switch_to.h ++++ b/arch/x86/include/asm/switch_to.h +@@ -72,4 +72,16 @@ do { \ + ((last) = __switch_to_asm((prev), (next))); \ + } while (0) + ++#ifdef CONFIG_X86_32 ++static inline void refresh_sysenter_cs(struct thread_struct *thread) ++{ ++ /* Only happens when SEP is enabled, no need to test "SEP"arately: */ ++ if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs)) ++ return; ++ ++ this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs); ++ wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); ++} ++#endif ++ + #endif /* _ASM_X86_SWITCH_TO_H */ +diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c +index 11966251cd42..0936ed3da6b6 100644 +--- a/arch/x86/kernel/process_32.c ++++ b/arch/x86/kernel/process_32.c +@@ -284,9 +284,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + + /* + * Reload esp0 and cpu_current_top_of_stack. This changes +- * current_thread_info(). ++ * current_thread_info(). Refresh the SYSENTER configuration in ++ * case prev or next is vm86. + */ + load_sp0(tss, next); ++ refresh_sysenter_cs(next); + this_cpu_write(cpu_current_top_of_stack, + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE); +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index 302e7b2572d1..a6ff6d1a0110 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -464,7 +464,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + */ + this_cpu_write(current_task, next_p); + +- /* Reload esp0 and ss1. This changes current_thread_info(). */ ++ /* Reload sp0. */ + load_sp0(tss, next); + + /* +diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c +index 7924a5356c8a..5bc1c3ab6287 100644 +--- a/arch/x86/kernel/vm86_32.c ++++ b/arch/x86/kernel/vm86_32.c +@@ -54,6 +54,7 @@ + #include <asm/irq.h> + #include <asm/traps.h> + #include <asm/vm86.h> ++#include <asm/switch_to.h> + + /* + * Known problems: +@@ -149,6 +150,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) + tsk->thread.sp0 = vm86->saved_sp0; + tsk->thread.sysenter_cs = __KERNEL_CS; + load_sp0(tss, &tsk->thread); ++ refresh_sysenter_cs(&tsk->thread); + vm86->saved_sp0 = 0; + put_cpu(); + +@@ -368,8 +370,10 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) + /* make room for real-mode segments */ + tsk->thread.sp0 += 16; + +- if (static_cpu_has(X86_FEATURE_SEP)) ++ if (static_cpu_has(X86_FEATURE_SEP)) { + tsk->thread.sysenter_cs = 0; ++ refresh_sysenter_cs(&tsk->thread); ++ } + + load_sp0(tss, &tsk->thread); + put_cpu(); +-- +2.15.0 + diff --git a/queue/x86-entry-64-Allocate-and-enable-the-SYSENTER-stack.patch b/queue/x86-entry-64-Allocate-and-enable-the-SYSENTER-stack.patch new file mode 100644 index 0000000..ad9bdde --- /dev/null +++ b/queue/x86-entry-64-Allocate-and-enable-the-SYSENTER-stack.patch @@ -0,0 +1,165 @@ +From 1a79797b58cddfa948420a7553241c79c013e3ca Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 4 Dec 2017 15:07:12 +0100 +Subject: [PATCH] x86/entry/64: Allocate and enable the SYSENTER stack + +commit 1a79797b58cddfa948420a7553241c79c013e3ca upstream. + +This will simplify future changes that want scratch variables early in +the SYSENTER handler -- they'll be able to spill registers to the +stack. It also lets us get rid of a SWAPGS_UNSAFE_STACK user. + +This does not depend on CONFIG_IA32_EMULATION=y because we'll want the +stack space even without IA32 emulation. + +As far as I can tell, the reason that this wasn't done from day 1 is +that we use IST for #DB and #BP, which is IMO rather nasty and causes +a lot more problems than it solves. But, since #DB uses IST, we don't +actually need a real stack for SYSENTER (because SYSENTER with TF set +will invoke #DB on the IST stack rather than the SYSENTER stack). + +I want to remove IST usage from these vectors some day, and this patch +is a prerequisite for that as well. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.312726423@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S +index 568e130d932c..dcc6987f9bae 100644 +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -48,7 +48,7 @@ + */ + ENTRY(entry_SYSENTER_compat) + /* Interrupts are off on entry. */ +- SWAPGS_UNSAFE_STACK ++ SWAPGS + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + + /* +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 2db7cf720b04..789dad5da20f 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -339,14 +339,11 @@ struct tss_struct { + */ + unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; + +-#ifdef CONFIG_X86_32 + /* + * Space for the temporary SYSENTER stack. + */ + unsigned long SYSENTER_stack_canary; + unsigned long SYSENTER_stack[64]; +-#endif +- + } ____cacheline_aligned; + + DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); +diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c +index 8ea78275480d..b275863128eb 100644 +--- a/arch/x86/kernel/asm-offsets.c ++++ b/arch/x86/kernel/asm-offsets.c +@@ -93,4 +93,9 @@ void common(void) { + + BLANK(); + DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); ++ ++ /* Offset from cpu_tss to SYSENTER_stack */ ++ OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); ++ /* Size of SYSENTER_stack */ ++ DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); + } +diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c +index dedf428b20b6..52ce4ea16e53 100644 +--- a/arch/x86/kernel/asm-offsets_32.c ++++ b/arch/x86/kernel/asm-offsets_32.c +@@ -50,11 +50,6 @@ void foo(void) + DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - + offsetofend(struct tss_struct, SYSENTER_stack)); + +- /* Offset from cpu_tss to SYSENTER_stack */ +- OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); +- /* Size of SYSENTER_stack */ +- DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); +- + #ifdef CONFIG_CC_STACKPROTECTOR + BLANK(); + OFFSET(stack_canary_offset, stack_canary, canary); +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index cdf79ab628c2..22f542170198 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1361,7 +1361,9 @@ void syscall_init(void) + * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). + */ + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); +- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); ++ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, ++ (unsigned long)this_cpu_ptr(&cpu_tss) + ++ offsetofend(struct tss_struct, SYSENTER_stack)); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); + #else + wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index 97fb3e5737f5..35d674157fda 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -71,9 +71,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { + */ + .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, + #endif +-#ifdef CONFIG_X86_32 + .SYSENTER_stack_canary = STACK_END_MAGIC, +-#endif + }; + EXPORT_PER_CPU_SYMBOL(cpu_tss); + +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c +index d366adfc61da..d3e3bbd5d3a0 100644 +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -794,14 +794,13 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) + debug_stack_usage_dec(); + + exit: +-#if defined(CONFIG_X86_32) + /* + * This is the most likely code path that involves non-trivial use + * of the SYSENTER stack. Check that we haven't overrun it. + */ + WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC, + "Overran or corrupted SYSENTER stack\n"); +-#endif ++ + ist_exit(regs); + } + NOKPROBE_SYMBOL(do_debug); +-- +2.15.0 + diff --git a/queue/x86-entry-64-Create-a-per-CPU-SYSCALL-entry-trampoli.patch b/queue/x86-entry-64-Create-a-per-CPU-SYSCALL-entry-trampoli.patch new file mode 100644 index 0000000..59ce02e --- /dev/null +++ b/queue/x86-entry-64-Create-a-per-CPU-SYSCALL-entry-trampoli.patch @@ -0,0 +1,226 @@ +From 3386bc8aed825e9f1f65ce38df4b109b2019b71a Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 4 Dec 2017 15:07:25 +0100 +Subject: [PATCH] x86/entry/64: Create a per-CPU SYSCALL entry trampoline + +commit 3386bc8aed825e9f1f65ce38df4b109b2019b71a upstream. + +Handling SYSCALL is tricky: the SYSCALL handler is entered with every +single register (except FLAGS), including RSP, live. It somehow needs +to set RSP to point to a valid stack, which means it needs to save the +user RSP somewhere and find its own stack pointer. The canonical way +to do this is with SWAPGS, which lets us access percpu data using the +%gs prefix. + +With PAGE_TABLE_ISOLATION-like pagetable switching, this is +problematic. Without a scratch register, switching CR3 is impossible, so +%gs-based percpu memory would need to be mapped in the user pagetables. +Doing that without information leaks is difficult or impossible. + +Instead, use a different sneaky trick. Map a copy of the first part +of the SYSCALL asm at a different address for each CPU. Now RIP +varies depending on the CPU, so we can use RIP-relative memory access +to access percpu memory. By putting the relevant information (one +scratch slot and the stack address) at a constant offset relative to +RIP, we can make SYSCALL work without relying on %gs. + +A nice thing about this approach is that we can easily switch it on +and off if we want pagetable switching to be configurable. + +The compat variant of SYSCALL doesn't have this problem in the first +place -- there are plenty of scratch registers, since we don't care +about preserving r8-r15. This patch therefore doesn't touch SYSCALL32 +at all. + +This patch actually seems to be a small speedup. With this patch, +SYSCALL touches an extra cache line and an extra virtual page, but +the pipeline no longer stalls waiting for SWAPGS. It seems that, at +least in a tight loop, the latter outweights the former. + +Thanks to David Laight for an optimization tip. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bpetkov@suse.de> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.403607157@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 42a9379f7acb..2582984ffb4b 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -136,6 +136,64 @@ END(native_usergs_sysret64) + * with them due to bugs in both AMD and Intel CPUs. + */ + ++ .pushsection .entry_trampoline, "ax" ++ ++/* ++ * The code in here gets remapped into cpu_entry_area's trampoline. This means ++ * that the assembler and linker have the wrong idea as to where this code ++ * lives (and, in fact, it's mapped more than once, so it's not even at a ++ * fixed address). So we can't reference any symbols outside the entry ++ * trampoline and expect it to work. ++ * ++ * Instead, we carefully abuse %rip-relative addressing. ++ * _entry_trampoline(%rip) refers to the start of the remapped) entry ++ * trampoline. We can thus find cpu_entry_area with this macro: ++ */ ++ ++#define CPU_ENTRY_AREA \ ++ _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) ++ ++/* The top word of the SYSENTER stack is hot and is usable as scratch space. */ ++#define RSP_SCRATCH CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \ ++ SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA ++ ++ENTRY(entry_SYSCALL_64_trampoline) ++ UNWIND_HINT_EMPTY ++ swapgs ++ ++ /* Stash the user RSP. */ ++ movq %rsp, RSP_SCRATCH ++ ++ /* Load the top of the task stack into RSP */ ++ movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp ++ ++ /* Start building the simulated IRET frame. */ ++ pushq $__USER_DS /* pt_regs->ss */ ++ pushq RSP_SCRATCH /* pt_regs->sp */ ++ pushq %r11 /* pt_regs->flags */ ++ pushq $__USER_CS /* pt_regs->cs */ ++ pushq %rcx /* pt_regs->ip */ ++ ++ /* ++ * x86 lacks a near absolute jump, and we can't jump to the real ++ * entry text with a relative jump. We could push the target ++ * address and then use retq, but this destroys the pipeline on ++ * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead, ++ * spill RDI and restore it in a second-stage trampoline. ++ */ ++ pushq %rdi ++ movq $entry_SYSCALL_64_stage2, %rdi ++ jmp *%rdi ++END(entry_SYSCALL_64_trampoline) ++ ++ .popsection ++ ++ENTRY(entry_SYSCALL_64_stage2) ++ UNWIND_HINT_EMPTY ++ popq %rdi ++ jmp entry_SYSCALL_64_after_hwframe ++END(entry_SYSCALL_64_stage2) ++ + ENTRY(entry_SYSCALL_64) + UNWIND_HINT_EMPTY + /* +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index 84558b611ad3..6a699474c2c7 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -61,6 +61,8 @@ struct cpu_entry_area { + * of the TSS region. + */ + struct tss_struct tss; ++ ++ char entry_trampoline[PAGE_SIZE]; + }; + + #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) +diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c +index 55858b277cf6..61b1af88ac07 100644 +--- a/arch/x86/kernel/asm-offsets.c ++++ b/arch/x86/kernel/asm-offsets.c +@@ -101,4 +101,5 @@ void common(void) { + + /* Layout info for cpu_entry_area */ + OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); ++ OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); + } +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 57968880e39b..430f950b0b7f 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -486,6 +486,8 @@ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); + static inline void setup_cpu_entry_area(int cpu) + { + #ifdef CONFIG_X86_64 ++ extern char _entry_trampoline[]; ++ + /* On 64-bit systems, we use a read-only fixmap GDT. */ + pgprot_t gdt_prot = PAGE_KERNEL_RO; + #else +@@ -532,6 +534,11 @@ static inline void setup_cpu_entry_area(int cpu) + #ifdef CONFIG_X86_32 + this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu)); + #endif ++ ++#ifdef CONFIG_X86_64 ++ __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), ++ __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); ++#endif + } + + /* Load the original GDT from the per-cpu structure */ +@@ -1395,10 +1402,16 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks + /* May not be marked __init: used by software suspend */ + void syscall_init(void) + { ++ extern char _entry_trampoline[]; ++ extern char entry_SYSCALL_64_trampoline[]; ++ + int cpu = smp_processor_id(); ++ unsigned long SYSCALL64_entry_trampoline = ++ (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline + ++ (entry_SYSCALL_64_trampoline - _entry_trampoline); + + wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); +- wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); ++ wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); + + #ifdef CONFIG_IA32_EMULATION + wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); +diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S +index a4009fb9be87..d2a8b5a24a44 100644 +--- a/arch/x86/kernel/vmlinux.lds.S ++++ b/arch/x86/kernel/vmlinux.lds.S +@@ -107,6 +107,15 @@ SECTIONS + SOFTIRQENTRY_TEXT + *(.fixup) + *(.gnu.warning) ++ ++#ifdef CONFIG_X86_64 ++ . = ALIGN(PAGE_SIZE); ++ _entry_trampoline = .; ++ *(.entry_trampoline) ++ . = ALIGN(PAGE_SIZE); ++ ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); ++#endif ++ + /* End of text section */ + _etext = .; + } :text = 0x9090 +-- +2.15.0 + diff --git a/queue/x86-entry-64-De-Xen-ify-our-NMI-code.patch b/queue/x86-entry-64-De-Xen-ify-our-NMI-code.patch new file mode 100644 index 0000000..383dce6 --- /dev/null +++ b/queue/x86-entry-64-De-Xen-ify-our-NMI-code.patch @@ -0,0 +1,106 @@ +From 929bacec21478a72c78e4f29f98fb799bd00105a Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 2 Nov 2017 00:59:08 -0700 +Subject: [PATCH] x86/entry/64: De-Xen-ify our NMI code + +commit 929bacec21478a72c78e4f29f98fb799bd00105a upstream. + +Xen PV is fundamentally incompatible with our fancy NMI code: it +doesn't use IST at all, and Xen entries clobber two stack slots +below the hardware frame. + +Drop Xen PV support from our NMI code entirely. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Reviewed-by: Borislav Petkov <bp@suse.de> +Acked-by: Juergen Gross <jgross@suse.com> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/bfbe711b5ae03f672f8848999a8eb2711efc7f98.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index a3f76ab5d0ea..40e9933a2d33 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -1240,9 +1240,13 @@ ENTRY(error_exit) + jmp retint_user + END(error_exit) + +-/* Runs on exception stack */ ++/* ++ * Runs on exception stack. Xen PV does not go through this path at all, ++ * so we can use real assembly here. ++ */ + ENTRY(nmi) + UNWIND_HINT_IRET_REGS ++ + /* + * We allow breakpoints in NMIs. If a breakpoint occurs, then + * the iretq it performs will take us out of NMI context. +@@ -1300,7 +1304,7 @@ ENTRY(nmi) + * stacks lest we corrupt the "NMI executing" variable. + */ + +- SWAPGS_UNSAFE_STACK ++ swapgs + cld + movq %rsp, %rdx + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp +@@ -1465,7 +1469,7 @@ nested_nmi_out: + popq %rdx + + /* We are returning to kernel mode, so this cannot result in a fault. */ +- INTERRUPT_RETURN ++ iretq + + first_nmi: + /* Restore rdx. */ +@@ -1496,7 +1500,7 @@ first_nmi: + pushfq /* RFLAGS */ + pushq $__KERNEL_CS /* CS */ + pushq $1f /* RIP */ +- INTERRUPT_RETURN /* continues at repeat_nmi below */ ++ iretq /* continues at repeat_nmi below */ + UNWIND_HINT_IRET_REGS + 1: + #endif +@@ -1571,20 +1575,22 @@ nmi_restore: + /* + * Clear "NMI executing". Set DF first so that we can easily + * distinguish the remaining code between here and IRET from +- * the SYSCALL entry and exit paths. On a native kernel, we +- * could just inspect RIP, but, on paravirt kernels, +- * INTERRUPT_RETURN can translate into a jump into a +- * hypercall page. ++ * the SYSCALL entry and exit paths. ++ * ++ * We arguably should just inspect RIP instead, but I (Andy) wrote ++ * this code when I had the misapprehension that Xen PV supported ++ * NMIs, and Xen PV would break that approach. + */ + std + movq $0, 5*8(%rsp) /* clear "NMI executing" */ + + /* +- * INTERRUPT_RETURN reads the "iret" frame and exits the NMI +- * stack in a single instruction. We are returning to kernel +- * mode, so this cannot result in a fault. ++ * iretq reads the "iret" frame and exits the NMI stack in a ++ * single instruction. We are returning to kernel mode, so this ++ * cannot result in a fault. Similarly, we don't need to worry ++ * about espfix64 on the way back to kernel mode. + */ +- INTERRUPT_RETURN ++ iretq + END(nmi) + + ENTRY(ignore_sysret) +-- +2.15.0 + diff --git a/queue/x86-entry-64-Make-cpu_entry_area.tss-read-only.patch b/queue/x86-entry-64-Make-cpu_entry_area.tss-read-only.patch new file mode 100644 index 0000000..d3fd08a --- /dev/null +++ b/queue/x86-entry-64-Make-cpu_entry_area.tss-read-only.patch @@ -0,0 +1,466 @@ +From c482feefe1aeb150156248ba0fd3e029bc886605 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 4 Dec 2017 15:07:29 +0100 +Subject: [PATCH] x86/entry/64: Make cpu_entry_area.tss read-only + +commit c482feefe1aeb150156248ba0fd3e029bc886605 upstream. + +The TSS is a fairly juicy target for exploits, and, now that the TSS +is in the cpu_entry_area, it's no longer protected by kASLR. Make it +read-only on x86_64. + +On x86_32, it can't be RO because it's written by the CPU during task +switches, and we use a task gate for double faults. I'd also be +nervous about errata if we tried to make it RO even on configurations +without double fault handling. + +[ tglx: AMD confirmed that there is no problem on 64-bit with TSS RO. So + it's probably safe to assume that it's a non issue, though Intel + might have been creative in that area. Still waiting for + confirmation. ] + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bpetkov@suse.de> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Kees Cook <keescook@chromium.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.733700132@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S +index 3629bcbf85a2..bd8b57a5c874 100644 +--- a/arch/x86/entry/entry_32.S ++++ b/arch/x86/entry/entry_32.S +@@ -942,7 +942,7 @@ ENTRY(debug) + + /* Are we currently on the SYSENTER stack? */ + movl PER_CPU_VAR(cpu_entry_area), %ecx +- addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx ++ addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ + cmpl $SIZEOF_SYSENTER_stack, %ecx + jb .Ldebug_from_sysenter_stack +@@ -986,7 +986,7 @@ ENTRY(nmi) + + /* Are we currently on the SYSENTER stack? */ + movl PER_CPU_VAR(cpu_entry_area), %ecx +- addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx ++ addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ + cmpl $SIZEOF_SYSENTER_stack, %ecx + jb .Lnmi_from_sysenter_stack +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 575b184f377f..2812ce043a7a 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -154,7 +154,7 @@ END(native_usergs_sysret64) + _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) + + /* The top word of the SYSENTER stack is hot and is usable as scratch space. */ +-#define RSP_SCRATCH CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + \ ++#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \ + SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA + + ENTRY(entry_SYSCALL_64_trampoline) +@@ -390,7 +390,7 @@ syscall_return_via_sysret: + * Save old stack pointer and switch to trampoline stack. + */ + movq %rsp, %rdi +- movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp ++ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + + pushq RSP-RDI(%rdi) /* RSP */ + pushq (%rdi) /* RDI */ +@@ -719,7 +719,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) + * Save old stack pointer and switch to trampoline stack. + */ + movq %rsp, %rdi +- movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp ++ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + + /* Copy the IRET frame to the trampoline stack. */ + pushq 6*8(%rdi) /* SS */ +@@ -934,7 +934,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt + /* + * Exception entry points. + */ +-#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) ++#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) + + /* + * Switch to the thread stack. This is called with the IRET frame and +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index cc5d98bdca37..94fc4fa14127 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -56,9 +56,14 @@ struct cpu_entry_area { + char gdt[PAGE_SIZE]; + + /* +- * The GDT is just below cpu_tss and thus serves (on x86_64) as a +- * a read-only guard page for the SYSENTER stack at the bottom +- * of the TSS region. ++ * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as ++ * a a read-only guard page. ++ */ ++ struct SYSENTER_stack_page SYSENTER_stack_page; ++ ++ /* ++ * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because ++ * we need task switches to work, and task switches write to the TSS. + */ + struct tss_struct tss; + +@@ -247,7 +252,7 @@ static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) + + static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu) + { +- return &get_cpu_entry_area(cpu)->tss.SYSENTER_stack; ++ return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack; + } + + #endif /* !__ASSEMBLY__ */ +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index f933869470b8..e8991d7f7034 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -340,13 +340,11 @@ struct SYSENTER_stack { + unsigned long words[64]; + }; + +-struct tss_struct { +- /* +- * Space for the temporary SYSENTER stack, used for SYSENTER +- * and the entry trampoline as well. +- */ +- struct SYSENTER_stack SYSENTER_stack; ++struct SYSENTER_stack_page { ++ struct SYSENTER_stack stack; ++} __aligned(PAGE_SIZE); + ++struct tss_struct { + /* + * The fixed hardware portion. This must not cross a page boundary + * at risk of violating the SDM's advice and potentially triggering +@@ -363,7 +361,7 @@ struct tss_struct { + unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; + } __aligned(PAGE_SIZE); + +-DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss); ++DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw); + + /* + * sizeof(unsigned long) coming from an extra "long" at the end +@@ -378,7 +376,8 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss); + #ifdef CONFIG_X86_32 + DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); + #else +-#define cpu_current_top_of_stack cpu_tss.x86_tss.sp1 ++/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */ ++#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1 + #endif + + /* +@@ -538,7 +537,7 @@ static inline void native_set_iopl_mask(unsigned mask) + static inline void + native_load_sp0(unsigned long sp0) + { +- this_cpu_write(cpu_tss.x86_tss.sp0, sp0); ++ this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); + } + + static inline void native_swapgs(void) +diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h +index cbc71e73bd32..9b6df68d8fd1 100644 +--- a/arch/x86/include/asm/switch_to.h ++++ b/arch/x86/include/asm/switch_to.h +@@ -79,10 +79,10 @@ do { \ + static inline void refresh_sysenter_cs(struct thread_struct *thread) + { + /* Only happens when SEP is enabled, no need to test "SEP"arately: */ +- if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs)) ++ if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs)) + return; + +- this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs); ++ this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs); + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); + } + #endif +diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h +index 44a04999791e..00223333821a 100644 +--- a/arch/x86/include/asm/thread_info.h ++++ b/arch/x86/include/asm/thread_info.h +@@ -207,7 +207,7 @@ static inline int arch_within_stack_frames(const void * const stack, + #else /* !__ASSEMBLY__ */ + + #ifdef CONFIG_X86_64 +-# define cpu_current_top_of_stack (cpu_tss + TSS_sp1) ++# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1) + #endif + + #endif +diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c +index 46c0995344aa..cd360a5e0dca 100644 +--- a/arch/x86/kernel/asm-offsets.c ++++ b/arch/x86/kernel/asm-offsets.c +@@ -94,10 +94,9 @@ void common(void) { + BLANK(); + DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); + +- OFFSET(TSS_STRUCT_SYSENTER_stack, tss_struct, SYSENTER_stack); +- DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); +- + /* Layout info for cpu_entry_area */ + OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); + OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); ++ OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page); ++ DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); + } +diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c +index 52ce4ea16e53..7d20d9c0b3d6 100644 +--- a/arch/x86/kernel/asm-offsets_32.c ++++ b/arch/x86/kernel/asm-offsets_32.c +@@ -47,8 +47,8 @@ void foo(void) + BLANK(); + + /* Offset from the sysenter stack to tss.sp0 */ +- DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - +- offsetofend(struct tss_struct, SYSENTER_stack)); ++ DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - ++ offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack)); + + #ifdef CONFIG_CC_STACKPROTECTOR + BLANK(); +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 3de7480e4f32..c2eada1056de 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -487,6 +487,9 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); + #endif + ++static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page, ++ SYSENTER_stack_storage); ++ + static void __init + set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) + { +@@ -500,23 +503,29 @@ static void __init setup_cpu_entry_area(int cpu) + #ifdef CONFIG_X86_64 + extern char _entry_trampoline[]; + +- /* On 64-bit systems, we use a read-only fixmap GDT. */ ++ /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ + pgprot_t gdt_prot = PAGE_KERNEL_RO; ++ pgprot_t tss_prot = PAGE_KERNEL_RO; + #else + /* + * On native 32-bit systems, the GDT cannot be read-only because + * our double fault handler uses a task gate, and entering through +- * a task gate needs to change an available TSS to busy. If the GDT +- * is read-only, that will triple fault. ++ * a task gate needs to change an available TSS to busy. If the ++ * GDT is read-only, that will triple fault. The TSS cannot be ++ * read-only because the CPU writes to it on task switches. + * +- * On Xen PV, the GDT must be read-only because the hypervisor requires +- * it. ++ * On Xen PV, the GDT must be read-only because the hypervisor ++ * requires it. + */ + pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? + PAGE_KERNEL_RO : PAGE_KERNEL; ++ pgprot_t tss_prot = PAGE_KERNEL; + #endif + + __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); ++ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page), ++ per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1, ++ PAGE_KERNEL); + + /* + * The Intel SDM says (Volume 3, 7.2.1): +@@ -539,9 +548,9 @@ static void __init setup_cpu_entry_area(int cpu) + offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); + BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); + set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), +- &per_cpu(cpu_tss, cpu), ++ &per_cpu(cpu_tss_rw, cpu), + sizeof(struct tss_struct) / PAGE_SIZE, +- PAGE_KERNEL); ++ tss_prot); + + #ifdef CONFIG_X86_32 + per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); +@@ -1305,7 +1314,7 @@ void enable_sep_cpu(void) + return; + + cpu = get_cpu(); +- tss = &per_cpu(cpu_tss, cpu); ++ tss = &per_cpu(cpu_tss_rw, cpu); + + /* + * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- +@@ -1575,7 +1584,7 @@ void cpu_init(void) + if (cpu) + load_ucode_ap(); + +- t = &per_cpu(cpu_tss, cpu); ++ t = &per_cpu(cpu_tss_rw, cpu); + oist = &per_cpu(orig_ist, cpu); + + #ifdef CONFIG_NUMA +@@ -1667,7 +1676,7 @@ void cpu_init(void) + { + int cpu = smp_processor_id(); + struct task_struct *curr = current; +- struct tss_struct *t = &per_cpu(cpu_tss, cpu); ++ struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu); + + wait_for_master_cpu(cpu); + +diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c +index 3feb648781c4..2f723301eb58 100644 +--- a/arch/x86/kernel/ioport.c ++++ b/arch/x86/kernel/ioport.c +@@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) + * because the ->io_bitmap_max value must match the bitmap + * contents: + */ +- tss = &per_cpu(cpu_tss, get_cpu()); ++ tss = &per_cpu(cpu_tss_rw, get_cpu()); + + if (turn_on) + bitmap_clear(t->io_bitmap_ptr, from, num); +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index 6a04287f222b..517415978409 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -47,7 +47,7 @@ + * section. Since TSS's are completely CPU-local, we want them + * on exact cacheline boundaries, to eliminate cacheline ping-pong. + */ +-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { ++__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = { + .x86_tss = { + /* + * .sp0 is only used when entering ring 0 from a lower +@@ -82,7 +82,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { + .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, + #endif + }; +-EXPORT_PER_CPU_SYMBOL(cpu_tss); ++EXPORT_PER_CPU_SYMBOL(cpu_tss_rw); + + DEFINE_PER_CPU(bool, __tss_limit_invalid); + EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); +@@ -111,7 +111,7 @@ void exit_thread(struct task_struct *tsk) + struct fpu *fpu = &t->fpu; + + if (bp) { +- struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); ++ struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu()); + + t->io_bitmap_ptr = NULL; + clear_thread_flag(TIF_IO_BITMAP); +diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c +index 45bf0c5f93e1..5224c6099184 100644 +--- a/arch/x86/kernel/process_32.c ++++ b/arch/x86/kernel/process_32.c +@@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + struct fpu *prev_fpu = &prev->fpu; + struct fpu *next_fpu = &next->fpu; + int cpu = smp_processor_id(); +- struct tss_struct *tss = &per_cpu(cpu_tss, cpu); ++ struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); + + /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ + +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index 157f81816915..c75466232016 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -399,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + struct fpu *prev_fpu = &prev->fpu; + struct fpu *next_fpu = &next->fpu; + int cpu = smp_processor_id(); +- struct tss_struct *tss = &per_cpu(cpu_tss, cpu); ++ struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); + + WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && + this_cpu_read(irq_count) != -1); +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c +index 5ade4f89a6d1..74136fd16f49 100644 +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -364,7 +364,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) + regs->cs == __KERNEL_CS && + regs->ip == (unsigned long)native_irq_return_iret) + { +- struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; ++ struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; + + /* + * regs->sp points to the failing IRET frame on the +@@ -649,7 +649,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) + * exception came from the IRET target. + */ + struct bad_iret_stack *new_stack = +- (struct bad_iret_stack *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; ++ (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; + + /* Copy the IRET target to the new stack. */ + memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); +diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c +index 553f8fd23cc4..4846eff7e4c8 100644 +--- a/arch/x86/lib/delay.c ++++ b/arch/x86/lib/delay.c +@@ -107,10 +107,10 @@ static void delay_mwaitx(unsigned long __loops) + delay = min_t(u64, MWAITX_MAX_LOOPS, loops); + + /* +- * Use cpu_tss as a cacheline-aligned, seldomly ++ * Use cpu_tss_rw as a cacheline-aligned, seldomly + * accessed per-cpu variable as the monitor target. + */ +- __monitorx(raw_cpu_ptr(&cpu_tss), 0, 0); ++ __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); + + /* + * AMD, like Intel, supports the EAX hint and EAX=0xf +diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c +index fbd054d6ac97..ae3a071e1d0f 100644 +--- a/arch/x86/xen/enlighten_pv.c ++++ b/arch/x86/xen/enlighten_pv.c +@@ -818,7 +818,7 @@ static void xen_load_sp0(unsigned long sp0) + mcs = xen_mc_entry(0); + MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); + xen_mc_issue(PARAVIRT_LAZY_CPU); +- this_cpu_write(cpu_tss.x86_tss.sp0, sp0); ++ this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); + } + + void xen_set_iopl_mask(unsigned mask) +-- +2.15.0 + diff --git a/queue/x86-entry-64-Merge-the-fast-and-slow-SYSRET-paths.patch b/queue/x86-entry-64-Merge-the-fast-and-slow-SYSRET-paths.patch new file mode 100644 index 0000000..2e56571 --- /dev/null +++ b/queue/x86-entry-64-Merge-the-fast-and-slow-SYSRET-paths.patch @@ -0,0 +1,49 @@ +From a512210643da8082cb44181dba8b18e752bd68f0 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 2 Nov 2017 00:59:04 -0700 +Subject: [PATCH] x86/entry/64: Merge the fast and slow SYSRET paths + +commit a512210643da8082cb44181dba8b18e752bd68f0 upstream. + +They did almost the same thing. Remove a bunch of pointless +instructions (mostly hidden in macros) and reduce cognitive load by +merging them. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/1204e20233fcab9130a1ba80b3b1879b5db3fc1f.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 4f9b4463b3fc..b5a0ea63d391 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -220,10 +220,9 @@ entry_SYSCALL_64_fastpath: + TRACE_IRQS_ON /* user mode is traced as IRQs on */ + movq RIP(%rsp), %rcx + movq EFLAGS(%rsp), %r11 +- RESTORE_C_REGS_EXCEPT_RCX_R11 +- movq RSP(%rsp), %rsp ++ addq $6*8, %rsp /* skip extra regs -- they were preserved */ + UNWIND_HINT_EMPTY +- USERGS_SYSRET64 ++ jmp .Lpop_c_regs_except_rcx_r11_and_sysret + + 1: + /* +@@ -317,6 +316,7 @@ syscall_return_via_sysret: + /* rcx and r11 are already restored (see code above) */ + UNWIND_HINT_EMPTY + POP_EXTRA_REGS ++.Lpop_c_regs_except_rcx_r11_and_sysret: + popq %rsi /* skip r11 */ + popq %r10 + popq %r9 +-- +2.15.0 + diff --git a/queue/x86-entry-64-Move-SWAPGS-into-the-common-IRET-to-use.patch b/queue/x86-entry-64-Move-SWAPGS-into-the-common-IRET-to-use.patch new file mode 100644 index 0000000..5b83301 --- /dev/null +++ b/queue/x86-entry-64-Move-SWAPGS-into-the-common-IRET-to-use.patch @@ -0,0 +1,144 @@ +From 8a055d7f411d41755ce30db5bb65b154777c4b78 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 2 Nov 2017 00:59:00 -0700 +Subject: [PATCH] x86/entry/64: Move SWAPGS into the common IRET-to-usermode + path + +commit 8a055d7f411d41755ce30db5bb65b154777c4b78 upstream. + +All of the code paths that ended up doing IRET to usermode did +SWAPGS immediately beforehand. Move the SWAPGS into the common +code. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/27fd6f45b7cd640de38fb9066fd0349bcd11f8e1.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 3eeb1694210c..d6ffdc9afcbb 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -249,12 +249,14 @@ return_from_SYSCALL_64: + + /* + * Try to use SYSRET instead of IRET if we're returning to +- * a completely clean 64-bit userspace context. ++ * a completely clean 64-bit userspace context. If we're not, ++ * go to the slow exit path. + */ + movq RCX(%rsp), %rcx + movq RIP(%rsp), %r11 +- cmpq %rcx, %r11 /* RCX == RIP */ +- jne opportunistic_sysret_failed ++ ++ cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */ ++ jne swapgs_restore_regs_and_return_to_usermode + + /* + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP +@@ -272,14 +274,14 @@ return_from_SYSCALL_64: + + /* If this changed %rcx, it was not canonical */ + cmpq %rcx, %r11 +- jne opportunistic_sysret_failed ++ jne swapgs_restore_regs_and_return_to_usermode + + cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ +- jne opportunistic_sysret_failed ++ jne swapgs_restore_regs_and_return_to_usermode + + movq R11(%rsp), %r11 + cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ +- jne opportunistic_sysret_failed ++ jne swapgs_restore_regs_and_return_to_usermode + + /* + * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot +@@ -300,12 +302,12 @@ return_from_SYSCALL_64: + * would never get past 'stuck_here'. + */ + testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 +- jnz opportunistic_sysret_failed ++ jnz swapgs_restore_regs_and_return_to_usermode + + /* nothing to check for RSP */ + + cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ +- jne opportunistic_sysret_failed ++ jne swapgs_restore_regs_and_return_to_usermode + + /* + * We win! This label is here just for ease of understanding +@@ -318,10 +320,6 @@ syscall_return_via_sysret: + movq RSP(%rsp), %rsp + UNWIND_HINT_EMPTY + USERGS_SYSRET64 +- +-opportunistic_sysret_failed: +- SWAPGS +- jmp restore_regs_and_return_to_usermode + END(entry_SYSCALL_64) + + ENTRY(stub_ptregs_64) +@@ -422,8 +420,7 @@ ENTRY(ret_from_fork) + movq %rsp, %rdi + call syscall_return_slowpath /* returns with IRQs disabled */ + TRACE_IRQS_ON /* user mode is traced as IRQS on */ +- SWAPGS +- jmp restore_regs_and_return_to_usermode ++ jmp swapgs_restore_regs_and_return_to_usermode + + 1: + /* kernel thread */ +@@ -611,9 +608,8 @@ GLOBAL(retint_user) + mov %rsp,%rdi + call prepare_exit_to_usermode + TRACE_IRQS_IRETQ +- SWAPGS + +-GLOBAL(restore_regs_and_return_to_usermode) ++GLOBAL(swapgs_restore_regs_and_return_to_usermode) + #ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates user mode. */ + testl $3, CS(%rsp) +@@ -621,6 +617,7 @@ GLOBAL(restore_regs_and_return_to_usermode) + ud2 + 1: + #endif ++ SWAPGS + RESTORE_EXTRA_REGS + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 +@@ -1342,8 +1339,7 @@ ENTRY(nmi) + * Return back to user mode. We must *not* do the normal exit + * work, because we don't want to enable interrupts. + */ +- SWAPGS +- jmp restore_regs_and_return_to_usermode ++ jmp swapgs_restore_regs_and_return_to_usermode + + .Lnmi_from_kernel: + /* +diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S +index 9ca014a99968..932b96ce1b06 100644 +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -336,8 +336,7 @@ ENTRY(entry_INT80_compat) + + /* Go back to user mode. */ + TRACE_IRQS_ON +- SWAPGS +- jmp restore_regs_and_return_to_usermode ++ jmp swapgs_restore_regs_and_return_to_usermode + END(entry_INT80_compat) + + ENTRY(stub32_clone) +-- +2.15.0 + diff --git a/queue/x86-entry-64-Move-the-IST-stacks-into-struct-cpu_ent.patch b/queue/x86-entry-64-Move-the-IST-stacks-into-struct-cpu_ent.patch new file mode 100644 index 0000000..fa740b4 --- /dev/null +++ b/queue/x86-entry-64-Move-the-IST-stacks-into-struct-cpu_ent.patch @@ -0,0 +1,221 @@ +From 40e7f949e0d9a33968ebde5d67f7e3a47c97742a Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 4 Dec 2017 15:07:26 +0100 +Subject: [PATCH] x86/entry/64: Move the IST stacks into struct cpu_entry_area + +commit 40e7f949e0d9a33968ebde5d67f7e3a47c97742a upstream. + +The IST stacks are needed when an IST exception occurs and are accessed +before any kernel code at all runs. Move them into struct cpu_entry_area. + +The IST stacks are unlike the rest of cpu_entry_area: they're used even for +entries from kernel mode. This means that they should be set up before we +load the final IDT. Move cpu_entry_area setup to trap_init() for the boot +CPU and set it up for all possible CPUs at once in native_smp_prepare_cpus(). + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.480598743@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index 6a699474c2c7..451da7d9a502 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -63,10 +63,22 @@ struct cpu_entry_area { + struct tss_struct tss; + + char entry_trampoline[PAGE_SIZE]; ++ ++#ifdef CONFIG_X86_64 ++ /* ++ * Exception stacks used for IST entries. ++ * ++ * In the future, this should have a separate slot for each stack ++ * with guard pages between them. ++ */ ++ char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; ++#endif + }; + + #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) + ++extern void setup_cpu_entry_areas(void); ++ + /* + * Here we define all the compile-time 'special' virtual + * addresses. The point is to have a constant address at +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 430f950b0b7f..fb01a8e5e9b7 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -466,24 +466,36 @@ void load_percpu_segment(int cpu) + load_stack_canary_segment(); + } + +-static void set_percpu_fixmap_pages(int fixmap_index, void *ptr, +- int pages, pgprot_t prot) +-{ +- int i; +- +- for (i = 0; i < pages; i++) { +- __set_fixmap(fixmap_index - i, +- per_cpu_ptr_to_phys(ptr + i * PAGE_SIZE), prot); +- } +-} +- + #ifdef CONFIG_X86_32 + /* The 32-bit entry code needs to find cpu_entry_area. */ + DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); + #endif + ++#ifdef CONFIG_X86_64 ++/* ++ * Special IST stacks which the CPU switches to when it calls ++ * an IST-marked descriptor entry. Up to 7 stacks (hardware ++ * limit), all of them are 4K, except the debug stack which ++ * is 8K. ++ */ ++static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { ++ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, ++ [DEBUG_STACK - 1] = DEBUG_STKSZ ++}; ++ ++static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks ++ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); ++#endif ++ ++static void __init ++set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) ++{ ++ for ( ; pages; pages--, idx--, ptr += PAGE_SIZE) ++ __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot); ++} ++ + /* Setup the fixmap mappings only once per-processor */ +-static inline void setup_cpu_entry_area(int cpu) ++static void __init setup_cpu_entry_area(int cpu) + { + #ifdef CONFIG_X86_64 + extern char _entry_trampoline[]; +@@ -532,15 +544,31 @@ static inline void setup_cpu_entry_area(int cpu) + PAGE_KERNEL); + + #ifdef CONFIG_X86_32 +- this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu)); ++ per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); + #endif + + #ifdef CONFIG_X86_64 ++ BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); ++ BUILD_BUG_ON(sizeof(exception_stacks) != ++ sizeof(((struct cpu_entry_area *)0)->exception_stacks)); ++ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks), ++ &per_cpu(exception_stacks, cpu), ++ sizeof(exception_stacks) / PAGE_SIZE, ++ PAGE_KERNEL); ++ + __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), + __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); + #endif + } + ++void __init setup_cpu_entry_areas(void) ++{ ++ unsigned int cpu; ++ ++ for_each_possible_cpu(cpu) ++ setup_cpu_entry_area(cpu); ++} ++ + /* Load the original GDT from the per-cpu structure */ + void load_direct_gdt(int cpu) + { +@@ -1385,20 +1413,6 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; + DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; + EXPORT_PER_CPU_SYMBOL(__preempt_count); + +-/* +- * Special IST stacks which the CPU switches to when it calls +- * an IST-marked descriptor entry. Up to 7 stacks (hardware +- * limit), all of them are 4K, except the debug stack which +- * is 8K. +- */ +-static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { +- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, +- [DEBUG_STACK - 1] = DEBUG_STKSZ +-}; +- +-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks +- [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +- + /* May not be marked __init: used by software suspend */ + void syscall_init(void) + { +@@ -1607,7 +1621,7 @@ void cpu_init(void) + * set up and load the per-CPU TSS + */ + if (!oist->ist[0]) { +- char *estacks = per_cpu(exception_stacks, cpu); ++ char *estacks = get_cpu_entry_area(cpu)->exception_stacks; + + for (v = 0; v < N_EXCEPTION_STACKS; v++) { + estacks += exception_stack_sizes[v]; +@@ -1633,8 +1647,6 @@ void cpu_init(void) + initialize_tlbstate_and_flush(); + enter_lazy_tlb(&init_mm, me); + +- setup_cpu_entry_area(cpu); +- + /* + * Initialize the TSS. sp0 points to the entry trampoline stack + * regardless of what task is running. +@@ -1694,8 +1706,6 @@ void cpu_init(void) + initialize_tlbstate_and_flush(); + enter_lazy_tlb(&init_mm, curr); + +- setup_cpu_entry_area(cpu); +- + /* + * Initialize the TSS. Don't bother initializing sp0, as the initial + * task never enters user mode. +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c +index ee9ca0ad4388..3e29aad5c7cc 100644 +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -947,6 +947,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) + + void __init trap_init(void) + { ++ /* Init cpu_entry_area before IST entries are set up */ ++ setup_cpu_entry_areas(); ++ + idt_setup_traps(); + + /* +-- +2.15.0 + diff --git a/queue/x86-entry-64-Pass-SP0-directly-to-load_sp0.patch b/queue/x86-entry-64-Pass-SP0-directly-to-load_sp0.patch new file mode 100644 index 0000000..c94c9d6 --- /dev/null +++ b/queue/x86-entry-64-Pass-SP0-directly-to-load_sp0.patch @@ -0,0 +1,220 @@ +From da51da189a24bb9b7e2d5a123be096e51a4695a5 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 2 Nov 2017 00:59:10 -0700 +Subject: [PATCH] x86/entry/64: Pass SP0 directly to load_sp0() + +commit da51da189a24bb9b7e2d5a123be096e51a4695a5 upstream. + +load_sp0() had an odd signature: + + void load_sp0(struct tss_struct *tss, struct thread_struct *thread); + +Simplify it to: + + void load_sp0(unsigned long sp0); + +Also simplify a few get_cpu()/put_cpu() sequences to +preempt_disable()/preempt_enable(). + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/2655d8b42ed940aa384fe18ee1129bbbcf730a08.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h +index 12deec722cf0..43d4f90edebc 100644 +--- a/arch/x86/include/asm/paravirt.h ++++ b/arch/x86/include/asm/paravirt.h +@@ -15,10 +15,9 @@ + #include <linux/cpumask.h> + #include <asm/frame.h> + +-static inline void load_sp0(struct tss_struct *tss, +- struct thread_struct *thread) ++static inline void load_sp0(unsigned long sp0) + { +- PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread); ++ PVOP_VCALL1(pv_cpu_ops.load_sp0, sp0); + } + + /* The paravirtualized CPUID instruction. */ +diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h +index 280d94c36dad..a916788ac478 100644 +--- a/arch/x86/include/asm/paravirt_types.h ++++ b/arch/x86/include/asm/paravirt_types.h +@@ -133,7 +133,7 @@ struct pv_cpu_ops { + void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); + void (*free_ldt)(struct desc_struct *ldt, unsigned entries); + +- void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); ++ void (*load_sp0)(unsigned long sp0); + + void (*set_iopl_mask)(unsigned mask); + +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 0167e3e35a57..064b84722166 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -517,9 +517,9 @@ static inline void native_set_iopl_mask(unsigned mask) + } + + static inline void +-native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) ++native_load_sp0(unsigned long sp0) + { +- tss->x86_tss.sp0 = thread->sp0; ++ this_cpu_write(cpu_tss.x86_tss.sp0, sp0); + } + + static inline void native_swapgs(void) +@@ -544,10 +544,9 @@ static inline unsigned long current_top_of_stack(void) + #else + #define __cpuid native_cpuid + +-static inline void load_sp0(struct tss_struct *tss, +- struct thread_struct *thread) ++static inline void load_sp0(unsigned long sp0) + { +- native_load_sp0(tss, thread); ++ native_load_sp0(sp0); + } + + #define set_iopl_mask native_set_iopl_mask +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 03bb004bb15e..4e7fb9c3bfa5 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1570,7 +1570,7 @@ void cpu_init(void) + initialize_tlbstate_and_flush(); + enter_lazy_tlb(&init_mm, me); + +- load_sp0(t, ¤t->thread); ++ load_sp0(current->thread.sp0); + set_tss_desc(cpu, t); + load_TR_desc(); + load_mm_ldt(&init_mm); +@@ -1625,7 +1625,7 @@ void cpu_init(void) + initialize_tlbstate_and_flush(); + enter_lazy_tlb(&init_mm, curr); + +- load_sp0(t, thread); ++ load_sp0(thread->sp0); + set_tss_desc(cpu, t); + load_TR_desc(); + load_mm_ldt(&init_mm); +diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c +index 0936ed3da6b6..40b85870e429 100644 +--- a/arch/x86/kernel/process_32.c ++++ b/arch/x86/kernel/process_32.c +@@ -287,7 +287,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + * current_thread_info(). Refresh the SYSENTER configuration in + * case prev or next is vm86. + */ +- load_sp0(tss, next); ++ load_sp0(next->sp0); + refresh_sysenter_cs(next); + this_cpu_write(cpu_current_top_of_stack, + (unsigned long)task_stack_page(next_p) + +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index a6ff6d1a0110..2124304fb77a 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -465,7 +465,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + this_cpu_write(current_task, next_p); + + /* Reload sp0. */ +- load_sp0(tss, next); ++ load_sp0(next->sp0); + + /* + * Now maybe reload the debug registers and handle I/O bitmaps +diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c +index 5bc1c3ab6287..0f1d92cd20ad 100644 +--- a/arch/x86/kernel/vm86_32.c ++++ b/arch/x86/kernel/vm86_32.c +@@ -94,7 +94,6 @@ + + void save_v86_state(struct kernel_vm86_regs *regs, int retval) + { +- struct tss_struct *tss; + struct task_struct *tsk = current; + struct vm86plus_struct __user *user; + struct vm86 *vm86 = current->thread.vm86; +@@ -146,13 +145,13 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) + do_exit(SIGSEGV); + } + +- tss = &per_cpu(cpu_tss, get_cpu()); ++ preempt_disable(); + tsk->thread.sp0 = vm86->saved_sp0; + tsk->thread.sysenter_cs = __KERNEL_CS; +- load_sp0(tss, &tsk->thread); ++ load_sp0(tsk->thread.sp0); + refresh_sysenter_cs(&tsk->thread); + vm86->saved_sp0 = 0; +- put_cpu(); ++ preempt_enable(); + + memcpy(®s->pt, &vm86->regs32, sizeof(struct pt_regs)); + +@@ -238,7 +237,6 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg) + + static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) + { +- struct tss_struct *tss; + struct task_struct *tsk = current; + struct vm86 *vm86 = tsk->thread.vm86; + struct kernel_vm86_regs vm86regs; +@@ -366,8 +364,8 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) + vm86->saved_sp0 = tsk->thread.sp0; + lazy_save_gs(vm86->regs32.gs); + +- tss = &per_cpu(cpu_tss, get_cpu()); + /* make room for real-mode segments */ ++ preempt_disable(); + tsk->thread.sp0 += 16; + + if (static_cpu_has(X86_FEATURE_SEP)) { +@@ -375,8 +373,8 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) + refresh_sysenter_cs(&tsk->thread); + } + +- load_sp0(tss, &tsk->thread); +- put_cpu(); ++ load_sp0(tsk->thread.sp0); ++ preempt_enable(); + + if (vm86->flags & VM86_SCREEN_BITMAP) + mark_screen_rdonly(tsk->mm); +diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c +index 8da4eff19c2a..e7b213047724 100644 +--- a/arch/x86/xen/enlighten_pv.c ++++ b/arch/x86/xen/enlighten_pv.c +@@ -810,15 +810,14 @@ static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, + } + } + +-static void xen_load_sp0(struct tss_struct *tss, +- struct thread_struct *thread) ++static void xen_load_sp0(unsigned long sp0) + { + struct multicall_space mcs; + + mcs = xen_mc_entry(0); +- MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); ++ MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); + xen_mc_issue(PARAVIRT_LAZY_CPU); +- tss->x86_tss.sp0 = thread->sp0; ++ this_cpu_write(cpu_tss.x86_tss.sp0, sp0); + } + + void xen_set_iopl_mask(unsigned mask) +-- +2.15.0 + diff --git a/queue/x86-entry-64-Remove-all-remaining-direct-thread_stru.patch b/queue/x86-entry-64-Remove-all-remaining-direct-thread_stru.patch new file mode 100644 index 0000000..14fe880 --- /dev/null +++ b/queue/x86-entry-64-Remove-all-remaining-direct-thread_stru.patch @@ -0,0 +1,89 @@ +From 46f5a10a721ce8dce8cc8fe55279b49e1c6b3288 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 2 Nov 2017 00:59:14 -0700 +Subject: [PATCH] x86/entry/64: Remove all remaining direct thread_struct::sp0 + reads + +commit 46f5a10a721ce8dce8cc8fe55279b49e1c6b3288 upstream. + +The only remaining readers in context switch code or vm86(), and +they all just want to update TSS.sp0 to match the current task. +Replace them all with a new helper update_sp0(). + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/2d231687f4ff288c9d9e98d7861b7df374246ac3.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h +index 7ae8caffbada..54e64d909725 100644 +--- a/arch/x86/include/asm/switch_to.h ++++ b/arch/x86/include/asm/switch_to.h +@@ -84,4 +84,10 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) + } + #endif + ++/* This is used when switching tasks or entering/exiting vm86 mode. */ ++static inline void update_sp0(struct task_struct *task) ++{ ++ load_sp0(task->thread.sp0); ++} ++ + #endif /* _ASM_X86_SWITCH_TO_H */ +diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c +index 40b85870e429..45bf0c5f93e1 100644 +--- a/arch/x86/kernel/process_32.c ++++ b/arch/x86/kernel/process_32.c +@@ -287,7 +287,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + * current_thread_info(). Refresh the SYSENTER configuration in + * case prev or next is vm86. + */ +- load_sp0(next->sp0); ++ update_sp0(next_p); + refresh_sysenter_cs(next); + this_cpu_write(cpu_current_top_of_stack, + (unsigned long)task_stack_page(next_p) + +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index 2124304fb77a..45e380958392 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -465,7 +465,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + this_cpu_write(current_task, next_p); + + /* Reload sp0. */ +- load_sp0(next->sp0); ++ update_sp0(next_p); + + /* + * Now maybe reload the debug registers and handle I/O bitmaps +diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c +index 0f1d92cd20ad..a7b44c75c642 100644 +--- a/arch/x86/kernel/vm86_32.c ++++ b/arch/x86/kernel/vm86_32.c +@@ -148,7 +148,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) + preempt_disable(); + tsk->thread.sp0 = vm86->saved_sp0; + tsk->thread.sysenter_cs = __KERNEL_CS; +- load_sp0(tsk->thread.sp0); ++ update_sp0(tsk); + refresh_sysenter_cs(&tsk->thread); + vm86->saved_sp0 = 0; + preempt_enable(); +@@ -373,7 +373,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) + refresh_sysenter_cs(&tsk->thread); + } + +- load_sp0(tsk->thread.sp0); ++ update_sp0(tsk); + preempt_enable(); + + if (vm86->flags & VM86_SCREEN_BITMAP) +-- +2.15.0 + diff --git a/queue/x86-entry-64-Remove-the-RESTORE_._REGS-infrastructur.patch b/queue/x86-entry-64-Remove-the-RESTORE_._REGS-infrastructur.patch new file mode 100644 index 0000000..f4460e9 --- /dev/null +++ b/queue/x86-entry-64-Remove-the-RESTORE_._REGS-infrastructur.patch @@ -0,0 +1,93 @@ +From c39858de696f0cc160a544455e8403d663d577e9 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 2 Nov 2017 00:59:06 -0700 +Subject: [PATCH] x86/entry/64: Remove the RESTORE_..._REGS infrastructure + +commit c39858de696f0cc160a544455e8403d663d577e9 upstream. + +All users of RESTORE_EXTRA_REGS, RESTORE_C_REGS and such, and +REMOVE_PT_GPREGS_FROM_STACK are gone. Delete the macros. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/c32672f6e47c561893316d48e06c7656b1039a36.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h +index 0b9dd8123701..1895a685d3dd 100644 +--- a/arch/x86/entry/calling.h ++++ b/arch/x86/entry/calling.h +@@ -141,16 +141,6 @@ For 32-bit we have the following conventions - kernel is built with + UNWIND_HINT_REGS offset=\offset + .endm + +- .macro RESTORE_EXTRA_REGS offset=0 +- movq 0*8+\offset(%rsp), %r15 +- movq 1*8+\offset(%rsp), %r14 +- movq 2*8+\offset(%rsp), %r13 +- movq 3*8+\offset(%rsp), %r12 +- movq 4*8+\offset(%rsp), %rbp +- movq 5*8+\offset(%rsp), %rbx +- UNWIND_HINT_REGS offset=\offset extra=0 +- .endm +- + .macro POP_EXTRA_REGS + popq %r15 + popq %r14 +@@ -172,48 +162,6 @@ For 32-bit we have the following conventions - kernel is built with + popq %rdi + .endm + +- .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 +- .if \rstor_r11 +- movq 6*8(%rsp), %r11 +- .endif +- .if \rstor_r8910 +- movq 7*8(%rsp), %r10 +- movq 8*8(%rsp), %r9 +- movq 9*8(%rsp), %r8 +- .endif +- .if \rstor_rax +- movq 10*8(%rsp), %rax +- .endif +- .if \rstor_rcx +- movq 11*8(%rsp), %rcx +- .endif +- .if \rstor_rdx +- movq 12*8(%rsp), %rdx +- .endif +- movq 13*8(%rsp), %rsi +- movq 14*8(%rsp), %rdi +- UNWIND_HINT_IRET_REGS offset=16*8 +- .endm +- .macro RESTORE_C_REGS +- RESTORE_C_REGS_HELPER 1,1,1,1,1 +- .endm +- .macro RESTORE_C_REGS_EXCEPT_RAX +- RESTORE_C_REGS_HELPER 0,1,1,1,1 +- .endm +- .macro RESTORE_C_REGS_EXCEPT_RCX +- RESTORE_C_REGS_HELPER 1,0,1,1,1 +- .endm +- .macro RESTORE_C_REGS_EXCEPT_R11 +- RESTORE_C_REGS_HELPER 1,1,0,1,1 +- .endm +- .macro RESTORE_C_REGS_EXCEPT_RCX_R11 +- RESTORE_C_REGS_HELPER 1,0,0,1,1 +- .endm +- +- .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 +- subq $-(15*8+\addskip), %rsp +- .endm +- + .macro icebp + .byte 0xf1 + .endm +-- +2.15.0 + diff --git a/queue/x86-entry-64-Remove-the-SYSENTER-stack-canary.patch b/queue/x86-entry-64-Remove-the-SYSENTER-stack-canary.patch new file mode 100644 index 0000000..62872da --- /dev/null +++ b/queue/x86-entry-64-Remove-the-SYSENTER-stack-canary.patch @@ -0,0 +1,97 @@ +From 7fbbd5cbebf118a9e09f5453f686656a167c3d1c Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 4 Dec 2017 15:07:27 +0100 +Subject: [PATCH] x86/entry/64: Remove the SYSENTER stack canary + +commit 7fbbd5cbebf118a9e09f5453f686656a167c3d1c upstream. + +Now that the SYSENTER stack has a guard page, there's no need for a canary +to detect overflow after the fact. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.572577316@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index b0cf0612a454..d34ac13c5866 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -341,7 +341,6 @@ struct tss_struct { + * Space for the temporary SYSENTER stack, used for SYSENTER + * and the entry trampoline as well. + */ +- unsigned long SYSENTER_stack_canary; + unsigned long SYSENTER_stack[64]; + + /* +diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c +index 60267850125e..ae1ce2e3f132 100644 +--- a/arch/x86/kernel/dumpstack.c ++++ b/arch/x86/kernel/dumpstack.c +@@ -48,8 +48,7 @@ bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) + int cpu = smp_processor_id(); + struct tss_struct *tss = &get_cpu_entry_area(cpu)->tss; + +- /* Treat the canary as part of the stack for unwinding purposes. */ +- void *begin = &tss->SYSENTER_stack_canary; ++ void *begin = &tss->SYSENTER_stack; + void *end = (void *)&tss->SYSENTER_stack + sizeof(tss->SYSENTER_stack); + + if ((void *)stack < begin || (void *)stack >= end) +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index 86e83762e3b3..6a04287f222b 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -81,7 +81,6 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { + */ + .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, + #endif +- .SYSENTER_stack_canary = STACK_END_MAGIC, + }; + EXPORT_PER_CPU_SYMBOL(cpu_tss); + +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c +index 3e29aad5c7cc..5ade4f89a6d1 100644 +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -814,13 +814,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) + debug_stack_usage_dec(); + + exit: +- /* +- * This is the most likely code path that involves non-trivial use +- * of the SYSENTER stack. Check that we haven't overrun it. +- */ +- WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC, +- "Overran or corrupted SYSENTER stack\n"); +- + ist_exit(regs); + } + NOKPROBE_SYMBOL(do_debug); +-- +2.15.0 + diff --git a/queue/x86-entry-64-Remove-the-restore_c_regs_and_iret-labe.patch b/queue/x86-entry-64-Remove-the-restore_c_regs_and_iret-labe.patch new file mode 100644 index 0000000..560a3b9 --- /dev/null +++ b/queue/x86-entry-64-Remove-the-restore_c_regs_and_iret-labe.patch @@ -0,0 +1,63 @@ +From 9da78ba6b47b46428cfdfc0851511ab29c869798 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 2 Nov 2017 00:58:58 -0700 +Subject: [PATCH] x86/entry/64: Remove the restore_c_regs_and_iret label + +commit 9da78ba6b47b46428cfdfc0851511ab29c869798 upstream. + +The only user was the 64-bit opportunistic SYSRET failure path, and +that path didn't really need it. This change makes the +opportunistic SYSRET code a bit more straightforward and gets rid of +the label. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/be3006a7ad3326e3458cf1cc55d416252cbe1986.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 846e84a1d1f7..e8ef83df46e6 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -245,7 +245,6 @@ entry_SYSCALL64_slow_path: + call do_syscall_64 /* returns with IRQs disabled */ + + return_from_SYSCALL_64: +- RESTORE_EXTRA_REGS + TRACE_IRQS_IRETQ /* we're about to change IF */ + + /* +@@ -314,6 +313,7 @@ return_from_SYSCALL_64: + */ + syscall_return_via_sysret: + /* rcx and r11 are already restored (see code above) */ ++ RESTORE_EXTRA_REGS + RESTORE_C_REGS_EXCEPT_RCX_R11 + movq RSP(%rsp), %rsp + UNWIND_HINT_EMPTY +@@ -321,7 +321,7 @@ syscall_return_via_sysret: + + opportunistic_sysret_failed: + SWAPGS +- jmp restore_c_regs_and_iret ++ jmp restore_regs_and_iret + END(entry_SYSCALL_64) + + ENTRY(stub_ptregs_64) +@@ -638,7 +638,6 @@ retint_kernel: + */ + GLOBAL(restore_regs_and_iret) + RESTORE_EXTRA_REGS +-restore_c_regs_and_iret: + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 + INTERRUPT_RETURN +-- +2.15.0 + diff --git a/queue/x86-entry-64-Remove-thread_struct-sp0.patch b/queue/x86-entry-64-Remove-thread_struct-sp0.patch new file mode 100644 index 0000000..35c9de3 --- /dev/null +++ b/queue/x86-entry-64-Remove-thread_struct-sp0.patch @@ -0,0 +1,140 @@ +From d375cf1530595e33961a8844192cddab913650e3 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 2 Nov 2017 00:59:16 -0700 +Subject: [PATCH] x86/entry/64: Remove thread_struct::sp0 + +commit d375cf1530595e33961a8844192cddab913650e3 upstream. + +On x86_64, we can easily calculate sp0 when needed instead of +storing it in thread_struct. + +On x86_32, a similar cleanup would be possible, but it would require +cleaning up the vm86 code first, and that can wait for a later +cleanup series. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/719cd9c66c548c4350d98a90f050aee8b17f8919.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h +index 5343c19814b3..948b6d8ec46f 100644 +--- a/arch/x86/include/asm/compat.h ++++ b/arch/x86/include/asm/compat.h +@@ -6,6 +6,7 @@ + */ + #include <linux/types.h> + #include <linux/sched.h> ++#include <linux/sched/task_stack.h> + #include <asm/processor.h> + #include <asm/user32.h> + #include <asm/unistd.h> +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index ad59cec14239..ae2ae6d80674 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -430,7 +430,9 @@ typedef struct { + struct thread_struct { + /* Cached TLS descriptors: */ + struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; ++#ifdef CONFIG_X86_32 + unsigned long sp0; ++#endif + unsigned long sp; + #ifdef CONFIG_X86_32 + unsigned long sysenter_cs; +@@ -797,6 +799,13 @@ static inline void spin_lock_prefetch(const void *x) + + #define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1)) + ++#define task_pt_regs(task) \ ++({ \ ++ unsigned long __ptr = (unsigned long)task_stack_page(task); \ ++ __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ ++ ((struct pt_regs *)__ptr) - 1; \ ++}) ++ + #ifdef CONFIG_X86_32 + /* + * User space process size: 3GB (default). +@@ -816,23 +825,6 @@ static inline void spin_lock_prefetch(const void *x) + .addr_limit = KERNEL_DS, \ + } + +-/* +- * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack. +- * This is necessary to guarantee that the entire "struct pt_regs" +- * is accessible even if the CPU haven't stored the SS/ESP registers +- * on the stack (interrupt gate does not save these registers +- * when switching to the same priv ring). +- * Therefore beware: accessing the ss/esp fields of the +- * "struct pt_regs" is possible, but they may contain the +- * completely wrong values. +- */ +-#define task_pt_regs(task) \ +-({ \ +- unsigned long __ptr = (unsigned long)task_stack_page(task); \ +- __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ +- ((struct pt_regs *)__ptr) - 1; \ +-}) +- + #define KSTK_ESP(task) (task_pt_regs(task)->sp) + + #else +@@ -866,11 +858,9 @@ static inline void spin_lock_prefetch(const void *x) + #define STACK_TOP_MAX TASK_SIZE_MAX + + #define INIT_THREAD { \ +- .sp0 = TOP_OF_INIT_STACK, \ + .addr_limit = KERNEL_DS, \ + } + +-#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) + extern unsigned long KSTK_ESP(struct task_struct *task); + + #endif /* CONFIG_X86_64 */ +diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h +index 54e64d909725..010cd6e4eafc 100644 +--- a/arch/x86/include/asm/switch_to.h ++++ b/arch/x86/include/asm/switch_to.h +@@ -1,6 +1,8 @@ + #ifndef _ASM_X86_SWITCH_TO_H + #define _ASM_X86_SWITCH_TO_H + ++#include <linux/sched/task_stack.h> ++ + struct task_struct; /* one of the stranger aspects of C forward declarations */ + + struct task_struct *__switch_to_asm(struct task_struct *prev, +@@ -87,7 +89,11 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) + /* This is used when switching tasks or entering/exiting vm86 mode. */ + static inline void update_sp0(struct task_struct *task) + { ++#ifdef CONFIG_X86_32 + load_sp0(task->thread.sp0); ++#else ++ load_sp0(task_top_of_stack(task)); ++#endif + } + + #endif /* _ASM_X86_SWITCH_TO_H */ +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index 45e380958392..eeeb34f85c25 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -274,7 +274,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, + struct inactive_task_frame *frame; + struct task_struct *me = current; + +- p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; + childregs = task_pt_regs(p); + fork_frame = container_of(childregs, struct fork_frame, regs); + frame = &fork_frame->frame; +-- +2.15.0 + diff --git a/queue/x86-entry-64-Return-to-userspace-from-the-trampoline.patch b/queue/x86-entry-64-Return-to-userspace-from-the-trampoline.patch new file mode 100644 index 0000000..97d5535 --- /dev/null +++ b/queue/x86-entry-64-Return-to-userspace-from-the-trampoline.patch @@ -0,0 +1,122 @@ +From 3e3b9293d392c577b62e24e4bc9982320438e749 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 4 Dec 2017 15:07:24 +0100 +Subject: [PATCH] x86/entry/64: Return to userspace from the trampoline stack + +commit 3e3b9293d392c577b62e24e4bc9982320438e749 upstream. + +By itself, this is useless. It gives us the ability to run some final code +before exit that cannnot run on the kernel stack. This could include a CR3 +switch a la PAGE_TABLE_ISOLATION or some kernel stack erasing, for +example. (Or even weird things like *changing* which kernel stack gets +used as an ASLR-strengthening mechanism.) + +The SYSRET32 path is not covered yet. It could be in the future or +we could just ignore it and force the slow path if needed. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.306546484@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 35b8e949ac2f..42a9379f7acb 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -326,8 +326,24 @@ syscall_return_via_sysret: + popq %rsi /* skip rcx */ + popq %rdx + popq %rsi ++ ++ /* ++ * Now all regs are restored except RSP and RDI. ++ * Save old stack pointer and switch to trampoline stack. ++ */ ++ movq %rsp, %rdi ++ movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp ++ ++ pushq RSP-RDI(%rdi) /* RSP */ ++ pushq (%rdi) /* RDI */ ++ ++ /* ++ * We are on the trampoline stack. All regs except RDI are live. ++ * We can do future final exit work right here. ++ */ ++ + popq %rdi +- movq RSP-ORIG_RAX(%rsp), %rsp ++ popq %rsp + USERGS_SYSRET64 + END(entry_SYSCALL_64) + +@@ -630,10 +646,41 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) + ud2 + 1: + #endif +- SWAPGS + POP_EXTRA_REGS +- POP_C_REGS +- addq $8, %rsp /* skip regs->orig_ax */ ++ popq %r11 ++ popq %r10 ++ popq %r9 ++ popq %r8 ++ popq %rax ++ popq %rcx ++ popq %rdx ++ popq %rsi ++ ++ /* ++ * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS. ++ * Save old stack pointer and switch to trampoline stack. ++ */ ++ movq %rsp, %rdi ++ movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp ++ ++ /* Copy the IRET frame to the trampoline stack. */ ++ pushq 6*8(%rdi) /* SS */ ++ pushq 5*8(%rdi) /* RSP */ ++ pushq 4*8(%rdi) /* EFLAGS */ ++ pushq 3*8(%rdi) /* CS */ ++ pushq 2*8(%rdi) /* RIP */ ++ ++ /* Push user RDI on the trampoline stack. */ ++ pushq (%rdi) ++ ++ /* ++ * We are on the trampoline stack. All regs except RDI are live. ++ * We can do future final exit work right here. ++ */ ++ ++ /* Restore RDI. */ ++ popq %rdi ++ SWAPGS + INTERRUPT_RETURN + + +-- +2.15.0 + diff --git a/queue/x86-entry-64-Separate-cpu_current_top_of_stack-from-.patch b/queue/x86-entry-64-Separate-cpu_current_top_of_stack-from-.patch new file mode 100644 index 0000000..0fc1dc0 --- /dev/null +++ b/queue/x86-entry-64-Separate-cpu_current_top_of_stack-from-.patch @@ -0,0 +1,146 @@ +From 9aaefe7b59ae00605256a7d6bd1c1456432495fc Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 4 Dec 2017 15:07:21 +0100 +Subject: [PATCH] x86/entry/64: Separate cpu_current_top_of_stack from TSS.sp0 + +commit 9aaefe7b59ae00605256a7d6bd1c1456432495fc upstream. + +On 64-bit kernels, we used to assume that TSS.sp0 was the current +top of stack. With the addition of an entry trampoline, this will +no longer be the case. Store the current top of stack in TSS.sp1, +which is otherwise unused but shares the same cacheline. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.050864668@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 759051251664..b0cf0612a454 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -309,7 +309,13 @@ struct x86_hw_tss { + struct x86_hw_tss { + u32 reserved1; + u64 sp0; ++ ++ /* ++ * We store cpu_current_top_of_stack in sp1 so it's always accessible. ++ * Linux does not use ring 1, so sp1 is not otherwise needed. ++ */ + u64 sp1; ++ + u64 sp2; + u64 reserved2; + u64 ist[7]; +@@ -368,6 +374,8 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss); + + #ifdef CONFIG_X86_32 + DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); ++#else ++#define cpu_current_top_of_stack cpu_tss.x86_tss.sp1 + #endif + + /* +@@ -539,12 +547,12 @@ static inline void native_swapgs(void) + + static inline unsigned long current_top_of_stack(void) + { +-#ifdef CONFIG_X86_64 +- return this_cpu_read_stable(cpu_tss.x86_tss.sp0); +-#else +- /* sp0 on x86_32 is special in and around vm86 mode. */ ++ /* ++ * We can't read directly from tss.sp0: sp0 on x86_32 is special in ++ * and around vm86 mode and sp0 on x86_64 is special because of the ++ * entry trampoline. ++ */ + return this_cpu_read_stable(cpu_current_top_of_stack); +-#endif + } + + static inline bool on_thread_stack(void) +diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h +index 70f425947dc5..44a04999791e 100644 +--- a/arch/x86/include/asm/thread_info.h ++++ b/arch/x86/include/asm/thread_info.h +@@ -207,7 +207,7 @@ static inline int arch_within_stack_frames(const void * const stack, + #else /* !__ASSEMBLY__ */ + + #ifdef CONFIG_X86_64 +-# define cpu_current_top_of_stack (cpu_tss + TSS_sp0) ++# define cpu_current_top_of_stack (cpu_tss + TSS_sp1) + #endif + + #endif +diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c +index e3a5175a444b..bf51e51d808d 100644 +--- a/arch/x86/kernel/asm-offsets_64.c ++++ b/arch/x86/kernel/asm-offsets_64.c +@@ -66,6 +66,7 @@ int main(void) + + OFFSET(TSS_ist, tss_struct, x86_tss.ist); + OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); ++ OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); + BLANK(); + + #ifdef CONFIG_CC_STACKPROTECTOR +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index 35d674157fda..86e83762e3b3 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -56,6 +56,16 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { + * Poison it. + */ + .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, ++ ++#ifdef CONFIG_X86_64 ++ /* ++ * .sp1 is cpu_current_top_of_stack. The init task never ++ * runs user code, but cpu_current_top_of_stack should still ++ * be well defined before the first context switch. ++ */ ++ .sp1 = TOP_OF_INIT_STACK, ++#endif ++ + #ifdef CONFIG_X86_32 + .ss0 = __KERNEL_DS, + .ss1 = __KERNEL_CS, +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index 01b119bebb68..157f81816915 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -461,6 +461,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + * Switch the PDA and FPU contexts. + */ + this_cpu_write(current_task, next_p); ++ this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); + + /* Reload sp0. */ + update_sp0(next_p); +-- +2.15.0 + diff --git a/queue/x86-entry-64-Shorten-TEST-instructions.patch b/queue/x86-entry-64-Shorten-TEST-instructions.patch new file mode 100644 index 0000000..90fa702 --- /dev/null +++ b/queue/x86-entry-64-Shorten-TEST-instructions.patch @@ -0,0 +1,46 @@ +From 1e4c4f610f774df6088d7c065b2dd4d22adba698 Mon Sep 17 00:00:00 2001 +From: Borislav Petkov <bp@suse.de> +Date: Thu, 2 Nov 2017 13:09:26 +0100 +Subject: [PATCH] x86/entry/64: Shorten TEST instructions + +commit 1e4c4f610f774df6088d7c065b2dd4d22adba698 upstream. + +Convert TESTL to TESTB and save 3 bytes per callsite. + +No functionality change. + +Signed-off-by: Borislav Petkov <bp@suse.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/20171102120926.4srwerqrr7g72e2k@pd.tnic +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 40e9933a2d33..84263c79a119 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -620,7 +620,7 @@ GLOBAL(retint_user) + GLOBAL(swapgs_restore_regs_and_return_to_usermode) + #ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates user mode. */ +- testl $3, CS(%rsp) ++ testb $3, CS(%rsp) + jnz 1f + ud2 + 1: +@@ -653,7 +653,7 @@ retint_kernel: + GLOBAL(restore_regs_and_return_to_kernel) + #ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates kernel mode. */ +- testl $3, CS(%rsp) ++ testb $3, CS(%rsp) + jz 1f + ud2 + 1: +-- +2.15.0 + diff --git a/queue/x86-entry-64-Shrink-paranoid_exit_restore-and-make-l.patch b/queue/x86-entry-64-Shrink-paranoid_exit_restore-and-make-l.patch new file mode 100644 index 0000000..7315082 --- /dev/null +++ b/queue/x86-entry-64-Shrink-paranoid_exit_restore-and-make-l.patch @@ -0,0 +1,59 @@ +From e53178328c9b96fbdbc719e78c93b5687ee007c3 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 2 Nov 2017 00:59:02 -0700 +Subject: [PATCH] x86/entry/64: Shrink paranoid_exit_restore and make labels + local + +commit e53178328c9b96fbdbc719e78c93b5687ee007c3 upstream. + +paranoid_exit_restore was a copy of restore_regs_and_return_to_kernel. +Merge them and make the paranoid_exit internal labels local. + +Keeping .Lparanoid_exit makes the code a bit shorter because it +allows a 2-byte jnz instead of a 5-byte jnz. + +Saves 96 bytes of text. + +( This is still a bit suboptimal in a non-CONFIG_TRACE_IRQFLAGS + kernel, but fixing that would make the code rather messy. ) + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/510d66a1895cda9473c84b1086f0bb974f22de6a.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 925d56246071..155396443aaa 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -1123,17 +1123,14 @@ ENTRY(paranoid_exit) + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF_DEBUG + testl %ebx, %ebx /* swapgs needed? */ +- jnz paranoid_exit_no_swapgs ++ jnz .Lparanoid_exit_no_swapgs + TRACE_IRQS_IRETQ + SWAPGS_UNSAFE_STACK +- jmp paranoid_exit_restore +-paranoid_exit_no_swapgs: ++ jmp .Lparanoid_exit_restore ++.Lparanoid_exit_no_swapgs: + TRACE_IRQS_IRETQ_DEBUG +-paranoid_exit_restore: +- RESTORE_EXTRA_REGS +- RESTORE_C_REGS +- REMOVE_PT_GPREGS_FROM_STACK 8 +- INTERRUPT_RETURN ++.Lparanoid_exit_restore: ++ jmp restore_regs_and_return_to_kernel + END(paranoid_exit) + + /* +-- +2.15.0 + diff --git a/queue/x86-entry-64-Simplify-reg-restore-code-in-the-standa.patch b/queue/x86-entry-64-Simplify-reg-restore-code-in-the-standa.patch new file mode 100644 index 0000000..d65b626 --- /dev/null +++ b/queue/x86-entry-64-Simplify-reg-restore-code-in-the-standa.patch @@ -0,0 +1,91 @@ +From e872045bfd9c465a8555bab4b8567d56a4d2d3bb Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 2 Nov 2017 00:59:01 -0700 +Subject: [PATCH] x86/entry/64: Simplify reg restore code in the standard IRET + paths + +commit e872045bfd9c465a8555bab4b8567d56a4d2d3bb upstream. + +The old code restored all the registers with movq instead of pop. + +In theory, this was done because some CPUs have higher movq +throughput, but any gain there would be tiny and is almost certainly +outweighed by the higher text size. + +This saves 96 bytes of text. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/ad82520a207ccd851b04ba613f4f752b33ac05f7.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h +index 640aafebdc00..0b9dd8123701 100644 +--- a/arch/x86/entry/calling.h ++++ b/arch/x86/entry/calling.h +@@ -151,6 +151,27 @@ For 32-bit we have the following conventions - kernel is built with + UNWIND_HINT_REGS offset=\offset extra=0 + .endm + ++ .macro POP_EXTRA_REGS ++ popq %r15 ++ popq %r14 ++ popq %r13 ++ popq %r12 ++ popq %rbp ++ popq %rbx ++ .endm ++ ++ .macro POP_C_REGS ++ popq %r11 ++ popq %r10 ++ popq %r9 ++ popq %r8 ++ popq %rax ++ popq %rcx ++ popq %rdx ++ popq %rsi ++ popq %rdi ++ .endm ++ + .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 + .if \rstor_r11 + movq 6*8(%rsp), %r11 +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index d6ffdc9afcbb..925d56246071 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -618,9 +618,9 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) + 1: + #endif + SWAPGS +- RESTORE_EXTRA_REGS +- RESTORE_C_REGS +- REMOVE_PT_GPREGS_FROM_STACK 8 ++ POP_EXTRA_REGS ++ POP_C_REGS ++ addq $8, %rsp /* skip regs->orig_ax */ + INTERRUPT_RETURN + + +@@ -650,9 +650,9 @@ GLOBAL(restore_regs_and_return_to_kernel) + ud2 + 1: + #endif +- RESTORE_EXTRA_REGS +- RESTORE_C_REGS +- REMOVE_PT_GPREGS_FROM_STACK 8 ++ POP_EXTRA_REGS ++ POP_C_REGS ++ addq $8, %rsp /* skip regs->orig_ax */ + INTERRUPT_RETURN + + ENTRY(native_iret) +-- +2.15.0 + diff --git a/queue/x86-entry-64-Split-the-IRET-to-user-and-IRET-to-kern.patch b/queue/x86-entry-64-Split-the-IRET-to-user-and-IRET-to-kern.patch new file mode 100644 index 0000000..996e1bc --- /dev/null +++ b/queue/x86-entry-64-Split-the-IRET-to-user-and-IRET-to-kern.patch @@ -0,0 +1,121 @@ +From 26c4ef9c49d8a0341f6d97ce2cfdd55d1236ed29 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 2 Nov 2017 00:58:59 -0700 +Subject: [PATCH] x86/entry/64: Split the IRET-to-user and IRET-to-kernel paths + +commit 26c4ef9c49d8a0341f6d97ce2cfdd55d1236ed29 upstream. + +These code paths will diverge soon. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/dccf8c7b3750199b4b30383c812d4e2931811509.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index e8ef83df46e6..3eeb1694210c 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -321,7 +321,7 @@ syscall_return_via_sysret: + + opportunistic_sysret_failed: + SWAPGS +- jmp restore_regs_and_iret ++ jmp restore_regs_and_return_to_usermode + END(entry_SYSCALL_64) + + ENTRY(stub_ptregs_64) +@@ -423,7 +423,7 @@ ENTRY(ret_from_fork) + call syscall_return_slowpath /* returns with IRQs disabled */ + TRACE_IRQS_ON /* user mode is traced as IRQS on */ + SWAPGS +- jmp restore_regs_and_iret ++ jmp restore_regs_and_return_to_usermode + + 1: + /* kernel thread */ +@@ -612,7 +612,20 @@ GLOBAL(retint_user) + call prepare_exit_to_usermode + TRACE_IRQS_IRETQ + SWAPGS +- jmp restore_regs_and_iret ++ ++GLOBAL(restore_regs_and_return_to_usermode) ++#ifdef CONFIG_DEBUG_ENTRY ++ /* Assert that pt_regs indicates user mode. */ ++ testl $3, CS(%rsp) ++ jnz 1f ++ ud2 ++1: ++#endif ++ RESTORE_EXTRA_REGS ++ RESTORE_C_REGS ++ REMOVE_PT_GPREGS_FROM_STACK 8 ++ INTERRUPT_RETURN ++ + + /* Returning to kernel space */ + retint_kernel: +@@ -632,11 +645,14 @@ retint_kernel: + */ + TRACE_IRQS_IRETQ + +-/* +- * At this label, code paths which return to kernel and to user, +- * which come from interrupts/exception and from syscalls, merge. +- */ +-GLOBAL(restore_regs_and_iret) ++GLOBAL(restore_regs_and_return_to_kernel) ++#ifdef CONFIG_DEBUG_ENTRY ++ /* Assert that pt_regs indicates kernel mode. */ ++ testl $3, CS(%rsp) ++ jz 1f ++ ud2 ++1: ++#endif + RESTORE_EXTRA_REGS + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 +@@ -1327,7 +1343,7 @@ ENTRY(nmi) + * work, because we don't want to enable interrupts. + */ + SWAPGS +- jmp restore_regs_and_iret ++ jmp restore_regs_and_return_to_usermode + + .Lnmi_from_kernel: + /* +diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S +index e26c25ca7756..9ca014a99968 100644 +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -337,7 +337,7 @@ ENTRY(entry_INT80_compat) + /* Go back to user mode. */ + TRACE_IRQS_ON + SWAPGS +- jmp restore_regs_and_iret ++ jmp restore_regs_and_return_to_usermode + END(entry_INT80_compat) + + ENTRY(stub32_clone) +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S +index 189bf42dfa2b..08f067faa264 100644 +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -326,7 +326,7 @@ early_idt_handler_common: + + 20: + decl early_recursion_flag(%rip) +- jmp restore_regs_and_iret ++ jmp restore_regs_and_return_to_kernel + END(early_idt_handler_common) + + __INITDATA +-- +2.15.0 + diff --git a/queue/x86-entry-64-Stop-initializing-TSS.sp0-at-boot.patch b/queue/x86-entry-64-Stop-initializing-TSS.sp0-at-boot.patch new file mode 100644 index 0000000..54b7b25 --- /dev/null +++ b/queue/x86-entry-64-Stop-initializing-TSS.sp0-at-boot.patch @@ -0,0 +1,90 @@ +From 20bb83443ea79087b5e5f8dab4e9d80bb9bf7acb Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 2 Nov 2017 00:59:13 -0700 +Subject: [PATCH] x86/entry/64: Stop initializing TSS.sp0 at boot + +commit 20bb83443ea79087b5e5f8dab4e9d80bb9bf7acb upstream. + +In my quest to get rid of thread_struct::sp0, I want to clean up or +remove all of its readers. Two of them are in cpu_init() (32-bit and +64-bit), and they aren't needed. This is because we never enter +userspace at all on the threads that CPUs are initialized in. + +Poison the initial TSS.sp0 and stop initializing it on CPU init. + +The comment text mostly comes from Dave Hansen. Thanks! + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/ee4a00540ad28c6cff475fbcc7769a4460acc861.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 4e7fb9c3bfa5..cdf79ab628c2 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1570,9 +1570,13 @@ void cpu_init(void) + initialize_tlbstate_and_flush(); + enter_lazy_tlb(&init_mm, me); + +- load_sp0(current->thread.sp0); ++ /* ++ * Initialize the TSS. Don't bother initializing sp0, as the initial ++ * task never enters user mode. ++ */ + set_tss_desc(cpu, t); + load_TR_desc(); ++ + load_mm_ldt(&init_mm); + + clear_all_debug_regs(); +@@ -1594,7 +1598,6 @@ void cpu_init(void) + int cpu = smp_processor_id(); + struct task_struct *curr = current; + struct tss_struct *t = &per_cpu(cpu_tss, cpu); +- struct thread_struct *thread = &curr->thread; + + wait_for_master_cpu(cpu); + +@@ -1625,9 +1628,13 @@ void cpu_init(void) + initialize_tlbstate_and_flush(); + enter_lazy_tlb(&init_mm, curr); + +- load_sp0(thread->sp0); ++ /* ++ * Initialize the TSS. Don't bother initializing sp0, as the initial ++ * task never enters user mode. ++ */ + set_tss_desc(cpu, t); + load_TR_desc(); ++ + load_mm_ldt(&init_mm); + + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index bd6b85fac666..ff8a9acbcf8b 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -48,7 +48,13 @@ + */ + __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { + .x86_tss = { +- .sp0 = TOP_OF_INIT_STACK, ++ /* ++ * .sp0 is only used when entering ring 0 from a lower ++ * privilege level. Since the init task never runs anything ++ * but ring 0 code, there is no need for a valid value here. ++ * Poison it. ++ */ ++ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, + #ifdef CONFIG_X86_32 + .ss0 = __KERNEL_DS, + .ss1 = __KERNEL_CS, +-- +2.15.0 + diff --git a/queue/x86-entry-64-Use-POP-instead-of-MOV-to-restore-regs-.patch b/queue/x86-entry-64-Use-POP-instead-of-MOV-to-restore-regs-.patch new file mode 100644 index 0000000..b59844b --- /dev/null +++ b/queue/x86-entry-64-Use-POP-instead-of-MOV-to-restore-regs-.patch @@ -0,0 +1,46 @@ +From 471ee4832209e986029b9fabdaad57b1eecb856b Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 2 Nov 2017 00:59:05 -0700 +Subject: [PATCH] x86/entry/64: Use POP instead of MOV to restore regs on NMI + return + +commit 471ee4832209e986029b9fabdaad57b1eecb856b upstream. + +This gets rid of the last user of the old RESTORE_..._REGS infrastructure. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/652a260f17a160789bc6a41d997f98249b73e2ab.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index b5a0ea63d391..5b2f0bc661a0 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -1559,11 +1559,14 @@ end_repeat_nmi: + nmi_swapgs: + SWAPGS_UNSAFE_STACK + nmi_restore: +- RESTORE_EXTRA_REGS +- RESTORE_C_REGS ++ POP_EXTRA_REGS ++ POP_C_REGS + +- /* Point RSP at the "iret" frame. */ +- REMOVE_PT_GPREGS_FROM_STACK 6*8 ++ /* ++ * Skip orig_ax and the "outermost" frame to point RSP at the "iret" ++ * at the "iret" frame. ++ */ ++ addq $6*8, %rsp + + /* + * Clear "NMI executing". Set DF first so that we can easily +-- +2.15.0 + diff --git a/queue/x86-entry-64-Use-a-per-CPU-trampoline-stack-for-IDT-.patch b/queue/x86-entry-64-Use-a-per-CPU-trampoline-stack-for-IDT-.patch new file mode 100644 index 0000000..ed4131a --- /dev/null +++ b/queue/x86-entry-64-Use-a-per-CPU-trampoline-stack-for-IDT-.patch @@ -0,0 +1,279 @@ +From 7f2590a110b837af5679d08fc25c6227c5a8c497 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 4 Dec 2017 15:07:23 +0100 +Subject: [PATCH] x86/entry/64: Use a per-CPU trampoline stack for IDT entries + +commit 7f2590a110b837af5679d08fc25c6227c5a8c497 upstream. + +Historically, IDT entries from usermode have always gone directly +to the running task's kernel stack. Rearrange it so that we enter on +a per-CPU trampoline stack and then manually switch to the task's stack. +This touches a couple of extra cachelines, but it gives us a chance +to run some code before we touch the kernel stack. + +The asm isn't exactly beautiful, but I think that fully refactoring +it can wait. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.225330557@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 32306788821c..35b8e949ac2f 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -560,6 +560,13 @@ END(irq_entries_start) + /* 0(%rsp): ~(interrupt number) */ + .macro interrupt func + cld ++ ++ testb $3, CS-ORIG_RAX(%rsp) ++ jz 1f ++ SWAPGS ++ call switch_to_thread_stack ++1: ++ + ALLOC_PT_GPREGS_ON_STACK + SAVE_C_REGS + SAVE_EXTRA_REGS +@@ -569,12 +576,8 @@ END(irq_entries_start) + jz 1f + + /* +- * IRQ from user mode. Switch to kernel gsbase and inform context +- * tracking that we're in kernel mode. +- */ +- SWAPGS +- +- /* ++ * IRQ from user mode. ++ * + * We need to tell lockdep that IRQs are off. We can't do this until + * we fix gsbase, and we should do it before enter_from_user_mode + * (which can take locks). Since TRACE_IRQS_OFF idempotent, +@@ -828,6 +831,32 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt + */ + #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) + ++/* ++ * Switch to the thread stack. This is called with the IRET frame and ++ * orig_ax on the stack. (That is, RDI..R12 are not on the stack and ++ * space has not been allocated for them.) ++ */ ++ENTRY(switch_to_thread_stack) ++ UNWIND_HINT_FUNC ++ ++ pushq %rdi ++ movq %rsp, %rdi ++ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp ++ UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI ++ ++ pushq 7*8(%rdi) /* regs->ss */ ++ pushq 6*8(%rdi) /* regs->rsp */ ++ pushq 5*8(%rdi) /* regs->eflags */ ++ pushq 4*8(%rdi) /* regs->cs */ ++ pushq 3*8(%rdi) /* regs->ip */ ++ pushq 2*8(%rdi) /* regs->orig_ax */ ++ pushq 8(%rdi) /* return address */ ++ UNWIND_HINT_FUNC ++ ++ movq (%rdi), %rdi ++ ret ++END(switch_to_thread_stack) ++ + .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 + ENTRY(\sym) + UNWIND_HINT_IRET_REGS offset=\has_error_code*8 +@@ -845,11 +874,12 @@ ENTRY(\sym) + + ALLOC_PT_GPREGS_ON_STACK + +- .if \paranoid +- .if \paranoid == 1 ++ .if \paranoid < 2 + testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ +- jnz 1f ++ jnz .Lfrom_usermode_switch_stack_\@ + .endif ++ ++ .if \paranoid + call paranoid_entry + .else + call error_entry +@@ -891,20 +921,15 @@ ENTRY(\sym) + jmp error_exit + .endif + +- .if \paranoid == 1 ++ .if \paranoid < 2 + /* +- * Paranoid entry from userspace. Switch stacks and treat it ++ * Entry from userspace. Switch stacks and treat it + * as a normal entry. This means that paranoid handlers + * run in real process context if user_mode(regs). + */ +-1: ++.Lfrom_usermode_switch_stack_\@: + call error_entry + +- +- movq %rsp, %rdi /* pt_regs pointer */ +- call sync_regs +- movq %rax, %rsp /* switch stack */ +- + movq %rsp, %rdi /* pt_regs pointer */ + + .if \has_error_code +@@ -1165,6 +1190,14 @@ ENTRY(error_entry) + SWAPGS + + .Lerror_entry_from_usermode_after_swapgs: ++ /* Put us onto the real thread stack. */ ++ popq %r12 /* save return addr in %12 */ ++ movq %rsp, %rdi /* arg0 = pt_regs pointer */ ++ call sync_regs ++ movq %rax, %rsp /* switch stack */ ++ ENCODE_FRAME_POINTER ++ pushq %r12 ++ + /* + * We need to tell lockdep that IRQs are off. We can't do this until + * we fix gsbase, and we should do it before enter_from_user_mode +diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S +index dcc6987f9bae..95ad40eb7eff 100644 +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -306,8 +306,11 @@ ENTRY(entry_INT80_compat) + */ + movl %eax, %eax + +- /* Construct struct pt_regs on stack (iret frame is already on stack) */ + pushq %rax /* pt_regs->orig_ax */ ++ ++ /* switch to thread stack expects orig_ax to be pushed */ ++ call switch_to_thread_stack ++ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ +diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h +index 8c6bd6863db9..cbc71e73bd32 100644 +--- a/arch/x86/include/asm/switch_to.h ++++ b/arch/x86/include/asm/switch_to.h +@@ -90,10 +90,12 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) + /* This is used when switching tasks or entering/exiting vm86 mode. */ + static inline void update_sp0(struct task_struct *task) + { ++ /* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */ + #ifdef CONFIG_X86_32 + load_sp0(task->thread.sp0); + #else +- load_sp0(task_top_of_stack(task)); ++ if (static_cpu_has(X86_FEATURE_XENPV)) ++ load_sp0(task_top_of_stack(task)); + #endif + } + +diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h +index 1fadd310ff68..31051f35cbb7 100644 +--- a/arch/x86/include/asm/traps.h ++++ b/arch/x86/include/asm/traps.h +@@ -75,7 +75,6 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long); + dotraplinkage void do_stack_segment(struct pt_regs *, long); + #ifdef CONFIG_X86_64 + dotraplinkage void do_double_fault(struct pt_regs *, long); +-asmlinkage struct pt_regs *sync_regs(struct pt_regs *); + #endif + dotraplinkage void do_general_protection(struct pt_regs *, long); + dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index e5837bd6c672..57968880e39b 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1623,11 +1623,13 @@ void cpu_init(void) + setup_cpu_entry_area(cpu); + + /* +- * Initialize the TSS. Don't bother initializing sp0, as the initial +- * task never enters user mode. ++ * Initialize the TSS. sp0 points to the entry trampoline stack ++ * regardless of what task is running. + */ + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); + load_TR_desc(); ++ load_sp0((unsigned long)&get_cpu_entry_area(cpu)->tss + ++ offsetofend(struct tss_struct, SYSENTER_stack)); + + load_mm_ldt(&init_mm); + +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c +index f0029d17b14b..ee9ca0ad4388 100644 +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -619,14 +619,15 @@ NOKPROBE_SYMBOL(do_int3); + + #ifdef CONFIG_X86_64 + /* +- * Help handler running on IST stack to switch off the IST stack if the +- * interrupted code was in user mode. The actual stack switch is done in +- * entry_64.S ++ * Help handler running on a per-cpu (IST or entry trampoline) stack ++ * to switch to the normal thread stack if the interrupted code was in ++ * user mode. The actual stack switch is done in entry_64.S + */ + asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) + { +- struct pt_regs *regs = task_pt_regs(current); +- *regs = *eregs; ++ struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1; ++ if (regs != eregs) ++ *regs = *eregs; + return regs; + } + NOKPROBE_SYMBOL(sync_regs); +@@ -642,13 +643,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) + /* + * This is called from entry_64.S early in handling a fault + * caused by a bad iret to user mode. To handle the fault +- * correctly, we want move our stack frame to task_pt_regs +- * and we want to pretend that the exception came from the +- * iret target. ++ * correctly, we want to move our stack frame to where it would ++ * be had we entered directly on the entry stack (rather than ++ * just below the IRET frame) and we want to pretend that the ++ * exception came from the IRET target. + */ + struct bad_iret_stack *new_stack = +- container_of(task_pt_regs(current), +- struct bad_iret_stack, regs); ++ (struct bad_iret_stack *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; + + /* Copy the IRET target to the new stack. */ + memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); +-- +2.15.0 + diff --git a/queue/x86-entry-64-Use-pop-instead-of-movq-in-syscall_retu.patch b/queue/x86-entry-64-Use-pop-instead-of-movq-in-syscall_retu.patch new file mode 100644 index 0000000..0d6b58a --- /dev/null +++ b/queue/x86-entry-64-Use-pop-instead-of-movq-in-syscall_retu.patch @@ -0,0 +1,50 @@ +From 4fbb39108f972437c44e5ffa781b56635d496826 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 2 Nov 2017 00:59:03 -0700 +Subject: [PATCH] x86/entry/64: Use pop instead of movq in + syscall_return_via_sysret + +commit 4fbb39108f972437c44e5ffa781b56635d496826 upstream. + +Saves 64 bytes. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/6609b7f74ab31c36604ad746e019ea8495aec76c.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 155396443aaa..4f9b4463b3fc 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -315,10 +315,18 @@ return_from_SYSCALL_64: + */ + syscall_return_via_sysret: + /* rcx and r11 are already restored (see code above) */ +- RESTORE_EXTRA_REGS +- RESTORE_C_REGS_EXCEPT_RCX_R11 +- movq RSP(%rsp), %rsp + UNWIND_HINT_EMPTY ++ POP_EXTRA_REGS ++ popq %rsi /* skip r11 */ ++ popq %r10 ++ popq %r9 ++ popq %r8 ++ popq %rax ++ popq %rsi /* skip rcx */ ++ popq %rdx ++ popq %rsi ++ popq %rdi ++ movq RSP-ORIG_RAX(%rsp), %rsp + USERGS_SYSRET64 + END(entry_SYSCALL_64) + +-- +2.15.0 + diff --git a/queue/x86-entry-64-paravirt-Use-paravirt-safe-macro-to-acc.patch b/queue/x86-entry-64-paravirt-Use-paravirt-safe-macro-to-acc.patch new file mode 100644 index 0000000..b051d41 --- /dev/null +++ b/queue/x86-entry-64-paravirt-Use-paravirt-safe-macro-to-acc.patch @@ -0,0 +1,115 @@ +From e17f8234538d1ff708673f287a42457c4dee720d Mon Sep 17 00:00:00 2001 +From: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Date: Mon, 4 Dec 2017 15:07:07 +0100 +Subject: [PATCH] x86/entry/64/paravirt: Use paravirt-safe macro to access + eflags + +commit e17f8234538d1ff708673f287a42457c4dee720d upstream. + +Commit 1d3e53e8624a ("x86/entry/64: Refactor IRQ stacks and make them +NMI-safe") added DEBUG_ENTRY_ASSERT_IRQS_OFF macro that acceses eflags +using 'pushfq' instruction when testing for IF bit. On PV Xen guests +looking at IF flag directly will always see it set, resulting in 'ud2'. + +Introduce SAVE_FLAGS() macro that will use appropriate save_fl pv op when +running paravirt. + +Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Juergen Gross <jgross@suse.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: xen-devel@lists.xenproject.org +Link: https://lkml.kernel.org/r/20171204150604.899457242@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index a2b30ec69497..32306788821c 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -462,12 +462,13 @@ END(irq_entries_start) + + .macro DEBUG_ENTRY_ASSERT_IRQS_OFF + #ifdef CONFIG_DEBUG_ENTRY +- pushfq +- testl $X86_EFLAGS_IF, (%rsp) ++ pushq %rax ++ SAVE_FLAGS(CLBR_RAX) ++ testl $X86_EFLAGS_IF, %eax + jz .Lokay_\@ + ud2 + .Lokay_\@: +- addq $8, %rsp ++ popq %rax + #endif + .endm + +diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h +index c8ef23f2c28f..89f08955fff7 100644 +--- a/arch/x86/include/asm/irqflags.h ++++ b/arch/x86/include/asm/irqflags.h +@@ -142,6 +142,9 @@ static inline notrace unsigned long arch_local_irq_save(void) + swapgs; \ + sysretl + ++#ifdef CONFIG_DEBUG_ENTRY ++#define SAVE_FLAGS(x) pushfq; popq %rax ++#endif + #else + #define INTERRUPT_RETURN iret + #define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit +diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h +index 283efcaac8af..892df375b615 100644 +--- a/arch/x86/include/asm/paravirt.h ++++ b/arch/x86/include/asm/paravirt.h +@@ -927,6 +927,15 @@ extern void default_banner(void); + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ + CLBR_NONE, \ + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) ++ ++#ifdef CONFIG_DEBUG_ENTRY ++#define SAVE_FLAGS(clobbers) \ ++ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \ ++ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ ++ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl); \ ++ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) ++#endif ++ + #endif /* CONFIG_X86_32 */ + + #endif /* __ASSEMBLY__ */ +diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c +index 630212fa9b9d..e3a5175a444b 100644 +--- a/arch/x86/kernel/asm-offsets_64.c ++++ b/arch/x86/kernel/asm-offsets_64.c +@@ -23,6 +23,9 @@ int main(void) + #ifdef CONFIG_PARAVIRT + OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); + OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); ++#ifdef CONFIG_DEBUG_ENTRY ++ OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl); ++#endif + BLANK(); + #endif + +-- +2.15.0 + diff --git a/queue/x86-entry-Add-task_top_of_stack-to-find-the-top-of-a.patch b/queue/x86-entry-Add-task_top_of_stack-to-find-the-top-of-a.patch new file mode 100644 index 0000000..8f517f1 --- /dev/null +++ b/queue/x86-entry-Add-task_top_of_stack-to-find-the-top-of-a.patch @@ -0,0 +1,37 @@ +From 3500130b84a3cdc5b6796eba1daf178944935efe Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 2 Nov 2017 00:59:11 -0700 +Subject: [PATCH] x86/entry: Add task_top_of_stack() to find the top of a + task's stack + +commit 3500130b84a3cdc5b6796eba1daf178944935efe upstream. + +This will let us get rid of a few places that hardcode accesses to +thread.sp0. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/b49b3f95a8ff858c40c9b0f5b32be0355324327d.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 064b84722166..ad59cec14239 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -795,6 +795,8 @@ static inline void spin_lock_prefetch(const void *x) + #define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \ + TOP_OF_KERNEL_STACK_PADDING) + ++#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1)) ++ + #ifdef CONFIG_X86_32 + /* + * User space process size: 3GB (default). +-- +2.15.0 + diff --git a/queue/x86-entry-Clean-up-the-SYSENTER_stack-code.patch b/queue/x86-entry-Clean-up-the-SYSENTER_stack-code.patch new file mode 100644 index 0000000..1f211ed --- /dev/null +++ b/queue/x86-entry-Clean-up-the-SYSENTER_stack-code.patch @@ -0,0 +1,188 @@ +From 0f9a48100fba3f189724ae88a450c2261bf91c80 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 4 Dec 2017 15:07:28 +0100 +Subject: [PATCH] x86/entry: Clean up the SYSENTER_stack code + +commit 0f9a48100fba3f189724ae88a450c2261bf91c80 upstream. + +The existing code was a mess, mainly because C arrays are nasty. Turn +SYSENTER_stack into a struct, add a helper to find it, and do all the +obvious cleanups this enables. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bpetkov@suse.de> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.653244723@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S +index 0ab316c46806..3629bcbf85a2 100644 +--- a/arch/x86/entry/entry_32.S ++++ b/arch/x86/entry/entry_32.S +@@ -942,7 +942,7 @@ ENTRY(debug) + + /* Are we currently on the SYSENTER stack? */ + movl PER_CPU_VAR(cpu_entry_area), %ecx +- addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx ++ addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ + cmpl $SIZEOF_SYSENTER_stack, %ecx + jb .Ldebug_from_sysenter_stack +@@ -986,7 +986,7 @@ ENTRY(nmi) + + /* Are we currently on the SYSENTER stack? */ + movl PER_CPU_VAR(cpu_entry_area), %ecx +- addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx ++ addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ + cmpl $SIZEOF_SYSENTER_stack, %ecx + jb .Lnmi_from_sysenter_stack +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 2582984ffb4b..575b184f377f 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -154,7 +154,7 @@ END(native_usergs_sysret64) + _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) + + /* The top word of the SYSENTER stack is hot and is usable as scratch space. */ +-#define RSP_SCRATCH CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \ ++#define RSP_SCRATCH CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + \ + SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA + + ENTRY(entry_SYSCALL_64_trampoline) +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index 451da7d9a502..cc5d98bdca37 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -245,5 +245,10 @@ static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) + return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); + } + ++static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu) ++{ ++ return &get_cpu_entry_area(cpu)->tss.SYSENTER_stack; ++} ++ + #endif /* !__ASSEMBLY__ */ + #endif /* _ASM_X86_FIXMAP_H */ +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index d34ac13c5866..f933869470b8 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -336,12 +336,16 @@ struct x86_hw_tss { + #define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss)) + #define INVALID_IO_BITMAP_OFFSET 0x8000 + ++struct SYSENTER_stack { ++ unsigned long words[64]; ++}; ++ + struct tss_struct { + /* + * Space for the temporary SYSENTER stack, used for SYSENTER + * and the entry trampoline as well. + */ +- unsigned long SYSENTER_stack[64]; ++ struct SYSENTER_stack SYSENTER_stack; + + /* + * The fixed hardware portion. This must not cross a page boundary +diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c +index 61b1af88ac07..46c0995344aa 100644 +--- a/arch/x86/kernel/asm-offsets.c ++++ b/arch/x86/kernel/asm-offsets.c +@@ -94,10 +94,8 @@ void common(void) { + BLANK(); + DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); + +- /* Offset from cpu_tss to SYSENTER_stack */ +- OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); +- /* Size of SYSENTER_stack */ +- DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); ++ OFFSET(TSS_STRUCT_SYSENTER_stack, tss_struct, SYSENTER_stack); ++ DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); + + /* Layout info for cpu_entry_area */ + OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index fb01a8e5e9b7..3de7480e4f32 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1314,12 +1314,7 @@ void enable_sep_cpu(void) + + tss->x86_tss.ss1 = __KERNEL_CS; + wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); +- +- wrmsr(MSR_IA32_SYSENTER_ESP, +- (unsigned long)&get_cpu_entry_area(cpu)->tss + +- offsetofend(struct tss_struct, SYSENTER_stack), +- 0); +- ++ wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0); + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); + + put_cpu(); +@@ -1436,9 +1431,7 @@ void syscall_init(void) + * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). + */ + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); +- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, +- (unsigned long)&get_cpu_entry_area(cpu)->tss + +- offsetofend(struct tss_struct, SYSENTER_stack)); ++ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); + #else + wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); +@@ -1653,8 +1646,7 @@ void cpu_init(void) + */ + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); + load_TR_desc(); +- load_sp0((unsigned long)&get_cpu_entry_area(cpu)->tss + +- offsetofend(struct tss_struct, SYSENTER_stack)); ++ load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); + + load_mm_ldt(&init_mm); + +diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c +index ae1ce2e3f132..bbd6d986e2d0 100644 +--- a/arch/x86/kernel/dumpstack.c ++++ b/arch/x86/kernel/dumpstack.c +@@ -45,11 +45,10 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, + + bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) + { +- int cpu = smp_processor_id(); +- struct tss_struct *tss = &get_cpu_entry_area(cpu)->tss; ++ struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id()); + +- void *begin = &tss->SYSENTER_stack; +- void *end = (void *)&tss->SYSENTER_stack + sizeof(tss->SYSENTER_stack); ++ void *begin = ss; ++ void *end = ss + 1; + + if ((void *)stack < begin || (void *)stack >= end) + return false; +-- +2.15.0 + diff --git a/queue/x86-entry-Fix-assumptions-that-the-HW-TSS-is-at-the-.patch b/queue/x86-entry-Fix-assumptions-that-the-HW-TSS-is-at-the-.patch new file mode 100644 index 0000000..396a294 --- /dev/null +++ b/queue/x86-entry-Fix-assumptions-that-the-HW-TSS-is-at-the-.patch @@ -0,0 +1,211 @@ +From 7fb983b4dd569e08564134a850dfd4eb1c63d9b8 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 4 Dec 2017 15:07:17 +0100 +Subject: [PATCH] x86/entry: Fix assumptions that the HW TSS is at the + beginning of cpu_tss + +commit 7fb983b4dd569e08564134a850dfd4eb1c63d9b8 upstream. + +A future patch will move SYSENTER_stack to the beginning of cpu_tss +to help detect overflow. Before this can happen, fix several code +paths that hardcode assumptions about the old layout. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Reviewed-by: Dave Hansen <dave.hansen@intel.com> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.722425540@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h +index f6f428432a68..2ace1f90d138 100644 +--- a/arch/x86/include/asm/desc.h ++++ b/arch/x86/include/asm/desc.h +@@ -178,7 +178,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr, + #endif + } + +-static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) ++static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr) + { + struct desc_struct *d = get_cpu_gdt_rw(cpu); + tss_desc tss; +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 789dad5da20f..555c9478f3df 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -162,7 +162,7 @@ enum cpuid_regs_idx { + extern struct cpuinfo_x86 boot_cpu_data; + extern struct cpuinfo_x86 new_cpu_data; + +-extern struct tss_struct doublefault_tss; ++extern struct x86_hw_tss doublefault_tss; + extern __u32 cpu_caps_cleared[NCAPINTS]; + extern __u32 cpu_caps_set[NCAPINTS]; + +@@ -252,6 +252,11 @@ static inline void load_cr3(pgd_t *pgdir) + write_cr3(__sme_pa(pgdir)); + } + ++/* ++ * Note that while the legacy 'TSS' name comes from 'Task State Segment', ++ * on modern x86 CPUs the TSS also holds information important to 64-bit mode, ++ * unrelated to the task-switch mechanism: ++ */ + #ifdef CONFIG_X86_32 + /* This is the TSS defined by the hardware. */ + struct x86_hw_tss { +@@ -322,7 +327,7 @@ struct x86_hw_tss { + #define IO_BITMAP_BITS 65536 + #define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) + #define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) +-#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) ++#define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss)) + #define INVALID_IO_BITMAP_OFFSET 0x8000 + + struct tss_struct { +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 2cb394dc4153..3f285b973f50 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1557,7 +1557,7 @@ void cpu_init(void) + } + } + +- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); ++ t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; + + /* + * <= is required because the CPU will access up to +@@ -1576,7 +1576,7 @@ void cpu_init(void) + * Initialize the TSS. Don't bother initializing sp0, as the initial + * task never enters user mode. + */ +- set_tss_desc(cpu, t); ++ set_tss_desc(cpu, &t->x86_tss); + load_TR_desc(); + + load_mm_ldt(&init_mm); +@@ -1634,12 +1634,12 @@ void cpu_init(void) + * Initialize the TSS. Don't bother initializing sp0, as the initial + * task never enters user mode. + */ +- set_tss_desc(cpu, t); ++ set_tss_desc(cpu, &t->x86_tss); + load_TR_desc(); + + load_mm_ldt(&init_mm); + +- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); ++ t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; + + #ifdef CONFIG_DOUBLEFAULT + /* Set up doublefault TSS pointer in the GDT */ +diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c +index 0e662c55ae90..0b8cedb20d6d 100644 +--- a/arch/x86/kernel/doublefault.c ++++ b/arch/x86/kernel/doublefault.c +@@ -50,25 +50,23 @@ static void doublefault_fn(void) + cpu_relax(); + } + +-struct tss_struct doublefault_tss __cacheline_aligned = { +- .x86_tss = { +- .sp0 = STACK_START, +- .ss0 = __KERNEL_DS, +- .ldt = 0, +- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, +- +- .ip = (unsigned long) doublefault_fn, +- /* 0x2 bit is always set */ +- .flags = X86_EFLAGS_SF | 0x2, +- .sp = STACK_START, +- .es = __USER_DS, +- .cs = __KERNEL_CS, +- .ss = __KERNEL_DS, +- .ds = __USER_DS, +- .fs = __KERNEL_PERCPU, +- +- .__cr3 = __pa_nodebug(swapper_pg_dir), +- } ++struct x86_hw_tss doublefault_tss __cacheline_aligned = { ++ .sp0 = STACK_START, ++ .ss0 = __KERNEL_DS, ++ .ldt = 0, ++ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, ++ ++ .ip = (unsigned long) doublefault_fn, ++ /* 0x2 bit is always set */ ++ .flags = X86_EFLAGS_SF | 0x2, ++ .sp = STACK_START, ++ .es = __USER_DS, ++ .cs = __KERNEL_CS, ++ .ss = __KERNEL_DS, ++ .ds = __USER_DS, ++ .fs = __KERNEL_PERCPU, ++ ++ .__cr3 = __pa_nodebug(swapper_pg_dir), + }; + + /* dummy for do_double_fault() call */ +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index a6f4f095f8f4..2abe0073b573 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -2291,7 +2291,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) + * processors. See 22.2.4. + */ + vmcs_writel(HOST_TR_BASE, +- (unsigned long)this_cpu_ptr(&cpu_tss)); ++ (unsigned long)this_cpu_ptr(&cpu_tss.x86_tss)); + vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ + + /* +diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c +index 84fcfde53f8f..50593e138281 100644 +--- a/arch/x86/power/cpu.c ++++ b/arch/x86/power/cpu.c +@@ -165,12 +165,13 @@ static void fix_processor_context(void) + struct desc_struct *desc = get_cpu_gdt_rw(cpu); + tss_desc tss; + #endif +- set_tss_desc(cpu, t); /* +- * This just modifies memory; should not be +- * necessary. But... This is necessary, because +- * 386 hardware has concept of busy TSS or some +- * similar stupidity. +- */ ++ ++ /* ++ * This just modifies memory; should not be necessary. But... This is ++ * necessary, because 386 hardware has concept of busy TSS or some ++ * similar stupidity. ++ */ ++ set_tss_desc(cpu, &t->x86_tss); + + #ifdef CONFIG_X86_64 + memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc)); +-- +2.15.0 + diff --git a/queue/x86-entry-Move-SYSENTER_stack-to-the-beginning-of-st.patch b/queue/x86-entry-Move-SYSENTER_stack-to-the-beginning-of-st.patch new file mode 100644 index 0000000..ad78a49 --- /dev/null +++ b/queue/x86-entry-Move-SYSENTER_stack-to-the-beginning-of-st.patch @@ -0,0 +1,118 @@ +From 1a935bc3d4ea61556461a9e92a68ca3556232efd Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 4 Dec 2017 15:07:19 +0100 +Subject: [PATCH] x86/entry: Move SYSENTER_stack to the beginning of struct + tss_struct + +commit 1a935bc3d4ea61556461a9e92a68ca3556232efd upstream. + +SYSENTER_stack should have reliable overflow detection, which +means that it needs to be at the bottom of a page, not the top. +Move it to the beginning of struct tss_struct and page-align it. + +Also add an assertion to make sure that the fixed hardware TSS +doesn't cross a page boundary. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.881827433@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 555c9478f3df..759051251664 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -332,7 +332,16 @@ struct x86_hw_tss { + + struct tss_struct { + /* +- * The hardware state: ++ * Space for the temporary SYSENTER stack, used for SYSENTER ++ * and the entry trampoline as well. ++ */ ++ unsigned long SYSENTER_stack_canary; ++ unsigned long SYSENTER_stack[64]; ++ ++ /* ++ * The fixed hardware portion. This must not cross a page boundary ++ * at risk of violating the SDM's advice and potentially triggering ++ * errata. + */ + struct x86_hw_tss x86_tss; + +@@ -343,15 +352,9 @@ struct tss_struct { + * be within the limit. + */ + unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; ++} __aligned(PAGE_SIZE); + +- /* +- * Space for the temporary SYSENTER stack. +- */ +- unsigned long SYSENTER_stack_canary; +- unsigned long SYSENTER_stack[64]; +-} ____cacheline_aligned; +- +-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); ++DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss); + + /* + * sizeof(unsigned long) coming from an extra "long" at the end +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 3f285b973f50..60b2dfd2a58b 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -487,6 +487,27 @@ static inline void setup_cpu_entry_area(int cpu) + #endif + + __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); ++ ++ /* ++ * The Intel SDM says (Volume 3, 7.2.1): ++ * ++ * Avoid placing a page boundary in the part of the TSS that the ++ * processor reads during a task switch (the first 104 bytes). The ++ * processor may not correctly perform address translations if a ++ * boundary occurs in this area. During a task switch, the processor ++ * reads and writes into the first 104 bytes of each TSS (using ++ * contiguous physical addresses beginning with the physical address ++ * of the first byte of the TSS). So, after TSS access begins, if ++ * part of the 104 bytes is not physically contiguous, the processor ++ * will access incorrect information without generating a page-fault ++ * exception. ++ * ++ * There are also a lot of errata involving the TSS spanning a page ++ * boundary. Assert that we're not doing that. ++ */ ++ BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ ++ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); ++ + } + + /* Load the original GDT from the per-cpu structure */ +-- +2.15.0 + diff --git a/queue/x86-entry-Remap-the-TSS-into-the-CPU-entry-area.patch b/queue/x86-entry-Remap-the-TSS-into-the-CPU-entry-area.patch new file mode 100644 index 0000000..b8eebed --- /dev/null +++ b/queue/x86-entry-Remap-the-TSS-into-the-CPU-entry-area.patch @@ -0,0 +1,269 @@ +From 72f5e08dbba2d01aa90b592cf76c378ea233b00b Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 4 Dec 2017 15:07:20 +0100 +Subject: [PATCH] x86/entry: Remap the TSS into the CPU entry area + +commit 72f5e08dbba2d01aa90b592cf76c378ea233b00b upstream. + +This has a secondary purpose: it puts the entry stack into a region +with a well-controlled layout. A subsequent patch will take +advantage of this to streamline the SYSCALL entry code to be able to +find it more easily. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bpetkov@suse.de> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.962042855@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S +index 4838037f97f6..0ab316c46806 100644 +--- a/arch/x86/entry/entry_32.S ++++ b/arch/x86/entry/entry_32.S +@@ -941,7 +941,8 @@ ENTRY(debug) + movl %esp, %eax # pt_regs pointer + + /* Are we currently on the SYSENTER stack? */ +- PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) ++ movl PER_CPU_VAR(cpu_entry_area), %ecx ++ addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ + cmpl $SIZEOF_SYSENTER_stack, %ecx + jb .Ldebug_from_sysenter_stack +@@ -984,7 +985,8 @@ ENTRY(nmi) + movl %esp, %eax # pt_regs pointer + + /* Are we currently on the SYSENTER stack? */ +- PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) ++ movl PER_CPU_VAR(cpu_entry_area), %ecx ++ addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ + cmpl $SIZEOF_SYSENTER_stack, %ecx + jb .Lnmi_from_sysenter_stack +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index b61f0242f9d0..84558b611ad3 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -54,6 +54,13 @@ extern unsigned long __FIXADDR_TOP; + */ + struct cpu_entry_area { + char gdt[PAGE_SIZE]; ++ ++ /* ++ * The GDT is just below cpu_tss and thus serves (on x86_64) as a ++ * a read-only guard page for the SYSENTER stack at the bottom ++ * of the TSS region. ++ */ ++ struct tss_struct tss; + }; + + #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) +diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c +index b275863128eb..55858b277cf6 100644 +--- a/arch/x86/kernel/asm-offsets.c ++++ b/arch/x86/kernel/asm-offsets.c +@@ -98,4 +98,7 @@ void common(void) { + OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); + /* Size of SYSENTER_stack */ + DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); ++ ++ /* Layout info for cpu_entry_area */ ++ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); + } +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 60b2dfd2a58b..e5837bd6c672 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -466,6 +466,22 @@ void load_percpu_segment(int cpu) + load_stack_canary_segment(); + } + ++static void set_percpu_fixmap_pages(int fixmap_index, void *ptr, ++ int pages, pgprot_t prot) ++{ ++ int i; ++ ++ for (i = 0; i < pages; i++) { ++ __set_fixmap(fixmap_index - i, ++ per_cpu_ptr_to_phys(ptr + i * PAGE_SIZE), prot); ++ } ++} ++ ++#ifdef CONFIG_X86_32 ++/* The 32-bit entry code needs to find cpu_entry_area. */ ++DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); ++#endif ++ + /* Setup the fixmap mappings only once per-processor */ + static inline void setup_cpu_entry_area(int cpu) + { +@@ -507,7 +523,15 @@ static inline void setup_cpu_entry_area(int cpu) + */ + BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ + offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); ++ BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); ++ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), ++ &per_cpu(cpu_tss, cpu), ++ sizeof(struct tss_struct) / PAGE_SIZE, ++ PAGE_KERNEL); + ++#ifdef CONFIG_X86_32 ++ this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu)); ++#endif + } + + /* Load the original GDT from the per-cpu structure */ +@@ -1257,7 +1281,8 @@ void enable_sep_cpu(void) + wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); + + wrmsr(MSR_IA32_SYSENTER_ESP, +- (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), ++ (unsigned long)&get_cpu_entry_area(cpu)->tss + ++ offsetofend(struct tss_struct, SYSENTER_stack), + 0); + + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); +@@ -1370,6 +1395,8 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks + /* May not be marked __init: used by software suspend */ + void syscall_init(void) + { ++ int cpu = smp_processor_id(); ++ + wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); + +@@ -1383,7 +1410,7 @@ void syscall_init(void) + */ + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, +- (unsigned long)this_cpu_ptr(&cpu_tss) + ++ (unsigned long)&get_cpu_entry_area(cpu)->tss + + offsetofend(struct tss_struct, SYSENTER_stack)); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); + #else +@@ -1593,11 +1620,13 @@ void cpu_init(void) + initialize_tlbstate_and_flush(); + enter_lazy_tlb(&init_mm, me); + ++ setup_cpu_entry_area(cpu); ++ + /* + * Initialize the TSS. Don't bother initializing sp0, as the initial + * task never enters user mode. + */ +- set_tss_desc(cpu, &t->x86_tss); ++ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); + load_TR_desc(); + + load_mm_ldt(&init_mm); +@@ -1610,7 +1639,6 @@ void cpu_init(void) + if (is_uv_system()) + uv_cpu_init(); + +- setup_cpu_entry_area(cpu); + load_fixmap_gdt(cpu); + } + +@@ -1651,11 +1679,13 @@ void cpu_init(void) + initialize_tlbstate_and_flush(); + enter_lazy_tlb(&init_mm, curr); + ++ setup_cpu_entry_area(cpu); ++ + /* + * Initialize the TSS. Don't bother initializing sp0, as the initial + * task never enters user mode. + */ +- set_tss_desc(cpu, &t->x86_tss); ++ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); + load_TR_desc(); + + load_mm_ldt(&init_mm); +@@ -1672,7 +1702,6 @@ void cpu_init(void) + + fpu__init_cpu(); + +- setup_cpu_entry_area(cpu); + load_fixmap_gdt(cpu); + } + #endif +diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c +index 64f8ed2a4827..60267850125e 100644 +--- a/arch/x86/kernel/dumpstack.c ++++ b/arch/x86/kernel/dumpstack.c +@@ -45,7 +45,8 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, + + bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) + { +- struct tss_struct *tss = this_cpu_ptr(&cpu_tss); ++ int cpu = smp_processor_id(); ++ struct tss_struct *tss = &get_cpu_entry_area(cpu)->tss; + + /* Treat the canary as part of the stack for unwinding purposes. */ + void *begin = &tss->SYSENTER_stack_canary; +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 2abe0073b573..62ee4362e1c1 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -2291,7 +2291,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) + * processors. See 22.2.4. + */ + vmcs_writel(HOST_TR_BASE, +- (unsigned long)this_cpu_ptr(&cpu_tss.x86_tss)); ++ (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); + vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ + + /* +diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c +index 50593e138281..04d5157fe7f8 100644 +--- a/arch/x86/power/cpu.c ++++ b/arch/x86/power/cpu.c +@@ -160,18 +160,19 @@ static void do_fpu_end(void) + static void fix_processor_context(void) + { + int cpu = smp_processor_id(); +- struct tss_struct *t = &per_cpu(cpu_tss, cpu); + #ifdef CONFIG_X86_64 + struct desc_struct *desc = get_cpu_gdt_rw(cpu); + tss_desc tss; + #endif + + /* +- * This just modifies memory; should not be necessary. But... This is +- * necessary, because 386 hardware has concept of busy TSS or some +- * similar stupidity. ++ * We need to reload TR, which requires that we change the ++ * GDT entry to indicate "available" first. ++ * ++ * XXX: This could probably all be replaced by a call to ++ * force_reload_TR(). + */ +- set_tss_desc(cpu, &t->x86_tss); ++ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); + + #ifdef CONFIG_X86_64 + memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc)); +-- +2.15.0 + diff --git a/queue/x86-entry-gdt-Put-per-CPU-GDT-remaps-in-ascending-or.patch b/queue/x86-entry-gdt-Put-per-CPU-GDT-remaps-in-ascending-or.patch new file mode 100644 index 0000000..9145d6e --- /dev/null +++ b/queue/x86-entry-gdt-Put-per-CPU-GDT-remaps-in-ascending-or.patch @@ -0,0 +1,59 @@ +From aaeed3aeb39c1ba69f0a49baec8cb728121d0a91 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 4 Dec 2017 15:07:14 +0100 +Subject: [PATCH] x86/entry/gdt: Put per-CPU GDT remaps in ascending order + +commit aaeed3aeb39c1ba69f0a49baec8cb728121d0a91 upstream. + +We currently have CPU 0's GDT at the top of the GDT range and +higher-numbered CPUs at lower addresses. This happens because the +fixmap is upside down (index 0 is the top of the fixmap). + +Flip it so that GDTs are in ascending order by virtual address. +This will simplify a future patch that will generalize the GDT +remap to contain multiple pages. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.471561421@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h +index 0a3e808b9123..01fd944fd721 100644 +--- a/arch/x86/include/asm/desc.h ++++ b/arch/x86/include/asm/desc.h +@@ -63,7 +63,7 @@ static inline struct desc_struct *get_current_gdt_rw(void) + /* Get the fixmap index for a specific processor */ + static inline unsigned int get_cpu_gdt_ro_index(int cpu) + { +- return FIX_GDT_REMAP_BEGIN + cpu; ++ return FIX_GDT_REMAP_END - cpu; + } + + /* Provide the fixmap address of the remapped GDT */ +-- +2.15.0 + diff --git a/queue/x86-espfix-64-Stop-assuming-that-pt_regs-is-on-the-e.patch b/queue/x86-espfix-64-Stop-assuming-that-pt_regs-is-on-the-e.patch new file mode 100644 index 0000000..4ecf45e --- /dev/null +++ b/queue/x86-espfix-64-Stop-assuming-that-pt_regs-is-on-the-e.patch @@ -0,0 +1,113 @@ +From 6d9256f0a89eaff97fca6006100bcaea8d1d8bdb Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 4 Dec 2017 15:07:22 +0100 +Subject: [PATCH] x86/espfix/64: Stop assuming that pt_regs is on the entry + stack + +commit 6d9256f0a89eaff97fca6006100bcaea8d1d8bdb upstream. + +When we start using an entry trampoline, a #GP from userspace will +be delivered on the entry stack, not on the task stack. Fix the +espfix64 #DF fixup to set up #GP according to TSS.SP0, rather than +assuming that pt_regs + 1 == SP0. This won't change anything +without an entry stack, but it will make the code continue to work +when an entry stack is added. + +While we're at it, improve the comments to explain what's actually +going on. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.130778051@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c +index d3e3bbd5d3a0..f0029d17b14b 100644 +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -348,9 +348,15 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) + + /* + * If IRET takes a non-IST fault on the espfix64 stack, then we +- * end up promoting it to a doublefault. In that case, modify +- * the stack to make it look like we just entered the #GP +- * handler from user space, similar to bad_iret. ++ * end up promoting it to a doublefault. In that case, take ++ * advantage of the fact that we're not using the normal (TSS.sp0) ++ * stack right now. We can write a fake #GP(0) frame at TSS.sp0 ++ * and then modify our own IRET frame so that, when we return, ++ * we land directly at the #GP(0) vector with the stack already ++ * set up according to its expectations. ++ * ++ * The net result is that our #GP handler will think that we ++ * entered from usermode with the bad user context. + * + * No need for ist_enter here because we don't use RCU. + */ +@@ -358,13 +364,26 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) + regs->cs == __KERNEL_CS && + regs->ip == (unsigned long)native_irq_return_iret) + { +- struct pt_regs *normal_regs = task_pt_regs(current); ++ struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; ++ ++ /* ++ * regs->sp points to the failing IRET frame on the ++ * ESPFIX64 stack. Copy it to the entry stack. This fills ++ * in gpregs->ss through gpregs->ip. ++ * ++ */ ++ memmove(&gpregs->ip, (void *)regs->sp, 5*8); ++ gpregs->orig_ax = 0; /* Missing (lost) #GP error code */ + +- /* Fake a #GP(0) from userspace. */ +- memmove(&normal_regs->ip, (void *)regs->sp, 5*8); +- normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ ++ /* ++ * Adjust our frame so that we return straight to the #GP ++ * vector with the expected RSP value. This is safe because ++ * we won't enable interupts or schedule before we invoke ++ * general_protection, so nothing will clobber the stack ++ * frame we just set up. ++ */ + regs->ip = (unsigned long)general_protection; +- regs->sp = (unsigned long)&normal_regs->orig_ax; ++ regs->sp = (unsigned long)&gpregs->orig_ax; + + return; + } +@@ -389,7 +408,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) + * + * Processors update CR2 whenever a page fault is detected. If a + * second page fault occurs while an earlier page fault is being +- * deliv- ered, the faulting linear address of the second fault will ++ * delivered, the faulting linear address of the second fault will + * overwrite the contents of CR2 (replacing the previous + * address). These updates to CR2 occur even if the page fault + * results in a double fault or occurs during the delivery of a +-- +2.15.0 + diff --git a/queue/x86-fpu-Make-XSAVE-check-the-base-CPUID-features-bef.patch b/queue/x86-fpu-Make-XSAVE-check-the-base-CPUID-features-bef.patch new file mode 100644 index 0000000..7465f90 --- /dev/null +++ b/queue/x86-fpu-Make-XSAVE-check-the-base-CPUID-features-bef.patch @@ -0,0 +1,79 @@ +From ccb18db2ab9d923df07e7495123fe5fb02329713 Mon Sep 17 00:00:00 2001 +From: Andi Kleen <ak@linux.intel.com> +Date: Fri, 13 Oct 2017 14:56:44 -0700 +Subject: [PATCH] x86/fpu: Make XSAVE check the base CPUID features before + enabling + +commit ccb18db2ab9d923df07e7495123fe5fb02329713 upstream. + +Before enabling XSAVE, not only check the XSAVE specific CPUID bits, +but also the base CPUID features of the respective XSAVE feature. +This allows to disable individual XSAVE states using the existing +clearcpuid= option, which can be useful for performance testing +and debugging, and also in general avoids inconsistencies. + +Signed-off-by: Andi Kleen <ak@linux.intel.com> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Link: http://lkml.kernel.org/r/20171013215645.23166-5-andi@firstfloor.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c +index f1d5476c9022..fb581292975b 100644 +--- a/arch/x86/kernel/fpu/xstate.c ++++ b/arch/x86/kernel/fpu/xstate.c +@@ -15,6 +15,7 @@ + #include <asm/fpu/xstate.h> + + #include <asm/tlbflush.h> ++#include <asm/cpufeature.h> + + /* + * Although we spell it out in here, the Processor Trace +@@ -36,6 +37,19 @@ static const char *xfeature_names[] = + "unknown xstate feature" , + }; + ++static short xsave_cpuid_features[] __initdata = { ++ X86_FEATURE_FPU, ++ X86_FEATURE_XMM, ++ X86_FEATURE_AVX, ++ X86_FEATURE_MPX, ++ X86_FEATURE_MPX, ++ X86_FEATURE_AVX512F, ++ X86_FEATURE_AVX512F, ++ X86_FEATURE_AVX512F, ++ X86_FEATURE_INTEL_PT, ++ X86_FEATURE_PKU, ++}; ++ + /* + * Mask of xstate features supported by the CPU and the kernel: + */ +@@ -726,6 +740,7 @@ void __init fpu__init_system_xstate(void) + unsigned int eax, ebx, ecx, edx; + static int on_boot_cpu __initdata = 1; + int err; ++ int i; + + WARN_ON_FPU(!on_boot_cpu); + on_boot_cpu = 0; +@@ -759,6 +774,14 @@ void __init fpu__init_system_xstate(void) + goto out_disable; + } + ++ /* ++ * Clear XSAVE features that are disabled in the normal CPUID. ++ */ ++ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { ++ if (!boot_cpu_has(xsave_cpuid_features[i])) ++ xfeatures_mask &= ~BIT(i); ++ } ++ + xfeatures_mask &= fpu__get_supported_xfeatures_mask(); + + /* Enable xstate instructions to be able to continue with initialization: */ +-- +2.15.0 + diff --git a/queue/x86-fpu-Parse-clearcpuid-as-early-XSAVE-argument.patch b/queue/x86-fpu-Parse-clearcpuid-as-early-XSAVE-argument.patch new file mode 100644 index 0000000..a6cdfed --- /dev/null +++ b/queue/x86-fpu-Parse-clearcpuid-as-early-XSAVE-argument.patch @@ -0,0 +1,85 @@ +From 0c2a3913d6f50503f7c59d83a6219e39508cc898 Mon Sep 17 00:00:00 2001 +From: Andi Kleen <ak@linux.intel.com> +Date: Fri, 13 Oct 2017 14:56:43 -0700 +Subject: [PATCH] x86/fpu: Parse clearcpuid= as early XSAVE argument + +commit 0c2a3913d6f50503f7c59d83a6219e39508cc898 upstream. + +With a followon patch we want to make clearcpuid affect the XSAVE +configuration. But xsave is currently initialized before arguments +are parsed. Move the clearcpuid= parsing into the special +early xsave argument parsing code. + +Since clearcpuid= contains a = we need to keep the old __setup +around as a dummy, otherwise it would end up as a environment +variable in init's environment. + +Signed-off-by: Andi Kleen <ak@linux.intel.com> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Link: http://lkml.kernel.org/r/20171013215645.23166-4-andi@firstfloor.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index c9176bae7fd8..03bb004bb15e 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1301,18 +1301,16 @@ void print_cpu_info(struct cpuinfo_x86 *c) + pr_cont(")\n"); + } + +-static __init int setup_disablecpuid(char *arg) ++/* ++ * clearcpuid= was already parsed in fpu__init_parse_early_param. ++ * But we need to keep a dummy __setup around otherwise it would ++ * show up as an environment variable for init. ++ */ ++static __init int setup_clearcpuid(char *arg) + { +- int bit; +- +- if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32) +- setup_clear_cpu_cap(bit); +- else +- return 0; +- + return 1; + } +-__setup("clearcpuid=", setup_disablecpuid); ++__setup("clearcpuid=", setup_clearcpuid); + + #ifdef CONFIG_X86_64 + DEFINE_PER_CPU_FIRST(union irq_stack_union, +diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c +index 7affb7e3d9a5..6abd83572b01 100644 +--- a/arch/x86/kernel/fpu/init.c ++++ b/arch/x86/kernel/fpu/init.c +@@ -249,6 +249,10 @@ static void __init fpu__init_system_ctx_switch(void) + */ + static void __init fpu__init_parse_early_param(void) + { ++ char arg[32]; ++ char *argptr = arg; ++ int bit; ++ + if (cmdline_find_option_bool(boot_command_line, "no387")) + setup_clear_cpu_cap(X86_FEATURE_FPU); + +@@ -266,6 +270,13 @@ static void __init fpu__init_parse_early_param(void) + + if (cmdline_find_option_bool(boot_command_line, "noxsaves")) + setup_clear_cpu_cap(X86_FEATURE_XSAVES); ++ ++ if (cmdline_find_option(boot_command_line, "clearcpuid", arg, ++ sizeof(arg)) && ++ get_option(&argptr, &bit) && ++ bit >= 0 && ++ bit < NCAPINTS * 32) ++ setup_clear_cpu_cap(bit); + } + + /* +-- +2.15.0 + diff --git a/queue/x86-fpu-Remove-the-explicit-clearing-of-XSAVE-depend.patch b/queue/x86-fpu-Remove-the-explicit-clearing-of-XSAVE-depend.patch new file mode 100644 index 0000000..08d075c --- /dev/null +++ b/queue/x86-fpu-Remove-the-explicit-clearing-of-XSAVE-depend.patch @@ -0,0 +1,59 @@ +From 73e3a7d2a7c3be29a5a22b85026f6cfa5664267f Mon Sep 17 00:00:00 2001 +From: Andi Kleen <ak@linux.intel.com> +Date: Fri, 13 Oct 2017 14:56:45 -0700 +Subject: [PATCH] x86/fpu: Remove the explicit clearing of XSAVE dependent + features + +commit 73e3a7d2a7c3be29a5a22b85026f6cfa5664267f upstream. + +Clearing a CPU feature with setup_clear_cpu_cap() clears all features +which depend on it. Expressing feature dependencies in one place is +easier to maintain than keeping functions like +fpu__xstate_clear_all_cpu_caps() up to date. + +The features which depend on XSAVE have their dependency expressed in the +dependency table, so its sufficient to clear X86_FEATURE_XSAVE. + +Remove the explicit clearing of XSAVE dependent features. + +Signed-off-by: Andi Kleen <ak@linux.intel.com> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Link: http://lkml.kernel.org/r/20171013215645.23166-6-andi@firstfloor.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c +index fb581292975b..87a57b7642d3 100644 +--- a/arch/x86/kernel/fpu/xstate.c ++++ b/arch/x86/kernel/fpu/xstate.c +@@ -73,26 +73,6 @@ unsigned int fpu_user_xstate_size; + void fpu__xstate_clear_all_cpu_caps(void) + { + setup_clear_cpu_cap(X86_FEATURE_XSAVE); +- setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); +- setup_clear_cpu_cap(X86_FEATURE_XSAVEC); +- setup_clear_cpu_cap(X86_FEATURE_XSAVES); +- setup_clear_cpu_cap(X86_FEATURE_AVX); +- setup_clear_cpu_cap(X86_FEATURE_AVX2); +- setup_clear_cpu_cap(X86_FEATURE_AVX512F); +- setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA); +- setup_clear_cpu_cap(X86_FEATURE_AVX512PF); +- setup_clear_cpu_cap(X86_FEATURE_AVX512ER); +- setup_clear_cpu_cap(X86_FEATURE_AVX512CD); +- setup_clear_cpu_cap(X86_FEATURE_AVX512DQ); +- setup_clear_cpu_cap(X86_FEATURE_AVX512BW); +- setup_clear_cpu_cap(X86_FEATURE_AVX512VL); +- setup_clear_cpu_cap(X86_FEATURE_MPX); +- setup_clear_cpu_cap(X86_FEATURE_XGETBV1); +- setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI); +- setup_clear_cpu_cap(X86_FEATURE_PKU); +- setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW); +- setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS); +- setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ); + } + + /* +-- +2.15.0 + diff --git a/queue/x86-fpu-debug-Remove-unused-x86_fpu_state-and-x86_fp.patch b/queue/x86-fpu-debug-Remove-unused-x86_fpu_state-and-x86_fp.patch new file mode 100644 index 0000000..4b0390e --- /dev/null +++ b/queue/x86-fpu-debug-Remove-unused-x86_fpu_state-and-x86_fp.patch @@ -0,0 +1,55 @@ +From 127a1bea40f7f2a36bc7207ea4d51bb6b4e936fa Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (VMware)" <rostedt@goodmis.org> +Date: Thu, 12 Oct 2017 18:06:19 -0400 +Subject: [PATCH] x86/fpu/debug: Remove unused 'x86_fpu_state' and + 'x86_fpu_deactivate_state' tracepoints + +commit 127a1bea40f7f2a36bc7207ea4d51bb6b4e936fa upstream. + +Commit: + + d1898b733619 ("x86/fpu: Add tracepoints to dump FPU state at key points") + +... added the 'x86_fpu_state' and 'x86_fpu_deactivate_state' trace points, +but never used them. Today they are still not used. As they take up +and waste memory, remove them. + +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/20171012180619.670b68b6@gandalf.local.home +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h +index 39f7a27bef13..6b086c39e6dc 100644 +--- a/arch/x86/include/asm/trace/fpu.h ++++ b/arch/x86/include/asm/trace/fpu.h +@@ -33,11 +33,6 @@ DECLARE_EVENT_CLASS(x86_fpu, + ) + ); + +-DEFINE_EVENT(x86_fpu, x86_fpu_state, +- TP_PROTO(struct fpu *fpu), +- TP_ARGS(fpu) +-); +- + DEFINE_EVENT(x86_fpu, x86_fpu_before_save, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +@@ -73,11 +68,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_activate_state, + TP_ARGS(fpu) + ); + +-DEFINE_EVENT(x86_fpu, x86_fpu_deactivate_state, +- TP_PROTO(struct fpu *fpu), +- TP_ARGS(fpu) +-); +- + DEFINE_EVENT(x86_fpu, x86_fpu_init_state, + TP_PROTO(struct fpu *fpu), + TP_ARGS(fpu) +-- +2.15.0 + diff --git a/queue/x86-head-Add-unwind-hint-annotations.patch b/queue/x86-head-Add-unwind-hint-annotations.patch new file mode 100644 index 0000000..255d65d --- /dev/null +++ b/queue/x86-head-Add-unwind-hint-annotations.patch @@ -0,0 +1,122 @@ +From 2704fbb672d0d9a19414907fda7949283dcef6a1 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf <jpoimboe@redhat.com> +Date: Mon, 18 Sep 2017 21:43:37 -0500 +Subject: [PATCH] x86/head: Add unwind hint annotations + +commit 2704fbb672d0d9a19414907fda7949283dcef6a1 upstream. + +Jiri Slaby reported an ORC issue when unwinding from an idle task. The +stack was: + + ffffffff811083c2 do_idle+0x142/0x1e0 + ffffffff8110861d cpu_startup_entry+0x5d/0x60 + ffffffff82715f58 start_kernel+0x3ff/0x407 + ffffffff827153e8 x86_64_start_kernel+0x14e/0x15d + ffffffff810001bf secondary_startup_64+0x9f/0xa0 + +The ORC unwinder errored out at secondary_startup_64 because the head +code isn't annotated yet so there wasn't a corresponding ORC entry. + +Fix that and any other head-related unwinding issues by adding unwind +hints to the head code. + +Reported-by: Jiri Slaby <jslaby@suse.cz> +Tested-by: Jiri Slaby <jslaby@suse.cz> +Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/78ef000a2f68f545d6eef44ee912edceaad82ccf.1505764066.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile +index fd0a7895b63f..d8e2b700d1db 100644 +--- a/arch/x86/kernel/Makefile ++++ b/arch/x86/kernel/Makefile +@@ -26,7 +26,6 @@ KASAN_SANITIZE_dumpstack.o := n + KASAN_SANITIZE_dumpstack_$(BITS).o := n + KASAN_SANITIZE_stacktrace.o := n + +-OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y + OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y + OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y + OBJECT_FILES_NON_STANDARD_test_nx.o := y +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S +index edacd579d504..42e32c2e51bb 100644 +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -49,6 +49,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) + .code64 + .globl startup_64 + startup_64: ++ UNWIND_HINT_EMPTY + /* + * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, + * and someone has loaded an identity mapped page table +@@ -88,6 +89,7 @@ startup_64: + addq $(early_top_pgt - __START_KERNEL_map), %rax + jmp 1f + ENTRY(secondary_startup_64) ++ UNWIND_HINT_EMPTY + /* + * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, + * and someone has loaded a mapped page table. +@@ -132,6 +134,7 @@ ENTRY(secondary_startup_64) + movq $1f, %rax + jmp *%rax + 1: ++ UNWIND_HINT_EMPTY + + /* Check if nx is implemented */ + movl $0x80000001, %eax +@@ -246,6 +249,7 @@ END(secondary_startup_64) + */ + ENTRY(start_cpu0) + movq initial_stack(%rip), %rsp ++ UNWIND_HINT_EMPTY + jmp .Ljump_to_C_code + ENDPROC(start_cpu0) + #endif +@@ -270,13 +274,18 @@ ENTRY(early_idt_handler_array) + i = 0 + .rept NUM_EXCEPTION_VECTORS + .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 +- pushq $0 # Dummy error code, to make stack frame uniform ++ UNWIND_HINT_IRET_REGS ++ pushq $0 # Dummy error code, to make stack frame uniform ++ .else ++ UNWIND_HINT_IRET_REGS offset=8 + .endif + pushq $i # 72(%rsp) Vector number + jmp early_idt_handler_common ++ UNWIND_HINT_IRET_REGS + i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc + .endr ++ UNWIND_HINT_IRET_REGS offset=16 + END(early_idt_handler_array) + + early_idt_handler_common: +@@ -305,6 +314,7 @@ early_idt_handler_common: + pushq %r13 /* pt_regs->r13 */ + pushq %r14 /* pt_regs->r14 */ + pushq %r15 /* pt_regs->r15 */ ++ UNWIND_HINT_REGS + + cmpq $14,%rsi /* Page fault? */ + jnz 10f +@@ -427,7 +437,7 @@ ENTRY(phys_base) + EXPORT_SYMBOL(phys_base) + + #include "../../x86/xen/xen-head.S" +- ++ + __PAGE_ALIGNED_BSS + NEXT_PAGE(empty_zero_page) + .skip PAGE_SIZE +-- +2.15.0 + diff --git a/queue/x86-head-Fix-head-ELF-function-annotations.patch b/queue/x86-head-Fix-head-ELF-function-annotations.patch new file mode 100644 index 0000000..5ba4aaa --- /dev/null +++ b/queue/x86-head-Fix-head-ELF-function-annotations.patch @@ -0,0 +1,55 @@ +From 015a2ea5478680fc5216d56b7ff306f2a74efaf9 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf <jpoimboe@redhat.com> +Date: Mon, 18 Sep 2017 21:43:33 -0500 +Subject: [PATCH] x86/head: Fix head ELF function annotations + +commit 015a2ea5478680fc5216d56b7ff306f2a74efaf9 upstream. + +These functions aren't callable C-type functions, so don't annotate them +as such. + +Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Jiri Slaby <jslaby@suse.cz> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/36eb182738c28514f8bf95e403d89b6413a88883.1505764066.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S +index afb0a1e22d41..edacd579d504 100644 +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -234,7 +234,7 @@ ENTRY(secondary_startup_64) + pushq %rax # target address in negative space + lretq + .Lafter_lret: +-ENDPROC(secondary_startup_64) ++END(secondary_startup_64) + + #include "verify_cpu.S" + +@@ -277,7 +277,7 @@ ENTRY(early_idt_handler_array) + i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc + .endr +-ENDPROC(early_idt_handler_array) ++END(early_idt_handler_array) + + early_idt_handler_common: + /* +@@ -320,7 +320,7 @@ early_idt_handler_common: + 20: + decl early_recursion_flag(%rip) + jmp restore_regs_and_iret +-ENDPROC(early_idt_handler_common) ++END(early_idt_handler_common) + + __INITDATA + +-- +2.15.0 + diff --git a/queue/x86-head-Remove-confusing-comment.patch b/queue/x86-head-Remove-confusing-comment.patch new file mode 100644 index 0000000..fa9f41a --- /dev/null +++ b/queue/x86-head-Remove-confusing-comment.patch @@ -0,0 +1,43 @@ +From 17270717e80de33a884ad328fea5f407d87f6d6a Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf <jpoimboe@redhat.com> +Date: Mon, 18 Sep 2017 21:43:31 -0500 +Subject: [PATCH] x86/head: Remove confusing comment + +commit 17270717e80de33a884ad328fea5f407d87f6d6a upstream. + +This comment is actively wrong and confusing. It refers to the +registers' stack offsets after the pt_regs has been constructed on the +stack, but this code is *before* that. + +At this point the stack just has the standard iret frame, for which no +comment should be needed. + +Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Jiri Slaby <jslaby@suse.cz> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/a3c267b770fc56c9b86df9c11c552848248aace2.1505764066.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S +index 513cbb012ecc..3b04e4c99389 100644 +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -270,10 +270,6 @@ bad_address: + + __INIT + ENTRY(early_idt_handler_array) +- # 104(%rsp) %rflags +- # 96(%rsp) %cs +- # 88(%rsp) %rip +- # 80(%rsp) error code + i = 0 + .rept NUM_EXCEPTION_VECTORS + .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 +-- +2.15.0 + diff --git a/queue/x86-head-Remove-unused-bad_address-code.patch b/queue/x86-head-Remove-unused-bad_address-code.patch new file mode 100644 index 0000000..b7dae59 --- /dev/null +++ b/queue/x86-head-Remove-unused-bad_address-code.patch @@ -0,0 +1,37 @@ +From a8b88e84d124bc92c4808e72b8b8c0e0bb538630 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf <jpoimboe@redhat.com> +Date: Mon, 18 Sep 2017 21:43:32 -0500 +Subject: [PATCH] x86/head: Remove unused 'bad_address' code + +commit a8b88e84d124bc92c4808e72b8b8c0e0bb538630 upstream. + +It's no longer possible for this code to be executed, so remove it. + +Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Jiri Slaby <jslaby@suse.cz> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/32a46fe92d2083700599b36872b26e7dfd7b7965.1505764066.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S +index 3b04e4c99389..afb0a1e22d41 100644 +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -265,9 +265,6 @@ ENDPROC(start_cpu0) + .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS + __FINITDATA + +-bad_address: +- jmp bad_address +- + __INIT + ENTRY(early_idt_handler_array) + i = 0 +-- +2.15.0 + diff --git a/queue/x86-irq-64-Print-the-offending-IP-in-the-stack-overf.patch b/queue/x86-irq-64-Print-the-offending-IP-in-the-stack-overf.patch new file mode 100644 index 0000000..49d13c3 --- /dev/null +++ b/queue/x86-irq-64-Print-the-offending-IP-in-the-stack-overf.patch @@ -0,0 +1,59 @@ +From 4f3789e792296e21405f708cf3cb409d7c7d5683 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 4 Dec 2017 15:07:11 +0100 +Subject: [PATCH] x86/irq/64: Print the offending IP in the stack overflow + warning + +commit 4f3789e792296e21405f708cf3cb409d7c7d5683 upstream. + +In case something goes wrong with unwind (not unlikely in case of +overflow), print the offending IP where we detected the overflow. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.231677119@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c +index 020efbf5786b..d86e344f5b3d 100644 +--- a/arch/x86/kernel/irq_64.c ++++ b/arch/x86/kernel/irq_64.c +@@ -57,10 +57,10 @@ static inline void stack_overflow_check(struct pt_regs *regs) + if (regs->sp >= estack_top && regs->sp <= estack_bottom) + return; + +- WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n", ++ WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n", + current->comm, curbase, regs->sp, + irq_stack_top, irq_stack_bottom, +- estack_top, estack_bottom); ++ estack_top, estack_bottom, (void *)regs->ip); + + if (sysctl_panic_on_stackoverflow) + panic("low stack detected by irq handler - check messages\n"); +-- +2.15.0 + diff --git a/queue/x86-irq-Remove-an-old-outdated-comment-about-context.patch b/queue/x86-irq-Remove-an-old-outdated-comment-about-context.patch new file mode 100644 index 0000000..d4102b8 --- /dev/null +++ b/queue/x86-irq-Remove-an-old-outdated-comment-about-context.patch @@ -0,0 +1,64 @@ +From 6669a692605547892a026445e460bf233958bd7f Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 4 Dec 2017 15:07:10 +0100 +Subject: [PATCH] x86/irq: Remove an old outdated comment about context + tracking races + +commit 6669a692605547892a026445e460bf233958bd7f upstream. + +That race has been fixed and code cleaned up for a while now. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.150551639@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c +index 52089c043160..aa9d51eea9d0 100644 +--- a/arch/x86/kernel/irq.c ++++ b/arch/x86/kernel/irq.c +@@ -219,18 +219,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) + /* high bit used in ret_from_ code */ + unsigned vector = ~regs->orig_ax; + +- /* +- * NB: Unlike exception entries, IRQ entries do not reliably +- * handle context tracking in the low-level entry code. This is +- * because syscall entries execute briefly with IRQs on before +- * updating context tracking state, so we can take an IRQ from +- * kernel mode with CONTEXT_USER. The low-level entry code only +- * updates the context if we came from user mode, so we won't +- * switch to CONTEXT_KERNEL. We'll fix that once the syscall +- * code is cleaned up enough that we can cleanly defer enabling +- * IRQs. +- */ +- + entering_irq(); + + /* entering_irq() tells RCU that we're not quiescent. Check it. */ +-- +2.15.0 + diff --git a/queue/x86-kasan-64-Teach-KASAN-about-the-cpu_entry_area.patch b/queue/x86-kasan-64-Teach-KASAN-about-the-cpu_entry_area.patch new file mode 100644 index 0000000..de66625 --- /dev/null +++ b/queue/x86-kasan-64-Teach-KASAN-about-the-cpu_entry_area.patch @@ -0,0 +1,80 @@ +From 21506525fb8ddb0342f2a2370812d47f6a1f3833 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 4 Dec 2017 15:07:16 +0100 +Subject: [PATCH] x86/kasan/64: Teach KASAN about the cpu_entry_area + +commit 21506525fb8ddb0342f2a2370812d47f6a1f3833 upstream. + +The cpu_entry_area will contain stacks. Make sure that KASAN has +appropriate shadow mappings for them. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Alexander Potapenko <glider@google.com> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Dmitry Vyukov <dvyukov@google.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: kasan-dev@googlegroups.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.642806442@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c +index 99dfed6dfef8..9ec70d780f1f 100644 +--- a/arch/x86/mm/kasan_init_64.c ++++ b/arch/x86/mm/kasan_init_64.c +@@ -277,6 +277,7 @@ void __init kasan_early_init(void) + void __init kasan_init(void) + { + int i; ++ void *shadow_cpu_entry_begin, *shadow_cpu_entry_end; + + #ifdef CONFIG_KASAN_INLINE + register_die_notifier(&kasan_die_notifier); +@@ -329,8 +330,23 @@ void __init kasan_init(void) + (unsigned long)kasan_mem_to_shadow(_end), + early_pfn_to_nid(__pa(_stext))); + ++ shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM); ++ shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin); ++ shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin, ++ PAGE_SIZE); ++ ++ shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE); ++ shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end); ++ shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end, ++ PAGE_SIZE); ++ + kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), +- (void *)KASAN_SHADOW_END); ++ shadow_cpu_entry_begin); ++ ++ kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, ++ (unsigned long)shadow_cpu_entry_end, 0); ++ ++ kasan_populate_zero_shadow(shadow_cpu_entry_end, (void *)KASAN_SHADOW_END); + + load_cr3(init_top_pgt); + __flush_tlb_all(); +-- +2.15.0 + diff --git a/queue/x86-kasan-Use-the-same-shadow-offset-for-4-and-5-lev.patch b/queue/x86-kasan-Use-the-same-shadow-offset-for-4-and-5-lev.patch new file mode 100644 index 0000000..a23c5f4 --- /dev/null +++ b/queue/x86-kasan-Use-the-same-shadow-offset-for-4-and-5-lev.patch @@ -0,0 +1,230 @@ +From 12a8cc7fcf54a8575f094be1e99032ec38aa045c Mon Sep 17 00:00:00 2001 +From: Andrey Ryabinin <aryabinin@virtuozzo.com> +Date: Fri, 29 Sep 2017 17:08:18 +0300 +Subject: [PATCH] x86/kasan: Use the same shadow offset for 4- and 5-level + paging + +commit 12a8cc7fcf54a8575f094be1e99032ec38aa045c upstream. + +We are going to support boot-time switching between 4- and 5-level +paging. For KASAN it means we cannot have different KASAN_SHADOW_OFFSET +for different paging modes: the constant is passed to gcc to generate +code and cannot be changed at runtime. + +This patch changes KASAN code to use 0xdffffc0000000000 as shadow offset +for both 4- and 5-level paging. + +For 5-level paging it means that shadow memory region is not aligned to +PGD boundary anymore and we have to handle unaligned parts of the region +properly. + +In addition, we have to exclude paravirt code from KASAN instrumentation +as we now use set_pgd() before KASAN is fully ready. + +[kirill.shutemov@linux.intel.com: clenaup, changelog message] +Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com> +Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Borislav Petkov <bp@suse.de> +Cc: Cyrill Gorcunov <gorcunov@openvz.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: linux-mm@kvack.org +Link: http://lkml.kernel.org/r/20170929140821.37654-4-kirill.shutemov@linux.intel.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt +index b0798e281aa6..3448e675b462 100644 +--- a/Documentation/x86/x86_64/mm.txt ++++ b/Documentation/x86/x86_64/mm.txt +@@ -34,7 +34,7 @@ ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space + ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole + ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) + ... unused hole ... +-ffd8000000000000 - fff7ffffffffffff (=53 bits) kasan shadow memory (8PB) ++ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) + ... unused hole ... + ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks + ... unused hole ... +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 971feac13506..32779beb56e2 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -302,7 +302,6 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC + config KASAN_SHADOW_OFFSET + hex + depends on KASAN +- default 0xdff8000000000000 if X86_5LEVEL + default 0xdffffc0000000000 + + config HAVE_INTEL_TXT +diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile +index fd0a7895b63f..a97a6b611531 100644 +--- a/arch/x86/kernel/Makefile ++++ b/arch/x86/kernel/Makefile +@@ -24,7 +24,8 @@ endif + KASAN_SANITIZE_head$(BITS).o := n + KASAN_SANITIZE_dumpstack.o := n + KASAN_SANITIZE_dumpstack_$(BITS).o := n +-KASAN_SANITIZE_stacktrace.o := n ++KASAN_SANITIZE_stacktrace.o := n ++KASAN_SANITIZE_paravirt.o := n + + OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y + OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y +diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c +index bc84b73684b7..fe5760db7b19 100644 +--- a/arch/x86/mm/kasan_init_64.c ++++ b/arch/x86/mm/kasan_init_64.c +@@ -15,6 +15,8 @@ + + extern struct range pfn_mapped[E820_MAX_ENTRIES]; + ++static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); ++ + static int __init map_range(struct range *range) + { + unsigned long start; +@@ -30,8 +32,10 @@ static void __init clear_pgds(unsigned long start, + unsigned long end) + { + pgd_t *pgd; ++ /* See comment in kasan_init() */ ++ unsigned long pgd_end = end & PGDIR_MASK; + +- for (; start < end; start += PGDIR_SIZE) { ++ for (; start < pgd_end; start += PGDIR_SIZE) { + pgd = pgd_offset_k(start); + /* + * With folded p4d, pgd_clear() is nop, use p4d_clear() +@@ -42,29 +46,61 @@ static void __init clear_pgds(unsigned long start, + else + pgd_clear(pgd); + } ++ ++ pgd = pgd_offset_k(start); ++ for (; start < end; start += P4D_SIZE) ++ p4d_clear(p4d_offset(pgd, start)); ++} ++ ++static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr) ++{ ++ unsigned long p4d; ++ ++ if (!IS_ENABLED(CONFIG_X86_5LEVEL)) ++ return (p4d_t *)pgd; ++ ++ p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK; ++ p4d += __START_KERNEL_map - phys_base; ++ return (p4d_t *)p4d + p4d_index(addr); ++} ++ ++static void __init kasan_early_p4d_populate(pgd_t *pgd, ++ unsigned long addr, ++ unsigned long end) ++{ ++ pgd_t pgd_entry; ++ p4d_t *p4d, p4d_entry; ++ unsigned long next; ++ ++ if (pgd_none(*pgd)) { ++ pgd_entry = __pgd(_KERNPG_TABLE | __pa_nodebug(kasan_zero_p4d)); ++ set_pgd(pgd, pgd_entry); ++ } ++ ++ p4d = early_p4d_offset(pgd, addr); ++ do { ++ next = p4d_addr_end(addr, end); ++ ++ if (!p4d_none(*p4d)) ++ continue; ++ ++ p4d_entry = __p4d(_KERNPG_TABLE | __pa_nodebug(kasan_zero_pud)); ++ set_p4d(p4d, p4d_entry); ++ } while (p4d++, addr = next, addr != end && p4d_none(*p4d)); + } + + static void __init kasan_map_early_shadow(pgd_t *pgd) + { +- int i; +- unsigned long start = KASAN_SHADOW_START; ++ /* See comment in kasan_init() */ ++ unsigned long addr = KASAN_SHADOW_START & PGDIR_MASK; + unsigned long end = KASAN_SHADOW_END; ++ unsigned long next; + +- for (i = pgd_index(start); start < end; i++) { +- switch (CONFIG_PGTABLE_LEVELS) { +- case 4: +- pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) | +- _KERNPG_TABLE); +- break; +- case 5: +- pgd[i] = __pgd(__pa_nodebug(kasan_zero_p4d) | +- _KERNPG_TABLE); +- break; +- default: +- BUILD_BUG(); +- } +- start += PGDIR_SIZE; +- } ++ pgd += pgd_index(addr); ++ do { ++ next = pgd_addr_end(addr, end); ++ kasan_early_p4d_populate(pgd, addr, next); ++ } while (pgd++, addr = next, addr != end); + } + + #ifdef CONFIG_KASAN_INLINE +@@ -101,7 +137,7 @@ void __init kasan_early_init(void) + for (i = 0; i < PTRS_PER_PUD; i++) + kasan_zero_pud[i] = __pud(pud_val); + +- for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++) ++ for (i = 0; IS_ENABLED(CONFIG_X86_5LEVEL) && i < PTRS_PER_P4D; i++) + kasan_zero_p4d[i] = __p4d(p4d_val); + + kasan_map_early_shadow(early_top_pgt); +@@ -117,12 +153,35 @@ void __init kasan_init(void) + #endif + + memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt)); ++ ++ /* ++ * We use the same shadow offset for 4- and 5-level paging to ++ * facilitate boot-time switching between paging modes. ++ * As result in 5-level paging mode KASAN_SHADOW_START and ++ * KASAN_SHADOW_END are not aligned to PGD boundary. ++ * ++ * KASAN_SHADOW_START doesn't share PGD with anything else. ++ * We claim whole PGD entry to make things easier. ++ * ++ * KASAN_SHADOW_END lands in the last PGD entry and it collides with ++ * bunch of things like kernel code, modules, EFI mapping, etc. ++ * We need to take extra steps to not overwrite them. ++ */ ++ if (IS_ENABLED(CONFIG_X86_5LEVEL)) { ++ void *ptr; ++ ++ ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END)); ++ memcpy(tmp_p4d_table, (void *)ptr, sizeof(tmp_p4d_table)); ++ set_pgd(&early_top_pgt[pgd_index(KASAN_SHADOW_END)], ++ __pgd(__pa(tmp_p4d_table) | _KERNPG_TABLE)); ++ } ++ + load_cr3(early_top_pgt); + __flush_tlb_all(); + +- clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); ++ clear_pgds(KASAN_SHADOW_START & PGDIR_MASK, KASAN_SHADOW_END); + +- kasan_populate_zero_shadow((void *)KASAN_SHADOW_START, ++ kasan_populate_zero_shadow((void *)(KASAN_SHADOW_START & PGDIR_MASK), + kasan_mem_to_shadow((void *)PAGE_OFFSET)); + + for (i = 0; i < E820_MAX_ENTRIES; i++) { +-- +2.15.0 + diff --git a/queue/x86-mm-64-Rename-the-register_page_bootmem_memmap-si.patch b/queue/x86-mm-64-Rename-the-register_page_bootmem_memmap-si.patch new file mode 100644 index 0000000..0a12641 --- /dev/null +++ b/queue/x86-mm-64-Rename-the-register_page_bootmem_memmap-si.patch @@ -0,0 +1,78 @@ +From 15670bfe19905b1dcbb63137f40d718b59d84479 Mon Sep 17 00:00:00 2001 +From: Baoquan He <bhe@redhat.com> +Date: Sat, 28 Oct 2017 09:30:38 +0800 +Subject: [PATCH] x86/mm/64: Rename the register_page_bootmem_memmap() 'size' + parameter to 'nr_pages' + +commit 15670bfe19905b1dcbb63137f40d718b59d84479 upstream. + +register_page_bootmem_memmap()'s 3rd 'size' parameter is named +in a somewhat misleading fashion - rename it to 'nr_pages' which +makes the units of it much clearer. + +Meanwhile rename the existing local variable 'nr_pages' to +'nr_pmd_pages', a more expressive name, to avoid conflict with +new function parameter 'nr_pages'. + +(Also clean up the unnecessary parentheses in which get_order() is called.) + +Signed-off-by: Baoquan He <bhe@redhat.com> +Acked-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: akpm@linux-foundation.org +Link: http://lkml.kernel.org/r/1509154238-23250-1-git-send-email-bhe@redhat.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c +index 048fbe8fc274..adcea90a2046 100644 +--- a/arch/x86/mm/init_64.c ++++ b/arch/x86/mm/init_64.c +@@ -1426,16 +1426,16 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) + + #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE) + void register_page_bootmem_memmap(unsigned long section_nr, +- struct page *start_page, unsigned long size) ++ struct page *start_page, unsigned long nr_pages) + { + unsigned long addr = (unsigned long)start_page; +- unsigned long end = (unsigned long)(start_page + size); ++ unsigned long end = (unsigned long)(start_page + nr_pages); + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; +- unsigned int nr_pages; ++ unsigned int nr_pmd_pages; + struct page *page; + + for (; addr < end; addr = next) { +@@ -1482,9 +1482,9 @@ void register_page_bootmem_memmap(unsigned long section_nr, + if (pmd_none(*pmd)) + continue; + +- nr_pages = 1 << (get_order(PMD_SIZE)); ++ nr_pmd_pages = 1 << get_order(PMD_SIZE); + page = pmd_page(*pmd); +- while (nr_pages--) ++ while (nr_pmd_pages--) + get_page_bootmem(section_nr, page++, + SECTION_INFO); + } +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 065d99deb847..b2c7045e9604 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2495,7 +2495,7 @@ void vmemmap_populate_print_last(void); + void vmemmap_free(unsigned long start, unsigned long end); + #endif + void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, +- unsigned long size); ++ unsigned long nr_pages); + + enum mf_flags { + MF_COUNT_INCREASED = 1 << 0, +-- +2.15.0 + diff --git a/queue/x86-mm-Define-_PAGE_TABLE-using-_KERNPG_TABLE.patch b/queue/x86-mm-Define-_PAGE_TABLE-using-_KERNPG_TABLE.patch new file mode 100644 index 0000000..414c74e --- /dev/null +++ b/queue/x86-mm-Define-_PAGE_TABLE-using-_KERNPG_TABLE.patch @@ -0,0 +1,37 @@ +From c7da092a1f243bfd1bfb4124f538e69e941882da Mon Sep 17 00:00:00 2001 +From: Borislav Petkov <bp@suse.de> +Date: Fri, 3 Nov 2017 11:20:28 +0100 +Subject: [PATCH] x86/mm: Define _PAGE_TABLE using _KERNPG_TABLE + +commit c7da092a1f243bfd1bfb4124f538e69e941882da upstream. + +... so that the difference is obvious. + +No functionality change. + +Signed-off-by: Borislav Petkov <bp@suse.de> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/20171103102028.20284-1-bp@alien8.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h +index f1492473f10e..c33f80da8a79 100644 +--- a/arch/x86/include/asm/pgtable_types.h ++++ b/arch/x86/include/asm/pgtable_types.h +@@ -199,10 +199,9 @@ enum page_cache_mode { + + #define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) + +-#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ +- _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC) + #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ + _PAGE_DIRTY | _PAGE_ENC) ++#define _PAGE_TABLE (_KERNPG_TABLE | _PAGE_USER) + + #define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC) + #define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _PAGE_ENC) +-- +2.15.0 + diff --git a/queue/x86-mm-Relocate-page-fault-error-codes-to-traps.h.patch b/queue/x86-mm-Relocate-page-fault-error-codes-to-traps.h.patch new file mode 100644 index 0000000..f6ab587 --- /dev/null +++ b/queue/x86-mm-Relocate-page-fault-error-codes-to-traps.h.patch @@ -0,0 +1,350 @@ +From 1067f030994c69ca1fba8c607437c8895dcf8509 Mon Sep 17 00:00:00 2001 +From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com> +Date: Fri, 27 Oct 2017 13:25:28 -0700 +Subject: [PATCH] x86/mm: Relocate page fault error codes to traps.h + +commit 1067f030994c69ca1fba8c607437c8895dcf8509 upstream. + +Up to this point, only fault.c used the definitions of the page fault error +codes. Thus, it made sense to keep them within such file. Other portions of +code might be interested in those definitions too. For instance, the User- +Mode Instruction Prevention emulation code will use such definitions to +emulate a page fault when it is unable to successfully copy the results +of the emulated instructions to user space. + +While relocating the error code enumeration, the prefix X86_ is used to +make it consistent with the rest of the definitions in traps.h. Of course, +code using the enumeration had to be updated as well. No functional changes +were performed. + +Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Reviewed-by: Andy Lutomirski <luto@kernel.org> +Cc: "Michael S. Tsirkin" <mst@redhat.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: ricardo.neri@intel.com +Cc: Paul Gortmaker <paul.gortmaker@windriver.com> +Cc: Huang Rui <ray.huang@amd.com> +Cc: Shuah Khan <shuah@kernel.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Jiri Slaby <jslaby@suse.cz> +Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com> +Cc: Chris Metcalf <cmetcalf@mellanox.com> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Chen Yucong <slaoub@gmail.com> +Cc: Vlastimil Babka <vbabka@suse.cz> +Cc: Masami Hiramatsu <mhiramat@kernel.org> +Cc: Paolo Bonzini <pbonzini@redhat.com> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> +Link: https://lkml.kernel.org/r/1509135945-13762-2-git-send-email-ricardo.neri-calderon@linux.intel.com + +diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h +index 5545f6459bf5..da3c3a3674a5 100644 +--- a/arch/x86/include/asm/traps.h ++++ b/arch/x86/include/asm/traps.h +@@ -144,4 +144,22 @@ enum { + X86_TRAP_IRET = 32, /* 32, IRET Exception */ + }; + ++/* ++ * Page fault error code bits: ++ * ++ * bit 0 == 0: no page found 1: protection fault ++ * bit 1 == 0: read access 1: write access ++ * bit 2 == 0: kernel-mode access 1: user-mode access ++ * bit 3 == 1: use of reserved bit detected ++ * bit 4 == 1: fault was an instruction fetch ++ * bit 5 == 1: protection keys block access ++ */ ++enum x86_pf_error_code { ++ X86_PF_PROT = 1 << 0, ++ X86_PF_WRITE = 1 << 1, ++ X86_PF_USER = 1 << 2, ++ X86_PF_RSVD = 1 << 3, ++ X86_PF_INSTR = 1 << 4, ++ X86_PF_PK = 1 << 5, ++}; + #endif /* _ASM_X86_TRAPS_H */ +diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c +index e2baeaa053a5..db71c73530bd 100644 +--- a/arch/x86/mm/fault.c ++++ b/arch/x86/mm/fault.c +@@ -28,26 +28,6 @@ + #define CREATE_TRACE_POINTS + #include <asm/trace/exceptions.h> + +-/* +- * Page fault error code bits: +- * +- * bit 0 == 0: no page found 1: protection fault +- * bit 1 == 0: read access 1: write access +- * bit 2 == 0: kernel-mode access 1: user-mode access +- * bit 3 == 1: use of reserved bit detected +- * bit 4 == 1: fault was an instruction fetch +- * bit 5 == 1: protection keys block access +- */ +-enum x86_pf_error_code { +- +- PF_PROT = 1 << 0, +- PF_WRITE = 1 << 1, +- PF_USER = 1 << 2, +- PF_RSVD = 1 << 3, +- PF_INSTR = 1 << 4, +- PF_PK = 1 << 5, +-}; +- + /* + * Returns 0 if mmiotrace is disabled, or if the fault is not + * handled by mmiotrace: +@@ -149,7 +129,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) + * If it was a exec (instruction fetch) fault on NX page, then + * do not ignore the fault: + */ +- if (error_code & PF_INSTR) ++ if (error_code & X86_PF_INSTR) + return 0; + + instr = (void *)convert_ip_to_linear(current, regs); +@@ -179,7 +159,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) + * siginfo so userspace can discover which protection key was set + * on the PTE. + * +- * If we get here, we know that the hardware signaled a PF_PK ++ * If we get here, we know that the hardware signaled a X86_PF_PK + * fault and that there was a VMA once we got in the fault + * handler. It does *not* guarantee that the VMA we find here + * was the one that we faulted on. +@@ -204,7 +184,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey) + /* + * force_sig_info_fault() is called from a number of + * contexts, some of which have a VMA and some of which +- * do not. The PF_PK handing happens after we have a ++ * do not. The X86_PF_PK handing happens after we have a + * valid VMA, so we should never reach this without a + * valid VMA. + */ +@@ -697,7 +677,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, + if (!oops_may_print()) + return; + +- if (error_code & PF_INSTR) { ++ if (error_code & X86_PF_INSTR) { + unsigned int level; + pgd_t *pgd; + pte_t *pte; +@@ -779,7 +759,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, + */ + if (current->thread.sig_on_uaccess_err && signal) { + tsk->thread.trap_nr = X86_TRAP_PF; +- tsk->thread.error_code = error_code | PF_USER; ++ tsk->thread.error_code = error_code | X86_PF_USER; + tsk->thread.cr2 = address; + + /* XXX: hwpoison faults will set the wrong code. */ +@@ -897,7 +877,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + struct task_struct *tsk = current; + + /* User mode accesses just cause a SIGSEGV */ +- if (error_code & PF_USER) { ++ if (error_code & X86_PF_USER) { + /* + * It's possible to have interrupts off here: + */ +@@ -918,7 +898,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + * Instruction fetch faults in the vsyscall page might need + * emulation. + */ +- if (unlikely((error_code & PF_INSTR) && ++ if (unlikely((error_code & X86_PF_INSTR) && + ((address & ~0xfff) == VSYSCALL_ADDR))) { + if (emulate_vsyscall(regs, address)) + return; +@@ -931,7 +911,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + * are always protection faults. + */ + if (address >= TASK_SIZE_MAX) +- error_code |= PF_PROT; ++ error_code |= X86_PF_PROT; + + if (likely(show_unhandled_signals)) + show_signal_msg(regs, error_code, address, tsk); +@@ -992,11 +972,11 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code, + + if (!boot_cpu_has(X86_FEATURE_OSPKE)) + return false; +- if (error_code & PF_PK) ++ if (error_code & X86_PF_PK) + return true; + /* this checks permission keys on the VMA: */ +- if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), +- (error_code & PF_INSTR), foreign)) ++ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), ++ (error_code & X86_PF_INSTR), foreign)) + return true; + return false; + } +@@ -1024,7 +1004,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, + int code = BUS_ADRERR; + + /* Kernel mode? Handle exceptions or die: */ +- if (!(error_code & PF_USER)) { ++ if (!(error_code & X86_PF_USER)) { + no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); + return; + } +@@ -1052,14 +1032,14 @@ static noinline void + mm_fault_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address, u32 *pkey, unsigned int fault) + { +- if (fatal_signal_pending(current) && !(error_code & PF_USER)) { ++ if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) { + no_context(regs, error_code, address, 0, 0); + return; + } + + if (fault & VM_FAULT_OOM) { + /* Kernel mode? Handle exceptions or die: */ +- if (!(error_code & PF_USER)) { ++ if (!(error_code & X86_PF_USER)) { + no_context(regs, error_code, address, + SIGSEGV, SEGV_MAPERR); + return; +@@ -1084,16 +1064,16 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, + + static int spurious_fault_check(unsigned long error_code, pte_t *pte) + { +- if ((error_code & PF_WRITE) && !pte_write(*pte)) ++ if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) + return 0; + +- if ((error_code & PF_INSTR) && !pte_exec(*pte)) ++ if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) + return 0; + /* + * Note: We do not do lazy flushing on protection key +- * changes, so no spurious fault will ever set PF_PK. ++ * changes, so no spurious fault will ever set X86_PF_PK. + */ +- if ((error_code & PF_PK)) ++ if ((error_code & X86_PF_PK)) + return 1; + + return 1; +@@ -1139,8 +1119,8 @@ spurious_fault(unsigned long error_code, unsigned long address) + * change, so user accesses are not expected to cause spurious + * faults. + */ +- if (error_code != (PF_WRITE | PF_PROT) +- && error_code != (PF_INSTR | PF_PROT)) ++ if (error_code != (X86_PF_WRITE | X86_PF_PROT) && ++ error_code != (X86_PF_INSTR | X86_PF_PROT)) + return 0; + + pgd = init_mm.pgd + pgd_index(address); +@@ -1200,19 +1180,19 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) + * always an unconditional error and can never result in + * a follow-up action to resolve the fault, like a COW. + */ +- if (error_code & PF_PK) ++ if (error_code & X86_PF_PK) + return 1; + + /* + * Make sure to check the VMA so that we do not perform +- * faults just to hit a PF_PK as soon as we fill in a ++ * faults just to hit a X86_PF_PK as soon as we fill in a + * page. + */ +- if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), +- (error_code & PF_INSTR), foreign)) ++ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), ++ (error_code & X86_PF_INSTR), foreign)) + return 1; + +- if (error_code & PF_WRITE) { ++ if (error_code & X86_PF_WRITE) { + /* write, present and write, not present: */ + if (unlikely(!(vma->vm_flags & VM_WRITE))) + return 1; +@@ -1220,7 +1200,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) + } + + /* read, present: */ +- if (unlikely(error_code & PF_PROT)) ++ if (unlikely(error_code & X86_PF_PROT)) + return 1; + + /* read, not present: */ +@@ -1243,7 +1223,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) + if (!static_cpu_has(X86_FEATURE_SMAP)) + return false; + +- if (error_code & PF_USER) ++ if (error_code & X86_PF_USER) + return false; + + if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC)) +@@ -1296,7 +1276,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, + * protection error (error_code & 9) == 0. + */ + if (unlikely(fault_in_kernel_space(address))) { +- if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { ++ if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { + if (vmalloc_fault(address) >= 0) + return; + +@@ -1324,7 +1304,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, + if (unlikely(kprobes_fault(regs))) + return; + +- if (unlikely(error_code & PF_RSVD)) ++ if (unlikely(error_code & X86_PF_RSVD)) + pgtable_bad(regs, error_code, address); + + if (unlikely(smap_violation(error_code, regs))) { +@@ -1350,7 +1330,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, + */ + if (user_mode(regs)) { + local_irq_enable(); +- error_code |= PF_USER; ++ error_code |= X86_PF_USER; + flags |= FAULT_FLAG_USER; + } else { + if (regs->flags & X86_EFLAGS_IF) +@@ -1359,9 +1339,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + +- if (error_code & PF_WRITE) ++ if (error_code & X86_PF_WRITE) + flags |= FAULT_FLAG_WRITE; +- if (error_code & PF_INSTR) ++ if (error_code & X86_PF_INSTR) + flags |= FAULT_FLAG_INSTRUCTION; + + /* +@@ -1381,7 +1361,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, + * space check, thus avoiding the deadlock: + */ + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { +- if ((error_code & PF_USER) == 0 && ++ if (!(error_code & X86_PF_USER) && + !search_exception_tables(regs->ip)) { + bad_area_nosemaphore(regs, error_code, address, NULL); + return; +@@ -1408,7 +1388,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, + bad_area(regs, error_code, address); + return; + } +- if (error_code & PF_USER) { ++ if (error_code & X86_PF_USER) { + /* + * Accessing the stack below %sp is always a bug. + * The large cushion allows instructions like enter +-- +2.15.0 + diff --git a/queue/x86-mm-fixmap-Generalize-the-GDT-fixmap-mechanism-in.patch b/queue/x86-mm-fixmap-Generalize-the-GDT-fixmap-mechanism-in.patch new file mode 100644 index 0000000..c4a16cb --- /dev/null +++ b/queue/x86-mm-fixmap-Generalize-the-GDT-fixmap-mechanism-in.patch @@ -0,0 +1,192 @@ +From ef8813ab280507972bb57e4b1b502811ad4411e9 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 4 Dec 2017 15:07:15 +0100 +Subject: [PATCH] x86/mm/fixmap: Generalize the GDT fixmap mechanism, introduce + struct cpu_entry_area + +commit ef8813ab280507972bb57e4b1b502811ad4411e9 upstream. + +Currently, the GDT is an ad-hoc array of pages, one per CPU, in the +fixmap. Generalize it to be an array of a new 'struct cpu_entry_area' +so that we can cleanly add new things to it. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.563271721@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h +index 01fd944fd721..f6f428432a68 100644 +--- a/arch/x86/include/asm/desc.h ++++ b/arch/x86/include/asm/desc.h +@@ -60,17 +60,10 @@ static inline struct desc_struct *get_current_gdt_rw(void) + return this_cpu_ptr(&gdt_page)->gdt; + } + +-/* Get the fixmap index for a specific processor */ +-static inline unsigned int get_cpu_gdt_ro_index(int cpu) +-{ +- return FIX_GDT_REMAP_END - cpu; +-} +- + /* Provide the fixmap address of the remapped GDT */ + static inline struct desc_struct *get_cpu_gdt_ro(int cpu) + { +- unsigned int idx = get_cpu_gdt_ro_index(cpu); +- return (struct desc_struct *)__fix_to_virt(idx); ++ return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt; + } + + /* Provide the current read-only GDT */ +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index b0c505fe9a95..b61f0242f9d0 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -44,6 +44,19 @@ extern unsigned long __FIXADDR_TOP; + PAGE_SIZE) + #endif + ++/* ++ * cpu_entry_area is a percpu region in the fixmap that contains things ++ * needed by the CPU and early entry/exit code. Real types aren't used ++ * for all fields here to avoid circular header dependencies. ++ * ++ * Every field is a virtual alias of some other allocated backing store. ++ * There is no direct allocation of a struct cpu_entry_area. ++ */ ++struct cpu_entry_area { ++ char gdt[PAGE_SIZE]; ++}; ++ ++#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) + + /* + * Here we define all the compile-time 'special' virtual +@@ -101,8 +114,8 @@ enum fixed_addresses { + FIX_LNW_VRTC, + #endif + /* Fixmap entries to remap the GDTs, one per processor. */ +- FIX_GDT_REMAP_BEGIN, +- FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1, ++ FIX_CPU_ENTRY_AREA_TOP, ++ FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1, + + #ifdef CONFIG_ACPI_APEI_GHES + /* Used for GHES mapping from assorted contexts */ +@@ -191,5 +204,25 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, + void __early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags); + ++static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page) ++{ ++ BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); ++ ++ return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page; ++} ++ ++#define __get_cpu_entry_area_offset_index(cpu, offset) ({ \ ++ BUILD_BUG_ON(offset % PAGE_SIZE != 0); \ ++ __get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \ ++ }) ++ ++#define get_cpu_entry_area_index(cpu, field) \ ++ __get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field)) ++ ++static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) ++{ ++ return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); ++} ++ + #endif /* !__ASSEMBLY__ */ + #endif /* _ASM_X86_FIXMAP_H */ +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 22f542170198..2cb394dc4153 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -466,12 +466,12 @@ void load_percpu_segment(int cpu) + load_stack_canary_segment(); + } + +-/* Setup the fixmap mapping only once per-processor */ +-static inline void setup_fixmap_gdt(int cpu) ++/* Setup the fixmap mappings only once per-processor */ ++static inline void setup_cpu_entry_area(int cpu) + { + #ifdef CONFIG_X86_64 + /* On 64-bit systems, we use a read-only fixmap GDT. */ +- pgprot_t prot = PAGE_KERNEL_RO; ++ pgprot_t gdt_prot = PAGE_KERNEL_RO; + #else + /* + * On native 32-bit systems, the GDT cannot be read-only because +@@ -482,11 +482,11 @@ static inline void setup_fixmap_gdt(int cpu) + * On Xen PV, the GDT must be read-only because the hypervisor requires + * it. + */ +- pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ? ++ pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? + PAGE_KERNEL_RO : PAGE_KERNEL; + #endif + +- __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot); ++ __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); + } + + /* Load the original GDT from the per-cpu structure */ +@@ -1589,7 +1589,7 @@ void cpu_init(void) + if (is_uv_system()) + uv_cpu_init(); + +- setup_fixmap_gdt(cpu); ++ setup_cpu_entry_area(cpu); + load_fixmap_gdt(cpu); + } + +@@ -1651,7 +1651,7 @@ void cpu_init(void) + + fpu__init_cpu(); + +- setup_fixmap_gdt(cpu); ++ setup_cpu_entry_area(cpu); + load_fixmap_gdt(cpu); + } + #endif +diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c +index 2ccdaba31a07..c2454237fa67 100644 +--- a/arch/x86/xen/mmu_pv.c ++++ b/arch/x86/xen/mmu_pv.c +@@ -2272,7 +2272,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) + #endif + case FIX_TEXT_POKE0: + case FIX_TEXT_POKE1: +- case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END: ++ case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM: + /* All local page mappings */ + pte = pfn_pte(phys, prot); + break; +-- +2.15.0 + diff --git a/queue/x86-mm-kasan-Don-t-use-vmemmap_populate-to-initializ.patch b/queue/x86-mm-kasan-Don-t-use-vmemmap_populate-to-initializ.patch new file mode 100644 index 0000000..a454771 --- /dev/null +++ b/queue/x86-mm-kasan-Don-t-use-vmemmap_populate-to-initializ.patch @@ -0,0 +1,254 @@ +From 2aeb07365bcd489620f71390a7d2031cd4dfb83e Mon Sep 17 00:00:00 2001 +From: Andrey Ryabinin <aryabinin@virtuozzo.com> +Date: Wed, 15 Nov 2017 17:36:35 -0800 +Subject: [PATCH] x86/mm/kasan: Don't use vmemmap_populate() to initialize + shadow + +commit 2aeb07365bcd489620f71390a7d2031cd4dfb83e upstream. + +[ Note, this is a Git cherry-pick of the following commit: + + d17a1d97dc20: ("x86/mm/kasan: don't use vmemmap_populate() to initialize shadow") + + ... for easier x86 PTI code testing and back-porting. ] + +The KASAN shadow is currently mapped using vmemmap_populate() since that +provides a semi-convenient way to map pages into init_top_pgt. However, +since that no longer zeroes the mapped pages, it is not suitable for +KASAN, which requires zeroed shadow memory. + +Add kasan_populate_shadow() interface and use it instead of +vmemmap_populate(). Besides, this allows us to take advantage of +gigantic pages and use them to populate the shadow, which should save us +some memory wasted on page tables and reduce TLB pressure. + +Link: http://lkml.kernel.org/r/20171103185147.2688-2-pasha.tatashin@oracle.com +Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com> +Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Steven Sistare <steven.sistare@oracle.com> +Cc: Daniel Jordan <daniel.m.jordan@oracle.com> +Cc: Bob Picco <bob.picco@oracle.com> +Cc: Michal Hocko <mhocko@suse.com> +Cc: Alexander Potapenko <glider@google.com> +Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org> +Cc: Catalin Marinas <catalin.marinas@arm.com> +Cc: Christian Borntraeger <borntraeger@de.ibm.com> +Cc: David S. Miller <davem@davemloft.net> +Cc: Dmitry Vyukov <dvyukov@google.com> +Cc: Heiko Carstens <heiko.carstens@de.ibm.com> +Cc: "H. Peter Anvin" <hpa@zytor.com> +Cc: Ingo Molnar <mingo@redhat.com> +Cc: Mark Rutland <mark.rutland@arm.com> +Cc: Matthew Wilcox <willy@infradead.org> +Cc: Mel Gorman <mgorman@techsingularity.net> +Cc: Michal Hocko <mhocko@kernel.org> +Cc: Sam Ravnborg <sam@ravnborg.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: Will Deacon <will.deacon@arm.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 4ae940a0ed3b..665eba1b6103 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -108,7 +108,7 @@ config X86 + select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE + select HAVE_ARCH_JUMP_LABEL +- select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP ++ select HAVE_ARCH_KASAN if X86_64 + select HAVE_ARCH_KGDB + select HAVE_ARCH_KMEMCHECK + select HAVE_ARCH_MMAP_RND_BITS if MMU +diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c +index 2b60dc6e64b1..99dfed6dfef8 100644 +--- a/arch/x86/mm/kasan_init_64.c ++++ b/arch/x86/mm/kasan_init_64.c +@@ -4,12 +4,14 @@ + #include <linux/bootmem.h> + #include <linux/kasan.h> + #include <linux/kdebug.h> ++#include <linux/memblock.h> + #include <linux/mm.h> + #include <linux/sched.h> + #include <linux/sched/task.h> + #include <linux/vmalloc.h> + + #include <asm/e820/types.h> ++#include <asm/pgalloc.h> + #include <asm/tlbflush.h> + #include <asm/sections.h> + #include <asm/pgtable.h> +@@ -18,7 +20,134 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES]; + + static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); + +-static int __init map_range(struct range *range) ++static __init void *early_alloc(size_t size, int nid) ++{ ++ return memblock_virt_alloc_try_nid_nopanic(size, size, ++ __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); ++} ++ ++static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, ++ unsigned long end, int nid) ++{ ++ pte_t *pte; ++ ++ if (pmd_none(*pmd)) { ++ void *p; ++ ++ if (boot_cpu_has(X86_FEATURE_PSE) && ++ ((end - addr) == PMD_SIZE) && ++ IS_ALIGNED(addr, PMD_SIZE)) { ++ p = early_alloc(PMD_SIZE, nid); ++ if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL)) ++ return; ++ else if (p) ++ memblock_free(__pa(p), PMD_SIZE); ++ } ++ ++ p = early_alloc(PAGE_SIZE, nid); ++ pmd_populate_kernel(&init_mm, pmd, p); ++ } ++ ++ pte = pte_offset_kernel(pmd, addr); ++ do { ++ pte_t entry; ++ void *p; ++ ++ if (!pte_none(*pte)) ++ continue; ++ ++ p = early_alloc(PAGE_SIZE, nid); ++ entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL); ++ set_pte_at(&init_mm, addr, pte, entry); ++ } while (pte++, addr += PAGE_SIZE, addr != end); ++} ++ ++static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, ++ unsigned long end, int nid) ++{ ++ pmd_t *pmd; ++ unsigned long next; ++ ++ if (pud_none(*pud)) { ++ void *p; ++ ++ if (boot_cpu_has(X86_FEATURE_GBPAGES) && ++ ((end - addr) == PUD_SIZE) && ++ IS_ALIGNED(addr, PUD_SIZE)) { ++ p = early_alloc(PUD_SIZE, nid); ++ if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL)) ++ return; ++ else if (p) ++ memblock_free(__pa(p), PUD_SIZE); ++ } ++ ++ p = early_alloc(PAGE_SIZE, nid); ++ pud_populate(&init_mm, pud, p); ++ } ++ ++ pmd = pmd_offset(pud, addr); ++ do { ++ next = pmd_addr_end(addr, end); ++ if (!pmd_large(*pmd)) ++ kasan_populate_pmd(pmd, addr, next, nid); ++ } while (pmd++, addr = next, addr != end); ++} ++ ++static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr, ++ unsigned long end, int nid) ++{ ++ pud_t *pud; ++ unsigned long next; ++ ++ if (p4d_none(*p4d)) { ++ void *p = early_alloc(PAGE_SIZE, nid); ++ ++ p4d_populate(&init_mm, p4d, p); ++ } ++ ++ pud = pud_offset(p4d, addr); ++ do { ++ next = pud_addr_end(addr, end); ++ if (!pud_large(*pud)) ++ kasan_populate_pud(pud, addr, next, nid); ++ } while (pud++, addr = next, addr != end); ++} ++ ++static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr, ++ unsigned long end, int nid) ++{ ++ void *p; ++ p4d_t *p4d; ++ unsigned long next; ++ ++ if (pgd_none(*pgd)) { ++ p = early_alloc(PAGE_SIZE, nid); ++ pgd_populate(&init_mm, pgd, p); ++ } ++ ++ p4d = p4d_offset(pgd, addr); ++ do { ++ next = p4d_addr_end(addr, end); ++ kasan_populate_p4d(p4d, addr, next, nid); ++ } while (p4d++, addr = next, addr != end); ++} ++ ++static void __init kasan_populate_shadow(unsigned long addr, unsigned long end, ++ int nid) ++{ ++ pgd_t *pgd; ++ unsigned long next; ++ ++ addr = addr & PAGE_MASK; ++ end = round_up(end, PAGE_SIZE); ++ pgd = pgd_offset_k(addr); ++ do { ++ next = pgd_addr_end(addr, end); ++ kasan_populate_pgd(pgd, addr, next, nid); ++ } while (pgd++, addr = next, addr != end); ++} ++ ++static void __init map_range(struct range *range) + { + unsigned long start; + unsigned long end; +@@ -26,7 +155,7 @@ static int __init map_range(struct range *range) + start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start)); + end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end)); + +- return vmemmap_populate(start, end, NUMA_NO_NODE); ++ kasan_populate_shadow(start, end, early_pfn_to_nid(range->start)); + } + + static void __init clear_pgds(unsigned long start, +@@ -189,16 +318,16 @@ void __init kasan_init(void) + if (pfn_mapped[i].end == 0) + break; + +- if (map_range(&pfn_mapped[i])) +- panic("kasan: unable to allocate shadow!"); ++ map_range(&pfn_mapped[i]); + } ++ + kasan_populate_zero_shadow( + kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), + kasan_mem_to_shadow((void *)__START_KERNEL_map)); + +- vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext), +- (unsigned long)kasan_mem_to_shadow(_end), +- NUMA_NO_NODE); ++ kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), ++ (unsigned long)kasan_mem_to_shadow(_end), ++ early_pfn_to_nid(__pa(_stext))); + + kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), + (void *)KASAN_SHADOW_END); +-- +2.15.0 + diff --git a/queue/x86-paravirt-Dont-patch-flush_tlb_single.patch b/queue/x86-paravirt-Dont-patch-flush_tlb_single.patch new file mode 100644 index 0000000..0512947 --- /dev/null +++ b/queue/x86-paravirt-Dont-patch-flush_tlb_single.patch @@ -0,0 +1,66 @@ +From a035795499ca1c2bd1928808d1a156eda1420383 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Dec 2017 15:07:30 +0100 +Subject: [PATCH] x86/paravirt: Dont patch flush_tlb_single + +commit a035795499ca1c2bd1928808d1a156eda1420383 upstream. + +native_flush_tlb_single() will be changed with the upcoming +PAGE_TABLE_ISOLATION feature. This requires to have more code in +there than INVLPG. + +Remove the paravirt patching for it. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com> +Reviewed-by: Juergen Gross <jgross@suse.com> +Acked-by: Peter Zijlstra <peterz@infradead.org> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Cc: michael.schwarz@iaik.tugraz.at +Cc: moritz.lipp@iaik.tugraz.at +Cc: richard.fellner@student.tugraz.at +Link: https://lkml.kernel.org/r/20171204150606.828111617@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c +index ac0be8283325..9edadabf04f6 100644 +--- a/arch/x86/kernel/paravirt_patch_64.c ++++ b/arch/x86/kernel/paravirt_patch_64.c +@@ -10,7 +10,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); + DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); + DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); + DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); +-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); + DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); + + DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); +@@ -60,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, + PATCH_SITE(pv_mmu_ops, read_cr2); + PATCH_SITE(pv_mmu_ops, read_cr3); + PATCH_SITE(pv_mmu_ops, write_cr3); +- PATCH_SITE(pv_mmu_ops, flush_tlb_single); + PATCH_SITE(pv_cpu_ops, wbinvd); + #if defined(CONFIG_PARAVIRT_SPINLOCKS) + case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): +-- +2.15.0 + diff --git a/queue/x86-paravirt-Provide-a-way-to-check-for-hypervisors.patch b/queue/x86-paravirt-Provide-a-way-to-check-for-hypervisors.patch new file mode 100644 index 0000000..d9d05ba --- /dev/null +++ b/queue/x86-paravirt-Provide-a-way-to-check-for-hypervisors.patch @@ -0,0 +1,94 @@ +From 79cc74155218316b9a5d28577c7077b2adba8e58 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Dec 2017 15:07:31 +0100 +Subject: [PATCH] x86/paravirt: Provide a way to check for hypervisors + +commit 79cc74155218316b9a5d28577c7077b2adba8e58 upstream. + +There is no generic way to test whether a kernel is running on a specific +hypervisor. But that's required to prevent the upcoming user address space +separation feature in certain guest modes. + +Make the hypervisor type enum unconditionally available and provide a +helper function which allows to test for a specific type. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Juergen Gross <jgross@suse.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.912938129@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h +index 1b0a5abcd8ae..96aa6b9884dc 100644 +--- a/arch/x86/include/asm/hypervisor.h ++++ b/arch/x86/include/asm/hypervisor.h +@@ -20,16 +20,7 @@ + #ifndef _ASM_X86_HYPERVISOR_H + #define _ASM_X86_HYPERVISOR_H + +-#ifdef CONFIG_HYPERVISOR_GUEST +- +-#include <asm/kvm_para.h> +-#include <asm/x86_init.h> +-#include <asm/xen/hypervisor.h> +- +-/* +- * x86 hypervisor information +- */ +- ++/* x86 hypervisor types */ + enum x86_hypervisor_type { + X86_HYPER_NATIVE = 0, + X86_HYPER_VMWARE, +@@ -39,6 +30,12 @@ enum x86_hypervisor_type { + X86_HYPER_KVM, + }; + ++#ifdef CONFIG_HYPERVISOR_GUEST ++ ++#include <asm/kvm_para.h> ++#include <asm/x86_init.h> ++#include <asm/xen/hypervisor.h> ++ + struct hypervisor_x86 { + /* Hypervisor name */ + const char *name; +@@ -58,7 +55,15 @@ struct hypervisor_x86 { + + extern enum x86_hypervisor_type x86_hyper_type; + extern void init_hypervisor_platform(void); ++static inline bool hypervisor_is_type(enum x86_hypervisor_type type) ++{ ++ return x86_hyper_type == type; ++} + #else + static inline void init_hypervisor_platform(void) { } ++static inline bool hypervisor_is_type(enum x86_hypervisor_type type) ++{ ++ return type == X86_HYPER_NATIVE; ++} + #endif /* CONFIG_HYPERVISOR_GUEST */ + #endif /* _ASM_X86_HYPERVISOR_H */ +-- +2.15.0 + diff --git a/queue/x86-platform-UV-Convert-timers-to-use-timer_setup.patch b/queue/x86-platform-UV-Convert-timers-to-use-timer_setup.patch new file mode 100644 index 0000000..7ec2910 --- /dev/null +++ b/queue/x86-platform-UV-Convert-timers-to-use-timer_setup.patch @@ -0,0 +1,45 @@ +From 376f3bcebdc999cc737d9052109cc33b573b3a8b Mon Sep 17 00:00:00 2001 +From: Kees Cook <keescook@chromium.org> +Date: Mon, 16 Oct 2017 16:22:31 -0700 +Subject: [PATCH] x86/platform/UV: Convert timers to use timer_setup() + +commit 376f3bcebdc999cc737d9052109cc33b573b3a8b upstream. + +In preparation for unconditionally passing the struct timer_list pointer to +all timer callbacks, switch to using the new timer_setup() and from_timer() +to pass the timer pointer explicitly. + +Signed-off-by: Kees Cook <keescook@chromium.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Dimitri Sivanich <sivanich@hpe.com> +Cc: Russ Anderson <rja@hpe.com> +Cc: Mike Travis <mike.travis@hpe.com> +Link: https://lkml.kernel.org/r/20171016232231.GA100493@beast + +diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c +index 0d57bb9079c9..c0b694810ff4 100644 +--- a/arch/x86/kernel/apic/x2apic_uv_x.c ++++ b/arch/x86/kernel/apic/x2apic_uv_x.c +@@ -920,9 +920,8 @@ static __init void uv_rtc_init(void) + /* + * percpu heartbeat timer + */ +-static void uv_heartbeat(unsigned long ignored) ++static void uv_heartbeat(struct timer_list *timer) + { +- struct timer_list *timer = &uv_scir_info->timer; + unsigned char bits = uv_scir_info->state; + + /* Flip heartbeat bit: */ +@@ -947,7 +946,7 @@ static int uv_heartbeat_enable(unsigned int cpu) + struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer; + + uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); +- setup_pinned_timer(timer, uv_heartbeat, cpu); ++ timer_setup(timer, uv_heartbeat, TIMER_PINNED); + timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; + add_timer_on(timer, cpu); + uv_cpu_scir_info(cpu)->enabled = 1; +-- +2.15.0 + diff --git a/queue/x86-traps-Use-a-new-on_thread_stack-helper-to-clean-.patch b/queue/x86-traps-Use-a-new-on_thread_stack-helper-to-clean-.patch new file mode 100644 index 0000000..188c380 --- /dev/null +++ b/queue/x86-traps-Use-a-new-on_thread_stack-helper-to-clean-.patch @@ -0,0 +1,56 @@ +From 3383642c2f9d4f5b4fa37436db4a109a1a10018c Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 2 Nov 2017 00:59:17 -0700 +Subject: [PATCH] x86/traps: Use a new on_thread_stack() helper to clean up an + assertion + +commit 3383642c2f9d4f5b4fa37436db4a109a1a10018c upstream. + +Let's keep the stack-related logic together rather than open-coding +a comparison in an assertion in the traps code. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/856b15bee1f55017b8f79d3758b0d51c48a08cf8.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index ae2ae6d80674..f10dae14f951 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -541,6 +541,12 @@ static inline unsigned long current_top_of_stack(void) + #endif + } + ++static inline bool on_thread_stack(void) ++{ ++ return (unsigned long)(current_top_of_stack() - ++ current_stack_pointer) < THREAD_SIZE; ++} ++ + #ifdef CONFIG_PARAVIRT + #include <asm/paravirt.h> + #else +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c +index 67db4f43309e..42a9c4458f5d 100644 +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -141,8 +141,7 @@ void ist_begin_non_atomic(struct pt_regs *regs) + * will catch asm bugs and any attempt to use ist_preempt_enable + * from double_fault. + */ +- BUG_ON((unsigned long)(current_top_of_stack() - +- current_stack_pointer) >= THREAD_SIZE); ++ BUG_ON(!on_thread_stack()); + + preempt_enable_no_resched(); + } +-- +2.15.0 + diff --git a/queue/x86-unwind-Make-CONFIG_UNWINDER_ORC-y-the-default-in.patch b/queue/x86-unwind-Make-CONFIG_UNWINDER_ORC-y-the-default-in.patch new file mode 100644 index 0000000..6c6c6bc --- /dev/null +++ b/queue/x86-unwind-Make-CONFIG_UNWINDER_ORC-y-the-default-in.patch @@ -0,0 +1,79 @@ +From fc72ae40e30327aa24eb88a24b9c7058f938bd36 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf <jpoimboe@redhat.com> +Date: Fri, 13 Oct 2017 15:02:01 -0500 +Subject: [PATCH] x86/unwind: Make CONFIG_UNWINDER_ORC=y the default in kconfig + for 64-bit + +commit fc72ae40e30327aa24eb88a24b9c7058f938bd36 upstream. + +The ORC unwinder has been stable in testing so far. Give it much wider +testing by making it the default in kconfig for x86_64. It's not yet +supported for 32-bit, so leave frame pointers as the default there. + +Suggested-by: Ingo Molnar <mingo@kernel.org> +Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/9b1237bbe7244ed9cdf8db2dcb1253e37e1c341e.1507924831.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug +index f274dbb87c26..a4ff214fb760 100644 +--- a/arch/x86/Kconfig.debug ++++ b/arch/x86/Kconfig.debug +@@ -358,27 +358,13 @@ config PUNIT_ATOM_DEBUG + + choice + prompt "Choose kernel unwinder" +- default UNWINDER_FRAME_POINTER ++ default UNWINDER_ORC if X86_64 ++ default UNWINDER_FRAME_POINTER if X86_32 + ---help--- + This determines which method will be used for unwinding kernel stack + traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack, + livepatch, lockdep, and more. + +-config UNWINDER_FRAME_POINTER +- bool "Frame pointer unwinder" +- select FRAME_POINTER +- ---help--- +- This option enables the frame pointer unwinder for unwinding kernel +- stack traces. +- +- The unwinder itself is fast and it uses less RAM than the ORC +- unwinder, but the kernel text size will grow by ~3% and the kernel's +- overall performance will degrade by roughly 5-10%. +- +- This option is recommended if you want to use the livepatch +- consistency model, as this is currently the only way to get a +- reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). +- + config UNWINDER_ORC + bool "ORC unwinder" + depends on X86_64 +@@ -395,6 +381,21 @@ config UNWINDER_ORC + Enabling this option will increase the kernel's runtime memory usage + by roughly 2-4MB, depending on your kernel config. + ++config UNWINDER_FRAME_POINTER ++ bool "Frame pointer unwinder" ++ select FRAME_POINTER ++ ---help--- ++ This option enables the frame pointer unwinder for unwinding kernel ++ stack traces. ++ ++ The unwinder itself is fast and it uses less RAM than the ORC ++ unwinder, but the kernel text size will grow by ~3% and the kernel's ++ overall performance will degrade by roughly 5-10%. ++ ++ This option is recommended if you want to use the livepatch ++ consistency model, as this is currently the only way to get a ++ reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). ++ + config UNWINDER_GUESS + bool "Guess unwinder" + depends on EXPERT +-- +2.15.0 + diff --git a/queue/x86-unwind-Rename-unwinder-config-options-to-CONFIG_.patch b/queue/x86-unwind-Rename-unwinder-config-options-to-CONFIG_.patch new file mode 100644 index 0000000..25eb213 --- /dev/null +++ b/queue/x86-unwind-Rename-unwinder-config-options-to-CONFIG_.patch @@ -0,0 +1,251 @@ +From 11af847446ed0d131cf24d16a7ef3d5ea7a49554 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf <jpoimboe@redhat.com> +Date: Fri, 13 Oct 2017 15:02:00 -0500 +Subject: [PATCH] x86/unwind: Rename unwinder config options to + 'CONFIG_UNWINDER_*' + +commit 11af847446ed0d131cf24d16a7ef3d5ea7a49554 upstream. + +Rename the unwinder config options from: + + CONFIG_ORC_UNWINDER + CONFIG_FRAME_POINTER_UNWINDER + CONFIG_GUESS_UNWINDER + +to: + + CONFIG_UNWINDER_ORC + CONFIG_UNWINDER_FRAME_POINTER + CONFIG_UNWINDER_GUESS + +... in order to give them a more logical config namespace. + +Suggested-by: Ingo Molnar <mingo@kernel.org> +Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/73972fc7e2762e91912c6b9584582703d6f1b8cc.1507924831.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/Documentation/x86/orc-unwinder.txt b/Documentation/x86/orc-unwinder.txt +index af0c9a4c65a6..cd4b29be29af 100644 +--- a/Documentation/x86/orc-unwinder.txt ++++ b/Documentation/x86/orc-unwinder.txt +@@ -4,7 +4,7 @@ ORC unwinder + Overview + -------- + +-The kernel CONFIG_ORC_UNWINDER option enables the ORC unwinder, which is ++The kernel CONFIG_UNWINDER_ORC option enables the ORC unwinder, which is + similar in concept to a DWARF unwinder. The difference is that the + format of the ORC data is much simpler than DWARF, which in turn allows + the ORC unwinder to be much simpler and faster. +diff --git a/Makefile b/Makefile +index bc5c79e8e3cf..c0f723f81c06 100644 +--- a/Makefile ++++ b/Makefile +@@ -933,8 +933,8 @@ ifdef CONFIG_STACK_VALIDATION + ifeq ($(has_libelf),1) + objtool_target := tools/objtool FORCE + else +- ifdef CONFIG_ORC_UNWINDER +- $(error "Cannot generate ORC metadata for CONFIG_ORC_UNWINDER=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") ++ ifdef CONFIG_UNWINDER_ORC ++ $(error "Cannot generate ORC metadata for CONFIG_UNWINDER_ORC=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") + else + $(warning "Cannot use CONFIG_STACK_VALIDATION=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") + endif +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 971feac13506..6b94ca0aa585 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -170,7 +170,7 @@ config X86 + select HAVE_PERF_USER_STACK_DUMP + select HAVE_RCU_TABLE_FREE + select HAVE_REGS_AND_STACK_ACCESS_API +- select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION ++ select HAVE_RELIABLE_STACKTRACE if X86_64 && UNWINDER_FRAME_POINTER && STACK_VALIDATION + select HAVE_STACK_VALIDATION if X86_64 + select HAVE_SYSCALL_TRACEPOINTS + select HAVE_UNSTABLE_SCHED_CLOCK +diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug +index 71a48a30fc84..f274dbb87c26 100644 +--- a/arch/x86/Kconfig.debug ++++ b/arch/x86/Kconfig.debug +@@ -358,13 +358,13 @@ config PUNIT_ATOM_DEBUG + + choice + prompt "Choose kernel unwinder" +- default FRAME_POINTER_UNWINDER ++ default UNWINDER_FRAME_POINTER + ---help--- + This determines which method will be used for unwinding kernel stack + traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack, + livepatch, lockdep, and more. + +-config FRAME_POINTER_UNWINDER ++config UNWINDER_FRAME_POINTER + bool "Frame pointer unwinder" + select FRAME_POINTER + ---help--- +@@ -379,7 +379,7 @@ config FRAME_POINTER_UNWINDER + consistency model, as this is currently the only way to get a + reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). + +-config ORC_UNWINDER ++config UNWINDER_ORC + bool "ORC unwinder" + depends on X86_64 + select STACK_VALIDATION +@@ -395,7 +395,7 @@ config ORC_UNWINDER + Enabling this option will increase the kernel's runtime memory usage + by roughly 2-4MB, depending on your kernel config. + +-config GUESS_UNWINDER ++config UNWINDER_GUESS + bool "Guess unwinder" + depends on EXPERT + ---help--- +@@ -410,7 +410,7 @@ config GUESS_UNWINDER + endchoice + + config FRAME_POINTER +- depends on !ORC_UNWINDER && !GUESS_UNWINDER ++ depends on !UNWINDER_ORC && !UNWINDER_GUESS + bool + + endmenu +diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config +index 550cd5012b73..66c9e2aab16c 100644 +--- a/arch/x86/configs/tiny.config ++++ b/arch/x86/configs/tiny.config +@@ -1,5 +1,5 @@ + CONFIG_NOHIGHMEM=y + # CONFIG_HIGHMEM4G is not set + # CONFIG_HIGHMEM64G is not set +-CONFIG_GUESS_UNWINDER=y +-# CONFIG_FRAME_POINTER_UNWINDER is not set ++CONFIG_UNWINDER_GUESS=y ++# CONFIG_UNWINDER_FRAME_POINTER is not set +diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig +index eb65c248708d..e32fc1f274d8 100644 +--- a/arch/x86/configs/x86_64_defconfig ++++ b/arch/x86/configs/x86_64_defconfig +@@ -299,7 +299,7 @@ CONFIG_DEBUG_STACKOVERFLOW=y + # CONFIG_DEBUG_RODATA_TEST is not set + CONFIG_DEBUG_BOOT_PARAMS=y + CONFIG_OPTIMIZE_INLINING=y +-CONFIG_ORC_UNWINDER=y ++CONFIG_UNWINDER_ORC=y + CONFIG_SECURITY=y + CONFIG_SECURITY_NETWORK=y + CONFIG_SECURITY_SELINUX=y +diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h +index 9eb7c718aaf8..9f05a1002aa9 100644 +--- a/arch/x86/include/asm/module.h ++++ b/arch/x86/include/asm/module.h +@@ -5,7 +5,7 @@ + #include <asm/orc_types.h> + + struct mod_arch_specific { +-#ifdef CONFIG_ORC_UNWINDER ++#ifdef CONFIG_UNWINDER_ORC + unsigned int num_orcs; + int *orc_unwind_ip; + struct orc_entry *orc_unwind; +diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h +index e9f793e2df7a..35d67dc7b69f 100644 +--- a/arch/x86/include/asm/unwind.h ++++ b/arch/x86/include/asm/unwind.h +@@ -12,11 +12,11 @@ struct unwind_state { + struct task_struct *task; + int graph_idx; + bool error; +-#if defined(CONFIG_ORC_UNWINDER) ++#if defined(CONFIG_UNWINDER_ORC) + bool signal, full_regs; + unsigned long sp, bp, ip; + struct pt_regs *regs; +-#elif defined(CONFIG_FRAME_POINTER_UNWINDER) ++#elif defined(CONFIG_UNWINDER_FRAME_POINTER) + bool got_irq; + unsigned long *bp, *orig_sp, ip; + struct pt_regs *regs; +@@ -50,7 +50,7 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, + __unwind_start(state, task, regs, first_frame); + } + +-#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER) ++#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) + static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) + { + if (unwind_done(state)) +@@ -65,7 +65,7 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) + } + #endif + +-#ifdef CONFIG_ORC_UNWINDER ++#ifdef CONFIG_UNWINDER_ORC + void unwind_init(void); + void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size); +diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile +index fd0a7895b63f..6209ab6deb50 100644 +--- a/arch/x86/kernel/Makefile ++++ b/arch/x86/kernel/Makefile +@@ -127,9 +127,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o + obj-$(CONFIG_TRACING) += tracepoint.o + obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o + +-obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o +-obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o +-obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o ++obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o ++obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o ++obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o + + ### + # 64 bit specific files +diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h +index 8acfc1e099e1..63e56f6c1877 100644 +--- a/include/asm-generic/vmlinux.lds.h ++++ b/include/asm-generic/vmlinux.lds.h +@@ -687,7 +687,7 @@ + #define BUG_TABLE + #endif + +-#ifdef CONFIG_ORC_UNWINDER ++#ifdef CONFIG_UNWINDER_ORC + #define ORC_UNWIND_TABLE \ + . = ALIGN(4); \ + .orc_unwind_ip : AT(ADDR(.orc_unwind_ip) - LOAD_OFFSET) { \ +diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug +index 2689b7c50c52..7566eff22236 100644 +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -376,7 +376,7 @@ config STACK_VALIDATION + that runtime stack traces are more reliable. + + This is also a prerequisite for generation of ORC unwind data, which +- is needed for CONFIG_ORC_UNWINDER. ++ is needed for CONFIG_UNWINDER_ORC. + + For more information, see + tools/objtool/Documentation/stack-validation.txt. +diff --git a/scripts/Makefile.build b/scripts/Makefile.build +index 061d0c3a420a..f965f477832e 100644 +--- a/scripts/Makefile.build ++++ b/scripts/Makefile.build +@@ -258,7 +258,7 @@ ifneq ($(SKIP_STACK_VALIDATION),1) + + __objtool_obj := $(objtree)/tools/objtool/objtool + +-objtool_args = $(if $(CONFIG_ORC_UNWINDER),orc generate,check) ++objtool_args = $(if $(CONFIG_UNWINDER_ORC),orc generate,check) + + ifndef CONFIG_FRAME_POINTER + objtool_args += --no-fp +-- +2.15.0 + diff --git a/queue/x86-unwinder-Handle-stack-overflows-more-gracefully.patch b/queue/x86-unwinder-Handle-stack-overflows-more-gracefully.patch new file mode 100644 index 0000000..48371db --- /dev/null +++ b/queue/x86-unwinder-Handle-stack-overflows-more-gracefully.patch @@ -0,0 +1,320 @@ +From b02fcf9ba1211097754b286043cd87a8b4907e75 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf <jpoimboe@redhat.com> +Date: Mon, 4 Dec 2017 15:07:09 +0100 +Subject: [PATCH] x86/unwinder: Handle stack overflows more gracefully + +commit b02fcf9ba1211097754b286043cd87a8b4907e75 upstream. + +There are at least two unwinder bugs hindering the debugging of +stack-overflow crashes: + +- It doesn't deal gracefully with the case where the stack overflows and + the stack pointer itself isn't on a valid stack but the + to-be-dereferenced data *is*. + +- The ORC oops dump code doesn't know how to print partial pt_regs, for the + case where if we get an interrupt/exception in *early* entry code + before the full pt_regs have been saved. + +Fix both issues. + +http://lkml.kernel.org/r/20171126024031.uxi4numpbjm5rlbr@treble + +Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bpetkov@suse.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150605.071425003@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h +index f86a8caa561e..395c9631e000 100644 +--- a/arch/x86/include/asm/kdebug.h ++++ b/arch/x86/include/asm/kdebug.h +@@ -26,6 +26,7 @@ extern void die(const char *, struct pt_regs *,long); + extern int __must_check __die(const char *, struct pt_regs *, long); + extern void show_stack_regs(struct pt_regs *regs); + extern void __show_regs(struct pt_regs *regs, int all); ++extern void show_iret_regs(struct pt_regs *regs); + extern unsigned long oops_begin(void); + extern void oops_end(unsigned long, struct pt_regs *, int signr); + +diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h +index e9cc6fe1fc6f..c1688c2d0a12 100644 +--- a/arch/x86/include/asm/unwind.h ++++ b/arch/x86/include/asm/unwind.h +@@ -7,6 +7,9 @@ + #include <asm/ptrace.h> + #include <asm/stacktrace.h> + ++#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip)) ++#define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET) ++ + struct unwind_state { + struct stack_info stack_info; + unsigned long stack_mask; +@@ -52,6 +55,10 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, + } + + #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) ++/* ++ * WARNING: The entire pt_regs may not be safe to dereference. In some cases, ++ * only the iret frame registers are accessible. Use with caution! ++ */ + static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) + { + if (unwind_done(state)) +diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c +index f13b4c00a5de..0bc95be5c638 100644 +--- a/arch/x86/kernel/dumpstack.c ++++ b/arch/x86/kernel/dumpstack.c +@@ -50,6 +50,28 @@ static void printk_stack_address(unsigned long address, int reliable, + printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); + } + ++void show_iret_regs(struct pt_regs *regs) ++{ ++ printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip); ++ printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss, ++ regs->sp, regs->flags); ++} ++ ++static void show_regs_safe(struct stack_info *info, struct pt_regs *regs) ++{ ++ if (on_stack(info, regs, sizeof(*regs))) ++ __show_regs(regs, 0); ++ else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET, ++ IRET_FRAME_SIZE)) { ++ /* ++ * When an interrupt or exception occurs in entry code, the ++ * full pt_regs might not have been saved yet. In that case ++ * just print the iret frame. ++ */ ++ show_iret_regs(regs); ++ } ++} ++ + void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, char *log_lvl) + { +@@ -94,8 +116,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + if (stack_name) + printk("%s <%s>\n", log_lvl, stack_name); + +- if (regs && on_stack(&stack_info, regs, sizeof(*regs))) +- __show_regs(regs, 0); ++ if (regs) ++ show_regs_safe(&stack_info, regs); + + /* + * Scan the stack, printing any text addresses we find. At the +@@ -119,7 +141,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + + /* + * Don't print regs->ip again if it was already printed +- * by __show_regs() below. ++ * by show_regs_safe() below. + */ + if (regs && stack == ®s->ip) + goto next; +@@ -155,8 +177,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + + /* if the frame has entry regs, print them */ + regs = unwind_get_entry_regs(&state); +- if (regs && on_stack(&stack_info, regs, sizeof(*regs))) +- __show_regs(regs, 0); ++ if (regs) ++ show_regs_safe(&stack_info, regs); + } + + if (stack_name) +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index eeeb34f85c25..01b119bebb68 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -69,9 +69,8 @@ void __show_regs(struct pt_regs *regs, int all) + unsigned int fsindex, gsindex; + unsigned int ds, cs, es; + +- printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip); +- printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, +- regs->sp, regs->flags); ++ show_iret_regs(regs); ++ + if (regs->orig_ax != -1) + pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); + else +@@ -88,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all) + printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", + regs->r13, regs->r14, regs->r15); + ++ if (!all) ++ return; ++ + asm("movl %%ds,%0" : "=r" (ds)); + asm("movl %%cs,%0" : "=r" (cs)); + asm("movl %%es,%0" : "=r" (es)); +@@ -98,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all) + rdmsrl(MSR_GS_BASE, gs); + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + +- if (!all) +- return; +- + cr0 = read_cr0(); + cr2 = read_cr2(); + cr3 = __read_cr3(); +diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c +index ff8e1132b2ae..be86a865087a 100644 +--- a/arch/x86/kernel/unwind_orc.c ++++ b/arch/x86/kernel/unwind_orc.c +@@ -253,22 +253,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) + return NULL; + } + +-static bool stack_access_ok(struct unwind_state *state, unsigned long addr, ++static bool stack_access_ok(struct unwind_state *state, unsigned long _addr, + size_t len) + { + struct stack_info *info = &state->stack_info; ++ void *addr = (void *)_addr; + +- /* +- * If the address isn't on the current stack, switch to the next one. +- * +- * We may have to traverse multiple stacks to deal with the possibility +- * that info->next_sp could point to an empty stack and the address +- * could be on a subsequent stack. +- */ +- while (!on_stack(info, (void *)addr, len)) +- if (get_stack_info(info->next_sp, state->task, info, +- &state->stack_mask)) +- return false; ++ if (!on_stack(info, addr, len) && ++ (get_stack_info(addr, state->task, info, &state->stack_mask))) ++ return false; + + return true; + } +@@ -283,42 +276,32 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr, + return true; + } + +-#define REGS_SIZE (sizeof(struct pt_regs)) +-#define SP_OFFSET (offsetof(struct pt_regs, sp)) +-#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip)) +-#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip)) +- + static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, +- unsigned long *ip, unsigned long *sp, bool full) ++ unsigned long *ip, unsigned long *sp) + { +- size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE; +- size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET; +- struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE); +- +- if (IS_ENABLED(CONFIG_X86_64)) { +- if (!stack_access_ok(state, addr, regs_size)) +- return false; +- +- *ip = regs->ip; +- *sp = regs->sp; ++ struct pt_regs *regs = (struct pt_regs *)addr; + +- return true; +- } ++ /* x86-32 support will be more complicated due to the ®s->sp hack */ ++ BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32)); + +- if (!stack_access_ok(state, addr, sp_offset)) ++ if (!stack_access_ok(state, addr, sizeof(struct pt_regs))) + return false; + + *ip = regs->ip; ++ *sp = regs->sp; ++ return true; ++} + +- if (user_mode(regs)) { +- if (!stack_access_ok(state, addr + sp_offset, +- REGS_SIZE - SP_OFFSET)) +- return false; ++static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr, ++ unsigned long *ip, unsigned long *sp) ++{ ++ struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET; + +- *sp = regs->sp; +- } else +- *sp = (unsigned long)®s->sp; ++ if (!stack_access_ok(state, addr, IRET_FRAME_SIZE)) ++ return false; + ++ *ip = regs->ip; ++ *sp = regs->sp; + return true; + } + +@@ -327,7 +310,6 @@ bool unwind_next_frame(struct unwind_state *state) + unsigned long ip_p, sp, orig_ip, prev_sp = state->sp; + enum stack_type prev_type = state->stack_info.type; + struct orc_entry *orc; +- struct pt_regs *ptregs; + bool indirect = false; + + if (unwind_done(state)) +@@ -435,7 +417,7 @@ bool unwind_next_frame(struct unwind_state *state) + break; + + case ORC_TYPE_REGS: +- if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) { ++ if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) { + orc_warn("can't dereference registers at %p for ip %pB\n", + (void *)sp, (void *)orig_ip); + goto done; +@@ -447,20 +429,14 @@ bool unwind_next_frame(struct unwind_state *state) + break; + + case ORC_TYPE_REGS_IRET: +- if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) { ++ if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) { + orc_warn("can't dereference iret registers at %p for ip %pB\n", + (void *)sp, (void *)orig_ip); + goto done; + } + +- ptregs = container_of((void *)sp, struct pt_regs, ip); +- if ((unsigned long)ptregs >= prev_sp && +- on_stack(&state->stack_info, ptregs, REGS_SIZE)) { +- state->regs = ptregs; +- state->full_regs = false; +- } else +- state->regs = NULL; +- ++ state->regs = (void *)sp - IRET_FRAME_OFFSET; ++ state->full_regs = false; + state->signal = true; + break; + +-- +2.15.0 + diff --git a/queue/x86-unwinder-Make-CONFIG_UNWINDER_ORC-y-the-default-.patch b/queue/x86-unwinder-Make-CONFIG_UNWINDER_ORC-y-the-default-.patch new file mode 100644 index 0000000..d633c7d --- /dev/null +++ b/queue/x86-unwinder-Make-CONFIG_UNWINDER_ORC-y-the-default-.patch @@ -0,0 +1,33 @@ +From 1e4078f0bba46ad61b69548abe6a6faf63b89380 Mon Sep 17 00:00:00 2001 +From: Ingo Molnar <mingo@kernel.org> +Date: Thu, 12 Oct 2017 09:24:30 +0200 +Subject: [PATCH] x86/unwinder: Make CONFIG_UNWINDER_ORC=y the default in the + 64-bit defconfig + +commit 1e4078f0bba46ad61b69548abe6a6faf63b89380 upstream. + +Increase testing coverage by turning on the primary x86 unwinder for +the 64-bit defconfig. + +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig +index 4a4b16e56d35..eb65c248708d 100644 +--- a/arch/x86/configs/x86_64_defconfig ++++ b/arch/x86/configs/x86_64_defconfig +@@ -299,6 +299,7 @@ CONFIG_DEBUG_STACKOVERFLOW=y + # CONFIG_DEBUG_RODATA_TEST is not set + CONFIG_DEBUG_BOOT_PARAMS=y + CONFIG_OPTIMIZE_INLINING=y ++CONFIG_ORC_UNWINDER=y + CONFIG_SECURITY=y + CONFIG_SECURITY_NETWORK=y + CONFIG_SECURITY_SELINUX=y +-- +2.15.0 + diff --git a/queue/x86-unwinder-orc-Dont-bail-on-stack-overflow.patch b/queue/x86-unwinder-orc-Dont-bail-on-stack-overflow.patch new file mode 100644 index 0000000..1cf4c3b --- /dev/null +++ b/queue/x86-unwinder-orc-Dont-bail-on-stack-overflow.patch @@ -0,0 +1,80 @@ +From d3a09104018cf2ad5973dfa8a9c138ef9f5015a3 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 4 Dec 2017 15:07:08 +0100 +Subject: [PATCH] x86/unwinder/orc: Dont bail on stack overflow + +commit d3a09104018cf2ad5973dfa8a9c138ef9f5015a3 upstream. + +If the stack overflows into a guard page and the ORC unwinder should work +well: by construction, there can't be any meaningful data in the guard page +because no writes to the guard page will have succeeded. + +But there is a bug that prevents unwinding from working correctly: if the +starting register state has RSP pointing into a stack guard page, the ORC +unwinder bails out immediately. + +Instead of bailing out immediately check whether the next page up is a +valid check page and if so analyze that. As a result the ORC unwinder will +start the unwind. + +Tested by intentionally overflowing the task stack. The result is an +accurate call trace instead of a trace consisting purely of '?' entries. + +There are a few other bugs that are triggered if the unwinder encounters a +stack overflow after the first step, but they are outside the scope of this +fix. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150604.991389777@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c +index a3f973b2c97a..ff8e1132b2ae 100644 +--- a/arch/x86/kernel/unwind_orc.c ++++ b/arch/x86/kernel/unwind_orc.c +@@ -553,8 +553,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, + } + + if (get_stack_info((unsigned long *)state->sp, state->task, +- &state->stack_info, &state->stack_mask)) +- return; ++ &state->stack_info, &state->stack_mask)) { ++ /* ++ * We weren't on a valid stack. It's possible that ++ * we overflowed a valid stack into a guard page. ++ * See if the next page up is valid so that we can ++ * generate some kind of backtrace if this happens. ++ */ ++ void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp); ++ if (get_stack_info(next_page, state->task, &state->stack_info, ++ &state->stack_mask)) ++ return; ++ } + + /* + * The caller can provide the address of the first frame directly +-- +2.15.0 + diff --git a/queue/x86-virt-Add-enum-for-hypervisors-to-replace-x86_hyp.patch b/queue/x86-virt-Add-enum-for-hypervisors-to-replace-x86_hyp.patch new file mode 100644 index 0000000..2d28e37 --- /dev/null +++ b/queue/x86-virt-Add-enum-for-hypervisors-to-replace-x86_hyp.patch @@ -0,0 +1,279 @@ +From 03b2a320b19f1424e9ac9c21696be9c60b6d0d93 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Thu, 9 Nov 2017 14:27:36 +0100 +Subject: [PATCH] x86/virt: Add enum for hypervisors to replace x86_hyper + +commit 03b2a320b19f1424e9ac9c21696be9c60b6d0d93 upstream. + +The x86_hyper pointer is only used for checking whether a virtual +device is supporting the hypervisor the system is running on. + +Use an enum for that purpose instead and drop the x86_hyper pointer. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Acked-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: Xavier Deguillard <xdeguillard@vmware.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: akataria@vmware.com +Cc: arnd@arndb.de +Cc: boris.ostrovsky@oracle.com +Cc: devel@linuxdriverproject.org +Cc: dmitry.torokhov@gmail.com +Cc: gregkh@linuxfoundation.org +Cc: haiyangz@microsoft.com +Cc: kvm@vger.kernel.org +Cc: kys@microsoft.com +Cc: linux-graphics-maintainer@vmware.com +Cc: linux-input@vger.kernel.org +Cc: moltmann@vmware.com +Cc: pbonzini@redhat.com +Cc: pv-drivers@vmware.com +Cc: rkrcmar@redhat.com +Cc: sthemmin@microsoft.com +Cc: virtualization@lists.linux-foundation.org +Cc: xen-devel@lists.xenproject.org +Link: http://lkml.kernel.org/r/20171109132739.23465-3-jgross@suse.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c +index a5db63f728a2..a0b86cf486e0 100644 +--- a/arch/x86/hyperv/hv_init.c ++++ b/arch/x86/hyperv/hv_init.c +@@ -113,7 +113,7 @@ void hyperv_init(void) + u64 guest_id; + union hv_x64_msr_hypercall_contents hypercall_msr; + +- if (x86_hyper != &x86_hyper_ms_hyperv) ++ if (x86_hyper_type != X86_HYPER_MS_HYPERV) + return; + + /* Allocate percpu VP index */ +diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h +index 0eca7239a7aa..1b0a5abcd8ae 100644 +--- a/arch/x86/include/asm/hypervisor.h ++++ b/arch/x86/include/asm/hypervisor.h +@@ -29,6 +29,16 @@ + /* + * x86 hypervisor information + */ ++ ++enum x86_hypervisor_type { ++ X86_HYPER_NATIVE = 0, ++ X86_HYPER_VMWARE, ++ X86_HYPER_MS_HYPERV, ++ X86_HYPER_XEN_PV, ++ X86_HYPER_XEN_HVM, ++ X86_HYPER_KVM, ++}; ++ + struct hypervisor_x86 { + /* Hypervisor name */ + const char *name; +@@ -36,6 +46,9 @@ struct hypervisor_x86 { + /* Detection routine */ + uint32_t (*detect)(void); + ++ /* Hypervisor type */ ++ enum x86_hypervisor_type type; ++ + /* init time callbacks */ + struct x86_hyper_init init; + +@@ -43,15 +56,7 @@ struct hypervisor_x86 { + struct x86_hyper_runtime runtime; + }; + +-extern const struct hypervisor_x86 *x86_hyper; +- +-/* Recognized hypervisors */ +-extern const struct hypervisor_x86 x86_hyper_vmware; +-extern const struct hypervisor_x86 x86_hyper_ms_hyperv; +-extern const struct hypervisor_x86 x86_hyper_xen_pv; +-extern const struct hypervisor_x86 x86_hyper_xen_hvm; +-extern const struct hypervisor_x86 x86_hyper_kvm; +- ++extern enum x86_hypervisor_type x86_hyper_type; + extern void init_hypervisor_platform(void); + #else + static inline void init_hypervisor_platform(void) { } +diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c +index 22226c1bf092..bea8d3e24f50 100644 +--- a/arch/x86/kernel/cpu/hypervisor.c ++++ b/arch/x86/kernel/cpu/hypervisor.c +@@ -26,6 +26,12 @@ + #include <asm/processor.h> + #include <asm/hypervisor.h> + ++extern const struct hypervisor_x86 x86_hyper_vmware; ++extern const struct hypervisor_x86 x86_hyper_ms_hyperv; ++extern const struct hypervisor_x86 x86_hyper_xen_pv; ++extern const struct hypervisor_x86 x86_hyper_xen_hvm; ++extern const struct hypervisor_x86 x86_hyper_kvm; ++ + static const __initconst struct hypervisor_x86 * const hypervisors[] = + { + #ifdef CONFIG_XEN_PV +@@ -41,8 +47,8 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = + #endif + }; + +-const struct hypervisor_x86 *x86_hyper; +-EXPORT_SYMBOL(x86_hyper); ++enum x86_hypervisor_type x86_hyper_type; ++EXPORT_SYMBOL(x86_hyper_type); + + static inline const struct hypervisor_x86 * __init + detect_hypervisor_vendor(void) +@@ -87,6 +93,6 @@ void __init init_hypervisor_platform(void) + copy_array(&h->init, &x86_init.hyper, sizeof(h->init)); + copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime)); + +- x86_hyper = h; ++ x86_hyper_type = h->type; + x86_init.hyper.init_platform(); + } +diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c +index 6bb84d655e4b..85eb5fc180c8 100644 +--- a/arch/x86/kernel/cpu/mshyperv.c ++++ b/arch/x86/kernel/cpu/mshyperv.c +@@ -254,9 +254,9 @@ static void __init ms_hyperv_init_platform(void) + #endif + } + +-const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { ++const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { + .name = "Microsoft Hyper-V", + .detect = ms_hyperv_platform, ++ .type = X86_HYPER_MS_HYPERV, + .init.init_platform = ms_hyperv_init_platform, + }; +-EXPORT_SYMBOL(x86_hyper_ms_hyperv); +diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c +index 4804c1d063c8..8e005329648b 100644 +--- a/arch/x86/kernel/cpu/vmware.c ++++ b/arch/x86/kernel/cpu/vmware.c +@@ -205,10 +205,10 @@ static bool __init vmware_legacy_x2apic_available(void) + (eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0; + } + +-const __refconst struct hypervisor_x86 x86_hyper_vmware = { ++const __initconst struct hypervisor_x86 x86_hyper_vmware = { + .name = "VMware", + .detect = vmware_platform, ++ .type = X86_HYPER_VMWARE, + .init.init_platform = vmware_platform_setup, + .init.x2apic_available = vmware_legacy_x2apic_available, + }; +-EXPORT_SYMBOL(x86_hyper_vmware); +diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c +index 9dca8437c795..a94de09edbed 100644 +--- a/arch/x86/kernel/kvm.c ++++ b/arch/x86/kernel/kvm.c +@@ -544,12 +544,12 @@ static uint32_t __init kvm_detect(void) + return kvm_cpuid_base(); + } + +-const struct hypervisor_x86 x86_hyper_kvm __refconst = { ++const __initconst struct hypervisor_x86 x86_hyper_kvm = { + .name = "KVM", + .detect = kvm_detect, ++ .type = X86_HYPER_KVM, + .init.x2apic_available = kvm_para_available, + }; +-EXPORT_SYMBOL_GPL(x86_hyper_kvm); + + static __init int activate_jump_labels(void) + { +diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c +index 7b1622089f96..754d5391d9fa 100644 +--- a/arch/x86/xen/enlighten_hvm.c ++++ b/arch/x86/xen/enlighten_hvm.c +@@ -226,12 +226,12 @@ static uint32_t __init xen_platform_hvm(void) + return xen_cpuid_base(); + } + +-const struct hypervisor_x86 x86_hyper_xen_hvm = { ++const __initconst struct hypervisor_x86 x86_hyper_xen_hvm = { + .name = "Xen HVM", + .detect = xen_platform_hvm, ++ .type = X86_HYPER_XEN_HVM, + .init.init_platform = xen_hvm_guest_init, + .init.x2apic_available = xen_x2apic_para_available, + .init.init_mem_mapping = xen_hvm_init_mem_mapping, + .runtime.pin_vcpu = xen_pin_vcpu, + }; +-EXPORT_SYMBOL(x86_hyper_xen_hvm); +diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c +index 69d1a7054ddb..168efb2534c0 100644 +--- a/arch/x86/xen/enlighten_pv.c ++++ b/arch/x86/xen/enlighten_pv.c +@@ -1460,9 +1460,9 @@ static uint32_t __init xen_platform_pv(void) + return 0; + } + +-const struct hypervisor_x86 x86_hyper_xen_pv = { ++const __initconst struct hypervisor_x86 x86_hyper_xen_pv = { + .name = "Xen PV", + .detect = xen_platform_pv, ++ .type = X86_HYPER_XEN_PV, + .runtime.pin_vcpu = xen_pin_vcpu, + }; +-EXPORT_SYMBOL(x86_hyper_xen_pv); +diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c +index 937801ac2fe0..2cd134dd94d2 100644 +--- a/drivers/hv/vmbus_drv.c ++++ b/drivers/hv/vmbus_drv.c +@@ -1534,7 +1534,7 @@ static int __init hv_acpi_init(void) + { + int ret, t; + +- if (x86_hyper != &x86_hyper_ms_hyperv) ++ if (x86_hyper_type != X86_HYPER_MS_HYPERV) + return -ENODEV; + + init_completion(&probe_event); +diff --git a/drivers/input/mouse/vmmouse.c b/drivers/input/mouse/vmmouse.c +index 0f586780ceb4..1ae5c1ef3f5b 100644 +--- a/drivers/input/mouse/vmmouse.c ++++ b/drivers/input/mouse/vmmouse.c +@@ -316,11 +316,9 @@ static int vmmouse_enable(struct psmouse *psmouse) + /* + * Array of supported hypervisors. + */ +-static const struct hypervisor_x86 *vmmouse_supported_hypervisors[] = { +- &x86_hyper_vmware, +-#ifdef CONFIG_KVM_GUEST +- &x86_hyper_kvm, +-#endif ++static enum x86_hypervisor_type vmmouse_supported_hypervisors[] = { ++ X86_HYPER_VMWARE, ++ X86_HYPER_KVM, + }; + + /** +@@ -331,7 +329,7 @@ static bool vmmouse_check_hypervisor(void) + int i; + + for (i = 0; i < ARRAY_SIZE(vmmouse_supported_hypervisors); i++) +- if (vmmouse_supported_hypervisors[i] == x86_hyper) ++ if (vmmouse_supported_hypervisors[i] == x86_hyper_type) + return true; + + return false; +diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c +index 1e688bfec567..9047c0a529b2 100644 +--- a/drivers/misc/vmw_balloon.c ++++ b/drivers/misc/vmw_balloon.c +@@ -1271,7 +1271,7 @@ static int __init vmballoon_init(void) + * Check if we are running on VMware's hypervisor and bail out + * if we are not. + */ +- if (x86_hyper != &x86_hyper_vmware) ++ if (x86_hyper_type != X86_HYPER_VMWARE) + return -ENODEV; + + for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES; +-- +2.15.0 + diff --git a/queue/x86-virt-x86-platform-Merge-struct-x86_hyper-into-st.patch b/queue/x86-virt-x86-platform-Merge-struct-x86_hyper-into-st.patch new file mode 100644 index 0000000..d9c7be4 --- /dev/null +++ b/queue/x86-virt-x86-platform-Merge-struct-x86_hyper-into-st.patch @@ -0,0 +1,384 @@ +From f72e38e8ec8869ac0ba5a75d7d2f897d98a1454e Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Thu, 9 Nov 2017 14:27:35 +0100 +Subject: [PATCH] x86/virt, x86/platform: Merge 'struct x86_hyper' into 'struct + x86_platform' and 'struct x86_init' + +commit f72e38e8ec8869ac0ba5a75d7d2f897d98a1454e upstream. + +Instead of x86_hyper being either NULL on bare metal or a pointer to a +struct hypervisor_x86 in case of the kernel running as a guest merge +the struct into x86_platform and x86_init. + +This will remove the need for wrappers making it hard to find out what +is being called. With dummy functions added for all callbacks testing +for a NULL function pointer can be removed, too. + +Suggested-by: Ingo Molnar <mingo@kernel.org> +Signed-off-by: Juergen Gross <jgross@suse.com> +Acked-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: akataria@vmware.com +Cc: boris.ostrovsky@oracle.com +Cc: devel@linuxdriverproject.org +Cc: haiyangz@microsoft.com +Cc: kvm@vger.kernel.org +Cc: kys@microsoft.com +Cc: pbonzini@redhat.com +Cc: rkrcmar@redhat.com +Cc: rusty@rustcorp.com.au +Cc: sthemmin@microsoft.com +Cc: virtualization@lists.linux-foundation.org +Cc: xen-devel@lists.xenproject.org +Link: http://lkml.kernel.org/r/20171109132739.23465-2-jgross@suse.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h +index 0ead9dbb9130..0eca7239a7aa 100644 +--- a/arch/x86/include/asm/hypervisor.h ++++ b/arch/x86/include/asm/hypervisor.h +@@ -23,6 +23,7 @@ + #ifdef CONFIG_HYPERVISOR_GUEST + + #include <asm/kvm_para.h> ++#include <asm/x86_init.h> + #include <asm/xen/hypervisor.h> + + /* +@@ -35,17 +36,11 @@ struct hypervisor_x86 { + /* Detection routine */ + uint32_t (*detect)(void); + +- /* Platform setup (run once per boot) */ +- void (*init_platform)(void); ++ /* init time callbacks */ ++ struct x86_hyper_init init; + +- /* X2APIC detection (run once per boot) */ +- bool (*x2apic_available)(void); +- +- /* pin current vcpu to specified physical cpu (run rarely) */ +- void (*pin_vcpu)(int); +- +- /* called during init_mem_mapping() to setup early mappings. */ +- void (*init_mem_mapping)(void); ++ /* runtime callbacks */ ++ struct x86_hyper_runtime runtime; + }; + + extern const struct hypervisor_x86 *x86_hyper; +@@ -58,17 +53,7 @@ extern const struct hypervisor_x86 x86_hyper_xen_hvm; + extern const struct hypervisor_x86 x86_hyper_kvm; + + extern void init_hypervisor_platform(void); +-extern bool hypervisor_x2apic_available(void); +-extern void hypervisor_pin_vcpu(int cpu); +- +-static inline void hypervisor_init_mem_mapping(void) +-{ +- if (x86_hyper && x86_hyper->init_mem_mapping) +- x86_hyper->init_mem_mapping(); +-} + #else + static inline void init_hypervisor_platform(void) { } +-static inline bool hypervisor_x2apic_available(void) { return false; } +-static inline void hypervisor_init_mem_mapping(void) { } + #endif /* CONFIG_HYPERVISOR_GUEST */ + #endif /* _ASM_X86_HYPERVISOR_H */ +diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h +index 8a1ebf9540dd..ad15a0fda917 100644 +--- a/arch/x86/include/asm/x86_init.h ++++ b/arch/x86/include/asm/x86_init.h +@@ -114,6 +114,18 @@ struct x86_init_pci { + void (*fixup_irqs)(void); + }; + ++/** ++ * struct x86_hyper_init - x86 hypervisor init functions ++ * @init_platform: platform setup ++ * @x2apic_available: X2APIC detection ++ * @init_mem_mapping: setup early mappings during init_mem_mapping() ++ */ ++struct x86_hyper_init { ++ void (*init_platform)(void); ++ bool (*x2apic_available)(void); ++ void (*init_mem_mapping)(void); ++}; ++ + /** + * struct x86_init_ops - functions for platform specific setup + * +@@ -127,6 +139,7 @@ struct x86_init_ops { + struct x86_init_timers timers; + struct x86_init_iommu iommu; + struct x86_init_pci pci; ++ struct x86_hyper_init hyper; + }; + + /** +@@ -199,6 +212,15 @@ struct x86_legacy_features { + struct x86_legacy_devices devices; + }; + ++/** ++ * struct x86_hyper_runtime - x86 hypervisor specific runtime callbacks ++ * ++ * @pin_vcpu: pin current vcpu to specified physical cpu (run rarely) ++ */ ++struct x86_hyper_runtime { ++ void (*pin_vcpu)(int cpu); ++}; ++ + /** + * struct x86_platform_ops - platform specific runtime functions + * @calibrate_cpu: calibrate CPU +@@ -218,6 +240,7 @@ struct x86_legacy_features { + * possible in x86_early_init_platform_quirks() by + * only using the current x86_hardware_subarch + * semantics. ++ * @hyper: x86 hypervisor specific runtime callbacks + */ + struct x86_platform_ops { + unsigned long (*calibrate_cpu)(void); +@@ -233,6 +256,7 @@ struct x86_platform_ops { + void (*apic_post_init)(void); + struct x86_legacy_features legacy; + void (*set_legacy_features)(void); ++ struct x86_hyper_runtime hyper; + }; + + struct pci_dev; +diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c +index ff891772c9f8..89c7c8569e5e 100644 +--- a/arch/x86/kernel/apic/apic.c ++++ b/arch/x86/kernel/apic/apic.c +@@ -1645,7 +1645,7 @@ static __init void try_to_enable_x2apic(int remap_mode) + * under KVM + */ + if (max_physical_apicid > 255 || +- !hypervisor_x2apic_available()) { ++ !x86_init.hyper.x2apic_available()) { + pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n"); + x2apic_disable(); + return; +diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c +index 4fa90006ac68..22226c1bf092 100644 +--- a/arch/x86/kernel/cpu/hypervisor.c ++++ b/arch/x86/kernel/cpu/hypervisor.c +@@ -44,51 +44,49 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = + const struct hypervisor_x86 *x86_hyper; + EXPORT_SYMBOL(x86_hyper); + +-static inline void __init ++static inline const struct hypervisor_x86 * __init + detect_hypervisor_vendor(void) + { +- const struct hypervisor_x86 *h, * const *p; ++ const struct hypervisor_x86 *h = NULL, * const *p; + uint32_t pri, max_pri = 0; + + for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) { +- h = *p; +- pri = h->detect(); +- if (pri != 0 && pri > max_pri) { ++ pri = (*p)->detect(); ++ if (pri > max_pri) { + max_pri = pri; +- x86_hyper = h; ++ h = *p; + } + } + +- if (max_pri) +- pr_info("Hypervisor detected: %s\n", x86_hyper->name); ++ if (h) ++ pr_info("Hypervisor detected: %s\n", h->name); ++ ++ return h; + } + +-void __init init_hypervisor_platform(void) ++static void __init copy_array(const void *src, void *target, unsigned int size) + { ++ unsigned int i, n = size / sizeof(void *); ++ const void * const *from = (const void * const *)src; ++ const void **to = (const void **)target; + +- detect_hypervisor_vendor(); +- +- if (!x86_hyper) +- return; +- +- if (x86_hyper->init_platform) +- x86_hyper->init_platform(); ++ for (i = 0; i < n; i++) ++ if (from[i]) ++ to[i] = from[i]; + } + +-bool __init hypervisor_x2apic_available(void) ++void __init init_hypervisor_platform(void) + { +- return x86_hyper && +- x86_hyper->x2apic_available && +- x86_hyper->x2apic_available(); +-} ++ const struct hypervisor_x86 *h; + +-void hypervisor_pin_vcpu(int cpu) +-{ +- if (!x86_hyper) ++ h = detect_hypervisor_vendor(); ++ ++ if (!h) + return; + +- if (x86_hyper->pin_vcpu) +- x86_hyper->pin_vcpu(cpu); +- else +- WARN_ONCE(1, "vcpu pinning requested but not supported!\n"); ++ copy_array(&h->init, &x86_init.hyper, sizeof(h->init)); ++ copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime)); ++ ++ x86_hyper = h; ++ x86_init.hyper.init_platform(); + } +diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c +index 236324e83a3a..6bb84d655e4b 100644 +--- a/arch/x86/kernel/cpu/mshyperv.c ++++ b/arch/x86/kernel/cpu/mshyperv.c +@@ -257,6 +257,6 @@ static void __init ms_hyperv_init_platform(void) + const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { + .name = "Microsoft Hyper-V", + .detect = ms_hyperv_platform, +- .init_platform = ms_hyperv_init_platform, ++ .init.init_platform = ms_hyperv_init_platform, + }; + EXPORT_SYMBOL(x86_hyper_ms_hyperv); +diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c +index 40ed26852ebd..4804c1d063c8 100644 +--- a/arch/x86/kernel/cpu/vmware.c ++++ b/arch/x86/kernel/cpu/vmware.c +@@ -208,7 +208,7 @@ static bool __init vmware_legacy_x2apic_available(void) + const __refconst struct hypervisor_x86 x86_hyper_vmware = { + .name = "VMware", + .detect = vmware_platform, +- .init_platform = vmware_platform_setup, +- .x2apic_available = vmware_legacy_x2apic_available, ++ .init.init_platform = vmware_platform_setup, ++ .init.x2apic_available = vmware_legacy_x2apic_available, + }; + EXPORT_SYMBOL(x86_hyper_vmware); +diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c +index 8bb9594d0761..9dca8437c795 100644 +--- a/arch/x86/kernel/kvm.c ++++ b/arch/x86/kernel/kvm.c +@@ -547,7 +547,7 @@ static uint32_t __init kvm_detect(void) + const struct hypervisor_x86 x86_hyper_kvm __refconst = { + .name = "KVM", + .detect = kvm_detect, +- .x2apic_available = kvm_para_available, ++ .init.x2apic_available = kvm_para_available, + }; + EXPORT_SYMBOL_GPL(x86_hyper_kvm); + +diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c +index a088b2c47f73..5b2d10c1973a 100644 +--- a/arch/x86/kernel/x86_init.c ++++ b/arch/x86/kernel/x86_init.c +@@ -28,6 +28,8 @@ void x86_init_noop(void) { } + void __init x86_init_uint_noop(unsigned int unused) { } + int __init iommu_init_noop(void) { return 0; } + void iommu_shutdown_noop(void) { } ++bool __init bool_x86_init_noop(void) { return false; } ++void x86_op_int_noop(int cpu) { } + + /* + * The platform setup functions are preset with the default functions +@@ -81,6 +83,12 @@ struct x86_init_ops x86_init __initdata = { + .init_irq = x86_default_pci_init_irq, + .fixup_irqs = x86_default_pci_fixup_irqs, + }, ++ ++ .hyper = { ++ .init_platform = x86_init_noop, ++ .x2apic_available = bool_x86_init_noop, ++ .init_mem_mapping = x86_init_noop, ++ }, + }; + + struct x86_cpuinit_ops x86_cpuinit = { +@@ -101,6 +109,7 @@ struct x86_platform_ops x86_platform __ro_after_init = { + .get_nmi_reason = default_get_nmi_reason, + .save_sched_clock_state = tsc_save_sched_clock_state, + .restore_sched_clock_state = tsc_restore_sched_clock_state, ++ .hyper.pin_vcpu = x86_op_int_noop, + }; + + EXPORT_SYMBOL_GPL(x86_platform); +diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c +index af5c1ed21d43..a22c2b95e513 100644 +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -671,7 +671,7 @@ void __init init_mem_mapping(void) + load_cr3(swapper_pg_dir); + __flush_tlb_all(); + +- hypervisor_init_mem_mapping(); ++ x86_init.hyper.init_mem_mapping(); + + early_memtest(0, max_pfn_mapped << PAGE_SHIFT); + } +diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c +index de503c225ae1..7b1622089f96 100644 +--- a/arch/x86/xen/enlighten_hvm.c ++++ b/arch/x86/xen/enlighten_hvm.c +@@ -229,9 +229,9 @@ static uint32_t __init xen_platform_hvm(void) + const struct hypervisor_x86 x86_hyper_xen_hvm = { + .name = "Xen HVM", + .detect = xen_platform_hvm, +- .init_platform = xen_hvm_guest_init, +- .pin_vcpu = xen_pin_vcpu, +- .x2apic_available = xen_x2apic_para_available, +- .init_mem_mapping = xen_hvm_init_mem_mapping, ++ .init.init_platform = xen_hvm_guest_init, ++ .init.x2apic_available = xen_x2apic_para_available, ++ .init.init_mem_mapping = xen_hvm_init_mem_mapping, ++ .runtime.pin_vcpu = xen_pin_vcpu, + }; + EXPORT_SYMBOL(x86_hyper_xen_hvm); +diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c +index d4396e27b1fb..69d1a7054ddb 100644 +--- a/arch/x86/xen/enlighten_pv.c ++++ b/arch/x86/xen/enlighten_pv.c +@@ -1463,6 +1463,6 @@ static uint32_t __init xen_platform_pv(void) + const struct hypervisor_x86 x86_hyper_xen_pv = { + .name = "Xen PV", + .detect = xen_platform_pv, +- .pin_vcpu = xen_pin_vcpu, ++ .runtime.pin_vcpu = xen_pin_vcpu, + }; + EXPORT_SYMBOL(x86_hyper_xen_pv); +diff --git a/include/linux/hypervisor.h b/include/linux/hypervisor.h +index b4054fd5b6f6..b19563f9a8eb 100644 +--- a/include/linux/hypervisor.h ++++ b/include/linux/hypervisor.h +@@ -7,8 +7,12 @@ + * Juergen Gross <jgross@suse.com> + */ + +-#ifdef CONFIG_HYPERVISOR_GUEST +-#include <asm/hypervisor.h> ++#ifdef CONFIG_X86 ++#include <asm/x86_init.h> ++static inline void hypervisor_pin_vcpu(int cpu) ++{ ++ x86_platform.hyper.pin_vcpu(cpu); ++} + #else + static inline void hypervisor_pin_vcpu(int cpu) + { +-- +2.15.0 + diff --git a/queue/x86-xen-64-x86-entry-64-Clean-up-SP-code-in-cpu_init.patch b/queue/x86-xen-64-x86-entry-64-Clean-up-SP-code-in-cpu_init.patch new file mode 100644 index 0000000..2b4ac9a --- /dev/null +++ b/queue/x86-xen-64-x86-entry-64-Clean-up-SP-code-in-cpu_init.patch @@ -0,0 +1,88 @@ +From f16b3da1dc936c0f8121741d0a1731bf242f2f56 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 2 Nov 2017 00:59:12 -0700 +Subject: [PATCH] x86/xen/64, x86/entry/64: Clean up SP code in + cpu_initialize_context() + +commit f16b3da1dc936c0f8121741d0a1731bf242f2f56 upstream. + +I'm removing thread_struct::sp0, and Xen's usage of it is slightly +dubious and unnecessary. Use appropriate helpers instead. + +While we're at at, reorder the code slightly to make it more obvious +what's going on. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Reviewed-by: Juergen Gross <jgross@suse.com> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/d5b9a3da2b47c68325bd2bbe8f82d9554dee0d0f.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c +index 51471408fdd1..8c0e047d0b80 100644 +--- a/arch/x86/xen/smp_pv.c ++++ b/arch/x86/xen/smp_pv.c +@@ -13,6 +13,7 @@ + * single-threaded. + */ + #include <linux/sched.h> ++#include <linux/sched/task_stack.h> + #include <linux/err.h> + #include <linux/slab.h> + #include <linux/smp.h> +@@ -293,12 +294,19 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) + #endif + memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); + ++ /* ++ * Bring up the CPU in cpu_bringup_and_idle() with the stack ++ * pointing just below where pt_regs would be if it were a normal ++ * kernel entry. ++ */ + ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; + ctxt->flags = VGCF_IN_KERNEL; + ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ + ctxt->user_regs.ds = __USER_DS; + ctxt->user_regs.es = __USER_DS; + ctxt->user_regs.ss = __KERNEL_DS; ++ ctxt->user_regs.cs = __KERNEL_CS; ++ ctxt->user_regs.esp = (unsigned long)task_pt_regs(idle); + + xen_copy_trap_info(ctxt->trap_ctxt); + +@@ -313,8 +321,13 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) + ctxt->gdt_frames[0] = gdt_mfn; + ctxt->gdt_ents = GDT_ENTRIES; + ++ /* ++ * Set SS:SP that Xen will use when entering guest kernel mode ++ * from guest user mode. Subsequent calls to load_sp0() can ++ * change this value. ++ */ + ctxt->kernel_ss = __KERNEL_DS; +- ctxt->kernel_sp = idle->thread.sp0; ++ ctxt->kernel_sp = task_top_of_stack(idle); + + #ifdef CONFIG_X86_32 + ctxt->event_callback_cs = __KERNEL_CS; +@@ -326,10 +339,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) + (unsigned long)xen_hypervisor_callback; + ctxt->failsafe_callback_eip = + (unsigned long)xen_failsafe_callback; +- ctxt->user_regs.cs = __KERNEL_CS; + per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); + +- ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); + ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); + if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt)) + BUG(); +-- +2.15.0 + diff --git a/queue/x86-xen-Add-unwind-hint-annotations.patch b/queue/x86-xen-Add-unwind-hint-annotations.patch new file mode 100644 index 0000000..a8292fc --- /dev/null +++ b/queue/x86-xen-Add-unwind-hint-annotations.patch @@ -0,0 +1,59 @@ +From abbe1cac6214d81d2f4e149aba64a8760703144e Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf <jpoimboe@redhat.com> +Date: Mon, 18 Sep 2017 21:43:36 -0500 +Subject: [PATCH] x86/xen: Add unwind hint annotations + +commit abbe1cac6214d81d2f4e149aba64a8760703144e upstream. + +Add unwind hint annotations to the xen head code so the ORC unwinder can +read head_64.o. + +hypercall_page needs empty annotations at 32-byte intervals to match the +'xen_hypercall_*' ELF functions at those locations. + +Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Jiri Slaby <jslaby@suse.cz> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/70ed2eb516fe9266be766d953f93c2571bca88cc.1505764066.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S +index 9753225289e8..124941d09b2b 100644 +--- a/arch/x86/xen/xen-head.S ++++ b/arch/x86/xen/xen-head.S +@@ -9,6 +9,7 @@ + #include <asm/boot.h> + #include <asm/asm.h> + #include <asm/page_types.h> ++#include <asm/unwind_hints.h> + + #include <xen/interface/elfnote.h> + #include <xen/interface/features.h> +@@ -19,6 +20,7 @@ + #ifdef CONFIG_XEN_PV + __INIT + ENTRY(startup_xen) ++ UNWIND_HINT_EMPTY + cld + + /* Clear .bss */ +@@ -40,7 +42,10 @@ END(startup_xen) + .pushsection .text + .balign PAGE_SIZE + ENTRY(hypercall_page) +- .skip PAGE_SIZE ++ .rept (PAGE_SIZE / 32) ++ UNWIND_HINT_EMPTY ++ .skip 32 ++ .endr + + #define HYPERCALL(n) \ + .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \ +-- +2.15.0 + diff --git a/queue/x86-xen-Drop-5-level-paging-support-code-from-the-XE.patch b/queue/x86-xen-Drop-5-level-paging-support-code-from-the-XE.patch new file mode 100644 index 0000000..da50f32 --- /dev/null +++ b/queue/x86-xen-Drop-5-level-paging-support-code-from-the-XE.patch @@ -0,0 +1,305 @@ +From 773dd2fca581b0a80e5a33332cc8ee67e5a79cba Mon Sep 17 00:00:00 2001 +From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> +Date: Fri, 29 Sep 2017 17:08:20 +0300 +Subject: [PATCH] x86/xen: Drop 5-level paging support code from the XEN_PV + code + +commit 773dd2fca581b0a80e5a33332cc8ee67e5a79cba upstream. + +It was decided 5-level paging is not going to be supported in XEN_PV. + +Let's drop the dead code from the XEN_PV code. + +Tested-by: Juergen Gross <jgross@suse.com> +Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> +Reviewed-by: Juergen Gross <jgross@suse.com> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Borislav Petkov <bp@suse.de> +Cc: Cyrill Gorcunov <gorcunov@openvz.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: linux-mm@kvack.org +Link: http://lkml.kernel.org/r/20170929140821.37654-6-kirill.shutemov@linux.intel.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c +index 71495f1a86d7..2ccdaba31a07 100644 +--- a/arch/x86/xen/mmu_pv.c ++++ b/arch/x86/xen/mmu_pv.c +@@ -449,7 +449,7 @@ __visible pmd_t xen_make_pmd(pmdval_t pmd) + } + PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); + +-#if CONFIG_PGTABLE_LEVELS == 4 ++#ifdef CONFIG_X86_64 + __visible pudval_t xen_pud_val(pud_t pud) + { + return pte_mfn_to_pfn(pud.pud); +@@ -538,7 +538,7 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val) + + xen_mc_issue(PARAVIRT_LAZY_MMU); + } +-#endif /* CONFIG_PGTABLE_LEVELS == 4 */ ++#endif /* CONFIG_X86_64 */ + + static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, + int (*func)(struct mm_struct *mm, struct page *, enum pt_level), +@@ -580,21 +580,17 @@ static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d, + int (*func)(struct mm_struct *mm, struct page *, enum pt_level), + bool last, unsigned long limit) + { +- int i, nr, flush = 0; ++ int flush = 0; ++ pud_t *pud; + +- nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D; +- for (i = 0; i < nr; i++) { +- pud_t *pud; + +- if (p4d_none(p4d[i])) +- continue; ++ if (p4d_none(*p4d)) ++ return flush; + +- pud = pud_offset(&p4d[i], 0); +- if (PTRS_PER_PUD > 1) +- flush |= (*func)(mm, virt_to_page(pud), PT_PUD); +- flush |= xen_pud_walk(mm, pud, func, +- last && i == nr - 1, limit); +- } ++ pud = pud_offset(p4d, 0); ++ if (PTRS_PER_PUD > 1) ++ flush |= (*func)(mm, virt_to_page(pud), PT_PUD); ++ flush |= xen_pud_walk(mm, pud, func, last, limit); + return flush; + } + +@@ -644,8 +640,6 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, + continue; + + p4d = p4d_offset(&pgd[i], 0); +- if (PTRS_PER_P4D > 1) +- flush |= (*func)(mm, virt_to_page(p4d), PT_P4D); + flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit); + } + +@@ -1176,22 +1170,14 @@ static void __init xen_cleanmfnmap(unsigned long vaddr) + { + pgd_t *pgd; + p4d_t *p4d; +- unsigned int i; + bool unpin; + + unpin = (vaddr == 2 * PGDIR_SIZE); + vaddr &= PMD_MASK; + pgd = pgd_offset_k(vaddr); + p4d = p4d_offset(pgd, 0); +- for (i = 0; i < PTRS_PER_P4D; i++) { +- if (p4d_none(p4d[i])) +- continue; +- xen_cleanmfnmap_p4d(p4d + i, unpin); +- } +- if (IS_ENABLED(CONFIG_X86_5LEVEL)) { +- set_pgd(pgd, __pgd(0)); +- xen_cleanmfnmap_free_pgtbl(p4d, unpin); +- } ++ if (!p4d_none(*p4d)) ++ xen_cleanmfnmap_p4d(p4d, unpin); + } + + static void __init xen_pagetable_p2m_free(void) +@@ -1692,7 +1678,7 @@ static void xen_release_pmd(unsigned long pfn) + xen_release_ptpage(pfn, PT_PMD); + } + +-#if CONFIG_PGTABLE_LEVELS >= 4 ++#ifdef CONFIG_X86_64 + static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) + { + xen_alloc_ptpage(mm, pfn, PT_PUD); +@@ -2029,13 +2015,12 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) + */ + void __init xen_relocate_p2m(void) + { +- phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys; ++ phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys; + unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; +- int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d; ++ int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud; + pte_t *pt; + pmd_t *pmd; + pud_t *pud; +- p4d_t *p4d = NULL; + pgd_t *pgd; + unsigned long *new_p2m; + int save_pud; +@@ -2045,11 +2030,7 @@ void __init xen_relocate_p2m(void) + n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; + n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; + n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT; +- if (PTRS_PER_P4D > 1) +- n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; +- else +- n_p4d = 0; +- n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d; ++ n_frames = n_pte + n_pt + n_pmd + n_pud; + + new_area = xen_find_free_area(PFN_PHYS(n_frames)); + if (!new_area) { +@@ -2065,76 +2046,56 @@ void __init xen_relocate_p2m(void) + * To avoid any possible virtual address collision, just use + * 2 * PUD_SIZE for the new area. + */ +- p4d_phys = new_area; +- pud_phys = p4d_phys + PFN_PHYS(n_p4d); ++ pud_phys = new_area; + pmd_phys = pud_phys + PFN_PHYS(n_pud); + pt_phys = pmd_phys + PFN_PHYS(n_pmd); + p2m_pfn = PFN_DOWN(pt_phys) + n_pt; + + pgd = __va(read_cr3_pa()); + new_p2m = (unsigned long *)(2 * PGDIR_SIZE); +- idx_p4d = 0; + save_pud = n_pud; +- do { +- if (n_p4d > 0) { +- p4d = early_memremap(p4d_phys, PAGE_SIZE); +- clear_page(p4d); +- n_pud = min(save_pud, PTRS_PER_P4D); +- } +- for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { +- pud = early_memremap(pud_phys, PAGE_SIZE); +- clear_page(pud); +- for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); +- idx_pmd++) { +- pmd = early_memremap(pmd_phys, PAGE_SIZE); +- clear_page(pmd); +- for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); +- idx_pt++) { +- pt = early_memremap(pt_phys, PAGE_SIZE); +- clear_page(pt); +- for (idx_pte = 0; +- idx_pte < min(n_pte, PTRS_PER_PTE); +- idx_pte++) { +- set_pte(pt + idx_pte, +- pfn_pte(p2m_pfn, PAGE_KERNEL)); +- p2m_pfn++; +- } +- n_pte -= PTRS_PER_PTE; +- early_memunmap(pt, PAGE_SIZE); +- make_lowmem_page_readonly(__va(pt_phys)); +- pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, +- PFN_DOWN(pt_phys)); +- set_pmd(pmd + idx_pt, +- __pmd(_PAGE_TABLE | pt_phys)); +- pt_phys += PAGE_SIZE; ++ for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { ++ pud = early_memremap(pud_phys, PAGE_SIZE); ++ clear_page(pud); ++ for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); ++ idx_pmd++) { ++ pmd = early_memremap(pmd_phys, PAGE_SIZE); ++ clear_page(pmd); ++ for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); ++ idx_pt++) { ++ pt = early_memremap(pt_phys, PAGE_SIZE); ++ clear_page(pt); ++ for (idx_pte = 0; ++ idx_pte < min(n_pte, PTRS_PER_PTE); ++ idx_pte++) { ++ set_pte(pt + idx_pte, ++ pfn_pte(p2m_pfn, PAGE_KERNEL)); ++ p2m_pfn++; + } +- n_pt -= PTRS_PER_PMD; +- early_memunmap(pmd, PAGE_SIZE); +- make_lowmem_page_readonly(__va(pmd_phys)); +- pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, +- PFN_DOWN(pmd_phys)); +- set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); +- pmd_phys += PAGE_SIZE; ++ n_pte -= PTRS_PER_PTE; ++ early_memunmap(pt, PAGE_SIZE); ++ make_lowmem_page_readonly(__va(pt_phys)); ++ pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, ++ PFN_DOWN(pt_phys)); ++ set_pmd(pmd + idx_pt, ++ __pmd(_PAGE_TABLE | pt_phys)); ++ pt_phys += PAGE_SIZE; + } +- n_pmd -= PTRS_PER_PUD; +- early_memunmap(pud, PAGE_SIZE); +- make_lowmem_page_readonly(__va(pud_phys)); +- pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); +- if (n_p4d > 0) +- set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys)); +- else +- set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); +- pud_phys += PAGE_SIZE; +- } +- if (n_p4d > 0) { +- save_pud -= PTRS_PER_P4D; +- early_memunmap(p4d, PAGE_SIZE); +- make_lowmem_page_readonly(__va(p4d_phys)); +- pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys)); +- set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys)); +- p4d_phys += PAGE_SIZE; ++ n_pt -= PTRS_PER_PMD; ++ early_memunmap(pmd, PAGE_SIZE); ++ make_lowmem_page_readonly(__va(pmd_phys)); ++ pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, ++ PFN_DOWN(pmd_phys)); ++ set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); ++ pmd_phys += PAGE_SIZE; + } +- } while (++idx_p4d < n_p4d); ++ n_pmd -= PTRS_PER_PUD; ++ early_memunmap(pud, PAGE_SIZE); ++ make_lowmem_page_readonly(__va(pud_phys)); ++ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); ++ set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); ++ pud_phys += PAGE_SIZE; ++ } + + /* Now copy the old p2m info to the new area. */ + memcpy(new_p2m, xen_p2m_addr, size); +@@ -2361,7 +2322,7 @@ static void __init xen_post_allocator_init(void) + pv_mmu_ops.set_pte = xen_set_pte; + pv_mmu_ops.set_pmd = xen_set_pmd; + pv_mmu_ops.set_pud = xen_set_pud; +-#if CONFIG_PGTABLE_LEVELS >= 4 ++#ifdef CONFIG_X86_64 + pv_mmu_ops.set_p4d = xen_set_p4d; + #endif + +@@ -2371,7 +2332,7 @@ static void __init xen_post_allocator_init(void) + pv_mmu_ops.alloc_pmd = xen_alloc_pmd; + pv_mmu_ops.release_pte = xen_release_pte; + pv_mmu_ops.release_pmd = xen_release_pmd; +-#if CONFIG_PGTABLE_LEVELS >= 4 ++#ifdef CONFIG_X86_64 + pv_mmu_ops.alloc_pud = xen_alloc_pud; + pv_mmu_ops.release_pud = xen_release_pud; + #endif +@@ -2435,14 +2396,14 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { + .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), + .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), + +-#if CONFIG_PGTABLE_LEVELS >= 4 ++#ifdef CONFIG_X86_64 + .pud_val = PV_CALLEE_SAVE(xen_pud_val), + .make_pud = PV_CALLEE_SAVE(xen_make_pud), + .set_p4d = xen_set_p4d_hyper, + + .alloc_pud = xen_alloc_pmd_init, + .release_pud = xen_release_pmd_init, +-#endif /* CONFIG_PGTABLE_LEVELS == 4 */ ++#endif /* CONFIG_X86_64 */ + + .activate_mm = xen_activate_mm, + .dup_mmap = xen_dup_mmap, +-- +2.15.0 + diff --git a/queue/x86-xen-Fix-xen-head-ELF-annotations.patch b/queue/x86-xen-Fix-xen-head-ELF-annotations.patch new file mode 100644 index 0000000..50f1759 --- /dev/null +++ b/queue/x86-xen-Fix-xen-head-ELF-annotations.patch @@ -0,0 +1,45 @@ +From 2582d3df95c76d3b686453baf90b64d57e87d1e8 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf <jpoimboe@redhat.com> +Date: Mon, 18 Sep 2017 21:43:35 -0500 +Subject: [PATCH] x86/xen: Fix xen head ELF annotations + +commit 2582d3df95c76d3b686453baf90b64d57e87d1e8 upstream. + +Mark the ends of the startup_xen and hypercall_page code sections. + +Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Jiri Slaby <jslaby@suse.cz> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/3a80a394d30af43d9cefa1a29628c45ed8420c97.1505764066.git.jpoimboe@redhat.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S +index a7525e95d53f..9753225289e8 100644 +--- a/arch/x86/xen/xen-head.S ++++ b/arch/x86/xen/xen-head.S +@@ -33,7 +33,7 @@ ENTRY(startup_xen) + mov $init_thread_union+THREAD_SIZE, %_ASM_SP + + jmp xen_start_kernel +- ++END(startup_xen) + __FINIT + #endif + +@@ -47,7 +47,7 @@ ENTRY(hypercall_page) + .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 + #include <asm/xen-hypercalls.h> + #undef HYPERCALL +- ++END(hypercall_page) + .popsection + + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") +-- +2.15.0 + diff --git a/queue/x86-xen-Provide-pre-built-page-tables-only-for-CONFI.patch b/queue/x86-xen-Provide-pre-built-page-tables-only-for-CONFI.patch new file mode 100644 index 0000000..4d01230 --- /dev/null +++ b/queue/x86-xen-Provide-pre-built-page-tables-only-for-CONFI.patch @@ -0,0 +1,69 @@ +From 4375c29985f155d7eb2346615d84e62d1b673682 Mon Sep 17 00:00:00 2001 +From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> +Date: Fri, 29 Sep 2017 17:08:19 +0300 +Subject: [PATCH] x86/xen: Provide pre-built page tables only for + CONFIG_XEN_PV=y and CONFIG_XEN_PVH=y + +commit 4375c29985f155d7eb2346615d84e62d1b673682 upstream. + +Looks like we only need pre-built page tables in the CONFIG_XEN_PV=y and +CONFIG_XEN_PVH=y cases. + +Let's not provide them for other configurations. + +Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> +Reviewed-by: Juergen Gross <jgross@suse.com> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Borislav Petkov <bp@suse.de> +Cc: Cyrill Gorcunov <gorcunov@openvz.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: linux-mm@kvack.org +Link: http://lkml.kernel.org/r/20170929140821.37654-5-kirill.shutemov@linux.intel.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S +index 513cbb012ecc..2be7d1e7fcf1 100644 +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -37,11 +37,12 @@ + * + */ + +-#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1)) + #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) + ++#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) + PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) + PGD_START_KERNEL = pgd_index(__START_KERNEL_map) ++#endif + L3_START_KERNEL = pud_index(__START_KERNEL_map) + + .text +@@ -361,10 +362,7 @@ NEXT_PAGE(early_dynamic_pgts) + + .data + +-#ifndef CONFIG_XEN +-NEXT_PAGE(init_top_pgt) +- .fill 512,8,0 +-#else ++#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) + NEXT_PAGE(init_top_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC + .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 +@@ -381,6 +379,9 @@ NEXT_PAGE(level2_ident_pgt) + * Don't set NX because code runs from these pages. + */ + PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) ++#else ++NEXT_PAGE(init_top_pgt) ++ .fill 512,8,0 + #endif + + #ifdef CONFIG_X86_5LEVEL +-- +2.15.0 + diff --git a/queue/xen-x86-entry-64-Add-xen-NMI-trap-entry.patch b/queue/xen-x86-entry-64-Add-xen-NMI-trap-entry.patch new file mode 100644 index 0000000..202a3f4 --- /dev/null +++ b/queue/xen-x86-entry-64-Add-xen-NMI-trap-entry.patch @@ -0,0 +1,91 @@ +From 43e4111086a70c78bedb6ad990bee97f17b27a6e Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Thu, 2 Nov 2017 00:59:07 -0700 +Subject: [PATCH] xen, x86/entry/64: Add xen NMI trap entry + +commit 43e4111086a70c78bedb6ad990bee97f17b27a6e upstream. + +Instead of trying to execute any NMI via the bare metal's NMI trap +handler use a Xen specific one for PV domains, like we do for e.g. +debug traps. As in a PV domain the NMI is handled via the normal +kernel stack this is the correct thing to do. + +This will enable us to get rid of the very fragile and questionable +dependencies between the bare metal NMI handler and Xen assumptions +believed to be broken anyway. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/5baf5c0528d58402441550c5770b98e7961e7680.1509609304.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 5b2f0bc661a0..a3f76ab5d0ea 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -1078,6 +1078,7 @@ idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK + idtentry stack_segment do_stack_segment has_error_code=1 + + #ifdef CONFIG_XEN ++idtentry xennmi do_nmi has_error_code=0 + idtentry xendebug do_debug has_error_code=0 + idtentry xenint3 do_int3 has_error_code=0 + #endif +@@ -1240,7 +1241,6 @@ ENTRY(error_exit) + END(error_exit) + + /* Runs on exception stack */ +-/* XXX: broken on Xen PV */ + ENTRY(nmi) + UNWIND_HINT_IRET_REGS + /* +diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h +index da3c3a3674a5..e76ce80ca18b 100644 +--- a/arch/x86/include/asm/traps.h ++++ b/arch/x86/include/asm/traps.h +@@ -37,9 +37,9 @@ asmlinkage void simd_coprocessor_error(void); + + #if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) + asmlinkage void xen_divide_error(void); ++asmlinkage void xen_xennmi(void); + asmlinkage void xen_xendebug(void); + asmlinkage void xen_xenint3(void); +-asmlinkage void xen_nmi(void); + asmlinkage void xen_overflow(void); + asmlinkage void xen_bounds(void); + asmlinkage void xen_invalid_op(void); +diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c +index 69b9deff7e5c..8da4eff19c2a 100644 +--- a/arch/x86/xen/enlighten_pv.c ++++ b/arch/x86/xen/enlighten_pv.c +@@ -600,7 +600,7 @@ static struct trap_array_entry trap_array[] = { + #ifdef CONFIG_X86_MCE + { machine_check, xen_machine_check, true }, + #endif +- { nmi, xen_nmi, true }, ++ { nmi, xen_xennmi, true }, + { overflow, xen_overflow, false }, + #ifdef CONFIG_IA32_EMULATION + { entry_INT80_compat, xen_entry_INT80_compat, false }, +diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S +index dae2cc33afb5..286ecc198562 100644 +--- a/arch/x86/xen/xen-asm_64.S ++++ b/arch/x86/xen/xen-asm_64.S +@@ -29,7 +29,7 @@ xen_pv_trap debug + xen_pv_trap xendebug + xen_pv_trap int3 + xen_pv_trap xenint3 +-xen_pv_trap nmi ++xen_pv_trap xennmi + xen_pv_trap overflow + xen_pv_trap bounds + xen_pv_trap invalid_op +-- +2.15.0 + |