aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-03-12 09:58:57 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-03-12 09:58:57 -0700
commitb29f377119f68b942369a9366bdcb1fec82b2cda (patch)
tree1d5bca16e70aa3c1a0660825d0d8fc3f09abc262
parente66c58f743513119f703f3a47f0f93a8e82c0028 (diff)
parent2e2bc42c8381d2c0e9604b59e49264821da29368 (diff)
downloadlinux-can-b29f377119f68b942369a9366bdcb1fec82b2cda.tar.gz
Merge tag 'x86-boot-2024-03-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 boot updates from Ingo Molnar: - Continuing work by Ard Biesheuvel to improve the x86 early startup code, with the long-term goal to make it position independent: - Get rid of early accesses to global objects, either by moving them to the stack, deferring the access until later, or dropping the globals entirely - Move all code that runs early via the 1:1 mapping into .head.text, and move code that does not out of it, so that build time checks can be added later to ensure that no inadvertent absolute references were emitted into code that does not tolerate them - Remove fixup_pointer() and occurrences of __pa_symbol(), which rely on the compiler emitting absolute references, which is not guaranteed - Improve the early console code - Add early console message about ignored NMIs, so that users are at least warned about their existence - even if we cannot do anything about them - Improve the kexec code's kernel load address handling - Enable more X86S (simplified x86) bits - Simplify early boot GDT handling - Micro-optimize the boot code a bit - Misc cleanups * tag 'x86-boot-2024-03-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (22 commits) x86/sev: Move early startup code into .head.text section x86/sme: Move early SME kernel encryption handling into .head.text x86/boot: Move mem_encrypt= parsing to the decompressor efi/libstub: Add generic support for parsing mem_encrypt= x86/startup_64: Simplify virtual switch on primary boot x86/startup_64: Simplify calculation of initial page table address x86/startup_64: Defer assignment of 5-level paging global variables x86/startup_64: Simplify CR4 handling in startup code x86/boot: Use 32-bit XOR to clear registers efi/x86: Set the PE/COFF header's NX compat flag unconditionally x86/boot/64: Load the final kernel GDT during early boot directly, remove startup_gdt[] x86/boot/64: Use RIP_REL_REF() to access early_top_pgt[] x86/boot/64: Use RIP_REL_REF() to access early page tables x86/boot/64: Use RIP_REL_REF() to access '__supported_pte_mask' x86/boot/64: Use RIP_REL_REF() to access early_dynamic_pgts[] x86/boot/64: Use RIP_REL_REF() to assign 'phys_base' x86/boot/64: Simplify global variable accesses in GDT/IDT programming x86/trampoline: Bypass compat mode in trampoline_start64() if not needed kexec: Allocate kernel above bzImage's pref_address x86/boot: Add a message about ignored early NMIs ...
-rw-r--r--Documentation/arch/x86/boot.rst3
-rw-r--r--arch/x86/Kconfig10
-rw-r--r--arch/x86/boot/compressed/ident_map_64.c2
-rw-r--r--arch/x86/boot/compressed/misc.c55
-rw-r--r--arch/x86/boot/compressed/misc.h3
-rw-r--r--arch/x86/boot/compressed/sev.c3
-rw-r--r--arch/x86/boot/header.S4
-rw-r--r--arch/x86/include/asm/desc.h1
-rw-r--r--arch/x86/include/asm/mem_encrypt.h8
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h2
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/include/asm/sev.h10
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h1
-rw-r--r--arch/x86/kernel/head64.c179
-rw-r--r--arch/x86/kernel/head_64.S144
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c5
-rw-r--r--arch/x86/kernel/sev-shared.c23
-rw-r--r--arch/x86/kernel/sev.c14
-rw-r--r--arch/x86/kernel/sev_verify_cbit.S2
-rw-r--r--arch/x86/lib/Makefile13
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c74
-rw-r--r--arch/x86/realmode/rm/trampoline_64.S33
-rw-r--r--drivers/firmware/efi/libstub/efi-stub-helper.c8
-rw-r--r--drivers/firmware/efi/libstub/efistub.h2
-rw-r--r--drivers/firmware/efi/libstub/x86-stub.c3
25 files changed, 273 insertions, 331 deletions
diff --git a/Documentation/arch/x86/boot.rst b/Documentation/arch/x86/boot.rst
index c513855a54bb99..4fd492cb49704f 100644
--- a/Documentation/arch/x86/boot.rst
+++ b/Documentation/arch/x86/boot.rst
@@ -878,7 +878,8 @@ Protocol: 2.10+
address if possible.
A non-relocatable kernel will unconditionally move itself and to run
- at this address.
+ at this address. A relocatable kernel will move itself to this address if it
+ loaded below this address.
============ =======
Field name: init_size
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 77e8c8c6795044..e88f6f7b6b4129 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2111,11 +2111,11 @@ config PHYSICAL_START
help
This gives the physical address where the kernel is loaded.
- If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then
- bzImage will decompress itself to above physical address and
- run from there. Otherwise, bzImage will run from the address where
- it has been loaded by the boot loader and will ignore above physical
- address.
+ If the kernel is not relocatable (CONFIG_RELOCATABLE=n) then bzImage
+ will decompress itself to above physical address and run from there.
+ Otherwise, bzImage will run from the address where it has been loaded
+ by the boot loader. The only exception is if it is loaded below the
+ above physical address, in which case it will relocate itself there.
In normal kdump cases one does not have to set/change this option
as now bzImage can be compiled as a completely relocatable image
diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c
index ff09ca6dbb87ee..909f2a35b60c5d 100644
--- a/arch/x86/boot/compressed/ident_map_64.c
+++ b/arch/x86/boot/compressed/ident_map_64.c
@@ -389,5 +389,5 @@ void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code)
void do_boot_nmi_trap(struct pt_regs *regs, unsigned long error_code)
{
- /* Empty handler to ignore NMI during early boot */
+ spurious_nmi_count++;
}
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index b99e08e6815b1a..408507e305be61 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -52,6 +52,7 @@ struct port_io_ops pio_ops;
memptr free_mem_ptr;
memptr free_mem_end_ptr;
+int spurious_nmi_count;
static char *vidmem;
static int vidport;
@@ -164,21 +165,34 @@ void __putstr(const char *s)
outb(0xff & (pos >> 1), vidport+1);
}
-void __puthex(unsigned long value)
+static noinline void __putnum(unsigned long value, unsigned int base,
+ int mindig)
{
- char alpha[2] = "0";
- int bits;
+ char buf[8*sizeof(value)+1];
+ char *p;
- for (bits = sizeof(value) * 8 - 4; bits >= 0; bits -= 4) {
- unsigned long digit = (value >> bits) & 0xf;
+ p = buf + sizeof(buf);
+ *--p = '\0';
- if (digit < 0xA)
- alpha[0] = '0' + digit;
- else
- alpha[0] = 'a' + (digit - 0xA);
+ while (mindig-- > 0 || value) {
+ unsigned char digit = value % base;
+ digit += (digit >= 10) ? ('a'-10) : '0';
+ *--p = digit;
- __putstr(alpha);
+ value /= base;
}
+
+ __putstr(p);
+}
+
+void __puthex(unsigned long value)
+{
+ __putnum(value, 16, sizeof(value)*2);
+}
+
+void __putdec(unsigned long value)
+{
+ __putnum(value, 10, 1);
}
#ifdef CONFIG_X86_NEED_RELOCS
@@ -358,6 +372,19 @@ unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr,
}
/*
+ * Set the memory encryption xloadflag based on the mem_encrypt= command line
+ * parameter, if provided.
+ */
+static void parse_mem_encrypt(struct setup_header *hdr)
+{
+ int on = cmdline_find_option_bool("mem_encrypt=on");
+ int off = cmdline_find_option_bool("mem_encrypt=off");
+
+ if (on > off)
+ hdr->xloadflags |= XLF_MEM_ENCRYPTION;
+}
+
+/*
* The compressed kernel image (ZO), has been moved so that its position
* is against the end of the buffer used to hold the uncompressed kernel
* image (VO) and the execution environment (.bss, .brk), which makes sure
@@ -387,6 +414,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output)
/* Clear flags intended for solely in-kernel use. */
boot_params_ptr->hdr.loadflags &= ~KASLR_FLAG;
+ parse_mem_encrypt(&boot_params_ptr->hdr);
+
sanitize_boot_params(boot_params_ptr);
if (boot_params_ptr->screen_info.orig_video_mode == 7) {
@@ -493,6 +522,12 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output)
/* Disable exception handling before booting the kernel */
cleanup_exception_handling();
+ if (spurious_nmi_count) {
+ error_putstr("Spurious early NMIs ignored: ");
+ error_putdec(spurious_nmi_count);
+ error_putstr("\n");
+ }
+
return output + entry_offset;
}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index bc2f0f17fb90ec..b353a7be380cb8 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -59,12 +59,15 @@ extern char _head[], _end[];
/* misc.c */
extern memptr free_mem_ptr;
extern memptr free_mem_end_ptr;
+extern int spurious_nmi_count;
void *malloc(int size);
void free(void *where);
void __putstr(const char *s);
void __puthex(unsigned long value);
+void __putdec(unsigned long value);
#define error_putstr(__x) __putstr(__x)
#define error_puthex(__x) __puthex(__x)
+#define error_putdec(__x) __putdec(__x)
#ifdef CONFIG_X86_VERBOSE_BOOTUP
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 97561eabfbef12..ec71846d28c9ed 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -117,6 +117,9 @@ static bool fault_in_kernel_space(unsigned long address)
#undef __init
#define __init
+#undef __head
+#define __head
+
#define __BOOT_COMPRESSED
/* Basic instruction decoding support needed */
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index a1bbedd989e42e..b5c79f43359bcd 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -111,11 +111,7 @@ extra_header_fields:
.long salign # SizeOfHeaders
.long 0 # CheckSum
.word IMAGE_SUBSYSTEM_EFI_APPLICATION # Subsystem (EFI application)
-#ifdef CONFIG_EFI_DXE_MEM_ATTRIBUTES
.word IMAGE_DLL_CHARACTERISTICS_NX_COMPAT # DllCharacteristics
-#else
- .word 0 # DllCharacteristics
-#endif
#ifdef CONFIG_X86_32
.long 0 # SizeOfStackReserve
.long 0 # SizeOfStackCommit
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index ec95fe44fa3a03..62dc9f59ea768c 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -46,6 +46,7 @@ struct gdt_page {
} __attribute__((aligned(PAGE_SIZE)));
DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
+DECLARE_INIT_PER_CPU(gdt_page);
/* Provide the original GDT */
static inline struct desc_struct *get_cpu_gdt_rw(unsigned int cpu)
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index b31eb9fd595446..f922b682b9b4c5 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -47,8 +47,8 @@ void __init sme_unmap_bootdata(char *real_mode_data);
void __init sme_early_init(void);
-void __init sme_encrypt_kernel(struct boot_params *bp);
-void __init sme_enable(struct boot_params *bp);
+void sme_encrypt_kernel(struct boot_params *bp);
+void sme_enable(struct boot_params *bp);
int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size);
int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size);
@@ -81,8 +81,8 @@ static inline void __init sme_unmap_bootdata(char *real_mode_data) { }
static inline void __init sme_early_init(void) { }
-static inline void __init sme_encrypt_kernel(struct boot_params *bp) { }
-static inline void __init sme_enable(struct boot_params *bp) { }
+static inline void sme_encrypt_kernel(struct boot_params *bp) { }
+static inline void sme_enable(struct boot_params *bp) { }
static inline void sev_es_init_vc_handling(void) { }
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 38b54b992f32e3..9053dfe9fa03f9 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -21,9 +21,9 @@ typedef unsigned long pgprotval_t;
typedef struct { pteval_t pte; } pte_t;
typedef struct { pmdval_t pmd; } pmd_t;
-#ifdef CONFIG_X86_5LEVEL
extern unsigned int __pgtable_l5_enabled;
+#ifdef CONFIG_X86_5LEVEL
#ifdef USE_EARLY_PGTABLE_L5
/*
* cpu_feature_enabled() is not available in early boot code.
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 5c83729c8e71ff..e61e68d71cba04 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -48,7 +48,7 @@ extern unsigned long saved_video_mode;
extern void reserve_standard_io_resources(void);
extern void i386_reserve_resources(void);
extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp);
-extern void startup_64_setup_env(unsigned long physbase);
+extern void startup_64_setup_gdt_idt(void);
extern void early_setup_idt(void);
extern void __init do_early_exception(struct pt_regs *regs, int trapnr);
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 10f9f1b259c322..9477b4053bce2c 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -214,16 +214,16 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate)
struct snp_guest_request_ioctl;
void setup_ghcb(void);
-void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
- unsigned long npages);
-void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
- unsigned long npages);
+void early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages);
+void early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages);
void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op);
void snp_set_memory_shared(unsigned long vaddr, unsigned long npages);
void snp_set_memory_private(unsigned long vaddr, unsigned long npages);
void snp_set_wakeup_secondary_cpu(void);
bool snp_init(struct boot_params *bp);
-void __init __noreturn snp_abort(void);
+void __noreturn snp_abort(void);
int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio);
void snp_accept_memory(phys_addr_t start, phys_addr_t end);
u64 snp_get_unsupported_features(u64 status);
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 4a38e7917756fa..9b82eebd7add55 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -24,6 +24,7 @@
#define XLF_EFI_KEXEC (1<<4)
#define XLF_5LEVEL (1<<5)
#define XLF_5LEVEL_ENABLED (1<<6)
+#define XLF_MEM_ENCRYPTION (1<<7)
#ifndef __ASSEMBLY__
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index dc0956067944dc..212e8e06aeba23 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -22,6 +22,8 @@
#include <linux/cc_platform.h>
#include <linux/pgtable.h>
+#include <asm/asm.h>
+#include <asm/page_64.h>
#include <asm/processor.h>
#include <asm/proto.h>
#include <asm/smp.h>
@@ -67,42 +69,11 @@ unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4;
EXPORT_SYMBOL(vmemmap_base);
#endif
-/*
- * GDT used on the boot CPU before switching to virtual addresses.
- */
-static struct desc_struct startup_gdt[GDT_ENTRIES] __initdata = {
- [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff),
-};
-
-/*
- * Address needs to be set at runtime because it references the startup_gdt
- * while the kernel still uses a direct mapping.
- */
-static struct desc_ptr startup_gdt_descr __initdata = {
- .size = sizeof(startup_gdt)-1,
- .address = 0,
-};
-
-static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
-{
- return ptr - (void *)_text + (void *)physaddr;
-}
-
-static unsigned long __head *fixup_long(void *ptr, unsigned long physaddr)
-{
- return fixup_pointer(ptr, physaddr);
-}
-
-#ifdef CONFIG_X86_5LEVEL
-static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr)
+static inline bool check_la57_support(void)
{
- return fixup_pointer(ptr, physaddr);
-}
+ if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ return false;
-static bool __head check_la57_support(unsigned long physaddr)
-{
/*
* 5-level paging is detected and enabled at kernel decompression
* stage. Only check if it has been enabled there.
@@ -110,21 +81,8 @@ static bool __head check_la57_support(unsigned long physaddr)
if (!(native_read_cr4() & X86_CR4_LA57))
return false;
- *fixup_int(&__pgtable_l5_enabled, physaddr) = 1;
- *fixup_int(&pgdir_shift, physaddr) = 48;
- *fixup_int(&ptrs_per_p4d, physaddr) = 512;
- *fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5;
- *fixup_long(&vmalloc_base, physaddr) = __VMALLOC_BASE_L5;
- *fixup_long(&vmemmap_base, physaddr) = __VMEMMAP_BASE_L5;
-
return true;
}
-#else
-static bool __head check_la57_support(unsigned long physaddr)
-{
- return false;
-}
-#endif
static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd)
{
@@ -173,23 +131,22 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv
* doesn't have to generate PC-relative relocations when accessing globals from
* that function. Clang actually does not generate them, which leads to
* boot-time crashes. To work around this problem, every global pointer must
- * be adjusted using fixup_pointer().
+ * be accessed using RIP_REL_REF().
*/
unsigned long __head __startup_64(unsigned long physaddr,
struct boot_params *bp)
{
- unsigned long load_delta, *p;
+ pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts);
unsigned long pgtable_flags;
+ unsigned long load_delta;
pgdval_t *pgd;
p4dval_t *p4d;
pudval_t *pud;
pmdval_t *pmd, pmd_entry;
- pteval_t *mask_ptr;
bool la57;
int i;
- unsigned int *next_pgt_ptr;
- la57 = check_la57_support(physaddr);
+ la57 = check_la57_support();
/* Is the address too large? */
if (physaddr >> MAX_PHYSMEM_BITS)
@@ -200,6 +157,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
* and the address I am actually running at.
*/
load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
+ RIP_REL_REF(phys_base) = load_delta;
/* Is the address not 2M aligned? */
if (load_delta & ~PMD_MASK)
@@ -210,26 +168,21 @@ unsigned long __head __startup_64(unsigned long physaddr,
/* Fixup the physical addresses in the page table */
- pgd = fixup_pointer(early_top_pgt, physaddr);
- p = pgd + pgd_index(__START_KERNEL_map);
- if (la57)
- *p = (unsigned long)level4_kernel_pgt;
- else
- *p = (unsigned long)level3_kernel_pgt;
- *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
+ pgd = &RIP_REL_REF(early_top_pgt)->pgd;
+ pgd[pgd_index(__START_KERNEL_map)] += load_delta;
if (la57) {
- p4d = fixup_pointer(level4_kernel_pgt, physaddr);
- p4d[511] += load_delta;
+ p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt);
+ p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
+
+ pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE_NOENC;
}
- pud = fixup_pointer(level3_kernel_pgt, physaddr);
- pud[510] += load_delta;
- pud[511] += load_delta;
+ RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 2].pud += load_delta;
+ RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 1].pud += load_delta;
- pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
- pmd[i] += load_delta;
+ RIP_REL_REF(level2_fixmap_pgt)[i].pmd += load_delta;
/*
* Set up the identity mapping for the switchover. These
@@ -238,15 +191,14 @@ unsigned long __head __startup_64(unsigned long physaddr,
* it avoids problems around wraparound.
*/
- next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr);
- pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
- pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
+ pud = &early_pgts[0]->pmd;
+ pmd = &early_pgts[1]->pmd;
+ RIP_REL_REF(next_early_pgt) = 2;
pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
if (la57) {
- p4d = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++],
- physaddr);
+ p4d = &early_pgts[RIP_REL_REF(next_early_pgt)++]->pmd;
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
@@ -267,8 +219,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
/* Filter out unsupported __PAGE_KERNEL_* bits: */
- mask_ptr = fixup_pointer(&__supported_pte_mask, physaddr);
- pmd_entry &= *mask_ptr;
+ pmd_entry &= RIP_REL_REF(__supported_pte_mask);
pmd_entry += sme_get_me_mask();
pmd_entry += physaddr;
@@ -294,7 +245,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
* error, causing the BIOS to halt the system.
*/
- pmd = fixup_pointer(level2_kernel_pgt, physaddr);
+ pmd = &RIP_REL_REF(level2_kernel_pgt)->pmd;
/* invalidate pages before the kernel image */
for (i = 0; i < pmd_index((unsigned long)_text); i++)
@@ -309,12 +260,6 @@ unsigned long __head __startup_64(unsigned long physaddr,
for (; i < PTRS_PER_PMD; i++)
pmd[i] &= ~_PAGE_PRESENT;
- /*
- * Fixup phys_base - remove the memory encryption mask to obtain
- * the true physical address.
- */
- *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask();
-
return sme_postprocess_startup(bp, pmd);
}
@@ -486,6 +431,15 @@ asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
+ if (check_la57_support()) {
+ __pgtable_l5_enabled = 1;
+ pgdir_shift = 48;
+ ptrs_per_p4d = 512;
+ page_offset_base = __PAGE_OFFSET_BASE_L5;
+ vmalloc_base = __VMALLOC_BASE_L5;
+ vmemmap_base = __VMEMMAP_BASE_L5;
+ }
+
cr4_init_shadow();
/* Kill off the identity-map trampoline */
@@ -569,62 +523,52 @@ void __init __noreturn x86_64_start_reservations(char *real_mode_data)
*/
static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data;
-static struct desc_ptr bringup_idt_descr = {
- .size = (NUM_EXCEPTION_VECTORS * sizeof(gate_desc)) - 1,
- .address = 0, /* Set at runtime */
-};
-
-static void set_bringup_idt_handler(gate_desc *idt, int n, void *handler)
+/* This may run while still in the direct mapping */
+static void __head startup_64_load_idt(void *vc_handler)
{
-#ifdef CONFIG_AMD_MEM_ENCRYPT
+ struct desc_ptr desc = {
+ .address = (unsigned long)&RIP_REL_REF(bringup_idt_table),
+ .size = sizeof(bringup_idt_table) - 1,
+ };
struct idt_data data;
- gate_desc desc;
-
- init_idt_data(&data, n, handler);
- idt_init_desc(&desc, &data);
- native_write_idt_entry(idt, n, &desc);
-#endif
-}
+ gate_desc idt_desc;
-/* This runs while still in the direct mapping */
-static void __head startup_64_load_idt(unsigned long physbase)
-{
- struct desc_ptr *desc = fixup_pointer(&bringup_idt_descr, physbase);
- gate_desc *idt = fixup_pointer(bringup_idt_table, physbase);
-
-
- if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
- void *handler;
-
- /* VMM Communication Exception */
- handler = fixup_pointer(vc_no_ghcb, physbase);
- set_bringup_idt_handler(idt, X86_TRAP_VC, handler);
+ /* @vc_handler is set only for a VMM Communication Exception */
+ if (vc_handler) {
+ init_idt_data(&data, X86_TRAP_VC, vc_handler);
+ idt_init_desc(&idt_desc, &data);
+ native_write_idt_entry((gate_desc *)desc.address, X86_TRAP_VC, &idt_desc);
}
- desc->address = (unsigned long)idt;
- native_load_idt(desc);
+ native_load_idt(&desc);
}
/* This is used when running on kernel addresses */
void early_setup_idt(void)
{
- /* VMM Communication Exception */
+ void *handler = NULL;
+
if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
setup_ghcb();
- set_bringup_idt_handler(bringup_idt_table, X86_TRAP_VC, vc_boot_ghcb);
+ handler = vc_boot_ghcb;
}
- bringup_idt_descr.address = (unsigned long)bringup_idt_table;
- native_load_idt(&bringup_idt_descr);
+ startup_64_load_idt(handler);
}
/*
* Setup boot CPU state needed before kernel switches to virtual addresses.
*/
-void __head startup_64_setup_env(unsigned long physbase)
+void __head startup_64_setup_gdt_idt(void)
{
+ void *handler = NULL;
+
+ struct desc_ptr startup_gdt_descr = {
+ .address = (unsigned long)&RIP_REL_REF(init_per_cpu_var(gdt_page.gdt)),
+ .size = GDT_SIZE - 1,
+ };
+
/* Load GDT */
- startup_gdt_descr.address = (unsigned long)fixup_pointer(startup_gdt, physbase);
native_load_gdt(&startup_gdt_descr);
/* New GDT is live - reload data segment registers */
@@ -632,5 +576,8 @@ void __head startup_64_setup_env(unsigned long physbase)
"movl %%eax, %%ss\n"
"movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory");
- startup_64_load_idt(physbase);
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
+ handler = &RIP_REL_REF(vc_no_ghcb);
+
+ startup_64_load_idt(handler);
}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3dbd05f9385993..d8198fbd70e54e 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -40,7 +40,6 @@ L4_START_KERNEL = l4_index(__START_KERNEL_map)
L3_START_KERNEL = pud_index(__START_KERNEL_map)
- .text
__HEAD
.code64
SYM_CODE_START_NOALIGN(startup_64)
@@ -69,8 +68,6 @@ SYM_CODE_START_NOALIGN(startup_64)
/* Set up the stack for verify_cpu() */
leaq (__end_init_task - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE)(%rip), %rsp
- leaq _text(%rip), %rdi
-
/* Setup GSBASE to allow stack canary access for C code */
movl $MSR_GS_BASE, %ecx
leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx
@@ -78,7 +75,7 @@ SYM_CODE_START_NOALIGN(startup_64)
shrq $32, %rdx
wrmsr
- call startup_64_setup_env
+ call startup_64_setup_gdt_idt
/* Now switch to __KERNEL_CS so IRET works reliably */
pushq $__KERNEL_CS
@@ -114,13 +111,11 @@ SYM_CODE_START_NOALIGN(startup_64)
call __startup_64
/* Form the CR3 value being sure to include the CR3 modifier */
- addq $(early_top_pgt - __START_KERNEL_map), %rax
+ leaq early_top_pgt(%rip), %rcx
+ addq %rcx, %rax
#ifdef CONFIG_AMD_MEM_ENCRYPT
mov %rax, %rdi
- mov %rax, %r14
-
- addq phys_base(%rip), %rdi
/*
* For SEV guests: Verify that the C-bit is correct. A malicious
@@ -129,17 +124,23 @@ SYM_CODE_START_NOALIGN(startup_64)
* the next RET instruction.
*/
call sev_verify_cbit
+#endif
/*
- * Restore CR3 value without the phys_base which will be added
- * below, before writing %cr3.
+ * Switch to early_top_pgt which still has the identity mappings
+ * present.
*/
- mov %r14, %rax
-#endif
+ movq %rax, %cr3
- jmp 1f
+ /* Branch to the common startup code at its kernel virtual address */
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *0f(%rip)
SYM_CODE_END(startup_64)
+ __INITRODATA
+0: .quad common_startup_64
+
+ .text
SYM_CODE_START(secondary_startup_64)
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
@@ -172,22 +173,39 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
/* Clear %R15 which holds the boot_params pointer on the boot CPU */
- xorq %r15, %r15
+ xorl %r15d, %r15d
+
+ /* Derive the runtime physical address of init_top_pgt[] */
+ movq phys_base(%rip), %rax
+ addq $(init_top_pgt - __START_KERNEL_map), %rax
/*
* Retrieve the modifier (SME encryption mask if SME is active) to be
* added to the initial pgdir entry that will be programmed into CR3.
*/
#ifdef CONFIG_AMD_MEM_ENCRYPT
- movq sme_me_mask, %rax
-#else
- xorq %rax, %rax
+ addq sme_me_mask(%rip), %rax
#endif
+ /*
+ * Switch to the init_top_pgt here, away from the trampoline_pgd and
+ * unmap the identity mapped ranges.
+ */
+ movq %rax, %cr3
- /* Form the CR3 value being sure to include the CR3 modifier */
- addq $(init_top_pgt - __START_KERNEL_map), %rax
-1:
+SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL)
+ UNWIND_HINT_END_OF_STACK
+ ANNOTATE_NOENDBR
+ /*
+ * Create a mask of CR4 bits to preserve. Omit PGE in order to flush
+ * global 1:1 translations from the TLBs.
+ *
+ * From the SDM:
+ * "If CR4.PGE is changing from 0 to 1, there were no global TLB
+ * entries before the execution; if CR4.PGE is changing from 1 to 0,
+ * there will be no global TLB entries after the execution."
+ */
+ movl $(X86_CR4_PAE | X86_CR4_LA57), %edx
#ifdef CONFIG_X86_MCE
/*
* Preserve CR4.MCE if the kernel will enable #MC support.
@@ -196,52 +214,20 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
* configured will crash the system regardless of the CR4.MCE value set
* here.
*/
- movq %cr4, %rcx
- andl $X86_CR4_MCE, %ecx
-#else
- movl $0, %ecx
+ orl $X86_CR4_MCE, %edx
#endif
+ movq %cr4, %rcx
+ andl %edx, %ecx
- /* Enable PAE mode, PSE, PGE and LA57 */
- orl $(X86_CR4_PAE | X86_CR4_PSE | X86_CR4_PGE), %ecx
-#ifdef CONFIG_X86_5LEVEL
- testb $1, __pgtable_l5_enabled(%rip)
- jz 1f
- orl $X86_CR4_LA57, %ecx
-1:
-#endif
+ /* Even if ignored in long mode, set PSE uniformly on all logical CPUs. */
+ btsl $X86_CR4_PSE_BIT, %ecx
movq %rcx, %cr4
- /* Setup early boot stage 4-/5-level pagetables. */
- addq phys_base(%rip), %rax
-
/*
- * Switch to new page-table
- *
- * For the boot CPU this switches to early_top_pgt which still has the
- * identity mappings present. The secondary CPUs will switch to the
- * init_top_pgt here, away from the trampoline_pgd and unmap the
- * identity mapped ranges.
+ * Set CR4.PGE to re-enable global translations.
*/
- movq %rax, %cr3
-
- /*
- * Do a global TLB flush after the CR3 switch to make sure the TLB
- * entries from the identity mapping are flushed.
- */
- movq %cr4, %rcx
- movq %rcx, %rax
- xorq $X86_CR4_PGE, %rcx
+ btsl $X86_CR4_PGE_BIT, %ecx
movq %rcx, %cr4
- movq %rax, %cr4
-
- /* Ensure I am executing from virtual addresses */
- movq $1f, %rax
- ANNOTATE_RETPOLINE_SAFE
- jmp *%rax
-1:
- UNWIND_HINT_END_OF_STACK
- ANNOTATE_NOENDBR // above
#ifdef CONFIG_SMP
/*
@@ -298,7 +284,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
.Llookup_AP:
/* EAX contains the APIC ID of the current CPU */
- xorq %rcx, %rcx
+ xorl %ecx, %ecx
leaq cpuid_to_apicid(%rip), %rbx
.Lfind_cpunr:
@@ -429,39 +415,10 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
movq %r15, %rdi
.Ljump_to_C_code:
- /*
- * Jump to run C code and to be on a real kernel address.
- * Since we are running on identity-mapped space we have to jump
- * to the full 64bit address, this is only possible as indirect
- * jump. In addition we need to ensure %cs is set so we make this
- * a far return.
- *
- * Note: do not change to far jump indirect with 64bit offset.
- *
- * AMD does not support far jump indirect with 64bit offset.
- * AMD64 Architecture Programmer's Manual, Volume 3: states only
- * JMP FAR mem16:16 FF /5 Far jump indirect,
- * with the target specified by a far pointer in memory.
- * JMP FAR mem16:32 FF /5 Far jump indirect,
- * with the target specified by a far pointer in memory.
- *
- * Intel64 does support 64bit offset.
- * Software Developer Manual Vol 2: states:
- * FF /5 JMP m16:16 Jump far, absolute indirect,
- * address given in m16:16
- * FF /5 JMP m16:32 Jump far, absolute indirect,
- * address given in m16:32.
- * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
- * address given in m16:64.
- */
- pushq $.Lafter_lret # put return address on stack for unwinder
xorl %ebp, %ebp # clear frame pointer
- movq initial_code(%rip), %rax
- pushq $__KERNEL_CS # set correct cs
- pushq %rax # target address in negative space
- lretq
-.Lafter_lret:
- ANNOTATE_NOENDBR
+ ANNOTATE_RETPOLINE_SAFE
+ callq *initial_code(%rip)
+ ud2
SYM_CODE_END(secondary_startup_64)
#include "verify_cpu.S"
@@ -656,7 +613,8 @@ SYM_CODE_END(vc_no_ghcb)
.balign 4
SYM_DATA_START_PTI_ALIGNED(early_top_pgt)
- .fill 512,8,0
+ .fill 511,8,0
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
.fill PTI_USER_PGD_FILL,8,0
SYM_DATA_END(early_top_pgt)
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 2a422e00ed4b42..cde167b0ea92ad 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -503,7 +503,10 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
kbuf.bufsz = kernel_len - kern16_size;
kbuf.memsz = PAGE_ALIGN(header->init_size);
kbuf.buf_align = header->kernel_alignment;
- kbuf.buf_min = MIN_KERNEL_LOAD_ADDR;
+ if (header->pref_address < MIN_KERNEL_LOAD_ADDR)
+ kbuf.buf_min = MIN_KERNEL_LOAD_ADDR;
+ else
+ kbuf.buf_min = header->pref_address;
kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
ret = kexec_add_buffer(&kbuf);
if (ret)
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index a200bd72fadc55..8b04958da5e7d6 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -95,7 +95,8 @@ static bool __init sev_es_check_cpu_features(void)
return true;
}
-static void __noreturn sev_es_terminate(unsigned int set, unsigned int reason)
+static void __head __noreturn
+sev_es_terminate(unsigned int set, unsigned int reason)
{
u64 val = GHCB_MSR_TERM_REQ;
@@ -332,13 +333,7 @@ static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid
*/
static const struct snp_cpuid_table *snp_cpuid_get_table(void)
{
- void *ptr;
-
- asm ("lea cpuid_table_copy(%%rip), %0"
- : "=r" (ptr)
- : "p" (&cpuid_table_copy));
-
- return ptr;
+ return &RIP_REL_REF(cpuid_table_copy);
}
/*
@@ -397,7 +392,7 @@ static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
return xsave_size;
}
-static bool
+static bool __head
snp_cpuid_get_validated_func(struct cpuid_leaf *leaf)
{
const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
@@ -534,7 +529,8 @@ static int snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
* Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value
* should be treated as fatal by caller.
*/
-static int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
+static int __head
+snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
{
const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
@@ -576,7 +572,7 @@ static int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_le
* page yet, so it only supports the MSR based communication with the
* hypervisor and only the CPUID exit-code.
*/
-void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
+void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
{
unsigned int subfn = lower_bits(regs->cx, 32);
unsigned int fn = lower_bits(regs->ax, 32);
@@ -1027,7 +1023,8 @@ struct cc_setup_data {
* Search for a Confidential Computing blob passed in as a setup_data entry
* via the Linux Boot Protocol.
*/
-static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp)
+static __head
+struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp)
{
struct cc_setup_data *sd = NULL;
struct setup_data *hdr;
@@ -1054,7 +1051,7 @@ static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp)
* mapping needs to be updated in sync with all the changes to virtual memory
* layout and related mapping facilities throughout the boot process.
*/
-static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
+static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
{
const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table;
int i;
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 7d242898852f5e..b59b09c2f28406 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -25,6 +25,7 @@
#include <linux/psp-sev.h>
#include <uapi/linux/sev-guest.h>
+#include <asm/init.h>
#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>
#include <asm/sev.h>
@@ -701,8 +702,9 @@ static u64 __init get_jump_table_addr(void)
return ret;
}
-static void early_set_pages_state(unsigned long vaddr, unsigned long paddr,
- unsigned long npages, enum psc_op op)
+static void __head
+early_set_pages_state(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages, enum psc_op op)
{
unsigned long paddr_end;
u64 val;
@@ -758,7 +760,7 @@ e_term:
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
}
-void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
+void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
unsigned long npages)
{
/*
@@ -2081,7 +2083,7 @@ fail:
*
* Scan for the blob in that order.
*/
-static __init struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
+static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
{
struct cc_blob_sev_info *cc_info;
@@ -2107,7 +2109,7 @@ found_cc_info:
return cc_info;
}
-bool __init snp_init(struct boot_params *bp)
+bool __head snp_init(struct boot_params *bp)
{
struct cc_blob_sev_info *cc_info;
@@ -2129,7 +2131,7 @@ bool __init snp_init(struct boot_params *bp)
return true;
}
-void __init __noreturn snp_abort(void)
+void __head __noreturn snp_abort(void)
{
sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
}
diff --git a/arch/x86/kernel/sev_verify_cbit.S b/arch/x86/kernel/sev_verify_cbit.S
index 3355e27c69ebf8..1ab65f6c6ae7a1 100644
--- a/arch/x86/kernel/sev_verify_cbit.S
+++ b/arch/x86/kernel/sev_verify_cbit.S
@@ -77,7 +77,7 @@ SYM_FUNC_START(sev_verify_cbit)
* The check failed, prevent any forward progress to prevent ROP
* attacks, invalidate the stack and go into a hlt loop.
*/
- xorq %rsp, %rsp
+ xorl %esp, %esp
subq $0x1000, %rsp
2: hlt
jmp 2b
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 72cc9c90e9f307..6da73513f02668 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -14,19 +14,6 @@ ifdef CONFIG_KCSAN
CFLAGS_REMOVE_delay.o = $(CC_FLAGS_FTRACE)
endif
-# Early boot use of cmdline; don't instrument it
-ifdef CONFIG_AMD_MEM_ENCRYPT
-KCOV_INSTRUMENT_cmdline.o := n
-KASAN_SANITIZE_cmdline.o := n
-KCSAN_SANITIZE_cmdline.o := n
-
-ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_cmdline.o = -pg
-endif
-
-CFLAGS_cmdline.o := -fno-stack-protector -fno-jump-tables
-endif
-
inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
quiet_cmd_inat_tables = GEN $@
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index 0166ab1780ccb1..64b5005d49e577 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -41,9 +41,9 @@
#include <linux/mem_encrypt.h>
#include <linux/cc_platform.h>
+#include <asm/init.h>
#include <asm/setup.h>
#include <asm/sections.h>
-#include <asm/cmdline.h>
#include <asm/coco.h>
#include <asm/sev.h>
@@ -95,10 +95,7 @@ struct sme_populate_pgd_data {
*/
static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch");
-static char sme_cmdline_arg[] __initdata = "mem_encrypt";
-static char sme_cmdline_on[] __initdata = "on";
-
-static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
+static void __head sme_clear_pgd(struct sme_populate_pgd_data *ppd)
{
unsigned long pgd_start, pgd_end, pgd_size;
pgd_t *pgd_p;
@@ -113,7 +110,7 @@ static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
memset(pgd_p, 0, pgd_size);
}
-static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
+static pud_t __head *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -150,7 +147,7 @@ static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
return pud;
}
-static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
+static void __head sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
{
pud_t *pud;
pmd_t *pmd;
@@ -166,7 +163,7 @@ static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags));
}
-static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
+static void __head sme_populate_pgd(struct sme_populate_pgd_data *ppd)
{
pud_t *pud;
pmd_t *pmd;
@@ -192,7 +189,7 @@ static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
set_pte(pte, __pte(ppd->paddr | ppd->pte_flags));
}
-static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
+static void __head __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
{
while (ppd->vaddr < ppd->vaddr_end) {
sme_populate_pgd_large(ppd);
@@ -202,7 +199,7 @@ static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
}
}
-static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
+static void __head __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
{
while (ppd->vaddr < ppd->vaddr_end) {
sme_populate_pgd(ppd);
@@ -212,7 +209,7 @@ static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
}
}
-static void __init __sme_map_range(struct sme_populate_pgd_data *ppd,
+static void __head __sme_map_range(struct sme_populate_pgd_data *ppd,
pmdval_t pmd_flags, pteval_t pte_flags)
{
unsigned long vaddr_end;
@@ -236,22 +233,22 @@ static void __init __sme_map_range(struct sme_populate_pgd_data *ppd,
__sme_map_range_pte(ppd);
}
-static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
+static void __head sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
{
__sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC);
}
-static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
+static void __head sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
{
__sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC);
}
-static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd)
+static void __head sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd)
{
__sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP);
}
-static unsigned long __init sme_pgtable_calc(unsigned long len)
+static unsigned long __head sme_pgtable_calc(unsigned long len)
{
unsigned long entries = 0, tables = 0;
@@ -288,7 +285,7 @@ static unsigned long __init sme_pgtable_calc(unsigned long len)
return entries + tables;
}
-void __init sme_encrypt_kernel(struct boot_params *bp)
+void __head sme_encrypt_kernel(struct boot_params *bp)
{
unsigned long workarea_start, workarea_end, workarea_len;
unsigned long execute_start, execute_end, execute_len;
@@ -323,9 +320,8 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
* memory from being cached.
*/
- /* Physical addresses gives us the identity mapped virtual addresses */
- kernel_start = __pa_symbol(_text);
- kernel_end = ALIGN(__pa_symbol(_end), PMD_SIZE);
+ kernel_start = (unsigned long)RIP_REL_REF(_text);
+ kernel_end = ALIGN((unsigned long)RIP_REL_REF(_end), PMD_SIZE);
kernel_len = kernel_end - kernel_start;
initrd_start = 0;
@@ -343,14 +339,6 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
#endif
/*
- * We're running identity mapped, so we must obtain the address to the
- * SME encryption workarea using rip-relative addressing.
- */
- asm ("lea sme_workarea(%%rip), %0"
- : "=r" (workarea_start)
- : "p" (sme_workarea));
-
- /*
* Calculate required number of workarea bytes needed:
* executable encryption area size:
* stack page (PAGE_SIZE)
@@ -359,7 +347,7 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
* pagetable structures for the encryption of the kernel
* pagetable structures for workarea (in case not currently mapped)
*/
- execute_start = workarea_start;
+ execute_start = workarea_start = (unsigned long)RIP_REL_REF(sme_workarea);
execute_end = execute_start + (PAGE_SIZE * 2) + PMD_SIZE;
execute_len = execute_end - execute_start;
@@ -502,13 +490,11 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
native_write_cr3(__native_read_cr3());
}
-void __init sme_enable(struct boot_params *bp)
+void __head sme_enable(struct boot_params *bp)
{
- const char *cmdline_ptr, *cmdline_arg, *cmdline_on;
unsigned int eax, ebx, ecx, edx;
unsigned long feature_mask;
unsigned long me_mask;
- char buffer[16];
bool snp;
u64 msr;
@@ -551,6 +537,9 @@ void __init sme_enable(struct boot_params *bp)
/* Check if memory encryption is enabled */
if (feature_mask == AMD_SME_BIT) {
+ if (!(bp->hdr.xloadflags & XLF_MEM_ENCRYPTION))
+ return;
+
/*
* No SME if Hypervisor bit is set. This check is here to
* prevent a guest from trying to enable SME. For running as a
@@ -570,31 +559,8 @@ void __init sme_enable(struct boot_params *bp)
msr = __rdmsr(MSR_AMD64_SYSCFG);
if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
return;
- } else {
- /* SEV state cannot be controlled by a command line option */
- goto out;
}
- /*
- * Fixups have not been applied to phys_base yet and we're running
- * identity mapped, so we must obtain the address to the SME command
- * line argument data using rip-relative addressing.
- */
- asm ("lea sme_cmdline_arg(%%rip), %0"
- : "=r" (cmdline_arg)
- : "p" (sme_cmdline_arg));
- asm ("lea sme_cmdline_on(%%rip), %0"
- : "=r" (cmdline_on)
- : "p" (sme_cmdline_on));
-
- cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
- ((u64)bp->ext_cmd_line_ptr << 32));
-
- if (cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)) < 0 ||
- strncmp(buffer, cmdline_on, sizeof(buffer)))
- return;
-
-out:
RIP_REL_REF(sme_me_mask) = me_mask;
physical_mask &= ~me_mask;
cc_vendor = CC_VENDOR_AMD;
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index c9f76fae902e4b..14d9c7daf90f48 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -37,13 +37,15 @@
.text
.code16
-.macro LOCK_AND_LOAD_REALMODE_ESP lock_pa=0
+.macro LOCK_AND_LOAD_REALMODE_ESP lock_pa=0 lock_rip=0
/*
* Make sure only one CPU fiddles with the realmode stack
*/
.Llock_rm\@:
.if \lock_pa
lock btsl $0, pa_tr_lock
+ .elseif \lock_rip
+ lock btsl $0, tr_lock(%rip)
.else
lock btsl $0, tr_lock
.endif
@@ -220,6 +222,35 @@ SYM_CODE_START(trampoline_start64)
lidt tr_idt(%rip)
lgdt tr_gdt64(%rip)
+ /* Check if paging mode has to be changed */
+ movq %cr4, %rax
+ xorl tr_cr4(%rip), %eax
+ testl $X86_CR4_LA57, %eax
+ jnz .L_switch_paging
+
+ /* Paging mode is correct proceed in 64-bit mode */
+
+ LOCK_AND_LOAD_REALMODE_ESP lock_rip=1
+
+ movw $__KERNEL_DS, %dx
+ movl %edx, %ss
+ addl $pa_real_mode_base, %esp
+ movl %edx, %ds
+ movl %edx, %es
+ movl %edx, %fs
+ movl %edx, %gs
+
+ movl $pa_trampoline_pgd, %eax
+ movq %rax, %cr3
+
+ pushq $__KERNEL_CS
+ pushq tr_start(%rip)
+ lretq
+.L_switch_paging:
+ /*
+ * To switch between 4- and 5-level paging modes, it is necessary
+ * to disable paging. This must be done in the compatibility mode.
+ */
ljmpl *tr_compat(%rip)
SYM_CODE_END(trampoline_start64)
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index bfa30625f5d031..3dc2f9aaf08db0 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -24,6 +24,8 @@ static bool efi_noinitrd;
static bool efi_nosoftreserve;
static bool efi_disable_pci_dma = IS_ENABLED(CONFIG_EFI_DISABLE_PCI_DMA);
+int efi_mem_encrypt;
+
bool __pure __efi_soft_reserve_enabled(void)
{
return !efi_nosoftreserve;
@@ -75,6 +77,12 @@ efi_status_t efi_parse_options(char const *cmdline)
efi_noinitrd = true;
} else if (IS_ENABLED(CONFIG_X86_64) && !strcmp(param, "no5lvl")) {
efi_no5lvl = true;
+ } else if (IS_ENABLED(CONFIG_ARCH_HAS_MEM_ENCRYPT) &&
+ !strcmp(param, "mem_encrypt") && val) {
+ if (parse_option_str(val, "on"))
+ efi_mem_encrypt = 1;
+ else if (parse_option_str(val, "off"))
+ efi_mem_encrypt = -1;
} else if (!strcmp(param, "efi") && val) {
efi_nochunk = parse_option_str(val, "nochunk");
efi_novamap |= parse_option_str(val, "novamap");
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index c04b82ea40f216..fc18fd649ed771 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -37,8 +37,8 @@ extern bool efi_no5lvl;
extern bool efi_nochunk;
extern bool efi_nokaslr;
extern int efi_loglevel;
+extern int efi_mem_encrypt;
extern bool efi_novamap;
-
extern const efi_system_table_t *efi_system_table;
typedef union efi_dxe_services_table efi_dxe_services_table_t;
diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c
index 99429bc4b0c7eb..0336ed175e671a 100644
--- a/drivers/firmware/efi/libstub/x86-stub.c
+++ b/drivers/firmware/efi/libstub/x86-stub.c
@@ -884,6 +884,9 @@ void __noreturn efi_stub_entry(efi_handle_t handle,
}
}
+ if (efi_mem_encrypt > 0)
+ hdr->xloadflags |= XLF_MEM_ENCRYPTION;
+
status = efi_decompress_kernel(&kernel_entry);
if (status != EFI_SUCCESS) {
efi_err("Failed to decompress kernel\n");