linux/arch/x86/kernel/head32.c
Thomas Gleixner 0b62f6cb07 x86/microcode/32: Move early loading after paging enable
32-bit loads microcode before paging is enabled. The commit which
introduced that has zero justification in the changelog. The cover
letter has slightly more content, but it does not give any technical
justification either:

  "The problem in current microcode loading method is that we load a
   microcode way, way too late; ideally we should load it before turning
   paging on.  This may only be practical on 32 bits since we can't get
   to 64-bit mode without paging on, but we should still do it as early
   as at all possible."

Handwaving word salad with zero technical content.

Someone claimed in an offlist conversation that this is required for
curing the ATOM erratum AAE44/AAF40/AAG38/AAH41. That erratum requires
an microcode update in order to make the usage of PSE safe. But during
early boot, PSE is completely irrelevant and it is evaluated way later.

Neither is it relevant for the AP on single core HT enabled CPUs as the
microcode loading on the AP is not doing anything.

On dual core CPUs there is a theoretical problem if a split of an
executable large page between enabling paging including PSE and loading
the microcode happens. But that's only theoretical, it's practically
irrelevant because the affected dual core CPUs are 64bit enabled and
therefore have paging and PSE enabled before loading the microcode on
the second core. So why would it work on 64-bit but not on 32-bit?

The erratum:

  "AAG38 Code Fetch May Occur to Incorrect Address After a Large Page is
   Split Into 4-Kbyte Pages

   Problem: If software clears the PS (page size) bit in a present PDE
   (page directory entry), that will cause linear addresses mapped through
   this PDE to use 4-KByte pages instead of using a large page after old
   TLB entries are invalidated. Due to this erratum, if a code fetch uses
   this PDE before the TLB entry for the large page is invalidated then it
   may fetch from a different physical address than specified by either the
   old large page translation or the new 4-KByte page translation. This
   erratum may also cause speculative code fetches from incorrect addresses."

The practical relevance for this is exactly zero because there is no
splitting of large text pages during early boot-time, i.e. between paging
enable and microcode loading, and neither during CPU hotplug.

IOW, this load microcode before paging enable is yet another voodoo
programming solution in search of a problem. What's worse is that it causes
at least two serious problems:

 1) When stackprotector is enabled, the microcode loader code has the
    stackprotector mechanics enabled. The read from the per CPU variable
    __stack_chk_guard is always accessing the virtual address either
    directly on UP or via %fs on SMP. In physical address mode this
    results in an access to memory above 3GB. So this works by chance as
    the hardware returns the same value when there is no RAM at this
    physical address. When there is RAM populated above 3G then the read
    is by chance the same as nothing changes that memory during the very
    early boot stage. That's not necessarily true during runtime CPU
    hotplug.

 2) When function tracing is enabled, the relevant microcode loader
    functions and the functions invoked from there will call into the
    tracing code and evaluate global and per CPU variables in physical
    address mode. What could potentially go wrong?

Cure this and move the microcode loading after the early paging enable, use
the new temporary initrd mapping and remove the gunk in the microcode
loader which is required to handle physical address mode.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20231017211722.348298216@linutronix.de
2023-10-18 22:15:01 +02:00

175 lines
4.8 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* linux/arch/i386/kernel/head32.c -- prepare to run common code
*
* Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
* Copyright (C) 2007 Eric Biederman <ebiederm@xmission.com>
*/
#include <linux/init.h>
#include <linux/start_kernel.h>
#include <linux/mm.h>
#include <linux/memblock.h>
#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/sections.h>
#include <asm/e820/api.h>
#include <asm/page.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/bios_ebda.h>
#include <asm/microcode.h>
#include <asm/tlbflush.h>
#include <asm/bootparam_utils.h>
static void __init i386_default_early_setup(void)
{
/* Initialize 32bit specific setup functions */
x86_init.resources.reserve_resources = i386_reserve_resources;
x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
}
#ifdef CONFIG_MICROCODE_INITRD32
unsigned long __initdata initrd_start_early;
static pte_t __initdata *initrd_pl2p_start, *initrd_pl2p_end;
static void zap_early_initrd_mapping(void)
{
pte_t *pl2p = initrd_pl2p_start;
for (; pl2p < initrd_pl2p_end; pl2p++) {
*pl2p = (pte_t){ .pte = 0 };
if (!IS_ENABLED(CONFIG_X86_PAE))
*(pl2p + ((PAGE_OFFSET >> PGDIR_SHIFT))) = (pte_t) {.pte = 0};
}
}
#else
static inline void zap_early_initrd_mapping(void) { }
#endif
asmlinkage __visible void __init __noreturn i386_start_kernel(void)
{
/* Make sure IDT is set up before any exception happens */
idt_setup_early_handler();
load_ucode_bsp();
zap_early_initrd_mapping();
cr4_init_shadow();
sanitize_boot_params(&boot_params);
x86_early_init_platform_quirks();
/* Call the subarch specific early setup function */
switch (boot_params.hdr.hardware_subarch) {
case X86_SUBARCH_INTEL_MID:
x86_intel_mid_early_setup();
break;
case X86_SUBARCH_CE4100:
x86_ce4100_early_setup();
break;
default:
i386_default_early_setup();
break;
}
start_kernel();
}
/*
* Initialize page tables. This creates a PDE and a set of page
* tables, which are located immediately beyond __brk_base. The variable
* _brk_end is set up to point to the first "safe" location.
* Mappings are created both at virtual address 0 (identity mapping)
* and PAGE_OFFSET for up to _end.
*
* In PAE mode initial_page_table is statically defined to contain
* enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
* entries). The identity mapping is handled by pointing two PGD entries
* to the first kernel PMD. Note the upper half of each PMD or PTE are
* always zero at this stage.
*/
#ifdef CONFIG_X86_PAE
typedef pmd_t pl2_t;
#define pl2_base initial_pg_pmd
#define SET_PL2(val) { .pmd = (val), }
#else
typedef pgd_t pl2_t;
#define pl2_base initial_page_table
#define SET_PL2(val) { .pgd = (val), }
#endif
static __init __no_stack_protector pte_t init_map(pte_t pte, pte_t **ptep, pl2_t **pl2p,
const unsigned long limit)
{
while ((pte.pte & PTE_PFN_MASK) < limit) {
pl2_t pl2 = SET_PL2((unsigned long)*ptep | PDE_IDENT_ATTR);
int i;
**pl2p = pl2;
if (!IS_ENABLED(CONFIG_X86_PAE)) {
/* Kernel PDE entry */
*(*pl2p + ((PAGE_OFFSET >> PGDIR_SHIFT))) = pl2;
}
for (i = 0; i < PTRS_PER_PTE; i++) {
**ptep = pte;
pte.pte += PAGE_SIZE;
(*ptep)++;
}
(*pl2p)++;
}
return pte;
}
void __init __no_stack_protector mk_early_pgtbl_32(void)
{
/* Enough space to fit pagetables for the low memory linear map */
unsigned long limit = __pa_nodebug(_end) + (PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT);
pte_t pte, *ptep = (pte_t *)__pa_nodebug(__brk_base);
struct boot_params __maybe_unused *params;
pl2_t *pl2p = (pl2_t *)__pa_nodebug(pl2_base);
unsigned long *ptr;
pte.pte = PTE_IDENT_ATTR;
pte = init_map(pte, &ptep, &pl2p, limit);
ptr = (unsigned long *)__pa_nodebug(&max_pfn_mapped);
/* Can't use pte_pfn() since it's a call with CONFIG_PARAVIRT */
*ptr = (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
ptr = (unsigned long *)__pa_nodebug(&_brk_end);
*ptr = (unsigned long)ptep + PAGE_OFFSET;
#ifdef CONFIG_MICROCODE_INITRD32
/* Running on a hypervisor? */
if (native_cpuid_ecx(1) & BIT(31))
return;
params = (struct boot_params *)__pa_nodebug(&boot_params);
if (!params->hdr.ramdisk_size || !params->hdr.ramdisk_image)
return;
/* Save the virtual start address */
ptr = (unsigned long *)__pa_nodebug(&initrd_start_early);
*ptr = (pte.pte & PTE_PFN_MASK) + PAGE_OFFSET;
*ptr += ((unsigned long)params->hdr.ramdisk_image) & ~PAGE_MASK;
/* Save PLP2 for cleanup */
ptr = (unsigned long *)__pa_nodebug(&initrd_pl2p_start);
*ptr = (unsigned long)pl2p + PAGE_OFFSET;
limit = (unsigned long)params->hdr.ramdisk_image;
pte.pte = PTE_IDENT_ATTR | PFN_ALIGN(limit);
limit = (unsigned long)params->hdr.ramdisk_image + params->hdr.ramdisk_size;
init_map(pte, &ptep, &pl2p, limit);
ptr = (unsigned long *)__pa_nodebug(&initrd_pl2p_end);
*ptr = (unsigned long)pl2p + PAGE_OFFSET;
#endif
}