linux/arch/x86/kernel/module.c
Linus Torvalds 94a855111e - Add the call depth tracking mitigation for Retbleed which has
been long in the making. It is a lighterweight software-only fix for
 Skylake-based cores where enabling IBRS is a big hammer and causes a
 significant performance impact.
 
 What it basically does is, it aligns all kernel functions to 16 bytes
 boundary and adds a 16-byte padding before the function, objtool
 collects all functions' locations and when the mitigation gets applied,
 it patches a call accounting thunk which is used to track the call depth
 of the stack at any time.
 
 When that call depth reaches a magical, microarchitecture-specific value
 for the Return Stack Buffer, the code stuffs that RSB and avoids its
 underflow which could otherwise lead to the Intel variant of Retbleed.
 
 This software-only solution brings a lot of the lost performance back,
 as benchmarks suggest:
 
   https://lore.kernel.org/all/20220915111039.092790446@infradead.org/
 
 That page above also contains a lot more detailed explanation of the
 whole mechanism
 
 - Implement a new control flow integrity scheme called FineIBT which is
 based on the software kCFI implementation and uses hardware IBT support
 where present to annotate and track indirect branches using a hash to
 validate them
 
 - Other misc fixes and cleanups
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmOZp5EACgkQEsHwGGHe
 VUrZFxAAvi/+8L0IYSK4mKJvixGbTFjxN/Swo2JVOfs34LqGUT6JaBc+VUMwZxdb
 VMTFIZ3ttkKEodjhxGI7oGev6V8UfhI37SmO2lYKXpQVjXXnMlv/M+Vw3teE38CN
 gopi+xtGnT1IeWQ3tc/Tv18pleJ0mh5HKWiW+9KoqgXj0wgF9x4eRYDz1TDCDA/A
 iaBzs56j8m/FSykZHnrWZ/MvjKNPdGlfJASUCPeTM2dcrXQGJ93+X2hJctzDte0y
 Nuiw6Y0htfFBE7xoJn+sqm5Okr+McoUM18/CCprbgSKYk18iMYm3ZtAi6FUQZS1A
 ua4wQCf49loGp15PO61AS5d3OBf5D3q/WihQRbCaJvTVgPp9sWYnWwtcVUuhMllh
 ZQtBU9REcVJ/22bH09Q9CjBW0VpKpXHveqQdqRDViLJ6v/iI6EFGmD24SW/VxyRd
 73k9MBGrL/dOf1SbEzdsnvcSB3LGzp0Om8o/KzJWOomrVKjBCJy16bwTEsCZEJmP
 i406m92GPXeaN1GhTko7vmF0GnkEdJs1GVCZPluCAxxbhHukyxHnrjlQjI4vC80n
 Ylc0B3Kvitw7LGJsPqu+/jfNHADC/zhx1qz/30wb5cFmFbN1aRdp3pm8JYUkn+l/
 zri2Y6+O89gvE/9/xUhMohzHsWUO7xITiBavewKeTP9GSWybWUs=
 =cRy1
 -----END PGP SIGNATURE-----

Merge tag 'x86_core_for_v6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 core updates from Borislav Petkov:

 - Add the call depth tracking mitigation for Retbleed which has been
   long in the making. It is a lighterweight software-only fix for
   Skylake-based cores where enabling IBRS is a big hammer and causes a
   significant performance impact.

   What it basically does is, it aligns all kernel functions to 16 bytes
   boundary and adds a 16-byte padding before the function, objtool
   collects all functions' locations and when the mitigation gets
   applied, it patches a call accounting thunk which is used to track
   the call depth of the stack at any time.

   When that call depth reaches a magical, microarchitecture-specific
   value for the Return Stack Buffer, the code stuffs that RSB and
   avoids its underflow which could otherwise lead to the Intel variant
   of Retbleed.

   This software-only solution brings a lot of the lost performance
   back, as benchmarks suggest:

       https://lore.kernel.org/all/20220915111039.092790446@infradead.org/

   That page above also contains a lot more detailed explanation of the
   whole mechanism

 - Implement a new control flow integrity scheme called FineIBT which is
   based on the software kCFI implementation and uses hardware IBT
   support where present to annotate and track indirect branches using a
   hash to validate them

 - Other misc fixes and cleanups

* tag 'x86_core_for_v6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (80 commits)
  x86/paravirt: Use common macro for creating simple asm paravirt functions
  x86/paravirt: Remove clobber bitmask from .parainstructions
  x86/debug: Include percpu.h in debugreg.h to get DECLARE_PER_CPU() et al
  x86/cpufeatures: Move X86_FEATURE_CALL_DEPTH from bit 18 to bit 19 of word 11, to leave space for WIP X86_FEATURE_SGX_EDECCSSA bit
  x86/Kconfig: Enable kernel IBT by default
  x86,pm: Force out-of-line memcpy()
  objtool: Fix weak hole vs prefix symbol
  objtool: Optimize elf_dirty_reloc_sym()
  x86/cfi: Add boot time hash randomization
  x86/cfi: Boot time selection of CFI scheme
  x86/ibt: Implement FineIBT
  objtool: Add --cfi to generate the .cfi_sites section
  x86: Add prefix symbols for function padding
  objtool: Add option to generate prefix symbols
  objtool: Avoid O(bloody terrible) behaviour -- an ode to libelf
  objtool: Slice up elf_create_section_symbol()
  kallsyms: Revert "Take callthunks into account"
  x86: Unconfuse CONFIG_ and X86_FEATURE_ namespaces
  x86/retpoline: Fix crash printing warning
  x86/paravirt: Fix a !PARAVIRT build warning
  ...
2022-12-14 15:03:00 -08:00

360 lines
8.8 KiB
C

// SPDX-License-Identifier: GPL-2.0-or-later
/* Kernel module help for x86.
Copyright (C) 2001 Rusty Russell.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/moduleloader.h>
#include <linux/elf.h>
#include <linux/vmalloc.h>
#include <linux/fs.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/kasan.h>
#include <linux/bug.h>
#include <linux/mm.h>
#include <linux/gfp.h>
#include <linux/jump_label.h>
#include <linux/random.h>
#include <linux/memory.h>
#include <asm/text-patching.h>
#include <asm/page.h>
#include <asm/setup.h>
#include <asm/unwind.h>
#if 0
#define DEBUGP(fmt, ...) \
printk(KERN_DEBUG fmt, ##__VA_ARGS__)
#else
#define DEBUGP(fmt, ...) \
do { \
if (0) \
printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
} while (0)
#endif
#ifdef CONFIG_RANDOMIZE_BASE
static unsigned long module_load_offset;
/* Mutex protects the module_load_offset. */
static DEFINE_MUTEX(module_kaslr_mutex);
static unsigned long int get_module_load_offset(void)
{
if (kaslr_enabled()) {
mutex_lock(&module_kaslr_mutex);
/*
* Calculate the module_load_offset the first time this
* code is called. Once calculated it stays the same until
* reboot.
*/
if (module_load_offset == 0)
module_load_offset =
get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
mutex_unlock(&module_kaslr_mutex);
}
return module_load_offset;
}
#else
static unsigned long int get_module_load_offset(void)
{
return 0;
}
#endif
void *module_alloc(unsigned long size)
{
gfp_t gfp_mask = GFP_KERNEL;
void *p;
if (PAGE_ALIGN(size) > MODULES_LEN)
return NULL;
p = __vmalloc_node_range(size, MODULE_ALIGN,
MODULES_VADDR + get_module_load_offset(),
MODULES_END, gfp_mask, PAGE_KERNEL,
VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK,
NUMA_NO_NODE, __builtin_return_address(0));
if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
vfree(p);
return NULL;
}
return p;
}
#ifdef CONFIG_X86_32
int apply_relocate(Elf32_Shdr *sechdrs,
const char *strtab,
unsigned int symindex,
unsigned int relsec,
struct module *me)
{
unsigned int i;
Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
Elf32_Sym *sym;
uint32_t *location;
DEBUGP("Applying relocate section %u to %u\n",
relsec, sechdrs[relsec].sh_info);
for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
/* This is where to make the change */
location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+ rel[i].r_offset;
/* This is the symbol it is referring to. Note that all
undefined symbols have been resolved. */
sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
+ ELF32_R_SYM(rel[i].r_info);
switch (ELF32_R_TYPE(rel[i].r_info)) {
case R_386_32:
/* We add the value into the location given */
*location += sym->st_value;
break;
case R_386_PC32:
case R_386_PLT32:
/* Add the value, subtract its position */
*location += sym->st_value - (uint32_t)location;
break;
default:
pr_err("%s: Unknown relocation: %u\n",
me->name, ELF32_R_TYPE(rel[i].r_info));
return -ENOEXEC;
}
}
return 0;
}
#else /*X86_64*/
static int __apply_relocate_add(Elf64_Shdr *sechdrs,
const char *strtab,
unsigned int symindex,
unsigned int relsec,
struct module *me,
void *(*write)(void *dest, const void *src, size_t len))
{
unsigned int i;
Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
Elf64_Sym *sym;
void *loc;
u64 val;
DEBUGP("Applying relocate section %u to %u\n",
relsec, sechdrs[relsec].sh_info);
for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
/* This is where to make the change */
loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+ rel[i].r_offset;
/* This is the symbol it is referring to. Note that all
undefined symbols have been resolved. */
sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
+ ELF64_R_SYM(rel[i].r_info);
DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
(int)ELF64_R_TYPE(rel[i].r_info),
sym->st_value, rel[i].r_addend, (u64)loc);
val = sym->st_value + rel[i].r_addend;
switch (ELF64_R_TYPE(rel[i].r_info)) {
case R_X86_64_NONE:
break;
case R_X86_64_64:
if (*(u64 *)loc != 0)
goto invalid_relocation;
write(loc, &val, 8);
break;
case R_X86_64_32:
if (*(u32 *)loc != 0)
goto invalid_relocation;
write(loc, &val, 4);
if (val != *(u32 *)loc)
goto overflow;
break;
case R_X86_64_32S:
if (*(s32 *)loc != 0)
goto invalid_relocation;
write(loc, &val, 4);
if ((s64)val != *(s32 *)loc)
goto overflow;
break;
case R_X86_64_PC32:
case R_X86_64_PLT32:
if (*(u32 *)loc != 0)
goto invalid_relocation;
val -= (u64)loc;
write(loc, &val, 4);
#if 0
if ((s64)val != *(s32 *)loc)
goto overflow;
#endif
break;
case R_X86_64_PC64:
if (*(u64 *)loc != 0)
goto invalid_relocation;
val -= (u64)loc;
write(loc, &val, 8);
break;
default:
pr_err("%s: Unknown rela relocation: %llu\n",
me->name, ELF64_R_TYPE(rel[i].r_info));
return -ENOEXEC;
}
}
return 0;
invalid_relocation:
pr_err("x86/modules: Skipping invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n",
(int)ELF64_R_TYPE(rel[i].r_info), loc, val);
return -ENOEXEC;
overflow:
pr_err("overflow in relocation type %d val %Lx\n",
(int)ELF64_R_TYPE(rel[i].r_info), val);
pr_err("`%s' likely not compiled with -mcmodel=kernel\n",
me->name);
return -ENOEXEC;
}
int apply_relocate_add(Elf64_Shdr *sechdrs,
const char *strtab,
unsigned int symindex,
unsigned int relsec,
struct module *me)
{
int ret;
bool early = me->state == MODULE_STATE_UNFORMED;
void *(*write)(void *, const void *, size_t) = memcpy;
if (!early) {
write = text_poke;
mutex_lock(&text_mutex);
}
ret = __apply_relocate_add(sechdrs, strtab, symindex, relsec, me,
write);
if (!early) {
text_poke_sync();
mutex_unlock(&text_mutex);
}
return ret;
}
#endif
int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
{
const Elf_Shdr *s, *alt = NULL, *locks = NULL,
*para = NULL, *orc = NULL, *orc_ip = NULL,
*retpolines = NULL, *returns = NULL, *ibt_endbr = NULL,
*calls = NULL, *cfi = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
if (!strcmp(".altinstructions", secstrings + s->sh_name))
alt = s;
if (!strcmp(".smp_locks", secstrings + s->sh_name))
locks = s;
if (!strcmp(".parainstructions", secstrings + s->sh_name))
para = s;
if (!strcmp(".orc_unwind", secstrings + s->sh_name))
orc = s;
if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
orc_ip = s;
if (!strcmp(".retpoline_sites", secstrings + s->sh_name))
retpolines = s;
if (!strcmp(".return_sites", secstrings + s->sh_name))
returns = s;
if (!strcmp(".call_sites", secstrings + s->sh_name))
calls = s;
if (!strcmp(".cfi_sites", secstrings + s->sh_name))
cfi = s;
if (!strcmp(".ibt_endbr_seal", secstrings + s->sh_name))
ibt_endbr = s;
}
/*
* See alternative_instructions() for the ordering rules between the
* various patching types.
*/
if (para) {
void *pseg = (void *)para->sh_addr;
apply_paravirt(pseg, pseg + para->sh_size);
}
if (retpolines || cfi) {
void *rseg = NULL, *cseg = NULL;
unsigned int rsize = 0, csize = 0;
if (retpolines) {
rseg = (void *)retpolines->sh_addr;
rsize = retpolines->sh_size;
}
if (cfi) {
cseg = (void *)cfi->sh_addr;
csize = cfi->sh_size;
}
apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize);
}
if (retpolines) {
void *rseg = (void *)retpolines->sh_addr;
apply_retpolines(rseg, rseg + retpolines->sh_size);
}
if (returns) {
void *rseg = (void *)returns->sh_addr;
apply_returns(rseg, rseg + returns->sh_size);
}
if (alt) {
/* patch .altinstructions */
void *aseg = (void *)alt->sh_addr;
apply_alternatives(aseg, aseg + alt->sh_size);
}
if (calls || para) {
struct callthunk_sites cs = {};
if (calls) {
cs.call_start = (void *)calls->sh_addr;
cs.call_end = (void *)calls->sh_addr + calls->sh_size;
}
if (para) {
cs.pv_start = (void *)para->sh_addr;
cs.pv_end = (void *)para->sh_addr + para->sh_size;
}
callthunks_patch_module_calls(&cs, me);
}
if (ibt_endbr) {
void *iseg = (void *)ibt_endbr->sh_addr;
apply_ibt_endbr(iseg, iseg + ibt_endbr->sh_size);
}
if (locks) {
void *lseg = (void *)locks->sh_addr;
void *text = me->core_layout.base;
void *text_end = text + me->core_layout.text_size;
alternatives_smp_module_add(me, me->name,
lseg, lseg + locks->sh_size,
text, text_end);
}
if (orc && orc_ip)
unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
(void *)orc->sh_addr, orc->sh_size);
return 0;
}
void module_arch_cleanup(struct module *mod)
{
alternatives_smp_module_del(mod);
}