linux/kernel/seccomp.c
Linus Torvalds 8b5c6a3a49 audit/stable-4.18 PR 20180605
-----BEGIN PGP SIGNATURE-----
 
 iQJIBAABCAAyFiEEcQCq365ubpQNLgrWVeRaWujKfIoFAlsXFUEUHHBhdWxAcGF1
 bC1tb29yZS5jb20ACgkQVeRaWujKfIoomg//eRNpc6x9kxTijN670AC2uD0CBTlZ
 2z6mHuJaOhG8bTxjZxQfUBoo6/eZJ2YC1yq6ornGFNzw4sfKsR/j86ujJim2HAmo
 opUhziq3SILGEvjsxfPkREe/wb49jy0AA/WjZqciitB1ig8Hz7xzqi0lpNaEspFh
 QJFB6XXkojWGFGrRzruAVJnPS+pDWoTQR0qafs3JWKnpeinpOdZnl1hPsysAEHt5
 Ag8o4qS/P9xJM0khi7T+jWECmTyT/mtWqEtFcZ0o+JLOgt/EMvNX6DO4ETDiYRD2
 mVChga9x5r78bRgNy2U8IlEWWa76WpcQAEODvhzbijX4RxMAmjsmLE+e+udZSnMZ
 eCITl2f7ExxrL5SwNFC/5h7pAv0RJ+SOC19vcyeV4JDlQNNVjUy/aNKv5baV0aeg
 EmkeobneMWxqHx52aERz8RF1in5pT8gLOYoYnWfNpcDEmjLrwhuZLX2asIzUEqrS
 SoPJ8hxIDCxceHOWIIrz5Dqef7x28Dyi46w3QINC8bSy2RnR/H3q40DRegvXOGiS
 9WcbbwbhnM4Kau413qKicGCvdqTVYdeyZqo7fVelSciD139Vk7pZotyom4MuU25p
 fIyGfXa8/8gkl7fZ+HNkZbba0XWNfAZt//zT095qsp3CkhVnoybwe6OwG1xRqErq
 W7OOQbS7vvN/KGo=
 =10u6
 -----END PGP SIGNATURE-----

Merge tag 'audit-pr-20180605' of git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/audit

Pull audit updates from Paul Moore:
 "Another reasonable chunk of audit changes for v4.18, thirteen patches
  in total.

  The thirteen patches can mostly be broken down into one of four
  categories: general bug fixes, accessor functions for audit state
  stored in the task_struct, negative filter matches on executable
  names, and extending the (relatively) new seccomp logging knobs to the
  audit subsystem.

  The main driver for the accessor functions from Richard are the
  changes we're working on to associate audit events with containers,
  but I think they have some standalone value too so I figured it would
  be good to get them in now.

  The seccomp/audit patches from Tyler apply the seccomp logging
  improvements from a few releases ago to audit's seccomp logging;
  starting with this patchset the changes in
  /proc/sys/kernel/seccomp/actions_logged should apply to both the
  standard kernel logging and audit.

  As usual, everything passes the audit-testsuite and it happens to
  merge cleanly with your tree"

[ Heh, except it had trivial merge conflicts with the SELinux tree that
  also came in from Paul   - Linus ]

* tag 'audit-pr-20180605' of git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/audit:
  audit: Fix wrong task in comparison of session ID
  audit: use existing session info function
  audit: normalize loginuid read access
  audit: use new audit_context access funciton for seccomp_actions_logged
  audit: use inline function to set audit context
  audit: use inline function to get audit context
  audit: convert sessionid unset to a macro
  seccomp: Don't special case audited processes when logging
  seccomp: Audit attempts to modify the actions_logged sysctl
  seccomp: Configurable separator for the actions_logged string
  seccomp: Separate read and write code for actions_logged sysctl
  audit: allow not equal op for audit by executable
  audit: add syscall information to FEATURE_CHANGE records
2018-06-06 16:34:00 -07:00

1345 lines
34 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/seccomp.c
*
* Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
*
* Copyright (C) 2012 Google, Inc.
* Will Drewry <wad@chromium.org>
*
* This defines a simple but solid secure-computing facility.
*
* Mode 1 uses a fixed list of allowed system calls.
* Mode 2 allows user-defined system call filters in the form
* of Berkeley Packet Filters/Linux Socket Filters.
*/
#include <linux/refcount.h>
#include <linux/audit.h>
#include <linux/compat.h>
#include <linux/coredump.h>
#include <linux/kmemleak.h>
#include <linux/nospec.h>
#include <linux/prctl.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/seccomp.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
#include <linux/sysctl.h>
#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
#include <asm/syscall.h>
#endif
#ifdef CONFIG_SECCOMP_FILTER
#include <linux/filter.h>
#include <linux/pid.h>
#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/tracehook.h>
#include <linux/uaccess.h>
/**
* struct seccomp_filter - container for seccomp BPF programs
*
* @usage: reference count to manage the object lifetime.
* get/put helpers should be used when accessing an instance
* outside of a lifetime-guarded section. In general, this
* is only needed for handling filters shared across tasks.
* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
* @prev: points to a previously installed, or inherited, filter
* @prog: the BPF program to evaluate
*
* seccomp_filter objects are organized in a tree linked via the @prev
* pointer. For any task, it appears to be a singly-linked list starting
* with current->seccomp.filter, the most recently attached or inherited filter.
* However, multiple filters may share a @prev node, by way of fork(), which
* results in a unidirectional tree existing in memory. This is similar to
* how namespaces work.
*
* seccomp_filter objects should never be modified after being attached
* to a task_struct (other than @usage).
*/
struct seccomp_filter {
refcount_t usage;
bool log;
struct seccomp_filter *prev;
struct bpf_prog *prog;
};
/* Limit any path through the tree to 256KB worth of instructions. */
#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
/*
* Endianness is explicitly ignored and left for BPF program authors to manage
* as per the specific architecture.
*/
static void populate_seccomp_data(struct seccomp_data *sd)
{
struct task_struct *task = current;
struct pt_regs *regs = task_pt_regs(task);
unsigned long args[6];
sd->nr = syscall_get_nr(task, regs);
sd->arch = syscall_get_arch();
syscall_get_arguments(task, regs, 0, 6, args);
sd->args[0] = args[0];
sd->args[1] = args[1];
sd->args[2] = args[2];
sd->args[3] = args[3];
sd->args[4] = args[4];
sd->args[5] = args[5];
sd->instruction_pointer = KSTK_EIP(task);
}
/**
* seccomp_check_filter - verify seccomp filter code
* @filter: filter to verify
* @flen: length of filter
*
* Takes a previously checked filter (by bpf_check_classic) and
* redirects all filter code that loads struct sk_buff data
* and related data through seccomp_bpf_load. It also
* enforces length and alignment checking of those loads.
*
* Returns 0 if the rule set is legal or -EINVAL if not.
*/
static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
{
int pc;
for (pc = 0; pc < flen; pc++) {
struct sock_filter *ftest = &filter[pc];
u16 code = ftest->code;
u32 k = ftest->k;
switch (code) {
case BPF_LD | BPF_W | BPF_ABS:
ftest->code = BPF_LDX | BPF_W | BPF_ABS;
/* 32-bit aligned and not out of bounds. */
if (k >= sizeof(struct seccomp_data) || k & 3)
return -EINVAL;
continue;
case BPF_LD | BPF_W | BPF_LEN:
ftest->code = BPF_LD | BPF_IMM;
ftest->k = sizeof(struct seccomp_data);
continue;
case BPF_LDX | BPF_W | BPF_LEN:
ftest->code = BPF_LDX | BPF_IMM;
ftest->k = sizeof(struct seccomp_data);
continue;
/* Explicitly include allowed calls. */
case BPF_RET | BPF_K:
case BPF_RET | BPF_A:
case BPF_ALU | BPF_ADD | BPF_K:
case BPF_ALU | BPF_ADD | BPF_X:
case BPF_ALU | BPF_SUB | BPF_K:
case BPF_ALU | BPF_SUB | BPF_X:
case BPF_ALU | BPF_MUL | BPF_K:
case BPF_ALU | BPF_MUL | BPF_X:
case BPF_ALU | BPF_DIV | BPF_K:
case BPF_ALU | BPF_DIV | BPF_X:
case BPF_ALU | BPF_AND | BPF_K:
case BPF_ALU | BPF_AND | BPF_X:
case BPF_ALU | BPF_OR | BPF_K:
case BPF_ALU | BPF_OR | BPF_X:
case BPF_ALU | BPF_XOR | BPF_K:
case BPF_ALU | BPF_XOR | BPF_X:
case BPF_ALU | BPF_LSH | BPF_K:
case BPF_ALU | BPF_LSH | BPF_X:
case BPF_ALU | BPF_RSH | BPF_K:
case BPF_ALU | BPF_RSH | BPF_X:
case BPF_ALU | BPF_NEG:
case BPF_LD | BPF_IMM:
case BPF_LDX | BPF_IMM:
case BPF_MISC | BPF_TAX:
case BPF_MISC | BPF_TXA:
case BPF_LD | BPF_MEM:
case BPF_LDX | BPF_MEM:
case BPF_ST:
case BPF_STX:
case BPF_JMP | BPF_JA:
case BPF_JMP | BPF_JEQ | BPF_K:
case BPF_JMP | BPF_JEQ | BPF_X:
case BPF_JMP | BPF_JGE | BPF_K:
case BPF_JMP | BPF_JGE | BPF_X:
case BPF_JMP | BPF_JGT | BPF_K:
case BPF_JMP | BPF_JGT | BPF_X:
case BPF_JMP | BPF_JSET | BPF_K:
case BPF_JMP | BPF_JSET | BPF_X:
continue;
default:
return -EINVAL;
}
}
return 0;
}
/**
* seccomp_run_filters - evaluates all seccomp filters against @sd
* @sd: optional seccomp data to be passed to filters
* @match: stores struct seccomp_filter that resulted in the return value,
* unless filter returned SECCOMP_RET_ALLOW, in which case it will
* be unchanged.
*
* Returns valid seccomp BPF response codes.
*/
#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
static u32 seccomp_run_filters(const struct seccomp_data *sd,
struct seccomp_filter **match)
{
struct seccomp_data sd_local;
u32 ret = SECCOMP_RET_ALLOW;
/* Make sure cross-thread synced filter points somewhere sane. */
struct seccomp_filter *f =
READ_ONCE(current->seccomp.filter);
/* Ensure unexpected behavior doesn't result in failing open. */
if (unlikely(WARN_ON(f == NULL)))
return SECCOMP_RET_KILL_PROCESS;
if (!sd) {
populate_seccomp_data(&sd_local);
sd = &sd_local;
}
/*
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
*/
for (; f; f = f->prev) {
u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
ret = cur_ret;
*match = f;
}
}
return ret;
}
#endif /* CONFIG_SECCOMP_FILTER */
static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
{
assert_spin_locked(&current->sighand->siglock);
if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
return false;
return true;
}
void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { }
static inline void seccomp_assign_mode(struct task_struct *task,
unsigned long seccomp_mode,
unsigned long flags)
{
assert_spin_locked(&task->sighand->siglock);
task->seccomp.mode = seccomp_mode;
/*
* Make sure TIF_SECCOMP cannot be set before the mode (and
* filter) is set.
*/
smp_mb__before_atomic();
/* Assume default seccomp processes want spec flaw mitigation. */
if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0)
arch_seccomp_spec_mitigate(task);
set_tsk_thread_flag(task, TIF_SECCOMP);
}
#ifdef CONFIG_SECCOMP_FILTER
/* Returns 1 if the parent is an ancestor of the child. */
static int is_ancestor(struct seccomp_filter *parent,
struct seccomp_filter *child)
{
/* NULL is the root ancestor. */
if (parent == NULL)
return 1;
for (; child; child = child->prev)
if (child == parent)
return 1;
return 0;
}
/**
* seccomp_can_sync_threads: checks if all threads can be synchronized
*
* Expects sighand and cred_guard_mutex locks to be held.
*
* Returns 0 on success, -ve on error, or the pid of a thread which was
* either not in the correct seccomp mode or it did not have an ancestral
* seccomp filter.
*/
static inline pid_t seccomp_can_sync_threads(void)
{
struct task_struct *thread, *caller;
BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
assert_spin_locked(&current->sighand->siglock);
/* Validate all threads being eligible for synchronization. */
caller = current;
for_each_thread(caller, thread) {
pid_t failed;
/* Skip current, since it is initiating the sync. */
if (thread == caller)
continue;
if (thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
(thread->seccomp.mode == SECCOMP_MODE_FILTER &&
is_ancestor(thread->seccomp.filter,
caller->seccomp.filter)))
continue;
/* Return the first thread that cannot be synchronized. */
failed = task_pid_vnr(thread);
/* If the pid cannot be resolved, then return -ESRCH */
if (unlikely(WARN_ON(failed == 0)))
failed = -ESRCH;
return failed;
}
return 0;
}
/**
* seccomp_sync_threads: sets all threads to use current's filter
*
* Expects sighand and cred_guard_mutex locks to be held, and for
* seccomp_can_sync_threads() to have returned success already
* without dropping the locks.
*
*/
static inline void seccomp_sync_threads(unsigned long flags)
{
struct task_struct *thread, *caller;
BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
assert_spin_locked(&current->sighand->siglock);
/* Synchronize all threads. */
caller = current;
for_each_thread(caller, thread) {
/* Skip current, since it needs no changes. */
if (thread == caller)
continue;
/* Get a task reference for the new leaf node. */
get_seccomp_filter(caller);
/*
* Drop the task reference to the shared ancestor since
* current's path will hold a reference. (This also
* allows a put before the assignment.)
*/
put_seccomp_filter(thread);
smp_store_release(&thread->seccomp.filter,
caller->seccomp.filter);
/*
* Don't let an unprivileged task work around
* the no_new_privs restriction by creating
* a thread that sets it up, enters seccomp,
* then dies.
*/
if (task_no_new_privs(caller))
task_set_no_new_privs(thread);
/*
* Opt the other thread into seccomp if needed.
* As threads are considered to be trust-realm
* equivalent (see ptrace_may_access), it is safe to
* allow one thread to transition the other.
*/
if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
seccomp_assign_mode(thread, SECCOMP_MODE_FILTER,
flags);
}
}
/**
* seccomp_prepare_filter: Prepares a seccomp filter for use.
* @fprog: BPF program to install
*
* Returns filter on success or an ERR_PTR on failure.
*/
static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
struct seccomp_filter *sfilter;
int ret;
const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
return ERR_PTR(-EINVAL);
BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
/*
* Installing a seccomp filter requires that the task has
* CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
* This avoids scenarios where unprivileged tasks can affect the
* behavior of privileged children.
*/
if (!task_no_new_privs(current) &&
security_capable_noaudit(current_cred(), current_user_ns(),
CAP_SYS_ADMIN) != 0)
return ERR_PTR(-EACCES);
/* Allocate a new seccomp_filter */
sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
if (!sfilter)
return ERR_PTR(-ENOMEM);
ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
seccomp_check_filter, save_orig);
if (ret < 0) {
kfree(sfilter);
return ERR_PTR(ret);
}
refcount_set(&sfilter->usage, 1);
return sfilter;
}
/**
* seccomp_prepare_user_filter - prepares a user-supplied sock_fprog
* @user_filter: pointer to the user data containing a sock_fprog.
*
* Returns 0 on success and non-zero otherwise.
*/
static struct seccomp_filter *
seccomp_prepare_user_filter(const char __user *user_filter)
{
struct sock_fprog fprog;
struct seccomp_filter *filter = ERR_PTR(-EFAULT);
#ifdef CONFIG_COMPAT
if (in_compat_syscall()) {
struct compat_sock_fprog fprog32;
if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
goto out;
fprog.len = fprog32.len;
fprog.filter = compat_ptr(fprog32.filter);
} else /* falls through to the if below. */
#endif
if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
goto out;
filter = seccomp_prepare_filter(&fprog);
out:
return filter;
}
/**
* seccomp_attach_filter: validate and attach filter
* @flags: flags to change filter behavior
* @filter: seccomp filter to add to the current process
*
* Caller must be holding current->sighand->siglock lock.
*
* Returns 0 on success, -ve on error.
*/
static long seccomp_attach_filter(unsigned int flags,
struct seccomp_filter *filter)
{
unsigned long total_insns;
struct seccomp_filter *walker;
assert_spin_locked(&current->sighand->siglock);
/* Validate resulting filter length. */
total_insns = filter->prog->len;
for (walker = current->seccomp.filter; walker; walker = walker->prev)
total_insns += walker->prog->len + 4; /* 4 instr penalty */
if (total_insns > MAX_INSNS_PER_PATH)
return -ENOMEM;
/* If thread sync has been requested, check that it is possible. */
if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
int ret;
ret = seccomp_can_sync_threads();
if (ret)
return ret;
}
/* Set log flag, if present. */
if (flags & SECCOMP_FILTER_FLAG_LOG)
filter->log = true;
/*
* If there is an existing filter, make it the prev and don't drop its
* task reference.
*/
filter->prev = current->seccomp.filter;
current->seccomp.filter = filter;
/* Now that the new filter is in place, synchronize to all threads. */
if (flags & SECCOMP_FILTER_FLAG_TSYNC)
seccomp_sync_threads(flags);
return 0;
}
static void __get_seccomp_filter(struct seccomp_filter *filter)
{
/* Reference count is bounded by the number of total processes. */
refcount_inc(&filter->usage);
}
/* get_seccomp_filter - increments the reference count of the filter on @tsk */
void get_seccomp_filter(struct task_struct *tsk)
{
struct seccomp_filter *orig = tsk->seccomp.filter;
if (!orig)
return;
__get_seccomp_filter(orig);
}
static inline void seccomp_filter_free(struct seccomp_filter *filter)
{
if (filter) {
bpf_prog_destroy(filter->prog);
kfree(filter);
}
}
static void __put_seccomp_filter(struct seccomp_filter *orig)
{
/* Clean up single-reference branches iteratively. */
while (orig && refcount_dec_and_test(&orig->usage)) {
struct seccomp_filter *freeme = orig;
orig = orig->prev;
seccomp_filter_free(freeme);
}
}
/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
void put_seccomp_filter(struct task_struct *tsk)
{
__put_seccomp_filter(tsk->seccomp.filter);
}
static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason)
{
clear_siginfo(info);
info->si_signo = SIGSYS;
info->si_code = SYS_SECCOMP;
info->si_call_addr = (void __user *)KSTK_EIP(current);
info->si_errno = reason;
info->si_arch = syscall_get_arch();
info->si_syscall = syscall;
}
/**
* seccomp_send_sigsys - signals the task to allow in-process syscall emulation
* @syscall: syscall number to send to userland
* @reason: filter-supplied reason code to send to userland (via si_errno)
*
* Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
*/
static void seccomp_send_sigsys(int syscall, int reason)
{
struct siginfo info;
seccomp_init_siginfo(&info, syscall, reason);
force_sig_info(SIGSYS, &info, current);
}
#endif /* CONFIG_SECCOMP_FILTER */
/* For use with seccomp_actions_logged */
#define SECCOMP_LOG_KILL_PROCESS (1 << 0)
#define SECCOMP_LOG_KILL_THREAD (1 << 1)
#define SECCOMP_LOG_TRAP (1 << 2)
#define SECCOMP_LOG_ERRNO (1 << 3)
#define SECCOMP_LOG_TRACE (1 << 4)
#define SECCOMP_LOG_LOG (1 << 5)
#define SECCOMP_LOG_ALLOW (1 << 6)
static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS |
SECCOMP_LOG_KILL_THREAD |
SECCOMP_LOG_TRAP |
SECCOMP_LOG_ERRNO |
SECCOMP_LOG_TRACE |
SECCOMP_LOG_LOG;
static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
bool requested)
{
bool log = false;
switch (action) {
case SECCOMP_RET_ALLOW:
break;
case SECCOMP_RET_TRAP:
log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP;
break;
case SECCOMP_RET_ERRNO:
log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO;
break;
case SECCOMP_RET_TRACE:
log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
break;
case SECCOMP_RET_LOG:
log = seccomp_actions_logged & SECCOMP_LOG_LOG;
break;
case SECCOMP_RET_KILL_THREAD:
log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD;
break;
case SECCOMP_RET_KILL_PROCESS:
default:
log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS;
}
/*
* Emit an audit message when the action is RET_KILL_*, RET_LOG, or the
* FILTER_FLAG_LOG bit was set. The admin has the ability to silence
* any action from being logged by removing the action name from the
* seccomp_actions_logged sysctl.
*/
if (!log)
return;
audit_seccomp(syscall, signr, action);
}
/*
* Secure computing mode 1 allows only read/write/exit/sigreturn.
* To be fully secure this must be combined with rlimit
* to limit the stack allocations too.
*/
static const int mode1_syscalls[] = {
__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
0, /* null terminated */
};
static void __secure_computing_strict(int this_syscall)
{
const int *syscall_whitelist = mode1_syscalls;
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
syscall_whitelist = get_compat_mode1_syscalls();
#endif
do {
if (*syscall_whitelist == this_syscall)
return;
} while (*++syscall_whitelist);
#ifdef SECCOMP_DEBUG
dump_stack();
#endif
seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true);
do_exit(SIGKILL);
}
#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
void secure_computing_strict(int this_syscall)
{
int mode = current->seccomp.mode;
if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
return;
if (mode == SECCOMP_MODE_DISABLED)
return;
else if (mode == SECCOMP_MODE_STRICT)
__secure_computing_strict(this_syscall);
else
BUG();
}
#else
#ifdef CONFIG_SECCOMP_FILTER
static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
const bool recheck_after_trace)
{
u32 filter_ret, action;
struct seccomp_filter *match = NULL;
int data;
/*
* Make sure that any changes to mode from another thread have
* been seen after TIF_SECCOMP was seen.
*/
rmb();
filter_ret = seccomp_run_filters(sd, &match);
data = filter_ret & SECCOMP_RET_DATA;
action = filter_ret & SECCOMP_RET_ACTION_FULL;
switch (action) {
case SECCOMP_RET_ERRNO:
/* Set low-order bits as an errno, capped at MAX_ERRNO. */
if (data > MAX_ERRNO)
data = MAX_ERRNO;
syscall_set_return_value(current, task_pt_regs(current),
-data, 0);
goto skip;
case SECCOMP_RET_TRAP:
/* Show the handler the original registers. */
syscall_rollback(current, task_pt_regs(current));
/* Let the filter pass back 16 bits of data. */
seccomp_send_sigsys(this_syscall, data);
goto skip;
case SECCOMP_RET_TRACE:
/* We've been put in this state by the ptracer already. */
if (recheck_after_trace)
return 0;
/* ENOSYS these calls if there is no tracer attached. */
if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
syscall_set_return_value(current,
task_pt_regs(current),
-ENOSYS, 0);
goto skip;
}
/* Allow the BPF to provide the event message */
ptrace_event(PTRACE_EVENT_SECCOMP, data);
/*
* The delivery of a fatal signal during event
* notification may silently skip tracer notification,
* which could leave us with a potentially unmodified
* syscall that the tracer would have liked to have
* changed. Since the process is about to die, we just
* force the syscall to be skipped and let the signal
* kill the process and correctly handle any tracer exit
* notifications.
*/
if (fatal_signal_pending(current))
goto skip;
/* Check if the tracer forced the syscall to be skipped. */
this_syscall = syscall_get_nr(current, task_pt_regs(current));
if (this_syscall < 0)
goto skip;
/*
* Recheck the syscall, since it may have changed. This
* intentionally uses a NULL struct seccomp_data to force
* a reload of all registers. This does not goto skip since
* a skip would have already been reported.
*/
if (__seccomp_filter(this_syscall, NULL, true))
return -1;
return 0;
case SECCOMP_RET_LOG:
seccomp_log(this_syscall, 0, action, true);
return 0;
case SECCOMP_RET_ALLOW:
/*
* Note that the "match" filter will always be NULL for
* this action since SECCOMP_RET_ALLOW is the starting
* state in seccomp_run_filters().
*/
return 0;
case SECCOMP_RET_KILL_THREAD:
case SECCOMP_RET_KILL_PROCESS:
default:
seccomp_log(this_syscall, SIGSYS, action, true);
/* Dump core only if this is the last remaining thread. */
if (action == SECCOMP_RET_KILL_PROCESS ||
get_nr_threads(current) == 1) {
siginfo_t info;
/* Show the original registers in the dump. */
syscall_rollback(current, task_pt_regs(current));
/* Trigger a manual coredump since do_exit skips it. */
seccomp_init_siginfo(&info, this_syscall, data);
do_coredump(&info);
}
if (action == SECCOMP_RET_KILL_PROCESS)
do_group_exit(SIGSYS);
else
do_exit(SIGSYS);
}
unreachable();
skip:
seccomp_log(this_syscall, 0, action, match ? match->log : false);
return -1;
}
#else
static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
const bool recheck_after_trace)
{
BUG();
}
#endif
int __secure_computing(const struct seccomp_data *sd)
{
int mode = current->seccomp.mode;
int this_syscall;
if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
return 0;
this_syscall = sd ? sd->nr :
syscall_get_nr(current, task_pt_regs(current));
switch (mode) {
case SECCOMP_MODE_STRICT:
__secure_computing_strict(this_syscall); /* may call do_exit */
return 0;
case SECCOMP_MODE_FILTER:
return __seccomp_filter(this_syscall, sd, false);
default:
BUG();
}
}
#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
long prctl_get_seccomp(void)
{
return current->seccomp.mode;
}
/**
* seccomp_set_mode_strict: internal function for setting strict seccomp
*
* Once current->seccomp.mode is non-zero, it may not be changed.
*
* Returns 0 on success or -EINVAL on failure.
*/
static long seccomp_set_mode_strict(void)
{
const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
long ret = -EINVAL;
spin_lock_irq(&current->sighand->siglock);
if (!seccomp_may_assign_mode(seccomp_mode))
goto out;
#ifdef TIF_NOTSC
disable_TSC();
#endif
seccomp_assign_mode(current, seccomp_mode, 0);
ret = 0;
out:
spin_unlock_irq(&current->sighand->siglock);
return ret;
}
#ifdef CONFIG_SECCOMP_FILTER
/**
* seccomp_set_mode_filter: internal function for setting seccomp filter
* @flags: flags to change filter behavior
* @filter: struct sock_fprog containing filter
*
* This function may be called repeatedly to install additional filters.
* Every filter successfully installed will be evaluated (in reverse order)
* for each system call the task makes.
*
* Once current->seccomp.mode is non-zero, it may not be changed.
*
* Returns 0 on success or -EINVAL on failure.
*/
static long seccomp_set_mode_filter(unsigned int flags,
const char __user *filter)
{
const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
struct seccomp_filter *prepared = NULL;
long ret = -EINVAL;
/* Validate flags. */
if (flags & ~SECCOMP_FILTER_FLAG_MASK)
return -EINVAL;
/* Prepare the new filter before holding any locks. */
prepared = seccomp_prepare_user_filter(filter);
if (IS_ERR(prepared))
return PTR_ERR(prepared);
/*
* Make sure we cannot change seccomp or nnp state via TSYNC
* while another thread is in the middle of calling exec.
*/
if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
mutex_lock_killable(&current->signal->cred_guard_mutex))
goto out_free;
spin_lock_irq(&current->sighand->siglock);
if (!seccomp_may_assign_mode(seccomp_mode))
goto out;
ret = seccomp_attach_filter(flags, prepared);
if (ret)
goto out;
/* Do not free the successfully attached filter. */
prepared = NULL;
seccomp_assign_mode(current, seccomp_mode, flags);
out:
spin_unlock_irq(&current->sighand->siglock);
if (flags & SECCOMP_FILTER_FLAG_TSYNC)
mutex_unlock(&current->signal->cred_guard_mutex);
out_free:
seccomp_filter_free(prepared);
return ret;
}
#else
static inline long seccomp_set_mode_filter(unsigned int flags,
const char __user *filter)
{
return -EINVAL;
}
#endif
static long seccomp_get_action_avail(const char __user *uaction)
{
u32 action;
if (copy_from_user(&action, uaction, sizeof(action)))
return -EFAULT;
switch (action) {
case SECCOMP_RET_KILL_PROCESS:
case SECCOMP_RET_KILL_THREAD:
case SECCOMP_RET_TRAP:
case SECCOMP_RET_ERRNO:
case SECCOMP_RET_TRACE:
case SECCOMP_RET_LOG:
case SECCOMP_RET_ALLOW:
break;
default:
return -EOPNOTSUPP;
}
return 0;
}
/* Common entry point for both prctl and syscall. */
static long do_seccomp(unsigned int op, unsigned int flags,
const char __user *uargs)
{
switch (op) {
case SECCOMP_SET_MODE_STRICT:
if (flags != 0 || uargs != NULL)
return -EINVAL;
return seccomp_set_mode_strict();
case SECCOMP_SET_MODE_FILTER:
return seccomp_set_mode_filter(flags, uargs);
case SECCOMP_GET_ACTION_AVAIL:
if (flags != 0)
return -EINVAL;
return seccomp_get_action_avail(uargs);
default:
return -EINVAL;
}
}
SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
const char __user *, uargs)
{
return do_seccomp(op, flags, uargs);
}
/**
* prctl_set_seccomp: configures current->seccomp.mode
* @seccomp_mode: requested mode to use
* @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
*
* Returns 0 on success or -EINVAL on failure.
*/
long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
{
unsigned int op;
char __user *uargs;
switch (seccomp_mode) {
case SECCOMP_MODE_STRICT:
op = SECCOMP_SET_MODE_STRICT;
/*
* Setting strict mode through prctl always ignored filter,
* so make sure it is always NULL here to pass the internal
* check in do_seccomp().
*/
uargs = NULL;
break;
case SECCOMP_MODE_FILTER:
op = SECCOMP_SET_MODE_FILTER;
uargs = filter;
break;
default:
return -EINVAL;
}
/* prctl interface doesn't have flags, so they are always zero. */
return do_seccomp(op, 0, uargs);
}
#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
static struct seccomp_filter *get_nth_filter(struct task_struct *task,
unsigned long filter_off)
{
struct seccomp_filter *orig, *filter;
unsigned long count;
/*
* Note: this is only correct because the caller should be the (ptrace)
* tracer of the task, otherwise lock_task_sighand is needed.
*/
spin_lock_irq(&task->sighand->siglock);
if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
spin_unlock_irq(&task->sighand->siglock);
return ERR_PTR(-EINVAL);
}
orig = task->seccomp.filter;
__get_seccomp_filter(orig);
spin_unlock_irq(&task->sighand->siglock);
count = 0;
for (filter = orig; filter; filter = filter->prev)
count++;
if (filter_off >= count) {
filter = ERR_PTR(-ENOENT);
goto out;
}
count -= filter_off;
for (filter = orig; filter && count > 1; filter = filter->prev)
count--;
if (WARN_ON(count != 1 || !filter)) {
filter = ERR_PTR(-ENOENT);
goto out;
}
__get_seccomp_filter(filter);
out:
__put_seccomp_filter(orig);
return filter;
}
long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
void __user *data)
{
struct seccomp_filter *filter;
struct sock_fprog_kern *fprog;
long ret;
if (!capable(CAP_SYS_ADMIN) ||
current->seccomp.mode != SECCOMP_MODE_DISABLED) {
return -EACCES;
}
filter = get_nth_filter(task, filter_off);
if (IS_ERR(filter))
return PTR_ERR(filter);
fprog = filter->prog->orig_prog;
if (!fprog) {
/* This must be a new non-cBPF filter, since we save
* every cBPF filter's orig_prog above when
* CONFIG_CHECKPOINT_RESTORE is enabled.
*/
ret = -EMEDIUMTYPE;
goto out;
}
ret = fprog->len;
if (!data)
goto out;
if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
ret = -EFAULT;
out:
__put_seccomp_filter(filter);
return ret;
}
long seccomp_get_metadata(struct task_struct *task,
unsigned long size, void __user *data)
{
long ret;
struct seccomp_filter *filter;
struct seccomp_metadata kmd = {};
if (!capable(CAP_SYS_ADMIN) ||
current->seccomp.mode != SECCOMP_MODE_DISABLED) {
return -EACCES;
}
size = min_t(unsigned long, size, sizeof(kmd));
if (size < sizeof(kmd.filter_off))
return -EINVAL;
if (copy_from_user(&kmd.filter_off, data, sizeof(kmd.filter_off)))
return -EFAULT;
filter = get_nth_filter(task, kmd.filter_off);
if (IS_ERR(filter))
return PTR_ERR(filter);
if (filter->log)
kmd.flags |= SECCOMP_FILTER_FLAG_LOG;
ret = size;
if (copy_to_user(data, &kmd, size))
ret = -EFAULT;
__put_seccomp_filter(filter);
return ret;
}
#endif
#ifdef CONFIG_SYSCTL
/* Human readable action names for friendly sysctl interaction */
#define SECCOMP_RET_KILL_PROCESS_NAME "kill_process"
#define SECCOMP_RET_KILL_THREAD_NAME "kill_thread"
#define SECCOMP_RET_TRAP_NAME "trap"
#define SECCOMP_RET_ERRNO_NAME "errno"
#define SECCOMP_RET_TRACE_NAME "trace"
#define SECCOMP_RET_LOG_NAME "log"
#define SECCOMP_RET_ALLOW_NAME "allow"
static const char seccomp_actions_avail[] =
SECCOMP_RET_KILL_PROCESS_NAME " "
SECCOMP_RET_KILL_THREAD_NAME " "
SECCOMP_RET_TRAP_NAME " "
SECCOMP_RET_ERRNO_NAME " "
SECCOMP_RET_TRACE_NAME " "
SECCOMP_RET_LOG_NAME " "
SECCOMP_RET_ALLOW_NAME;
struct seccomp_log_name {
u32 log;
const char *name;
};
static const struct seccomp_log_name seccomp_log_names[] = {
{ SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME },
{ SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
{ SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
{ SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
{ SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
{ SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME },
{ SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
{ }
};
static bool seccomp_names_from_actions_logged(char *names, size_t size,
u32 actions_logged,
const char *sep)
{
const struct seccomp_log_name *cur;
bool append_sep = false;
for (cur = seccomp_log_names; cur->name && size; cur++) {
ssize_t ret;
if (!(actions_logged & cur->log))
continue;
if (append_sep) {
ret = strscpy(names, sep, size);
if (ret < 0)
return false;
names += ret;
size -= ret;
} else
append_sep = true;
ret = strscpy(names, cur->name, size);
if (ret < 0)
return false;
names += ret;
size -= ret;
}
return true;
}
static bool seccomp_action_logged_from_name(u32 *action_logged,
const char *name)
{
const struct seccomp_log_name *cur;
for (cur = seccomp_log_names; cur->name; cur++) {
if (!strcmp(cur->name, name)) {
*action_logged = cur->log;
return true;
}
}
return false;
}
static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
{
char *name;
*actions_logged = 0;
while ((name = strsep(&names, " ")) && *name) {
u32 action_logged = 0;
if (!seccomp_action_logged_from_name(&action_logged, name))
return false;
*actions_logged |= action_logged;
}
return true;
}
static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
size_t *lenp, loff_t *ppos)
{
char names[sizeof(seccomp_actions_avail)];
struct ctl_table table;
memset(names, 0, sizeof(names));
if (!seccomp_names_from_actions_logged(names, sizeof(names),
seccomp_actions_logged, " "))
return -EINVAL;
table = *ro_table;
table.data = names;
table.maxlen = sizeof(names);
return proc_dostring(&table, 0, buffer, lenp, ppos);
}
static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer,
size_t *lenp, loff_t *ppos, u32 *actions_logged)
{
char names[sizeof(seccomp_actions_avail)];
struct ctl_table table;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
memset(names, 0, sizeof(names));
table = *ro_table;
table.data = names;
table.maxlen = sizeof(names);
ret = proc_dostring(&table, 1, buffer, lenp, ppos);
if (ret)
return ret;
if (!seccomp_actions_logged_from_names(actions_logged, table.data))
return -EINVAL;
if (*actions_logged & SECCOMP_LOG_ALLOW)
return -EINVAL;
seccomp_actions_logged = *actions_logged;
return 0;
}
static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged,
int ret)
{
char names[sizeof(seccomp_actions_avail)];
char old_names[sizeof(seccomp_actions_avail)];
const char *new = names;
const char *old = old_names;
if (!audit_enabled)
return;
memset(names, 0, sizeof(names));
memset(old_names, 0, sizeof(old_names));
if (ret)
new = "?";
else if (!actions_logged)
new = "(none)";
else if (!seccomp_names_from_actions_logged(names, sizeof(names),
actions_logged, ","))
new = "?";
if (!old_actions_logged)
old = "(none)";
else if (!seccomp_names_from_actions_logged(old_names,
sizeof(old_names),
old_actions_logged, ","))
old = "?";
return audit_seccomp_actions_logged(new, old, !ret);
}
static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
if (write) {
u32 actions_logged = 0;
u32 old_actions_logged = seccomp_actions_logged;
ret = write_actions_logged(ro_table, buffer, lenp, ppos,
&actions_logged);
audit_actions_logged(actions_logged, old_actions_logged, ret);
} else
ret = read_actions_logged(ro_table, buffer, lenp, ppos);
return ret;
}
static struct ctl_path seccomp_sysctl_path[] = {
{ .procname = "kernel", },
{ .procname = "seccomp", },
{ }
};
static struct ctl_table seccomp_sysctl_table[] = {
{
.procname = "actions_avail",
.data = (void *) &seccomp_actions_avail,
.maxlen = sizeof(seccomp_actions_avail),
.mode = 0444,
.proc_handler = proc_dostring,
},
{
.procname = "actions_logged",
.mode = 0644,
.proc_handler = seccomp_actions_logged_handler,
},
{ }
};
static int __init seccomp_sysctl_init(void)
{
struct ctl_table_header *hdr;
hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table);
if (!hdr)
pr_warn("seccomp: sysctl registration failed\n");
else
kmemleak_not_leak(hdr);
return 0;
}
device_initcall(seccomp_sysctl_init)
#endif /* CONFIG_SYSCTL */