linux/fs/nsfs.c
Christian Brauner ca567df74a nsfs: add pid translation ioctls
Add ioctl()s to translate pids between pid namespaces.

LXCFS is a tiny fuse filesystem used to virtualize various aspects of
procfs. LXCFS is run on the host. The files and directories it creates
can be bind-mounted by e.g. a container at startup and mounted over the
various procfs files the container wishes to have virtualized. When e.g.
a read request for uptime is received, LXCFS will receive the pid of the
reader. In order to virtualize the corresponding read, LXCFS needs to
know the pid of the init process of the reader's pid namespace. In order
to do this, LXCFS first needs to fork() two helper processes. The first
helper process setns() to the readers pid namespace. The second helper
process is needed to create a process that is a proper member of the pid
namespace. The second helper process then creates a ucred message with
ucred.pid set to 1 and sends it back to LXCFS. The kernel will translate
the ucred.pid field to the corresponding pid number in LXCFS's pid
namespace. This way LXCFS can learn the init pid number of the reader's
pid namespace and can go on to virtualize. Since these two forks() are
costly LXCFS maintains an init pid cache that caches a given pid for a
fixed amount of time. The cache is pruned during new read requests.
However, even with the cache the hit of the two forks() is singificant
when a very large number of containers are running. With this simple
patch we add an ns ioctl that let's a caller retrieve the init pid nr of
a pid namespace through its pid namespace fd. This significantly
improves performance with a very simple change.

Support translation of pids and tgids. Other concepts can be added but
there are no obvious users for this right now.

To protect against races pidfds can be used to check whether the process
is still valid. If needed, this can also be extended to work on pidfds
directly.

Link: https://lore.kernel.org/r/20240619-work-ns_ioctl-v1-1-7c0097e6bb6b@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-25 23:00:41 +02:00

298 lines
6.6 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
#include <linux/magic.h>
#include <linux/ktime.h>
#include <linux/seq_file.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/nsfs.h>
#include <linux/uaccess.h>
#include "internal.h"
static struct vfsmount *nsfs_mnt;
static long ns_ioctl(struct file *filp, unsigned int ioctl,
unsigned long arg);
static const struct file_operations ns_file_operations = {
.llseek = no_llseek,
.unlocked_ioctl = ns_ioctl,
.compat_ioctl = compat_ptr_ioctl,
};
static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
{
struct inode *inode = d_inode(dentry);
struct ns_common *ns = inode->i_private;
const struct proc_ns_operations *ns_ops = ns->ops;
return dynamic_dname(buffer, buflen, "%s:[%lu]",
ns_ops->name, inode->i_ino);
}
const struct dentry_operations ns_dentry_operations = {
.d_delete = always_delete_dentry,
.d_dname = ns_dname,
.d_prune = stashed_dentry_prune,
};
static void nsfs_evict(struct inode *inode)
{
struct ns_common *ns = inode->i_private;
clear_inode(inode);
ns->ops->put(ns);
}
int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
void *private_data)
{
struct ns_common *ns;
ns = ns_get_cb(private_data);
if (!ns)
return -ENOENT;
return path_from_stashed(&ns->stashed, nsfs_mnt, ns, path);
}
struct ns_get_path_task_args {
const struct proc_ns_operations *ns_ops;
struct task_struct *task;
};
static struct ns_common *ns_get_path_task(void *private_data)
{
struct ns_get_path_task_args *args = private_data;
return args->ns_ops->get(args->task);
}
int ns_get_path(struct path *path, struct task_struct *task,
const struct proc_ns_operations *ns_ops)
{
struct ns_get_path_task_args args = {
.ns_ops = ns_ops,
.task = task,
};
return ns_get_path_cb(path, ns_get_path_task, &args);
}
int open_related_ns(struct ns_common *ns,
struct ns_common *(*get_ns)(struct ns_common *ns))
{
struct path path = {};
struct ns_common *relative;
struct file *f;
int err;
int fd;
fd = get_unused_fd_flags(O_CLOEXEC);
if (fd < 0)
return fd;
relative = get_ns(ns);
if (IS_ERR(relative)) {
put_unused_fd(fd);
return PTR_ERR(relative);
}
err = path_from_stashed(&relative->stashed, nsfs_mnt, relative, &path);
if (err < 0) {
put_unused_fd(fd);
return err;
}
f = dentry_open(&path, O_RDONLY, current_cred());
path_put(&path);
if (IS_ERR(f)) {
put_unused_fd(fd);
fd = PTR_ERR(f);
} else
fd_install(fd, f);
return fd;
}
EXPORT_SYMBOL_GPL(open_related_ns);
static long ns_ioctl(struct file *filp, unsigned int ioctl,
unsigned long arg)
{
struct user_namespace *user_ns;
struct pid_namespace *pid_ns;
struct task_struct *tsk;
struct ns_common *ns = get_proc_ns(file_inode(filp));
uid_t __user *argp;
uid_t uid;
int ret;
switch (ioctl) {
case NS_GET_USERNS:
return open_related_ns(ns, ns_get_owner);
case NS_GET_PARENT:
if (!ns->ops->get_parent)
return -EINVAL;
return open_related_ns(ns, ns->ops->get_parent);
case NS_GET_NSTYPE:
return ns->ops->type;
case NS_GET_OWNER_UID:
if (ns->ops->type != CLONE_NEWUSER)
return -EINVAL;
user_ns = container_of(ns, struct user_namespace, ns);
argp = (uid_t __user *) arg;
uid = from_kuid_munged(current_user_ns(), user_ns->owner);
return put_user(uid, argp);
case NS_GET_PID_FROM_PIDNS:
fallthrough;
case NS_GET_TGID_FROM_PIDNS:
fallthrough;
case NS_GET_PID_IN_PIDNS:
fallthrough;
case NS_GET_TGID_IN_PIDNS:
if (ns->ops->type != CLONE_NEWPID)
return -EINVAL;
ret = -ESRCH;
pid_ns = container_of(ns, struct pid_namespace, ns);
rcu_read_lock();
if (ioctl == NS_GET_PID_IN_PIDNS ||
ioctl == NS_GET_TGID_IN_PIDNS)
tsk = find_task_by_vpid(arg);
else
tsk = find_task_by_pid_ns(arg, pid_ns);
if (!tsk)
break;
switch (ioctl) {
case NS_GET_PID_FROM_PIDNS:
ret = task_pid_vnr(tsk);
break;
case NS_GET_TGID_FROM_PIDNS:
ret = task_tgid_vnr(tsk);
break;
case NS_GET_PID_IN_PIDNS:
ret = task_pid_nr_ns(tsk, pid_ns);
break;
case NS_GET_TGID_IN_PIDNS:
ret = task_tgid_nr_ns(tsk, pid_ns);
break;
default:
ret = 0;
break;
}
rcu_read_unlock();
if (!ret)
ret = -ESRCH;
break;
default:
ret = -ENOTTY;
}
return ret;
}
int ns_get_name(char *buf, size_t size, struct task_struct *task,
const struct proc_ns_operations *ns_ops)
{
struct ns_common *ns;
int res = -ENOENT;
const char *name;
ns = ns_ops->get(task);
if (ns) {
name = ns_ops->real_ns_name ? : ns_ops->name;
res = snprintf(buf, size, "%s:[%u]", name, ns->inum);
ns_ops->put(ns);
}
return res;
}
bool proc_ns_file(const struct file *file)
{
return file->f_op == &ns_file_operations;
}
/**
* ns_match() - Returns true if current namespace matches dev/ino provided.
* @ns: current namespace
* @dev: dev_t from nsfs that will be matched against current nsfs
* @ino: ino_t from nsfs that will be matched against current nsfs
*
* Return: true if dev and ino matches the current nsfs.
*/
bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino)
{
return (ns->inum == ino) && (nsfs_mnt->mnt_sb->s_dev == dev);
}
static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
const struct ns_common *ns = inode->i_private;
const struct proc_ns_operations *ns_ops = ns->ops;
seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
return 0;
}
static const struct super_operations nsfs_ops = {
.statfs = simple_statfs,
.evict_inode = nsfs_evict,
.show_path = nsfs_show_path,
};
static int nsfs_init_inode(struct inode *inode, void *data)
{
struct ns_common *ns = data;
inode->i_private = data;
inode->i_mode |= S_IRUGO;
inode->i_fop = &ns_file_operations;
inode->i_ino = ns->inum;
return 0;
}
static void nsfs_put_data(void *data)
{
struct ns_common *ns = data;
ns->ops->put(ns);
}
static const struct stashed_operations nsfs_stashed_ops = {
.init_inode = nsfs_init_inode,
.put_data = nsfs_put_data,
};
static int nsfs_init_fs_context(struct fs_context *fc)
{
struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC);
if (!ctx)
return -ENOMEM;
ctx->ops = &nsfs_ops;
ctx->dops = &ns_dentry_operations;
fc->s_fs_info = (void *)&nsfs_stashed_ops;
return 0;
}
static struct file_system_type nsfs = {
.name = "nsfs",
.init_fs_context = nsfs_init_fs_context,
.kill_sb = kill_anon_super,
};
void __init nsfs_init(void)
{
nsfs_mnt = kern_mount(&nsfs);
if (IS_ERR(nsfs_mnt))
panic("can't set nsfs up\n");
nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER;
}