mirror of
https://github.com/torvalds/linux
synced 2024-11-05 18:23:50 +00:00
0cbee99269
Pull user namespace updates from Eric Biederman: "Long ago and far away when user namespaces where young it was realized that allowing fresh mounts of proc and sysfs with only user namespace permissions could violate the basic rule that only root gets to decide if proc or sysfs should be mounted at all. Some hacks were put in place to reduce the worst of the damage could be done, and the common sense rule was adopted that fresh mounts of proc and sysfs should allow no more than bind mounts of proc and sysfs. Unfortunately that rule has not been fully enforced. There are two kinds of gaps in that enforcement. Only filesystems mounted on empty directories of proc and sysfs should be ignored but the test for empty directories was insufficient. So in my tree directories on proc, sysctl and sysfs that will always be empty are created specially. Every other technique is imperfect as an ordinary directory can have entries added even after a readdir returns and shows that the directory is empty. Special creation of directories for mount points makes the code in the kernel a smidge clearer about it's purpose. I asked container developers from the various container projects to help test this and no holes were found in the set of mount points on proc and sysfs that are created specially. This set of changes also starts enforcing the mount flags of fresh mounts of proc and sysfs are consistent with the existing mount of proc and sysfs. I expected this to be the boring part of the work but unfortunately unprivileged userspace winds up mounting fresh copies of proc and sysfs with noexec and nosuid clear when root set those flags on the previous mount of proc and sysfs. So for now only the atime, read-only and nodev attributes which userspace happens to keep consistent are enforced. Dealing with the noexec and nosuid attributes remains for another time. This set of changes also addresses an issue with how open file descriptors from /proc/<pid>/ns/* are displayed. Recently readlink of /proc/<pid>/fd has been triggering a WARN_ON that has not been meaningful since it was added (as all of the code in the kernel was converted) and is not now actively wrong. There is also a short list of issues that have not been fixed yet that I will mention briefly. It is possible to rename a directory from below to above a bind mount. At which point any directory pointers below the renamed directory can be walked up to the root directory of the filesystem. With user namespaces enabled a bind mount of the bind mount can be created allowing the user to pick a directory whose children they can rename to outside of the bind mount. This is challenging to fix and doubly so because all obvious solutions must touch code that is in the performance part of pathname resolution. As mentioned above there is also a question of how to ensure that developers by accident or with purpose do not introduce exectuable files on sysfs and proc and in doing so introduce security regressions in the current userspace that will not be immediately obvious and as such are likely to require breaking userspace in painful ways once they are recognized" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: vfs: Remove incorrect debugging WARN in prepend_path mnt: Update fs_fully_visible to test for permanently empty directories sysfs: Create mountpoints with sysfs_create_mount_point sysfs: Add support for permanently empty directories to serve as mount points. kernfs: Add support for always empty directories. proc: Allow creating permanently empty directories that serve as mount points sysctl: Allow creating permanently empty directories that serve as mountpoints. fs: Add helper functions for permanently empty directories. vfs: Ignore unlocked mounts in fs_fully_visible mnt: Modify fs_fully_visible to deal with locked ro nodev and atime mnt: Refactor the logic for mounting sysfs and proc in a user namespace
488 lines
12 KiB
C
488 lines
12 KiB
C
/*
|
|
* linux/fs/proc/inode.c
|
|
*
|
|
* Copyright (C) 1991, 1992 Linus Torvalds
|
|
*/
|
|
|
|
#include <linux/time.h>
|
|
#include <linux/proc_fs.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/pid_namespace.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/string.h>
|
|
#include <linux/stat.h>
|
|
#include <linux/completion.h>
|
|
#include <linux/poll.h>
|
|
#include <linux/printk.h>
|
|
#include <linux/file.h>
|
|
#include <linux/limits.h>
|
|
#include <linux/init.h>
|
|
#include <linux/module.h>
|
|
#include <linux/sysctl.h>
|
|
#include <linux/seq_file.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/mount.h>
|
|
#include <linux/magic.h>
|
|
|
|
#include <asm/uaccess.h>
|
|
|
|
#include "internal.h"
|
|
|
|
static void proc_evict_inode(struct inode *inode)
|
|
{
|
|
struct proc_dir_entry *de;
|
|
struct ctl_table_header *head;
|
|
|
|
truncate_inode_pages_final(&inode->i_data);
|
|
clear_inode(inode);
|
|
|
|
/* Stop tracking associated processes */
|
|
put_pid(PROC_I(inode)->pid);
|
|
|
|
/* Let go of any associated proc directory entry */
|
|
de = PDE(inode);
|
|
if (de)
|
|
pde_put(de);
|
|
head = PROC_I(inode)->sysctl;
|
|
if (head) {
|
|
RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
|
|
sysctl_head_put(head);
|
|
}
|
|
}
|
|
|
|
static struct kmem_cache * proc_inode_cachep;
|
|
|
|
static struct inode *proc_alloc_inode(struct super_block *sb)
|
|
{
|
|
struct proc_inode *ei;
|
|
struct inode *inode;
|
|
|
|
ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
|
|
if (!ei)
|
|
return NULL;
|
|
ei->pid = NULL;
|
|
ei->fd = 0;
|
|
ei->op.proc_get_link = NULL;
|
|
ei->pde = NULL;
|
|
ei->sysctl = NULL;
|
|
ei->sysctl_entry = NULL;
|
|
ei->ns_ops = NULL;
|
|
inode = &ei->vfs_inode;
|
|
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
|
|
return inode;
|
|
}
|
|
|
|
static void proc_i_callback(struct rcu_head *head)
|
|
{
|
|
struct inode *inode = container_of(head, struct inode, i_rcu);
|
|
kmem_cache_free(proc_inode_cachep, PROC_I(inode));
|
|
}
|
|
|
|
static void proc_destroy_inode(struct inode *inode)
|
|
{
|
|
call_rcu(&inode->i_rcu, proc_i_callback);
|
|
}
|
|
|
|
static void init_once(void *foo)
|
|
{
|
|
struct proc_inode *ei = (struct proc_inode *) foo;
|
|
|
|
inode_init_once(&ei->vfs_inode);
|
|
}
|
|
|
|
void __init proc_init_inodecache(void)
|
|
{
|
|
proc_inode_cachep = kmem_cache_create("proc_inode_cache",
|
|
sizeof(struct proc_inode),
|
|
0, (SLAB_RECLAIM_ACCOUNT|
|
|
SLAB_MEM_SPREAD|SLAB_PANIC),
|
|
init_once);
|
|
}
|
|
|
|
static int proc_show_options(struct seq_file *seq, struct dentry *root)
|
|
{
|
|
struct super_block *sb = root->d_sb;
|
|
struct pid_namespace *pid = sb->s_fs_info;
|
|
|
|
if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID))
|
|
seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid));
|
|
if (pid->hide_pid != 0)
|
|
seq_printf(seq, ",hidepid=%u", pid->hide_pid);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static const struct super_operations proc_sops = {
|
|
.alloc_inode = proc_alloc_inode,
|
|
.destroy_inode = proc_destroy_inode,
|
|
.drop_inode = generic_delete_inode,
|
|
.evict_inode = proc_evict_inode,
|
|
.statfs = simple_statfs,
|
|
.remount_fs = proc_remount,
|
|
.show_options = proc_show_options,
|
|
};
|
|
|
|
enum {BIAS = -1U<<31};
|
|
|
|
static inline int use_pde(struct proc_dir_entry *pde)
|
|
{
|
|
return atomic_inc_unless_negative(&pde->in_use);
|
|
}
|
|
|
|
static void unuse_pde(struct proc_dir_entry *pde)
|
|
{
|
|
if (atomic_dec_return(&pde->in_use) == BIAS)
|
|
complete(pde->pde_unload_completion);
|
|
}
|
|
|
|
/* pde is locked */
|
|
static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
|
|
{
|
|
if (pdeo->closing) {
|
|
/* somebody else is doing that, just wait */
|
|
DECLARE_COMPLETION_ONSTACK(c);
|
|
pdeo->c = &c;
|
|
spin_unlock(&pde->pde_unload_lock);
|
|
wait_for_completion(&c);
|
|
spin_lock(&pde->pde_unload_lock);
|
|
} else {
|
|
struct file *file;
|
|
pdeo->closing = 1;
|
|
spin_unlock(&pde->pde_unload_lock);
|
|
file = pdeo->file;
|
|
pde->proc_fops->release(file_inode(file), file);
|
|
spin_lock(&pde->pde_unload_lock);
|
|
list_del_init(&pdeo->lh);
|
|
if (pdeo->c)
|
|
complete(pdeo->c);
|
|
kfree(pdeo);
|
|
}
|
|
}
|
|
|
|
void proc_entry_rundown(struct proc_dir_entry *de)
|
|
{
|
|
DECLARE_COMPLETION_ONSTACK(c);
|
|
/* Wait until all existing callers into module are done. */
|
|
de->pde_unload_completion = &c;
|
|
if (atomic_add_return(BIAS, &de->in_use) != BIAS)
|
|
wait_for_completion(&c);
|
|
|
|
spin_lock(&de->pde_unload_lock);
|
|
while (!list_empty(&de->pde_openers)) {
|
|
struct pde_opener *pdeo;
|
|
pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
|
|
close_pdeo(de, pdeo);
|
|
}
|
|
spin_unlock(&de->pde_unload_lock);
|
|
}
|
|
|
|
static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
|
|
{
|
|
struct proc_dir_entry *pde = PDE(file_inode(file));
|
|
loff_t rv = -EINVAL;
|
|
if (use_pde(pde)) {
|
|
loff_t (*llseek)(struct file *, loff_t, int);
|
|
llseek = pde->proc_fops->llseek;
|
|
if (!llseek)
|
|
llseek = default_llseek;
|
|
rv = llseek(file, offset, whence);
|
|
unuse_pde(pde);
|
|
}
|
|
return rv;
|
|
}
|
|
|
|
static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
|
|
{
|
|
ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
|
|
struct proc_dir_entry *pde = PDE(file_inode(file));
|
|
ssize_t rv = -EIO;
|
|
if (use_pde(pde)) {
|
|
read = pde->proc_fops->read;
|
|
if (read)
|
|
rv = read(file, buf, count, ppos);
|
|
unuse_pde(pde);
|
|
}
|
|
return rv;
|
|
}
|
|
|
|
static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
|
|
{
|
|
ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
|
|
struct proc_dir_entry *pde = PDE(file_inode(file));
|
|
ssize_t rv = -EIO;
|
|
if (use_pde(pde)) {
|
|
write = pde->proc_fops->write;
|
|
if (write)
|
|
rv = write(file, buf, count, ppos);
|
|
unuse_pde(pde);
|
|
}
|
|
return rv;
|
|
}
|
|
|
|
static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts)
|
|
{
|
|
struct proc_dir_entry *pde = PDE(file_inode(file));
|
|
unsigned int rv = DEFAULT_POLLMASK;
|
|
unsigned int (*poll)(struct file *, struct poll_table_struct *);
|
|
if (use_pde(pde)) {
|
|
poll = pde->proc_fops->poll;
|
|
if (poll)
|
|
rv = poll(file, pts);
|
|
unuse_pde(pde);
|
|
}
|
|
return rv;
|
|
}
|
|
|
|
static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
|
{
|
|
struct proc_dir_entry *pde = PDE(file_inode(file));
|
|
long rv = -ENOTTY;
|
|
long (*ioctl)(struct file *, unsigned int, unsigned long);
|
|
if (use_pde(pde)) {
|
|
ioctl = pde->proc_fops->unlocked_ioctl;
|
|
if (ioctl)
|
|
rv = ioctl(file, cmd, arg);
|
|
unuse_pde(pde);
|
|
}
|
|
return rv;
|
|
}
|
|
|
|
#ifdef CONFIG_COMPAT
|
|
static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
|
{
|
|
struct proc_dir_entry *pde = PDE(file_inode(file));
|
|
long rv = -ENOTTY;
|
|
long (*compat_ioctl)(struct file *, unsigned int, unsigned long);
|
|
if (use_pde(pde)) {
|
|
compat_ioctl = pde->proc_fops->compat_ioctl;
|
|
if (compat_ioctl)
|
|
rv = compat_ioctl(file, cmd, arg);
|
|
unuse_pde(pde);
|
|
}
|
|
return rv;
|
|
}
|
|
#endif
|
|
|
|
static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
|
|
{
|
|
struct proc_dir_entry *pde = PDE(file_inode(file));
|
|
int rv = -EIO;
|
|
int (*mmap)(struct file *, struct vm_area_struct *);
|
|
if (use_pde(pde)) {
|
|
mmap = pde->proc_fops->mmap;
|
|
if (mmap)
|
|
rv = mmap(file, vma);
|
|
unuse_pde(pde);
|
|
}
|
|
return rv;
|
|
}
|
|
|
|
static unsigned long
|
|
proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
|
|
unsigned long len, unsigned long pgoff,
|
|
unsigned long flags)
|
|
{
|
|
struct proc_dir_entry *pde = PDE(file_inode(file));
|
|
unsigned long rv = -EIO;
|
|
|
|
if (use_pde(pde)) {
|
|
typeof(proc_reg_get_unmapped_area) *get_area;
|
|
|
|
get_area = pde->proc_fops->get_unmapped_area;
|
|
#ifdef CONFIG_MMU
|
|
if (!get_area)
|
|
get_area = current->mm->get_unmapped_area;
|
|
#endif
|
|
|
|
if (get_area)
|
|
rv = get_area(file, orig_addr, len, pgoff, flags);
|
|
else
|
|
rv = orig_addr;
|
|
unuse_pde(pde);
|
|
}
|
|
return rv;
|
|
}
|
|
|
|
static int proc_reg_open(struct inode *inode, struct file *file)
|
|
{
|
|
struct proc_dir_entry *pde = PDE(inode);
|
|
int rv = 0;
|
|
int (*open)(struct inode *, struct file *);
|
|
int (*release)(struct inode *, struct file *);
|
|
struct pde_opener *pdeo;
|
|
|
|
/*
|
|
* What for, you ask? Well, we can have open, rmmod, remove_proc_entry
|
|
* sequence. ->release won't be called because ->proc_fops will be
|
|
* cleared. Depending on complexity of ->release, consequences vary.
|
|
*
|
|
* We can't wait for mercy when close will be done for real, it's
|
|
* deadlockable: rmmod foo </proc/foo . So, we're going to do ->release
|
|
* by hand in remove_proc_entry(). For this, save opener's credentials
|
|
* for later.
|
|
*/
|
|
pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL);
|
|
if (!pdeo)
|
|
return -ENOMEM;
|
|
|
|
if (!use_pde(pde)) {
|
|
kfree(pdeo);
|
|
return -ENOENT;
|
|
}
|
|
open = pde->proc_fops->open;
|
|
release = pde->proc_fops->release;
|
|
|
|
if (open)
|
|
rv = open(inode, file);
|
|
|
|
if (rv == 0 && release) {
|
|
/* To know what to release. */
|
|
pdeo->file = file;
|
|
/* Strictly for "too late" ->release in proc_reg_release(). */
|
|
spin_lock(&pde->pde_unload_lock);
|
|
list_add(&pdeo->lh, &pde->pde_openers);
|
|
spin_unlock(&pde->pde_unload_lock);
|
|
} else
|
|
kfree(pdeo);
|
|
|
|
unuse_pde(pde);
|
|
return rv;
|
|
}
|
|
|
|
static int proc_reg_release(struct inode *inode, struct file *file)
|
|
{
|
|
struct proc_dir_entry *pde = PDE(inode);
|
|
struct pde_opener *pdeo;
|
|
spin_lock(&pde->pde_unload_lock);
|
|
list_for_each_entry(pdeo, &pde->pde_openers, lh) {
|
|
if (pdeo->file == file) {
|
|
close_pdeo(pde, pdeo);
|
|
break;
|
|
}
|
|
}
|
|
spin_unlock(&pde->pde_unload_lock);
|
|
return 0;
|
|
}
|
|
|
|
static const struct file_operations proc_reg_file_ops = {
|
|
.llseek = proc_reg_llseek,
|
|
.read = proc_reg_read,
|
|
.write = proc_reg_write,
|
|
.poll = proc_reg_poll,
|
|
.unlocked_ioctl = proc_reg_unlocked_ioctl,
|
|
#ifdef CONFIG_COMPAT
|
|
.compat_ioctl = proc_reg_compat_ioctl,
|
|
#endif
|
|
.mmap = proc_reg_mmap,
|
|
.get_unmapped_area = proc_reg_get_unmapped_area,
|
|
.open = proc_reg_open,
|
|
.release = proc_reg_release,
|
|
};
|
|
|
|
#ifdef CONFIG_COMPAT
|
|
static const struct file_operations proc_reg_file_ops_no_compat = {
|
|
.llseek = proc_reg_llseek,
|
|
.read = proc_reg_read,
|
|
.write = proc_reg_write,
|
|
.poll = proc_reg_poll,
|
|
.unlocked_ioctl = proc_reg_unlocked_ioctl,
|
|
.mmap = proc_reg_mmap,
|
|
.get_unmapped_area = proc_reg_get_unmapped_area,
|
|
.open = proc_reg_open,
|
|
.release = proc_reg_release,
|
|
};
|
|
#endif
|
|
|
|
static const char *proc_follow_link(struct dentry *dentry, void **cookie)
|
|
{
|
|
struct proc_dir_entry *pde = PDE(d_inode(dentry));
|
|
if (unlikely(!use_pde(pde)))
|
|
return ERR_PTR(-EINVAL);
|
|
*cookie = pde;
|
|
return pde->data;
|
|
}
|
|
|
|
static void proc_put_link(struct inode *unused, void *p)
|
|
{
|
|
unuse_pde(p);
|
|
}
|
|
|
|
const struct inode_operations proc_link_inode_operations = {
|
|
.readlink = generic_readlink,
|
|
.follow_link = proc_follow_link,
|
|
.put_link = proc_put_link,
|
|
};
|
|
|
|
struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
|
|
{
|
|
struct inode *inode = new_inode_pseudo(sb);
|
|
|
|
if (inode) {
|
|
inode->i_ino = de->low_ino;
|
|
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
|
|
PROC_I(inode)->pde = de;
|
|
|
|
if (is_empty_pde(de)) {
|
|
make_empty_dir_inode(inode);
|
|
return inode;
|
|
}
|
|
if (de->mode) {
|
|
inode->i_mode = de->mode;
|
|
inode->i_uid = de->uid;
|
|
inode->i_gid = de->gid;
|
|
}
|
|
if (de->size)
|
|
inode->i_size = de->size;
|
|
if (de->nlink)
|
|
set_nlink(inode, de->nlink);
|
|
WARN_ON(!de->proc_iops);
|
|
inode->i_op = de->proc_iops;
|
|
if (de->proc_fops) {
|
|
if (S_ISREG(inode->i_mode)) {
|
|
#ifdef CONFIG_COMPAT
|
|
if (!de->proc_fops->compat_ioctl)
|
|
inode->i_fop =
|
|
&proc_reg_file_ops_no_compat;
|
|
else
|
|
#endif
|
|
inode->i_fop = &proc_reg_file_ops;
|
|
} else {
|
|
inode->i_fop = de->proc_fops;
|
|
}
|
|
}
|
|
} else
|
|
pde_put(de);
|
|
return inode;
|
|
}
|
|
|
|
int proc_fill_super(struct super_block *s)
|
|
{
|
|
struct inode *root_inode;
|
|
int ret;
|
|
|
|
s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
|
|
s->s_blocksize = 1024;
|
|
s->s_blocksize_bits = 10;
|
|
s->s_magic = PROC_SUPER_MAGIC;
|
|
s->s_op = &proc_sops;
|
|
s->s_time_gran = 1;
|
|
|
|
pde_get(&proc_root);
|
|
root_inode = proc_get_inode(s, &proc_root);
|
|
if (!root_inode) {
|
|
pr_err("proc_fill_super: get root inode failed\n");
|
|
return -ENOMEM;
|
|
}
|
|
|
|
s->s_root = d_make_root(root_inode);
|
|
if (!s->s_root) {
|
|
pr_err("proc_fill_super: allocate dentry failed\n");
|
|
return -ENOMEM;
|
|
}
|
|
|
|
ret = proc_setup_self(s);
|
|
if (ret) {
|
|
return ret;
|
|
}
|
|
return proc_setup_thread_self(s);
|
|
}
|