linux/block/ioprio.c
Jens Axboe 3b7cb74547 block: move __get_task_ioprio() into header file
We call this once per IO, which can be millions of times per second.
Since nobody really uses io priorities, or at least it isn't very
common, this is all wasted time and can amount to as much as 3% of
the total kernel time.

Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-01-08 12:27:39 -07:00

254 lines
5.2 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* fs/ioprio.c
*
* Copyright (C) 2004 Jens Axboe <axboe@kernel.dk>
*
* Helper functions for setting/querying io priorities of processes. The
* system calls closely mimmick getpriority/setpriority, see the man page for
* those. The prio argument is a composite of prio class and prio data, where
* the data argument has meaning within that class. The standard scheduling
* classes have 8 distinct prio levels, with 0 being the highest prio and 7
* being the lowest.
*
* IOW, setting BE scheduling class with prio 2 is done ala:
*
* unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2;
*
* ioprio_set(PRIO_PROCESS, pid, prio);
*
* See also Documentation/block/ioprio.rst
*
*/
#include <linux/gfp.h>
#include <linux/kernel.h>
#include <linux/ioprio.h>
#include <linux/cred.h>
#include <linux/blkdev.h>
#include <linux/capability.h>
#include <linux/syscalls.h>
#include <linux/security.h>
#include <linux/pid_namespace.h>
int ioprio_check_cap(int ioprio)
{
int class = IOPRIO_PRIO_CLASS(ioprio);
int level = IOPRIO_PRIO_LEVEL(ioprio);
switch (class) {
case IOPRIO_CLASS_RT:
/*
* Originally this only checked for CAP_SYS_ADMIN,
* which was implicitly allowed for pid 0 by security
* modules such as SELinux. Make sure we check
* CAP_SYS_ADMIN first to avoid a denial/avc for
* possibly missing CAP_SYS_NICE permission.
*/
if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
return -EPERM;
fallthrough;
/* rt has prio field too */
case IOPRIO_CLASS_BE:
if (level >= IOPRIO_NR_LEVELS)
return -EINVAL;
break;
case IOPRIO_CLASS_IDLE:
break;
case IOPRIO_CLASS_NONE:
if (level)
return -EINVAL;
break;
case IOPRIO_CLASS_INVALID:
default:
return -EINVAL;
}
return 0;
}
SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
{
struct task_struct *p, *g;
struct user_struct *user;
struct pid *pgrp;
kuid_t uid;
int ret;
ret = ioprio_check_cap(ioprio);
if (ret)
return ret;
ret = -ESRCH;
rcu_read_lock();
switch (which) {
case IOPRIO_WHO_PROCESS:
if (!who)
p = current;
else
p = find_task_by_vpid(who);
if (p)
ret = set_task_ioprio(p, ioprio);
break;
case IOPRIO_WHO_PGRP:
if (!who)
pgrp = task_pgrp(current);
else
pgrp = find_vpid(who);
read_lock(&tasklist_lock);
do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
ret = set_task_ioprio(p, ioprio);
if (ret) {
read_unlock(&tasklist_lock);
goto out;
}
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
read_unlock(&tasklist_lock);
break;
case IOPRIO_WHO_USER:
uid = make_kuid(current_user_ns(), who);
if (!uid_valid(uid))
break;
if (!who)
user = current_user();
else
user = find_user(uid);
if (!user)
break;
for_each_process_thread(g, p) {
if (!uid_eq(task_uid(p), uid) ||
!task_pid_vnr(p))
continue;
ret = set_task_ioprio(p, ioprio);
if (ret)
goto free_uid;
}
free_uid:
if (who)
free_uid(user);
break;
default:
ret = -EINVAL;
}
out:
rcu_read_unlock();
return ret;
}
static int get_task_ioprio(struct task_struct *p)
{
int ret;
ret = security_task_getioprio(p);
if (ret)
goto out;
task_lock(p);
ret = __get_task_ioprio(p);
task_unlock(p);
out:
return ret;
}
/*
* Return raw IO priority value as set by userspace. We use this for
* ioprio_get(pid, IOPRIO_WHO_PROCESS) so that we keep historical behavior and
* also so that userspace can distinguish unset IO priority (which just gets
* overriden based on task's nice value) from IO priority set to some value.
*/
static int get_task_raw_ioprio(struct task_struct *p)
{
int ret;
ret = security_task_getioprio(p);
if (ret)
goto out;
task_lock(p);
if (p->io_context)
ret = p->io_context->ioprio;
else
ret = IOPRIO_DEFAULT;
task_unlock(p);
out:
return ret;
}
static int ioprio_best(unsigned short aprio, unsigned short bprio)
{
return min(aprio, bprio);
}
SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
{
struct task_struct *g, *p;
struct user_struct *user;
struct pid *pgrp;
kuid_t uid;
int ret = -ESRCH;
int tmpio;
rcu_read_lock();
switch (which) {
case IOPRIO_WHO_PROCESS:
if (!who)
p = current;
else
p = find_task_by_vpid(who);
if (p)
ret = get_task_raw_ioprio(p);
break;
case IOPRIO_WHO_PGRP:
if (!who)
pgrp = task_pgrp(current);
else
pgrp = find_vpid(who);
read_lock(&tasklist_lock);
do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
tmpio = get_task_ioprio(p);
if (tmpio < 0)
continue;
if (ret == -ESRCH)
ret = tmpio;
else
ret = ioprio_best(ret, tmpio);
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
read_unlock(&tasklist_lock);
break;
case IOPRIO_WHO_USER:
uid = make_kuid(current_user_ns(), who);
if (!who)
user = current_user();
else
user = find_user(uid);
if (!user)
break;
for_each_process_thread(g, p) {
if (!uid_eq(task_uid(p), user->uid) ||
!task_pid_vnr(p))
continue;
tmpio = get_task_ioprio(p);
if (tmpio < 0)
continue;
if (ret == -ESRCH)
ret = tmpio;
else
ret = ioprio_best(ret, tmpio);
}
if (who)
free_uid(user);
break;
default:
ret = -EINVAL;
}
rcu_read_unlock();
return ret;
}