linux/kernel/tsacct.c
Dr. Thomas Orgis 0e0af57e0e taskstats: version 12 with thread group and exe info
The task exit struct needs some crucial information to be able to provide
an enhanced version of process and thread accounting.  This change
provides:

1. ac_tgid in additon to ac_pid
2. thread group execution walltime in ac_tgetime
3. flag AGROUP in ac_flag to indicate the last task
   in a thread group / process
4. device ID and inode of task's /proc/self/exe in
   ac_exe_dev and ac_exe_inode
5. tools/accounting/procacct as demonstrator

When a task exits, taskstats are reported to userspace including the
task's pid and ppid, but without the id of the thread group this task is
part of.  Without the tgid, the stats of single tasks cannot be correlated
to each other as a thread group (process).

The taskstats documentation suggests that on process exit a data set
consisting of accumulated stats for the whole group is produced.  But such
an additional set of stats is only produced for actually multithreaded
processes, not groups that had only one thread, and also those stats only
contain data about delay accounting and not the more basic information
about CPU and memory resource usage.  Adding the AGROUP flag to be set
when the last task of a group exited enables determination of process end
also for single-threaded processes.

My applicaton basically does enhanced process accounting with summed
cputime, biggest maxrss, tasks per process.  The data is not available
with the traditional BSD process accounting (which is not designed to be
extensible) and the taskstats interface allows more efficient on-the-fly
grouping and summing of the stats, anyway, without intermediate disk
writes.

Furthermore, I do carry statistics on which exact program binary is used
how often with associated resources, getting a picture on how important
which parts of a collection of installed scientific software in different
versions are, and how well they put load on the machine.  This is enabled
by providing information on /proc/self/exe for each task.  I assume the
two 64-bit fields for device ID and inode are more appropriate than the
possibly large resolved path to keep the data volume down.

Add the tgid to the stats to complete task identification, the flag AGROUP
to mark the last task of a group, the group wallclock time, and
inode-based identification of the associated executable file.

Add tools/accounting/procacct.c as a simplified fork of getdelays.c to
demonstrate process and thread accounting.

[thomas.orgis@uni-hamburg.de: fix version number in comment]
  Link: https://lkml.kernel.org/r/20220405003601.7a5f6008@plasteblaster
Link: https://lkml.kernel.org/r/20220331004106.64e5616b@plasteblaster
Signed-off-by: Dr. Thomas Orgis <thomas.orgis@uni-hamburg.de>
Reviewed-by: Ismael Luceno <ismael@iodev.co.uk>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yang Yang <yang.yang29@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-04-29 14:38:03 -07:00

183 lines
5.0 KiB
C

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* tsacct.c - System accounting over taskstats interface
*
* Copyright (C) Jay Lan, <jlan@sgi.com>
*/
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
#include <linux/sched/cputime.h>
#include <linux/tsacct_kern.h>
#include <linux/acct.h>
#include <linux/jiffies.h>
#include <linux/mm.h>
/*
* fill in basic accounting fields
*/
void bacct_add_tsk(struct user_namespace *user_ns,
struct pid_namespace *pid_ns,
struct taskstats *stats, struct task_struct *tsk)
{
const struct cred *tcred;
u64 utime, stime, utimescaled, stimescaled;
u64 now_ns, delta;
time64_t btime;
BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
/* calculate task elapsed time in nsec */
now_ns = ktime_get_ns();
/* store whole group time first */
delta = now_ns - tsk->group_leader->start_time;
/* Convert to micro seconds */
do_div(delta, NSEC_PER_USEC);
stats->ac_tgetime = delta;
delta = now_ns - tsk->start_time;
do_div(delta, NSEC_PER_USEC);
stats->ac_etime = delta;
/* Convert to seconds for btime (note y2106 limit) */
btime = ktime_get_real_seconds() - div_u64(delta, USEC_PER_SEC);
stats->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX);
stats->ac_btime64 = btime;
if (tsk->flags & PF_EXITING)
stats->ac_exitcode = tsk->exit_code;
if (thread_group_leader(tsk) && (tsk->flags & PF_FORKNOEXEC))
stats->ac_flag |= AFORK;
if (tsk->flags & PF_SUPERPRIV)
stats->ac_flag |= ASU;
if (tsk->flags & PF_DUMPCORE)
stats->ac_flag |= ACORE;
if (tsk->flags & PF_SIGNALED)
stats->ac_flag |= AXSIG;
stats->ac_nice = task_nice(tsk);
stats->ac_sched = tsk->policy;
stats->ac_pid = task_pid_nr_ns(tsk, pid_ns);
stats->ac_tgid = task_tgid_nr_ns(tsk, pid_ns);
rcu_read_lock();
tcred = __task_cred(tsk);
stats->ac_uid = from_kuid_munged(user_ns, tcred->uid);
stats->ac_gid = from_kgid_munged(user_ns, tcred->gid);
stats->ac_ppid = pid_alive(tsk) ?
task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
rcu_read_unlock();
task_cputime(tsk, &utime, &stime);
stats->ac_utime = div_u64(utime, NSEC_PER_USEC);
stats->ac_stime = div_u64(stime, NSEC_PER_USEC);
task_cputime_scaled(tsk, &utimescaled, &stimescaled);
stats->ac_utimescaled = div_u64(utimescaled, NSEC_PER_USEC);
stats->ac_stimescaled = div_u64(stimescaled, NSEC_PER_USEC);
stats->ac_minflt = tsk->min_flt;
stats->ac_majflt = tsk->maj_flt;
strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm));
}
#ifdef CONFIG_TASK_XACCT
#define KB 1024
#define MB (1024*KB)
#define KB_MASK (~(KB-1))
/*
* fill in extended accounting fields
*/
void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
{
struct mm_struct *mm;
/* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */
stats->coremem = p->acct_rss_mem1 * PAGE_SIZE;
do_div(stats->coremem, 1000 * KB);
stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE;
do_div(stats->virtmem, 1000 * KB);
mm = get_task_mm(p);
if (mm) {
/* adjust to KB unit */
stats->hiwater_rss = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB;
stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB;
mmput(mm);
}
stats->read_char = p->ioac.rchar & KB_MASK;
stats->write_char = p->ioac.wchar & KB_MASK;
stats->read_syscalls = p->ioac.syscr & KB_MASK;
stats->write_syscalls = p->ioac.syscw & KB_MASK;
#ifdef CONFIG_TASK_IO_ACCOUNTING
stats->read_bytes = p->ioac.read_bytes & KB_MASK;
stats->write_bytes = p->ioac.write_bytes & KB_MASK;
stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes & KB_MASK;
#else
stats->read_bytes = 0;
stats->write_bytes = 0;
stats->cancelled_write_bytes = 0;
#endif
}
#undef KB
#undef MB
static void __acct_update_integrals(struct task_struct *tsk,
u64 utime, u64 stime)
{
u64 time, delta;
if (!likely(tsk->mm))
return;
time = stime + utime;
delta = time - tsk->acct_timexpd;
if (delta < TICK_NSEC)
return;
tsk->acct_timexpd = time;
/*
* Divide by 1024 to avoid overflow, and to avoid division.
* The final unit reported to userspace is Mbyte-usecs,
* the rest of the math is done in xacct_add_tsk.
*/
tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
tsk->acct_vm_mem1 += delta * READ_ONCE(tsk->mm->total_vm) >> 10;
}
/**
* acct_update_integrals - update mm integral fields in task_struct
* @tsk: task_struct for accounting
*/
void acct_update_integrals(struct task_struct *tsk)
{
u64 utime, stime;
unsigned long flags;
local_irq_save(flags);
task_cputime(tsk, &utime, &stime);
__acct_update_integrals(tsk, utime, stime);
local_irq_restore(flags);
}
/**
* acct_account_cputime - update mm integral after cputime update
* @tsk: task_struct for accounting
*/
void acct_account_cputime(struct task_struct *tsk)
{
__acct_update_integrals(tsk, tsk->utime, tsk->stime);
}
/**
* acct_clear_integrals - clear the mm integral fields in task_struct
* @tsk: task_struct whose accounting fields are cleared
*/
void acct_clear_integrals(struct task_struct *tsk)
{
tsk->acct_timexpd = 0;
tsk->acct_rss_mem1 = 0;
tsk->acct_vm_mem1 = 0;
}
#endif