linux/init/init_task.c
Linus Torvalds 88264981f2 sched_ext: Initial pull request for v6.12
This is the initial pull request of sched_ext. The v7 patchset
 (https://lkml.kernel.org/r/20240618212056.2833381-1-tj@kernel.org) is
 applied on top of tip/sched/core + bpf/master as of Jun 18th.
 
   tip/sched/core 793a62823d1c ("sched/core: Drop spinlocks on contention iff kernel is preempti
 ble")
   bpf/master f6afdaf72a ("Merge branch 'bpf-support-resilient-split-btf'")
 
 Since then, the following pulls were made:
 
 - v6.11-rc1 is pulled to keep up with the mainline.
 
 - tip/sched/core was pulled several times:
 
   - 7b9f6c864a, 0df340ceae, 5ac998574f, 0b1777f0fa: To resolve
     conflicts. See each commit for details on conflicts and their
     resolutions.
 
   - d7b01aef9d: To receive fd03c5b858 ("sched: Rework pick_next_task()")
     and related commits. @prev in added to sched_class->put_prev_task() and
     put_prev_task() is reordered after ->pick_task(), which makes
     sched_class->switch_class() unnecessary. The follow-up commits update
     sched_ext accordingly and drop sched_class->switch_class().
 
 - bpf/master was pulled to receive baebe9aaba ("bpf: allow passing struct
   bpf_iter_<type> as kfunc arguments") and related changes in preparation
   for the DSQ iterator patchset
 
 To obtain the net sched_ext changes, diff against:
 
   git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git for-6.12-base
 
 which is the merge of:
 
   tip/sched/core bc9057da1a ("sched/cpufreq: Use NSEC_PER_MSEC for deadline task")
   bpf/master 2ad6d23f46 ("selftests/bpf: Do not update vmlinux.h unnecessarily")
 
 Since the v7 patchset, the following changes were made:
 
 - cpuperf support which was a part of the v6 patchset was posted separately
   and then applied after reviews.
 
 - cgroup support which was a part of the v6 patchset was posted seprately,
   iterated and then applied.
 
 - Improve integration with sched core.
 
 - Double locking usage in migration paths dropped. Depend on
   TASK_ON_RQ_MIGRATING synchronization instead.
 
 - The BPF scheduler couldn't directly dispatch to the local DSQ of another
   CPU using a SCX_DSQ_LOCAL_ON verdict. This caused difficulties around
   handling non-wakeup enqueues. Updated so that SCX_DSQ_LOCAL_ON can be used
   in the enqueue path too.
 
 - DSQ iterator which was a part of the v6 patchset was posted separately.
   The iterator itself was applied after a couple revisions. The associated
   selective consumption kfunc can use further improvements and is still
   being worked on.
 
 - scx_bpf_dispatch[_vtime]_from_dsq() added to increase flexibility. A task
   can now be transferred between two DSQs from almost any context. This
   involved significant refactoring of migration code.
 
 - Various fixes and improvements.
 
 As the branch is based on top of tip/sched/core + bpf/master, please merge
 after both are applied.
 -----BEGIN PGP SIGNATURE-----
 
 iIQEABYKACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCZuOSuA4cdGpAa2VybmVs
 Lm9yZwAKCRCxYfJx3gVYGVZyAQDBU3WPkYKB8gl6a6YQ+/PzBXorOK7mioS9A2iJ
 vBR3FgEAg1vtcss1S+2juWmVq7ItiFNWCqtXzUr/bVmL9CqqDwA=
 =bOOC
 -----END PGP SIGNATURE-----

Merge tag 'sched_ext-for-6.12' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext support from Tejun Heo:
 "This implements a new scheduler class called ‘ext_sched_class’, or
  sched_ext, which allows scheduling policies to be implemented as BPF
  programs.

  The goals of this are:

   - Ease of experimentation and exploration: Enabling rapid iteration
     of new scheduling policies.

   - Customization: Building application-specific schedulers which
     implement policies that are not applicable to general-purpose
     schedulers.

   - Rapid scheduler deployments: Non-disruptive swap outs of scheduling
     policies in production environments"

See individual commits for more documentation, but also the cover letter
for the latest series:

Link: https://lore.kernel.org/all/20240618212056.2833381-1-tj@kernel.org/

* tag 'sched_ext-for-6.12' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: (110 commits)
  sched: Move update_other_load_avgs() to kernel/sched/pelt.c
  sched_ext: Don't trigger ops.quiescent/runnable() on migrations
  sched_ext: Synchronize bypass state changes with rq lock
  scx_qmap: Implement highpri boosting
  sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()
  sched_ext: Compact struct bpf_iter_scx_dsq_kern
  sched_ext: Replace consume_local_task() with move_local_task_to_local_dsq()
  sched_ext: Move consume_local_task() upward
  sched_ext: Move sanity check and dsq_mod_nr() into task_unlink_from_dsq()
  sched_ext: Reorder args for consume_local/remote_task()
  sched_ext: Restructure dispatch_to_local_dsq()
  sched_ext: Fix processs_ddsp_deferred_locals() by unifying DTL_INVALID handling
  sched_ext: Make find_dsq_for_dispatch() handle SCX_DSQ_LOCAL_ON
  sched_ext: Refactor consume_remote_task()
  sched_ext: Rename scx_kfunc_set_sleepable to unlocked and relocate
  sched_ext: Add missing static to scx_dump_data
  sched_ext: Add missing static to scx_has_op[]
  sched_ext: Temporarily work around pick_task_scx() being called without balance_scx()
  sched_ext: Add a cgroup scheduler which uses flattened hierarchy
  sched_ext: Add cgroup support
  ...
2024-09-21 09:44:57 -07:00

232 lines
6.5 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <linux/init_task.h>
#include <linux/export.h>
#include <linux/mqueue.h>
#include <linux/sched.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
#include <linux/sched/task.h>
#include <linux/sched/ext.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/audit.h>
#include <linux/numa.h>
#include <linux/scs.h>
#include <linux/plist.h>
#include <linux/uaccess.h>
static struct signal_struct init_signals = {
.nr_threads = 1,
.thread_head = LIST_HEAD_INIT(init_task.thread_node),
.wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(init_signals.wait_chldexit),
.shared_pending = {
.list = LIST_HEAD_INIT(init_signals.shared_pending.list),
.signal = {{0}}
},
.multiprocess = HLIST_HEAD_INIT,
.rlim = INIT_RLIMITS,
.cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
.exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock),
#ifdef CONFIG_POSIX_TIMERS
.posix_timers = HLIST_HEAD_INIT,
.cputimer = {
.cputime_atomic = INIT_CPUTIME_ATOMIC,
},
#endif
INIT_CPU_TIMERS(init_signals)
.pids = {
[PIDTYPE_PID] = &init_struct_pid,
[PIDTYPE_TGID] = &init_struct_pid,
[PIDTYPE_PGID] = &init_struct_pid,
[PIDTYPE_SID] = &init_struct_pid,
},
INIT_PREV_CPUTIME(init_signals)
};
static struct sighand_struct init_sighand = {
.count = REFCOUNT_INIT(1),
.action = { { { .sa_handler = SIG_DFL, } }, },
.siglock = __SPIN_LOCK_UNLOCKED(init_sighand.siglock),
.signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh),
};
#ifdef CONFIG_SHADOW_CALL_STACK
unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] = {
[(SCS_SIZE / sizeof(long)) - 1] = SCS_END_MAGIC
};
#endif
/*
* Set up the first task table, touch at your own risk!. Base=0,
* limit=0x1fffff (=2MB)
*/
struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
#ifdef CONFIG_THREAD_INFO_IN_TASK
.thread_info = INIT_THREAD_INFO(init_task),
.stack_refcount = REFCOUNT_INIT(1),
#endif
.__state = 0,
.stack = init_stack,
.usage = REFCOUNT_INIT(2),
.flags = PF_KTHREAD,
.prio = MAX_PRIO - 20,
.static_prio = MAX_PRIO - 20,
.normal_prio = MAX_PRIO - 20,
.policy = SCHED_NORMAL,
.cpus_ptr = &init_task.cpus_mask,
.user_cpus_ptr = NULL,
.cpus_mask = CPU_MASK_ALL,
.max_allowed_capacity = SCHED_CAPACITY_SCALE,
.nr_cpus_allowed= NR_CPUS,
.mm = NULL,
.active_mm = &init_mm,
.faults_disabled_mapping = NULL,
.restart_block = {
.fn = do_no_restart_syscall,
},
.se = {
.group_node = LIST_HEAD_INIT(init_task.se.group_node),
},
.rt = {
.run_list = LIST_HEAD_INIT(init_task.rt.run_list),
.time_slice = RR_TIMESLICE,
},
.tasks = LIST_HEAD_INIT(init_task.tasks),
#ifdef CONFIG_SMP
.pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO),
#endif
#ifdef CONFIG_CGROUP_SCHED
.sched_task_group = &root_task_group,
#endif
#ifdef CONFIG_SCHED_CLASS_EXT
.scx = {
.dsq_list.node = LIST_HEAD_INIT(init_task.scx.dsq_list.node),
.sticky_cpu = -1,
.holding_cpu = -1,
.runnable_node = LIST_HEAD_INIT(init_task.scx.runnable_node),
.runnable_at = INITIAL_JIFFIES,
.ddsp_dsq_id = SCX_DSQ_INVALID,
.slice = SCX_SLICE_DFL,
},
#endif
.ptraced = LIST_HEAD_INIT(init_task.ptraced),
.ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry),
.real_parent = &init_task,
.parent = &init_task,
.children = LIST_HEAD_INIT(init_task.children),
.sibling = LIST_HEAD_INIT(init_task.sibling),
.group_leader = &init_task,
RCU_POINTER_INITIALIZER(real_cred, &init_cred),
RCU_POINTER_INITIALIZER(cred, &init_cred),
.comm = INIT_TASK_COMM,
.thread = INIT_THREAD,
.fs = &init_fs,
.files = &init_files,
#ifdef CONFIG_IO_URING
.io_uring = NULL,
#endif
.signal = &init_signals,
.sighand = &init_sighand,
.nsproxy = &init_nsproxy,
.pending = {
.list = LIST_HEAD_INIT(init_task.pending.list),
.signal = {{0}}
},
.blocked = {{0}},
.alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock),
.journal_info = NULL,
INIT_CPU_TIMERS(init_task)
.pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock),
.timer_slack_ns = 50000, /* 50 usec default slack */
.thread_pid = &init_struct_pid,
.thread_node = LIST_HEAD_INIT(init_signals.thread_head),
#ifdef CONFIG_AUDIT
.loginuid = INVALID_UID,
.sessionid = AUDIT_SID_UNSET,
#endif
#ifdef CONFIG_PERF_EVENTS
.perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex),
.perf_event_list = LIST_HEAD_INIT(init_task.perf_event_list),
#endif
#ifdef CONFIG_PREEMPT_RCU
.rcu_read_lock_nesting = 0,
.rcu_read_unlock_special.s = 0,
.rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry),
.rcu_blocked_node = NULL,
#endif
#ifdef CONFIG_TASKS_RCU
.rcu_tasks_holdout = false,
.rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list),
.rcu_tasks_idle_cpu = -1,
.rcu_tasks_exit_list = LIST_HEAD_INIT(init_task.rcu_tasks_exit_list),
#endif
#ifdef CONFIG_TASKS_TRACE_RCU
.trc_reader_nesting = 0,
.trc_reader_special.s = 0,
.trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list),
.trc_blkd_node = LIST_HEAD_INIT(init_task.trc_blkd_node),
#endif
#ifdef CONFIG_CPUSETS
.mems_allowed_seq = SEQCNT_SPINLOCK_ZERO(init_task.mems_allowed_seq,
&init_task.alloc_lock),
#endif
#ifdef CONFIG_RT_MUTEXES
.pi_waiters = RB_ROOT_CACHED,
.pi_top_task = NULL,
#endif
INIT_PREV_CPUTIME(init_task)
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
.vtime.seqcount = SEQCNT_ZERO(init_task.vtime_seqcount),
.vtime.starttime = 0,
.vtime.state = VTIME_SYS,
#endif
#ifdef CONFIG_NUMA_BALANCING
.numa_preferred_nid = NUMA_NO_NODE,
.numa_group = NULL,
.numa_faults = NULL,
#endif
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
.kasan_depth = 1,
#endif
#ifdef CONFIG_KCSAN
.kcsan_ctx = {
.scoped_accesses = {LIST_POISON1, NULL},
},
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
.softirqs_enabled = 1,
#endif
#ifdef CONFIG_LOCKDEP
.lockdep_depth = 0, /* no locks held yet */
.curr_chain_key = INITIAL_CHAIN_KEY,
.lockdep_recursion = 0,
#endif
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
.ret_stack = NULL,
.tracing_graph_pause = ATOMIC_INIT(0),
#endif
#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPTION)
.trace_recursion = 0,
#endif
#ifdef CONFIG_LIVEPATCH
.patch_state = KLP_TRANSITION_IDLE,
#endif
#ifdef CONFIG_SECURITY
.security = NULL,
#endif
#ifdef CONFIG_SECCOMP_FILTER
.seccomp = { .filter_count = ATOMIC_INIT(0) },
#endif
};
EXPORT_SYMBOL(init_task);
/*
* Initial thread structure. Alignment of this is handled by a special
* linker map entry.
*/
#ifndef CONFIG_THREAD_INFO_IN_TASK
struct thread_info init_thread_info __init_thread_info = INIT_THREAD_INFO(init_task);
#endif