From bbbfeac92beff40eb86c7f682a7f1395f9f0ae52 Mon Sep 17 00:00:00 2001 From: Nathan Zimmer Date: Thu, 21 Feb 2013 15:15:09 -0800 Subject: [PATCH] sched: Fix /proc/sched_debug failure on very very large systems On systems with 4096 cores attemping to read /proc/sched_debug fails because we are trying to push all the data into a single kmalloc buffer. The issue is on these very large machines all the data will not fit in 4mb. A better solution is to not us the single_open mechanism but to provide our own seq_operations and treat each cpu as an individual record. The output should be identical to the previous version. Reported-by: Dave Jones Signed-off-by: Nathan Zimmer Cc: Peter Zijlstra ) [ Whitespace fixlet] [ Fix spello in comment] Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 92 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 80 insertions(+), 12 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 7ae4c4c5420e..c496eb3c6459 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -269,11 +269,11 @@ static void print_cpu(struct seq_file *m, int cpu) { unsigned int freq = cpu_khz ? : 1; - SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", + SEQ_printf(m, "cpu#%d, %u.%03u MHz\n", cpu, freq / 1000, (freq % 1000)); } #else - SEQ_printf(m, "\ncpu#%d\n", cpu); + SEQ_printf(m, "cpu#%d\n", cpu); #endif #define P(x) \ @@ -330,6 +330,7 @@ do { \ print_rq(m, rq, cpu); rcu_read_unlock(); spin_unlock_irqrestore(&sched_debug_lock, flags); + SEQ_printf(m, "\n"); } static const char *sched_tunable_scaling_names[] = { @@ -338,11 +339,10 @@ static const char *sched_tunable_scaling_names[] = { "linear" }; -static int sched_debug_show(struct seq_file *m, void *v) +static void sched_debug_header(struct seq_file *m) { u64 ktime, sched_clk, cpu_clk; unsigned long flags; - int cpu; local_irq_save(flags); ktime = ktime_to_ns(ktime_get()); @@ -384,33 +384,101 @@ static int sched_debug_show(struct seq_file *m, void *v) #undef PN #undef P - SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", + SEQ_printf(m, " .%-40s: %d (%s)\n", + "sysctl_sched_tunable_scaling", sysctl_sched_tunable_scaling, sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); - - for_each_online_cpu(cpu) - print_cpu(m, cpu); - SEQ_printf(m, "\n"); +} + +static int sched_debug_show(struct seq_file *m, void *v) +{ + int cpu = (unsigned long)(v - 2); + + if (cpu != -1) + print_cpu(m, cpu); + else + sched_debug_header(m); return 0; } void sysrq_sched_debug_show(void) { - sched_debug_show(NULL, NULL); + int cpu; + + sched_debug_header(NULL); + for_each_online_cpu(cpu) + print_cpu(NULL, cpu); + +} + +/* + * This itererator needs some explanation. + * It returns 1 for the header position. + * This means 2 is cpu 0. + * In a hotplugged system some cpus, including cpu 0, may be missing so we have + * to use cpumask_* to iterate over the cpus. + */ +static void *sched_debug_start(struct seq_file *file, loff_t *offset) +{ + unsigned long n = *offset; + + if (n == 0) + return (void *) 1; + + n--; + + if (n > 0) + n = cpumask_next(n - 1, cpu_online_mask); + else + n = cpumask_first(cpu_online_mask); + + *offset = n + 1; + + if (n < nr_cpu_ids) + return (void *)(unsigned long)(n + 2); + return NULL; +} + +static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset) +{ + (*offset)++; + return sched_debug_start(file, offset); +} + +static void sched_debug_stop(struct seq_file *file, void *data) +{ +} + +static const struct seq_operations sched_debug_sops = { + .start = sched_debug_start, + .next = sched_debug_next, + .stop = sched_debug_stop, + .show = sched_debug_show, +}; + +static int sched_debug_release(struct inode *inode, struct file *file) +{ + seq_release(inode, file); + + return 0; } static int sched_debug_open(struct inode *inode, struct file *filp) { - return single_open(filp, sched_debug_show, NULL); + int ret = 0; + + ret = seq_open(filp, &sched_debug_sops); + + return ret; } static const struct file_operations sched_debug_fops = { .open = sched_debug_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = sched_debug_release, }; static int __init init_sched_debug_procfs(void)