linux/arch/powerpc/include/asm/cputhreads.h
Srikar Dronamraju f3232321db powerpc/topology: Override cpu_smt_mask
On Power9, a pair of SMT4 cores can be presented by the firmware as a SMT8
core for backward compatibility reasons, with the fusion of two SMT4 cores.
Powerpc allows LPARs to be live migrated from Power8 to Power9.  Existing
software developed/configured for Power8, expects to see a SMT8 core.

In order to maintain userspace backward compatibility (with Power8 chips in
case of Power9) in enterprise Linux systems, the topology_sibling_cpumask
has to be set to SMT8 core.

cpu_smt_mask() should generally point to the cpu mask of the SMT4 core.
Hence override the default cpu_smt_mask() to be powerpc specific
allowing for better scheduling behaviour on Power.

schbench
(latency measured in usecs, so lesser is better)
Without patch                   With patch
Latency percentiles (usec)	Latency percentiles (usec)
	50.0000th: 34           	50.0000th: 38
	75.0000th: 47           	75.0000th: 52
	90.0000th: 54           	90.0000th: 60
	95.0000th: 57           	95.0000th: 64
	*99.0000th: 62          	*99.0000th: 72
	99.5000th: 65           	99.5000th: 75
	99.9000th: 76           	99.9000th: 3452
	min=0, max=9205         	min=0, max=9344

schbench (With Cede disabled)
Without patch                   With patch
Latency percentiles (usec) 	Latency percentiles (usec)
	50.0000th: 20           	50.0000th: 21
	75.0000th: 28           	75.0000th: 29
	90.0000th: 33           	90.0000th: 34
	95.0000th: 35           	95.0000th: 37
	*99.0000th: 40          	*99.0000th: 40
	99.5000th: 48           	99.5000th: 42
	99.9000th: 94           	99.9000th: 79
	min=0, max=791          	min=0, max=791

perf bench sched pipe
usec/ops : lesser is better
Without patch
  N           Min           Max        Median           Avg        Stddev
101      5.095113      5.595269      5.204842     5.2298776    0.10762713

5.10 - 5.15 : ##################################################   23% (24)
5.15 - 5.20 : #############################################        21% (22)
5.20 - 5.25 : ##################################################   23% (24)
5.25 - 5.30 : #########################                            11% (12)
5.30 - 5.35 : ##########                                            4% (5)
5.35 - 5.40 : ########                                              3% (4)
5.40 - 5.45 : ########                                              3% (4)
5.45 - 5.50 : ####                                                  1% (2)
5.50 - 5.55 : ##                                                    0% (1)
5.55 - 5.60 : ####                                                  1% (2)

With patch
  N           Min           Max        Median           Avg        Stddev
101      5.134675      8.524719      5.207658     5.2780985    0.34911969

5.1 - 5.5 : ##################################################   94% (95)
5.5 - 5.8 : ##                                                    3% (4)
5.8 - 6.2 :                                                       0% (1)
6.2 - 6.5 :
6.5 - 6.8 :
6.8 - 7.2 :
7.2 - 7.5 :
7.5 - 7.8 :
7.8 - 8.2 :
8.2 - 8.5 :

perf bench sched pipe (cede disabled)
usec/ops : lesser is better
Without patch
  N           Min           Max        Median           Avg        Stddev
101      7.884227     12.576538      7.956474     8.0170722    0.46159054

7.9 - 8.4 : ##################################################   99% (100)
8.4 - 8.8 :
8.8 - 9.3 :
9.3 - 9.8 :
9.8 - 10.2 :
10.2 - 10.7 :
10.7 - 11.2 :
11.2 - 11.6 :
11.6 - 12.1 :
12.1 - 12.6 :

With patch
  N           Min           Max        Median           Avg        Stddev
101      7.956021      8.217284      8.015615     8.0283866   0.049844967

7.96 - 7.98 : ######################                               12% (13)
7.98 - 8.01 : ##################################################   28% (29)
8.01 - 8.03 : ####################################                 20% (21)
8.03 - 8.06 : #########################                            14% (15)
8.06 - 8.09 : ######################                               12% (13)
8.09 - 8.11 : ######                                                3% (4)
8.11 - 8.14 : ###                                                   1% (2)
8.14 - 8.17 : ###                                                   1% (2)
8.17 - 8.19 :
8.19 - 8.22 : #                                                     0% (1)

Observations: With the patch, the initial run/iteration takes a slight
longer time. This can be attributed to the fact that now we pick a CPU
from a idle core which could be sleep mode. Once we remove the cede,
state the numbers improve in favour of the patch.

ebizzy:
transactions per second (higher is better)
without patch
  N           Min           Max        Median           Avg        Stddev
100       1018433       1304470       1193208     1182315.7     60018.733

1018433 - 1047037 : ######                                                3% (3)
1047037 - 1075640 : ########                                              4% (4)
1075640 - 1104244 : ########                                              4% (4)
1104244 - 1132848 : ###############                                       7% (7)
1132848 - 1161452 : ####################################                 17% (17)
1161452 - 1190055 : ##########################                           12% (12)
1190055 - 1218659 : #############################################        21% (21)
1218659 - 1247263 : ##################################################   23% (23)
1247263 - 1275866 : ########                                              4% (4)
1275866 - 1304470 : ########                                              4% (4)

with patch
  N           Min           Max        Median           Avg        Stddev
100        967014       1292938       1208819     1185281.8     69815.851

 967014 - 999606  : ##                                                    1% (1)
 999606 - 1032199 : ##                                                    1% (1)
1032199 - 1064791 : ############                                          6% (6)
1064791 - 1097384 : ##########                                            5% (5)
1097384 - 1129976 : ##################                                    9% (9)
1129976 - 1162568 : ####################                                 10% (10)
1162568 - 1195161 : ##########################                           13% (13)
1195161 - 1227753 : ############################################         22% (22)
1227753 - 1260346 : ##################################################   25% (25)
1260346 - 1292938 : ##############                                        7% (7)

Observations: Not much changes, ebizzy is not much impacted.

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200807074517.27957-2-srikar@linux.vnet.ibm.com
2020-09-16 22:05:19 +10:00

119 lines
2.9 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_POWERPC_CPUTHREADS_H
#define _ASM_POWERPC_CPUTHREADS_H
#ifndef __ASSEMBLY__
#include <linux/cpumask.h>
#include <asm/cpu_has_feature.h>
/*
* Mapping of threads to cores
*
* Note: This implementation is limited to a power of 2 number of
* threads per core and the same number for each core in the system
* (though it would work if some processors had less threads as long
* as the CPU numbers are still allocated, just not brought online).
*
* However, the API allows for a different implementation in the future
* if needed, as long as you only use the functions and not the variables
* directly.
*/
#ifdef CONFIG_SMP
extern int threads_per_core;
extern int threads_per_subcore;
extern int threads_shift;
extern cpumask_t threads_core_mask;
#else
#define threads_per_core 1
#define threads_per_subcore 1
#define threads_shift 0
#define has_big_cores 0
#define threads_core_mask (*get_cpu_mask(0))
#endif
/* cpu_thread_mask_to_cores - Return a cpumask of one per cores
* hit by the argument
*
* @threads: a cpumask of online threads
*
* This function returns a cpumask which will have one online cpu's
* bit set for each core that has at least one thread set in the argument.
*
* This can typically be used for things like IPI for tlb invalidations
* since those need to be done only once per core/TLB
*/
static inline cpumask_t cpu_thread_mask_to_cores(const struct cpumask *threads)
{
cpumask_t tmp, res;
int i, cpu;
cpumask_clear(&res);
for (i = 0; i < NR_CPUS; i += threads_per_core) {
cpumask_shift_left(&tmp, &threads_core_mask, i);
if (cpumask_intersects(threads, &tmp)) {
cpu = cpumask_next_and(-1, &tmp, cpu_online_mask);
if (cpu < nr_cpu_ids)
cpumask_set_cpu(cpu, &res);
}
}
return res;
}
static inline int cpu_nr_cores(void)
{
return nr_cpu_ids >> threads_shift;
}
static inline cpumask_t cpu_online_cores_map(void)
{
return cpu_thread_mask_to_cores(cpu_online_mask);
}
#ifdef CONFIG_SMP
int cpu_core_index_of_thread(int cpu);
int cpu_first_thread_of_core(int core);
#else
static inline int cpu_core_index_of_thread(int cpu) { return cpu; }
static inline int cpu_first_thread_of_core(int core) { return core; }
#endif
static inline int cpu_thread_in_core(int cpu)
{
return cpu & (threads_per_core - 1);
}
static inline int cpu_thread_in_subcore(int cpu)
{
return cpu & (threads_per_subcore - 1);
}
static inline int cpu_first_thread_sibling(int cpu)
{
return cpu & ~(threads_per_core - 1);
}
static inline int cpu_last_thread_sibling(int cpu)
{
return cpu | (threads_per_core - 1);
}
static inline u32 get_tensr(void)
{
#ifdef CONFIG_BOOKE
if (cpu_has_feature(CPU_FTR_SMT))
return mfspr(SPRN_TENSR);
#endif
return 1;
}
void book3e_start_thread(int thread, unsigned long addr);
void book3e_stop_thread(int thread);
#endif /* __ASSEMBLY__ */
#define INVALID_THREAD_HWID 0x0fff
#endif /* _ASM_POWERPC_CPUTHREADS_H */