mirror of
https://github.com/torvalds/linux
synced 2024-11-05 18:23:50 +00:00
cbc917a1b0
Some platforms have 'cluster' topology and CPUs in the cluster will
share resources like L3 Cache Tag (for HiSilicon Kunpeng SoC) or L2
cache (for Intel Jacobsville). Currently parsing and building cluster
topology have been supported since [1].
perf stat has already supported aggregation for other topologies like
die or socket, etc. It'll be useful to aggregate per-cluster to find
problems like L3T bandwidth contention.
This patch add support for "--per-cluster" option for per-cluster
aggregation. Also update the docs and related test. The output will
be like:
[root@localhost tmp]# perf stat -a -e LLC-load --per-cluster -- sleep 5
Performance counter stats for 'system wide':
S56-D0-CLS158 4 1,321,521,570 LLC-load
S56-D0-CLS594 4 794,211,453 LLC-load
S56-D0-CLS1030 4 41,623 LLC-load
S56-D0-CLS1466 4 41,646 LLC-load
S56-D0-CLS1902 4 16,863 LLC-load
S56-D0-CLS2338 4 15,721 LLC-load
S56-D0-CLS2774 4 22,671 LLC-load
[...]
On a legacy system without cluster or cluster support, the output will
be look like:
[root@localhost perf]# perf stat -a -e cycles --per-cluster -- sleep 1
Performance counter stats for 'system wide':
S56-D0-CLS0 64 18,011,485 cycles
S7182-D0-CLS0 64 16,548,835 cycles
Note that this patch doesn't mix the cluster information in the outputs
of --per-core to avoid breaking any tools/scripts using it.
Note that perf recently supports "--per-cache" aggregation, but it's not
the same with the cluster although cluster CPUs may share some cache
resources. For example on my machine all clusters within a die share the
same L3 cache:
$ cat /sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list
0-31
$ cat /sys/devices/system/cpu/cpu0/topology/cluster_cpus_list
0-3
[1] commit c5e22feffd
("topology: Represent clusters of CPUs within a die")
Tested-by: Jie Zhan <zhanjie9@hisilicon.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Cc: james.clark@arm.com
Cc: 21cnbao@gmail.com
Cc: prime.zeng@hisilicon.com
Cc: Jonathan.Cameron@huawei.com
Cc: fanghao11@huawei.com
Cc: linuxarm@huawei.com
Cc: tim.c.chen@intel.com
Cc: linux-arm-kernel@lists.infradead.org
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/r/20240208024026.2691-1-yangyicong@huawei.com
164 lines
5.8 KiB
C
164 lines
5.8 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef __PERF_CPUMAP_H
|
|
#define __PERF_CPUMAP_H
|
|
|
|
#include <stdbool.h>
|
|
#include <stdio.h>
|
|
#include <perf/cpumap.h>
|
|
#include <linux/refcount.h>
|
|
|
|
/** Identify where counts are aggregated, -1 implies not to aggregate. */
|
|
struct aggr_cpu_id {
|
|
/** A value in the range 0 to number of threads. */
|
|
int thread_idx;
|
|
/** The numa node X as read from /sys/devices/system/node/nodeX. */
|
|
int node;
|
|
/**
|
|
* The socket number as read from
|
|
* /sys/devices/system/cpu/cpuX/topology/physical_package_id.
|
|
*/
|
|
int socket;
|
|
/** The die id as read from /sys/devices/system/cpu/cpuX/topology/die_id. */
|
|
int die;
|
|
/** The cluster id as read from /sys/devices/system/cpu/cpuX/topology/cluster_id */
|
|
int cluster;
|
|
/** The cache level as read from /sys/devices/system/cpu/cpuX/cache/indexY/level */
|
|
int cache_lvl;
|
|
/**
|
|
* The cache instance ID, which is the first CPU in the
|
|
* /sys/devices/system/cpu/cpuX/cache/indexY/shared_cpu_list
|
|
*/
|
|
int cache;
|
|
/** The core id as read from /sys/devices/system/cpu/cpuX/topology/core_id. */
|
|
int core;
|
|
/** CPU aggregation, note there is one CPU for each SMT thread. */
|
|
struct perf_cpu cpu;
|
|
};
|
|
|
|
/** A collection of aggr_cpu_id values, the "built" version is sorted and uniqued. */
|
|
struct cpu_aggr_map {
|
|
refcount_t refcnt;
|
|
/** Number of valid entries. */
|
|
int nr;
|
|
/** The entries. */
|
|
struct aggr_cpu_id map[];
|
|
};
|
|
|
|
#define cpu_aggr_map__for_each_idx(idx, aggr_map) \
|
|
for ((idx) = 0; (idx) < aggr_map->nr; (idx)++)
|
|
|
|
struct perf_record_cpu_map_data;
|
|
|
|
bool perf_record_cpu_map_data__test_bit(int i, const struct perf_record_cpu_map_data *data);
|
|
|
|
struct perf_cpu_map *perf_cpu_map__empty_new(int nr);
|
|
|
|
struct perf_cpu_map *cpu_map__new_data(const struct perf_record_cpu_map_data *data);
|
|
size_t cpu_map__snprint(struct perf_cpu_map *map, char *buf, size_t size);
|
|
size_t cpu_map__snprint_mask(struct perf_cpu_map *map, char *buf, size_t size);
|
|
size_t cpu_map__fprintf(struct perf_cpu_map *map, FILE *fp);
|
|
struct perf_cpu_map *cpu_map__online(void); /* thread unsafe */
|
|
|
|
int cpu__setup_cpunode_map(void);
|
|
|
|
int cpu__max_node(void);
|
|
struct perf_cpu cpu__max_cpu(void);
|
|
struct perf_cpu cpu__max_present_cpu(void);
|
|
|
|
/**
|
|
* cpu_map__is_dummy - Events associated with a pid, rather than a CPU, use a single dummy map with an entry of -1.
|
|
*/
|
|
static inline bool cpu_map__is_dummy(const struct perf_cpu_map *cpus)
|
|
{
|
|
return perf_cpu_map__nr(cpus) == 1 && perf_cpu_map__cpu(cpus, 0).cpu == -1;
|
|
}
|
|
|
|
/**
|
|
* cpu__get_node - Returns the numa node X as read from
|
|
* /sys/devices/system/node/nodeX for the given CPU.
|
|
*/
|
|
int cpu__get_node(struct perf_cpu cpu);
|
|
/**
|
|
* cpu__get_socket_id - Returns the socket number as read from
|
|
* /sys/devices/system/cpu/cpuX/topology/physical_package_id for the given CPU.
|
|
*/
|
|
int cpu__get_socket_id(struct perf_cpu cpu);
|
|
/**
|
|
* cpu__get_die_id - Returns the die id as read from
|
|
* /sys/devices/system/cpu/cpuX/topology/die_id for the given CPU.
|
|
*/
|
|
int cpu__get_die_id(struct perf_cpu cpu);
|
|
/**
|
|
* cpu__get_cluster_id - Returns the cluster id as read from
|
|
* /sys/devices/system/cpu/cpuX/topology/cluster_id for the given CPU
|
|
*/
|
|
int cpu__get_cluster_id(struct perf_cpu cpu);
|
|
/**
|
|
* cpu__get_core_id - Returns the core id as read from
|
|
* /sys/devices/system/cpu/cpuX/topology/core_id for the given CPU.
|
|
*/
|
|
int cpu__get_core_id(struct perf_cpu cpu);
|
|
|
|
/**
|
|
* cpu_aggr_map__empty_new - Create a cpu_aggr_map of size nr with every entry
|
|
* being empty.
|
|
*/
|
|
struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr);
|
|
|
|
typedef struct aggr_cpu_id (*aggr_cpu_id_get_t)(struct perf_cpu cpu, void *data);
|
|
|
|
/**
|
|
* cpu_aggr_map__new - Create a cpu_aggr_map with an aggr_cpu_id for each cpu in
|
|
* cpus. The aggr_cpu_id is created with 'get_id' that may have a data value
|
|
* passed to it. The cpu_aggr_map is sorted with duplicate values removed.
|
|
*/
|
|
struct cpu_aggr_map *cpu_aggr_map__new(const struct perf_cpu_map *cpus,
|
|
aggr_cpu_id_get_t get_id,
|
|
void *data, bool needs_sort);
|
|
|
|
bool aggr_cpu_id__equal(const struct aggr_cpu_id *a, const struct aggr_cpu_id *b);
|
|
bool aggr_cpu_id__is_empty(const struct aggr_cpu_id *a);
|
|
struct aggr_cpu_id aggr_cpu_id__empty(void);
|
|
|
|
|
|
/**
|
|
* aggr_cpu_id__socket - Create an aggr_cpu_id with the socket populated with
|
|
* the socket for cpu. The function signature is compatible with
|
|
* aggr_cpu_id_get_t.
|
|
*/
|
|
struct aggr_cpu_id aggr_cpu_id__socket(struct perf_cpu cpu, void *data);
|
|
/**
|
|
* aggr_cpu_id__die - Create an aggr_cpu_id with the die and socket populated
|
|
* with the die and socket for cpu. The function signature is compatible with
|
|
* aggr_cpu_id_get_t.
|
|
*/
|
|
struct aggr_cpu_id aggr_cpu_id__die(struct perf_cpu cpu, void *data);
|
|
/**
|
|
* aggr_cpu_id__cluster - Create an aggr_cpu_id with cluster, die and socket
|
|
* populated with the cluster, die and socket for cpu. The function signature
|
|
* is compatible with aggr_cpu_id_get_t.
|
|
*/
|
|
struct aggr_cpu_id aggr_cpu_id__cluster(struct perf_cpu cpu, void *data);
|
|
/**
|
|
* aggr_cpu_id__core - Create an aggr_cpu_id with the core, cluster, die and
|
|
* socket populated with the core, die and socket for cpu. The function
|
|
* signature is compatible with aggr_cpu_id_get_t.
|
|
*/
|
|
struct aggr_cpu_id aggr_cpu_id__core(struct perf_cpu cpu, void *data);
|
|
/**
|
|
* aggr_cpu_id__core - Create an aggr_cpu_id with the cpu, core, die and socket
|
|
* populated with the cpu, core, die and socket for cpu. The function signature
|
|
* is compatible with aggr_cpu_id_get_t.
|
|
*/
|
|
struct aggr_cpu_id aggr_cpu_id__cpu(struct perf_cpu cpu, void *data);
|
|
/**
|
|
* aggr_cpu_id__node - Create an aggr_cpu_id with the numa node populated for
|
|
* cpu. The function signature is compatible with aggr_cpu_id_get_t.
|
|
*/
|
|
struct aggr_cpu_id aggr_cpu_id__node(struct perf_cpu cpu, void *data);
|
|
/**
|
|
* aggr_cpu_id__global - Create an aggr_cpu_id for global aggregation.
|
|
* The function signature is compatible with aggr_cpu_id_get_t.
|
|
*/
|
|
struct aggr_cpu_id aggr_cpu_id__global(struct perf_cpu cpu, void *data);
|
|
#endif /* __PERF_CPUMAP_H */
|