drm/amdkfd: Update logic for CU occupancy calculations

Currently, the code uses the IH_VMID_X_LUT register to map
a queue's vmid to the corresponding PASID. This logic is racy
since CP can update the VMID-PASID mapping anytime especially
when there are more processes than number of vmids. Update the
logic to calculate CU occupancy by matching doorbell offset of
the queue with valid wave counts against the process's queues.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Reviewed-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Mukul Joshi 2024-09-16 14:33:58 -04:00 committed by Alex Deucher
parent e1d27f7a9c
commit 6ae9e1aba9
6 changed files with 87 additions and 63 deletions

View file

@ -950,28 +950,30 @@ static void unlock_spi_csq_mutexes(struct amdgpu_device *adev)
* @inst: xcc's instance number on a multi-XCC setup
*/
static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
int *wave_cnt, int *vmid, uint32_t inst)
struct kfd_cu_occupancy *queue_cnt, uint32_t inst)
{
int pipe_idx;
int queue_slot;
unsigned int reg_val;
unsigned int wave_cnt;
/*
* Program GRBM with appropriate MEID, PIPEID, QUEUEID and VMID
* parameters to read out waves in flight. Get VMID if there are
* non-zero waves in flight.
*/
*vmid = 0xFF;
*wave_cnt = 0;
pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe;
queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe;
soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, inst);
reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, inst, mmSPI_CSQ_WF_ACTIVE_COUNT_0) +
queue_slot);
*wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK;
if (*wave_cnt != 0)
*vmid = (RREG32_SOC15(GC, inst, mmCP_HQD_VMID) &
CP_HQD_VMID__VMID_MASK) >> CP_HQD_VMID__VMID__SHIFT;
reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, inst,
mmSPI_CSQ_WF_ACTIVE_COUNT_0) + queue_slot);
wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK;
if (wave_cnt != 0) {
queue_cnt->wave_cnt += wave_cnt;
queue_cnt->doorbell_off =
(RREG32_SOC15(GC, inst, mmCP_HQD_PQ_DOORBELL_CONTROL) &
CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET_MASK) >>
CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT;
}
}
/**
@ -981,9 +983,8 @@ static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
* or more queues running and submitting waves to compute units.
*
* @adev: Handle of device from which to get number of waves in flight
* @pasid: Identifies the process for which this query call is invoked
* @pasid_wave_cnt: Output parameter updated with number of waves in flight that
* belong to process with given pasid
* @cu_occupancy: Array that gets filled with wave_cnt and doorbell offset
* for comparison later.
* @max_waves_per_cu: Output parameter updated with maximum number of waves
* possible per Compute Unit
* @inst: xcc's instance number on a multi-XCC setup
@ -1011,30 +1012,24 @@ static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
* number of waves that are in flight for the queue at specified index. The
* index ranges from 0 to 7.
*
* If non-zero waves are in flight, read CP_HQD_VMID register to obtain VMID
* of the wave(s).
* If non-zero waves are in flight, store the corresponding doorbell offset
* of the queue, along with the wave count.
*
* Determine if VMID from above step maps to pasid provided as parameter. If
* it matches agrregate the wave count. That the VMID will not match pasid is
* a normal condition i.e. a device is expected to support multiple queues
* from multiple proceses.
* Determine if the queue belongs to the process by comparing the doorbell
* offset against the process's queues. If it matches, aggregate the wave
* count for the process.
*
* Reading registers referenced above involves programming GRBM appropriately
*/
void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
int *pasid_wave_cnt, int *max_waves_per_cu, uint32_t inst)
void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev,
struct kfd_cu_occupancy *cu_occupancy,
int *max_waves_per_cu, uint32_t inst)
{
int qidx;
int vmid;
int se_idx;
int sh_idx;
int se_cnt;
int sh_cnt;
int wave_cnt;
int queue_map;
int pasid_tmp;
int max_queue_cnt;
int vmid_wave_cnt = 0;
DECLARE_BITMAP(cp_queue_bitmap, AMDGPU_MAX_QUEUES);
lock_spi_csq_mutexes(adev);
@ -1048,42 +1043,30 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
AMDGPU_MAX_QUEUES);
max_queue_cnt = adev->gfx.mec.num_pipe_per_mec *
adev->gfx.mec.num_queue_per_pipe;
sh_cnt = adev->gfx.config.max_sh_per_se;
se_cnt = adev->gfx.config.max_shader_engines;
for (se_idx = 0; se_idx < se_cnt; se_idx++) {
for (sh_idx = 0; sh_idx < sh_cnt; sh_idx++) {
amdgpu_gfx_select_se_sh(adev, se_idx, 0, 0xffffffff, inst);
queue_map = RREG32_SOC15(GC, inst, mmSPI_CSQ_WF_ACTIVE_STATUS);
amdgpu_gfx_select_se_sh(adev, se_idx, sh_idx, 0xffffffff, inst);
queue_map = RREG32_SOC15(GC, inst, mmSPI_CSQ_WF_ACTIVE_STATUS);
/*
* Assumption: queue map encodes following schema: four
* pipes per each micro-engine, with each pipe mapping
* eight queues. This schema is true for GFX9 devices
* and must be verified for newer device families
/*
* Assumption: queue map encodes following schema: four
* pipes per each micro-engine, with each pipe mapping
* eight queues. This schema is true for GFX9 devices
* and must be verified for newer device families
*/
for (qidx = 0; qidx < max_queue_cnt; qidx++) {
/* Skip qeueus that are not associated with
* compute functions
*/
for (qidx = 0; qidx < max_queue_cnt; qidx++) {
if (!test_bit(qidx, cp_queue_bitmap))
continue;
/* Skip qeueus that are not associated with
* compute functions
*/
if (!test_bit(qidx, cp_queue_bitmap))
continue;
if (!(queue_map & (1 << qidx)))
continue;
if (!(queue_map & (1 << qidx)))
continue;
/* Get number of waves in flight and aggregate them */
get_wave_count(adev, qidx, &wave_cnt, &vmid,
inst);
if (wave_cnt != 0) {
pasid_tmp =
RREG32(SOC15_REG_OFFSET(OSSSYS, inst,
mmIH_VMID_0_LUT) + vmid);
if (pasid_tmp == pasid)
vmid_wave_cnt += wave_cnt;
}
}
/* Get number of waves in flight and aggregate them */
get_wave_count(adev, qidx, &cu_occupancy[qidx],
inst);
}
}
@ -1092,7 +1075,6 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
unlock_spi_csq_mutexes(adev);
/* Update the output parameters and return */
*pasid_wave_cnt = vmid_wave_cnt;
*max_waves_per_cu = adev->gfx.cu_info.simd_per_cu *
adev->gfx.cu_info.max_waves_per_simd;
}

View file

@ -52,8 +52,9 @@ bool kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev,
uint8_t vmid, uint16_t *p_pasid);
void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
uint32_t vmid, uint64_t page_table_base);
void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
int *pasid_wave_cnt, int *max_waves_per_cu, uint32_t inst);
void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev,
struct kfd_cu_occupancy *cu_occupancy,
int *max_waves_per_cu, uint32_t inst);
void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr,
uint32_t inst);

View file

@ -3540,6 +3540,26 @@ int debug_refresh_runlist(struct device_queue_manager *dqm)
return debug_map_and_unlock(dqm);
}
bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm,
struct qcm_process_device *qpd,
int doorbell_off)
{
struct queue *q;
bool r = false;
dqm_lock(dqm);
list_for_each_entry(q, &qpd->queues_list, list) {
if (q->properties.doorbell_off == doorbell_off) {
r = true;
goto out;
}
}
out:
dqm_unlock(dqm);
return r;
}
#if defined(CONFIG_DEBUG_FS)
static void seq_reg_dump(struct seq_file *m,

View file

@ -324,6 +324,9 @@ void set_queue_snapshot_entry(struct queue *q,
int debug_lock_and_unmap(struct device_queue_manager *dqm);
int debug_map_and_unlock(struct device_queue_manager *dqm);
int debug_refresh_runlist(struct device_queue_manager *dqm);
bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm,
struct qcm_process_device *qpd,
int doorbell_off);
static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
{

View file

@ -270,6 +270,10 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer)
struct kfd_node *dev = NULL;
struct kfd_process *proc = NULL;
struct kfd_process_device *pdd = NULL;
int i;
struct kfd_cu_occupancy cu_occupancy[AMDGPU_MAX_QUEUES];
memset(cu_occupancy, 0x0, sizeof(cu_occupancy));
pdd = container_of(attr, struct kfd_process_device, attr_cu_occupancy);
dev = pdd->dev;
@ -287,9 +291,17 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer)
/* Collect wave count from device if it supports */
wave_cnt = 0;
max_waves_per_cu = 0;
dev->kfd2kgd->get_cu_occupancy(dev->adev, proc->pasid, &wave_cnt,
dev->kfd2kgd->get_cu_occupancy(dev->adev, cu_occupancy,
&max_waves_per_cu, 0);
for (i = 0; i < AMDGPU_MAX_QUEUES; i++) {
if (cu_occupancy[i].wave_cnt != 0 &&
kfd_dqm_is_queue_in_process(dev->dqm, &pdd->qpd,
cu_occupancy[i].doorbell_off))
wave_cnt += cu_occupancy[i].wave_cnt;
}
/* Translate wave count to number of compute units */
cu_cnt = (wave_cnt + (max_waves_per_cu - 1)) / max_waves_per_cu;
return snprintf(buffer, PAGE_SIZE, "%d\n", cu_cnt);

View file

@ -71,6 +71,11 @@ enum kgd_memory_pool {
KGD_POOL_FRAMEBUFFER = 3,
};
struct kfd_cu_occupancy {
u32 wave_cnt;
u32 doorbell_off;
};
/**
* enum kfd_sched_policy
*
@ -313,8 +318,9 @@ struct kfd2kgd_calls {
uint32_t grace_period,
uint32_t *reg_offset,
uint32_t *reg_data);
void (*get_cu_occupancy)(struct amdgpu_device *adev, int pasid,
int *wave_cnt, int *max_waves_per_cu, uint32_t inst);
void (*get_cu_occupancy)(struct amdgpu_device *adev,
struct kfd_cu_occupancy *cu_occupancy,
int *max_waves_per_cu, uint32_t inst);
void (*program_trap_handler_settings)(struct amdgpu_device *adev,
uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr,
uint32_t inst);