winsys/amdgpu: fwm packet pre-emption for gfx 11.5

gfx 11.5 uses f32 firmware. f32 firmware requires COND_EXEC
packet to flush the ring buffer when pre-emption occured.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36700>
This commit is contained in:
Yogesh Mohan Marimuthu 2025-07-18 11:01:55 +05:30 committed by Marge Bot
parent 37c7d19e46
commit 9beb668d8d
4 changed files with 76 additions and 10 deletions

View file

@ -70,6 +70,7 @@
#define PREDICATION_OP_BOOL32 0x4
#define PREDICATION_CONTINUE (1 << 31)
#define PKT3_COND_EXEC 0x22
#define COND_EXEC_USERQ_OVERRULE_CMD (1 << 31)
#define PKT3_PRED_EXEC 0x23
#define PKT3_DRAW_INDIRECT 0x24
#define PKT3_DRAW_INDEX_INDIRECT 0x25
@ -111,6 +112,9 @@
#define STRMOUT_SELECT_BUFFER(x) (((unsigned)(x)&0x3) << 8)
#define PKT3_DRAW_INDEX_OFFSET_2 0x35
#define PKT3_WRITE_DATA 0x37
#define WRITE_DATA_DST_SEL(x) (((unsigned)(x)&0xf) << 8)
#define WRITE_DATA_WR_CONFIRM (1 << 20)
#define WRITE_DATA_CACHE_POLICY(x) (x << 25)
#define PKT3_DRAW_INDEX_INDIRECT_MULTI 0x38
#define PKT3_MEM_SEMAPHORE 0x39
#define PKT3_MPEG_INDEX 0x3A /* GFX6 only */
@ -250,6 +254,8 @@
#define PKT3_INCREMENT_CE_COUNTER 0x84
#define PKT3_INCREMENT_DE_COUNTER 0x85
#define PKT3_WAIT_ON_CE_COUNTER 0x86
#define PKT3_FRAME_CONTROL 0x90
#define S_FRAME_CONTROL_CMD(x) ((x) << 28)
#define PKT3_HDP_FLUSH 0x95
#define PKT3_SET_SH_REG_INDEX 0x9B
#define PKT3_LOAD_CONTEXT_REG_INDEX 0x9F /* GFX8+ */

View file

@ -1408,6 +1408,11 @@ static int amdgpu_cs_submit_ib_kernelq(struct amdgpu_cs *acs,
return r;
}
struct cond_exec_skip_count {
uint32_t *count_dw_ptr;
uint64_t start_wptr;
};
static void amdgpu_cs_add_userq_packets(struct amdgpu_winsys *aws,
struct amdgpu_userq *userq,
struct amdgpu_cs_context *csc,
@ -1417,13 +1422,31 @@ static void amdgpu_cs_add_userq_packets(struct amdgpu_winsys *aws,
amdgpu_pkt_begin();
if (userq->ip_type == AMD_IP_GFX || userq->ip_type == AMD_IP_COMPUTE) {
struct cond_exec_skip_count *cond_exec_skip_counts = NULL;
if (csc->aws->info.gfx_level == GFX11_5 && userq->ip_type == AMD_IP_GFX) {
/* index 0 holds skip count for skipping the entire job. Rest for FENCE_WAIT_MULTI
* packet pre-emption going to end of the job.
*/
cond_exec_skip_counts = (struct cond_exec_skip_count*)alloca(
sizeof(struct cond_exec_skip_count) * (1 + DIV_ROUND_UP(num_fences, 4)));
amdgpu_pkt_add_dw(PKT3(PKT3_COND_EXEC, 3, 0));
amdgpu_pkt_add_dw(0);
amdgpu_pkt_add_dw(0);
amdgpu_pkt_add_dw(0);
cond_exec_skip_counts[0].count_dw_ptr = amdgpu_pkt_get_ptr_skip_dw();
cond_exec_skip_counts[0].start_wptr = amdgpu_pkt_get_next_wptr();
}
if (num_fences) {
unsigned max_num_fences_fwm;
unsigned num_fences_in_iter;
if (csc->aws->info.has_dedicated_vram || csc->aws->info.gfx_level >= GFX12)
max_num_fences_fwm = 32;
else
max_num_fences_fwm = 4;
for (unsigned i = 0; i < num_fences; i = i + max_num_fences_fwm) {
num_fences_in_iter = (i + max_num_fences_fwm > num_fences) ?
num_fences - i : max_num_fences_fwm;
@ -1435,6 +1458,15 @@ static void amdgpu_cs_add_userq_packets(struct amdgpu_winsys *aws,
amdgpu_pkt_add_dw(fence_info[i + j].value);
amdgpu_pkt_add_dw(fence_info[i + j].value >> 32);
}
if (csc->aws->info.gfx_level == GFX11_5 && userq->ip_type == AMD_IP_GFX) {
amdgpu_pkt_add_dw(PKT3(PKT3_COND_EXEC, 3, 0));
amdgpu_pkt_add_dw(0);
amdgpu_pkt_add_dw(0);
amdgpu_pkt_add_dw(0);
cond_exec_skip_counts[1 + i].count_dw_ptr = amdgpu_pkt_get_ptr_skip_dw();
cond_exec_skip_counts[1 + i].start_wptr = amdgpu_pkt_get_next_wptr();
}
}
}
@ -1469,11 +1501,31 @@ static void amdgpu_cs_add_userq_packets(struct amdgpu_winsys *aws,
amdgpu_pkt_add_dw(userq->user_fence_seq_num >> 32);
amdgpu_pkt_add_dw(0);
/* protected signal packet. This is trusted RELEASE_MEM packet. i.e. fence buffer
* is only accessible from kernel through VMID 0.
/* protected signal packet. This is trusted RELEASE_MEM packet.
*
* Kernel allocates the memory for the protected fence and passes the protected fence address
* in MQD (memory queue descriptor - where static and dynamic queue states are stored). This
* fence memory is mapped as write, only for VMID 0. This packet writes the ring buffer
* monotonic (non-wrapping) read pointer value to the fence address passed in MQD when the
* job is completed.
*
* The protected fence memory is mapped as read only to the user VMID. The
* DRM_AMDGPU_USERQ_WAIT ioctl will return read only fence memory address along with protected
* fence sequence number to wait which is used in FENCE_WAIT_MULTI packet.
*
* PKT3_PROTECTED_FENCE_SIGNAL packet should be the last packet before ringing doorbell so
* that mesa user fence sequence number matches with protected fence sequence number. This
* is helpful in debugging.
*/
amdgpu_pkt_add_dw(PKT3(PKT3_PROTECTED_FENCE_SIGNAL, 0, 0));
amdgpu_pkt_add_dw(0);
if (csc->aws->info.gfx_level == GFX11_5 && userq->ip_type == AMD_IP_GFX) {
for (unsigned i = 0; i < 1 + DIV_ROUND_UP(num_fences, 4); i++)
*cond_exec_skip_counts[i].count_dw_ptr = (amdgpu_pkt_get_next_wptr() -
cond_exec_skip_counts[i].start_wptr) |
COND_EXEC_USERQ_OVERRULE_CMD;
}
} else {
mesa_loge("amdgpu: unsupported userq ip submission = %d\n", userq->ip_type);
}
@ -1496,7 +1548,7 @@ static int amdgpu_cs_submit_ib_userq(struct amdgpu_userq *userq,
/* Syncobj dependencies. */
unsigned num_syncobj_dependencies = csc->syncobj_dependencies.num;
uint32_t *syncobj_dependencies_list =
uint32_t *syncobj_dependencies_list =
(uint32_t*)alloca(num_syncobj_dependencies * sizeof(uint32_t));
/* Currently only 1 vm timeline syncobj can be a dependency. */

View file

@ -58,14 +58,16 @@ amdgpu_userq_ring_init(struct amdgpu_winsys *aws, struct amdgpu_userq *userq,
*userq->wptr_bo_map = 0;
userq->next_wptr = 0;
userq->rptr_bo = amdgpu_bo_create(aws, aws->info.gart_page_size, 256, RADEON_DOMAIN_VRAM,
/* Allocate memory for rptr. */
userq->vram_bo = amdgpu_bo_create(aws, aws->info.gart_page_size, 256, RADEON_DOMAIN_VRAM,
RADEON_FLAG_CLEAR_VRAM | RADEON_FLAG_GL2_BYPASS |
RADEON_FLAG_NO_SUBALLOC |
RADEON_FLAG_NO_INTERPROCESS_SHARING);
if (!userq->rptr_bo)
if (!userq->vram_bo)
return false;
update_vm_timeline_point_to_wait(vm_timeline_point_to_wait, userq->rptr_bo);
update_vm_timeline_point_to_wait(vm_timeline_point_to_wait, userq->vram_bo);
userq->rptr_va = amdgpu_bo_get_va(userq->vram_bo);
return true;
}
@ -77,7 +79,7 @@ amdgpu_userq_deinit(struct amdgpu_winsys *aws, struct amdgpu_userq *userq)
radeon_bo_reference(&aws->dummy_sws.base, &userq->gtt_bo, NULL);
radeon_bo_reference(&aws->dummy_sws.base, &userq->wptr_bo, NULL);
radeon_bo_reference(&aws->dummy_sws.base, &userq->rptr_bo, NULL);
radeon_bo_reference(&aws->dummy_sws.base, &userq->vram_bo, NULL);
radeon_bo_reference(&aws->dummy_sws.base, &userq->doorbell_bo, NULL);
switch (userq->ip_type) {
@ -206,8 +208,8 @@ amdgpu_userq_init(struct amdgpu_winsys *aws, struct amdgpu_userq *userq, enum am
r = ac_drm_create_userqueue(aws->dev, hw_ip_type,
get_real_bo(amdgpu_winsys_bo(userq->doorbell_bo))->kms_handle,
AMDGPU_USERQ_DOORBELL_INDEX, ring_va, AMDGPU_USERQ_RING_SIZE,
amdgpu_bo_get_va(userq->wptr_bo), amdgpu_bo_get_va(userq->rptr_bo),
mqd, priority, &userq->userq_handle);
amdgpu_bo_get_va(userq->wptr_bo), userq->rptr_va, mqd, priority,
&userq->userq_handle);
if (r == -EACCES && priority == AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_HIGH) {
/* Try again with a lower priority. */
priority = AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_NORMAL_HIGH;

View file

@ -32,6 +32,11 @@ extern "C" {
userq->next_wptr = __next_wptr; \
} while (0)
#define amdgpu_pkt_get_ptr_skip_dw() \
(__ring_ptr + (__next_wptr++ & AMDGPU_USERQ_RING_SIZE_DW_MASK))
#define amdgpu_pkt_get_next_wptr() __next_wptr
struct amdgpu_winsys;
struct amdgpu_screen_winsys;
@ -68,7 +73,8 @@ struct amdgpu_userq {
* (this avoids writing multiple times to the door bell for the same
* submission) */
uint64_t next_wptr;
struct pb_buffer_lean *rptr_bo;
struct pb_buffer_lean *vram_bo;
uint64_t rptr_va;
struct pb_buffer_lean *doorbell_bo;
uint64_t *doorbell_bo_map;