From 9beb668d8d26ca054cbbe1c7554a87d502809c2d Mon Sep 17 00:00:00 2001 From: Yogesh Mohan Marimuthu Date: Fri, 18 Jul 2025 11:01:55 +0530 Subject: [PATCH] winsys/amdgpu: fwm packet pre-emption for gfx 11.5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gfx 11.5 uses f32 firmware. f32 firmware requires COND_EXEC packet to flush the ring buffer when pre-emption occured. Reviewed-by: Marek Olšák Part-of: --- src/amd/common/sid.h | 6 ++ src/gallium/winsys/amdgpu/drm/amdgpu_cs.cpp | 58 +++++++++++++++++++- src/gallium/winsys/amdgpu/drm/amdgpu_userq.c | 14 +++-- src/gallium/winsys/amdgpu/drm/amdgpu_userq.h | 8 ++- 4 files changed, 76 insertions(+), 10 deletions(-) diff --git a/src/amd/common/sid.h b/src/amd/common/sid.h index 8b25ad7cb84..64c4406c44f 100644 --- a/src/amd/common/sid.h +++ b/src/amd/common/sid.h @@ -70,6 +70,7 @@ #define PREDICATION_OP_BOOL32 0x4 #define PREDICATION_CONTINUE (1 << 31) #define PKT3_COND_EXEC 0x22 +#define COND_EXEC_USERQ_OVERRULE_CMD (1 << 31) #define PKT3_PRED_EXEC 0x23 #define PKT3_DRAW_INDIRECT 0x24 #define PKT3_DRAW_INDEX_INDIRECT 0x25 @@ -111,6 +112,9 @@ #define STRMOUT_SELECT_BUFFER(x) (((unsigned)(x)&0x3) << 8) #define PKT3_DRAW_INDEX_OFFSET_2 0x35 #define PKT3_WRITE_DATA 0x37 +#define WRITE_DATA_DST_SEL(x) (((unsigned)(x)&0xf) << 8) +#define WRITE_DATA_WR_CONFIRM (1 << 20) +#define WRITE_DATA_CACHE_POLICY(x) (x << 25) #define PKT3_DRAW_INDEX_INDIRECT_MULTI 0x38 #define PKT3_MEM_SEMAPHORE 0x39 #define PKT3_MPEG_INDEX 0x3A /* GFX6 only */ @@ -250,6 +254,8 @@ #define PKT3_INCREMENT_CE_COUNTER 0x84 #define PKT3_INCREMENT_DE_COUNTER 0x85 #define PKT3_WAIT_ON_CE_COUNTER 0x86 +#define PKT3_FRAME_CONTROL 0x90 +#define S_FRAME_CONTROL_CMD(x) ((x) << 28) #define PKT3_HDP_FLUSH 0x95 #define PKT3_SET_SH_REG_INDEX 0x9B #define PKT3_LOAD_CONTEXT_REG_INDEX 0x9F /* GFX8+ */ diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.cpp b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.cpp index a02a7a8fc67..f2a3238d960 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.cpp +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.cpp @@ -1408,6 +1408,11 @@ static int amdgpu_cs_submit_ib_kernelq(struct amdgpu_cs *acs, return r; } +struct cond_exec_skip_count { + uint32_t *count_dw_ptr; + uint64_t start_wptr; +}; + static void amdgpu_cs_add_userq_packets(struct amdgpu_winsys *aws, struct amdgpu_userq *userq, struct amdgpu_cs_context *csc, @@ -1417,13 +1422,31 @@ static void amdgpu_cs_add_userq_packets(struct amdgpu_winsys *aws, amdgpu_pkt_begin(); if (userq->ip_type == AMD_IP_GFX || userq->ip_type == AMD_IP_COMPUTE) { + struct cond_exec_skip_count *cond_exec_skip_counts = NULL; + + if (csc->aws->info.gfx_level == GFX11_5 && userq->ip_type == AMD_IP_GFX) { + /* index 0 holds skip count for skipping the entire job. Rest for FENCE_WAIT_MULTI + * packet pre-emption going to end of the job. + */ + cond_exec_skip_counts = (struct cond_exec_skip_count*)alloca( + sizeof(struct cond_exec_skip_count) * (1 + DIV_ROUND_UP(num_fences, 4))); + amdgpu_pkt_add_dw(PKT3(PKT3_COND_EXEC, 3, 0)); + amdgpu_pkt_add_dw(0); + amdgpu_pkt_add_dw(0); + amdgpu_pkt_add_dw(0); + cond_exec_skip_counts[0].count_dw_ptr = amdgpu_pkt_get_ptr_skip_dw(); + cond_exec_skip_counts[0].start_wptr = amdgpu_pkt_get_next_wptr(); + } + if (num_fences) { unsigned max_num_fences_fwm; unsigned num_fences_in_iter; + if (csc->aws->info.has_dedicated_vram || csc->aws->info.gfx_level >= GFX12) max_num_fences_fwm = 32; else max_num_fences_fwm = 4; + for (unsigned i = 0; i < num_fences; i = i + max_num_fences_fwm) { num_fences_in_iter = (i + max_num_fences_fwm > num_fences) ? num_fences - i : max_num_fences_fwm; @@ -1435,6 +1458,15 @@ static void amdgpu_cs_add_userq_packets(struct amdgpu_winsys *aws, amdgpu_pkt_add_dw(fence_info[i + j].value); amdgpu_pkt_add_dw(fence_info[i + j].value >> 32); } + + if (csc->aws->info.gfx_level == GFX11_5 && userq->ip_type == AMD_IP_GFX) { + amdgpu_pkt_add_dw(PKT3(PKT3_COND_EXEC, 3, 0)); + amdgpu_pkt_add_dw(0); + amdgpu_pkt_add_dw(0); + amdgpu_pkt_add_dw(0); + cond_exec_skip_counts[1 + i].count_dw_ptr = amdgpu_pkt_get_ptr_skip_dw(); + cond_exec_skip_counts[1 + i].start_wptr = amdgpu_pkt_get_next_wptr(); + } } } @@ -1469,11 +1501,31 @@ static void amdgpu_cs_add_userq_packets(struct amdgpu_winsys *aws, amdgpu_pkt_add_dw(userq->user_fence_seq_num >> 32); amdgpu_pkt_add_dw(0); - /* protected signal packet. This is trusted RELEASE_MEM packet. i.e. fence buffer - * is only accessible from kernel through VMID 0. + /* protected signal packet. This is trusted RELEASE_MEM packet. + * + * Kernel allocates the memory for the protected fence and passes the protected fence address + * in MQD (memory queue descriptor - where static and dynamic queue states are stored). This + * fence memory is mapped as write, only for VMID 0. This packet writes the ring buffer + * monotonic (non-wrapping) read pointer value to the fence address passed in MQD when the + * job is completed. + * + * The protected fence memory is mapped as read only to the user VMID. The + * DRM_AMDGPU_USERQ_WAIT ioctl will return read only fence memory address along with protected + * fence sequence number to wait which is used in FENCE_WAIT_MULTI packet. + * + * PKT3_PROTECTED_FENCE_SIGNAL packet should be the last packet before ringing doorbell so + * that mesa user fence sequence number matches with protected fence sequence number. This + * is helpful in debugging. */ amdgpu_pkt_add_dw(PKT3(PKT3_PROTECTED_FENCE_SIGNAL, 0, 0)); amdgpu_pkt_add_dw(0); + + if (csc->aws->info.gfx_level == GFX11_5 && userq->ip_type == AMD_IP_GFX) { + for (unsigned i = 0; i < 1 + DIV_ROUND_UP(num_fences, 4); i++) + *cond_exec_skip_counts[i].count_dw_ptr = (amdgpu_pkt_get_next_wptr() - + cond_exec_skip_counts[i].start_wptr) | + COND_EXEC_USERQ_OVERRULE_CMD; + } } else { mesa_loge("amdgpu: unsupported userq ip submission = %d\n", userq->ip_type); } @@ -1496,7 +1548,7 @@ static int amdgpu_cs_submit_ib_userq(struct amdgpu_userq *userq, /* Syncobj dependencies. */ unsigned num_syncobj_dependencies = csc->syncobj_dependencies.num; - uint32_t *syncobj_dependencies_list = + uint32_t *syncobj_dependencies_list = (uint32_t*)alloca(num_syncobj_dependencies * sizeof(uint32_t)); /* Currently only 1 vm timeline syncobj can be a dependency. */ diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_userq.c b/src/gallium/winsys/amdgpu/drm/amdgpu_userq.c index 73976855728..0595e5ac0a1 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_userq.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_userq.c @@ -58,14 +58,16 @@ amdgpu_userq_ring_init(struct amdgpu_winsys *aws, struct amdgpu_userq *userq, *userq->wptr_bo_map = 0; userq->next_wptr = 0; - userq->rptr_bo = amdgpu_bo_create(aws, aws->info.gart_page_size, 256, RADEON_DOMAIN_VRAM, + /* Allocate memory for rptr. */ + userq->vram_bo = amdgpu_bo_create(aws, aws->info.gart_page_size, 256, RADEON_DOMAIN_VRAM, RADEON_FLAG_CLEAR_VRAM | RADEON_FLAG_GL2_BYPASS | RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_NO_INTERPROCESS_SHARING); - if (!userq->rptr_bo) + if (!userq->vram_bo) return false; - update_vm_timeline_point_to_wait(vm_timeline_point_to_wait, userq->rptr_bo); + update_vm_timeline_point_to_wait(vm_timeline_point_to_wait, userq->vram_bo); + userq->rptr_va = amdgpu_bo_get_va(userq->vram_bo); return true; } @@ -77,7 +79,7 @@ amdgpu_userq_deinit(struct amdgpu_winsys *aws, struct amdgpu_userq *userq) radeon_bo_reference(&aws->dummy_sws.base, &userq->gtt_bo, NULL); radeon_bo_reference(&aws->dummy_sws.base, &userq->wptr_bo, NULL); - radeon_bo_reference(&aws->dummy_sws.base, &userq->rptr_bo, NULL); + radeon_bo_reference(&aws->dummy_sws.base, &userq->vram_bo, NULL); radeon_bo_reference(&aws->dummy_sws.base, &userq->doorbell_bo, NULL); switch (userq->ip_type) { @@ -206,8 +208,8 @@ amdgpu_userq_init(struct amdgpu_winsys *aws, struct amdgpu_userq *userq, enum am r = ac_drm_create_userqueue(aws->dev, hw_ip_type, get_real_bo(amdgpu_winsys_bo(userq->doorbell_bo))->kms_handle, AMDGPU_USERQ_DOORBELL_INDEX, ring_va, AMDGPU_USERQ_RING_SIZE, - amdgpu_bo_get_va(userq->wptr_bo), amdgpu_bo_get_va(userq->rptr_bo), - mqd, priority, &userq->userq_handle); + amdgpu_bo_get_va(userq->wptr_bo), userq->rptr_va, mqd, priority, + &userq->userq_handle); if (r == -EACCES && priority == AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_HIGH) { /* Try again with a lower priority. */ priority = AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_NORMAL_HIGH; diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_userq.h b/src/gallium/winsys/amdgpu/drm/amdgpu_userq.h index 60342b6fc0b..018121eb76b 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_userq.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_userq.h @@ -32,6 +32,11 @@ extern "C" { userq->next_wptr = __next_wptr; \ } while (0) +#define amdgpu_pkt_get_ptr_skip_dw() \ + (__ring_ptr + (__next_wptr++ & AMDGPU_USERQ_RING_SIZE_DW_MASK)) + +#define amdgpu_pkt_get_next_wptr() __next_wptr + struct amdgpu_winsys; struct amdgpu_screen_winsys; @@ -68,7 +73,8 @@ struct amdgpu_userq { * (this avoids writing multiple times to the door bell for the same * submission) */ uint64_t next_wptr; - struct pb_buffer_lean *rptr_bo; + struct pb_buffer_lean *vram_bo; + uint64_t rptr_va; struct pb_buffer_lean *doorbell_bo; uint64_t *doorbell_bo_map;