mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-07 10:50:16 +01:00
winsys/amdgpu: fwm packet pre-emption for gfx 11.5
gfx 11.5 uses f32 firmware. f32 firmware requires COND_EXEC packet to flush the ring buffer when pre-emption occured. Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36700>
This commit is contained in:
parent
37c7d19e46
commit
9beb668d8d
4 changed files with 76 additions and 10 deletions
|
|
@ -70,6 +70,7 @@
|
|||
#define PREDICATION_OP_BOOL32 0x4
|
||||
#define PREDICATION_CONTINUE (1 << 31)
|
||||
#define PKT3_COND_EXEC 0x22
|
||||
#define COND_EXEC_USERQ_OVERRULE_CMD (1 << 31)
|
||||
#define PKT3_PRED_EXEC 0x23
|
||||
#define PKT3_DRAW_INDIRECT 0x24
|
||||
#define PKT3_DRAW_INDEX_INDIRECT 0x25
|
||||
|
|
@ -111,6 +112,9 @@
|
|||
#define STRMOUT_SELECT_BUFFER(x) (((unsigned)(x)&0x3) << 8)
|
||||
#define PKT3_DRAW_INDEX_OFFSET_2 0x35
|
||||
#define PKT3_WRITE_DATA 0x37
|
||||
#define WRITE_DATA_DST_SEL(x) (((unsigned)(x)&0xf) << 8)
|
||||
#define WRITE_DATA_WR_CONFIRM (1 << 20)
|
||||
#define WRITE_DATA_CACHE_POLICY(x) (x << 25)
|
||||
#define PKT3_DRAW_INDEX_INDIRECT_MULTI 0x38
|
||||
#define PKT3_MEM_SEMAPHORE 0x39
|
||||
#define PKT3_MPEG_INDEX 0x3A /* GFX6 only */
|
||||
|
|
@ -250,6 +254,8 @@
|
|||
#define PKT3_INCREMENT_CE_COUNTER 0x84
|
||||
#define PKT3_INCREMENT_DE_COUNTER 0x85
|
||||
#define PKT3_WAIT_ON_CE_COUNTER 0x86
|
||||
#define PKT3_FRAME_CONTROL 0x90
|
||||
#define S_FRAME_CONTROL_CMD(x) ((x) << 28)
|
||||
#define PKT3_HDP_FLUSH 0x95
|
||||
#define PKT3_SET_SH_REG_INDEX 0x9B
|
||||
#define PKT3_LOAD_CONTEXT_REG_INDEX 0x9F /* GFX8+ */
|
||||
|
|
|
|||
|
|
@ -1408,6 +1408,11 @@ static int amdgpu_cs_submit_ib_kernelq(struct amdgpu_cs *acs,
|
|||
return r;
|
||||
}
|
||||
|
||||
struct cond_exec_skip_count {
|
||||
uint32_t *count_dw_ptr;
|
||||
uint64_t start_wptr;
|
||||
};
|
||||
|
||||
static void amdgpu_cs_add_userq_packets(struct amdgpu_winsys *aws,
|
||||
struct amdgpu_userq *userq,
|
||||
struct amdgpu_cs_context *csc,
|
||||
|
|
@ -1417,13 +1422,31 @@ static void amdgpu_cs_add_userq_packets(struct amdgpu_winsys *aws,
|
|||
amdgpu_pkt_begin();
|
||||
|
||||
if (userq->ip_type == AMD_IP_GFX || userq->ip_type == AMD_IP_COMPUTE) {
|
||||
struct cond_exec_skip_count *cond_exec_skip_counts = NULL;
|
||||
|
||||
if (csc->aws->info.gfx_level == GFX11_5 && userq->ip_type == AMD_IP_GFX) {
|
||||
/* index 0 holds skip count for skipping the entire job. Rest for FENCE_WAIT_MULTI
|
||||
* packet pre-emption going to end of the job.
|
||||
*/
|
||||
cond_exec_skip_counts = (struct cond_exec_skip_count*)alloca(
|
||||
sizeof(struct cond_exec_skip_count) * (1 + DIV_ROUND_UP(num_fences, 4)));
|
||||
amdgpu_pkt_add_dw(PKT3(PKT3_COND_EXEC, 3, 0));
|
||||
amdgpu_pkt_add_dw(0);
|
||||
amdgpu_pkt_add_dw(0);
|
||||
amdgpu_pkt_add_dw(0);
|
||||
cond_exec_skip_counts[0].count_dw_ptr = amdgpu_pkt_get_ptr_skip_dw();
|
||||
cond_exec_skip_counts[0].start_wptr = amdgpu_pkt_get_next_wptr();
|
||||
}
|
||||
|
||||
if (num_fences) {
|
||||
unsigned max_num_fences_fwm;
|
||||
unsigned num_fences_in_iter;
|
||||
|
||||
if (csc->aws->info.has_dedicated_vram || csc->aws->info.gfx_level >= GFX12)
|
||||
max_num_fences_fwm = 32;
|
||||
else
|
||||
max_num_fences_fwm = 4;
|
||||
|
||||
for (unsigned i = 0; i < num_fences; i = i + max_num_fences_fwm) {
|
||||
num_fences_in_iter = (i + max_num_fences_fwm > num_fences) ?
|
||||
num_fences - i : max_num_fences_fwm;
|
||||
|
|
@ -1435,6 +1458,15 @@ static void amdgpu_cs_add_userq_packets(struct amdgpu_winsys *aws,
|
|||
amdgpu_pkt_add_dw(fence_info[i + j].value);
|
||||
amdgpu_pkt_add_dw(fence_info[i + j].value >> 32);
|
||||
}
|
||||
|
||||
if (csc->aws->info.gfx_level == GFX11_5 && userq->ip_type == AMD_IP_GFX) {
|
||||
amdgpu_pkt_add_dw(PKT3(PKT3_COND_EXEC, 3, 0));
|
||||
amdgpu_pkt_add_dw(0);
|
||||
amdgpu_pkt_add_dw(0);
|
||||
amdgpu_pkt_add_dw(0);
|
||||
cond_exec_skip_counts[1 + i].count_dw_ptr = amdgpu_pkt_get_ptr_skip_dw();
|
||||
cond_exec_skip_counts[1 + i].start_wptr = amdgpu_pkt_get_next_wptr();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1469,11 +1501,31 @@ static void amdgpu_cs_add_userq_packets(struct amdgpu_winsys *aws,
|
|||
amdgpu_pkt_add_dw(userq->user_fence_seq_num >> 32);
|
||||
amdgpu_pkt_add_dw(0);
|
||||
|
||||
/* protected signal packet. This is trusted RELEASE_MEM packet. i.e. fence buffer
|
||||
* is only accessible from kernel through VMID 0.
|
||||
/* protected signal packet. This is trusted RELEASE_MEM packet.
|
||||
*
|
||||
* Kernel allocates the memory for the protected fence and passes the protected fence address
|
||||
* in MQD (memory queue descriptor - where static and dynamic queue states are stored). This
|
||||
* fence memory is mapped as write, only for VMID 0. This packet writes the ring buffer
|
||||
* monotonic (non-wrapping) read pointer value to the fence address passed in MQD when the
|
||||
* job is completed.
|
||||
*
|
||||
* The protected fence memory is mapped as read only to the user VMID. The
|
||||
* DRM_AMDGPU_USERQ_WAIT ioctl will return read only fence memory address along with protected
|
||||
* fence sequence number to wait which is used in FENCE_WAIT_MULTI packet.
|
||||
*
|
||||
* PKT3_PROTECTED_FENCE_SIGNAL packet should be the last packet before ringing doorbell so
|
||||
* that mesa user fence sequence number matches with protected fence sequence number. This
|
||||
* is helpful in debugging.
|
||||
*/
|
||||
amdgpu_pkt_add_dw(PKT3(PKT3_PROTECTED_FENCE_SIGNAL, 0, 0));
|
||||
amdgpu_pkt_add_dw(0);
|
||||
|
||||
if (csc->aws->info.gfx_level == GFX11_5 && userq->ip_type == AMD_IP_GFX) {
|
||||
for (unsigned i = 0; i < 1 + DIV_ROUND_UP(num_fences, 4); i++)
|
||||
*cond_exec_skip_counts[i].count_dw_ptr = (amdgpu_pkt_get_next_wptr() -
|
||||
cond_exec_skip_counts[i].start_wptr) |
|
||||
COND_EXEC_USERQ_OVERRULE_CMD;
|
||||
}
|
||||
} else {
|
||||
mesa_loge("amdgpu: unsupported userq ip submission = %d\n", userq->ip_type);
|
||||
}
|
||||
|
|
@ -1496,7 +1548,7 @@ static int amdgpu_cs_submit_ib_userq(struct amdgpu_userq *userq,
|
|||
|
||||
/* Syncobj dependencies. */
|
||||
unsigned num_syncobj_dependencies = csc->syncobj_dependencies.num;
|
||||
uint32_t *syncobj_dependencies_list =
|
||||
uint32_t *syncobj_dependencies_list =
|
||||
(uint32_t*)alloca(num_syncobj_dependencies * sizeof(uint32_t));
|
||||
|
||||
/* Currently only 1 vm timeline syncobj can be a dependency. */
|
||||
|
|
|
|||
|
|
@ -58,14 +58,16 @@ amdgpu_userq_ring_init(struct amdgpu_winsys *aws, struct amdgpu_userq *userq,
|
|||
*userq->wptr_bo_map = 0;
|
||||
userq->next_wptr = 0;
|
||||
|
||||
userq->rptr_bo = amdgpu_bo_create(aws, aws->info.gart_page_size, 256, RADEON_DOMAIN_VRAM,
|
||||
/* Allocate memory for rptr. */
|
||||
userq->vram_bo = amdgpu_bo_create(aws, aws->info.gart_page_size, 256, RADEON_DOMAIN_VRAM,
|
||||
RADEON_FLAG_CLEAR_VRAM | RADEON_FLAG_GL2_BYPASS |
|
||||
RADEON_FLAG_NO_SUBALLOC |
|
||||
RADEON_FLAG_NO_INTERPROCESS_SHARING);
|
||||
if (!userq->rptr_bo)
|
||||
if (!userq->vram_bo)
|
||||
return false;
|
||||
|
||||
update_vm_timeline_point_to_wait(vm_timeline_point_to_wait, userq->rptr_bo);
|
||||
update_vm_timeline_point_to_wait(vm_timeline_point_to_wait, userq->vram_bo);
|
||||
userq->rptr_va = amdgpu_bo_get_va(userq->vram_bo);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -77,7 +79,7 @@ amdgpu_userq_deinit(struct amdgpu_winsys *aws, struct amdgpu_userq *userq)
|
|||
|
||||
radeon_bo_reference(&aws->dummy_sws.base, &userq->gtt_bo, NULL);
|
||||
radeon_bo_reference(&aws->dummy_sws.base, &userq->wptr_bo, NULL);
|
||||
radeon_bo_reference(&aws->dummy_sws.base, &userq->rptr_bo, NULL);
|
||||
radeon_bo_reference(&aws->dummy_sws.base, &userq->vram_bo, NULL);
|
||||
radeon_bo_reference(&aws->dummy_sws.base, &userq->doorbell_bo, NULL);
|
||||
|
||||
switch (userq->ip_type) {
|
||||
|
|
@ -206,8 +208,8 @@ amdgpu_userq_init(struct amdgpu_winsys *aws, struct amdgpu_userq *userq, enum am
|
|||
r = ac_drm_create_userqueue(aws->dev, hw_ip_type,
|
||||
get_real_bo(amdgpu_winsys_bo(userq->doorbell_bo))->kms_handle,
|
||||
AMDGPU_USERQ_DOORBELL_INDEX, ring_va, AMDGPU_USERQ_RING_SIZE,
|
||||
amdgpu_bo_get_va(userq->wptr_bo), amdgpu_bo_get_va(userq->rptr_bo),
|
||||
mqd, priority, &userq->userq_handle);
|
||||
amdgpu_bo_get_va(userq->wptr_bo), userq->rptr_va, mqd, priority,
|
||||
&userq->userq_handle);
|
||||
if (r == -EACCES && priority == AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_HIGH) {
|
||||
/* Try again with a lower priority. */
|
||||
priority = AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_NORMAL_HIGH;
|
||||
|
|
|
|||
|
|
@ -32,6 +32,11 @@ extern "C" {
|
|||
userq->next_wptr = __next_wptr; \
|
||||
} while (0)
|
||||
|
||||
#define amdgpu_pkt_get_ptr_skip_dw() \
|
||||
(__ring_ptr + (__next_wptr++ & AMDGPU_USERQ_RING_SIZE_DW_MASK))
|
||||
|
||||
#define amdgpu_pkt_get_next_wptr() __next_wptr
|
||||
|
||||
struct amdgpu_winsys;
|
||||
struct amdgpu_screen_winsys;
|
||||
|
||||
|
|
@ -68,7 +73,8 @@ struct amdgpu_userq {
|
|||
* (this avoids writing multiple times to the door bell for the same
|
||||
* submission) */
|
||||
uint64_t next_wptr;
|
||||
struct pb_buffer_lean *rptr_bo;
|
||||
struct pb_buffer_lean *vram_bo;
|
||||
uint64_t rptr_va;
|
||||
|
||||
struct pb_buffer_lean *doorbell_bo;
|
||||
uint64_t *doorbell_bo_map;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue