diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.cpp b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.cpp index 4449d3d5f6c..12aa8b12071 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.cpp +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.cpp @@ -1416,6 +1416,15 @@ struct cond_exec_skip_count { uint64_t start_wptr; }; +#define add_dbg_count_write_data_pkt(number) do { \ + amdgpu_pkt_add_dw(PKT3(PKT3_WRITE_DATA, 4, 0)); \ + amdgpu_pkt_add_dw(WRITE_DATA_DST_SEL(5) | WRITE_DATA_WR_CONFIRM | WRITE_DATA_CACHE_POLICY(3)); \ + amdgpu_pkt_add_dw(userq->write_data_pkt_dbg_count_va); \ + amdgpu_pkt_add_dw(userq->write_data_pkt_dbg_count_va >> 32); \ + amdgpu_pkt_add_dw(number); \ + amdgpu_pkt_add_dw((uint64_t)number >> 32); \ +} while (0) + static void amdgpu_cs_add_userq_packets(struct amdgpu_winsys *aws, struct amdgpu_userq *userq, struct amdgpu_cs_context *csc, @@ -1441,6 +1450,9 @@ static void amdgpu_cs_add_userq_packets(struct amdgpu_winsys *aws, cond_exec_skip_counts[0].start_wptr = amdgpu_pkt_get_next_wptr(); } + if (aws->userq_job_log) + add_dbg_count_write_data_pkt(1); + if (num_fences) { unsigned max_num_fences_fwm; unsigned num_fences_in_iter; @@ -1473,6 +1485,9 @@ static void amdgpu_cs_add_userq_packets(struct amdgpu_winsys *aws, } } + if (aws->userq_job_log) + add_dbg_count_write_data_pkt(2); + amdgpu_pkt_add_dw(PKT3(PKT3_HDP_FLUSH, 0, 0)); amdgpu_pkt_add_dw(0x0); diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_userq.c b/src/gallium/winsys/amdgpu/drm/amdgpu_userq.c index 1c80aaf7d35..e92ad158cec 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_userq.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_userq.c @@ -29,7 +29,8 @@ static bool amdgpu_userq_ring_init(struct amdgpu_winsys *aws, struct amdgpu_userq *userq, uint64_t *vm_timeline_point_to_wait) { - /* Allocate ring and user fence in one buffer. */ + /* Allocate ring and user fence in one buffer. Also allocate for wait packet debug count + * variable. */ uint32_t gtt_bo_size = AMDGPU_USERQ_RING_SIZE + aws->info.gart_page_size; userq->gtt_bo = amdgpu_bo_create(aws, gtt_bo_size, 256, RADEON_DOMAIN_GTT, RADEON_FLAG_GL2_BYPASS | RADEON_FLAG_NO_INTERPROCESS_SHARING); @@ -59,6 +60,12 @@ amdgpu_userq_ring_init(struct amdgpu_winsys *aws, struct amdgpu_userq *userq, *userq->wptr_bo_map = 0; userq->next_wptr = 0; + userq->write_data_pkt_dbg_count_ptr = (uint64_t*)(userq->gtt_bo_map + + AMDGPU_USERQ_RING_SIZE + 8); + userq->write_data_pkt_dbg_count_va = amdgpu_bo_get_va(userq->gtt_bo) + + AMDGPU_USERQ_RING_SIZE + 8; + *userq->write_data_pkt_dbg_count_ptr = 0; + /* Allocate memory for rptr. */ userq->vram_bo = amdgpu_bo_create(aws, aws->info.gart_page_size, 256, RADEON_DOMAIN_VRAM, RADEON_FLAG_CLEAR_VRAM | RADEON_FLAG_GL2_BYPASS | @@ -85,14 +92,18 @@ userq_job_log_thread(void *data) if (userq->userq_handle) { uint64_t last_submitted_job = *userq->wptr_bo_map; uint64_t last_completed_job = *userq->user_fence_ptr; + uint64_t last_write_data_pkt_dbg_count = *userq->write_data_pkt_dbg_count_ptr; if (userq->last_submitted_job != last_submitted_job || - userq->last_completed_job != last_completed_job) { - mesa_logi("amdgpu: uq_log: %s: submitted_job=%llx completed_job=%llx\n", - amdgpu_userq_str[i], (long long)last_submitted_job, - (long long)last_completed_job); + userq->last_completed_job != last_completed_job || + userq->last_write_data_pkt_dbg_count != last_write_data_pkt_dbg_count) { + mesa_logi("amdgpu: uq_log: %s: submitted_job=%llx completed_job=%llx" + " write_data_pkt_dbg_count=%llx\n", amdgpu_userq_str[i], + (long long)last_submitted_job, (long long)last_completed_job, + (long long)last_write_data_pkt_dbg_count); userq->last_submitted_job = last_submitted_job; userq->last_completed_job = last_completed_job; + userq->last_write_data_pkt_dbg_count = last_write_data_pkt_dbg_count; } } } diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_userq.h b/src/gallium/winsys/amdgpu/drm/amdgpu_userq.h index ac888c84b89..fe4b376f451 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_userq.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_userq.h @@ -79,6 +79,13 @@ struct amdgpu_userq { struct pb_buffer_lean *doorbell_bo; uint64_t *doorbell_bo_map; + /* For debugging where the ring is stuck, WRITE_DATA packet with unique number is + * inserted in the ring. The number will indicate the packets that are parsed by CP. + * This value is printed in job log. + */ + uint64_t *write_data_pkt_dbg_count_ptr; + uint64_t write_data_pkt_dbg_count_va; + /* In case of gfx11.5 shadow register address has to be initialized using LOAD_* packet. * Also for every new ib/job submission, the shadowed registers has to be loaded using LOAD_* * packets. @@ -103,6 +110,7 @@ struct amdgpu_userq { /* Used in userq job log thread to only print if data has changed */ uint64_t last_submitted_job; uint64_t last_completed_job; + uint64_t last_write_data_pkt_dbg_count; }; void