diff --git a/docs/envvars.rst b/docs/envvars.rst index 6a4a16cd5ee..c4f1d523788 100644 --- a/docs/envvars.rst +++ b/docs/envvars.rst @@ -1823,6 +1823,8 @@ RadeonSI driver environment variables Enable CP register shadowing in kernel queue. ``userqnoshadowregs`` Disable register shadowing in userqueue. This will also disable userqueue mcbp. + ``userqjoblog`` + Print submitted, completed... job info for userqueues. ``novideotiling`` Disable tiling for video. ``nodectier1`` diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.cpp b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.cpp index 7115a7671db..4449d3d5f6c 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.cpp +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.cpp @@ -1628,6 +1628,18 @@ static int amdgpu_cs_submit_ib_userq(struct amdgpu_userq *userq, if (r) mesa_loge("amdgpu: getting wait fences failed\n"); + if (aws->userq_job_log) { + for (unsigned i = 0; i < userq_wait_data.num_fences; i++) { + /* The uq_va memory is allocated in kernel from a memory chunk. This memory chunk is + * mapped to same address for all process/apps. Once uq_va is guess mapped to a + * given queue, cross process/queue fence dependency can be analyzed. + */ + mesa_logi("amdgpu: uq_log: %s: num_wait_fences=%d uq_va=%llx job=%llx\n", + amdgpu_userq_str[acs->queue_index], userq_wait_data.num_fences, fence_info[i].va, + fence_info[i].value); + } + } + simple_mtx_lock(&userq->lock); amdgpu_cs_add_userq_packets(aws, userq, csc, userq_wait_data.num_fences, fence_info); struct drm_amdgpu_userq_signal userq_signal_data = { @@ -1658,6 +1670,11 @@ static int amdgpu_cs_submit_ib_userq(struct amdgpu_userq *userq, userq->doorbell_bo_map[AMDGPU_USERQ_DOORBELL_INDEX] = userq->next_wptr; r = ac_drm_userq_signal(aws->dev, &userq_signal_data); + if (aws->userq_job_log) { + mesa_logi("amdgpu: uq_log: %s: submitted_job=%llx\n", amdgpu_userq_str[acs->queue_index], + (long long)*userq->wptr_bo_map); + } + *seq_no = userq->user_fence_seq_num; simple_mtx_unlock(&userq->lock); diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_userq.c b/src/gallium/winsys/amdgpu/drm/amdgpu_userq.c index 9355e0d2e60..1c80aaf7d35 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_userq.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_userq.c @@ -72,6 +72,41 @@ amdgpu_userq_ring_init(struct amdgpu_winsys *aws, struct amdgpu_userq *userq, return true; } +static void * +userq_job_log_thread(void *data) +{ + struct amdgpu_winsys *aws = data; + struct amdgpu_userq *userq; + + while (aws->userq_job_log) { + os_time_sleep(1000 * 700); + for (unsigned i = 0; i < AMDGPU_MAX_QUEUES; i++) { + userq = &aws->queues[i].userq; + if (userq->userq_handle) { + uint64_t last_submitted_job = *userq->wptr_bo_map; + uint64_t last_completed_job = *userq->user_fence_ptr; + + if (userq->last_submitted_job != last_submitted_job || + userq->last_completed_job != last_completed_job) { + mesa_logi("amdgpu: uq_log: %s: submitted_job=%llx completed_job=%llx\n", + amdgpu_userq_str[i], (long long)last_submitted_job, + (long long)last_completed_job); + userq->last_submitted_job = last_submitted_job; + userq->last_completed_job = last_completed_job; + } + } + } + } + + return NULL; +} + +void +amdgpu_userq_start_job_log_thread(struct amdgpu_winsys *aws) +{ + pthread_create(&aws->userq_job_log_thread, NULL, userq_job_log_thread, aws); +} + void amdgpu_userq_deinit(struct amdgpu_winsys *aws, struct amdgpu_userq *userq) { diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_userq.h b/src/gallium/winsys/amdgpu/drm/amdgpu_userq.h index e2d19e2abf2..ac888c84b89 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_userq.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_userq.h @@ -99,8 +99,15 @@ struct amdgpu_userq { struct amdgpu_userq_compute_data compute_data; struct amdgpu_userq_sdma_data sdma_data; }; + + /* Used in userq job log thread to only print if data has changed */ + uint64_t last_submitted_job; + uint64_t last_completed_job; }; +void +amdgpu_userq_start_job_log_thread(struct amdgpu_winsys *aws); + bool amdgpu_userq_init(struct amdgpu_winsys *aws, struct amdgpu_userq *userq, enum amd_ip_type ip_type, unsigned queue_index); diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c index e15fd24bdeb..ada39e7f278 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c @@ -25,6 +25,13 @@ #include #include "sid.h" +char amdgpu_userq_str[AMDGPU_MAX_QUEUES][8] = { + "gfx", + "gfx_hi", + "comp", + "sdma" +}; + static struct hash_table *dev_tab = NULL; static simple_mtx_t dev_tab_mutex = SIMPLE_MTX_INITIALIZER; @@ -59,6 +66,7 @@ static bool do_winsys_init(struct amdgpu_winsys *aws, strstr(debug_get_option("AMD_DEBUG", ""), "sqtt") != NULL; aws->zero_all_vram_allocs = strstr(debug_get_option("R600_DEBUG", ""), "zerovram") != NULL || driQueryOptionb(config->options, "radeonsi_zerovram"); + aws->userq_job_log = strstr(debug_get_option("AMD_DEBUG", ""), "userqjoblog") != NULL; for (unsigned i = 0; i < ARRAY_SIZE(aws->queues); i++) simple_mtx_init(&aws->queues[i].userq.lock, mtx_plain); @@ -67,6 +75,9 @@ static bool do_winsys_init(struct amdgpu_winsys *aws, if (!aws->info.userq_ip_mask) aws->info.has_vm_always_valid = false; + if (aws->userq_job_log) + amdgpu_userq_start_job_log_thread(aws); + return true; fail: @@ -80,6 +91,11 @@ static void do_winsys_deinit(struct amdgpu_winsys *aws) if (aws->reserve_vmid) ac_drm_vm_unreserve_vmid(aws->dev, 0); + if (aws->userq_job_log) { + aws->userq_job_log = false; + pthread_join(aws->userq_job_log_thread, NULL); + } + for (unsigned i = 0; i < ARRAY_SIZE(aws->queues); i++) { for (unsigned j = 0; j < ARRAY_SIZE(aws->queues[i].fences); j++) amdgpu_fence_reference(&aws->queues[i].fences[j], NULL); diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h index 6f8ada7a01c..0f67f5efbd6 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h @@ -126,6 +126,8 @@ enum amdgpu_queue_index { AMDGPU_QUEUE_USES_ALT_FENCE = INT_MAX, }; +extern char amdgpu_userq_str[AMDGPU_MAX_QUEUES][8]; + /* This can use any integer type because the logic handles integer wraparounds robustly, but * uint8_t wraps around so quickly that some BOs might never become idle because we don't * remove idle fences from BOs, so they become "busy" again after a queue sequence number wraps @@ -207,6 +209,8 @@ struct amdgpu_winsys { /* Protected by bo_fence_lock. */ struct amdgpu_queue queues[AMDGPU_MAX_QUEUES]; + pthread_t userq_job_log_thread; + bool userq_job_log; /* enable userq job log thread */ struct pb_cache bo_cache; struct pb_slabs bo_slabs; /* Slab allocator. */