radv: fix a synchronization issue with primitives generated query on RDNA1-2

Only RDNA1-2 are affected because RADV needs to handle the legacy vs
NGG path for this query, and the NGG results are stored with 2 extra
64-bit values.

Fixes flakes with
dEQP-VK.transform_feedback.primitives_generated_query.* since VKCTS
1.3.7.0.

Cc: mesa-stable
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25862>
(cherry picked from commit 7d96fe853e)
This commit is contained in:
Samuel Pitoiset 2023-10-03 17:34:06 +02:00 committed by Eric Engestrom
parent 8a50c841cf
commit c03932b6e6
3 changed files with 35 additions and 18 deletions

View file

@ -214,7 +214,7 @@
"description": "radv: fix a synchronization issue with primitives generated query on RDNA1-2",
"nominated": true,
"nomination_type": 0,
"resolution": 0,
"resolution": 1,
"main_sha": null,
"because_sha": null,
"notes": null

View file

@ -19,6 +19,3 @@ dEQP-VK.ray_tracing_pipeline.pipeline_library.configurations.multithreaded_compi
dEQP-VK.ray_tracing_pipeline.pipeline_library.configurations.multithreaded_compilation.*_check_capture_replay_handles
dEQP-VK.ray_tracing_pipeline.pipeline_library.configurations.singlethreaded_compilation.*_check_all_handles
dEQP-VK.ray_tracing_pipeline.pipeline_library.configurations.singlethreaded_compilation.*_check_capture_replay_handles
# New CTS flakes in 1.3.7.0
dEQP-VK.transform_feedback.primitives_generated_query.get.*

View file

@ -584,7 +584,7 @@ build_timestamp_query_shader(struct radv_device *device)
}
#define RADV_PGQ_STRIDE 32
#define RADV_PGQ_STRIDE_GDS (RADV_PGQ_STRIDE + 4 * 2)
#define RADV_PGQ_STRIDE_GDS (RADV_PGQ_STRIDE + 8 * 2)
static nir_shader *
build_pg_query_shader(struct radv_device *device)
@ -663,11 +663,21 @@ build_pg_query_shader(struct radv_device *device)
nir_def *avails[2];
avails[0] = nir_channel(&b, load1, 1);
avails[1] = nir_channel(&b, load2, 1);
nir_def *result_is_available =
nir_i2b(&b, nir_iand(&b, nir_iand(&b, avails[0], avails[1]), nir_imm_int(&b, 0x80000000)));
nir_store_var(&b, available, nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, avails[0], avails[1]), 0x80000000)), 0x1);
nir_push_if(&b, uses_gds);
{
nir_def *gds_avail_start = nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd_imm(&b, input_base, 36), .align_mul = 4);
nir_def *gds_avail_end = nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd_imm(&b, input_base, 44), .align_mul = 4);
nir_def *gds_result_available =
nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, gds_avail_start, gds_avail_end), 0x80000000));
nir_store_var(&b, available, nir_iand(&b, nir_load_var(&b, available), gds_result_available), 0x1);
}
nir_pop_if(&b, NULL);
/* Only compute result if available. */
nir_push_if(&b, result_is_available);
nir_push_if(&b, nir_load_var(&b, available));
/* Pack values. */
nir_def *packed64[2];
@ -684,7 +694,7 @@ build_pg_query_shader(struct radv_device *device)
nir_def *gds_start =
nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 32)), .align_mul = 4);
nir_def *gds_end =
nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 36)), .align_mul = 4);
nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 40)), .align_mul = 4);
nir_def *ngg_gds_result = nir_isub(&b, gds_end, gds_start);
@ -692,8 +702,6 @@ build_pg_query_shader(struct radv_device *device)
}
nir_pop_if(&b, NULL);
nir_store_var(&b, available, nir_imm_true(&b), 0x1);
nir_pop_if(&b, NULL);
/* Determine if result is 64 or 32 bit. */
@ -1087,7 +1095,7 @@ radv_create_query_pool(struct radv_device *device, const VkQueryPoolCreateInfo *
case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
if (pool->uses_gds && device->physical_device->rad_info.gfx_level < GFX11) {
/* When the hardware can use both the legacy and the NGG paths in the same begin/end pair,
* allocate 2x32-bit values for the GDS counters.
* allocate 2x64-bit values for the GDS counters.
*/
pool->stride = RADV_PGQ_STRIDE_GDS;
} else {
@ -1318,6 +1326,7 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t first
break;
}
case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
const bool uses_gds_query = pool->uses_gds && device->physical_device->rad_info.gfx_level < GFX11;
p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src;
uint64_t primitive_storage_needed;
@ -1333,6 +1342,10 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t first
!(p_atomic_read(src64 + 2) & 0x8000000000000000UL)) {
available = 0;
}
if (uses_gds_query && (!(p_atomic_read(src64 + 4) & 0x8000000000000000UL) ||
!(p_atomic_read(src64 + 5) & 0x8000000000000000UL))) {
available = 0;
}
} while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT));
if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
@ -1340,11 +1353,9 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t first
primitive_storage_needed = src64[2] - src64[0];
if (pool->uses_gds && device->physical_device->rad_info.gfx_level < GFX11) {
uint32_t const *src32 = (uint32_t const *)src;
if (uses_gds_query) {
/* Accumulate the result that was copied from GDS in case NGG shader has been used. */
primitive_storage_needed += src32[9] - src32[8];
primitive_storage_needed += src64[5] - src64[4];
}
if (flags & VK_QUERY_RESULT_64_BIT) {
@ -1539,15 +1550,22 @@ radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPoo
break;
case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
if (flags & VK_QUERY_RESULT_WAIT_BIT) {
const bool uses_gds_query = pool->uses_gds && cmd_buffer->device->physical_device->rad_info.gfx_level < GFX11;
for (unsigned i = 0; i < queryCount; i++) {
unsigned query = firstQuery + i;
uint64_t src_va = va + query * pool->stride;
radeon_check_space(cmd_buffer->device->ws, cs, 7 * 2);
radeon_check_space(cmd_buffer->device->ws, cs, 7 * 4);
/* Wait on the upper word of the PrimitiveStorageNeeded result. */
radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff);
radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 20, 0x80000000, 0xffffffff);
if (uses_gds_query) {
radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 36, 0x80000000, 0xffffffff);
radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 44, 0x80000000, 0xffffffff);
}
}
}
@ -1829,6 +1847,7 @@ emit_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *poo
if (pool->uses_gds) {
/* generated prim counter */
gfx10_copy_gds_query(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 32);
radv_cs_write_data_imm(cs, V_370_ME, va + 36, 0x80000000);
/* Record that the command buffer needs GDS. */
cmd_buffer->gds_needed = true;
@ -1983,7 +2002,8 @@ emit_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool,
if (pool->uses_gds) {
/* generated prim counter */
gfx10_copy_gds_query(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 36);
gfx10_copy_gds_query(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 40);
radv_cs_write_data_imm(cs, V_370_ME, va + 44, 0x80000000);
cmd_buffer->state.active_prims_gen_gds_queries--;