diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c index 74e9efddcc3..5406196be80 100644 --- a/src/amd/vulkan/radv_query.c +++ b/src/amd/vulkan/radv_query.c @@ -1283,7 +1283,13 @@ radv_create_query_pool(struct radv_device *device, const VkQueryPoolCreateInfo * break; } case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: - pool->stride = 16; + if (device->physical_device->rad_info.gfx_level >= GFX11) { + /* GFX11 natively supports mesh generated primitives with pipeline statistics. */ + pool->stride = radv_get_pipelinestat_query_size(device) * 2; + } else { + assert(device->physical_device->emulate_mesh_shader_queries); + pool->stride = 16; + } break; default: unreachable("creating unhandled query type"); @@ -1291,7 +1297,9 @@ radv_create_query_pool(struct radv_device *device, const VkQueryPoolCreateInfo * pool->availability_offset = pool->stride * pCreateInfo->queryCount; pool->size = pool->availability_offset; - if (pCreateInfo->queryType == VK_QUERY_TYPE_PIPELINE_STATISTICS) + if (pCreateInfo->queryType == VK_QUERY_TYPE_PIPELINE_STATISTICS || + (pCreateInfo->queryType == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT && + device->physical_device->rad_info.gfx_level >= GFX11)) pool->size += 4 * pCreateInfo->queryCount; result = device->ws->buffer_create(device->ws, pool->size, 64, RADEON_DOMAIN_GTT, @@ -1607,24 +1615,41 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t first break; } case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: { - p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src; uint64_t ms_prim_gen; - do { - available = 1; - if (!(p_atomic_read(&src64[0].value) & 0x8000000000000000UL) || - !(p_atomic_read(&src64[1].value) & 0x8000000000000000UL)) { - available = 0; - } - } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) && !(timed_out = (atimeout < os_time_get_nano()))); + if (device->physical_device->rad_info.gfx_level >= GFX11) { + unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(device); + const uint32_t *avail_ptr = (const uint32_t *)(pool->ptr + pool->availability_offset + 4 * query); + + do { + available = p_atomic_read(avail_ptr); + } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) && + !(timed_out = (atimeout < os_time_get_nano()))); + + const uint64_t *start = (uint64_t *)src; + const uint64_t *stop = (uint64_t *)(src + pipelinestat_block_size); + + ms_prim_gen = stop[pipeline_statistics_indices[13]] - start[pipeline_statistics_indices[13]]; + } else { + p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src; + + do { + available = 1; + if (!(p_atomic_read(&src64[0].value) & 0x8000000000000000UL) || + !(p_atomic_read(&src64[1].value) & 0x8000000000000000UL)) { + available = 0; + } + } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT) && + !(timed_out = (atimeout < os_time_get_nano()))); + + ms_prim_gen = p_atomic_read_relaxed(&src64[1].value) - p_atomic_read_relaxed(&src64[0].value); + } if (timed_out) result = VK_ERROR_DEVICE_LOST; else if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) result = VK_NOT_READY; - ms_prim_gen = p_atomic_read_relaxed(&src64[1].value) - p_atomic_read_relaxed(&src64[0].value); - if (flags & VK_QUERY_RESULT_64_BIT) { if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) *(uint64_t *)dest = ms_prim_gen; @@ -1844,22 +1869,41 @@ radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPoo pool->uses_gds && cmd_buffer->device->physical_device->rad_info.gfx_level < GFX11); break; case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: - if (flags & VK_QUERY_RESULT_WAIT_BIT) { - for (unsigned i = 0; i < queryCount; i++) { - unsigned query = firstQuery + i; - uint64_t src_va = va + query * pool->stride; + if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) { + if (flags & VK_QUERY_RESULT_WAIT_BIT) { + for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) { + unsigned query = firstQuery + i; - radeon_check_space(cmd_buffer->device->ws, cs, 7 * 2); + radeon_check_space(cmd_buffer->device->ws, cs, 7); - /* Wait on the upper word. */ - radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff); - radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 12, 0x80000000, 0xffffffff); + uint64_t avail_va = va + pool->availability_offset + 4 * query; + + /* This waits on the ME. All copies below are done on the ME */ + radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_EQUAL, avail_va, 1, 0xffffffff); + } } - } + radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline, + pool->bo, dst_buffer->bo, firstQuery * pool->stride, dst_buffer->offset + dstOffset, + pool->stride, stride, dst_size, queryCount, flags, 1 << 13, + pool->availability_offset + 4 * firstQuery, false); + } else { + if (flags & VK_QUERY_RESULT_WAIT_BIT) { + for (unsigned i = 0; i < queryCount; i++) { + unsigned query = firstQuery + i; + uint64_t src_va = va + query * pool->stride; - radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.ms_prim_gen_query_pipeline, pool->bo, - dst_buffer->bo, firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride, stride, - dst_size, queryCount, flags, 0, 0, false); + radeon_check_space(cmd_buffer->device->ws, cs, 7 * 2); + + /* Wait on the upper word. */ + radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff); + radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 12, 0x80000000, 0xffffffff); + } + } + + radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.ms_prim_gen_query_pipeline, pool->bo, + dst_buffer->bo, firstQuery * pool->stride, dst_buffer->offset + dstOffset, pool->stride, + stride, dst_size, queryCount, flags, 0, 0, false); + } break; default: unreachable("trying to get results of unhandled query type"); @@ -1898,7 +1942,9 @@ radv_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uin flush_bits |= radv_fill_buffer(cmd_buffer, NULL, pool->bo, radv_buffer_get_va(pool->bo) + firstQuery * pool->stride, queryCount * pool->stride, value); - if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { + if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS || + (pool->vk.query_type == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT && + cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11)) { flush_bits |= radv_fill_buffer(cmd_buffer, NULL, pool->bo, radv_buffer_get_va(pool->bo) + pool->availability_offset + firstQuery * 4, queryCount * 4, 0); @@ -1914,6 +1960,7 @@ radv_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uin VKAPI_ATTR void VKAPI_CALL radv_ResetQueryPool(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount) { + RADV_FROM_HANDLE(radv_device, device, _device); RADV_FROM_HANDLE(radv_query_pool, pool, queryPool); uint32_t value = query_clear_value(pool->vk.query_type); @@ -1923,7 +1970,9 @@ radv_ResetQueryPool(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery for (uint32_t *p = data; p != data_end; ++p) *p = value; - if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { + if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS || + (pool->vk.query_type == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT && + device->physical_device->rad_info.gfx_level >= GFX11)) { memset(pool->ptr + pool->availability_offset + firstQuery * 4, 0, queryCount * 4); } } @@ -2193,16 +2242,29 @@ emit_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *poo break; } case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: { - gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_MS_PRIM_GEN_OFFSET, va); - radv_cs_write_data_imm(cs, V_370_ME, va + 4, 0x80000000); + if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) { + radeon_check_space(cmd_buffer->device->ws, cs, 4); - /* Record that the command buffer needs GDS. */ - cmd_buffer->gds_needed = true; + ++cmd_buffer->state.active_pipeline_queries; - if (!cmd_buffer->state.active_prims_gen_gds_queries) - cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY; + radv_update_hw_pipelinestat(cmd_buffer); - cmd_buffer->state.active_prims_gen_gds_queries++; + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + } else { + gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_MS_PRIM_GEN_OFFSET, va); + radv_cs_write_data_imm(cs, V_370_ME, va + 4, 0x80000000); + + /* Record that the command buffer needs GDS. */ + cmd_buffer->gds_needed = true; + + if (!cmd_buffer->state.active_prims_gen_gds_queries) + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY; + + cmd_buffer->state.active_prims_gen_gds_queries++; + } break; } default: @@ -2381,13 +2443,34 @@ emit_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, break; } case VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT: { - gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_MS_PRIM_GEN_OFFSET, va + 8); - radv_cs_write_data_imm(cs, V_370_ME, va + 12, 0x80000000); + if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX11) { + unsigned pipelinestat_block_size = radv_get_pipelinestat_query_size(cmd_buffer->device); - cmd_buffer->state.active_prims_gen_gds_queries--; + radeon_check_space(cmd_buffer->device->ws, cs, 16); - if (!cmd_buffer->state.active_prims_gen_gds_queries) - cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY; + cmd_buffer->state.active_pipeline_queries--; + + radv_update_hw_pipelinestat(cmd_buffer); + + va += pipelinestat_block_size; + + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + + radv_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, cmd_buffer->qf, + V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, + avail_va, 1, cmd_buffer->gfx9_eop_bug_va); + } else { + gfx10_copy_gds_query_gfx(cmd_buffer, RADV_SHADER_QUERY_MS_PRIM_GEN_OFFSET, va + 8); + radv_cs_write_data_imm(cs, V_370_ME, va + 12, 0x80000000); + + cmd_buffer->state.active_prims_gen_gds_queries--; + + if (!cmd_buffer->state.active_prims_gen_gds_queries) + cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY; + } break; } default: