diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index ac6851dc908..1caf50522f2 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -509,6 +509,7 @@ radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) cmd_buffer->gsvs_ring_size_needed = 0; cmd_buffer->tess_rings_needed = false; cmd_buffer->task_rings_needed = false; + cmd_buffer->mesh_scratch_ring_needed = false; cmd_buffer->gds_needed = false; cmd_buffer->gds_oa_needed = false; cmd_buffer->sample_positions_needed = false; @@ -5260,6 +5261,9 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TESS_CTRL)) cmd_buffer->tess_rings_needed = true; + if (mesh_shading) + cmd_buffer->mesh_scratch_ring_needed |= + pipeline->shaders[MESA_SHADER_MESH]->info.ms.needs_ms_scratch_ring; if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_TASK)) { cmd_buffer->task_rings_needed = true; @@ -5801,6 +5805,8 @@ radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCou primary->tess_rings_needed = true; if (secondary->task_rings_needed) primary->task_rings_needed = true; + if (secondary->mesh_scratch_ring_needed) + primary->mesh_scratch_ring_needed = true; if (secondary->sample_positions_needed) primary->sample_positions_needed = true; if (secondary->gds_needed) diff --git a/src/amd/vulkan/radv_constants.h b/src/amd/vulkan/radv_constants.h index 4b6d3b9667b..1953a5d2201 100644 --- a/src/amd/vulkan/radv_constants.h +++ b/src/amd/vulkan/radv_constants.h @@ -76,7 +76,8 @@ #define RING_HS_TESS_OFFCHIP 6 #define RING_TS_DRAW 7 #define RING_TS_PAYLOAD 8 -#define RING_PS_SAMPLE_POSITIONS 9 +#define RING_MS_SCRATCH 9 +#define RING_PS_SAMPLE_POSITIONS 10 /* max number of descriptor sets */ #define MAX_SETS 32 @@ -91,6 +92,28 @@ */ #define RADV_MAX_MEMORY_ALLOCATION_SIZE 0xFFFFFFFCull +/* Number of entries in the mesh shader scratch ring. + * This depends on VGT_GS_MAX_WAVE_ID which is set by the kernel + * and is impossible to query. We leave it on its maximum value + * because real applications are unlikely to use it. + * + * The maximum ID on GFX10.3 is 2047 (0x7ff), so we need 2048 entries. + */ +#define RADV_MESH_SCRATCH_NUM_ENTRIES 2048 + +/* Size of each entry in the mesh shader scratch ring. + * We must ensure that the absolute maximum mesh shader output fits here. + * + * Mesh shaders can create up to 256 vertices/primitives per workgroup, + * and up to the following amount of outputs: + * - 32 parameters + * - 4 positions (clip/cull distance, etc.) + * - 4 per-primitive built-in outputs (layer, view index, prim id, VRS rate) + * - primitive indices which are always kept in LDS + * That is a total of 32+4+4=40 output slots x 16 bytes per slot x 256 = 160K bytes. + */ +#define RADV_MESH_SCRATCH_ENTRY_BYTES (160 * 1024) + /* Number of invocations in each subgroup. */ #define RADV_SUBGROUP_SIZE 64 diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index eb66c4215cb..e92b56a8a05 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -3605,7 +3605,8 @@ radv_fill_shader_rings(struct radv_device *device, uint32_t *map, bool add_sampl uint32_t esgs_ring_size, struct radeon_winsys_bo *esgs_ring_bo, uint32_t gsvs_ring_size, struct radeon_winsys_bo *gsvs_ring_bo, struct radeon_winsys_bo *tess_rings_bo, - struct radeon_winsys_bo *task_rings_bo) + struct radeon_winsys_bo *task_rings_bo, + struct radeon_winsys_bo *mesh_scratch_ring_bo) { uint32_t *desc = &map[4]; @@ -3791,6 +3792,27 @@ radv_fill_shader_rings(struct radv_device *device, uint32_t *map, bool add_sampl desc += 8; + if (mesh_scratch_ring_bo) { + uint64_t va = radv_buffer_get_va(mesh_scratch_ring_bo); + + desc[0] = va; + desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32); + desc[2] = RADV_MESH_SCRATCH_NUM_ENTRIES * RADV_MESH_SCRATCH_ENTRY_BYTES; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); + + if (device->physical_device->rad_info.gfx_level >= GFX11) { + desc[3] |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_UINT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED); + } else { + assert(device->physical_device->rad_info.gfx_level >= GFX10_3); + desc[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED) | S_008F0C_RESOURCE_LEVEL(1); + } + } + + desc += 4; + if (add_sample_positions) { /* add sample positions after all rings */ memcpy(desc, device->sample_locations_1x, 8); @@ -4083,6 +4105,7 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi struct radeon_winsys_bo *gsvs_ring_bo = queue->gsvs_ring_bo; struct radeon_winsys_bo *tess_rings_bo = queue->tess_rings_bo; struct radeon_winsys_bo *task_rings_bo = queue->task_rings_bo; + struct radeon_winsys_bo *mesh_scratch_ring_bo = queue->mesh_scratch_ring_bo; struct radeon_winsys_bo *gds_bo = queue->gds_bo; struct radeon_winsys_bo *gds_oa_bo = queue->gds_oa_bo; struct radeon_cmdbuf *dest_cs[3] = {0}; @@ -4154,6 +4177,16 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi goto fail; } + if (!queue->ring_info.mesh_scratch_ring && needs->mesh_scratch_ring) { + assert(device->physical_device->rad_info.gfx_level >= GFX10_3); + result = + ws->buffer_create(ws, RADV_MESH_SCRATCH_NUM_ENTRIES * RADV_MESH_SCRATCH_ENTRY_BYTES, 256, + RADEON_DOMAIN_VRAM, ring_bo_flags, RADV_BO_PRIORITY_SCRATCH, 0, &mesh_scratch_ring_bo); + + if (result != VK_SUCCESS) + goto fail; + } + if (!queue->ring_info.gds && needs->gds) { assert(device->physical_device->rad_info.gfx_level >= GFX10); @@ -4184,10 +4217,11 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi if ((queue->qf == RADV_QUEUE_COMPUTE && !descriptor_bo && task_rings_bo) || scratch_bo != queue->scratch_bo || esgs_ring_bo != queue->esgs_ring_bo || gsvs_ring_bo != queue->gsvs_ring_bo || tess_rings_bo != queue->tess_rings_bo || - task_rings_bo != queue->task_rings_bo || add_sample_positions) { + task_rings_bo != queue->task_rings_bo || mesh_scratch_ring_bo != queue->mesh_scratch_ring_bo || + add_sample_positions) { uint32_t size = 0; - if (gsvs_ring_bo || esgs_ring_bo || tess_rings_bo || task_rings_bo || add_sample_positions) { - size = 144; /* 2 dword + 2 padding + 4 dword * 8 */ + if (gsvs_ring_bo || esgs_ring_bo || tess_rings_bo || task_rings_bo || mesh_scratch_ring_bo || add_sample_positions) { + size = 160; /* 2 dword + 2 padding + 4 dword * 9 */ if (add_sample_positions) size += 128; /* 64+32+16+8 = 120 bytes */ } else if (scratch_bo) { @@ -4220,10 +4254,10 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi map[1] = rsrc1; } - if (esgs_ring_bo || gsvs_ring_bo || tess_rings_bo || task_rings_bo || add_sample_positions) + if (esgs_ring_bo || gsvs_ring_bo || tess_rings_bo || task_rings_bo || mesh_scratch_ring_bo || add_sample_positions) radv_fill_shader_rings(device, map, add_sample_positions, needs->esgs_ring_size, esgs_ring_bo, needs->gsvs_ring_size, gsvs_ring_bo, tess_rings_bo, - task_rings_bo); + task_rings_bo, mesh_scratch_ring_bo); ws->buffer_unmap(descriptor_bo); } @@ -4238,7 +4272,7 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi /* Continue preamble is unnecessary when no shader rings are used. */ if (!needs->scratch_size_per_wave && !needs->compute_scratch_size_per_wave && !needs->esgs_ring_size && !needs->gsvs_ring_size && !needs->tess_rings && - !needs->task_rings && !needs->gds && !needs->gds_oa && !needs->sample_positions) + !needs->task_rings && !needs->mesh_scratch_ring && !needs->gds && !needs->gds_oa && !needs->sample_positions) continue; } @@ -4368,6 +4402,7 @@ radv_update_preamble_cs(struct radv_queue_state *queue, struct radv_device *devi queue->tess_rings_bo = tess_rings_bo; queue->task_rings_bo = task_rings_bo; + queue->mesh_scratch_ring_bo = mesh_scratch_ring_bo; queue->gds_bo = gds_bo; queue->gds_oa_bo = gds_oa_bo; queue->ring_info = *needs; @@ -4539,6 +4574,7 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device needs.gsvs_ring_size = MAX2(needs.gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed); needs.tess_rings |= cmd_buffer->tess_rings_needed; needs.task_rings |= cmd_buffer->task_rings_needed; + needs.mesh_scratch_ring |= cmd_buffer->mesh_scratch_ring_needed; needs.gds |= cmd_buffer->gds_needed; needs.gds_oa |= cmd_buffer->gds_oa_needed; needs.sample_positions |= cmd_buffer->sample_positions_needed; @@ -4565,7 +4601,9 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device queue->ring_info.esgs_ring_size == needs.esgs_ring_size && queue->ring_info.gsvs_ring_size == needs.gsvs_ring_size && queue->ring_info.tess_rings == needs.tess_rings && - queue->ring_info.task_rings == needs.task_rings && queue->ring_info.gds == needs.gds && + queue->ring_info.task_rings == needs.task_rings && + queue->ring_info.mesh_scratch_ring == needs.mesh_scratch_ring && + queue->ring_info.gds == needs.gds && queue->ring_info.gds_oa == needs.gds_oa && queue->ring_info.sample_positions == needs.sample_positions) return VK_SUCCESS; diff --git a/src/amd/vulkan/radv_nir_lower_abi.c b/src/amd/vulkan/radv_nir_lower_abi.c index 0570a0f1d88..7b6231114c4 100644 --- a/src/amd/vulkan/radv_nir_lower_abi.c +++ b/src/amd/vulkan/radv_nir_lower_abi.c @@ -173,6 +173,14 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state) case nir_intrinsic_load_ring_task_payload_amd: return load_ring(b, RING_TS_PAYLOAD, s); + case nir_intrinsic_load_ring_mesh_scratch_amd: + return load_ring(b, RING_MS_SCRATCH, s); + + case nir_intrinsic_load_ring_mesh_scratch_offset_amd: + /* gs_tg_info[0:11] is ordered_wave_id. Multiply by the ring entry size. */ + return nir_imul_imm(b, nir_iand_imm(b, ac_nir_load_arg(b, &s->args->ac, s->args->ac.gs_tg_info), 0xfff), + RADV_MESH_SCRATCH_ENTRY_BYTES); + case nir_intrinsic_load_task_ring_entry_amd: return ac_nir_load_arg(b, &s->args->ac, s->args->ac.task_ring_entry); @@ -230,6 +238,8 @@ filter_abi_instr(const nir_instr *instr, intrin->intrinsic == nir_intrinsic_load_viewport_y_offset || intrin->intrinsic == nir_intrinsic_load_ring_task_draw_amd || intrin->intrinsic == nir_intrinsic_load_ring_task_payload_amd || + intrin->intrinsic == nir_intrinsic_load_ring_mesh_scratch_amd || + intrin->intrinsic == nir_intrinsic_load_ring_mesh_scratch_offset_amd || intrin->intrinsic == nir_intrinsic_load_task_ring_entry_amd || intrin->intrinsic == nir_intrinsic_load_task_ib_addr || intrin->intrinsic == nir_intrinsic_load_task_ib_stride || diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 9cad5b92241..0d409d49d58 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -717,6 +717,7 @@ struct radv_queue_ring_info { uint32_t gsvs_ring_size; bool tess_rings; bool task_rings; + bool mesh_scratch_ring; bool gds; bool gds_oa; bool sample_positions; @@ -733,6 +734,7 @@ struct radv_queue_state { struct radeon_winsys_bo *gsvs_ring_bo; struct radeon_winsys_bo *tess_rings_bo; struct radeon_winsys_bo *task_rings_bo; + struct radeon_winsys_bo *mesh_scratch_ring_bo; struct radeon_winsys_bo *gds_bo; struct radeon_winsys_bo *gds_oa_bo; @@ -1568,6 +1570,7 @@ struct radv_cmd_buffer { uint32_t gsvs_ring_size_needed; bool tess_rings_needed; bool task_rings_needed; + bool mesh_scratch_ring_needed; bool gds_needed; /* for GFX10 streamout and NGG GS queries */ bool gds_oa_needed; /* for GFX10 streamout */ bool sample_positions_needed; diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 3f4d1b3551a..f7e5470aec7 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -1242,6 +1242,7 @@ void radv_lower_ngg(struct radv_device *device, struct radv_pipeline_stage *ngg_ } else if (nir->info.stage == MESA_SHADER_MESH) { bool scratch_ring = false; NIR_PASS_V(nir, ac_nir_lower_ngg_ms, &scratch_ring, info->wave_size, pl_key->has_multiview_view_index); + ngg_stage->info.ms.needs_ms_scratch_ring = scratch_ring; } else { unreachable("invalid SW stage passed to radv_lower_ngg"); } diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 0044d1e98d6..fbd43140184 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -364,6 +364,7 @@ struct radv_shader_info { struct { struct radv_vs_output_info outinfo; enum shader_prim output_prim; + bool needs_ms_scratch_ring; } ms; struct radv_streamout_info so;