diff --git a/src/amd/common/ac_shader_args.h b/src/amd/common/ac_shader_args.h index 6b71048433a..2a71360099c 100644 --- a/src/amd/common/ac_shader_args.h +++ b/src/amd/common/ac_shader_args.h @@ -111,6 +111,7 @@ struct ac_shader_args { struct ac_arg es2gs_offset; /* separate legacy ES */ struct ac_arg gs2vs_offset; /* legacy GS */ struct ac_arg gs_wave_id; /* legacy GS */ + struct ac_arg gs_attr_offset; /* gfx11+: attribute ring offset in 512B increments */ struct ac_arg gs_vtx_offset[6]; /* GFX6-8: [0-5], GFX9+: [0-2] packed */ struct ac_arg gs_prim_id; struct ac_arg gs_invocation_id; diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 1072d9b3a22..d17ad48d1b6 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -1322,6 +1322,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) ret = LLVMBuildInsertValue(ctx->ac.builder, ret, new_merged_wave_info, 3, ""); if (ctx->stage == MESA_SHADER_TESS_EVAL) ret = si_insert_input_ret(ctx, ret, ctx->args.tess_offchip_offset, 4); + if (ctx->ac.chip_class >= GFX11) + ret = si_insert_input_ret(ctx, ret, ctx->args.gs_attr_offset, 5); ret = si_insert_input_ptr(ctx, ret, ctx->internal_bindings, 8 + SI_SGPR_INTERNAL_BINDINGS); ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images, @@ -1330,6 +1332,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi) 8 + SI_SGPR_CONST_AND_SHADER_BUFFERS); ret = si_insert_input_ptr(ctx, ret, ctx->samplers_and_images, 8 + SI_SGPR_SAMPLERS_AND_IMAGES); ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS); + if (ctx->ac.chip_class >= GFX11) + ret = si_insert_input_ptr(ctx, ret, ctx->gs_attr_address, 8 + GFX9_SGPR_ATTRIBUTE_RING_ADDR); if (ctx->stage == MESA_SHADER_VERTEX) { ret = si_insert_input_ptr(ctx, ret, ctx->args.base_vertex, 8 + SI_SGPR_BASE_VERTEX); diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index 20083c1ce85..2c7d1db6f60 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -99,6 +99,9 @@ void si_blitter_end(struct si_context *sctx) * non-global VS user SGPRs. */ sctx->shader_pointers_dirty |= SI_DESCS_SHADER_MASK(VERTEX); + if (sctx->chip_class >= GFX11) + sctx->gs_attribute_ring_pointer_dirty = true; + /* Reset SI_SGPR_SMALL_PRIM_CULL_INFO: */ if (sctx->screen->use_ngg_culling) si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state); diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index ebdc8cddbd7..962fc5db024 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -2072,6 +2072,8 @@ void si_shader_pointers_mark_dirty(struct si_context *sctx) sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL; sctx->compute_shaderbuf_sgprs_dirty = true; sctx->compute_image_sgprs_dirty = true; + if (sctx->chip_class >= GFX11) + sctx->gs_attribute_ring_pointer_dirty = true; } /* Set a base register address for user data constants in the given shader. @@ -2227,6 +2229,13 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx) sh_base[PIPE_SHADER_TESS_CTRL]); si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(GEOMETRY), sh_base[PIPE_SHADER_GEOMETRY]); + + if (sctx->gs_attribute_ring_pointer_dirty) { + assert(sctx->chip_class >= GFX11); + radeon_set_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + GFX9_SGPR_ATTRIBUTE_RING_ADDR * 4, + sctx->screen->attribute_ring->gpu_address); + sctx->gs_attribute_ring_pointer_dirty = false; + } radeon_end(); sctx->shader_pointers_dirty &= ~u_bit_consecutive(SI_DESCS_INTERNAL, SI_DESCS_FIRST_COMPUTE); @@ -2791,6 +2800,9 @@ void si_init_all_descriptors(struct si_context *sctx) si_get_user_data_base(sctx->chip_class, TESS_OFF, GS_OFF, NGG_OFF, PIPE_SHADER_GEOMETRY)); si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0); + + si_set_ring_buffer(sctx, SI_GS_ATTRIBUTE_RING, &sctx->screen->attribute_ring->b.b, + 0, ~0u, false, true, 16, 32, 0); } static bool si_upload_shader_descriptors(struct si_context *sctx, unsigned mask) diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index e8f5cb6302b..e902b311052 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -385,6 +385,10 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs) if (ctx->screen->info.has_vgt_flush_ngg_legacy_bug && !ctx->ngg) ctx->flags |= SI_CONTEXT_VGT_FLUSH; + if (ctx->screen->attribute_ring) { + radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->screen->attribute_ring, + RADEON_USAGE_READWRITE | RADEON_PRIO_SHADER_RINGS); + } if (ctx->border_color_buffer) { radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, ctx->border_color_buffer, RADEON_USAGE_READ | RADEON_PRIO_BORDER_COLORS); diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 1952e3a208c..1a1a6bd54ba 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -878,6 +878,8 @@ static void si_destroy_screen(struct pipe_screen *pscreen) sscreen->num_disk_shader_cache_misses); } + si_resource_reference(&sscreen->attribute_ring, NULL); + simple_mtx_destroy(&sscreen->aux_context_lock); if (sscreen->aux_context) { @@ -1381,6 +1383,17 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, sscreen->ngg_subgroup_size = 128; + if (sscreen->info.chip_class >= GFX11) { + /* TODO: tweak this */ + unsigned attr_ring_size_per_se = align(1400000, 64 * 1024); + unsigned attr_ring_size = attr_ring_size_per_se * sscreen->info.max_se; + assert(attr_ring_size <= 16 * 1024 * 1024); /* maximum size */ + sscreen->attribute_ring = si_aligned_buffer_create(&sscreen->b, SI_RESOURCE_FLAG_32BIT, + PIPE_USAGE_DEFAULT, + /* TODO: remove the overallocation */ + attr_ring_size * 16, 2 * 1024 * 1024); + } + /* Create the auxiliary context. This must be done last. */ sscreen->aux_context = si_create_context( &sscreen->b, diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 9ee492b3acd..f88b71334fc 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -707,6 +707,8 @@ struct si_screen { struct util_idalloc_mt buffer_ids; struct util_vertex_state_cache vertex_state_cache; + + struct si_resource *attribute_ring; }; struct si_sampler_view { @@ -1209,6 +1211,7 @@ struct si_context { bool bindless_descriptors_dirty; bool graphics_bindless_pointer_dirty; bool compute_bindless_pointer_dirty; + bool gs_attribute_ring_pointer_dirty; /* Allocated bindless handles */ struct hash_table *tex_handles; diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 924114104ac..07f2932b4c4 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -542,7 +542,10 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader) ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.merged_wave_info); ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.scratch_offset); + if (ctx->screen->info.chip_class >= GFX11) + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs_attr_offset); + else + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.scratch_offset); ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ @@ -573,6 +576,10 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader) } ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->small_prim_cull_info); + if (ctx->screen->info.chip_class >= GFX11) + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_attr_address); + else + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ if (ctx->stage == MESA_SHADER_VERTEX) declare_vb_descriptor_input_sgprs(ctx); diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 98408fb508a..4a709afb599 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -215,6 +215,7 @@ enum SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS, GFX9_SGPR_SMALL_PRIM_CULL_INFO = MAX2(SI_VS_NUM_USER_SGPR, SI_TES_NUM_USER_SGPR), + GFX9_SGPR_ATTRIBUTE_RING_ADDR, GFX9_GS_NUM_USER_SGPR, /* PS only */ diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index a9747c15b7f..9a7288f21d7 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -73,6 +73,7 @@ struct si_shader_context { struct ac_arg internal_bindings; struct ac_arg bindless_samplers_and_images; struct ac_arg small_prim_cull_info; + struct ac_arg gs_attr_address; /* API VS */ struct ac_arg vb_descriptors[5]; struct ac_arg vertex_index0; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c index 57f7523a038..0292a7f2145 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c @@ -110,14 +110,18 @@ static void si_set_es_return_value_for_gs(struct si_shader_context *ctx) else ret = si_insert_input_ret(ctx, ret, ctx->args.gs2vs_offset, 2); ret = si_insert_input_ret(ctx, ret, ctx->args.merged_wave_info, 3); - ret = si_insert_input_ret(ctx, ret, ctx->args.scratch_offset, 5); - + if (ctx->screen->info.chip_class >= GFX11) + ret = si_insert_input_ret(ctx, ret, ctx->args.gs_attr_offset, 5); + else + ret = si_insert_input_ret(ctx, ret, ctx->args.scratch_offset, 5); ret = si_insert_input_ptr(ctx, ret, ctx->internal_bindings, 8 + SI_SGPR_INTERNAL_BINDINGS); ret = si_insert_input_ptr(ctx, ret, ctx->bindless_samplers_and_images, 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); if (ctx->screen->use_ngg) { ret = si_insert_input_ptr(ctx, ret, ctx->vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS); ret = si_insert_input_ptr(ctx, ret, ctx->small_prim_cull_info, 8 + GFX9_SGPR_SMALL_PRIM_CULL_INFO); + if (ctx->screen->info.chip_class >= GFX11) + ret = si_insert_input_ptr(ctx, ret, ctx->gs_attr_address, 8 + GFX9_SGPR_ATTRIBUTE_RING_ADDR); } unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c index 90b169ff198..016f812f2b9 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c @@ -690,6 +690,9 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx, ac_build_export(&ctx->ac, &pos_args[i]); } + if (!shader->info.nr_param_exports) + return; + /* Build parameter exports. Use 2 loops to export params in ascending order. * 32 is the maximum number of parameter exports. */ @@ -707,8 +710,61 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx, ¶m_exports[offset]); } - for (unsigned i = 0; i < shader->info.nr_param_exports; i++) - ac_build_export(&ctx->ac, ¶m_exports[i]); + if (ctx->screen->info.chip_class >= GFX11) { + /* Get the attribute ring address and descriptor. */ + LLVMValueRef attr_address; + if (ctx->stage == MESA_SHADER_VERTEX && shader->selector->info.base.vs.blit_sgprs_amd) { + LLVMValueRef ptr = + LLVMBuildPointerCast(ctx->ac.builder, + ac_get_arg(&ctx->ac, ctx->internal_bindings), + LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_CONST_32BIT), ""); + attr_address = ac_build_load_to_sgpr(&ctx->ac, ptr, + LLVMConstInt(ctx->ac.i32, SI_GS_ATTRIBUTE_RING * 4, 0)); + } else { + attr_address = ac_get_arg(&ctx->ac, ctx->gs_attr_address); + } + + unsigned stride = 16 * shader->info.nr_param_exports; + LLVMValueRef attr_desc[4] = { + attr_address, + LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi) | + S_008F04_STRIDE(stride) | + S_008F04_SWIZZLE_ENABLE_GFX11(3) /* 16B */, 0), + LLVMConstInt(ctx->ac.i32, 0xffffffff, 0), + LLVMConstInt(ctx->ac.i32, S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_32_32_32_FLOAT) | + S_008F0C_INDEX_STRIDE(2) /* 32 elements */, 0), + }; + LLVMValueRef attr_rsrc = ac_build_gather_values(&ctx->ac, attr_desc, 4); + LLVMValueRef attr_offset = LLVMBuildShl(ctx->ac.builder, + si_unpack_param(ctx, ctx->args.gs_attr_offset, 0, 15), + LLVMConstInt(ctx->ac.i32, 9, 0), ""); /* 512B increments */ + LLVMValueRef vindex = gfx10_get_thread_id_in_tg(ctx); + + LLVMValueRef soffset[32]; + + /* Compute scalar offsets first. */ + for (unsigned i = 0; i < shader->info.nr_param_exports; i++) { + soffset[i] = LLVMBuildAdd(ctx->ac.builder, attr_offset, + LLVMConstInt(ctx->ac.i32, 32 * i * 16, 0), ""); + } + + /* Write attributes to the attribute ring buffer. */ + for (unsigned i = 0; i < shader->info.nr_param_exports; i++) { + LLVMValueRef vdata = ac_build_gather_values_extended(&ctx->ac, param_exports[i].out, + 4, 1, false); + + ac_build_buffer_store_dword(&ctx->ac, attr_rsrc, vdata, vindex, + ctx->ac.i32_0, soffset[i], ac_swizzled); + } + } else { + /* Export attributes using parameter exports. */ + for (unsigned i = 0; i < shader->info.nr_param_exports; i++) + ac_build_export(&ctx->ac, ¶m_exports[i]); + } } void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi) diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 9c45531d3d5..5696b5ac6ef 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -5791,5 +5791,27 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing) S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE)); } + if (sctx->chip_class >= GFX11) { + /* We must wait for idle before changing the SPI attribute ring registers. */ + si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0)); + si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + + si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0)); + si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); + + si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0)); + si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + + assert((sscreen->attribute_ring->gpu_address >> 32) == sscreen->info.address32_hi); + + /* The PS will read inputs from this address. */ + si_pm4_set_reg(pm4, R_031118_SPI_ATTRIBUTE_RING_BASE, + sscreen->attribute_ring->gpu_address >> 16); + si_pm4_set_reg(pm4, R_03111C_SPI_ATTRIBUTE_RING_SIZE, + S_03111C_MEM_SIZE(((sscreen->attribute_ring->bo_size / + sscreen->info.max_se) >> 16) - 1) | + S_03111C_L1_POLICY(1)); + } + sctx->cs_preamble_state = pm4; } diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 6dbec01861f..82e2cc35273 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -357,8 +357,8 @@ enum /* Image descriptor of color buffer 0 for KHR_blend_equation_advanced. */ SI_PS_IMAGE_COLORBUF0, SI_PS_IMAGE_COLORBUF0_HI, - SI_PS_IMAGE_COLORBUF0_FMASK, - SI_PS_IMAGE_COLORBUF0_FMASK_HI, + SI_PS_IMAGE_COLORBUF0_FMASK, /* gfx6-10 */ + SI_PS_IMAGE_COLORBUF0_FMASK_HI, /* gfx6-10 */ /* Internal constant buffers. */ SI_HS_CONST_DEFAULT_TESS_LEVELS, @@ -368,12 +368,17 @@ enum SI_PS_CONST_SAMPLE_POSITIONS, SI_RING_ESGS, /* gfx6-8 */ - SI_RING_GSVS, + SI_RING_GSVS, /* gfx6-10 */ SI_NUM_INTERNAL_BINDINGS, /* Aliases to reuse slots that are unused on other generations. */ SI_GS_QUERY_BUF = SI_RING_ESGS, /* gfx10+ */ + + /* Only u_blitter uses this (and compute should be used in most cases, so this shouldn't + * be used much). Normal draws get the address from a user SGPR. + */ + SI_GS_ATTRIBUTE_RING = SI_RING_GSVS, /* gfx11+ */ }; /* Indices into sctx->descriptors, laid out so that gfx and compute pipelines