diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 0660813e3c6..6c8942d916c 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -100,12 +100,12 @@ static void si_create_compute_state_async(void *job, void *gdata, int thread_ind assert(thread_index < ARRAY_SIZE(sscreen->compiler)); compiler = &sscreen->compiler[thread_index]; - if (!sscreen->use_aco && !*compiler) - *compiler = si_create_llvm_compiler(sscreen); - assert(program->ir_type == PIPE_SHADER_IR_NIR); si_nir_scan_shader(sscreen, sel->nir, &sel->info); + if (!sel->info.base.use_aco_amd && !*compiler) + *compiler = si_create_llvm_compiler(sscreen); + si_get_active_slot_masks(sscreen, &sel->info, &sel->active_const_and_shader_buffers, &sel->active_samplers_and_images); diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c index 4f1cd10417b..f1d91565d8f 100644 --- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c +++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c @@ -695,7 +695,7 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s break; case nir_intrinsic_load_tess_rel_patch_id_amd: /* LLVM need to replace patch id arg, so have to be done in LLVM backend. */ - if (!sel->screen->use_aco) + if (!sel->info.base.use_aco_amd) return false; if (stage == MESA_SHADER_TESS_CTRL) { @@ -776,7 +776,7 @@ static bool lower_tex(nir_builder *b, nir_instr *instr, struct lower_abi_state * */ /* LLVM keep non-uniform sampler as index, so can't do this in NIR. */ - if (tex->is_shadow && gfx_level >= GFX8 && gfx_level <= GFX9 && sel->screen->use_aco) { + if (tex->is_shadow && gfx_level >= GFX8 && gfx_level <= GFX9 && sel->info.base.use_aco_amd) { int samp_index = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle); int comp_index = nir_tex_instr_src_index(tex, nir_tex_src_comparator); assert(samp_index >= 0 && comp_index >= 0); diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 8f29cb936df..c82537f3b3f 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -382,7 +382,7 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args) } /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */ - if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11) + if (sel->info.base.use_aco_amd && sel->screen->info.gfx_level < GFX11) ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset); /* VGPRs */ @@ -400,7 +400,7 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args) ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset); /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */ - if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11) + if (sel->info.base.use_aco_amd && sel->screen->info.gfx_level < GFX11) ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset); /* VGPRs */ @@ -453,7 +453,7 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args) ac_add_return(&args->ac, AC_ARG_VGPR); /* VS outputs passed via VGPRs to TCS. */ - if (shader->key.ge.opt.same_patch_vertices && !sel->screen->use_aco) { + if (shader->key.ge.opt.same_patch_vertices && !sel->info.base.use_aco_amd) { unsigned num_outputs = util_last_bit64(shader->selector->info.outputs_written_before_tes_gs); for (i = 0; i < num_outputs * 4; i++) ac_add_return(&args->ac, AC_ARG_VGPR); @@ -461,7 +461,7 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args) } } else { /* TCS inputs are passed via VGPRs from VS. */ - if (shader->key.ge.opt.same_patch_vertices && !sel->screen->use_aco) { + if (shader->key.ge.opt.same_patch_vertices && !sel->info.base.use_aco_amd) { unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->info.outputs_written_before_tes_gs); for (i = 0; i < num_inputs * 4; i++) ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); @@ -574,7 +574,7 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args) } /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */ - if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11) + if (sel->info.base.use_aco_amd && sel->screen->info.gfx_level < GFX11) ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset); /* VGPRs */ @@ -588,7 +588,7 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args) ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_wave_id); /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */ - if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11) + if (sel->info.base.use_aco_amd && sel->screen->info.gfx_level < GFX11) ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset); /* VGPRs */ @@ -641,7 +641,7 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args) si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.pos_fixed_pt, SI_PARAM_POS_FIXED_PT); - if (sel->screen->use_aco) { + if (sel->info.base.use_aco_amd) { ac_compact_ps_vgpr_args(&args->ac, shader->config.spi_ps_input_addr); /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */ @@ -728,7 +728,7 @@ void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args) ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tg_size); /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */ - if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11) + if (sel->info.base.use_aco_amd && sel->screen->info.gfx_level < GFX11) ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset); /* Hardware VGPRs. */ @@ -2345,7 +2345,7 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader, NIR_PASS(progress, nir, ac_nir_lower_image_opcodes); /* LLVM does not work well with this, so is handled in llvm backend waterfall. */ - if (sel->screen->use_aco && sel->info.has_non_uniform_tex_access) { + if (sel->info.base.use_aco_amd && sel->info.has_non_uniform_tex_access) { nir_lower_non_uniform_access_options options = { .types = nir_lower_non_uniform_texture_access, }; @@ -2449,7 +2449,7 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader, ac_nir_lower_ps_options options = { .gfx_level = sel->screen->info.gfx_level, .family = sel->screen->info.family, - .use_aco = sel->screen->use_aco, + .use_aco = sel->info.base.use_aco_amd, .uses_discard = si_shader_uses_discard(shader), .alpha_to_coverage_via_mrtz = key->ps.part.epilog.alpha_to_coverage_via_mrtz, .dual_src_blend_swizzle = key->ps.part.epilog.dual_src_blend_swizzle, @@ -2538,7 +2538,7 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader, /* aco only accept scalar const, must be done after si_nir_late_opts() * which may generate vec const. */ - if (sel->screen->use_aco) + if (sel->info.base.use_aco_amd) NIR_PASS_V(nir, nir_lower_load_const_to_scalar); /* This helps LLVM form VMEM clauses and thus get more GPU cache hits. @@ -2643,7 +2643,7 @@ si_nir_generate_gs_copy_shader(struct si_screen *sscreen, si_nir_opts(gs_selector->screen, nir, false); /* aco only accept scalar const */ - if (sscreen->use_aco) + if (gsinfo->base.use_aco_amd) NIR_PASS_V(nir, nir_lower_load_const_to_scalar); if (si_can_dump_shader(sscreen, MESA_SHADER_GEOMETRY, SI_DUMP_NIR)) { @@ -2653,11 +2653,11 @@ si_nir_generate_gs_copy_shader(struct si_screen *sscreen, bool ok = #if AMD_LLVM_AVAILABLE - !sscreen->use_aco ? si_llvm_compile_shader(sscreen, compiler, shader, &args, debug, nir) : + !gs_selector->info.base.use_aco_amd ? si_llvm_compile_shader(sscreen, compiler, shader, + &args, debug, nir) : #endif si_aco_compile_shader(shader, &args, nir, debug); - if (ok) { assert(!shader->config.scratch_bytes_per_wave); ok = si_shader_binary_upload(sscreen, shader, 0) >= 0; @@ -2857,7 +2857,7 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi struct si_shader_selector *sel = shader->selector; /* ACO need spi_ps_input in advance to init args and used in compiler. */ - if (sel->stage == MESA_SHADER_FRAGMENT && sscreen->use_aco) + if (sel->stage == MESA_SHADER_FRAGMENT && sel->info.base.use_aco_amd) si_set_spi_ps_input_config(shader); /* We need this info only when legacy GS. */ @@ -2923,7 +2923,8 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi ret = #if AMD_LLVM_AVAILABLE - !sscreen->use_aco ? si_llvm_compile_shader(sscreen, compiler, shader, &args, debug, nir) : + !sel->info.base.use_aco_amd ? si_llvm_compile_shader(sscreen, compiler, shader, &args, + debug, nir) : #endif si_aco_compile_shader(shader, &args, nir, debug); @@ -3015,7 +3016,7 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi if (sel->screen->info.gfx_level < GFX11 && (sel->screen->info.family < CHIP_GFX940 || sel->screen->info.has_graphics) && !si_is_merged_shader(shader)) { - if (sscreen->use_aco) { + if (sel->info.base.use_aco_amd) { /* When aco scratch_offset arg is added explicitly at the beginning. * After compile if no scratch used, reduce the input sgpr count. */ @@ -3087,9 +3088,14 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list, result = CALLOC_STRUCT(si_shader_part); result->key = *key; + ASSERTED bool use_aco = sscreen->use_aco || + (stage == MESA_SHADER_FRAGMENT && + ((prolog && key->ps_prolog.use_aco) || + (!prolog && key->ps_epilog.use_aco))); + bool ok = #if AMD_LLVM_AVAILABLE - !sscreen->use_aco ? si_llvm_build_shader_part(sscreen, stage, prolog, compiler, debug, name, result) : + !use_aco ? si_llvm_build_shader_part(sscreen, stage, prolog, compiler, debug, name, result) : #endif si_aco_build_shader_part(sscreen, stage, prolog, debug, name, result); @@ -3144,6 +3150,7 @@ void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *ke memset(key, 0, sizeof(*key)); key->ps_prolog.states = shader->key.ps.part.prolog; + key->ps_prolog.use_aco = info->base.use_aco_amd; key->ps_prolog.wave32 = shader->wave_size == 32; key->ps_prolog.colors_read = shader->info.ps_colors_read; key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs; @@ -3266,6 +3273,7 @@ void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *ke { struct si_shader_info *info = &shader->selector->info; memset(key, 0, sizeof(*key)); + key->ps_epilog.use_aco = info->base.use_aco_amd; key->ps_epilog.wave32 = shader->wave_size == 32; key->ps_epilog.uses_discard = si_shader_uses_discard(shader); key->ps_epilog.colors_written = info->colors_written; diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index b0f93c6f2f4..817bb5c1fb0 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -642,6 +642,7 @@ struct si_ps_epilog_bits { union si_shader_part_key { struct { struct si_ps_prolog_bits states; + unsigned use_aco : 1; unsigned wave32 : 1; unsigned num_input_sgprs : 6; /* Color interpolation and two-side color selection. */ @@ -654,6 +655,7 @@ union si_shader_part_key { } ps_prolog; struct { struct si_ps_epilog_bits states; + unsigned use_aco : 1; unsigned wave32 : 1; unsigned uses_discard : 1; unsigned colors_written : 8; diff --git a/src/gallium/drivers/radeonsi/si_shader_info.c b/src/gallium/drivers/radeonsi/si_shader_info.c index a0d427ffa8c..2e639802501 100644 --- a/src/gallium/drivers/radeonsi/si_shader_info.c +++ b/src/gallium/drivers/radeonsi/si_shader_info.c @@ -9,6 +9,7 @@ #include "util/mesa-sha1.h" #include "sid.h" #include "nir.h" +#include "aco_interface.h" struct si_shader_profile si_shader_profiles[] = { @@ -619,6 +620,8 @@ void si_nir_scan_shader(struct si_screen *sscreen, const struct nir_shader *nir, { memset(info, 0, sizeof(*info)); info->base = nir->info; + info->base.use_aco_amd = aco_is_gpu_supported(&sscreen->info) && + (sscreen->use_aco || nir->info.use_aco_amd); /* Get options from shader profiles. */ for (unsigned i = 0; i < ARRAY_SIZE(si_shader_profiles); i++) { diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index 3480b765a42..5158307e91b 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -71,6 +71,7 @@ static unsigned si_lower_bit_size_callback(const nir_instr *instr, void *data) void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first) { + bool use_aco = sscreen->use_aco || nir->info.use_aco_amd; bool progress; do { @@ -80,7 +81,7 @@ void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first) NIR_PASS(progress, nir, nir_lower_vars_to_ssa); NIR_PASS(progress, nir, nir_lower_alu_to_scalar, - nir->options->lower_to_scalar_filter, (void *)sscreen->use_aco); + nir->options->lower_to_scalar_filter, (void *)use_aco); NIR_PASS(progress, nir, nir_lower_phis_to_scalar, false); if (first) { @@ -103,7 +104,7 @@ void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first) if (lower_alu_to_scalar) { NIR_PASS_V(nir, nir_lower_alu_to_scalar, - nir->options->lower_to_scalar_filter, (void *)sscreen->use_aco); + nir->options->lower_to_scalar_filter, (void *)use_aco); } if (lower_phis_to_scalar) NIR_PASS_V(nir, nir_lower_phis_to_scalar, false); @@ -145,10 +146,8 @@ void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first) if (nir->info.stage == MESA_SHADER_FRAGMENT) NIR_PASS_V(nir, nir_opt_move_discards_to_top); - if (sscreen->info.has_packed_math_16bit) { - NIR_PASS(progress, nir, nir_opt_vectorize, si_vectorize_callback, - (void *)sscreen->use_aco); - } + if (sscreen->info.has_packed_math_16bit) + NIR_PASS(progress, nir, nir_opt_vectorize, si_vectorize_callback, (void *)use_aco); } while (progress); NIR_PASS_V(nir, nir_lower_var_copies); diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c index 0f9cbccc226..7572ef317a2 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c @@ -301,7 +301,7 @@ static void optimization_barrier_vgpr_array(struct si_context *sctx, nir_builder * barrier in the compute blit for GFX6-8 because the lack of A16 combined with optimization * barriers would unnecessarily increase VGPR usage for MSAA resources. */ - if (!sctx->screen->use_aco && sctx->gfx_level >= GFX10) { + if (!b->shader->info.use_aco_amd && sctx->gfx_level >= GFX10) { for (unsigned i = 0; i < num_elements; i++) { unsigned prev_num = array[i]->num_components; array[i] = nir_trim_vector(b, array[i], num_components); @@ -360,6 +360,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, nir_options, "blit_non_scaled_cs"); + b.shader->info.use_aco_amd = sctx->screen->use_aco; b.shader->info.num_images = options->is_clear ? 1 : 2; unsigned image_dst_index = b.shader->info.num_images - 1; if (!options->is_clear && options->src_is_msaa) @@ -609,7 +610,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha * barriers waiting for image loads, i.e. after s_waitcnt vmcnt(0). */ nir_def *img_dst_desc = nir_image_deref_descriptor_amd(&b, 8, 32, deref_ssa(&b, img_dst)); - if (lane_size > 1 && !sctx->screen->use_aco) + if (lane_size > 1 && !b.shader->info.use_aco_amd) img_dst_desc = nir_optimization_barrier_sgpr_amd(&b, 32, img_dst_desc); /* Apply the blit output modifiers, once per sample. */ diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index f74c29bcb72..df03fd292d4 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -2823,7 +2823,7 @@ static void si_build_shader_variant(struct si_shader *shader, int thread_index, compiler = &shader->compiler_ctx_state.compiler; } - if (!sscreen->use_aco && !*compiler) + if (!sel->info.base.use_aco_amd && !*compiler) *compiler = si_create_llvm_compiler(sscreen); if (unlikely(!si_create_shader_variant(sscreen, *compiler, shader, debug))) { @@ -3039,7 +3039,7 @@ current_not_ready: util_queue_fence_init(&shader->ready); - if (!sscreen->use_aco && !sctx->compiler) + if (!sel->info.base.use_aco_amd && !sctx->compiler) sctx->compiler = si_create_llvm_compiler(sctx->screen); shader->selector = sel; @@ -3249,7 +3249,7 @@ static void si_init_shader_selector_async(void *job, void *gdata, int thread_ind assert(thread_index < (int)ARRAY_SIZE(sscreen->compiler)); compiler = &sscreen->compiler[thread_index]; - if (!sscreen->use_aco && !*compiler) + if (!sel->info.base.use_aco_amd && !*compiler) *compiler = si_create_llvm_compiler(sscreen); /* Serialize NIR to save memory. Monolithic shader variants