diff --git a/src/freedreno/ci/traces-freedreno.yml b/src/freedreno/ci/traces-freedreno.yml index 7b6d79e0526..aa19da2c903 100644 --- a/src/freedreno/ci/traces-freedreno.yml +++ b/src/freedreno/ci/traces-freedreno.yml @@ -51,10 +51,10 @@ traces: freedreno-a630: checksum: e0e18dcc50ab2e23cead650d64469178 zink-a618: - checksum: b589b5d9ddd3026cbde08f0abe840ea7 + checksum: b7e0cdb0db74ea9a31fb7a75ae0d76fc zink-a630: label: [skip, flakes] - checksum: b589b5d9ddd3026cbde08f0abe840ea7 + checksum: b7e0cdb0db74ea9a31fb7a75ae0d76fc text: seems to trigger oomkilling recently valve/counterstrike-source-v2.trace: diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 2043214f2b4..44d3effa7ca 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -3854,7 +3854,7 @@ emit_tex(struct ir3_context *ctx, nir_tex_instr *tex) ir3_builder_at(ir3_before_terminator(ctx->in_block)); sam = ir3_SAM(&build, opc, type, MASK(ncomp), 0, NULL, get_barycentric(ctx, IJ_PERSP_PIXEL), 0); - sam->prefetch.input_offset = ir3_nir_coord_offset(tex->src[idx].src.ssa); + sam->prefetch.input_offset = ir3_nir_coord_offset(tex->src[idx].src.ssa, NULL); /* make sure not to add irrelevant flags like S2EN */ sam->flags = flags | (info.flags & IR3_INSTR_B); sam->prefetch.tex = info.tex_idx; @@ -5810,7 +5810,8 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, int idx = 0; foreach_input (instr, ir) { - if (instr->input.sysval != SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL) + if (instr->input.sysval != + (SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + so->prefetch_bary_type)) continue; assert(idx < 2); diff --git a/src/freedreno/ir3/ir3_context.c b/src/freedreno/ir3/ir3_context.c index a590749a8a6..da406b5e7ac 100644 --- a/src/freedreno/ir3/ir3_context.c +++ b/src/freedreno/ir3/ir3_context.c @@ -109,8 +109,9 @@ ir3_context_init(struct ir3_compiler *compiler, struct ir3_shader *shader, /* Enable the texture pre-fetch feature only a4xx onwards. But * only enable it on generations that have been tested: */ - if ((so->type == MESA_SHADER_FRAGMENT) && compiler->has_fs_tex_prefetch) - NIR_PASS(_, ctx->s, ir3_nir_lower_tex_prefetch); + if ((so->type == MESA_SHADER_FRAGMENT) && compiler->has_fs_tex_prefetch) { + NIR_PASS(_, ctx->s, ir3_nir_lower_tex_prefetch, &so->prefetch_bary_type); + } bool vectorized = false; NIR_PASS(vectorized, ctx->s, nir_opt_vectorize, ir3_nir_vectorize_filter, diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h index 87a3dc67a60..27c82d37777 100644 --- a/src/freedreno/ir3/ir3_nir.h +++ b/src/freedreno/ir3/ir3_nir.h @@ -27,8 +27,9 @@ bool ir3_nir_lower_push_consts_to_preamble(nir_shader *nir, bool ir3_nir_lower_driver_params_to_ubo(nir_shader *nir, struct ir3_shader_variant *v); bool ir3_nir_move_varying_inputs(nir_shader *shader); -int ir3_nir_coord_offset(nir_def *ssa); -bool ir3_nir_lower_tex_prefetch(nir_shader *shader); +int ir3_nir_coord_offset(nir_def *ssa, gl_system_value *bary_type); +bool ir3_nir_lower_tex_prefetch(nir_shader *shader, + enum ir3_bary *prefetch_bary_type); bool ir3_nir_lower_layer_id(nir_shader *shader); bool ir3_nir_lower_frag_shading_rate(nir_shader *shader); bool ir3_nir_lower_primitive_shading_rate(nir_shader *shader); diff --git a/src/freedreno/ir3/ir3_nir_lower_tex_prefetch.c b/src/freedreno/ir3/ir3_nir_lower_tex_prefetch.c index af9830788e2..a29313bc8f1 100644 --- a/src/freedreno/ir3/ir3_nir_lower_tex_prefetch.c +++ b/src/freedreno/ir3/ir3_nir_lower_tex_prefetch.c @@ -5,13 +5,25 @@ #include "ir3_nir.h" +#include "util/u_vector.h" + /** * A pass which detects tex instructions which are candidate to be executed * prior to FS shader start, and change them to nir_texop_tex_prefetch. */ +typedef struct { + nir_tex_instr *tex; + enum ir3_bary bary; +} tex_prefetch_candidate; + +typedef struct { + struct u_vector candidates; + uint32_t per_bary_candidates[IJ_COUNT]; +} ir3_prefetch_state; + static int -coord_offset(nir_def *ssa) +coord_offset(nir_def *ssa, gl_system_value *bary_type) { nir_instr *parent_instr = ssa->parent_instr; @@ -27,7 +39,7 @@ coord_offset(nir_def *ssa) if (alu->op != nir_op_vec2) return -1; - int base_src_offset = coord_offset(alu->src[0].src.ssa); + int base_src_offset = coord_offset(alu->src[0].src.ssa, bary_type); if (base_src_offset < 0) return -1; @@ -35,7 +47,7 @@ coord_offset(nir_def *ssa) /* NOTE it might be possible to support more than 2D? */ for (int i = 1; i < 2; i++) { - int nth_src_offset = coord_offset(alu->src[i].src.ssa); + int nth_src_offset = coord_offset(alu->src[i].src.ssa, bary_type); if (nth_src_offset < 0) return -1; int nth_offset = nth_src_offset + alu->src[i].swizzle[0]; @@ -62,20 +74,26 @@ coord_offset(nir_def *ssa) nir_intrinsic_instr *interp = nir_instr_as_intrinsic(input->src[0].ssa->parent_instr); - if (interp->intrinsic != nir_intrinsic_load_barycentric_pixel) + if (interp->intrinsic != nir_intrinsic_load_barycentric_pixel && + interp->intrinsic != nir_intrinsic_load_barycentric_sample && + interp->intrinsic != nir_intrinsic_load_barycentric_centroid) return -1; - /* interpolation modes such as noperspective aren't covered by the other + /* interpolation modes such as flat aren't covered by the other * test, we need to explicitly check for them here. */ unsigned interp_mode = nir_intrinsic_interp_mode(interp); - if (interp_mode != INTERP_MODE_NONE && interp_mode != INTERP_MODE_SMOOTH) + if (interp_mode != INTERP_MODE_NONE && interp_mode != INTERP_MODE_SMOOTH && + interp_mode != INTERP_MODE_NOPERSPECTIVE) return -1; /* we also need a const input offset: */ if (!nir_src_is_const(input->src[1])) return -1; + if (bary_type) + *bary_type = ir3_nir_intrinsic_barycentric_sysval(interp); + unsigned base = nir_src_as_uint(input->src[1]) + nir_intrinsic_base(input); unsigned comp = nir_intrinsic_component(input); @@ -83,11 +101,13 @@ coord_offset(nir_def *ssa) } int -ir3_nir_coord_offset(nir_def *ssa) +ir3_nir_coord_offset(nir_def *ssa, gl_system_value *bary_type) { assert(ssa->num_components == 2); - return coord_offset(ssa); + if (bary_type) + *bary_type = SYSTEM_VALUE_MAX; + return coord_offset(ssa, bary_type); } static bool @@ -136,7 +156,7 @@ ok_tex_samp(nir_tex_instr *tex) } static bool -lower_tex_prefetch_block(nir_block *block) +lower_tex_prefetch_block(nir_block *block, ir3_prefetch_state *state) { bool progress = false; @@ -168,8 +188,14 @@ lower_tex_prefetch_block(nir_block *block) /* First source should be the sampling coordinate. */ nir_tex_src *coord = &tex->src[idx]; - if (ir3_nir_coord_offset(coord->src.ssa) >= 0) { - tex->op = nir_texop_tex_prefetch; + gl_system_value bary_type; + if (ir3_nir_coord_offset(coord->src.ssa, &bary_type) >= 0) { + enum ir3_bary bary = bary_type - SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL; + state->per_bary_candidates[bary]++; + + tex_prefetch_candidate *candidate = u_vector_add(&state->candidates); + candidate->tex = tex; + candidate->bary = bary; progress |= true; } @@ -179,7 +205,7 @@ lower_tex_prefetch_block(nir_block *block) } static bool -lower_tex_prefetch_func(nir_function_impl *impl) +lower_tex_prefetch_func(nir_function_impl *impl, ir3_prefetch_state *state) { /* Only instructions in the the outer-most block are considered eligible for * pre-dispatch, because they need to be move-able to the beginning of the @@ -201,18 +227,22 @@ lower_tex_prefetch_func(nir_function_impl *impl) } } - bool progress = lower_tex_prefetch_block(block); + bool progress = lower_tex_prefetch_block(block, state); return nir_progress(progress, impl, nir_metadata_control_flow); } bool -ir3_nir_lower_tex_prefetch(nir_shader *shader) +ir3_nir_lower_tex_prefetch(nir_shader *shader, + enum ir3_bary *prefetch_bary_type) { bool progress = false; assert(shader->info.stage == MESA_SHADER_FRAGMENT); + ir3_prefetch_state state = {}; + u_vector_init(&state.candidates, 4, sizeof(tex_prefetch_candidate)); + nir_foreach_function (function, shader) { /* Only texture sampling instructions inside the main function * are eligible for pre-dispatch. @@ -220,8 +250,36 @@ ir3_nir_lower_tex_prefetch(nir_shader *shader) if (!function->impl || !function->is_entrypoint) continue; - progress |= lower_tex_prefetch_func(function->impl); + progress |= lower_tex_prefetch_func(function->impl, &state); } + if (progress) { + /* We cannot prefetch tex ops that use different interpolation modes, + * so we have to choose a single mode to prefetch. We select the + * interpolation mode that would allow us to prefetch the most tex ops. + */ + uint32_t max_tex_with_bary = 0; + uint32_t chosen_bary = 0; + for (int i = 0; i < IJ_COUNT; i++) { + if (state.per_bary_candidates[i] > max_tex_with_bary) { + max_tex_with_bary = state.per_bary_candidates[i]; + chosen_bary = i; + } + } + + tex_prefetch_candidate *candidate; + u_vector_foreach(candidate, &state.candidates) { + if (candidate->bary == chosen_bary) { + candidate->tex->op = nir_texop_tex_prefetch; + } + } + + *prefetch_bary_type = chosen_bary; + } else { + *prefetch_bary_type = IJ_COUNT; + } + + u_vector_finish(&state.candidates); + return progress; } diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index 6bffc65a8a0..5b73403d8c1 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -879,6 +879,7 @@ struct ir3_shader_variant { /* texture sampler pre-dispatches */ uint32_t num_sampler_prefetch; struct ir3_sampler_prefetch sampler_prefetch[IR3_MAX_SAMPLER_PREFETCH]; + enum ir3_bary prefetch_bary_type; /* If true, the last use of helper invocations is the texture prefetch and * they should be disabled for the actual shader. Equivalent to adding diff --git a/src/freedreno/vulkan/tu_shader.cc b/src/freedreno/vulkan/tu_shader.cc index dc63f5bf41f..b588cdf2236 100644 --- a/src/freedreno/vulkan/tu_shader.cc +++ b/src/freedreno/vulkan/tu_shader.cc @@ -1685,9 +1685,9 @@ tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs) ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i); if (fs->num_sampler_prefetch > 0) { - /* It seems like ij_pix is *required* to be r0.x */ - assert(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]) || - ij_regid[IJ_PERSP_PIXEL] == regid(0, 0)); + /* FS prefetch reads coordinates from r0.x */ + assert(!VALIDREG(ij_regid[fs->prefetch_bary_type]) || + ij_regid[fs->prefetch_bary_type] == regid(0, 0)); } tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch); diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.cc b/src/gallium/drivers/freedreno/a6xx/fd6_program.cc index 5798a503403..08439fffbfe 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_program.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.cc @@ -939,9 +939,9 @@ emit_fs_inputs(struct fd_ringbuffer *ring, const struct program_builder *b) ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i); if (fs->num_sampler_prefetch > 0) { - /* It seems like ij_pix is *required* to be r0.x */ - assert(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]) || - ij_regid[IJ_PERSP_PIXEL] == regid(0, 0)); + /* FS prefetch reads coordinates from r0.x */ + assert(!VALIDREG(ij_regid[fs->prefetch_bary_type]) || + ij_regid[fs->prefetch_bary_type] == regid(0, 0)); } OUT_PKT4(ring, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch);