From 55c4595b2a850a2c846db4b29a8b51b550e3d384 Mon Sep 17 00:00:00 2001 From: Faith Ekstrand Date: Mon, 13 Apr 2026 14:36:38 -0400 Subject: [PATCH] pan/nir: Use the NIR lowering on Valhall+ Reviewed-by: Christoph Pillmayer Reviewed-by: Lorenzo Rossi Part-of: --- .../compiler/bifrost/bifrost_compile.c | 381 +++--------------- src/panfrost/compiler/bifrost/bifrost_nir.c | 3 + src/panfrost/compiler/pan_nir_lower_tex.c | 3 +- 3 files changed, 63 insertions(+), 324 deletions(-) diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c index 58c35e9667d..4a3f3403a7f 100644 --- a/src/panfrost/compiler/bifrost/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost/bifrost_compile.c @@ -18,6 +18,7 @@ #include "bi_quirks.h" #include "bifrost_compile.h" #include "bifrost_nir.h" +#include "pan_nir.h" #include "compiler.h" static void pan_stats_verbose(FILE *f, const char *prefix, bi_context *ctx, @@ -3750,56 +3751,6 @@ bi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr) return dest; } -/* - * Valhall specifies specifies texel offsets, multisample indices, and (for - * fetches) LOD together as a u8vec4 , where the third - * component is either offset.z or multisample index depending on context. Build - * this register. - */ -static bi_index -bi_emit_valhall_offsets(bi_builder *b, nir_tex_instr *instr) -{ - bi_index dest = bi_zero(); - - int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset); - int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index); - int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod); - - /* Components 0-2: offsets */ - if (offs_idx >= 0 && !nir_src_is_zero(instr->src[offs_idx].src)) { - unsigned nr = nir_src_num_components(instr->src[offs_idx].src); - bi_index idx = bi_src_index(&instr->src[offs_idx].src); - - /* No multisample index with 3D */ - assert((nr <= 2) || (ms_idx < 0)); - - /* Zero extend the Z byte so we can use it with MKVEC.v2i8 */ - bi_index z = (nr > 2) - ? bi_mkvec_v2i8(b, bi_byte(bi_extract(b, idx, 2), 0), - bi_imm_u8(0), bi_zero()) - : bi_zero(); - - dest = bi_mkvec_v2i8( - b, (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0), - (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0), z); - } - - /* Component 2: multisample index */ - if (ms_idx >= 0 && !nir_src_is_zero(instr->src[ms_idx].src)) { - bi_index ms = bi_src_index(&instr->src[ms_idx].src); - dest = bi_mkvec_v2i16(b, bi_half(dest, false), bi_half(ms, false)); - } - - /* Component 3: 8-bit LOD */ - if (lod_idx >= 0 && !nir_src_is_zero(instr->src[lod_idx].src) && - nir_tex_instr_src_type(instr, lod_idx) != nir_type_float) { - dest = bi_lshift_or_i32(b, bi_src_index(&instr->src[lod_idx].src), dest, - bi_imm_u8(24)); - } - - return dest; -} - static void bi_emit_cube_coord(bi_builder *b, bi_index coord, bi_index *face, bi_index *s, bi_index *t) @@ -4227,306 +4178,92 @@ bi_emit_texc(bi_builder *b, nir_tex_instr *instr) DIV_ROUND_UP(instr->def.num_components * res_size, 4)); } -/* Staging registers required by texturing in the order they appear (Valhall) */ - -enum valhall_tex_sreg { - VALHALL_TEX_SREG_X_COORD = 0, - VALHALL_TEX_SREG_Y_COORD = 1, - VALHALL_TEX_SREG_Z_COORD = 2, - VALHALL_TEX_SREG_Y_DELTAS = 3, - VALHALL_TEX_SREG_ARRAY = 4, - VALHALL_TEX_SREG_SHADOW = 5, - VALHALL_TEX_SREG_OFFSETMS = 6, - VALHALL_TEX_SREG_LOD = 7, - VALHALL_TEX_SREG_GRDESC0 = 8, - VALHALL_TEX_SREG_GRDESC1 = 9, - VALHALL_TEX_SREG_COUNT, -}; - static void -bi_emit_tex_valhall(bi_builder *b, nir_tex_instr *instr) +bi_emit_tex_valhall(bi_builder *b, nir_tex_instr *tex) { - bool explicit_offset = false; - enum bi_va_lod_mode lod_mode = BI_VA_LOD_MODE_COMPUTED_LOD; - - bool has_lod_mode = (instr->op == nir_texop_tex) || - (instr->op == nir_texop_txl) || - (instr->op == nir_texop_txd) || - (instr->op == nir_texop_txb); - - /* 32-bit indices to be allocated as consecutive staging registers */ - bi_index sregs[VALHALL_TEX_SREG_COUNT] = {}; - bi_index sampler = bi_imm_u32(instr->sampler_index); - bi_index texture = bi_imm_u32(instr->texture_index); - bi_index ddx = bi_null(); - bi_index ddy = bi_null(); - - for (unsigned i = 0; i < instr->num_srcs; ++i) { - bi_index index = bi_src_index(&instr->src[i].src); - unsigned sz = nir_src_bit_size(instr->src[i].src); - - switch (instr->src[i].src_type) { - case nir_tex_src_coord: { - bool is_array = instr->is_array && instr->op != nir_texop_lod; - unsigned components = nir_tex_instr_src_size(instr, i) - is_array; - - if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { - sregs[VALHALL_TEX_SREG_X_COORD] = bi_emit_texc_cube_coord( - b, index, &sregs[VALHALL_TEX_SREG_Y_COORD]); - } else { - assert(components >= 1 && components <= 3); - - /* Copy XY (for 2D+) or XX (for 1D) */ - sregs[VALHALL_TEX_SREG_X_COORD] = index; - - if (components >= 2) - sregs[VALHALL_TEX_SREG_Y_COORD] = bi_extract(b, index, 1); - - if (components == 3) - sregs[VALHALL_TEX_SREG_Z_COORD] = bi_extract(b, index, 2); - } - - if (is_array) - sregs[VALHALL_TEX_SREG_ARRAY] = bi_extract(b, index, components); - - break; - } - - case nir_tex_src_lod: - if (nir_src_is_zero(instr->src[i].src)) { - lod_mode = BI_VA_LOD_MODE_ZERO_LOD; - } else if (has_lod_mode) { - lod_mode = BI_VA_LOD_MODE_EXPLICIT; - - assert(sz == 16 || sz == 32); - sregs[VALHALL_TEX_SREG_LOD] = - bi_emit_texc_lod_88(b, index, sz == 16); - } - break; - - case nir_tex_src_ddx: - ddx = index; - break; - - case nir_tex_src_ddy: - ddy = index; - break; - - case nir_tex_src_bias: - /* Upper 16-bits interpreted as a clamp, leave zero */ - assert(sz == 16 || sz == 32); - sregs[VALHALL_TEX_SREG_LOD] = bi_emit_texc_lod_88(b, index, sz == 16); - - lod_mode = BI_VA_LOD_MODE_COMPUTED_BIAS; - break; - case nir_tex_src_ms_index: - case nir_tex_src_offset: - /* Handled below */ - break; - - case nir_tex_src_comparator: - sregs[VALHALL_TEX_SREG_SHADOW] = index; - break; - - case nir_tex_src_texture_offset: - /* This should always be 0 as lower_index_to_offset is expected to be - * set */ - assert(instr->texture_index == 0); - texture = index; - break; - - case nir_tex_src_sampler_offset: - /* This should always be 0 as lower_index_to_offset is expected to be - * set */ - assert(instr->sampler_index == 0); - sampler = index; - break; - - default: - UNREACHABLE("Unhandled src type in tex emit"); + nir_def *tex_h = NULL, *sr0 = NULL, *sr1 = NULL; + for (unsigned i = 0; i < tex->num_srcs; i++) { + switch (tex->src[i].src_type) { + case nir_tex_src_texture_handle: tex_h = tex->src[i].src.ssa; break; + case nir_tex_src_backend1: sr0 = tex->src[i].src.ssa; break; + case nir_tex_src_backend2: sr1 = tex->src[i].src.ssa; break; + default: UNREACHABLE("Unknown texture source"); } } - /* Generate packed offset + ms index + LOD register. These default to - * zero so we only need to encode if these features are actually in use. - */ - bi_index offsets = bi_emit_valhall_offsets(b, instr); + struct pan_va_tex_flags flags; + STATIC_ASSERT(sizeof(tex->backend_flags) == sizeof(flags)); + memcpy(&flags, &tex->backend_flags, sizeof(flags)); - if (!bi_is_equiv(offsets, bi_zero())) { - sregs[VALHALL_TEX_SREG_OFFSETMS] = offsets; - explicit_offset = true; - } + bi_index src0 = bi_extract(b, bi_def_index(tex_h), 0); + bi_index src1 = bi_extract(b, bi_def_index(tex_h), 1); - bool narrow_indices = va_is_valid_const_narrow_index(texture) && - va_is_valid_const_narrow_index(sampler); - - bi_index src0; - bi_index src1; - - if (narrow_indices) { - unsigned tex_set = - va_res_fold_table_idx(pan_res_handle_get_table(texture.value)); - unsigned sampler_set = - va_res_fold_table_idx(pan_res_handle_get_table(sampler.value)); - unsigned texture_index = pan_res_handle_get_index(texture.value); - unsigned sampler_index = pan_res_handle_get_index(sampler.value); - - unsigned packed_handle = (tex_set << 27) | (texture_index << 16) | - (sampler_set << 11) | sampler_index; - - src0 = bi_imm_u32(packed_handle); - - /* TODO: narrow offsetms. (only when offsetms is dynamically uniform) */ - src1 = bi_zero(); - } else { - src0 = sampler; - src1 = texture; - } - - enum bi_dimension dim = valhall_tex_dimension(instr->sampler_dim); - - if (!bi_is_null(ddx) || !bi_is_null(ddy)) { - unsigned coords_comp_count = - instr->coord_components - - (instr->is_array || instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE); - assert(!bi_is_null(ddx) && !bi_is_null(ddy)); - - lod_mode = BI_VA_LOD_MODE_GRDESC; - - bi_index derivs[6] = { - bi_extract(b, ddx, 0), - bi_extract(b, ddy, 0), - coords_comp_count > 1 ? bi_extract(b, ddx, 1) : bi_null(), - coords_comp_count > 1 ? bi_extract(b, ddy, 1) : bi_null(), - coords_comp_count > 2 ? bi_extract(b, ddx, 2) : bi_null(), - coords_comp_count > 2 ? bi_extract(b, ddy, 2) : bi_null(), - }; - bi_index derivs_packed = bi_temp(b->shader); - bi_make_vec_to(b, derivs_packed, derivs, NULL, coords_comp_count * 2, 32); - bi_index grdesc = bi_temp(b->shader); - bi_instr *I = bi_tex_gradient_to(b, grdesc, derivs_packed, src0, src1, dim, - !narrow_indices, 3, coords_comp_count * 2); - I->derivative_enable = true; - I->force_delta_enable = false; - I->lod_clamp_disable = true; - I->lod_bias_disable = true; - I->register_format = BI_REGISTER_FORMAT_U32; - - bi_emit_cached_split_i32(b, grdesc, 2); - sregs[VALHALL_TEX_SREG_GRDESC0] = bi_extract(b, grdesc, 0); - sregs[VALHALL_TEX_SREG_GRDESC1] = bi_extract(b, grdesc, 1); - } - - /* Allocate staging registers contiguously by compacting the array. */ unsigned sr_count = 0; - for (unsigned i = 0; i < ARRAY_SIZE(sregs); ++i) { - if (!bi_is_null(sregs[i])) - sregs[sr_count++] = sregs[i]; + bi_index sr_comps[8]; + for (unsigned i = 0; i < sr0->num_components; i++) + sr_comps[sr_count++] = bi_extract(b, bi_def_index(sr0), i); + if (sr1 != NULL) { + for (unsigned i = 0; i < sr1->num_components; i++) + sr_comps[sr_count++] = bi_extract(b, bi_def_index(sr1), i); } - bi_index idx = sr_count ? bi_temp(b->shader) : bi_null(); + bi_index sr = bi_temp(b->shader); + bi_emit_collect_to(b, sr, sr_comps, sr_count); - if (sr_count) - bi_make_vec_to(b, idx, sregs, NULL, sr_count, 32); - - if (instr->op == nir_texop_lod) { - assert(instr->def.num_components == 2 && instr->def.bit_size == 32); - - bi_index res[2]; - - for (unsigned i = 0; i < 2; i++) { - bi_index grdesc = bi_temp(b->shader); - bi_instr *I = bi_tex_gradient_to(b, grdesc, idx, src0, src1, dim, - !narrow_indices, 1, sr_count); - I->derivative_enable = false; - I->force_delta_enable = true; - I->lod_clamp_disable = i != 0; - I->register_format = BI_REGISTER_FORMAT_U32; - bi_index lod; - - /* v11 removed S16_TO_F32 */ - if (b->shader->arch >= 11) { - lod = bi_s32_to_f32(b, bi_s16_to_s32(b, bi_half(grdesc, 0))); - } else { - lod = bi_s16_to_f32(b, bi_half(grdesc, 0)); - } - - lod = bi_fmul_f32(b, lod, bi_imm_f32(1.0f / 256)); - - if (i == 0) - lod = bi_fround_f32(b, lod, BI_ROUND_NONE); - - res[i] = lod; - } - - bi_make_vec_to(b, bi_def_index(&instr->def), res, NULL, 2, 32); - return; - } - - /* Only write the components that we actually read */ - unsigned mask = nir_def_components_read(&instr->def); - unsigned comps_per_reg = instr->def.bit_size == 16 ? 2 : 1; - unsigned res_size = DIV_ROUND_UP(util_bitcount(mask), comps_per_reg); - - enum bi_register_format regfmt = bi_reg_fmt_for_nir(instr->dest_type); + const enum bi_dimension dim = valhall_tex_dimension(tex->sampler_dim); + const enum bi_register_format regfmt = bi_reg_fmt_for_nir(tex->dest_type); + const unsigned mask = nir_def_components_read(&tex->def); bi_index dest = bi_temp(b->shader); - switch (instr->op) { + switch (tex->op) { case nir_texop_tex: case nir_texop_txb: case nir_texop_txl: case nir_texop_txd: - bi_tex_single_to(b, dest, idx, src0, src1, instr->is_array, dim, regfmt, - instr->is_shadow, explicit_offset, lod_mode, - !narrow_indices, mask, sr_count); + bi_tex_single_to(b, dest, sr, src0, src1, flags.array_enable, dim, + regfmt, flags.compare_enable, flags.texel_offset, + flags.lod_mode, flags.wide_indices, + mask, sr_count); break; case nir_texop_txf: - case nir_texop_txf_ms: { - assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF && - "Texel buffers should already have been lowered"); - /* On Valhall, TEX_FETCH doesn't have CUBE support. This is not a problem - * as a cube is just a 2D array in any cases. */ - if (dim == BI_DIMENSION_CUBE) - dim = BI_DIMENSION_2D; - - bi_tex_fetch_to(b, dest, idx, src0, src1, instr->is_array, dim, regfmt, - explicit_offset, !narrow_indices, mask, sr_count); + case nir_texop_txf_ms: + bi_tex_fetch_to(b, dest, sr, src0, src1, flags.array_enable, dim, + regfmt, flags.texel_offset, flags.wide_indices, + mask, sr_count); + break; + case nir_texop_tg4: + bi_tex_gather_to(b, dest, sr, src0, src1, flags.array_enable, dim, + tex->component, false, regfmt, flags.compare_enable, + flags.texel_offset, flags.wide_indices, + mask, sr_count); + break; + case nir_texop_gradient_pan: { + bi_instr *I = + bi_tex_gradient_to(b, dest, sr, src0, src1, dim, + flags.wide_indices, + mask, sr_count); + I->force_delta_enable = flags.force_delta_enable; + I->derivative_enable = flags.derivative_enable; + I->lod_clamp_disable = flags.lod_clamp_disable; + I->lod_bias_disable = flags.lod_bias_disable; + I->register_format = BI_REGISTER_FORMAT_U32; break; } - case nir_texop_tg4: - bi_tex_gather_to(b, dest, idx, src0, src1, instr->is_array, dim, - instr->component, false, regfmt, instr->is_shadow, - explicit_offset, !narrow_indices, mask, sr_count); - break; default: UNREACHABLE("Unhandled Valhall texture op"); } - /* The hardware will write only what we read, and it will into - * contiguous registers without gaps (different from Bifrost). NIR - * expects the gaps, so fill in the holes (they'll be copypropped and - * DCE'd away later). - */ - bi_index unpacked[4] = {bi_null(), bi_null(), bi_null(), bi_null()}; + bi_emit_cached_split(b, dest, util_bitcount(mask) * tex->def.bit_size); + bi_index dest4[4] = { dest, dest, dest, dest }; - bi_emit_cached_split_i32(b, dest, res_size); - - /* Index into the packed component array */ - unsigned j = 0; - unsigned comps[4] = {0}; - unsigned nr_components = instr->def.num_components; - - for (unsigned i = 0; i < nr_components; ++i) { - if (mask & BITFIELD_BIT(i)) { - unpacked[i] = dest; - comps[i] = j++; - } else { - unpacked[i] = bi_zero(); - } + unsigned channel[4] = { }; + for (unsigned i = 0; i < tex->def.num_components; i++) { + if (mask & BITFIELD_BIT(i)) + channel[i] = util_bitcount(mask & BITFIELD_MASK(i)); } - bi_make_vec_to(b, bi_def_index(&instr->def), unpacked, comps, - instr->def.num_components, instr->def.bit_size); + bi_make_vec_to(b, bi_def_index(&tex->def), dest4, channel, + tex->def.num_components, tex->def.bit_size); } /* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube diff --git a/src/panfrost/compiler/bifrost/bifrost_nir.c b/src/panfrost/compiler/bifrost/bifrost_nir.c index fb24ad0f4b6..3d38df4c704 100644 --- a/src/panfrost/compiler/bifrost/bifrost_nir.c +++ b/src/panfrost/compiler/bifrost/bifrost_nir.c @@ -943,6 +943,9 @@ bifrost_postprocess_nir(nir_shader *nir, &info->vs.needs_extended_fifo); } + if (pan_arch(gpu_id) >= 9) + NIR_PASS(_, nir, pan_nir_lower_tex, gpu_id); + /* Our OpenCL compiler (src/panfrost/clc/pan_compile.c) has a very weird and * suboptimal optimization pipeline that results in a lot of unoptimized * memcpys and sparse scratch space. That code is still being used for diff --git a/src/panfrost/compiler/pan_nir_lower_tex.c b/src/panfrost/compiler/pan_nir_lower_tex.c index e81c7e7f353..5bb480e8d26 100644 --- a/src/panfrost/compiler/pan_nir_lower_tex.c +++ b/src/panfrost/compiler/pan_nir_lower_tex.c @@ -350,8 +350,7 @@ va_lower_tex(nir_builder *b, nir_tex_instr *tex, uint64_t gpu_id) if (tex->op == nir_texop_txf && srcs.lod) { comps[3] = nir_get_scalar(srcs.lod, 0); if (pan_arch(gpu_id) >= 11 && nir_scalar_is_const(comps[3])) { - /* On v11+, narrow_array_index is a 8.8 fixed-point value in - * bits [31:16] + /* On v11+, narrow_lod is a 8.8 fixed-point value in bits [31:16] */ uint32_t imm_lod = nir_scalar_as_uint(comps[3]); narrow_offset |= MIN2(imm_lod, UINT8_MAX) << 24;