pan/bi: Lower VS outputs in NIR

Co-authored-by: Lorenzo Rossi <lorenzo.rossi@collabora.com>
Reviewed-by: Lorenzo Rossi <lorenzo.rossi@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40391>
This commit is contained in:
Faith Ekstrand 2026-03-17 12:10:50 +01:00 committed by Marge Bot
parent 8127f5a88a
commit 3418525a82
6 changed files with 274 additions and 179 deletions

View file

@ -755,6 +755,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
case nir_intrinsic_load_input_attachment_target_pan:
case nir_intrinsic_load_input_attachment_conv_pan:
case nir_intrinsic_load_global_cvt_pan:
case nir_intrinsic_lea_attr_pan:
case nir_intrinsic_lea_buf_pan:
case nir_intrinsic_atomic_counter_read:
case nir_intrinsic_atomic_counter_read_deref:
case nir_intrinsic_is_sparse_texels_resident:
@ -1039,6 +1041,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
case nir_intrinsic_load_tile_res_pan:
case nir_intrinsic_load_cumulative_coverage_pan:
case nir_intrinsic_load_blend_input_pan:
case nir_intrinsic_load_idvs_output_buf_index_pan:
case nir_intrinsic_atest_pan:
case nir_intrinsic_zs_emit_pan:
case nir_intrinsic_load_return_param_amd:

View file

@ -1741,6 +1741,17 @@ store("global_cvt_pan", [1, 1], indices=[SRC_TYPE, ACCESS])
# src[] = { value, address }
store("global_psiz_pan", [1], indices=[WRITE_MASK, ACCESS])
# Base index of the output buffer passed into the IDVS on Valhall.
system_value("idvs_output_buf_index_pan", 1, bit_sizes=[32])
# src[] = { handle, vertex_id, instance_id }
intrinsic("lea_attr_pan", [1, 1, 1], dest_comp=3, bit_sizes=[32],
indices=[SRC_TYPE], flags=[CAN_ELIMINATE, CAN_REORDER])
# src[] = { handle, index }
intrinsic("lea_buf_pan", [1, 1], dest_comp=2, bit_sizes=[32],
flags=[CAN_ELIMINATE, CAN_REORDER])
# Load the address and potentially the conversion descriptor for a texel buffer index.
# The 64 bit address is always in the first two channels, while the 32 bit
# conversion descriptor is in the last channel only for Bifrost.

View file

@ -580,6 +580,80 @@ bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
bi_copy_component(b, instr, dest);
}
static void
bi_emit_lea_attr(bi_builder *b, nir_intrinsic_instr *intr)
{
assert(intr->intrinsic == nir_intrinsic_lea_attr_pan);
const nir_alu_type src_fmt = nir_intrinsic_src_type(intr);
if (b->shader->arch < 9 && b->shader->idvs == BI_IDVS_POSITION) {
/* Bifrost position shaders have a fast path */
assert(nir_src_as_uint(intr->src[0]) == 0);
assert(src_fmt == nir_type_float32);
unsigned regfmt = BI_REGISTER_FORMAT_F32;
unsigned identity = (b->shader->arch == 6) ? 0x688 : 0;
unsigned snap4 = 0x5E;
uint32_t format = identity | (snap4 << 12) | (regfmt << 24);
bi_collect_v3i32_to(b, bi_def_index(&intr->def),
bi_preload(b, 58), bi_preload(b, 59),
bi_imm_u32(format));
return;
}
bi_index vertex_id = bi_src_index(&intr->src[1]);
bi_index instance_id = bi_src_index(&intr->src[2]);
enum bi_register_format regfmt = bi_reg_fmt_for_nir(src_fmt);
/* Check if the index can fit in LEA_ATTR_IMM */
uint32_t imm_res = 0;
bool use_imm_form = false;
if (nir_src_is_const(intr->src[0])) {
imm_res = nir_src_as_uint(intr->src[0]);
use_imm_form = pan_res_handle_get_index(imm_res) < 0x10;
}
bi_index address = bi_def_index(&intr->def);
if (use_imm_form) {
bi_instr *I = bi_lea_attr_imm_to(b, address, vertex_id, instance_id,
regfmt,
pan_res_handle_get_index(imm_res));
if (b->shader->arch >= 9)
I->table = va_res_fold_table_idx(pan_res_handle_get_table(imm_res));
} else {
bi_index res = bi_src_index(&intr->src[0]);
bi_lea_attr_to(b, address, vertex_id, instance_id, res, regfmt);
}
bi_split_def(b, &intr->def);
}
static void
bi_emit_lea_buf(bi_builder *b, nir_intrinsic_instr *intr)
{
assert(intr->intrinsic == nir_intrinsic_lea_buf_pan);
assert(b->shader->arch >= 9);
bi_index index = bi_src_index(&intr->src[1]);
uint32_t imm_res;
bool use_imm_form = false;
if (nir_src_is_const(intr->src[0])) {
imm_res = nir_src_as_uint(intr->src[0]);
uint32_t table_index = pan_res_handle_get_table(imm_res);
uint32_t res_index = pan_res_handle_get_index(imm_res);
use_imm_form = va_is_valid_const_table(table_index) && res_index < 256;
}
bi_index address = bi_def_index(&intr->def);
if (use_imm_form) {
bi_instr *I = bi_lea_buf_imm_to(b, address, index);
I->table = va_res_fold_table_idx(pan_res_handle_get_table(imm_res));
I->index = pan_res_handle_get_index(imm_res);
} else {
bi_index res = bi_src_index(&intr->src[0]);
bi_lea_buf_to(b, address, index, res);
}
bi_split_def(b, &intr->def);
}
static void
bi_emit_load_var(bi_builder *b, nir_intrinsic_instr *intr)
{
@ -1046,169 +1120,6 @@ bifrost_nir_lower_vs_atomics(nir_shader *shader)
nir_metadata_none, NULL);
}
static void
bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
{
/* In principle we can do better for 16-bit. At the moment we require
* mediump varyings to be 32-bit to permit the use of .auto, in order to
* force .u32 for flat varyings, to handle internal TGSI shaders that set
* flat in the VS but smooth in the FS.
*
* Explicit 16-bit types are unaffected, and written as 16-bit. */
ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr);
ASSERTED unsigned T_size = nir_alu_type_get_type_size(T);
nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
const struct pan_varying_slot *slot =
pan_varying_layout_find_slot(b->shader->varying_layout, sem.location);
ASSERTED unsigned base = nir_intrinsic_base(instr);
assert(slot == &b->shader->varying_layout->slots[base]);
unsigned imm_index = 0;
bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
/* Only look at the total components needed. In effect, we fill in all
* the intermediate "holes" in the write mask, since we can't mask off
* stores. Since nir_lower_io_vars_to_temporaries ensures each varying is
* written at most once, anything that's masked out is undefined, so it
* doesn't matter what we write there. So we may as well do the
* simplest thing possible. */
unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr));
assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0));
bi_index data = bi_src_index(&instr->src[0]);
/* To keep the vector dimensions consistent, we need to drop some
* components. This should be coalesced.
*
* TODO: This is ugly and maybe inefficient. Would we rather
* introduce a TRIM.i32 pseudoinstruction?
*/
if (nr < nir_intrinsic_src_components(instr, 0)) {
bi_index chans[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
unsigned comps_per_reg = instr->def.bit_size == 16 ? 2 : 1;
unsigned src_comps =
DIV_ROUND_UP(nir_intrinsic_src_components(instr, 0), comps_per_reg);
unsigned dst_comps = DIV_ROUND_UP(nr, comps_per_reg);
bi_emit_split_i32(b, chans, data, src_comps);
bi_index tmp = bi_temp(b->shader);
bi_instr *collect = bi_collect_i32_to(b, tmp, dst_comps);
bi_foreach_src(collect, w)
collect->src[w] = chans[w];
data = tmp;
}
bi_index a[4] = {bi_null()};
if (b->shader->arch <= 8 && b->shader->idvs == BI_IDVS_POSITION) {
/* Bifrost position shaders have a fast path */
assert(T == nir_type_float32);
unsigned regfmt = BI_REGISTER_FORMAT_F32;
unsigned identity = (b->shader->arch == 6) ? 0x688 : 0;
unsigned snap4 = 0x5E;
uint32_t format = identity | (snap4 << 12) | (regfmt << 24);
bi_st_cvt(b, data, bi_preload(b, 58), bi_preload(b, 59),
bi_imm_u32(format), regfmt, nr - 1);
} else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) {
bi_index index = bi_preload(b, 59);
unsigned src_bit_sz = nir_src_bit_size(instr->src[0]);
unsigned index_offset = 0;
if (slot->section == PAN_VARYING_SECTION_ATTRIBS)
index_offset += 4;
if (instr->intrinsic == nir_intrinsic_store_per_view_output) {
unsigned view_index = nir_src_as_uint(instr->src[1]);
if (slot->section == PAN_VARYING_SECTION_GENERIC) {
index_offset += view_index * 4;
} else {
/* We don't patch these offsets in the no_psiz variant, so if
* multiview is enabled we can't switch to the basic format by
* using no_psiz */
const uint64_t outputs = b->shader->nir->info.outputs_written;
bool extended_position_fifo =
valhal_writes_extended_fifo(outputs, false, true);
/* Must be the same with and without no_psiz */
assert(valhal_writes_extended_fifo(outputs, true, true) ==
extended_position_fifo);
unsigned position_fifo_stride = extended_position_fifo ? 8 : 4;
index_offset += view_index * position_fifo_stride;
}
}
if (index_offset != 0)
index = bi_iadd_imm_i32(b, index, index_offset);
const enum va_memory_access mem_access =
slot->section == PAN_VARYING_SECTION_GENERIC ? VA_MEMORY_ACCESS_ESTREAM
: VA_MEMORY_ACCESS_ISTREAM;
nir_src *offset_src = nir_get_io_offset_src(instr);
assert(nir_src_is_const(*offset_src) && "assumes immediate offset");
unsigned offset = slot->offset + (nir_src_as_uint(*offset_src) * 16);
/* On Valhall, with IDVS varying are stored in a hardware-controlled
* buffer through table 61 at index 0 */
bi_index address = bi_temp(b->shader);
bi_instr *I = bi_lea_buf_imm_to(b, address, index);
I->table = va_res_fold_table_idx(61);
I->index = 0;
/* On 5th Gen, the hardware-controlled buffer is at index 1 for varyings */
if (pan_arch(b->shader->inputs->gpu_id) >= 12 &&
slot->section == PAN_VARYING_SECTION_GENERIC) {
I->index = 1;
}
bi_emit_split_i32(b, a, address, 2);
bi_instr *S = bi_store(b, nr * src_bit_sz, data, a[0], a[1], BI_SEG_NONE,
offset);
S->mem_access = mem_access;
S->is_psiz_write = slot->location == VARYING_SLOT_PSIZ;
} else {
assert(T_size == 32 || T_size == 16);
enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
/* Since v9 we cannot have separate attribute descriptors for VS-FS,
* There might be a mismatch on Gallium where the VS thinks it is storing
* an int, but the data is actually a float, and that's what FS expects.
* So, just for v9 onwards, just until we haven't fixed gallium, use auto32.
* We are still getting around the midgard quirk since we do this only
* from v9.
* TODO: fix all bugs with gallium and remove this patch
*/
if (b->shader->arch >= 9 && T_size == 32)
regfmt = BI_REGISTER_FORMAT_AUTO;
if (immediate) {
bi_index address = bi_lea_attr_imm(b, bi_vertex_id(b),
bi_instance_id(b),
regfmt, imm_index);
bi_emit_split_i32(b, a, address, 3);
bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
} else {
bi_index idx = bi_iadd_u32(b,
bi_src_index(nir_get_io_offset_src(instr)),
bi_imm_u32(nir_intrinsic_base(instr)), false);
bi_index address =
bi_lea_attr(b, bi_vertex_id(b), bi_instance_id(b), idx, regfmt);
bi_emit_split_i32(b, a, address, 3);
bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
}
}
}
static void
bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr)
{
@ -2047,16 +1958,6 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
bi_emit_load_var_buf(b, instr);
break;
case nir_intrinsic_store_output:
case nir_intrinsic_store_per_view_output:
if (stage == MESA_SHADER_FRAGMENT)
UNREACHABLE("Should have been lowered by pan_nir_lower_fs_outputs");
else if (stage == MESA_SHADER_VERTEX)
bi_emit_store_vary(b, instr);
else
UNREACHABLE("Unsupported shader stage");
break;
case nir_intrinsic_load_cumulative_coverage_pan:
bi_mov_i32_to(b, dst, bi_preload(b, 60));
break;
@ -2335,6 +2236,18 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
bi_emit_store_cvt(b, instr, va_memory_access_from_nir(instr));
break;
case nir_intrinsic_load_idvs_output_buf_index_pan:
bi_mov_i32_to(b, dst, bi_preload(b, 59));
break;
case nir_intrinsic_lea_attr_pan:
bi_emit_lea_attr(b, instr);
break;
case nir_intrinsic_lea_buf_pan:
bi_emit_lea_buf(b, instr);
break;
case nir_intrinsic_load_tile_pan:
case nir_intrinsic_load_tile_res_pan:
bi_emit_ld_tile(b, instr);
@ -6631,10 +6544,6 @@ bi_compile_variant_nir(nir_shader *nir,
ctx->malloc_idvs = (ctx->arch >= 9) && !inputs->no_idvs;
ctx->fau_consts_count = info.init_fau_consts_count;
if (!mesa_shader_stage_is_compute(nir->info.stage)) {
ctx->varying_layout = inputs->varying_layout;
}
unsigned execution_mode = nir->info.float_controls_execution_mode;
ctx->rtz_fp16 = nir_is_rounding_mode_rtz(execution_mode, 16);
ctx->rtz_fp32 = nir_is_rounding_mode_rtz(execution_mode, 32);
@ -7098,6 +7007,18 @@ bifrost_compile_shader_nir(nir_shader *nir,
NIR_PASS(_, nir, nir_opt_if, 0);
}
}
bool has_extended_fifo = false;
if (pan_arch(inputs->gpu_id) >= 9) {
const uint64_t outputs = nir->info.outputs_written;
has_extended_fifo = valhal_writes_extended_fifo(outputs, false, true);
/* Must be the same with and without no_psiz */
assert(valhal_writes_extended_fifo(outputs, true, true) ==
has_extended_fifo);
}
NIR_PASS(_, nir, pan_nir_lower_vs_outputs, inputs->gpu_id,
inputs->varying_layout, info->vs.idvs, has_extended_fifo);
}
if (nir->info.stage == MESA_SHADER_FRAGMENT) {

View file

@ -1057,8 +1057,6 @@ typedef struct {
enum bi_idvs_mode idvs;
unsigned num_blocks;
const struct pan_varying_layout *varying_layout;
/* Floating point rounding mode controls */
bool rtz_fp16;
bool rtz_fp32;

View file

@ -57,6 +57,10 @@ bool pan_nir_lower_frag_coord_zw(nir_shader *shader);
bool pan_nir_lower_noperspective_vs(nir_shader *shader);
bool pan_nir_lower_noperspective_fs(nir_shader *shader);
bool pan_nir_lower_vs_outputs(nir_shader *shader, unsigned gpu_id,
const struct pan_varying_layout *varying_layout,
bool has_idvs, bool has_extended_fifo);
bool pan_nir_lower_fs_inputs(nir_shader *shader, unsigned gpu_id,
const struct pan_varying_layout *varying_layout,
bool valhall_use_ld_var_buf);

View file

@ -8,6 +8,164 @@
#include "panfrost/model/pan_model.h"
struct lower_vs_outputs_ctx {
unsigned arch;
const struct pan_varying_layout *varying_layout;
bool has_idvs;
bool has_extended_fifo;
};
static void
build_attr_buf_write(struct nir_builder *b, nir_def *data,
const struct pan_varying_slot *slot, uint32_t view_index,
const struct lower_vs_outputs_ctx *ctx)
{
/* We need the precise memory layout */
pan_varying_layout_require_layout(ctx->varying_layout);
nir_def *index = nir_load_idvs_output_buf_index_pan(b);
uint32_t res, view_stride;
if (slot->section == PAN_VARYING_SECTION_GENERIC) {
/* The varying buffer is bound at index 1 on v12+ */
uint32_t res_index = ctx->arch >= 12 ? 1 : 0;
res = pan_res_handle(61, res_index);
view_stride = 4;
} else {
res = pan_res_handle(61, 0);
view_stride = ctx->has_extended_fifo ? 8 : 4;
}
uint32_t index_offset = view_index * view_stride;
if (slot->section == PAN_VARYING_SECTION_ATTRIBS)
index_offset += 4;
/* v9+ cache hints, generic varyings don't need caching while
* position/attribute varyings are reused by other units inside of the GPU.
* TODO: Do we really want ESTREAM on generic varyings?
*/
enum gl_access_qualifier access =
slot->section == PAN_VARYING_SECTION_GENERIC ? ACCESS_ESTREAM_PAN :
ACCESS_ISTREAM_PAN;
index = nir_iadd_imm(b, index, index_offset);
nir_def *addr = nir_lea_buf_pan(b, nir_imm_int(b, res), index);
addr = nir_pack_64_2x32(b, addr);
addr = nir_iadd(b, addr, nir_imm_int64(b, slot->offset));
/* Tag writes to gl_PointSize with a special intrinsic */
if (slot->location == VARYING_SLOT_PSIZ) {
nir_store_global_psiz_pan(b, data, addr, .access = access);
} else {
nir_store_global(b, data, addr, .access = access);
}
}
static void
build_attr_desc_write(struct nir_builder *b, nir_def *data, uint32_t base,
nir_alu_type src_type,
const struct lower_vs_outputs_ctx *ctx)
{
nir_def *index = nir_imm_int(b, base);
nir_def *vertex_id = nir_load_raw_vertex_id_pan(b);
nir_def *instance_id = nir_load_instance_id(b);
nir_def *addr_cvt = nir_lea_attr_pan(b, index, vertex_id, instance_id,
.src_type = src_type);
nir_def *addr = nir_pack_64_2x32(b, nir_trim_vector(b, addr_cvt, 2));
nir_def *cvt = nir_channel(b, addr_cvt, 2);
nir_store_global_cvt_pan(b, data, addr, cvt, .src_type = src_type);
}
static bool
lower_vs_output_store(struct nir_builder *b,
nir_intrinsic_instr *store, void *cb_data)
{
const struct lower_vs_outputs_ctx *ctx = cb_data;
if (store->intrinsic != nir_intrinsic_store_output &&
store->intrinsic != nir_intrinsic_store_per_view_output)
return false;
b->cursor = nir_instr_remove(&store->instr);
nir_io_semantics sem = nir_intrinsic_io_semantics(store);
nir_alu_type src_type = nir_intrinsic_src_type(store);
unsigned src_bit_size = nir_alu_type_get_type_size(src_type);
/* Indirect array varyings are not yet supported (num_slots > 1) */
assert(sem.num_slots == 1);
assert(nir_src_as_uint(*nir_get_io_offset_src(store)) == 0);
/* We need the slot section for cache hints */
pan_varying_layout_require_format(ctx->varying_layout);
const struct pan_varying_slot *slot =
pan_varying_layout_find_slot(ctx->varying_layout, sem.location);
/* Special slots are read only */
assert(slot && slot->section != PAN_VARYING_SECTION_SPECIAL);
/* From v9, IO is resized to the real size of the slot */
assert(ctx->arch < 9 ||
src_bit_size == nir_alu_type_get_type_size(slot->alu_type));
/* Since v9 we cannot have separate attribute descriptors for VS-FS,
* There might be a mismatch on Gallium where the VS thinks it is storing
* an int, but the data is actually a float, and that's what FS expects.
* So, just for v9 onwards, just until we haven't fixed gallium, use auto32.
* We are still getting around the midgard quirk since we do this only
* from v9.
* TODO: fix all bugs with gallium and remove this patch
*/
if (ctx->arch >= 9 && src_bit_size == 32)
src_type = 32;
nir_def *data = store->src[0].ssa;
assert(src_bit_size == data->bit_size);
/* Trim the input so we don't write extra channels at the end. In effect,
* we fill in all the intermediate "holes" in the write mask, since we
* can't mask off stores. Since nir_lower_io_vars_to_temporaries ensures
* each varying is written at most once, anything that's masked out is
* undefined, so it doesn't matter what we write there. So we may as well
* do the simplest thing possible.
*/
const nir_component_mask_t write_mask = nir_intrinsic_write_mask(store);
data = nir_trim_vector(b, data, util_last_bit(write_mask));
if (ctx->arch >= 9 && ctx->has_idvs) {
uint32_t view_index = 0;
if (store->intrinsic == nir_intrinsic_store_per_view_output)
view_index = nir_src_as_uint(store->src[1]);
build_attr_buf_write(b, data, slot, view_index, ctx);
} else {
uint32_t base = nir_intrinsic_base(store);
assert(store->intrinsic != nir_intrinsic_store_per_view_output);
build_attr_desc_write(b, data, base, src_type, ctx);
}
return true;
}
bool
pan_nir_lower_vs_outputs(nir_shader *shader, unsigned gpu_id,
const struct pan_varying_layout *varying_layout,
bool has_idvs,
bool has_extended_fifo)
{
assert(shader->info.stage == MESA_SHADER_VERTEX);
const struct lower_vs_outputs_ctx ctx = {
.arch = pan_arch(gpu_id),
.varying_layout = varying_layout,
.has_idvs = has_idvs,
.has_extended_fifo = has_extended_fifo,
};
return nir_shader_intrinsics_pass(shader, lower_vs_output_store,
nir_metadata_control_flow,
(void *)&ctx);
}
struct lower_fs_inputs_ctx {
unsigned arch;
const struct pan_varying_layout *varying_layout;