From 43ffcf06f453c3f415da44a207fc8042781a4ff3 Mon Sep 17 00:00:00 2001 From: Lorenzo Rossi Date: Wed, 11 Mar 2026 17:45:33 +0100 Subject: [PATCH] pan/bi,nir: Divide memory_access from segments Valhall removed Bifrost's memory segments and added in its place memory access. Those were bolted on reserved bits as "pseudo-segments" and the emitter would catch these and emit the right memory access. This commit cleans it up a bit by making memory_access available directly and exposing it to NIR (this will be useful later). Signed-off-by: Lorenzo Rossi Reviewed-by: Faith Ekstrand Part-of: --- src/compiler/nir/nir_print.c | 2 + src/compiler/shader_enums.h | 18 +++++ .../compiler/bifrost/bifrost_compile.c | 68 +++++++++++++------ src/panfrost/compiler/bifrost/compiler.h | 1 + .../bifrost/valhall/test/test-packing.cpp | 9 +-- .../compiler/bifrost/valhall/va_pack.c | 27 +++----- 6 files changed, 80 insertions(+), 45 deletions(-) diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c index 797cf6e97c7..497103e1e39 100644 --- a/src/compiler/nir/nir_print.c +++ b/src/compiler/nir/nir_print.c @@ -900,6 +900,8 @@ print_access(enum gl_access_qualifier access, print_state *state, const char *se { ACCESS_ATOMIC, "atomic" }, { ACCESS_FUSED_EU_DISABLE_INTEL, "fused-eu-disable-intel" }, { ACCESS_SPARSE, "sparse" }, + { ACCESS_ISTREAM_PAN, "istream-pan" }, + { ACCESS_ESTREAM_PAN, "estream-pan" }, }; bool first = true; diff --git a/src/compiler/shader_enums.h b/src/compiler/shader_enums.h index 66a77a72466..a45fda6458a 100644 --- a/src/compiler/shader_enums.h +++ b/src/compiler/shader_enums.h @@ -1266,6 +1266,24 @@ enum gl_access_qualifier * intrinsics for sparse. */ ACCESS_SPARSE = (1 << 20), + + /** + * Internal streaming access (v9+) + * + * Whether the memory is accessed in a streaming fashion inside of the GPU. + * Since the data is likely to be read inside of the GPU, the hardware will + * try to store it in level 2 cache. + */ + ACCESS_ISTREAM_PAN = (1 << 21), + + /** + * External streaming access (v9+) + * + * Whether the memory is accessed in a streaming fashion outside of the GPU. + * This hints the hardware to not cache the data, it could be useful for + * one-time accesses or if the data is larger than what the memory can store. + */ + ACCESS_ESTREAM_PAN = (1 << 22), }; enum gl_tess_spacing diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c index ff53e557cd2..dce290e1c12 100644 --- a/src/panfrost/compiler/bifrost/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost/bifrost_compile.c @@ -1146,9 +1146,9 @@ bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr) if (index_offset != 0) index = bi_iadd_imm_i32(b, index, index_offset); - const enum bi_seg seg = - slot->section == PAN_VARYING_SECTION_GENERIC ? BI_SEG_VARY - : BI_SEG_POS; + const enum va_memory_access mem_access = + slot->section == PAN_VARYING_SECTION_GENERIC ? VA_MEMORY_ACCESS_ESTREAM + : VA_MEMORY_ACCESS_ISTREAM; nir_src *offset_src = nir_get_io_offset_src(instr); assert(nir_src_is_const(*offset_src) && "assumes immediate offset"); @@ -1169,7 +1169,9 @@ bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr) bi_emit_split_i32(b, a, address, 2); - bi_instr *S = bi_store(b, nr * src_bit_sz, data, a[0], a[1], seg, offset); + bi_instr *S = bi_store(b, nr * src_bit_sz, data, a[0], a[1], BI_SEG_NONE, + offset); + S->mem_access = mem_access; S->is_psiz_write = slot->location == VARYING_SLOT_PSIZ; } else { assert(T_size == 32 || T_size == 16); @@ -1310,7 +1312,8 @@ bi_handle_segment(bi_builder *b, bi_index *addr_lo, bi_index *addr_hi, } static void -bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg) +bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg, + enum va_memory_access mem_access) { int16_t offset = 0; unsigned bits = instr->num_components * instr->def.bit_size; @@ -1320,12 +1323,14 @@ bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg) bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset); - bi_load_to(b, bits, dest, addr_lo, addr_hi, seg, offset); + bi_instr *I = bi_load_to(b, bits, dest, addr_lo, addr_hi, seg, offset); + I->mem_access = mem_access; bi_emit_cached_split(b, dest, bits); } static bi_instr * -bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg) +bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg, + enum va_memory_access mem_access) { /* Require contiguous masks, gauranteed by nir_lower_wrmasks */ assert(nir_intrinsic_write_mask(instr) == @@ -1340,6 +1345,7 @@ bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg) bi_instr *I = bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]), bi_src_index(&instr->src[0]), addr_lo, addr_hi, seg, offset); + I->mem_access = mem_access; return I; } @@ -1744,19 +1750,23 @@ va_emit_load_texel_buf_index_address(bi_builder *b, bi_index dst, } static void -bi_emit_load_cvt(bi_builder *b, bi_index dst, nir_intrinsic_instr *instr) +bi_emit_load_cvt(bi_builder *b, bi_index dst, nir_intrinsic_instr *instr, + enum va_memory_access mem_access) { bi_index addr = bi_src_index(&instr->src[0]); bi_index icd = bi_src_index(&instr->src[1]); - bi_ld_cvt_to(b, dst, bi_extract(b, addr, 0), bi_extract(b, addr, 1), icd, - bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr)), - instr->def.num_components - 1); + bi_instr *I = + bi_ld_cvt_to(b, dst, bi_extract(b, addr, 0), bi_extract(b, addr, 1), icd, + bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr)), + instr->def.num_components - 1); + I->mem_access = mem_access; bi_emit_cached_split_i32(b, dst, instr->def.num_components); } static void -bi_emit_store_cvt(bi_builder *b, nir_intrinsic_instr *instr) +bi_emit_store_cvt(bi_builder *b, nir_intrinsic_instr *instr, + enum va_memory_access mem_access) { bi_index value = bi_src_index(&instr->src[0]); bi_index addr = bi_src_index(&instr->src[1]); @@ -1773,8 +1783,10 @@ bi_emit_store_cvt(bi_builder *b, nir_intrinsic_instr *instr) regfmt = bi_reg_fmt_for_nir(src_type); } - bi_st_cvt(b, value, bi_extract(b, addr, 0), bi_extract(b, addr, 1), icd, - regfmt, instr->num_components - 1); + bi_instr *I = + bi_st_cvt(b, value, bi_extract(b, addr, 0), bi_extract(b, addr, 1), icd, + regfmt, instr->num_components - 1); + I->mem_access = mem_access; } static void @@ -1980,6 +1992,17 @@ bi_subgroup_from_cluster_size(unsigned cluster_size) } } +static enum va_memory_access +va_memory_access_from_nir(const nir_intrinsic_instr *intr) +{ + const enum gl_access_qualifier access = nir_intrinsic_access(intr); + if (access & ACCESS_ISTREAM_PAN) + return VA_MEMORY_ACCESS_ISTREAM; + if (access & ACCESS_ESTREAM_PAN) + return VA_MEMORY_ACCESS_ESTREAM; + return VA_MEMORY_ACCESS_NONE; +} + static void bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) { @@ -2139,30 +2162,31 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) case nir_intrinsic_load_global: case nir_intrinsic_load_global_constant: - bi_emit_load(b, instr, BI_SEG_NONE); + bi_emit_load(b, instr, BI_SEG_NONE, va_memory_access_from_nir(instr)); break; case nir_intrinsic_store_global: case nir_intrinsic_store_global_psiz_pan: { - bi_instr *I = bi_emit_store(b, instr, BI_SEG_NONE); + bi_instr *I = bi_emit_store(b, instr, BI_SEG_NONE, + va_memory_access_from_nir(instr)); I->is_psiz_write = instr->intrinsic == nir_intrinsic_store_global_psiz_pan; break; } case nir_intrinsic_load_scratch: - bi_emit_load(b, instr, BI_SEG_TL); + bi_emit_load(b, instr, BI_SEG_TL, VA_MEMORY_ACCESS_FORCE); break; case nir_intrinsic_store_scratch: - bi_emit_store(b, instr, BI_SEG_TL); + bi_emit_store(b, instr, BI_SEG_TL, VA_MEMORY_ACCESS_FORCE); break; case nir_intrinsic_load_shared: - bi_emit_load(b, instr, BI_SEG_WLS); + bi_emit_load(b, instr, BI_SEG_WLS, VA_MEMORY_ACCESS_NONE); break; case nir_intrinsic_store_shared: - bi_emit_store(b, instr, BI_SEG_WLS); + bi_emit_store(b, instr, BI_SEG_WLS, VA_MEMORY_ACCESS_NONE); break; case nir_intrinsic_barrier: @@ -2304,11 +2328,11 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_global_cvt_pan: - bi_emit_load_cvt(b, dst, instr); + bi_emit_load_cvt(b, dst, instr, va_memory_access_from_nir(instr)); break; case nir_intrinsic_store_global_cvt_pan: - bi_emit_store_cvt(b, instr); + bi_emit_store_cvt(b, instr, va_memory_access_from_nir(instr)); break; case nir_intrinsic_load_tile_pan: diff --git a/src/panfrost/compiler/bifrost/compiler.h b/src/panfrost/compiler/bifrost/compiler.h index 514b67b9024..a9fe1ddd837 100644 --- a/src/panfrost/compiler/bifrost/compiler.h +++ b/src/panfrost/compiler/bifrost/compiler.h @@ -637,6 +637,7 @@ typedef struct { struct { enum bi_seg seg; /* LOAD, STORE, SEG_ADD, SEG_SUB */ + enum va_memory_access mem_access; /* LOAD, STORE, LD_CVT, ST_CVT */ bool preserve_null; /* SEG_ADD, SEG_SUB */ enum bi_extend extend; /* LOAD, IMUL */ }; diff --git a/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp b/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp index 85093247005..1f5003f9686 100644 --- a/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp +++ b/src/panfrost/compiler/bifrost/valhall/test/test-packing.cpp @@ -291,11 +291,12 @@ TEST_F(ValhallPacking, LeaBufImm) 0x005e84040000007b); } -TEST_F(ValhallPacking, StoreSegment) +TEST_F(ValhallPacking, StoreMemoryAccess) { - CASE(bi_store_i96(b, bi_register(0), bi_discard(bi_register(4)), - bi_discard(bi_register(5)), BI_SEG_VARY, 0), - 0x0061400632000044); + bi_instr *I = bi_store_i96(b, bi_register(0), bi_discard(bi_register(4)), + bi_discard(bi_register(5)), BI_SEG_NONE, 0); + I->mem_access = VA_MEMORY_ACCESS_ESTREAM; + CASE(I, 0x0061400632000044); } TEST_F(ValhallPacking, Convert16To32) diff --git a/src/panfrost/compiler/bifrost/valhall/va_pack.c b/src/panfrost/compiler/bifrost/valhall/va_pack.c index ad930e28014..9242da468e5 100644 --- a/src/panfrost/compiler/bifrost/valhall/va_pack.c +++ b/src/panfrost/compiler/bifrost/valhall/va_pack.c @@ -737,35 +737,21 @@ va_pack_load(const bi_instr *I, bool buffer_descriptor) hex |= va_pack_byte_offset(I); hex |= (uint64_t)va_pack_src(I, 0) << 0; + hex |= (uint64_t)I->mem_access << 24; if (buffer_descriptor) hex |= (uint64_t)va_pack_src(I, 1) << 8; return hex; } - -static uint64_t -va_pack_memory_access(const bi_instr *I) -{ - switch (I->seg) { - case BI_SEG_TL: - return VA_MEMORY_ACCESS_FORCE; - case BI_SEG_POS: - return VA_MEMORY_ACCESS_ISTREAM; - case BI_SEG_VARY: - return VA_MEMORY_ACCESS_ESTREAM; - default: - return VA_MEMORY_ACCESS_NONE; - } -} - static uint64_t va_pack_store(const bi_instr *I) { - uint64_t hex = va_pack_memory_access(I) << 24; + uint64_t hex = 0; va_validate_register_pair(I, 1); hex |= (uint64_t)va_pack_src(I, 1) << 0; + hex |= I->mem_access << 24; hex |= va_pack_byte_offset(I); @@ -971,15 +957,18 @@ va_pack_instr(const bi_instr *I, unsigned arch) /* Conversion descriptor */ hex |= (uint64_t)va_pack_src(I, 2) << 16; - hex |= va_pack_memory_access(I) << 37; + hex |= (uint64_t)I->mem_access << 37; break; case BI_OPCODE_ST_CVT: /* Staging read */ - hex |= va_pack_store(I); + va_validate_register_pair(I, 1); + hex |= (uint64_t)va_pack_src(I, 1) << 0; + hex |= va_pack_byte_offset(I); /* Conversion descriptor */ hex |= (uint64_t)va_pack_src(I, 3) << 16; + hex |= (uint64_t)I->mem_access << 37; break; case BI_OPCODE_BLEND: {