From 68ab25e6d4ef01d19b52a015b4240a69fa456f38 Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Mon, 3 Mar 2025 09:58:54 +0100 Subject: [PATCH] ir3: split immediate state from rest of const state On a7xx, the immediates that get promoted to const registers will be initialized in the preamble instead of being part of the const state. So technically, we won't need the immediate state that is part of the const state anymore on a7xx. However, it is still a convenient place for ir3_cp to store the immediates that should be promoted to const registers before they are lowered to the preamble. This causes one issue: the binning pass isn't allowed to modify the const state while it's perfectly fine for it to use different immediates compared to the non-binning pass on a7xx. Even pre-a7xx this is fine as long as the size of the immediate buffer is the same. To allow the binning pass to modify its immediate state while keeping its const state immutable, this commit moves the fields related to immediates into a new struct. Runtime checks are added to enforce that the size of the immediate buffer is the same for the binning and non-binning variant pre-a7xx. Signed-off-by: Job Noorman Part-of: --- src/freedreno/computerator/a4xx.cc | 5 +- src/freedreno/computerator/a6xx.cc | 15 ++-- src/freedreno/ir3/ir3_compiler_nir.c | 12 +++ src/freedreno/ir3/ir3_disk_cache.c | 24 ++++-- src/freedreno/ir3/ir3_parser.y | 26 +++--- src/freedreno/ir3/ir3_shader.c | 83 +++++++++++++------ src/freedreno/ir3/ir3_shader.h | 18 +++- src/freedreno/vulkan/tu_shader.cc | 6 +- src/gallium/drivers/freedreno/ir3/ir3_const.h | 4 +- 9 files changed, 130 insertions(+), 63 deletions(-) diff --git a/src/freedreno/computerator/a4xx.cc b/src/freedreno/computerator/a4xx.cc index 30871ca781f..a5d2ca0e346 100644 --- a/src/freedreno/computerator/a4xx.cc +++ b/src/freedreno/computerator/a4xx.cc @@ -208,7 +208,8 @@ cs_const_emit(struct fd_ringbuffer *ring, struct kernel *kernel, const struct ir3_const_state *const_state = ir3_const_state(v); uint32_t base = const_state->allocs.max_const_offset_vec4; - int size = DIV_ROUND_UP(const_state->immediates_count, 4); + const struct ir3_imm_const_state *imm_state = &v->imm_state; + int size = DIV_ROUND_UP(imm_state->count, 4); /* truncate size to avoid writing constants that shader * does not use: @@ -220,7 +221,7 @@ cs_const_emit(struct fd_ringbuffer *ring, struct kernel *kernel, size *= 4; if (size > 0) { - emit_const(ring, kernel, base, size, const_state->immediates); + emit_const(ring, kernel, base, size, imm_state->values); } } diff --git a/src/freedreno/computerator/a6xx.cc b/src/freedreno/computerator/a6xx.cc index 2b38ad3cd4d..c07474c3620 100644 --- a/src/freedreno/computerator/a6xx.cc +++ b/src/freedreno/computerator/a6xx.cc @@ -317,14 +317,15 @@ cs_const_emit(struct fd_ringbuffer *ring, struct kernel *kernel, const struct ir3_const_state *const_state = ir3_const_state(v); uint32_t base = const_state->allocs.max_const_offset_vec4; - int size = DIV_ROUND_UP(const_state->immediates_count, 4); + const struct ir3_imm_const_state *imm_state = &v->imm_state; + int size = DIV_ROUND_UP(imm_state->count, 4); if (ir3_kernel->info.numwg != INVALID_REG) { assert((ir3_kernel->info.numwg & 0x3) == 0); int idx = ir3_kernel->info.numwg >> 2; - const_state->immediates[idx * 4 + 0] = grid[0]; - const_state->immediates[idx * 4 + 1] = grid[1]; - const_state->immediates[idx * 4 + 2] = grid[2]; + imm_state->values[idx * 4 + 0] = grid[0]; + imm_state->values[idx * 4 + 1] = grid[1]; + imm_state->values[idx * 4 + 2] = grid[2]; } for (int i = 0; i < MAX_BUFS; i++) { @@ -334,8 +335,8 @@ cs_const_emit(struct fd_ringbuffer *ring, struct kernel *kernel, uint64_t iova = fd_bo_get_iova(kernel->bufs[i]); - const_state->immediates[idx * 4 + 1] = iova >> 32; - const_state->immediates[idx * 4 + 0] = (iova << 32) >> 32; + imm_state->values[idx * 4 + 1] = iova >> 32; + imm_state->values[idx * 4 + 0] = (iova << 32) >> 32; } } @@ -349,7 +350,7 @@ cs_const_emit(struct fd_ringbuffer *ring, struct kernel *kernel, size *= 4; if (size > 0) { - emit_const(ring, base, size, const_state->immediates); + emit_const(ring, base, size, imm_state->values); } } diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index feb41ff4d3b..e9b692a9feb 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -5754,6 +5754,18 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, so->need_full_quad = true; } + /* If we're uploading immediates as part of the const state, we need to make + * sure the binning and non-binning variants have the same size. Pre-allocate + * for the binning variant, ir3_const_add_imm will ensure we don't add more + * immediates than allowed. + */ + if (so->binning_pass && !compiler->load_shader_consts_via_preamble && + so->nonbinning->imm_state.size) { + ASSERTED bool success = + ir3_const_ensure_imm_size(so, so->nonbinning->imm_state.size); + assert(success); + } + ir3_debug_print(ir, "AFTER: nir->ir3"); ir3_validate(ir); diff --git a/src/freedreno/ir3/ir3_disk_cache.c b/src/freedreno/ir3/ir3_disk_cache.c index 1d35b435eb2..1bb46460ccb 100644 --- a/src/freedreno/ir3/ir3_disk_cache.c +++ b/src/freedreno/ir3/ir3_disk_cache.c @@ -117,10 +117,14 @@ retrieve_variant(struct blob_reader *blob, struct ir3_shader_variant *v) if (!v->binning_pass) { blob_copy_bytes(blob, v->const_state, sizeof(*v->const_state)); - unsigned immeds_sz = v->const_state->immediates_size * - sizeof(v->const_state->immediates[0]); - v->const_state->immediates = ralloc_size(v->const_state, immeds_sz); - blob_copy_bytes(blob, v->const_state->immediates, immeds_sz); + } + + if (!v->compiler->load_shader_consts_via_preamble) { + v->imm_state.size = blob_read_uint32(blob); + v->imm_state.count = v->imm_state.size; + uint32_t immeds_sz = v->imm_state.size * sizeof(v->imm_state.values[0]); + v->imm_state.values = ralloc_size(v, immeds_sz); + blob_copy_bytes(blob, v->imm_state.values, immeds_sz); } } @@ -139,9 +143,15 @@ store_variant(struct blob *blob, const struct ir3_shader_variant *v) if (!v->binning_pass) { blob_write_bytes(blob, v->const_state, sizeof(*v->const_state)); - unsigned immeds_sz = v->const_state->immediates_size * - sizeof(v->const_state->immediates[0]); - blob_write_bytes(blob, v->const_state->immediates, immeds_sz); + } + + /* When load_shader_consts_via_preamble, immediates are loaded in the + * preamble and hence part of bin. + */ + if (!v->compiler->load_shader_consts_via_preamble) { + blob_write_uint32(blob, v->imm_state.size); + uint32_t immeds_sz = v->imm_state.size * sizeof(v->imm_state.values[0]); + blob_write_bytes(blob, v->imm_state.values, immeds_sz); } } diff --git a/src/freedreno/ir3/ir3_parser.y b/src/freedreno/ir3/ir3_parser.y index b46e3b9826d..6b890d799fa 100644 --- a/src/freedreno/ir3/ir3_parser.y +++ b/src/freedreno/ir3/ir3_parser.y @@ -243,23 +243,23 @@ static void fixup_cat5_s2en(void) static void add_const(unsigned reg, unsigned c0, unsigned c1, unsigned c2, unsigned c3) { - struct ir3_const_state *const_state = ir3_const_state_mut(variant); + struct ir3_imm_const_state *imm_state = &variant->imm_state; assert((reg & 0x7) == 0); int idx = reg >> (1 + 2); /* low bit is half vs full, next two bits are swiz */ - if (idx * 4 + 4 > const_state->immediates_size) { - const_state->immediates = rerzalloc(const_state, - const_state->immediates, - __typeof__(const_state->immediates[0]), - const_state->immediates_size, + if (idx * 4 + 4 > imm_state->size) { + imm_state->values = rerzalloc(imm_state, + imm_state->values, + __typeof__(imm_state->values[0]), + imm_state->size, idx * 4 + 4); - for (unsigned i = const_state->immediates_size; i < idx * 4; i++) - const_state->immediates[i] = 0xd0d0d0d0; - const_state->immediates_size = const_state->immediates_count = idx * 4 + 4; + for (unsigned i = imm_state->size; i < idx * 4; i++) + imm_state->values[i] = 0xd0d0d0d0; + imm_state->size = imm_state->count = idx * 4 + 4; } - const_state->immediates[idx * 4 + 0] = c0; - const_state->immediates[idx * 4 + 1] = c1; - const_state->immediates[idx * 4 + 2] = c2; - const_state->immediates[idx * 4 + 3] = c3; + imm_state->values[idx * 4 + 0] = c0; + imm_state->values[idx * 4 + 1] = c1; + imm_state->values[idx * 4 + 2] = c2; + imm_state->values[idx * 4 + 3] = c3; } static void add_buf_init_val(uint32_t val) diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c index 24f9322da48..d831faa67e0 100644 --- a/src/freedreno/ir3/ir3_shader.c +++ b/src/freedreno/ir3/ir3_shader.c @@ -25,6 +25,45 @@ #include "disasm.h" +bool +ir3_const_ensure_imm_size(struct ir3_shader_variant *v, unsigned size) +{ + struct ir3_imm_const_state *imm_state = &v->imm_state; + + if (size <= imm_state->size) { + return true; + } + + /* Immediates are uploaded in units of vec4 so make sure our buffer is large + * enough. + */ + size = ALIGN(size, 4); + + /* Pre-a7xx, the immediates that get lowered to const registers are + * emitted as part of the const state so the total size of immediates + * should be the same for the binning and non-binning variants. Make sure + * we don't increase the size beyond that of the non-binning variant. + */ + if (v->binning_pass && !v->compiler->load_shader_consts_via_preamble && + size > v->nonbinning->imm_state.size) { + return false; + } + + imm_state->values = + rerzalloc(v, imm_state->values, __typeof__(imm_state->values[0]), + imm_state->size, size); + imm_state->size = size; + + /* Note that ir3 printing relies on having groups of 4 dwords, so we fill the + * unused slots with a dummy value. + */ + for (int i = imm_state->count; i < imm_state->size; i++) { + imm_state->values[i] = 0xd0d0d0d0; + } + + return true; +} + static uint16_t const_imm_index_to_reg(const struct ir3_const_state *const_state, unsigned i) { @@ -35,9 +74,10 @@ uint16_t ir3_const_find_imm(struct ir3_shader_variant *v, uint32_t imm) { const struct ir3_const_state *const_state = ir3_const_state(v); + const struct ir3_imm_const_state *imm_state = &v->imm_state; - for (unsigned i = 0; i < const_state->immediates_count; i++) { - if (const_state->immediates[i] == imm) + for (unsigned i = 0; i < imm_state->count; i++) { + if (imm_state->values[i] == imm) return const_imm_index_to_reg(const_state, i); } @@ -47,36 +87,26 @@ ir3_const_find_imm(struct ir3_shader_variant *v, uint32_t imm) uint16_t ir3_const_add_imm(struct ir3_shader_variant *v, uint32_t imm) { - struct ir3_const_state *const_state = ir3_const_state_mut(v); + const struct ir3_const_state *const_state = ir3_const_state(v); + struct ir3_imm_const_state *imm_state = &v->imm_state; - /* Reallocate for 4 more elements whenever it's necessary. Note that ir3 - * printing relies on having groups of 4 dwords, so we fill the unused - * slots with a dummy value. - */ - if (const_state->immediates_count == const_state->immediates_size) { - const_state->immediates = rerzalloc( - const_state, const_state->immediates, - __typeof__(const_state->immediates[0]), const_state->immediates_size, - const_state->immediates_size + 4); - const_state->immediates_size += 4; - - for (int i = const_state->immediates_count; - i < const_state->immediates_size; i++) { - const_state->immediates[i] = 0xd0d0d0d0; + /* Reallocate for 4 more elements whenever it's necessary. */ + if (imm_state->count == imm_state->size) { + if (!ir3_const_ensure_imm_size(v, imm_state->size + 4)) { + return INVALID_CONST_REG; } } /* Add on a new immediate to be pushed, if we have space left in the * constbuf. */ - if (const_state->allocs.max_const_offset_vec4 + - const_state->immediates_count / 4 >= + if (const_state->allocs.max_const_offset_vec4 + imm_state->count / 4 >= ir3_max_const(v)) { return INVALID_CONST_REG; } - const_state->immediates[const_state->immediates_count] = imm; - return const_imm_index_to_reg(const_state, const_state->immediates_count++); + imm_state->values[imm_state->count] = imm; + return const_imm_index_to_reg(const_state, imm_state->count++); } int @@ -912,14 +942,15 @@ ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out) } const struct ir3_const_state *const_state = ir3_const_state(so); - for (i = 0; i < DIV_ROUND_UP(const_state->immediates_count, 4); i++) { + const struct ir3_imm_const_state *imm_state = &so->imm_state; + for (i = 0; i < DIV_ROUND_UP(imm_state->count, 4); i++) { fprintf(out, "@const(c%d.x)\t", const_state->allocs.max_const_offset_vec4 + i); fprintf(out, "0x%08x, 0x%08x, 0x%08x, 0x%08x\n", - const_state->immediates[i * 4 + 0], - const_state->immediates[i * 4 + 1], - const_state->immediates[i * 4 + 2], - const_state->immediates[i * 4 + 3]); + imm_state->values[i * 4 + 0], + imm_state->values[i * 4 + 1], + imm_state->values[i * 4 + 2], + imm_state->values[i * 4 + 3]); } ir3_isa_disasm(bin, so->info.sizedwords * 4, out, diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index 64ccc949185..f73985cbdb5 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -260,6 +260,12 @@ struct ir3_const_image_dims { uint32_t off[IR3_MAX_SHADER_IMAGES]; }; +struct ir3_imm_const_state { + unsigned size; + unsigned count; + uint32_t *values; +}; + /** * Describes the layout of shader consts in the const register file * and additional info about individual allocations. @@ -289,10 +295,6 @@ struct ir3_const_state { struct ir3_const_image_dims image_dims; - unsigned immediates_count; - unsigned immediates_size; - uint32_t *immediates; - /* State of ubo access lowered to push consts: */ struct ir3_ubo_analysis_state ubo_state; enum ir3_push_consts_type push_consts_type; @@ -674,6 +676,13 @@ struct ir3_shader_variant { struct ir3_const_state *const_state; + /* Immediate values that will be lowered to const registers. Before a7xx, + * this will be uploaded together with the const_state. From a7xx on (where + * load_shader_consts_via_preamble is true), this will be lowered to const + * stores in the preamble. + */ + struct ir3_imm_const_state imm_state; + /* * The following macros are used by the shader disk cache save/ * restore paths to serialize/deserialize the variant. Any @@ -1083,6 +1092,7 @@ ir3_max_const(const struct ir3_shader_variant *v) return _ir3_max_const(v, v->key.safe_constlen); } +bool ir3_const_ensure_imm_size(struct ir3_shader_variant *v, unsigned size); uint16_t ir3_const_find_imm(struct ir3_shader_variant *v, uint32_t imm); uint16_t ir3_const_add_imm(struct ir3_shader_variant *v, uint32_t imm); diff --git a/src/freedreno/vulkan/tu_shader.cc b/src/freedreno/vulkan/tu_shader.cc index 14aa5579b55..f54de781a6a 100644 --- a/src/freedreno/vulkan/tu_shader.cc +++ b/src/freedreno/vulkan/tu_shader.cc @@ -1157,7 +1157,8 @@ tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant *xs) { const struct ir3_const_state *const_state = ir3_const_state(xs); uint32_t base = const_state->allocs.max_const_offset_vec4; - int32_t size = DIV_ROUND_UP(const_state->immediates_count, 4); + const struct ir3_imm_const_state *imm_state = &xs->imm_state; + int32_t size = DIV_ROUND_UP(imm_state->count, 4); /* truncate size to avoid writing constants that shader * does not use: @@ -1365,6 +1366,7 @@ tu6_emit_xs(struct tu_cs *cs, const struct ir3_const_state *const_state = ir3_const_state(xs); uint32_t base = const_state->allocs.max_const_offset_vec4; + const struct ir3_imm_const_state *imm_state = &xs->imm_state; unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs); if (immediate_size > 0) { @@ -1378,7 +1380,7 @@ tu6_emit_xs(struct tu_cs *cs, tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); - tu_cs_emit_array(cs, const_state->immediates, immediate_size); + tu_cs_emit_array(cs, imm_state->values, immediate_size); } if (const_state->consts_ubo.idx != -1) { diff --git a/src/gallium/drivers/freedreno/ir3/ir3_const.h b/src/gallium/drivers/freedreno/ir3/ir3_const.h index d430a0133ab..21f91fc3952 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_const.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_const.h @@ -300,7 +300,7 @@ ir3_emit_immediates(const struct ir3_shader_variant *v, { const struct ir3_const_state *const_state = ir3_const_state(v); uint32_t base = const_state->allocs.max_const_offset_vec4; - int size = DIV_ROUND_UP(const_state->immediates_count, 4); + int size = DIV_ROUND_UP(v->imm_state.count, 4); /* truncate size to avoid writing constants that shader * does not use: @@ -312,7 +312,7 @@ ir3_emit_immediates(const struct ir3_shader_variant *v, size *= 4; if (size > 0) - emit_const_user(ring, v, base, size, const_state->immediates); + emit_const_user(ring, v, base, size, v->imm_state.values); /* NIR constant data has the same lifetime as immediates, so upload it * now, too.