diff --git a/src/freedreno/computerator/a4xx.cc b/src/freedreno/computerator/a4xx.cc index 30871ca781f..a5d2ca0e346 100644 --- a/src/freedreno/computerator/a4xx.cc +++ b/src/freedreno/computerator/a4xx.cc @@ -208,7 +208,8 @@ cs_const_emit(struct fd_ringbuffer *ring, struct kernel *kernel, const struct ir3_const_state *const_state = ir3_const_state(v); uint32_t base = const_state->allocs.max_const_offset_vec4; - int size = DIV_ROUND_UP(const_state->immediates_count, 4); + const struct ir3_imm_const_state *imm_state = &v->imm_state; + int size = DIV_ROUND_UP(imm_state->count, 4); /* truncate size to avoid writing constants that shader * does not use: @@ -220,7 +221,7 @@ cs_const_emit(struct fd_ringbuffer *ring, struct kernel *kernel, size *= 4; if (size > 0) { - emit_const(ring, kernel, base, size, const_state->immediates); + emit_const(ring, kernel, base, size, imm_state->values); } } diff --git a/src/freedreno/computerator/a6xx.cc b/src/freedreno/computerator/a6xx.cc index 2b38ad3cd4d..c07474c3620 100644 --- a/src/freedreno/computerator/a6xx.cc +++ b/src/freedreno/computerator/a6xx.cc @@ -317,14 +317,15 @@ cs_const_emit(struct fd_ringbuffer *ring, struct kernel *kernel, const struct ir3_const_state *const_state = ir3_const_state(v); uint32_t base = const_state->allocs.max_const_offset_vec4; - int size = DIV_ROUND_UP(const_state->immediates_count, 4); + const struct ir3_imm_const_state *imm_state = &v->imm_state; + int size = DIV_ROUND_UP(imm_state->count, 4); if (ir3_kernel->info.numwg != INVALID_REG) { assert((ir3_kernel->info.numwg & 0x3) == 0); int idx = ir3_kernel->info.numwg >> 2; - const_state->immediates[idx * 4 + 0] = grid[0]; - const_state->immediates[idx * 4 + 1] = grid[1]; - const_state->immediates[idx * 4 + 2] = grid[2]; + imm_state->values[idx * 4 + 0] = grid[0]; + imm_state->values[idx * 4 + 1] = grid[1]; + imm_state->values[idx * 4 + 2] = grid[2]; } for (int i = 0; i < MAX_BUFS; i++) { @@ -334,8 +335,8 @@ cs_const_emit(struct fd_ringbuffer *ring, struct kernel *kernel, uint64_t iova = fd_bo_get_iova(kernel->bufs[i]); - const_state->immediates[idx * 4 + 1] = iova >> 32; - const_state->immediates[idx * 4 + 0] = (iova << 32) >> 32; + imm_state->values[idx * 4 + 1] = iova >> 32; + imm_state->values[idx * 4 + 0] = (iova << 32) >> 32; } } @@ -349,7 +350,7 @@ cs_const_emit(struct fd_ringbuffer *ring, struct kernel *kernel, size *= 4; if (size > 0) { - emit_const(ring, base, size, const_state->immediates); + emit_const(ring, base, size, imm_state->values); } } diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index feb41ff4d3b..e9b692a9feb 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -5754,6 +5754,18 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, so->need_full_quad = true; } + /* If we're uploading immediates as part of the const state, we need to make + * sure the binning and non-binning variants have the same size. Pre-allocate + * for the binning variant, ir3_const_add_imm will ensure we don't add more + * immediates than allowed. + */ + if (so->binning_pass && !compiler->load_shader_consts_via_preamble && + so->nonbinning->imm_state.size) { + ASSERTED bool success = + ir3_const_ensure_imm_size(so, so->nonbinning->imm_state.size); + assert(success); + } + ir3_debug_print(ir, "AFTER: nir->ir3"); ir3_validate(ir); diff --git a/src/freedreno/ir3/ir3_disk_cache.c b/src/freedreno/ir3/ir3_disk_cache.c index 1d35b435eb2..1bb46460ccb 100644 --- a/src/freedreno/ir3/ir3_disk_cache.c +++ b/src/freedreno/ir3/ir3_disk_cache.c @@ -117,10 +117,14 @@ retrieve_variant(struct blob_reader *blob, struct ir3_shader_variant *v) if (!v->binning_pass) { blob_copy_bytes(blob, v->const_state, sizeof(*v->const_state)); - unsigned immeds_sz = v->const_state->immediates_size * - sizeof(v->const_state->immediates[0]); - v->const_state->immediates = ralloc_size(v->const_state, immeds_sz); - blob_copy_bytes(blob, v->const_state->immediates, immeds_sz); + } + + if (!v->compiler->load_shader_consts_via_preamble) { + v->imm_state.size = blob_read_uint32(blob); + v->imm_state.count = v->imm_state.size; + uint32_t immeds_sz = v->imm_state.size * sizeof(v->imm_state.values[0]); + v->imm_state.values = ralloc_size(v, immeds_sz); + blob_copy_bytes(blob, v->imm_state.values, immeds_sz); } } @@ -139,9 +143,15 @@ store_variant(struct blob *blob, const struct ir3_shader_variant *v) if (!v->binning_pass) { blob_write_bytes(blob, v->const_state, sizeof(*v->const_state)); - unsigned immeds_sz = v->const_state->immediates_size * - sizeof(v->const_state->immediates[0]); - blob_write_bytes(blob, v->const_state->immediates, immeds_sz); + } + + /* When load_shader_consts_via_preamble, immediates are loaded in the + * preamble and hence part of bin. + */ + if (!v->compiler->load_shader_consts_via_preamble) { + blob_write_uint32(blob, v->imm_state.size); + uint32_t immeds_sz = v->imm_state.size * sizeof(v->imm_state.values[0]); + blob_write_bytes(blob, v->imm_state.values, immeds_sz); } } diff --git a/src/freedreno/ir3/ir3_parser.y b/src/freedreno/ir3/ir3_parser.y index b46e3b9826d..6b890d799fa 100644 --- a/src/freedreno/ir3/ir3_parser.y +++ b/src/freedreno/ir3/ir3_parser.y @@ -243,23 +243,23 @@ static void fixup_cat5_s2en(void) static void add_const(unsigned reg, unsigned c0, unsigned c1, unsigned c2, unsigned c3) { - struct ir3_const_state *const_state = ir3_const_state_mut(variant); + struct ir3_imm_const_state *imm_state = &variant->imm_state; assert((reg & 0x7) == 0); int idx = reg >> (1 + 2); /* low bit is half vs full, next two bits are swiz */ - if (idx * 4 + 4 > const_state->immediates_size) { - const_state->immediates = rerzalloc(const_state, - const_state->immediates, - __typeof__(const_state->immediates[0]), - const_state->immediates_size, + if (idx * 4 + 4 > imm_state->size) { + imm_state->values = rerzalloc(imm_state, + imm_state->values, + __typeof__(imm_state->values[0]), + imm_state->size, idx * 4 + 4); - for (unsigned i = const_state->immediates_size; i < idx * 4; i++) - const_state->immediates[i] = 0xd0d0d0d0; - const_state->immediates_size = const_state->immediates_count = idx * 4 + 4; + for (unsigned i = imm_state->size; i < idx * 4; i++) + imm_state->values[i] = 0xd0d0d0d0; + imm_state->size = imm_state->count = idx * 4 + 4; } - const_state->immediates[idx * 4 + 0] = c0; - const_state->immediates[idx * 4 + 1] = c1; - const_state->immediates[idx * 4 + 2] = c2; - const_state->immediates[idx * 4 + 3] = c3; + imm_state->values[idx * 4 + 0] = c0; + imm_state->values[idx * 4 + 1] = c1; + imm_state->values[idx * 4 + 2] = c2; + imm_state->values[idx * 4 + 3] = c3; } static void add_buf_init_val(uint32_t val) diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c index 24f9322da48..d831faa67e0 100644 --- a/src/freedreno/ir3/ir3_shader.c +++ b/src/freedreno/ir3/ir3_shader.c @@ -25,6 +25,45 @@ #include "disasm.h" +bool +ir3_const_ensure_imm_size(struct ir3_shader_variant *v, unsigned size) +{ + struct ir3_imm_const_state *imm_state = &v->imm_state; + + if (size <= imm_state->size) { + return true; + } + + /* Immediates are uploaded in units of vec4 so make sure our buffer is large + * enough. + */ + size = ALIGN(size, 4); + + /* Pre-a7xx, the immediates that get lowered to const registers are + * emitted as part of the const state so the total size of immediates + * should be the same for the binning and non-binning variants. Make sure + * we don't increase the size beyond that of the non-binning variant. + */ + if (v->binning_pass && !v->compiler->load_shader_consts_via_preamble && + size > v->nonbinning->imm_state.size) { + return false; + } + + imm_state->values = + rerzalloc(v, imm_state->values, __typeof__(imm_state->values[0]), + imm_state->size, size); + imm_state->size = size; + + /* Note that ir3 printing relies on having groups of 4 dwords, so we fill the + * unused slots with a dummy value. + */ + for (int i = imm_state->count; i < imm_state->size; i++) { + imm_state->values[i] = 0xd0d0d0d0; + } + + return true; +} + static uint16_t const_imm_index_to_reg(const struct ir3_const_state *const_state, unsigned i) { @@ -35,9 +74,10 @@ uint16_t ir3_const_find_imm(struct ir3_shader_variant *v, uint32_t imm) { const struct ir3_const_state *const_state = ir3_const_state(v); + const struct ir3_imm_const_state *imm_state = &v->imm_state; - for (unsigned i = 0; i < const_state->immediates_count; i++) { - if (const_state->immediates[i] == imm) + for (unsigned i = 0; i < imm_state->count; i++) { + if (imm_state->values[i] == imm) return const_imm_index_to_reg(const_state, i); } @@ -47,36 +87,26 @@ ir3_const_find_imm(struct ir3_shader_variant *v, uint32_t imm) uint16_t ir3_const_add_imm(struct ir3_shader_variant *v, uint32_t imm) { - struct ir3_const_state *const_state = ir3_const_state_mut(v); + const struct ir3_const_state *const_state = ir3_const_state(v); + struct ir3_imm_const_state *imm_state = &v->imm_state; - /* Reallocate for 4 more elements whenever it's necessary. Note that ir3 - * printing relies on having groups of 4 dwords, so we fill the unused - * slots with a dummy value. - */ - if (const_state->immediates_count == const_state->immediates_size) { - const_state->immediates = rerzalloc( - const_state, const_state->immediates, - __typeof__(const_state->immediates[0]), const_state->immediates_size, - const_state->immediates_size + 4); - const_state->immediates_size += 4; - - for (int i = const_state->immediates_count; - i < const_state->immediates_size; i++) { - const_state->immediates[i] = 0xd0d0d0d0; + /* Reallocate for 4 more elements whenever it's necessary. */ + if (imm_state->count == imm_state->size) { + if (!ir3_const_ensure_imm_size(v, imm_state->size + 4)) { + return INVALID_CONST_REG; } } /* Add on a new immediate to be pushed, if we have space left in the * constbuf. */ - if (const_state->allocs.max_const_offset_vec4 + - const_state->immediates_count / 4 >= + if (const_state->allocs.max_const_offset_vec4 + imm_state->count / 4 >= ir3_max_const(v)) { return INVALID_CONST_REG; } - const_state->immediates[const_state->immediates_count] = imm; - return const_imm_index_to_reg(const_state, const_state->immediates_count++); + imm_state->values[imm_state->count] = imm; + return const_imm_index_to_reg(const_state, imm_state->count++); } int @@ -912,14 +942,15 @@ ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out) } const struct ir3_const_state *const_state = ir3_const_state(so); - for (i = 0; i < DIV_ROUND_UP(const_state->immediates_count, 4); i++) { + const struct ir3_imm_const_state *imm_state = &so->imm_state; + for (i = 0; i < DIV_ROUND_UP(imm_state->count, 4); i++) { fprintf(out, "@const(c%d.x)\t", const_state->allocs.max_const_offset_vec4 + i); fprintf(out, "0x%08x, 0x%08x, 0x%08x, 0x%08x\n", - const_state->immediates[i * 4 + 0], - const_state->immediates[i * 4 + 1], - const_state->immediates[i * 4 + 2], - const_state->immediates[i * 4 + 3]); + imm_state->values[i * 4 + 0], + imm_state->values[i * 4 + 1], + imm_state->values[i * 4 + 2], + imm_state->values[i * 4 + 3]); } ir3_isa_disasm(bin, so->info.sizedwords * 4, out, diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index 64ccc949185..f73985cbdb5 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -260,6 +260,12 @@ struct ir3_const_image_dims { uint32_t off[IR3_MAX_SHADER_IMAGES]; }; +struct ir3_imm_const_state { + unsigned size; + unsigned count; + uint32_t *values; +}; + /** * Describes the layout of shader consts in the const register file * and additional info about individual allocations. @@ -289,10 +295,6 @@ struct ir3_const_state { struct ir3_const_image_dims image_dims; - unsigned immediates_count; - unsigned immediates_size; - uint32_t *immediates; - /* State of ubo access lowered to push consts: */ struct ir3_ubo_analysis_state ubo_state; enum ir3_push_consts_type push_consts_type; @@ -674,6 +676,13 @@ struct ir3_shader_variant { struct ir3_const_state *const_state; + /* Immediate values that will be lowered to const registers. Before a7xx, + * this will be uploaded together with the const_state. From a7xx on (where + * load_shader_consts_via_preamble is true), this will be lowered to const + * stores in the preamble. + */ + struct ir3_imm_const_state imm_state; + /* * The following macros are used by the shader disk cache save/ * restore paths to serialize/deserialize the variant. Any @@ -1083,6 +1092,7 @@ ir3_max_const(const struct ir3_shader_variant *v) return _ir3_max_const(v, v->key.safe_constlen); } +bool ir3_const_ensure_imm_size(struct ir3_shader_variant *v, unsigned size); uint16_t ir3_const_find_imm(struct ir3_shader_variant *v, uint32_t imm); uint16_t ir3_const_add_imm(struct ir3_shader_variant *v, uint32_t imm); diff --git a/src/freedreno/vulkan/tu_shader.cc b/src/freedreno/vulkan/tu_shader.cc index 14aa5579b55..f54de781a6a 100644 --- a/src/freedreno/vulkan/tu_shader.cc +++ b/src/freedreno/vulkan/tu_shader.cc @@ -1157,7 +1157,8 @@ tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant *xs) { const struct ir3_const_state *const_state = ir3_const_state(xs); uint32_t base = const_state->allocs.max_const_offset_vec4; - int32_t size = DIV_ROUND_UP(const_state->immediates_count, 4); + const struct ir3_imm_const_state *imm_state = &xs->imm_state; + int32_t size = DIV_ROUND_UP(imm_state->count, 4); /* truncate size to avoid writing constants that shader * does not use: @@ -1365,6 +1366,7 @@ tu6_emit_xs(struct tu_cs *cs, const struct ir3_const_state *const_state = ir3_const_state(xs); uint32_t base = const_state->allocs.max_const_offset_vec4; + const struct ir3_imm_const_state *imm_state = &xs->imm_state; unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs); if (immediate_size > 0) { @@ -1378,7 +1380,7 @@ tu6_emit_xs(struct tu_cs *cs, tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); - tu_cs_emit_array(cs, const_state->immediates, immediate_size); + tu_cs_emit_array(cs, imm_state->values, immediate_size); } if (const_state->consts_ubo.idx != -1) { diff --git a/src/gallium/drivers/freedreno/ir3/ir3_const.h b/src/gallium/drivers/freedreno/ir3/ir3_const.h index d430a0133ab..21f91fc3952 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_const.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_const.h @@ -300,7 +300,7 @@ ir3_emit_immediates(const struct ir3_shader_variant *v, { const struct ir3_const_state *const_state = ir3_const_state(v); uint32_t base = const_state->allocs.max_const_offset_vec4; - int size = DIV_ROUND_UP(const_state->immediates_count, 4); + int size = DIV_ROUND_UP(v->imm_state.count, 4); /* truncate size to avoid writing constants that shader * does not use: @@ -312,7 +312,7 @@ ir3_emit_immediates(const struct ir3_shader_variant *v, size *= 4; if (size > 0) - emit_const_user(ring, v, base, size, const_state->immediates); + emit_const_user(ring, v, base, size, v->imm_state.values); /* NIR constant data has the same lifetime as immediates, so upload it * now, too.