diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index c2420e7a017..113758aa2f3 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -1263,7 +1263,7 @@ emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr, struct ir3_builder *b = &ctx->build; struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1; const struct ir3_const_state *const_state = ir3_const_state(ctx->so); - unsigned ubo = regid(const_state->offsets.ubo, 0); + unsigned ubo = ir3_const_reg(const_state, IR3_CONST_ALLOC_UBO_PTRS, 0); const unsigned ptrsz = ir3_pointer_size(ctx->compiler); int off = 0; @@ -1283,9 +1283,10 @@ emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr, * at least big enough to cover all the UBO addresses, since the * assembler won't know what the max address reg is. */ - ctx->so->constlen = - MAX2(ctx->so->constlen, - const_state->offsets.ubo + (ctx->s->info.num_ubos * ptrsz)); + ctx->so->constlen = MAX2( + ctx->so->constlen, + const_state->allocs.consts[IR3_CONST_ALLOC_UBO_PTRS].offset_vec4 + + (ctx->s->info.num_ubos * ptrsz)); } /* note: on 32bit gpu's base_hi is ignored and DCE'd */ @@ -2686,15 +2687,8 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) if (ctx->compiler->has_scalar_alu && !intr->def.divergent) dst[i]->dsts[0]->flags |= IR3_REG_SHARED; } - /* NOTE: if relative addressing is used, we set - * constlen in the compiler (to worst-case value) - * since we don't know in the assembler what the max - * addr reg value can be: - */ - ctx->so->constlen = - MAX2(ctx->so->constlen, - ctx->so->shader_options.num_reserved_user_consts + - const_state->ubo_state.size / 16); + + ctx->has_relative_load_const_ir3 = true; } break; @@ -5942,6 +5936,27 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, ctx->so->per_samp = ctx->s->info.fs.uses_sample_shading; + if (ctx->has_relative_load_const_ir3) { + /* NOTE: if relative addressing is used, we set + * constlen in the compiler (to worst-case value) + * since we don't know in the assembler what the max + * addr reg value can be: + */ + const struct ir3_const_state *const_state = ir3_const_state(ctx->so); + const enum ir3_const_alloc_type rel_const_srcs[] = { + IR3_CONST_ALLOC_INLINE_UNIFORM_ADDRS, IR3_CONST_ALLOC_UBO_RANGES, + IR3_CONST_ALLOC_PREAMBLE, IR3_CONST_ALLOC_GLOBAL}; + for (int i = 0; i < ARRAY_SIZE(rel_const_srcs); i++) { + const struct ir3_const_allocation *const_alloc = + &const_state->allocs.consts[rel_const_srcs[i]]; + if (const_alloc->size_vec4 > 0) { + ctx->so->constlen = + MAX2(ctx->so->constlen, + const_alloc->offset_vec4 + const_alloc->size_vec4); + } + } + } + if (ctx->so->type == MESA_SHADER_FRAGMENT && compiler->fs_must_have_non_zero_constlen_quirk) { so->constlen = MAX2(so->constlen, 4); diff --git a/src/freedreno/ir3/ir3_context.h b/src/freedreno/ir3/ir3_context.h index c944bfdeb3c..bcd1c258970 100644 --- a/src/freedreno/ir3/ir3_context.h +++ b/src/freedreno/ir3/ir3_context.h @@ -141,6 +141,8 @@ struct ir3_context { unsigned prefetch_limit; + bool has_relative_load_const_ir3; + /* set if we encounter something we can't handle yet, so we * can bail cleanly and fallback to TGSI compiler f/e */ diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index 3c8fb8a1fc9..4b4b44a090c 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -1151,6 +1151,10 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, progress |= OPT(s, ir3_nir_lower_io_offsets); + if (!so->binning_pass) { + ir3_const_alloc_all_reserved_space(&ir3_const_state_mut(so)->allocs); + } + if (progress) ir3_optimize_loop(so->compiler, options, s); @@ -1364,6 +1368,61 @@ ir3_align_constoff(struct ir3_const_state *const_state, unsigned constoff, return constoff; } +void +ir3_const_alloc(struct ir3_const_allocations *const_alloc, + enum ir3_const_alloc_type type, uint32_t size_vec4, + uint32_t align_vec4) +{ + struct ir3_const_allocation *alloc = &const_alloc->consts[type]; + assert(alloc->size_vec4 == 0); + + const_alloc->max_const_offset_vec4 = + align(const_alloc->max_const_offset_vec4, align_vec4); + alloc->size_vec4 = size_vec4; + alloc->offset_vec4 = const_alloc->max_const_offset_vec4; + const_alloc->max_const_offset_vec4 += size_vec4; +} + +void +ir3_const_reserve_space(struct ir3_const_allocations *const_alloc, + enum ir3_const_alloc_type type, uint32_t size_vec4, + uint32_t align_vec4) +{ + struct ir3_const_allocation *alloc = &const_alloc->consts[type]; + assert(alloc->size_vec4 == 0 && alloc->reserved_size_vec4 == 0); + + alloc->reserved_size_vec4 = size_vec4; + alloc->reserved_align_vec4 = align_vec4; + /* Be pessimistic here and assume the worst case alignment is needed */ + const_alloc->reserved_vec4 += size_vec4 + align_vec4 - 1; +} + +void +ir3_const_free_reserved_space(struct ir3_const_allocations *const_alloc, + enum ir3_const_alloc_type type) +{ + struct ir3_const_allocation *alloc = &const_alloc->consts[type]; + assert(const_alloc->reserved_vec4 >= alloc->reserved_size_vec4); + + const_alloc->reserved_vec4 -= + alloc->reserved_size_vec4 + alloc->reserved_align_vec4 - 1; + alloc->reserved_size_vec4 = 0; +} + +void +ir3_const_alloc_all_reserved_space(struct ir3_const_allocations *const_alloc) +{ + for (int i = 0; i < IR3_CONST_ALLOC_MAX; i++) { + if (const_alloc->consts[i].reserved_size_vec4 > 0) { + ir3_const_alloc(const_alloc, i, + const_alloc->consts[i].reserved_size_vec4, + const_alloc->consts[i].reserved_align_vec4); + const_alloc->consts[i].reserved_size_vec4 = 0; + } + } + const_alloc->reserved_vec4 = 0; +} + /* Sets up the variant-dependent constant state for the ir3_shader. Note * that it is also used from ir3_nir_analyze_ubo_ranges() to figure out the * maximum number of driver params that would eventually be used, to leave @@ -1374,6 +1433,7 @@ ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v, struct ir3_const_state *const_state) { struct ir3_compiler *compiler = v->compiler; + unsigned ptrsz = ir3_pointer_size(compiler); memset(&const_state->offsets, ~0, sizeof(const_state->offsets)); const_state->required_consts_aligment_vec4 = 1; @@ -1388,16 +1448,8 @@ ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v, const_state->num_ubos = nir->info.num_ubos; assert((const_state->ubo_state.size % 16) == 0); - unsigned constoff = v->shader_options.num_reserved_user_consts + - const_state->ubo_state.size / 16 + - const_state->preamble_size + - const_state->global_size; - unsigned ptrsz = ir3_pointer_size(compiler); - if (const_state->num_ubos > 0 && compiler->gen < 6) { - const_state->offsets.ubo = constoff; - constoff += align(const_state->num_ubos * ptrsz, 4) / 4; - } + unsigned constoff = const_state->allocs.max_const_offset_vec4; if (const_state->image_dims.count > 0) { unsigned cnt = const_state->image_dims.count; @@ -1482,9 +1534,12 @@ ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v, uint32_t ir3_const_state_get_free_space(const struct ir3_shader_variant *v, - const struct ir3_const_state *const_state) + const struct ir3_const_state *const_state, + uint32_t align_vec4) { - uint32_t free_space_vec4 = ir3_max_const(v) - const_state->offsets.immediate; + uint32_t free_space_vec4 = + ir3_max_const(v) - align(const_state->offsets.immediate, align_vec4) - + const_state->allocs.reserved_vec4; free_space_vec4 = (free_space_vec4 / const_state->required_consts_aligment_vec4) * const_state->required_consts_aligment_vec4; diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h index 417d3039ddc..cf9b1e4546c 100644 --- a/src/freedreno/ir3/ir3_nir.h +++ b/src/freedreno/ir3/ir3_nir.h @@ -77,7 +77,18 @@ void ir3_nir_lower_variant(struct ir3_shader_variant *so, void ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v, struct ir3_const_state *const_state); uint32_t ir3_const_state_get_free_space(const struct ir3_shader_variant *v, - const struct ir3_const_state *const_state); + const struct ir3_const_state *const_state, + uint32_t align_vec4); +void ir3_const_alloc(struct ir3_const_allocations *const_alloc, + enum ir3_const_alloc_type type, uint32_t size_vec4, + uint32_t align_vec4); +void ir3_const_reserve_space(struct ir3_const_allocations *const_alloc, + enum ir3_const_alloc_type type, + uint32_t size_vec4, uint32_t align_vec4); +void ir3_const_free_reserved_space(struct ir3_const_allocations *const_alloc, + enum ir3_const_alloc_type type); +void ir3_const_alloc_all_reserved_space(struct ir3_const_allocations *const_alloc); + bool ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v); void ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v); bool ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v); diff --git a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c index eaa6e7db649..f1f3ffca6bd 100644 --- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c +++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c @@ -553,15 +553,18 @@ ir3_nir_lower_const_global_loads(nir_shader *nir, struct ir3_shader_variant *v) return false; unsigned max_upload; + uint32_t global_offset = 0; if (v->binning_pass) { - max_upload = const_state->global_size * 16; - } else { - struct ir3_const_state worst_case_const_state = { - .preamble_size = const_state->preamble_size, - }; - ir3_setup_const_state(nir, v, &worst_case_const_state); max_upload = - ir3_const_state_get_free_space(v, &worst_case_const_state) * 16; + const_state->allocs.consts[IR3_CONST_ALLOC_GLOBAL].size_vec4 * 16; + global_offset = + const_state->allocs.consts[IR3_CONST_ALLOC_GLOBAL].offset_vec4 * 16; + } else { + struct ir3_const_state *const_state = ir3_const_state_mut(v); + ir3_setup_const_state(nir, v, const_state); + global_offset = const_state->allocs.max_const_offset_vec4 * 16; + max_upload = + ir3_const_state_get_free_space(v, const_state, 1) * 16; } struct ir3_ubo_analysis_state state = {}; @@ -581,7 +584,6 @@ ir3_nir_lower_const_global_loads(nir_shader *nir, struct ir3_shader_variant *v) } } - uint32_t global_offset = v->shader_options.num_reserved_user_consts * 16; assign_offsets(&state, global_offset, max_upload); bool progress = copy_global_to_uniform(nir, &state); @@ -612,8 +614,10 @@ ir3_nir_lower_const_global_loads(nir_shader *nir, struct ir3_shader_variant *v) } } - if (!v->binning_pass) - ir3_const_state_mut(v)->global_size = DIV_ROUND_UP(state.size, 16); + if (!v->binning_pass) { + ir3_const_alloc(&ir3_const_state_mut(v)->allocs, IR3_CONST_ALLOC_GLOBAL, + DIV_ROUND_UP(state.size, 16), 1); + } return progress; } @@ -625,19 +629,26 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v) struct ir3_ubo_analysis_state *state = &const_state->ubo_state; struct ir3_compiler *compiler = v->compiler; + if (compiler->gen < 6 && const_state->num_ubos > 0) { + uint32_t ptrs_vec4 = + align(const_state->num_ubos * ir3_pointer_size(compiler), 4) / 4; + ir3_const_reserve_space(&const_state->allocs, IR3_CONST_ALLOC_UBO_PTRS, + ptrs_vec4, 1); + } + + uint32_t align_vec4 = compiler->load_shader_consts_via_preamble + ? 1 + : compiler->const_upload_unit; + /* Limit our uploads to the amount of constant buffer space available in * the hardware, minus what the shader compiler may need for various * driver params. We do this UBO-to-push-constant before the real * allocation of the driver params' const space, because UBO pointers can * be driver params but this pass usually eliminatings them. */ - struct ir3_const_state worst_case_const_state = { - .preamble_size = const_state->preamble_size, - .global_size = const_state->global_size, - }; - ir3_setup_const_state(nir, v, &worst_case_const_state); + ir3_setup_const_state(nir, v, const_state); const uint32_t max_upload = - ir3_const_state_get_free_space(v, &worst_case_const_state) * 16; + ir3_const_state_get_free_space(v, const_state, align_vec4) * 16; memset(state, 0, sizeof(*state)); @@ -660,9 +671,13 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v) } } - uint32_t ubo_offset = v->shader_options.num_reserved_user_consts * 16 + - const_state->global_size * 16; + uint32_t ubo_offset = align(const_state->allocs.max_const_offset_vec4, align_vec4) * 16; assign_offsets(state, ubo_offset, max_upload); + + uint32_t upload_vec4 = state->size / 16; + if (upload_vec4 > 0) + ir3_const_alloc(&ir3_const_state_mut(v)->allocs, + IR3_CONST_ALLOC_UBO_RANGES, upload_vec4, align_vec4); } bool @@ -709,6 +724,22 @@ ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v) if (nir->info.first_ubo_is_default_ubo && !push_ubos && !has_preamble) nir->info.num_ubos = num_ubos; + + if (!v->binning_pass) { + ir3_const_state_mut(v)->num_ubos = num_ubos; + + if (compiler->gen < 6) + ir3_const_free_reserved_space(&ir3_const_state_mut(v)->allocs, + IR3_CONST_ALLOC_UBO_PTRS); + + if (compiler->gen < 6 && const_state->num_ubos > 0) { + uint32_t upload_ptrs_vec4 = + align(const_state->num_ubos * ir3_pointer_size(compiler), 4) / 4; + ir3_const_alloc(&ir3_const_state_mut(v)->allocs, + IR3_CONST_ALLOC_UBO_PTRS, upload_ptrs_vec4, 1); + } + } + if (compiler->has_preamble && push_ubos) progress |= copy_ubo_to_uniform( nir, const_state, !compiler->load_shader_consts_via_preamble); diff --git a/src/freedreno/ir3/ir3_nir_opt_preamble.c b/src/freedreno/ir3/ir3_nir_opt_preamble.c index f04169c4ade..1d42dd9fee1 100644 --- a/src/freedreno/ir3/ir3_nir_opt_preamble.c +++ b/src/freedreno/ir3/ir3_nir_opt_preamble.c @@ -284,11 +284,13 @@ ir3_nir_opt_preamble(nir_shader *nir, struct ir3_shader_variant *v) unsigned max_size; if (v->binning_pass) { const struct ir3_const_state *const_state = ir3_const_state(v); - max_size = const_state->preamble_size * 4; + max_size = + const_state->allocs.consts[IR3_CONST_ALLOC_PREAMBLE].size_vec4 * 4; } else { - struct ir3_const_state worst_case_const_state = {}; - ir3_setup_const_state(nir, v, &worst_case_const_state); - max_size = ir3_const_state_get_free_space(v, &worst_case_const_state) * 4; + struct ir3_const_state *const_state = ir3_const_state_mut(v); + ir3_setup_const_state(nir, v, const_state); + max_size = ir3_const_state_get_free_space( + v, const_state, v->compiler->const_upload_unit) * 4; } if (max_size == 0) @@ -312,8 +314,10 @@ ir3_nir_opt_preamble(nir_shader *nir, struct ir3_shader_variant *v) progress |= nir_opt_preamble(nir, &options, &size); if (!v->binning_pass) { - struct ir3_const_state *const_state = ir3_const_state_mut(v); - const_state->preamble_size = DIV_ROUND_UP(size, 4); + uint32_t preamble_size_vec4 = + align(DIV_ROUND_UP(size, 4), v->compiler->const_upload_unit); + ir3_const_alloc(&ir3_const_state_mut(v)->allocs, IR3_CONST_ALLOC_PREAMBLE, + preamble_size_vec4, v->compiler->const_upload_unit); } return progress; @@ -605,8 +609,9 @@ ir3_nir_opt_prefetch_descriptors(nir_shader *nir, struct ir3_shader_variant *v) bool progress = false; struct prefetch_state state = {}; - nir_def **preamble_defs = calloc(const_state->preamble_size * 4, - sizeof(nir_def *)); + nir_def **preamble_defs = + calloc(const_state->allocs.consts[IR3_CONST_ALLOC_PREAMBLE].size_vec4 * 4, + sizeof(nir_def *)); /* Collect preamble defs. This is useful if the computation of the offset has * already been hoisted to the preamble. @@ -622,7 +627,9 @@ ir3_nir_opt_prefetch_descriptors(nir_shader *nir, struct ir3_shader_variant *v) if (intrin->intrinsic != nir_intrinsic_store_preamble) continue; - assert(nir_intrinsic_base(intrin) < const_state->preamble_size * 4); + assert( + nir_intrinsic_base(intrin) < + const_state->allocs.consts[IR3_CONST_ALLOC_PREAMBLE].size_vec4 * 4); preamble_defs[nir_intrinsic_base(intrin)] = intrin->src[0].ssa; } } @@ -719,9 +726,10 @@ ir3_nir_lower_preamble(nir_shader *nir, struct ir3_shader_variant *v) /* First, lower load/store_preamble. */ const struct ir3_const_state *const_state = ir3_const_state(v); - unsigned preamble_base = v->shader_options.num_reserved_user_consts * 4 + - const_state->ubo_state.size / 4 + const_state->global_size * 4; - unsigned preamble_size = const_state->preamble_size * 4; + unsigned preamble_base = + const_state->allocs.consts[IR3_CONST_ALLOC_PREAMBLE].offset_vec4 * 4; + unsigned preamble_size = + const_state->allocs.consts[IR3_CONST_ALLOC_PREAMBLE].size_vec4 * 4; BITSET_DECLARE(promoted_to_float, preamble_size); memset(promoted_to_float, 0, sizeof(promoted_to_float)); diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c index ef00c897843..5c3a2b776c6 100644 --- a/src/freedreno/ir3/ir3_shader.c +++ b/src/freedreno/ir3/ir3_shader.c @@ -340,6 +340,7 @@ alloc_variant(struct ir3_shader *shader, const struct ir3_shader_key *key, if (!v->binning_pass) { v->const_state = rzalloc_size(v, sizeof(*v->const_state)); + v->const_state->allocs = shader->options.const_allocs; v->const_state->push_consts_type = shader->options.push_consts_type; v->const_state->consts_ubo.idx = -1; v->const_state->driver_params_ubo.idx = -1; @@ -754,6 +755,31 @@ output_name(struct ir3_shader_variant *so, int i) } } +static const char * +ir3_const_alloc_type_to_string(enum ir3_const_alloc_type type) +{ + switch (type) { + case IR3_CONST_ALLOC_PUSH_CONSTS: + return "push_consts"; + case IR3_CONST_ALLOC_DYN_DESCRIPTOR_OFFSET: + return "dyn_descriptor_offset"; + case IR3_CONST_ALLOC_INLINE_UNIFORM_ADDRS: + return "inline_uniform_addresses"; + case IR3_CONST_ALLOC_DRIVER_PARAMS: + return "driver_params"; + case IR3_CONST_ALLOC_UBO_RANGES: + return "ubo_ranges"; + case IR3_CONST_ALLOC_PREAMBLE: + return "preamble"; + case IR3_CONST_ALLOC_GLOBAL: + return "global"; + case IR3_CONST_ALLOC_UBO_PTRS: + return "ubo_ptrs"; + default: + return "unknown"; + } +} + static void dump_const_state(struct ir3_shader_variant *so, FILE *out) { @@ -763,8 +789,16 @@ dump_const_state(struct ir3_shader_variant *so, FILE *out) fprintf(out, "; num_ubos: %u\n", cs->num_ubos); fprintf(out, "; num_driver_params: %u\n", cs->num_driver_params); fprintf(out, "; offsets:\n"); - if (cs->offsets.ubo != ~0) - fprintf(out, "; ubo: c%u.x\n", cs->offsets.ubo); + + for (uint32_t i = 0; i < IR3_CONST_ALLOC_MAX; i++) { + if (cs->allocs.consts[i].size_vec4) { + fprintf(out, "; %-26s c%u.x (%u vec4)\n", + ir3_const_alloc_type_to_string(i), + cs->allocs.consts[i].offset_vec4, + cs->allocs.consts[i].size_vec4); + } + } + if (cs->offsets.image_dims != ~0) fprintf(out, "; image_dims: c%u.x\n", cs->offsets.image_dims); if (cs->offsets.kernel_params != ~0) diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index 1b333675152..038e6abbfe9 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -185,6 +185,53 @@ struct ir3_driver_ubo { uint32_t size; }; +enum ir3_const_alloc_type { + /* Vulkan, push consts. */ + IR3_CONST_ALLOC_PUSH_CONSTS = 0, + /* Vulkan, offsets required to calculate offsets of descriptors with dynamic + * offsets. + */ + IR3_CONST_ALLOC_DYN_DESCRIPTOR_OFFSET = 1, + /* Vulkan, addresses of inline uniform buffers, to which we fallback when + * their size is unknown. + */ + IR3_CONST_ALLOC_INLINE_UNIFORM_ADDRS = 2, + /* Common, stage-specific params uploaded by the driver/HW. */ + IR3_CONST_ALLOC_DRIVER_PARAMS = 3, + /* Common, UBOs lowered to consts. */ + IR3_CONST_ALLOC_UBO_RANGES = 4, + /* Common, consts produced by a preamble to be used in a main shader. */ + IR3_CONST_ALLOC_PREAMBLE = 5, + /* Vulkan, inline uniforms loaded into consts in the preamble.*/ + IR3_CONST_ALLOC_GLOBAL = 6, + /* OpenGL, pre-a6xx; pointers to UBOs */ + IR3_CONST_ALLOC_UBO_PTRS = 7, + IR3_CONST_ALLOC_MAX = 8, +}; + +struct ir3_const_allocation { + uint32_t offset_vec4; + uint32_t size_vec4; + + uint32_t reserved_size_vec4; + uint32_t reserved_align_vec4; +}; + +struct ir3_const_allocations { + struct ir3_const_allocation consts[IR3_CONST_ALLOC_MAX]; + uint32_t max_const_offset_vec4; + uint32_t reserved_vec4; +}; + +static inline bool +ir3_const_can_upload(const struct ir3_const_allocations *const_alloc, + enum ir3_const_alloc_type type, + uint32_t shader_const_size_vec4) +{ + return const_alloc->consts[type].size_vec4 > 0 && + const_alloc->consts[type].offset_vec4 < shader_const_size_vec4; +} + /** * Describes the layout of shader consts in the const register file. * @@ -192,8 +239,7 @@ struct ir3_driver_ubo { * that pointer size (ubo, etc) changes depending on generation. * * + user consts: only used for turnip push consts - * + lowered UBO ranges - * + preamble consts + * + Optional consts: ubo ranges, preamble, global, etc. * + UBO addresses: turnip is bindless and these are wasted * + image dimensions: a5xx only; needed to calculate pixel offset, but only * for images that have image_{load,store,size,atomic*} intrinsics @@ -228,11 +274,8 @@ struct ir3_const_state { */ uint32_t required_consts_aligment_vec4; - int32_t constant_data_dynamic_offsets; - struct { - /* user const start at zero */ - unsigned ubo; + /* Required consts, cannot negotiate their size */ unsigned image_dims; unsigned kernel_params; unsigned driver_param; @@ -242,6 +285,8 @@ struct ir3_const_state { unsigned immediate; } offsets; + struct ir3_const_allocations allocs; + struct { uint32_t mask; /* bitmask of images that have image_store */ uint32_t count; /* number of consts allocated */ @@ -257,9 +302,6 @@ struct ir3_const_state { unsigned immediates_size; uint32_t *immediates; - unsigned preamble_size; - unsigned global_size; - /* State of ubo access lowered to push consts: */ struct ir3_ubo_analysis_state ubo_state; enum ir3_push_consts_type push_consts_type; @@ -555,7 +597,6 @@ struct ir3_shader_nir_options { }; struct ir3_shader_options { - unsigned num_reserved_user_consts; /* What API-visible wavesizes are allowed. Even if only double wavesize is * allowed, we may still use the smaller wavesize "under the hood" and the * application simply sees the upper half as always disabled. @@ -570,6 +611,9 @@ struct ir3_shader_options { uint32_t push_consts_base; uint32_t push_consts_dwords; + /* Some const allocations are required at API level. */ + struct ir3_const_allocations const_allocs; + struct ir3_shader_nir_options nir_options; }; @@ -1042,6 +1086,16 @@ ir3_max_const(const struct ir3_shader_variant *v) uint16_t ir3_const_find_imm(struct ir3_shader_variant *v, uint32_t imm); uint16_t ir3_const_add_imm(struct ir3_shader_variant *v, uint32_t imm); +static inline unsigned +ir3_const_reg(const struct ir3_const_state *const_state, + enum ir3_const_alloc_type type, + unsigned offset) +{ + unsigned n = const_state->allocs.consts[type].offset_vec4; + assert(const_state->allocs.consts[type].size_vec4 != 0); + return regid(n + offset / 4, offset % 4); +} + /* Return true if a variant may need to be recompiled due to exceeding the * maximum "safe" constlen. */ diff --git a/src/freedreno/vulkan/tu_clear_blit.cc b/src/freedreno/vulkan/tu_clear_blit.cc index 59a7df8823f..74876dc763a 100644 --- a/src/freedreno/vulkan/tu_clear_blit.cc +++ b/src/freedreno/vulkan/tu_clear_blit.cc @@ -799,10 +799,14 @@ compile_shader(struct tu_device *dev, struct nir_shader *nir, nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage); nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage); + struct ir3_const_allocations const_allocs = {}; + if (consts > 0) + ir3_const_alloc(&const_allocs, IR3_CONST_ALLOC_UBO_RANGES, align(consts, 8), 1); + const struct ir3_shader_options options = { - .num_reserved_user_consts = align(consts, 8), .api_wavesize = IR3_SINGLE_OR_DOUBLE, .real_wavesize = IR3_SINGLE_OR_DOUBLE, + .const_allocs = const_allocs, }; ir3_finalize_nir(dev->compiler, &options.nir_options, nir); diff --git a/src/freedreno/vulkan/tu_shader.cc b/src/freedreno/vulkan/tu_shader.cc index cff60ff1e04..b2549250b0f 100644 --- a/src/freedreno/vulkan/tu_shader.cc +++ b/src/freedreno/vulkan/tu_shader.cc @@ -826,7 +826,7 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev, const struct tu_pipeline_layout *layout, uint32_t read_only_input_attachments, bool dynamic_renderpass, - unsigned *reserved_consts_vec4_out) + struct ir3_const_allocations *const_allocs) { tu_shader->const_state.push_consts = (struct tu_push_constant_range) { .lo = 0, @@ -848,10 +848,13 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev, } struct tu_const_state *const_state = &tu_shader->const_state; - unsigned reserved_consts_vec4 = + unsigned push_consts_vec4 = align(DIV_ROUND_UP(const_state->push_consts.dwords, 4), dev->compiler->const_upload_unit); + ir3_const_alloc(const_allocs, IR3_CONST_ALLOC_PUSH_CONSTS, + push_consts_vec4, 1); + bool unknown_dynamic_size = false; bool unknown_dynamic_offset = false; for (unsigned i = 0; i < layout->num_sets; i++) { @@ -867,9 +870,12 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev, } if (unknown_dynamic_offset) { - const_state->dynamic_offset_loc = reserved_consts_vec4 * 4; + const_state->dynamic_offset_loc = + const_allocs->max_const_offset_vec4 * 4; assert(dev->physical_device->reserved_set_idx >= 0); - reserved_consts_vec4 += DIV_ROUND_UP(dev->physical_device->reserved_set_idx, 4); + ir3_const_alloc( + const_allocs, IR3_CONST_ALLOC_DYN_DESCRIPTOR_OFFSET, + DIV_ROUND_UP(dev->physical_device->reserved_set_idx, 4), 1); } else { const_state->dynamic_offset_loc = UINT32_MAX; } @@ -877,6 +883,7 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev, /* Reserve space for inline uniforms, so we can always load them from * constants and not setup a UBO descriptor for them. */ + size_t ldgk_consts = 0; bool use_ldg_k = dev->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk; for (unsigned set = 0; set < layout->num_sets; set++) { @@ -918,20 +925,23 @@ tu_lower_io(nir_shader *shader, struct tu_device *dev, assert(const_state->num_inline_ubos < ARRAY_SIZE(const_state->ubos)); unsigned size_vec4 = push_address ? 1 : DIV_ROUND_UP(binding->size, 16); - const_state->ubos[const_state->num_inline_ubos++] = (struct tu_inline_ubo) { - .base = set, - .offset = binding->offset, - .push_address = push_address, - .const_offset_vec4 = reserved_consts_vec4, - .size_vec4 = size_vec4, - }; + const_state->ubos[const_state->num_inline_ubos++] = + (struct tu_inline_ubo) { + .base = set, + .offset = binding->offset, + .push_address = push_address, + .const_offset_vec4 = + const_allocs->max_const_offset_vec4 + ldgk_consts, + .size_vec4 = size_vec4, + }; - if (!use_ldg_k) - reserved_consts_vec4 += align(size_vec4, dev->compiler->const_upload_unit); + if (!use_ldg_k) { + ldgk_consts += align(size_vec4, dev->compiler->const_upload_unit); + } } } - *reserved_consts_vec4_out = reserved_consts_vec4; + ir3_const_alloc(const_allocs, IR3_CONST_ALLOC_INLINE_UNIFORM_ADDRS, ldgk_consts, 1); struct lower_instr_params params = { .dev = dev, @@ -2527,10 +2537,10 @@ tu_shader_create(struct tu_device *dev, NIR_PASS_V(nir, nir_lower_mem_access_bit_sizes, &options); } - unsigned reserved_consts_vec4 = 0; + struct ir3_const_allocations const_allocs = {}; NIR_PASS_V(nir, tu_lower_io, dev, shader, layout, key->read_only_input_attachments, key->dynamic_renderpass, - &reserved_consts_vec4); + &const_allocs); nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); @@ -2540,12 +2550,12 @@ tu_shader_create(struct tu_device *dev, ir3_finalize_nir(dev->compiler, &nir_options, nir); const struct ir3_shader_options options = { - .num_reserved_user_consts = reserved_consts_vec4, .api_wavesize = key->api_wavesize, .real_wavesize = key->real_wavesize, .push_consts_type = shader->const_state.push_consts.type, .push_consts_base = shader->const_state.push_consts.lo, .push_consts_dwords = shader->const_state.push_consts.dwords, + .const_allocs = const_allocs, .nir_options = nir_options, }; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_const.h b/src/gallium/drivers/freedreno/ir3/ir3_const.h index f7d10fe2821..9a5d820aea7 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_const.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_const.h @@ -188,7 +188,8 @@ ir3_emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf) { const struct ir3_const_state *const_state = ir3_const_state(v); - uint32_t offset = const_state->offsets.ubo; + uint32_t offset = + const_state->allocs.consts[IR3_CONST_ALLOC_UBO_PTRS].offset_vec4; /* a6xx+ uses UBO state and ldc instead of pointers emitted in * const state and ldg: @@ -196,7 +197,8 @@ ir3_emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v, if (ctx->screen->gen >= 6) return; - if (v->constlen > offset) { + if (ir3_const_can_upload(&const_state->allocs, IR3_CONST_ALLOC_UBO_PTRS, + v->constlen)) { uint32_t params = const_state->num_ubos; uint32_t offsets[params]; struct fd_bo *bos[params];