diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 814a78919c3..2492aa6d9aa 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1354,6 +1354,11 @@ store("uniform_ir3", [], indices=[BASE]) # vec4's. intrinsic("copy_ubo_to_uniform_ir3", [1, 1], indices=[BASE, RANGE]) +# IR3-specific intrinsic for ldg.k. +# base is an offset to apply to the address in bytes, range_base is the +# const file base in components, range is the amount to copy in vec4's. +intrinsic("copy_global_to_uniform_ir3", [2], indices=[BASE, RANGE_BASE, RANGE]) + # IR3-specific intrinsic for stsc. Loads from push consts to constant file # Should be used in the shader preamble. intrinsic("copy_push_const_to_uniform_ir3", [1], indices=[BASE, RANGE]) diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index d326de7c080..45d28d007bf 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -2582,6 +2582,7 @@ INSTR4(ATOMIC_S_AND) INSTR4(ATOMIC_S_OR) INSTR4(ATOMIC_S_XOR) #endif +INSTR4NODST(LDG_K) /* cat7 instructions: */ INSTR0(BAR) diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 8216e95c9f1..e97ec561f37 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -920,6 +920,40 @@ emit_intrinsic_copy_ubo_to_uniform(struct ir3_context *ctx, array_insert(b, b->keeps, ldc); } +static void +emit_intrinsic_copy_global_to_uniform(struct ir3_context *ctx, + nir_intrinsic_instr *intr) +{ + struct ir3_block *b = ctx->block; + + unsigned size = nir_intrinsic_range(intr); + unsigned dst = nir_intrinsic_range_base(intr); + unsigned addr_offset = nir_intrinsic_base(intr); + unsigned dst_lo = dst & 0xff; + unsigned dst_hi = dst >> 8; + + struct ir3_instruction *a1 = NULL; + if (dst_hi) + a1 = ir3_get_addr1(ctx, dst_hi << 8); + + struct ir3_instruction *addr_lo = ir3_get_src(ctx, &intr->src[0])[0]; + struct ir3_instruction *addr_hi = ir3_get_src(ctx, &intr->src[0])[1]; + struct ir3_instruction *addr = ir3_collect(b, addr_lo, addr_hi); + struct ir3_instruction *ldg = ir3_LDG_K(b, create_immed(b, dst_lo), 0, addr, 0, + create_immed(b, addr_offset), 0, + create_immed(b, size), 0); + ldg->barrier_class = ldg->barrier_conflict = IR3_BARRIER_CONST_W; + ldg->cat6.type = TYPE_U32; + + if (a1) { + ir3_instr_set_address(ldg, a1); + ldg->flags |= IR3_INSTR_A1EN; + } + + array_insert(b, b->keeps, ldg); +} + + /* handles direct/indirect UBO reads: */ static void emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr, @@ -2277,6 +2311,9 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) case nir_intrinsic_copy_ubo_to_uniform_ir3: emit_intrinsic_copy_ubo_to_uniform(ctx, intr); break; + case nir_intrinsic_copy_global_to_uniform_ir3: + emit_intrinsic_copy_global_to_uniform(ctx, intr); + break; case nir_intrinsic_load_frag_coord: case nir_intrinsic_load_frag_coord_unscaled_ir3: ir3_split_dest(b, dst, get_frag_coord(ctx, intr), 0, 4); diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index 46587509e0a..8dc68f10083 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -808,6 +808,13 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s) !(ir3_shader_debug & IR3_DBG_NOPREAMBLE)) progress |= OPT(s, ir3_nir_opt_preamble, so); + if (so->compiler->load_shader_consts_via_preamble) + progress |= OPT(s, ir3_nir_lower_driver_params_to_ubo, so); + + /* TODO: ldg.k might also work on a6xx */ + if (so->compiler->gen >= 7) + progress |= OPT(s, ir3_nir_lower_const_global_loads, so); + if (!so->binning_pass) OPT_V(s, ir3_nir_analyze_ubo_ranges, so); @@ -1053,7 +1060,8 @@ ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v, assert((const_state->ubo_state.size % 16) == 0); unsigned constoff = v->shader_options.num_reserved_user_consts + const_state->ubo_state.size / 16 + - const_state->preamble_size; + const_state->preamble_size + + const_state->global_size; unsigned ptrsz = ir3_pointer_size(compiler); if (const_state->num_ubos > 0 && compiler->gen < 6) { diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h index af1ad07d93d..5320340ea2c 100644 --- a/src/freedreno/ir3/ir3_nir.h +++ b/src/freedreno/ir3/ir3_nir.h @@ -80,6 +80,7 @@ void ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v, bool ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v); void ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v); bool ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v); +bool ir3_nir_lower_const_global_loads(nir_shader *nir, struct ir3_shader_variant *v); bool ir3_nir_fixup_load_uniform(nir_shader *nir); bool ir3_nir_opt_preamble(nir_shader *nir, struct ir3_shader_variant *v); bool ir3_nir_lower_preamble(nir_shader *nir, struct ir3_shader_variant *v); diff --git a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c index f364c7cdf9d..4982fdc8c24 100644 --- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c +++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c @@ -34,11 +34,18 @@ get_ubo_load_range(nir_shader *nir, nir_intrinsic_instr *instr, uint32_t offset = nir_intrinsic_range_base(instr); uint32_t size = nir_intrinsic_range(instr); + if (instr->intrinsic == nir_intrinsic_load_global_ir3) { + offset *= 4; + size *= 4; + } + /* If the offset is constant, the range is trivial (and NIR may not have * figured it out). */ if (nir_src_is_const(instr->src[1])) { offset = nir_src_as_uint(instr->src[1]); + if (instr->intrinsic == nir_intrinsic_load_global_ir3) + offset *= 4; size = nir_intrinsic_dest_components(instr) * 4; } @@ -55,17 +62,28 @@ get_ubo_load_range(nir_shader *nir, nir_intrinsic_instr *instr, static bool get_ubo_info(nir_intrinsic_instr *instr, struct ir3_ubo_info *ubo) { - if (nir_src_is_const(instr->src[0])) { + if (instr->intrinsic == nir_intrinsic_load_global_ir3) { + ubo->global_base = instr->src[0].ssa; + ubo->block = 0; + ubo->bindless_base = 0; + ubo->bindless = false; + ubo->global = true; + return true; + } else if (nir_src_is_const(instr->src[0])) { + ubo->global_base = NULL; ubo->block = nir_src_as_uint(instr->src[0]); ubo->bindless_base = 0; ubo->bindless = false; + ubo->global = false; return true; } else { nir_intrinsic_instr *rsrc = ir3_bindless_resource(instr->src[0]); if (rsrc && nir_src_is_const(rsrc->src[0])) { + ubo->global_base = NULL; ubo->block = nir_src_as_uint(rsrc->src[0]); ubo->bindless_base = nir_intrinsic_desc_set(rsrc); ubo->bindless = true; + ubo->global = false; return true; } } @@ -273,7 +291,8 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b, struct ir3_ubo_range r; if (!get_ubo_load_range(b->shader, instr, alignment, &r)) { - track_ubo_use(instr, b, num_ubos); + if (instr->intrinsic == nir_intrinsic_load_ubo) + track_ubo_use(instr, b, num_ubos); return false; } @@ -283,7 +302,8 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b, */ const struct ir3_ubo_range *range = get_existing_range(instr, state, &r); if (!range) { - track_ubo_use(instr, b, num_ubos); + if (instr->intrinsic == nir_intrinsic_load_ubo) + track_ubo_use(instr, b, num_ubos); return false; } @@ -292,20 +312,23 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b, handle_partial_const(b, &ubo_offset, &const_offset); - /* UBO offset is in bytes, but uniform offset is in units of - * dwords, so we need to divide by 4 (right-shift by 2). For ldc the - * offset is in units of 16 bytes, so we need to multiply by 4. And - * also the same for the constant part of the offset: - */ - const int shift = -2; - nir_def *new_offset = ir3_nir_try_propagate_bit_shift(b, ubo_offset, -2); - nir_def *uniform_offset = NULL; - if (new_offset) { - uniform_offset = new_offset; - } else { - uniform_offset = shift > 0 - ? nir_ishl_imm(b, ubo_offset, shift) - : nir_ushr_imm(b, ubo_offset, -shift); + nir_def *uniform_offset = ubo_offset; + + if (instr->intrinsic == nir_intrinsic_load_ubo) { + /* UBO offset is in bytes, but uniform offset is in units of + * dwords, so we need to divide by 4 (right-shift by 2). For ldc the + * offset is in units of 16 bytes, so we need to multiply by 4. And + * also the same for the constant part of the offset: + */ + const int shift = -2; + nir_def *new_offset = ir3_nir_try_propagate_bit_shift(b, ubo_offset, -2); + if (new_offset) { + uniform_offset = new_offset; + } else { + uniform_offset = shift > 0 + ? nir_ishl_imm(b, ubo_offset, shift) + : nir_ushr_imm(b, ubo_offset, -shift); + } } assert(!(const_offset & 0x3)); @@ -336,6 +359,174 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b, return true; } +/* This isn't nearly as comprehensive as what's done in nir_opt_preamble, but we + * need to hoist the load_global base into the preamble. Currently the only user + * is turnip with inline uniforms, so we can be simple and only handle a few + * uncomplicated intrinsics. + * + * TODO: Fold what this pass does into opt_preamble, which will give us a better + * heuristic for what to push and we won't need this. + */ +static bool +def_is_rematerializable(nir_def *def) +{ + switch (def->parent_instr->type) { + case nir_instr_type_load_const: + return true; + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(def->parent_instr); + switch (intrin->intrinsic) { + case nir_intrinsic_load_ubo: + return def_is_rematerializable(intrin->src[0].ssa) && + def_is_rematerializable(intrin->src[1].ssa); + case nir_intrinsic_bindless_resource_ir3: + return def_is_rematerializable(intrin->src[0].ssa); + default: + return false; + } + } + case nir_instr_type_alu: { + nir_alu_instr *alu = nir_instr_as_alu(def->parent_instr); + for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) { + if (!def_is_rematerializable(alu->src[i].src.ssa)) + return false; + } + return true; + } + default: + return false; + } +} + +static nir_def * +_rematerialize_def(nir_builder *b, struct hash_table *remap_ht, + nir_def *def) +{ + if (_mesa_hash_table_search(remap_ht, def->parent_instr)) + return NULL; + + switch (def->parent_instr->type) { + case nir_instr_type_load_const: + break; + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(def->parent_instr); + for (unsigned i = 0; i < nir_intrinsic_infos[intrin->intrinsic].num_srcs; + i++) + _rematerialize_def(b, remap_ht, intrin->src[i].ssa); + break; + } + case nir_instr_type_alu: { + nir_alu_instr *alu = nir_instr_as_alu(def->parent_instr); + for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) + _rematerialize_def(b, remap_ht, alu->src[i].src.ssa); + break; + } + default: + unreachable("should not get here"); + } + + nir_instr *instr = nir_instr_clone_deep(b->shader, def->parent_instr, + remap_ht); + nir_builder_instr_insert(b, instr); + return nir_instr_def(instr); +} + +static nir_def * +rematerialize_def(nir_builder *b, nir_def *def) +{ + struct hash_table *remap_ht = _mesa_pointer_hash_table_create(NULL); + + nir_def *new_def = _rematerialize_def(b, remap_ht, def); + + _mesa_hash_table_destroy(remap_ht, NULL); + + return new_def; +} + +static bool +rematerialize_load_global_bases(nir_shader *nir, + struct ir3_ubo_analysis_state *state) +{ + bool has_load_global = false; + for (unsigned i = 0; i < state->num_enabled; i++) { + if (state->range[i].ubo.global) { + has_load_global = true; + break; + } + } + + if (!has_load_global) + return false; + + nir_function_impl *preamble = nir_shader_get_preamble(nir); + nir_builder _b = nir_builder_at(nir_after_impl(preamble)); + nir_builder *b = &_b; + + for (unsigned i = 0; i < state->num_enabled; i++) { + struct ir3_ubo_range *range = &state->range[i]; + + if (!range->ubo.global) + continue; + + range->ubo.global_base = rematerialize_def(b, range->ubo.global_base); + } + + return true; +} + +static bool +copy_global_to_uniform(nir_shader *nir, struct ir3_ubo_analysis_state *state) +{ + if (state->num_enabled == 0) + return false; + + nir_function_impl *preamble = nir_shader_get_preamble(nir); + nir_builder _b = nir_builder_at(nir_after_impl(preamble)); + nir_builder *b = &_b; + + for (unsigned i = 0; i < state->num_enabled; i++) { + const struct ir3_ubo_range *range = &state->range[i]; + assert(range->ubo.global); + + nir_def *base = rematerialize_def(b, range->ubo.global_base); + unsigned start = range->start; + if (start > (1 << 10)) { + /* This is happening pretty late, so we need to add the offset + * manually ourselves. + */ + nir_def *start_val = nir_imm_int(b, start); + nir_def *base_lo = nir_channel(b, base, 0); + nir_def *base_hi = nir_channel(b, base, 1); + nir_def *carry = nir_b2i32(b, nir_ult(b, base_lo, start_val)); + base_lo = nir_iadd(b, base_lo, start_val); + base_hi = nir_iadd(b, base_hi, carry); + base = nir_vec2(b, base_lo, base_hi); + start = 0; + } + + unsigned size = (range->end - range->start); + for (unsigned offset = 0; offset < size; offset += 16) { + unsigned const_offset = range->offset / 4 + offset / 4; + if (const_offset < 256) { + nir_copy_global_to_uniform_ir3(b, base, + .base = start + offset, + .range_base = const_offset, + .range = 1); + } else { + /* It seems that the a1.x format doesn't work, so we need to + * decompose the ldg.k into ldg + stc. + */ + nir_def *load = + nir_load_global_ir3(b, 4, 32, base, + nir_imm_int(b, (start + offset) / 4)); + nir_store_uniform_ir3(b, load, .base = const_offset); + } + } + } + + return true; +} + static bool copy_ubo_to_uniform(nir_shader *nir, const struct ir3_const_state *const_state, bool const_data_via_cp) @@ -402,6 +593,130 @@ instr_is_load_ubo(nir_instr *instr) return op == nir_intrinsic_load_ubo; } +static bool +instr_is_load_const(nir_instr *instr) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + nir_intrinsic_op op = intrin->intrinsic; + + if (op != nir_intrinsic_load_global_ir3) + return false; + + /* TODO handle non-aligned accesses */ + if (nir_intrinsic_align_mul(intrin) < 16 || + nir_intrinsic_align_offset(intrin) % 16 != 0) + return false; + + enum gl_access_qualifier access = nir_intrinsic_access(intrin); + return (access & ACCESS_NON_WRITEABLE) && (access & ACCESS_CAN_SPECULATE); +} + +/* For now, everything we upload is accessed statically and thus will be + * used by the shader. Once we can upload dynamically indexed data, we may + * upload sparsely accessed arrays, at which point we probably want to + * give priority to smaller UBOs, on the assumption that big UBOs will be + * accessed dynamically. Alternatively, we can track statically and + * dynamically accessed ranges separately and upload static rangtes + * first. + */ +static void +assign_offsets(struct ir3_ubo_analysis_state *state, unsigned start, + unsigned max_upload) +{ + uint32_t offset = 0; + for (uint32_t i = 0; i < state->num_enabled; i++) { + uint32_t range_size = state->range[i].end - state->range[i].start; + + assert(offset <= max_upload); + state->range[i].offset = offset + start; + assert(offset <= max_upload); + offset += range_size; + } + state->size = offset; +} + +/* Lowering to ldg to ldg.k + const uses the same infrastructure as lowering UBO + * loads, but must be done separately because the analysis and transform must be + * done in the same pass and we cannot reuse the main variant analysis for the + * binning variant. + */ +bool +ir3_nir_lower_const_global_loads(nir_shader *nir, struct ir3_shader_variant *v) +{ + struct ir3_const_state *const_state = ir3_const_state(v); + struct ir3_compiler *compiler = v->compiler; + + if (ir3_shader_debug & IR3_DBG_NOUBOOPT) + return false; + + unsigned max_upload; + if (v->binning_pass) { + max_upload = const_state->global_size * 16; + } else { + struct ir3_const_state worst_case_const_state = { + .preamble_size = const_state->preamble_size, + }; + ir3_setup_const_state(nir, v, &worst_case_const_state); + max_upload = (ir3_max_const(v) - worst_case_const_state.offsets.immediate) * 16; + } + + struct ir3_ubo_analysis_state state = {}; + uint32_t upload_remaining = max_upload; + + nir_foreach_function (function, nir) { + if (function->impl && !function->is_preamble) { + nir_foreach_block (block, function->impl) { + nir_foreach_instr (instr, block) { + if (instr_is_load_const(instr) && + def_is_rematerializable(nir_instr_as_intrinsic(instr)->src[0].ssa)) + gather_ubo_ranges(nir, nir_instr_as_intrinsic(instr), &state, + compiler->const_upload_unit, + &upload_remaining); + } + } + } + } + + uint32_t global_offset = v->shader_options.num_reserved_user_consts * 16; + assign_offsets(&state, global_offset, max_upload); + + bool progress = copy_global_to_uniform(nir, &state); + + if (progress) { + nir_foreach_function (function, nir) { + if (function->impl) { + if (function->is_preamble) { + nir_metadata_preserve( + function->impl, nir_metadata_all); + continue; + } + + nir_builder builder = nir_builder_create(function->impl); + nir_foreach_block (block, function->impl) { + nir_foreach_instr_safe (instr, block) { + if (!instr_is_load_const(instr)) + continue; + progress |= lower_ubo_load_to_uniform( + nir_instr_as_intrinsic(instr), &builder, &state, NULL, + compiler->const_upload_unit); + } + } + + nir_metadata_preserve( + function->impl, nir_metadata_block_index | nir_metadata_dominance); + } + } + } + + if (!v->binning_pass) + const_state->global_size = DIV_ROUND_UP(state.size, 16); + + return progress; +} + void ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v) { @@ -417,6 +732,7 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v) */ struct ir3_const_state worst_case_const_state = { .preamble_size = const_state->preamble_size, + .global_size = const_state->global_size, }; ir3_setup_const_state(nir, v, &worst_case_const_state); const uint32_t max_upload = @@ -429,6 +745,7 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v) uint32_t upload_remaining = max_upload; bool push_ubos = compiler->options.push_ubo_with_preamble; + nir_foreach_function (function, nir) { if (function->impl && (!push_ubos || !function->is_preamble)) { nir_foreach_block (block, function->impl) { @@ -442,25 +759,9 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v) } } - /* For now, everything we upload is accessed statically and thus will be - * used by the shader. Once we can upload dynamically indexed data, we may - * upload sparsely accessed arrays, at which point we probably want to - * give priority to smaller UBOs, on the assumption that big UBOs will be - * accessed dynamically. Alternatively, we can track statically and - * dynamically accessed ranges separately and upload static rangtes - * first. - */ - - uint32_t offset = 0; - for (uint32_t i = 0; i < state->num_enabled; i++) { - uint32_t range_size = state->range[i].end - state->range[i].start; - - assert(offset <= max_upload); - state->range[i].offset = offset + v->shader_options.num_reserved_user_consts * 16; - assert(offset <= max_upload); - offset += range_size; - } - state->size = offset; + uint32_t ubo_offset = v->shader_options.num_reserved_user_consts * 16 + + const_state->global_size * 16; + assign_offsets(state, ubo_offset, max_upload); } bool diff --git a/src/freedreno/ir3/ir3_nir_opt_preamble.c b/src/freedreno/ir3/ir3_nir_opt_preamble.c index cd7926ab252..1fa9baf8f44 100644 --- a/src/freedreno/ir3/ir3_nir_opt_preamble.c +++ b/src/freedreno/ir3/ir3_nir_opt_preamble.c @@ -349,7 +349,7 @@ ir3_nir_lower_preamble(nir_shader *nir, struct ir3_shader_variant *v) /* First, lower load/store_preamble. */ const struct ir3_const_state *const_state = ir3_const_state(v); unsigned preamble_base = v->shader_options.num_reserved_user_consts * 4 + - const_state->ubo_state.size / 4; + const_state->ubo_state.size / 4 + const_state->global_size * 4; unsigned preamble_size = const_state->preamble_size * 4; BITSET_DECLARE(promoted_to_float, preamble_size); diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index 8f087f2dcc2..2b2cb198c65 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -123,10 +123,14 @@ enum ir3_wavesize_option { /** * Description of a lowered UBO. */ +struct nir_def; + struct ir3_ubo_info { + struct nir_def *global_base; /* For global loads, the base address */ uint32_t block; /* Which constant block */ uint16_t bindless_base; /* For bindless, which base register is used */ bool bindless; + bool global; }; /** @@ -230,6 +234,7 @@ struct ir3_const_state { uint32_t *immediates; unsigned preamble_size; + unsigned global_size; /* State of ubo access lowered to push consts: */ struct ir3_ubo_analysis_state ubo_state;