gallivm/nir: Add a short circuit uniform-offset mode for load_global.

If we know the offset is constant, we don't have ask LLVM to loop over the
elements pulling the same value out over and over.

This doesn't seem to have produced a win in the testcase I was looking at,
but it was an easier entrypoint to figuring out how to do scalar memory
access than load_memory, and will probably affect some workload.

Reviewed-by: Dave Airlie <airlied@redhat.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14999>
This commit is contained in:
Emma Anholt 2022-02-10 14:29:13 -08:00 committed by Marge Bot
parent d74606d440
commit 181f25aff4
3 changed files with 19 additions and 1 deletions

View file

@ -1695,9 +1695,10 @@ static void visit_load_global(struct lp_build_nir_context *bld_base,
nir_intrinsic_instr *instr, LLVMValueRef result[NIR_MAX_VEC_COMPONENTS])
{
LLVMValueRef addr = get_src(bld_base, instr->src[0]);
bool offset_is_uniform = nir_src_is_always_uniform(instr->src[0]);
bld_base->load_global(bld_base, nir_dest_num_components(instr->dest), nir_dest_bit_size(instr->dest),
nir_src_bit_size(instr->src[0]),
addr, result);
offset_is_uniform, addr, result);
}
static void visit_store_global(struct lp_build_nir_context *bld_base,

View file

@ -91,6 +91,7 @@ struct lp_build_nir_context
void (*load_global)(struct lp_build_nir_context *bld_base,
unsigned nc, unsigned bit_size,
unsigned offset_bit_size,
bool offset_is_global,
LLVMValueRef offset, LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]);
void (*store_global)(struct lp_build_nir_context *bld_base,

View file

@ -816,6 +816,7 @@ static void emit_load_global(struct lp_build_nir_context *bld_base,
unsigned nc,
unsigned bit_size,
unsigned addr_bit_size,
bool offset_is_uniform,
LLVMValueRef addr,
LLVMValueRef outval[NIR_MAX_VEC_COMPONENTS])
{
@ -827,6 +828,21 @@ static void emit_load_global(struct lp_build_nir_context *bld_base,
res_bld = get_int_bld(bld_base, true, bit_size);
if (offset_is_uniform && invocation_0_must_be_active(bld_base)) {
/* If the offset is uniform, then use the address from invocation 0 to
* load, and broadcast to all invocations.
*/
LLVMValueRef addr_ptr = LLVMBuildExtractElement(gallivm->builder, addr,
lp_build_const_int32(gallivm, 0), "");
addr_ptr = global_addr_to_ptr(gallivm, addr_ptr, bit_size);
for (unsigned c = 0; c < nc; c++) {
LLVMValueRef scalar = lp_build_pointer_get(builder, addr_ptr, lp_build_const_int32(gallivm, c));
outval[c] = lp_build_broadcast_scalar(res_bld, scalar);
}
return;
}
for (unsigned c = 0; c < nc; c++) {
LLVMValueRef result = lp_build_alloca(gallivm, res_bld->vec_type, "");
struct lp_build_loop_state loop_state;