gallivm: add load/store scratch support.

Scratch space is per-thread space, so allocate the scratch size
* vector width, and add a per-thread base offset to each
load/store.

This is needed for OpenCL private memory space

Reviewed-by: Roland Scheidegger <sroland@vmware.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7304>
This commit is contained in:
Dave Airlie 2020-10-08 11:09:11 +10:00
parent 9c1f6ed804
commit fb56fb02a1
4 changed files with 188 additions and 54 deletions

View file

@ -18,8 +18,6 @@ program/execute/amdgcn-callee-saved-registers: skip
program/execute/amdgcn-f16-inline-immediates: skip
program/execute/amdgcn-f32-inline-immediates/add integer 64: fail
program/execute/amdgcn-i16-inline-immediates: skip
program/execute/amdgcn-mubuf-negative-vaddr: crash
program/execute/amdgcn.sign_extend_inreg: crash
program/execute/atomic_int64_add-global: skip
program/execute/atomic_int64_add-global-return: skip
program/execute/atomic_int64_add-local: skip
@ -65,8 +63,7 @@ program/execute/builtin/builtin-char-mul_hi-1.0.generated/mul_hi char4: fail
program/execute/builtin/builtin-char-mul_hi-1.0.generated/mul_hi char8: fail
program/execute/builtin/builtin-char-popcount-1.2.generated: skip
program/execute/builtin/builtin-char-rotate-1.0.generated: crash
program/execute/builtin/builtin-float-cos-1.0.generated: crash
program/execute/builtin/builtin-float-fma-1.0.generated: crash
program/execute/builtin/builtin-float-cos-1.0.generated: timeout
program/execute/builtin/builtin-float-fmax-1.0.generated/fmax float1: fail
program/execute/builtin/builtin-float-fmax-1.0.generated/fmax float16: fail
program/execute/builtin/builtin-float-fmax-1.0.generated/fmax float2: fail
@ -85,13 +82,9 @@ program/execute/builtin/builtin-float-fmin-1.0.generated/tss_fmin float16: fail
program/execute/builtin/builtin-float-fmin-1.0.generated/tss_fmin float2: fail
program/execute/builtin/builtin-float-fmin-1.0.generated/tss_fmin float4: fail
program/execute/builtin/builtin-float-fmin-1.0.generated/tss_fmin float8: fail
program/execute/builtin/builtin-float-fract-1.0.generated: crash
program/execute/builtin/builtin-float-frexp-1.0.generated: crash
program/execute/builtin/builtin-float-isfinite-1.0.generated: crash
program/execute/builtin/builtin-float-isnormal-1.0.generated: crash
program/execute/builtin/builtin-float-ldexp-1.0.generated: fail
program/execute/builtin/builtin-float-lgamma-1.0.generated: crash
program/execute/builtin/builtin-float-lgamma_r-1.0.generated: crash
program/execute/builtin/builtin-float-maxmag-1.1.generated/maxmag float1: fail
program/execute/builtin/builtin-float-maxmag-1.1.generated/maxmag float16: fail
program/execute/builtin/builtin-float-maxmag-1.1.generated/maxmag float2: fail
@ -103,12 +96,9 @@ program/execute/builtin/builtin-float-minmag-1.1.generated/minmag float2: fail
program/execute/builtin/builtin-float-minmag-1.1.generated/minmag float4: fail
program/execute/builtin/builtin-float-minmag-1.1.generated/minmag float8: fail
program/execute/builtin/builtin-float-mix-1.0.generated: crash
program/execute/builtin/builtin-float-modf-1.0.generated: crash
program/execute/builtin/builtin-float-remquo-1.0.generated: crash
program/execute/builtin/builtin-float-sin-1.0.generated: crash
program/execute/builtin/builtin-float-sin-1.0.generated: timeout
program/execute/builtin/builtin-float-sincos-1.0.generated: timeout
program/execute/builtin/builtin-float-tan-1.0.generated: crash
program/execute/builtin/builtin-float-tgamma-1.0.generated: crash
program/execute/builtin/builtin-float-tan-1.0.generated: timeout
program/execute/builtin/builtin-int-popcount-1.2.generated: skip
program/execute/builtin/builtin-long-mad_hi-1.0.generated/mad_hi long1: fail
program/execute/builtin/builtin-long-mad_hi-1.0.generated/mad_hi long16: fail
@ -178,10 +168,9 @@ program/execute/builtin/builtin-ushort-popcount-1.2.generated: skip
program/execute/builtin/builtin-ushort-upsample-1.0.generated: crash
program/execute/call-clobbers-amdgcn: skip
program/execute/calls-large-struct: crash
program/execute/calls-struct: crash
program/execute/gegl-rgb-gamma-u8-to-ragabaf: crash
program/execute/calls-struct/regs struct: fail
program/execute/calls-struct/small struct in regs: fail
program/execute/global-offset/3d, input dependent: fail
program/execute/i32-stack-array: crash
program/execute/image-attributes: crash
program/execute/image-read-2d/read float from cl_float cl_rgba image.: fail
program/execute/image-read-2d/read signed integer from cl_signed_int8 cl_rgba image.: fail
@ -192,66 +181,33 @@ program/execute/load-hi16-generic: skip
program/execute/load-lo16: crash
program/execute/load-lo16-generic: skip
program/execute/mad-mix: skip
program/execute/multiple-stack-objects: crash
program/execute/negative-private-base-pointer: crash
program/execute/program-tester-check-local-size-test-should-skip/this test should skip: skip
program/execute/pyrit-wpa-psk: crash
program/execute/realign-stack: crash
program/execute/reference: crash
program/execute/sampler/read from image using clamp_to_edge addressing mode: fail
program/execute/sampler/read from image using linear filtering and normalized coords: fail
program/execute/sampler/read from image using linear filtering and unnormalized coords: fail
program/execute/scalar-logical-float: skip
program/execute/store-hi16-generic: skip
program/execute/v2i32-stack: crash
program/execute/v3i32-stack: crash
program/execute/v3i32-stack-array: crash
program/execute/v4i32-stack: crash
program/execute/vload/vload-char-private: crash
program/execute/vload/vload-double-private: crash
program/execute/vload/vload-float-private: crash
program/execute/vload/vload-half-constant: skip
program/execute/vload/vload-half-global: skip
program/execute/vload/vload-half-local: skip
program/execute/vload/vload-half-private: skip
program/execute/vload/vload-int-private: crash
program/execute/vload/vload-long-private: crash
program/execute/vload/vload-short-private: crash
program/execute/vload/vload-uchar-private: crash
program/execute/vload/vload-uint-private: crash
program/execute/vload/vload-ulong-private: crash
program/execute/vload/vload-ushort-private: crash
program/execute/vload/vload_half-float-private: crash
program/execute/vload/vloada_half-float-private: crash
program/execute/vstore/vstore-char-private: crash
program/execute/vstore/vstore-double-private: crash
program/execute/vstore/vstore-float-private: crash
program/execute/vstore/vstore-half-global: skip
program/execute/vstore/vstore-half-local: skip
program/execute/vstore/vstore-half-private: skip
program/execute/vstore/vstore-int-private: crash
program/execute/vstore/vstore-long-private: crash
program/execute/vstore/vstore-short-private: crash
program/execute/vstore/vstore-uchar-private: crash
program/execute/vstore/vstore-uint-private: crash
program/execute/vstore/vstore-ulong-private: crash
program/execute/vstore/vstore-ushort-private: crash
program/execute/vstore/vstore_half-double-global: crash
program/execute/vstore/vstore_half-double-local: crash
program/execute/vstore/vstore_half-double-private: crash
program/execute/vstore/vstore_half-float-private: crash
program/execute/vstore/vstorea_half-double-global: crash
program/execute/vstore/vstorea_half-double-local: crash
program/execute/vstore/vstorea_half-double-private: crash
program/execute/vstore/vstorea_half-float-private: crash
summary:
name: results
---- --------
pass: 2969
fail: 105
crash: 67
pass: 3560
fail: 107
crash: 18
skip: 73
timeout: 1
timeout: 4
warn: 0
incomplete: 0
dmesg-warn: 0
@ -259,4 +215,4 @@ summary:
changes: 0
fixes: 0
regressions: 0
total: 3215
total: 3762

View file

@ -1485,6 +1485,28 @@ static void visit_interp(struct lp_build_nir_context *bld_base,
bld_base->interp_at(bld_base, num_components, var, centroid, sample, const_index, indir_index, offsets, result);
}
static void visit_load_scratch(struct lp_build_nir_context *bld_base,
nir_intrinsic_instr *instr,
LLVMValueRef result[NIR_MAX_VEC_COMPONENTS])
{
LLVMValueRef offset = get_src(bld_base, instr->src[0]);
bld_base->load_scratch(bld_base, nir_dest_num_components(instr->dest),
nir_dest_bit_size(instr->dest), offset, result);
}
static void visit_store_scratch(struct lp_build_nir_context *bld_base,
nir_intrinsic_instr *instr)
{
LLVMValueRef val = get_src(bld_base, instr->src[0]);
LLVMValueRef offset = get_src(bld_base, instr->src[1]);
int writemask = instr->const_index[2];
int nc = nir_src_num_components(instr->src[0]);
int bitsize = nir_src_bit_size(instr->src[0]);
bld_base->store_scratch(bld_base, writemask, nc, bitsize, offset, val);
}
static void visit_intrinsic(struct lp_build_nir_context *bld_base,
nir_intrinsic_instr *instr)
{
@ -1648,6 +1670,12 @@ static void visit_intrinsic(struct lp_build_nir_context *bld_base,
case nir_intrinsic_interp_deref_at_sample:
visit_interp(bld_base, instr, result);
break;
case nir_intrinsic_load_scratch:
visit_load_scratch(bld_base, instr, result);
break;
case nir_intrinsic_store_scratch:
visit_store_scratch(bld_base, instr);
break;
default:
fprintf(stderr, "Unsupported intrinsic: ");
nir_print_instr(&instr->instr, stderr);

View file

@ -147,6 +147,15 @@ struct lp_build_nir_context
LLVMValueRef reg_storage,
LLVMValueRef dst[NIR_MAX_VEC_COMPONENTS]);
void (*load_scratch)(struct lp_build_nir_context *bld_base,
unsigned nc, unsigned bit_size,
LLVMValueRef offset,
LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]);
void (*store_scratch)(struct lp_build_nir_context *bld_base,
unsigned writemask, unsigned nc,
unsigned bit_size, LLVMValueRef offset,
LLVMValueRef val);
void (*emit_var_decl)(struct lp_build_nir_context *bld_base,
nir_variable *var);
@ -209,6 +218,8 @@ struct lp_build_nir_soa_context
LLVMValueRef ssbo_sizes[LP_MAX_TGSI_SHADER_BUFFERS];
LLVMValueRef shared_ptr;
LLVMValueRef scratch_ptr;
unsigned scratch_size;
const struct lp_build_coro_suspend_info *coro;

View file

@ -1833,6 +1833,136 @@ emit_interp_at(struct lp_build_nir_context *bld_base,
}
}
static LLVMValueRef get_scratch_thread_offsets(struct gallivm_state *gallivm,
struct lp_type type,
unsigned scratch_size)
{
LLVMTypeRef elem_type = lp_build_int_elem_type(gallivm, type);
LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
unsigned i;
if (type.length == 1)
return LLVMConstInt(elem_type, 0, 0);
for (i = 0; i < type.length; ++i)
elems[i] = LLVMConstInt(elem_type, scratch_size * i, 0);
return LLVMConstVector(elems, type.length);
}
static void
emit_load_scratch(struct lp_build_nir_context *bld_base,
unsigned nc, unsigned bit_size,
LLVMValueRef offset,
LLVMValueRef outval[NIR_MAX_VEC_COMPONENTS])
{
struct gallivm_state * gallivm = bld_base->base.gallivm;
LLVMBuilderRef builder = gallivm->builder;
struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
struct lp_build_context *uint_bld = &bld_base->uint_bld;
struct lp_build_context *load_bld;
LLVMValueRef thread_offsets = get_scratch_thread_offsets(gallivm, uint_bld->type, bld->scratch_size);;
uint32_t shift_val = bit_size_to_shift_size(bit_size);
load_bld = get_int_bld(bld_base, true, bit_size);
offset = lp_build_add(uint_bld, offset, thread_offsets);
offset = lp_build_shr_imm(uint_bld, offset, shift_val);
for (unsigned c = 0; c < nc; c++) {
LLVMValueRef loop_index = lp_build_add(uint_bld, offset, lp_build_const_int_vec(gallivm, uint_bld->type, c));
LLVMValueRef exec_mask = mask_vec(bld_base);
LLVMValueRef result = lp_build_alloca(gallivm, load_bld->vec_type, "");
struct lp_build_loop_state loop_state;
lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
struct lp_build_if_state ifthen;
LLVMValueRef cond, temp_res;
loop_index = LLVMBuildExtractElement(gallivm->builder, loop_index,
loop_state.counter, "");
cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
lp_build_if(&ifthen, gallivm, cond);
LLVMValueRef scalar;
LLVMValueRef ptr2 = LLVMBuildBitCast(builder, bld->scratch_ptr, LLVMPointerType(load_bld->elem_type, 0), "");
scalar = lp_build_pointer_get(builder, ptr2, loop_index);
temp_res = LLVMBuildLoad(builder, result, "");
temp_res = LLVMBuildInsertElement(builder, temp_res, scalar, loop_state.counter, "");
LLVMBuildStore(builder, temp_res, result);
lp_build_else(&ifthen);
temp_res = LLVMBuildLoad(builder, result, "");
LLVMValueRef zero;
if (bit_size == 64)
zero = LLVMConstInt(LLVMInt64TypeInContext(gallivm->context), 0, 0);
else if (bit_size == 16)
zero = LLVMConstInt(LLVMInt16TypeInContext(gallivm->context), 0, 0);
else if (bit_size == 8)
zero = LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), 0, 0);
else
zero = lp_build_const_int32(gallivm, 0);
temp_res = LLVMBuildInsertElement(builder, temp_res, zero, loop_state.counter, "");
LLVMBuildStore(builder, temp_res, result);
lp_build_endif(&ifthen);
lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length),
NULL, LLVMIntUGE);
outval[c] = LLVMBuildLoad(gallivm->builder, result, "");
}
}
static void
emit_store_scratch(struct lp_build_nir_context *bld_base,
unsigned writemask, unsigned nc,
unsigned bit_size, LLVMValueRef offset,
LLVMValueRef dst)
{
struct gallivm_state * gallivm = bld_base->base.gallivm;
LLVMBuilderRef builder = gallivm->builder;
struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
struct lp_build_context *uint_bld = &bld_base->uint_bld;
struct lp_build_context *store_bld;
LLVMValueRef thread_offsets = get_scratch_thread_offsets(gallivm, uint_bld->type, bld->scratch_size);;
uint32_t shift_val = bit_size_to_shift_size(bit_size);
store_bld = get_int_bld(bld_base, true, bit_size);
LLVMValueRef exec_mask = mask_vec(bld_base);
offset = lp_build_add(uint_bld, offset, thread_offsets);
offset = lp_build_shr_imm(uint_bld, offset, shift_val);
for (unsigned c = 0; c < nc; c++) {
if (!(writemask & (1u << c)))
continue;
LLVMValueRef val = (nc == 1) ? dst : LLVMBuildExtractValue(builder, dst, c, "");
LLVMValueRef loop_index = lp_build_add(uint_bld, offset, lp_build_const_int_vec(gallivm, uint_bld->type, c));
struct lp_build_loop_state loop_state;
lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
LLVMValueRef value_ptr = LLVMBuildExtractElement(gallivm->builder, val,
loop_state.counter, "");
value_ptr = LLVMBuildBitCast(gallivm->builder, value_ptr, store_bld->elem_type, "");
struct lp_build_if_state ifthen;
LLVMValueRef cond;
loop_index = LLVMBuildExtractElement(gallivm->builder, loop_index,
loop_state.counter, "");
cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
lp_build_if(&ifthen, gallivm, cond);
LLVMValueRef ptr2 = LLVMBuildBitCast(builder, bld->scratch_ptr, LLVMPointerType(store_bld->elem_type, 0), "");
lp_build_pointer_set(builder, ptr2, loop_index, value_ptr);
lp_build_endif(&ifthen);
lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length),
NULL, LLVMIntUGE);
}
}
void lp_build_nir_soa(struct gallivm_state *gallivm,
struct nir_shader *shader,
const struct lp_build_tgsi_params *params,
@ -1930,6 +2060,8 @@ void lp_build_nir_soa(struct gallivm_state *gallivm,
bld.bld_base.vote = emit_vote;
bld.bld_base.helper_invocation = emit_helper_invocation;
bld.bld_base.interp_at = emit_interp_at;
bld.bld_base.load_scratch = emit_load_scratch;
bld.bld_base.store_scratch = emit_store_scratch;
bld.mask = params->mask;
bld.inputs = params->inputs;
@ -1976,6 +2108,13 @@ void lp_build_nir_soa(struct gallivm_state *gallivm,
bld.bld_base.shader = shader;
if (shader->scratch_size) {
bld.scratch_ptr = lp_build_array_alloca(gallivm,
LLVMInt8TypeInContext(gallivm->context),
lp_build_const_int32(gallivm, shader->scratch_size * type.length),
"scratch");
}
bld.scratch_size = shader->scratch_size;
emit_prologue(&bld);
lp_build_nir_llvm(&bld.bld_base, shader);