freedreno/ir3: shared variable support

Signed-off-by: Rob Clark <robdclark@gmail.com>
This commit is contained in:
Rob Clark 2017-11-09 10:56:43 -05:00
parent dd75abc6f3
commit eaae81058c
3 changed files with 177 additions and 2 deletions

View file

@ -1432,6 +1432,149 @@ emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
return atomic;
}
/* src[] = { offset }. const_index[] = { base } */
static void
emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr,
struct ir3_instruction **dst)
{
struct ir3_block *b = ctx->block;
struct ir3_instruction *ldl, *offset;
unsigned base;
offset = get_src(ctx, &intr->src[0])[0];
base = intr->const_index[0];
ldl = ir3_LDL(b, offset, 0, create_immed(b, intr->num_components), 0);
ldl->cat6.src_offset = base;
ldl->cat6.type = TYPE_U32;
ldl->regs[0]->wrmask = MASK(intr->num_components);
mark_read(ctx, ldl);
split_dest(b, dst, ldl, 0, intr->num_components);
}
/* src[] = { value, offset }. const_index[] = { base, write_mask } */
static void
emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
{
struct ir3_block *b = ctx->block;
struct ir3_instruction *stl, *offset;
struct ir3_instruction * const *value;
unsigned base, wrmask;
value = get_src(ctx, &intr->src[0]);
offset = get_src(ctx, &intr->src[1])[0];
base = intr->const_index[0];
wrmask = intr->const_index[1];
/* Combine groups of consecutive enabled channels in one write
* message. We use ffs to find the first enabled channel and then ffs on
* the bit-inverse, down-shifted writemask to determine the length of
* the block of enabled bits.
*
* (trick stolen from i965's fs_visitor::nir_emit_cs_intrinsic())
*/
while (wrmask) {
unsigned first_component = ffs(wrmask) - 1;
unsigned length = ffs(~(wrmask >> first_component)) - 1;
stl = ir3_STL(b, offset, 0,
create_collect(b, &value[first_component], length), 0,
create_immed(b, length), 0);
stl->cat6.dst_offset = first_component + base;
stl->cat6.type = TYPE_U32;
mark_write(ctx, stl);
array_insert(b, b->keeps, stl);
/* Clear the bits in the writemask that we just wrote, then try
* again to see if more channels are left.
*/
wrmask &= (15 << (first_component + length));
}
}
/*
* CS shared variable atomic intrinsics
*
* All of the shared variable atomic memory operations read a value from
* memory, compute a new value using one of the operations below, write the
* new value to memory, and return the original value read.
*
* All operations take 2 sources except CompSwap that takes 3. These
* sources represent:
*
* 0: The offset into the shared variable storage region that the atomic
* operation will operate on.
* 1: The data parameter to the atomic function (i.e. the value to add
* in shared_atomic_add, etc).
* 2: For CompSwap only: the second data parameter.
*/
static struct ir3_instruction *
emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
{
struct ir3_block *b = ctx->block;
struct ir3_instruction *atomic, *src0, *src1;
type_t type = TYPE_U32;
src0 = get_src(ctx, &intr->src[0])[0]; /* offset */
src1 = get_src(ctx, &intr->src[1])[0]; /* value */
switch (intr->intrinsic) {
case nir_intrinsic_shared_atomic_add:
atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0);
break;
case nir_intrinsic_shared_atomic_imin:
atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
type = TYPE_S32;
break;
case nir_intrinsic_shared_atomic_umin:
atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
break;
case nir_intrinsic_shared_atomic_imax:
atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
type = TYPE_S32;
break;
case nir_intrinsic_shared_atomic_umax:
atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
break;
case nir_intrinsic_shared_atomic_and:
atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0);
break;
case nir_intrinsic_shared_atomic_or:
atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0);
break;
case nir_intrinsic_shared_atomic_xor:
atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0);
break;
case nir_intrinsic_shared_atomic_exchange:
atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0);
break;
case nir_intrinsic_shared_atomic_comp_swap:
/* for cmpxchg, src1 is [ui]vec2(data, compare): */
src1 = create_collect(b, (struct ir3_instruction*[]){
get_src(ctx, &intr->src[2])[0],
src1,
}, 2);
atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0);
break;
default:
unreachable("boo");
}
atomic->cat6.iim_val = 1;
atomic->cat6.d = 1;
atomic->cat6.type = type;
mark_write(ctx, atomic);
/* even if nothing consume the result, we can't DCE the instruction: */
array_insert(b, b->keeps, atomic);
return atomic;
}
static void
emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
{
@ -1586,6 +1729,24 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
case nir_intrinsic_ssbo_atomic_comp_swap:
dst[0] = emit_intrinsic_atomic_ssbo(ctx, intr);
break;
case nir_intrinsic_load_shared:
emit_intrinsic_load_shared(ctx, intr, dst);
break;
case nir_intrinsic_store_shared:
emit_intrinsic_store_shared(ctx, intr);
break;
case nir_intrinsic_shared_atomic_add:
case nir_intrinsic_shared_atomic_imin:
case nir_intrinsic_shared_atomic_umin:
case nir_intrinsic_shared_atomic_imax:
case nir_intrinsic_shared_atomic_umax:
case nir_intrinsic_shared_atomic_and:
case nir_intrinsic_shared_atomic_or:
case nir_intrinsic_shared_atomic_xor:
case nir_intrinsic_shared_atomic_exchange:
case nir_intrinsic_shared_atomic_comp_swap:
dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
break;
case nir_intrinsic_barrier:
case nir_intrinsic_memory_barrier:
case nir_intrinsic_group_memory_barrier:

View file

@ -194,11 +194,20 @@ static bool valid_flags(struct ir3_instruction *instr, unsigned n,
if (is_store(instr) && (n == 1))
return false;
if ((instr->opc == OPC_LDL) && (n != 1))
return false;
if ((instr->opc == OPC_STL) && (n != 2))
return false;
/* disallow CP into anything but the SSBO slot argument for
* atomics:
*/
if (is_atomic(instr->opc) && (n != 0))
return false;
if (is_atomic(instr->opc) && !(instr->flags & IR3_INSTR_G))
return false;
}
break;

View file

@ -191,13 +191,18 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
/* seems like ldlv needs (ss) bit instead?? which is odd but
* makes a bunch of flat-varying tests start working on a4xx.
*/
if (n->opc == OPC_LDLV)
if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL))
regmask_set(&needs_ss, n->regs[0]);
else
regmask_set(&needs_sy, n->regs[0]);
} else if (is_atomic(n->opc)) {
if (n->flags & IR3_INSTR_G)
regmask_set(&needs_sy, n->regs[0]);
else
regmask_set(&needs_ss, n->regs[0]);
}
if ((n->opc == OPC_LDGB) || (n->opc == OPC_STGB) || is_atomic(n->opc))
if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G)))
ctx->has_ssbo = true;
/* both tex/sfu appear to not always immediately consume