diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index 498d95e4be6..2bf591fbac5 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -202,6 +202,248 @@ v3d_general_tmu_op(nir_intrinsic_instr *instr) } } +/** + * Checks if pipelining a new TMU operation requiring 'components' LDTMUs and + * 'writes' TMU register writes would overflow any of the TMU fifos. + */ +static bool +tmu_fifo_overflow(struct v3d_compile *c, uint32_t components, uint32_t writes) +{ + if (c->tmu.input_fifo_size + writes > 16 / c->threads) + return true; + + /* Output and Config fifos are only involved with TMU lookups */ + if (components > 0 && + (c->tmu.config_fifo_size + 1 > 8 / c->threads || + c->tmu.output_fifo_size + components > 16 / c->threads)) { + return true; + } + + return false; +} + +/** + * Emits the thread switch and LDTMU/TMUWT for all outstanding TMU operations, + * popping all TMU fifo entries. + */ +void +ntq_flush_tmu(struct v3d_compile *c) +{ + if (c->tmu.flush_count == 0) + return; + + vir_emit_thrsw(c); + + bool emitted_tmuwt = false; + for (int i = 0; i < c->tmu.flush_count; i++) { + if (c->tmu.flush[i].num_components > 0) { + nir_dest *dest = c->tmu.flush[i].dest; + assert(dest); + + for (int j = 0; j < c->tmu.flush[i].num_components; j++) { + ntq_store_dest(c, dest, j, + vir_MOV(c, vir_LDTMU(c))); + } + } else if (!emitted_tmuwt) { + vir_TMUWT(c); + emitted_tmuwt = true; + } + } + + c->tmu.input_fifo_size = 0; + c->tmu.config_fifo_size = 0; + c->tmu.output_fifo_size = 0; + c->tmu.flush_count = 0; + _mesa_set_clear(c->tmu.outstanding_regs, NULL); +} + +/** + * Queues a pending thread switch + LDTMU/TMUWT for a TMU operation. The caller + * is reponsible for ensuring that doing this doesn't overflow the TMU fifos, + * and more specifically, the output fifo, since that can't stall. + */ +static void +ntq_add_pending_tmu_flush(struct v3d_compile *c, + nir_dest *dest, + uint32_t num_components, + uint32_t tmu_writes) +{ + assert(!tmu_fifo_overflow(c, num_components, tmu_writes)); + + c->tmu.input_fifo_size += tmu_writes; + if (num_components > 0) { + c->tmu.config_fifo_size += 1; + c->tmu.output_fifo_size += num_components; + if (!dest->is_ssa) + _mesa_set_add(c->tmu.outstanding_regs, dest->reg.reg); + } + + c->tmu.flush[c->tmu.flush_count].dest = dest; + c->tmu.flush[c->tmu.flush_count].num_components = num_components; + c->tmu.flush_count++; +} + +enum emit_mode { + MODE_COUNT = 0, + MODE_EMIT, + MODE_LAST, +}; + +/** + * For a TMU general store instruction: + * + * In MODE_COUNT mode, records the number of TMU writes required and flushes + * any outstanding TMU operations the instruction depends on, but it doesn't + * emit any actual register writes. + * + * In MODE_EMIT mode, emits the data register writes required by the + * instruction. + */ +static void +emit_tmu_general_store_writes(struct v3d_compile *c, + enum emit_mode mode, + nir_intrinsic_instr *instr, + uint32_t base_const_offset, + uint32_t *writemask, + uint32_t *const_offset, + uint32_t *tmu_writes) +{ + struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD); + + /* Find the first set of consecutive components that + * are enabled in the writemask and emit the TMUD + * instructions for them. + */ + uint32_t first_component = ffs(*writemask) - 1; + uint32_t last_component = first_component; + while (*writemask & BITFIELD_BIT(last_component + 1)) + last_component++; + + assert(first_component >= 0 && + first_component <= last_component && + last_component < instr->num_components); + + for (int i = first_component; i <= last_component; i++) { + struct qreg data = ntq_get_src(c, instr->src[0], i); + if (mode == MODE_COUNT) + (*tmu_writes)++; + else + vir_MOV_dest(c, tmud, data); + } + + if (mode == MODE_EMIT) { + /* Update the offset for the TMU write based on the + * the first component we are writing. + */ + *const_offset = base_const_offset + first_component * 4; + + /* Clear these components from the writemask */ + uint32_t written_mask = + BITFIELD_RANGE(first_component, *tmu_writes); + (*writemask) &= ~written_mask; + } +} + +/** + * For a TMU general atomic instruction: + * + * In MODE_COUNT mode, records the number of TMU writes required and flushes + * any outstanding TMU operations the instruction depends on, but it doesn't + * emit any actual register writes. + * + * In MODE_EMIT mode, emits the data register writes required by the + * instruction. + */ +static void +emit_tmu_general_atomic_writes(struct v3d_compile *c, + enum emit_mode mode, + nir_intrinsic_instr *instr, + uint32_t tmu_op, + bool has_index, + uint32_t *tmu_writes) +{ + struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD); + + struct qreg data = ntq_get_src(c, instr->src[1 + has_index], 0); + if (mode == MODE_COUNT) + (*tmu_writes)++; + else + vir_MOV_dest(c, tmud, data); + + if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) { + data = ntq_get_src(c, instr->src[2 + has_index], 0); + if (mode == MODE_COUNT) + (*tmu_writes)++; + else + vir_MOV_dest(c, tmud, data); + } +} + +/** + * For any TMU general instruction: + * + * In MODE_COUNT mode, records the number of TMU writes required to emit the + * address parameter and flushes any outstanding TMU operations the instruction + * depends on, but it doesn't emit any actual register writes. + * + * In MODE_EMIT mode, emits register writes required to emit the address. + */ +static void +emit_tmu_general_address_write(struct v3d_compile *c, + enum emit_mode mode, + nir_intrinsic_instr *instr, + uint32_t config, + bool dynamic_src, + int offset_src, + struct qreg base_offset, + uint32_t const_offset, + uint32_t *tmu_writes) +{ + if (mode == MODE_COUNT) { + (*tmu_writes)++; + if (dynamic_src) + ntq_get_src(c, instr->src[offset_src], 0); + return; + } + + if (vir_in_nonuniform_control_flow(c)) { + vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); + } + + struct qreg tmua; + if (config == ~0) + tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA); + else + tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU); + + struct qinst *tmu; + if (dynamic_src) { + struct qreg offset = base_offset; + if (const_offset != 0) { + offset = vir_ADD(c, offset, + vir_uniform_ui(c, const_offset)); + } + struct qreg data = ntq_get_src(c, instr->src[offset_src], 0); + tmu = vir_ADD_dest(c, tmua, offset, data); + } else { + if (const_offset != 0) { + tmu = vir_ADD_dest(c, tmua, base_offset, + vir_uniform_ui(c, const_offset)); + } else { + tmu = vir_MOV_dest(c, tmua, base_offset); + } + } + + if (config != ~0) { + tmu->uniform = + vir_get_uniform_index(c, QUNIFORM_CONSTANT, config); + } + + if (vir_in_nonuniform_control_flow(c)) + vir_set_cond(tmu, V3D_QPU_COND_IFA); +} + /** * Implements indirect uniform loads and SSBO accesses through the TMU general * memory access interface. @@ -293,140 +535,98 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, 1 : 0])); } - struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD); - unsigned writemask = is_store ? nir_intrinsic_write_mask(instr) : 0; + /* We are ready to emit TMU register writes now, but before we actually + * emit them we need to know the amount of writes we will require + * and we need to flush outstanding TMU operations if any of the writes + * reads from the result of an outstanding TMU operation before we emit + * any of the writes for the current operation to avoid corrupting its + * TMU sequence. To do this we run this logic twice, the first time + * it will count register writes and flush pending TMU requests if + * necessary due to a dependency, and the second one will emit the + * actual TMU writes. + */ + const uint32_t dest_components = nir_intrinsic_dest_components(instr); uint32_t base_const_offset = const_offset; - int first_component = -1; - int last_component = -1; + uint32_t writemask = is_store ? nir_intrinsic_write_mask(instr) : 0; do { - int tmu_writes = 1; /* address */ + uint32_t tmu_writes = 0; + for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) { + assert(mode == MODE_COUNT || tmu_writes > 0); - if (is_store) { - /* Find the first set of consecutive components that - * are enabled in the writemask and emit the TMUD - * instructions for them. + if (is_store) { + emit_tmu_general_store_writes(c, mode, instr, + base_const_offset, + &writemask, + &const_offset, + &tmu_writes); + } else if (!is_load && !atomic_add_replaced) { + emit_tmu_general_atomic_writes(c, mode, instr, + tmu_op, + has_index, + &tmu_writes); + } + + /* The spec says that for atomics, the TYPE field is + * ignored, but that doesn't seem to be the case for + * CMPXCHG. Just use the number of tmud writes we did + * to decide the type (or choose "32bit" for atomic + * reads, which has been fine). */ - first_component = ffs(writemask) - 1; - last_component = first_component; - while (writemask & BITFIELD_BIT(last_component + 1)) - last_component++; + uint32_t config = 0; + if (mode == MODE_EMIT) { + uint32_t num_components; + if (is_load || atomic_add_replaced) + num_components = instr->num_components; + else { + assert(tmu_writes > 0); + num_components = tmu_writes - 1; + } - assert(first_component >= 0 && - first_component <= last_component && - last_component < instr->num_components); + uint32_t perquad = + is_load && !vir_in_nonuniform_control_flow(c) + ? GENERAL_TMU_LOOKUP_PER_QUAD + : GENERAL_TMU_LOOKUP_PER_PIXEL; + config = 0xffffff00 | tmu_op << 3 | perquad; - struct qreg tmud = vir_reg(QFILE_MAGIC, - V3D_QPU_WADDR_TMUD); - for (int i = first_component; i <= last_component; i++) { - struct qreg data = - ntq_get_src(c, instr->src[0], i); - vir_MOV_dest(c, tmud, data); - tmu_writes++; + if (num_components == 1) { + config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI; + } else { + config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 + + num_components - 2; + } } - /* Update the offset for the TMU write based on the - * the first component we are writing. - */ - const_offset = base_const_offset + first_component * 4; + emit_tmu_general_address_write(c, mode, instr, config, + dynamic_src, + offset_src, + base_offset, + const_offset, + &tmu_writes); - /* Clear these components from the writemask */ - uint32_t written_mask = - BITFIELD_RANGE(first_component, tmu_writes - 1); - writemask &= ~written_mask; - } else if (!is_load && !atomic_add_replaced) { - struct qreg data = - ntq_get_src(c, instr->src[1 + has_index], 0); - vir_MOV_dest(c, tmud, data); - tmu_writes++; - if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) { - data = ntq_get_src(c, instr->src[2 + has_index], - 0); - vir_MOV_dest(c, tmud, data); - tmu_writes++; - } - } + assert(tmu_writes > 0); + if (mode == MODE_COUNT) { + /* Make sure we won't exceed the 16-entry TMU + * fifo if each thread is storing at the same + * time. + */ + while (tmu_writes > 16 / c->threads) + c->threads /= 2; - /* Make sure we won't exceed the 16-entry TMU fifo if each - * thread is storing at the same time. - */ - while (tmu_writes > 16 / c->threads) - c->threads /= 2; - - /* The spec says that for atomics, the TYPE field is ignored, - * but that doesn't seem to be the case for CMPXCHG. Just use - * the number of tmud writes we did to decide the type (or - * choose "32bit" for atomic reads, which has been fine). - */ - uint32_t num_components; - if (is_load || atomic_add_replaced) { - num_components = instr->num_components; - } else { - assert(tmu_writes > 1); - num_components = tmu_writes - 1; - } - - uint32_t perquad = is_load && !vir_in_nonuniform_control_flow(c) - ? GENERAL_TMU_LOOKUP_PER_QUAD - : GENERAL_TMU_LOOKUP_PER_PIXEL; - uint32_t config = (0xffffff00 | - tmu_op << 3| - perquad); - if (num_components == 1) { - config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI; - } else { - config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 + - num_components - 2; - } - - if (vir_in_nonuniform_control_flow(c)) { - vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), - V3D_QPU_PF_PUSHZ); - } - - struct qreg tmua; - if (config == ~0) - tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA); - else - tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU); - - struct qinst *tmu; - if (dynamic_src) { - struct qreg offset = base_offset; - if (const_offset != 0) { - offset = vir_ADD(c, offset, - vir_uniform_ui(c, const_offset)); - } - struct qreg data = - ntq_get_src(c, instr->src[offset_src], 0); - tmu = vir_ADD_dest(c, tmua, offset, data); - } else { - if (const_offset != 0) { - tmu = vir_ADD_dest(c, tmua, base_offset, - vir_uniform_ui(c, const_offset)); + /* If pipelining this TMU operation would + * overflow TMU fifos, we need to flush. + */ + if (tmu_fifo_overflow(c, dest_components, tmu_writes)) + ntq_flush_tmu(c); } else { - tmu = vir_MOV_dest(c, tmua, base_offset); + /* Delay emission of the thread switch and + * LDTMU/TMUWT until we really need to do it to + * improve pipelining. + */ + ntq_add_pending_tmu_flush(c, &instr->dest, + dest_components, + tmu_writes); } } - - if (config != ~0) { - tmu->uniform = - vir_get_uniform_index(c, QUNIFORM_CONSTANT, - config); - } - - if (vir_in_nonuniform_control_flow(c)) - vir_set_cond(tmu, V3D_QPU_COND_IFA); - - vir_emit_thrsw(c); - - /* Read the result, or wait for the TMU op to complete. */ - for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) { - ntq_store_dest(c, &instr->dest, i, - vir_MOV(c, vir_LDTMU(c))); - } - - if (nir_intrinsic_dest_components(instr) == 0) - vir_TMUWT(c); } while (is_store && writemask != 0); } @@ -532,20 +732,42 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, } } +/** + * This looks up the qreg associated with a particular ssa/reg used as a source + * in any instruction. + * + * It is expected that the definition for any NIR value read as a source has + * been emitted by a previous instruction, however, in the case of TMU + * operations we may have postponed emission of the thread switch and LDTMUs + * required to read the TMU results until the results are actually used to + * improve pipelining, which then would lead to us not finding them here + * (for SSA defs) or finding them in the list of registers awaiting a TMU flush + * (for registers), meaning that we need to flush outstanding TMU operations + * to read the correct value. + */ struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i) { struct hash_entry *entry; if (src.is_ssa) { - entry = _mesa_hash_table_search(c->def_ht, src.ssa); assert(i < src.ssa->num_components); + + entry = _mesa_hash_table_search(c->def_ht, src.ssa); + if (!entry) { + ntq_flush_tmu(c); + entry = _mesa_hash_table_search(c->def_ht, src.ssa); + } } else { nir_register *reg = src.reg.reg; - entry = _mesa_hash_table_search(c->def_ht, reg); assert(reg->num_array_elems == 0); assert(src.reg.base_offset == 0); assert(i < reg->num_components); + + if (_mesa_set_search(c->tmu.outstanding_regs, reg)) + ntq_flush_tmu(c); + entry = _mesa_hash_table_search(c->def_ht, reg); } + assert(entry); struct qreg *qregs = entry->data; return qregs[i]; @@ -2520,6 +2742,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) break; case nir_intrinsic_discard: + ntq_flush_tmu(c); + if (vir_in_nonuniform_control_flow(c)) { vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ); @@ -2533,6 +2757,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) break; case nir_intrinsic_discard_if: { + ntq_flush_tmu(c); + enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]); if (vir_in_nonuniform_control_flow(c)) { @@ -2561,10 +2787,9 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) /* We don't do any instruction scheduling of these NIR * instructions between each other, so we just need to make * sure that the TMU operations before the barrier are flushed - * before the ones after the barrier. That is currently - * handled by having a THRSW in each of them and a LDTMU - * series or a TMUWT after. + * before the ones after the barrier. */ + ntq_flush_tmu(c); break; case nir_intrinsic_control_barrier: @@ -2572,6 +2797,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) * (actually supergroup) to block until the last invocation * reaches the TSY op. */ + ntq_flush_tmu(c); + if (c->devinfo->ver >= 42) { vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB)); @@ -3061,6 +3288,13 @@ ntq_emit_block(struct v3d_compile *c, nir_block *block) nir_foreach_instr(instr, block) { ntq_emit_instr(c, instr); } + + /* Always process pending TMU operations in the same block they were + * emitted: we can't emit TMU operations in a block and then emit a + * thread switch and LDTMU/TMUWT for them in another block, possibly + * under control flow. + */ + ntq_flush_tmu(c); } static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list); diff --git a/src/broadcom/compiler/v3d33_tex.c b/src/broadcom/compiler/v3d33_tex.c index 1b24fa0db3c..386453289c3 100644 --- a/src/broadcom/compiler/v3d33_tex.c +++ b/src/broadcom/compiler/v3d33_tex.c @@ -33,6 +33,9 @@ void v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) { + /* FIXME: allow tex pipelining */ + ntq_flush_tmu(c); + unsigned unit = instr->texture_index; struct V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1 p0_unpacked = { diff --git a/src/broadcom/compiler/v3d40_tex.c b/src/broadcom/compiler/v3d40_tex.c index e9737d6a826..f999c8b8619 100644 --- a/src/broadcom/compiler/v3d40_tex.c +++ b/src/broadcom/compiler/v3d40_tex.c @@ -61,6 +61,9 @@ static const struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = { void v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) { + /* FIXME: allow tex pipelining */ + ntq_flush_tmu(c); + unsigned texture_idx = instr->texture_index; unsigned sampler_idx = instr->sampler_index; @@ -343,6 +346,9 @@ void v3d40_vir_emit_image_load_store(struct v3d_compile *c, nir_intrinsic_instr *instr) { + /* FIXME: allow image load/store pipelining */ + ntq_flush_tmu(c); + unsigned format = nir_intrinsic_format(instr); unsigned unit = nir_src_as_uint(instr->src[0]); int tmu_writes = 0; diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index ec6087b1f50..d617168ddd5 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -566,6 +566,24 @@ struct v3d_compile { struct qinst **defs; uint32_t defs_array_size; + /* TMU pipelining tracking */ + struct { + /* NIR registers that have been updated with a TMU operation + * that has not been flushed yet. + */ + struct set *outstanding_regs; + + uint32_t input_fifo_size; + uint32_t config_fifo_size; + uint32_t output_fifo_size; + + struct { + nir_dest *dest; + uint32_t num_components; + } flush[8]; /* 16 entries / 2 threads for input/output fifos */ + uint32_t flush_count; + } tmu; + /** * Inputs to the shader, arranged by TGSI declaration order. * @@ -918,6 +936,7 @@ uint8_t vir_channels_written(struct qinst *inst); struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i); void ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, struct qreg result); +void ntq_flush_tmu(struct v3d_compile *c); void vir_emit_thrsw(struct v3d_compile *c); void vir_dump(struct v3d_compile *c); diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index a36be86a1f3..92f36a99aee 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -539,6 +539,8 @@ vir_compile_init(const struct v3d_compiler *compiler, c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer, _mesa_key_pointer_equal); + c->tmu.outstanding_regs = _mesa_pointer_set_create(c); + return c; }