mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 19:58:09 +02:00
broadcom/vc5: Use THRSW to enable multi-threaded shaders.
This is a major performance boost on all of V3D, but is required on V3D 4.x where shaders are always either 2- or 4-threaded.
This commit is contained in:
parent
86a12b4d5a
commit
90269ba353
9 changed files with 310 additions and 80 deletions
|
|
@ -700,13 +700,17 @@
|
||||||
<field name="Vertex Shader input VPM segment size" size="8" start="7b" type="uint"/>
|
<field name="Vertex Shader input VPM segment size" size="8" start="7b" type="uint"/>
|
||||||
<field name="Address of default attribute values" size="32" start="8b" type="address"/>
|
<field name="Address of default attribute values" size="32" start="8b" type="address"/>
|
||||||
<field name="Fragment Shader Code Address" size="29" start="99" type="address"/>
|
<field name="Fragment Shader Code Address" size="29" start="99" type="address"/>
|
||||||
<field name="2-way threadable" size="1" start="96" type="bool"/>
|
<field name="Fragment Shader 2-way threadable" size="1" start="96" type="bool"/>
|
||||||
<field name="4-way threadable" size="1" start="97" type="bool"/>
|
<field name="Fragment Shader 4-way threadable" size="1" start="97" type="bool"/>
|
||||||
<field name="Propagate NaNs" size="1" start="98" type="bool"/>
|
<field name="Propagate NaNs" size="1" start="98" type="bool"/>
|
||||||
<field name="Fragment Shader Uniforms Address" size="32" start="16b" type="address"/>
|
<field name="Fragment Shader Uniforms Address" size="32" start="16b" type="address"/>
|
||||||
<field name="Vertex Shader Code Address" size="32" start="20b" type="address"/>
|
<field name="Vertex Shader Code Address" size="32" start="20b" type="address"/>
|
||||||
|
<field name="Vertex Shader 2-way threadable" size="1" start="160" type="bool"/>
|
||||||
|
<field name="Vertex Shader 4-way threadable" size="1" start="161" type="bool"/>
|
||||||
<field name="Vertex Shader Uniforms Address" size="32" start="24b" type="address"/>
|
<field name="Vertex Shader Uniforms Address" size="32" start="24b" type="address"/>
|
||||||
<field name="Coordinate Shader Code Address" size="32" start="28b" type="address"/>
|
<field name="Coordinate Shader Code Address" size="32" start="28b" type="address"/>
|
||||||
|
<field name="Coordinate Shader 2-way threadable" size="1" start="224" type="bool"/>
|
||||||
|
<field name="Coordinate Shader 4-way threadable" size="1" start="225" type="bool"/>
|
||||||
<field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
|
<field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
|
||||||
</struct>
|
</struct>
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -65,6 +65,23 @@ resize_qreg_array(struct v3d_compile *c,
|
||||||
(*regs)[i] = c->undef;
|
(*regs)[i] = c->undef;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
vir_emit_thrsw(struct v3d_compile *c)
|
||||||
|
{
|
||||||
|
if (c->threads == 1)
|
||||||
|
return;
|
||||||
|
|
||||||
|
/* Always thread switch after each texture operation for now.
|
||||||
|
*
|
||||||
|
* We could do better by batching a bunch of texture fetches up and
|
||||||
|
* then doing one thread switch and collecting all their results
|
||||||
|
* afterward.
|
||||||
|
*/
|
||||||
|
c->last_thrsw = vir_NOP(c);
|
||||||
|
c->last_thrsw->qpu.sig.thrsw = true;
|
||||||
|
c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
|
||||||
|
}
|
||||||
|
|
||||||
static struct qreg
|
static struct qreg
|
||||||
vir_SFU(struct v3d_compile *c, int waddr, struct qreg src)
|
vir_SFU(struct v3d_compile *c, int waddr, struct qreg src)
|
||||||
{
|
{
|
||||||
|
|
@ -118,6 +135,7 @@ indirect_uniform_load(struct v3d_compile *c, nir_intrinsic_instr *intr)
|
||||||
vir_uniform(c, QUNIFORM_UBO_ADDR, 0),
|
vir_uniform(c, QUNIFORM_UBO_ADDR, 0),
|
||||||
indirect_offset);
|
indirect_offset);
|
||||||
|
|
||||||
|
vir_emit_thrsw(c);
|
||||||
return vir_LDTMU(c);
|
return vir_LDTMU(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -488,6 +506,8 @@ ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
vir_emit_thrsw(c);
|
||||||
|
|
||||||
struct qreg return_values[4];
|
struct qreg return_values[4];
|
||||||
for (int i = 0; i < 4; i++) {
|
for (int i = 0; i < 4; i++) {
|
||||||
/* Swizzling .zw of an RG texture should give undefined
|
/* Swizzling .zw of an RG texture should give undefined
|
||||||
|
|
@ -1685,6 +1705,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
|
||||||
ntq_get_src(c, instr->src[1], 0),
|
ntq_get_src(c, instr->src[1], 0),
|
||||||
vir_uniform_ui(c, i * 4)));
|
vir_uniform_ui(c, i * 4)));
|
||||||
|
|
||||||
|
vir_emit_thrsw(c);
|
||||||
|
|
||||||
ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c));
|
ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c));
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
@ -2124,6 +2146,62 @@ count_nir_instrs(nir_shader *nir)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When demoting a shader down to single-threaded, removes the THRSW
|
||||||
|
* instructions (one will still be inserted at v3d_vir_to_qpu() for the
|
||||||
|
* program end).
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
vir_remove_thrsw(struct v3d_compile *c)
|
||||||
|
{
|
||||||
|
vir_for_each_block(block, c) {
|
||||||
|
vir_for_each_inst_safe(inst, block) {
|
||||||
|
if (inst->qpu.sig.thrsw)
|
||||||
|
vir_remove_instruction(c, inst);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
c->last_thrsw = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
vir_emit_last_thrsw(struct v3d_compile *c)
|
||||||
|
{
|
||||||
|
/* On V3D before 4.1, we need a TMU op to be outstanding when thread
|
||||||
|
* switching, so disable threads if we didn't do any TMU ops (each of
|
||||||
|
* which would have emitted a THRSW).
|
||||||
|
*/
|
||||||
|
if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) {
|
||||||
|
c->threads = 1;
|
||||||
|
if (c->last_thrsw)
|
||||||
|
vir_remove_thrsw(c);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If we're threaded and the last THRSW was in conditional code, then
|
||||||
|
* we need to emit another one so that we can flag it as the last
|
||||||
|
* thrsw.
|
||||||
|
*/
|
||||||
|
if (c->last_thrsw && !c->last_thrsw_at_top_level) {
|
||||||
|
assert(c->devinfo->ver >= 41);
|
||||||
|
vir_emit_thrsw(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If we're threaded, then we need to mark the last THRSW instruction
|
||||||
|
* so we can emit a pair of them at QPU emit time.
|
||||||
|
*
|
||||||
|
* For V3D 4.x, we can spawn the non-fragment shaders already in the
|
||||||
|
* post-last-THRSW state, so we can skip this.
|
||||||
|
*/
|
||||||
|
if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) {
|
||||||
|
assert(c->devinfo->ver >= 41);
|
||||||
|
vir_emit_thrsw(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (c->last_thrsw)
|
||||||
|
c->last_thrsw->is_last_thrsw = true;
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
v3d_nir_to_vir(struct v3d_compile *c)
|
v3d_nir_to_vir(struct v3d_compile *c)
|
||||||
{
|
{
|
||||||
|
|
@ -2137,6 +2215,9 @@ v3d_nir_to_vir(struct v3d_compile *c)
|
||||||
|
|
||||||
nir_to_vir(c);
|
nir_to_vir(c);
|
||||||
|
|
||||||
|
/* Emit the last THRSW before STVPM and TLB writes. */
|
||||||
|
vir_emit_last_thrsw(c);
|
||||||
|
|
||||||
switch (c->s->info.stage) {
|
switch (c->s->info.stage) {
|
||||||
case MESA_SHADER_FRAGMENT:
|
case MESA_SHADER_FRAGMENT:
|
||||||
emit_frag_end(c);
|
emit_frag_end(c);
|
||||||
|
|
@ -2171,5 +2252,33 @@ v3d_nir_to_vir(struct v3d_compile *c)
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
v3d_vir_to_qpu(c);
|
/* Compute the live ranges so we can figure out interference. */
|
||||||
|
vir_calculate_live_intervals(c);
|
||||||
|
|
||||||
|
/* Attempt to allocate registers for the temporaries. If we fail,
|
||||||
|
* reduce thread count and try again.
|
||||||
|
*/
|
||||||
|
int min_threads = (c->devinfo->ver >= 41) ? 2 : 1;
|
||||||
|
struct qpu_reg *temp_registers;
|
||||||
|
while (true) {
|
||||||
|
temp_registers = v3d_register_allocate(c);
|
||||||
|
|
||||||
|
if (temp_registers)
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (c->threads == min_threads) {
|
||||||
|
fprintf(stderr, "Failed to register allocate at %d threads:\n",
|
||||||
|
c->threads);
|
||||||
|
vir_dump(c);
|
||||||
|
c->failed = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
c->threads /= 2;
|
||||||
|
|
||||||
|
if (c->threads == 1)
|
||||||
|
vir_remove_thrsw(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
v3d_vir_to_qpu(c, temp_registers);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1097,12 +1097,29 @@ qpu_instruction_valid_in_thrend_slot(struct v3d_compile *c,
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
valid_thrend_sequence(struct v3d_compile *c,
|
valid_thrsw_sequence(struct v3d_compile *c,
|
||||||
struct qinst *qinst, int instructions_in_sequence)
|
struct qinst *qinst, int instructions_in_sequence,
|
||||||
|
bool is_thrend)
|
||||||
{
|
{
|
||||||
for (int slot = 0; slot < instructions_in_sequence; slot++) {
|
for (int slot = 0; slot < instructions_in_sequence; slot++) {
|
||||||
if (!qpu_instruction_valid_in_thrend_slot(c, qinst, slot))
|
/* No scheduling SFU when the result would land in the other
|
||||||
|
* thread. The simulator complains for safety, though it
|
||||||
|
* would only occur for dead code in our case.
|
||||||
|
*/
|
||||||
|
if (slot > 0 &&
|
||||||
|
qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
|
||||||
|
(v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) ||
|
||||||
|
v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) {
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (slot > 0 && qinst->qpu.sig.ldvary)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (is_thrend &&
|
||||||
|
!qpu_instruction_valid_in_thrend_slot(c, qinst, slot)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/* Note that the list is circular, so we can only do this up
|
/* Note that the list is circular, so we can only do this up
|
||||||
* to instructions_in_sequence.
|
* to instructions_in_sequence.
|
||||||
|
|
@ -1121,7 +1138,8 @@ static int
|
||||||
emit_thrsw(struct v3d_compile *c,
|
emit_thrsw(struct v3d_compile *c,
|
||||||
struct qblock *block,
|
struct qblock *block,
|
||||||
struct choose_scoreboard *scoreboard,
|
struct choose_scoreboard *scoreboard,
|
||||||
struct qinst *inst)
|
struct qinst *inst,
|
||||||
|
bool is_thrend)
|
||||||
{
|
{
|
||||||
int time = 0;
|
int time = 0;
|
||||||
|
|
||||||
|
|
@ -1143,20 +1161,25 @@ emit_thrsw(struct v3d_compile *c,
|
||||||
if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
|
if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
|
||||||
break;
|
break;
|
||||||
|
|
||||||
if (!valid_thrend_sequence(c, prev_inst, slots_filled + 1))
|
if (!valid_thrsw_sequence(c, prev_inst, slots_filled + 1,
|
||||||
|
is_thrend)) {
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
merge_inst = prev_inst;
|
merge_inst = prev_inst;
|
||||||
if (++slots_filled == 3)
|
if (++slots_filled == 3)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool needs_free = false;
|
||||||
if (merge_inst) {
|
if (merge_inst) {
|
||||||
merge_inst->qpu.sig.thrsw = true;
|
merge_inst->qpu.sig.thrsw = true;
|
||||||
|
needs_free = true;
|
||||||
} else {
|
} else {
|
||||||
insert_scheduled_instruction(c, block, scoreboard, inst);
|
insert_scheduled_instruction(c, block, scoreboard, inst);
|
||||||
time++;
|
time++;
|
||||||
slots_filled++;
|
slots_filled++;
|
||||||
|
merge_inst = inst;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Insert any extra delay slot NOPs we need. */
|
/* Insert any extra delay slot NOPs we need. */
|
||||||
|
|
@ -1165,10 +1188,19 @@ emit_thrsw(struct v3d_compile *c,
|
||||||
time++;
|
time++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* If we're emitting the last THRSW (other than program end), then
|
||||||
|
* signal that to the HW by emitting two THRSWs in a row.
|
||||||
|
*/
|
||||||
|
if (inst->is_last_thrsw) {
|
||||||
|
struct qinst *second_inst =
|
||||||
|
(struct qinst *)merge_inst->link.next;
|
||||||
|
second_inst->qpu.sig.thrsw = true;
|
||||||
|
}
|
||||||
|
|
||||||
/* If we put our THRSW into another instruction, free up the
|
/* If we put our THRSW into another instruction, free up the
|
||||||
* instruction that didn't end up scheduled into the list.
|
* instruction that didn't end up scheduled into the list.
|
||||||
*/
|
*/
|
||||||
if (merge_inst)
|
if (needs_free)
|
||||||
free(inst);
|
free(inst);
|
||||||
|
|
||||||
return time;
|
return time;
|
||||||
|
|
@ -1293,40 +1325,24 @@ schedule_instructions(struct v3d_compile *c,
|
||||||
free(merge->inst);
|
free(merge->inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (0 && inst->sig.thrsw) {
|
if (inst->sig.thrsw) {
|
||||||
/* XXX emit_thrsw(c, scoreboard, qinst); */
|
time += emit_thrsw(c, block, scoreboard, qinst, false);
|
||||||
} else {
|
} else {
|
||||||
c->qpu_inst_count++;
|
insert_scheduled_instruction(c, block,
|
||||||
list_addtail(&qinst->link, &block->instructions);
|
scoreboard, qinst);
|
||||||
update_scoreboard_for_chosen(scoreboard, inst);
|
|
||||||
}
|
|
||||||
|
|
||||||
scoreboard->tick++;
|
if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
|
||||||
time++;
|
block->branch_qpu_ip = c->qpu_inst_count - 1;
|
||||||
|
/* Fill the delay slots.
|
||||||
if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH ||
|
*
|
||||||
inst->sig.thrsw /* XXX */) {
|
* We should fill these with actual instructions,
|
||||||
block->branch_qpu_ip = c->qpu_inst_count - 1;
|
* instead, but that will probably need to be done
|
||||||
/* Fill the delay slots.
|
* after this, once we know what the leading
|
||||||
*
|
* instructions of the successors are (so we can
|
||||||
* We should fill these with actual instructions,
|
* handle A/B register file write latency)
|
||||||
* instead, but that will probably need to be done
|
*/
|
||||||
* after this, once we know what the leading
|
for (int i = 0; i < 3; i++)
|
||||||
* instructions of the successors are (so we can
|
emit_nop(c, block, scoreboard);
|
||||||
* handle A/B register file write latency)
|
|
||||||
*/
|
|
||||||
/* XXX: scoreboard */
|
|
||||||
int slots = (inst->type == V3D_QPU_INSTR_TYPE_BRANCH ?
|
|
||||||
3 : 2);
|
|
||||||
for (int i = 0; i < slots; i++) {
|
|
||||||
struct qinst *nop = vir_nop();
|
|
||||||
list_addtail(&nop->link, &block->instructions);
|
|
||||||
|
|
||||||
update_scoreboard_for_chosen(scoreboard,
|
|
||||||
&nop->qpu);
|
|
||||||
c->qpu_inst_count++;
|
|
||||||
scoreboard->tick++;
|
|
||||||
time++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1488,7 +1504,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
|
||||||
/* Emit the program-end THRSW instruction. */;
|
/* Emit the program-end THRSW instruction. */;
|
||||||
struct qinst *thrsw = vir_nop();
|
struct qinst *thrsw = vir_nop();
|
||||||
thrsw->qpu.sig.thrsw = true;
|
thrsw->qpu.sig.thrsw = true;
|
||||||
emit_thrsw(c, end_block, &scoreboard, thrsw);
|
emit_thrsw(c, end_block, &scoreboard, thrsw, true);
|
||||||
|
|
||||||
qpu_set_branch_targets(c);
|
qpu_set_branch_targets(c);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -39,6 +39,10 @@ struct v3d_qpu_validate_state {
|
||||||
const struct v3d_qpu_instr *last;
|
const struct v3d_qpu_instr *last;
|
||||||
int ip;
|
int ip;
|
||||||
int last_sfu_write;
|
int last_sfu_write;
|
||||||
|
int last_branch_ip;
|
||||||
|
int last_thrsw_ip;
|
||||||
|
bool last_thrsw_found;
|
||||||
|
int thrsw_count;
|
||||||
};
|
};
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
|
@ -62,6 +66,18 @@ fail_instr(struct v3d_qpu_validate_state *state, const char *msg)
|
||||||
abort();
|
abort();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
in_branch_delay_slots(struct v3d_qpu_validate_state *state)
|
||||||
|
{
|
||||||
|
return (state->ip - state->last_branch_ip) < 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
in_thrsw_delay_slots(struct v3d_qpu_validate_state *state)
|
||||||
|
{
|
||||||
|
return (state->ip - state->last_thrsw_ip) < 3;
|
||||||
|
}
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
qpu_magic_waddr_matches(const struct v3d_qpu_instr *inst,
|
qpu_magic_waddr_matches(const struct v3d_qpu_instr *inst,
|
||||||
bool (*predicate)(enum v3d_qpu_waddr waddr))
|
bool (*predicate)(enum v3d_qpu_waddr waddr))
|
||||||
|
|
@ -136,6 +152,19 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (in_thrsw_delay_slots(state)) {
|
||||||
|
/* There's no way you want to start SFU during the THRSW delay
|
||||||
|
* slots, since the result would land in the other thread.
|
||||||
|
*/
|
||||||
|
if (sfu_writes) {
|
||||||
|
fail_instr(state,
|
||||||
|
"SFU write started during THRSW delay slots ");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inst->sig.ldvary)
|
||||||
|
fail_instr(state, "LDVARY during THRSW delay slots");
|
||||||
|
}
|
||||||
|
|
||||||
(void)qpu_magic_waddr_matches; /* XXX */
|
(void)qpu_magic_waddr_matches; /* XXX */
|
||||||
|
|
||||||
/* SFU r4 results come back two instructions later. No doing
|
/* SFU r4 results come back two instructions later. No doing
|
||||||
|
|
@ -170,6 +199,35 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
|
||||||
|
|
||||||
if (sfu_writes)
|
if (sfu_writes)
|
||||||
state->last_sfu_write = state->ip;
|
state->last_sfu_write = state->ip;
|
||||||
|
|
||||||
|
if (inst->sig.thrsw) {
|
||||||
|
if (in_branch_delay_slots(state))
|
||||||
|
fail_instr(state, "THRSW in a branch delay slot.");
|
||||||
|
|
||||||
|
if (state->last_thrsw_ip == state->ip - 1) {
|
||||||
|
/* If it's the second THRSW in a row, then it's just a
|
||||||
|
* last-thrsw signal.
|
||||||
|
*/
|
||||||
|
if (state->last_thrsw_found)
|
||||||
|
fail_instr(state, "Two last-THRSW signals");
|
||||||
|
state->last_thrsw_found = true;
|
||||||
|
} else {
|
||||||
|
if (in_thrsw_delay_slots(state)) {
|
||||||
|
fail_instr(state,
|
||||||
|
"THRSW too close to another THRSW.");
|
||||||
|
}
|
||||||
|
state->thrsw_count++;
|
||||||
|
state->last_thrsw_ip = state->ip;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
|
||||||
|
if (in_branch_delay_slots(state))
|
||||||
|
fail_instr(state, "branch in a branch delay slot.");
|
||||||
|
if (in_thrsw_delay_slots(state))
|
||||||
|
fail_instr(state, "branch in a THRSW delay slot.");
|
||||||
|
state->last_branch_ip = state->ip;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
|
@ -201,10 +259,22 @@ qpu_validate(struct v3d_compile *c)
|
||||||
struct v3d_qpu_validate_state state = {
|
struct v3d_qpu_validate_state state = {
|
||||||
.c = c,
|
.c = c,
|
||||||
.last_sfu_write = -10,
|
.last_sfu_write = -10,
|
||||||
|
.last_thrsw_ip = -10,
|
||||||
|
.last_branch_ip = -10,
|
||||||
.ip = 0,
|
.ip = 0,
|
||||||
};
|
};
|
||||||
|
|
||||||
vir_for_each_block(block, c) {
|
vir_for_each_block(block, c) {
|
||||||
qpu_validate_block(&state, block);
|
qpu_validate_block(&state, block);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (state.thrsw_count > 1 && !state.last_thrsw_found) {
|
||||||
|
fail_instr(&state,
|
||||||
|
"thread switch found without last-THRSW in program");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (state.thrsw_count == 0 ||
|
||||||
|
(state.last_thrsw_found && state.thrsw_count == 1)) {
|
||||||
|
fail_instr(&state, "No program-end THRSW found");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -134,6 +134,7 @@ struct qinst {
|
||||||
struct qreg src[3];
|
struct qreg src[3];
|
||||||
bool cond_is_exec_mask;
|
bool cond_is_exec_mask;
|
||||||
bool has_implicit_uniform;
|
bool has_implicit_uniform;
|
||||||
|
bool is_last_thrsw;
|
||||||
|
|
||||||
/* After vir_to_qpu.c: If instr reads a uniform, which uniform from
|
/* After vir_to_qpu.c: If instr reads a uniform, which uniform from
|
||||||
* the uncompiled stream it is.
|
* the uncompiled stream it is.
|
||||||
|
|
@ -522,12 +523,16 @@ struct v3d_compile {
|
||||||
uint32_t program_id;
|
uint32_t program_id;
|
||||||
uint32_t variant_id;
|
uint32_t variant_id;
|
||||||
|
|
||||||
/* Set to compile program in threaded FS mode, where SIG_THREAD_SWITCH
|
/* Set to compile program in in 1x, 2x, or 4x threaded mode, where
|
||||||
* is used to hide texturing latency at the cost of limiting ourselves
|
* SIG_THREAD_SWITCH is used to hide texturing latency at the cost of
|
||||||
* to the bottom half of physical reg space.
|
* limiting ourselves to the part of the physical reg space.
|
||||||
|
*
|
||||||
|
* On V3D 3.x, 2x or 4x divide the physical reg space by 2x or 4x. On
|
||||||
|
* V3D 4.x, all shaders are 2x threaded, and 4x only divides the
|
||||||
|
* physical reg space in half.
|
||||||
*/
|
*/
|
||||||
bool fs_threaded;
|
uint8_t threads;
|
||||||
|
struct qinst *last_thrsw;
|
||||||
bool last_thrsw_at_top_level;
|
bool last_thrsw_at_top_level;
|
||||||
|
|
||||||
bool failed;
|
bool failed;
|
||||||
|
|
@ -547,7 +552,12 @@ struct v3d_prog_data {
|
||||||
uint32_t ubo_size;
|
uint32_t ubo_size;
|
||||||
|
|
||||||
uint8_t num_inputs;
|
uint8_t num_inputs;
|
||||||
|
uint8_t threads;
|
||||||
|
|
||||||
|
/* For threads > 1, whether the program should be dispatched in the
|
||||||
|
* after-final-THRSW state.
|
||||||
|
*/
|
||||||
|
bool single_seg;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct v3d_vs_prog_data {
|
struct v3d_vs_prog_data {
|
||||||
|
|
@ -674,7 +684,7 @@ void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c);
|
||||||
void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c);
|
void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c);
|
||||||
void vir_lower_uniforms(struct v3d_compile *c);
|
void vir_lower_uniforms(struct v3d_compile *c);
|
||||||
|
|
||||||
void v3d_vir_to_qpu(struct v3d_compile *c);
|
void v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers);
|
||||||
uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c);
|
uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c);
|
||||||
void qpu_validate(struct v3d_compile *c);
|
void qpu_validate(struct v3d_compile *c);
|
||||||
struct qpu_reg *v3d_register_allocate(struct v3d_compile *c);
|
struct qpu_reg *v3d_register_allocate(struct v3d_compile *c);
|
||||||
|
|
|
||||||
|
|
@ -109,7 +109,7 @@ vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (inst->qpu.sig.ldtmu)
|
if (inst->qpu.sig.ldtmu || inst->qpu.sig.thrsw)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
|
|
@ -528,6 +528,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
|
||||||
c->key = key;
|
c->key = key;
|
||||||
c->program_id = program_id;
|
c->program_id = program_id;
|
||||||
c->variant_id = variant_id;
|
c->variant_id = variant_id;
|
||||||
|
c->threads = 4;
|
||||||
|
|
||||||
s = nir_shader_clone(c, s);
|
s = nir_shader_clone(c, s);
|
||||||
c->s = s;
|
c->s = s;
|
||||||
|
|
@ -637,6 +638,9 @@ static void
|
||||||
v3d_set_prog_data(struct v3d_compile *c,
|
v3d_set_prog_data(struct v3d_compile *c,
|
||||||
struct v3d_prog_data *prog_data)
|
struct v3d_prog_data *prog_data)
|
||||||
{
|
{
|
||||||
|
prog_data->threads = c->threads;
|
||||||
|
prog_data->single_seg = !c->last_thrsw;
|
||||||
|
|
||||||
v3d_set_prog_data_uniforms(c, prog_data);
|
v3d_set_prog_data_uniforms(c, prog_data);
|
||||||
v3d_set_prog_data_ubo(c, prog_data);
|
v3d_set_prog_data_ubo(c, prog_data);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,7 @@
|
||||||
|
|
||||||
#include "util/ralloc.h"
|
#include "util/ralloc.h"
|
||||||
#include "util/register_allocate.h"
|
#include "util/register_allocate.h"
|
||||||
|
#include "common/v3d_device_info.h"
|
||||||
#include "v3d_compiler.h"
|
#include "v3d_compiler.h"
|
||||||
|
|
||||||
#define QPU_R(i) { .magic = false, .index = i }
|
#define QPU_R(i) { .magic = false, .index = i }
|
||||||
|
|
@ -35,15 +36,17 @@
|
||||||
bool
|
bool
|
||||||
vir_init_reg_sets(struct v3d_compiler *compiler)
|
vir_init_reg_sets(struct v3d_compiler *compiler)
|
||||||
{
|
{
|
||||||
|
/* Allocate up to 3 regfile classes, for the ways the physical
|
||||||
|
* register file can be divided up for fragment shader threading.
|
||||||
|
*/
|
||||||
|
int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3);
|
||||||
|
|
||||||
compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
|
compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
|
||||||
true);
|
true);
|
||||||
if (!compiler->regs)
|
if (!compiler->regs)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
/* Allocate 3 regfile classes, for the ways the physical register file
|
for (int threads = 0; threads < max_thread_index; threads++) {
|
||||||
* can be divided up for fragment shader threading.
|
|
||||||
*/
|
|
||||||
for (int threads = 0; threads < 3; threads++) {
|
|
||||||
compiler->reg_class_phys_or_acc[threads] =
|
compiler->reg_class_phys_or_acc[threads] =
|
||||||
ra_alloc_reg_class(compiler->regs);
|
ra_alloc_reg_class(compiler->regs);
|
||||||
compiler->reg_class_phys[threads] =
|
compiler->reg_class_phys[threads] =
|
||||||
|
|
@ -105,6 +108,16 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||||
struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
|
struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
|
||||||
c->num_temps +
|
c->num_temps +
|
||||||
ARRAY_SIZE(acc_nodes));
|
ARRAY_SIZE(acc_nodes));
|
||||||
|
/* Convert 1, 2, 4 threads to 0, 1, 2 index.
|
||||||
|
*
|
||||||
|
* V3D 4.x has double the physical register space, so 64 physical regs
|
||||||
|
* are available at both 1x and 2x threading, and 4x has 32.
|
||||||
|
*/
|
||||||
|
int thread_index = ffs(c->threads) - 1;
|
||||||
|
if (c->devinfo->ver >= 40) {
|
||||||
|
if (thread_index >= 1)
|
||||||
|
thread_index--;
|
||||||
|
}
|
||||||
|
|
||||||
/* Make some fixed nodes for the accumulators, which we will need to
|
/* Make some fixed nodes for the accumulators, which we will need to
|
||||||
* interfere with when ops have implied r3/r4 writes or for the thread
|
* interfere with when ops have implied r3/r4 writes or for the thread
|
||||||
|
|
@ -117,9 +130,6 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||||
ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i);
|
ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Compute the live ranges so we can figure out interference. */
|
|
||||||
vir_calculate_live_intervals(c);
|
|
||||||
|
|
||||||
for (uint32_t i = 0; i < c->num_temps; i++) {
|
for (uint32_t i = 0; i < c->num_temps; i++) {
|
||||||
map[i].temp = i;
|
map[i].temp = i;
|
||||||
map[i].priority = c->temp_end[i] - c->temp_start[i];
|
map[i].priority = c->temp_end[i] - c->temp_start[i];
|
||||||
|
|
@ -204,23 +214,15 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
if (inst->qpu.sig.thrsw) {
|
||||||
switch (inst->op) {
|
|
||||||
case QOP_THRSW:
|
|
||||||
/* All accumulators are invalidated across a thread
|
/* All accumulators are invalidated across a thread
|
||||||
* switch.
|
* switch.
|
||||||
*/
|
*/
|
||||||
for (int i = 0; i < c->num_temps; i++) {
|
for (int i = 0; i < c->num_temps; i++) {
|
||||||
if (c->temp_start[i] < ip && c->temp_end[i] > ip)
|
if (c->temp_start[i] < ip && c->temp_end[i] > ip)
|
||||||
class_bits[i] &= ~(CLASS_BIT_R0_R3 |
|
class_bits[i] &= CLASS_BIT_PHYS;
|
||||||
CLASS_BIT_R4);
|
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
ip++;
|
ip++;
|
||||||
}
|
}
|
||||||
|
|
@ -228,14 +230,14 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||||
for (uint32_t i = 0; i < c->num_temps; i++) {
|
for (uint32_t i = 0; i < c->num_temps; i++) {
|
||||||
if (class_bits[i] == CLASS_BIT_PHYS) {
|
if (class_bits[i] == CLASS_BIT_PHYS) {
|
||||||
ra_set_node_class(g, temp_to_node[i],
|
ra_set_node_class(g, temp_to_node[i],
|
||||||
c->compiler->reg_class_phys[c->fs_threaded]);
|
c->compiler->reg_class_phys[thread_index]);
|
||||||
} else {
|
} else {
|
||||||
assert(class_bits[i] == (CLASS_BIT_PHYS |
|
assert(class_bits[i] == (CLASS_BIT_PHYS |
|
||||||
CLASS_BIT_R0_R2 |
|
CLASS_BIT_R0_R2 |
|
||||||
CLASS_BIT_R3 |
|
CLASS_BIT_R3 |
|
||||||
CLASS_BIT_R4));
|
CLASS_BIT_R4));
|
||||||
ra_set_node_class(g, temp_to_node[i],
|
ra_set_node_class(g, temp_to_node[i],
|
||||||
c->compiler->reg_class_phys_or_acc[c->fs_threaded]);
|
c->compiler->reg_class_phys_or_acc[thread_index]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -252,12 +254,6 @@ v3d_register_allocate(struct v3d_compile *c)
|
||||||
|
|
||||||
bool ok = ra_allocate(g);
|
bool ok = ra_allocate(g);
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
if (!c->fs_threaded) {
|
|
||||||
fprintf(stderr, "Failed to register allocate:\n");
|
|
||||||
vir_dump(c);
|
|
||||||
}
|
|
||||||
|
|
||||||
c->failed = true;
|
|
||||||
free(temp_registers);
|
free(temp_registers);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -319,10 +319,8 @@ v3d_dump_qpu(struct v3d_compile *c)
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
v3d_vir_to_qpu(struct v3d_compile *c)
|
v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers)
|
||||||
{
|
{
|
||||||
struct qpu_reg *temp_registers = v3d_register_allocate(c);
|
|
||||||
|
|
||||||
/* Reset the uniform count to how many will be actually loaded by the
|
/* Reset the uniform count to how many will be actually loaded by the
|
||||||
* generated QPU code.
|
* generated QPU code.
|
||||||
*/
|
*/
|
||||||
|
|
|
||||||
|
|
@ -209,9 +209,32 @@ vc5_emit_gl_shader_state(struct vc5_context *vc5,
|
||||||
shader.fragment_shader_uniforms_address = fs_uniforms;
|
shader.fragment_shader_uniforms_address = fs_uniforms;
|
||||||
|
|
||||||
#if V3D_VERSION >= 41
|
#if V3D_VERSION >= 41
|
||||||
shader.coordinate_shader_start_in_final_thread_section = true;
|
shader.coordinate_shader_4_way_threadable =
|
||||||
shader.vertex_shader_start_in_final_thread_section = true;
|
vc5->prog.cs->prog_data.vs->base.threads == 4;
|
||||||
shader.fragment_shader_start_in_final_thread_section = true;
|
shader.vertex_shader_4_way_threadable =
|
||||||
|
vc5->prog.vs->prog_data.vs->base.threads == 4;
|
||||||
|
shader.fragment_shader_4_way_threadable =
|
||||||
|
vc5->prog.fs->prog_data.fs->base.threads == 4;
|
||||||
|
|
||||||
|
shader.coordinate_shader_start_in_final_thread_section =
|
||||||
|
vc5->prog.cs->prog_data.vs->base.single_seg;
|
||||||
|
shader.vertex_shader_start_in_final_thread_section =
|
||||||
|
vc5->prog.vs->prog_data.vs->base.single_seg;
|
||||||
|
shader.fragment_shader_start_in_final_thread_section =
|
||||||
|
vc5->prog.fs->prog_data.fs->base.single_seg;
|
||||||
|
#else
|
||||||
|
shader.coordinate_shader_4_way_threadable =
|
||||||
|
vc5->prog.cs->prog_data.vs->base.threads == 4;
|
||||||
|
shader.coordinate_shader_2_way_threadable =
|
||||||
|
vc5->prog.cs->prog_data.vs->base.threads == 2;
|
||||||
|
shader.vertex_shader_4_way_threadable =
|
||||||
|
vc5->prog.vs->prog_data.vs->base.threads == 4;
|
||||||
|
shader.vertex_shader_2_way_threadable =
|
||||||
|
vc5->prog.vs->prog_data.vs->base.threads == 2;
|
||||||
|
shader.fragment_shader_4_way_threadable =
|
||||||
|
vc5->prog.fs->prog_data.fs->base.threads == 4;
|
||||||
|
shader.fragment_shader_2_way_threadable =
|
||||||
|
vc5->prog.fs->prog_data.fs->base.threads == 2;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
shader.vertex_id_read_by_coordinate_shader =
|
shader.vertex_id_read_by_coordinate_shader =
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue