mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-24 21:50:12 +01:00
vc4: Fix GPU hangs with >16 varying values.
Fixes glsl-routing in piglit and hangs in glbenchmark 2.0.2.
This commit is contained in:
parent
5277f25480
commit
87a88f2daa
2 changed files with 68 additions and 19 deletions
|
|
@ -494,6 +494,18 @@ struct vc4_compile {
|
|||
uint32_t qpu_inst_size;
|
||||
uint32_t num_inputs;
|
||||
|
||||
/**
|
||||
* Number of inputs from num_inputs remaining to be queued to the read
|
||||
* FIFO in the VS/CS.
|
||||
*/
|
||||
uint32_t num_inputs_remaining;
|
||||
|
||||
/* Number of inputs currently in the read FIFO for the VS/CS */
|
||||
uint32_t num_inputs_in_fifo;
|
||||
|
||||
/** Next offset in the VPM to read from in the VS/CS */
|
||||
uint32_t vpm_read_offset;
|
||||
|
||||
uint32_t program_id;
|
||||
uint32_t variant_id;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -96,6 +96,60 @@ swap_file(struct qpu_reg *src)
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets up the VPM read FIFO before we do any VPM read.
|
||||
*
|
||||
* VPM reads (vertex attribute input) and VPM writes (varyings output) from
|
||||
* the QPU reuse the VRI (varying interpolation) block's FIFOs to talk to the
|
||||
* VPM block. In the VS/CS (unlike in the FS), the block starts out
|
||||
* uninitialized, and you need to emit setup to the block before any VPM
|
||||
* reads/writes.
|
||||
*
|
||||
* VRI has a FIFO in each direction, with each FIFO able to hold four
|
||||
* 32-bit-per-vertex values. VPM reads come through the read FIFO and VPM
|
||||
* writes go through the write FIFO. The read/write setup values from QPU go
|
||||
* through the write FIFO as well, with a sideband signal indicating that
|
||||
* they're setup values. Once a read setup reaches the other side of the
|
||||
* FIFO, the VPM block will start asynchronously reading vertex attributes and
|
||||
* filling the read FIFO -- that way hopefully the QPU doesn't have to block
|
||||
* on reads later.
|
||||
*
|
||||
* VPM read setup can configure 16 32-bit-per-vertex values to be read at a
|
||||
* time, which is 4 vec4s. If more than that is being read (since we support
|
||||
* 8 vec4 vertex attributes), then multiple read setup writes need to be done.
|
||||
*
|
||||
* The existence of the FIFO makes it seem like you should be able to emit
|
||||
* both setups for the 5-8 attribute cases and then do all the attribute
|
||||
* reads. However, once the setup value makes it to the other end of the
|
||||
* write FIFO, it will immediately update the VPM block's setup register.
|
||||
* That updated setup register would be used for read FIFO fills from then on,
|
||||
* breaking whatever remaining VPM values were supposed to be read into the
|
||||
* read FIFO from the previous attribute set.
|
||||
*
|
||||
* As a result, we need to emit the read setup, pull every VPM read value from
|
||||
* that setup, and only then emit the second setup if applicable.
|
||||
*/
|
||||
static void
|
||||
setup_for_vpm_read(struct vc4_compile *c, struct qblock *block)
|
||||
{
|
||||
if (c->num_inputs_in_fifo) {
|
||||
c->num_inputs_in_fifo--;
|
||||
return;
|
||||
}
|
||||
|
||||
c->num_inputs_in_fifo = MIN2(c->num_inputs_remaining, 16);
|
||||
|
||||
queue(block,
|
||||
qpu_load_imm_ui(qpu_vrsetup(),
|
||||
c->vpm_read_offset |
|
||||
0x00001a00 |
|
||||
((c->num_inputs_in_fifo & 0xf) << 20)));
|
||||
c->num_inputs_remaining -= c->num_inputs_in_fifo;
|
||||
c->vpm_read_offset += c->num_inputs_in_fifo;
|
||||
|
||||
c->num_inputs_in_fifo--;
|
||||
}
|
||||
|
||||
/**
|
||||
* This is used to resolve the fact that we might register-allocate two
|
||||
* different operands of an instruction to the same physical register file
|
||||
|
|
@ -268,6 +322,7 @@ vc4_generate_code_block(struct vc4_compile *c,
|
|||
assert(src[i].addr <= 47);
|
||||
break;
|
||||
case QFILE_VPM:
|
||||
setup_for_vpm_read(c, block);
|
||||
assert((int)qinst->src[i].index >=
|
||||
last_vpm_read_index);
|
||||
(void)last_vpm_read_index;
|
||||
|
|
@ -484,31 +539,13 @@ void
|
|||
vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
|
||||
{
|
||||
struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
|
||||
uint32_t inputs_remaining = c->num_inputs;
|
||||
uint32_t vpm_read_fifo_count = 0;
|
||||
uint32_t vpm_read_offset = 0;
|
||||
struct qblock *start_block = list_first_entry(&c->blocks,
|
||||
struct qblock, link);
|
||||
|
||||
switch (c->stage) {
|
||||
case QSTAGE_VERT:
|
||||
case QSTAGE_COORD:
|
||||
/* There's a 4-entry FIFO for VPMVCD reads, each of which can
|
||||
* load up to 16 dwords (4 vec4s) per vertex.
|
||||
*/
|
||||
while (inputs_remaining) {
|
||||
uint32_t num_entries = MIN2(inputs_remaining, 16);
|
||||
queue(start_block,
|
||||
qpu_load_imm_ui(qpu_vrsetup(),
|
||||
vpm_read_offset |
|
||||
0x00001a00 |
|
||||
((num_entries & 0xf) << 20)));
|
||||
inputs_remaining -= num_entries;
|
||||
vpm_read_offset += num_entries;
|
||||
vpm_read_fifo_count++;
|
||||
}
|
||||
assert(vpm_read_fifo_count <= 4);
|
||||
|
||||
c->num_inputs_remaining = c->num_inputs;
|
||||
queue(start_block, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
|
||||
break;
|
||||
case QSTAGE_FRAG:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue