mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 16:08:04 +02:00
pan/va: Support relaxed waits on read-only render targets
On Valhall we can optimize lower waits, which waits for both readers and writers, into resource_waits which only wait for writers, allowing threads accessing read-only resources to execute concurrently. Let's use that on LD_TILE instructions so we can optmize the read-only case. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32540>
This commit is contained in:
parent
fbb2805575
commit
4f4ac56145
6 changed files with 58 additions and 2 deletions
|
|
@ -1554,6 +1554,13 @@ load("sampler_lod_parameters_pan", [1], flags=[CAN_ELIMINATE, CAN_REORDER])
|
|||
# and is ignored otherwise
|
||||
load("converted_output_pan", [1, 1, 1], indices=[DEST_TYPE, IO_SEMANTICS], flags=[CAN_ELIMINATE])
|
||||
|
||||
# Like converted_output_pan but for case where the output is never written by the shader
|
||||
# This is used to relax waits on tile-buffer accesses and the target is read-only
|
||||
# src[] = { target, sample, conversion }
|
||||
# target must be in the [0..7] range when io_semantics.location is FRAG_RESULT_DATA0
|
||||
# and is ignored otherwise
|
||||
load("readonly_output_pan", [1, 1, 1], indices=[DEST_TYPE, IO_SEMANTICS], flags=[CAN_ELIMINATE])
|
||||
|
||||
# Load input attachment target
|
||||
# src[] = { input_attachment_index }
|
||||
# valid targets are:
|
||||
|
|
|
|||
|
|
@ -1841,6 +1841,9 @@ bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr)
|
|||
if (is_zs)
|
||||
I->z_stencil = true;
|
||||
|
||||
if (instr->intrinsic == nir_intrinsic_load_readonly_output_pan)
|
||||
I->wait_resource = true;
|
||||
|
||||
bi_emit_cached_split(b, dest, size * nr);
|
||||
}
|
||||
|
||||
|
|
@ -2103,6 +2106,7 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
|
|||
break;
|
||||
|
||||
case nir_intrinsic_load_converted_output_pan:
|
||||
case nir_intrinsic_load_readonly_output_pan:
|
||||
bi_emit_ld_tile(b, instr);
|
||||
break;
|
||||
|
||||
|
|
|
|||
|
|
@ -402,6 +402,9 @@ typedef struct {
|
|||
/* Flow control associated with a Valhall instruction */
|
||||
uint8_t flow;
|
||||
|
||||
/* Valhall-only property to relax waits on read-only resources */
|
||||
bool wait_resource;
|
||||
|
||||
/* Slot associated with a message-passing instruction */
|
||||
uint8_t slot;
|
||||
|
||||
|
|
|
|||
|
|
@ -95,7 +95,7 @@
|
|||
<value name="Wait on slots 0, 1, 2, 6, 7">wait</value>
|
||||
<value name="Perform branch reconverge">reconverge</value>
|
||||
<reserved/>
|
||||
<reserved/>
|
||||
<value name="Wait for resource read">wait_resource</value>
|
||||
<value name="Terminate discarded threads">discard</value>
|
||||
<reserved/>
|
||||
<value name="Return from shader">end</value>
|
||||
|
|
|
|||
|
|
@ -21,6 +21,8 @@
|
|||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "panfrost/lib/pan_props.h"
|
||||
|
||||
#include "bi_builder.h"
|
||||
#include "va_compiler.h"
|
||||
#include "valhall_enums.h"
|
||||
|
|
@ -441,11 +443,21 @@ va_insert_flow_control_nops(bi_context *ctx)
|
|||
* instructions. Wait for slot 0 immediately after the ATEST.
|
||||
*/
|
||||
case BI_OPCODE_BLEND:
|
||||
case BI_OPCODE_LD_TILE:
|
||||
case BI_OPCODE_ST_TILE:
|
||||
if (!ctx->inputs->is_blend)
|
||||
bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT);
|
||||
break;
|
||||
|
||||
case BI_OPCODE_LD_TILE: {
|
||||
if (ctx->inputs->is_blend)
|
||||
break;
|
||||
|
||||
assert(!I->wait_resource || pan_arch(ctx->inputs->gpu_id) >= 10);
|
||||
bi_flow(ctx, bi_before_instr(I),
|
||||
I->wait_resource ? VA_FLOW_WAIT_RESOURCE : VA_FLOW_WAIT);
|
||||
break;
|
||||
}
|
||||
|
||||
case BI_OPCODE_ATEST:
|
||||
bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT0126);
|
||||
bi_flow(ctx, bi_after_instr(I), VA_FLOW_WAIT0);
|
||||
|
|
|
|||
|
|
@ -161,6 +161,35 @@ merge_waits(bi_block *block)
|
|||
}
|
||||
}
|
||||
|
||||
static void
|
||||
merge_resource_waits(bi_block *block)
|
||||
{
|
||||
/* Most recent instruction with which we can merge, or NULL if none */
|
||||
bi_instr *last_free = NULL;
|
||||
|
||||
bi_foreach_instr_in_block_safe(block, I) {
|
||||
if (last_free != NULL && I->op == BI_OPCODE_NOP &&
|
||||
I->flow == VA_FLOW_WAIT_RESOURCE) {
|
||||
|
||||
/* Merge resource_waits with compatible instructions */
|
||||
last_free->flow = VA_FLOW_WAIT_RESOURCE;
|
||||
bi_remove_instruction(I);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Don't move waits past async instructions, since they might be what
|
||||
* we're waiting for. If we wanted to optimize this case, we could check
|
||||
* the signaled slots.
|
||||
*/
|
||||
if (bi_get_opcode_props(I)->message)
|
||||
last_free = NULL;
|
||||
|
||||
/* We can only merge with instructions whose flow control is a none. */
|
||||
if (I->flow == VA_FLOW_NONE)
|
||||
last_free = I;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
bi_is_first_instr(bi_block *block, bi_instr *I)
|
||||
{
|
||||
|
|
@ -221,6 +250,7 @@ va_merge_flow(bi_context *ctx)
|
|||
continue;
|
||||
|
||||
merge_end_reconverge(block);
|
||||
merge_resource_waits(block);
|
||||
merge_waits(block);
|
||||
|
||||
if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue