iris: Emit EXECUTE_INDIRECT_DRAW when available

On newer platforms (Arrowlake and above) we can issue a
EXECUTE_INDIRECT_DRAW that allows us to:
  * Skip issuing mi load/store instructions for indirect parameters
  * Skip doing the indirect draw unroll on the CPU side when the
    appropriate stride is passed

Signed-off-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26178>
This commit is contained in:
Rohan Garg 2022-06-21 15:51:31 +02:00 committed by Marge Bot
parent e5b7e16f3b
commit d1109f67bb
4 changed files with 231 additions and 53 deletions

View file

@ -1152,6 +1152,30 @@ int iris_get_driver_query_group_info(struct pipe_screen *pscreen,
void gfx9_toggle_preemption(struct iris_context *ice,
struct iris_batch *batch,
const struct pipe_draw_info *draw);
static const bool
iris_execute_indirect_draw_supported(const struct iris_context *ice,
const struct pipe_draw_indirect_info *indirect,
const struct pipe_draw_info *draw)
{
const struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen;
const struct brw_vs_prog_data *vs_prog_data = (void *)
ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
const bool is_multiview = draw->view_mask != 0;
const size_t struct_size = draw->index_size ?
sizeof(uint32_t) * 5 :
sizeof(uint32_t) * 4;
const bool aligned_stride =
indirect && (indirect->stride == 0 || indirect->stride == struct_size);
return (screen->devinfo->has_indirect_unroll &&
aligned_stride &&
(indirect &&
!indirect->count_from_stream_output) &&
!is_multiview &&
!(vs_prog_data->uses_firstvertex ||
vs_prog_data->uses_baseinstance ||
vs_prog_data->uses_drawid));
}
#ifdef genX
# include "iris_genx_protos.h"

View file

@ -181,59 +181,6 @@ iris_update_draw_parameters(struct iris_context *ice,
}
}
static void
iris_indirect_draw_vbo(struct iris_context *ice,
const struct pipe_draw_info *dinfo,
unsigned drawid_offset,
const struct pipe_draw_indirect_info *dindirect,
const struct pipe_draw_start_count_bias *draw)
{
struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
struct pipe_draw_info info = *dinfo;
struct pipe_draw_indirect_info indirect = *dindirect;
iris_emit_buffer_barrier_for(batch, iris_resource_bo(indirect.buffer),
IRIS_DOMAIN_VF_READ);
if (indirect.indirect_draw_count) {
struct iris_bo *draw_count_bo =
iris_resource_bo(indirect.indirect_draw_count);
iris_emit_buffer_barrier_for(batch, draw_count_bo,
IRIS_DOMAIN_OTHER_READ);
if (ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT) {
/* Upload MI_PREDICATE_RESULT to GPR15.*/
batch->screen->vtbl.load_register_reg64(batch, CS_GPR(15), MI_PREDICATE_RESULT);
}
}
const uint64_t orig_dirty = ice->state.dirty;
const uint64_t orig_stage_dirty = ice->state.stage_dirty;
for (int i = 0; i < indirect.draw_count; i++) {
iris_batch_maybe_flush(batch, 1500);
iris_update_draw_parameters(ice, &info, drawid_offset + i, &indirect, draw);
batch->screen->vtbl.upload_render_state(ice, batch, &info, drawid_offset + i, &indirect, draw);
ice->state.dirty &= ~IRIS_ALL_DIRTY_FOR_RENDER;
ice->state.stage_dirty &= ~IRIS_ALL_STAGE_DIRTY_FOR_RENDER;
indirect.offset += indirect.stride;
}
if (indirect.indirect_draw_count &&
ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT) {
/* Restore MI_PREDICATE_RESULT. */
batch->screen->vtbl.load_register_reg64(batch, MI_PREDICATE_RESULT, CS_GPR(15));
}
/* Put this back for post-draw resolves, we'll clear it again after. */
ice->state.dirty = orig_dirty;
ice->state.stage_dirty = orig_stage_dirty;
}
static void
iris_simple_draw_vbo(struct iris_context *ice,
const struct pipe_draw_info *draw,
@ -250,6 +197,64 @@ iris_simple_draw_vbo(struct iris_context *ice,
batch->screen->vtbl.upload_render_state(ice, batch, draw, drawid_offset, indirect, sc);
}
static void
iris_indirect_draw_vbo(struct iris_context *ice,
const struct pipe_draw_info *dinfo,
unsigned drawid_offset,
const struct pipe_draw_indirect_info *dindirect,
const struct pipe_draw_start_count_bias *draw)
{
struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
struct pipe_draw_info info = *dinfo;
struct pipe_draw_indirect_info indirect = *dindirect;
const bool use_predicate =
ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT;
const uint64_t orig_dirty = ice->state.dirty;
const uint64_t orig_stage_dirty = ice->state.stage_dirty;
if (iris_execute_indirect_draw_supported(ice, &indirect, &info)) {
iris_batch_maybe_flush(batch, 1500);
iris_update_draw_parameters(ice, &info, drawid_offset, &indirect, draw);
batch->screen->vtbl.upload_indirect_render_state(ice, &info, &indirect, draw);
} else {
iris_emit_buffer_barrier_for(batch, iris_resource_bo(indirect.buffer),
IRIS_DOMAIN_VF_READ);
if (indirect.indirect_draw_count) {
struct iris_bo *draw_count_bo =
iris_resource_bo(indirect.indirect_draw_count);
iris_emit_buffer_barrier_for(batch, draw_count_bo,
IRIS_DOMAIN_OTHER_READ);
}
if (use_predicate) {
/* Upload MI_PREDICATE_RESULT to GPR15.*/
batch->screen->vtbl.load_register_reg64(batch, CS_GPR(15), MI_PREDICATE_RESULT);
}
for (int i = 0; i < indirect.draw_count; i++) {
iris_simple_draw_vbo(ice, &info, drawid_offset + i, &indirect, draw);
ice->state.dirty &= ~IRIS_ALL_DIRTY_FOR_RENDER;
ice->state.stage_dirty &= ~IRIS_ALL_STAGE_DIRTY_FOR_RENDER;
indirect.offset += indirect.stride;
}
if (use_predicate) {
/* Restore MI_PREDICATE_RESULT. */
batch->screen->vtbl.load_register_reg64(batch, MI_PREDICATE_RESULT, CS_GPR(15));
}
}
/* Put this back for post-draw resolves, we'll clear it again after. */
ice->state.dirty = orig_dirty;
ice->state.stage_dirty = orig_stage_dirty;
}
/**
* The pipe->draw_vbo() driver hook. Performs a draw on the GPU.
*/

View file

@ -69,6 +69,10 @@ struct iris_vtable {
unsigned drawid_offset,
const struct pipe_draw_indirect_info *indirect,
const struct pipe_draw_start_count_bias *sc);
void (*upload_indirect_render_state)(struct iris_context *ice,
const struct pipe_draw_info *draw,
const struct pipe_draw_indirect_info *indirect,
const struct pipe_draw_start_count_bias *sc);
void (*update_binder_address)(struct iris_batch *batch,
struct iris_binder *binder);
void (*upload_compute_state)(struct iris_context *ice,

View file

@ -8317,6 +8317,150 @@ iris_upload_render_state(struct iris_context *ice,
trace_intel_end_draw(&batch->trace, count);
}
static void
iris_upload_indirect_render_state(struct iris_context *ice,
const struct pipe_draw_info *draw,
const struct pipe_draw_indirect_info *indirect,
const struct pipe_draw_start_count_bias *sc)
{
#if GFX_VERx10 >= 125
assert(indirect);
struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
UNUSED struct iris_screen *screen = batch->screen;
UNUSED const struct intel_device_info *devinfo = screen->devinfo;
const bool use_predicate =
ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT;
trace_intel_begin_draw(&batch->trace);
if (ice->state.dirty & IRIS_DIRTY_VERTEX_BUFFER_FLUSHES)
flush_vbos(ice, batch);
iris_batch_sync_region_start(batch);
/* Always pin the binder. If we're emitting new binding table pointers,
* we need it. If not, we're probably inheriting old tables via the
* context, and need it anyway. Since true zero-bindings cases are
* practically non-existent, just pin it and avoid last_res tracking.
*/
iris_use_pinned_bo(batch, ice->state.binder.bo, false,
IRIS_DOMAIN_NONE);
if (!batch->contains_draw) {
/* Re-emit constants when starting a new batch buffer in order to
* work around push constant corruption on context switch.
*
* XXX - Provide hardware spec quotation when available.
*/
ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_CONSTANTS_VS |
IRIS_STAGE_DIRTY_CONSTANTS_TCS |
IRIS_STAGE_DIRTY_CONSTANTS_TES |
IRIS_STAGE_DIRTY_CONSTANTS_GS |
IRIS_STAGE_DIRTY_CONSTANTS_FS);
batch->contains_draw = true;
}
if (!batch->contains_draw_with_next_seqno) {
iris_restore_render_saved_bos(ice, batch, draw);
batch->contains_draw_with_next_seqno = true;
}
/* Wa_1306463417 - Send HS state for every primitive on gfx11.
* Wa_16011107343 (same for gfx12)
* We implement this by setting TCS dirty on each draw.
*/
if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
ice->shaders.prog[MESA_SHADER_TESS_CTRL]) {
ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TCS;
}
iris_upload_dirty_render_state(ice, batch, draw);
if (draw->index_size > 0) {
unsigned offset;
if (draw->has_user_indices) {
unsigned start_offset = draw->index_size * sc->start;
u_upload_data(ice->ctx.const_uploader, start_offset,
sc->count * draw->index_size, 4,
(char*)draw->index.user + start_offset,
&offset, &ice->state.last_res.index_buffer);
offset -= start_offset;
} else {
struct iris_resource *res = (void *) draw->index.resource;
res->bind_history |= PIPE_BIND_INDEX_BUFFER;
pipe_resource_reference(&ice->state.last_res.index_buffer,
draw->index.resource);
offset = 0;
iris_emit_buffer_barrier_for(batch, res->bo, IRIS_DOMAIN_VF_READ);
}
struct iris_genx_state *genx = ice->state.genx;
struct iris_bo *bo = iris_resource_bo(ice->state.last_res.index_buffer);
uint32_t ib_packet[GENX(3DSTATE_INDEX_BUFFER_length)];
iris_pack_command(GENX(3DSTATE_INDEX_BUFFER), ib_packet, ib) {
ib.IndexFormat = draw->index_size >> 1;
ib.MOCS = iris_mocs(bo, &batch->screen->isl_dev,
ISL_SURF_USAGE_INDEX_BUFFER_BIT);
ib.BufferSize = bo->size - offset;
ib.BufferStartingAddress = ro_bo(NULL, bo->address + offset);
ib.L3BypassDisable = true;
}
if (memcmp(genx->last_index_buffer, ib_packet, sizeof(ib_packet)) != 0) {
memcpy(genx->last_index_buffer, ib_packet, sizeof(ib_packet));
iris_batch_emit(batch, ib_packet, sizeof(ib_packet));
iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_VF_READ);
}
}
iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_DRAW, draw, indirect, sc);
genX(maybe_emit_breakpoint)(batch, true);
iris_emit_cmd(batch, GENX(EXECUTE_INDIRECT_DRAW), ind) {
ind.ArgumentFormat =
draw->index_size > 0 ? DRAWINDEXED : DRAW;
ind.PredicateEnable = use_predicate;
ind.TBIMREnabled = ice->state.use_tbimr;
ind.MaxCount = indirect->draw_count;
if (indirect->buffer) {
struct iris_bo *bo = iris_resource_bo(indirect->buffer);
ind.ArgumentBufferStartAddress = ro_bo(bo, indirect->offset);
ind.MOCS = iris_mocs(bo, &screen->isl_dev, 0);
} else {
ind.MOCS = iris_mocs(NULL, &screen->isl_dev, 0);
}
if (indirect->indirect_draw_count) {
struct iris_bo *draw_count_bo =
iris_resource_bo(indirect->indirect_draw_count);
ind.CountBufferIndirectEnable = true;
ind.CountBufferAddress =
ro_bo(draw_count_bo, indirect->indirect_draw_count_offset);
}
}
genX(emit_3dprimitive_was)(batch, indirect, ice->state.prim_mode, sc->count);
genX(maybe_emit_breakpoint)(batch, false);
iris_batch_sync_region_end(batch);
uint32_t count = (sc) ? sc->count : 0;
count *= draw->instance_count ? draw->instance_count : 1;
trace_intel_end_draw(&batch->trace, count);
#else
unreachable("Unsupported path");
#endif /* GFX_VERx10 >= 125 */
}
static void
iris_load_indirect_location(struct iris_context *ice,
struct iris_batch *batch,
@ -9728,6 +9872,7 @@ genX(init_screen_state)(struct iris_screen *screen)
screen->vtbl.init_render_context = iris_init_render_context;
screen->vtbl.init_compute_context = iris_init_compute_context;
screen->vtbl.upload_render_state = iris_upload_render_state;
screen->vtbl.upload_indirect_render_state = iris_upload_indirect_render_state;
screen->vtbl.update_binder_address = iris_update_binder_address;
screen->vtbl.upload_compute_state = iris_upload_compute_state;
screen->vtbl.emit_raw_pipe_control = iris_emit_raw_pipe_control;