diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_compute.c b/src/gallium/drivers/freedreno/a6xx/fd6_compute.c index bf4c377079d..0134e93d46a 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_compute.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_compute.c @@ -60,6 +60,10 @@ cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, OUT_PKT4(ring, REG_A6XX_SP_CS_CONFIG, 2); OUT_RING(ring, A6XX_SP_CS_CONFIG_ENABLED | + COND(v->bindless_tex, A6XX_SP_CS_CONFIG_BINDLESS_TEX) | + COND(v->bindless_samp, A6XX_SP_CS_CONFIG_BINDLESS_SAMP) | + COND(v->bindless_ibo, A6XX_SP_CS_CONFIG_BINDLESS_IBO) | + COND(v->bindless_ubo, A6XX_SP_CS_CONFIG_BINDLESS_UBO) | A6XX_SP_CS_CONFIG_NIBO(ir3_shader_nibo(v)) | A6XX_SP_CS_CONFIG_NTEX(v->num_samp) | A6XX_SP_CS_CONFIG_NSAMP(v->num_samp)); /* SP_VS_CONFIG */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_context.c b/src/gallium/drivers/freedreno/a6xx/fd6_context.c index 1a14afbe126..8454b25ba00 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_context.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_context.c @@ -51,6 +51,10 @@ fd6_context_destroy(struct pipe_context *pctx) in_dt { struct fd6_context *fd6_ctx = fd6_context(fd_context(pctx)); + fd6_descriptor_set_invalidate(&fd6_ctx->cs_descriptor_set); + for (unsigned i = 0; i < ARRAY_SIZE(fd6_ctx->descriptor_sets); i++) + fd6_descriptor_set_invalidate(&fd6_ctx->descriptor_sets[i]); + if (fd6_ctx->streamout_disable_stateobj) fd_ringbuffer_del(fd6_ctx->streamout_disable_stateobj); @@ -184,6 +188,26 @@ setup_state_map(struct fd_context *ctx) fd_context_add_shader_map(ctx, PIPE_SHADER_FRAGMENT, FD_DIRTY_SHADER_TEX, BIT(FD6_GROUP_FS_TEX)); + fd_context_add_shader_map(ctx, PIPE_SHADER_VERTEX, + FD_DIRTY_SHADER_SSBO | FD_DIRTY_SHADER_IMAGE, + BIT(FD6_GROUP_VS_BINDLESS)); + fd_context_add_shader_map(ctx, PIPE_SHADER_TESS_CTRL, + FD_DIRTY_SHADER_SSBO | FD_DIRTY_SHADER_IMAGE, + BIT(FD6_GROUP_HS_BINDLESS)); + fd_context_add_shader_map(ctx, PIPE_SHADER_TESS_EVAL, + FD_DIRTY_SHADER_SSBO | FD_DIRTY_SHADER_IMAGE, + BIT(FD6_GROUP_DS_BINDLESS)); + fd_context_add_shader_map(ctx, PIPE_SHADER_GEOMETRY, + FD_DIRTY_SHADER_SSBO | FD_DIRTY_SHADER_IMAGE, + BIT(FD6_GROUP_GS_BINDLESS)); + /* NOTE: FD6_GROUP_FS_BINDLESS has a weak dependency on the program + * state (ie. it needs to be re-generated with fb-read descriptor + * patched in) but this special case is handled in fd6_emit_3d_state() + */ + fd_context_add_shader_map(ctx, PIPE_SHADER_FRAGMENT, + FD_DIRTY_SHADER_SSBO | FD_DIRTY_SHADER_IMAGE, + BIT(FD6_GROUP_FS_BINDLESS)); + /* NOTE: scissor enabled bit is part of rasterizer state, but * fd_rasterizer_state_bind() will mark scissor dirty if needed: */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_context.h b/src/gallium/drivers/freedreno/a6xx/fd6_context.h index 37a1051f717..3bfccc800f8 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_context.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_context.h @@ -62,8 +62,22 @@ struct fd6_descriptor_set { * resource has been rebound */ uint16_t seqno[IR3_BINDLESS_DESC_COUNT]; + + /** + * Current GPU copy of the desciptor set + */ + struct fd_bo *bo; }; +static void +fd6_descriptor_set_invalidate(struct fd6_descriptor_set *set) +{ + if (!set->bo) + return; + fd_bo_del(set->bo); + set->bo = NULL; +} + struct fd6_context { struct fd_context base; diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c index 3ce2d0e36e2..a3b86ccb05e 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c @@ -793,11 +793,13 @@ fd6_emit_3d_state(struct fd_ringbuffer *ring, struct fd6_emit *emit) emit_marker6(ring, 5); - /* NOTE: we track fb_read differently than _BLEND_ENABLED since we - * might decide to do sysmem in some cases when blend is enabled: + /* Special case, we need to re-emit bindless FS state w/ the + * fb-read state appended: */ - if (fs->fb_read) + if ((emit->dirty_groups & BIT(FD6_GROUP_PROG)) && fs->fb_read) { ctx->batch->gmem_reason |= FD_GMEM_FB_READ; + emit->dirty_groups |= BIT(FD6_GROUP_FS_BINDLESS); + } u_foreach_bit (b, emit->dirty_groups) { enum fd6_state_id group = b; @@ -862,6 +864,26 @@ fd6_emit_3d_state(struct fd_ringbuffer *ring, struct fd6_emit *emit) state = build_ibo(emit); fd6_state_take_group(&emit->state, state, FD6_GROUP_IBO); break; + case FD6_GROUP_VS_BINDLESS: + state = fd6_build_bindless_state(ctx, PIPE_SHADER_VERTEX, false); + fd6_state_take_group(&emit->state, state, FD6_GROUP_VS_BINDLESS); + break; + case FD6_GROUP_HS_BINDLESS: + state = fd6_build_bindless_state(ctx, PIPE_SHADER_TESS_CTRL, false); + fd6_state_take_group(&emit->state, state, FD6_GROUP_HS_BINDLESS); + break; + case FD6_GROUP_DS_BINDLESS: + state = fd6_build_bindless_state(ctx, PIPE_SHADER_TESS_EVAL, false); + fd6_state_take_group(&emit->state, state, FD6_GROUP_DS_BINDLESS); + break; + case FD6_GROUP_GS_BINDLESS: + state = fd6_build_bindless_state(ctx, PIPE_SHADER_GEOMETRY, false); + fd6_state_take_group(&emit->state, state, FD6_GROUP_GS_BINDLESS); + break; + case FD6_GROUP_FS_BINDLESS: + state = fd6_build_bindless_state(ctx, PIPE_SHADER_FRAGMENT, fs->fb_read); + fd6_state_take_group(&emit->state, state, FD6_GROUP_FS_BINDLESS); + break; case FD6_GROUP_CONST: state = fd6_build_user_consts(emit); fd6_state_take_group(&emit->state, state, FD6_GROUP_CONST); @@ -913,6 +935,7 @@ void fd6_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring, struct ir3_shader_variant *cp) { + struct fd6_state state = {}; enum fd_dirty_shader_state dirty = ctx->dirty_shader[PIPE_SHADER_COMPUTE]; if (dirty & (FD_DIRTY_SHADER_TEX | FD_DIRTY_SHADER_PROG | @@ -957,6 +980,24 @@ fd6_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring, fd_ringbuffer_del(state); } + + u_foreach_bit (b, ctx->gen_dirty) { + enum fd6_state_id group = b; + + switch (group) { + case FD6_GROUP_CS_BINDLESS: + fd6_state_take_group( + &state, + fd6_build_bindless_state(ctx, PIPE_SHADER_COMPUTE, false), + FD6_GROUP_CS_BINDLESS); + break; + default: + /* State-group unused for compute shaders */ + break; + } + } + + fd6_state_emit(&state, ring); } /* emit setup at begin of new cmdstream buffer (don't rely on previous diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h index 6a67877425c..0f085b6c370 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h @@ -66,6 +66,11 @@ enum fd6_state_id { FD6_GROUP_BLEND_COLOR, FD6_GROUP_SO, FD6_GROUP_IBO, + FD6_GROUP_VS_BINDLESS, + FD6_GROUP_HS_BINDLESS, + FD6_GROUP_DS_BINDLESS, + FD6_GROUP_GS_BINDLESS, + FD6_GROUP_FS_BINDLESS, /* * Virtual state-groups, which don't turn into a CP_SET_DRAW_STATE group @@ -73,6 +78,12 @@ enum fd6_state_id { FD6_GROUP_PROG_KEY, /* Set for any state which could change shader key */ FD6_GROUP_NON_GROUP, /* placeholder group for state emit in IB2, keep last */ + + /* + * Note that since we don't interleave draws and grids in the same batch, + * the compute vs draw state groups can overlap: + */ + FD6_GROUP_CS_BINDLESS = FD6_GROUP_VS_BINDLESS, }; #define ENABLE_ALL \ @@ -133,6 +144,7 @@ fd6_state_take_group(struct fd6_state *state, struct fd_ringbuffer *stateobj, [FD6_GROUP_PROG_BINNING] = CP_SET_DRAW_STATE__0_BINNING, [FD6_GROUP_PROG_INTERP] = ENABLE_DRAW, [FD6_GROUP_FS_TEX] = ENABLE_DRAW, + [FD6_GROUP_FS_BINDLESS] = ENABLE_DRAW, }; assert(state->num_groups < ARRAY_SIZE(state->groups)); struct fd6_state_group *g = &state->groups[state->num_groups++]; diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_image.c b/src/gallium/drivers/freedreno/a6xx/fd6_image.c index e13a7f81cad..157d0351e54 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_image.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_image.c @@ -158,6 +158,17 @@ descriptor_set(struct fd_context *ctx, enum pipe_shader_type shader) static void clear_descriptor(struct fd6_descriptor_set *set, unsigned slot) { + /* The 2nd dword of the descriptor contains the width and height. + * so a non-zero value means the slot was previously valid and + * must be cleared. We can't leave dangling descriptors as the + * shader could use variable indexing into the set of IBOs to + * get at them. See piglit arb_shader_image_load_store-invalid. + */ + if (!set->descriptor[slot][1]) + return; + + fd6_descriptor_set_invalidate(set); + memset(set->descriptor[slot], 0, sizeof(set->descriptor[slot])); } @@ -170,6 +181,8 @@ validate_image_descriptor(struct fd_context *ctx, struct fd6_descriptor_set *set if (!rsc || (rsc->seqno == set->seqno[slot])) return; + fd6_descriptor_set_invalidate(set); + fd6_image_descriptor(ctx, img, set->descriptor[slot]); set->seqno[slot] = rsc->seqno; } @@ -183,6 +196,8 @@ validate_buffer_descriptor(struct fd_context *ctx, struct fd6_descriptor_set *se if (!rsc || (rsc->seqno == set->seqno[slot])) return; + fd6_descriptor_set_invalidate(set); + fd6_ssbo_descriptor(ctx, buf, set->descriptor[slot]); set->seqno[slot] = rsc->seqno; } @@ -221,6 +236,182 @@ fd6_build_ibo_state(struct fd_context *ctx, const struct ir3_shader_variant *v, return state; } +/* Build bindless descriptor state, returns ownership of state reference */ +struct fd_ringbuffer * +fd6_build_bindless_state(struct fd_context *ctx, enum pipe_shader_type shader, + bool append_fb_read) +{ + struct fd_shaderbuf_stateobj *bufso = &ctx->shaderbuf[shader]; + struct fd_shaderimg_stateobj *imgso = &ctx->shaderimg[shader]; + struct fd6_descriptor_set *set = descriptor_set(ctx, shader); + + struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( + ctx->batch->submit, 16 * 4, FD_RINGBUFFER_STREAMING); + + /* Don't re-use a previous descriptor set if appending the + * fb-read descriptor, as that can change across batches. + * The normal descriptor slots are safe to re-use even if + * the state is dirtied due to batch flush, but the fb-read + * slot is not. + */ + if (unlikely(append_fb_read)) + fd6_descriptor_set_invalidate(set); + + /* + * Re-validate the descriptor slots, ie. in the case that + * the resource gets rebound due to use with non-UBWC + * compatible view format, etc. + * + * While we are at it, attach the BOs to the ring. + */ + + u_foreach_bit (b, bufso->enabled_mask) { + struct pipe_shader_buffer *buf = &bufso->sb[b]; + unsigned idx = b + IR3_BINDLESS_SSBO_OFFSET; + validate_buffer_descriptor(ctx, set, idx, buf); + if (buf->buffer) + fd_ringbuffer_attach_bo(ring, fd_resource(buf->buffer)->bo); + } + + u_foreach_bit (b, imgso->enabled_mask) { + struct pipe_image_view *img = &imgso->si[b]; + unsigned idx = b + IR3_BINDLESS_IMAGE_OFFSET; + validate_image_descriptor(ctx, set, idx, img); + if (img->resource) + fd_ringbuffer_attach_bo(ring, fd_resource(img->resource)->bo); + } + + if (!set->bo) { + set->bo = fd_bo_new( + ctx->dev, sizeof(set->descriptor), + /* Use same flags as ringbuffer so hits the same heap, + * because those will already have the FD_RELOC_DUMP + * flag set: + */ + FD_BO_GPUREADONLY | FD_BO_CACHED_COHERENT, + "%s bindless", _mesa_shader_stage_to_abbrev(shader)); + fd_bo_mark_for_dump(set->bo); + + uint32_t *desc_buf = fd_bo_map(set->bo); + + memcpy(desc_buf, set->descriptor, sizeof(set->descriptor)); + + if (unlikely(append_fb_read)) { + /* The last image slot is used for fb-read: */ + unsigned idx = IR3_BINDLESS_DESC_COUNT - 1; + + /* This is patched with the appropriate descriptor for GMEM or + * sysmem rendering path in fd6_gmem + */ + + struct fd_cs_patch patch = { + .cs = &desc_buf[idx * FDL6_TEX_CONST_DWORDS], + }; + util_dynarray_append(&ctx->batch->fb_read_patches, + __typeof__(patch), patch); + } + } + + /* + * Build stateobj emitting reg writes to configure the descriptor + * set and CP_LOAD_STATE packets to preload the state. + * + * Note that unless the app is using the max # of SSBOs there will + * be a gap between the IBO descriptors used for SSBOs and for images, + * so emit this as two CP_LOAD_STATE packets: + */ + + unsigned idx = ir3_shader_descriptor_set(shader); + + if (shader == PIPE_SHADER_COMPUTE) { + OUT_REG(ring, A6XX_HLSQ_INVALIDATE_CMD(.cs_bindless = 0x1f)); + OUT_REG(ring, A6XX_SP_CS_BINDLESS_BASE_DESCRIPTOR( + idx, .desc_size = BINDLESS_DESCRIPTOR_64B, .bo = set->bo, + )); + OUT_REG(ring, A6XX_HLSQ_CS_BINDLESS_BASE_DESCRIPTOR( + idx, .desc_size = BINDLESS_DESCRIPTOR_64B, .bo = set->bo, + )); + + if (bufso->enabled_mask) { + OUT_PKT(ring, CP_LOAD_STATE6_FRAG, + CP_LOAD_STATE6_0( + .dst_off = IR3_BINDLESS_SSBO_OFFSET, + .state_type = ST6_IBO, + .state_src = SS6_BINDLESS, + .state_block = SB6_CS_SHADER, + .num_unit = util_last_bit(bufso->enabled_mask), + ), + CP_LOAD_STATE6_EXT_SRC_ADDR( + /* This isn't actually an address: */ + .qword = (idx << 28) | + IR3_BINDLESS_SSBO_OFFSET * FDL6_TEX_CONST_DWORDS, + ), + ); + } + + if (imgso->enabled_mask) { + OUT_PKT(ring, CP_LOAD_STATE6_FRAG, + CP_LOAD_STATE6_0( + .dst_off = IR3_BINDLESS_IMAGE_OFFSET, + .state_type = ST6_IBO, + .state_src = SS6_BINDLESS, + .state_block = SB6_CS_SHADER, + .num_unit = util_last_bit(imgso->enabled_mask), + ), + CP_LOAD_STATE6_EXT_SRC_ADDR( + /* This isn't actually an address: */ + .qword = (idx << 28) | + IR3_BINDLESS_IMAGE_OFFSET * FDL6_TEX_CONST_DWORDS, + ), + ); + } + } else { + OUT_REG(ring, A6XX_HLSQ_INVALIDATE_CMD(.gfx_bindless = 0x1f)); + OUT_REG(ring, A6XX_SP_BINDLESS_BASE_DESCRIPTOR( + idx, .desc_size = BINDLESS_DESCRIPTOR_64B, .bo = set->bo, + )); + OUT_REG(ring, A6XX_HLSQ_BINDLESS_BASE_DESCRIPTOR( + idx, .desc_size = BINDLESS_DESCRIPTOR_64B, .bo = set->bo, + )); + + if (bufso->enabled_mask) { + OUT_PKT(ring, CP_LOAD_STATE6, + CP_LOAD_STATE6_0( + .dst_off = IR3_BINDLESS_SSBO_OFFSET, + .state_type = ST6_SHADER, + .state_src = SS6_BINDLESS, + .state_block = SB6_IBO, + .num_unit = util_last_bit(bufso->enabled_mask), + ), + CP_LOAD_STATE6_EXT_SRC_ADDR( + /* This isn't actually an address: */ + .qword = (idx << 28) | + IR3_BINDLESS_SSBO_OFFSET * FDL6_TEX_CONST_DWORDS, + ), + ); + } + + if (imgso->enabled_mask) { + OUT_PKT(ring, CP_LOAD_STATE6, + CP_LOAD_STATE6_0( + .dst_off = IR3_BINDLESS_IMAGE_OFFSET, + .state_type = ST6_SHADER, + .state_src = SS6_BINDLESS, + .state_block = SB6_IBO, + .num_unit = util_last_bit(imgso->enabled_mask), + ), + CP_LOAD_STATE6_EXT_SRC_ADDR( + /* This isn't actually an address: */ + .qword = (idx << 28) | + IR3_BINDLESS_IMAGE_OFFSET * FDL6_TEX_CONST_DWORDS, + ), + ); + } + } + + return ring; +} + static void fd6_set_shader_buffers(struct pipe_context *pctx, enum pipe_shader_type shader, unsigned start, unsigned count, diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_image.h b/src/gallium/drivers/freedreno/a6xx/fd6_image.h index 439317ea572..22235640e02 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_image.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_image.h @@ -39,6 +39,9 @@ struct ir3_shader_variant; struct fd_ringbuffer * fd6_build_ibo_state(struct fd_context *ctx, const struct ir3_shader_variant *v, enum pipe_shader_type shader) assert_dt; +struct fd_ringbuffer * +fd6_build_bindless_state(struct fd_context *ctx, enum pipe_shader_type shader, + bool append_fb_read) assert_dt; void fd6_image_init(struct pipe_context *pctx); diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.c b/src/gallium/drivers/freedreno/a6xx/fd6_program.c index 9ff022d5f5c..1b1eee0c570 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_program.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.c @@ -289,6 +289,22 @@ setup_stream_out(struct fd_context *ctx, struct fd6_program_state *state, state->streamout_stateobj = ring; } +static uint32_t +sp_xs_config(struct ir3_shader_variant *v) +{ + if (!v) + return 0; + + return A6XX_SP_VS_CONFIG_ENABLED | + COND(v->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) | + COND(v->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) | + COND(v->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) | + COND(v->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) | + A6XX_SP_VS_CONFIG_NIBO(ir3_shader_nibo(v)) | + A6XX_SP_VS_CONFIG_NTEX(v->num_samp) | + A6XX_SP_VS_CONFIG_NSAMP(v->num_samp); +} + static void setup_config_stateobj(struct fd_context *ctx, struct fd6_program_state *state) { @@ -318,37 +334,19 @@ setup_config_stateobj(struct fd_context *ctx, struct fd6_program_state *state) A6XX_HLSQ_FS_CNTL_ENABLED); OUT_PKT4(ring, REG_A6XX_SP_VS_CONFIG, 1); - OUT_RING(ring, COND(state->vs, A6XX_SP_VS_CONFIG_ENABLED) | - A6XX_SP_VS_CONFIG_NIBO(ir3_shader_nibo(state->vs)) | - A6XX_SP_VS_CONFIG_NTEX(state->vs->num_samp) | - A6XX_SP_VS_CONFIG_NSAMP(state->vs->num_samp)); + OUT_RING(ring, sp_xs_config(state->vs)); OUT_PKT4(ring, REG_A6XX_SP_HS_CONFIG, 1); - OUT_RING(ring, COND(state->hs, - A6XX_SP_HS_CONFIG_ENABLED | - A6XX_SP_HS_CONFIG_NIBO(ir3_shader_nibo(state->hs)) | - A6XX_SP_HS_CONFIG_NTEX(state->hs->num_samp) | - A6XX_SP_HS_CONFIG_NSAMP(state->hs->num_samp))); + OUT_RING(ring, sp_xs_config(state->hs)); OUT_PKT4(ring, REG_A6XX_SP_DS_CONFIG, 1); - OUT_RING(ring, COND(state->ds, - A6XX_SP_DS_CONFIG_ENABLED | - A6XX_SP_DS_CONFIG_NIBO(ir3_shader_nibo(state->ds)) | - A6XX_SP_DS_CONFIG_NTEX(state->ds->num_samp) | - A6XX_SP_DS_CONFIG_NSAMP(state->ds->num_samp))); + OUT_RING(ring, sp_xs_config(state->ds)); OUT_PKT4(ring, REG_A6XX_SP_GS_CONFIG, 1); - OUT_RING(ring, COND(state->gs, - A6XX_SP_GS_CONFIG_ENABLED | - A6XX_SP_GS_CONFIG_NIBO(ir3_shader_nibo(state->gs)) | - A6XX_SP_GS_CONFIG_NTEX(state->gs->num_samp) | - A6XX_SP_GS_CONFIG_NSAMP(state->gs->num_samp))); + OUT_RING(ring, sp_xs_config(state->gs)); OUT_PKT4(ring, REG_A6XX_SP_FS_CONFIG, 1); - OUT_RING(ring, COND(state->fs, A6XX_SP_FS_CONFIG_ENABLED) | - A6XX_SP_FS_CONFIG_NIBO(ir3_shader_nibo(state->fs)) | - A6XX_SP_FS_CONFIG_NTEX(state->fs->num_samp) | - A6XX_SP_FS_CONFIG_NSAMP(state->fs->num_samp)); + OUT_RING(ring, sp_xs_config(state->fs)); OUT_PKT4(ring, REG_A6XX_SP_IBO_COUNT, 1); OUT_RING(ring, ir3_shader_nibo(state->fs));