From 3ddcf8ab92a1d3ee3fc6ac64ea2cc18df4de7047 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Tue, 8 Jul 2025 09:48:50 -0700 Subject: [PATCH] freedreno/a6xx: Convert gallium to new cs builders Signed-off-by: Rob Clark Part-of: --- src/freedreno/common/fd6_pack.h | 7 + .../drivers/freedreno/a6xx/fd6_barrier.cc | 30 +- .../drivers/freedreno/a6xx/fd6_barrier.h | 14 +- .../drivers/freedreno/a6xx/fd6_blend.cc | 72 +- .../drivers/freedreno/a6xx/fd6_blitter.cc | 652 ++++---- .../drivers/freedreno/a6xx/fd6_blitter.h | 7 +- .../drivers/freedreno/a6xx/fd6_compute.cc | 295 ++-- .../drivers/freedreno/a6xx/fd6_const.cc | 209 ++- .../drivers/freedreno/a6xx/fd6_const.h | 14 +- .../drivers/freedreno/a6xx/fd6_context.cc | 48 +- .../drivers/freedreno/a6xx/fd6_context.h | 10 +- .../drivers/freedreno/a6xx/fd6_draw.cc | 236 +-- .../drivers/freedreno/a6xx/fd6_emit.cc | 640 ++++---- src/gallium/drivers/freedreno/a6xx/fd6_emit.h | 155 +- .../drivers/freedreno/a6xx/fd6_gmem.cc | 1450 +++++++++-------- .../drivers/freedreno/a6xx/fd6_image.cc | 159 +- .../drivers/freedreno/a6xx/fd6_program.cc | 1029 ++++++------ .../drivers/freedreno/a6xx/fd6_program.h | 4 +- .../drivers/freedreno/a6xx/fd6_query.cc | 354 ++-- .../drivers/freedreno/a6xx/fd6_rasterizer.cc | 64 +- .../drivers/freedreno/a6xx/fd6_resource.h | 2 - .../drivers/freedreno/a6xx/fd6_texture.cc | 164 +- src/gallium/drivers/freedreno/a6xx/fd6_zsa.cc | 121 +- src/gallium/drivers/freedreno/a6xx/fd6_zsa.h | 6 - 24 files changed, 2809 insertions(+), 2933 deletions(-) diff --git a/src/freedreno/common/fd6_pack.h b/src/freedreno/common/fd6_pack.h index 96d7d4536f3..63d555545f0 100644 --- a/src/freedreno/common/fd6_pack.h +++ b/src/freedreno/common/fd6_pack.h @@ -52,6 +52,13 @@ __reg_iova(const struct fd_reg_pair *reg) return iova << reg->bo_low; } +/* Special helper for building UBO descriptors inline with pkt7 */ +#define A6XX_UBO_DESC(_i, _bo, _bo_offset, _size_vec4s) { \ + .reg = 3 + (2 * _i), \ + .value = (uint64_t)A6XX_UBO_1_SIZE(_size_vec4s) << 32, \ + .bo = _bo, .bo_offset = _bo_offset, \ + }, {} + /** * Helper for various builders that use fd_ringbuffer. Not for direct use. */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_barrier.cc b/src/gallium/drivers/freedreno/a6xx/fd6_barrier.cc index bdf05024041..f1a5b4733f0 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_barrier.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_barrier.cc @@ -12,8 +12,7 @@ template void -fd6_emit_flushes(struct fd_context *ctx, struct fd_ringbuffer *ring, - unsigned flushes) +fd6_emit_flushes(struct fd_context *ctx, fd_cs &cs, unsigned flushes) { /* Experiments show that invalidating CCU while it still has data in it * doesn't work, so make sure to always flush before invalidating in case @@ -21,43 +20,34 @@ fd6_emit_flushes(struct fd_context *ctx, struct fd_ringbuffer *ring, * However it does seem to work for UCHE. */ if (flushes & (FD6_FLUSH_CCU_COLOR | FD6_INVALIDATE_CCU_COLOR)) - fd6_event_write(ctx, ring, FD_CCU_CLEAN_COLOR); + fd6_event_write(ctx, cs, FD_CCU_CLEAN_COLOR); if (flushes & (FD6_FLUSH_CCU_DEPTH | FD6_INVALIDATE_CCU_DEPTH)) - fd6_event_write(ctx, ring, FD_CCU_CLEAN_DEPTH); + fd6_event_write(ctx, cs, FD_CCU_CLEAN_DEPTH); if (flushes & FD6_INVALIDATE_CCU_COLOR) - fd6_event_write(ctx, ring, FD_CCU_INVALIDATE_COLOR); + fd6_event_write(ctx, cs, FD_CCU_INVALIDATE_COLOR); if (flushes & FD6_INVALIDATE_CCU_DEPTH) - fd6_event_write(ctx, ring, FD_CCU_INVALIDATE_DEPTH); + fd6_event_write(ctx, cs, FD_CCU_INVALIDATE_DEPTH); if (flushes & FD6_FLUSH_CACHE) - fd6_event_write(ctx, ring, FD_CACHE_CLEAN); + fd6_event_write(ctx, cs, FD_CACHE_CLEAN); if (flushes & FD6_INVALIDATE_CACHE) - fd6_event_write(ctx, ring, FD_CACHE_INVALIDATE); + fd6_event_write(ctx, cs, FD_CACHE_INVALIDATE); if (flushes & FD6_WAIT_MEM_WRITES) - OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0); + fd_pkt7(cs, CP_WAIT_MEM_WRITES, 0); if (flushes & FD6_WAIT_FOR_IDLE) - OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0); + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); if (flushes & FD6_WAIT_FOR_ME) - OUT_PKT7(ring, CP_WAIT_FOR_ME, 0); + fd_pkt7(cs, CP_WAIT_FOR_ME, 0); } FD_GENX(fd6_emit_flushes); -template -void -fd6_barrier_flush(struct fd_batch *batch) -{ - fd6_emit_flushes(batch->ctx, batch->draw, batch->barrier); - batch->barrier = 0; -} -FD_GENX(fd6_barrier_flush); - static void add_flushes(struct pipe_context *pctx, unsigned flushes) assert_dt diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_barrier.h b/src/gallium/drivers/freedreno/a6xx/fd6_barrier.h index 732c82bb7ae..017bb4a1f22 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_barrier.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_barrier.h @@ -6,7 +6,9 @@ #ifndef FD6_BARRIER_H_ #define FD6_BARRIER_H_ +#include "freedreno_batch.h" #include "freedreno_context.h" +#include "fd6_pack.h" /** * Various flush operations that could be needed @@ -24,11 +26,17 @@ enum fd6_flush { }; template -void fd6_emit_flushes(struct fd_context *ctx, struct fd_ringbuffer *ring, - unsigned flushes); +void fd6_emit_flushes(struct fd_context *ctx, fd_cs &cs, unsigned flushes); template -void fd6_barrier_flush(struct fd_batch *batch) assert_dt; +static inline void +fd6_barrier_flush(fd_cs &cs, struct fd_batch *batch) +{ + if (!batch->barrier) + return; + fd6_emit_flushes(batch->ctx, cs, batch->barrier); + batch->barrier = 0; +} void fd6_barrier_init(struct pipe_context *pctx); diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_blend.cc b/src/gallium/drivers/freedreno/a6xx/fd6_blend.cc index 716c207b2bd..6e16ccebc2a 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_blend.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_blend.cc @@ -60,9 +60,8 @@ __fd6_setup_blend_variant(struct fd6_blend_stateobj *blend, if (!so) return NULL; - struct fd_ringbuffer *ring = fd_ringbuffer_new_object( - blend->ctx->pipe, ((A6XX_MAX_RENDER_TARGETS * 4) + 6) * 4); - so->stateobj = ring; + unsigned nregs = (2 * A6XX_MAX_RENDER_TARGETS) + 3; + fd_crb crb(blend->ctx->pipe, nregs); for (unsigned i = 0; i <= cso->max_rt; i++) { const struct pipe_rt_blend_state *rt; @@ -72,25 +71,21 @@ __fd6_setup_blend_variant(struct fd6_blend_stateobj *blend, else rt = &cso->rt[0]; - OUT_REG(ring, - A6XX_RB_MRT_BLEND_CONTROL( - i, .rgb_src_factor = fd_blend_factor(rt->rgb_src_factor), + crb.add(A6XX_RB_MRT_BLEND_CONTROL(i, + .rgb_src_factor = fd_blend_factor(rt->rgb_src_factor), .rgb_blend_opcode = blend_func(rt->rgb_func), .rgb_dest_factor = fd_blend_factor(rt->rgb_dst_factor), .alpha_src_factor = fd_blend_factor(rt->alpha_src_factor), .alpha_blend_opcode = blend_func(rt->alpha_func), - .alpha_dest_factor = fd_blend_factor(rt->alpha_dst_factor), )); - - OUT_REG(ring, - A6XX_RB_MRT_CONTROL( - i, - .blend = rt->blend_enable, - .blend2 = rt->blend_enable, - .rop_enable = cso->logicop_enable, - .rop_code = rop, - .component_enable = rt->colormask, - ) - ); + .alpha_dest_factor = fd_blend_factor(rt->alpha_dst_factor), + )) + .add(A6XX_RB_MRT_CONTROL(i, + .blend = rt->blend_enable, + .blend2 = rt->blend_enable, + .rop_enable = cso->logicop_enable, + .rop_code = rop, + .component_enable = rt->colormask, + )); if (rt->blend_enable) { mrt_blend |= (1 << i); @@ -104,8 +99,7 @@ __fd6_setup_blend_variant(struct fd6_blend_stateobj *blend, /* sRGB + dither on a7xx goes badly: */ bool dither = (CHIP < A7XX) ? cso->dither : false; - OUT_REG(ring, - A6XX_RB_DITHER_CNTL( + crb.add(A6XX_RB_DITHER_CNTL( .dither_mode_mrt0 = dither ? DITHER_ALWAYS : DITHER_DISABLE, .dither_mode_mrt1 = dither ? DITHER_ALWAYS : DITHER_DISABLE, .dither_mode_mrt2 = dither ? DITHER_ALWAYS : DITHER_DISABLE, @@ -114,29 +108,23 @@ __fd6_setup_blend_variant(struct fd6_blend_stateobj *blend, .dither_mode_mrt5 = dither ? DITHER_ALWAYS : DITHER_DISABLE, .dither_mode_mrt6 = dither ? DITHER_ALWAYS : DITHER_DISABLE, .dither_mode_mrt7 = dither ? DITHER_ALWAYS : DITHER_DISABLE, - ) - ); - - OUT_REG(ring, - A6XX_SP_BLEND_CNTL( - .enable_blend = mrt_blend, - .unk8 = true, - .dual_color_in_enable = blend->use_dual_src_blend, - .alpha_to_coverage = cso->alpha_to_coverage, - ), - ); - - OUT_REG(ring, - A6XX_RB_BLEND_CNTL( - .blend_reads_dest = mrt_blend, - .independent_blend = cso->independent_blend_enable, - .dual_color_in_enable = blend->use_dual_src_blend, - .alpha_to_coverage = cso->alpha_to_coverage, - .alpha_to_one = cso->alpha_to_one, - .sample_mask = sample_mask, - ), - ); + )) + .add(A6XX_SP_BLEND_CNTL( + .enable_blend = mrt_blend, + .unk8 = true, + .dual_color_in_enable = blend->use_dual_src_blend, + .alpha_to_coverage = cso->alpha_to_coverage, + )) + .add(A6XX_RB_BLEND_CNTL( + .blend_reads_dest = mrt_blend, + .independent_blend = cso->independent_blend_enable, + .dual_color_in_enable = blend->use_dual_src_blend, + .alpha_to_coverage = cso->alpha_to_coverage, + .alpha_to_one = cso->alpha_to_one, + .sample_mask = sample_mask, + )); + so->stateobj = crb.ring(); so->sample_mask = sample_mask; util_dynarray_append(&blend->variants, struct fd6_blend_variant *, so); diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_blitter.cc b/src/gallium/drivers/freedreno/a6xx/fd6_blitter.cc index 2ca07928602..044484de63b 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_blitter.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_blitter.cc @@ -259,44 +259,46 @@ can_do_clear(const struct pipe_resource *prsc, unsigned level, template static void -emit_setup(struct fd_batch *batch) +emit_setup(struct fd_context *ctx, fd_cs &cs) { - struct fd_ringbuffer *ring = batch->draw; - struct fd_screen *screen = batch->ctx->screen; - - fd6_emit_flushes(batch->ctx, ring, + fd6_emit_flushes(ctx, cs, FD6_FLUSH_CCU_COLOR | FD6_INVALIDATE_CCU_COLOR | FD6_FLUSH_CCU_DEPTH | FD6_INVALIDATE_CCU_DEPTH); /* normal BLIT_OP_SCALE operation needs bypass RB_CCU_CNTL */ - fd6_emit_ccu_cntl(ring, screen, false); + fd6_emit_ccu_cntl(cs, ctx->screen, false); } template static void -emit_blit_fini(struct fd_context *ctx, struct fd_ringbuffer *ring) +emit_blit_fini(struct fd_context *ctx, fd_cs &cs) { - fd6_event_write(ctx, ring, FD_LABEL); - OUT_WFI5(ring); + const struct fd_dev_info *info = ctx->screen->info; - OUT_PKT4(ring, REG_A6XX_RB_DBG_ECO_CNTL, 1); - OUT_RING(ring, ctx->screen->info->a6xx.magic.RB_DBG_ECO_CNTL_blit); + fd6_event_write(ctx, cs, FD_LABEL); - OUT_PKT7(ring, CP_BLIT, 1); - OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_SCALE)); + if (info->a6xx.magic.RB_DBG_ECO_CNTL != info->a6xx.magic.RB_DBG_ECO_CNTL_blit) { + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); + fd_pkt4(cs, 1) + .add(A6XX_RB_DBG_ECO_CNTL(.dword = info->a6xx.magic.RB_DBG_ECO_CNTL_blit)); + } - OUT_WFI5(ring); + fd_pkt7(cs, CP_BLIT, 1) + .add(CP_BLIT_0(.op = BLIT_OP_SCALE)); - OUT_PKT4(ring, REG_A6XX_RB_DBG_ECO_CNTL, 1); - OUT_RING(ring, 0); /* RB_DBG_ECO_CNTL */ + if (info->a6xx.magic.RB_DBG_ECO_CNTL != info->a6xx.magic.RB_DBG_ECO_CNTL_blit) { + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); + fd_pkt4(cs, 1) + .add(A6XX_RB_DBG_ECO_CNTL(.dword = info->a6xx.magic.RB_DBG_ECO_CNTL)); + } } -FD_GENX(emit_blit_fini); +/* nregs: 5 */ template static void -emit_blit_setup(struct fd_ringbuffer *ring, enum pipe_format pfmt, +emit_blit_setup(fd_ncrb &ncrb, enum pipe_format pfmt, bool scissor_enable, union pipe_color_union *color, uint32_t unknown_8c01, enum a6xx_rotation rotate) { @@ -316,17 +318,14 @@ emit_blit_setup(struct fd_ringbuffer *ring, enum pipe_format pfmt, COND(color, A6XX_RB_A2D_BLT_CNTL_SOLID_COLOR) | COND(scissor_enable, A6XX_RB_A2D_BLT_CNTL_SCISSOR); - OUT_PKT4(ring, REG_A6XX_RB_A2D_BLT_CNTL, 1); - OUT_RING(ring, blit_cntl); - - OUT_PKT4(ring, REG_A6XX_GRAS_A2D_BLT_CNTL, 1); - OUT_RING(ring, blit_cntl); + ncrb.add(A6XX_RB_A2D_BLT_CNTL(.dword = blit_cntl)); + ncrb.add(A6XX_GRAS_A2D_BLT_CNTL(.dword = blit_cntl)); if (CHIP >= A7XX) { - OUT_REG(ring, A7XX_TPL1_A2D_BLT_CNTL( - .raw_copy = false, - .start_offset_texels = 0, - .type = A6XX_TEX_2D, + ncrb.add(A7XX_TPL1_A2D_BLT_CNTL( + .raw_copy = false, + .start_offset_texels = 0, + .type = A6XX_TEX_2D, )); } @@ -345,34 +344,32 @@ emit_blit_setup(struct fd_ringbuffer *ring, enum pipe_format pfmt, * controlling the internal/accumulator format or something like * that. It's certainly not tied to only the src format. */ - OUT_REG(ring, SP_A2D_OUTPUT_INFO( - CHIP, - .ifmt_type = output_ifmt_type, - .color_format = fmt, - .srgb = is_srgb, - .mask = 0xf, + ncrb.add(SP_A2D_OUTPUT_INFO(CHIP, + .ifmt_type = output_ifmt_type, + .color_format = fmt, + .srgb = is_srgb, + .mask = 0xf, )); - OUT_PKT4(ring, REG_A6XX_RB_A2D_PIXEL_CNTL, 1); - OUT_RING(ring, unknown_8c01); + ncrb.add(A6XX_RB_A2D_PIXEL_CNTL(.dword = unknown_8c01)); } +/* nregs: 4 */ +template static void -emit_blit_buffer_dst(struct fd_ringbuffer *ring, struct fd_resource *dst, +emit_blit_buffer_dst(fd_ncrb &ncrb, struct fd_resource *dst, unsigned off, unsigned size, a6xx_format color_format) { - OUT_REG(ring, - A6XX_RB_A2D_DEST_BUFFER_INFO( - .color_format = color_format, - .tile_mode = TILE6_LINEAR, - .color_swap = WZYX, - ), - A6XX_RB_A2D_DEST_BUFFER_BASE( - .bo = dst->bo, - .bo_offset = off, - ), - A6XX_RB_A2D_DEST_BUFFER_PITCH(size), - ); + ncrb.add(A6XX_RB_A2D_DEST_BUFFER_INFO( + .color_format = color_format, + .tile_mode = TILE6_LINEAR, + .color_swap = WZYX, + )); + ncrb.add(A6XX_RB_A2D_DEST_BUFFER_BASE( + .bo = dst->bo, + .bo_offset = off, + )); + ncrb.add(A6XX_RB_A2D_DEST_BUFFER_PITCH(size)); } /* buffers need to be handled specially since x/width can exceed the bounds @@ -380,8 +377,7 @@ emit_blit_buffer_dst(struct fd_ringbuffer *ring, struct fd_resource *dst, */ template static void -emit_blit_buffer(struct fd_context *ctx, struct fd_ringbuffer *ring, - const struct pipe_blit_info *info) +emit_blit_buffer(struct fd_context *ctx, fd_cs &cs, const struct pipe_blit_info *info) { const struct pipe_box *sbox = &info->src.box; const struct pipe_box *dbox = &info->dst.box; @@ -428,7 +424,8 @@ emit_blit_buffer(struct fd_context *ctx, struct fd_ringbuffer *ring, sshift = sbox->x & 0x3f; dshift = dbox->x & 0x3f; - emit_blit_setup(ring, PIPE_FORMAT_R8_UNORM, false, NULL, 0, ROTATE_0); + with_ncrb (cs, 5) + emit_blit_setup(ncrb, PIPE_FORMAT_R8_UNORM, false, NULL, 0, ROTATE_0); for (unsigned off = 0; off < sbox->width; off += (0x4000 - 0x40)) { unsigned soff, doff, w, p; @@ -442,94 +439,80 @@ emit_blit_buffer(struct fd_context *ctx, struct fd_ringbuffer *ring, assert((soff + w) <= fd_bo_size(src->bo)); assert((doff + w) <= fd_bo_size(dst->bo)); - /* - * Emit source: - */ - OUT_REG(ring, - TPL1_A2D_SRC_TEXTURE_INFO( - CHIP, - .color_format = FMT6_8_UNORM, - .tile_mode = TILE6_LINEAR, - .color_swap = WZYX, - .unk20 = true, - .unk22 = true, - ), - TPL1_A2D_SRC_TEXTURE_SIZE( - CHIP, - .width = sshift + w, - .height = 1, - ), - TPL1_A2D_SRC_TEXTURE_BASE( - CHIP, - .bo = src->bo, - .bo_offset = soff, - ), - TPL1_A2D_SRC_TEXTURE_PITCH( - CHIP, - .pitch = p, - ), - ); + with_ncrb (cs, 15) { + /* + * Emit source: + */ + ncrb.add(TPL1_A2D_SRC_TEXTURE_INFO(CHIP, + .color_format = FMT6_8_UNORM, + .tile_mode = TILE6_LINEAR, + .color_swap = WZYX, + .unk20 = true, + .unk22 = true, + )); + ncrb.add(TPL1_A2D_SRC_TEXTURE_SIZE(CHIP, + .width = sshift + w, + .height = 1, + )); + ncrb.add(TPL1_A2D_SRC_TEXTURE_BASE(CHIP, + .bo = src->bo, + .bo_offset = soff, + )); + ncrb.add(TPL1_A2D_SRC_TEXTURE_PITCH(CHIP, .pitch = p)); - /* - * Emit destination: - */ - emit_blit_buffer_dst(ring, dst, doff, p, FMT6_8_UNORM); + /* + * Emit destination: + */ + emit_blit_buffer_dst(ncrb, dst, doff, p, FMT6_8_UNORM); + + ncrb.add(A6XX_GRAS_A2D_SRC_XMIN(sshift)); + ncrb.add(A6XX_GRAS_A2D_SRC_XMAX(sshift + w - 1)); + ncrb.add(A6XX_GRAS_A2D_SRC_YMIN(0)); + ncrb.add(A6XX_GRAS_A2D_SRC_YMAX(0)); + + ncrb.add(A6XX_GRAS_A2D_DEST_TL(.x = dshift)); + ncrb.add(A6XX_GRAS_A2D_DEST_BR(.x = dshift + w - 1)); + } /* * Blit command: */ - OUT_REG(ring, - A6XX_GRAS_A2D_SRC_XMIN(sshift), - A6XX_GRAS_A2D_SRC_XMAX(sshift + w - 1), - A6XX_GRAS_A2D_SRC_YMIN(0), - A6XX_GRAS_A2D_SRC_YMAX(0), - ); - - OUT_PKT4(ring, REG_A6XX_GRAS_A2D_DEST_TL, 2); - OUT_RING(ring, A6XX_GRAS_A2D_DEST_TL_X(dshift) | A6XX_GRAS_A2D_DEST_TL_Y(0)); - OUT_RING(ring, A6XX_GRAS_A2D_DEST_BR_X(dshift + w - 1) | - A6XX_GRAS_A2D_DEST_BR_Y(0)); - - emit_blit_fini(ctx, ring); + emit_blit_fini(ctx, cs); } } template static void -clear_ubwc_setup(struct fd_ringbuffer *ring) +clear_ubwc_setup(fd_cs &cs) { union pipe_color_union color = {}; + fd_ncrb ncrb(cs, 18); - emit_blit_setup(ring, PIPE_FORMAT_R8_UNORM, false, &color, 0, ROTATE_0); + emit_blit_setup(ncrb, PIPE_FORMAT_R8_UNORM, false, &color, 0, ROTATE_0); - OUT_REG(ring, - TPL1_A2D_SRC_TEXTURE_INFO(CHIP), - TPL1_A2D_SRC_TEXTURE_SIZE(CHIP), - TPL1_A2D_SRC_TEXTURE_BASE(CHIP), - TPL1_A2D_SRC_TEXTURE_PITCH(CHIP), - ); + ncrb.add(TPL1_A2D_SRC_TEXTURE_INFO(CHIP)); + ncrb.add(TPL1_A2D_SRC_TEXTURE_SIZE(CHIP)); + ncrb.add(TPL1_A2D_SRC_TEXTURE_BASE(CHIP)); + ncrb.add(TPL1_A2D_SRC_TEXTURE_PITCH(CHIP)); - OUT_PKT4(ring, REG_A6XX_RB_A2D_CLEAR_COLOR_DW0, 4); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); + ncrb.add(A6XX_RB_A2D_CLEAR_COLOR_DW0()); + ncrb.add(A6XX_RB_A2D_CLEAR_COLOR_DW1()); + ncrb.add(A6XX_RB_A2D_CLEAR_COLOR_DW2()); + ncrb.add(A6XX_RB_A2D_CLEAR_COLOR_DW3()); - OUT_REG(ring, - A6XX_GRAS_A2D_SRC_XMIN(0), - A6XX_GRAS_A2D_SRC_XMAX(0), - A6XX_GRAS_A2D_SRC_YMIN(0), - A6XX_GRAS_A2D_SRC_YMAX(0), - ); + ncrb.add(A6XX_GRAS_A2D_SRC_XMIN(0)); + ncrb.add(A6XX_GRAS_A2D_SRC_XMAX(0)); + ncrb.add(A6XX_GRAS_A2D_SRC_YMIN(0)); + ncrb.add(A6XX_GRAS_A2D_SRC_YMAX(0)); } template static void fd6_clear_ubwc(struct fd_batch *batch, struct fd_resource *rsc) assert_dt { - struct fd_ringbuffer *ring = fd_batch_get_prologue(batch); + fd_cs cs(fd_batch_get_prologue(batch)); - clear_ubwc_setup(ring); + clear_ubwc_setup(cs); unsigned size = rsc->layout.slices[0].offset; unsigned offset = 0; @@ -550,34 +533,35 @@ fd6_clear_ubwc(struct fd_batch *batch, struct fd_resource *rsc) assert_dt /* width is already aligned to a suitable pitch: */ const unsigned p = w; - /* - * Emit destination: - */ - emit_blit_buffer_dst(ring, rsc, offset, p, FMT6_8_UNORM); + with_ncrb (cs, 6) { + /* + * Emit destination: + */ + emit_blit_buffer_dst(ncrb, rsc, offset, p, FMT6_8_UNORM); + + ncrb.add(A6XX_GRAS_A2D_DEST_TL(.x = 0, .y = 0)); + ncrb.add(A6XX_GRAS_A2D_DEST_BR(.x = w - 1, .y = h - 1)); + } /* * Blit command: */ - - OUT_PKT4(ring, REG_A6XX_GRAS_A2D_DEST_TL, 2); - OUT_RING(ring, A6XX_GRAS_A2D_DEST_TL_X(0) | A6XX_GRAS_A2D_DEST_TL_Y(0)); - OUT_RING(ring, - A6XX_GRAS_A2D_DEST_BR_X(w - 1) | A6XX_GRAS_A2D_DEST_BR_Y(h - 1)); - - emit_blit_fini(batch->ctx, ring); + emit_blit_fini(batch->ctx, cs); offset += w * h; size -= w * h; } - fd6_emit_flushes(batch->ctx, ring, + fd6_emit_flushes(batch->ctx, cs, FD6_FLUSH_CCU_COLOR | FD6_FLUSH_CCU_DEPTH | FD6_FLUSH_CACHE | FD6_WAIT_FOR_IDLE); } +/* nregs: 10 */ +template static void -emit_blit_dst(struct fd_ringbuffer *ring, struct pipe_resource *prsc, +emit_blit_dst(fd_ncrb &ncrb, struct pipe_resource *prsc, enum pipe_format pfmt, unsigned level, unsigned layer) { struct fd_resource *dst = fd_resource(prsc); @@ -595,33 +579,36 @@ emit_blit_dst(struct fd_ringbuffer *ring, struct pipe_resource *prsc, if (fmt == FMT6_Z24_UNORM_S8_UINT) fmt = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8; - OUT_REG(ring, - A6XX_RB_A2D_DEST_BUFFER_INFO( - .color_format = fmt, - .tile_mode = tile, - .color_swap = swap, - .flags = ubwc_enabled, - .srgb = util_format_is_srgb(pfmt), - ), - A6XX_RB_A2D_DEST_BUFFER_BASE( - .bo = dst->bo, - .bo_offset = off, - ), - A6XX_RB_A2D_DEST_BUFFER_PITCH(pitch), - ); + ncrb.add(A6XX_RB_A2D_DEST_BUFFER_INFO( + .color_format = fmt, + .tile_mode = tile, + .color_swap = swap, + .flags = ubwc_enabled, + .srgb = util_format_is_srgb(pfmt), + )); + ncrb.add(A6XX_RB_A2D_DEST_BUFFER_BASE( + .bo = dst->bo, + .bo_offset = off, + )); + ncrb.add(A6XX_RB_A2D_DEST_BUFFER_PITCH(pitch)); if (ubwc_enabled) { - OUT_PKT4(ring, REG_A6XX_RB_A2D_DEST_FLAG_BUFFER_BASE, 6); - fd6_emit_flag_reference(ring, dst, level, layer); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); + ncrb.add(A6XX_RB_A2D_DEST_FLAG_BUFFER_BASE( + dst->bo, fd_resource_ubwc_offset(dst, level, layer) + )); + ncrb.add(A6XX_RB_A2D_DEST_FLAG_BUFFER_PITCH( + .pitch = fdl_ubwc_pitch(&dst->layout, level), + .array_pitch = dst->layout.ubwc_layer_size >> 2, + )); + ncrb.add(A6XX_RB_A2D_DEST_FLAG_BUFFER_BASE_1()); + ncrb.add(A6XX_RB_A2D_DEST_FLAG_BUFFER_PITCH_1()); } } +/* nregs: 8 */ template static void -emit_blit_src(struct fd_ringbuffer *ring, const struct pipe_blit_info *info, +emit_blit_src(fd_ncrb &ncrb, const struct pipe_blit_info *info, unsigned layer, unsigned nr_samples) { struct fd_resource *src = fd_resource(info->src.resource); @@ -641,52 +628,36 @@ emit_blit_src(struct fd_ringbuffer *ring, const struct pipe_blit_info *info, if (info->src.format == PIPE_FORMAT_A8_UNORM) sfmt = FMT6_A8_UNORM; - OUT_REG(ring, - TPL1_A2D_SRC_TEXTURE_INFO( - CHIP, - .color_format = sfmt, - .tile_mode = stile, - .color_swap = sswap, - .flags = subwc_enabled, - .srgb = util_format_is_srgb(info->src.format), - .samples = samples, - .filter = (info->filter == PIPE_TEX_FILTER_LINEAR), - .samples_average = (samples > MSAA_ONE) && !info->sample0_only, - .unk20 = true, - .unk22 = true, - ), - TPL1_A2D_SRC_TEXTURE_SIZE( - CHIP, - .width = width, - .height = height, - ), - TPL1_A2D_SRC_TEXTURE_BASE( - CHIP, - .bo = src->bo, - .bo_offset = soff, - ), - TPL1_A2D_SRC_TEXTURE_PITCH( - CHIP, - .pitch = pitch, - ), - ); + ncrb.add(TPL1_A2D_SRC_TEXTURE_INFO(CHIP, + .color_format = sfmt, + .tile_mode = stile, + .color_swap = sswap, + .flags = subwc_enabled, + .srgb = util_format_is_srgb(info->src.format), + .samples = samples, + .filter = (info->filter == PIPE_TEX_FILTER_LINEAR), + .samples_average = (samples > MSAA_ONE) && !info->sample0_only, + .unk20 = true, + .unk22 = true, + )); + ncrb.add(TPL1_A2D_SRC_TEXTURE_SIZE(CHIP, .width = width, .height = height)); + ncrb.add(TPL1_A2D_SRC_TEXTURE_BASE(CHIP, .bo = src->bo, .bo_offset = soff)); + ncrb.add(TPL1_A2D_SRC_TEXTURE_PITCH(CHIP, .pitch = pitch)); if (subwc_enabled && fd_resource_ubwc_enabled(src, info->src.level)) { - OUT_REG(ring, - TPL1_A2D_SRC_TEXTURE_FLAG_BASE( - CHIP, - .bo = src->bo, - .bo_offset = fd_resource_ubwc_offset(src, info->src.level, layer), - ), - TPL1_A2D_SRC_TEXTURE_FLAG_PITCH( - CHIP, fdl_ubwc_pitch(&src->layout, info->src.level)), - ); + ncrb.add(TPL1_A2D_SRC_TEXTURE_FLAG_BASE(CHIP, + .bo = src->bo, + .bo_offset = fd_resource_ubwc_offset(src, info->src.level, layer), + )); + ncrb.add(TPL1_A2D_SRC_TEXTURE_FLAG_PITCH(CHIP, + fdl_ubwc_pitch(&src->layout, info->src.level), + )); } } template static void -emit_blit_texture_setup(struct fd_ringbuffer *ring, const struct pipe_blit_info *info) +emit_blit_texture_setup(fd_cs &cs, const struct pipe_blit_info *info) { const struct pipe_box *sbox = &info->src.box; const struct pipe_box *dbox = &info->dst.box; @@ -717,35 +688,33 @@ emit_blit_texture_setup(struct fd_ringbuffer *ring, const struct pipe_blit_info enum a6xx_rotation rotate = rotates[mirror_y][mirror_x]; - OUT_REG(ring, - A6XX_GRAS_A2D_SRC_XMIN(MIN2(sx1, sx2)), - A6XX_GRAS_A2D_SRC_XMAX(MAX2(sx1, sx2) - 1), - A6XX_GRAS_A2D_SRC_YMIN(MIN2(sy1, sy2)), - A6XX_GRAS_A2D_SRC_YMAX(MAX2(sy1, sy2) - 1), - ); + fd_ncrb ncrb(cs, 13); - OUT_REG(ring, - A6XX_GRAS_A2D_DEST_TL(.x = MIN2(dx1, dx2), - .y = MIN2(dy1, dy2)), - A6XX_GRAS_A2D_DEST_BR(.x = MAX2(dx1, dx2) - 1, - .y = MAX2(dy1, dy2) - 1), - ); + ncrb.add(A6XX_GRAS_A2D_SRC_XMIN(MIN2(sx1, sx2))); + ncrb.add(A6XX_GRAS_A2D_SRC_XMAX(MAX2(sx1, sx2) - 1)); + ncrb.add(A6XX_GRAS_A2D_SRC_YMIN(MIN2(sy1, sy2))); + ncrb.add(A6XX_GRAS_A2D_SRC_YMAX(MAX2(sy1, sy2) - 1)); + + ncrb.add(A6XX_GRAS_A2D_DEST_TL(.x = MIN2(dx1, dx2), .y = MIN2(dy1, dy2))); + ncrb.add(A6XX_GRAS_A2D_DEST_BR(.x = MAX2(dx1, dx2) - 1, .y = MAX2(dy1, dy2) - 1)); if (info->scissor_enable) { - OUT_PKT4(ring, REG_A6XX_GRAS_A2D_SCISSOR_TL, 2); - OUT_RING(ring, A6XX_GRAS_A2D_SCISSOR_TL_X(info->scissor.minx) | - A6XX_GRAS_A2D_SCISSOR_TL_Y(info->scissor.miny)); - OUT_RING(ring, A6XX_GRAS_A2D_SCISSOR_TL_X(info->scissor.maxx - 1) | - A6XX_GRAS_A2D_SCISSOR_TL_Y(info->scissor.maxy - 1)); + ncrb.add(A6XX_GRAS_A2D_SCISSOR_TL( + .x = info->scissor.minx, + .y = info->scissor.miny, + )); + ncrb.add(A6XX_GRAS_A2D_SCISSOR_BR( + .x = info->scissor.maxx - 1, + .y = info->scissor.maxy - 1, + )); } - emit_blit_setup(ring, info->dst.format, info->scissor_enable, NULL, 0, rotate); + emit_blit_setup(ncrb, info->dst.format, info->scissor_enable, NULL, 0, rotate); } template static void -emit_blit_texture(struct fd_context *ctx, struct fd_ringbuffer *ring, - const struct pipe_blit_info *info) +emit_blit_texture(struct fd_context *ctx, fd_cs &cs, const struct pipe_blit_info *info) { const struct pipe_box *sbox = &info->src.box; const struct pipe_box *dbox = &info->dst.box; @@ -756,24 +725,27 @@ emit_blit_texture(struct fd_context *ctx, struct fd_ringbuffer *ring, dump_blit_info(info); } - emit_blit_texture_setup(ring, info); + emit_blit_texture_setup(cs, info); dst = fd_resource(info->dst.resource); uint32_t nr_samples = fd_resource_nr_samples(&dst->b.b); for (unsigned i = 0; i < info->dst.box.depth; i++) { + with_ncrb (cs, 18) { + emit_blit_src(ncrb, info, sbox->z + i, nr_samples); + emit_blit_dst(ncrb, info->dst.resource, info->dst.format, info->dst.level, + dbox->z + i); + } - emit_blit_src(ring, info, sbox->z + i, nr_samples); - emit_blit_dst(ring, info->dst.resource, info->dst.format, info->dst.level, - dbox->z + i); - - emit_blit_fini(ctx, ring); + emit_blit_fini(ctx, cs); } } +/* nregs: 4 */ +template static void -emit_clear_color(struct fd_ringbuffer *ring, enum pipe_format pfmt, +emit_clear_color(fd_ncrb &ncrb, enum pipe_format pfmt, union pipe_color_union *color) { switch (pfmt) { @@ -792,68 +764,65 @@ emit_clear_color(struct fd_ringbuffer *ring, enum pipe_format pfmt, break; } - OUT_PKT4(ring, REG_A6XX_RB_A2D_CLEAR_COLOR_DW0, 4); switch (fd6_ifmt(fd6_color_format(pfmt, TILE6_LINEAR))) { case R2D_UNORM8: case R2D_UNORM8_SRGB: /* The r2d ifmt is badly named, it also covers the signed case: */ if (util_format_is_snorm(pfmt)) { - OUT_RING(ring, float_to_byte_tex(color->f[0])); - OUT_RING(ring, float_to_byte_tex(color->f[1])); - OUT_RING(ring, float_to_byte_tex(color->f[2])); - OUT_RING(ring, float_to_byte_tex(color->f[3])); + ncrb.add(A6XX_RB_A2D_CLEAR_COLOR_DW0(float_to_byte_tex(color->f[0]))); + ncrb.add(A6XX_RB_A2D_CLEAR_COLOR_DW1(float_to_byte_tex(color->f[1]))); + ncrb.add(A6XX_RB_A2D_CLEAR_COLOR_DW2(float_to_byte_tex(color->f[2]))); + ncrb.add(A6XX_RB_A2D_CLEAR_COLOR_DW3(float_to_byte_tex(color->f[3]))); } else { - OUT_RING(ring, float_to_ubyte(color->f[0])); - OUT_RING(ring, float_to_ubyte(color->f[1])); - OUT_RING(ring, float_to_ubyte(color->f[2])); - OUT_RING(ring, float_to_ubyte(color->f[3])); + ncrb.add(A6XX_RB_A2D_CLEAR_COLOR_DW0(float_to_ubyte(color->f[0]))); + ncrb.add(A6XX_RB_A2D_CLEAR_COLOR_DW1(float_to_ubyte(color->f[1]))); + ncrb.add(A6XX_RB_A2D_CLEAR_COLOR_DW2(float_to_ubyte(color->f[2]))); + ncrb.add(A6XX_RB_A2D_CLEAR_COLOR_DW3(float_to_ubyte(color->f[3]))); } break; case R2D_FLOAT16: - OUT_RING(ring, _mesa_float_to_half(color->f[0])); - OUT_RING(ring, _mesa_float_to_half(color->f[1])); - OUT_RING(ring, _mesa_float_to_half(color->f[2])); - OUT_RING(ring, _mesa_float_to_half(color->f[3])); + ncrb.add(A6XX_RB_A2D_CLEAR_COLOR_DW0(_mesa_float_to_half(color->f[0]))); + ncrb.add(A6XX_RB_A2D_CLEAR_COLOR_DW1(_mesa_float_to_half(color->f[1]))); + ncrb.add(A6XX_RB_A2D_CLEAR_COLOR_DW2(_mesa_float_to_half(color->f[2]))); + ncrb.add(A6XX_RB_A2D_CLEAR_COLOR_DW3(_mesa_float_to_half(color->f[3]))); break; case R2D_FLOAT32: case R2D_INT32: case R2D_INT16: case R2D_INT8: default: - OUT_RING(ring, color->ui[0]); - OUT_RING(ring, color->ui[1]); - OUT_RING(ring, color->ui[2]); - OUT_RING(ring, color->ui[3]); + ncrb.add(A6XX_RB_A2D_CLEAR_COLOR_DW0(color->ui[0])); + ncrb.add(A6XX_RB_A2D_CLEAR_COLOR_DW1(color->ui[1])); + ncrb.add(A6XX_RB_A2D_CLEAR_COLOR_DW2(color->ui[2])); + ncrb.add(A6XX_RB_A2D_CLEAR_COLOR_DW3(color->ui[3])); break; } } template static void -clear_lrz_setup(struct fd_ringbuffer *ring, struct fd_resource *zsbuf, - struct fd_bo *lrz, double depth) +clear_lrz_setup(fd_cs &cs, struct fd_resource *zsbuf, struct fd_bo *lrz, double depth) { - OUT_PKT4(ring, REG_A6XX_GRAS_A2D_DEST_TL, 2); - OUT_RING(ring, A6XX_GRAS_A2D_DEST_TL_X(0) | A6XX_GRAS_A2D_DEST_TL_Y(0)); - OUT_RING(ring, A6XX_GRAS_A2D_DEST_BR_X(zsbuf->lrz_layout.lrz_pitch - 1) | - A6XX_GRAS_A2D_DEST_BR_Y(zsbuf->lrz_layout.lrz_height - 1)); + fd_ncrb ncrb(cs, 15); + + ncrb.add(A6XX_GRAS_A2D_DEST_TL(.x = 0, .y = 0)); + ncrb.add(A6XX_GRAS_A2D_DEST_BR( + .x = zsbuf->lrz_layout.lrz_pitch - 1, + .y = zsbuf->lrz_layout.lrz_height - 1, + )); union pipe_color_union clear_color = { .f = {depth} }; - emit_clear_color(ring, PIPE_FORMAT_Z16_UNORM, &clear_color); - emit_blit_setup(ring, PIPE_FORMAT_Z16_UNORM, false, &clear_color, 0, ROTATE_0); + emit_clear_color(ncrb, PIPE_FORMAT_Z16_UNORM, &clear_color); + emit_blit_setup(ncrb, PIPE_FORMAT_Z16_UNORM, false, &clear_color, 0, ROTATE_0); - OUT_REG(ring, - A6XX_RB_A2D_DEST_BUFFER_INFO( - .color_format = FMT6_16_UNORM, - .tile_mode = TILE6_LINEAR, - .color_swap = WZYX, - ), - A6XX_RB_A2D_DEST_BUFFER_BASE( - .bo = lrz, - ), - A6XX_RB_A2D_DEST_BUFFER_PITCH(zsbuf->lrz_layout.lrz_pitch * 2), - ); + ncrb.add(A6XX_RB_A2D_DEST_BUFFER_INFO( + .color_format = FMT6_16_UNORM, + .tile_mode = TILE6_LINEAR, + .color_swap = WZYX, + )); + ncrb.add(A6XX_RB_A2D_DEST_BUFFER_BASE(.bo = lrz)); + ncrb.add(A6XX_RB_A2D_DEST_BUFFER_PITCH(zsbuf->lrz_layout.lrz_pitch * 2)); } template @@ -861,7 +830,7 @@ void fd6_clear_lrz(struct fd_batch *batch, struct fd_resource *zsbuf, struct fd_bo *lrz, double depth) { - struct fd_ringbuffer *ring = fd_batch_get_prologue(batch); + fd_cs cs(fd_batch_get_prologue(batch)); if (DEBUG_BLIT) { fprintf(stderr, "lrz clear:\ndst resource: "); @@ -869,14 +838,14 @@ fd6_clear_lrz(struct fd_batch *batch, struct fd_resource *zsbuf, fprintf(stderr, "\n"); } - clear_lrz_setup(ring, zsbuf, lrz, depth); + clear_lrz_setup(cs, zsbuf, lrz, depth); /* * Blit command: */ - OUT_PKT7(ring, CP_BLIT, 1); - OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_SCALE)); + fd_pkt7(cs, CP_BLIT, 1) + .add(CP_BLIT_0(.op = BLIT_OP_SCALE)); } FD_GENX(fd6_clear_lrz); @@ -979,7 +948,7 @@ fd6_clear_buffer(struct pipe_context *pctx, struct fd_context *ctx = fd_context(pctx); struct fd_resource *rsc = fd_resource(prsc); struct fd_batch *batch = fd_bc_alloc_batch(ctx, true); - struct fd_ringbuffer *ring = batch->draw; + fd_cs cs(batch->draw); fd_screen_lock(ctx->screen); fd_batch_resource_write(batch, rsc); @@ -995,10 +964,12 @@ fd6_clear_buffer(struct pipe_context *pctx, fd_batch_update_queries(batch); - emit_setup(batch); + emit_setup(batch->ctx, cs); - emit_clear_color(ring, dst_fmt, &color); - emit_blit_setup(ring, dst_fmt, false, &color, 0, ROTATE_0); + with_ncrb (cs, 9) { + emit_clear_color(ncrb, dst_fmt, &color); + emit_blit_setup(ncrb, dst_fmt, false, &color, 0, ROTATE_0); + } /* * Buffers can have dimensions bigger than max width (0x4000), so @@ -1020,24 +991,24 @@ fd6_clear_buffer(struct pipe_context *pctx, uint32_t doff = offset & ~0x3f; uint32_t width = MIN2(blocks, 0x4000 - dst_x); - emit_blit_buffer_dst(ring, rsc, doff, 0, fmt); + with_ncrb (cs, 6) { + emit_blit_buffer_dst(ncrb, rsc, doff, 0, fmt); - OUT_PKT4(ring, REG_A6XX_GRAS_A2D_DEST_TL, 2); - OUT_RING(ring, A6XX_GRAS_A2D_DEST_TL_X(dst_x) | A6XX_GRAS_A2D_DEST_TL_Y(0)); - OUT_RING(ring, A6XX_GRAS_A2D_DEST_BR_X(dst_x + width - 1) | - A6XX_GRAS_A2D_DEST_BR_Y(0)); + ncrb.add(A6XX_GRAS_A2D_DEST_TL(.x = dst_x)); + ncrb.add(A6XX_GRAS_A2D_DEST_BR(.x = dst_x + width - 1)); + } - emit_blit_fini(ctx, ring); + emit_blit_fini(ctx, cs); offset += width * clear_value_size; blocks -= width; } - fd6_emit_flushes(batch->ctx, ring, - FD6_FLUSH_CCU_COLOR | - FD6_FLUSH_CCU_DEPTH | - FD6_FLUSH_CACHE | - FD6_WAIT_FOR_IDLE); + fd6_emit_flushes(batch->ctx, cs, + FD6_FLUSH_CCU_COLOR | + FD6_FLUSH_CCU_DEPTH | + FD6_FLUSH_CACHE | + FD6_WAIT_FOR_IDLE); fd_batch_flush(batch); fd_batch_reference(&batch, NULL); @@ -1050,26 +1021,31 @@ fd6_clear_buffer(struct pipe_context *pctx, template static void -clear_surface_setup(struct fd_ringbuffer *ring, struct pipe_surface *psurf, +clear_surface_setup(fd_cs &cs, struct pipe_surface *psurf, const struct pipe_box *box2d, union pipe_color_union *color, uint32_t unknown_8c01) { uint32_t nr_samples = fd_resource_nr_samples(psurf->texture); - OUT_PKT4(ring, REG_A6XX_GRAS_A2D_DEST_TL, 2); - OUT_RING(ring, A6XX_GRAS_A2D_DEST_TL_X(box2d->x * nr_samples) | - A6XX_GRAS_A2D_DEST_TL_Y(box2d->y)); - OUT_RING(ring, A6XX_GRAS_A2D_DEST_BR_X((box2d->x + box2d->width) * nr_samples - 1) | - A6XX_GRAS_A2D_DEST_BR_Y(box2d->y + box2d->height - 1)); + fd_ncrb ncrb(cs, 11); + + ncrb.add(A6XX_GRAS_A2D_DEST_TL( + .x = box2d->x * nr_samples, + .y = box2d->y, + )); + ncrb.add(A6XX_GRAS_A2D_DEST_BR( + .x = (box2d->x + box2d->width) * nr_samples - 1, + .y = box2d->y + box2d->height - 1, + )); union pipe_color_union clear_color = convert_color(psurf->format, color); - emit_clear_color(ring, psurf->format, &clear_color); - emit_blit_setup(ring, psurf->format, false, &clear_color, unknown_8c01, ROTATE_0); + emit_clear_color(ncrb, psurf->format, &clear_color); + emit_blit_setup(ncrb, psurf->format, false, &clear_color, unknown_8c01, ROTATE_0); } template void -fd6_clear_surface(struct fd_context *ctx, struct fd_ringbuffer *ring, +fd6_clear_surface(struct fd_context *ctx, fd_cs &cs, struct pipe_surface *psurf, const struct pipe_box *box2d, union pipe_color_union *color, uint32_t unknown_8c01) { @@ -1079,13 +1055,13 @@ fd6_clear_surface(struct fd_context *ctx, struct fd_ringbuffer *ring, fprintf(stderr, "\n"); } - clear_surface_setup(ring, psurf, box2d, color, unknown_8c01); + clear_surface_setup(cs, psurf, box2d, color, unknown_8c01); - for (unsigned i = psurf->first_layer; i <= psurf->last_layer; - i++) { - emit_blit_dst(ring, psurf->texture, psurf->format, psurf->level, i); + for (unsigned i = psurf->first_layer; i <= psurf->last_layer; i++) { + with_ncrb (cs, 10) + emit_blit_dst(ncrb, psurf->texture, psurf->format, psurf->level, i); - emit_blit_fini(ctx, ring); + emit_blit_fini(ctx, cs); } } FD_GENX(fd6_clear_surface); @@ -1149,7 +1125,9 @@ fd6_clear_texture(struct pipe_context *pctx, struct pipe_resource *prsc, fd_batch_update_queries(batch); - emit_setup(batch); + fd_cs cs(batch->draw); + + emit_setup(batch->ctx, cs); struct pipe_surface surf = { .format = prsc->format, @@ -1159,9 +1137,9 @@ fd6_clear_texture(struct pipe_context *pctx, struct pipe_resource *prsc, .texture = prsc, }; - fd6_clear_surface(ctx, batch->draw, &surf, box, &color, 0); + fd6_clear_surface(ctx, cs, &surf, box, &color, 0); - fd6_emit_flushes(batch->ctx, batch->draw, + fd6_emit_flushes(batch->ctx, cs, FD6_FLUSH_CCU_COLOR | FD6_FLUSH_CCU_DEPTH | FD6_FLUSH_CACHE | @@ -1178,8 +1156,8 @@ fd6_clear_texture(struct pipe_context *pctx, struct pipe_resource *prsc, template static void -resolve_tile_setup(struct fd_batch *batch, struct fd_ringbuffer *ring, - uint32_t base, struct pipe_surface *psurf, uint32_t unknown_8c01) +resolve_tile_setup(struct fd_batch *batch, fd_cs &cs, uint32_t base, + struct pipe_surface *psurf, uint32_t unknown_8c01) { const struct fd_gmem_stateobj *gmem = batch->gmem_state; uint64_t gmem_base = batch->ctx->screen->gmem_base + base; @@ -1187,85 +1165,71 @@ resolve_tile_setup(struct fd_batch *batch, struct fd_ringbuffer *ring, util_format_get_blocksize(psurf->format); unsigned width = pipe_surface_width(psurf); unsigned height = pipe_surface_height(psurf); + fd_ncrb ncrb(cs, 26); - OUT_PKT4(ring, REG_A6XX_GRAS_A2D_DEST_TL, 2); - OUT_RING(ring, A6XX_GRAS_A2D_DEST_TL_X(0) | A6XX_GRAS_A2D_DEST_TL_Y(0)); - OUT_RING(ring, A6XX_GRAS_A2D_DEST_BR_X(width - 1) | - A6XX_GRAS_A2D_DEST_BR_Y(height - 1)); + ncrb.add(A6XX_GRAS_A2D_DEST_TL(.x = 0, .y = 0)); + ncrb.add(A6XX_GRAS_A2D_DEST_BR(.x = width - 1, .y = height - 1)); - OUT_REG(ring, - A6XX_GRAS_A2D_SRC_XMIN(0), - A6XX_GRAS_A2D_SRC_XMAX(width - 1), - A6XX_GRAS_A2D_SRC_YMIN(0), - A6XX_GRAS_A2D_SRC_YMAX(height - 1), - ); + ncrb.add(A6XX_GRAS_A2D_SRC_XMIN(0)); + ncrb.add(A6XX_GRAS_A2D_SRC_XMAX(width - 1)); + ncrb.add(A6XX_GRAS_A2D_SRC_YMIN(0)); + ncrb.add(A6XX_GRAS_A2D_SRC_YMAX(height - 1)); /* Enable scissor bit, which will take into account the window scissor * which is set per-tile */ - emit_blit_setup(ring, psurf->format, true, NULL, unknown_8c01, ROTATE_0); + emit_blit_setup(ncrb, psurf->format, true, NULL, unknown_8c01, ROTATE_0); /* We shouldn't be using GMEM in the layered rendering case: */ assert(psurf->first_layer == psurf->last_layer); - emit_blit_dst(ring, psurf->texture, psurf->format, psurf->level, + emit_blit_dst(ncrb, psurf->texture, psurf->format, psurf->level, psurf->first_layer); enum a6xx_format sfmt = fd6_color_format(psurf->format, TILE6_LINEAR); enum a3xx_msaa_samples samples = fd_msaa_samples(batch->framebuffer.samples); - OUT_REG(ring, - TPL1_A2D_SRC_TEXTURE_INFO( - CHIP, - .color_format = sfmt, - .tile_mode = TILE6_2, - .color_swap = WZYX, - .srgb = util_format_is_srgb(psurf->format), - .samples = samples, - .samples_average = samples > MSAA_ONE, - .unk20 = true, - .unk22 = true, - ), - TPL1_A2D_SRC_TEXTURE_SIZE( - CHIP, - .width = width, - .height = height, - ), - TPL1_A2D_SRC_TEXTURE_BASE( - CHIP, - .qword = gmem_base, - ), - TPL1_A2D_SRC_TEXTURE_PITCH( - CHIP, - .pitch = gmem_pitch, - ), - ); + ncrb.add(TPL1_A2D_SRC_TEXTURE_INFO(CHIP, + .color_format = sfmt, + .tile_mode = TILE6_2, + .color_swap = WZYX, + .srgb = util_format_is_srgb(psurf->format), + .samples = samples, + .samples_average = samples > MSAA_ONE, + .unk20 = true, + .unk22 = true, + )); + ncrb.add(TPL1_A2D_SRC_TEXTURE_SIZE(CHIP, + .width = width, + .height = height, + )); + ncrb.add(TPL1_A2D_SRC_TEXTURE_BASE(CHIP, .qword = gmem_base)); + ncrb.add(TPL1_A2D_SRC_TEXTURE_PITCH(CHIP, .pitch = gmem_pitch)); } template void -fd6_resolve_tile(struct fd_batch *batch, struct fd_ringbuffer *ring, - uint32_t base, struct pipe_surface *psurf, uint32_t unknown_8c01) +fd6_resolve_tile(struct fd_batch *batch, fd_cs &cs, uint32_t base, + struct pipe_surface *psurf, uint32_t unknown_8c01) { - resolve_tile_setup(batch, ring, base, psurf, unknown_8c01); + resolve_tile_setup(batch, cs, base, psurf, unknown_8c01); /* sync GMEM writes with CACHE. */ - fd6_cache_inv(batch->ctx, ring); + fd6_cache_inv(batch->ctx, cs); /* Wait for CACHE_INVALIDATE to land */ - OUT_WFI5(ring); + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); - OUT_PKT7(ring, CP_BLIT, 1); - OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_SCALE)); + fd_pkt7(cs, CP_BLIT, 1) + .add(CP_BLIT_0(.op = BLIT_OP_SCALE)); - OUT_WFI5(ring); + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to * sysmem, and we generally assume that GMEM renderpasses leave their * results in sysmem, so we need to flush manually here. */ - fd6_emit_flushes(batch->ctx, ring, - FD6_FLUSH_CCU_COLOR | FD6_WAIT_FOR_IDLE); + fd6_emit_flushes(batch->ctx, cs, FD6_FLUSH_CCU_COLOR | FD6_WAIT_FOR_IDLE); } FD_GENX(fd6_resolve_tile); @@ -1306,28 +1270,30 @@ handle_rgba_blit(struct fd_context *ctx, const struct pipe_blit_info *info) fd_batch_update_queries(batch); - emit_setup(batch); + fd_cs cs(batch->draw); + + emit_setup(batch->ctx, cs); DBG_BLIT(info, batch); - trace_start_blit(&batch->trace, batch->draw, info->src.resource->target, + trace_start_blit(&batch->trace, cs.ring(), info->src.resource->target, info->dst.resource->target); if ((info->src.resource->target == PIPE_BUFFER) && (info->dst.resource->target == PIPE_BUFFER)) { assert(src->layout.tile_mode == TILE6_LINEAR); assert(dst->layout.tile_mode == TILE6_LINEAR); - emit_blit_buffer(ctx, batch->draw, info); + emit_blit_buffer(ctx, cs, info); } else { /* I don't *think* we need to handle blits between buffer <-> !buffer */ assert(info->src.resource->target != PIPE_BUFFER); assert(info->dst.resource->target != PIPE_BUFFER); - emit_blit_texture(ctx, batch->draw, info); + emit_blit_texture(ctx, cs, info); } - trace_end_blit(&batch->trace, batch->draw); + trace_end_blit(&batch->trace, cs.ring()); - fd6_emit_flushes(batch->ctx, batch->draw, + fd6_emit_flushes(batch->ctx, cs, FD6_FLUSH_CCU_COLOR | FD6_FLUSH_CCU_DEPTH | FD6_FLUSH_CACHE | diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_blitter.h b/src/gallium/drivers/freedreno/a6xx/fd6_blitter.h index 5c4e694522f..35be5565784 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_blitter.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_blitter.h @@ -14,6 +14,7 @@ #include "freedreno_context.h" +#include "fd6_pack.h" template void fd6_blitter_init(struct pipe_context *pctx); @@ -29,11 +30,11 @@ template void fd6_clear_lrz(struct fd_batch *batch, struct fd_resource *zsbuf, struct fd_bo *lrz, double depth) assert_dt; template -void fd6_clear_surface(struct fd_context *ctx, struct fd_ringbuffer *ring, +void fd6_clear_surface(struct fd_context *ctx, fd_cs &cs, struct pipe_surface *psurf, const struct pipe_box *box2d, union pipe_color_union *color, uint32_t unknown_8c01) assert_dt; template -void fd6_resolve_tile(struct fd_batch *batch, struct fd_ringbuffer *ring, - uint32_t base, struct pipe_surface *psurf, uint32_t unknown_8c01) assert_dt; +void fd6_resolve_tile(struct fd_batch *batch, fd_cs &cs, uint32_t base, + struct pipe_surface *psurf, uint32_t unknown_8c01) assert_dt; #endif /* FD6_BLIT_H_ */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_compute.cc b/src/gallium/drivers/freedreno/a6xx/fd6_compute.cc index 6119a1b7336..b518e4f40e5 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_compute.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_compute.cc @@ -23,9 +23,10 @@ #include "fd6_emit.h" #include "fd6_pack.h" +/* nregs: 2 */ template static void -cs_program_emit_local_size(struct fd_context *ctx, struct fd_ringbuffer *ring, +cs_program_emit_local_size(struct fd_context *ctx, fd_crb &crb, struct ir3_shader_variant *v, uint16_t local_size[3]) { /* @@ -43,53 +44,50 @@ cs_program_emit_local_size(struct fd_context *ctx, struct fd_ringbuffer *ring, : (local_size[1] % 2 == 0) ? 9 : 17; - OUT_REG(ring, - SP_CS_WGE_CNTL( - CHIP, - .linearlocalidregid = INVALID_REG, - .threadsize = thrsz_cs, - .workgrouprastorderzfirsten = true, - .wgtilewidth = 4, - .wgtileheight = tile_height, - ) - ); + crb.add(SP_CS_WGE_CNTL(CHIP, + .linearlocalidregid = INVALID_REG, + .threadsize = thrsz_cs, + .workgrouprastorderzfirsten = true, + .wgtilewidth = 4, + .wgtileheight = tile_height, + )); - OUT_REG(ring, - A7XX_SP_CS_NDRANGE_7( - .localsizex = local_size[0] - 1, - .localsizey = local_size[1] - 1, - .localsizez = local_size[2] - 1, - ) - ); + crb.add(A7XX_SP_CS_NDRANGE_7( + .localsizex = local_size[0] - 1, + .localsizey = local_size[1] - 1, + .localsizez = local_size[2] - 1, + )); } } +/* nregs: 9 */ template static void -cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct ir3_shader_variant *v) +cs_program_emit(struct fd_context *ctx, fd_crb &crb, struct ir3_shader_variant *v) assert_dt { - OUT_REG(ring, SP_UPDATE_CNTL(CHIP, .vs_state = true, .hs_state = true, - .ds_state = true, .gs_state = true, - .fs_state = true, .cs_state = true, - .cs_uav = true, .gfx_uav = true, )); - - OUT_REG(ring, SP_CS_CONST_CONFIG( - CHIP, - .constlen = v->constlen, - .enabled = true, + crb.add(SP_UPDATE_CNTL(CHIP, + .vs_state = true, .hs_state = true, + .ds_state = true, .gs_state = true, + .fs_state = true, .cs_state = true, + .cs_uav = true, .gfx_uav = true, )); - OUT_PKT4(ring, REG_A6XX_SP_CS_CONFIG, 1); - OUT_RING(ring, A6XX_SP_CS_CONFIG_ENABLED | - COND(v->bindless_tex, A6XX_SP_CS_CONFIG_BINDLESS_TEX) | - COND(v->bindless_samp, A6XX_SP_CS_CONFIG_BINDLESS_SAMP) | - COND(v->bindless_ibo, A6XX_SP_CS_CONFIG_BINDLESS_UAV) | - COND(v->bindless_ubo, A6XX_SP_CS_CONFIG_BINDLESS_UBO) | - A6XX_SP_CS_CONFIG_NUAV(ir3_shader_num_uavs(v)) | - A6XX_SP_CS_CONFIG_NTEX(v->num_samp) | - A6XX_SP_CS_CONFIG_NSAMP(v->num_samp)); /* SP_CS_CONFIG */ + crb.add(SP_CS_CONST_CONFIG(CHIP, + .constlen = v->constlen, + .enabled = true, + )); + + crb.add(A6XX_SP_CS_CONFIG( + .bindless_tex = v->bindless_tex, + .bindless_samp = v->bindless_samp, + .bindless_uav = v->bindless_ibo, + .bindless_ubo = v->bindless_ubo, + .enabled = true, + .ntex = v->num_samp, + .nsamp = v->num_samp, + .nuav = ir3_shader_num_uavs(v), + )); uint32_t local_invocation_id = v->cs.local_invocation_id; uint32_t work_group_id = v->cs.work_group_id; @@ -104,54 +102,53 @@ cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, .supports_double_threadsize ? thrsz : THREAD128; if (CHIP == A6XX) { - OUT_PKT4(ring, REG_A6XX_SP_CS_CONST_CONFIG_0, 2); - OUT_RING(ring, A6XX_SP_CS_CONST_CONFIG_0_WGIDCONSTID(work_group_id) | - A6XX_SP_CS_CONST_CONFIG_0_WGSIZECONSTID(regid(63, 0)) | - A6XX_SP_CS_CONST_CONFIG_0_WGOFFSETCONSTID(regid(63, 0)) | - A6XX_SP_CS_CONST_CONFIG_0_LOCALIDREGID(local_invocation_id)); - OUT_RING(ring, A6XX_SP_CS_WGE_CNTL_LINEARLOCALIDREGID(regid(63, 0)) | - A6XX_SP_CS_WGE_CNTL_THREADSIZE(thrsz_cs)); + crb.add(A6XX_SP_CS_CONST_CONFIG_0( + .wgidconstid = work_group_id, + .wgsizeconstid = INVALID_REG, + .wgoffsetconstid = INVALID_REG, + .localidregid = local_invocation_id, + )); + crb.add(SP_CS_WGE_CNTL(CHIP, + .linearlocalidregid = INVALID_REG, + .threadsize = thrsz_cs, + )); + if (!ctx->screen->info->a6xx.supports_double_threadsize) { - OUT_PKT4(ring, REG_A6XX_SP_PS_WAVE_CNTL, 1); - OUT_RING(ring, A6XX_SP_PS_WAVE_CNTL_THREADSIZE(thrsz)); + crb.add(SP_PS_WAVE_CNTL(CHIP, .threadsize = thrsz)); } if (ctx->screen->info->a6xx.has_lpac) { - OUT_PKT4(ring, REG_A6XX_SP_CS_WIE_CNTL_0, 2); - OUT_RING(ring, A6XX_SP_CS_WIE_CNTL_0_WGIDCONSTID(work_group_id) | - A6XX_SP_CS_WIE_CNTL_0_WGSIZECONSTID(regid(63, 0)) | - A6XX_SP_CS_WIE_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) | - A6XX_SP_CS_WIE_CNTL_0_LOCALIDREGID(local_invocation_id)); - OUT_RING(ring, A6XX_SP_CS_WIE_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) | - A6XX_SP_CS_WIE_CNTL_1_THREADSIZE(thrsz)); - } - } else { - OUT_REG(ring, SP_PS_WAVE_CNTL(CHIP, .threadsize = THREAD64)); - OUT_REG(ring, - A6XX_SP_CS_WIE_CNTL_0( + crb.add(A6XX_SP_CS_WIE_CNTL_0( .wgidconstid = work_group_id, .wgsizeconstid = INVALID_REG, .wgoffsetconstid = INVALID_REG, .localidregid = local_invocation_id, - ) - ); - OUT_REG(ring, - SP_CS_WIE_CNTL_1( - CHIP, + )); + crb.add(SP_CS_WIE_CNTL_1(CHIP, .linearlocalidregid = INVALID_REG, - .threadsize = thrsz_cs, - .workitemrastorder = - v->cs.force_linear_dispatch ? WORKITEMRASTORDER_LINEAR - : WORKITEMRASTORDER_TILED, - ) - ); - OUT_REG(ring, A7XX_SP_CS_UNKNOWN_A9BE(0)); // Sometimes is 0x08000000 + .threadsize = thrsz, + )); + } + } else { + crb.add(SP_PS_WAVE_CNTL(CHIP, .threadsize = THREAD64)); + crb.add(A6XX_SP_CS_WIE_CNTL_0( + .wgidconstid = work_group_id, + .wgsizeconstid = INVALID_REG, + .wgoffsetconstid = INVALID_REG, + .localidregid = local_invocation_id, + )); + crb.add(SP_CS_WIE_CNTL_1(CHIP, + .linearlocalidregid = INVALID_REG, + .threadsize = thrsz_cs, + .workitemrastorder = + v->cs.force_linear_dispatch ? WORKITEMRASTORDER_LINEAR + : WORKITEMRASTORDER_TILED, + )); + crb.add(A7XX_SP_CS_UNKNOWN_A9BE(0)); // Sometimes is 0x08000000 } if (!v->local_size_variable) - cs_program_emit_local_size(ctx, ring, v, v->local_size); - - fd6_emit_shader(ctx, ring, v); + cs_program_emit_local_size(ctx, crb, v, v->local_size); } template @@ -159,7 +156,7 @@ static void fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt { struct fd6_compute_state *cp = (struct fd6_compute_state *)ctx->compute; - struct fd_ringbuffer *ring = ctx->batch->draw; + fd_cs cs(ctx->batch->draw); if (unlikely(!cp->v)) { struct ir3_shader_state *hwcso = (struct ir3_shader_state *)cp->hwcso; @@ -170,16 +167,18 @@ fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt return; cp->stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); - cs_program_emit(ctx, cp->stateobj, cp->v); + fd_cs cs(cp->stateobj); + with_crb (cs, 9) + cs_program_emit(ctx, crb, cp->v); + fd6_emit_shader(ctx, cs, cp->v); } - trace_start_compute(&ctx->batch->trace, ring, !!info->indirect, info->work_dim, + trace_start_compute(&ctx->batch->trace, cs.ring(), !!info->indirect, info->work_dim, info->block[0], info->block[1], info->block[2], info->grid[0], info->grid[1], info->grid[2], cp->v->shader_id); - if (ctx->batch->barrier) - fd6_barrier_flush(ctx->batch); + fd6_barrier_flush(cs, ctx->batch); bool emit_instrlen_workaround = cp->v->instrlen > ctx->screen->info->a6xx.instr_cache_size; @@ -200,37 +199,22 @@ fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt * See https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19023 */ if (emit_instrlen_workaround) { - OUT_REG(ring, A6XX_SP_PS_INSTR_SIZE(cp->v->instrlen)); - fd6_event_write(ctx, ring, FD_LABEL); + fd_pkt4(cs, 1) + .add(A6XX_SP_PS_INSTR_SIZE(cp->v->instrlen)); + fd6_event_write(ctx, cs, FD_LABEL); } if (ctx->gen_dirty) - fd6_emit_cs_state(ctx, ring, cp); + fd6_emit_cs_state(ctx, cs, cp); if (ctx->gen_dirty & BIT(FD6_GROUP_CONST)) - fd6_emit_cs_user_consts(ctx, ring, cp->v); + fd6_emit_cs_user_consts(ctx, cs, cp->v); if (cp->v->need_driver_params) - fd6_emit_cs_driver_params(ctx, ring, cp->v, info); + fd6_emit_cs_driver_params(ctx, cs, cp->v, info); - OUT_PKT7(ring, CP_SET_MARKER, 1); - OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE)); - - uint32_t shared_size = - MAX2(((int)(cp->v->cs.req_local_mem + info->variable_shared_mem) - 1) / 1024, 1); - enum a6xx_const_ram_mode mode = - cp->v->constlen > 256 ? CONSTLEN_512 : - (cp->v->constlen > 192 ? CONSTLEN_256 : - (cp->v->constlen > 128 ? CONSTLEN_192 : CONSTLEN_128)); - OUT_PKT4(ring, REG_A6XX_SP_CS_CNTL_1, 1); - OUT_RING(ring, A6XX_SP_CS_CNTL_1_SHARED_SIZE(shared_size) | - A6XX_SP_CS_CNTL_1_CONSTANTRAMMODE(mode)); - - if (CHIP == A6XX && ctx->screen->info->a6xx.has_lpac) { - OUT_PKT4(ring, REG_A6XX_HLSQ_CS_CTRL_REG1, 1); - OUT_RING(ring, A6XX_HLSQ_CS_CTRL_REG1_SHARED_SIZE(shared_size) | - A6XX_HLSQ_CS_CTRL_REG1_CONSTANTRAMMODE(mode)); - } + fd_pkt7(cs, CP_SET_MARKER, 1) + .add(A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE)); const unsigned *local_size = info->block; // v->shader->nir->info->workgroup_size; @@ -238,61 +222,74 @@ fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt /* for some reason, mesa/st doesn't set info->work_dim, so just assume 3: */ const unsigned work_dim = info->work_dim ? info->work_dim : 3; - if (cp->v->local_size_variable) { - uint16_t wg[] = {local_size[0], local_size[1], local_size[2]}; - cs_program_emit_local_size(ctx, ring, cp->v, wg); + with_crb (cs, 15) { + uint32_t shared_size = + MAX2(((int)(cp->v->cs.req_local_mem + info->variable_shared_mem) - 1) / 1024, 1); + enum a6xx_const_ram_mode mode = + cp->v->constlen > 256 ? CONSTLEN_512 : + (cp->v->constlen > 192 ? CONSTLEN_256 : + (cp->v->constlen > 128 ? CONSTLEN_192 : CONSTLEN_128)); + crb.add(A6XX_SP_CS_CNTL_1( + .shared_size = shared_size, + .constantrammode = mode, + )); + + if (CHIP == A6XX && ctx->screen->info->a6xx.has_lpac) { + crb.add(A6XX_HLSQ_CS_CTRL_REG1( + .shared_size = shared_size, + .constantrammode = mode, + )); + } + + if (cp->v->local_size_variable) { + uint16_t wg[] = {local_size[0], local_size[1], local_size[2]}; + cs_program_emit_local_size(ctx, crb, cp->v, wg); + } + + crb.add(SP_CS_NDRANGE_0(CHIP, + .kerneldim = work_dim, + .localsizex = local_size[0] - 1, + .localsizey = local_size[1] - 1, + .localsizez = local_size[2] - 1, + )); + crb.add(SP_CS_NDRANGE_1(CHIP, + .globalsize_x = local_size[0] * num_groups[0], + )); + crb.add(SP_CS_NDRANGE_2(CHIP, .globaloff_x = 0)); + crb.add(SP_CS_NDRANGE_3(CHIP, + .globalsize_y = local_size[1] * num_groups[1], + )); + crb.add(SP_CS_NDRANGE_4(CHIP, .globaloff_y = 0)); + crb.add(SP_CS_NDRANGE_5(CHIP, + .globalsize_z = local_size[2] * num_groups[2], + )); + crb.add(SP_CS_NDRANGE_6(CHIP, .globaloff_z = 0)); + + crb.add(SP_CS_KERNEL_GROUP_X(CHIP, 1)); + crb.add(SP_CS_KERNEL_GROUP_Y(CHIP, 1)); + crb.add(SP_CS_KERNEL_GROUP_Z(CHIP, 1)); } - OUT_REG(ring, - SP_CS_NDRANGE_0( - CHIP, - .kerneldim = work_dim, - .localsizex = local_size[0] - 1, - .localsizey = local_size[1] - 1, - .localsizez = local_size[2] - 1, - ), - SP_CS_NDRANGE_1( - CHIP, - .globalsize_x = local_size[0] * num_groups[0], - ), - SP_CS_NDRANGE_2(CHIP, .globaloff_x = 0), - SP_CS_NDRANGE_3( - CHIP, - .globalsize_y = local_size[1] * num_groups[1], - ), - SP_CS_NDRANGE_4(CHIP, .globaloff_y = 0), - SP_CS_NDRANGE_5( - CHIP, - .globalsize_z = local_size[2] * num_groups[2], - ), - SP_CS_NDRANGE_6(CHIP, .globaloff_z = 0), - ); - - OUT_REG(ring, - SP_CS_KERNEL_GROUP_X(CHIP, 1), - SP_CS_KERNEL_GROUP_Y(CHIP, 1), - SP_CS_KERNEL_GROUP_Z(CHIP, 1), - ); - if (info->indirect) { struct fd_resource *rsc = fd_resource(info->indirect); - OUT_PKT7(ring, CP_EXEC_CS_INDIRECT, 4); - OUT_RING(ring, 0x00000000); - OUT_RELOC(ring, rsc->bo, info->indirect_offset, 0, 0); /* ADDR_LO/HI */ - OUT_RING(ring, - A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) | - A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) | - A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1)); + fd_pkt7(cs, CP_EXEC_CS_INDIRECT, 4) + .add(A4XX_CP_EXEC_CS_INDIRECT_0()) + .add(A5XX_CP_EXEC_CS_INDIRECT_ADDR(rsc->bo, info->indirect_offset)) + .add(A5XX_CP_EXEC_CS_INDIRECT_3( + .localsizex = local_size[0] - 1, + .localsizey = local_size[1] - 1, + .localsizez = local_size[2] - 1, + )); } else { - OUT_PKT7(ring, CP_EXEC_CS, 4); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, CP_EXEC_CS_1_NGROUPS_X(info->grid[0])); - OUT_RING(ring, CP_EXEC_CS_2_NGROUPS_Y(info->grid[1])); - OUT_RING(ring, CP_EXEC_CS_3_NGROUPS_Z(info->grid[2])); + fd_pkt7(cs, CP_EXEC_CS, 4) + .add(CP_EXEC_CS_0()) + .add(CP_EXEC_CS_1(info->grid[0])) + .add(CP_EXEC_CS_2(info->grid[1])) + .add(CP_EXEC_CS_3(info->grid[2])); } - trace_end_compute(&ctx->batch->trace, ring); + trace_end_compute(&ctx->batch->trace, cs.ring()); fd_context_all_clean(ctx); } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_const.cc b/src/gallium/drivers/freedreno/a6xx/fd6_const.cc index dcb036527f0..2c4614a9be6 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_const.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_const.cc @@ -11,37 +11,34 @@ #include "fd6_compute.h" #include "fd6_pack.h" -#define emit_const_user fd6_emit_const_user -#define emit_const_bo fd6_emit_const_bo #include "ir3_const.h" + static inline void -fd6_emit_driver_ubo(struct fd_ringbuffer *ring, const struct ir3_shader_variant *v, +fd6_emit_driver_ubo(fd_cs &cs, const struct ir3_shader_variant *v, int base, uint32_t sizedwords, unsigned buffer_offset, struct fd_bo *bo) { - enum a6xx_state_block block = fd6_stage2shadersb(v->type); + int size_vec4s = DIV_ROUND_UP(sizedwords, 4); /* base == ubo idx */ - OUT_PKT7(ring, fd6_stage2opcode(v->type), 5); - OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(base) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(block) | - CP_LOAD_STATE6_0_NUM_UNIT(1)); - OUT_RING(ring, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); - OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); - - int size_vec4s = DIV_ROUND_UP(sizedwords, 4); - OUT_RELOC(ring, bo, buffer_offset, - ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32), 0); + fd_pkt7(cs, fd6_stage2opcode(v->type), 5) + .add(CP_LOAD_STATE6_0( + .dst_off = base, + .state_type = ST6_UBO, + .state_src = SS6_DIRECT, + .state_block = fd6_stage2shadersb(v->type), + .num_unit = 1, + )) + .add(CP_LOAD_STATE6_EXT_SRC_ADDR()) + .add(A6XX_UBO_DESC(0, bo, buffer_offset, size_vec4s)); } /* A helper to upload driver-params to a UBO, for the case where constants are * loaded by shader preamble rather than ST6_CONSTANTS */ static void -fd6_upload_emit_driver_ubo(struct fd_context *ctx, struct fd_ringbuffer *ring, +fd6_upload_emit_driver_ubo(struct fd_context *ctx, fd_cs &cs, const struct ir3_shader_variant *v, int base, uint32_t sizedwords, const void *dwords) { @@ -63,9 +60,9 @@ fd6_upload_emit_driver_ubo(struct fd_context *ctx, struct fd_ringbuffer *ring, * this allocation happens outside of the context of batch resource * tracking. */ - fd_ringbuffer_attach_bo(ring, fd_resource(buffer)->bo); + cs.attach_bo(fd_resource(buffer)->bo); - fd6_emit_driver_ubo(ring, v, base, sizedwords, buffer_offset, + fd6_emit_driver_ubo(cs, v, base, sizedwords, buffer_offset, fd_resource(buffer)->bo); pipe_resource_reference(&buffer, NULL); @@ -76,8 +73,7 @@ fd6_upload_emit_driver_ubo(struct fd_context *ctx, struct fd_ringbuffer *ring, * sizedwords: size of const value buffer */ void -fd6_emit_const_user(struct fd_ringbuffer *ring, - const struct ir3_shader_variant *v, uint32_t regid, +fd6_emit_const_user(fd_cs &cs, const struct ir3_shader_variant *v, uint32_t regid, uint32_t sizedwords, const uint32_t *dwords) { emit_const_asserts(v, regid, sizedwords); @@ -88,28 +84,29 @@ fd6_emit_const_user(struct fd_ringbuffer *ring, */ uint32_t align_sz = align(sizedwords, 4); - if (fd6_geom_stage(v->type)) { - OUT_PKTBUF(ring, CP_LOAD_STATE6_GEOM, dwords, align_sz, - CP_LOAD_STATE6_0(.dst_off = regid / 4, .state_type = ST6_CONSTANTS, - .state_src = SS6_DIRECT, - .state_block = fd6_stage2shadersb(v->type), - .num_unit = DIV_ROUND_UP(sizedwords, 4)), - CP_LOAD_STATE6_1(), - CP_LOAD_STATE6_2()); - } else { - OUT_PKTBUF(ring, CP_LOAD_STATE6_FRAG, dwords, align_sz, - CP_LOAD_STATE6_0(.dst_off = regid / 4, .state_type = ST6_CONSTANTS, - .state_src = SS6_DIRECT, - .state_block = fd6_stage2shadersb(v->type), - .num_unit = DIV_ROUND_UP(sizedwords, 4)), - CP_LOAD_STATE6_1(), - CP_LOAD_STATE6_2()); - } + fd_pkt7(cs, fd6_stage2opcode(v->type), 3 + align_sz) + .add(CP_LOAD_STATE6_0( + .dst_off = regid / 4, + .state_type = ST6_CONSTANTS, + .state_src = SS6_DIRECT, + .state_block = fd6_stage2shadersb(v->type), + .num_unit = DIV_ROUND_UP(sizedwords, 4) + )) + .add(CP_LOAD_STATE6_EXT_SRC_ADDR()) + .add(dwords, align_sz); +} + +static void +emit_const_user(struct fd_ringbuffer *ring, + const struct ir3_shader_variant *v, uint32_t regid, + uint32_t size, const uint32_t *user_buffer) +{ + fd_cs cs(ring); + fd6_emit_const_user(cs, v, regid, size, user_buffer); } void -fd6_emit_const_bo(struct fd_ringbuffer *ring, - const struct ir3_shader_variant *v, uint32_t regid, +fd6_emit_const_bo(fd_cs &cs, const struct ir3_shader_variant *v, uint32_t regid, uint32_t offset, uint32_t sizedwords, struct fd_bo *bo) { uint32_t dst_off = regid / 4; @@ -119,21 +116,23 @@ fd6_emit_const_bo(struct fd_ringbuffer *ring, emit_const_asserts(v, regid, sizedwords); - if (fd6_geom_stage(v->type)) { - OUT_PKT(ring, CP_LOAD_STATE6_GEOM, - CP_LOAD_STATE6_0(.dst_off = dst_off, .state_type = ST6_CONSTANTS, - .state_src = SS6_INDIRECT, - .state_block = fd6_stage2shadersb(v->type), - .num_unit = num_unit, ), - CP_LOAD_STATE6_EXT_SRC_ADDR(.bo = bo, .bo_offset = offset)); - } else { - OUT_PKT(ring, CP_LOAD_STATE6_FRAG, - CP_LOAD_STATE6_0(.dst_off = dst_off, .state_type = ST6_CONSTANTS, - .state_src = SS6_INDIRECT, - .state_block = fd6_stage2shadersb(v->type), - .num_unit = num_unit, ), - CP_LOAD_STATE6_EXT_SRC_ADDR(.bo = bo, .bo_offset = offset)); - } + fd_pkt7(cs, fd6_stage2opcode(v->type), 3) + .add(CP_LOAD_STATE6_0( + .dst_off = dst_off, .state_type = ST6_CONSTANTS, + .state_src = SS6_INDIRECT, + .state_block = fd6_stage2shadersb(v->type), + .num_unit = num_unit, + )) + .add(CP_LOAD_STATE6_EXT_SRC_ADDR(.bo = bo, .bo_offset = offset)); +} + +static void +emit_const_bo(struct fd_ringbuffer *ring, + const struct ir3_shader_variant *v, uint32_t regid, + uint32_t offset, uint32_t size, struct fd_bo *bo) +{ + fd_cs cs(ring); + fd6_emit_const_bo(cs, v, regid, offset, size, bo); } static bool @@ -158,7 +157,7 @@ wait_mem_writes(struct fd_context *ctx) template static void -emit_stage_tess_consts(struct fd_ringbuffer *ring, const struct ir3_shader_variant *v, +emit_stage_tess_consts(fd_cs &cs, const struct ir3_shader_variant *v, struct fd_context *ctx, uint32_t *params, int num_params) { const struct ir3_const_state *const_state = ir3_const_state(v); @@ -166,7 +165,7 @@ emit_stage_tess_consts(struct fd_ringbuffer *ring, const struct ir3_shader_varia if (CHIP == A7XX && ctx->screen->info->a7xx.load_shader_consts_via_preamble) { int base = const_state->primitive_param_ubo.idx; - fd6_upload_emit_driver_ubo(ctx, ring, v, base, num_params, params); + fd6_upload_emit_driver_ubo(ctx, cs, v, base, num_params, params); } else if (ir3_const_can_upload(&const_state->allocs, IR3_CONST_ALLOC_PRIMITIVE_PARAM, v->constlen)) { @@ -174,7 +173,7 @@ emit_stage_tess_consts(struct fd_ringbuffer *ring, const struct ir3_shader_varia const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_PARAM].offset_vec4; int size = MIN2(1 + regid, v->constlen) - regid; if (size > 0) - fd6_emit_const_user(ring, v, regid * 4, num_params, params); + fd6_emit_const_user(cs, v, regid * 4, num_params, params); } } @@ -183,8 +182,7 @@ struct fd_ringbuffer * fd6_build_tess_consts(struct fd6_emit *emit) { struct fd_context *ctx = emit->ctx; - struct fd_ringbuffer *constobj = fd_submit_new_ringbuffer( - ctx->batch->submit, 0x1000, FD_RINGBUFFER_STREAMING); + fd_cs constobj(ctx->batch->submit, 0x1000); /* VS sizes are in bytes since that's what STLW/LDLW use, while the HS * size is dwords, since that's what LDG/STG use. @@ -205,7 +203,7 @@ fd6_build_tess_consts(struct fd6_emit *emit) int64_t tess_factor_iova = fd_bo_get_iova(tess_bo); int64_t tess_param_iova = tess_factor_iova + FD6_TESS_FACTOR_SIZE; - fd_ringbuffer_attach_bo(constobj, tess_bo); + constobj.attach_bo(tess_bo); uint32_t hs_params[8] = { emit->vs->output_size * num_vertices * 4, /* vs primitive stride */ @@ -258,12 +256,12 @@ fd6_build_tess_consts(struct fd6_emit *emit) gs_params, ARRAY_SIZE(gs_params)); } - return constobj; + return constobj.ring(); } FD_GENX(fd6_build_tess_consts); static void -fd6_emit_ubos(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, +fd6_emit_ubos(const struct ir3_shader_variant *v, fd_cs &cs, struct fd_constbuf_stateobj *constbuf) { const struct ir3_const_state *const_state = ir3_const_state(v); @@ -272,25 +270,26 @@ fd6_emit_ubos(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, if (!num_ubos) return; - OUT_PKT7(ring, fd6_stage2opcode(v->type), 3 + (2 * num_ubos)); - OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(fd6_stage2shadersb(v->type)) | - CP_LOAD_STATE6_0_NUM_UNIT(num_ubos)); - OUT_RING(ring, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); - OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + fd_pkt7 pkt(cs, fd6_stage2opcode(v->type), 3 + (2 * num_ubos)); + + pkt.add(CP_LOAD_STATE6_0( + .dst_off = 0, + .state_type = ST6_UBO, + .state_src = SS6_DIRECT, + .state_block = fd6_stage2shadersb(v->type), + .num_unit = num_ubos, + )) + .add(CP_LOAD_STATE6_EXT_SRC_ADDR()); for (int i = 0; i < num_ubos; i++) { struct pipe_constant_buffer *cb = &constbuf->cb[i]; if (cb->buffer) { + struct fd_bo *bo = fd_resource(cb->buffer)->bo; int size_vec4s = DIV_ROUND_UP(cb->buffer_size, 16); - OUT_RELOC(ring, fd_resource(cb->buffer)->bo, cb->buffer_offset, - (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32, 0); + pkt.add(A6XX_UBO_DESC(i, bo, cb->buffer_offset, size_vec4s)); } else { - OUT_RING(ring, 0xbad00000 | (i << 16)); - OUT_RING(ring, A6XX_UBO_1_SIZE(0)); + pkt.add(A6XX_UBO_DESC(i, NULL, 0, 0)); } } } @@ -325,16 +324,15 @@ FD_GENX(fd6_user_consts_cmdstream_size); template static void -emit_user_consts(const struct ir3_shader_variant *v, - struct fd_ringbuffer *ring, +emit_user_consts(const struct ir3_shader_variant *v, fd_cs &cs, struct fd_constbuf_stateobj *constbuf) { - fd6_emit_ubos(v, ring, constbuf); + fd6_emit_ubos(v, cs, constbuf); if (CHIP == A7XX && v->compiler->load_shader_consts_via_preamble) return; - ir3_emit_user_consts(v, ring, constbuf); + ir3_emit_user_consts(v, cs.ring(), constbuf); } template @@ -344,8 +342,7 @@ fd6_build_user_consts(struct fd6_emit *emit) struct fd_context *ctx = emit->ctx; unsigned sz = emit->prog->user_consts_cmdstream_size; - struct fd_ringbuffer *constobj = - fd_submit_new_ringbuffer(ctx->batch->submit, sz, FD_RINGBUFFER_STREAMING); + fd_cs constobj(ctx->batch->submit, sz); emit_user_consts(emit->vs, constobj, &ctx->constbuf[MESA_SHADER_VERTEX]); @@ -360,7 +357,7 @@ fd6_build_user_consts(struct fd6_emit *emit) } emit_user_consts(emit->fs, constobj, &ctx->constbuf[MESA_SHADER_FRAGMENT]); - return constobj; + return constobj.ring(); } template struct fd_ringbuffer * fd6_build_user_consts(struct fd6_emit *emit); template struct fd_ringbuffer * fd6_build_user_consts(struct fd6_emit *emit); @@ -369,7 +366,7 @@ template struct fd_ringbuffer * fd6_build_user_consts(struct f template static inline void -emit_driver_params(const struct ir3_shader_variant *v, struct fd_ringbuffer *dpconstobj, +emit_driver_params(const struct ir3_shader_variant *v, fd_cs &dpconstobj, struct fd_context *ctx, const struct pipe_draw_info *info, const struct pipe_draw_indirect_info *indirect, const struct ir3_driver_params_vs *vertex_params) @@ -382,14 +379,13 @@ emit_driver_params(const struct ir3_shader_variant *v, struct fd_ringbuffer *dpc dword_sizeof(*vertex_params), vertex_params); } else { - ir3_emit_driver_params(v, dpconstobj, ctx, info, indirect, vertex_params); + ir3_emit_driver_params(v, dpconstobj.ring(), ctx, info, indirect, vertex_params); } } template static inline void -emit_hs_driver_params(const struct ir3_shader_variant *v, - struct fd_ringbuffer *dpconstobj, +emit_hs_driver_params(const struct ir3_shader_variant *v, fd_cs &dpconstobj, struct fd_context *ctx) { if (CHIP == A7XX && ctx->screen->info->a7xx.load_shader_consts_via_preamble) { @@ -401,7 +397,7 @@ emit_hs_driver_params(const struct ir3_shader_variant *v, dword_sizeof(hs_params), &hs_params); } else { - ir3_emit_hs_driver_params(v, dpconstobj, ctx); + ir3_emit_hs_driver_params(v, dpconstobj.ring(), ctx); } } @@ -442,12 +438,11 @@ fd6_build_driver_params(struct fd6_emit *emit) num_dp * (4 + dword_sizeof(p)) + /* 4dw PKT7 header */ num_ubo_dp * 6; /* 6dw per UBO descriptor */ - struct fd_ringbuffer *dpconstobj = fd_submit_new_ringbuffer( - ctx->batch->submit, size_dwords * 4, FD_RINGBUFFER_STREAMING); + fd_cs dpconstobj(ctx->batch->submit, size_dwords * 4); /* VS still works the old way*/ if (emit->vs->need_driver_params) { - ir3_emit_driver_params(emit->vs, dpconstobj, ctx, emit->info, emit->indirect, &p); + ir3_emit_driver_params(emit->vs, dpconstobj.ring(), ctx, emit->info, emit->indirect, &p); } if (PIPELINE == HAS_TESS_GS) { @@ -469,7 +464,7 @@ fd6_build_driver_params(struct fd6_emit *emit) fd6_ctx->has_dp_state = true; - return dpconstobj; + return dpconstobj.ring(); } template struct fd_ringbuffer * fd6_build_driver_params(struct fd6_emit *emit); @@ -479,8 +474,7 @@ template struct fd_ringbuffer * fd6_build_driver_params(struct template void -fd6_emit_cs_driver_params(struct fd_context *ctx, - struct fd_ringbuffer *ring, +fd6_emit_cs_driver_params(struct fd_context *ctx, fd_cs &cs, const struct ir3_shader_variant *v, const struct pipe_grid_info *info) { @@ -501,20 +495,20 @@ fd6_emit_cs_driver_params(struct fd_context *ctx, if (info->indirect) { /* Copy indirect params into UBO: */ - ctx->screen->mem_to_mem(ring, buffer, buffer_offset, info->indirect, + ctx->screen->mem_to_mem(cs.ring(), buffer, buffer_offset, info->indirect, info->indirect_offset, 3); wait_mem_writes(ctx); } else { - fd_ringbuffer_attach_bo(ring, fd_resource(buffer)->bo); + cs.attach_bo(fd_resource(buffer)->bo); } - fd6_emit_driver_ubo(ring, v, base, dword_sizeof(compute_params), + fd6_emit_driver_ubo(cs, v, base, dword_sizeof(compute_params), buffer_offset, fd_resource(buffer)->bo); pipe_resource_reference(&buffer, NULL); } else { - ir3_emit_cs_driver_params(v, ring, ctx, info); + ir3_emit_cs_driver_params(v, cs.ring(), ctx, info); if (info->indirect) wait_mem_writes(ctx); } @@ -523,50 +517,47 @@ FD_GENX(fd6_emit_cs_driver_params); template void -fd6_emit_cs_user_consts(struct fd_context *ctx, - struct fd_ringbuffer *ring, +fd6_emit_cs_user_consts(struct fd_context *ctx, fd_cs &cs, const struct ir3_shader_variant *v) { - emit_user_consts(v, ring, &ctx->constbuf[MESA_SHADER_COMPUTE]); + emit_user_consts(v, cs, &ctx->constbuf[MESA_SHADER_COMPUTE]); } FD_GENX(fd6_emit_cs_user_consts); template void -fd6_emit_immediates(const struct ir3_shader_variant *v, - struct fd_ringbuffer *ring) +fd6_emit_immediates(const struct ir3_shader_variant *v, fd_cs &cs) { const struct ir3_const_state *const_state = ir3_const_state(v); if (const_state->consts_ubo.idx >= 0) { int sizedwords = DIV_ROUND_UP(v->constant_data_size, 4); - fd6_emit_driver_ubo(ring, v, const_state->consts_ubo.idx, sizedwords, + fd6_emit_driver_ubo(cs, v, const_state->consts_ubo.idx, sizedwords, v->info.constant_data_offset, v->bo); } if (CHIP == A7XX && v->compiler->load_inline_uniforms_via_preamble_ldgk) return; - ir3_emit_immediates(v, ring); + ir3_emit_immediates(v, cs.ring()); } FD_GENX(fd6_emit_immediates); template void -fd6_emit_link_map(struct fd_context *ctx, +fd6_emit_link_map(struct fd_context *ctx, fd_cs &cs, const struct ir3_shader_variant *producer, - const struct ir3_shader_variant *consumer, - struct fd_ringbuffer *ring) + const struct ir3_shader_variant *consumer) { if (CHIP == A7XX && producer->compiler->load_shader_consts_via_preamble) { const struct ir3_const_state *const_state = ir3_const_state(consumer); int base = const_state->primitive_map_ubo.idx; uint32_t size = ALIGN(consumer->input_size, 4); - fd6_upload_emit_driver_ubo(ctx, ring, consumer, base, size, producer->output_loc); + fd6_upload_emit_driver_ubo(ctx, cs, consumer, base, size, producer->output_loc); } else { - ir3_emit_link_map(producer, consumer, ring); + ir3_emit_link_map(producer, consumer, cs.ring()); } } FD_GENX(fd6_emit_link_map); diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_const.h b/src/gallium/drivers/freedreno/a6xx/fd6_const.h index 5bd53dc718b..962b7a948c8 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_const.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_const.h @@ -22,21 +22,17 @@ struct fd_ringbuffer * fd6_build_driver_params(struct fd6_emit *emit) assert_dt; template -void fd6_emit_cs_driver_params(struct fd_context *ctx, - struct fd_ringbuffer *ring, +void fd6_emit_cs_driver_params(struct fd_context *ctx, fd_cs &cs, const struct ir3_shader_variant *v, const struct pipe_grid_info *info) assert_dt; template -void fd6_emit_cs_user_consts(struct fd_context *ctx, - struct fd_ringbuffer *ring, +void fd6_emit_cs_user_consts(struct fd_context *ctx, fd_cs &cs, const struct ir3_shader_variant *v) assert_dt; template -void fd6_emit_immediates(const struct ir3_shader_variant *v, - struct fd_ringbuffer *ring) assert_dt; +void fd6_emit_immediates(const struct ir3_shader_variant *v, fd_cs &cs) assert_dt; template -void fd6_emit_link_map(struct fd_context *ctx, +void fd6_emit_link_map(struct fd_context *ctx, fd_cs &cs, const struct ir3_shader_variant *producer, - const struct ir3_shader_variant *consumer, - struct fd_ringbuffer *ring) assert_dt; + const struct ir3_shader_variant *consumer) assert_dt; #endif /* FD6_CONST_H */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_context.cc b/src/gallium/drivers/freedreno/a6xx/fd6_context.cc index 6cd91281459..25cca6d374b 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_context.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_context.cc @@ -74,11 +74,9 @@ fd6_vertex_state_create(struct pipe_context *pctx, unsigned num_elements, struct fd6_vertex_stateobj *state = CALLOC_STRUCT(fd6_vertex_stateobj); memcpy(state->base.pipe, elements, sizeof(*elements) * num_elements); state->base.num_elements = num_elements; - state->stateobj = - fd_ringbuffer_new_object(ctx->pipe, 4 * (num_elements * 4 + 1)); - struct fd_ringbuffer *ring = state->stateobj; - OUT_PKT4(ring, REG_A6XX_VFD_FETCH_INSTR(0), 2 * num_elements); + fd_crb crb(ctx->pipe, num_elements * 3); + for (int32_t i = 0; i < num_elements; i++) { const struct pipe_vertex_element *elem = &elements[i]; enum pipe_format pfmt = (enum pipe_format)elem->src_format; @@ -86,25 +84,26 @@ fd6_vertex_state_create(struct pipe_context *pctx, unsigned num_elements, bool isint = util_format_is_pure_integer(pfmt); assert(fmt != FMT6_NONE); - OUT_RING(ring, A6XX_VFD_FETCH_INSTR_INSTR_IDX(elem->vertex_buffer_index) | - A6XX_VFD_FETCH_INSTR_INSTR_OFFSET(elem->src_offset) | - A6XX_VFD_FETCH_INSTR_INSTR_FORMAT(fmt) | - COND(elem->instance_divisor, - A6XX_VFD_FETCH_INSTR_INSTR_INSTANCED) | - A6XX_VFD_FETCH_INSTR_INSTR_SWAP(fd6_vertex_swap(pfmt)) | - A6XX_VFD_FETCH_INSTR_INSTR_UNK30 | - COND(!isint, A6XX_VFD_FETCH_INSTR_INSTR_FLOAT)); - OUT_RING(ring, - MAX2(1, elem->instance_divisor)); /* VFD_FETCH_INSTR[j].STEP_RATE */ + crb.add(A6XX_VFD_FETCH_INSTR_INSTR(i, + .idx = elem->vertex_buffer_index, + .offset = elem->src_offset, + .instanced = elem->instance_divisor, + .format = fmt, + .swap = fd6_vertex_swap(pfmt), + .unk30 = true, + ._float = !isint, + )) + .add(A6XX_VFD_FETCH_INSTR_STEP_RATE(i, MAX2(1, elem->instance_divisor))); } for (int32_t i = 0; i < num_elements; i++) { const struct pipe_vertex_element *elem = &elements[i]; - OUT_PKT4(ring, REG_A6XX_VFD_VERTEX_BUFFER_STRIDE(elem->vertex_buffer_index), 1); - OUT_RING(ring, elem->src_stride); + crb.add(A6XX_VFD_VERTEX_BUFFER_STRIDE(elem->vertex_buffer_index, elem->src_stride)); } + state->stateobj = crb.ring(); + return state; } @@ -308,20 +307,19 @@ fd6_context_create(struct pipe_screen *pscreen, void *priv, fd6_blitter_init(pctx); - struct fd_ringbuffer *ring = - fd_ringbuffer_new_object(fd6_ctx->base.pipe, 6 * 4); + fd_crb crb(fd6_ctx->base.pipe, 3); - OUT_REG(ring, A6XX_GRAS_SC_MSAA_SAMPLE_POS_CNTL()); - OUT_REG(ring, A6XX_RB_MSAA_SAMPLE_POS_CNTL()); - OUT_REG(ring, A6XX_TPL1_MSAA_SAMPLE_POS_CNTL()); + crb.add(A6XX_GRAS_SC_MSAA_SAMPLE_POS_CNTL()) + .add(A6XX_RB_MSAA_SAMPLE_POS_CNTL()) + .add(A6XX_TPL1_MSAA_SAMPLE_POS_CNTL()); - fd6_ctx->sample_locations_disable_stateobj = ring; + fd6_ctx->sample_locations_disable_stateobj = crb.ring(); fd6_ctx->preamble = fd6_build_preemption_preamble(&fd6_ctx->base); - ring = fd_ringbuffer_new_object(fd6_ctx->base.pipe, 0x1000); - fd6_emit_static_regs(&fd6_ctx->base, ring); - fd6_ctx->restore = ring; + fd_cs restore(fd6_ctx->base.pipe, 0x1000); + fd6_emit_static_regs(restore, &fd6_ctx->base); + fd6_ctx->restore = restore.ring(); return fd_context_init_tc(pctx, flags); } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_context.h b/src/gallium/drivers/freedreno/a6xx/fd6_context.h index 62366247043..4d177206b4f 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_context.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_context.h @@ -19,6 +19,7 @@ #include "ir3/ir3_descriptor.h" #include "fd6_hw.h" +#include "fd6_pack.h" struct fd6_lrz_state { union { @@ -165,14 +166,13 @@ struct fd6_control { (fd6_ctx)->control_mem, offsetof(struct fd6_control, member) static inline void -emit_marker6(struct fd_ringbuffer *ring, int scratch_idx) +emit_marker6(fd_cs &cs, int scratch_idx) { extern int32_t marker_cnt; - unsigned reg = REG_A6XX_CP_SCRATCH_REG(scratch_idx); if (__EMIT_MARKER) { - OUT_WFI5(ring); - OUT_PKT4(ring, reg, 1); - OUT_RING(ring, p_atomic_inc_return(&marker_cnt)); + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); + fd_pkt4(cs, 1) + .add(A6XX_CP_SCRATCH_REG(scratch_idx, p_atomic_inc_return(&marker_cnt))); } } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_draw.cc b/src/gallium/drivers/freedreno/a6xx/fd6_draw.cc index a5d8f2a970d..0a4f8f1c77f 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_draw.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_draw.cc @@ -60,7 +60,7 @@ is_indexed(enum draw_type type) } static void -draw_emit_xfb(struct fd_ringbuffer *ring, struct CP_DRAW_INDX_OFFSET_0 *draw0, +draw_emit_xfb(fd_cs &cs, struct CP_DRAW_INDX_OFFSET_0 *draw0, const struct pipe_draw_info *info, const struct pipe_draw_indirect_info *indirect) { @@ -68,14 +68,13 @@ draw_emit_xfb(struct fd_ringbuffer *ring, struct CP_DRAW_INDX_OFFSET_0 *draw0, fd_stream_output_target(indirect->count_from_stream_output); struct fd_resource *offset = fd_resource(target->offset_buf); - OUT_PKT7(ring, CP_DRAW_AUTO, 6); - OUT_RING(ring, pack_CP_DRAW_INDX_OFFSET_0(*draw0).value); - OUT_RING(ring, info->instance_count); - OUT_RELOC(ring, offset->bo, 0, 0, 0); - OUT_RING( - ring, - 0); /* byte counter offset subtraced from the value read from above */ - OUT_RING(ring, target->stride); + fd_pkt7(cs, CP_DRAW_AUTO, 6) + .add(pack_CP_DRAW_INDX_OFFSET_0(*draw0)) + .add(CP_DRAW_AUTO_1(info->instance_count)) + .add(CP_DRAW_AUTO_NUM_VERTICES_BASE(offset->bo, 0)) + /* byte counter offset subtraced from the value read from above: */ + .add(CP_DRAW_AUTO_4(0)) + .add(CP_DRAW_AUTO_5(target->stride)); } static inline unsigned @@ -100,9 +99,7 @@ max_indices(const struct pipe_draw_info *info, unsigned index_offset) template static void -draw_emit_indirect(struct fd_context *ctx, - struct fd_ringbuffer *ring, - struct CP_DRAW_INDX_OFFSET_0 *draw0, +draw_emit_indirect(fd_cs &cs, struct CP_DRAW_INDX_OFFSET_0 *draw0, const struct pipe_draw_info *info, const struct pipe_draw_indirect_info *indirect, unsigned index_offset, uint32_t driver_param) @@ -110,59 +107,94 @@ draw_emit_indirect(struct fd_context *ctx, struct fd_resource *ind = fd_resource(indirect->buffer); if (DRAW == DRAW_INDIRECT_OP_INDIRECT_COUNT_INDEXED) { - OUT_PKT7(ring, CP_DRAW_INDIRECT_MULTI, 11); - OUT_RING(ring, pack_CP_DRAW_INDX_OFFSET_0(*draw0).value); - OUT_RING(ring, - (A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDIRECT_COUNT_INDEXED) - | A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(driver_param))); struct fd_resource *count_buf = fd_resource(indirect->indirect_draw_count); struct pipe_resource *idx = info->index.resource; - OUT_RING(ring, indirect->draw_count); - OUT_RELOC(ring, fd_resource(idx)->bo, index_offset, 0, 0); - OUT_RING(ring, max_indices(info, index_offset)); - OUT_RELOC(ring, ind->bo, indirect->offset, 0, 0); - OUT_RELOC(ring, count_buf->bo, indirect->indirect_draw_count_offset, 0, 0); - OUT_RING(ring, indirect->stride); + + fd_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 11) + .add(pack_CP_DRAW_INDX_OFFSET_0(*draw0)) + .add(A6XX_CP_DRAW_INDIRECT_MULTI_1( + .opcode = INDIRECT_OP_INDIRECT_COUNT_INDEXED, + .dst_off = driver_param, + )) + .add(A6XX_CP_DRAW_INDIRECT_MULTI_DRAW_COUNT(indirect->draw_count)) + .add(INDIRECT_OP_INDIRECT_COUNT_INDEXED_CP_DRAW_INDIRECT_MULTI_INDEX( + fd_resource(idx)->bo, index_offset + )) + .add(INDIRECT_OP_INDIRECT_COUNT_INDEXED_CP_DRAW_INDIRECT_MULTI_MAX_INDICES( + max_indices(info, index_offset) + )) + .add(INDIRECT_OP_INDIRECT_COUNT_INDEXED_CP_DRAW_INDIRECT_MULTI_INDIRECT( + ind->bo, indirect->offset + )) + .add(INDIRECT_OP_INDIRECT_COUNT_INDEXED_CP_DRAW_INDIRECT_MULTI_INDIRECT_COUNT( + count_buf->bo, indirect->indirect_draw_count_offset + )) + .add(INDIRECT_OP_INDIRECT_COUNT_INDEXED_CP_DRAW_INDIRECT_MULTI_STRIDE( + indirect->stride + )); } else if (DRAW == DRAW_INDIRECT_OP_INDEXED) { - OUT_PKT7(ring, CP_DRAW_INDIRECT_MULTI, 9); - OUT_RING(ring, pack_CP_DRAW_INDX_OFFSET_0(*draw0).value); - OUT_RING(ring, - (A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDEXED) - | A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(driver_param))); struct pipe_resource *idx = info->index.resource; - OUT_RING(ring, indirect->draw_count); - //index va - OUT_RELOC(ring, fd_resource(idx)->bo, index_offset, 0, 0); - //max indices - OUT_RING(ring, max_indices(info, index_offset)); - OUT_RELOC(ring, ind->bo, indirect->offset, 0, 0); - OUT_RING(ring, indirect->stride); + + fd_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 9) + .add(pack_CP_DRAW_INDX_OFFSET_0(*draw0)) + .add(A6XX_CP_DRAW_INDIRECT_MULTI_1( + .opcode = INDIRECT_OP_INDEXED, + .dst_off = driver_param, + )) + .add(A6XX_CP_DRAW_INDIRECT_MULTI_DRAW_COUNT(indirect->draw_count)) + //index va + .add(INDIRECT_OP_INDEXED_CP_DRAW_INDIRECT_MULTI_INDEX( + fd_resource(idx)->bo, index_offset + )) + //max indices + .add(INDIRECT_OP_INDEXED_CP_DRAW_INDIRECT_MULTI_MAX_INDICES( + max_indices(info, index_offset) + )) + .add(INDIRECT_OP_INDEXED_CP_DRAW_INDIRECT_MULTI_INDIRECT( + ind->bo, indirect->offset + )) + .add(INDIRECT_OP_INDEXED_CP_DRAW_INDIRECT_MULTI_STRIDE( + indirect->stride + )); } else if(DRAW == DRAW_INDIRECT_OP_INDIRECT_COUNT) { - OUT_PKT7(ring, CP_DRAW_INDIRECT_MULTI, 8); - OUT_RING(ring, pack_CP_DRAW_INDX_OFFSET_0(*draw0).value); - OUT_RING(ring, - (A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDIRECT_COUNT) - | A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(driver_param))); struct fd_resource *count_buf = fd_resource(indirect->indirect_draw_count); - OUT_RING(ring, indirect->draw_count); - OUT_RELOC(ring, ind->bo, indirect->offset, 0, 0); - OUT_RELOC(ring, count_buf->bo, indirect->indirect_draw_count_offset, 0, 0); - OUT_RING(ring, indirect->stride); + + fd_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 8) + .add(pack_CP_DRAW_INDX_OFFSET_0(*draw0)) + .add(A6XX_CP_DRAW_INDIRECT_MULTI_1( + .opcode = INDIRECT_OP_INDIRECT_COUNT, + .dst_off = driver_param, + )) + .add(A6XX_CP_DRAW_INDIRECT_MULTI_DRAW_COUNT(indirect->draw_count)) + .add(INDIRECT_OP_INDIRECT_COUNT_CP_DRAW_INDIRECT_MULTI_INDIRECT( + ind->bo, indirect->offset + )) + .add(INDIRECT_OP_INDIRECT_COUNT_CP_DRAW_INDIRECT_MULTI_INDIRECT_COUNT( + count_buf->bo, indirect->indirect_draw_count_offset + )) + .add(INDIRECT_OP_INDIRECT_COUNT_CP_DRAW_INDIRECT_MULTI_STRIDE( + indirect->stride + )); } else if (DRAW == DRAW_INDIRECT_OP_NORMAL) { - OUT_PKT7(ring, CP_DRAW_INDIRECT_MULTI, 6); - OUT_RING(ring, pack_CP_DRAW_INDX_OFFSET_0(*draw0).value); - OUT_RING(ring, - (A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_NORMAL) - | A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(driver_param))); - OUT_RING(ring, indirect->draw_count); - OUT_RELOC(ring, ind->bo, indirect->offset, 0, 0); - OUT_RING(ring, indirect->stride); + fd_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 6) + .add(pack_CP_DRAW_INDX_OFFSET_0(*draw0)) + .add(A6XX_CP_DRAW_INDIRECT_MULTI_1( + .opcode = INDIRECT_OP_NORMAL, + .dst_off = driver_param, + )) + .add(A6XX_CP_DRAW_INDIRECT_MULTI_DRAW_COUNT(indirect->draw_count)) + .add(INDIRECT_OP_NORMAL_CP_DRAW_INDIRECT_MULTI_INDIRECT( + ind->bo, indirect->offset + )) + .add(INDIRECT_OP_NORMAL_CP_DRAW_INDIRECT_MULTI_STRIDE( + indirect->stride + )); } } template static void -draw_emit(struct fd_ringbuffer *ring, struct CP_DRAW_INDX_OFFSET_0 *draw0, +draw_emit(fd_cs &cs, struct CP_DRAW_INDX_OFFSET_0 *draw0, const struct pipe_draw_info *info, const struct pipe_draw_start_count_bias *draw, unsigned index_offset) { @@ -171,17 +203,21 @@ draw_emit(struct fd_ringbuffer *ring, struct CP_DRAW_INDX_OFFSET_0 *draw0, struct pipe_resource *idx_buffer = info->index.resource; - OUT_PKT(ring, CP_DRAW_INDX_OFFSET, pack_CP_DRAW_INDX_OFFSET_0(*draw0), - CP_DRAW_INDX_OFFSET_1(.num_instances = info->instance_count), - CP_DRAW_INDX_OFFSET_2(.num_indices = draw->count), - CP_DRAW_INDX_OFFSET_3(.first_indx = draw->start), - A5XX_CP_DRAW_INDX_OFFSET_INDX_BASE(fd_resource(idx_buffer)->bo, - index_offset), - A5XX_CP_DRAW_INDX_OFFSET_6(.max_indices = max_indices(info, index_offset))); + fd_pkt7(cs, CP_DRAW_INDX_OFFSET, 7) + .add(pack_CP_DRAW_INDX_OFFSET_0(*draw0)) + .add(CP_DRAW_INDX_OFFSET_1(.num_instances = info->instance_count)) + .add(CP_DRAW_INDX_OFFSET_2(.num_indices = draw->count)) + .add(CP_DRAW_INDX_OFFSET_3(.first_indx = draw->start)) + .add(A5XX_CP_DRAW_INDX_OFFSET_INDX_BASE( + fd_resource(idx_buffer)->bo, + index_offset + )) + .add(A5XX_CP_DRAW_INDX_OFFSET_6(.max_indices = max_indices(info, index_offset))); } else if (DRAW == DRAW_DIRECT_OP_NORMAL) { - OUT_PKT(ring, CP_DRAW_INDX_OFFSET, pack_CP_DRAW_INDX_OFFSET_0(*draw0), - CP_DRAW_INDX_OFFSET_1(.num_instances = info->instance_count), - CP_DRAW_INDX_OFFSET_2(.num_indices = draw->count)); + fd_pkt7(cs, CP_DRAW_INDX_OFFSET, 3) + .add(pack_CP_DRAW_INDX_OFFSET_0(*draw0)) + .add(CP_DRAW_INDX_OFFSET_1(.num_instances = info->instance_count)) + .add(CP_DRAW_INDX_OFFSET_2(.num_indices = draw->count)); } } @@ -258,18 +294,16 @@ get_program_state(struct fd_context *ctx, const struct pipe_draw_info *info) template static void -flush_streamout(struct fd_context *ctx, struct fd6_emit *emit) +flush_streamout(struct fd_context *ctx, fd_cs &cs, struct fd6_emit *emit) assert_dt { if (!emit->streamout_mask) return; - struct fd_ringbuffer *ring = ctx->batch->draw; - for (unsigned i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { if (emit->streamout_mask & (1 << i)) { enum fd_gpu_event evt = (enum fd_gpu_event)(FD_FLUSH_SO_0 + i); - fd6_event_write(ctx, ring, evt); + fd6_event_write(ctx, cs, evt); } } } @@ -360,7 +394,7 @@ draw_vbos(struct fd_context *ctx, const struct pipe_draw_info *info, ctx->stats.fs_regs += ir3_shader_halfregs(emit.fs); } - struct fd_ringbuffer *ring = ctx->batch->draw; + fd_cs cs(ctx->batch->draw); struct CP_DRAW_INDX_OFFSET_0 draw0 = { .prim_type = ctx->screen->primtypes[info->mode], @@ -400,35 +434,36 @@ draw_vbos(struct fd_context *ctx, const struct pipe_draw_info *info, /* convert from # of patches to draw count */ subdraw_size *= ctx->patch_vertices; - OUT_PKT7(ring, CP_SET_SUBDRAW_SIZE, 1); - OUT_RING(ring, subdraw_size); + fd_pkt7(cs, CP_SET_SUBDRAW_SIZE, 1) + .add(subdraw_size); ctx->batch->tessellation = true; } - uint32_t index_start = is_indexed(DRAW) ? draws[0].index_bias : draws[0].start; - if (ctx->last.dirty || (ctx->last.index_start != index_start)) { - OUT_PKT4(ring, REG_A6XX_VFD_INDEX_OFFSET, 1); - OUT_RING(ring, index_start); /* VFD_INDEX_OFFSET */ - ctx->last.index_start = index_start; - } + { + fd_crb crb(cs, 3); - if (ctx->last.dirty || (ctx->last.instance_start != info->start_instance)) { - OUT_PKT4(ring, REG_A6XX_VFD_INSTANCE_START_OFFSET, 1); - OUT_RING(ring, info->start_instance); /* VFD_INSTANCE_START_OFFSET */ - ctx->last.instance_start = info->start_instance; - } + uint32_t index_start = is_indexed(DRAW) ? draws[0].index_bias : draws[0].start; + if (ctx->last.dirty || (ctx->last.index_start != index_start)) { + crb.add(A6XX_VFD_INDEX_OFFSET(index_start)); + ctx->last.index_start = index_start; + } - uint32_t restart_index = - info->primitive_restart ? info->restart_index : 0xffffffff; - if (ctx->last.dirty || (ctx->last.restart_index != restart_index)) { - OUT_PKT4(ring, REG_A6XX_PC_RESTART_INDEX, 1); - OUT_RING(ring, restart_index); /* PC_RESTART_INDEX */ - ctx->last.restart_index = restart_index; + if (ctx->last.dirty || (ctx->last.instance_start != info->start_instance)) { + crb.add(A6XX_VFD_INSTANCE_START_OFFSET(info->start_instance)); + ctx->last.instance_start = info->start_instance; + } + + uint32_t restart_index = + info->primitive_restart ? info->restart_index : 0xffffffff; + if (ctx->last.dirty || (ctx->last.restart_index != restart_index)) { + crb.add(A6XX_PC_RESTART_INDEX(restart_index)); + ctx->last.restart_index = restart_index; + } } if (emit.dirty_groups) - fd6_emit_3d_state(ring, &emit); + fd6_emit_3d_state(cs, &emit); /* All known firmware versions do not wait for WFI's with CP_DRAW_AUTO. * Plus, for the common case where the counter buffer is written by @@ -444,8 +479,7 @@ draw_vbos(struct fd_context *ctx, const struct pipe_draw_info *info, DRAW == DRAW_INDIRECT_OP_INDIRECT_COUNT) ctx->batch->barrier |= FD6_WAIT_FOR_ME; - if (ctx->batch->barrier) - fd6_barrier_flush(ctx->batch); + fd6_barrier_flush(cs, ctx->batch); /* for debug after a lock up, write a unique counter value * to scratch7 for each draw, to make it easier to match up @@ -453,12 +487,12 @@ draw_vbos(struct fd_context *ctx, const struct pipe_draw_info *info, * (scratch6) and DRAW is enough to "triangulate" the * particular draw that caused lockup. */ - emit_marker6(ring, 7); + emit_marker6(cs, 7); if (is_indirect(DRAW)) { assert(num_draws == 1); /* only >1 for direct draws */ if (DRAW == DRAW_INDIRECT_OP_XFB) { - draw_emit_xfb(ring, &draw0, info, indirect); + draw_emit_xfb(cs, &draw0, info, indirect); } else { const struct ir3_const_state *const_state = ir3_const_state(emit.vs); uint32_t dst_offset_dp = @@ -470,10 +504,10 @@ draw_vbos(struct fd_context *ctx, const struct pipe_draw_info *info, emit.vs->constlen)) dst_offset_dp = 0; - draw_emit_indirect(ctx, ring, &draw0, info, indirect, index_offset, dst_offset_dp); + draw_emit_indirect(cs, &draw0, info, indirect, index_offset, dst_offset_dp); } } else { - draw_emit(ring, &draw0, info, &draws[0], index_offset); + draw_emit(cs, &draw0, info, &draws[0], index_offset); if (unlikely(num_draws > 1)) { @@ -492,14 +526,14 @@ draw_vbos(struct fd_context *ctx, const struct pipe_draw_info *info, uint32_t last_index_start = ctx->last.index_start; for (unsigned i = 1; i < num_draws; i++) { - flush_streamout(ctx, &emit); + flush_streamout(ctx, cs, &emit); fd6_vsc_update_sizes(ctx->batch, info, &draws[i]); uint32_t index_start = is_indexed(DRAW) ? draws[i].index_bias : draws[i].start; if (last_index_start != index_start) { - OUT_PKT4(ring, REG_A6XX_VFD_INDEX_OFFSET, 1); - OUT_RING(ring, index_start); /* VFD_INDEX_OFFSET */ + fd_pkt4(cs, 1) + .add(A6XX_VFD_INDEX_OFFSET(index_start)); last_index_start = index_start; } @@ -507,21 +541,21 @@ draw_vbos(struct fd_context *ctx, const struct pipe_draw_info *info, emit.state.num_groups = 0; emit.draw = &draws[i]; emit.draw_id = info->increment_draw_id ? i : 0; - fd6_emit_3d_state(ring, &emit); + fd6_emit_3d_state(cs, &emit); } assert(!index_offset); /* handled by util_draw_multi() */ - draw_emit(ring, &draw0, info, &draws[i], 0); + draw_emit(cs, &draw0, info, &draws[i], 0); } ctx->last.index_start = last_index_start; } } - emit_marker6(ring, 7); + emit_marker6(cs, 7); - flush_streamout(ctx, &emit); + flush_streamout(ctx, cs, &emit); fd_context_all_clean(ctx); } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.cc b/src/gallium/drivers/freedreno/a6xx/fd6_emit.cc index 65ebfb27f88..c4d933887db 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.cc @@ -52,29 +52,26 @@ build_vbo_state(struct fd6_emit *emit) assert_dt const struct fd_vertex_state *vtx = &emit->ctx->vtx; const unsigned cnt = vtx->vertexbuf.count; - const unsigned dwords = cnt * 4; /* per vbo: reg64 + one reg32 + pkt hdr */ - struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( - emit->ctx->batch->submit, 4 * dwords, FD_RINGBUFFER_STREAMING); + fd_crb crb(emit->ctx->batch->submit, 3 * cnt); for (int32_t j = 0; j < cnt; j++) { - OUT_PKT4(ring, REG_A6XX_VFD_VERTEX_BUFFER(j), 3); + const struct pipe_vertex_buffer *vb = &vtx->vertexbuf.vb[j]; struct fd_resource *rsc = fd_resource(vb->buffer.resource); if (rsc == NULL) { - OUT_RING(ring, 0); - OUT_RING(ring, 0); - OUT_RING(ring, 0); + crb.add(A6XX_VFD_VERTEX_BUFFER_BASE(j)); + crb.add(A6XX_VFD_VERTEX_BUFFER_SIZE(j)); } else { uint32_t off = vb->buffer_offset; uint32_t size = vb->buffer.resource->width0 - off; - OUT_RELOC(ring, rsc->bo, off, 0, 0); - OUT_RING(ring, size); /* VFD_VERTEX_BUFFER[j].SIZE */ + crb.add(A6XX_VFD_VERTEX_BUFFER_BASE(j, .bo = rsc->bo, .bo_offset = off)); + crb.add(A6XX_VFD_VERTEX_BUFFER_SIZE(j, size)); } } - return ring; + return crb.ring(); } static enum a6xx_ztest_mode @@ -227,46 +224,39 @@ build_lrz(struct fd6_emit *emit) assert_dt fd6_ctx->last.lrz = lrz; - unsigned ndwords = (CHIP >= A7XX) ? 10 : 8; - struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( - ctx->batch->submit, ndwords * 4, FD_RINGBUFFER_STREAMING); + unsigned nregs = (CHIP >= A7XX) ? 5 : 4; + fd_crb crb(ctx->batch->submit, nregs); if (CHIP >= A7XX) { - OUT_REG(ring, - A6XX_GRAS_LRZ_CNTL( - .enable = lrz.enable, - .lrz_write = lrz.write, - .greater = lrz.direction == FD_LRZ_GREATER, - .z_write_enable = lrz.test, - .z_bounds_enable = lrz.z_bounds_enable, - ) - ); - OUT_REG(ring, - A7XX_GRAS_LRZ_CNTL2( - .disable_on_wrong_dir = false, - .fc_enable = false, - ) - ); + crb.add(A6XX_GRAS_LRZ_CNTL( + .enable = lrz.enable, + .lrz_write = lrz.write, + .greater = lrz.direction == FD_LRZ_GREATER, + .z_write_enable = lrz.test, + .z_bounds_enable = lrz.z_bounds_enable, + )) + .add(A7XX_GRAS_LRZ_CNTL2( + .disable_on_wrong_dir = false, + .fc_enable = false, + )); } else { - OUT_REG(ring, - A6XX_GRAS_LRZ_CNTL( - .enable = lrz.enable, - .lrz_write = lrz.write, - .greater = lrz.direction == FD_LRZ_GREATER, - .fc_enable = false, - .z_write_enable = lrz.test, - .z_bounds_enable = lrz.z_bounds_enable, - .disable_on_wrong_dir = false, + crb.add(A6XX_GRAS_LRZ_CNTL( + .enable = lrz.enable, + .lrz_write = lrz.write, + .greater = lrz.direction == FD_LRZ_GREATER, + .fc_enable = false, + .z_write_enable = lrz.test, + .z_bounds_enable = lrz.z_bounds_enable, + .disable_on_wrong_dir = false, ) ); } - OUT_REG(ring, A6XX_RB_LRZ_CNTL(.enable = lrz.enable, )); - OUT_REG(ring, A6XX_RB_DEPTH_PLANE_CNTL(.z_mode = lrz.z_mode, )); + crb.add(A6XX_RB_LRZ_CNTL(.enable = lrz.enable, )) + .add(A6XX_RB_DEPTH_PLANE_CNTL(.z_mode = lrz.z_mode, )) + .add(A6XX_GRAS_SU_DEPTH_PLANE_CNTL(.z_mode = lrz.z_mode, )); - OUT_REG(ring, A6XX_GRAS_SU_DEPTH_PLANE_CNTL(.z_mode = lrz.z_mode, )); - - return ring; + return crb.ring(); } static struct fd_ringbuffer * @@ -276,18 +266,14 @@ build_scissor(struct fd6_emit *emit) assert_dt struct pipe_scissor_state *scissors = fd_context_get_scissor(ctx); unsigned num_viewports = emit->prog->num_viewports; - struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( - emit->ctx->batch->submit, (1 + (2 * num_viewports)) * 4, FD_RINGBUFFER_STREAMING); + fd_crb crb(emit->ctx->batch->submit, 2 * num_viewports); - OUT_PKT4(ring, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0), 2 * num_viewports); for (unsigned i = 0; i < num_viewports; i++) { - OUT_RING(ring, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_X(scissors[i].minx) | - A6XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(scissors[i].miny)); - OUT_RING(ring, A6XX_GRAS_SC_SCREEN_SCISSOR_BR_X(scissors[i].maxx) | - A6XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(scissors[i].maxy)); + crb.add(A6XX_GRAS_SC_SCREEN_SCISSOR_TL(i, .x = scissors[i].minx, .y = scissors[i].miny)) + .add(A6XX_GRAS_SC_SCREEN_SCISSOR_BR(i, .x = scissors[i].maxx, .y = scissors[i].maxy)); } - return ring; + return crb.ring(); } /* Combination of FD_DIRTY_FRAMEBUFFER | FD_DIRTY_RASTERIZER_DISCARD | @@ -301,8 +287,7 @@ build_prog_fb_rast(struct fd6_emit *emit) assert_dt const struct fd6_program_state *prog = fd6_emit_get_prog(emit); const struct ir3_shader_variant *fs = emit->fs; - struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( - ctx->batch->submit, 9 * 4, FD_RINGBUFFER_STREAMING); + fd_crb crb(ctx->batch->submit, 5); unsigned nr = pfb->nr_cbufs; @@ -314,18 +299,14 @@ build_prog_fb_rast(struct fd6_emit *emit) assert_dt if (blend->use_dual_src_blend) nr++; - OUT_PKT4(ring, REG_A6XX_RB_PS_OUTPUT_CNTL, 2); - OUT_RING(ring, COND(fs->writes_pos, A6XX_RB_PS_OUTPUT_CNTL_FRAG_WRITES_Z) | - COND(fs->writes_smask && pfb->samples > 1, - A6XX_RB_PS_OUTPUT_CNTL_FRAG_WRITES_SAMPMASK) | - COND(fs->writes_stencilref, - A6XX_RB_PS_OUTPUT_CNTL_FRAG_WRITES_STENCILREF) | - COND(blend->use_dual_src_blend, - A6XX_RB_PS_OUTPUT_CNTL_DUAL_COLOR_IN_ENABLE)); - OUT_RING(ring, A6XX_RB_PS_MRT_CNTL_MRT(nr)); - - OUT_PKT4(ring, REG_A6XX_SP_PS_MRT_CNTL, 1); - OUT_RING(ring, A6XX_SP_PS_MRT_CNTL_MRT(nr)); + crb.add(A6XX_RB_PS_OUTPUT_CNTL( + .dual_color_in_enable = blend->use_dual_src_blend, + .frag_writes_z = fs->writes_pos, + .frag_writes_sampmask = fs->writes_smask && pfb->samples > 1, + .frag_writes_stencilref = fs->writes_stencilref, + )); + crb.add(A6XX_RB_PS_MRT_CNTL(.mrt = nr)); + crb.add(A6XX_SP_PS_MRT_CNTL(.mrt = nr)); unsigned mrt_components = 0; for (unsigned i = 0; i < pfb->nr_cbufs; i++) { @@ -340,10 +321,10 @@ build_prog_fb_rast(struct fd6_emit *emit) assert_dt mrt_components &= prog->mrt_components; - OUT_REG(ring, A6XX_SP_PS_OUTPUT_MASK(.dword = mrt_components)); - OUT_REG(ring, A6XX_RB_PS_OUTPUT_MASK(.dword = mrt_components)); + crb.add(A6XX_SP_PS_OUTPUT_MASK(.dword = mrt_components)) + .add(A6XX_RB_PS_OUTPUT_MASK(.dword = mrt_components)); - return ring; + return crb.ring(); } static struct fd_ringbuffer * @@ -351,15 +332,13 @@ build_blend_color(struct fd6_emit *emit) assert_dt { struct fd_context *ctx = emit->ctx; struct pipe_blend_color *bcolor = &ctx->blend_color; - struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( - ctx->batch->submit, 5 * 4, FD_RINGBUFFER_STREAMING); - OUT_REG(ring, A6XX_RB_BLEND_CONSTANT_RED_FP32(bcolor->color[0]), - A6XX_RB_BLEND_CONSTANT_GREEN_FP32(bcolor->color[1]), - A6XX_RB_BLEND_CONSTANT_BLUE_FP32(bcolor->color[2]), - A6XX_RB_BLEND_CONSTANT_ALPHA_FP32(bcolor->color[3])); - - return ring; + return fd_crb(ctx->batch->submit, 4) + .add(A6XX_RB_BLEND_CONSTANT_RED_FP32(bcolor->color[0])) + .add(A6XX_RB_BLEND_CONSTANT_GREEN_FP32(bcolor->color[1])) + .add(A6XX_RB_BLEND_CONSTANT_BLUE_FP32(bcolor->color[2])) + .add(A6XX_RB_BLEND_CONSTANT_ALPHA_FP32(bcolor->color[3])) + .ring(); } static struct fd_ringbuffer * @@ -373,9 +352,6 @@ build_sample_locations(struct fd6_emit *emit) return fd_ringbuffer_ref(fd6_ctx->sample_locations_disable_stateobj); } - struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( - ctx->batch->submit, 9 * 4, FD_RINGBUFFER_STREAMING); - uint32_t sample_locations = 0; for (int i = 0; i < 4; i++) { float x = (ctx->sample_locations[i] & 0xf) / 16.0f; @@ -389,21 +365,19 @@ build_sample_locations(struct fd6_emit *emit) A6XX_RB_PROGRAMMABLE_MSAA_POS_0_SAMPLE_0_Y(y)) << i*8; } - OUT_REG(ring, A6XX_GRAS_SC_MSAA_SAMPLE_POS_CNTL(.location_enable = true), - A6XX_GRAS_SC_PROGRAMMABLE_MSAA_POS_0(.dword = sample_locations)); - - OUT_REG(ring, A6XX_RB_MSAA_SAMPLE_POS_CNTL(.location_enable = true), - A6XX_RB_PROGRAMMABLE_MSAA_POS_0(.dword = sample_locations)); - - OUT_REG(ring, A6XX_TPL1_MSAA_SAMPLE_POS_CNTL(.location_enable = true), - A6XX_TPL1_PROGRAMMABLE_MSAA_POS_0(.dword = sample_locations)); - - return ring; + return fd_crb(ctx->batch->submit, 6) + .add(A6XX_GRAS_SC_MSAA_SAMPLE_POS_CNTL(.location_enable = true)) + .add(A6XX_GRAS_SC_PROGRAMMABLE_MSAA_POS_0(.dword = sample_locations)) + .add(A6XX_RB_MSAA_SAMPLE_POS_CNTL(.location_enable = true)) + .add(A6XX_RB_PROGRAMMABLE_MSAA_POS_0(.dword = sample_locations)) + .add(A6XX_TPL1_MSAA_SAMPLE_POS_CNTL(.location_enable = true)) + .add(A6XX_TPL1_PROGRAMMABLE_MSAA_POS_0(.dword = sample_locations)) + .ring(); } template static void -fd6_emit_streamout(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt +fd6_emit_streamout(fd_cs &cs, struct fd6_emit *emit) assert_dt { struct fd_context *ctx = emit->ctx; const struct fd6_program_state *prog = fd6_emit_get_prog(emit); @@ -423,34 +397,34 @@ fd6_emit_streamout(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt target->stride = info->stride[i]; - OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_BASE(i), 3); - /* VPC_SO[i].BUFFER_BASE_LO: */ - OUT_RELOC(ring, fd_resource(target->base.buffer)->bo, 0, 0, 0); - OUT_RING(ring, target->base.buffer_size + target->base.buffer_offset); + fd_pkt4(cs, 3) + .add(A6XX_VPC_SO_BUFFER_BASE(i, fd_resource(target->base.buffer)->bo)) + .add(A6XX_VPC_SO_BUFFER_SIZE(i, target->base.buffer_size + target->base.buffer_offset)); struct fd_bo *offset_bo = fd_resource(target->offset_buf)->bo; if (so->reset & (1 << i)) { assert(so->offsets[i] == 0); - OUT_PKT7(ring, CP_MEM_WRITE, 3); - OUT_RELOC(ring, offset_bo, 0, 0, 0); - OUT_RING(ring, target->base.buffer_offset); + fd_pkt7(cs, CP_MEM_WRITE, 3) + .add(CP_MEM_WRITE_ADDR(offset_bo)) + .add(target->base.buffer_offset); - OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_OFFSET(i), 1); - OUT_RING(ring, target->base.buffer_offset); + fd_pkt4(cs, 1) + .add(A6XX_VPC_SO_BUFFER_OFFSET(i,target->base.buffer_offset)); } else { - OUT_PKT7(ring, CP_MEM_TO_REG, 3); - OUT_RING(ring, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(i)) | - COND(CHIP == A6XX, CP_MEM_TO_REG_0_SHIFT_BY_2) | - CP_MEM_TO_REG_0_UNK31 | - CP_MEM_TO_REG_0_CNT(0)); - OUT_RELOC(ring, offset_bo, 0, 0, 0); + fd_pkt7(cs, CP_MEM_TO_REG, 3) + .add(CP_MEM_TO_REG_0( + .reg = REG_A6XX_VPC_SO_BUFFER_OFFSET(i), + .shift_by_2 = CHIP == A6XX, + .unk31 = true, + )) + .add(CP_MEM_TO_REG_SRC(offset_bo)); } // After a draw HW would write the new offset to offset_bo - OUT_PKT4(ring, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2); - OUT_RELOC(ring, offset_bo, 0, 0, 0); + fd_pkt4(cs, 2) + .add(A6XX_VPC_SO_FLUSH_BASE(i, offset_bo)); so->reset &= ~(1 << i); @@ -482,7 +456,7 @@ fd6_emit_streamout(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt * themselves. */ if (ctx->dirty & FD_DIRTY_STREAMOUT) - OUT_WFI5(ring); + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); ctx->last.streamout_mask = streamout_mask; emit->streamout_mask = streamout_mask; @@ -492,18 +466,18 @@ fd6_emit_streamout(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt * Stuff that less frequently changes and isn't (yet) moved into stategroups */ static void -fd6_emit_non_ring(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt +fd6_emit_non_group(fd_cs &cs, struct fd6_emit *emit) assert_dt { struct fd_context *ctx = emit->ctx; const enum fd_dirty_3d_state dirty = ctx->dirty; unsigned num_viewports = emit->prog->num_viewports; + fd_crb crb(cs, 324); + if (dirty & FD_DIRTY_STENCIL_REF) { struct pipe_stencil_ref *sr = &ctx->stencil_ref; - OUT_PKT4(ring, REG_A6XX_RB_STENCIL_REF_CNTL, 1); - OUT_RING(ring, A6XX_RB_STENCIL_REF_CNTL_REF(sr->ref_value[0]) | - A6XX_RB_STENCIL_REF_CNTL_BFREF(sr->ref_value[1])); + crb.add(A6XX_RB_STENCIL_REF_CNTL(.ref = sr->ref_value[0], .bfref = sr->ref_value[1])); } if (dirty & (FD_DIRTY_VIEWPORT | FD_DIRTY_PROG)) { @@ -511,25 +485,18 @@ fd6_emit_non_ring(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt struct pipe_scissor_state *scissor = &ctx->viewport_scissor[i]; struct pipe_viewport_state *vp = & ctx->viewport[i]; - OUT_REG(ring, A6XX_GRAS_CL_VIEWPORT_XOFFSET(i, vp->translate[0]), - A6XX_GRAS_CL_VIEWPORT_XSCALE(i, vp->scale[0]), - A6XX_GRAS_CL_VIEWPORT_YOFFSET(i, vp->translate[1]), - A6XX_GRAS_CL_VIEWPORT_YSCALE(i, vp->scale[1]), - A6XX_GRAS_CL_VIEWPORT_ZOFFSET(i, vp->translate[2]), - A6XX_GRAS_CL_VIEWPORT_ZSCALE(i, vp->scale[2])); - - OUT_REG( - ring, - A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(i, - .x = scissor->minx, - .y = scissor->miny), - A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(i, - .x = scissor->maxx, - .y = scissor->maxy)); + crb.add(A6XX_GRAS_CL_VIEWPORT_XOFFSET(i, vp->translate[0])); + crb.add(A6XX_GRAS_CL_VIEWPORT_XSCALE(i, vp->scale[0])); + crb.add(A6XX_GRAS_CL_VIEWPORT_YOFFSET(i, vp->translate[1])); + crb.add(A6XX_GRAS_CL_VIEWPORT_YSCALE(i, vp->scale[1])); + crb.add(A6XX_GRAS_CL_VIEWPORT_ZOFFSET(i, vp->translate[2])); + crb.add(A6XX_GRAS_CL_VIEWPORT_ZSCALE(i, vp->scale[2])); + crb.add(A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(i, .x = scissor->minx, .y = scissor->miny)); + crb.add(A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(i, .x = scissor->maxx, .y = scissor->maxy)); } - OUT_REG(ring, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ(.horz = ctx->guardband.x, - .vert = ctx->guardband.y)); + crb.add(A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ(.horz = ctx->guardband.x, + .vert = ctx->guardband.y)); } /* The clamp ranges are only used when the rasterizer wants depth @@ -544,12 +511,14 @@ fd6_emit_non_ring(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt util_viewport_zmin_zmax(vp, ctx->rasterizer->clip_halfz, &zmin, &zmax); - OUT_REG(ring, A6XX_GRAS_CL_VIEWPORT_ZCLAMP_MIN(i, zmin), - A6XX_GRAS_CL_VIEWPORT_ZCLAMP_MAX(i, zmax)); + crb.add(A6XX_GRAS_CL_VIEWPORT_ZCLAMP_MIN(i, zmin)); + crb.add(A6XX_GRAS_CL_VIEWPORT_ZCLAMP_MAX(i, zmax)); /* TODO: what to do about this and multi viewport ? */ - if (i == 0) - OUT_REG(ring, A6XX_RB_VIEWPORT_ZCLAMP_MIN(zmin), A6XX_RB_VIEWPORT_ZCLAMP_MAX(zmax)); + if (i == 0) { + crb.add(A6XX_RB_VIEWPORT_ZCLAMP_MIN(zmin)); + crb.add(A6XX_RB_VIEWPORT_ZCLAMP_MAX(zmax)); + } } } } @@ -558,8 +527,6 @@ static struct fd_ringbuffer* build_prim_mode(struct fd6_emit *emit, struct fd_context *ctx, bool gmem) assert_dt { - struct fd_ringbuffer *ring = - fd_submit_new_ringbuffer(emit->ctx->batch->submit, 2 * 4, FD_RINGBUFFER_STREAMING); uint32_t prim_mode = NO_FLUSH; if (emit->fs->fs.uses_fbfetch_output) { if (gmem) { @@ -571,21 +538,25 @@ build_prim_mode(struct fd6_emit *emit, struct fd_context *ctx, bool gmem) } else { prim_mode = NO_FLUSH; } - OUT_REG(ring, A6XX_GRAS_SC_CNTL(.ccusinglecachelinesize = 2, - .single_prim_mode = (enum a6xx_single_prim_mode)prim_mode)); - return ring; + + return fd_crb(ctx->batch->submit, 1) + .add(A6XX_GRAS_SC_CNTL( + .ccusinglecachelinesize = 2, + .single_prim_mode = (enum a6xx_single_prim_mode)prim_mode) + ) + .ring(); } template void -fd6_emit_3d_state(struct fd_ringbuffer *ring, struct fd6_emit *emit) +fd6_emit_3d_state(fd_cs &cs, struct fd6_emit *emit) { struct fd_context *ctx = emit->ctx; struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; const struct fd6_program_state *prog = fd6_emit_get_prog(emit); const struct ir3_shader_variant *fs = emit->fs; - emit_marker6(ring, 5); + emit_marker6(cs, 5); /* Special case, we need to re-emit bindless FS state w/ the * fb-read state appended: @@ -713,7 +684,7 @@ fd6_emit_3d_state(struct fd_ringbuffer *ring, struct fd6_emit *emit) fd6_state_take_group(&emit->state, state, FD6_GROUP_FS_TEX); break; case FD6_GROUP_SO: - fd6_emit_streamout(ring, emit); + fd6_emit_streamout(cs, emit); break; case FD6_GROUP_PRIM_MODE_SYSMEM: state = build_prim_mode(emit, ctx, false); @@ -724,25 +695,24 @@ fd6_emit_3d_state(struct fd_ringbuffer *ring, struct fd6_emit *emit) fd6_state_take_group(&emit->state, state, FD6_GROUP_PRIM_MODE_GMEM); break; case FD6_GROUP_NON_GROUP: - fd6_emit_non_ring(ring, emit); + fd6_emit_non_group(cs, emit); break; default: break; } } - fd6_state_emit(&emit->state, ring); + fd6_state_emit(&emit->state, cs); } -template void fd6_emit_3d_state(struct fd_ringbuffer *ring, struct fd6_emit *emit); -template void fd6_emit_3d_state(struct fd_ringbuffer *ring, struct fd6_emit *emit); -template void fd6_emit_3d_state(struct fd_ringbuffer *ring, struct fd6_emit *emit); -template void fd6_emit_3d_state(struct fd_ringbuffer *ring, struct fd6_emit *emit); +template void fd6_emit_3d_state(fd_cs &cs, struct fd6_emit *emit); +template void fd6_emit_3d_state(fd_cs &cs, struct fd6_emit *emit); +template void fd6_emit_3d_state(fd_cs &cs, struct fd6_emit *emit); +template void fd6_emit_3d_state(fd_cs &cs, struct fd6_emit *emit); template void -fd6_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct fd6_compute_state *cp) +fd6_emit_cs_state(struct fd_context *ctx, fd_cs &cs, struct fd6_compute_state *cp) { struct fd6_state state = {}; @@ -754,8 +724,8 @@ fd6_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring, * const state, so it must execute before we start loading consts, rather * than be deferred until CP_EXEC_CS. */ - OUT_PKT7(ring, CP_SET_MODE, 1); - OUT_RING(ring, 1); + fd_pkt7(cs, CP_SET_MODE, 1) + .add(1); uint32_t gen_dirty = ctx->gen_dirty & (BIT(FD6_GROUP_PROG) | BIT(FD6_GROUP_CS_TEX) | BIT(FD6_GROUP_CS_BINDLESS)); @@ -785,13 +755,13 @@ fd6_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring, } } - fd6_state_emit(&state, ring); + fd6_state_emit(&state, cs); } FD_GENX(fd6_emit_cs_state); template void -fd6_emit_ccu_cntl(struct fd_ringbuffer *ring, struct fd_screen *screen, bool gmem) +fd6_emit_ccu_cntl(fd_cs &cs, struct fd_screen *screen, bool gmem) { const struct fd6_gmem_config *cfg = gmem ? &screen->config_gmem : &screen->config_sysmem; enum a6xx_ccu_cache_size color_cache_size = !gmem ? CCU_CACHE_SIZE_FULL : @@ -803,8 +773,8 @@ fd6_emit_ccu_cntl(struct fd_ringbuffer *ring, struct fd_screen *screen, bool gme uint32_t depth_offset_hi = cfg->depth_ccu_offset >> 21; if (CHIP == A7XX) { - OUT_REG(ring, - A7XX_RB_CCU_CACHE_CNTL( + fd_pkt4(cs, 1) + .add(A7XX_RB_CCU_CACHE_CNTL( .depth_offset_hi = depth_offset_hi, .color_offset_hi = color_offset_hi, .depth_cache_size = CCU_CACHE_SIZE_FULL, @@ -815,20 +785,16 @@ fd6_emit_ccu_cntl(struct fd_ringbuffer *ring, struct fd_screen *screen, bool gme ); if (screen->info->a7xx.has_gmem_vpc_attr_buf) { - OUT_REG(ring, - A7XX_VPC_ATTR_BUF_GMEM_SIZE(.size_gmem = cfg->vpc_attr_buf_size), - A7XX_VPC_ATTR_BUF_GMEM_BASE(.base_gmem = cfg->vpc_attr_buf_offset) - ); - OUT_REG(ring, - A7XX_PC_ATTR_BUF_GMEM_SIZE(.size_gmem = cfg->vpc_attr_buf_size) - ); + fd_crb(cs, 3) + .add(A7XX_VPC_ATTR_BUF_GMEM_SIZE(.size_gmem = cfg->vpc_attr_buf_size)) + .add(A7XX_VPC_ATTR_BUF_GMEM_BASE(.base_gmem = cfg->vpc_attr_buf_offset)) + .add(A7XX_PC_ATTR_BUF_GMEM_SIZE(.size_gmem = cfg->vpc_attr_buf_size)); } } else { - OUT_WFI5(ring); /* early a6xx (a630?) needed this */ + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); - OUT_REG(ring, - RB_CCU_CNTL( - CHIP, + fd_pkt4(cs, 1) + .add(RB_CCU_CNTL(CHIP, .gmem_fast_clear_disable = !screen->info->a6xx.has_gmem_fast_clear, .concurrent_resolve = @@ -847,21 +813,22 @@ FD_GENX(fd6_emit_ccu_cntl); template static void -fd6_emit_stomp(struct fd_ringbuffer *ring, const uint16_t *regs, size_t count) +fd6_emit_stomp(fd_cs &cs, const uint16_t *regs, size_t count) { for (size_t i = 0; i < count; i++) { if (fd_reg_stomp_allowed(CHIP, regs[i])) { - WRITE(regs[i], 0xffffffff); + fd_pkt4(cs, 1).add({regs[i], 0xffffffff}); } } } - template -void -fd6_emit_static_regs(struct fd_context *ctx, struct fd_ringbuffer *ring) +static void +fd6_emit_static_non_context_regs(struct fd_context *ctx, fd_cs &cs) { struct fd_screen *screen = ctx->screen; + fd_ncrb ncrb(cs, 25 + ARRAY_SIZE(screen->info->a6xx.magic_raw)); + if (CHIP >= A7XX) { /* On A7XX, RB_CCU_CNTL was broken into two registers, RB_CCU_CNTL which has * static properties that can be set once, this requires a WFI to take effect. @@ -869,13 +836,10 @@ fd6_emit_static_regs(struct fd_context *ctx, struct fd_ringbuffer *ring) * change per-RP and don't require a WFI to take effect, only CCU inval/flush * events are required. */ - OUT_REG(ring, - RB_CCU_CNTL( - CHIP, - .gmem_fast_clear_disable = true, // !screen->info->a6xx.has_gmem_fast_clear, - .concurrent_resolve = screen->info->a6xx.concurrent_resolve, - ) - ); + ncrb.add(RB_CCU_CNTL(CHIP, + .gmem_fast_clear_disable = true, // !screen->info->a6xx.has_gmem_fast_clear, + .concurrent_resolve = screen->info->a6xx.concurrent_resolve, + )); } for (size_t i = 0; i < ARRAY_SIZE(screen->info->a6xx.magic_raw); i++) { @@ -893,176 +857,163 @@ fd6_emit_static_regs(struct fd_context *ctx, struct fd_ringbuffer *ring) break; } - WRITE(magic_reg.reg, value); + ncrb.add({ .reg = magic_reg.reg, .value = value }); } - WRITE(REG_A6XX_RB_DBG_ECO_CNTL, screen->info->a6xx.magic.RB_DBG_ECO_CNTL); - WRITE(REG_A6XX_SP_NC_MODE_CNTL_2, A6XX_SP_NC_MODE_CNTL_2_F16_NO_INF); - WRITE(REG_A6XX_SP_DBG_ECO_CNTL, screen->info->a6xx.magic.SP_DBG_ECO_CNTL); - WRITE(REG_A6XX_SP_PERFCTR_SHADER_MASK, 0x3f); + ncrb.add(A6XX_RB_DBG_ECO_CNTL(.dword = screen->info->a6xx.magic.RB_DBG_ECO_CNTL)); + ncrb.add(A6XX_SP_NC_MODE_CNTL_2(.f16_no_inf = true)); + + ncrb.add(A6XX_SP_DBG_ECO_CNTL(.dword = screen->info->a6xx.magic.SP_DBG_ECO_CNTL)); + ncrb.add(A6XX_SP_PERFCTR_SHADER_MASK(.dword = 0x3f)); if (CHIP == A6XX && !screen->info->a6xx.is_a702) - WRITE(REG_A6XX_TPL1_UNKNOWN_B605, 0x44); - WRITE(REG_A6XX_TPL1_DBG_ECO_CNTL, screen->info->a6xx.magic.TPL1_DBG_ECO_CNTL); + ncrb.add(A6XX_TPL1_UNKNOWN_B605(.dword = 0x44)); + ncrb.add(A6XX_TPL1_DBG_ECO_CNTL(.dword = screen->info->a6xx.magic.TPL1_DBG_ECO_CNTL)); if (CHIP == A6XX) { - WRITE(REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80); - WRITE(REG_A6XX_HLSQ_UNKNOWN_BE01, 0); + ncrb.add(A6XX_HLSQ_UNKNOWN_BE00(.dword = 0x80)); + ncrb.add(A6XX_HLSQ_UNKNOWN_BE01()); } - WRITE(REG_A6XX_VPC_DBG_ECO_CNTL, screen->info->a6xx.magic.VPC_DBG_ECO_CNTL); - WRITE(REG_A6XX_GRAS_DBG_ECO_CNTL, screen->info->a6xx.magic.GRAS_DBG_ECO_CNTL); + ncrb.add(A6XX_VPC_DBG_ECO_CNTL(.dword = screen->info->a6xx.magic.VPC_DBG_ECO_CNTL)); + ncrb.add(A6XX_GRAS_DBG_ECO_CNTL(.dword = screen->info->a6xx.magic.GRAS_DBG_ECO_CNTL)); if (CHIP == A6XX) - WRITE(REG_A6XX_HLSQ_DBG_ECO_CNTL, screen->info->a6xx.magic.HLSQ_DBG_ECO_CNTL); - WRITE(REG_A6XX_SP_CHICKEN_BITS, screen->info->a6xx.magic.SP_CHICKEN_BITS); - WRITE(REG_A6XX_SP_GFX_USIZE, 0); - WRITE(REG_A6XX_SP_UNKNOWN_B182, 0); - if (CHIP == A6XX) - WRITE(REG_A6XX_HLSQ_SHARED_CONSTS, 0); - WRITE(REG_A6XX_UCHE_UNKNOWN_0E12, screen->info->a6xx.magic.UCHE_UNKNOWN_0E12); - WRITE(REG_A6XX_UCHE_CLIENT_PF, screen->info->a6xx.magic.UCHE_CLIENT_PF); - WRITE(REG_A6XX_RB_UNKNOWN_8E01, screen->info->a6xx.magic.RB_UNKNOWN_8E01); - WRITE(REG_A6XX_SP_UNKNOWN_A9A8, 0); - OUT_REG(ring, - A6XX_SP_MODE_CNTL( + ncrb.add(A6XX_HLSQ_DBG_ECO_CNTL(.dword = screen->info->a6xx.magic.HLSQ_DBG_ECO_CNTL)); + ncrb.add(A6XX_SP_CHICKEN_BITS(.dword = screen->info->a6xx.magic.SP_CHICKEN_BITS)); + + ncrb.add(A6XX_UCHE_UNKNOWN_0E12(.dword = screen->info->a6xx.magic.UCHE_UNKNOWN_0E12)); + ncrb.add(A6XX_UCHE_CLIENT_PF(.dword = screen->info->a6xx.magic.UCHE_CLIENT_PF)); + + if (CHIP == A6XX) { + ncrb.add(A6XX_HLSQ_SHARED_CONSTS()); + ncrb.add(A6XX_VPC_UNKNOWN_9211()); + } + + ncrb.add(A6XX_GRAS_UNKNOWN_80AF()); + ncrb.add(A6XX_VPC_UNKNOWN_9602()); + + /* These regs are blocked (CP_PROTECT) on a6xx: */ + if (CHIP >= A7XX) { + ncrb.add(TPL1_BICUBIC_WEIGHTS_TABLE_0(CHIP, 0)); + ncrb.add(TPL1_BICUBIC_WEIGHTS_TABLE_1(CHIP, 0x3fe05ff4)); + ncrb.add(TPL1_BICUBIC_WEIGHTS_TABLE_2(CHIP, 0x3fa0ebee)); + ncrb.add(TPL1_BICUBIC_WEIGHTS_TABLE_3(CHIP, 0x3f5193ed)); + ncrb.add(TPL1_BICUBIC_WEIGHTS_TABLE_4(CHIP, 0x3f0243f0)); + } +} + +/** + * Note, CP_CONTEXT_REG_BUNCH can only write context regs, some of the static + * regs are non-context regs, attempting to write them with CRB will trigger + * CP_PROTECT errors. + */ +template +static void +fd6_emit_static_context_regs(struct fd_context *ctx, fd_cs &cs) +{ + struct fd_screen *screen = ctx->screen; + + fd_crb crb(cs, 80); + + crb.add(A6XX_SP_GFX_USIZE()); + crb.add(A6XX_SP_UNKNOWN_B182()); + + crb.add(A6XX_RB_UNKNOWN_8E01(.dword = screen->info->a6xx.magic.RB_UNKNOWN_8E01)); + crb.add(A6XX_SP_UNKNOWN_A9A8()); + + crb.add(A6XX_SP_MODE_CNTL( .constant_demotion_enable = true, .isammode = ISAMMODE_GL, .shared_consts_enable = false, ) ); - OUT_REG(ring, A6XX_VFD_MODE_CNTL(.vertex = true, .instance = true)); - WRITE(REG_A6XX_VPC_UNKNOWN_9107, 0); - WRITE(REG_A6XX_RB_UNKNOWN_8811, 0x00000010); - WRITE(REG_A6XX_PC_MODE_CNTL, screen->info->a6xx.magic.PC_MODE_CNTL); - WRITE(REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 0); - WRITE(REG_A6XX_GRAS_LRZ_PS_SAMPLEFREQ_CNTL, 0); - WRITE(REG_A6XX_GRAS_UNKNOWN_8110, 0x2); + crb.add(A6XX_VFD_MODE_CNTL(.vertex = true, .instance = true)); + crb.add(A6XX_VPC_UNKNOWN_9107()); + crb.add(A6XX_RB_UNKNOWN_8811(.dword = 0x00000010)); + crb.add(A6XX_PC_MODE_CNTL(.dword=screen->info->a6xx.magic.PC_MODE_CNTL)); + crb.add(A6XX_GRAS_LRZ_PS_INPUT_CNTL()); + crb.add(A6XX_GRAS_LRZ_PS_SAMPLEFREQ_CNTL()); + crb.add(A6XX_GRAS_UNKNOWN_8110(.dword = 0x2)); - WRITE(REG_A6XX_RB_UNKNOWN_8818, 0); + crb.add(A6XX_RB_UNKNOWN_8818()); if (CHIP == A6XX) { - WRITE(REG_A6XX_RB_UNKNOWN_8819, 0); - WRITE(REG_A6XX_RB_UNKNOWN_881A, 0); - WRITE(REG_A6XX_RB_UNKNOWN_881B, 0); - WRITE(REG_A6XX_RB_UNKNOWN_881C, 0); - WRITE(REG_A6XX_RB_UNKNOWN_881D, 0); - WRITE(REG_A6XX_RB_UNKNOWN_881E, 0); + crb.add(A6XX_RB_UNKNOWN_8819()); + crb.add(A6XX_RB_UNKNOWN_881A()); + crb.add(A6XX_RB_UNKNOWN_881B()); + crb.add(A6XX_RB_UNKNOWN_881C()); + crb.add(A6XX_RB_UNKNOWN_881D()); + crb.add(A6XX_RB_UNKNOWN_881E()); } - WRITE(REG_A6XX_RB_UNKNOWN_88F0, 0); + crb.add(A6XX_RB_UNKNOWN_88F0()); + crb.add(A6XX_VPC_REPLACE_MODE_CNTL()); + crb.add(A6XX_VPC_UNKNOWN_9300()); + crb.add(A6XX_VPC_SO_OVERRIDE(true)); - WRITE(REG_A6XX_VPC_REPLACE_MODE_CNTL, A6XX_VPC_REPLACE_MODE_CNTL(0).value); - WRITE(REG_A6XX_VPC_UNKNOWN_9300, 0); - - WRITE(REG_A6XX_VPC_SO_OVERRIDE, A6XX_VPC_SO_OVERRIDE(true).value); - - OUT_REG(ring, VPC_RAST_STREAM_CNTL(CHIP)); + crb.add(VPC_RAST_STREAM_CNTL(CHIP)); if (CHIP == A7XX) - OUT_REG(ring, A7XX_VPC_RAST_STREAM_CNTL_V2()); + crb.add(A7XX_VPC_RAST_STREAM_CNTL_V2()); - WRITE(REG_A6XX_PC_STEREO_RENDERING_CNTL, 0); + crb.add(A6XX_PC_STEREO_RENDERING_CNTL()); + crb.add(A6XX_SP_UNKNOWN_B183()); + crb.add(A6XX_GRAS_SU_CONSERVATIVE_RAS_CNTL()); + crb.add(A6XX_GRAS_SU_VS_SIV_CNTL()); + crb.add(A6XX_GRAS_SC_CNTL(.ccusinglecachelinesize = 2)); - WRITE(REG_A6XX_SP_UNKNOWN_B183, 0); - - WRITE(REG_A6XX_GRAS_SU_CONSERVATIVE_RAS_CNTL, 0); - WRITE(REG_A6XX_GRAS_SU_VS_SIV_CNTL, 0); - WRITE(REG_A6XX_GRAS_SC_CNTL, A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2)); - WRITE(REG_A6XX_GRAS_UNKNOWN_80AF, 0); if (CHIP == A6XX) { - WRITE(REG_A6XX_VPC_UNKNOWN_9210, 0); - WRITE(REG_A6XX_VPC_UNKNOWN_9211, 0); + crb.add(A6XX_VPC_UNKNOWN_9210()); } - WRITE(REG_A6XX_VPC_UNKNOWN_9602, 0); - WRITE(REG_A6XX_PC_UNKNOWN_9E72, 0); - /* NOTE blob seems to (mostly?) use 0xb2 for TPL1_MODE_CNTL - * but this seems to kill texture gather offsets. - */ - OUT_REG(ring, - A6XX_TPL1_MODE_CNTL( + + crb.add(A6XX_PC_UNKNOWN_9E72()); + + crb.add(A6XX_TPL1_MODE_CNTL( .isammode = ISAMMODE_GL, .texcoordroundmode = COORD_TRUNCATE, .nearestmipsnap = CLAMP_ROUND_TRUNCATE, - .destdatatypeoverride = true)); + .destdatatypeoverride = true, + )); - OUT_REG(ring, SP_REG_PROG_ID_3( + crb.add(SP_REG_PROG_ID_3( CHIP, .linelengthregid = INVALID_REG, .foveationqualityregid = INVALID_REG, )); - emit_marker6(ring, 7); + crb.add(A6XX_VFD_RENDER_MODE(RENDERING_PASS)); + crb.add(A6XX_VFD_STEREO_RENDERING_CNTL()); + crb.add(A6XX_VPC_SO_CNTL()); - OUT_REG(ring, A6XX_VFD_RENDER_MODE(RENDERING_PASS)); + crb.add(A6XX_GRAS_LRZ_CNTL()); + if (CHIP >= A7XX) + crb.add(A7XX_GRAS_LRZ_CNTL2()); - WRITE(REG_A6XX_VFD_STEREO_RENDERING_CNTL, 0); - - /* Clear any potential pending state groups to be safe: */ - OUT_PKT7(ring, CP_SET_DRAW_STATE, 3); - OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) | - CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS | - CP_SET_DRAW_STATE__0_GROUP_ID(0)); - OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0)); - OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0)); - - OUT_PKT4(ring, REG_A6XX_VPC_SO_CNTL, 1); - OUT_RING(ring, 0x00000000); /* VPC_SO_CNTL */ - - if (CHIP >= A7XX) { - OUT_REG(ring, A6XX_GRAS_LRZ_CNTL()); - OUT_REG(ring, A7XX_GRAS_LRZ_CNTL2()); - } else { - OUT_REG(ring, A6XX_GRAS_LRZ_CNTL()); - } - - OUT_REG(ring, A6XX_RB_LRZ_CNTL()); - OUT_REG(ring, A6XX_RB_DEPTH_PLANE_CNTL()); - OUT_REG(ring, A6XX_GRAS_SU_DEPTH_PLANE_CNTL()); - - OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_CNTL, 1); - OUT_RING(ring, 0x00000000); - - OUT_PKT4(ring, REG_A6XX_RB_LRZ_CNTL, 1); - OUT_RING(ring, 0x00000000); + crb.add(A6XX_RB_LRZ_CNTL()); + crb.add(A6XX_RB_DEPTH_PLANE_CNTL()); + crb.add(A6XX_GRAS_SU_DEPTH_PLANE_CNTL()); /* Initialize VFD_VERTEX_BUFFER[n].SIZE to zero to avoid iova faults trying * to fetch from a VFD_VERTEX_BUFFER[n].BASE which we've potentially inherited * from another process: */ - for (int32_t i = 0; i < 32; i++) { - OUT_PKT4(ring, REG_A6XX_VFD_VERTEX_BUFFER_SIZE(i), 1); - OUT_RING(ring, 0); - } + for (int32_t i = 0; i < 32; i++) + crb.add(A6XX_VFD_VERTEX_BUFFER_SIZE(i, 0)); struct fd6_context *fd6_ctx = fd6_context(ctx); struct fd_bo *bcolor_mem = fd6_ctx->bcolor_mem; - OUT_PKT4(ring, REG_A6XX_TPL1_GFX_BORDER_COLOR_BASE, 2); - OUT_RELOC(ring, bcolor_mem, 0, 0, 0); - - OUT_PKT4(ring, REG_A6XX_TPL1_CS_BORDER_COLOR_BASE, 2); - OUT_RELOC(ring, bcolor_mem, 0, 0, 0); - - OUT_REG(ring, A6XX_PC_DGEN_SU_CONSERVATIVE_RAS_CNTL()); - - /* These regs are blocked (CP_PROTECT) on a6xx: */ - if (CHIP >= A7XX) { - OUT_REG(ring, - TPL1_BICUBIC_WEIGHTS_TABLE_0(CHIP, 0), - TPL1_BICUBIC_WEIGHTS_TABLE_1(CHIP, 0x3fe05ff4), - TPL1_BICUBIC_WEIGHTS_TABLE_2(CHIP, 0x3fa0ebee), - TPL1_BICUBIC_WEIGHTS_TABLE_3(CHIP, 0x3f5193ed), - TPL1_BICUBIC_WEIGHTS_TABLE_4(CHIP, 0x3f0243f0), - ); - } + crb.add(A6XX_TPL1_GFX_BORDER_COLOR_BASE(.bo = bcolor_mem)); + crb.add(A6XX_TPL1_CS_BORDER_COLOR_BASE(.bo = bcolor_mem)); + crb.add(A6XX_PC_DGEN_SU_CONSERVATIVE_RAS_CNTL()); if (CHIP >= A7XX) { /* Blob sets these two per draw. */ - OUT_REG(ring, A7XX_PC_HS_BUFFER_SIZE(FD6_TESS_PARAM_SIZE)); + crb.add(A7XX_PC_HS_BUFFER_SIZE(FD6_TESS_PARAM_SIZE)); /* Blob adds a bit more space ({0x10, 0x20, 0x30, 0x40} bytes) * but the meaning of this additional space is not known, * so we play safe and don't add it. */ - OUT_REG(ring, A7XX_PC_TF_BUFFER_SIZE(FD6_TESS_FACTOR_SIZE)); + crb.add(A7XX_PC_TF_BUFFER_SIZE(FD6_TESS_FACTOR_SIZE)); } /* There is an optimization to skip executing draw states for draws with no @@ -1081,9 +1032,21 @@ fd6_emit_static_regs(struct fd_context *ctx, struct fd_ringbuffer *ring) * seem to be affected. */ if (screen->info->a6xx.has_early_preamble) { - WRITE(REG_A6XX_SP_PS_CNTL_0, 0); + crb.add(A6XX_SP_PS_CNTL_0()); } } + +template +void +fd6_emit_static_regs(fd_cs &cs, struct fd_context *ctx) +{ + fd6_emit_static_non_context_regs(ctx, cs); + fd6_emit_static_context_regs(ctx, cs); + + fd_pkt7(cs, CP_SET_DRAW_STATE, 3) + .add(CP_SET_DRAW_STATE__0(0, .disable_all_groups = true)) + .add(CP_SET_DRAW_STATE__ADDR(0)); +} FD_GENX(fd6_emit_static_regs); /* emit setup at begin of new cmdstream buffer (don't rely on previous @@ -1091,42 +1054,44 @@ FD_GENX(fd6_emit_static_regs); */ template void -fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring) +fd6_emit_restore(fd_cs &cs, struct fd_batch *batch) { struct fd_context *ctx = batch->ctx; struct fd_screen *screen = ctx->screen; if (!batch->nondraw) { - trace_start_state_restore(&batch->trace, ring); + trace_start_state_restore(&batch->trace, cs.ring()); } if (FD_DBG(STOMP)) { - fd6_emit_stomp(ring, &RP_BLIT_REGS[0], ARRAY_SIZE(RP_BLIT_REGS)); - fd6_emit_stomp(ring, &CMD_REGS[0], ARRAY_SIZE(CMD_REGS)); + fd6_emit_stomp(cs, &RP_BLIT_REGS[0], ARRAY_SIZE(RP_BLIT_REGS)); + fd6_emit_stomp(cs, &CMD_REGS[0], ARRAY_SIZE(CMD_REGS)); } - OUT_PKT7(ring, CP_SET_MODE, 1); - OUT_RING(ring, 0); + fd_pkt7(cs, CP_SET_MODE, 1) + .add(0x0); if (CHIP == A6XX) { - fd6_cache_inv(ctx, ring); + fd6_cache_inv(ctx, cs); } else { - OUT_PKT7(ring, CP_THREAD_CONTROL, 1); - OUT_RING(ring, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BR) | - CP_THREAD_CONTROL_0_CONCURRENT_BIN_DISABLE); + fd_pkt7(cs, CP_THREAD_CONTROL, 1) + .add(CP_THREAD_CONTROL_0( + .thread = CP_SET_THREAD_BR, + .concurrent_bin_disable = true, + )); - fd6_event_write(ctx, ring, FD_CCU_INVALIDATE_COLOR); - fd6_event_write(ctx, ring, FD_CCU_INVALIDATE_DEPTH); + fd6_event_write(ctx, cs, FD_CCU_INVALIDATE_COLOR); + fd6_event_write(ctx, cs, FD_CCU_INVALIDATE_DEPTH); - OUT_PKT7(ring, CP_EVENT_WRITE, 1); - OUT_RING(ring, UNK_40); + fd_pkt7(cs, CP_EVENT_WRITE, 1) + .add(UNK_40); - fd6_event_write(ctx, ring, FD_CACHE_INVALIDATE); - OUT_WFI5(ring); + fd6_event_write(ctx, cs, FD_CACHE_INVALIDATE); + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); } - OUT_REG(ring, - SP_UPDATE_CNTL(CHIP, + fd_pkt4(cs, 1) + .add(SP_UPDATE_CNTL(CHIP, .vs_state = true, .hs_state = true, .ds_state = true, .gs_state = true, .fs_state = true, .cs_state = true, @@ -1135,31 +1100,29 @@ fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring) .gfx_shared_const = true, .cs_bindless = CHIP == A6XX ? 0x1f : 0xff, .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff, - ) - ); + )); - OUT_WFI5(ring); + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); - fd6_emit_ib(ring, fd6_context(ctx)->restore); - fd6_emit_ccu_cntl(ring, screen, false); + fd6_emit_ib(cs, fd6_context(ctx)->restore); + fd6_emit_ccu_cntl(cs, screen, false); - OUT_PKT7(ring, CP_SET_AMBLE, 3); - uint32_t dwords = fd_ringbuffer_emit_reloc_ring_full(ring, fd6_context(ctx)->preamble, 0) / 4; - OUT_RING(ring, CP_SET_AMBLE_2_DWORDS(dwords) | - CP_SET_AMBLE_2_TYPE(BIN_PREAMBLE_AMBLE_TYPE)); + uint32_t dwords; - OUT_PKT7(ring, CP_SET_AMBLE, 3); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, CP_SET_AMBLE_2_TYPE(PREAMBLE_AMBLE_TYPE)); + fd_pkt7(cs, CP_SET_AMBLE, 3) + .add(fd6_context(ctx)->preamble, 0, &dwords) + .add(CP_SET_AMBLE_2(.dwords = dwords, .type = BIN_PREAMBLE_AMBLE_TYPE)); - OUT_PKT7(ring, CP_SET_AMBLE, 3); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, CP_SET_AMBLE_2_TYPE(POSTAMBLE_AMBLE_TYPE)); + fd_pkt7(cs, CP_SET_AMBLE, 3) + .add(CP_SET_AMBLE_ADDR()) + .add(CP_SET_AMBLE_2(.type = PREAMBLE_AMBLE_TYPE)); + + fd_pkt7(cs, CP_SET_AMBLE, 3) + .add(CP_SET_AMBLE_ADDR()) + .add(CP_SET_AMBLE_2(.type = POSTAMBLE_AMBLE_TYPE)); if (!batch->nondraw) { - trace_end_state_restore(&batch->trace, ring); + trace_end_state_restore(&batch->trace, cs.ring()); } } FD_GENX(fd6_emit_restore); @@ -1171,16 +1134,17 @@ fd6_mem_to_mem(struct fd_ringbuffer *ring, struct pipe_resource *dst, { struct fd_bo *src_bo = fd_resource(src)->bo; struct fd_bo *dst_bo = fd_resource(dst)->bo; + fd_cs cs(ring); unsigned i; - fd_ringbuffer_attach_bo(ring, dst_bo); - fd_ringbuffer_attach_bo(ring, src_bo); + cs.attach_bo(dst_bo); + cs.attach_bo(src_bo); for (i = 0; i < sizedwords; i++) { - OUT_PKT7(ring, CP_MEM_TO_MEM, 5); - OUT_RING(ring, 0x00000000); - OUT_RELOC(ring, dst_bo, dst_off, 0, 0); - OUT_RELOC(ring, src_bo, src_off, 0, 0); + fd_pkt7(cs, CP_MEM_TO_MEM, 5) + .add(CP_MEM_TO_MEM_0()) + .add(CP_MEM_TO_MEM_DST(dst_bo, dst_off)) + .add(CP_MEM_TO_MEM_SRC_A(src_bo, src_off)); dst_off += 4; src_off += 4; diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h index f012bcdbcc3..73995695d8e 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h @@ -104,28 +104,34 @@ struct fd6_state { }; static inline void -fd6_state_emit(struct fd6_state *state, struct fd_ringbuffer *ring) +fd6_state_emit(struct fd6_state *state, fd_cs &cs) { if (!state->num_groups) return; - OUT_PKT7(ring, CP_SET_DRAW_STATE, 3 * state->num_groups); + fd_pkt7 pkt(cs, CP_SET_DRAW_STATE, 3 * state->num_groups); + for (unsigned i = 0; i < state->num_groups; i++) { struct fd6_state_group *g = &state->groups[i]; - unsigned n = g->stateobj ? fd_ringbuffer_size(g->stateobj) / 4 : 0; assert((g->enable_mask & ~ENABLE_ALL) == 0); - if (n == 0) { - OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) | - CP_SET_DRAW_STATE__0_DISABLE | g->enable_mask | - CP_SET_DRAW_STATE__0_GROUP_ID(g->group_id)); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); + if (g->stateobj) { + unsigned n = fd_ringbuffer_size(g->stateobj) / 4; + + pkt.add(CP_SET_DRAW_STATE__0(i, + .count = n, + .group_id = g->group_id, + .dword = g->enable_mask, + )); + pkt.add(g->stateobj, 0, NULL); } else { - OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(n) | g->enable_mask | - CP_SET_DRAW_STATE__0_GROUP_ID(g->group_id)); - OUT_RB(ring, g->stateobj); + pkt.add(CP_SET_DRAW_STATE__0(i, + .disable = true, + .group_id = g->group_id, + .dword = g->enable_mask, + )); + pkt.add(CP_SET_DRAW_STATE__ADDR(i)); } if (g->stateobj) @@ -201,51 +207,55 @@ fd6_emit_get_prog(struct fd6_emit *emit) template static inline void -__event_write(struct fd_ringbuffer *ring, enum fd_gpu_event event, +__event_write(fd_cs &cs, enum fd_gpu_event event, enum event_write_src esrc, enum event_write_dst edst, uint32_t val, struct fd_bo *bo, uint32_t offset) { struct fd_gpu_event_info info = fd_gpu_events[event]; unsigned len = info.needs_seqno ? 4 : 1; + if ((CHIP == A7XX) && (event == FD_RB_DONE)) + len--; + + fd_pkt7 pkt(cs, CP_EVENT_WRITE, len); + if (CHIP == A6XX) { - OUT_PKT7(ring, CP_EVENT_WRITE, len); - OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(info.raw_event) | + pkt.add(CP_EVENT_WRITE_0_EVENT(info.raw_event) | COND(info.needs_seqno, CP_EVENT_WRITE_0_TIMESTAMP)); } else if (CHIP == A7XX) { - if (event == FD_RB_DONE) - len--; - OUT_PKT7(ring, CP_EVENT_WRITE, len); - OUT_RING(ring, CP_EVENT_WRITE7_0_EVENT(info.raw_event) | - CP_EVENT_WRITE7_0_WRITE_SRC(esrc) | - CP_EVENT_WRITE7_0_WRITE_DST(edst) | - COND(info.needs_seqno, CP_EVENT_WRITE7_0_WRITE_ENABLED)); + pkt.add(CP_EVENT_WRITE7_0_EVENT(info.raw_event) | + CP_EVENT_WRITE7_0_WRITE_SRC(esrc) | + CP_EVENT_WRITE7_0_WRITE_DST(edst) | + COND(info.needs_seqno, CP_EVENT_WRITE7_0_WRITE_ENABLED)); } if (info.needs_seqno) { - OUT_RELOC(ring, bo, offset); /* ADDR_LO/HI */ + pkt.add(CP_EVENT_WRITE_ADDR( + .bo = bo, + .bo_offset = offset, + )); /* ADDR_LO/HI */ if (len == 4) - OUT_RING(ring, val); + pkt.add(val); } } template static inline void -fd6_record_ts(struct fd_ringbuffer *ring, struct fd_bo *bo, uint32_t offset) +fd6_record_ts(fd_cs &cs, struct fd_bo *bo, uint32_t offset) { - __event_write(ring, FD_RB_DONE, EV_WRITE_ALWAYSON, EV_DST_RAM, 0, bo, offset); + __event_write(cs, FD_RB_DONE, EV_WRITE_ALWAYSON, EV_DST_RAM, 0, bo, offset); } template static inline void -fd6_fence_write(struct fd_ringbuffer *ring, uint32_t val, struct fd_bo *bo, uint32_t offset) +fd6_fence_write(fd_cs &cs, uint32_t val, struct fd_bo *bo, uint32_t offset) { - __event_write(ring, FD_CACHE_CLEAN, EV_WRITE_USER_32B, EV_DST_RAM, val, bo, offset); + __event_write(cs, FD_CACHE_CLEAN, EV_WRITE_USER_32B, EV_DST_RAM, val, bo, offset); } template static inline unsigned -fd6_event_write(struct fd_context *ctx, struct fd_ringbuffer *ring, enum fd_gpu_event event) +fd6_event_write(struct fd_context *ctx, fd_cs &cs, enum fd_gpu_event event) { struct fd6_context *fd6_ctx = fd6_context(ctx); struct fd_gpu_event_info info = fd_gpu_events[event]; @@ -256,7 +266,7 @@ fd6_event_write(struct fd_context *ctx, struct fd_ringbuffer *ring, enum fd_gpu_ seqno = ++fd6_ctx->seqno; } - __event_write(ring, event, EV_WRITE_USER_32B, EV_DST_RAM, seqno, + __event_write(cs, event, EV_WRITE_USER_32B, EV_DST_RAM, seqno, control_ptr(fd6_ctx, seqno)); return seqno; @@ -264,45 +274,20 @@ fd6_event_write(struct fd_context *ctx, struct fd_ringbuffer *ring, enum fd_gpu_ template static inline void -fd6_cache_inv(struct fd_context *ctx, struct fd_ringbuffer *ring) +fd6_cache_inv(struct fd_context *ctx, fd_cs &cs) { - fd6_event_write(ctx, ring, FD_CCU_INVALIDATE_COLOR); - fd6_event_write(ctx, ring, FD_CCU_INVALIDATE_DEPTH); - fd6_event_write(ctx, ring, FD_CACHE_INVALIDATE); + fd6_event_write(ctx, cs, FD_CCU_INVALIDATE_COLOR); + fd6_event_write(ctx, cs, FD_CCU_INVALIDATE_DEPTH); + fd6_event_write(ctx, cs, FD_CACHE_INVALIDATE); } template static inline void -fd6_cache_flush(struct fd_context *ctx, struct fd_ringbuffer *ring) +fd6_emit_blit(struct fd_context *ctx, fd_cs &cs) { - struct fd6_context *fd6_ctx = fd6_context(ctx); - unsigned seqno; - - seqno = fd6_event_write(ctx, ring, FD_RB_DONE); - - OUT_PKT7(ring, CP_WAIT_REG_MEM, 6); - OUT_RING(ring, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) | - CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY)); - OUT_RELOC(ring, control_ptr(fd6_ctx, seqno)); - OUT_RING(ring, CP_WAIT_REG_MEM_3_REF(seqno)); - OUT_RING(ring, CP_WAIT_REG_MEM_4_MASK(~0)); - OUT_RING(ring, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16)); - - seqno = fd6_event_write(ctx, ring, FD_CACHE_CLEAN); - - OUT_PKT7(ring, CP_WAIT_MEM_GTE, 4); - OUT_RING(ring, CP_WAIT_MEM_GTE_0_RESERVED(0)); - OUT_RELOC(ring, control_ptr(fd6_ctx, seqno)); - OUT_RING(ring, CP_WAIT_MEM_GTE_3_REF(seqno)); -} - -template -static inline void -fd6_emit_blit(struct fd_context *ctx, struct fd_ringbuffer *ring) -{ - emit_marker6(ring, 7); - fd6_event_write(ctx, ring, FD_BLIT); - emit_marker6(ring, 7); + emit_marker6(cs, 7); + fd6_event_write(ctx, cs, FD_BLIT); + emit_marker6(cs, 7); } static inline bool @@ -323,7 +308,7 @@ fd6_geom_stage(mesa_shader_stage type) } } -static inline uint32_t +static inline enum adreno_pm4_type3_packets fd6_stage2opcode(mesa_shader_stage type) { return fd6_geom_stage(type) ? CP_LOAD_STATE6_GEOM : CP_LOAD_STATE6_FRAG; @@ -369,37 +354,45 @@ fd6_gl2spacing(enum gl_tess_spacing spacing) } template -void fd6_emit_3d_state(struct fd_ringbuffer *ring, - struct fd6_emit *emit) assert_dt; +void fd6_emit_3d_state(fd_cs &cs, struct fd6_emit *emit) assert_dt; struct fd6_compute_state; template -void fd6_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct fd6_compute_state *cs) assert_dt; +void fd6_emit_cs_state(struct fd_context *ctx, fd_cs &cs, + struct fd6_compute_state *cp) assert_dt; template -void fd6_emit_ccu_cntl(struct fd_ringbuffer *ring, struct fd_screen *screen, bool gmem); +void fd6_emit_ccu_cntl(fd_cs &cs, struct fd_screen *screen, bool gmem); template -void fd6_emit_static_regs(struct fd_context *ctx, struct fd_ringbuffer *ring); +void fd6_emit_static_regs(fd_cs &cs, struct fd_context *ctx); template -void fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring); +void fd6_emit_restore(fd_cs &cs, struct fd_batch *batch); void fd6_emit_init_screen(struct pipe_screen *pscreen); static inline void -fd6_emit_ib(struct fd_ringbuffer *ring, struct fd_ringbuffer *target) +fd6_emit_ib(fd_cs &cs, struct fd_ringbuffer *target) { - emit_marker6(ring, 6); - __OUT_IB5(ring, target); - emit_marker6(ring, 6); + if (target->cur == target->start) + return; + + unsigned count = fd_ringbuffer_cmd_count(target); + + emit_marker6(cs, 6); + + for (unsigned i = 0; i < count; i++) { + uint32_t dwords; + + fd_pkt7(cs, CP_INDIRECT_BUFFER, 3) + .add(target, i, &dwords) + .add(A5XX_CP_INDIRECT_BUFFER_2(.ib_size = dwords)); + + assert(dwords > 0); + } + + emit_marker6(cs, 6); } -#define WRITE(reg, val) \ - do { \ - OUT_PKT4(ring, reg, 1); \ - OUT_RING(ring, val); \ - } while (0) - #endif /* FD6_EMIT_H */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_gmem.cc b/src/gallium/drivers/freedreno/a6xx/fd6_gmem.cc index eeefd8b2b02..1da1b806f1c 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_gmem.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_gmem.cc @@ -33,31 +33,9 @@ #include "fd6_resource.h" #include "fd6_zsa.h" -/** - * Emits the flags registers, suitable for RB_COLOR_FLAG_BUFFER, - * RB_DEPTH_FLAG_BUFFER, TPL1_A2D_SRC_TEXTURE_FLAG_BASE, and RB_RESOLVE_SYSTEM_FLAG_BUFFER_BASE. - */ -void -fd6_emit_flag_reference(struct fd_ringbuffer *ring, struct fd_resource *rsc, - int level, int layer) -{ - if (fd_resource_ubwc_enabled(rsc, level)) { - OUT_RELOC(ring, rsc->bo, fd_resource_ubwc_offset(rsc, level, layer), 0, - 0); - OUT_RING(ring, A6XX_RB_COLOR_FLAG_BUFFER_PITCH_PITCH( - fdl_ubwc_pitch(&rsc->layout, level)) | - A6XX_RB_COLOR_FLAG_BUFFER_PITCH_ARRAY_PITCH( - rsc->layout.ubwc_layer_size >> 2)); - } else { - OUT_RING(ring, 0x00000000); /* RB_COLOR_FLAG_BUFFER[i].ADDR_LO */ - OUT_RING(ring, 0x00000000); /* RB_COLOR_FLAG_BUFFER[i].ADDR_HI */ - OUT_RING(ring, 0x00000000); - } -} - template static void -emit_mrt(struct fd_ringbuffer *ring, struct pipe_framebuffer_state *pfb, +emit_mrt(fd_crb &crb, struct pipe_framebuffer_state *pfb, const struct fd_gmem_stateobj *gmem) { unsigned srgb_cntl = 0; @@ -109,26 +87,34 @@ emit_mrt(struct fd_ringbuffer *ring, struct pipe_framebuffer_state *pfb, assert((offset + slice->size0) <= fd_bo_size(rsc->bo)); /* Batch with no draws? */ - fd_ringbuffer_attach_bo(ring, rsc->bo); + crb.attach_bo(rsc->bo); - OUT_REG(ring, - RB_MRT_BUF_INFO(CHIP, i, - .color_format = format, - .color_tile_mode = tile_mode, - .color_swap = swap, - .losslesscompen = fd_resource_ubwc_enabled(rsc, psurf->level), - ), - A6XX_RB_MRT_PITCH(i, stride), - A6XX_RB_MRT_ARRAY_PITCH(i, array_stride), - A6XX_RB_MRT_BASE(i, .bo = rsc->bo, .bo_offset = offset), - A6XX_RB_MRT_BASE_GMEM(i, base)); + crb.add(RB_MRT_BUF_INFO(CHIP, i, + .color_format = format, + .color_tile_mode = tile_mode, + .color_swap = swap, + .losslesscompen = fd_resource_ubwc_enabled(rsc, psurf->level), + )); - OUT_REG(ring, A6XX_SP_PS_MRT_REG(i, .color_format = format, - .color_sint = sint, .color_uint = uint)); + crb.add(A6XX_RB_MRT_PITCH(i, stride)); + crb.add(A6XX_RB_MRT_ARRAY_PITCH(i, array_stride)); + crb.add(A6XX_RB_MRT_BASE(i, .bo = rsc->bo, .bo_offset = offset)); + crb.add(A6XX_RB_MRT_BASE_GMEM(i, base)); - OUT_PKT4(ring, REG_A6XX_RB_COLOR_FLAG_BUFFER(i), 3); - fd6_emit_flag_reference(ring, rsc, psurf->level, - psurf->first_layer); + crb.add(A6XX_SP_PS_MRT_REG(i, + .color_format = format, + .color_sint = sint, + .color_uint = uint + )); + + crb.add(A6XX_RB_COLOR_FLAG_BUFFER_ADDR(i, + .bo = rsc->bo, + .bo_offset = fd_resource_ubwc_offset(rsc, psurf->level, psurf->first_layer), + )); + crb.add(A6XX_RB_COLOR_FLAG_BUFFER_PITCH(i, + .pitch = fdl_ubwc_pitch(&rsc->layout, psurf->level), + .array_pitch = rsc->layout.ubwc_layer_size >> 2, + )); if (i == 0) mrt0_format = format; @@ -136,18 +122,16 @@ emit_mrt(struct fd_ringbuffer *ring, struct pipe_framebuffer_state *pfb, if (pfb->zsbuf.texture) max_layer_index = pfb->zsbuf.last_layer - pfb->zsbuf.first_layer; - OUT_REG(ring, A6XX_GRAS_LRZ_MRT_BUFFER_INFO_0(.color_format = mrt0_format)); + crb.add(A6XX_GRAS_LRZ_MRT_BUFFER_INFO_0(.color_format = mrt0_format)); + crb.add(A6XX_RB_SRGB_CNTL(.dword = srgb_cntl)); + crb.add(A6XX_SP_SRGB_CNTL(.dword = srgb_cntl)); - OUT_REG(ring, A6XX_RB_SRGB_CNTL(.dword = srgb_cntl)); - OUT_REG(ring, A6XX_SP_SRGB_CNTL(.dword = srgb_cntl)); - - OUT_REG(ring, A6XX_GRAS_CL_ARRAY_SIZE(max_layer_index)); + crb.add(A6XX_GRAS_CL_ARRAY_SIZE(max_layer_index)); } template static void -emit_zs(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct pipe_surface *zsbuf, const struct fd_gmem_stateobj *gmem) +emit_zs(fd_crb &crb, struct pipe_surface *zsbuf, const struct fd_gmem_stateobj *gmem) { if (zsbuf->texture) { struct fd_resource *rsc = fd_resource(zsbuf->texture); @@ -162,45 +146,46 @@ emit_zs(struct fd_context *ctx, struct fd_ringbuffer *ring, * enabled, in which case it wouldn't have been part of the batch * resource tracking */ - fd_ringbuffer_attach_bo(ring, rsc->bo); + crb.attach_bo(rsc->bo); if (zsbuf->format == PIPE_FORMAT_S8_UINT) { /* S8 is implemented as Z32_S8 minus the Z32 plane: */ enum a6xx_depth_format fmt = DEPTH6_32; - OUT_REG(ring, - RB_DEPTH_BUFFER_INFO(CHIP, - .depth_format = fmt, - .tilemode = TILE6_3, - .losslesscompen = fd_resource_ubwc_enabled(rsc, zsbuf->level), - ), - A6XX_RB_DEPTH_BUFFER_PITCH(0), - A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(0), - A6XX_RB_DEPTH_BUFFER_BASE(.qword = 0), - A6XX_RB_DEPTH_GMEM_BASE(base)); - - OUT_REG(ring, A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt)); + crb.add(RB_DEPTH_BUFFER_INFO(CHIP, + .depth_format = fmt, + .tilemode = TILE6_3, + .losslesscompen = fd_resource_ubwc_enabled(rsc, zsbuf->level), + )); + crb.add(A6XX_RB_DEPTH_BUFFER_PITCH()); + crb.add(A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH()); + crb.add(A6XX_RB_DEPTH_BUFFER_BASE()); + crb.add(A6XX_RB_DEPTH_GMEM_BASE(base)); + crb.add(A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt)); stencil = rsc; } else { enum a6xx_depth_format fmt = fd6_pipe2depth(zsbuf->format); - OUT_REG(ring, - RB_DEPTH_BUFFER_INFO(CHIP, - .depth_format = fmt, - .tilemode = TILE6_3, - .losslesscompen = fd_resource_ubwc_enabled(rsc, zsbuf->level), - ), - A6XX_RB_DEPTH_BUFFER_PITCH(stride), - A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(array_stride), - A6XX_RB_DEPTH_BUFFER_BASE(.bo = rsc->bo, .bo_offset = offset), - A6XX_RB_DEPTH_GMEM_BASE(base)); + crb.add(RB_DEPTH_BUFFER_INFO(CHIP, + .depth_format = fmt, + .tilemode = TILE6_3, + .losslesscompen = fd_resource_ubwc_enabled(rsc, zsbuf->level), + )); + crb.add(A6XX_RB_DEPTH_BUFFER_PITCH(stride)); + crb.add(A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(array_stride)); + crb.add(A6XX_RB_DEPTH_BUFFER_BASE(.bo = rsc->bo, .bo_offset = offset)); + crb.add(A6XX_RB_DEPTH_GMEM_BASE(base)); + crb.add(A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt)); - OUT_REG(ring, A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt)); - - OUT_PKT4(ring, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE, 3); - fd6_emit_flag_reference(ring, rsc, zsbuf->level, - zsbuf->first_layer); + crb.add(A6XX_RB_DEPTH_FLAG_BUFFER_BASE( + .bo = rsc->bo, + .bo_offset = fd_resource_ubwc_offset(rsc, zsbuf->level, zsbuf->first_layer), + )); + crb.add(A6XX_RB_DEPTH_FLAG_BUFFER_PITCH( + .pitch = fdl_ubwc_pitch(&rsc->layout, zsbuf->level), + .array_pitch = rsc->layout.ubwc_layer_size >> 2, + )); } if (stencil) { @@ -210,54 +195,46 @@ emit_zs(struct fd_context *ctx, struct fd_ringbuffer *ring, uint32_t offset = fd_resource_offset(stencil, zsbuf->level, zsbuf->first_layer); - fd_ringbuffer_attach_bo(ring, stencil->bo); + crb.attach_bo(stencil->bo); - OUT_REG(ring, - RB_STENCIL_BUFFER_INFO( - CHIP, - .separate_stencil = true, - .tilemode = TILE6_3, - ), - A6XX_RB_STENCIL_BUFFER_PITCH(stride), - A6XX_RB_STENCIL_BUFFER_ARRAY_PITCH(array_stride), - A6XX_RB_STENCIL_BUFFER_BASE(.bo = stencil->bo, .bo_offset = offset), - A6XX_RB_STENCIL_GMEM_BASE(base) - ); + crb.add(RB_STENCIL_BUFFER_INFO(CHIP, + .separate_stencil = true, + .tilemode = TILE6_3, + )); + crb.add(A6XX_RB_STENCIL_BUFFER_PITCH(stride)); + crb.add(A6XX_RB_STENCIL_BUFFER_ARRAY_PITCH(array_stride)); + crb.add(A6XX_RB_STENCIL_BUFFER_BASE(.bo = stencil->bo, .bo_offset = offset)); + crb.add(A6XX_RB_STENCIL_GMEM_BASE(base)); } else { - OUT_REG(ring, RB_STENCIL_BUFFER_INFO(CHIP, 0)); + crb.add(RB_STENCIL_BUFFER_INFO(CHIP, 0)); } } else { - OUT_REG(ring, - RB_DEPTH_BUFFER_INFO( - CHIP, - .depth_format = DEPTH6_NONE, - ), - A6XX_RB_DEPTH_BUFFER_PITCH(), - A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(), - A6XX_RB_DEPTH_BUFFER_BASE(), - A6XX_RB_DEPTH_GMEM_BASE(), - ); - - OUT_REG(ring, - A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE)); - - OUT_REG(ring, RB_STENCIL_BUFFER_INFO(CHIP, 0)); + crb.add(RB_DEPTH_BUFFER_INFO(CHIP, + .depth_format = DEPTH6_NONE, + )); + crb.add(A6XX_RB_DEPTH_BUFFER_PITCH()); + crb.add(A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH()); + crb.add(A6XX_RB_DEPTH_BUFFER_BASE()); + crb.add(A6XX_RB_DEPTH_GMEM_BASE()); + crb.add(A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE)); + crb.add(RB_STENCIL_BUFFER_INFO(CHIP, 0)); } } template static void -emit_lrz(struct fd_batch *batch, struct fd_batch_subpass *subpass) +emit_lrz(fd_cs &cs, struct fd_batch *batch, struct fd_batch_subpass *subpass) { struct pipe_framebuffer_state *pfb = &batch->framebuffer; - struct fd_ringbuffer *ring = batch->gmem; if (!subpass->lrz) { - OUT_REG(ring, A6XX_GRAS_LRZ_BUFFER_BASE(), - A6XX_GRAS_LRZ_BUFFER_PITCH(), - A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE()); + fd_crb crb(cs, 6); + + crb.add(A6XX_GRAS_LRZ_BUFFER_BASE()); + crb.add(A6XX_GRAS_LRZ_BUFFER_PITCH()); + crb.add(A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE()); if (CHIP >= A7XX) - OUT_REG(ring, A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO()); + crb.add(A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO()); return; } @@ -267,25 +244,25 @@ emit_lrz(struct fd_batch *batch, struct fd_batch_subpass *subpass) * we change the LRZ buffer after a sub-pass, but get a * cache-hit on stale data from the previous LRZ buffer. */ - fd6_event_write(batch->ctx, ring, FD_LRZ_FLUSH); + fd6_event_write(batch->ctx, cs, FD_LRZ_FLUSH); + + fd_crb crb(cs, 6); struct fd_resource *zsbuf = fd_resource(pfb->zsbuf.texture); - OUT_REG(ring, - A6XX_GRAS_LRZ_BUFFER_BASE(.bo = subpass->lrz), - A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = zsbuf->lrz_layout.lrz_pitch), - A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE( - .bo = zsbuf->lrz_layout.lrz_fc_size ? subpass->lrz : NULL, - .bo_offset = zsbuf->lrz_layout.lrz_fc_offset - ), - ); - fd_ringbuffer_attach_bo(ring, subpass->lrz); + + crb.attach_bo(subpass->lrz); + + crb.add(A6XX_GRAS_LRZ_BUFFER_BASE(.bo = subpass->lrz)); + crb.add(A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = zsbuf->lrz_layout.lrz_pitch)); + crb.add(A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE( + .bo = zsbuf->lrz_layout.lrz_fc_size ? subpass->lrz : NULL, + .bo_offset = zsbuf->lrz_layout.lrz_fc_offset + )); if (CHIP >= A7XX) { - OUT_REG(ring, - A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO( - .depth_format = fd6_pipe2depth(pfb->zsbuf.format), - ) - ); + crb.add(A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO( + .depth_format = fd6_pipe2depth(pfb->zsbuf.format), + )); } } @@ -319,21 +296,23 @@ emit_lrz_clears(struct fd_batch *batch) /* prep before first clear: */ if (count == 0) { - struct fd_ringbuffer *ring = fd_batch_get_prologue(batch); + fd_cs cs(fd_batch_get_prologue(batch)); - fd6_emit_ccu_cntl(ring, ctx->screen, false); + fd6_emit_ccu_cntl(cs, ctx->screen, false); - OUT_PKT7(ring, CP_SET_MARKER, 1); - OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BLIT2DSCALE)); + fd_pkt7(cs, CP_SET_MARKER, 1) + .add(A6XX_CP_SET_MARKER_0_MODE(RM6_BLIT2DSCALE)); - fd6_emit_flushes(ctx, ring, FD6_FLUSH_CACHE); + fd6_emit_flushes(ctx, cs, FD6_FLUSH_CACHE); if (ctx->screen->info->a6xx.magic.RB_DBG_ECO_CNTL_blit != ctx->screen->info->a6xx.magic.RB_DBG_ECO_CNTL) { /* This a non-context register, so we have to WFI before changing. */ - OUT_WFI5(ring); - OUT_PKT4(ring, REG_A6XX_RB_DBG_ECO_CNTL, 1); - OUT_RING(ring, ctx->screen->info->a6xx.magic.RB_DBG_ECO_CNTL_blit); + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); + fd_pkt4(cs, 1) + .add(A6XX_RB_DBG_ECO_CNTL( + .dword = ctx->screen->info->a6xx.magic.RB_DBG_ECO_CNTL_blit + )); } } @@ -344,13 +323,15 @@ emit_lrz_clears(struct fd_batch *batch) /* cleanup after last clear: */ if (count > 0) { - struct fd_ringbuffer *ring = fd_batch_get_prologue(batch); + fd_cs cs(fd_batch_get_prologue(batch)); if (ctx->screen->info->a6xx.magic.RB_DBG_ECO_CNTL_blit != ctx->screen->info->a6xx.magic.RB_DBG_ECO_CNTL) { - OUT_WFI5(ring); - OUT_PKT4(ring, REG_A6XX_RB_DBG_ECO_CNTL, 1); - OUT_RING(ring, ctx->screen->info->a6xx.magic.RB_DBG_ECO_CNTL); + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); + fd_pkt4(cs, 1) + .add(A6XX_RB_DBG_ECO_CNTL( + .dword = ctx->screen->info->a6xx.magic.RB_DBG_ECO_CNTL + )); } /* Clearing writes via CCU color in the PS stage, and LRZ is read via @@ -360,7 +341,7 @@ emit_lrz_clears(struct fd_batch *batch) * has_ccu_flush_bug (and it is added by fd6_emit_flushes() already * in that case) */ - fd6_emit_flushes(batch->ctx, ring, + fd6_emit_flushes(batch->ctx, cs, FD6_FLUSH_CCU_COLOR | FD6_INVALIDATE_CACHE); } @@ -487,29 +468,24 @@ patch_fb_read_sysmem(struct fd_batch *batch) template static void -update_render_cntl(struct fd_batch *batch, struct pipe_framebuffer_state *pfb, +update_render_cntl(fd_cs &cs, struct fd_screen *screen, + struct pipe_framebuffer_state *pfb, bool binning) { - struct fd_ringbuffer *ring = batch->gmem; - if (CHIP >= A7XX) { - OUT_REG(ring, - RB_RENDER_CNTL( - CHIP, + with_crb (cs, 2) { + crb.add(RB_RENDER_CNTL(CHIP, .fs_disable = binning, .raster_mode = TYPE_TILED, .raster_direction = LR_TB - ) - ); - OUT_REG(ring, - A7XX_GRAS_SU_RENDER_CNTL( + )); + crb.add(A7XX_GRAS_SU_RENDER_CNTL( .fs_disable = binning, - ) - ); + )); + } return; } - struct fd_screen *screen = batch->ctx->screen; bool depth_ubwc_enable = false; uint32_t mrts_ubwc_enable = 0; int i; @@ -540,23 +516,23 @@ update_render_cntl(struct fd_batch *batch, struct pipe_framebuffer_state *pfb, ); if (screen->info->a6xx.has_cp_reg_write) { - OUT_PKT(ring, CP_REG_WRITE, - CP_REG_WRITE_0(TRACK_RENDER_CNTL), - CP_REG_WRITE_1(rb_render_cntl.reg), - CP_REG_WRITE_2(rb_render_cntl.value), - ); + fd_pkt7(cs, CP_REG_WRITE, 3) + .add(CP_REG_WRITE_0(TRACK_RENDER_CNTL)) + .add(CP_REG_WRITE_1(rb_render_cntl.reg)) + .add(CP_REG_WRITE_2(rb_render_cntl.value)); } else { - OUT_REG(ring, rb_render_cntl); + fd_pkt4(cs, 1) + .add(rb_render_cntl); } } +template static void -update_vsc_pipe(struct fd_batch *batch) +update_vsc_pipe(fd_cs &cs, struct fd_batch *batch) { struct fd_context *ctx = batch->ctx; struct fd6_context *fd6_ctx = fd6_context(ctx); const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct fd_ringbuffer *ring = batch->gmem; unsigned max_vsc_pipes = batch->ctx->screen->info->num_vsc_pipes; int i; @@ -599,34 +575,33 @@ update_vsc_pipe(struct fd_batch *batch) fd_bo_new(ctx->screen->dev, sz, FD_BO_NOMAP, "vsc_prim_strm"); } - fd_ringbuffer_attach_bo(ring, fd6_ctx->vsc_draw_strm); - fd_ringbuffer_attach_bo(ring, fd6_ctx->vsc_prim_strm); + cs.attach_bo(fd6_ctx->vsc_draw_strm); + cs.attach_bo(fd6_ctx->vsc_prim_strm); - OUT_REG(ring, A6XX_VSC_BIN_SIZE(.width = gmem->bin_w, .height = gmem->bin_h), - A6XX_VSC_SIZE_BASE(.bo = fd6_ctx->vsc_draw_strm, - .bo_offset = max_vsc_pipes * - fd6_ctx->vsc_draw_strm_pitch)); + fd_ncrb ncrb(cs, 12 + max_vsc_pipes); - OUT_REG(ring, A6XX_VSC_EXPANDED_BIN_CNTL(.nx = gmem->nbins_x, .ny = gmem->nbins_y)); + ncrb.add(A6XX_VSC_BIN_SIZE(.width = gmem->bin_w, .height = gmem->bin_h)); + ncrb.add(A6XX_VSC_SIZE_BASE( + .bo = fd6_ctx->vsc_draw_strm, + .bo_offset = max_vsc_pipes * fd6_ctx->vsc_draw_strm_pitch + )); + + ncrb.add(A6XX_VSC_EXPANDED_BIN_CNTL(.nx = gmem->nbins_x, .ny = gmem->nbins_y)); - OUT_PKT4(ring, REG_A6XX_VSC_PIPE_CONFIG_REG(0), max_vsc_pipes); for (i = 0; i < max_vsc_pipes; i++) { const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; - OUT_RING(ring, A6XX_VSC_PIPE_CONFIG_REG_X(pipe->x) | - A6XX_VSC_PIPE_CONFIG_REG_Y(pipe->y) | - A6XX_VSC_PIPE_CONFIG_REG_W(pipe->w) | - A6XX_VSC_PIPE_CONFIG_REG_H(pipe->h)); + ncrb.add(A6XX_VSC_PIPE_CONFIG_REG(i, + .x = pipe->x, .y = pipe->y, + .w = pipe->w, .h = pipe->h)); } - OUT_REG( - ring, A6XX_VSC_PIPE_DATA_PRIM_BASE(.bo = fd6_ctx->vsc_prim_strm), - A6XX_VSC_PIPE_DATA_PRIM_STRIDE(.dword = fd6_ctx->vsc_prim_strm_pitch), - A6XX_VSC_PIPE_DATA_PRIM_LENGTH(.dword = fd6_ctx->vsc_prim_strm_pitch - 64)); + ncrb.add(A6XX_VSC_PIPE_DATA_PRIM_BASE(.bo = fd6_ctx->vsc_prim_strm)); + ncrb.add(A6XX_VSC_PIPE_DATA_PRIM_STRIDE(.dword = fd6_ctx->vsc_prim_strm_pitch)); + ncrb.add(A6XX_VSC_PIPE_DATA_PRIM_LENGTH(.dword = fd6_ctx->vsc_prim_strm_pitch - 64)); - OUT_REG( - ring, A6XX_VSC_PIPE_DATA_DRAW_BASE(.bo = fd6_ctx->vsc_draw_strm), - A6XX_VSC_PIPE_DATA_DRAW_STRIDE(.dword = fd6_ctx->vsc_draw_strm_pitch), - A6XX_VSC_PIPE_DATA_DRAW_LENGTH(.dword = fd6_ctx->vsc_draw_strm_pitch - 64)); + ncrb.add(A6XX_VSC_PIPE_DATA_DRAW_BASE(.bo = fd6_ctx->vsc_draw_strm)); + ncrb.add(A6XX_VSC_PIPE_DATA_DRAW_STRIDE(.dword = fd6_ctx->vsc_draw_strm_pitch)); + ncrb.add(A6XX_VSC_PIPE_DATA_DRAW_LENGTH(.dword = fd6_ctx->vsc_draw_strm_pitch - 64)); } /* @@ -641,41 +616,47 @@ update_vsc_pipe(struct fd_batch *batch) static void emit_vsc_overflow_test(struct fd_batch *batch) { - struct fd_ringbuffer *ring = batch->gmem; const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct fd6_context *fd6_ctx = fd6_context(batch->ctx); + fd_cs cs(batch->gmem); assert((fd6_ctx->vsc_draw_strm_pitch & 0x3) == 0); assert((fd6_ctx->vsc_prim_strm_pitch & 0x3) == 0); /* Check for overflow, write vsc_scratch if detected: */ for (int i = 0; i < gmem->num_vsc_pipes; i++) { - OUT_PKT7(ring, CP_COND_WRITE5, 8); - OUT_RING(ring, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) | - CP_COND_WRITE5_0_WRITE_MEMORY); - OUT_RING(ring, REG_A6XX_VSC_PIPE_DATA_DRAW_SIZE(i)); - OUT_RING(ring, 0); - OUT_RING(ring, CP_COND_WRITE5_3_REF(fd6_ctx->vsc_draw_strm_pitch - 64)); - OUT_RING(ring, CP_COND_WRITE5_4_MASK(~0)); - OUT_RELOC(ring, - control_ptr(fd6_ctx, vsc_overflow)); /* WRITE_ADDR_LO/HI */ - OUT_RING(ring, - CP_COND_WRITE5_7_WRITE_DATA(1 + fd6_ctx->vsc_draw_strm_pitch)); + fd_pkt7(cs, CP_COND_WRITE5, 8) + .add(CP_COND_WRITE5_0( + .function = WRITE_GE, + .write_memory = true, + )) + .add(CP_COND_WRITE5_POLL_ADDR( + .qword = REG_A6XX_VSC_PIPE_DATA_DRAW_SIZE(i), + )) + .add(CP_COND_WRITE5_3( + .ref = fd6_ctx->vsc_draw_strm_pitch - 64, + )) + .add(CP_COND_WRITE5_4(.mask = ~0)) + .add(CP_COND_WRITE5_WRITE_ADDR(control_ptr(fd6_ctx, vsc_overflow))) + .add(CP_COND_WRITE5_7(.write_data = 1 + fd6_ctx->vsc_draw_strm_pitch)); - OUT_PKT7(ring, CP_COND_WRITE5, 8); - OUT_RING(ring, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) | - CP_COND_WRITE5_0_WRITE_MEMORY); - OUT_RING(ring, REG_A6XX_VSC_PIPE_DATA_PRIM_SIZE(i)); - OUT_RING(ring, 0); - OUT_RING(ring, CP_COND_WRITE5_3_REF(fd6_ctx->vsc_prim_strm_pitch - 64)); - OUT_RING(ring, CP_COND_WRITE5_4_MASK(~0)); - OUT_RELOC(ring, - control_ptr(fd6_ctx, vsc_overflow)); /* WRITE_ADDR_LO/HI */ - OUT_RING(ring, - CP_COND_WRITE5_7_WRITE_DATA(3 + fd6_ctx->vsc_prim_strm_pitch)); + fd_pkt7(cs, CP_COND_WRITE5, 8) + .add(CP_COND_WRITE5_0( + .function = WRITE_GE, + .write_memory = true, + )) + .add(CP_COND_WRITE5_POLL_ADDR( + .qword = REG_A6XX_VSC_PIPE_DATA_PRIM_SIZE(i), + )) + .add(CP_COND_WRITE5_3( + .ref = fd6_ctx->vsc_prim_strm_pitch - 64, + )) + .add(CP_COND_WRITE5_4(.mask = ~0)) + .add(CP_COND_WRITE5_WRITE_ADDR(control_ptr(fd6_ctx, vsc_overflow))) + .add(CP_COND_WRITE5_7(.write_data = 1 + fd6_ctx->vsc_prim_strm_pitch)); } - OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0); + fd_pkt7(cs, CP_WAIT_MEM_WRITES, 0); } static void @@ -741,83 +722,85 @@ check_vsc_overflow(struct fd_context *ctx) template static void -emit_common_init(struct fd_batch *batch) +emit_common_init(fd_cs &cs, struct fd_batch *batch) { struct fd_context *ctx = batch->ctx; - struct fd_ringbuffer *ring = batch->gmem; struct fd_autotune *at = &batch->ctx->autotune; struct fd_batch_result *result = batch->autotune_result; if (!result) return; - fd_ringbuffer_attach_bo(ring, at->results_mem); + cs.attach_bo(at->results_mem); - OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNTER_CNTL, 1); - OUT_RING(ring, A6XX_RB_SAMPLE_COUNTER_CNTL_COPY); + fd_pkt4(cs, 1) + .add(A6XX_RB_SAMPLE_COUNTER_CNTL(.copy = true)); if (!ctx->screen->info->a7xx.has_event_write_sample_count) { - OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNTER_BASE, 2); - OUT_RELOC(ring, results_ptr(at, result[result->idx].samples_start)); + fd_pkt4(cs, 2) + .add(A6XX_RB_SAMPLE_COUNTER_BASE( + results_ptr(at, result[result->idx].samples_start) + )); - fd6_event_write(ctx, ring, FD_ZPASS_DONE); + fd6_event_write(ctx, cs, FD_ZPASS_DONE); /* Copied from blob's cmdstream, not sure why it is done. */ if (CHIP == A7XX) { - fd6_event_write(ctx, ring, FD_CCU_CLEAN_DEPTH); + fd6_event_write(ctx, cs, FD_CCU_CLEAN_DEPTH); } } else { - OUT_PKT(ring, CP_EVENT_WRITE7, - CP_EVENT_WRITE7_0( + fd_pkt7(cs, CP_EVENT_WRITE7, 3) + .add(CP_EVENT_WRITE7_0( .event = ZPASS_DONE, .write_sample_count = true, - ), - EV_DST_RAM_CP_EVENT_WRITE7_1( + )) + .add(EV_DST_RAM_CP_EVENT_WRITE7_1( results_ptr(at, result[result->idx].samples_start) - ), - ); + )); } } template static void -emit_common_fini(struct fd_batch *batch) +emit_common_fini(fd_cs &cs, struct fd_batch *batch) { struct fd_context *ctx = batch->ctx; - struct fd_ringbuffer *ring = batch->gmem; struct fd_autotune *at = &batch->ctx->autotune; struct fd_batch_result *result = batch->autotune_result; - fd6_emit_flushes(batch->ctx, ring, batch->barrier); + fd6_emit_flushes(batch->ctx, cs, batch->barrier); if (!result) return; - fd_ringbuffer_attach_bo(ring, at->results_mem); - - OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNTER_CNTL, 1); - OUT_RING(ring, A6XX_RB_SAMPLE_COUNTER_CNTL_COPY); + cs.attach_bo(at->results_mem); if (!ctx->screen->info->a7xx.has_event_write_sample_count) { - OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNTER_BASE, 2); - OUT_RELOC(ring, results_ptr(at, result[result->idx].samples_end)); + with_crb (cs, 3) { + crb.add(A6XX_RB_SAMPLE_COUNTER_CNTL(.copy = true)); + crb.add(A6XX_RB_SAMPLE_COUNTER_BASE( + results_ptr(at, result[result->idx].samples_end) + )); + } - fd6_event_write(batch->ctx, ring, FD_ZPASS_DONE); + fd6_event_write(batch->ctx, cs, FD_ZPASS_DONE); } else { - OUT_PKT(ring, CP_EVENT_WRITE7, - CP_EVENT_WRITE7_0( + fd_pkt4(cs, 1) + .add(A6XX_RB_SAMPLE_COUNTER_CNTL(.copy = true)); + + fd_pkt7(cs, CP_EVENT_WRITE7, 3) + .add(CP_EVENT_WRITE7_0( .event = ZPASS_DONE, .write_sample_count = true, .sample_count_end_offset = true, .write_accum_sample_count_diff = true, - ), - EV_DST_RAM_CP_EVENT_WRITE7_1( + )) + .add(EV_DST_RAM_CP_EVENT_WRITE7_1( results_ptr(at, result[result->idx].samples_start) - ), - ); + )); } - fd6_fence_write(ring, result->fence, results_ptr(at, fence)); + fd6_fence_write(cs, result->fence, results_ptr(at, fence)); } /* @@ -827,63 +810,66 @@ emit_common_fini(struct fd_batch *batch) * If we aren't using binning pass, this just emits a normal IB. */ static void -emit_conditional_ib(struct fd_batch *batch, const struct fd_tile *tile, +emit_conditional_ib(fd_cs &cs, struct fd_batch *batch, const struct fd_tile *tile, struct fd_ringbuffer *target) { - struct fd_ringbuffer *ring = batch->gmem; - /* If we have fast clear, that won't count in the VSC state, so it * forces an unconditional IB (because we know there is something * to do for this tile) */ if (batch->cleared || !use_hw_binning(batch)) { - fd6_emit_ib(batch->gmem, target); + fd6_emit_ib(cs, target); return; } if (target->cur == target->start) return; - emit_marker6(ring, 6); + emit_marker6(cs, 6); unsigned count = fd_ringbuffer_cmd_count(target); - BEGIN_RING(ring, 5 + 4 * count); /* ensure conditional doesn't get split */ + BEGIN_RING(cs.ring(), 5 + 4 * count); /* ensure conditional doesn't get split */ - OUT_PKT7(ring, CP_REG_TEST, 1); - OUT_RING(ring, A6XX_CP_REG_TEST_0_REG(REG_A6XX_VSC_CHANNEL_VISIBILITY(tile->p)) | - A6XX_CP_REG_TEST_0_BIT(tile->n) | - A6XX_CP_REG_TEST_0_SKIP_WAIT_FOR_ME); + fd_pkt7(cs, CP_REG_TEST, 1) + .add(A6XX_CP_REG_TEST_0( + .reg = REG_A6XX_VSC_CHANNEL_VISIBILITY(tile->p), + .bit = tile->n, + .skip_wait_for_me = true, + )); - OUT_PKT7(ring, CP_COND_REG_EXEC, 2); - OUT_RING(ring, CP_COND_REG_EXEC_0_MODE(PRED_TEST)); - OUT_RING(ring, PRED_TEST_CP_COND_REG_EXEC_1_DWORDS(4 * count)); + fd_pkt7(cs, CP_COND_REG_EXEC, 2) + .add(CP_COND_REG_EXEC_0_MODE(PRED_TEST)) + .add(PRED_TEST_CP_COND_REG_EXEC_1_DWORDS(4 * count)); for (unsigned i = 0; i < count; i++) { uint32_t dwords; - OUT_PKT7(ring, CP_INDIRECT_BUFFER, 3); - dwords = fd_ringbuffer_emit_reloc_ring_full(ring, target, i) / 4; + + fd_pkt7(cs, CP_INDIRECT_BUFFER, 3) + .add(target, i, &dwords) + .add(A5XX_CP_INDIRECT_BUFFER_2(.ib_size = dwords)); + assert(dwords > 0); - OUT_RING(ring, dwords); } - emit_marker6(ring, 6); + emit_marker6(cs, 6); } static void -set_scissor(struct fd_ringbuffer *ring, uint32_t x1, uint32_t y1, uint32_t x2, - uint32_t y2) +set_scissor(fd_cs &cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2) { - OUT_REG(ring, A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1), - A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2)); + fd_pkt4(cs, 2) + .add(A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1)) + .add(A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2)); - OUT_REG(ring, A6XX_GRAS_A2D_SCISSOR_TL(.x = x1, .y = y1), - A6XX_GRAS_A2D_SCISSOR_BR(.x = x2, .y = y2)); + fd_pkt4(cs, 2) + .add(A6XX_GRAS_A2D_SCISSOR_TL(.x = x1, .y = y1)) + .add(A6XX_GRAS_A2D_SCISSOR_BR(.x = x2, .y = y2)); } template static void -set_tessfactor_bo(struct fd_ringbuffer *ring, struct fd_batch *batch) +set_tessfactor_bo(fd_cs &cs, struct fd_batch *batch) { /* This happens after all drawing has been emitted to the draw CS, so we know * whether we need the tess BO pointers. @@ -894,10 +880,13 @@ set_tessfactor_bo(struct fd_ringbuffer *ring, struct fd_batch *batch) struct fd_screen *screen = batch->ctx->screen; assert(screen->tess_bo); - fd_ringbuffer_attach_bo(ring, screen->tess_bo); - OUT_REG(ring, PC_TESS_BASE(CHIP, screen->tess_bo)); + cs.attach_bo(screen->tess_bo); + + fd_pkt4(cs, 2) + .add(PC_TESS_BASE(CHIP, screen->tess_bo)); + /* Updating PC_TESS_BASE could race with the next draw which uses it. */ - OUT_WFI5(ring); + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); } struct bin_size_params { @@ -907,16 +896,16 @@ struct bin_size_params { enum a6xx_lrz_feedback_mask lrz_feedback_zmode_mask; }; +/* nregs: 3 */ template static void -set_bin_size(struct fd_ringbuffer *ring, const struct fd_gmem_stateobj *gmem, - struct bin_size_params p) +set_bin_size(fd_crb &crb, const struct fd_gmem_stateobj *gmem, struct bin_size_params p) { unsigned w = gmem ? gmem->bin_w : 0; unsigned h = gmem ? gmem->bin_h : 0; if (CHIP == A6XX) { - OUT_REG(ring, A6XX_GRAS_SC_BIN_CNTL( + crb.add(A6XX_GRAS_SC_BIN_CNTL( .binw = w, .binh = h, .render_mode = p.render_mode, .force_lrz_write_dis = p.force_lrz_write_dis, @@ -924,14 +913,14 @@ set_bin_size(struct fd_ringbuffer *ring, const struct fd_gmem_stateobj *gmem, .lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask, )); } else { - OUT_REG(ring, A6XX_GRAS_SC_BIN_CNTL( + crb.add(A6XX_GRAS_SC_BIN_CNTL( .binw = w, .binh = h, .render_mode = p.render_mode, .force_lrz_write_dis = p.force_lrz_write_dis, .lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask, )); } - OUT_REG(ring, RB_CNTL( + crb.add(RB_CNTL( CHIP, .binw = w, .binh = h, .render_mode = p.render_mode, @@ -940,70 +929,66 @@ set_bin_size(struct fd_ringbuffer *ring, const struct fd_gmem_stateobj *gmem, .lrz_feedback_zmode_mask = p.lrz_feedback_zmode_mask, )); /* no flag for RB_RESOLVE_CNTL_3... */ - OUT_REG(ring, A6XX_RB_RESOLVE_CNTL_3(.binw = w, .binh = h)); + crb.add(A6XX_RB_RESOLVE_CNTL_3(.binw = w, .binh = h)); } template static void -emit_binning_pass(struct fd_batch *batch) assert_dt +emit_binning_pass(fd_cs &cs, struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->gmem; const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct fd_screen *screen = batch->ctx->screen; assert(!batch->tessellation); - set_scissor(ring, 0, 0, gmem->width - 1, gmem->height - 1); + set_scissor(cs, 0, 0, gmem->width - 1, gmem->height - 1); - emit_marker6(ring, 7); - OUT_PKT7(ring, CP_SET_MARKER, 1); - OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_VISIBILITY)); - emit_marker6(ring, 7); + emit_marker6(cs, 7); + fd_pkt7(cs, CP_SET_MARKER, 1) + .add(A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_VISIBILITY)); + emit_marker6(cs, 7); - OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1); - OUT_RING(ring, 0x1); + fd_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1) + .add(0x1); - OUT_PKT7(ring, CP_SET_MODE, 1); - OUT_RING(ring, 0x1); + fd_pkt7(cs, CP_SET_MODE, 1) + .add(0x1); - OUT_WFI5(ring); + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); - OUT_REG(ring, A6XX_VFD_RENDER_MODE(.render_mode = BINNING_PASS)); + fd_pkt4(cs, 1) + .add(A6XX_VFD_RENDER_MODE(.render_mode = BINNING_PASS)); - update_vsc_pipe(batch); + update_vsc_pipe(cs, batch); if (CHIP == A6XX) { - OUT_REG(ring, A6XX_PC_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL)); - OUT_REG(ring, A6XX_VFD_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL)); + fd_pkt4(cs, 1) + .add(A6XX_PC_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL)); + fd_pkt4(cs, 1) + .add(A6XX_VFD_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL)); } - OUT_PKT7(ring, CP_EVENT_WRITE, 1); - OUT_RING(ring, UNK_2C); + fd_pkt7(cs, CP_EVENT_WRITE, 1) + .add(UNK_2C); - OUT_PKT4(ring, REG_A6XX_RB_WINDOW_OFFSET, 1); - OUT_RING(ring, A6XX_RB_WINDOW_OFFSET_X(0) | A6XX_RB_WINDOW_OFFSET_Y(0)); - - OUT_PKT4(ring, REG_A6XX_TPL1_WINDOW_OFFSET, 1); - OUT_RING(ring, - A6XX_TPL1_WINDOW_OFFSET_X(0) | A6XX_TPL1_WINDOW_OFFSET_Y(0)); + fd_crb(cs, 2) + .add(A6XX_RB_WINDOW_OFFSET(.x = 0, .y = 0)) + .add(A6XX_TPL1_WINDOW_OFFSET(.x = 0, .y = 0)); /* emit IB to binning drawcmds: */ - trace_start_binning_ib(&batch->trace, ring); + trace_start_binning_ib(&batch->trace, cs.ring()); foreach_subpass (subpass, batch) { - emit_lrz(batch, subpass); - fd6_emit_ib(ring, subpass->draw); + emit_lrz(cs, batch, subpass); + fd6_emit_ib(cs, subpass->draw); } - trace_end_binning_ib(&batch->trace, ring); + trace_end_binning_ib(&batch->trace, cs.ring()); - OUT_PKT7(ring, CP_SET_DRAW_STATE, 3); - OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) | - CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS | - CP_SET_DRAW_STATE__0_GROUP_ID(0)); - OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0)); - OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0)); + fd_pkt7(cs, CP_SET_DRAW_STATE, 3) + .add(CP_SET_DRAW_STATE__0(0, .disable_all_groups = true)) + .add(CP_SET_DRAW_STATE__ADDR(0)); - OUT_PKT7(ring, CP_EVENT_WRITE, 1); - OUT_RING(ring, UNK_2D); + fd_pkt7(cs, CP_EVENT_WRITE, 1) + .add(UNK_2D); /* This flush is probably required because the VSC, which produces the * visibility stream, is a client of UCHE, whereas the CP needs to read @@ -1013,49 +998,49 @@ emit_binning_pass(struct fd_batch *batch) assert_dt * emit_vsc_overflow_test) or the VSC_DATA buffer directly (implicitly * as part of draws). */ - fd6_emit_flushes(batch->ctx, ring, + fd6_emit_flushes(batch->ctx, cs, FD6_FLUSH_CACHE | FD6_WAIT_FOR_IDLE | FD6_WAIT_FOR_ME); - trace_start_vsc_overflow_test(&batch->trace, batch->gmem); + trace_start_vsc_overflow_test(&batch->trace, cs.ring()); emit_vsc_overflow_test(batch); - trace_end_vsc_overflow_test(&batch->trace, batch->gmem); + trace_end_vsc_overflow_test(&batch->trace, cs.ring()); - OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1); - OUT_RING(ring, 0x0); + fd_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1) + .add(0x0); - OUT_PKT7(ring, CP_SET_MODE, 1); - OUT_RING(ring, 0x0); + fd_pkt7(cs, CP_SET_MODE, 1) + .add(0x0); - fd6_emit_ccu_cntl(ring, screen, true); + fd6_emit_ccu_cntl(cs, screen, true); } +/* nregs: 7 */ static void -emit_msaa(struct fd_ringbuffer *ring, unsigned nr) +emit_msaa(fd_crb &crb, unsigned nr) { enum a3xx_msaa_samples samples = fd_msaa_samples(nr); - OUT_PKT4(ring, REG_A6XX_TPL1_RAS_MSAA_CNTL, 2); - OUT_RING(ring, A6XX_TPL1_RAS_MSAA_CNTL_SAMPLES(samples)); - OUT_RING(ring, A6XX_TPL1_DEST_MSAA_CNTL_SAMPLES(samples) | - COND(samples == MSAA_ONE, - A6XX_TPL1_DEST_MSAA_CNTL_MSAA_DISABLE)); + crb.add(A6XX_TPL1_RAS_MSAA_CNTL(.samples = samples)); + crb.add(A6XX_TPL1_DEST_MSAA_CNTL( + .samples = samples, + .msaa_disable = (samples == MSAA_ONE), + )); - OUT_PKT4(ring, REG_A6XX_GRAS_SC_RAS_MSAA_CNTL, 2); - OUT_RING(ring, A6XX_GRAS_SC_RAS_MSAA_CNTL_SAMPLES(samples)); - OUT_RING(ring, A6XX_GRAS_SC_DEST_MSAA_CNTL_SAMPLES(samples) | - COND(samples == MSAA_ONE, - A6XX_GRAS_SC_DEST_MSAA_CNTL_MSAA_DISABLE)); + crb.add(A6XX_GRAS_SC_RAS_MSAA_CNTL(.samples = samples)); + crb.add(A6XX_GRAS_SC_DEST_MSAA_CNTL( + .samples = samples, + .msaa_disable = (samples == MSAA_ONE), + )); - OUT_PKT4(ring, REG_A6XX_RB_RAS_MSAA_CNTL, 2); - OUT_RING(ring, A6XX_RB_RAS_MSAA_CNTL_SAMPLES(samples)); - OUT_RING(ring, - A6XX_RB_DEST_MSAA_CNTL_SAMPLES(samples) | - COND(samples == MSAA_ONE, A6XX_RB_DEST_MSAA_CNTL_MSAA_DISABLE)); + crb.add(A6XX_RB_RAS_MSAA_CNTL(.samples = samples)); + crb.add(A6XX_RB_DEST_MSAA_CNTL( + .samples = samples, + .msaa_disable = (samples == MSAA_ONE), + )); - OUT_PKT4(ring, REG_A6XX_RB_RESOLVE_GMEM_BUFFER_INFO, 1); - OUT_RING(ring, A6XX_RB_RESOLVE_GMEM_BUFFER_INFO_SAMPLES(samples)); + crb.add(A6XX_RB_RESOLVE_GMEM_BUFFER_INFO(.samples = samples)); } template @@ -1063,15 +1048,18 @@ static void prepare_tile_setup(struct fd_batch *batch); template static void prepare_tile_fini(struct fd_batch *batch); +template static void -fd7_emit_static_binning_regs(struct fd_ringbuffer *ring) +fd7_emit_static_binning_regs(fd_cs &cs) { - OUT_REG(ring, A7XX_RB_UNKNOWN_8812(0x0)); - OUT_REG(ring, A7XX_RB_CCU_DBG_ECO_CNTL(0x0)); - OUT_REG(ring, A7XX_GRAS_UNKNOWN_8007(0x0)); - OUT_REG(ring, A6XX_GRAS_UNKNOWN_8110(0x2)); - OUT_REG(ring, A7XX_RB_UNKNOWN_8E09(0x4)); - OUT_REG(ring, A7XX_RB_CLEAR_TARGET(.clear_mode = CLEAR_MODE_GMEM)); + fd_ncrb ncrb(cs, 6); + + ncrb.add(A7XX_RB_UNKNOWN_8812(0x0)); + ncrb.add(A7XX_RB_CCU_DBG_ECO_CNTL(0x0)); + ncrb.add(A7XX_GRAS_UNKNOWN_8007(0x0)); + ncrb.add(A6XX_GRAS_UNKNOWN_8110(0x2)); + ncrb.add(A7XX_RB_UNKNOWN_8E09(0x4)); + ncrb.add(A7XX_RB_CLEAR_TARGET(.clear_mode = CLEAR_MODE_GMEM)); } template @@ -1079,29 +1067,35 @@ struct fd_ringbuffer * fd6_build_preemption_preamble(struct fd_context *ctx) { struct fd_screen *screen = ctx->screen; - struct fd_ringbuffer *ring; - ring = fd_ringbuffer_new_object(ctx->pipe, 0x1000); - fd6_emit_static_regs(ctx, ring); - fd6_emit_ccu_cntl(ring, screen, false); + fd_cs cs(ctx->pipe, 0x1000); + + fd6_emit_static_regs(cs, ctx); + fd6_emit_ccu_cntl(cs, screen, false); if (CHIP == A6XX) { - OUT_REG(ring, A6XX_PC_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL)); - OUT_REG(ring, A6XX_VFD_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL)); + fd_pkt4(cs, 1) + .add(A6XX_PC_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL)); + fd_pkt4(cs, 1) + .add(A6XX_VFD_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL)); } else if (CHIP >= A7XX) { - fd7_emit_static_binning_regs(ring); + fd7_emit_static_binning_regs(cs); } /* TODO use CP_MEM_TO_SCRATCH_MEM on a7xx. The VSC scratch mem should be * automatically saved, unlike GPU registers, so we wouldn't have to * manually restore this state. */ - OUT_PKT7(ring, CP_MEM_TO_REG, 3); - OUT_RING(ring, CP_MEM_TO_REG_0_REG(REG_A6XX_VSC_CHANNEL_VISIBILITY(0)) | - CP_MEM_TO_REG_0_CNT(32)); - OUT_RELOC(ring, control_ptr(fd6_context(ctx), vsc_state)); + fd_pkt7(cs, CP_MEM_TO_REG, 3) + .add(CP_MEM_TO_REG_0( + .reg = REG_A6XX_VSC_CHANNEL_VISIBILITY(0), + .cnt = 32, + )) + .add(CP_MEM_TO_REG_SRC( + control_ptr(fd6_context(ctx), vsc_state), + )); - return ring; + return cs.ring(); } FD_GENX(fd6_build_preemption_preamble); @@ -1110,127 +1104,138 @@ template static void fd6_emit_tile_init(struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->gmem; struct pipe_framebuffer_state *pfb = &batch->framebuffer; const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct fd_screen *screen = batch->ctx->screen; + fd_cs cs(batch->gmem); emit_lrz_clears(batch); - fd6_emit_restore(batch, ring); + fd6_emit_restore(cs, batch); - fd6_event_write(batch->ctx, ring, FD_LRZ_FLUSH); + fd6_event_write(batch->ctx, cs, FD_LRZ_FLUSH); if (batch->prologue) { - trace_start_prologue(&batch->trace, ring); - fd6_emit_ib(ring, batch->prologue); - trace_end_prologue(&batch->trace, ring); + trace_start_prologue(&batch->trace, cs.ring()); + fd6_emit_ib(cs, batch->prologue); + trace_end_prologue(&batch->trace, cs.ring()); } - fd6_cache_inv(batch->ctx, ring); + fd6_cache_inv(batch->ctx, cs); prepare_tile_setup(batch); prepare_tile_fini(batch); - OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); - OUT_RING(ring, 0x0); + fd_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1) + .add(0x0); /* blob controls "local" in IB2, but I think that is not required */ - OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_LOCAL, 1); - OUT_RING(ring, 0x1); + fd_pkt7(cs, CP_SKIP_IB2_ENABLE_LOCAL, 1) + .add(0x1); - fd6_emit_ccu_cntl(ring, screen, true); + fd6_emit_ccu_cntl(cs, screen, true); + + with_crb (cs, 150) { + emit_zs(crb, &pfb->zsbuf, batch->gmem_state); + emit_mrt(crb, pfb, batch->gmem_state); + emit_msaa(crb, pfb->samples); + } - emit_zs(batch->ctx, ring, &pfb->zsbuf, batch->gmem_state); - emit_mrt(ring, pfb, batch->gmem_state); - emit_msaa(ring, pfb->samples); patch_fb_read_gmem(batch); - if (CHIP >= A7XX) - fd7_emit_static_binning_regs(ring); + if (CHIP >= A7XX) { + fd7_emit_static_binning_regs(cs); + } if (use_hw_binning(batch)) { /* enable stream-out during binning pass: */ - OUT_REG(ring, A6XX_VPC_SO_OVERRIDE(false)); + with_crb (cs, 4) { + crb.add(A6XX_VPC_SO_OVERRIDE(false)); - set_bin_size(ring, gmem, { - .render_mode = BINNING_PASS, - .buffers_location = BUFFERS_IN_GMEM, - .lrz_feedback_zmode_mask = LRZ_FEEDBACK_NONE, - }); - update_render_cntl(batch, pfb, true); - emit_binning_pass(batch); - - /* and disable stream-out for draw pass: */ - OUT_REG(ring, A6XX_VPC_SO_OVERRIDE(true)); - - /* - * NOTE: even if we detect VSC overflow and disable use of - * visibility stream in draw pass, it is still safe to execute - * the reset of these cmds: - */ - - set_bin_size(ring, gmem, { - .render_mode = RENDERING_PASS, - .force_lrz_write_dis = !screen->info->a6xx.has_lrz_feedback, - .buffers_location = BUFFERS_IN_GMEM, - .lrz_feedback_zmode_mask = screen->info->a6xx.has_lrz_feedback - ? LRZ_FEEDBACK_EARLY_Z_LATE_Z - : LRZ_FEEDBACK_NONE, - }); - - OUT_REG(ring, A6XX_VFD_RENDER_MODE(RENDERING_PASS)); - - if (CHIP == A6XX) { - OUT_REG(ring, A6XX_PC_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL)); - OUT_REG(ring, A6XX_VFD_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL)); + set_bin_size(crb, gmem, { + .render_mode = BINNING_PASS, + .buffers_location = BUFFERS_IN_GMEM, + .lrz_feedback_zmode_mask = LRZ_FEEDBACK_NONE, + }); } - OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); - OUT_RING(ring, 0x1); + update_render_cntl(cs, screen, pfb, true); + emit_binning_pass(cs, batch); + + with_crb (cs, 5) { + /* and disable stream-out for draw pass: */ + crb.add(A6XX_VPC_SO_OVERRIDE(true)); + + /* + * NOTE: even if we detect VSC overflow and disable use of + * visibility stream in draw pass, it is still safe to execute + * the reset of these cmds: + */ + + set_bin_size(crb, gmem, { + .render_mode = RENDERING_PASS, + .force_lrz_write_dis = !screen->info->a6xx.has_lrz_feedback, + .buffers_location = BUFFERS_IN_GMEM, + .lrz_feedback_zmode_mask = screen->info->a6xx.has_lrz_feedback + ? LRZ_FEEDBACK_EARLY_Z_LATE_Z + : LRZ_FEEDBACK_NONE, + }); + + crb.add(A6XX_VFD_RENDER_MODE(RENDERING_PASS)); + } + + if (CHIP == A6XX) { + fd_pkt4(cs, 1) + .add(A6XX_PC_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL)); + fd_pkt4(cs, 1) + .add(A6XX_VFD_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL)); + } + + fd_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1) + .add(0x1); /* Upload state regs to memory to be restored on skipsaverestore * preemption. */ - OUT_PKT7(ring, CP_REG_TO_MEM, 3); - OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A6XX_VSC_CHANNEL_VISIBILITY(0)) | - CP_REG_TO_MEM_0_CNT(32)); - OUT_RELOC(ring, control_ptr(fd6_context(batch->ctx), vsc_state)); + fd_pkt7(cs, CP_REG_TO_MEM, 3) + .add(CP_REG_TO_MEM_0( + .reg = REG_A6XX_VSC_CHANNEL_VISIBILITY(0), + .cnt = 32, + )) + .add(CP_REG_TO_MEM_DEST( + control_ptr(fd6_context(batch->ctx), vsc_state) + )); } else { - /* no binning pass, so enable stream-out for draw pass:: */ - OUT_REG(ring, A6XX_VPC_SO_OVERRIDE(false)); + with_crb (cs, 4) { + /* no binning pass, so enable stream-out for draw pass: */ + crb.add(A6XX_VPC_SO_OVERRIDE(false)); - set_bin_size(ring, gmem, { - .render_mode = RENDERING_PASS, - .force_lrz_write_dis = !screen->info->a6xx.has_lrz_feedback, - .buffers_location = BUFFERS_IN_GMEM, - .lrz_feedback_zmode_mask = - screen->info->a6xx.has_lrz_feedback - ? LRZ_FEEDBACK_EARLY_Z_OR_EARLY_Z_LATE_Z - : LRZ_FEEDBACK_NONE, - }); + set_bin_size(crb, gmem, { + .render_mode = RENDERING_PASS, + .force_lrz_write_dis = !screen->info->a6xx.has_lrz_feedback, + .buffers_location = BUFFERS_IN_GMEM, + .lrz_feedback_zmode_mask = + screen->info->a6xx.has_lrz_feedback + ? LRZ_FEEDBACK_EARLY_Z_OR_EARLY_Z_LATE_Z + : LRZ_FEEDBACK_NONE, + }); + } } - update_render_cntl(batch, pfb, false); + update_render_cntl(cs, screen, pfb, false); - emit_common_init(batch); + emit_common_init(cs, batch); } +/* nregs: 4 */ template static void -set_window_offset(struct fd_ringbuffer *ring, uint32_t x1, uint32_t y1) +set_window_offset(fd_crb &crb, uint32_t x1, uint32_t y1) { - OUT_PKT4(ring, REG_A6XX_RB_WINDOW_OFFSET, 1); - OUT_RING(ring, A6XX_RB_WINDOW_OFFSET_X(x1) | A6XX_RB_WINDOW_OFFSET_Y(y1)); - - OUT_PKT4(ring, REG_A6XX_RB_RESOLVE_WINDOW_OFFSET, 1); - OUT_RING(ring, A6XX_RB_RESOLVE_WINDOW_OFFSET_X(x1) | A6XX_RB_RESOLVE_WINDOW_OFFSET_Y(y1)); - - OUT_REG(ring, SP_WINDOW_OFFSET(CHIP, .x = x1, .y = y1)); - - OUT_PKT4(ring, REG_A6XX_TPL1_WINDOW_OFFSET, 1); - OUT_RING(ring, - A6XX_TPL1_WINDOW_OFFSET_X(x1) | A6XX_TPL1_WINDOW_OFFSET_Y(y1)); + crb.add(A6XX_RB_WINDOW_OFFSET(.x = x1, .y = y1)); + crb.add(A6XX_RB_RESOLVE_WINDOW_OFFSET(.x = x1, .y = y1)); + crb.add(SP_WINDOW_OFFSET(CHIP, .x = x1, .y = y1)); + crb.add(A6XX_TPL1_WINDOW_OFFSET(.x = x1, .y = y1)); } /* before mem2gmem */ @@ -1243,63 +1248,104 @@ fd6_emit_tile_prep(struct fd_batch *batch, const struct fd_tile *tile) struct fd_context *ctx = batch->ctx; const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct fd6_context *fd6_ctx = fd6_context(ctx); - struct fd_ringbuffer *ring = batch->gmem; + fd_cs cs(batch->gmem); - emit_marker6(ring, 7); - OUT_PKT7(ring, CP_SET_MARKER, 1); - OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_RENDER_START) | + emit_marker6(cs, 7); + fd_pkt7(cs, CP_SET_MARKER, 1) + .add(A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_RENDER_START) | A6XX_CP_SET_MARKER_0_USES_GMEM); - emit_marker6(ring, 7); + emit_marker6(cs, 7); uint32_t x1 = tile->xoff; uint32_t y1 = tile->yoff; uint32_t x2 = tile->xoff + tile->bin_w - 1; uint32_t y2 = tile->yoff + tile->bin_h - 1; - set_scissor(ring, x1, y1, x2, y2); - set_tessfactor_bo(ring, batch); + set_scissor(cs, x1, y1, x2, y2); + set_tessfactor_bo(cs, batch); - fd6_emit_ccu_cntl(ring, screen, true); + fd6_emit_ccu_cntl(cs, screen, true); - emit_zs(batch->ctx, ring, &pfb->zsbuf, batch->gmem_state); - emit_mrt(ring, pfb, batch->gmem_state); - emit_msaa(ring, pfb->samples); + with_crb (cs, 150) { + emit_zs(crb, &pfb->zsbuf, batch->gmem_state); + emit_mrt(crb, pfb, batch->gmem_state); + emit_msaa(crb, pfb->samples); + } if (use_hw_binning(batch)) { const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[tile->p]; - unsigned num_vsc_pipes = ctx->screen->info->num_vsc_pipes; + unsigned num_vsc_pipes = screen->info->num_vsc_pipes; - OUT_PKT7(ring, CP_WAIT_FOR_ME, 0); + fd_pkt7(cs, CP_WAIT_FOR_ME, 0); - OUT_PKT7(ring, CP_SET_MODE, 1); - OUT_RING(ring, 0x0); + fd_pkt7(cs, CP_SET_MODE, 1) + .add(0x0); - OUT_PKT7(ring, CP_SET_BIN_DATA5, 7); - /* A702 also sets BIT(0) but that hangchecks */ - OUT_RING(ring, CP_SET_BIN_DATA5_0_VSC_SIZE(pipe->w * pipe->h) | - CP_SET_BIN_DATA5_0_VSC_N(tile->n)); - OUT_RELOC(ring, fd6_ctx->vsc_draw_strm, /* per-pipe draw-stream address */ - (tile->p * fd6_ctx->vsc_draw_strm_pitch), 0, 0); - OUT_RELOC( - ring, fd6_ctx->vsc_draw_strm, /* VSC_PIPE_DATA_DRAW_BASE + (p * 4) */ - (tile->p * 4) + (num_vsc_pipes * fd6_ctx->vsc_draw_strm_pitch), - 0, 0); - OUT_RELOC(ring, fd6_ctx->vsc_prim_strm, - (tile->p * fd6_ctx->vsc_prim_strm_pitch), 0, 0); + fd_pkt7(cs, CP_SET_BIN_DATA5, 7) + /* A702 also sets BIT(0) but that hangchecks */ + .add(CP_SET_BIN_DATA5_0( + .vsc_size = pipe->w * pipe->h, + .vsc_n = tile->n, + )) + .add(NO_ABS_MASK_CP_SET_BIN_DATA5_BIN_DATA_ADDR( + .bo = fd6_ctx->vsc_draw_strm, + .bo_offset = tile->p * fd6_ctx->vsc_draw_strm_pitch, + )) + .add(NO_ABS_MASK_CP_SET_BIN_DATA5_BIN_SIZE_ADDR( + .bo = fd6_ctx->vsc_draw_strm, /* VSC_PIPE_DATA_DRAW_BASE + (p * 4) */ + .bo_offset = (tile->p * 4) + (num_vsc_pipes * fd6_ctx->vsc_draw_strm_pitch), + )) + .add(NO_ABS_MASK_CP_SET_BIN_DATA5_BIN_PRIM_STRM( + .bo = fd6_ctx->vsc_prim_strm, + .bo_offset = tile->p * fd6_ctx->vsc_prim_strm_pitch, + )); - OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1); - OUT_RING(ring, 0x0); + fd_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1) + .add(0x0); - /* and disable stream-out for draw pass: */ - OUT_REG(ring, A6XX_VPC_SO_OVERRIDE(true)); + with_crb (cs, 5) { + crb.add(A6XX_VPC_SO_OVERRIDE(true)); - /* - * NOTE: even if we detect VSC overflow and disable use of - * visibility stream in draw pass, it is still safe to execute - * the reset of these cmds: - */ + /* + * NOTE: even if we detect VSC overflow and disable use of + * visibility stream in draw pass, it is still safe to execute + * the reset of these cmds: + */ - set_bin_size(ring, gmem, { + set_bin_size(crb, gmem, { + .render_mode = RENDERING_PASS, + .force_lrz_write_dis = !screen->info->a6xx.has_lrz_feedback, + .buffers_location = BUFFERS_IN_GMEM, + .lrz_feedback_zmode_mask = screen->info->a6xx.has_lrz_feedback + ? LRZ_FEEDBACK_EARLY_Z_LATE_Z + : LRZ_FEEDBACK_NONE, + }); + + crb.add(A6XX_VFD_RENDER_MODE(RENDERING_PASS)); + } + + if (CHIP == A6XX) { + fd_pkt4(cs, 1) + .add(A6XX_PC_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL)); + fd_pkt4(cs, 1) + .add(A6XX_VFD_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL)); + } + + fd_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1) + .add(0x1); + } else { + fd_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1) + .add(0x1); + + /* no binning pass, so enable stream-out for draw pass:: */ + fd_pkt4(cs, 1) + .add(A6XX_VPC_SO_OVERRIDE(false)); + } + + with_crb (cs, 7) { + set_window_offset(crb, x1, y1); + + set_bin_size(crb, gmem, { .render_mode = RENDERING_PASS, .force_lrz_write_dis = !screen->info->a6xx.has_lrz_feedback, .buffers_location = BUFFERS_IN_GMEM, @@ -1307,52 +1353,14 @@ fd6_emit_tile_prep(struct fd_batch *batch, const struct fd_tile *tile) ? LRZ_FEEDBACK_EARLY_Z_LATE_Z : LRZ_FEEDBACK_NONE, }); - - OUT_REG(ring, A6XX_VFD_RENDER_MODE(RENDERING_PASS)); - - if (CHIP == A6XX) { - OUT_REG(ring, A6XX_PC_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL)); - OUT_REG(ring, A6XX_VFD_POWER_CNTL(screen->info->a6xx.magic.PC_POWER_CNTL)); - } - - OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); - OUT_RING(ring, 0x1); - - } else { - OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1); - OUT_RING(ring, 0x1); - - /* no binning pass, so enable stream-out for draw pass:: */ - OUT_REG(ring, A6XX_VPC_SO_OVERRIDE(false)); - - set_bin_size(ring, gmem, { - .render_mode = RENDERING_PASS, - .force_lrz_write_dis = !screen->info->a6xx.has_lrz_feedback, - .buffers_location = BUFFERS_IN_GMEM, - .lrz_feedback_zmode_mask = - screen->info->a6xx.has_lrz_feedback - ? LRZ_FEEDBACK_EARLY_Z_OR_EARLY_Z_LATE_Z - : LRZ_FEEDBACK_NONE, - }); } - set_window_offset(ring, x1, y1); - - set_bin_size(ring, gmem, { - .render_mode = RENDERING_PASS, - .force_lrz_write_dis = !ctx->screen->info->a6xx.has_lrz_feedback, - .buffers_location = BUFFERS_IN_GMEM, - .lrz_feedback_zmode_mask = ctx->screen->info->a6xx.has_lrz_feedback - ? LRZ_FEEDBACK_EARLY_Z_LATE_Z - : LRZ_FEEDBACK_NONE, - }); - - OUT_PKT7(ring, CP_SET_MODE, 1); - OUT_RING(ring, 0x0); + fd_pkt7(cs, CP_SET_MODE, 1) + .add(0x0); } static void -set_blit_scissor(struct fd_batch *batch, struct fd_ringbuffer *ring) +set_blit_scissor(struct fd_batch *batch, fd_cs &cs) { const struct pipe_framebuffer_state *pfb = &batch->framebuffer; @@ -1363,16 +1371,15 @@ set_blit_scissor(struct fd_batch *batch, struct fd_ringbuffer *ring) blit_scissor.maxx = ALIGN(pfb->width, 16); blit_scissor.maxy = ALIGN(pfb->height, 4); - OUT_PKT4(ring, REG_A6XX_RB_RESOLVE_CNTL_1, 2); - OUT_RING(ring, A6XX_RB_RESOLVE_CNTL_1_X(blit_scissor.minx) | - A6XX_RB_RESOLVE_CNTL_1_Y(blit_scissor.miny)); - OUT_RING(ring, A6XX_RB_RESOLVE_CNTL_2_X(blit_scissor.maxx - 1) | - A6XX_RB_RESOLVE_CNTL_2_Y(blit_scissor.maxy - 1)); + fd_pkt4(cs, 2) + .add(A6XX_RB_RESOLVE_CNTL_1(.x = blit_scissor.minx, .y = blit_scissor.miny)) + .add(A6XX_RB_RESOLVE_CNTL_2(.x = blit_scissor.maxx - 1, .y = blit_scissor.maxy - 1)); } +/* nregs: 10 */ template static void -emit_blit(struct fd_batch *batch, struct fd_ringbuffer *ring, uint32_t base, +emit_blit(struct fd_batch *batch, fd_crb &crb, uint32_t base, struct pipe_surface *psurf, bool stencil) { struct fd_resource *rsc = fd_resource(psurf->texture); @@ -1404,57 +1411,60 @@ emit_blit(struct fd_batch *batch, struct fd_ringbuffer *ring, uint32_t base, false); enum a3xx_msaa_samples samples = fd_msaa_samples(rsc->b.b.nr_samples); - OUT_REG(ring, - A6XX_RB_RESOLVE_SYSTEM_BUFFER_INFO( - .tile_mode = tile_mode, - .flags = ubwc_enabled, - .samples = samples, - .color_swap = swap, - .color_format = format, - ), - A6XX_RB_RESOLVE_SYSTEM_BUFFER_BASE(.bo = rsc->bo, .bo_offset = offset), - A6XX_RB_RESOLVE_SYSTEM_BUFFER_PITCH(stride), - A6XX_RB_RESOLVE_SYSTEM_BUFFER_ARRAY_PITCH(array_stride)); + crb.add(A6XX_RB_RESOLVE_SYSTEM_BUFFER_INFO( + .tile_mode = tile_mode, + .flags = ubwc_enabled, + .samples = samples, + .color_swap = swap, + .color_format = format, + )); + crb.add(A6XX_RB_RESOLVE_SYSTEM_BUFFER_BASE(.bo = rsc->bo, .bo_offset = offset)); + crb.add(A6XX_RB_RESOLVE_SYSTEM_BUFFER_PITCH(stride)); + crb.add(A6XX_RB_RESOLVE_SYSTEM_BUFFER_ARRAY_PITCH(array_stride)); - OUT_REG(ring, A6XX_RB_RESOLVE_GMEM_BUFFER_BASE(.dword = base)); + crb.add(A6XX_RB_RESOLVE_GMEM_BUFFER_BASE(.dword = base)); if (ubwc_enabled) { - OUT_PKT4(ring, REG_A6XX_RB_RESOLVE_SYSTEM_FLAG_BUFFER_BASE, 3); - fd6_emit_flag_reference(ring, rsc, psurf->level, - psurf->first_layer); + crb.add(A6XX_RB_RESOLVE_SYSTEM_FLAG_BUFFER_BASE( + .bo = rsc->bo, + .bo_offset = fd_resource_ubwc_offset(rsc, psurf->level, psurf->first_layer), + )); + crb.add(A6XX_RB_RESOLVE_SYSTEM_FLAG_BUFFER_PITCH( + .pitch = fdl_ubwc_pitch(&rsc->layout, psurf->level), + .array_pitch = rsc->layout.ubwc_layer_size >> 2, + )); } if (CHIP >= A7XX) - OUT_REG(ring, A7XX_RB_CLEAR_TARGET(.clear_mode = CLEAR_MODE_GMEM)); - - fd6_emit_blit(batch->ctx, ring); + crb.add(A7XX_RB_CLEAR_TARGET(.clear_mode = CLEAR_MODE_GMEM)); } template static void -emit_restore_blit(struct fd_batch *batch, struct fd_ringbuffer *ring, - uint32_t base, struct pipe_surface *psurf, unsigned buffer) +emit_restore_blit(struct fd_batch *batch, fd_cs &cs, uint32_t base, + struct pipe_surface *psurf, unsigned buffer) { bool stencil = (buffer == FD_BUFFER_STENCIL); - OUT_REG(ring, - A6XX_RB_RESOLVE_OPERATION( - .type = BLIT_EVENT_LOAD, - .sample_0 = util_format_is_pure_integer(psurf->format), - .depth = (buffer == FD_BUFFER_DEPTH), - ), - ); + with_crb (cs, 11) { + crb.add(A6XX_RB_RESOLVE_OPERATION( + .type = BLIT_EVENT_LOAD, + .sample_0 = util_format_is_pure_integer(psurf->format), + .depth = (buffer == FD_BUFFER_DEPTH), + )); - emit_blit(batch, ring, base, psurf, stencil); + emit_blit(batch, crb, base, psurf, stencil); + } + + fd6_emit_blit(batch->ctx, cs); } template static void -emit_subpass_clears(struct fd_batch *batch, struct fd_batch_subpass *subpass) +emit_subpass_clears(struct fd_batch *batch, fd_cs &cs, struct fd_batch_subpass *subpass) { struct pipe_framebuffer_state *pfb = &batch->framebuffer; const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct fd_ringbuffer *ring = subpass->subpass_clears; enum a3xx_msaa_samples samples = fd_msaa_samples(pfb->samples); uint32_t buffers = subpass->fast_cleared; @@ -1504,32 +1514,31 @@ emit_subpass_clears(struct fd_batch *batch, struct fd_batch_subpass *subpass) util_pack_color_union(pfmt, &uc, &swapped); - OUT_PKT4(ring, REG_A6XX_RB_RESOLVE_SYSTEM_BUFFER_INFO, 1); - OUT_RING(ring, - A6XX_RB_RESOLVE_SYSTEM_BUFFER_INFO_TILE_MODE(TILE6_LINEAR) | - A6XX_RB_RESOLVE_SYSTEM_BUFFER_INFO_SAMPLES(samples) | - A6XX_RB_RESOLVE_SYSTEM_BUFFER_INFO_COLOR_FORMAT(fd6_color_format(pfmt, TILE6_LINEAR))); + with_crb (cs, 9) { + crb.add(A6XX_RB_RESOLVE_SYSTEM_BUFFER_INFO( + .tile_mode = TILE6_LINEAR, + .samples = samples, + .color_format = fd6_color_format(pfmt, TILE6_LINEAR), + )); - OUT_PKT4(ring, REG_A6XX_RB_RESOLVE_OPERATION, 1); - OUT_RING(ring, A6XX_RB_RESOLVE_OPERATION_TYPE(BLIT_EVENT_CLEAR) | - A6XX_RB_RESOLVE_OPERATION_CLEAR_MASK(0xf)); + crb.add(A6XX_RB_RESOLVE_OPERATION( + .type = BLIT_EVENT_CLEAR, + .clear_mask = 0xf, + )); - OUT_PKT4(ring, REG_A6XX_RB_RESOLVE_GMEM_BUFFER_BASE, 1); - OUT_RING(ring, gmem->cbuf_base[i]); + crb.add(A6XX_RB_RESOLVE_GMEM_BUFFER_BASE(gmem->cbuf_base[i])); + crb.add(A6XX_RB_RESOLVE_CNTL_0()); - OUT_PKT4(ring, REG_A6XX_RB_RESOLVE_CNTL_0, 1); - OUT_RING(ring, 0); + crb.add(A6XX_RB_RESOLVE_CLEAR_COLOR_DW0(uc.ui[0])); + crb.add(A6XX_RB_RESOLVE_CLEAR_COLOR_DW1(uc.ui[1])); + crb.add(A6XX_RB_RESOLVE_CLEAR_COLOR_DW2(uc.ui[2])); + crb.add(A6XX_RB_RESOLVE_CLEAR_COLOR_DW3(uc.ui[3])); - OUT_PKT4(ring, REG_A6XX_RB_RESOLVE_CLEAR_COLOR_DW0, 4); - OUT_RING(ring, uc.ui[0]); - OUT_RING(ring, uc.ui[1]); - OUT_RING(ring, uc.ui[2]); - OUT_RING(ring, uc.ui[3]); + if (CHIP >= A7XX) + crb.add(A7XX_RB_CLEAR_TARGET(.clear_mode = CLEAR_MODE_GMEM)); + } - if (CHIP >= A7XX) - OUT_REG(ring, A7XX_RB_CLEAR_TARGET(.clear_mode = CLEAR_MODE_GMEM)); - - fd6_emit_blit(batch->ctx, ring); + fd6_emit_blit(batch->ctx, cs); } } @@ -1559,52 +1568,57 @@ emit_subpass_clears(struct fd_batch *batch, struct fd_batch_subpass *subpass) if (!has_separate_stencil && (buffers & PIPE_CLEAR_STENCIL)) mask |= 0x2; - OUT_PKT4(ring, REG_A6XX_RB_RESOLVE_SYSTEM_BUFFER_INFO, 1); - OUT_RING(ring, - A6XX_RB_RESOLVE_SYSTEM_BUFFER_INFO_TILE_MODE(TILE6_LINEAR) | - A6XX_RB_RESOLVE_SYSTEM_BUFFER_INFO_SAMPLES(samples) | - A6XX_RB_RESOLVE_SYSTEM_BUFFER_INFO_COLOR_FORMAT(fd6_color_format(pfmt, TILE6_LINEAR))); + with_crb (cs, 6) { + crb.add(A6XX_RB_RESOLVE_SYSTEM_BUFFER_INFO( + .tile_mode = TILE6_LINEAR, + .samples = samples, + .color_format = fd6_color_format(pfmt, TILE6_LINEAR), + )); - OUT_PKT4(ring, REG_A6XX_RB_RESOLVE_OPERATION, 1); - OUT_RING(ring, A6XX_RB_RESOLVE_OPERATION_TYPE(BLIT_EVENT_CLEAR) | - A6XX_RB_RESOLVE_OPERATION_DEPTH | - A6XX_RB_RESOLVE_OPERATION_CLEAR_MASK(mask)); + crb.add(A6XX_RB_RESOLVE_OPERATION( + .type = BLIT_EVENT_CLEAR, + .depth = true, + .clear_mask = mask, + )); - OUT_PKT4(ring, REG_A6XX_RB_RESOLVE_GMEM_BUFFER_BASE, 1); - OUT_RING(ring, gmem->zsbuf_base[0]); + crb.add(A6XX_RB_RESOLVE_GMEM_BUFFER_BASE(gmem->zsbuf_base[0])); + crb.add(A6XX_RB_RESOLVE_CNTL_0()); - OUT_PKT4(ring, REG_A6XX_RB_RESOLVE_CNTL_0, 1); - OUT_RING(ring, 0); + crb.add(A6XX_RB_RESOLVE_CLEAR_COLOR_DW0(clear_value)); - OUT_PKT4(ring, REG_A6XX_RB_RESOLVE_CLEAR_COLOR_DW0, 1); - OUT_RING(ring, clear_value); + if (CHIP >= A7XX) + crb.add(A7XX_RB_CLEAR_TARGET(.clear_mode = CLEAR_MODE_GMEM)); + } - fd6_emit_blit(batch->ctx, ring); + fd6_emit_blit(batch->ctx, cs); } /* Then clear the separate stencil buffer in case of 32 bit depth * formats with separate stencil. */ if (has_separate_stencil && (buffers & PIPE_CLEAR_STENCIL)) { - OUT_PKT4(ring, REG_A6XX_RB_RESOLVE_SYSTEM_BUFFER_INFO, 1); - OUT_RING(ring, A6XX_RB_RESOLVE_SYSTEM_BUFFER_INFO_TILE_MODE(TILE6_LINEAR) | - A6XX_RB_RESOLVE_SYSTEM_BUFFER_INFO_SAMPLES(samples) | - A6XX_RB_RESOLVE_SYSTEM_BUFFER_INFO_COLOR_FORMAT(FMT6_8_UINT)); + with_crb (cs, 6) { + crb.add(A6XX_RB_RESOLVE_SYSTEM_BUFFER_INFO( + .tile_mode = TILE6_LINEAR, + .samples = samples, + .color_format = FMT6_8_UINT, + )); - OUT_PKT4(ring, REG_A6XX_RB_RESOLVE_OPERATION, 1); - OUT_RING(ring, A6XX_RB_RESOLVE_OPERATION_TYPE(BLIT_EVENT_CLEAR) | - A6XX_RB_RESOLVE_OPERATION_DEPTH | - A6XX_RB_RESOLVE_OPERATION_CLEAR_MASK(0x1)); + crb.add(A6XX_RB_RESOLVE_OPERATION( + .type = BLIT_EVENT_CLEAR, + .depth = true, + .clear_mask = 0x1, + )); - OUT_PKT4(ring, REG_A6XX_RB_RESOLVE_GMEM_BUFFER_BASE, 1); - OUT_RING(ring, gmem->zsbuf_base[1]); + crb.add(A6XX_RB_RESOLVE_GMEM_BUFFER_BASE(gmem->zsbuf_base[1])); + crb.add(A6XX_RB_RESOLVE_CNTL_0()); - OUT_PKT4(ring, REG_A6XX_RB_RESOLVE_CNTL_0, 1); - OUT_RING(ring, 0); + crb.add(A6XX_RB_RESOLVE_CLEAR_COLOR_DW0(subpass->clear_stencil & 0xff)); - OUT_PKT4(ring, REG_A6XX_RB_RESOLVE_CLEAR_COLOR_DW0, 1); - OUT_RING(ring, subpass->clear_stencil & 0xff); + if (CHIP >= A7XX) + crb.add(A7XX_RB_CLEAR_TARGET(.clear_mode = CLEAR_MODE_GMEM)); + } - fd6_emit_blit(batch->ctx, ring); + fd6_emit_blit(batch->ctx, cs); } } @@ -1613,7 +1627,7 @@ emit_subpass_clears(struct fd_batch *batch, struct fd_batch_subpass *subpass) */ template static void -emit_restore_blits(struct fd_batch *batch, struct fd_ringbuffer *ring) +emit_restore_blits(struct fd_batch *batch, fd_cs &cs) { const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct pipe_framebuffer_state *pfb = &batch->framebuffer; @@ -1625,7 +1639,7 @@ emit_restore_blits(struct fd_batch *batch, struct fd_ringbuffer *ring) continue; if (!(batch->restore & (PIPE_CLEAR_COLOR0 << i))) continue; - emit_restore_blit(batch, ring, gmem->cbuf_base[i], &pfb->cbufs[i], + emit_restore_blit(batch, cs, gmem->cbuf_base[i], &pfb->cbufs[i], FD_BUFFER_COLOR); } } @@ -1634,11 +1648,11 @@ emit_restore_blits(struct fd_batch *batch, struct fd_ringbuffer *ring) struct fd_resource *rsc = fd_resource(pfb->zsbuf.texture); if (!rsc->stencil || (batch->restore & FD_BUFFER_DEPTH)) { - emit_restore_blit(batch, ring, gmem->zsbuf_base[0], &pfb->zsbuf, + emit_restore_blit(batch, cs, gmem->zsbuf_base[0], &pfb->zsbuf, FD_BUFFER_DEPTH); } if (rsc->stencil && (batch->restore & FD_BUFFER_STENCIL)) { - emit_restore_blit(batch, ring, gmem->zsbuf_base[1], &pfb->zsbuf, + emit_restore_blit(batch, cs, gmem->zsbuf_base[1], &pfb->zsbuf, FD_BUFFER_STENCIL); } } @@ -1652,8 +1666,10 @@ prepare_tile_setup(struct fd_batch *batch) batch->tile_loads = fd_submit_new_ringbuffer(batch->submit, 0x1000, FD_RINGBUFFER_STREAMING); - set_blit_scissor(batch, batch->tile_loads); - emit_restore_blits(batch, batch->tile_loads); + fd_cs cs(batch->tile_loads); + + set_blit_scissor(batch, cs); + emit_restore_blits(batch, cs); } foreach_subpass (subpass, batch) { @@ -1663,8 +1679,10 @@ prepare_tile_setup(struct fd_batch *batch) subpass->subpass_clears = fd_submit_new_ringbuffer(batch->submit, 0x1000, FD_RINGBUFFER_STREAMING); - set_blit_scissor(batch, subpass->subpass_clears); - emit_subpass_clears(batch, subpass); + fd_cs cs(subpass->subpass_clears); + + set_blit_scissor(batch, cs); + emit_subpass_clears(batch, cs, subpass); } } @@ -1681,9 +1699,10 @@ static void fd6_emit_tile_renderprep(struct fd_batch *batch, const struct fd_tile *tile) { if (batch->tile_loads) { - trace_start_tile_loads(&batch->trace, batch->gmem, batch->restore); - emit_conditional_ib(batch, tile, batch->tile_loads); - trace_end_tile_loads(&batch->trace, batch->gmem); + fd_cs cs(batch->gmem); + trace_start_tile_loads(&batch->trace, cs.ring(), batch->restore); + emit_conditional_ib(cs, batch, tile, batch->tile_loads); + trace_end_tile_loads(&batch->trace, cs.ring()); } } @@ -1750,7 +1769,7 @@ fd6_unknown_8c01(enum pipe_format format, unsigned buffers) template static void -emit_resolve_blit(struct fd_batch *batch, struct fd_ringbuffer *ring, +emit_resolve_blit(struct fd_batch *batch, fd_cs &cs, uint32_t base, struct pipe_surface *psurf, unsigned buffer) assert_dt { @@ -1773,7 +1792,7 @@ emit_resolve_blit(struct fd_batch *batch, struct fd_ringbuffer *ring, * !resolve case below, so batch_draw_tracking_for_dirty_bits() has us * just do a restore of the other channel for partial packed z/s writes. */ - fd6_resolve_tile(batch, ring, base, psurf, 0); + fd6_resolve_tile(batch, cs, base, psurf, 0); return; } @@ -1794,10 +1813,12 @@ emit_resolve_blit(struct fd_batch *batch, struct fd_ringbuffer *ring, util_format_is_depth_or_stencil(psurf->format)) info |= A6XX_RB_RESOLVE_OPERATION_SAMPLE_0; - OUT_PKT4(ring, REG_A6XX_RB_RESOLVE_OPERATION, 1); - OUT_RING(ring, info); + with_crb (cs, 11) { + crb.add(A6XX_RB_RESOLVE_OPERATION(.dword = info)); + emit_blit(batch, crb, base, psurf, stencil); + } - emit_blit(batch, ring, base, psurf, stencil); + fd6_emit_blit(batch->ctx, cs); } /* @@ -1811,23 +1832,23 @@ prepare_tile_fini(struct fd_batch *batch) { const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct pipe_framebuffer_state *pfb = &batch->framebuffer; - struct fd_ringbuffer *ring; batch->tile_store = fd_submit_new_ringbuffer(batch->submit, 0x1000, FD_RINGBUFFER_STREAMING); - ring = batch->tile_store; - set_blit_scissor(batch, ring); + fd_cs cs(batch->tile_store); + + set_blit_scissor(batch, cs); if (batch->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) { struct fd_resource *rsc = fd_resource(pfb->zsbuf.texture); if (!rsc->stencil || (batch->resolve & FD_BUFFER_DEPTH)) { - emit_resolve_blit(batch, ring, gmem->zsbuf_base[0], + emit_resolve_blit(batch, cs, gmem->zsbuf_base[0], &pfb->zsbuf, FD_BUFFER_DEPTH); } if (rsc->stencil && (batch->resolve & FD_BUFFER_STENCIL)) { - emit_resolve_blit(batch, ring, gmem->zsbuf_base[1], + emit_resolve_blit(batch, cs, gmem->zsbuf_base[1], &pfb->zsbuf, FD_BUFFER_STENCIL); } } @@ -1839,7 +1860,7 @@ prepare_tile_fini(struct fd_batch *batch) continue; if (!(batch->resolve & (PIPE_CLEAR_COLOR0 << i))) continue; - emit_resolve_blit(batch, ring, gmem->cbuf_base[i], + emit_resolve_blit(batch, cs, gmem->cbuf_base[i], &pfb->cbufs[i], FD_BUFFER_COLOR); } } @@ -1849,75 +1870,72 @@ template static void fd6_emit_tile(struct fd_batch *batch, const struct fd_tile *tile) { + fd_cs cs(batch->gmem); + foreach_subpass (subpass, batch) { if (subpass->subpass_clears) { - trace_start_clears(&batch->trace, batch->gmem, subpass->fast_cleared); - emit_conditional_ib(batch, tile, subpass->subpass_clears); - trace_end_clears(&batch->trace, batch->gmem); + trace_start_clears(&batch->trace, cs.ring(), subpass->fast_cleared); + emit_conditional_ib(cs, batch, tile, subpass->subpass_clears); + trace_end_clears(&batch->trace, cs.ring()); } - emit_lrz(batch, subpass); + emit_lrz(cs, batch, subpass); - fd6_emit_ib(batch->gmem, subpass->draw); + fd6_emit_ib(cs, subpass->draw); } if (batch->tile_epilogue) - fd6_emit_ib(batch->gmem, batch->tile_epilogue); + fd6_emit_ib(cs, batch->tile_epilogue); } static void fd6_emit_tile_gmem2mem(struct fd_batch *batch, const struct fd_tile *tile) { - struct fd_ringbuffer *ring = batch->gmem; + fd_cs cs(batch->gmem); if (batch->epilogue) - fd6_emit_ib(batch->gmem, batch->epilogue); + fd6_emit_ib(cs, batch->epilogue); if (use_hw_binning(batch)) { - OUT_PKT7(ring, CP_SET_MARKER, 1); - OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_END_OF_DRAWS) | - A6XX_CP_SET_MARKER_0_USES_GMEM); + fd_pkt7(cs, CP_SET_MARKER, 1) + .add(A6XX_CP_SET_MARKER_0(.mode = RM6_BIN_END_OF_DRAWS, .uses_gmem = true)); } - OUT_PKT7(ring, CP_SET_DRAW_STATE, 3); - OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) | - CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS | - CP_SET_DRAW_STATE__0_GROUP_ID(0)); - OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0)); - OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0)); + fd_pkt7(cs, CP_SET_DRAW_STATE, 3) + .add(CP_SET_DRAW_STATE__0(0, .disable_all_groups = true)) + .add(CP_SET_DRAW_STATE__ADDR(0)); - OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_LOCAL, 1); - OUT_RING(ring, 0x0); + fd_pkt7(cs, CP_SKIP_IB2_ENABLE_LOCAL, 1) + .add(0x0); - emit_marker6(ring, 7); - OUT_PKT7(ring, CP_SET_MARKER, 1); - OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_RESOLVE) | - A6XX_CP_SET_MARKER_0_USES_GMEM); - emit_marker6(ring, 7); + emit_marker6(cs, 7); + fd_pkt7(cs, CP_SET_MARKER, 1) + .add(A6XX_CP_SET_MARKER_0(.mode = RM6_BIN_RESOLVE, .uses_gmem = true)); + emit_marker6(cs, 7); if (batch->tile_store) { - trace_start_tile_stores(&batch->trace, batch->gmem, batch->resolve); - emit_conditional_ib(batch, tile, batch->tile_store); - trace_end_tile_stores(&batch->trace, batch->gmem); + trace_start_tile_stores(&batch->trace, cs.ring(), batch->resolve); + emit_conditional_ib(cs, batch, tile, batch->tile_store); + trace_end_tile_stores(&batch->trace, cs.ring()); } - OUT_PKT7(ring, CP_SET_MARKER, 1); - OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_RENDER_END)); + fd_pkt7(cs, CP_SET_MARKER, 1) + .add(A6XX_CP_SET_MARKER_0(.mode = RM6_BIN_RENDER_END)); } template static void fd6_emit_tile_fini(struct fd_batch *batch) { - struct fd_ringbuffer *ring = batch->gmem; + fd_cs cs(batch->gmem); - emit_common_fini(batch); + emit_common_fini(cs, batch); - OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_CNTL, 1); - OUT_RING(ring, A6XX_GRAS_LRZ_CNTL_ENABLE); + fd_pkt4(cs, 1) + .add(A6XX_GRAS_LRZ_CNTL(.enable = true)); - fd6_event_write(batch->ctx, ring, FD_LRZ_FLUSH); - fd6_event_write(batch->ctx, ring, FD_CCU_CLEAN_BLIT_CACHE); + fd6_event_write(batch->ctx, cs, FD_LRZ_FLUSH); + fd6_event_write(batch->ctx, cs, FD_CCU_CLEAN_BLIT_CACHE); if (use_hw_binning(batch)) { check_vsc_overflow(batch->ctx); @@ -1926,11 +1944,10 @@ fd6_emit_tile_fini(struct fd_batch *batch) template static void -emit_sysmem_clears(struct fd_batch *batch, struct fd_batch_subpass *subpass) +emit_sysmem_clears(fd_cs &cs, struct fd_batch *batch, struct fd_batch_subpass *subpass) assert_dt { struct fd_context *ctx = batch->ctx; - struct fd_ringbuffer *ring = batch->gmem; struct pipe_framebuffer_state *pfb = &batch->framebuffer; uint32_t buffers = subpass->fast_cleared; @@ -1941,7 +1958,7 @@ emit_sysmem_clears(struct fd_batch *batch, struct fd_batch_subpass *subpass) struct pipe_box box2d; u_box_2d(0, 0, pfb->width, pfb->height, &box2d); - trace_start_clears(&batch->trace, ring, buffers); + trace_start_clears(&batch->trace, cs.ring(), buffers); if (buffers & PIPE_CLEAR_COLOR) { for (int i = 0; i < pfb->nr_cbufs; i++) { @@ -1953,7 +1970,7 @@ emit_sysmem_clears(struct fd_batch *batch, struct fd_batch_subpass *subpass) if (!(buffers & (PIPE_CLEAR_COLOR0 << i))) continue; - fd6_clear_surface(ctx, ring, &pfb->cbufs[i], &box2d, &color, 0); + fd6_clear_surface(ctx, cs, &pfb->cbufs[i], &box2d, &color, 0); } } if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) { @@ -1968,7 +1985,7 @@ emit_sysmem_clears(struct fd_batch *batch, struct fd_batch_subpass *subpass) if ((buffers & PIPE_CLEAR_DEPTH) || (!separate_stencil && (buffers & PIPE_CLEAR_STENCIL))) { value.f[0] = subpass->clear_depth; value.ui[1] = subpass->clear_stencil; - fd6_clear_surface(ctx, ring, &pfb->zsbuf, &box2d, + fd6_clear_surface(ctx, cs, &pfb->zsbuf, &box2d, &value, fd6_unknown_8c01(pfb->zsbuf.format, buffers)); } @@ -1979,33 +1996,33 @@ emit_sysmem_clears(struct fd_batch *batch, struct fd_batch_subpass *subpass) stencil_surf.format = PIPE_FORMAT_S8_UINT; stencil_surf.texture = separate_stencil; - fd6_clear_surface(ctx, ring, &stencil_surf, &box2d, &value, 0); + fd6_clear_surface(ctx, cs, &stencil_surf, &box2d, &value, 0); } } - fd6_emit_flushes(ctx, ring, FD6_FLUSH_CCU_COLOR | FD6_INVALIDATE_CCU_COLOR); + fd6_emit_flushes(ctx, cs, FD6_FLUSH_CCU_COLOR | FD6_INVALIDATE_CCU_COLOR); - trace_end_clears(&batch->trace, ring); + trace_end_clears(&batch->trace, cs.ring()); } template static void fd6_emit_sysmem_prep(struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->gmem; + fd_cs cs(batch->gmem); emit_lrz_clears(batch); - fd6_emit_restore(batch, ring); - fd6_event_write(batch->ctx, ring, FD_LRZ_FLUSH); + fd6_emit_restore(cs, batch); + fd6_event_write(batch->ctx, cs, FD_LRZ_FLUSH); if (batch->prologue) { if (!batch->nondraw) { - trace_start_prologue(&batch->trace, ring); + trace_start_prologue(&batch->trace, cs.ring()); } - fd6_emit_ib(ring, batch->prologue); + fd6_emit_ib(cs, batch->prologue); if (!batch->nondraw) { - trace_end_prologue(&batch->trace, ring); + trace_end_prologue(&batch->trace, cs.ring()); } } @@ -2016,50 +2033,61 @@ fd6_emit_sysmem_prep(struct fd_batch *batch) assert_dt struct pipe_framebuffer_state *pfb = &batch->framebuffer; if (pfb->width > 0 && pfb->height > 0) - set_scissor(ring, 0, 0, pfb->width - 1, pfb->height - 1); + set_scissor(cs, 0, 0, pfb->width - 1, pfb->height - 1); else - set_scissor(ring, 0, 0, 0, 0); + set_scissor(cs, 0, 0, 0, 0); - set_tessfactor_bo(ring, batch); - set_window_offset(ring, 0, 0); - - set_bin_size(ring, NULL, { - .render_mode = RENDERING_PASS, - .buffers_location = BUFFERS_IN_SYSMEM, - }); + set_tessfactor_bo(cs, batch); if (CHIP >= A7XX) { - OUT_REG(ring, A7XX_RB_UNKNOWN_8812(0x3ff)); // all buffers in sysmem - OUT_REG(ring, A7XX_RB_CCU_DBG_ECO_CNTL(batch->ctx->screen->info->a6xx.magic.RB_CCU_DBG_ECO_CNTL)); - OUT_REG(ring, A7XX_GRAS_UNKNOWN_8007(0x0)); - OUT_REG(ring, A6XX_GRAS_UNKNOWN_8110(0x2)); - OUT_REG(ring, A7XX_RB_UNKNOWN_8E09(0x4)); + /* Non-context regs: */ + fd_pkt4(cs, 1) + .add(A6XX_GRAS_UNKNOWN_8110(0x2)); } - emit_marker6(ring, 7); - OUT_PKT7(ring, CP_SET_MARKER, 1); - OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_DIRECT_RENDER)); - emit_marker6(ring, 7); + with_crb (cs, 12) { + set_window_offset(crb, 0, 0); - OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); - OUT_RING(ring, 0x0); + set_bin_size(crb, NULL, { + .render_mode = RENDERING_PASS, + .buffers_location = BUFFERS_IN_SYSMEM, + }); + + if (CHIP >= A7XX) { + crb.add(A7XX_RB_UNKNOWN_8812(0x3ff)); // all buffers in sysmem + crb.add(A7XX_RB_CCU_DBG_ECO_CNTL(batch->ctx->screen->info->a6xx.magic.RB_CCU_DBG_ECO_CNTL)); + crb.add(A7XX_GRAS_UNKNOWN_8007(0x0)); + crb.add(A7XX_RB_UNKNOWN_8E09(0x4)); + } + + /* enable stream-out, with sysmem there is only one pass: */ + crb.add(A6XX_VPC_SO_OVERRIDE(false)); + } + + emit_marker6(cs, 7); + fd_pkt7(cs, CP_SET_MARKER, 1) + .add(A6XX_CP_SET_MARKER_0_MODE(RM6_DIRECT_RENDER)); + emit_marker6(cs, 7); + + fd_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1) + .add(0x0); /* blob controls "local" in IB2, but I think that is not required */ - OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_LOCAL, 1); - OUT_RING(ring, 0x1); + fd_pkt7(cs, CP_SKIP_IB2_ENABLE_LOCAL, 1) + .add(0x1); - /* enable stream-out, with sysmem there is only one pass: */ - OUT_REG(ring, A6XX_VPC_SO_OVERRIDE(false)); + fd_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1) + .add(0x1); - OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1); - OUT_RING(ring, 0x1); + with_crb (cs, 150) { + emit_zs(crb, &pfb->zsbuf, NULL); + emit_mrt(crb, pfb, NULL); + emit_msaa(crb, pfb->samples); + } + + emit_common_init(cs, batch); - emit_zs(batch->ctx, ring, &pfb->zsbuf, NULL); - emit_mrt(ring, pfb, NULL); - emit_msaa(ring, pfb->samples); patch_fb_read_sysmem(batch); - - emit_common_init(batch); } template @@ -2067,8 +2095,8 @@ static void fd6_emit_sysmem(struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->gmem; struct fd_screen *screen = batch->ctx->screen; + fd_cs cs(batch->gmem); foreach_subpass (subpass, batch) { if (subpass->fast_cleared) { @@ -2078,18 +2106,18 @@ fd6_emit_sysmem(struct fd_batch *batch) if (subpass->fast_cleared & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) flushes |= FD6_INVALIDATE_CCU_DEPTH; - fd6_emit_flushes(batch->ctx, ring, flushes); - emit_sysmem_clears(batch, subpass); + fd6_emit_flushes(batch->ctx, cs, flushes); + emit_sysmem_clears(cs, batch, subpass); } - fd6_emit_ccu_cntl(ring, screen, false); + fd6_emit_ccu_cntl(cs, screen, false); struct pipe_framebuffer_state *pfb = &batch->framebuffer; - update_render_cntl(batch, pfb, false); + update_render_cntl(cs, screen, pfb, false); - emit_lrz(batch, subpass); + emit_lrz(cs, batch, subpass); - fd6_emit_ib(ring, subpass->draw); + fd6_emit_ib(cs, subpass->draw); } } @@ -2097,22 +2125,22 @@ template static void fd6_emit_sysmem_fini(struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->gmem; + fd_cs cs(batch->gmem); - emit_common_fini(batch); + emit_common_fini(cs, batch); if (batch->tile_epilogue) - fd6_emit_ib(batch->gmem, batch->tile_epilogue); + fd6_emit_ib(cs, batch->tile_epilogue); if (batch->epilogue) - fd6_emit_ib(batch->gmem, batch->epilogue); + fd6_emit_ib(cs, batch->epilogue); - OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); - OUT_RING(ring, 0x0); + fd_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1) + .add(0x0); - fd6_event_write(batch->ctx, ring, FD_LRZ_FLUSH); + fd6_event_write(batch->ctx, cs, FD_LRZ_FLUSH); - fd6_emit_flushes(batch->ctx, ring, + fd6_emit_flushes(batch->ctx, cs, FD6_FLUSH_CCU_COLOR | FD6_FLUSH_CCU_DEPTH); } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_image.cc b/src/gallium/drivers/freedreno/a6xx/fd6_image.cc index 7630ed495b1..58efec2c7a2 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_image.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_image.cc @@ -191,8 +191,7 @@ fd6_build_bindless_state(struct fd_context *ctx, mesa_shader_stage shader, struct fd_shaderimg_stateobj *imgso = &ctx->shaderimg[shader]; struct fd6_descriptor_set *set = descriptor_set(ctx, shader); - struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( - ctx->batch->submit, 16 * 4, FD_RINGBUFFER_STREAMING); + fd_cs cs(ctx->batch->submit, 19 * 4); /* Don't re-use a previous descriptor set if appending the * fb-read descriptor, as that can change across batches. @@ -267,110 +266,104 @@ fd6_build_bindless_state(struct fd_context *ctx, mesa_shader_stage shader, unsigned idx = ir3_shader_descriptor_set(shader); - fd_ringbuffer_attach_bo(ring, set->bo); + cs.attach_bo(set->bo); if (shader == MESA_SHADER_COMPUTE) { - OUT_REG(ring, - SP_UPDATE_CNTL( - CHIP, + with_crb (cs, 5) { + crb.add(SP_UPDATE_CNTL(CHIP, .cs_bindless = CHIP == A6XX ? 0x1f : 0xff, - ) - ); - OUT_REG(ring, SP_CS_BINDLESS_BASE_DESCRIPTOR(CHIP, - idx, .desc_size = BINDLESS_DESCRIPTOR_64B, .bo = set->bo, - )); - - if (CHIP == A6XX) { - OUT_REG(ring, A6XX_HLSQ_CS_BINDLESS_BASE_DESCRIPTOR( - idx, .desc_size = BINDLESS_DESCRIPTOR_64B, .bo = set->bo, )); + crb.add(SP_CS_BINDLESS_BASE_DESCRIPTOR(CHIP, + idx, .desc_size = BINDLESS_DESCRIPTOR_64B, .bo = set->bo, + )); + + if (CHIP == A6XX) { + crb.add(A6XX_HLSQ_CS_BINDLESS_BASE_DESCRIPTOR( + idx, .desc_size = BINDLESS_DESCRIPTOR_64B, .bo = set->bo, + )); + } } if (bufso->enabled_mask) { - OUT_PKT(ring, CP_LOAD_STATE6_FRAG, - CP_LOAD_STATE6_0( - .dst_off = IR3_BINDLESS_SSBO_OFFSET, - .state_type = ST6_UAV, - .state_src = SS6_BINDLESS, - .state_block = SB6_CS_SHADER, - .num_unit = util_last_bit(bufso->enabled_mask), - ), - CP_LOAD_STATE6_EXT_SRC_ADDR( - /* This isn't actually an address: */ - .qword = (idx << 28) | - IR3_BINDLESS_SSBO_OFFSET * FDL6_TEX_CONST_DWORDS, - ), - ); + fd_pkt7(cs, CP_LOAD_STATE6_FRAG, 3) + .add(CP_LOAD_STATE6_0( + .dst_off = IR3_BINDLESS_SSBO_OFFSET, + .state_type = ST6_UAV, + .state_src = SS6_BINDLESS, + .state_block = SB6_CS_SHADER, + .num_unit = util_last_bit(bufso->enabled_mask), + )) + .add(CP_LOAD_STATE6_EXT_SRC_ADDR( + /* This isn't actually an address: */ + .qword = (idx << 28) | + IR3_BINDLESS_SSBO_OFFSET * FDL6_TEX_CONST_DWORDS, + )); } if (imgso->enabled_mask) { - OUT_PKT(ring, CP_LOAD_STATE6_FRAG, - CP_LOAD_STATE6_0( - .dst_off = IR3_BINDLESS_IMAGE_OFFSET, - .state_type = ST6_UAV, - .state_src = SS6_BINDLESS, - .state_block = SB6_CS_SHADER, - .num_unit = util_last_bit(imgso->enabled_mask), - ), - CP_LOAD_STATE6_EXT_SRC_ADDR( - /* This isn't actually an address: */ - .qword = (idx << 28) | - IR3_BINDLESS_IMAGE_OFFSET * FDL6_TEX_CONST_DWORDS, - ), - ); + fd_pkt7(cs, CP_LOAD_STATE6_FRAG, 3) + .add(CP_LOAD_STATE6_0( + .dst_off = IR3_BINDLESS_IMAGE_OFFSET, + .state_type = ST6_UAV, + .state_src = SS6_BINDLESS, + .state_block = SB6_CS_SHADER, + .num_unit = util_last_bit(imgso->enabled_mask), + )) + .add(CP_LOAD_STATE6_EXT_SRC_ADDR( + /* This isn't actually an address: */ + .qword = (idx << 28) | + IR3_BINDLESS_IMAGE_OFFSET * FDL6_TEX_CONST_DWORDS, + )); } } else { - OUT_REG(ring, - SP_UPDATE_CNTL( - CHIP, + with_crb (cs, 5) { + crb.add(SP_UPDATE_CNTL(CHIP, .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff, - ) - ); - OUT_REG(ring, SP_GFX_BINDLESS_BASE_DESCRIPTOR(CHIP, - idx, .desc_size = BINDLESS_DESCRIPTOR_64B, .bo = set->bo, - )); - if (CHIP == A6XX) { - OUT_REG(ring, A6XX_HLSQ_BINDLESS_BASE_DESCRIPTOR( - idx, .desc_size = BINDLESS_DESCRIPTOR_64B, .bo = set->bo, )); + crb.add(SP_GFX_BINDLESS_BASE_DESCRIPTOR(CHIP, + idx, .desc_size = BINDLESS_DESCRIPTOR_64B, .bo = set->bo, + )); + if (CHIP == A6XX) { + crb.add(A6XX_HLSQ_BINDLESS_BASE_DESCRIPTOR( + idx, .desc_size = BINDLESS_DESCRIPTOR_64B, .bo = set->bo, + )); + } } if (bufso->enabled_mask) { - OUT_PKT(ring, CP_LOAD_STATE6, - CP_LOAD_STATE6_0( - .dst_off = IR3_BINDLESS_SSBO_OFFSET, - .state_type = ST6_SHADER, - .state_src = SS6_BINDLESS, - .state_block = SB6_UAV, - .num_unit = util_last_bit(bufso->enabled_mask), - ), - CP_LOAD_STATE6_EXT_SRC_ADDR( - /* This isn't actually an address: */ - .qword = (idx << 28) | - IR3_BINDLESS_SSBO_OFFSET * FDL6_TEX_CONST_DWORDS, - ), - ); + fd_pkt7(cs, CP_LOAD_STATE6, 3) + .add(CP_LOAD_STATE6_0( + .dst_off = IR3_BINDLESS_SSBO_OFFSET, + .state_type = ST6_SHADER, + .state_src = SS6_BINDLESS, + .state_block = SB6_UAV, + .num_unit = util_last_bit(bufso->enabled_mask), + )) + .add(CP_LOAD_STATE6_EXT_SRC_ADDR( + /* This isn't actually an address: */ + .qword = (idx << 28) | + IR3_BINDLESS_SSBO_OFFSET * FDL6_TEX_CONST_DWORDS, + )); } if (imgso->enabled_mask) { - OUT_PKT(ring, CP_LOAD_STATE6, - CP_LOAD_STATE6_0( - .dst_off = IR3_BINDLESS_IMAGE_OFFSET, - .state_type = ST6_SHADER, - .state_src = SS6_BINDLESS, - .state_block = SB6_UAV, - .num_unit = util_last_bit(imgso->enabled_mask), - ), - CP_LOAD_STATE6_EXT_SRC_ADDR( - /* This isn't actually an address: */ - .qword = (idx << 28) | - IR3_BINDLESS_IMAGE_OFFSET * FDL6_TEX_CONST_DWORDS, - ), - ); + fd_pkt7(cs, CP_LOAD_STATE6, 3) + .add(CP_LOAD_STATE6_0( + .dst_off = IR3_BINDLESS_IMAGE_OFFSET, + .state_type = ST6_SHADER, + .state_src = SS6_BINDLESS, + .state_block = SB6_UAV, + .num_unit = util_last_bit(imgso->enabled_mask), + )) + .add(CP_LOAD_STATE6_EXT_SRC_ADDR( + /* This isn't actually an address: */ + .qword = (idx << 28) | + IR3_BINDLESS_IMAGE_OFFSET * FDL6_TEX_CONST_DWORDS, + )); } } - return ring; + return cs.ring(); } FD_GENX(fd6_build_bindless_state); diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.cc b/src/gallium/drivers/freedreno/a6xx/fd6_program.cc index 7ae68c07af1..ea06b3b47b6 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_program.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.cc @@ -43,64 +43,181 @@ struct program_builder { }; template -struct xs_config { - uint16_t reg_sp_xs_instrlen; - uint16_t reg_hlsq_xs_ctrl; - uint16_t reg_sp_xs_first_exec_offset; - uint16_t reg_sp_xs_pvt_mem_hw_stack_offset; - uint16_t reg_sp_xs_vgpr_config; -}; +static void +emit_shader_regs(struct fd_context *ctx, fd_cs &cs, const struct ir3_shader_variant *so) +{ + fd_crb crb(cs, 11); -template -static const struct xs_config xs_configs[] = { - [MESA_SHADER_VERTEX] = { - REG_A6XX_SP_VS_INSTR_SIZE, - CHIP == A6XX ? REG_A6XX_SP_VS_CONST_CONFIG : REG_A7XX_SP_VS_CONST_CONFIG, - REG_A6XX_SP_VS_PROGRAM_COUNTER_OFFSET, - REG_A6XX_SP_VS_PVT_MEM_STACK_OFFSET, - REG_A7XX_SP_VS_VGS_CNTL, - }, - [MESA_SHADER_TESS_CTRL] = { - REG_A6XX_SP_HS_INSTR_SIZE, - CHIP == A6XX ? REG_A6XX_SP_HS_CONST_CONFIG : REG_A7XX_SP_HS_CONST_CONFIG, - REG_A6XX_SP_HS_PROGRAM_COUNTER_OFFSET, - REG_A6XX_SP_HS_PVT_MEM_STACK_OFFSET, - REG_A7XX_SP_HS_VGS_CNTL, - }, - [MESA_SHADER_TESS_EVAL] = { - REG_A6XX_SP_DS_INSTR_SIZE, - CHIP == A6XX ? REG_A6XX_SP_DS_CONST_CONFIG : REG_A7XX_SP_DS_CONST_CONFIG, - REG_A6XX_SP_DS_PROGRAM_COUNTER_OFFSET, - REG_A6XX_SP_DS_PVT_MEM_STACK_OFFSET, - REG_A7XX_SP_DS_VGS_CNTL, - }, - [MESA_SHADER_GEOMETRY] = { - REG_A6XX_SP_GS_INSTR_SIZE, - CHIP == A6XX ? REG_A6XX_SP_GS_CONST_CONFIG : REG_A7XX_SP_GS_CONST_CONFIG, - REG_A6XX_SP_GS_PROGRAM_COUNTER_OFFSET, - REG_A6XX_SP_GS_PVT_MEM_STACK_OFFSET, - REG_A7XX_SP_GS_VGS_CNTL, - }, - [MESA_SHADER_FRAGMENT] = { - REG_A6XX_SP_PS_INSTR_SIZE, - CHIP == A6XX ? REG_A6XX_SP_PS_CONST_CONFIG : REG_A7XX_SP_PS_CONST_CONFIG, - REG_A6XX_SP_PS_PROGRAM_COUNTER_OFFSET, - REG_A6XX_SP_PS_PVT_MEM_STACK_OFFSET, - REG_A7XX_SP_PS_VGS_CNTL, - }, - [MESA_SHADER_COMPUTE] = { - REG_A6XX_SP_CS_INSTR_SIZE, - CHIP == A6XX ? REG_A6XX_SP_CS_CONST_CONFIG : REG_A7XX_SP_CS_CONST_CONFIG, - REG_A6XX_SP_CS_PROGRAM_COUNTER_OFFSET, - REG_A6XX_SP_CS_PVT_MEM_STACK_OFFSET, - REG_A7XX_SP_CS_VGS_CNTL, - }, -}; + mesa_shader_stage type = so->type; + if (type == MESA_SHADER_KERNEL) + type = MESA_SHADER_COMPUTE; + + enum a6xx_threadsize thrsz = + so->info.double_threadsize ? THREAD128 : THREAD64; + + ir3_get_private_mem(ctx, so); + + uint32_t per_sp_size = ctx->pvtmem[so->pvtmem_per_wave].per_sp_size; + struct fd_bo *pvtmem_bo = NULL; + + if (so->pvtmem_size > 0) { /* SP_xS_PVT_MEM_ADDR */ + pvtmem_bo = ctx->pvtmem[so->pvtmem_per_wave].bo; + crb.attach_bo(pvtmem_bo); + } + + crb.attach_bo(so->bo); + + switch (type) { + case MESA_SHADER_VERTEX: + crb.add(A6XX_SP_VS_CNTL_0( + .halfregfootprint = so->info.max_half_reg + 1, + .fullregfootprint = so->info.max_reg + 1, + .branchstack = ir3_shader_branchstack_hw(so), + .mergedregs = so->mergedregs, + .earlypreamble = so->early_preamble, + )); + crb.add(A6XX_SP_VS_INSTR_SIZE(so->instrlen)); + crb.add(A6XX_SP_VS_PROGRAM_COUNTER_OFFSET()); + crb.add(A6XX_SP_VS_BASE(so->bo)); + crb.add(A6XX_SP_VS_PVT_MEM_PARAM( + .memsizeperitem = ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size, + )); + crb.add(A6XX_SP_VS_PVT_MEM_BASE(pvtmem_bo)); + crb.add(A6XX_SP_VS_PVT_MEM_SIZE( + .totalpvtmemsize = per_sp_size, + .perwavememlayout = so->pvtmem_per_wave, + )); + crb.add(A6XX_SP_VS_PVT_MEM_STACK_OFFSET(.offset = per_sp_size)); + if (CHIP >= A7XX) + crb.add(A7XX_SP_VS_VGS_CNTL()); + break; + case MESA_SHADER_TESS_CTRL: + crb.add(A6XX_SP_HS_CNTL_0( + .halfregfootprint = so->info.max_half_reg + 1, + .fullregfootprint = so->info.max_reg + 1, + .branchstack = ir3_shader_branchstack_hw(so), + .earlypreamble = so->early_preamble, + )); + crb.add(A6XX_SP_HS_INSTR_SIZE(so->instrlen)); + crb.add(A6XX_SP_HS_PROGRAM_COUNTER_OFFSET()); + crb.add(A6XX_SP_HS_BASE(so->bo)); + crb.add(A6XX_SP_HS_PVT_MEM_PARAM( + .memsizeperitem = ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size, + )); + crb.add(A6XX_SP_HS_PVT_MEM_BASE(pvtmem_bo)); + crb.add(A6XX_SP_HS_PVT_MEM_SIZE( + .totalpvtmemsize = per_sp_size, + .perwavememlayout = so->pvtmem_per_wave, + )); + crb.add(A6XX_SP_HS_PVT_MEM_STACK_OFFSET(.offset = per_sp_size)); + if (CHIP >= A7XX) + crb.add(A7XX_SP_HS_VGS_CNTL()); + break; + case MESA_SHADER_TESS_EVAL: + crb.add(A6XX_SP_DS_CNTL_0( + .halfregfootprint = so->info.max_half_reg + 1, + .fullregfootprint = so->info.max_reg + 1, + .branchstack = ir3_shader_branchstack_hw(so), + .earlypreamble = so->early_preamble, + )); + crb.add(A6XX_SP_DS_INSTR_SIZE(so->instrlen)); + crb.add(A6XX_SP_DS_PROGRAM_COUNTER_OFFSET()); + crb.add(A6XX_SP_DS_BASE(so->bo)); + crb.add(A6XX_SP_DS_PVT_MEM_PARAM( + .memsizeperitem = ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size, + )); + crb.add(A6XX_SP_DS_PVT_MEM_BASE(pvtmem_bo)); + crb.add(A6XX_SP_DS_PVT_MEM_SIZE( + .totalpvtmemsize = per_sp_size, + .perwavememlayout = so->pvtmem_per_wave, + )); + crb.add(A6XX_SP_DS_PVT_MEM_STACK_OFFSET(.offset = per_sp_size)); + if (CHIP >= A7XX) + crb.add(A7XX_SP_DS_VGS_CNTL()); + break; + case MESA_SHADER_GEOMETRY: + crb.add(A6XX_SP_GS_CNTL_0( + .halfregfootprint = so->info.max_half_reg + 1, + .fullregfootprint = so->info.max_reg + 1, + .branchstack = ir3_shader_branchstack_hw(so), + .earlypreamble = so->early_preamble, + )); + crb.add(A6XX_SP_GS_INSTR_SIZE(so->instrlen)); + crb.add(A6XX_SP_GS_PROGRAM_COUNTER_OFFSET()); + crb.add(A6XX_SP_GS_BASE(so->bo)); + crb.add(A6XX_SP_GS_PVT_MEM_PARAM( + .memsizeperitem = ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size, + )); + crb.add(A6XX_SP_GS_PVT_MEM_BASE(pvtmem_bo)); + crb.add(A6XX_SP_GS_PVT_MEM_SIZE( + .totalpvtmemsize = per_sp_size, + .perwavememlayout = so->pvtmem_per_wave, + )); + crb.add(A6XX_SP_GS_PVT_MEM_STACK_OFFSET(.offset = per_sp_size)); + if (CHIP >= A7XX) + crb.add(A7XX_SP_GS_VGS_CNTL()); + break; + case MESA_SHADER_FRAGMENT: + crb.add(A6XX_SP_PS_CNTL_0( + .halfregfootprint = so->info.max_half_reg + 1, + .fullregfootprint = so->info.max_reg + 1, + .branchstack = ir3_shader_branchstack_hw(so), + .threadsize = thrsz, + .varying = so->total_in != 0, + .lodpixmask = so->need_full_quad, + .inoutregoverlap = true, + .pixlodenable = so->need_pixlod, + .earlypreamble = so->early_preamble, + .mergedregs = so->mergedregs, + )); + crb.add(A6XX_SP_PS_INSTR_SIZE(so->instrlen)); + crb.add(A6XX_SP_PS_PROGRAM_COUNTER_OFFSET()); + crb.add(A6XX_SP_PS_BASE(so->bo)); + crb.add(A6XX_SP_PS_PVT_MEM_PARAM( + .memsizeperitem = ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size, + )); + crb.add(A6XX_SP_PS_PVT_MEM_BASE(pvtmem_bo)); + crb.add(A6XX_SP_PS_PVT_MEM_SIZE( + .totalpvtmemsize = per_sp_size, + .perwavememlayout = so->pvtmem_per_wave, + )); + crb.add(A6XX_SP_PS_PVT_MEM_STACK_OFFSET(.offset = per_sp_size)); + if (CHIP >= A7XX) + crb.add(A7XX_SP_PS_VGS_CNTL()); + break; + case MESA_SHADER_COMPUTE: + thrsz = ctx->screen->info->a6xx.supports_double_threadsize ? thrsz : THREAD128; + crb.add(A6XX_SP_CS_CNTL_0( + .halfregfootprint = so->info.max_half_reg + 1, + .fullregfootprint = so->info.max_reg + 1, + .branchstack = ir3_shader_branchstack_hw(so), + .threadsize = thrsz, + .earlypreamble = so->early_preamble, + .mergedregs = so->mergedregs, + )); + crb.add(A6XX_SP_CS_INSTR_SIZE(so->instrlen)); + crb.add(A6XX_SP_CS_PROGRAM_COUNTER_OFFSET()); + crb.add(A6XX_SP_CS_BASE(so->bo)); + crb.add(A6XX_SP_CS_PVT_MEM_PARAM( + .memsizeperitem = ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size, + )); + crb.add(A6XX_SP_CS_PVT_MEM_BASE(pvtmem_bo)); + crb.add(A6XX_SP_CS_PVT_MEM_SIZE( + .totalpvtmemsize = per_sp_size, + .perwavememlayout = so->pvtmem_per_wave, + )); + crb.add(A6XX_SP_CS_PVT_MEM_STACK_OFFSET(.offset = per_sp_size)); + if (CHIP >= A7XX) + crb.add(A7XX_SP_CS_VGS_CNTL()); + break; + default: + UNREACHABLE("bad shader stage"); + } +} template void -fd6_emit_shader(struct fd_context *ctx, struct fd_ringbuffer *ring, - const struct ir3_shader_variant *so) +fd6_emit_shader(struct fd_context *ctx, fd_cs &cs, const struct ir3_shader_variant *so) { if (!so) { /* shader stage disabled */ @@ -111,131 +228,26 @@ fd6_emit_shader(struct fd_context *ctx, struct fd_ringbuffer *ring, /* Name should generally match what you get with MESA_SHADER_CAPTURE_PATH: */ const char *name = so->name; if (name) - fd_emit_string5(ring, name, strlen(name)); + fd_emit_string5(cs.ring(), name, strlen(name)); #endif - mesa_shader_stage type = so->type; - if (type == MESA_SHADER_KERNEL) - type = MESA_SHADER_COMPUTE; - - enum a6xx_threadsize thrsz = - so->info.double_threadsize ? THREAD128 : THREAD64; - - switch (type) { - case MESA_SHADER_VERTEX: - OUT_REG(ring, A6XX_SP_VS_CNTL_0( - .halfregfootprint = so->info.max_half_reg + 1, - .fullregfootprint = so->info.max_reg + 1, - .branchstack = ir3_shader_branchstack_hw(so), - .mergedregs = so->mergedregs, - .earlypreamble = so->early_preamble, - )); - break; - case MESA_SHADER_TESS_CTRL: - OUT_REG(ring, A6XX_SP_HS_CNTL_0( - .halfregfootprint = so->info.max_half_reg + 1, - .fullregfootprint = so->info.max_reg + 1, - .branchstack = ir3_shader_branchstack_hw(so), - .earlypreamble = so->early_preamble, - )); - break; - case MESA_SHADER_TESS_EVAL: - OUT_REG(ring, A6XX_SP_DS_CNTL_0( - .halfregfootprint = so->info.max_half_reg + 1, - .fullregfootprint = so->info.max_reg + 1, - .branchstack = ir3_shader_branchstack_hw(so), - .earlypreamble = so->early_preamble, - )); - break; - case MESA_SHADER_GEOMETRY: - OUT_REG(ring, A6XX_SP_GS_CNTL_0( - .halfregfootprint = so->info.max_half_reg + 1, - .fullregfootprint = so->info.max_reg + 1, - .branchstack = ir3_shader_branchstack_hw(so), - .earlypreamble = so->early_preamble, - )); - break; - case MESA_SHADER_FRAGMENT: - OUT_REG(ring, A6XX_SP_PS_CNTL_0( - .halfregfootprint = so->info.max_half_reg + 1, - .fullregfootprint = so->info.max_reg + 1, - .branchstack = ir3_shader_branchstack_hw(so), - .threadsize = thrsz, - .varying = so->total_in != 0, - .lodpixmask = so->need_full_quad, - .inoutregoverlap = true, - .pixlodenable = so->need_pixlod, - .earlypreamble = so->early_preamble, - .mergedregs = so->mergedregs, - )); - break; - case MESA_SHADER_COMPUTE: - thrsz = ctx->screen->info->a6xx.supports_double_threadsize ? thrsz : THREAD128; - OUT_REG(ring, A6XX_SP_CS_CNTL_0( - .halfregfootprint = so->info.max_half_reg + 1, - .fullregfootprint = so->info.max_reg + 1, - .branchstack = ir3_shader_branchstack_hw(so), - .threadsize = thrsz, - .earlypreamble = so->early_preamble, - .mergedregs = so->mergedregs, - )); - break; - default: - UNREACHABLE("bad shader stage"); - } - - const struct xs_config *cfg = &xs_configs[type]; - - OUT_PKT4(ring, cfg->reg_sp_xs_instrlen, 1); - OUT_RING(ring, so->instrlen); - - /* emit program binary & private memory layout - */ - - ir3_get_private_mem(ctx, so); - - uint32_t per_sp_size = ctx->pvtmem[so->pvtmem_per_wave].per_sp_size; - - fd_ringbuffer_attach_bo(ring, so->bo); - - OUT_PKT4(ring, cfg->reg_sp_xs_first_exec_offset, 7); - OUT_RING(ring, 0); /* SP_xS_OBJ_FIRST_EXEC_OFFSET */ - OUT_RELOC(ring, so->bo, 0, 0, 0); /* SP_xS_OBJ_START_LO */ - OUT_RING(ring, A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size)); - if (so->pvtmem_size > 0) { /* SP_xS_PVT_MEM_ADDR */ - fd_ringbuffer_attach_bo(ring, ctx->pvtmem[so->pvtmem_per_wave].bo); - OUT_RELOC(ring, ctx->pvtmem[so->pvtmem_per_wave].bo, 0, 0, 0); - } else { - OUT_RING(ring, 0); - OUT_RING(ring, 0); - } - OUT_RING(ring, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(per_sp_size) | - COND(so->pvtmem_per_wave, - A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT)); - - OUT_PKT4(ring, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1); - OUT_RING(ring, A6XX_SP_VS_PVT_MEM_STACK_OFFSET_OFFSET(per_sp_size)); - - if (CHIP >= A7XX) { - OUT_PKT4(ring, cfg->reg_sp_xs_vgpr_config, 1); - OUT_RING(ring, 0); - } + emit_shader_regs(ctx, cs, so); if (CHIP == A6XX) { uint32_t shader_preload_size = MIN2(so->instrlen, ctx->screen->info->a6xx.instr_cache_size); - enum a6xx_state_block sb = fd6_stage2shadersb(so->type); - OUT_PKT7(ring, fd6_stage2opcode(so->type), 3); - OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(sb) | - CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size)); - OUT_RELOC(ring, so->bo, 0, 0, 0); + fd_pkt7(cs, fd6_stage2opcode(so->type), 3) + .add(CP_LOAD_STATE6_0( + .state_type = ST6_SHADER, + .state_src = SS6_INDIRECT, + .state_block = fd6_stage2shadersb(so->type), + .num_unit = shader_preload_size, + )) + .add(CP_LOAD_STATE6_EXT_SRC_ADDR(.bo = so->bo)); } - fd6_emit_immediates(so, ring); + fd6_emit_immediates(so, cs); } FD_GENX(fd6_emit_shader); @@ -246,26 +258,21 @@ FD_GENX(fd6_emit_shader); static void setup_stream_out_disable(struct fd_context *ctx) { - unsigned sizedw = 4; + unsigned nreg = 2; if (ctx->screen->info->a6xx.tess_use_shared) - sizedw += 2; + nreg++; - struct fd_ringbuffer *ring = - fd_ringbuffer_new_object(ctx->pipe, (1 + sizedw) * 4); + fd_crb crb(ctx->pipe, nreg); - OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, sizedw); - OUT_RING(ring, REG_A6XX_VPC_SO_MAPPING_WPTR); - OUT_RING(ring, 0); - OUT_RING(ring, REG_A6XX_VPC_SO_CNTL); - OUT_RING(ring, 0); + crb.add(A6XX_VPC_SO_MAPPING_WPTR()); + crb.add(A6XX_VPC_SO_CNTL()); if (ctx->screen->info->a6xx.tess_use_shared) { - OUT_RING(ring, REG_A6XX_PC_DGEN_SO_CNTL); - OUT_RING(ring, 0); + crb.add(A6XX_PC_DGEN_SO_CNTL()); } - fd6_context(ctx)->streamout_disable_stateobj = ring; + fd6_context(ctx)->streamout_disable_stateobj = crb.ring(); } static void @@ -328,43 +335,29 @@ setup_stream_out(struct fd_context *ctx, struct fd6_program_state *state, ctx->screen->info->a6xx.tess_use_shared && v->type == MESA_SHADER_TESS_EVAL; - unsigned sizedw = 10 + (2 * prog_count); + unsigned nreg = 5 + prog_count; if (emit_pc_so_stream_cntl) - sizedw += 2; + nreg++; - struct fd_ringbuffer *ring = - fd_ringbuffer_new_object(ctx->pipe, (1 + sizedw) * 4); + fd_crb crb(ctx->pipe, nreg); - OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, sizedw); - OUT_RING(ring, REG_A6XX_VPC_SO_CNTL); - OUT_RING(ring, - A6XX_VPC_SO_CNTL_STREAM_ENABLE(strmout->streams_written) | - COND(strmout->stride[0] > 0, - A6XX_VPC_SO_CNTL_BUF0_STREAM(1 + strmout->output[0].stream)) | - COND(strmout->stride[1] > 0, - A6XX_VPC_SO_CNTL_BUF1_STREAM(1 + strmout->output[1].stream)) | - COND(strmout->stride[2] > 0, - A6XX_VPC_SO_CNTL_BUF2_STREAM(1 + strmout->output[2].stream)) | - COND(strmout->stride[3] > 0, - A6XX_VPC_SO_CNTL_BUF3_STREAM(1 + strmout->output[3].stream))); - OUT_RING(ring, REG_A6XX_VPC_SO_BUFFER_STRIDE(0)); - OUT_RING(ring, strmout->stride[0]); - OUT_RING(ring, REG_A6XX_VPC_SO_BUFFER_STRIDE(1)); - OUT_RING(ring, strmout->stride[1]); - OUT_RING(ring, REG_A6XX_VPC_SO_BUFFER_STRIDE(2)); - OUT_RING(ring, strmout->stride[2]); - OUT_RING(ring, REG_A6XX_VPC_SO_BUFFER_STRIDE(3)); - OUT_RING(ring, strmout->stride[3]); + crb.add(A6XX_VPC_SO_CNTL( + .buf0_stream = 1 + strmout->output[0].stream, + .buf1_stream = 1 + strmout->output[1].stream, + .buf2_stream = 1 + strmout->output[2].stream, + .buf3_stream = 1 + strmout->output[3].stream, + .stream_enable = strmout->streams_written, + )); + + for (unsigned i = 0; i < 4; i++) + crb.add(A6XX_VPC_SO_BUFFER_STRIDE(i, strmout->stride[i])); bool first = true; BITSET_FOREACH_RANGE (start, end, valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) { - OUT_RING(ring, REG_A6XX_VPC_SO_MAPPING_WPTR); - OUT_RING(ring, COND(first, A6XX_VPC_SO_MAPPING_WPTR_RESET) | - A6XX_VPC_SO_MAPPING_WPTR_ADDR(start)); + crb.add(A6XX_VPC_SO_MAPPING_WPTR(.addr = start, .reset = first)); for (unsigned i = start; i < end; i++) { - OUT_RING(ring, REG_A6XX_VPC_SO_MAPPING_PORT); - OUT_RING(ring, prog[i]); + crb.add(A6XX_VPC_SO_MAPPING_PORT(.dword = prog[i])); } first = false; } @@ -373,11 +366,10 @@ setup_stream_out(struct fd_context *ctx, struct fd6_program_state *state, /* Possibly not tess_use_shared related, but the combination of * tess + xfb fails some tests if we don't emit this. */ - OUT_RING(ring, REG_A6XX_PC_DGEN_SO_CNTL); - OUT_RING(ring, A6XX_PC_DGEN_SO_CNTL_STREAM_ENABLE(0x1)); + crb.add(A6XX_PC_DGEN_SO_CNTL(.stream_enable = true)); } - state->streamout_stateobj = ring; + state->streamout_stateobj = crb.ring(); } static uint32_t @@ -400,60 +392,47 @@ template static void setup_config_stateobj(struct fd_context *ctx, struct fd6_program_state *state) { - struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 100 * 4); + fd_crb crb(ctx->pipe, 12); - OUT_REG(ring, SP_UPDATE_CNTL(CHIP, .vs_state = true, .hs_state = true, - .ds_state = true, .gs_state = true, - .fs_state = true, .cs_state = true, - .cs_uav = true, .gfx_uav = true, )); + crb.add(SP_UPDATE_CNTL(CHIP, + .vs_state = true, .hs_state = true, + .ds_state = true, .gs_state = true, + .fs_state = true, .cs_state = true, + .cs_uav = true, .gfx_uav = true, + )); assert(state->vs->constlen >= state->bs->constlen); - OUT_REG(ring, SP_VS_CONST_CONFIG( - CHIP, + crb.add(SP_VS_CONST_CONFIG(CHIP, .constlen = state->vs->constlen, .enabled = true, )); - OUT_REG(ring, SP_HS_CONST_CONFIG( - CHIP, + crb.add(SP_HS_CONST_CONFIG(CHIP, .constlen = COND(state->hs, state->hs->constlen), .enabled = COND(state->hs, true), )); - OUT_REG(ring, SP_DS_CONST_CONFIG( - CHIP, + crb.add(SP_DS_CONST_CONFIG(CHIP, .constlen = COND(state->ds, state->ds->constlen), .enabled = COND(state->ds, true), )); - OUT_REG(ring, SP_GS_CONST_CONFIG( - CHIP, + crb.add(SP_GS_CONST_CONFIG(CHIP, .constlen = COND(state->gs, state->gs->constlen), .enabled = COND(state->gs, true), )); - OUT_REG(ring, SP_PS_CONST_CONFIG( - CHIP, + crb.add(SP_PS_CONST_CONFIG(CHIP, .constlen = state->fs->constlen, .enabled = true, )); - OUT_PKT4(ring, REG_A6XX_SP_VS_CONFIG, 1); - OUT_RING(ring, sp_xs_config(state->vs)); + crb.add(A6XX_SP_VS_CONFIG(.dword = sp_xs_config(state->vs))); + crb.add(A6XX_SP_HS_CONFIG(.dword = sp_xs_config(state->hs))); + crb.add(A6XX_SP_DS_CONFIG(.dword = sp_xs_config(state->ds))); + crb.add(A6XX_SP_GS_CONFIG(.dword = sp_xs_config(state->gs))); + crb.add(A6XX_SP_PS_CONFIG(.dword = sp_xs_config(state->fs))); - OUT_PKT4(ring, REG_A6XX_SP_HS_CONFIG, 1); - OUT_RING(ring, sp_xs_config(state->hs)); + crb.add(A6XX_SP_GFX_USIZE(ir3_shader_num_uavs(state->fs))); - OUT_PKT4(ring, REG_A6XX_SP_DS_CONFIG, 1); - OUT_RING(ring, sp_xs_config(state->ds)); - - OUT_PKT4(ring, REG_A6XX_SP_GS_CONFIG, 1); - OUT_RING(ring, sp_xs_config(state->gs)); - - OUT_PKT4(ring, REG_A6XX_SP_PS_CONFIG, 1); - OUT_RING(ring, sp_xs_config(state->fs)); - - OUT_PKT4(ring, REG_A6XX_SP_GFX_USIZE, 1); - OUT_RING(ring, ir3_shader_num_uavs(state->fs)); - - state->config_stateobj = ring; + state->config_stateobj = crb.ring(); } static inline uint32_t @@ -483,7 +462,7 @@ primitive_to_tess(enum mesa_prim primitive) #define MAX_VERTEX_ATTRIBS 32 static void -emit_vfd_dest(struct fd_ringbuffer *ring, const struct ir3_shader_variant *vs) +emit_vfd_dest(fd_crb &crb, const struct ir3_shader_variant *vs) { uint32_t attr_count = 0; @@ -491,24 +470,23 @@ emit_vfd_dest(struct fd_ringbuffer *ring, const struct ir3_shader_variant *vs) if (!vs->inputs[i].sysval) attr_count++; - OUT_REG(ring, A6XX_VFD_CNTL_0( - .fetch_cnt = attr_count, /* decode_cnt for binning pass ? */ - .decode_cnt = attr_count)); - - if (attr_count) - OUT_PKT4(ring, REG_A6XX_VFD_DEST_CNTL_INSTR(0), attr_count); + crb.add(A6XX_VFD_CNTL_0( + .fetch_cnt = attr_count, /* decode_cnt for binning pass ? */ + .decode_cnt = attr_count + )); for (uint32_t i = 0; i < attr_count; i++) { assert(!vs->inputs[i].sysval); - OUT_RING(ring, - A6XX_VFD_DEST_CNTL_INSTR_WRITEMASK(vs->inputs[i].compmask) | - A6XX_VFD_DEST_CNTL_INSTR_REGID(vs->inputs[i].regid)); + crb.add(A6XX_VFD_DEST_CNTL_INSTR(i, + .writemask = vs->inputs[i].compmask, + .regid = vs->inputs[i].regid, + )); } } +/* nregs: 6 */ static void -emit_vs_system_values(struct fd_ringbuffer *ring, - const struct program_builder *b) +emit_vs_system_values(fd_crb &crb, const struct program_builder *b) { const uint32_t vertexid_regid = ir3_find_sysval_regid(b->vs, SYSTEM_VALUE_VERTEX_ID); @@ -537,26 +515,51 @@ emit_vs_system_values(struct fd_ringbuffer *ring, */ const uint32_t viewid_regid = INVALID_REG; - OUT_PKT4(ring, REG_A6XX_VFD_CNTL_1, 6); - OUT_RING(ring, A6XX_VFD_CNTL_1_REGID4VTX(vertexid_regid) | - A6XX_VFD_CNTL_1_REGID4INST(instanceid_regid) | - A6XX_VFD_CNTL_1_REGID4PRIMID(vs_primitiveid_regid) | - A6XX_VFD_CNTL_1_REGID4VIEWID(viewid_regid)); - OUT_RING(ring, A6XX_VFD_CNTL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) | - A6XX_VFD_CNTL_2_REGID_INVOCATIONID(hs_invocation_regid)); - OUT_RING(ring, A6XX_VFD_CNTL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) | - A6XX_VFD_CNTL_3_REGID_TESSX(tess_coord_x_regid) | - A6XX_VFD_CNTL_3_REGID_TESSY(tess_coord_y_regid) | - A6XX_VFD_CNTL_3_REGID_DSPRIMID(ds_primitiveid_regid)); - OUT_RING(ring, 0x000000fc); /* VFD_CNTL_4 */ - OUT_RING(ring, A6XX_VFD_CNTL_5_REGID_GSHEADER(gsheader_regid) | - 0xfc00); /* VFD_CNTL_5 */ - OUT_RING(ring, COND(b->fs->reads_primid, A6XX_VFD_CNTL_6_PRIMID4PSEN)); /* VFD_CNTL_6 */ + crb.add(A6XX_VFD_CNTL_1( + .regid4vtx = vertexid_regid, + .regid4inst = instanceid_regid, + .regid4primid = vs_primitiveid_regid, + .regid4viewid = viewid_regid, + )); + crb.add(A6XX_VFD_CNTL_2( + .regid_hsrelpatchid = hs_rel_patch_regid, + .regid_invocationid = hs_invocation_regid, + )); + crb.add(A6XX_VFD_CNTL_3( + .regid_dsprimid = ds_primitiveid_regid, + .regid_dsrelpatchid = ds_rel_patch_regid, + .regid_tessx = tess_coord_x_regid, + .regid_tessy = tess_coord_y_regid, + )); + crb.add(A6XX_VFD_CNTL_4(.unk0 = INVALID_REG)); + crb.add(A6XX_VFD_CNTL_5( + .regid_gsheader = gsheader_regid, + .unk8 = INVALID_REG, + )); + crb.add(A6XX_VFD_CNTL_6(.primid4psen = b->fs->reads_primid)); } template static void -emit_vpc(struct fd_ringbuffer *ring, const struct program_builder *b) +emit_linkmap(fd_cs &cs, const struct program_builder *b) +{ + if (b->hs) { + fd6_emit_link_map(b->ctx, cs, b->vs, b->hs); + fd6_emit_link_map(b->ctx, cs, b->hs, b->ds); + } + + if (b->gs) { + if (b->hs) { + fd6_emit_link_map(b->ctx, cs, b->ds, b->gs); + } else { + fd6_emit_link_map(b->ctx, cs, b->vs, b->gs); + } + } +} + +template +static void +emit_vpc(fd_crb &crb, const struct program_builder *b) { const struct ir3_shader_variant *last_shader = b->last_shader; @@ -650,13 +653,10 @@ emit_vpc(struct fd_ringbuffer *ring, const struct program_builder *b) if (do_streamout) ir3_link_stream_out(&linkage, b->last_shader); - emit_vs_system_values(ring, b); + emit_vs_system_values(crb, b); - OUT_PKT4(ring, REG_A6XX_VPC_VARYING_LM_TRANSFER_CNTL_DISABLE(0), 4); - OUT_RING(ring, ~linkage.varmask[0]); - OUT_RING(ring, ~linkage.varmask[1]); - OUT_RING(ring, ~linkage.varmask[2]); - OUT_RING(ring, ~linkage.varmask[3]); + for (unsigned i = 0; i < 4; i++) + crb.add(A6XX_VPC_VARYING_LM_TRANSFER_CNTL_DISABLE(i, ~linkage.varmask[i])); /* a6xx finds position/pointsize at the end */ const uint32_t position_regid = @@ -676,7 +676,6 @@ emit_vpc(struct fd_ringbuffer *ring, const struct program_builder *b) uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff; -// XXX replace regid(63,0) with INVALID_REG if (layer_regid != INVALID_REG) { layer_loc = linkage.max_loc; ir3_link_add(&linkage, VARYING_SLOT_LAYER, layer_regid, 0x1, linkage.max_loc); @@ -761,30 +760,36 @@ emit_vpc(struct fd_ringbuffer *ring, const struct program_builder *b) A6XX_SP_VS_VPC_DEST_REG_OUTLOC0(linkage.var[i].loc); } - OUT_PKT4(ring, cfg->reg_sp_xs_out_reg, sp_out_count); - OUT_BUF(ring, sp_out, sp_out_count); + uint32_t *regs = (uint32_t *)sp_out; + for (unsigned i = 0; i < sp_out_count; i++) + crb.add({ cfg->reg_sp_xs_out_reg + i, regs[i] }); - OUT_PKT4(ring, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count); - OUT_BUF(ring, sp_vpc_dst, sp_vpc_dst_count); + regs = (uint32_t *)sp_vpc_dst; + for (unsigned i = 0; i < sp_vpc_dst_count; i++) + crb.add({ cfg->reg_sp_xs_vpc_dst_reg + i, regs[i] }); - OUT_PKT4(ring, cfg->reg_vpc_xs_pack, 1); - OUT_RING(ring, A6XX_VPC_VS_CNTL_POSITIONLOC(position_loc) | - A6XX_VPC_VS_CNTL_PSIZELOC(pointsize_loc) | - A6XX_VPC_VS_CNTL_STRIDE_IN_VPC(linkage.max_loc)); + crb.add({ cfg->reg_vpc_xs_pack, + A6XX_VPC_VS_CNTL_POSITIONLOC(position_loc) | + A6XX_VPC_VS_CNTL_PSIZELOC(pointsize_loc) | + A6XX_VPC_VS_CNTL_STRIDE_IN_VPC(linkage.max_loc) + }); - OUT_PKT4(ring, cfg->reg_vpc_xs_clip_cntl, 1); - OUT_RING(ring, A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_MASK(clip_cull_mask) | - A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_DIST_03_LOC(clip0_loc) | - A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_DIST_47_LOC(clip1_loc)); + crb.add({ cfg->reg_vpc_xs_clip_cntl, + A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_MASK(clip_cull_mask) | + A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_DIST_03_LOC(clip0_loc) | + A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_DIST_47_LOC(clip1_loc) + }); - OUT_PKT4(ring, cfg->reg_vpc_xs_clip_cntl_v2, 1); - OUT_RING(ring, A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_MASK(clip_cull_mask) | - A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_DIST_03_LOC(clip0_loc) | - A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_DIST_47_LOC(clip1_loc)); + crb.add({ cfg->reg_vpc_xs_clip_cntl_v2, + A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_MASK(clip_cull_mask) | + A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_DIST_03_LOC(clip0_loc) | + A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_DIST_47_LOC(clip1_loc) + }); - OUT_PKT4(ring, cfg->reg_gras_xs_cl_cntl, 1); - OUT_RING(ring, A6XX_GRAS_CL_VS_CLIP_CULL_DISTANCE_CLIP_MASK(clip_mask) | - A6XX_GRAS_CL_VS_CLIP_CULL_DISTANCE_CULL_MASK(cull_mask)); + crb.add({ cfg->reg_gras_xs_cl_cntl, + A6XX_GRAS_CL_VS_CLIP_CULL_DISTANCE_CLIP_MASK(clip_mask) | + A6XX_GRAS_CL_VS_CLIP_CULL_DISTANCE_CULL_MASK(cull_mask) + }); const struct ir3_shader_variant *geom_stages[] = { b->vs, b->hs, b->ds, b->gs }; @@ -796,60 +801,59 @@ emit_vpc(struct fd_ringbuffer *ring, const struct program_builder *b) bool primid = shader->type != MESA_SHADER_VERTEX && VALIDREG(ir3_find_sysval_regid(shader, SYSTEM_VALUE_PRIMITIVE_ID)); - OUT_PKT4(ring, reg_config[shader->type].reg_pc_xs_out_cntl, 1); + uint32_t val = COND(primid, A6XX_PC_VS_CNTL_PRIMITIVE_ID); if (shader == last_shader) { - OUT_RING(ring, A6XX_PC_VS_CNTL_STRIDE_IN_VPC(linkage.max_loc) | - CONDREG(pointsize_regid, A6XX_PC_VS_CNTL_PSIZE) | - CONDREG(layer_regid, A6XX_PC_VS_CNTL_LAYER) | - CONDREG(view_regid, A6XX_PC_VS_CNTL_VIEW) | - COND(primid, A6XX_PC_VS_CNTL_PRIMITIVE_ID) | - COND(primid, A6XX_PC_GS_CNTL_PRIMITIVE_ID) | - A6XX_PC_VS_CNTL_CLIP_MASK(clip_cull_mask)); - } else { - OUT_RING(ring, COND(primid, A6XX_PC_VS_CNTL_PRIMITIVE_ID)); + val |= A6XX_PC_VS_CNTL_STRIDE_IN_VPC(linkage.max_loc) | + CONDREG(pointsize_regid, A6XX_PC_VS_CNTL_PSIZE) | + CONDREG(layer_regid, A6XX_PC_VS_CNTL_LAYER) | + CONDREG(view_regid, A6XX_PC_VS_CNTL_VIEW) | + COND(primid, A6XX_PC_GS_CNTL_PRIMITIVE_ID) | + A6XX_PC_VS_CNTL_CLIP_MASK(clip_cull_mask); } + crb.add({ reg_config[shader->type].reg_pc_xs_out_cntl, val }); } /* if vertex_flags somehow gets optimized out, your gonna have a bad time: */ assert(flags_regid != INVALID_REG); - OUT_PKT4(ring, cfg->reg_sp_xs_primitive_cntl, 1); - OUT_RING(ring, A6XX_SP_VS_OUTPUT_CNTL_OUT(linkage.cnt) | - A6XX_SP_GS_OUTPUT_CNTL_FLAGS_REGID(flags_regid)); + crb.add({ cfg->reg_sp_xs_primitive_cntl, + A6XX_SP_VS_OUTPUT_CNTL_OUT(linkage.cnt) | + A6XX_SP_GS_OUTPUT_CNTL_FLAGS_REGID(flags_regid) + }); - OUT_PKT4(ring, cfg->reg_vpc_xs_layer_cntl, 1); - OUT_RING(ring, A6XX_VPC_VS_SIV_CNTL_LAYERLOC(layer_loc) | - A6XX_VPC_VS_SIV_CNTL_VIEWLOC(view_loc) | - A6XX_VPC_VS_SIV_CNTL_SHADINGRATELOC(0xff)); + crb.add({ cfg->reg_vpc_xs_layer_cntl, + A6XX_VPC_VS_SIV_CNTL_LAYERLOC(layer_loc) | + A6XX_VPC_VS_SIV_CNTL_VIEWLOC(view_loc) | + A6XX_VPC_VS_SIV_CNTL_SHADINGRATELOC(0xff) + }); - OUT_PKT4(ring, cfg->reg_vpc_xs_layer_cntl_v2, 1); - OUT_RING(ring, A6XX_VPC_VS_SIV_CNTL_LAYERLOC(layer_loc) | - A6XX_VPC_VS_SIV_CNTL_VIEWLOC(view_loc) | - A6XX_VPC_VS_SIV_CNTL_SHADINGRATELOC(0xff)); + crb.add({ cfg->reg_vpc_xs_layer_cntl_v2, + A6XX_VPC_VS_SIV_CNTL_LAYERLOC(layer_loc) | + A6XX_VPC_VS_SIV_CNTL_VIEWLOC(view_loc) | + A6XX_VPC_VS_SIV_CNTL_SHADINGRATELOC(0xff) + }); - OUT_PKT4(ring, cfg->reg_gras_xs_layer_cntl, 1); - OUT_RING(ring, CONDREG(layer_regid, A6XX_GRAS_SU_VS_SIV_CNTL_WRITES_LAYER) | - CONDREG(view_regid, A6XX_GRAS_SU_VS_SIV_CNTL_WRITES_VIEW)); + crb.add({ cfg->reg_gras_xs_layer_cntl, + CONDREG(layer_regid, A6XX_GRAS_SU_VS_SIV_CNTL_WRITES_LAYER) | + CONDREG(view_regid, A6XX_GRAS_SU_VS_SIV_CNTL_WRITES_VIEW) + }); - OUT_REG(ring, A6XX_PC_PS_CNTL(b->fs->reads_primid)); + crb.add(A6XX_PC_PS_CNTL(b->fs->reads_primid)); if (CHIP >= A7XX) { - OUT_REG(ring, A6XX_GRAS_UNKNOWN_8110(0x2)); - OUT_REG(ring, A7XX_SP_RENDER_CNTL(.fs_disable = false)); + crb.add(A6XX_GRAS_UNKNOWN_8110(0x2)); + crb.add(A7XX_SP_RENDER_CNTL(.fs_disable = false)); } - OUT_PKT4(ring, REG_A6XX_VPC_PS_CNTL, 1); - OUT_RING(ring, A6XX_VPC_PS_CNTL_NUMNONPOSVAR(b->fs->total_in) | - COND(b->fs->total_in, A6XX_VPC_PS_CNTL_VARYING) | - A6XX_VPC_PS_CNTL_PRIMIDLOC(linkage.primid_loc) | - A6XX_VPC_PS_CNTL_VIEWIDLOC(linkage.viewid_loc)); + crb.add(A6XX_VPC_PS_CNTL( + .numnonposvar = b->fs->total_in, + .primidloc = linkage.primid_loc, + .varying = !!b->fs->total_in, + .viewidloc = linkage.viewid_loc, + )); if (b->hs) { - OUT_PKT4(ring, REG_A6XX_PC_HS_PARAM_0, 1); - OUT_RING(ring, b->hs->tess.tcs_vertices_out); - - fd6_emit_link_map(b->ctx, b->vs, b->hs, ring); - fd6_emit_link_map(b->ctx, b->hs, b->ds, ring); + crb.add(A6XX_PC_HS_PARAM_0(b->hs->tess.tcs_vertices_out)); } if (b->gs) { @@ -857,12 +861,6 @@ emit_vpc(struct fd_ringbuffer *ring, const struct program_builder *b) uint32_t prev_stage_output_size = b->ds ? b->ds->output_size : b->vs->output_size; - if (b->hs) { - fd6_emit_link_map(b->ctx, b->ds, b->gs, ring); - } else { - fd6_emit_link_map(b->ctx, b->vs, b->gs, ring); - } - vertices_out = MAX2(1, b->gs->gs.vertices_out) - 1; enum a6xx_tess_output output = primitive_to_tess((enum mesa_prim)b->gs->gs.output_primitive); @@ -871,28 +869,24 @@ emit_vpc(struct fd_ringbuffer *ring, const struct program_builder *b) vec4_size = b->gs->gs.vertices_in * DIV_ROUND_UP(prev_stage_output_size, 4); - OUT_PKT4(ring, REG_A6XX_PC_GS_PARAM_0, 1); - OUT_RING(ring, - A6XX_PC_GS_PARAM_0_GS_VERTICES_OUT(vertices_out) | - A6XX_PC_GS_PARAM_0_GS_OUTPUT(output) | - A6XX_PC_GS_PARAM_0_GS_INVOCATIONS(invocations)); + crb.add(A6XX_PC_GS_PARAM_0( + .gs_vertices_out = vertices_out, + .gs_invocations = invocations, + .gs_output = output, + )); if (CHIP >= A7XX) { - OUT_REG(ring, - A7XX_VPC_GS_PARAM_0( - .gs_vertices_out = vertices_out, - .gs_invocations = invocations, - .gs_output = output, - ) - ); + crb.add(A7XX_VPC_GS_PARAM_0( + .gs_vertices_out = vertices_out, + .gs_invocations = invocations, + .gs_output = output, + )); } else { - OUT_PKT4(ring, REG_A6XX_VPC_GS_PARAM, 1); - OUT_RING(ring, 0xff); + crb.add(A6XX_VPC_GS_PARAM(0xff)); } if (CHIP == A6XX) { - OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1); - OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size)); + crb.add(A6XX_PC_PRIMITIVE_CNTL_6(vec4_size)); } uint32_t prim_size = prev_stage_output_size; @@ -901,8 +895,7 @@ emit_vpc(struct fd_ringbuffer *ring, const struct program_builder *b) else if (prim_size == 64) prim_size = 63; - OUT_PKT4(ring, REG_A6XX_SP_GS_CNTL_1, 1); - OUT_RING(ring, prim_size); + crb.add(A6XX_SP_GS_CNTL_1(prim_size)); } } @@ -919,7 +912,7 @@ tex_opc_to_prefetch_cmd(opc_t tex_opc) template static void -emit_fs_inputs(struct fd_ringbuffer *ring, const struct program_builder *b) +emit_fs_inputs(fd_crb &crb, const struct program_builder *b) { const struct ir3_shader_variant *fs = b->fs; uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid; @@ -943,76 +936,65 @@ emit_fs_inputs(struct fd_ringbuffer *ring, const struct program_builder *b) ij_regid[fs->prefetch_bary_type] == regid(0, 0)); } - OUT_PKT4(ring, REG_A6XX_SP_PS_INITIAL_TEX_LOAD_CNTL, 1 + fs->num_sampler_prefetch); - OUT_RING(ring, A6XX_SP_PS_INITIAL_TEX_LOAD_CNTL_COUNT(fs->num_sampler_prefetch) | - COND(CHIP >= A7XX, A6XX_SP_PS_INITIAL_TEX_LOAD_CNTL_CONSTSLOTID(0x1ff)) | - COND(CHIP >= A7XX, A6XX_SP_PS_INITIAL_TEX_LOAD_CNTL_CONSTSLOTID4COORD(0x1ff)) | - COND(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]), - A6XX_SP_PS_INITIAL_TEX_LOAD_CNTL_IJ_WRITE_DISABLE) | - COND(fs->prefetch_end_of_quad, - A6XX_SP_PS_INITIAL_TEX_LOAD_CNTL_ENDOFQUAD)); + crb.add(A6XX_SP_PS_INITIAL_TEX_LOAD_CNTL( + .count = fs->num_sampler_prefetch, + .ij_write_disable = !VALIDREG(ij_regid[IJ_PERSP_PIXEL]), + .endofquad = fs->prefetch_end_of_quad, + .constslotid = COND(CHIP >= A7XX, 0x1ff), + .constslotid4coord = COND(CHIP >= A7XX, 0x1ff), + )); + for (int i = 0; i < fs->num_sampler_prefetch; i++) { const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i]; - OUT_RING(ring, SP_PS_INITIAL_TEX_LOAD_CMD( - CHIP, i, - .src = prefetch->src, - /* For a7xx, samp_id/tex_id is always in SP_PS_INITIAL_TEX_INDEX_CMD[n] - * even in the non-bindless case (which probably makes the reg name - * wrong) - */ - .samp_id = (CHIP == A6XX) ? prefetch->samp_id : 0, - .tex_id = (CHIP == A6XX) ? prefetch->tex_id : 0, - .dst = prefetch->dst, - .wrmask = prefetch->wrmask, - .half = prefetch->half_precision, - .bindless = prefetch->bindless, - .cmd = tex_opc_to_prefetch_cmd(prefetch->tex_opc), - ).value - ); + crb.add(SP_PS_INITIAL_TEX_LOAD_CMD(CHIP, i, + .src = prefetch->src, + /* For a7xx, samp_id/tex_id is always in SP_PS_INITIAL_TEX_INDEX_CMD[n] + * even in the non-bindless case (which probably makes the reg name + * wrong) + */ + .samp_id = (CHIP == A6XX) ? prefetch->samp_id : 0, + .tex_id = (CHIP == A6XX) ? prefetch->tex_id : 0, + .dst = prefetch->dst, + .wrmask = prefetch->wrmask, + .half = prefetch->half_precision, + .bindless = prefetch->bindless, + .cmd = tex_opc_to_prefetch_cmd(prefetch->tex_opc), + )); } if (CHIP == A7XX) { for (int i = 0; i < fs->num_sampler_prefetch; i++) { const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i]; - OUT_REG(ring, - A6XX_SP_PS_INITIAL_TEX_INDEX_CMD(i, - .samp_id = prefetch->samp_id, - .tex_id = prefetch->tex_id, - ) - ); + crb.add(A6XX_SP_PS_INITIAL_TEX_INDEX_CMD(i, + .samp_id = prefetch->samp_id, + .tex_id = prefetch->tex_id, + )); } } - OUT_REG(ring, - SP_LB_PARAM_LIMIT(CHIP, - b->ctx->screen->info->a6xx.prim_alloc_threshold), - SP_REG_PROG_ID_0( - CHIP, - .faceregid = face_regid, - .sampleid = samp_id_regid, - .samplemask = smask_in_regid, - .centerrhw = ij_regid[IJ_PERSP_CENTER_RHW], - ), - SP_REG_PROG_ID_1( - CHIP, - .ij_persp_pixel = ij_regid[IJ_PERSP_PIXEL], - .ij_linear_pixel = ij_regid[IJ_LINEAR_PIXEL], - .ij_persp_centroid = ij_regid[IJ_PERSP_CENTROID], - .ij_linear_centroid = ij_regid[IJ_LINEAR_CENTROID], - ), - SP_REG_PROG_ID_2( - CHIP, - .ij_persp_sample = ij_regid[IJ_PERSP_SAMPLE], - .ij_linear_sample = ij_regid[IJ_LINEAR_SAMPLE], - .xycoordregid = coord_regid, - .zwcoordregid = zwcoord_regid, - ), - SP_REG_PROG_ID_3( - CHIP, - .linelengthregid = INVALID_REG, - .foveationqualityregid = INVALID_REG, - ), - ); + crb.add(SP_LB_PARAM_LIMIT(CHIP, b->ctx->screen->info->a6xx.prim_alloc_threshold)); + crb.add(SP_REG_PROG_ID_0(CHIP, + .faceregid = face_regid, + .sampleid = samp_id_regid, + .samplemask = smask_in_regid, + .centerrhw = ij_regid[IJ_PERSP_CENTER_RHW], + )); + crb.add(SP_REG_PROG_ID_1(CHIP, + .ij_persp_pixel = ij_regid[IJ_PERSP_PIXEL], + .ij_linear_pixel = ij_regid[IJ_LINEAR_PIXEL], + .ij_persp_centroid = ij_regid[IJ_PERSP_CENTROID], + .ij_linear_centroid = ij_regid[IJ_LINEAR_CENTROID], + )); + crb.add(SP_REG_PROG_ID_2(CHIP, + .ij_persp_sample = ij_regid[IJ_PERSP_SAMPLE], + .ij_linear_sample = ij_regid[IJ_LINEAR_SAMPLE], + .xycoordregid = coord_regid, + .zwcoordregid = zwcoord_regid, + )); + crb.add(SP_REG_PROG_ID_3(CHIP, + .linelengthregid = INVALID_REG, + .foveationqualityregid = INVALID_REG, + )); if (CHIP >= A7XX) { uint32_t sysval_regs = 0; @@ -1035,23 +1017,18 @@ emit_fs_inputs(struct fd_ringbuffer *ring, const struct program_builder *b) sysval_regs += 2; } - OUT_REG(ring, - A7XX_SP_PS_CNTL_1( - .sysval_regs_count = sysval_regs, - .unk8 = 1, - .unk9 = 1, - ) - ); + crb.add(A7XX_SP_PS_CNTL_1( + .sysval_regs_count = sysval_regs, + .unk8 = 1, + .unk9 = 1, + )); } enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64; - OUT_REG(ring, - SP_PS_WAVE_CNTL( - CHIP, - .threadsize = thrsz, - .varyings = enable_varyings, - ), - ); + crb.add(SP_PS_WAVE_CNTL(CHIP, + .threadsize = thrsz, + .varyings = enable_varyings, + )); bool need_size = fs->frag_face || fs->fragcoord_compmask != 0; bool need_size_persamp = false; @@ -1062,57 +1039,44 @@ emit_fs_inputs(struct fd_ringbuffer *ring, const struct program_builder *b) need_size = true; } - OUT_PKT4(ring, REG_A6XX_GRAS_CL_INTERP_CNTL, 1); - OUT_RING(ring, - CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CL_INTERP_CNTL_IJ_PERSP_PIXEL) | - CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CL_INTERP_CNTL_IJ_PERSP_CENTROID) | - CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CL_INTERP_CNTL_IJ_PERSP_SAMPLE) | - CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CL_INTERP_CNTL_IJ_LINEAR_PIXEL) | - CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_GRAS_CL_INTERP_CNTL_IJ_LINEAR_CENTROID) | - CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CL_INTERP_CNTL_IJ_LINEAR_SAMPLE) | - COND(need_size, A6XX_GRAS_CL_INTERP_CNTL_IJ_LINEAR_PIXEL) | - COND(need_size_persamp, A6XX_GRAS_CL_INTERP_CNTL_IJ_LINEAR_SAMPLE) | - COND(fs->fragcoord_compmask != 0, - A6XX_GRAS_CL_INTERP_CNTL_COORD_MASK(fs->fragcoord_compmask))); - - OUT_PKT4(ring, REG_A6XX_RB_INTERP_CNTL, 2); - OUT_RING(ring, - CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_INTERP_CNTL_IJ_PERSP_PIXEL) | - CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_INTERP_CNTL_IJ_PERSP_CENTROID) | - CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_INTERP_CNTL_IJ_PERSP_SAMPLE) | - CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_RB_INTERP_CNTL_IJ_LINEAR_PIXEL) | - CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_RB_INTERP_CNTL_IJ_LINEAR_CENTROID) | - CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_RB_INTERP_CNTL_IJ_LINEAR_SAMPLE) | - COND(need_size, A6XX_RB_INTERP_CNTL_IJ_LINEAR_PIXEL) | - COND(enable_varyings, A6XX_RB_INTERP_CNTL_UNK10) | - COND(need_size_persamp, A6XX_RB_INTERP_CNTL_IJ_LINEAR_SAMPLE) | - COND(fs->fragcoord_compmask != 0, - A6XX_RB_INTERP_CNTL_COORD_MASK(fs->fragcoord_compmask))); - OUT_RING(ring, - A6XX_RB_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE( - sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER) | - CONDREG(smask_in_regid, A6XX_RB_PS_INPUT_CNTL_SAMPLEMASK) | - CONDREG(samp_id_regid, A6XX_RB_PS_INPUT_CNTL_SAMPLEID) | - CONDREG(ij_regid[IJ_PERSP_CENTER_RHW], A6XX_RB_PS_INPUT_CNTL_CENTERRHW) | - COND(fs->post_depth_coverage, A6XX_RB_PS_INPUT_CNTL_POSTDEPTHCOVERAGE) | - COND(fs->frag_face, A6XX_RB_PS_INPUT_CNTL_FACENESS)); - - OUT_PKT4(ring, REG_A6XX_RB_PS_SAMPLEFREQ_CNTL, 1); - OUT_RING(ring, COND(sample_shading, A6XX_RB_PS_SAMPLEFREQ_CNTL_PER_SAMP_MODE)); - - OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 1); - OUT_RING(ring, - CONDREG(samp_id_regid, A6XX_GRAS_LRZ_PS_INPUT_CNTL_SAMPLEID) | - A6XX_GRAS_LRZ_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE( - sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER)); - - OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_PS_SAMPLEFREQ_CNTL, 1); - OUT_RING(ring, COND(sample_shading, A6XX_GRAS_LRZ_PS_SAMPLEFREQ_CNTL_PER_SAMP_MODE)); + crb.add(A6XX_GRAS_CL_INTERP_CNTL( + .ij_persp_pixel = VALIDREG(ij_regid[IJ_PERSP_PIXEL]), + .ij_persp_centroid = VALIDREG(ij_regid[IJ_PERSP_CENTROID]), + .ij_persp_sample = VALIDREG(ij_regid[IJ_PERSP_SAMPLE]), + .ij_linear_pixel = VALIDREG(ij_regid[IJ_LINEAR_PIXEL]) || need_size, + .ij_linear_centroid = VALIDREG(ij_regid[IJ_LINEAR_CENTROID]), + .ij_linear_sample = VALIDREG(ij_regid[IJ_LINEAR_SAMPLE]) || need_size_persamp, + .coord_mask = fs->fragcoord_compmask, + )); + crb.add(A6XX_RB_INTERP_CNTL( + .ij_persp_pixel = VALIDREG(ij_regid[IJ_PERSP_PIXEL]), + .ij_persp_centroid = VALIDREG(ij_regid[IJ_PERSP_CENTROID]), + .ij_persp_sample = VALIDREG(ij_regid[IJ_PERSP_SAMPLE]), + .ij_linear_pixel = VALIDREG(ij_regid[IJ_LINEAR_PIXEL]) || need_size, + .ij_linear_centroid = VALIDREG(ij_regid[IJ_LINEAR_CENTROID]), + .ij_linear_sample = VALIDREG(ij_regid[IJ_LINEAR_SAMPLE]) || need_size_persamp, + .coord_mask = fs->fragcoord_compmask, + .unk10 = enable_varyings, + )); + crb.add(A6XX_RB_PS_INPUT_CNTL( + .samplemask = VALIDREG(smask_in_regid), + .postdepthcoverage = fs->post_depth_coverage, + .faceness = fs->frag_face, + .sampleid = VALIDREG(samp_id_regid), + .fragcoordsamplemode = sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER, + .centerrhw = VALIDREG(ij_regid[IJ_PERSP_CENTER_RHW]) + )); + crb.add(A6XX_RB_PS_SAMPLEFREQ_CNTL(sample_shading)); + crb.add(A6XX_GRAS_LRZ_PS_INPUT_CNTL( + .sampleid = VALIDREG(samp_id_regid), + .fragcoordsamplemode = sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER, + )); + crb.add(A6XX_GRAS_LRZ_PS_SAMPLEFREQ_CNTL(sample_shading)); } template static void -emit_fs_outputs(struct fd_ringbuffer *ring, const struct program_builder *b) +emit_fs_outputs(fd_crb &crb, const struct program_builder *b) { const struct ir3_shader_variant *fs = b->fs; uint32_t smask_regid, posz_regid, stencilref_regid; @@ -1153,17 +1117,18 @@ emit_fs_outputs(struct fd_ringbuffer *ring, const struct program_builder *b) } } - OUT_PKT4(ring, REG_A6XX_SP_PS_OUTPUT_CNTL, 1); - OUT_RING(ring, A6XX_SP_PS_OUTPUT_CNTL_DEPTH_REGID(posz_regid) | - A6XX_SP_PS_OUTPUT_CNTL_SAMPMASK_REGID(smask_regid) | - A6XX_SP_PS_OUTPUT_CNTL_STENCILREF_REGID(stencilref_regid) | - COND(fs->dual_src_blend, A6XX_SP_PS_OUTPUT_CNTL_DUAL_COLOR_IN_ENABLE)); + crb.add(A6XX_SP_PS_OUTPUT_CNTL( + .dual_color_in_enable = fs->dual_src_blend, + .depth_regid = posz_regid, + .sampmask_regid = smask_regid, + .stencilref_regid = stencilref_regid, + )); - OUT_PKT4(ring, REG_A6XX_SP_PS_OUTPUT_REG(0), output_reg_count); for (uint32_t i = 0; i < output_reg_count; i++) { - OUT_RING(ring, A6XX_SP_PS_OUTPUT_REG_REGID(fragdata_regid[i]) | - COND(fragdata_regid[i] & HALF_REG_ID, - A6XX_SP_PS_OUTPUT_REG_HALF_PRECISION)); + crb.add(A6XX_SP_PS_OUTPUT_REG(i, + .regid = fragdata_regid[i] & ~HALF_REG_ID, + .half_precision = fragdata_regid[i] & HALF_REG_ID, + )); if (VALIDREG(fragdata_regid[i]) || (fragdata_aliased_components & (0xf << (i * 4)))) { @@ -1172,11 +1137,8 @@ emit_fs_outputs(struct fd_ringbuffer *ring, const struct program_builder *b) } if (CHIP >= A7XX) { - OUT_REG( - ring, - A7XX_SP_PS_OUTPUT_CONST_CNTL( - .enabled = fragdata_aliased_components != 0), - A7XX_SP_PS_OUTPUT_CONST_MASK(.dword = fragdata_aliased_components)); + crb.add(A7XX_SP_PS_OUTPUT_CONST_CNTL(.enabled = fragdata_aliased_components != 0)); + crb.add(A7XX_SP_PS_OUTPUT_CONST_MASK(.dword = fragdata_aliased_components)); } else { assert(fragdata_aliased_components == 0); } @@ -1184,25 +1146,27 @@ emit_fs_outputs(struct fd_ringbuffer *ring, const struct program_builder *b) template static void -setup_stateobj(struct fd_ringbuffer *ring, const struct program_builder *b) +setup_stateobj(fd_cs &cs, const struct program_builder *b) assert_dt { - fd6_emit_shader(b->ctx, ring, b->vs); - fd6_emit_shader(b->ctx, ring, b->hs); - fd6_emit_shader(b->ctx, ring, b->ds); - fd6_emit_shader(b->ctx, ring, b->gs); + fd6_emit_shader(b->ctx, cs, b->vs); + fd6_emit_shader(b->ctx, cs, b->hs); + fd6_emit_shader(b->ctx, cs, b->ds); + fd6_emit_shader(b->ctx, cs, b->gs); if (!b->binning_pass) - fd6_emit_shader(b->ctx, ring, b->fs); + fd6_emit_shader(b->ctx, cs, b->fs); - OUT_PKT4(ring, REG_A6XX_PC_STEREO_RENDERING_CNTL, 1); - OUT_RING(ring, 0); + emit_linkmap(cs, b); - emit_vfd_dest(ring, b->vs); + fd_crb crb(cs, 100); - emit_vpc(ring, b); + crb.add(A6XX_PC_STEREO_RENDERING_CNTL()); - emit_fs_inputs(ring, b); - emit_fs_outputs(ring, b); + emit_vfd_dest(crb, b->vs); + emit_vpc(crb, b); + + emit_fs_inputs(crb, b); + emit_fs_outputs(crb, b); if (b->hs) { uint32_t patch_control_points = b->key->patch_vertices; @@ -1211,8 +1175,7 @@ setup_stateobj(struct fd_ringbuffer *ring, const struct program_builder *b) patch_control_points * b->vs->output_size / 4; /* Total attribute slots in HS incoming patch. */ - OUT_PKT4(ring, REG_A6XX_PC_HS_PARAM_1, 1); - OUT_RING(ring, patch_local_mem_size_16b); + crb.add(A6XX_PC_HS_PARAM_1(patch_local_mem_size_16b)); const uint32_t wavesize = 64; const uint32_t vs_hs_local_mem_size = 16384; @@ -1239,8 +1202,7 @@ setup_stateobj(struct fd_ringbuffer *ring, const struct program_builder *b) uint32_t wave_input_size = DIV_ROUND_UP( patches_per_wave * patch_local_mem_size_16b * 16, 256); - OUT_PKT4(ring, REG_A6XX_SP_HS_CNTL_1, 1); - OUT_RING(ring, wave_input_size); + crb.add(A6XX_SP_HS_CNTL_1(wave_input_size)); enum a6xx_tess_output output; if (b->ds->tess.point_mode) @@ -1252,27 +1214,25 @@ setup_stateobj(struct fd_ringbuffer *ring, const struct program_builder *b) else output = TESS_CW_TRIS; - OUT_PKT4(ring, REG_A6XX_PC_DS_PARAM, 1); - OUT_RING(ring, A6XX_PC_DS_PARAM_SPACING( - fd6_gl2spacing(b->ds->tess.spacing)) | - A6XX_PC_DS_PARAM_OUTPUT(output)); + crb.add(A6XX_PC_DS_PARAM( + .spacing = fd6_gl2spacing(b->ds->tess.spacing), + .output = output, + )); } } -static void emit_interp_state(struct fd_ringbuffer *ring, - const struct fd6_program_state *state, - bool rasterflat, - bool sprite_coord_mode, +static void emit_interp_state(fd_crb &crb, const struct fd6_program_state *state, + bool rasterflat, bool sprite_coord_mode, uint32_t sprite_coord_enable); static struct fd_ringbuffer * create_interp_stateobj(struct fd_context *ctx, struct fd6_program_state *state) { - struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 18 * 4); + fd_crb crb(ctx->pipe, 16); - emit_interp_state(ring, state, false, false, 0); + emit_interp_state(crb, state, false, false, 0); - return ring; + return crb.ring(); } /* build the program streaming state which is not part of the pre- @@ -1288,18 +1248,17 @@ fd6_program_interp_state(struct fd6_emit *emit) /* fastpath: */ return fd_ringbuffer_ref(state->interp_stateobj); } else { - struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( - emit->ctx->batch->submit, 18 * 4, FD_RINGBUFFER_STREAMING); + fd_crb crb(emit->ctx->batch->submit, 16); - emit_interp_state(ring, state, emit->rasterflat, + emit_interp_state(crb, state, emit->rasterflat, emit->sprite_coord_mode, emit->sprite_coord_enable); - return ring; + return crb.ring(); } } static void -emit_interp_state(struct fd_ringbuffer *ring, const struct fd6_program_state *state, +emit_interp_state(fd_crb &crb, const struct fd6_program_state *state, bool rasterflat, bool sprite_coord_mode, uint32_t sprite_coord_enable) { @@ -1372,13 +1331,11 @@ emit_interp_state(struct fd_ringbuffer *ring, const struct fd6_program_state *st } } - OUT_PKT4(ring, REG_A6XX_VPC_VARYING_INTERP_MODE_MODE(0), 8); for (int i = 0; i < 8; i++) - OUT_RING(ring, vinterp[i]); /* VPC_VARYING_INTERP_MODE[i].MODE */ + crb.add(A6XX_VPC_VARYING_INTERP_MODE_MODE(i, vinterp[i])); - OUT_PKT4(ring, REG_A6XX_VPC_VARYING_REPLACE_MODE_MODE(0), 8); for (int i = 0; i < 8; i++) - OUT_RING(ring, vpsrepl[i]); /* VPC_VARYING_REPLACE_MODE[i] */ + crb.add(A6XX_VPC_VARYING_REPLACE_MODE_MODE(i, vpsrepl[i])); } template @@ -1465,7 +1422,8 @@ fd6_program_create(void *data, const struct ir3_shader_variant *bs, last_shader : state->bs; b.binning_pass = true; - setup_stateobj(state->binning_stateobj, &b); + fd_cs binning_cs(state->binning_stateobj); + setup_stateobj(binning_cs, &b); /* * Setup draw pass program state: @@ -1475,7 +1433,8 @@ fd6_program_create(void *data, const struct ir3_shader_variant *bs, b.last_shader = last_shader; b.binning_pass = false; - setup_stateobj(state->stateobj, &b); + fd_cs cs(state->stateobj); + setup_stateobj(cs, &b); state->interp_stateobj = create_interp_stateobj(ctx, state); diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.h b/src/gallium/drivers/freedreno/a6xx/fd6_program.h index 2dc7f3217df..820d1cd68ab 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_program.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.h @@ -16,6 +16,8 @@ #include "ir3/ir3_shader.h" #include "ir3_cache.h" +class fd_cs; +class fd_crb; struct fd6_emit; struct fd6_program_state { @@ -87,7 +89,7 @@ fd6_last_shader(const struct fd6_program_state *state) } template -void fd6_emit_shader(struct fd_context *ctx, struct fd_ringbuffer *ring, +void fd6_emit_shader(struct fd_context *ctx, fd_cs &cs, const struct ir3_shader_variant *so) assert_dt; struct fd_ringbuffer *fd6_program_interp_state(struct fd6_emit *emit) assert_dt; diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_query.cc b/src/gallium/drivers/freedreno/a6xx/fd6_query.cc index 0b19d1e492c..92802ff12cd 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_query.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_query.cc @@ -59,40 +59,39 @@ static void occlusion_resume(struct fd_acc_query *aq, struct fd_batch *batch) { struct fd_context *ctx = batch->ctx; - struct fd_ringbuffer *ring = batch->draw; + fd_cs cs(batch->draw); ASSERT_ALIGNED(struct fd6_query_sample, start, 16); - OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNTER_CNTL, 1); - OUT_RING(ring, A6XX_RB_SAMPLE_COUNTER_CNTL_COPY); + fd_pkt4(cs, 1) + .add(A6XX_RB_SAMPLE_COUNTER_CNTL(.copy = true)); if (!ctx->screen->info->a7xx.has_event_write_sample_count) { - OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNTER_BASE, 2); - OUT_RELOC(ring, query_sample(aq, start)); + fd_pkt4(cs, 2) + .add(A6XX_RB_SAMPLE_COUNTER_BASE(query_sample(aq, start))); - fd6_event_write(ctx, ring, FD_ZPASS_DONE); + fd6_event_write(ctx, cs, FD_ZPASS_DONE); /* Copied from blob's cmdstream, not sure why it is done. */ if (CHIP == A7XX) { - fd6_event_write(ctx, ring, FD_CCU_CLEAN_DEPTH); + fd6_event_write(ctx, cs, FD_CCU_CLEAN_DEPTH); } } else { - OUT_PKT(ring, CP_EVENT_WRITE7, - CP_EVENT_WRITE7_0( + fd_pkt7(cs, CP_EVENT_WRITE7, 3) + .add(CP_EVENT_WRITE7_0( .event = ZPASS_DONE, .write_sample_count = true, - ), - EV_DST_RAM_CP_EVENT_WRITE7_1(query_sample(aq, start)), - ); - OUT_PKT(ring, CP_EVENT_WRITE7, - CP_EVENT_WRITE7_0( + )) + .add(EV_DST_RAM_CP_EVENT_WRITE7_1(query_sample(aq, start))); + + fd_pkt7(cs, CP_EVENT_WRITE7, 3) + .add(CP_EVENT_WRITE7_0( .event = ZPASS_DONE, .write_sample_count = true, .sample_count_end_offset = true, .write_accum_sample_count_diff = true, - ), - EV_DST_RAM_CP_EVENT_WRITE7_1(query_sample(aq, start)), - ); + )) + .add(EV_DST_RAM_CP_EVENT_WRITE7_1(query_sample(aq, start))); } ctx->occlusion_queries_active++; @@ -108,63 +107,62 @@ static void occlusion_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt { struct fd_context *ctx = batch->ctx; - struct fd_ringbuffer *ring = batch->draw; + fd_cs cs(batch->draw); if (!ctx->screen->info->a7xx.has_event_write_sample_count) { - OUT_PKT7(ring, CP_MEM_WRITE, 4); - OUT_RELOC(ring, query_sample(aq, stop)); - OUT_RING(ring, 0xffffffff); - OUT_RING(ring, 0xffffffff); + fd_pkt7(cs, CP_MEM_WRITE, 4) + .add(CP_MEM_WRITE_ADDR(query_sample(aq, stop))) + .add(0xffffffff) + .add(0xffffffff); - OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0); + fd_pkt7(cs, CP_WAIT_MEM_WRITES, 0); } - OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNTER_CNTL, 1); - OUT_RING(ring, A6XX_RB_SAMPLE_COUNTER_CNTL_COPY); + fd_pkt4(cs, 1) + .add(A6XX_RB_SAMPLE_COUNTER_CNTL(.copy = true)); ASSERT_ALIGNED(struct fd6_query_sample, stop, 16); if (!ctx->screen->info->a7xx.has_event_write_sample_count) { - OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNTER_BASE, 2); - OUT_RELOC(ring, query_sample(aq, stop)); + fd_pkt4(cs, 2) + .add(A6XX_RB_SAMPLE_COUNTER_BASE(query_sample(aq, stop))); - fd6_event_write(batch->ctx, ring, FD_ZPASS_DONE); + fd6_event_write(batch->ctx, cs, FD_ZPASS_DONE); /* To avoid stalling in the draw buffer, emit code the code to compute the * counter delta in the epilogue ring. */ - struct fd_ringbuffer *epilogue = fd_batch_get_tile_epilogue(batch); + fd_cs epilogue(fd_batch_get_tile_epilogue(batch)); - OUT_PKT7(epilogue, CP_WAIT_REG_MEM, 6); - OUT_RING(epilogue, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) | - CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY)); - OUT_RELOC(epilogue, query_sample(aq, stop)); - OUT_RING(epilogue, CP_WAIT_REG_MEM_3_REF(0xffffffff)); - OUT_RING(epilogue, CP_WAIT_REG_MEM_4_MASK(0xffffffff)); - OUT_RING(epilogue, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16)); + fd_pkt7(epilogue, CP_WAIT_REG_MEM, 6) + .add(CP_WAIT_REG_MEM_0(.function = WRITE_NE, .poll = POLL_MEMORY)) + .add(CP_WAIT_REG_MEM_POLL_ADDR(query_sample(aq, stop))) + .add(CP_WAIT_REG_MEM_3(.ref = 0xffffffff)) + .add(CP_WAIT_REG_MEM_4(.mask = 0xffffffff)) + .add(CP_WAIT_REG_MEM_5(.delay_loop_cycles = 16)); /* result += stop - start: */ - OUT_PKT7(epilogue, CP_MEM_TO_MEM, 9); - OUT_RING(epilogue, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C); - OUT_RELOC(epilogue, query_sample(aq, result)); /* dst */ - OUT_RELOC(epilogue, query_sample(aq, result)); /* srcA */ - OUT_RELOC(epilogue, query_sample(aq, stop)); /* srcB */ - OUT_RELOC(epilogue, query_sample(aq, start)); /* srcC */ + fd_pkt7(epilogue, CP_MEM_TO_MEM, 9) + .add(CP_MEM_TO_MEM_0(.neg_c = true, ._double = true)) + .add(CP_MEM_TO_MEM_DST(query_sample(aq, result))) + .add(CP_MEM_TO_MEM_SRC_A(query_sample(aq, result))) + .add(CP_MEM_TO_MEM_SRC_B(query_sample(aq, stop))) + .add(CP_MEM_TO_MEM_SRC_C(query_sample(aq, start))); } else { - OUT_PKT(ring, CP_EVENT_WRITE7, - CP_EVENT_WRITE7_0( + fd_pkt7(cs, CP_EVENT_WRITE7, 3) + .add(CP_EVENT_WRITE7_0( .event = ZPASS_DONE, .write_sample_count = true, - ), - EV_DST_RAM_CP_EVENT_WRITE7_1(query_sample(aq, stop)), - ); - OUT_PKT(ring, CP_EVENT_WRITE7, - CP_EVENT_WRITE7_0( + )) + .add(EV_DST_RAM_CP_EVENT_WRITE7_1(query_sample(aq, stop))); + + fd_pkt7(cs, CP_EVENT_WRITE7, 3) + .add(CP_EVENT_WRITE7_0( .event = ZPASS_DONE, .write_sample_count = true, .sample_count_end_offset = true, .write_accum_sample_count_diff = true, - ), + )) /* Note: SQE is adding offsets to the iova, SAMPLE_COUNT_END_OFFSET causes * the result to be written to iova+16, and WRITE_ACCUM_SAMP_COUNT_DIFF * does *(iova + 8) += *(iova + 16) - *iova @@ -172,8 +170,7 @@ occlusion_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt * It just so happens this is the layout we already to for start/result/stop * So we just give the start address in all cases. */ - EV_DST_RAM_CP_EVENT_WRITE7_1(query_sample(aq, start)), - ); + .add(EV_DST_RAM_CP_EVENT_WRITE7_1(query_sample(aq, start))); } assert(ctx->occlusion_queries_active > 0); @@ -219,23 +216,27 @@ occlusion_predicate_result_resource(struct fd_acc_query *aq, struct fd_ringbuffe int index, struct fd_resource *dst, unsigned offset) { + fd_cs cs(ring); + /* This is a bit annoying but we need to turn the result into a one or * zero.. to do this use a CP_COND_WRITE to overwrite the result with * a one if it is non-zero. This doesn't change the results if the * query is also read on the CPU (ie. occlusion_predicate_result()). */ - OUT_PKT7(ring, CP_COND_WRITE5, 9); - OUT_RING(ring, CP_COND_WRITE5_0_FUNCTION(WRITE_NE) | - CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY) | - CP_COND_WRITE5_0_WRITE_MEMORY); - OUT_RELOC(ring, query_sample(aq, result)); /* POLL_ADDR_LO/HI */ - OUT_RING(ring, CP_COND_WRITE5_3_REF(0)); - OUT_RING(ring, CP_COND_WRITE5_4_MASK(~0)); - OUT_RELOC(ring, query_sample(aq, result)); /* WRITE_ADDR_LO/HI */ - OUT_RING(ring, 1); - OUT_RING(ring, 0); + fd_pkt7(cs, CP_COND_WRITE5, 9) + .add(CP_COND_WRITE5_0( + .function = WRITE_NE, + .poll = POLL_MEMORY, + .write_memory = true + )) + .add(CP_COND_WRITE5_POLL_ADDR(query_sample(aq, result))) + .add(CP_COND_WRITE5_3(.ref = 0)) + .add(CP_COND_WRITE5_4(.mask = ~0)) + .add(CP_COND_WRITE5_WRITE_ADDR(query_sample(aq, result))) + .add(1) + .add(0); - copy_result(ring, result_type, dst, offset, fd_resource(aq->prsc), + copy_result(cs.ring(), result_type, dst, offset, fd_resource(aq->prsc), offsetof(struct fd6_query_sample, result)); } @@ -277,28 +278,28 @@ template static void timestamp_resume(struct fd_acc_query *aq, struct fd_batch *batch) { - struct fd_ringbuffer *ring = batch->draw; + fd_cs cs(batch->draw); - fd6_record_ts(ring, query_sample(aq, start)); + fd6_record_ts(cs, query_sample(aq, start)); } template static void time_elapsed_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->draw; + fd_cs cs(batch->draw); - fd6_record_ts(ring, query_sample(aq, stop)); + fd6_record_ts(cs, query_sample(aq, stop)); - OUT_WFI5(ring); + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); /* result += stop - start: */ - OUT_PKT7(ring, CP_MEM_TO_MEM, 9); - OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C); - OUT_RELOC(ring, query_sample(aq, result)); /* dst */ - OUT_RELOC(ring, query_sample(aq, result)); /* srcA */ - OUT_RELOC(ring, query_sample(aq, stop)); /* srcB */ - OUT_RELOC(ring, query_sample(aq, start)); /* srcC */ + fd_pkt7(cs, CP_MEM_TO_MEM, 9) + .add(CP_MEM_TO_MEM_0(.neg_c = true, ._double = true)) + .add(CP_MEM_TO_MEM_DST(query_sample(aq, result))) + .add(CP_MEM_TO_MEM_SRC_A(query_sample(aq, result))) + .add(CP_MEM_TO_MEM_SRC_B(query_sample(aq, stop))) + .add(CP_MEM_TO_MEM_SRC_C(query_sample(aq, start))); } static void @@ -312,8 +313,10 @@ template static void record_timestamp(struct fd_ringbuffer *ring, struct fd_bo *bo, unsigned offset) { - fd_ringbuffer_attach_bo(ring, bo); - fd6_record_ts(ring, bo, offset); + fd_cs cs(ring); + + cs.attach_bo(bo); + fd6_record_ts(cs, bo, offset); } static void @@ -392,9 +395,8 @@ struct PACKED fd6_pipeline_stats_sample { }; FD_DEFINE_CAST(fd_acc_query_sample, fd6_pipeline_stats_sample); -#define stats_reloc(ring, aq, field) \ - OUT_RELOC(ring, fd_resource((aq)->prsc)->bo, \ - offsetof(struct fd6_pipeline_stats_sample, field), 0, 0); +#define stats_sample(aq, field) \ + fd_resource((aq)->prsc)->bo, offsetof(struct fd6_pipeline_stats_sample, field) /* Mapping of counters to pipeline stats: * @@ -493,23 +495,22 @@ static void pipeline_stats_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->draw; enum stats_type type = get_stats_type(aq); unsigned idx = stats_counter_index(aq); unsigned reg = REG_A6XX_RBBM_PIPESTAT_IAVERTICES + (2 * idx); + fd_cs cs(batch->draw); - OUT_WFI5(ring); + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); - OUT_PKT7(ring, CP_REG_TO_MEM, 3); - OUT_RING(ring, CP_REG_TO_MEM_0_64B | - CP_REG_TO_MEM_0_CNT(2) | - CP_REG_TO_MEM_0_REG(reg)); - stats_reloc(ring, aq, start); + /* snapshot the start value: */ + fd_pkt7(cs, CP_REG_TO_MEM, 3) + .add(CP_REG_TO_MEM_0(.reg = reg, .cnt = 2, ._64b = true)) + .add(CP_REG_TO_MEM_DEST(stats_sample(aq, start))); assert(type < ARRAY_SIZE(batch->pipeline_stats_queries_active)); if (!batch->pipeline_stats_queries_active[type]) - fd6_event_write(batch->ctx, ring, stats_counter_events[type].start); + fd6_event_write(batch->ctx, cs, stats_counter_events[type].start); batch->pipeline_stats_queries_active[type]++; } @@ -518,34 +519,36 @@ static void pipeline_stats_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->draw; enum stats_type type = get_stats_type(aq); unsigned idx = stats_counter_index(aq); unsigned reg = REG_A6XX_RBBM_PIPESTAT_IAVERTICES + (2 * idx); + fd_cs cs(batch->draw); - OUT_WFI5(ring); + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); /* snapshot the end values: */ - OUT_PKT7(ring, CP_REG_TO_MEM, 3); - OUT_RING(ring, CP_REG_TO_MEM_0_64B | - CP_REG_TO_MEM_0_CNT(2) | - CP_REG_TO_MEM_0_REG(reg)); - stats_reloc(ring, aq, stop); + fd_pkt7(cs, CP_REG_TO_MEM, 3) + .add(CP_REG_TO_MEM_0(.reg = reg, .cnt = 2, ._64b = true)) + .add(CP_REG_TO_MEM_DEST(stats_sample(aq, stop))); assert(type < ARRAY_SIZE(batch->pipeline_stats_queries_active)); assert(batch->pipeline_stats_queries_active[type] > 0); batch->pipeline_stats_queries_active[type]--; if (batch->pipeline_stats_queries_active[type]) - fd6_event_write(batch->ctx, ring, stats_counter_events[type].stop); + fd6_event_write(batch->ctx, cs, stats_counter_events[type].stop); /* result += stop - start: */ - OUT_PKT7(ring, CP_MEM_TO_MEM, 9); - OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 0x40000000); - stats_reloc(ring, aq, result); - stats_reloc(ring, aq, result); - stats_reloc(ring, aq, stop) - stats_reloc(ring, aq, start); + fd_pkt7(cs, CP_MEM_TO_MEM, 9) + .add(CP_MEM_TO_MEM_0( + .neg_c = true, + ._double = true, + .wait_for_mem_writes = true + )) + .add(CP_MEM_TO_MEM_DST(stats_sample(aq, result))) + .add(CP_MEM_TO_MEM_SRC_A(stats_sample(aq, result))) + .add(CP_MEM_TO_MEM_SRC_B(stats_sample(aq, stop))) + .add(CP_MEM_TO_MEM_SRC_C(stats_sample(aq, start))); } static void @@ -603,9 +606,8 @@ struct PACKED fd6_primitives_sample { }; FD_DEFINE_CAST(fd_acc_query_sample, fd6_primitives_sample); -#define primitives_reloc(ring, aq, field) \ - OUT_RELOC(ring, fd_resource((aq)->prsc)->bo, \ - __offsetof(struct fd6_primitives_sample, field), 0, 0); +#define primitives_sample(aq, field) \ + fd_resource((aq)->prsc)->bo, __offsetof(struct fd6_primitives_sample, field) static void log_primitives_sample(struct fd6_primitives_sample *ps) @@ -633,44 +635,40 @@ static void primitives_emitted_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->draw; + fd_cs cs(batch->draw); - OUT_WFI5(ring); + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); ASSERT_ALIGNED(struct fd6_primitives_sample, start[0], 32); - OUT_PKT4(ring, REG_A6XX_VPC_SO_QUERY_BASE, 2); - primitives_reloc(ring, aq, start[0]); + fd_pkt4(cs, 2) + .add(A6XX_VPC_SO_QUERY_BASE(primitives_sample(aq, start[0]))); - fd6_event_write(batch->ctx, ring, FD_WRITE_PRIMITIVE_COUNTS); + fd6_event_write(batch->ctx, cs, FD_WRITE_PRIMITIVE_COUNTS); } static void -accumultate_primitives_emitted(struct fd_acc_query *aq, - struct fd_ringbuffer *ring, - int idx) +accumultate_primitives_emitted(struct fd_acc_query *aq, fd_cs &cs, int idx) { /* result += stop - start: */ - OUT_PKT7(ring, CP_MEM_TO_MEM, 9); - OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 0x80000000); - primitives_reloc(ring, aq, result.emitted); - primitives_reloc(ring, aq, result.emitted); - primitives_reloc(ring, aq, stop[idx].emitted); - primitives_reloc(ring, aq, start[idx].emitted); + fd_pkt7(cs, CP_MEM_TO_MEM, 9) + .add(CP_MEM_TO_MEM_0(.neg_c = true, ._double = true, .unk31 = true)) + .add(CP_MEM_TO_MEM_DST(primitives_sample(aq, result.emitted))) + .add(CP_MEM_TO_MEM_SRC_A(primitives_sample(aq, result.emitted))) + .add(CP_MEM_TO_MEM_SRC_B(primitives_sample(aq, stop[idx].emitted))) + .add(CP_MEM_TO_MEM_SRC_C(primitives_sample(aq, start[idx].emitted))); } static void -accumultate_primitives_generated(struct fd_acc_query *aq, - struct fd_ringbuffer *ring, - int idx) +accumultate_primitives_generated(struct fd_acc_query *aq, fd_cs &cs, int idx) { /* result += stop - start: */ - OUT_PKT7(ring, CP_MEM_TO_MEM, 9); - OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 0x80000000); - primitives_reloc(ring, aq, result.generated); - primitives_reloc(ring, aq, result.generated); - primitives_reloc(ring, aq, stop[idx].generated); - primitives_reloc(ring, aq, start[idx].generated); + fd_pkt7(cs, CP_MEM_TO_MEM, 9) + .add(CP_MEM_TO_MEM_0(.neg_c = true, ._double = true, .unk31 = true)) + .add(CP_MEM_TO_MEM_DST(primitives_sample(aq, result.generated))) + .add(CP_MEM_TO_MEM_SRC_A(primitives_sample(aq, result.generated))) + .add(CP_MEM_TO_MEM_SRC_B(primitives_sample(aq, stop[idx].generated))) + .add(CP_MEM_TO_MEM_SRC_C(primitives_sample(aq, start[idx].generated))); } template @@ -678,29 +676,29 @@ static void primitives_emitted_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->draw; + fd_cs cs(batch->draw); - OUT_WFI5(ring); + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); ASSERT_ALIGNED(struct fd6_primitives_sample, stop[0], 32); - OUT_PKT4(ring, REG_A6XX_VPC_SO_QUERY_BASE, 2); - primitives_reloc(ring, aq, stop[0]); + fd_pkt4(cs, 2) + .add(A6XX_VPC_SO_QUERY_BASE(primitives_sample(aq, stop[0]))); - fd6_event_write(batch->ctx, ring, FD_WRITE_PRIMITIVE_COUNTS); - fd6_event_write(batch->ctx, ring, FD_CACHE_CLEAN); + fd6_event_write(batch->ctx, cs, FD_WRITE_PRIMITIVE_COUNTS); + fd6_event_write(batch->ctx, cs, FD_CACHE_CLEAN); if (aq->provider->query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) { /* Need results from all channels: */ for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { - accumultate_primitives_emitted(aq, ring, i); - accumultate_primitives_generated(aq, ring, i); + accumultate_primitives_emitted(aq, cs, i); + accumultate_primitives_generated(aq, cs, i); } } else { - accumultate_primitives_emitted(aq, ring, aq->base.index); + accumultate_primitives_emitted(aq, cs, aq->base.index); /* Only need primitives generated counts for the overflow queries: */ if (aq->provider->query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) - accumultate_primitives_generated(aq, ring, aq->base.index); + accumultate_primitives_generated(aq, cs, aq->base.index); } } @@ -746,30 +744,36 @@ so_overflow_predicate_result_resource(struct fd_acc_query *aq, int index, struct fd_resource *dst, unsigned offset) { - fd_ringbuffer_attach_bo(ring, dst->bo); - fd_ringbuffer_attach_bo(ring, fd_resource(aq->prsc)->bo); + fd_cs cs(ring); + + cs.attach_bo(dst->bo); + cs.attach_bo(fd_resource(aq->prsc)->bo); /* result = generated - emitted: */ - OUT_PKT7(ring, CP_MEM_TO_MEM, 7); - OUT_RING(ring, CP_MEM_TO_MEM_0_NEG_B | - COND(result_type >= PIPE_QUERY_TYPE_I64, CP_MEM_TO_MEM_0_DOUBLE)); - OUT_RELOC(ring, dst->bo, offset, 0, 0); - primitives_reloc(ring, aq, result.generated); - primitives_reloc(ring, aq, result.emitted); + fd_pkt7(cs, CP_MEM_TO_MEM, 7) + .add(CP_MEM_TO_MEM_0( + .neg_b = true, + ._double = result_type >= PIPE_QUERY_TYPE_I64, + )) + .add(CP_MEM_TO_MEM_DST(dst->bo, offset)) + .add(CP_MEM_TO_MEM_SRC_A(primitives_sample(aq, result.generated))) + .add(CP_MEM_TO_MEM_SRC_B(primitives_sample(aq, result.emitted))); /* This is a bit awkward, but glcts expects the result to be 1 or 0 * rather than non-zero vs zero: */ - OUT_PKT7(ring, CP_COND_WRITE5, 9); - OUT_RING(ring, CP_COND_WRITE5_0_FUNCTION(WRITE_NE) | - CP_COND_WRITE5_0_POLL(POLL_MEMORY) | - CP_COND_WRITE5_0_WRITE_MEMORY); - OUT_RELOC(ring, dst->bo, offset, 0, 0); /* POLL_ADDR_LO/HI */ - OUT_RING(ring, CP_COND_WRITE5_3_REF(0)); - OUT_RING(ring, CP_COND_WRITE5_4_MASK(~0)); - OUT_RELOC(ring, dst->bo, offset, 0, 0); /* WRITE_ADDR_LO/HI */ - OUT_RING(ring, 1); - OUT_RING(ring, 0); + fd_pkt7(cs, CP_COND_WRITE5, 9) + .add(CP_COND_WRITE5_0( + .function = WRITE_NE, + .poll = POLL_MEMORY, + .write_memory = true + )) + .add(CP_COND_WRITE5_POLL_ADDR(dst->bo, offset)) + .add(CP_COND_WRITE5_3(.ref = 0)) + .add(CP_COND_WRITE5_4(.mask = ~0)) + .add(CP_COND_WRITE5_WRITE_ADDR(dst->bo, offset)) + .add(1) + .add(0); } template @@ -827,12 +831,12 @@ perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt { struct fd_batch_query_data *data = (struct fd_batch_query_data *)aq->query_data; struct fd_screen *screen = data->screen; - struct fd_ringbuffer *ring = batch->draw; + fd_cs cs(batch->draw); unsigned counters_per_group[screen->num_perfcntr_groups]; memset(counters_per_group, 0, sizeof(counters_per_group)); - OUT_WFI5(ring); + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); /* configure performance counters for the requested queries: */ for (unsigned i = 0; i < data->num_query_entries; i++) { @@ -842,8 +846,10 @@ perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt assert(counter_idx < g->num_counters); - OUT_PKT4(ring, g->counters[counter_idx].select_reg, 1); - OUT_RING(ring, g->countables[entry->cid].selector); + fd_pkt4(cs, 1).add((fd_reg_pair){ + .reg = g->counters[counter_idx].select_reg, + .value = g->countables[entry->cid].selector, + }); } memset(counters_per_group, 0, sizeof(counters_per_group)); @@ -855,10 +861,9 @@ perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt unsigned counter_idx = counters_per_group[entry->gid]++; const struct fd_perfcntr_counter *counter = &g->counters[counter_idx]; - OUT_PKT7(ring, CP_REG_TO_MEM, 3); - OUT_RING(ring, CP_REG_TO_MEM_0_64B | - CP_REG_TO_MEM_0_REG(counter->counter_reg_lo)); - OUT_RELOC(ring, query_sample_idx(aq, i, start)); + fd_pkt7(cs, CP_REG_TO_MEM, 3) + .add(CP_REG_TO_MEM_0(.reg = counter->counter_reg_lo, ._64b = true)) + .add(CP_REG_TO_MEM_DEST(query_sample_idx(aq, i, start))); } } @@ -867,12 +872,12 @@ perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt { struct fd_batch_query_data *data = (struct fd_batch_query_data *)aq->query_data; struct fd_screen *screen = data->screen; - struct fd_ringbuffer *ring = batch->draw; + fd_cs cs(batch->draw); unsigned counters_per_group[screen->num_perfcntr_groups]; memset(counters_per_group, 0, sizeof(counters_per_group)); - OUT_WFI5(ring); + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); /* TODO do we need to bother to turn anything off? */ @@ -883,21 +888,20 @@ perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt unsigned counter_idx = counters_per_group[entry->gid]++; const struct fd_perfcntr_counter *counter = &g->counters[counter_idx]; - OUT_PKT7(ring, CP_REG_TO_MEM, 3); - OUT_RING(ring, CP_REG_TO_MEM_0_64B | - CP_REG_TO_MEM_0_REG(counter->counter_reg_lo)); - OUT_RELOC(ring, query_sample_idx(aq, i, stop)); + fd_pkt7(cs, CP_REG_TO_MEM, 3) + .add(CP_REG_TO_MEM_0(.reg = counter->counter_reg_lo, ._64b = true)) + .add(CP_REG_TO_MEM_DEST(query_sample_idx(aq, i, stop))); } /* and compute the result: */ for (unsigned i = 0; i < data->num_query_entries; i++) { /* result += stop - start: */ - OUT_PKT7(ring, CP_MEM_TO_MEM, 9); - OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C); - OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* dst */ - OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* srcA */ - OUT_RELOC(ring, query_sample_idx(aq, i, stop)); /* srcB */ - OUT_RELOC(ring, query_sample_idx(aq, i, start)); /* srcC */ + fd_pkt7(cs, CP_MEM_TO_MEM, 9) + .add(CP_MEM_TO_MEM_0(.neg_c = true, ._double = true)) + .add(CP_MEM_TO_MEM_DST(query_sample_idx(aq, i, result))) + .add(CP_MEM_TO_MEM_SRC_A(query_sample_idx(aq, i, result))) + .add(CP_MEM_TO_MEM_SRC_B(query_sample_idx(aq, i, stop))) + .add(CP_MEM_TO_MEM_SRC_C(query_sample_idx(aq, i, start))); } } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.cc b/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.cc index 4421cc61c27..7a414ccb1dd 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.cc @@ -25,8 +25,6 @@ __fd6_setup_rasterizer_stateobj(struct fd_context *ctx, const struct pipe_rasterizer_state *cso, bool primitive_restart) { - unsigned ndwords = (CHIP >= A7XX) ? 66 : 26; - struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, ndwords * 4); float psize_min, psize_max; if (cso->point_size_per_vertex) { @@ -38,48 +36,45 @@ __fd6_setup_rasterizer_stateobj(struct fd_context *ctx, psize_max = cso->point_size; } - OUT_REG(ring, - A6XX_GRAS_CL_CNTL( + unsigned nreg = (CHIP >= A7XX) ? 46 : 15; + fd_crb crb(ctx->pipe, nreg); + + crb.add(A6XX_GRAS_CL_CNTL( .znear_clip_disable = !cso->depth_clip_near, .zfar_clip_disable = !cso->depth_clip_far, .z_clamp_enable = cso->depth_clamp || CHIP >= A7XX, .zero_gb_scale_z = cso->clip_halfz, .vp_clip_code_ignore = 1, - ), + ) ); - OUT_REG(ring, - A6XX_GRAS_SU_CNTL( + crb.add(A6XX_GRAS_SU_CNTL( .cull_front = cso->cull_face & PIPE_FACE_FRONT, .cull_back = cso->cull_face & PIPE_FACE_BACK, .front_cw = !cso->front_ccw, .linehalfwidth = cso->line_width / 2.0f, .poly_offset = cso->offset_tri, .line_mode = cso->multisample ? RECTANGULAR : BRESENHAM, - ), + ) ); - OUT_REG(ring, - A6XX_GRAS_SU_POINT_MINMAX(.min = psize_min, .max = psize_max, ), - A6XX_GRAS_SU_POINT_SIZE(cso->point_size)); + crb.add(A6XX_GRAS_SU_POINT_MINMAX(.min = psize_min, .max = psize_max, )); + crb.add(A6XX_GRAS_SU_POINT_SIZE(cso->point_size)); + crb.add(A6XX_GRAS_SU_POLY_OFFSET_SCALE(cso->offset_scale)); + crb.add(A6XX_GRAS_SU_POLY_OFFSET_OFFSET(cso->offset_units)); + crb.add(A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(cso->offset_clamp)); - OUT_REG(ring, A6XX_GRAS_SU_POLY_OFFSET_SCALE(cso->offset_scale), - A6XX_GRAS_SU_POLY_OFFSET_OFFSET(cso->offset_units), - A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(cso->offset_clamp)); - - OUT_REG(ring, - A6XX_PC_CNTL( + crb.add(A6XX_PC_CNTL( .primitive_restart = primitive_restart, .provoking_vtx_last = !cso->flatshade_first, - ), + ) ); if (CHIP >= A7XX) { - OUT_REG(ring, - A7XX_VPC_PC_CNTL( + crb.add(A7XX_VPC_PC_CNTL( .primitive_restart = primitive_restart, .provoking_vtx_last = !cso->flatshade_first, - ), + ) ); } @@ -96,12 +91,12 @@ __fd6_setup_rasterizer_stateobj(struct fd_context *ctx, break; } - OUT_REG(ring, A6XX_VPC_RAST_CNTL(mode)); - OUT_REG(ring, PC_DGEN_RAST_CNTL(CHIP, mode)); + crb.add(A6XX_VPC_RAST_CNTL(mode)); + crb.add(PC_DGEN_RAST_CNTL(CHIP, mode)); if (CHIP == A7XX || (CHIP == A6XX && ctx->screen->info->a6xx.is_a702)) { - OUT_REG(ring, A6XX_VPC_PS_RAST_CNTL(mode)); + crb.add(A6XX_VPC_PS_RAST_CNTL(mode)); } /* With a7xx the hw doesn't do the clamping for us. When depth clamp @@ -115,26 +110,23 @@ __fd6_setup_rasterizer_stateobj(struct fd_context *ctx, /* We must assume the max: */ const unsigned num_viewports = 16; - OUT_PKT4(ring, REG_A6XX_GRAS_CL_VIEWPORT_ZCLAMP(0), num_viewports * 2); for (unsigned i = 0; i < num_viewports; i++) { - OUT_RING(ring, fui(0.0f)); - OUT_RING(ring, fui(1.0f)); + crb.add(A6XX_GRAS_CL_VIEWPORT_ZCLAMP_MIN(i, 0.0f)); + crb.add(A6XX_GRAS_CL_VIEWPORT_ZCLAMP_MAX(i, 1.0f)); } - OUT_REG(ring, - A6XX_RB_VIEWPORT_ZCLAMP_MIN(0.0f), - A6XX_RB_VIEWPORT_ZCLAMP_MAX(1.0), - ); + crb.add(A6XX_RB_VIEWPORT_ZCLAMP_MIN(0.0f)); + crb.add(A6XX_RB_VIEWPORT_ZCLAMP_MAX(1.0f)); } if (CHIP == A6XX && ctx->screen->info->a6xx.has_legacy_pipeline_shading_rate) { - OUT_REG(ring, A6XX_RB_UNKNOWN_8A00()); - OUT_REG(ring, A6XX_RB_UNKNOWN_8A10()); - OUT_REG(ring, A6XX_RB_UNKNOWN_8A20()); - OUT_REG(ring, A6XX_RB_UNKNOWN_8A30()); + crb.add(A6XX_RB_UNKNOWN_8A00()); + crb.add(A6XX_RB_UNKNOWN_8A10()); + crb.add(A6XX_RB_UNKNOWN_8A20()); + crb.add(A6XX_RB_UNKNOWN_8A30()); } - return ring; + return crb.ring(); } FD_GENX(__fd6_setup_rasterizer_stateobj); diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_resource.h b/src/gallium/drivers/freedreno/a6xx/fd6_resource.h index c79215bfaab..77089430edb 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_resource.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_resource.h @@ -29,8 +29,6 @@ fd6_assert_valid_format(struct fd_resource *rsc, enum pipe_format format) assert(fd6_check_valid_format(rsc, format) == FORMAT_OK); } -void fd6_emit_flag_reference(struct fd_ringbuffer *ring, - struct fd_resource *rsc, int level, int layer); template void fd6_resource_screen_init(struct pipe_screen *pscreen); diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_texture.cc b/src/gallium/drivers/freedreno/a6xx/fd6_texture.cc index 90bce23696d..f721b13336b 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_texture.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_texture.cc @@ -582,62 +582,28 @@ tex_key_equals(const void *_a, const void *_b) return memcmp(a, b, sizeof(struct fd6_texture_key)) == 0; } +static enum a6xx_state_block +stage2sb(mesa_shader_stage type) +{ + switch (type) { + case MESA_SHADER_VERTEX: return SB6_VS_TEX; + case MESA_SHADER_TESS_CTRL: return SB6_HS_TEX; + case MESA_SHADER_TESS_EVAL: return SB6_DS_TEX; + case MESA_SHADER_GEOMETRY: return SB6_GS_TEX; + case MESA_SHADER_FRAGMENT: return SB6_FS_TEX; + case MESA_SHADER_COMPUTE: return SB6_CS_TEX; + default: + UNREACHABLE("bad state block"); + } +} + static struct fd_ringbuffer * build_texture_state(struct fd_context *ctx, mesa_shader_stage type, struct fd_texture_stateobj *tex) assert_dt { - struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 32 * 4); - unsigned opcode, tex_samp_reg, tex_const_reg, tex_count_reg; struct fd_bo *tex_desc = NULL, *samp_desc = NULL; - enum a6xx_state_block sb; - - switch (type) { - case MESA_SHADER_VERTEX: - sb = SB6_VS_TEX; - opcode = CP_LOAD_STATE6_GEOM; - tex_samp_reg = REG_A6XX_SP_VS_SAMPLER_BASE; - tex_const_reg = REG_A6XX_SP_VS_TEXMEMOBJ_BASE; - tex_count_reg = REG_A6XX_SP_VS_TSIZE; - break; - case MESA_SHADER_TESS_CTRL: - sb = SB6_HS_TEX; - opcode = CP_LOAD_STATE6_GEOM; - tex_samp_reg = REG_A6XX_SP_HS_SAMPLER_BASE; - tex_const_reg = REG_A6XX_SP_HS_TEXMEMOBJ_BASE; - tex_count_reg = REG_A6XX_SP_HS_TSIZE; - break; - case MESA_SHADER_TESS_EVAL: - sb = SB6_DS_TEX; - opcode = CP_LOAD_STATE6_GEOM; - tex_samp_reg = REG_A6XX_SP_DS_SAMPLER_BASE; - tex_const_reg = REG_A6XX_SP_DS_TEXMEMOBJ_BASE; - tex_count_reg = REG_A6XX_SP_DS_TSIZE; - break; - case MESA_SHADER_GEOMETRY: - sb = SB6_GS_TEX; - opcode = CP_LOAD_STATE6_GEOM; - tex_samp_reg = REG_A6XX_SP_GS_SAMPLER_BASE; - tex_const_reg = REG_A6XX_SP_GS_TEXMEMOBJ_BASE; - tex_count_reg = REG_A6XX_SP_GS_TSIZE; - break; - case MESA_SHADER_FRAGMENT: - sb = SB6_FS_TEX; - opcode = CP_LOAD_STATE6_FRAG; - tex_samp_reg = REG_A6XX_SP_PS_SAMPLER_BASE; - tex_const_reg = REG_A6XX_SP_PS_TEXMEMOBJ_BASE; - tex_count_reg = REG_A6XX_SP_PS_TSIZE; - break; - case MESA_SHADER_COMPUTE: - sb = SB6_CS_TEX; - opcode = CP_LOAD_STATE6_FRAG; - tex_samp_reg = REG_A6XX_SP_CS_SAMPLER_BASE; - tex_const_reg = REG_A6XX_SP_CS_TEXMEMOBJ_BASE; - tex_count_reg = REG_A6XX_SP_CS_TSIZE; - break; - default: - UNREACHABLE("bad state block"); - } + fd_cs cs(ctx->pipe, 32 * 4); if (tex->num_samplers > 0) { samp_desc = fd_bo_new(ctx->dev, tex->num_samplers * 4 * 4, @@ -654,21 +620,7 @@ build_texture_state(struct fd_context *ctx, mesa_shader_stage type, buf += 4; } - fd_ringbuffer_attach_bo(ring, samp_desc); - - /* output sampler state: */ - OUT_PKT7(ring, opcode, 3); - OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(sb) | - CP_LOAD_STATE6_0_NUM_UNIT(tex->num_samplers)); - OUT_RELOC(ring, samp_desc, 0); /* SRC_ADDR_LO/HI */ - - OUT_PKT4(ring, tex_samp_reg, 2); - OUT_RELOC(ring, samp_desc, 0); /* SRC_ADDR_LO/HI */ - - fd_bo_del(samp_desc); + cs.attach_bo(samp_desc); } if (tex->num_textures > 0) { @@ -694,27 +646,73 @@ build_texture_state(struct fd_context *ctx, mesa_shader_stage type, buf += 16; } - fd_ringbuffer_attach_bo(ring, tex_desc); - - /* emit texture state: */ - OUT_PKT7(ring, opcode, 3); - OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(sb) | - CP_LOAD_STATE6_0_NUM_UNIT(tex->num_textures)); - OUT_RELOC(ring, tex_desc, 0); /* SRC_ADDR_LO/HI */ - - OUT_PKT4(ring, tex_const_reg, 2); - OUT_RELOC(ring, tex_desc, 0); /* SRC_ADDR_LO/HI */ - - fd_bo_del(tex_desc); + cs.attach_bo(tex_desc); } - OUT_PKT4(ring, tex_count_reg, 1); - OUT_RING(ring, tex->num_textures); + with_crb (cs, 5) { + switch (type) { + case MESA_SHADER_VERTEX: + crb.add(A6XX_SP_VS_SAMPLER_BASE(samp_desc)); + crb.add(A6XX_SP_VS_TEXMEMOBJ_BASE(tex_desc)); + crb.add(A6XX_SP_VS_TSIZE(tex->num_textures)); + break; + case MESA_SHADER_TESS_CTRL: + crb.add(A6XX_SP_HS_SAMPLER_BASE(samp_desc)); + crb.add(A6XX_SP_HS_TEXMEMOBJ_BASE(tex_desc)); + crb.add(A6XX_SP_HS_TSIZE(tex->num_textures)); + break; + case MESA_SHADER_TESS_EVAL: + crb.add(A6XX_SP_DS_SAMPLER_BASE(samp_desc)); + crb.add(A6XX_SP_DS_TEXMEMOBJ_BASE(tex_desc)); + crb.add(A6XX_SP_DS_TSIZE(tex->num_textures)); + break; + case MESA_SHADER_GEOMETRY: + crb.add(A6XX_SP_GS_SAMPLER_BASE(samp_desc)); + crb.add(A6XX_SP_GS_TEXMEMOBJ_BASE(tex_desc)); + crb.add(A6XX_SP_GS_TSIZE(tex->num_textures)); + break; + case MESA_SHADER_FRAGMENT: + crb.add(A6XX_SP_PS_SAMPLER_BASE(samp_desc)); + crb.add(A6XX_SP_PS_TEXMEMOBJ_BASE(tex_desc)); + crb.add(A6XX_SP_PS_TSIZE(tex->num_textures)); + break; + case MESA_SHADER_COMPUTE: + crb.add(A6XX_SP_CS_SAMPLER_BASE(samp_desc)); + crb.add(A6XX_SP_CS_TEXMEMOBJ_BASE(tex_desc)); + crb.add(A6XX_SP_CS_TSIZE(tex->num_textures)); + break; + default: + UNREACHABLE("bad state block"); + } + } - return ring; + if (samp_desc) { + fd_pkt7(cs, fd6_stage2opcode(type), 3) + .add(CP_LOAD_STATE6_0( + .state_type = ST6_SHADER, + .state_src = SS6_INDIRECT, + .state_block = stage2sb(type), + .num_unit = tex->num_samplers, + )) + .add(CP_LOAD_STATE6_EXT_SRC_ADDR(samp_desc)); + + fd_bo_del(samp_desc); + } + + if (tex_desc) { + fd_pkt7(cs, fd6_stage2opcode(type), 3) + .add(CP_LOAD_STATE6_0( + .state_type = ST6_CONSTANTS, + .state_src = SS6_INDIRECT, + .state_block = stage2sb(type), + .num_unit = tex->num_textures, + )) + .add(CP_LOAD_STATE6_EXT_SRC_ADDR(tex_desc)); + + fd_bo_del(tex_desc); + } + + return cs.ring(); } /** diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_zsa.cc b/src/gallium/drivers/freedreno/a6xx/fd6_zsa.cc index 26d19f653d0..85967e0ae27 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_zsa.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_zsa.cc @@ -91,6 +91,7 @@ fd6_zsa_state_create(struct pipe_context *pctx, enum adreno_compare_func depth_func = (enum adreno_compare_func)cso->depth_func; /* maps 1:1 */ + bool force_z_test_enable = false; /* On some GPUs it is necessary to enable z test for depth bounds test * when UBWC is enabled. Otherwise, the GPU would hang. FUNC_ALWAYS is @@ -100,16 +101,11 @@ fd6_zsa_state_create(struct pipe_context *pctx, */ if (cso->depth_bounds_test && !cso->depth_enabled && ctx->screen->info->a6xx.depth_bounds_require_depth_test_quirk) { - so->rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE; + force_z_test_enable = true; depth_func = FUNC_ALWAYS; } - so->rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_ZFUNC(depth_func); - if (cso->depth_enabled) { - so->rb_depth_cntl |= - A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE; - so->lrz.test = true; if (cso->depth_writemask) { @@ -155,9 +151,6 @@ fd6_zsa_state_create(struct pipe_context *pctx, } } - if (cso->depth_writemask) - so->rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; - if (cso->stencil[0].enabled) { const struct pipe_stencil_state *s = &cso->stencil[0]; @@ -167,97 +160,79 @@ fd6_zsa_state_create(struct pipe_context *pctx, */ update_lrz_stencil(so, (enum pipe_compare_func)s->func, util_writes_stencil(s)); - so->rb_stencil_control |= - A6XX_RB_STENCIL_CNTL_STENCIL_READ | - A6XX_RB_STENCIL_CNTL_STENCIL_ENABLE | - A6XX_RB_STENCIL_CNTL_FUNC((enum adreno_compare_func)s->func) | /* maps 1:1 */ - A6XX_RB_STENCIL_CNTL_FAIL(fd_stencil_op(s->fail_op)) | - A6XX_RB_STENCIL_CNTL_ZPASS(fd_stencil_op(s->zpass_op)) | - A6XX_RB_STENCIL_CNTL_ZFAIL(fd_stencil_op(s->zfail_op)); - - so->rb_stencilmask = A6XX_RB_STENCIL_MASK_MASK(s->valuemask); - so->rb_stencilwrmask = A6XX_RB_STENCIL_WRITE_MASK_WRMASK(s->writemask); - if (cso->stencil[1].enabled) { const struct pipe_stencil_state *bs = &cso->stencil[1]; update_lrz_stencil(so, (enum pipe_compare_func)bs->func, util_writes_stencil(bs)); - - so->rb_stencil_control |= - A6XX_RB_STENCIL_CNTL_STENCIL_ENABLE_BF | - A6XX_RB_STENCIL_CNTL_FUNC_BF((enum adreno_compare_func)bs->func) | /* maps 1:1 */ - A6XX_RB_STENCIL_CNTL_FAIL_BF(fd_stencil_op(bs->fail_op)) | - A6XX_RB_STENCIL_CNTL_ZPASS_BF(fd_stencil_op(bs->zpass_op)) | - A6XX_RB_STENCIL_CNTL_ZFAIL_BF(fd_stencil_op(bs->zfail_op)); - - so->rb_stencilmask |= A6XX_RB_STENCIL_MASK_BFMASK(bs->valuemask); - so->rb_stencilwrmask |= A6XX_RB_STENCIL_WRITE_MASK_BFWRMASK(bs->writemask); } } - if (cso->alpha_enabled) { - /* Alpha test is functionally a conditional discard, so we can't - * write LRZ before seeing if we end up discarding or not - */ - if (cso->alpha_func != PIPE_FUNC_ALWAYS) { - so->lrz.write = false; - so->alpha_test = true; - } - - uint32_t ref = cso->alpha_ref_value * 255.0f; - so->rb_alpha_control = - A6XX_RB_ALPHA_TEST_CNTL_ALPHA_TEST | - A6XX_RB_ALPHA_TEST_CNTL_ALPHA_REF(ref) | - A6XX_RB_ALPHA_TEST_CNTL_ALPHA_TEST_FUNC( - (enum adreno_compare_func)cso->alpha_func); + /* Alpha test is functionally a conditional discard, so we can't + * write LRZ before seeing if we end up discarding or not + */ + if (cso->alpha_enabled && (cso->alpha_func != PIPE_FUNC_ALWAYS)) { + so->lrz.write = false; + so->alpha_test = true; } if (cso->depth_bounds_test) { - so->rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | - A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE; so->lrz.z_bounds_enable = true; } + const struct pipe_stencil_state *fs = &cso->stencil[0]; + const struct pipe_stencil_state *bs = &cso->stencil[1]; + /* Build the four state permutations (with/without alpha/depth-clamp)*/ for (int i = 0; i < 4; i++) { - struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 16 * 4); bool depth_clamp_enable = (i & FD6_ZSA_DEPTH_CLAMP); + bool no_alpha = (i & FD6_ZSA_NO_ALPHA); - OUT_PKT4(ring, REG_A6XX_RB_ALPHA_TEST_CNTL, 1); - OUT_RING(ring, - (i & FD6_ZSA_NO_ALPHA) - ? so->rb_alpha_control & ~A6XX_RB_ALPHA_TEST_CNTL_ALPHA_TEST - : so->rb_alpha_control); + fd_crb crb(ctx->pipe, 9); - OUT_PKT4(ring, REG_A6XX_RB_STENCIL_CNTL, 1); - OUT_RING(ring, so->rb_stencil_control); + crb.add(A6XX_RB_ALPHA_TEST_CNTL( + .alpha_ref = (uint32_t)(cso->alpha_ref_value * 255.0f) & 0xff, + .alpha_test = cso->alpha_enabled && !no_alpha, + .alpha_test_func = (enum adreno_compare_func)cso->alpha_func, + )); - OUT_REG(ring, A6XX_GRAS_SU_STENCIL_CNTL(cso->stencil[0].enabled)); + crb.add(A6XX_RB_STENCIL_CNTL( + .stencil_enable = fs->enabled, + .stencil_enable_bf = bs->enabled, + .stencil_read = fs->enabled, + .func = (enum adreno_compare_func)fs->func, /* maps 1:1 */ + .fail = fd_stencil_op(fs->fail_op), + .zpass = fd_stencil_op(fs->zpass_op), + .zfail = fd_stencil_op(fs->zfail_op), + .func_bf = (enum adreno_compare_func)bs->func, /* maps 1:1 */ + .fail_bf = fd_stencil_op(bs->fail_op), + .zpass_bf = fd_stencil_op(bs->zpass_op), + .zfail_bf = fd_stencil_op(bs->zfail_op), + )); - OUT_PKT4(ring, REG_A6XX_RB_DEPTH_CNTL, 1); - OUT_RING(ring, - so->rb_depth_cntl | COND(depth_clamp_enable || CHIP >= A7XX, - A6XX_RB_DEPTH_CNTL_Z_CLAMP_ENABLE)); + crb.add(A6XX_GRAS_SU_STENCIL_CNTL(cso->stencil[0].enabled)); + crb.add(A6XX_RB_STENCIL_MASK(.mask = fs->valuemask, .bfmask = bs->valuemask)); + crb.add(A6XX_RB_STENCIL_WRITE_MASK(.wrmask = fs->writemask, .bfwrmask = bs->writemask)); - OUT_REG(ring, A6XX_GRAS_SU_DEPTH_CNTL(cso->depth_enabled)); + crb.add(A6XX_RB_DEPTH_CNTL( + .z_test_enable = cso->depth_enabled || force_z_test_enable, + .z_write_enable = cso->depth_writemask, + .zfunc = depth_func, + .z_clamp_enable = depth_clamp_enable || CHIP >= A7XX, + .z_read_enable = cso->depth_enabled || cso->depth_bounds_test, + .z_bounds_enable = cso->depth_bounds_test, + )); - OUT_PKT4(ring, REG_A6XX_RB_STENCIL_MASK, 2); - OUT_RING(ring, so->rb_stencilmask); - OUT_RING(ring, so->rb_stencilwrmask); + crb.add(A6XX_GRAS_SU_DEPTH_CNTL(cso->depth_enabled)); if (CHIP >= A7XX && !depth_clamp_enable) { - OUT_REG(ring, - A6XX_RB_DEPTH_BOUND_MIN(0.0f), - A6XX_RB_DEPTH_BOUND_MAX(1.0f), - ); + crb.add(A6XX_RB_DEPTH_BOUND_MIN(0.0f)); + crb.add(A6XX_RB_DEPTH_BOUND_MAX(1.0f)); } else { - OUT_REG(ring, - A6XX_RB_DEPTH_BOUND_MIN(cso->depth_bounds_min), - A6XX_RB_DEPTH_BOUND_MAX(cso->depth_bounds_max), - ); + crb.add(A6XX_RB_DEPTH_BOUND_MIN(cso->depth_bounds_min)); + crb.add(A6XX_RB_DEPTH_BOUND_MAX(cso->depth_bounds_max)); } - so->stateobj[i] = ring; + so->stateobj[i] = crb.ring(); } return so; diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_zsa.h b/src/gallium/drivers/freedreno/a6xx/fd6_zsa.h index 01500e79e84..732b0fdd210 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_zsa.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_zsa.h @@ -23,12 +23,6 @@ struct fd6_zsa_stateobj { struct pipe_depth_stencil_alpha_state base; - uint32_t rb_alpha_control; - uint32_t rb_depth_cntl; - uint32_t rb_stencil_control; - uint32_t rb_stencilmask; - uint32_t rb_stencilwrmask; - struct fd6_lrz_state lrz; bool writes_zs : 1; /* writes depth and/or stencil */ bool writes_z : 1; /* writes depth */