diff --git a/src/freedreno/computerator/a6xx.cc b/src/freedreno/computerator/a6xx.cc index 50b2c8622d1..c3369df4653 100644 --- a/src/freedreno/computerator/a6xx.cc +++ b/src/freedreno/computerator/a6xx.cc @@ -76,7 +76,7 @@ struct fd6_control { }; #define control_ptr(a6xx_backend, member) \ - (a6xx_backend)->control_mem, offsetof(struct fd6_control, member), 0, 0 + (a6xx_backend)->control_mem, offsetof(struct fd6_control, member) struct PACKED fd6_query_sample { uint64_t start; @@ -88,8 +88,7 @@ struct PACKED fd6_query_sample { #define query_sample_idx(a6xx_backend, idx, field) \ (a6xx_backend)->query_mem, \ (idx * sizeof(struct fd6_query_sample)) + \ - offsetof(struct fd6_query_sample, field), \ - 0, 0 + offsetof(struct fd6_query_sample, field) /* * Backend implementation: @@ -112,27 +111,25 @@ a6xx_disassemble(struct kernel *kernel, FILE *out) template static void -cs_restore_emit(struct fd_ringbuffer *ring, struct a6xx_backend *a6xx_backend) +cs_restore_emit(fd_cs &cs, struct a6xx_backend *a6xx_backend) { - OUT_PKT4(ring, REG_A6XX_SP_PERFCTR_SHADER_MASK, 1); - OUT_RING(ring, A6XX_SP_PERFCTR_SHADER_MASK_CS); + fd_ncrb ncrb(cs, 2 + ARRAY_SIZE(a6xx_backend->info->a6xx.magic_raw)); - OUT_PKT4(ring, REG_A6XX_SP_NC_MODE_CNTL_2, 1); - OUT_RING(ring, 0); + ncrb.add(A6XX_SP_PERFCTR_SHADER_MASK(.cs = true)); + ncrb.add(A6XX_SP_NC_MODE_CNTL_2()); for (size_t i = 0; i < ARRAY_SIZE(a6xx_backend->info->a6xx.magic_raw); i++) { auto magic_reg = a6xx_backend->info->a6xx.magic_raw[i]; if (!magic_reg.reg) break; - OUT_PKT4(ring, magic_reg.reg, 1); - OUT_RING(ring, magic_reg.value); + ncrb.add({magic_reg.reg, magic_reg.value}); } } template static void -cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel) +cs_program_emit_regs(fd_cs &cs, struct kernel *kernel) { struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel); struct a6xx_backend *a6xx_backend = to_a6xx_backend(ir3_kernel->backend); @@ -140,12 +137,15 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel) const unsigned *local_size = kernel->local_size; const struct ir3_info *i = &v->info; enum a6xx_threadsize thrsz = i->double_threadsize ? THREAD128 : THREAD64; + fd_crb crb(cs, 25); - OUT_REG(ring, A6XX_SP_MODE_CNTL(.constant_demotion_enable = true, - .isammode = ISAMMODE_GL, - .shared_consts_enable = false)); + crb.add(A6XX_SP_MODE_CNTL( + .constant_demotion_enable = true, + .isammode = ISAMMODE_GL, + .shared_consts_enable = false, + )); - OUT_REG(ring, SP_UPDATE_CNTL(CHIP, + crb.add(SP_UPDATE_CNTL(CHIP, .vs_state = true, .hs_state = true, .ds_state = true, @@ -156,30 +156,32 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel) )); unsigned constlen = align(v->constlen, 4); - OUT_REG(ring, SP_CS_CONST_CONFIG(CHIP, .constlen = constlen, .enabled = true, )); + crb.add(SP_CS_CONST_CONFIG(CHIP, .constlen = constlen, .enabled = true, )); - OUT_PKT4(ring, REG_A6XX_SP_CS_CONFIG, 2); - OUT_RING(ring, A6XX_SP_CS_CONFIG_ENABLED | - A6XX_SP_CS_CONFIG_NUAV(kernel->num_bufs) | - A6XX_SP_CS_CONFIG_NTEX(v->num_samp) | - A6XX_SP_CS_CONFIG_NSAMP(v->num_samp)); /* SP_VS_CONFIG */ - OUT_RING(ring, v->instrlen); /* SP_VS_INSTRLEN */ + crb.add(A6XX_SP_CS_CONFIG( + .enabled = true, + .ntex = v->num_samp, + .nsamp = v->num_samp, + .nuav = kernel->num_bufs, + )); + crb.add(A6XX_SP_CS_INSTR_SIZE(v->instrlen)); + + crb.add(A6XX_SP_CS_CNTL_0( + .halfregfootprint = i->max_half_reg + 1, + .fullregfootprint = i->max_reg + 1, + .branchstack = ir3_shader_branchstack_hw(v), + .threadsize = thrsz, + .earlypreamble = v->early_preamble, + .mergedregs = v->mergedregs, + )); - OUT_PKT4(ring, REG_A6XX_SP_CS_CNTL_0, 1); - OUT_RING(ring, - A6XX_SP_CS_CNTL_0_THREADSIZE(thrsz) | - A6XX_SP_CS_CNTL_0_FULLREGFOOTPRINT(i->max_reg + 1) | - A6XX_SP_CS_CNTL_0_HALFREGFOOTPRINT(i->max_half_reg + 1) | - COND(v->mergedregs, A6XX_SP_CS_CNTL_0_MERGEDREGS) | - COND(v->early_preamble, A6XX_SP_CS_CNTL_0_EARLYPREAMBLE) | - A6XX_SP_CS_CNTL_0_BRANCHSTACK(ir3_shader_branchstack_hw(v))); if (CHIP == A7XX) { - OUT_REG(ring, SP_PS_WAVE_CNTL(CHIP, .threadsize = THREAD64)); + crb.add(SP_PS_WAVE_CNTL(CHIP, .threadsize = THREAD64)); - OUT_REG(ring, SP_REG_PROG_ID_0(CHIP, .dword = 0xfcfcfcfc), - SP_REG_PROG_ID_1(CHIP, .dword = 0xfcfcfcfc), - SP_REG_PROG_ID_2(CHIP, .dword = 0xfcfcfcfc), - SP_REG_PROG_ID_3(CHIP, .dword = 0x0000fc00), ); + crb.add(SP_REG_PROG_ID_0(CHIP, .dword = 0xfcfcfcfc)); + crb.add(SP_REG_PROG_ID_1(CHIP, .dword = 0xfcfcfcfc)); + crb.add(SP_REG_PROG_ID_2(CHIP, .dword = 0xfcfcfcfc)); + crb.add(SP_REG_PROG_ID_3(CHIP, .dword = 0x0000fc00)); } uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1); @@ -187,14 +189,10 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel) v->constlen > 256 ? CONSTLEN_512 : (v->constlen > 192 ? CONSTLEN_256 : (v->constlen > 128 ? CONSTLEN_192 : CONSTLEN_128)); - OUT_PKT4(ring, REG_A6XX_SP_CS_CNTL_1, 1); - OUT_RING(ring, A6XX_SP_CS_CNTL_1_SHARED_SIZE(shared_size) | - A6XX_SP_CS_CNTL_1_CONSTANTRAMMODE(mode)); + crb.add(A6XX_SP_CS_CNTL_1(.shared_size = shared_size, .constantrammode = mode)); if (CHIP == A6XX && a6xx_backend->info->a6xx.has_lpac) { - OUT_PKT4(ring, REG_A6XX_HLSQ_CS_CTRL_REG1, 1); - OUT_RING(ring, A6XX_HLSQ_CS_CTRL_REG1_SHARED_SIZE(1) | - A6XX_HLSQ_CS_CTRL_REG1_CONSTANTRAMMODE(mode)); + crb.add(A6XX_HLSQ_CS_CTRL_REG1(.shared_size = 1, .constantrammode = mode)); } uint32_t local_invocation_id, work_group_id; @@ -203,67 +201,58 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel) work_group_id = ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID); if (CHIP == A6XX) { - OUT_PKT4(ring, REG_A6XX_SP_CS_CONST_CONFIG_0, 2); - OUT_RING(ring, A6XX_SP_CS_CONST_CONFIG_0_WGIDCONSTID(work_group_id) | - A6XX_SP_CS_CONST_CONFIG_0_WGSIZECONSTID(regid(63, 0)) | - A6XX_SP_CS_CONST_CONFIG_0_WGOFFSETCONSTID(regid(63, 0)) | - A6XX_SP_CS_CONST_CONFIG_0_LOCALIDREGID(local_invocation_id)); - OUT_RING(ring, A6XX_SP_CS_WGE_CNTL_LINEARLOCALIDREGID(regid(63, 0)) | - A6XX_SP_CS_WGE_CNTL_THREADSIZE(thrsz)); + crb.add(A6XX_SP_CS_CONST_CONFIG_0( + .wgidconstid = work_group_id, + .wgsizeconstid = INVALID_REG, + .wgoffsetconstid = INVALID_REG, + .localidregid = local_invocation_id, + )); + crb.add(SP_CS_WGE_CNTL(CHIP, + .linearlocalidregid = INVALID_REG, + .threadsize = thrsz, + )); } else { unsigned tile_height = (local_size[1] % 8 == 0) ? 3 : (local_size[1] % 4 == 0) ? 5 : (local_size[1] % 2 == 0) ? 9 : 17; - OUT_REG(ring, - SP_CS_WGE_CNTL(CHIP, - .linearlocalidregid = regid(63, 0), - .threadsize = thrsz, - .workgrouprastorderzfirsten = true, - .wgtilewidth = 4, - .wgtileheight = tile_height, - ) - ); + crb.add(SP_CS_WGE_CNTL(CHIP, + .linearlocalidregid = INVALID_REG, + .threadsize = thrsz, + .workgrouprastorderzfirsten = true, + .wgtilewidth = 4, + .wgtileheight = tile_height, + )); } if (CHIP == A7XX || a6xx_backend->info->a6xx.has_lpac) { - OUT_PKT4(ring, REG_A6XX_SP_CS_WIE_CNTL_0, 1); - OUT_RING(ring, A6XX_SP_CS_WIE_CNTL_0_WGIDCONSTID(work_group_id) | - A6XX_SP_CS_WIE_CNTL_0_WGSIZECONSTID(regid(63, 0)) | - A6XX_SP_CS_WIE_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) | - A6XX_SP_CS_WIE_CNTL_0_LOCALIDREGID(local_invocation_id)); + crb.add(A6XX_SP_CS_WIE_CNTL_0( + .wgidconstid = work_group_id, + .wgsizeconstid = INVALID_REG, + .wgoffsetconstid = INVALID_REG, + .localidregid = local_invocation_id, + )); + if (CHIP == A7XX) { /* TODO allow the shader to control the tiling */ - OUT_REG(ring, - SP_CS_WIE_CNTL_1(A7XX, .linearlocalidregid = regid(63, 0), - .threadsize = thrsz, - .workitemrastorder = WORKITEMRASTORDER_LINEAR)); + crb.add(SP_CS_WIE_CNTL_1(CHIP, + .linearlocalidregid = INVALID_REG, + .threadsize = thrsz, + .workitemrastorder = WORKITEMRASTORDER_LINEAR, + )); } else { - OUT_REG(ring, - SP_CS_WIE_CNTL_1(CHIP, .linearlocalidregid = regid(63, 0), - .threadsize = thrsz)); + crb.add(SP_CS_WIE_CNTL_1(CHIP, + .linearlocalidregid = INVALID_REG, + .threadsize = thrsz, + )); } } - OUT_PKT4(ring, REG_A6XX_SP_CS_BASE, 2); - OUT_RELOC(ring, v->bo, 0, 0, 0); /* SP_CS_BASE_LO/HI */ + crb.attach_bo(v->bo); - OUT_PKT4(ring, REG_A6XX_SP_CS_INSTR_SIZE, 1); - OUT_RING(ring, v->instrlen); - - OUT_PKT4(ring, REG_A6XX_SP_CS_BASE, 2); - OUT_RELOC(ring, v->bo, 0, 0, 0); - - uint32_t shader_preload_size = - MIN2(v->instrlen, a6xx_backend->info->a6xx.instr_cache_size); - OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3); - OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) | - CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size)); - OUT_RELOC(ring, v->bo, 0, 0, 0); + crb.add(A6XX_SP_CS_BASE(v->bo)); + crb.add(A6XX_SP_CS_INSTR_SIZE(v->instrlen)); if (v->pvtmem_size > 0) { uint32_t per_fiber_size = v->pvtmem_size; @@ -272,52 +261,68 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel) uint32_t total_size = per_sp_size * a6xx_backend->info->num_sp_cores; struct fd_bo *pvtmem = fd_bo_new(a6xx_backend->dev, total_size, 0, "pvtmem"); - OUT_PKT4(ring, REG_A6XX_SP_CS_PVT_MEM_PARAM, 4); - OUT_RING(ring, A6XX_SP_CS_PVT_MEM_PARAM_MEMSIZEPERITEM(per_fiber_size)); - OUT_RELOC(ring, pvtmem, 0, 0, 0); - OUT_RING(ring, A6XX_SP_CS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(per_sp_size) | - COND(v->pvtmem_per_wave, - A6XX_SP_CS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT)); + crb.add(A6XX_SP_CS_PVT_MEM_PARAM(.memsizeperitem = per_fiber_size)); + crb.add(A6XX_SP_CS_PVT_MEM_BASE(pvtmem)); + crb.add(A6XX_SP_CS_PVT_MEM_SIZE( + .totalpvtmemsize = per_sp_size, + .perwavememlayout = v->pvtmem_per_wave, + )); - OUT_PKT4(ring, REG_A6XX_SP_CS_PVT_MEM_STACK_OFFSET, 1); - OUT_RING(ring, A6XX_SP_CS_PVT_MEM_STACK_OFFSET_OFFSET(per_sp_size)); + crb.add(A6XX_SP_CS_PVT_MEM_STACK_OFFSET(.offset = per_sp_size)); } } template static void -emit_const(struct fd_ringbuffer *ring, uint32_t regid, uint32_t sizedwords, - const uint32_t *dwords) +cs_program_emit(fd_cs &cs, struct kernel *kernel) { + struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel); + struct a6xx_backend *a6xx_backend = to_a6xx_backend(ir3_kernel->backend); + struct ir3_shader_variant *v = ir3_kernel->v; + + cs_program_emit_regs(cs, kernel); + + uint32_t shader_preload_size = + MIN2(v->instrlen, a6xx_backend->info->a6xx.instr_cache_size); + + fd_pkt7(cs, CP_LOAD_STATE6_FRAG, 3) + .add(CP_LOAD_STATE6_0( + .state_type = ST6_SHADER, + .state_src = SS6_INDIRECT, + .state_block = SB6_CS_SHADER, + .num_unit = shader_preload_size, + )) + .add(CP_LOAD_STATE6_EXT_SRC_ADDR(v->bo)); +} + +template +static void +emit_const(fd_cs &cs, uint32_t regid, uint32_t sizedwords, const uint32_t *dwords) +{ + uint32_t zero[4] = {}; uint32_t align_sz; assert((regid % 4) == 0); align_sz = align(sizedwords, 4); - OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3 + align_sz); - OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(regid / 4) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) | - CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(sizedwords, 4))); - OUT_RING(ring, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); - OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); - - for (uint32_t i = 0; i < sizedwords; i++) { - OUT_RING(ring, dwords[i]); - } - - /* Zero-pad to multiple of 4 dwords */ - for (uint32_t i = sizedwords; i < align_sz; i++) { - OUT_RING(ring, 0); - } + fd_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + align_sz) + .add(CP_LOAD_STATE6_0( + .dst_off = regid / 4, + .state_type = ST6_CONSTANTS, + .state_src = SS6_DIRECT, + .state_block = SB6_CS_SHADER, + .num_unit = DIV_ROUND_UP(sizedwords, 4) + )) + .add(CP_LOAD_STATE6_EXT_SRC_ADDR()) + .add(dwords, sizedwords) + /* Zero-pad to multiple of 4 dwords */ + .add(zero, align_sz - sizedwords); } template static void -cs_const_emit(struct fd_ringbuffer *ring, struct kernel *kernel, - uint32_t grid[3]) +cs_const_emit(fd_cs &cs, struct kernel *kernel, uint32_t grid[3]) { struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel); struct ir3_shader_variant *v = ir3_kernel->v; @@ -357,7 +362,7 @@ cs_const_emit(struct fd_ringbuffer *ring, struct kernel *kernel, size *= 4; if (size > 0) { - emit_const(ring, base, size, imm_state->values); + emit_const(cs, base, size, imm_state->values); } } @@ -377,8 +382,7 @@ kernel_num_bufs(struct kernel *kernel, enum kernel_buf_type buf_type) template static void -cs_uav_emit(struct fd_ringbuffer *ring, struct fd_device *dev, - struct kernel *kernel) +cs_uav_emit(fd_cs &cs, struct fd_device *dev, struct kernel *kernel) { unsigned num_bufs = kernel_num_bufs(kernel, KERNEL_BUF_UAV); @@ -390,7 +394,7 @@ cs_uav_emit(struct fd_ringbuffer *ring, struct fd_device *dev, FD_BO_GPUREADONLY | FD_BO_HINT_COMMAND, "tex_desc"); - fd_ringbuffer_attach_bo(ring, state); + cs.attach_bo(state); uint32_t *buf = (uint32_t *)fd_bo_map(state); @@ -399,7 +403,7 @@ cs_uav_emit(struct fd_ringbuffer *ring, struct fd_device *dev, continue; } - fd_ringbuffer_attach_bo(ring, kernel->bufs[i]); + cs.attach_bo(kernel->bufs[i]); /* size is encoded with low 15b in WIDTH and high bits in HEIGHT, * in units of elements: @@ -424,29 +428,25 @@ cs_uav_emit(struct fd_ringbuffer *ring, struct fd_device *dev, buf += 16; } - OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3); - OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_UAV) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) | - CP_LOAD_STATE6_0_NUM_UNIT(num_bufs)); - OUT_RELOC(ring, state, 0); + fd_pkt7(cs, CP_LOAD_STATE6_FRAG, 3) + .add(CP_LOAD_STATE6_0( + .state_type = ST6_UAV, + .state_src = SS6_INDIRECT, + .state_block = SB6_CS_SHADER, + .num_unit = num_bufs, + )) + .add(CP_LOAD_STATE6_EXT_SRC_ADDR(state)); - if (CHIP == A6XX) { - OUT_PKT4(ring, REG_A6XX_SP_CS_UAV_BASE, 2); - } else { - OUT_PKT4(ring, REG_A7XX_SP_CS_UAV_BASE, 2); - } - OUT_RELOC(ring, state, 0); + fd_crb crb(cs, 3); - OUT_PKT4(ring, REG_A6XX_SP_CS_USIZE, 1); - OUT_RING(ring, num_bufs); + crb.add(SP_CS_UAV_BASE(CHIP, state)); + crb.add(A6XX_SP_CS_USIZE(num_bufs)); fd_bo_del(state); } static void -cs_ubo_emit(struct fd_ringbuffer *ring, struct kernel *kernel) +cs_ubo_emit(fd_cs &cs, struct kernel *kernel) { unsigned num_bufs = kernel_num_bufs(kernel, KERNEL_BUF_UBO); @@ -459,17 +459,20 @@ cs_ubo_emit(struct fd_ringbuffer *ring, struct kernel *kernel) continue; } - OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 5); - OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(offset) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) | - CP_LOAD_STATE6_0_NUM_UNIT(1)); - OUT_RING(ring, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); - OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + cs.attach_bo(kernel->bufs[i]); + unsigned size_vec4s = DIV_ROUND_UP(kernel->buf_sizes[i], 4); - OUT_RELOC(ring, kernel->bufs[i], 0, - (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32, 0); + + fd_pkt7(cs, CP_LOAD_STATE6_FRAG, 5) + .add(CP_LOAD_STATE6_0( + .dst_off = offset, + .state_type = ST6_UBO, + .state_src = SS6_DIRECT, + .state_block = SB6_CS_SHADER, + .num_unit = 1, + )) + .add(CP_LOAD_STATE6_EXT_SRC_ADDR()) + .add(A6XX_UBO_DESC(0, kernel->bufs[i], 0, size_vec4s)); offset++; } @@ -477,28 +480,28 @@ cs_ubo_emit(struct fd_ringbuffer *ring, struct kernel *kernel) template static inline unsigned -event_write(struct fd_ringbuffer *ring, struct kernel *kernel, - enum vgt_event_type evt, bool timestamp) +event_write(fd_cs &cs, struct kernel *kernel, enum vgt_event_type evt, bool timestamp) { unsigned seqno = 0; + unsigned len = timestamp ? 4 : 1; + + fd_pkt7 pkt(cs, CP_EVENT_WRITE, len); if (CHIP == A6XX) { - OUT_PKT7(ring, CP_EVENT_WRITE, timestamp ? 4 : 1); - OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(evt)); + pkt.add(CP_EVENT_WRITE_0_EVENT(evt) | + COND(timestamp, CP_EVENT_WRITE_0_TIMESTAMP)); } else { - OUT_PKT7(ring, CP_EVENT_WRITE7, timestamp ? 4 : 1); - OUT_RING(ring, - CP_EVENT_WRITE7_0_EVENT(evt) | - COND(timestamp, CP_EVENT_WRITE7_0_WRITE_ENABLED | - CP_EVENT_WRITE7_0_WRITE_SRC(EV_WRITE_USER_32B))); + pkt.add(CP_EVENT_WRITE7_0_EVENT(evt) | + CP_EVENT_WRITE7_0_WRITE_SRC(EV_WRITE_USER_32B) | + COND(timestamp, CP_EVENT_WRITE7_0_WRITE_ENABLED)); } if (timestamp) { struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel); struct a6xx_backend *a6xx_backend = to_a6xx_backend(ir3_kernel->backend); seqno = ++a6xx_backend->seqno; - OUT_RELOC(ring, control_ptr(a6xx_backend, seqno)); /* ADDR_LO/HI */ - OUT_RING(ring, seqno); + pkt.add(CP_EVENT_WRITE_ADDR(control_ptr(a6xx_backend, seqno))); + pkt.add(seqno); } return seqno; @@ -506,31 +509,30 @@ event_write(struct fd_ringbuffer *ring, struct kernel *kernel, template static inline void -cache_flush(struct fd_ringbuffer *ring, struct kernel *kernel) +cache_flush(fd_cs &cs, struct kernel *kernel) { struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel); struct a6xx_backend *a6xx_backend = to_a6xx_backend(ir3_kernel->backend); unsigned seqno; - seqno = event_write(ring, kernel, RB_DONE_TS, true); + seqno = event_write(cs, kernel, RB_DONE_TS, true); - OUT_PKT7(ring, CP_WAIT_REG_MEM, 6); - OUT_RING(ring, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) | - CP_WAIT_REG_MEM_0_POLL(POLL_MEMORY)); - OUT_RELOC(ring, control_ptr(a6xx_backend, seqno)); - OUT_RING(ring, CP_WAIT_REG_MEM_3_REF(seqno)); - OUT_RING(ring, CP_WAIT_REG_MEM_4_MASK(~0)); - OUT_RING(ring, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16)); + fd_pkt7(cs, CP_WAIT_REG_MEM, 6) + .add(CP_WAIT_REG_MEM_0(.function = WRITE_EQ, .poll = POLL_MEMORY)) + .add(CP_WAIT_REG_MEM_POLL_ADDR(control_ptr(a6xx_backend, seqno))) + .add(CP_WAIT_REG_MEM_3(.ref = seqno)) + .add(CP_WAIT_REG_MEM_4(.mask = !0)) + .add(CP_WAIT_REG_MEM_5(.delay_loop_cycles = 16)); if (CHIP == A6XX) { - seqno = event_write(ring, kernel, CACHE_FLUSH_TS, true); + seqno = event_write(cs, kernel, CACHE_FLUSH_TS, true); - OUT_PKT7(ring, CP_WAIT_MEM_GTE, 4); - OUT_RING(ring, CP_WAIT_MEM_GTE_0_RESERVED(0)); - OUT_RELOC(ring, control_ptr(a6xx_backend, seqno)); - OUT_RING(ring, CP_WAIT_MEM_GTE_3_REF(seqno)); + fd_pkt7(cs, CP_WAIT_MEM_GTE, 4) + .add(CP_WAIT_MEM_GTE_0()) + .add(CP_WAIT_MEM_GTE_POLL_ADDR(control_ptr(a6xx_backend, seqno))) + .add(CP_WAIT_MEM_GTE_3(.ref = seqno)); } else { - event_write(ring, kernel, CACHE_FLUSH7, false); + event_write(cs, kernel, CACHE_FLUSH7, false); } } @@ -541,18 +543,19 @@ a6xx_emit_grid(struct kernel *kernel, uint32_t grid[3], { struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel); struct a6xx_backend *a6xx_backend = to_a6xx_backend(ir3_kernel->backend); - struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( - submit, 0, - (enum fd_ringbuffer_flags)(FD_RINGBUFFER_PRIMARY | FD_RINGBUFFER_GROWABLE)); + fd_cs cs(fd_submit_new_ringbuffer(submit, 0, + (enum fd_ringbuffer_flags)(FD_RINGBUFFER_PRIMARY | FD_RINGBUFFER_GROWABLE))); - cs_restore_emit(ring, a6xx_backend); - cs_program_emit(ring, kernel); - cs_const_emit(ring, kernel, grid); - cs_uav_emit(ring, a6xx_backend->dev, kernel); - cs_ubo_emit(ring, kernel); + cs.attach_bo(a6xx_backend->control_mem); - OUT_PKT7(ring, CP_SET_MARKER, 1); - OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE)); + cs_restore_emit(cs, a6xx_backend); + cs_program_emit(cs, kernel); + cs_const_emit(cs, kernel, grid); + cs_uav_emit(cs, a6xx_backend->dev, kernel); + cs_ubo_emit(cs, kernel); + + fd_pkt7(cs, CP_SET_MARKER, 1) + .add(A6XX_CP_SET_MARKER_0(.mode = RM6_COMPUTE)); const unsigned *local_size = kernel->local_size; const unsigned *num_groups = grid; @@ -564,35 +567,40 @@ a6xx_emit_grid(struct kernel *kernel, uint32_t grid[3], work_dim++; } - OUT_REG(ring, SP_CS_NDRANGE_0(CHIP, - .kerneldim = work_dim, - .localsizex = local_size[0] - 1, - .localsizey = local_size[1] - 1, - .localsizez = local_size[2] - 1, - )); - if (CHIP == A7XX) { - OUT_REG(ring, A7XX_SP_CS_NDRANGE_7(.localsizex = local_size[0] - 1, - .localsizey = local_size[1] - 1, - .localsizez = local_size[2] - 1, )); + with_crb (cs, 11) { + crb.add(SP_CS_NDRANGE_0(CHIP, + .kerneldim = work_dim, + .localsizex = local_size[0] - 1, + .localsizey = local_size[1] - 1, + .localsizez = local_size[2] - 1, + )); + + if (CHIP == A7XX) { + crb.add(A7XX_SP_CS_NDRANGE_7( + .localsizex = local_size[0] - 1, + .localsizey = local_size[1] - 1, + .localsizez = local_size[2] - 1, + )); + } + + crb.add(SP_CS_NDRANGE_1(CHIP, + .globalsize_x = local_size[0] * num_groups[0], + )); + crb.add(SP_CS_NDRANGE_2(CHIP, 0)); + crb.add(SP_CS_NDRANGE_3(CHIP, + .globalsize_y = local_size[1] * num_groups[1], + )); + crb.add(SP_CS_NDRANGE_4(CHIP, 0)); + crb.add(SP_CS_NDRANGE_5(CHIP, + .globalsize_z = local_size[2] * num_groups[2], + )); + crb.add(SP_CS_NDRANGE_6(CHIP, 0)); + + crb.add(SP_CS_KERNEL_GROUP_X(CHIP, 1)); + crb.add(SP_CS_KERNEL_GROUP_Y(CHIP, 1)); + crb.add(SP_CS_KERNEL_GROUP_Z(CHIP, 1)); } - OUT_REG(ring, SP_CS_NDRANGE_1(CHIP, - .globalsize_x = local_size[0] * num_groups[0], - )); - OUT_REG(ring, SP_CS_NDRANGE_2(CHIP, 0)); - OUT_REG(ring, SP_CS_NDRANGE_3(CHIP, - .globalsize_y = local_size[1] * num_groups[1], - )); - OUT_REG(ring, SP_CS_NDRANGE_4(CHIP, 0)); - OUT_REG(ring, SP_CS_NDRANGE_5(CHIP, - .globalsize_z = local_size[2] * num_groups[2], - )); - OUT_REG(ring, SP_CS_NDRANGE_6(CHIP, 0)); - - OUT_REG(ring, SP_CS_KERNEL_GROUP_X(CHIP, 1)); - OUT_REG(ring, SP_CS_KERNEL_GROUP_Y(CHIP, 1)); - OUT_REG(ring, SP_CS_KERNEL_GROUP_Z(CHIP, 1)); - if (a6xx_backend->num_perfcntrs > 0) { a6xx_backend->query_mem = fd_bo_new( a6xx_backend->dev, @@ -604,55 +612,55 @@ a6xx_emit_grid(struct kernel *kernel, uint32_t grid[3], for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) { const struct perfcntr *counter = &a6xx_backend->perfcntrs[i]; - OUT_PKT4(ring, counter->select_reg, 1); - OUT_RING(ring, counter->selector); + fd_pkt4(cs, 1).add({ + .reg = counter->select_reg, + .value = counter->selector, + }); } - OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0); + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); /* and snapshot the start values: */ for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) { const struct perfcntr *counter = &a6xx_backend->perfcntrs[i]; - OUT_PKT7(ring, CP_REG_TO_MEM, 3); - OUT_RING(ring, CP_REG_TO_MEM_0_64B | - CP_REG_TO_MEM_0_REG(counter->counter_reg_lo)); - OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, start)); + fd_pkt7(cs, CP_REG_TO_MEM, 3) + .add(CP_REG_TO_MEM_0(.reg = counter->counter_reg_lo, ._64b = true)) + .add(CP_REG_TO_MEM_DEST(query_sample_idx(a6xx_backend, i, start))); } } - OUT_PKT7(ring, CP_EXEC_CS, 4); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, CP_EXEC_CS_1_NGROUPS_X(grid[0])); - OUT_RING(ring, CP_EXEC_CS_2_NGROUPS_Y(grid[1])); - OUT_RING(ring, CP_EXEC_CS_3_NGROUPS_Z(grid[2])); + fd_pkt7(cs, CP_EXEC_CS, 4) + .add(CP_EXEC_CS_0()) + .add(CP_EXEC_CS_1(.ngroups_x = grid[0])) + .add(CP_EXEC_CS_2(.ngroups_y = grid[1])) + .add(CP_EXEC_CS_3(.ngroups_z = grid[2])); - OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0); + fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); if (a6xx_backend->num_perfcntrs > 0) { /* snapshot the end values: */ for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) { const struct perfcntr *counter = &a6xx_backend->perfcntrs[i]; - OUT_PKT7(ring, CP_REG_TO_MEM, 3); - OUT_RING(ring, CP_REG_TO_MEM_0_64B | - CP_REG_TO_MEM_0_REG(counter->counter_reg_lo)); - OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, stop)); + fd_pkt7(cs, CP_REG_TO_MEM, 3) + .add(CP_REG_TO_MEM_0(.reg = counter->counter_reg_lo, ._64b = true)) + .add(CP_REG_TO_MEM_DEST(query_sample_idx(a6xx_backend, i, stop))); } /* and compute the result: */ for (unsigned i = 0; i < a6xx_backend->num_perfcntrs; i++) { /* result += stop - start: */ - OUT_PKT7(ring, CP_MEM_TO_MEM, 9); - OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C); - OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, result)); /* dst */ - OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, result)); /* srcA */ - OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, stop)); /* srcB */ - OUT_RELOC(ring, query_sample_idx(a6xx_backend, i, start)); /* srcC */ + fd_pkt7(cs, CP_MEM_TO_MEM, 9) + .add(CP_MEM_TO_MEM_0(.neg_c = true, ._double = true)) + .add(CP_MEM_TO_MEM_DST(query_sample_idx(a6xx_backend, i, result))) + .add(CP_MEM_TO_MEM_SRC_A(query_sample_idx(a6xx_backend, i, result))) + .add(CP_MEM_TO_MEM_SRC_B(query_sample_idx(a6xx_backend, i, stop))) + .add(CP_MEM_TO_MEM_SRC_C(query_sample_idx(a6xx_backend, i, start))); } } - cache_flush(ring, kernel); + cache_flush(cs, kernel); } static void diff --git a/src/freedreno/computerator/meson.build b/src/freedreno/computerator/meson.build index e3fec825bfc..55d9fed5fc9 100644 --- a/src/freedreno/computerator/meson.build +++ b/src/freedreno/computerator/meson.build @@ -12,6 +12,8 @@ computerator_files = [ computerator_cpp_args = cpp.get_supported_arguments([ '-Wno-array-bounds', + '-Wno-c++11-narrowing', + '-Wno-narrowing', '-Wno-sign-compare', ])