diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h index 3ccf3529d56..e08ffe2f305 100644 --- a/src/gallium/drivers/radeonsi/si_build_pm4.h +++ b/src/gallium/drivers/radeonsi/si_build_pm4.h @@ -39,249 +39,251 @@ #define SI_CHECK_SHADOWED_REGS(reg_offset, count) #endif -static inline void radeon_set_config_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num) -{ - SI_CHECK_SHADOWED_REGS(reg, num); - assert(reg < SI_CONTEXT_REG_OFFSET); - assert(cs->current.cdw + 2 + num <= cs->current.max_dw); - radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0)); - radeon_emit(cs, (reg - SI_CONFIG_REG_OFFSET) >> 2); -} +#define radeon_begin(cs) struct radeon_cmdbuf *__cs = (cs); \ + unsigned __cs_num = __cs->current.cdw; \ + UNUSED unsigned __cs_num_initial = __cs_num; \ + uint32_t *__cs_buf = __cs->current.buf -static inline void radeon_set_config_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value) -{ - radeon_set_config_reg_seq(cs, reg, 1); - radeon_emit(cs, value); -} +#define radeon_begin_again(cs) do { \ + assert(__cs == NULL); \ + __cs = (cs); \ + __cs_num = __cs->current.cdw; \ + __cs_num_initial = __cs_num; \ + __cs_buf = __cs->current.buf; \ +} while (0) -static inline void radeon_set_context_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num) -{ - SI_CHECK_SHADOWED_REGS(reg, num); - assert(reg >= SI_CONTEXT_REG_OFFSET); - assert(cs->current.cdw + 2 + num <= cs->current.max_dw); - radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0)); - radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2); -} +#define radeon_end() do { \ + __cs->current.cdw = __cs_num; \ + assert(__cs->current.cdw <= __cs->current.max_dw); \ + __cs = NULL; \ +} while (0) -static inline void radeon_set_context_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value) -{ - radeon_set_context_reg_seq(cs, reg, 1); - radeon_emit(cs, value); -} +#define radeon_emit(cs, value) __cs_buf[__cs_num++] = (value) +#define radeon_packets_added() (__cs_num != __cs_num_initial) -static inline void radeon_set_context_reg_seq_array(struct radeon_cmdbuf *cs, unsigned reg, - unsigned num, const uint32_t *values) -{ - radeon_set_context_reg_seq(cs, reg, num); - radeon_emit_array(cs, values, num); -} +#define radeon_end_update_context_roll(sctx) do { \ + radeon_end(); \ + if (radeon_packets_added()) \ + (sctx)->context_roll = true; \ +} while (0) -static inline void radeon_set_context_reg_idx(struct radeon_cmdbuf *cs, unsigned reg, unsigned idx, - unsigned value) -{ - SI_CHECK_SHADOWED_REGS(reg, 1); - assert(reg >= SI_CONTEXT_REG_OFFSET); - assert(cs->current.cdw + 3 <= cs->current.max_dw); - radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, 1, 0)); - radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2 | (idx << 28)); - radeon_emit(cs, value); -} +#define radeon_emit_array(cs, values, num) do { \ + unsigned __n = (num); \ + memcpy(__cs_buf + __cs_num, (values), __n * 4); \ + __cs_num += __n; \ +} while (0) -static inline void radeon_set_sh_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num) -{ - SI_CHECK_SHADOWED_REGS(reg, num); - assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END); - assert(cs->current.cdw + 2 + num <= cs->current.max_dw); - radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0)); - radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2); -} +#define radeon_set_config_reg_seq(cs, reg, num) do { \ + SI_CHECK_SHADOWED_REGS(reg, num); \ + assert((reg) < SI_CONTEXT_REG_OFFSET); \ + radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0)); \ + radeon_emit(cs, ((reg) - SI_CONFIG_REG_OFFSET) >> 2); \ +} while (0) -static inline void radeon_set_sh_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value) -{ - radeon_set_sh_reg_seq(cs, reg, 1); - radeon_emit(cs, value); -} +#define radeon_set_config_reg(cs, reg, value) do { \ + radeon_set_config_reg_seq(cs, reg, 1); \ + radeon_emit(cs, value); \ +} while (0) -static inline void radeon_set_uconfig_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num, bool perfctr) -{ - SI_CHECK_SHADOWED_REGS(reg, num); - assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END); - assert(cs->current.cdw + 2 + num <= cs->current.max_dw); - radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, perfctr)); - radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2); -} +#define radeon_set_context_reg_seq(cs, reg, num) do { \ + SI_CHECK_SHADOWED_REGS(reg, num); \ + assert((reg) >= SI_CONTEXT_REG_OFFSET); \ + radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0)); \ + radeon_emit(cs, ((reg) - SI_CONTEXT_REG_OFFSET) >> 2); \ +} while (0) -static inline void radeon_set_uconfig_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value) -{ - radeon_set_uconfig_reg_seq(cs, reg, 1, false); - radeon_emit(cs, value); -} +#define radeon_set_context_reg(cs, reg, value) do { \ + radeon_set_context_reg_seq(cs, reg, 1); \ + radeon_emit(cs, value); \ +} while (0) -static inline void radeon_set_uconfig_reg_perfctr(struct radeon_cmdbuf *cs, unsigned reg, unsigned value) -{ - radeon_set_uconfig_reg_seq(cs, reg, 1, true); - radeon_emit(cs, value); -} +#define radeon_set_context_reg_seq_array(cs, reg, num, values) do { \ + radeon_set_context_reg_seq(cs, reg, num); \ + radeon_emit_array(cs, values, num); \ +} while (0) -static inline void radeon_set_uconfig_reg_idx(struct radeon_cmdbuf *cs, struct si_screen *screen, - unsigned reg, unsigned idx, unsigned value) -{ - SI_CHECK_SHADOWED_REGS(reg, 1); - assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END); - assert(cs->current.cdw + 3 <= cs->current.max_dw); - assert(idx != 0); - unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX; - if (screen->info.chip_class < GFX9 || - (screen->info.chip_class == GFX9 && screen->info.me_fw_version < 26)) - opcode = PKT3_SET_UCONFIG_REG; - radeon_emit(cs, PKT3(opcode, 1, 0)); - radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2 | (idx << 28)); - radeon_emit(cs, value); -} +#define radeon_set_context_reg_idx(cs, reg, idx, value) do { \ + SI_CHECK_SHADOWED_REGS(reg, 1); \ + assert((reg) >= SI_CONTEXT_REG_OFFSET); \ + radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, 1, 0)); \ + radeon_emit(cs, ((reg) - SI_CONTEXT_REG_OFFSET) >> 2 | ((idx) << 28)); \ + radeon_emit(cs, value); \ +} while (0) -static inline void radeon_set_context_reg_rmw(struct radeon_cmdbuf *cs, unsigned reg, - unsigned value, unsigned mask) -{ - SI_CHECK_SHADOWED_REGS(reg, 1); - assert(reg >= SI_CONTEXT_REG_OFFSET); - assert(cs->current.cdw + 4 <= cs->current.max_dw); - radeon_emit(cs, PKT3(PKT3_CONTEXT_REG_RMW, 2, 0)); - radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2); - radeon_emit(cs, mask); - radeon_emit(cs, value); -} +#define radeon_set_sh_reg_seq(cs, reg, num) do { \ + SI_CHECK_SHADOWED_REGS(reg, num); \ + assert((reg) >= SI_SH_REG_OFFSET && (reg) < SI_SH_REG_END); \ + radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0)); \ + radeon_emit(cs, ((reg) - SI_SH_REG_OFFSET) >> 2); \ +} while (0) + +#define radeon_set_sh_reg(cs, reg, value) do { \ + radeon_set_sh_reg_seq(cs, reg, 1); \ + radeon_emit(cs, value); \ +} while (0) + +#define radeon_set_uconfig_reg_seq(cs, reg, num, perfctr) do { \ + SI_CHECK_SHADOWED_REGS(reg, num); \ + assert((reg) >= CIK_UCONFIG_REG_OFFSET && (reg) < CIK_UCONFIG_REG_END); \ + radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, perfctr)); \ + radeon_emit(cs, ((reg) - CIK_UCONFIG_REG_OFFSET) >> 2); \ +} while (0) + +#define radeon_set_uconfig_reg(cs, reg, value) do { \ + radeon_set_uconfig_reg_seq(cs, reg, 1, false); \ + radeon_emit(cs, value); \ +} while (0) + +#define radeon_set_uconfig_reg_perfctr(cs, reg, value) do { \ + radeon_set_uconfig_reg_seq(cs, reg, 1, true); \ + radeon_emit(cs, value); \ +} while (0) + +#define radeon_set_uconfig_reg_idx(cs, screen, chip_class, reg, idx, value) do { \ + SI_CHECK_SHADOWED_REGS(reg, 1); \ + assert((reg) >= CIK_UCONFIG_REG_OFFSET && (reg) < CIK_UCONFIG_REG_END); \ + assert((idx) != 0); \ + unsigned __opcode = PKT3_SET_UCONFIG_REG_INDEX; \ + if ((chip_class) < GFX9 || \ + ((chip_class) == GFX9 && (screen)->info.me_fw_version < 26)) \ + __opcode = PKT3_SET_UCONFIG_REG; \ + radeon_emit(cs, PKT3(__opcode, 1, 0)); \ + radeon_emit(cs, ((reg) - CIK_UCONFIG_REG_OFFSET) >> 2 | ((idx) << 28)); \ + radeon_emit(cs, value); \ +} while (0) + +#define radeon_set_context_reg_rmw(cs, reg, value, mask) do { \ + SI_CHECK_SHADOWED_REGS(reg, 1); \ + assert((reg) >= SI_CONTEXT_REG_OFFSET); \ + radeon_emit(cs, PKT3(PKT3_CONTEXT_REG_RMW, 2, 0)); \ + radeon_emit(cs, ((reg) - SI_CONTEXT_REG_OFFSET) >> 2); \ + radeon_emit(cs, mask); \ + radeon_emit(cs, value); \ +} while (0) /* Emit PKT3_CONTEXT_REG_RMW if the register value is different. */ -static inline void radeon_opt_set_context_reg_rmw(struct si_context *sctx, unsigned offset, - enum si_tracked_reg reg, unsigned value, - unsigned mask) -{ - struct radeon_cmdbuf *cs = &sctx->gfx_cs; - - assert((value & ~mask) == 0); - value &= mask; - - if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 || - sctx->tracked_regs.reg_value[reg] != value) { - radeon_set_context_reg_rmw(cs, offset, value, mask); - - sctx->tracked_regs.reg_saved |= 0x1ull << reg; - sctx->tracked_regs.reg_value[reg] = value; - } -} +#define radeon_opt_set_context_reg_rmw(sctx, offset, reg, val, mask) do { \ + unsigned __value = (val); \ + assert((__value & ~mask) == 0); \ + __value &= mask; \ + if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x1) != 0x1 || \ + sctx->tracked_regs.reg_value[reg] != __value) { \ + radeon_set_context_reg_rmw(&sctx->gfx_cs, offset, __value, mask); \ + sctx->tracked_regs.reg_saved |= 0x1ull << (reg); \ + sctx->tracked_regs.reg_value[reg] = __value; \ + } \ +} while (0) /* Emit PKT3_SET_CONTEXT_REG if the register value is different. */ -static inline void radeon_opt_set_context_reg(struct si_context *sctx, unsigned offset, - enum si_tracked_reg reg, unsigned value) -{ - struct radeon_cmdbuf *cs = &sctx->gfx_cs; - - if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 || - sctx->tracked_regs.reg_value[reg] != value) { - radeon_set_context_reg(cs, offset, value); - - sctx->tracked_regs.reg_saved |= 0x1ull << reg; - sctx->tracked_regs.reg_value[reg] = value; - } -} +#define radeon_opt_set_context_reg(sctx, offset, reg, val) do { \ + unsigned __value = val; \ + if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x1) != 0x1 || \ + sctx->tracked_regs.reg_value[reg] != __value) { \ + radeon_set_context_reg(&sctx->gfx_cs, offset, __value); \ + sctx->tracked_regs.reg_saved |= 0x1ull << (reg); \ + sctx->tracked_regs.reg_value[reg] = __value; \ + } \ +} while (0) /** * Set 2 consecutive registers if any registers value is different. * @param offset starting register offset - * @param value1 is written to first register - * @param value2 is written to second register + * @param val1 is written to first register + * @param val2 is written to second register */ -static inline void radeon_opt_set_context_reg2(struct si_context *sctx, unsigned offset, - enum si_tracked_reg reg, unsigned value1, - unsigned value2) -{ - struct radeon_cmdbuf *cs = &sctx->gfx_cs; - - if (((sctx->tracked_regs.reg_saved >> reg) & 0x3) != 0x3 || - sctx->tracked_regs.reg_value[reg] != value1 || - sctx->tracked_regs.reg_value[reg + 1] != value2) { - radeon_set_context_reg_seq(cs, offset, 2); - radeon_emit(cs, value1); - radeon_emit(cs, value2); - - sctx->tracked_regs.reg_value[reg] = value1; - sctx->tracked_regs.reg_value[reg + 1] = value2; - sctx->tracked_regs.reg_saved |= 0x3ull << reg; - } -} +#define radeon_opt_set_context_reg2(sctx, offset, reg, val1, val2) do { \ + unsigned __value1 = (val1), __value2 = (val2); \ + if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x3) != 0x3 || \ + sctx->tracked_regs.reg_value[reg] != __value1 || \ + sctx->tracked_regs.reg_value[(reg) + 1] != __value2) { \ + radeon_set_context_reg_seq(&sctx->gfx_cs, offset, 2); \ + radeon_emit(cs, __value1); \ + radeon_emit(cs, __value2); \ + sctx->tracked_regs.reg_value[reg] = __value1; \ + sctx->tracked_regs.reg_value[(reg) + 1] = __value2; \ + sctx->tracked_regs.reg_saved |= 0x3ull << (reg); \ + } \ +} while (0) /** * Set 3 consecutive registers if any registers value is different. */ -static inline void radeon_opt_set_context_reg3(struct si_context *sctx, unsigned offset, - enum si_tracked_reg reg, unsigned value1, - unsigned value2, unsigned value3) -{ - struct radeon_cmdbuf *cs = &sctx->gfx_cs; - - if (((sctx->tracked_regs.reg_saved >> reg) & 0x7) != 0x7 || - sctx->tracked_regs.reg_value[reg] != value1 || - sctx->tracked_regs.reg_value[reg + 1] != value2 || - sctx->tracked_regs.reg_value[reg + 2] != value3) { - radeon_set_context_reg_seq(cs, offset, 3); - radeon_emit(cs, value1); - radeon_emit(cs, value2); - radeon_emit(cs, value3); - - sctx->tracked_regs.reg_value[reg] = value1; - sctx->tracked_regs.reg_value[reg + 1] = value2; - sctx->tracked_regs.reg_value[reg + 2] = value3; - sctx->tracked_regs.reg_saved |= 0x7ull << reg; - } -} +#define radeon_opt_set_context_reg3(sctx, offset, reg, val1, val2, val3) do { \ + unsigned __value1 = (val1), __value2 = (val2), __value3 = (val3); \ + if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x7) != 0x7 || \ + sctx->tracked_regs.reg_value[reg] != __value1 || \ + sctx->tracked_regs.reg_value[(reg) + 1] != __value2 || \ + sctx->tracked_regs.reg_value[(reg) + 2] != __value3) { \ + radeon_set_context_reg_seq(&sctx->gfx_cs, offset, 3); \ + radeon_emit(cs, __value1); \ + radeon_emit(cs, __value2); \ + radeon_emit(cs, __value3); \ + sctx->tracked_regs.reg_value[reg] = __value1; \ + sctx->tracked_regs.reg_value[(reg) + 1] = __value2; \ + sctx->tracked_regs.reg_value[(reg) + 2] = __value3; \ + sctx->tracked_regs.reg_saved |= 0x7ull << (reg); \ + } \ +} while (0) /** * Set 4 consecutive registers if any registers value is different. */ -static inline void radeon_opt_set_context_reg4(struct si_context *sctx, unsigned offset, - enum si_tracked_reg reg, unsigned value1, - unsigned value2, unsigned value3, unsigned value4) -{ - struct radeon_cmdbuf *cs = &sctx->gfx_cs; - - if (((sctx->tracked_regs.reg_saved >> reg) & 0xf) != 0xf || - sctx->tracked_regs.reg_value[reg] != value1 || - sctx->tracked_regs.reg_value[reg + 1] != value2 || - sctx->tracked_regs.reg_value[reg + 2] != value3 || - sctx->tracked_regs.reg_value[reg + 3] != value4) { - radeon_set_context_reg_seq(cs, offset, 4); - radeon_emit(cs, value1); - radeon_emit(cs, value2); - radeon_emit(cs, value3); - radeon_emit(cs, value4); - - sctx->tracked_regs.reg_value[reg] = value1; - sctx->tracked_regs.reg_value[reg + 1] = value2; - sctx->tracked_regs.reg_value[reg + 2] = value3; - sctx->tracked_regs.reg_value[reg + 3] = value4; - sctx->tracked_regs.reg_saved |= 0xfull << reg; - } -} +#define radeon_opt_set_context_reg4(sctx, offset, reg, val1, val2, val3, val4) do { \ + unsigned __value1 = (val1), __value2 = (val2), __value3 = (val3), __value4 = (val4); \ + if (((sctx->tracked_regs.reg_saved >> (reg)) & 0xf) != 0xf || \ + sctx->tracked_regs.reg_value[reg] != __value1 || \ + sctx->tracked_regs.reg_value[(reg) + 1] != __value2 || \ + sctx->tracked_regs.reg_value[(reg) + 2] != __value3 || \ + sctx->tracked_regs.reg_value[(reg) + 3] != __value4) { \ + radeon_set_context_reg_seq(&sctx->gfx_cs, offset, 4); \ + radeon_emit(cs, __value1); \ + radeon_emit(cs, __value2); \ + radeon_emit(cs, __value3); \ + radeon_emit(cs, __value4); \ + sctx->tracked_regs.reg_value[reg] = __value1; \ + sctx->tracked_regs.reg_value[(reg) + 1] = __value2; \ + sctx->tracked_regs.reg_value[(reg) + 2] = __value3; \ + sctx->tracked_regs.reg_value[(reg) + 3] = __value4; \ + sctx->tracked_regs.reg_saved |= 0xfull << (reg); \ + } \ +} while (0) /** * Set consecutive registers if any registers value is different. */ -static inline void radeon_opt_set_context_regn(struct si_context *sctx, unsigned offset, - unsigned *value, unsigned *saved_val, unsigned num) -{ - struct radeon_cmdbuf *cs = &sctx->gfx_cs; +#define radeon_opt_set_context_regn(sctx, offset, value, saved_val, num) do { \ + for (unsigned i = 0; i < (num); i++) { \ + if ((saved_val)[i] != (value)[i]) { \ + radeon_set_context_reg_seq(&(sctx)->gfx_cs, offset, num); \ + for (unsigned j = 0; j < (num); j++) \ + radeon_emit(cs, value[j]); \ + memcpy(saved_val, value, sizeof(uint32_t) * (num)); \ + break; \ + } \ + } \ +} while (0) - for (unsigned i = 0; i < num; i++) { - if (saved_val[i] != value[i]) { - radeon_set_context_reg_seq(cs, offset, num); - for (unsigned j = 0; j < num; j++) - radeon_emit(cs, value[j]); +#define radeon_set_privileged_config_reg(cs, reg, value) do { \ + assert((reg) < CIK_UCONFIG_REG_OFFSET); \ + radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); \ + radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | \ + COPY_DATA_DST_SEL(COPY_DATA_PERF)); \ + radeon_emit(cs, value); \ + radeon_emit(cs, 0); /* unused */ \ + radeon_emit(cs, (reg) >> 2); \ + radeon_emit(cs, 0); /* unused */ \ +} while (0) - memcpy(saved_val, value, sizeof(uint32_t) * num); - break; - } - } -} +#define radeon_emit_32bit_pointer(sscreen, cs, va) do { \ + radeon_emit(cs, va); \ + assert((va) == 0 || ((va) >> 32) == sscreen->info.address32_hi); \ +} while (0) + +#define radeon_emit_one_32bit_pointer(sctx, desc, sh_base) do { \ + unsigned sh_offset = (sh_base) + (desc)->shader_userdata_offset; \ + radeon_set_sh_reg_seq(&sctx->gfx_cs, sh_offset, 1); \ + radeon_emit_32bit_pointer(sctx->screen, cs, (desc)->gpu_address); \ +} while (0) /* This should be evaluated at compile time if all parameters are constants. */ static ALWAYS_INLINE unsigned diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index c2b0c24887f..008972e27f3 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -349,6 +349,7 @@ void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf { uint64_t bc_va = sctx->border_color_buffer->gpu_address; + radeon_begin(cs); radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2); /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1, * renamed COMPUTE_DESTINATION_EN_SEn on gfx10. */ @@ -404,6 +405,7 @@ void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, 0); radeon_set_sh_reg(cs, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0); } + radeon_end(); } static bool si_setup_compute_scratch_buffer(struct si_context *sctx, struct si_shader *shader, @@ -505,6 +507,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); + radeon_begin(cs); radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2); radeon_emit(cs, shader_va >> 8); radeon_emit(cs, S_00B834_DATA(shader_va >> 40)); @@ -524,6 +527,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE, S_00B860_WAVES(sctx->scratch_waves) | S_00B860_WAVESIZE(sctx->max_seen_compute_scratch_bytes_per_wave >> 10)); + radeon_end(); sctx->cs_shader_state.emitted_program = program; sctx->cs_shader_state.offset = offset; @@ -562,11 +566,13 @@ static void setup_scratch_rsrc_user_sgprs(struct si_context *sctx, } } + radeon_begin(cs); radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 4); radeon_emit(cs, scratch_dword0); radeon_emit(cs, scratch_dword1); radeon_emit(cs, scratch_dword2); radeon_emit(cs, scratch_dword3); + radeon_end(); } static void si_setup_user_sgprs_co_v2(struct si_context *sctx, const amd_kernel_code_t *code_object, @@ -589,6 +595,8 @@ static void si_setup_user_sgprs_co_v2(struct si_context *sctx, const amd_kernel_ user_sgpr += 4; } + radeon_begin(cs); + if (AMD_HSA_BITS_GET(code_object->code_properties, AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR)) { struct dispatch_packet dispatch; unsigned dispatch_offset; @@ -646,6 +654,7 @@ static void si_setup_user_sgprs_co_v2(struct si_context *sctx, const amd_kernel_ user_sgpr += 1; } } + radeon_end(); } static bool si_upload_compute_input(struct si_context *sctx, const amd_kernel_code_t *code_object, @@ -693,13 +702,18 @@ static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_gr 12 * sel->info.uses_grid_size; unsigned cs_user_data_reg = block_size_reg + 12 * program->sel.info.uses_variable_block_size; + radeon_begin(cs); + if (sel->info.uses_grid_size) { if (info->indirect) { + radeon_end(); + for (unsigned i = 0; i < 3; ++i) { si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_REG, NULL, (grid_size_reg >> 2) + i, COPY_DATA_SRC_MEM, si_resource(info->indirect), info->indirect_offset + 4 * i); } + radeon_begin_again(cs); } else { radeon_set_sh_reg_seq(cs, grid_size_reg, 3); radeon_emit(cs, info->grid[0]); @@ -719,6 +733,7 @@ static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_gr radeon_set_sh_reg_seq(cs, cs_user_data_reg, sel->info.base.cs.user_data_components_amd); radeon_emit_array(cs, sctx->cs_user_data, sel->info.base.cs.user_data_components_amd); } + radeon_end(); } static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_grid_info *info) @@ -734,6 +749,7 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_ if (sctx->chip_class >= GFX10 && waves_per_threadgroup == 1) threadgroups_per_cu = 2; + radeon_begin(cs); radeon_set_sh_reg( cs, R_00B854_COMPUTE_RESOURCE_LIMITS, ac_get_compute_resource_limits(&sscreen->info, waves_per_threadgroup, @@ -795,9 +811,10 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_ } if (unlikely(sctx->thread_trace_enabled && sctx->chip_class >= GFX9)) { - radeon_emit(&sctx->gfx_cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(&sctx->gfx_cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0)); + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0)); } + radeon_end(); } static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *info) diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c index 4c94f2c53e3..bad93320496 100644 --- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c +++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c @@ -1084,8 +1084,10 @@ si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe */ if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) && gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) { + radeon_begin(gfx_cs); radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(gfx_cs, 0); + radeon_end(); } si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); @@ -1184,6 +1186,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, * TTM buffer moves in the kernel. */ if (sctx->chip_class >= GFX10) { + radeon_begin(cs); radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0)); radeon_emit(cs, 0); /* CP_COHER_CNTL */ radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ @@ -1195,6 +1198,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, S_586_GLI_INV(V_586_GLI_ALL) | S_586_GLK_INV(1) | S_586_GLV_INV(1) | S_586_GL1_INV(1) | S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) | S_586_GLM_WB(1) | S_586_SEQ(V_586_SEQ_FORWARD)); + radeon_end(); } else { si_emit_surface_sync(sctx, cs, S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) | @@ -1211,6 +1215,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, si_emit_initial_compute_regs(sctx, cs); + radeon_begin(cs); radeon_set_sh_reg( cs, R_00B860_COMPUTE_TMPRING_SIZE, S_00B860_WAVES(sctx->scratch_waves) | S_00B860_WAVESIZE(0)); /* no scratch */ @@ -1231,6 +1236,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, radeon_emit(cs, 0); radeon_emit(cs, S_03107C_ENABLE(0)); } + radeon_end(); if (sctx->last_ib_barrier_buf) { assert(!sctx->last_ib_barrier_fence); @@ -1349,6 +1355,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, * in parallel with compute shaders. */ if (first_dispatch) { + radeon_begin(cs); radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size / 4, 0)); radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1)); radeon_emit(cs, gds_offset); @@ -1356,6 +1363,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, radeon_emit(cs, 0); /* value to write */ if (gds_size == 8) radeon_emit(cs, 0); + radeon_end(); } } @@ -1370,6 +1378,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, assert(shader->config.scratch_bytes_per_wave == 0); assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4); + radeon_begin(cs); radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2); radeon_emit(cs, shader_va >> 8); radeon_emit(cs, S_00B834_DATA(shader_va >> 40)); @@ -1390,6 +1399,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, ac_get_compute_resource_limits(&sctx->screen->info, WAVES_PER_TG, MAX_WAVES_PER_SH, THREADGROUPS_PER_CU)); + radeon_end(); sctx->compute_ib_last_shader = shader; } @@ -1417,8 +1427,10 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4; if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) { + radeon_begin(gfx_cs); radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(gfx_cs, 0); + radeon_end(); si_cp_wait_mem( sctx, gfx_cs, @@ -1430,8 +1442,10 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, */ sctx->ws->cs_check_space(gfx_cs, 0, true); } else { + radeon_begin(gfx_cs); radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0)); radeon_emit(gfx_cs, 0); + radeon_end(); } } @@ -1441,12 +1455,16 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, uint64_t index_va = out_indexbuf_va + start_prim * 12; /* Emit the draw packet into the gfx IB. */ + radeon_begin(gfx_cs); radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0)); radeon_emit(gfx_cs, num_prims * vertices_per_prim); radeon_emit(gfx_cs, index_va); radeon_emit(gfx_cs, index_va >> 32); radeon_emit(gfx_cs, 0); radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA); + radeon_end(); + + radeon_begin_again(cs); /* Continue with the compute IB. */ if (start_prim == 0) { @@ -1503,6 +1521,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_PARTIAL_TG_EN(!!partial_block_size) | S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) | S_00B800_ORDER_MODE(0 /* launch in order */)); + radeon_end(); /* This is only for unordered append. Ordered append writes this from * the shader. diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 7945143f916..5cd30e50b60 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -24,6 +24,7 @@ #include "si_pipe.h" #include "sid.h" +#include "si_build_pm4.h" /* Set this if you want the ME to wait until CP DMA is done. * It should be set on the last CP DMA packet. */ @@ -102,6 +103,8 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) | S_500_SRC_CACHE_POLICY(cache_policy == L2_STREAM); } + radeon_begin(cs); + if (sctx->chip_class >= GFX7) { radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); radeon_emit(cs, header); @@ -130,6 +133,7 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); radeon_emit(cs, 0); } + radeon_end(); } void si_cp_dma_wait_for_idle(struct si_context *sctx, struct radeon_cmdbuf *cs) @@ -428,6 +432,7 @@ void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf, } struct radeon_cmdbuf *cs = &sctx->gfx_cs; + radeon_begin(cs); radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); radeon_emit(cs, header); radeon_emit(cs, address); /* SRC_ADDR_LO [31:0] */ @@ -435,6 +440,7 @@ void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf, radeon_emit(cs, address); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, address >> 32); /* DST_ADDR_HI [31:0] */ radeon_emit(cs, command); + radeon_end(); } void si_test_gds(struct si_context *sctx) @@ -495,11 +501,13 @@ void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, unsigned radeon_add_to_buffer_list(sctx, cs, buf, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); uint64_t va = buf->gpu_address + offset; + radeon_begin(cs); radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + size / 4, 0)); radeon_emit(cs, S_370_DST_SEL(dst_sel) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine)); radeon_emit(cs, va); radeon_emit(cs, va >> 32); radeon_emit_array(cs, (const uint32_t *)data, size / 4); + radeon_end(); } void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned dst_sel, @@ -517,10 +525,12 @@ void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned uint64_t dst_va = (dst ? dst->gpu_address : 0ull) + dst_offset; uint64_t src_va = (src ? src->gpu_address : 0ull) + src_offset; + radeon_begin(cs); radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); radeon_emit(cs, COPY_DATA_SRC_SEL(src_sel) | COPY_DATA_DST_SEL(dst_sel) | COPY_DATA_WR_CONFIRM); radeon_emit(cs, src_va); radeon_emit(cs, src_va >> 32); radeon_emit(cs, dst_va); radeon_emit(cs, dst_va >> 32); + radeon_end(); } diff --git a/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c b/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c index d48fb14278b..ad9341a83bb 100644 --- a/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c +++ b/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c @@ -144,6 +144,15 @@ si_create_shadowing_ib_preamble(struct si_context *sctx) return pm4; } +static void si_set_context_reg_array(struct radeon_cmdbuf *cs, unsigned reg, unsigned num, + const uint32_t *values) +{ + radeon_begin(cs); + radeon_set_context_reg_seq(cs, reg, num); + radeon_emit_array(cs, values, num); + radeon_end(); +} + void si_init_cp_reg_shadowing(struct si_context *sctx) { if (sctx->screen->info.mid_command_buffer_preemption_enabled || @@ -174,8 +183,7 @@ void si_init_cp_reg_shadowing(struct si_context *sctx) radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->shadowed_regs, RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS); si_pm4_emit(sctx, shadowing_preamble); - ac_emulate_clear_state(&sctx->screen->info, &sctx->gfx_cs, - radeon_set_context_reg_seq_array); + ac_emulate_clear_state(&sctx->screen->info, &sctx->gfx_cs, si_set_context_reg_array); si_pm4_emit(sctx, sctx->cs_preamble_state); /* The register values are shadowed, so we won't need to set them again. */ diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 75cfc1c8662..fdb0333c3a1 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -1930,82 +1930,59 @@ void si_shader_change_notify(struct si_context *sctx) PIPE_SHADER_TESS_EVAL)); } -static void si_emit_shader_pointer_head(struct radeon_cmdbuf *cs, unsigned sh_offset, - unsigned pointer_count) -{ - SI_CHECK_SHADOWED_REGS(sh_offset, pointer_count); - radeon_emit(cs, PKT3(PKT3_SET_SH_REG, pointer_count, 0)); - radeon_emit(cs, (sh_offset - SI_SH_REG_OFFSET) >> 2); -} - -static void si_emit_shader_pointer_body(struct si_screen *sscreen, struct radeon_cmdbuf *cs, - uint64_t va) -{ - radeon_emit(cs, va); - - assert(va == 0 || (va >> 32) == sscreen->info.address32_hi); -} - -static void si_emit_shader_pointer(struct si_context *sctx, struct si_descriptors *desc, - unsigned sh_base) -{ - struct radeon_cmdbuf *cs = &sctx->gfx_cs; - unsigned sh_offset = sh_base + desc->shader_userdata_offset; - - si_emit_shader_pointer_head(cs, sh_offset, 1); - si_emit_shader_pointer_body(sctx->screen, cs, desc->gpu_address); -} - -static void si_emit_consecutive_shader_pointers(struct si_context *sctx, unsigned pointer_mask, - unsigned sh_base) -{ - if (!sh_base) - return; - - struct radeon_cmdbuf *cs = &sctx->gfx_cs; - unsigned mask = sctx->shader_pointers_dirty & pointer_mask; - - while (mask) { - int start, count; - u_bit_scan_consecutive_range(&mask, &start, &count); - - struct si_descriptors *descs = &sctx->descriptors[start]; - unsigned sh_offset = sh_base + descs->shader_userdata_offset; - - si_emit_shader_pointer_head(cs, sh_offset, count); - for (int i = 0; i < count; i++) - si_emit_shader_pointer_body(sctx->screen, cs, descs[i].gpu_address); - } -} +#define si_emit_consecutive_shader_pointers(sctx, pointer_mask, sh_base) do { \ + unsigned sh_reg_base = (sh_base); \ + if (sh_reg_base) { \ + unsigned mask = sctx->shader_pointers_dirty & (pointer_mask); \ + \ + while (mask) { \ + int start, count; \ + u_bit_scan_consecutive_range(&mask, &start, &count); \ + \ + struct si_descriptors *descs = &sctx->descriptors[start]; \ + unsigned sh_offset = sh_reg_base + descs->shader_userdata_offset; \ + \ + radeon_set_sh_reg_seq(&sctx->gfx_cs, sh_offset, count); \ + for (int i = 0; i < count; i++) \ + radeon_emit_32bit_pointer(sctx->screen, cs, descs[i].gpu_address); \ + } \ + } \ +} while (0) static void si_emit_global_shader_pointers(struct si_context *sctx, struct si_descriptors *descs) { + radeon_begin(&sctx->gfx_cs); + if (sctx->chip_class >= GFX10) { - si_emit_shader_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0); + radeon_emit_one_32bit_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0); /* HW VS stage only used in non-NGG mode. */ - si_emit_shader_pointer(sctx, descs, R_00B130_SPI_SHADER_USER_DATA_VS_0); - si_emit_shader_pointer(sctx, descs, R_00B230_SPI_SHADER_USER_DATA_GS_0); - si_emit_shader_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_HS_0); + radeon_emit_one_32bit_pointer(sctx, descs, R_00B130_SPI_SHADER_USER_DATA_VS_0); + radeon_emit_one_32bit_pointer(sctx, descs, R_00B230_SPI_SHADER_USER_DATA_GS_0); + radeon_emit_one_32bit_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_HS_0); + radeon_end(); return; } else if (sctx->chip_class == GFX9 && sctx->shadowed_regs) { /* We can't use the COMMON registers with register shadowing. */ - si_emit_shader_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0); - si_emit_shader_pointer(sctx, descs, R_00B130_SPI_SHADER_USER_DATA_VS_0); - si_emit_shader_pointer(sctx, descs, R_00B330_SPI_SHADER_USER_DATA_ES_0); - si_emit_shader_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_LS_0); + radeon_emit_one_32bit_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0); + radeon_emit_one_32bit_pointer(sctx, descs, R_00B130_SPI_SHADER_USER_DATA_VS_0); + radeon_emit_one_32bit_pointer(sctx, descs, R_00B330_SPI_SHADER_USER_DATA_ES_0); + radeon_emit_one_32bit_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_LS_0); + radeon_end(); return; } else if (sctx->chip_class == GFX9) { /* Broadcast it to all shader stages. */ - si_emit_shader_pointer(sctx, descs, R_00B530_SPI_SHADER_USER_DATA_COMMON_0); + radeon_emit_one_32bit_pointer(sctx, descs, R_00B530_SPI_SHADER_USER_DATA_COMMON_0); + radeon_end(); return; } - si_emit_shader_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0); - si_emit_shader_pointer(sctx, descs, R_00B130_SPI_SHADER_USER_DATA_VS_0); - si_emit_shader_pointer(sctx, descs, R_00B330_SPI_SHADER_USER_DATA_ES_0); - si_emit_shader_pointer(sctx, descs, R_00B230_SPI_SHADER_USER_DATA_GS_0); - si_emit_shader_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_HS_0); - si_emit_shader_pointer(sctx, descs, R_00B530_SPI_SHADER_USER_DATA_LS_0); + radeon_emit_one_32bit_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0); + radeon_emit_one_32bit_pointer(sctx, descs, R_00B130_SPI_SHADER_USER_DATA_VS_0); + radeon_emit_one_32bit_pointer(sctx, descs, R_00B330_SPI_SHADER_USER_DATA_ES_0); + radeon_emit_one_32bit_pointer(sctx, descs, R_00B230_SPI_SHADER_USER_DATA_GS_0); + radeon_emit_one_32bit_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_HS_0); + radeon_emit_one_32bit_pointer(sctx, descs, R_00B530_SPI_SHADER_USER_DATA_LS_0); + radeon_end(); } void si_emit_graphics_shader_pointers(struct si_context *sctx) @@ -2016,6 +1993,7 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx) si_emit_global_shader_pointers(sctx, &sctx->descriptors[SI_DESCS_RW_BUFFERS]); } + radeon_begin(&sctx->gfx_cs); si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(VERTEX), sh_base[PIPE_SHADER_VERTEX]); si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_EVAL), @@ -2030,8 +2008,6 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx) sctx->shader_pointers_dirty &= ~u_bit_consecutive(SI_DESCS_RW_BUFFERS, SI_DESCS_FIRST_COMPUTE); if (sctx->vertex_buffer_pointer_dirty && sctx->num_vertex_elements) { - struct radeon_cmdbuf *cs = &sctx->gfx_cs; - /* Find the location of the VB descriptor pointer. */ unsigned sh_dw_offset = SI_VS_NUM_USER_SGPR; if (sctx->chip_class >= GFX9) { @@ -2042,22 +2018,22 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx) } unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + sh_dw_offset * 4; - si_emit_shader_pointer_head(cs, sh_offset, 1); - si_emit_shader_pointer_body( + radeon_set_sh_reg_seq(cs, sh_offset, 1); + radeon_emit_32bit_pointer( sctx->screen, cs, sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset); sctx->vertex_buffer_pointer_dirty = false; } if (sctx->vertex_buffer_user_sgprs_dirty && sctx->num_vertex_elements && sctx->screen->num_vbos_in_user_sgprs) { - struct radeon_cmdbuf *cs = &sctx->gfx_cs; unsigned num_desc = MIN2(sctx->num_vertex_elements, sctx->screen->num_vbos_in_user_sgprs); unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4; - si_emit_shader_pointer_head(cs, sh_offset, num_desc * 4); + radeon_set_sh_reg_seq(cs, sh_offset, num_desc * 4); radeon_emit_array(cs, sctx->vb_descriptor_user_sgprs, num_desc * 4); sctx->vertex_buffer_user_sgprs_dirty = false; } + radeon_end(); if (sctx->graphics_bindless_pointer_dirty) { si_emit_global_shader_pointers(sctx, &sctx->bindless_descriptors); @@ -2071,12 +2047,13 @@ void si_emit_compute_shader_pointers(struct si_context *sctx) struct si_shader_selector *shader = &sctx->cs_shader_state.program->sel; unsigned base = R_00B900_COMPUTE_USER_DATA_0; + radeon_begin(cs); si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE), R_00B900_COMPUTE_USER_DATA_0); sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(COMPUTE); if (sctx->compute_bindless_pointer_dirty) { - si_emit_shader_pointer(sctx, &sctx->bindless_descriptors, base); + radeon_emit_one_32bit_pointer(sctx, &sctx->bindless_descriptors, base); sctx->compute_bindless_pointer_dirty = false; } @@ -2085,9 +2062,9 @@ void si_emit_compute_shader_pointers(struct si_context *sctx) if (num_shaderbufs && sctx->compute_shaderbuf_sgprs_dirty) { struct si_descriptors *desc = si_const_and_shader_buffer_descriptors(sctx, PIPE_SHADER_COMPUTE); - si_emit_shader_pointer_head(cs, R_00B900_COMPUTE_USER_DATA_0 + - shader->cs_shaderbufs_sgpr_index * 4, - num_shaderbufs * 4); + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + + shader->cs_shaderbufs_sgpr_index * 4, + num_shaderbufs * 4); for (unsigned i = 0; i < num_shaderbufs; i++) radeon_emit_array(cs, &desc->list[si_get_shaderbuf_slot(i) * 4], 4); @@ -2100,9 +2077,9 @@ void si_emit_compute_shader_pointers(struct si_context *sctx) if (num_images && sctx->compute_image_sgprs_dirty) { struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, PIPE_SHADER_COMPUTE); - si_emit_shader_pointer_head(cs, R_00B900_COMPUTE_USER_DATA_0 + - shader->cs_images_sgpr_index * 4, - shader->cs_images_num_sgprs); + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + + shader->cs_images_sgpr_index * 4, + shader->cs_images_num_sgprs); for (unsigned i = 0; i < num_images; i++) { unsigned desc_offset = si_get_image_slot(i) * 8; @@ -2119,6 +2096,7 @@ void si_emit_compute_shader_pointers(struct si_context *sctx) sctx->compute_image_sgprs_dirty = false; } + radeon_end(); } /* BINDLESS */ diff --git a/src/gallium/drivers/radeonsi/si_fence.c b/src/gallium/drivers/radeonsi/si_fence.c index ab48a71729e..4159364265d 100644 --- a/src/gallium/drivers/radeonsi/si_fence.c +++ b/src/gallium/drivers/radeonsi/si_fence.c @@ -75,6 +75,8 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigne unsigned sel = EOP_DST_SEL(dst_sel) | EOP_INT_SEL(int_sel) | EOP_DATA_SEL(data_sel); bool compute_ib = !ctx->has_graphics || cs == &ctx->prim_discard_compute_cs; + radeon_begin(cs); + if (ctx->chip_class >= GFX9 || (compute_ib && ctx->chip_class >= GFX7)) { /* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion * counters) must immediately precede every timestamp event to @@ -136,6 +138,8 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigne radeon_emit(cs, 0); /* unused */ } + radeon_end(); + if (buf) { radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, buf, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); } @@ -154,6 +158,7 @@ unsigned si_cp_write_fence_dwords(struct si_screen *screen) void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, uint64_t va, uint32_t ref, uint32_t mask, unsigned flags) { + radeon_begin(cs); radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); radeon_emit(cs, WAIT_REG_MEM_MEM_SPACE(1) | flags); radeon_emit(cs, va); @@ -161,6 +166,7 @@ void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, uint64_t v radeon_emit(cs, ref); /* reference value */ radeon_emit(cs, mask); /* mask */ radeon_emit(cs, 4); /* poll interval */ + radeon_end(); } static void si_add_fence_dependency(struct si_context *sctx, struct pipe_fence_handle *fence) diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 16b6a10986c..6d3abb7557c 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -110,8 +110,10 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h /* Make sure compute shaders are idle before leaving the IB, so that * the next IB doesn't overwrite GDS that might be in use. */ + radeon_begin(compute_cs); radeon_emit(compute_cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(compute_cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + radeon_end(); /* Save the GDS prim restart counter if needed. */ if (ctx->preserve_prim_restart_gds_at_flush) { @@ -559,6 +561,8 @@ void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, uns assert(sctx->chip_class <= GFX9); + radeon_begin(cs); + if (sctx->chip_class == GFX9 || compute_ib) { /* Flush caches and wait for the caches to assert idle. */ radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0)); @@ -576,6 +580,7 @@ void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, uns radeon_emit(cs, 0); /* CP_COHER_BASE */ radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ } + radeon_end(); /* ACQUIRE_MEM has an implicit context roll if the current context * is busy. */ @@ -599,6 +604,8 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs) /* We don't need these. */ assert(!(flags & (SI_CONTEXT_VGT_STREAMOUT_SYNC | SI_CONTEXT_FLUSH_AND_INV_DB_META))); + radeon_begin(cs); + if (flags & SI_CONTEXT_VGT_FLUSH) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); @@ -686,6 +693,7 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs) ctx->num_cs_flushes++; ctx->compute_is_busy = false; } + radeon_end(); if (cb_db_event) { struct si_resource* wait_mem_scratch = unlikely(ctx->ws->cs_is_secure(cs)) ? @@ -729,6 +737,8 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs) si_cp_wait_mem(ctx, cs, va, ctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL); } + radeon_begin_again(cs); + /* Ignore fields that only modify the behavior of other fields. */ if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) { /* Flush caches and wait for the caches to assert idle. @@ -757,6 +767,7 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs) radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0)); } + radeon_end(); ctx->flags = 0; } @@ -820,6 +831,8 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs) cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1); } + radeon_begin(cs); + if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { /* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); @@ -868,6 +881,8 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs) radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0)); } + radeon_end(); + /* GFX9: Wait for idle if we're flushing CB or DB. ACQUIRE_MEM doesn't * wait for idle on GFX9. We have to use a TS event. */ @@ -934,8 +949,10 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs) if (sctx->has_graphics && (cp_coher_cntl || (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2)))) { + radeon_begin(cs); radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); radeon_emit(cs, 0); + radeon_end(); } /* GFX6-GFX8 only: @@ -988,11 +1005,15 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs) si_prim_discard_signal_next_compute_ib_start(sctx); if (flags & SI_CONTEXT_START_PIPELINE_STATS) { + radeon_begin(cs); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0)); + radeon_end(); } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) { + radeon_begin(cs); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0)); + radeon_end(); } sctx->flags = 0; diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c index 6363368c5a3..6d2868509f6 100644 --- a/src/gallium/drivers/radeonsi/si_perfcounter.c +++ b/src/gallium/drivers/radeonsi/si_perfcounter.c @@ -723,16 +723,20 @@ static void si_pc_emit_instance(struct si_context *sctx, int se, int instance) value |= S_030800_INSTANCE_BROADCAST_WRITES(1); } + radeon_begin(cs); radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value); + radeon_end(); } static void si_pc_emit_shaders(struct si_context *sctx, unsigned shaders) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; + radeon_begin(cs); radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2, false); radeon_emit(cs, shaders & 0x7f); radeon_emit(cs, 0xffffffff); + radeon_end(); } static void si_pc_emit_select(struct si_context *sctx, struct si_pc_block *block, unsigned count, @@ -749,6 +753,8 @@ static void si_pc_emit_select(struct si_context *sctx, struct si_pc_block *block if (regs->layout & SI_PC_FAKE) return; + radeon_begin(cs); + if (layout_multi == SI_PC_MULTI_BLOCK) { assert(!(regs->layout & SI_PC_REG_REVERSE)); @@ -826,6 +832,7 @@ static void si_pc_emit_select(struct si_context *sctx, struct si_pc_block *block radeon_emit(cs, 0); } } + radeon_end(); } static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer, uint64_t va) @@ -835,12 +842,14 @@ static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address, COPY_DATA_IMM, NULL, 1); + radeon_begin(cs); radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET)); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0)); radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING)); + radeon_end(); } /* Note: The buffer was already added in si_pc_emit_start, so we don't have to @@ -853,6 +862,7 @@ static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer, EOP_DATA_SEL_VALUE_32BIT, buffer, va, 0, SI_NOT_QUERY); si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL); + radeon_begin(cs); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0)); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); @@ -860,6 +870,7 @@ static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer, radeon_set_uconfig_reg( cs, R_036020_CP_PERFMON_CNTL, S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_STOP_COUNTING) | S_036020_PERFMON_SAMPLE_ENABLE(1)); + radeon_end(); } static void si_pc_emit_read(struct si_context *sctx, struct si_pc_block *block, unsigned count, @@ -871,6 +882,8 @@ static void si_pc_emit_read(struct si_context *sctx, struct si_pc_block *block, unsigned reg = regs->counter0_lo; unsigned reg_delta = 8; + radeon_begin(cs); + if (!(regs->layout & SI_PC_FAKE)) { if (regs->layout & SI_PC_REG_REVERSE) reg_delta = -reg_delta; @@ -901,6 +914,7 @@ static void si_pc_emit_read(struct si_context *sctx, struct si_pc_block *block, va += sizeof(uint64_t); } } + radeon_end(); } static void si_pc_query_destroy(struct si_context *sctx, struct si_query *squery) @@ -921,6 +935,8 @@ static void si_pc_query_destroy(struct si_context *sctx, struct si_query *squery void si_inhibit_clockgating(struct si_context *sctx, struct radeon_cmdbuf *cs, bool inhibit) { + radeon_begin(&sctx->gfx_cs); + if (sctx->chip_class >= GFX10) { radeon_set_uconfig_reg(cs, R_037390_RLC_PERFMON_CLK_CNTL, S_037390_PERFMON_CLOCK_STATE(inhibit)); @@ -928,6 +944,7 @@ void si_inhibit_clockgating(struct si_context *sctx, struct radeon_cmdbuf *cs, b radeon_set_uconfig_reg(cs, R_0372FC_RLC_PERFMON_CLK_CNTL, S_0372FC_PERFMON_CLOCK_STATE(inhibit)); } + radeon_end(); } static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery) diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c index 2f63fc02105..6918ae5a11f 100644 --- a/src/gallium/drivers/radeonsi/si_pm4.c +++ b/src/gallium/drivers/radeonsi/si_pm4.c @@ -116,7 +116,9 @@ void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state) RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); } + radeon_begin(cs); radeon_emit_array(cs, state->pm4, state->ndw); + radeon_end(); if (state->atom.emit) state->atom.emit(sctx); diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c index a109501e179..3a3beaba473 100644 --- a/src/gallium/drivers/radeonsi/si_query.c +++ b/src/gallium/drivers/radeonsi/si_query.c @@ -25,6 +25,7 @@ */ #include "si_query.h" +#include "si_build_pm4.h" #include "amd/common/sid.h" #include "si_pipe.h" @@ -771,10 +772,12 @@ static unsigned event_type_for_stream(unsigned stream) static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va, unsigned stream) { + radeon_begin(cs); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3)); radeon_emit(cs, va); radeon_emit(cs, va >> 32); + radeon_end(); } static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query, @@ -785,12 +788,15 @@ static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_h switch (query->b.type) { case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: - case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: { + radeon_begin(cs); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); radeon_emit(cs, va); radeon_emit(cs, va >> 32); + radeon_end(); break; + } case PIPE_QUERY_PRIMITIVES_EMITTED: case PIPE_QUERY_PRIMITIVES_GENERATED: case PIPE_QUERY_SO_STATISTICS: @@ -805,12 +811,15 @@ static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_h si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE, EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type); break; - case PIPE_QUERY_PIPELINE_STATISTICS: + case PIPE_QUERY_PIPELINE_STATISTICS: { + radeon_begin(cs); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); radeon_emit(cs, va); radeon_emit(cs, va >> 32); + radeon_end(); break; + } default: assert(0); } @@ -846,15 +855,18 @@ static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw switch (query->b.type) { case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: - case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: { va += 8; + radeon_begin(cs); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1)); radeon_emit(cs, va); radeon_emit(cs, va >> 32); + radeon_end(); fence_va = va + sctx->screen->info.max_render_backends * 16 - 8; break; + } case PIPE_QUERY_PRIMITIVES_EMITTED: case PIPE_QUERY_PRIMITIVES_GENERATED: case PIPE_QUERY_SO_STATISTICS: @@ -879,10 +891,12 @@ static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw unsigned sample_size = (query->result_size - 8) / 2; va += sample_size; + radeon_begin(cs); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2)); radeon_emit(cs, va); radeon_emit(cs, va >> 32); + radeon_end(); fence_va = va + sample_size; break; @@ -934,6 +948,8 @@ static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf, { struct radeon_cmdbuf *cs = &ctx->gfx_cs; + radeon_begin(cs); + if (ctx->chip_class >= GFX9) { radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0)); radeon_emit(cs, op); @@ -944,6 +960,8 @@ static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf, radeon_emit(cs, va); radeon_emit(cs, op | ((va >> 32) & 0xFF)); } + radeon_end(); + radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, buf, RADEON_USAGE_READ, RADEON_PRIO_QUERY); } diff --git a/src/gallium/drivers/radeonsi/si_sqtt.c b/src/gallium/drivers/radeonsi/si_sqtt.c index 1366430cff8..f5263d5e9fa 100644 --- a/src/gallium/drivers/radeonsi/si_sqtt.c +++ b/src/gallium/drivers/radeonsi/si_sqtt.c @@ -35,22 +35,6 @@ static void si_emit_spi_config_cntl(struct si_context* sctx, struct radeon_cmdbuf *cs, bool enable); -static inline void -radeon_set_privileged_config_reg(struct radeon_cmdbuf *cs, - unsigned reg, - unsigned value) -{ - assert(reg < CIK_UCONFIG_REG_OFFSET); - - radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); - radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | - COPY_DATA_DST_SEL(COPY_DATA_PERF)); - radeon_emit(cs, value); - radeon_emit(cs, 0); /* unused */ - radeon_emit(cs, reg >> 2); - radeon_emit(cs, 0); /* unused */ -} - static bool si_thread_trace_init_bo(struct si_context *sctx) { @@ -89,6 +73,8 @@ si_emit_thread_trace_start(struct si_context* sctx, uint32_t shifted_size = sctx->thread_trace->buffer_size >> SQTT_BUFFER_ALIGN_SHIFT; unsigned max_se = sscreen->info.max_se; + radeon_begin(cs); + for (unsigned se = 0; se < max_se; se++) { uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo); uint64_t data_va = ac_thread_trace_get_data_va(sctx->thread_trace, va, se); @@ -220,6 +206,7 @@ si_emit_thread_trace_start(struct si_context* sctx, radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0)); } + radeon_end(); } static const uint32_t gfx9_thread_trace_info_regs[] = @@ -258,6 +245,8 @@ si_copy_thread_trace_info_regs(struct si_context* sctx, uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo); uint64_t info_va = ac_thread_trace_get_info_va(va, se_index); + radeon_begin(cs); + /* Copy back the info struct one DWORD at a time. */ for (unsigned i = 0; i < 3; i++) { radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); @@ -269,6 +258,7 @@ si_copy_thread_trace_info_regs(struct si_context* sctx, radeon_emit(cs, (info_va + i * 4)); radeon_emit(cs, (info_va + i * 4) >> 32); } + radeon_end(); } @@ -280,6 +270,8 @@ si_emit_thread_trace_stop(struct si_context *sctx, { unsigned max_se = sctx->screen->info.max_se; + radeon_begin(cs); + /* Stop the thread trace with a different event based on the queue. */ if (queue_family_index == RING_COMPUTE) { radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, @@ -291,8 +283,11 @@ si_emit_thread_trace_stop(struct si_context *sctx, radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0)); + radeon_end(); for (unsigned se = 0; se < max_se; se++) { + radeon_begin(cs); + /* Target SEi and SH0. */ radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, S_030800_SE_INDEX(se) | @@ -335,15 +330,18 @@ si_emit_thread_trace_stop(struct si_context *sctx, radeon_emit(cs, S_030CE8_BUSY(1)); /* mask */ radeon_emit(cs, 4); /* poll interval */ } + radeon_end(); si_copy_thread_trace_info_regs(sctx, cs, se); } /* Restore global broadcasting. */ + radeon_begin_again(cs); radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) | S_030800_INSTANCE_BROADCAST_WRITES(1)); + radeon_end(); } static void @@ -351,6 +349,8 @@ si_thread_trace_start(struct si_context *sctx, int family, struct radeon_cmdbuf { struct radeon_winsys *ws = sctx->ws; + radeon_begin(cs); + switch (family) { case RING_GFX: radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); @@ -361,7 +361,8 @@ si_thread_trace_start(struct si_context *sctx, int family, struct radeon_cmdbuf radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, 0); break; - } + } + radeon_end(); ws->cs_add_buffer(cs, sctx->thread_trace->bo, @@ -390,6 +391,9 @@ static void si_thread_trace_stop(struct si_context *sctx, int family, struct radeon_cmdbuf *cs) { struct radeon_winsys *ws = sctx->ws; + + radeon_begin(cs); + switch (family) { case RING_GFX: radeon_emit(sctx->thread_trace->stop_cs[family], PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); @@ -401,6 +405,8 @@ si_thread_trace_stop(struct si_context *sctx, int family, struct radeon_cmdbuf * radeon_emit(sctx->thread_trace->stop_cs[family], 0); break; } + radeon_end(); + ws->cs_add_buffer(cs, sctx->thread_trace->bo, RADEON_USAGE_READWRITE, @@ -643,6 +649,8 @@ si_emit_thread_trace_userdata(struct si_context* sctx, { const uint32_t *dwords = (uint32_t *)data; + radeon_begin(cs); + while (num_dwords > 0) { uint32_t count = MIN2(num_dwords, 2); @@ -655,12 +663,15 @@ si_emit_thread_trace_userdata(struct si_context* sctx, dwords += count; num_dwords -= count; } + radeon_end(); } static void si_emit_spi_config_cntl(struct si_context* sctx, struct radeon_cmdbuf *cs, bool enable) { + radeon_begin(cs); + if (sctx->chip_class >= GFX9) { uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) | S_031100_EXP_PRIORITY_ORDER(3) | @@ -677,6 +688,7 @@ si_emit_spi_config_cntl(struct si_context* sctx, S_009100_ENABLE_SQG_TOP_EVENTS(enable) | S_009100_ENABLE_SQG_BOP_EVENTS(enable)); } + radeon_end(); } void diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 278c9a733f0..2a8e852a5e6 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -91,11 +91,13 @@ static void si_emit_cb_render_state(struct si_context *sctx) if (sctx->screen->dpbb_allowed && sctx->last_cb_target_mask != cb_target_mask) { sctx->last_cb_target_mask = cb_target_mask; + radeon_begin(cs); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); + radeon_end(); } - unsigned initial_cdw = cs->current.cdw; + radeon_begin(cs); radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK, SI_TRACKED_CB_TARGET_MASK, cb_target_mask); @@ -256,8 +258,7 @@ static void si_emit_cb_render_state(struct si_context *sctx) radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT, SI_TRACKED_SX_PS_DOWNCONVERT, sx_ps_downconvert, sx_blend_opt_epsilon, sx_blend_opt_control); } - if (initial_cdw != cs->current.cdw) - sctx->context_roll = true; + radeon_end_update_context_roll(sctx); } /* @@ -689,8 +690,10 @@ static void si_emit_blend_color(struct si_context *sctx) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; + radeon_begin(cs); radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4); radeon_emit_array(cs, (uint32_t *)sctx->blend_color.state.color, 4); + radeon_end(); } /* @@ -721,8 +724,10 @@ static void si_emit_clip_state(struct si_context *sctx) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; + radeon_begin(cs); radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6 * 4); radeon_emit_array(cs, (uint32_t *)sctx->clip_state.state.ucp, 6 * 4); + radeon_end(); } static void si_emit_clip_regs(struct si_context *sctx) @@ -747,7 +752,6 @@ static void si_emit_clip_regs(struct si_context *sctx) clipdist_mask &= rs->clip_plane_enable; culldist_mask |= clipdist_mask; - unsigned initial_cdw = sctx->gfx_cs.current.cdw; unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((vs_out_mask & 0x0F) != 0) | S_02881C_VS_OUT_CCDIST1_VEC_ENA((vs_out_mask & 0xF0) != 0) | S_02881C_BYPASS_VTX_RATE_COMBINER(sctx->chip_class >= GFX10_3 && @@ -755,6 +759,8 @@ static void si_emit_clip_regs(struct si_context *sctx) S_02881C_BYPASS_PRIM_RATE_COMBINER(sctx->chip_class >= GFX10_3) | clipdist_mask | (culldist_mask << 8); + radeon_begin(&sctx->gfx_cs); + if (sctx->chip_class >= GFX10) { radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, pa_cl_cntl, @@ -765,9 +771,7 @@ static void si_emit_clip_regs(struct si_context *sctx) } radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL, rs->pa_cl_clip_cntl | ucp_mask | S_028810_CLIP_DISABLE(window_space)); - - if (initial_cdw != sctx->gfx_cs.current.cdw) - sctx->context_roll = true; + radeon_end_update_context_roll(sctx); } /* @@ -1048,6 +1052,7 @@ static void si_emit_stencil_ref(struct si_context *sctx) struct pipe_stencil_ref *ref = &sctx->stencil_ref.state; struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part; + radeon_begin(cs); radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2); radeon_emit(cs, S_028430_STENCILTESTVAL(ref->ref_value[0]) | S_028430_STENCILMASK(dsa->valuemask[0]) | @@ -1056,6 +1061,7 @@ static void si_emit_stencil_ref(struct si_context *sctx) S_028434_STENCILMASK_BF(dsa->valuemask[1]) | S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) | S_028434_STENCILOPVAL_BF(1)); + radeon_end(); } static void si_set_stencil_ref(struct pipe_context *ctx, const struct pipe_stencil_ref state) @@ -1334,7 +1340,6 @@ static void si_emit_db_render_state(struct si_context *sctx) { struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; unsigned db_shader_control, db_render_control, db_count_control; - unsigned initial_cdw = sctx->gfx_cs.current.cdw; /* DB_RENDER_CONTROL */ if (sctx->dbcb_depth_copy_enabled || sctx->dbcb_stencil_copy_enabled) { @@ -1374,6 +1379,7 @@ static void si_emit_db_render_state(struct si_context *sctx) } } + radeon_begin(&sctx->gfx_cs); radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL, SI_TRACKED_DB_RENDER_CONTROL, db_render_control, db_count_control); @@ -1427,9 +1433,7 @@ static void si_emit_db_render_state(struct si_context *sctx) S_028064_VRS_OVERRIDE_RATE_Y(0)); } } - - if (initial_cdw != sctx->gfx_cs.current.cdw) - sctx->context_roll = true; + radeon_end_update_context_roll(sctx); } /* @@ -2909,6 +2913,8 @@ static void si_emit_framebuffer_state(struct si_context *sctx) struct si_surface *cb = NULL; unsigned cb_color_info = 0; + radeon_begin(cs); + /* Colorbuffers. */ for (i = 0; i < nr_cbufs; i++) { uint64_t cb_color_base, cb_color_fmask, cb_color_cmask, cb_dcc_base; @@ -3260,6 +3266,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx) radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); } + radeon_end(); si_update_display_dcc_dirty(sctx); @@ -3292,6 +3299,8 @@ static void si_emit_msaa_sample_locs(struct si_context *sctx) si_emit_sample_locations(cs, nr_samples); } + radeon_begin(cs); + if (sctx->family >= CHIP_POLARIS10) { unsigned small_prim_filter_cntl = S_028830_SMALL_PRIM_FILTER_ENABLE(1) | @@ -3323,6 +3332,7 @@ static void si_emit_msaa_sample_locs(struct si_context *sctx) radeon_opt_set_context_reg( sctx, R_02882C_PA_SU_PRIM_FILTER_CNTL, SI_TRACKED_PA_SU_PRIM_FILTER_CNTL, S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) | S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion)); + radeon_end(); } static bool si_out_of_order_rasterization(struct si_context *sctx) @@ -3501,7 +3511,7 @@ static void si_emit_msaa_config(struct si_context *sctx) } } - unsigned initial_cdw = cs->current.cdw; + radeon_begin(cs); /* R_028BDC_PA_SC_LINE_CNTL, R_028BE0_PA_SC_AA_CONFIG */ radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL, SI_TRACKED_PA_SC_LINE_CNTL, @@ -3512,7 +3522,7 @@ static void si_emit_msaa_config(struct si_context *sctx) radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1, SI_TRACKED_PA_SC_MODE_CNTL_1, sc_mode_cntl_1); - if (initial_cdw != cs->current.cdw) { + if (radeon_packets_added()) { sctx->context_roll = true; /* GFX9: Flush DFSM when the AA mode changes. */ @@ -3521,6 +3531,7 @@ static void si_emit_msaa_config(struct si_context *sctx) radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); } } + radeon_end(); } void si_update_ps_iter_samples(struct si_context *sctx) @@ -4509,9 +4520,11 @@ static void si_emit_sample_mask(struct si_context *sctx) assert(mask == 0xffff || sctx->framebuffer.nr_samples > 1 || (mask & 1 && sctx->blitter->running)); + radeon_begin(cs); radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2); radeon_emit(cs, mask | (mask << 16)); radeon_emit(cs, mask | (mask << 16)); + radeon_end(); } static void si_delete_sampler_state(struct pipe_context *ctx, void *state) diff --git a/src/gallium/drivers/radeonsi/si_state_binning.c b/src/gallium/drivers/radeonsi/si_state_binning.c index d3425e68449..f9e4b273317 100644 --- a/src/gallium/drivers/radeonsi/si_state_binning.c +++ b/src/gallium/drivers/radeonsi/si_state_binning.c @@ -404,7 +404,7 @@ static void gfx10_get_bin_sizes(struct si_context *sctx, unsigned cb_target_enab static void si_emit_dpbb_disable(struct si_context *sctx) { - unsigned initial_cdw = sctx->gfx_cs.current.cdw; + radeon_begin(&sctx->gfx_cs); if (sctx->chip_class >= GFX10) { struct uvec2 bin_size = {}; @@ -441,8 +441,7 @@ static void si_emit_dpbb_disable(struct si_context *sctx) radeon_opt_set_context_reg( sctx, db_dfsm_control, SI_TRACKED_DB_DFSM_CONTROL, S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) | S_028060_POPS_DRAIN_PS_ON_OVERLAP(1)); - if (initial_cdw != sctx->gfx_cs.current.cdw) - sctx->context_roll = true; + radeon_end_update_context_roll(sctx); sctx->last_binning_enabled = false; } @@ -526,7 +525,7 @@ void si_emit_dpbb_state(struct si_context *sctx) if (bin_size.y >= 32) bin_size_extend.y = util_logbase2(bin_size.y) - 5; - unsigned initial_cdw = sctx->gfx_cs.current.cdw; + radeon_begin(&sctx->gfx_cs); radeon_opt_set_context_reg( sctx, R_028C44_PA_SC_BINNER_CNTL_0, SI_TRACKED_PA_SC_BINNER_CNTL_0, S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) | S_028C44_BIN_SIZE_X(bin_size.x == 16) | @@ -546,8 +545,7 @@ void si_emit_dpbb_state(struct si_context *sctx) radeon_opt_set_context_reg( sctx, db_dfsm_control, SI_TRACKED_DB_DFSM_CONTROL, S_028060_PUNCHOUT_MODE(punchout_mode) | S_028060_POPS_DRAIN_PS_ON_OVERLAP(1)); - if (initial_cdw != sctx->gfx_cs.current.cdw) - sctx->context_roll = true; + radeon_end_update_context_roll(sctx); sctx->last_binning_enabled = true; } diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index e2beac6f7fa..d17ab2e7a37 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -399,6 +399,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx, assert(ls_current->config.lds_size == 0); struct radeon_cmdbuf *cs = &sctx->gfx_cs; + radeon_begin(cs); if (sctx->chip_class >= GFX9) { unsigned hs_rsrc2 = ls_current->config.rsrc2; @@ -443,6 +444,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx, radeon_set_sh_reg_seq(cs, tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, 2); radeon_emit(cs, offchip_layout); radeon_emit(cs, ring_va); + radeon_end(); unsigned ls_hs_config = S_028B58_NUM_PATCHES(*num_patches) | @@ -450,13 +452,14 @@ static void si_emit_derived_tess_state(struct si_context *sctx, S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp); if (sctx->last_ls_hs_config != ls_hs_config) { + radeon_begin(cs); if (sctx->chip_class >= GFX7) { radeon_set_context_reg_idx(cs, R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config); } else { radeon_set_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config); } + radeon_end_update_context_roll(sctx); sctx->last_ls_hs_config = ls_hs_config; - sctx->context_roll = true; } } @@ -734,7 +737,8 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx) struct radeon_cmdbuf *cs = &sctx->gfx_cs; enum pipe_prim_type rast_prim = sctx->current_rast_prim; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - unsigned initial_cdw = cs->current.cdw; + + radeon_begin(cs); if (unlikely(si_is_line_stipple_enabled(sctx))) { /* For lines, reset the stipple pattern at each primitive. Otherwise, @@ -756,8 +760,10 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx) sctx->last_gs_out_prim = gs_out_prim; } - if (GFX_VERSION == GFX9 && initial_cdw != cs->current.cdw) - sctx->context_roll = true; + if (GFX_VERSION == GFX9) + radeon_end_update_context_roll(sctx); + else + radeon_end(); if (NGG) { struct si_shader *hw_vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current; @@ -797,6 +803,7 @@ static void si_emit_vs_state(struct si_context *sctx, unsigned index_size) /* For the API vertex shader (VS_STATE_INDEXED, LS_OUT_*). */ unsigned vs_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG, PIPE_SHADER_VERTEX); + radeon_begin(cs); radeon_set_sh_reg(cs, vs_base + SI_SGPR_VS_STATE_BITS * 4, sctx->current_vs_state); @@ -815,6 +822,7 @@ static void si_emit_vs_state(struct si_context *sctx, unsigned index_size) radeon_set_sh_reg(cs, R_00B230_SPI_SHADER_USER_DATA_GS_0 + SI_SGPR_VS_STATE_BITS * 4, sctx->current_vs_state); } + radeon_end(); sctx->last_vs_state = sctx->current_vs_state; } @@ -845,14 +853,18 @@ static void si_emit_ia_multi_vgt_param(struct si_context *sctx, /* Draw state. */ if (ia_multi_vgt_param != sctx->last_multi_vgt_param) { + radeon_begin(cs); + if (GFX_VERSION == GFX9) - radeon_set_uconfig_reg_idx(cs, sctx->screen, R_030960_IA_MULTI_VGT_PARAM, 4, - ia_multi_vgt_param); + radeon_set_uconfig_reg_idx(cs, sctx->screen, GFX_VERSION, + R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param); else if (GFX_VERSION >= GFX7) radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param); else radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param); + radeon_end(); + sctx->last_multi_vgt_param = ia_multi_vgt_param; } } @@ -897,7 +909,11 @@ static void gfx10_emit_ge_cntl(struct si_context *sctx, unsigned num_patches) ge_cntl |= S_03096C_PACKET_TO_ONE_PA(si_is_line_stipple_enabled(sctx)); if (ge_cntl != sctx->last_multi_vgt_param) { - radeon_set_uconfig_reg(&sctx->gfx_cs, R_03096C_GE_CNTL, ge_cntl); + struct radeon_cmdbuf *cs = &sctx->gfx_cs; + + radeon_begin(cs); + radeon_set_uconfig_reg(cs, R_03096C_GE_CNTL, ge_cntl); + radeon_end(); sctx->last_multi_vgt_param = ge_cntl; } } @@ -919,13 +935,15 @@ static void si_emit_draw_registers(struct si_context *sctx, (sctx, indirect, prim, num_patches, instance_count, primitive_restart, min_vertex_count, vertices_per_patch); + radeon_begin(cs); + if (prim != sctx->last_prim) { unsigned vgt_prim = si_conv_pipe_prim(prim); if (GFX_VERSION >= GFX10) radeon_set_uconfig_reg(cs, R_030908_VGT_PRIMITIVE_TYPE, vgt_prim); else if (GFX_VERSION >= GFX7) - radeon_set_uconfig_reg_idx(cs, sctx->screen, R_030908_VGT_PRIMITIVE_TYPE, 1, vgt_prim); + radeon_set_uconfig_reg_idx(cs, sctx->screen, GFX_VERSION, R_030908_VGT_PRIMITIVE_TYPE, 1, vgt_prim); else radeon_set_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, vgt_prim); @@ -947,14 +965,17 @@ static void si_emit_draw_registers(struct si_context *sctx, if (GFX_VERSION == GFX9) sctx->context_roll = true; } + radeon_end(); } #define EMIT_SQTT_END_DRAW do { \ if (GFX_VERSION >= GFX9 && unlikely(sctx->thread_trace_enabled)) { \ + radeon_begin(&sctx->gfx_cs); \ radeon_emit(&sctx->gfx_cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); \ radeon_emit(&sctx->gfx_cs, \ EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | \ EVENT_INDEX(0)); \ + radeon_end(); \ } \ } while (0) @@ -979,7 +1000,10 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw if (indirect && indirect->count_from_stream_output) { struct si_streamout_target *t = (struct si_streamout_target *)indirect->count_from_stream_output; + radeon_begin(cs); radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, t->stride_in_dw); + radeon_end(); + si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_REG, NULL, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2, COPY_DATA_SRC_MEM, t->buf_filled_size, t->buf_filled_size_offset); @@ -990,6 +1014,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw uint32_t index_max_size = 0; uint64_t index_va = 0; + radeon_begin(cs); + /* draw packet */ if (index_size) { /* Register shadowing doesn't shadow INDEX_TYPE. */ @@ -1017,7 +1043,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw } if (GFX_VERSION >= GFX9) { - radeon_set_uconfig_reg_idx(cs, sctx->screen, R_03090C_VGT_INDEX_TYPE, 2, index_type); + radeon_set_uconfig_reg_idx(cs, sctx->screen, GFX_VERSION, + R_03090C_VGT_INDEX_TYPE, 2, index_type); } else { radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0)); radeon_emit(cs, index_type); @@ -1032,8 +1059,10 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw /* Skip draw calls with 0-sized index buffers. * They cause a hang on some chips, like Navi10-14. */ - if (!index_max_size) + if (!index_max_size) { + radeon_end(); return; + } index_va = si_resource(indexbuf)->gpu_address + index_offset; @@ -1173,6 +1202,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw if (index_size) { if (ALLOW_PRIM_DISCARD_CS && dispatch_prim_discard_cs) { + radeon_end(); + for (unsigned i = 0; i < num_draws; i++) { uint64_t va = index_va + draws[0].start * original_index_size; @@ -1238,6 +1269,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw radeon_emit(cs, draws[i].count); radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX); } + radeon_end(); + EMIT_SQTT_END_DRAW; return; } @@ -1265,6 +1298,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw sctx->last_base_vertex = draws[num_draws - 1].start; } } + radeon_end(); EMIT_SQTT_END_DRAW; } @@ -2181,8 +2215,10 @@ void si_trace_emit(struct si_context *sctx) si_cp_write_data(sctx, sctx->current_saved_cs->trace_buf, 0, 4, V_370_MEM, V_370_ME, &trace_id); + radeon_begin(cs); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); radeon_emit(cs, AC_ENCODE_TRACE_POINT(trace_id)); + radeon_end(); if (sctx->log) u_log_flush(sctx->log); diff --git a/src/gallium/drivers/radeonsi/si_state_msaa.c b/src/gallium/drivers/radeonsi/si_state_msaa.c index 9ebb1e5dcb4..5412a87f0a1 100644 --- a/src/gallium/drivers/radeonsi/si_state_msaa.c +++ b/src/gallium/drivers/radeonsi/si_state_msaa.c @@ -150,6 +150,7 @@ static void si_get_sample_position(struct pipe_context *ctx, unsigned sample_cou static void si_emit_max_4_sample_locs(struct radeon_cmdbuf *cs, uint64_t centroid_priority, uint32_t sample_locs) { + radeon_begin(cs); radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); radeon_emit(cs, centroid_priority); radeon_emit(cs, centroid_priority >> 32); @@ -157,11 +158,13 @@ static void si_emit_max_4_sample_locs(struct radeon_cmdbuf *cs, uint64_t centroi radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs); radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs); radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs); + radeon_end(); } static void si_emit_max_16_sample_locs(struct radeon_cmdbuf *cs, uint64_t centroid_priority, const uint32_t *sample_locs, unsigned num_samples) { + radeon_begin(cs); radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); radeon_emit(cs, centroid_priority); radeon_emit(cs, centroid_priority >> 32); @@ -171,6 +174,7 @@ static void si_emit_max_16_sample_locs(struct radeon_cmdbuf *cs, uint64_t centro radeon_emit_array(cs, sample_locs, 4); radeon_emit_array(cs, sample_locs, 4); radeon_emit_array(cs, sample_locs, num_samples == 8 ? 2 : 4); + radeon_end(); } void si_emit_sample_locations(struct radeon_cmdbuf *cs, int nr_samples) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 3326ad934fe..b2c7cf0e49e 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -566,11 +566,10 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader) static void si_emit_shader_es(struct si_context *sctx) { struct si_shader *shader = sctx->queued.named.es->shader; - unsigned initial_cdw = sctx->gfx_cs.current.cdw; - if (!shader) return; + radeon_begin(&sctx->gfx_cs); radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE, SI_TRACKED_VGT_ESGS_RING_ITEMSIZE, shader->selector->esgs_itemsize / 4); @@ -583,9 +582,7 @@ static void si_emit_shader_es(struct si_context *sctx) radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, shader->vgt_vertex_reuse_block_cntl); - - if (initial_cdw != sctx->gfx_cs.current.cdw) - sctx->context_roll = true; + radeon_end_update_context_roll(sctx); } static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader) @@ -729,11 +726,11 @@ void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector * static void si_emit_shader_gs(struct si_context *sctx) { struct si_shader *shader = sctx->queued.named.gs->shader; - unsigned initial_cdw = sctx->gfx_cs.current.cdw; - if (!shader) return; + radeon_begin(&sctx->gfx_cs); + /* R_028A60_VGT_GSVS_RING_OFFSET_1, R_028A64_VGT_GSVS_RING_OFFSET_2 * R_028A68_VGT_GSVS_RING_OFFSET_3 */ radeon_opt_set_context_reg3( @@ -782,9 +779,7 @@ static void si_emit_shader_gs(struct si_context *sctx) SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, shader->vgt_vertex_reuse_block_cntl); } - - if (initial_cdw != sctx->gfx_cs.current.cdw) - sctx->context_roll = true; + radeon_end_update_context_roll(sctx); } static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) @@ -931,6 +926,8 @@ static void gfx10_emit_ge_pc_alloc(struct si_context *sctx, unsigned value) sctx->tracked_regs.reg_value[reg] != value) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; + radeon_begin(cs); + if (sctx->chip_class == GFX10) { /* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); @@ -938,6 +935,7 @@ static void gfx10_emit_ge_pc_alloc(struct si_context *sctx, unsigned value) } radeon_set_uconfig_reg(cs, R_030980_GE_PC_ALLOC, value); + radeon_end(); sctx->tracked_regs.reg_saved |= 0x1ull << reg; sctx->tracked_regs.reg_value[reg] = value; @@ -945,9 +943,9 @@ static void gfx10_emit_ge_pc_alloc(struct si_context *sctx, unsigned value) } /* Common tail code for NGG primitive shaders. */ -static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader *shader, - unsigned initial_cdw) +static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader *shader) { + radeon_begin(&sctx->gfx_cs); radeon_opt_set_context_reg(sctx, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP, SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP, shader->ctx_reg.ngg.ge_max_output_per_subgroup); @@ -975,9 +973,7 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, shader->pa_cl_vs_out_cntl, SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK); - - if (initial_cdw != sctx->gfx_cs.current.cdw) - sctx->context_roll = true; + radeon_end_update_context_roll(sctx); /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */ gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.ngg.ge_pc_alloc); @@ -986,56 +982,55 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx) { struct si_shader *shader = sctx->queued.named.gs->shader; - unsigned initial_cdw = sctx->gfx_cs.current.cdw; - if (!shader) return; - gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw); + gfx10_emit_shader_ngg_tail(sctx, shader); } static void gfx10_emit_shader_ngg_tess_nogs(struct si_context *sctx) { struct si_shader *shader = sctx->queued.named.gs->shader; - unsigned initial_cdw = sctx->gfx_cs.current.cdw; - if (!shader) return; + radeon_begin(&sctx->gfx_cs); radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM, shader->vgt_tf_param); + radeon_end_update_context_roll(sctx); - gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw); + gfx10_emit_shader_ngg_tail(sctx, shader); } static void gfx10_emit_shader_ngg_notess_gs(struct si_context *sctx) { struct si_shader *shader = sctx->queued.named.gs->shader; - unsigned initial_cdw = sctx->gfx_cs.current.cdw; - if (!shader) return; + radeon_begin(&sctx->gfx_cs); radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT, shader->ctx_reg.ngg.vgt_gs_max_vert_out); + radeon_end_update_context_roll(sctx); - gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw); + gfx10_emit_shader_ngg_tail(sctx, shader); } static void gfx10_emit_shader_ngg_tess_gs(struct si_context *sctx) { struct si_shader *shader = sctx->queued.named.gs->shader; - unsigned initial_cdw = sctx->gfx_cs.current.cdw; if (!shader) return; + radeon_begin(&sctx->gfx_cs); radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT, shader->ctx_reg.ngg.vgt_gs_max_vert_out); radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM, shader->vgt_tf_param); + radeon_end_update_context_roll(sctx); - gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw); + gfx10_emit_shader_ngg_tail(sctx, shader); } unsigned si_get_input_prim(const struct si_shader_selector *gs) @@ -1308,11 +1303,10 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader static void si_emit_shader_vs(struct si_context *sctx) { struct si_shader *shader = sctx->queued.named.vs->shader; - unsigned initial_cdw = sctx->gfx_cs.current.cdw; - if (!shader) return; + radeon_begin(&sctx->gfx_cs); radeon_opt_set_context_reg(sctx, R_028A40_VGT_GS_MODE, SI_TRACKED_VGT_GS_MODE, shader->ctx_reg.vs.vgt_gs_mode); radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN, SI_TRACKED_VGT_PRIMITIVEID_EN, @@ -1356,9 +1350,7 @@ static void si_emit_shader_vs(struct si_context *sctx) SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, shader->pa_cl_vs_out_cntl, SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK); } - - if (initial_cdw != sctx->gfx_cs.current.cdw) - sctx->context_roll = true; + radeon_end_update_context_roll(sctx); /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */ if (sctx->chip_class >= GFX10) @@ -1536,11 +1528,10 @@ static unsigned si_get_spi_shader_col_format(struct si_shader *shader) static void si_emit_shader_ps(struct si_context *sctx) { struct si_shader *shader = sctx->queued.named.ps->shader; - unsigned initial_cdw = sctx->gfx_cs.current.cdw; - if (!shader) return; + radeon_begin(&sctx->gfx_cs); /* R_0286CC_SPI_PS_INPUT_ENA, R_0286D0_SPI_PS_INPUT_ADDR*/ radeon_opt_set_context_reg2(sctx, R_0286CC_SPI_PS_INPUT_ENA, SI_TRACKED_SPI_PS_INPUT_ENA, shader->ctx_reg.ps.spi_ps_input_ena, @@ -1558,9 +1549,7 @@ static void si_emit_shader_ps(struct si_context *sctx) radeon_opt_set_context_reg(sctx, R_02823C_CB_SHADER_MASK, SI_TRACKED_CB_SHADER_MASK, shader->ctx_reg.ps.cb_shader_mask); - - if (initial_cdw != sctx->gfx_cs.current.cdw) - sctx->context_roll = true; + radeon_end_update_context_roll(sctx); } static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) @@ -3371,12 +3360,10 @@ static void si_emit_spi_map(struct si_context *sctx) /* R_028644_SPI_PS_INPUT_CNTL_0 */ /* Dota 2: Only ~16% of SPI map updates set different values. */ /* Talos: Only ~9% of SPI map updates set different values. */ - unsigned initial_cdw = sctx->gfx_cs.current.cdw; + radeon_begin(&sctx->gfx_cs); radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl, sctx->tracked_regs.spi_ps_input_cntl, num_interp); - - if (initial_cdw != sctx->gfx_cs.current.cdw) - sctx->context_roll = true; + radeon_end_update_context_roll(sctx); } /** @@ -3405,6 +3392,8 @@ static void si_cs_preamble_add_vgt_flush(struct si_context *sctx) */ static void si_emit_vgt_flush(struct radeon_cmdbuf *cs) { + radeon_begin(cs); + /* This is required before VGT_FLUSH. */ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); @@ -3412,6 +3401,7 @@ static void si_emit_vgt_flush(struct radeon_cmdbuf *cs) /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); + radeon_end(); } /* Initialize state related to ESGS / GSVS ring buffers */ @@ -3505,6 +3495,8 @@ static bool si_update_gs_ring_buffers(struct si_context *sctx) si_emit_vgt_flush(cs); + radeon_begin(cs); + /* Set the GS registers. */ if (sctx->esgs_ring) { assert(sctx->chip_class <= GFX8); @@ -3515,6 +3507,7 @@ static bool si_update_gs_ring_buffers(struct si_context *sctx) radeon_set_uconfig_reg(cs, R_030904_VGT_GSVS_RING_SIZE, sctx->gsvs_ring->width0 / 256); } + radeon_end(); return true; } @@ -3789,6 +3782,7 @@ static void si_init_tess_factor_ring(struct si_context *sctx) si_emit_vgt_flush(cs); /* Set tessellation registers. */ + radeon_begin(cs); radeon_set_uconfig_reg(cs, R_030938_VGT_TF_RING_SIZE, S_030938_SIZE(sctx->screen->tess_factor_ring_size / 4)); radeon_set_uconfig_reg(cs, R_030940_VGT_TF_MEMORY_BASE, factor_va >> 8); @@ -3801,6 +3795,7 @@ static void si_init_tess_factor_ring(struct si_context *sctx) } radeon_set_uconfig_reg(cs, R_03093C_VGT_HS_OFFCHIP_PARAM, sctx->screen->vgt_hs_offchip_param); + radeon_end(); return; } @@ -4153,7 +4148,9 @@ static void si_emit_scratch_state(struct si_context *sctx) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; + radeon_begin(cs); radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE, sctx->spi_tmpring_size); + radeon_end(); if (sctx->scratch_buffer) { radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->scratch_buffer, RADEON_USAGE_READWRITE, diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c index 4c38746ed16..9ba4f73517d 100644 --- a/src/gallium/drivers/radeonsi/si_state_streamout.c +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c @@ -221,6 +221,8 @@ static void gfx10_emit_streamout_begin(struct si_context *sctx) last_target = i; } + radeon_begin(cs); + for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { if (!t[i]) continue; @@ -246,6 +248,7 @@ static void gfx10_emit_streamout_begin(struct si_context *sctx) radeon_emit(cs, 0); radeon_emit(cs, S_414_BYTE_COUNT_GFX9(4) | S_414_DISABLE_WR_CONFIRM_GFX9(i != last_target)); } + radeon_end(); sctx->streamout.begin_emitted = true; } @@ -275,6 +278,8 @@ static void si_flush_vgt_streamout(struct si_context *sctx) struct radeon_cmdbuf *cs = &sctx->gfx_cs; unsigned reg_strmout_cntl; + radeon_begin(cs); + /* The register is at different places on different ASICs. */ if (sctx->chip_class >= GFX7) { reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL; @@ -295,6 +300,7 @@ static void si_flush_vgt_streamout(struct si_context *sctx) radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */ radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */ radeon_emit(cs, 4); /* poll interval */ + radeon_end(); } static void si_emit_streamout_begin(struct si_context *sctx) @@ -306,6 +312,8 @@ static void si_emit_streamout_begin(struct si_context *sctx) si_flush_vgt_streamout(sctx); + radeon_begin(cs); + for (i = 0; i < sctx->streamout.num_targets; i++) { if (!t[i]) continue; @@ -344,6 +352,7 @@ static void si_emit_streamout_begin(struct si_context *sctx) radeon_emit(cs, 0); /* unused */ } } + radeon_end(); sctx->streamout.begin_emitted = true; } @@ -362,6 +371,8 @@ void si_emit_streamout_end(struct si_context *sctx) si_flush_vgt_streamout(sctx); + radeon_begin(cs); + for (i = 0; i < sctx->streamout.num_targets; i++) { if (!t[i]) continue; @@ -383,10 +394,10 @@ void si_emit_streamout_end(struct si_context *sctx) * buffer bound. This ensures that the primitives-emitted query * won't increment. */ radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0); - sctx->context_roll = true; t[i]->buf_filled_size_valid = true; } + radeon_end_update_context_roll(sctx); sctx->streamout.begin_emitted = false; } @@ -402,6 +413,7 @@ static void si_emit_streamout_enable(struct si_context *sctx) { assert(!sctx->screen->use_ngg_streamout); + radeon_begin(&sctx->gfx_cs); radeon_set_context_reg_seq(&sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2); radeon_emit(&sctx->gfx_cs, S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) | S_028B94_RAST_STREAM(0) | @@ -410,6 +422,7 @@ static void si_emit_streamout_enable(struct si_context *sctx) S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx))); radeon_emit(&sctx->gfx_cs, sctx->streamout.hw_enabled_mask & sctx->streamout.enabled_stream_buffers_mask); + radeon_end(); } static void si_set_streamout_enable(struct si_context *sctx, bool enable) diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c index 41432755c64..0327d2f5d15 100644 --- a/src/gallium/drivers/radeonsi/si_state_viewport.c +++ b/src/gallium/drivers/radeonsi/si_state_viewport.c @@ -103,8 +103,10 @@ static void si_emit_cull_state(struct si_context *sctx) /* This will end up in SGPR6 as (value << 8), shifted by the hw. */ radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->small_prim_cull_info_buf, RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER); + radeon_begin(&sctx->gfx_cs); radeon_set_sh_reg(&sctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS, sctx->small_prim_cull_info_address >> 8); + radeon_end(); /* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling. * @@ -213,18 +215,22 @@ static void si_emit_one_scissor(struct si_context *ctx, struct radeon_cmdbuf *cs if (scissor) si_clip_scissor(&final, scissor); + radeon_begin(cs); + /* Workaround for a hw bug on GFX6 that occurs when PA_SU_HARDWARE_- * SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0. */ if (ctx->chip_class == GFX6 && (final.maxx == 0 || final.maxy == 0)) { radeon_emit(cs, S_028250_TL_X(1) | S_028250_TL_Y(1) | S_028250_WINDOW_OFFSET_DISABLE(1)); radeon_emit(cs, S_028254_BR_X(1) | S_028254_BR_Y(1)); + radeon_end(); return; } radeon_emit(cs, S_028250_TL_X(final.minx) | S_028250_TL_Y(final.miny) | S_028250_WINDOW_OFFSET_DISABLE(1)); radeon_emit(cs, S_028254_BR_X(final.maxx) | S_028254_BR_Y(final.maxy)); + radeon_end(); } #define MAX_PA_SU_HARDWARE_SCREEN_OFFSET 8176 @@ -350,7 +356,7 @@ static void si_emit_guardband(struct si_context *ctx) * R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, R_028BEC_PA_CL_GB_VERT_DISC_ADJ * R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ */ - unsigned initial_cdw = ctx->gfx_cs.current.cdw; + radeon_begin(&ctx->gfx_cs); radeon_opt_set_context_reg4(ctx, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, fui(guardband_y), fui(discard_y), fui(guardband_x), fui(discard_x)); @@ -362,8 +368,7 @@ static void si_emit_guardband(struct si_context *ctx) ctx, R_028BE4_PA_SU_VTX_CNTL, SI_TRACKED_PA_SU_VTX_CNTL, S_028BE4_PIX_CENTER(rs->half_pixel_center) | S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH + vp_as_scissor.quant_mode)); - if (initial_cdw != ctx->gfx_cs.current.cdw) - ctx->context_roll = true; + radeon_end_update_context_roll(ctx); } static void si_emit_scissors(struct si_context *ctx) @@ -376,7 +381,10 @@ static void si_emit_scissors(struct si_context *ctx) if (!ctx->vs_writes_viewport_index) { struct si_signed_scissor *vp = &ctx->viewports.as_scissor[0]; + radeon_begin(cs); radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, 2); + radeon_end(); + si_emit_one_scissor(ctx, cs, vp, scissor_enabled ? &states[0] : NULL); return; } @@ -384,7 +392,10 @@ static void si_emit_scissors(struct si_context *ctx) /* All registers in the array need to be updated if any of them is changed. * This is a hardware requirement. */ + radeon_begin(cs); radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, SI_MAX_VIEWPORTS * 2); + radeon_end(); + for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) { si_emit_one_scissor(ctx, cs, &ctx->viewports.as_scissor[i], scissor_enabled ? &states[i] : NULL); @@ -477,12 +488,14 @@ static void si_emit_one_viewport(struct si_context *ctx, struct pipe_viewport_st { struct radeon_cmdbuf *cs = &ctx->gfx_cs; + radeon_begin(cs); radeon_emit(cs, fui(state->scale[0])); radeon_emit(cs, fui(state->translate[0])); radeon_emit(cs, fui(state->scale[1])); radeon_emit(cs, fui(state->translate[1])); radeon_emit(cs, fui(state->scale[2])); radeon_emit(cs, fui(state->translate[2])); + radeon_end(); } static void si_emit_viewports(struct si_context *ctx) @@ -492,7 +505,10 @@ static void si_emit_viewports(struct si_context *ctx) /* The simple case: Only 1 viewport is active. */ if (!ctx->vs_writes_viewport_index) { + radeon_begin(cs); radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6); + radeon_end(); + si_emit_one_viewport(ctx, &states[0]); return; } @@ -500,7 +516,10 @@ static void si_emit_viewports(struct si_context *ctx) /* All registers in the array need to be updated if any of them is changed. * This is a hardware requirement. */ + radeon_begin(cs); radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE + 0, SI_MAX_VIEWPORTS * 6); + radeon_end(); + for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) si_emit_one_viewport(ctx, &states[i]); } @@ -528,21 +547,25 @@ static void si_emit_depth_ranges(struct si_context *ctx) if (!ctx->vs_writes_viewport_index) { si_viewport_zmin_zmax(&states[0], clip_halfz, window_space, &zmin, &zmax); + radeon_begin(cs); radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, 2); radeon_emit(cs, fui(zmin)); radeon_emit(cs, fui(zmax)); + radeon_end(); return; } /* All registers in the array need to be updated if any of them is changed. * This is a hardware requirement. */ + radeon_begin(cs); radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, SI_MAX_VIEWPORTS * 2); for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) { si_viewport_zmin_zmax(&states[i], clip_halfz, window_space, &zmin, &zmax); radeon_emit(cs, fui(zmin)); radeon_emit(cs, fui(zmax)); } + radeon_end(); } static void si_emit_viewport_states(struct si_context *ctx) @@ -631,16 +654,20 @@ static void si_emit_window_rectangles(struct si_context *sctx) else rule = outside[num_rectangles - 1]; + radeon_begin(cs); radeon_opt_set_context_reg(sctx, R_02820C_PA_SC_CLIPRECT_RULE, SI_TRACKED_PA_SC_CLIPRECT_RULE, rule); - if (num_rectangles == 0) + if (num_rectangles == 0) { + radeon_end(); return; + } radeon_set_context_reg_seq(cs, R_028210_PA_SC_CLIPRECT_0_TL, num_rectangles * 2); for (unsigned i = 0; i < num_rectangles; i++) { radeon_emit(cs, S_028210_TL_X(rects[i].minx) | S_028210_TL_Y(rects[i].miny)); radeon_emit(cs, S_028214_BR_X(rects[i].maxx) | S_028214_BR_Y(rects[i].maxy)); } + radeon_end(); } static void si_set_window_rectangles(struct pipe_context *ctx, bool include,