mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-02 18:10:17 +01:00
radv: add GFX9 cache flushing support.
GFX9 needs to write event EOP to a fence buffer, allocate some space for this, and just write an ever increasing number to it, this isn't exactly what radeonsi does, but it seems to work. Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> Signed-off-by: Dave Airlie <airlied@redhat.com>
This commit is contained in:
parent
b11c4a5546
commit
c2fbeb7ca0
4 changed files with 144 additions and 50 deletions
|
|
@ -234,6 +234,14 @@ static void radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
|
|||
cmd_buffer->record_fail = false;
|
||||
|
||||
cmd_buffer->ring_offsets_idx = -1;
|
||||
|
||||
if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
|
||||
void *fence_ptr;
|
||||
radv_cmd_buffer_upload_alloc(cmd_buffer, 8, 0,
|
||||
&cmd_buffer->gfx9_fence_offset,
|
||||
&fence_ptr);
|
||||
cmd_buffer->gfx9_fence_bo = cmd_buffer->upload.upload_bo;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
|
|
|
|||
|
|
@ -1103,6 +1103,7 @@ VkResult radv_CreateDevice(
|
|||
case RADV_QUEUE_COMPUTE:
|
||||
si_cs_emit_cache_flush(device->flush_cs[family],
|
||||
device->physical_device->rad_info.chip_class,
|
||||
NULL, 0,
|
||||
family == RADV_QUEUE_COMPUTE && device->physical_device->rad_info.chip_class >= CIK,
|
||||
RADV_CMD_FLAG_INV_ICACHE |
|
||||
RADV_CMD_FLAG_INV_SMEM_L1 |
|
||||
|
|
@ -1118,6 +1119,7 @@ VkResult radv_CreateDevice(
|
|||
case RADV_QUEUE_COMPUTE:
|
||||
si_cs_emit_cache_flush(device->flush_shader_cs[family],
|
||||
device->physical_device->rad_info.chip_class,
|
||||
NULL, 0,
|
||||
family == RADV_QUEUE_COMPUTE && device->physical_device->rad_info.chip_class >= CIK,
|
||||
family == RADV_QUEUE_COMPUTE ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH : (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH) |
|
||||
RADV_CMD_FLAG_INV_ICACHE |
|
||||
|
|
@ -1763,6 +1765,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
|
|||
if (!i) {
|
||||
si_cs_emit_cache_flush(cs,
|
||||
queue->device->physical_device->rad_info.chip_class,
|
||||
NULL, 0,
|
||||
queue->queue_family_index == RING_COMPUTE &&
|
||||
queue->device->physical_device->rad_info.chip_class >= CIK,
|
||||
RADV_CMD_FLAG_INV_ICACHE |
|
||||
|
|
|
|||
|
|
@ -822,6 +822,9 @@ struct radv_cmd_buffer {
|
|||
bool record_fail;
|
||||
|
||||
int ring_offsets_idx; /* just used for verification */
|
||||
uint32_t gfx9_fence_offset;
|
||||
struct radeon_winsys_bo *gfx9_fence_bo;
|
||||
uint32_t gfx9_fence_idx;
|
||||
};
|
||||
|
||||
struct radv_image;
|
||||
|
|
@ -854,9 +857,10 @@ void si_emit_wait_fence(struct radeon_winsys_cs *cs,
|
|||
uint64_t va, uint32_t ref,
|
||||
uint32_t mask);
|
||||
void si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
|
||||
enum chip_class chip_class,
|
||||
bool is_mec,
|
||||
enum radv_cmd_flush_bits flush_bits);
|
||||
enum chip_class chip_class,
|
||||
uint32_t *fence_ptr, uint64_t va,
|
||||
bool is_mec,
|
||||
enum radv_cmd_flush_bits flush_bits);
|
||||
void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer);
|
||||
void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer,
|
||||
uint64_t src_va, uint64_t dest_va,
|
||||
|
|
|
|||
|
|
@ -823,15 +823,18 @@ void si_cs_emit_write_event_eop(struct radeon_winsys_cs *cs,
|
|||
unsigned op = EVENT_TYPE(event) |
|
||||
EVENT_INDEX(5) |
|
||||
event_flags;
|
||||
unsigned is_gfx8_mec = is_mec && chip_class < GFX9;
|
||||
|
||||
if (is_mec) {
|
||||
radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 5, 0));
|
||||
if (chip_class >= GFX9 || is_gfx8_mec) {
|
||||
radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, is_gfx8_mec ? 5 : 6, 0));
|
||||
radeon_emit(cs, op);
|
||||
radeon_emit(cs, EOP_DATA_SEL(data_sel));
|
||||
radeon_emit(cs, va); /* address lo */
|
||||
radeon_emit(cs, va >> 32); /* address hi */
|
||||
radeon_emit(cs, new_fence); /* immediate data lo */
|
||||
radeon_emit(cs, 0); /* immediate data hi */
|
||||
if (!is_gfx8_mec)
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
} else {
|
||||
if (chip_class == CIK ||
|
||||
chip_class == VI) {
|
||||
|
|
@ -872,15 +875,16 @@ si_emit_wait_fence(struct radeon_winsys_cs *cs,
|
|||
|
||||
static void
|
||||
si_emit_acquire_mem(struct radeon_winsys_cs *cs,
|
||||
bool is_mec,
|
||||
bool is_mec, bool is_gfx9,
|
||||
unsigned cp_coher_cntl)
|
||||
{
|
||||
if (is_mec) {
|
||||
if (is_mec || is_gfx9) {
|
||||
uint32_t hi_val = is_gfx9 ? 0xffffff : 0xff;
|
||||
radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0) |
|
||||
PKT3_SHADER_TYPE_S(1));
|
||||
PKT3_SHADER_TYPE_S(is_mec));
|
||||
radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
|
||||
radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
|
||||
radeon_emit(cs, 0xff); /* CP_COHER_SIZE_HI */
|
||||
radeon_emit(cs, hi_val); /* CP_COHER_SIZE_HI */
|
||||
radeon_emit(cs, 0); /* CP_COHER_BASE */
|
||||
radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
|
||||
radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
|
||||
|
|
@ -897,40 +901,45 @@ si_emit_acquire_mem(struct radeon_winsys_cs *cs,
|
|||
void
|
||||
si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
|
||||
enum chip_class chip_class,
|
||||
uint32_t *flush_cnt,
|
||||
uint64_t flush_va,
|
||||
bool is_mec,
|
||||
enum radv_cmd_flush_bits flush_bits)
|
||||
{
|
||||
unsigned cp_coher_cntl = 0;
|
||||
|
||||
uint32_t flush_cb_db = flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
|
||||
RADV_CMD_FLAG_FLUSH_AND_INV_DB);
|
||||
|
||||
if (flush_bits & RADV_CMD_FLAG_INV_ICACHE)
|
||||
cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
|
||||
if (flush_bits & RADV_CMD_FLAG_INV_SMEM_L1)
|
||||
cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
|
||||
|
||||
if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) {
|
||||
cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
|
||||
S_0085F0_CB0_DEST_BASE_ENA(1) |
|
||||
S_0085F0_CB1_DEST_BASE_ENA(1) |
|
||||
S_0085F0_CB2_DEST_BASE_ENA(1) |
|
||||
S_0085F0_CB3_DEST_BASE_ENA(1) |
|
||||
S_0085F0_CB4_DEST_BASE_ENA(1) |
|
||||
S_0085F0_CB5_DEST_BASE_ENA(1) |
|
||||
S_0085F0_CB6_DEST_BASE_ENA(1) |
|
||||
S_0085F0_CB7_DEST_BASE_ENA(1);
|
||||
if (chip_class <= VI) {
|
||||
if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) {
|
||||
cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
|
||||
S_0085F0_CB0_DEST_BASE_ENA(1) |
|
||||
S_0085F0_CB1_DEST_BASE_ENA(1) |
|
||||
S_0085F0_CB2_DEST_BASE_ENA(1) |
|
||||
S_0085F0_CB3_DEST_BASE_ENA(1) |
|
||||
S_0085F0_CB4_DEST_BASE_ENA(1) |
|
||||
S_0085F0_CB5_DEST_BASE_ENA(1) |
|
||||
S_0085F0_CB6_DEST_BASE_ENA(1) |
|
||||
S_0085F0_CB7_DEST_BASE_ENA(1);
|
||||
|
||||
/* Necessary for DCC */
|
||||
if (chip_class >= VI) {
|
||||
si_cs_emit_write_event_eop(cs,
|
||||
chip_class,
|
||||
is_mec,
|
||||
V_028A90_FLUSH_AND_INV_CB_DATA_TS,
|
||||
0, 0, 0, 0, 0);
|
||||
/* Necessary for DCC */
|
||||
if (chip_class >= VI) {
|
||||
si_cs_emit_write_event_eop(cs,
|
||||
chip_class,
|
||||
is_mec,
|
||||
V_028A90_FLUSH_AND_INV_CB_DATA_TS,
|
||||
0, 0, 0, 0, 0);
|
||||
}
|
||||
}
|
||||
if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) {
|
||||
cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
|
||||
S_0085F0_DB_DEST_BASE_ENA(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) {
|
||||
cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
|
||||
S_0085F0_DB_DEST_BASE_ENA(1);
|
||||
}
|
||||
|
||||
if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB_META) {
|
||||
|
|
@ -943,8 +952,7 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
|
|||
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
|
||||
}
|
||||
|
||||
if (!(flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
|
||||
RADV_CMD_FLAG_FLUSH_AND_INV_DB))) {
|
||||
if (!flush_cb_db) {
|
||||
if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
|
||||
|
|
@ -959,6 +967,54 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
|
|||
radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
|
||||
}
|
||||
|
||||
if (chip_class >= GFX9 && flush_cb_db) {
|
||||
unsigned cb_db_event, tc_flags;
|
||||
|
||||
/* Set the CB/DB flush event. */
|
||||
switch (flush_cb_db) {
|
||||
case RADV_CMD_FLAG_FLUSH_AND_INV_CB:
|
||||
cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
|
||||
break;
|
||||
case RADV_CMD_FLAG_FLUSH_AND_INV_DB:
|
||||
cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
|
||||
break;
|
||||
default:
|
||||
/* both CB & DB */
|
||||
cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
|
||||
}
|
||||
|
||||
/* TC | TC_WB = invalidate L2 data
|
||||
* TC_MD | TC_WB = invalidate L2 metadata
|
||||
* TC | TC_WB | TC_MD = invalidate L2 data & metadata
|
||||
*
|
||||
* The metadata cache must always be invalidated for coherency
|
||||
* between CB/DB and shaders. (metadata = HTILE, CMASK, DCC)
|
||||
*
|
||||
* TC must be invalidated on GFX9 only if the CB/DB surface is
|
||||
* not pipe-aligned. If the surface is RB-aligned, it might not
|
||||
* strictly be pipe-aligned since RB alignment takes precendence.
|
||||
*/
|
||||
tc_flags = EVENT_TC_WB_ACTION_ENA |
|
||||
EVENT_TC_MD_ACTION_ENA;
|
||||
|
||||
/* Ideally flush TC together with CB/DB. */
|
||||
if (flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) {
|
||||
tc_flags |= EVENT_TC_ACTION_ENA |
|
||||
EVENT_TCL1_ACTION_ENA;
|
||||
|
||||
/* Clear the flags. */
|
||||
flush_bits &= ~(RADV_CMD_FLAG_INV_GLOBAL_L2 |
|
||||
RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2 |
|
||||
RADV_CMD_FLAG_INV_VMEM_L1);
|
||||
}
|
||||
assert(flush_cnt);
|
||||
uint32_t old_fence = (*flush_cnt)++;
|
||||
|
||||
si_cs_emit_write_event_eop(cs, chip_class, false, cb_db_event, tc_flags, 1,
|
||||
flush_va, old_fence, *flush_cnt);
|
||||
si_emit_wait_fence(cs, flush_va, *flush_cnt, 0xffffffff);
|
||||
}
|
||||
|
||||
/* VGT state sync */
|
||||
if (flush_bits & RADV_CMD_FLAG_VGT_FLUSH) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
|
|
@ -968,7 +1024,11 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
|
|||
/* Make sure ME is idle (it executes most packets) before continuing.
|
||||
* This prevents read-after-write hazards between PFP and ME.
|
||||
*/
|
||||
if ((cp_coher_cntl || (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) &&
|
||||
if ((cp_coher_cntl ||
|
||||
(flush_bits & (RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
|
||||
RADV_CMD_FLAG_INV_VMEM_L1 |
|
||||
RADV_CMD_FLAG_INV_GLOBAL_L2 |
|
||||
RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) &&
|
||||
!is_mec) {
|
||||
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
|
||||
radeon_emit(cs, 0);
|
||||
|
|
@ -976,34 +1036,46 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
|
|||
|
||||
if ((flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) ||
|
||||
(chip_class <= CIK && (flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) {
|
||||
cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1);
|
||||
if (chip_class >= VI)
|
||||
cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1);
|
||||
} else if(flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2) {
|
||||
cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1) |
|
||||
S_0301F0_TC_NC_ACTION_ENA(1);
|
||||
|
||||
/* L2 writeback doesn't combine with L1 invalidate */
|
||||
si_emit_acquire_mem(cs, is_mec, cp_coher_cntl);
|
||||
|
||||
si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9,
|
||||
cp_coher_cntl |
|
||||
S_0085F0_TC_ACTION_ENA(1) |
|
||||
S_0085F0_TCL1_ACTION_ENA(1) |
|
||||
S_0301F0_TC_WB_ACTION_ENA(chip_class >= VI));
|
||||
cp_coher_cntl = 0;
|
||||
} else {
|
||||
if(flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2) {
|
||||
/* WB = write-back
|
||||
* NC = apply to non-coherent MTYPEs
|
||||
* (i.e. MTYPE <= 1, which is what we use everywhere)
|
||||
*
|
||||
* WB doesn't work without NC.
|
||||
*/
|
||||
si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9,
|
||||
cp_coher_cntl |
|
||||
S_0301F0_TC_WB_ACTION_ENA(1) |
|
||||
S_0301F0_TC_NC_ACTION_ENA(1));
|
||||
cp_coher_cntl = 0;
|
||||
}
|
||||
if (flush_bits & RADV_CMD_FLAG_INV_VMEM_L1) {
|
||||
si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9,
|
||||
cp_coher_cntl |
|
||||
S_0085F0_TCL1_ACTION_ENA(1));
|
||||
cp_coher_cntl = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (flush_bits & RADV_CMD_FLAG_INV_VMEM_L1)
|
||||
cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
|
||||
|
||||
/* When one of the DEST_BASE flags is set, SURFACE_SYNC waits for idle.
|
||||
* Therefore, it should be last. Done in PFP.
|
||||
*/
|
||||
if (cp_coher_cntl)
|
||||
si_emit_acquire_mem(cs, is_mec, cp_coher_cntl);
|
||||
si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9, cp_coher_cntl);
|
||||
}
|
||||
|
||||
void
|
||||
si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
bool is_compute = cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE;
|
||||
|
||||
enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class;
|
||||
if (is_compute)
|
||||
cmd_buffer->state.flush_bits &= ~(RADV_CMD_FLAG_FLUSH_AND_INV_CB |
|
||||
RADV_CMD_FLAG_FLUSH_AND_INV_CB_META |
|
||||
|
|
@ -1015,8 +1087,15 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
|
|||
|
||||
radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 128);
|
||||
|
||||
uint32_t *ptr = NULL;
|
||||
uint64_t va = 0;
|
||||
if (chip_class == GFX9) {
|
||||
va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->gfx9_fence_bo) + cmd_buffer->gfx9_fence_offset;
|
||||
ptr = &cmd_buffer->gfx9_fence_idx;
|
||||
}
|
||||
si_cs_emit_cache_flush(cmd_buffer->cs,
|
||||
cmd_buffer->device->physical_device->rad_info.chip_class,
|
||||
ptr, va,
|
||||
radv_cmd_buffer_uses_mec(cmd_buffer),
|
||||
cmd_buffer->state.flush_bits);
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue