mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-07 13:38:06 +02:00
radv: fix conditional rendering on compute queue on GFX6
PKT3_SET_PREDICATION is GFX only, even on GFX6.
This fixes recent
dEQP-VK.conditional_rendering.dispatch.*_compute_queue.
Cc: mesa-stable
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27746>
(cherry picked from commit cdf7e35618)
This commit is contained in:
parent
8ea0390197
commit
b14d23014e
2 changed files with 93 additions and 73 deletions
|
|
@ -2154,7 +2154,7 @@
|
|||
"description": "radv: fix conditional rendering on compute queue on GFX6",
|
||||
"nominated": true,
|
||||
"nomination_type": 0,
|
||||
"resolution": 0,
|
||||
"resolution": 1,
|
||||
"main_sha": null,
|
||||
"because_sha": null,
|
||||
"notes": null
|
||||
|
|
|
|||
|
|
@ -5758,7 +5758,7 @@ radv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBegi
|
|||
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_ALL | RADV_CMD_DIRTY_GUARDBAND | RADV_CMD_DIRTY_OCCLUSION_QUERY |
|
||||
RADV_CMD_DIRTY_DB_SHADER_CONTROL;
|
||||
|
||||
if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) {
|
||||
if (cmd_buffer->qf == RADV_QUEUE_COMPUTE || cmd_buffer->device->vk.enabled_features.taskShader) {
|
||||
uint32_t pred_value = 0;
|
||||
uint32_t pred_offset;
|
||||
if (!radv_cmd_buffer_upload_data(cmd_buffer, 4, &pred_value, &pred_offset))
|
||||
|
|
@ -7937,8 +7937,8 @@ radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index)
|
|||
* space in the upload BO and emit some packets to invert the condition.
|
||||
*/
|
||||
static void
|
||||
radv_cs_emit_compute_predication(struct radv_cmd_state *state, struct radeon_cmdbuf *cs, uint64_t inv_va,
|
||||
bool *inv_emitted, unsigned dwords)
|
||||
radv_cs_emit_compute_predication(const struct radv_device *device, struct radv_cmd_state *state,
|
||||
struct radeon_cmdbuf *cs, uint64_t inv_va, bool *inv_emitted, unsigned dwords)
|
||||
{
|
||||
if (!state->predicating)
|
||||
return;
|
||||
|
|
@ -7948,28 +7948,37 @@ radv_cs_emit_compute_predication(struct radv_cmd_state *state, struct radeon_cmd
|
|||
if (!state->predication_type) {
|
||||
/* Invert the condition the first time it is needed. */
|
||||
if (!*inv_emitted) {
|
||||
const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
|
||||
|
||||
*inv_emitted = true;
|
||||
|
||||
/* Write 1 to the inverted predication VA. */
|
||||
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
|
||||
radeon_emit(cs,
|
||||
COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
|
||||
radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
|
||||
COPY_DATA_WR_CONFIRM | (gfx_level == GFX6 ? COPY_DATA_ENGINE_PFP : 0));
|
||||
radeon_emit(cs, 1);
|
||||
radeon_emit(cs, 0);
|
||||
radeon_emit(cs, inv_va);
|
||||
radeon_emit(cs, inv_va >> 32);
|
||||
|
||||
/* If the API predication VA == 0, skip next command. */
|
||||
radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
|
||||
radeon_emit(cs, va);
|
||||
radeon_emit(cs, va >> 32);
|
||||
radeon_emit(cs, 0);
|
||||
radeon_emit(cs, 6); /* 1x COPY_DATA size */
|
||||
if (device->physical_device->rad_info.gfx_level >= GFX7) {
|
||||
radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
|
||||
radeon_emit(cs, va);
|
||||
radeon_emit(cs, va >> 32);
|
||||
radeon_emit(cs, 0);
|
||||
radeon_emit(cs, 6); /* 1x COPY_DATA size */
|
||||
} else {
|
||||
radeon_emit(cs, PKT3(PKT3_COND_EXEC, 2, 0));
|
||||
radeon_emit(cs, va);
|
||||
radeon_emit(cs, va >> 32);
|
||||
radeon_emit(cs, 6); /* 1x COPY_DATA size */
|
||||
}
|
||||
|
||||
/* Write 0 to the new predication VA (when the API condition != 0) */
|
||||
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
|
||||
radeon_emit(cs,
|
||||
COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
|
||||
radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
|
||||
COPY_DATA_WR_CONFIRM | (gfx_level == GFX6 ? COPY_DATA_ENGINE_PFP : 0));
|
||||
radeon_emit(cs, 0);
|
||||
radeon_emit(cs, 0);
|
||||
radeon_emit(cs, inv_va);
|
||||
|
|
@ -7979,11 +7988,18 @@ radv_cs_emit_compute_predication(struct radv_cmd_state *state, struct radeon_cmd
|
|||
va = inv_va;
|
||||
}
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
|
||||
radeon_emit(cs, va);
|
||||
radeon_emit(cs, va >> 32);
|
||||
radeon_emit(cs, 0); /* Cache policy */
|
||||
radeon_emit(cs, dwords); /* Size of the predicated packet(s) in DWORDs. */
|
||||
if (device->physical_device->rad_info.gfx_level >= GFX7) {
|
||||
radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
|
||||
radeon_emit(cs, va);
|
||||
radeon_emit(cs, va >> 32);
|
||||
radeon_emit(cs, 0); /* Cache policy */
|
||||
radeon_emit(cs, dwords); /* Size of the predicated packet(s) in DWORDs. */
|
||||
} else {
|
||||
radeon_emit(cs, PKT3(PKT3_COND_EXEC, 2, 0));
|
||||
radeon_emit(cs, va);
|
||||
radeon_emit(cs, va >> 32);
|
||||
radeon_emit(cs, dwords); /* Size of the predicated packet(s) in DWORDs. */
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
@ -8561,8 +8577,9 @@ radv_emit_direct_taskmesh_draw_packets(struct radv_cmd_buffer *cmd_buffer, uint3
|
|||
ace_predication_size += num_views * 3; /* SET_SH_REG size (view index SGPR) */
|
||||
|
||||
radv_emit_userdata_task(cmd_buffer, x, y, z, 0);
|
||||
radv_cs_emit_compute_predication(&cmd_buffer->state, cmd_buffer->gang.cs, cmd_buffer->mec_inv_pred_va,
|
||||
&cmd_buffer->mec_inv_pred_emitted, ace_predication_size);
|
||||
radv_cs_emit_compute_predication(cmd_buffer->device, &cmd_buffer->state, cmd_buffer->gang.cs,
|
||||
cmd_buffer->mec_inv_pred_va, &cmd_buffer->mec_inv_pred_emitted,
|
||||
ace_predication_size);
|
||||
|
||||
if (!view_mask) {
|
||||
radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, x, y, z);
|
||||
|
|
@ -8631,8 +8648,9 @@ radv_emit_indirect_taskmesh_draw_packets(struct radv_cmd_buffer *cmd_buffer, con
|
|||
}
|
||||
|
||||
radv_cs_add_buffer(ws, cmd_buffer->gang.cs, info->indirect->bo);
|
||||
radv_cs_emit_compute_predication(&cmd_buffer->state, cmd_buffer->gang.cs, cmd_buffer->mec_inv_pred_va,
|
||||
&cmd_buffer->mec_inv_pred_emitted, ace_predication_size);
|
||||
radv_cs_emit_compute_predication(cmd_buffer->device, &cmd_buffer->state, cmd_buffer->gang.cs,
|
||||
cmd_buffer->mec_inv_pred_va, &cmd_buffer->mec_inv_pred_emitted,
|
||||
ace_predication_size);
|
||||
|
||||
if (workaround_cond_va) {
|
||||
radeon_emit(ace_cs, PKT3(PKT3_COND_EXEC, 3, 0));
|
||||
|
|
@ -9674,7 +9692,7 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv
|
|||
const unsigned ace_predication_size =
|
||||
4 /* DISPATCH_INDIRECT */ + (needs_align32_workaround ? 6 * 3 /* 3x COPY_DATA */ : 0);
|
||||
|
||||
radv_cs_emit_compute_predication(&cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va,
|
||||
radv_cs_emit_compute_predication(cmd_buffer->device, &cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va,
|
||||
&cmd_buffer->mec_inv_pred_emitted, ace_predication_size);
|
||||
|
||||
if (needs_align32_workaround) {
|
||||
|
|
@ -9779,8 +9797,8 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv
|
|||
dispatch_initiator |= S_00B800_FORCE_START_AT_000(1);
|
||||
}
|
||||
|
||||
if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
|
||||
radv_cs_emit_compute_predication(&cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va,
|
||||
if (cmd_buffer->qf == RADV_QUEUE_COMPUTE) {
|
||||
radv_cs_emit_compute_predication(cmd_buffer->device, &cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va,
|
||||
&cmd_buffer->mec_inv_pred_emitted, 5 /* DISPATCH_DIRECT size */);
|
||||
predicating = false;
|
||||
}
|
||||
|
|
@ -10790,69 +10808,71 @@ radv_begin_conditional_rendering(struct radv_cmd_buffer *cmd_buffer, uint64_t va
|
|||
|
||||
radv_emit_cache_flush(cmd_buffer);
|
||||
|
||||
if (cmd_buffer->qf == RADV_QUEUE_GENERAL && !cmd_buffer->device->physical_device->rad_info.has_32bit_predication) {
|
||||
uint64_t pred_value = 0, pred_va;
|
||||
unsigned pred_offset;
|
||||
if (cmd_buffer->qf == RADV_QUEUE_GENERAL) {
|
||||
if (!cmd_buffer->device->physical_device->rad_info.has_32bit_predication) {
|
||||
uint64_t pred_value = 0, pred_va;
|
||||
unsigned pred_offset;
|
||||
|
||||
/* From the Vulkan spec 1.1.107:
|
||||
*
|
||||
* "If the 32-bit value at offset in buffer memory is zero,
|
||||
* then the rendering commands are discarded, otherwise they
|
||||
* are executed as normal. If the value of the predicate in
|
||||
* buffer memory changes while conditional rendering is
|
||||
* active, the rendering commands may be discarded in an
|
||||
* implementation-dependent way. Some implementations may
|
||||
* latch the value of the predicate upon beginning conditional
|
||||
* rendering while others may read it before every rendering
|
||||
* command."
|
||||
*
|
||||
* But, the AMD hardware treats the predicate as a 64-bit
|
||||
* value which means we need a workaround in the driver.
|
||||
* Luckily, it's not required to support if the value changes
|
||||
* when predication is active.
|
||||
*
|
||||
* The workaround is as follows:
|
||||
* 1) allocate a 64-value in the upload BO and initialize it
|
||||
* to 0
|
||||
* 2) copy the 32-bit predicate value to the upload BO
|
||||
* 3) use the new allocated VA address for predication
|
||||
*
|
||||
* Based on the conditionalrender demo, it's faster to do the
|
||||
* COPY_DATA in ME (+ sync PFP) instead of PFP.
|
||||
*/
|
||||
radv_cmd_buffer_upload_data(cmd_buffer, 8, &pred_value, &pred_offset);
|
||||
/* From the Vulkan spec 1.1.107:
|
||||
*
|
||||
* "If the 32-bit value at offset in buffer memory is zero,
|
||||
* then the rendering commands are discarded, otherwise they
|
||||
* are executed as normal. If the value of the predicate in
|
||||
* buffer memory changes while conditional rendering is
|
||||
* active, the rendering commands may be discarded in an
|
||||
* implementation-dependent way. Some implementations may
|
||||
* latch the value of the predicate upon beginning conditional
|
||||
* rendering while others may read it before every rendering
|
||||
* command."
|
||||
*
|
||||
* But, the AMD hardware treats the predicate as a 64-bit
|
||||
* value which means we need a workaround in the driver.
|
||||
* Luckily, it's not required to support if the value changes
|
||||
* when predication is active.
|
||||
*
|
||||
* The workaround is as follows:
|
||||
* 1) allocate a 64-value in the upload BO and initialize it
|
||||
* to 0
|
||||
* 2) copy the 32-bit predicate value to the upload BO
|
||||
* 3) use the new allocated VA address for predication
|
||||
*
|
||||
* Based on the conditionalrender demo, it's faster to do the
|
||||
* COPY_DATA in ME (+ sync PFP) instead of PFP.
|
||||
*/
|
||||
radv_cmd_buffer_upload_data(cmd_buffer, 8, &pred_value, &pred_offset);
|
||||
|
||||
pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
|
||||
pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
|
||||
|
||||
radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 8);
|
||||
radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 8);
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
|
||||
radeon_emit(cs,
|
||||
COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
|
||||
radeon_emit(cs, va);
|
||||
radeon_emit(cs, va >> 32);
|
||||
radeon_emit(cs, pred_va);
|
||||
radeon_emit(cs, pred_va >> 32);
|
||||
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
|
||||
radeon_emit(
|
||||
cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
|
||||
radeon_emit(cs, va);
|
||||
radeon_emit(cs, va >> 32);
|
||||
radeon_emit(cs, pred_va);
|
||||
radeon_emit(cs, pred_va >> 32);
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
|
||||
radeon_emit(cs, 0);
|
||||
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
|
||||
radeon_emit(cs, 0);
|
||||
|
||||
va = pred_va;
|
||||
pred_op = PREDICATION_OP_BOOL64;
|
||||
}
|
||||
va = pred_va;
|
||||
pred_op = PREDICATION_OP_BOOL64;
|
||||
}
|
||||
|
||||
/* MEC doesn't support predication, we emulate it elsewhere. */
|
||||
if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
|
||||
radv_emit_set_predication_state(cmd_buffer, draw_visible, pred_op, va);
|
||||
} else {
|
||||
/* Compute queue doesn't support predication and it's emulated elsewhere. */
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
radv_end_conditional_rendering(struct radv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
/* MEC doesn't support predication, no need to emit anything here. */
|
||||
if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
|
||||
if (cmd_buffer->qf == RADV_QUEUE_GENERAL) {
|
||||
radv_emit_set_predication_state(cmd_buffer, false, 0, 0);
|
||||
} else {
|
||||
/* Compute queue doesn't support predication, no need to emit anything here. */
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue