diff --git a/.pick_status.json b/.pick_status.json index ce99889dd66..1e3f2b20d2c 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -2154,7 +2154,7 @@ "description": "radv: fix conditional rendering on compute queue on GFX6", "nominated": true, "nomination_type": 0, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": null, "notes": null diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 547ea072f75..9bd9b048b26 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -5758,7 +5758,7 @@ radv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBegi cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_ALL | RADV_CMD_DIRTY_GUARDBAND | RADV_CMD_DIRTY_OCCLUSION_QUERY | RADV_CMD_DIRTY_DB_SHADER_CONTROL; - if (cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX7) { + if (cmd_buffer->qf == RADV_QUEUE_COMPUTE || cmd_buffer->device->vk.enabled_features.taskShader) { uint32_t pred_value = 0; uint32_t pred_offset; if (!radv_cmd_buffer_upload_data(cmd_buffer, 4, &pred_value, &pred_offset)) @@ -7937,8 +7937,8 @@ radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index) * space in the upload BO and emit some packets to invert the condition. */ static void -radv_cs_emit_compute_predication(struct radv_cmd_state *state, struct radeon_cmdbuf *cs, uint64_t inv_va, - bool *inv_emitted, unsigned dwords) +radv_cs_emit_compute_predication(const struct radv_device *device, struct radv_cmd_state *state, + struct radeon_cmdbuf *cs, uint64_t inv_va, bool *inv_emitted, unsigned dwords) { if (!state->predicating) return; @@ -7948,28 +7948,37 @@ radv_cs_emit_compute_predication(struct radv_cmd_state *state, struct radeon_cmd if (!state->predication_type) { /* Invert the condition the first time it is needed. */ if (!*inv_emitted) { + const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level; + *inv_emitted = true; /* Write 1 to the inverted predication VA. */ radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); - radeon_emit(cs, - COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM); + radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | + COPY_DATA_WR_CONFIRM | (gfx_level == GFX6 ? COPY_DATA_ENGINE_PFP : 0)); radeon_emit(cs, 1); radeon_emit(cs, 0); radeon_emit(cs, inv_va); radeon_emit(cs, inv_va >> 32); /* If the API predication VA == 0, skip next command. */ - radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0)); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - radeon_emit(cs, 0); - radeon_emit(cs, 6); /* 1x COPY_DATA size */ + if (device->physical_device->rad_info.gfx_level >= GFX7) { + radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, 0); + radeon_emit(cs, 6); /* 1x COPY_DATA size */ + } else { + radeon_emit(cs, PKT3(PKT3_COND_EXEC, 2, 0)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, 6); /* 1x COPY_DATA size */ + } /* Write 0 to the new predication VA (when the API condition != 0) */ radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); - radeon_emit(cs, - COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM); + radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | + COPY_DATA_WR_CONFIRM | (gfx_level == GFX6 ? COPY_DATA_ENGINE_PFP : 0)); radeon_emit(cs, 0); radeon_emit(cs, 0); radeon_emit(cs, inv_va); @@ -7979,11 +7988,18 @@ radv_cs_emit_compute_predication(struct radv_cmd_state *state, struct radeon_cmd va = inv_va; } - radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0)); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - radeon_emit(cs, 0); /* Cache policy */ - radeon_emit(cs, dwords); /* Size of the predicated packet(s) in DWORDs. */ + if (device->physical_device->rad_info.gfx_level >= GFX7) { + radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, 0); /* Cache policy */ + radeon_emit(cs, dwords); /* Size of the predicated packet(s) in DWORDs. */ + } else { + radeon_emit(cs, PKT3(PKT3_COND_EXEC, 2, 0)); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, dwords); /* Size of the predicated packet(s) in DWORDs. */ + } } static void @@ -8561,8 +8577,9 @@ radv_emit_direct_taskmesh_draw_packets(struct radv_cmd_buffer *cmd_buffer, uint3 ace_predication_size += num_views * 3; /* SET_SH_REG size (view index SGPR) */ radv_emit_userdata_task(cmd_buffer, x, y, z, 0); - radv_cs_emit_compute_predication(&cmd_buffer->state, cmd_buffer->gang.cs, cmd_buffer->mec_inv_pred_va, - &cmd_buffer->mec_inv_pred_emitted, ace_predication_size); + radv_cs_emit_compute_predication(cmd_buffer->device, &cmd_buffer->state, cmd_buffer->gang.cs, + cmd_buffer->mec_inv_pred_va, &cmd_buffer->mec_inv_pred_emitted, + ace_predication_size); if (!view_mask) { radv_cs_emit_dispatch_taskmesh_direct_ace_packet(cmd_buffer, x, y, z); @@ -8631,8 +8648,9 @@ radv_emit_indirect_taskmesh_draw_packets(struct radv_cmd_buffer *cmd_buffer, con } radv_cs_add_buffer(ws, cmd_buffer->gang.cs, info->indirect->bo); - radv_cs_emit_compute_predication(&cmd_buffer->state, cmd_buffer->gang.cs, cmd_buffer->mec_inv_pred_va, - &cmd_buffer->mec_inv_pred_emitted, ace_predication_size); + radv_cs_emit_compute_predication(cmd_buffer->device, &cmd_buffer->state, cmd_buffer->gang.cs, + cmd_buffer->mec_inv_pred_va, &cmd_buffer->mec_inv_pred_emitted, + ace_predication_size); if (workaround_cond_va) { radeon_emit(ace_cs, PKT3(PKT3_COND_EXEC, 3, 0)); @@ -9674,7 +9692,7 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv const unsigned ace_predication_size = 4 /* DISPATCH_INDIRECT */ + (needs_align32_workaround ? 6 * 3 /* 3x COPY_DATA */ : 0); - radv_cs_emit_compute_predication(&cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va, + radv_cs_emit_compute_predication(cmd_buffer->device, &cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va, &cmd_buffer->mec_inv_pred_emitted, ace_predication_size); if (needs_align32_workaround) { @@ -9779,8 +9797,8 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv dispatch_initiator |= S_00B800_FORCE_START_AT_000(1); } - if (radv_cmd_buffer_uses_mec(cmd_buffer)) { - radv_cs_emit_compute_predication(&cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va, + if (cmd_buffer->qf == RADV_QUEUE_COMPUTE) { + radv_cs_emit_compute_predication(cmd_buffer->device, &cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va, &cmd_buffer->mec_inv_pred_emitted, 5 /* DISPATCH_DIRECT size */); predicating = false; } @@ -10790,69 +10808,71 @@ radv_begin_conditional_rendering(struct radv_cmd_buffer *cmd_buffer, uint64_t va radv_emit_cache_flush(cmd_buffer); - if (cmd_buffer->qf == RADV_QUEUE_GENERAL && !cmd_buffer->device->physical_device->rad_info.has_32bit_predication) { - uint64_t pred_value = 0, pred_va; - unsigned pred_offset; + if (cmd_buffer->qf == RADV_QUEUE_GENERAL) { + if (!cmd_buffer->device->physical_device->rad_info.has_32bit_predication) { + uint64_t pred_value = 0, pred_va; + unsigned pred_offset; - /* From the Vulkan spec 1.1.107: - * - * "If the 32-bit value at offset in buffer memory is zero, - * then the rendering commands are discarded, otherwise they - * are executed as normal. If the value of the predicate in - * buffer memory changes while conditional rendering is - * active, the rendering commands may be discarded in an - * implementation-dependent way. Some implementations may - * latch the value of the predicate upon beginning conditional - * rendering while others may read it before every rendering - * command." - * - * But, the AMD hardware treats the predicate as a 64-bit - * value which means we need a workaround in the driver. - * Luckily, it's not required to support if the value changes - * when predication is active. - * - * The workaround is as follows: - * 1) allocate a 64-value in the upload BO and initialize it - * to 0 - * 2) copy the 32-bit predicate value to the upload BO - * 3) use the new allocated VA address for predication - * - * Based on the conditionalrender demo, it's faster to do the - * COPY_DATA in ME (+ sync PFP) instead of PFP. - */ - radv_cmd_buffer_upload_data(cmd_buffer, 8, &pred_value, &pred_offset); + /* From the Vulkan spec 1.1.107: + * + * "If the 32-bit value at offset in buffer memory is zero, + * then the rendering commands are discarded, otherwise they + * are executed as normal. If the value of the predicate in + * buffer memory changes while conditional rendering is + * active, the rendering commands may be discarded in an + * implementation-dependent way. Some implementations may + * latch the value of the predicate upon beginning conditional + * rendering while others may read it before every rendering + * command." + * + * But, the AMD hardware treats the predicate as a 64-bit + * value which means we need a workaround in the driver. + * Luckily, it's not required to support if the value changes + * when predication is active. + * + * The workaround is as follows: + * 1) allocate a 64-value in the upload BO and initialize it + * to 0 + * 2) copy the 32-bit predicate value to the upload BO + * 3) use the new allocated VA address for predication + * + * Based on the conditionalrender demo, it's faster to do the + * COPY_DATA in ME (+ sync PFP) instead of PFP. + */ + radv_cmd_buffer_upload_data(cmd_buffer, 8, &pred_value, &pred_offset); - pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset; + pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset; - radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 8); + radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 8); - radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); - radeon_emit(cs, - COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM); - radeon_emit(cs, va); - radeon_emit(cs, va >> 32); - radeon_emit(cs, pred_va); - radeon_emit(cs, pred_va >> 32); + radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit( + cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, pred_va); + radeon_emit(cs, pred_va >> 32); - radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); - radeon_emit(cs, 0); + radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); + radeon_emit(cs, 0); - va = pred_va; - pred_op = PREDICATION_OP_BOOL64; - } + va = pred_va; + pred_op = PREDICATION_OP_BOOL64; + } - /* MEC doesn't support predication, we emulate it elsewhere. */ - if (!radv_cmd_buffer_uses_mec(cmd_buffer)) { radv_emit_set_predication_state(cmd_buffer, draw_visible, pred_op, va); + } else { + /* Compute queue doesn't support predication and it's emulated elsewhere. */ } } void radv_end_conditional_rendering(struct radv_cmd_buffer *cmd_buffer) { - /* MEC doesn't support predication, no need to emit anything here. */ - if (!radv_cmd_buffer_uses_mec(cmd_buffer)) { + if (cmd_buffer->qf == RADV_QUEUE_GENERAL) { radv_emit_set_predication_state(cmd_buffer, false, 0, 0); + } else { + /* Compute queue doesn't support predication, no need to emit anything here. */ } }