diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index be60b5633f3..a870beecc57 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -1430,6 +1430,76 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs) } } +/* Set always-identical registers used specifically for GMEM */ +static void +tu7_emit_tile_render_begin_regs(struct tu_cs *cs) +{ + tu_cs_emit_regs(cs, + A7XX_RB_UNKNOWN_8812(0x0)); + tu_cs_emit_regs(cs, + A7XX_RB_UNKNOWN_8E06(0x0)); + + tu_cs_emit_regs(cs, A7XX_GRAS_UNKNOWN_8007(0x0)); + + tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8110(0x2)); + tu_cs_emit_regs(cs, A7XX_RB_UNKNOWN_8E09(0x4)); + + tu_cs_emit_regs(cs, A7XX_RB_BLIT_CLEAR_MODE(.clear_mode = CLEAR_MODE_GMEM)); +} + +/* Emit the bin restore preamble, which runs in between bins when L1 + * preemption with skipsaverestore happens and we switch back to this context. + * We need to restore static registers normally programmed at cmdbuf start + * which weren't saved, and we need to program the CCU state which is normally + * programmed before rendering the bins and isn't saved/restored by the CP + * because it is always the same for GMEM render passes. + */ +template +static void +tu_emit_bin_preamble(struct tu_device *dev, struct tu_cs *cs) +{ + struct tu_physical_device *phys_dev = dev->physical_device; + + tu6_init_static_regs(dev, cs); + emit_rb_ccu_cntl(cs, dev, true); + + if (CHIP == A6XX) { + tu_cs_emit_regs(cs, + A6XX_PC_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL)); + + tu_cs_emit_regs(cs, + A6XX_VFD_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL)); + } + + if (CHIP == A7XX) { + tu7_emit_tile_render_begin_regs(cs); + } + + /* TODO use CP_MEM_TO_SCRATCH_MEM on a7xx. The VSC scratch mem should be + * automatically saved, unlike GPU registers, so we wouldn't have to + * manually restore this state. + */ + tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3); + tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_VSC_STATE(0)) | + CP_MEM_TO_REG_0_CNT(32)); + tu_cs_emit_qw(cs, dev->global_bo->iova + gb_offset(vsc_state)); +} + +VkResult +tu_init_bin_preamble(struct tu_device *device) +{ + struct tu_cs preamble_cs; + VkResult result = tu_cs_begin_sub_stream(&device->sub_cs, 256, &preamble_cs); + if (result != VK_SUCCESS) + return vk_startup_errorf(device->instance, result, "bin restore"); + + TU_CALLX(device, tu_emit_bin_preamble)(device, &preamble_cs); + + device->bin_preamble_entry = tu_cs_end_sub_stream(&device->sub_cs, &preamble_cs); + + return VK_SUCCESS; +} + template static void tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) @@ -1490,6 +1560,21 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit_ib(cs, &dev->cmdbuf_start_a725_quirk_entry); } + tu_cs_emit_pkt7(cs, CP_SET_AMBLE, 3); + tu_cs_emit_qw(cs, cmd->device->bin_preamble_entry.bo->iova + + cmd->device->bin_preamble_entry.offset); + tu_cs_emit(cs, CP_SET_AMBLE_2_DWORDS(cmd->device->bin_preamble_entry.size / + sizeof(uint32_t)) | + CP_SET_AMBLE_2_TYPE(BIN_PREAMBLE_AMBLE_TYPE)); + + tu_cs_emit_pkt7(cs, CP_SET_AMBLE, 3); + tu_cs_emit_qw(cs, 0); + tu_cs_emit(cs, CP_SET_AMBLE_2_TYPE(PREAMBLE_AMBLE_TYPE)); + + tu_cs_emit_pkt7(cs, CP_SET_AMBLE, 3); + tu_cs_emit_qw(cs, 0); + tu_cs_emit(cs, CP_SET_AMBLE_2_TYPE(POSTAMBLE_AMBLE_TYPE)); + tu_cs_sanity_check(cs); } @@ -2013,17 +2098,7 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_cs_emit(cs, 0x0); if (CHIP >= A7XX) { - tu_cs_emit_regs(cs, - A7XX_RB_UNKNOWN_8812(0x0)); - tu_cs_emit_regs(cs, - A7XX_RB_UNKNOWN_8E06(0x0)); - - tu_cs_emit_regs(cs, A7XX_GRAS_UNKNOWN_8007(0x0)); - - tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8110(0x2)); - tu_cs_emit_regs(cs, A7XX_RB_UNKNOWN_8E09(0x4)); - - tu_cs_emit_regs(cs, A7XX_RB_BLIT_CLEAR_MODE(.clear_mode = CLEAR_MODE_GMEM)); + tu7_emit_tile_render_begin_regs(cs); } tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_GMEM); @@ -2071,6 +2146,16 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, } } + if (tiling->binning_possible) { + /* Upload state regs to memory to be restored on skipsaverestore + * preemption. + */ + tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); + tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_VSC_STATE_REG(0)) | + CP_REG_TO_MEM_0_CNT(32)); + tu_cs_emit_qw(cs, global_iova(cmd, vsc_state)); + } + tu_autotune_begin_renderpass(cmd, cs, autotune_result); tu_cs_sanity_check(cs); diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h index 1c35d077ba7..8d5b894d82a 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.h +++ b/src/freedreno/vulkan/tu_cmd_buffer.h @@ -771,4 +771,6 @@ _tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer *cmd, #define tu_create_fdm_bin_patchpoint(cmd, cs, size, apply, state) \ _tu_create_fdm_bin_patchpoint(cmd, cs, size, apply, &state, sizeof(state)) +VkResult tu_init_bin_preamble(struct tu_device *device); + #endif /* TU_CMD_BUFFER_H */ diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 713977e5815..4a4dbbee420 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -2503,6 +2503,10 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, } } + result = tu_init_bin_preamble(device); + if (result != VK_SUCCESS) + goto fail_bin_preamble; + if (physical_device->info->a7xx.cmdbuf_start_a725_quirk) { result = tu_init_cmdbuf_start_a725_quirk(device); if (result != VK_SUCCESS) @@ -2596,6 +2600,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, fail_timeline_cond: fail_a725_workaround: +fail_bin_preamble: fail_prepare_perfcntrs_pass_cs: free(device->perfcntrs_pass_cs_entries); fail_perfcntrs_pass_entries_alloc: diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index 16540cdb6f1..afdf1f3576f 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -235,6 +235,8 @@ struct tu6_global alignas(16) uint32_t cs_indirect_xyz[12]; + uint32_t vsc_state[32]; + volatile uint32_t vtx_stats_query_not_running; /* To know when renderpass stats for autotune are valid */ @@ -391,6 +393,8 @@ struct tu_device struct tu_cs_entry cmdbuf_start_a725_quirk_entry; + struct tu_cs_entry bin_preamble_entry; + struct util_dynarray dynamic_rendering_pending; VkCommandPool dynamic_rendering_pool; uint32_t dynamic_rendering_fence;