diff --git a/src/intel/vulkan/anv_astc_emu.c b/src/intel/vulkan/anv_astc_emu.c index e447db303c0..83c6cc660e7 100644 --- a/src/intel/vulkan/anv_astc_emu.c +++ b/src/intel/vulkan/anv_astc_emu.c @@ -5,6 +5,8 @@ #include "anv_private.h" +#include "vk_common_entrypoints.h" + #include "compiler/nir/nir_builder.h" static void @@ -293,8 +295,9 @@ astc_emu_flush_denorm_slice(struct anv_cmd_buffer *cmd_buffer, set_writes); VkDescriptorSet set = anv_descriptor_set_to_handle(&push_set.set); - anv_CmdBindPipeline(cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, - astc_emu->pipeline); + vk_common_CmdBindPipeline(cmd_buffer_, + VK_PIPELINE_BIND_POINT_COMPUTE, + astc_emu->pipeline); VkPushConstantsInfoKHR push_info = { .sType = VK_STRUCTURE_TYPE_PUSH_CONSTANTS_INFO_KHR, @@ -351,7 +354,9 @@ astc_emu_decompress_slice(struct anv_cmd_buffer *cmd_buffer, return; } - anv_CmdBindPipeline(cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + vk_common_CmdBindPipeline(cmd_buffer_, + VK_PIPELINE_BIND_POINT_COMPUTE, + pipeline); struct vk_texcompress_astc_write_descriptor_set writes; vk_texcompress_astc_fill_write_descriptor_sets(astc_emu->texcompress, diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index 945ac7686cd..e22612c35a5 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -30,6 +30,7 @@ #include "anv_private.h" #include "anv_measure.h" +#include "vk_common_entrypoints.h" #include "vk_util.h" /** \file anv_cmd_buffer.c @@ -435,17 +436,16 @@ set_dirty_for_bind_map(struct anv_cmd_buffer *cmd_buffer, } static void -anv_cmd_buffer_set_ray_query_buffer(struct anv_cmd_buffer *cmd_buffer, - struct anv_cmd_pipeline_state *pipeline_state, - struct anv_pipeline *pipeline, - VkShaderStageFlags stages) +anv_cmd_buffer_set_rt_query_buffer(struct anv_cmd_buffer *cmd_buffer, + struct anv_cmd_pipeline_state *pipeline_state, + uint32_t ray_queries, + VkShaderStageFlags stages) { struct anv_device *device = cmd_buffer->device; uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer); uint64_t ray_shadow_size = - align64(brw_rt_ray_queries_shadow_stacks_size(device->info, - pipeline->ray_queries), + align64(brw_rt_ray_queries_shadow_stacks_size(device->info, ray_queries), 4096); if (ray_shadow_size > 0 && (!cmd_buffer->state.ray_query_shadow_bo || @@ -497,112 +497,6 @@ anv_cmd_buffer_set_ray_query_buffer(struct anv_cmd_buffer *cmd_buffer, pipeline_state->push_constants_data_dirty = true; } -/** - * This function compute changes between 2 pipelines and flags the dirty HW - * state appropriately. - */ -static void -anv_cmd_buffer_flush_pipeline_hw_state(struct anv_cmd_buffer *cmd_buffer, - struct anv_graphics_pipeline *old_pipeline, - struct anv_graphics_pipeline *new_pipeline) -{ - struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; - struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state; - -#define diff_fix_state(bit, name) \ - do { \ - /* Fixed states should always have matching sizes */ \ - assert(old_pipeline == NULL || \ - old_pipeline->name.len == new_pipeline->name.len); \ - /* Don't bother memcmp if the state is already dirty */ \ - if (!BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_##bit) && \ - (old_pipeline == NULL || \ - memcmp(&old_pipeline->batch_data[old_pipeline->name.offset], \ - &new_pipeline->batch_data[new_pipeline->name.offset], \ - 4 * new_pipeline->name.len) != 0)) \ - BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_##bit); \ - } while (0) -#define diff_var_state(bit, name) \ - do { \ - /* Don't bother memcmp if the state is already dirty */ \ - /* Also if the new state is empty, avoid marking dirty */ \ - if (!BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_##bit) && \ - new_pipeline->name.len != 0 && \ - (old_pipeline == NULL || \ - old_pipeline->name.len != new_pipeline->name.len || \ - memcmp(&old_pipeline->batch_data[old_pipeline->name.offset], \ - &new_pipeline->batch_data[new_pipeline->name.offset], \ - 4 * new_pipeline->name.len) != 0)) \ - BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_##bit); \ - } while (0) -#define assert_identical(bit, name) \ - do { \ - /* Fixed states should always have matching sizes */ \ - assert(old_pipeline == NULL || \ - old_pipeline->name.len == new_pipeline->name.len); \ - assert(old_pipeline == NULL || \ - memcmp(&old_pipeline->batch_data[old_pipeline->name.offset], \ - &new_pipeline->batch_data[new_pipeline->name.offset], \ - 4 * new_pipeline->name.len) == 0); \ - } while (0) -#define assert_empty(name) assert(new_pipeline->name.len == 0) - - /* Compare all states, including partial packed ones, the dynamic part is - * left at 0 but the static part could still change. - * - * We avoid comparing protected packets as all the fields but the scratch - * surface are identical. we just need to select the right one at emission. - */ - diff_fix_state(VF_SGVS, final.vf_sgvs); - if (cmd_buffer->device->info->ver >= 11) - diff_fix_state(VF_SGVS_2, final.vf_sgvs_2); - diff_fix_state(VF_COMPONENT_PACKING, final.vf_component_packing); - diff_fix_state(VS, final.vs); - diff_fix_state(HS, final.hs); - diff_fix_state(DS, final.ds); - - diff_fix_state(WM, partial.wm); - diff_fix_state(STREAMOUT, partial.so); - diff_fix_state(GS, partial.gs); - diff_fix_state(TE, partial.te); - diff_fix_state(PS, partial.ps); - diff_fix_state(PS_EXTRA, partial.ps_extra); - - if (cmd_buffer->device->vk.enabled_extensions.EXT_mesh_shader) { - diff_fix_state(TASK_CONTROL, final.task_control); - diff_fix_state(TASK_SHADER, final.task_shader); - diff_fix_state(TASK_REDISTRIB, final.task_redistrib); - diff_fix_state(MESH_CONTROL, final.mesh_control); - diff_fix_state(MESH_SHADER, final.mesh_shader); - diff_fix_state(MESH_DISTRIB, final.mesh_distrib); - diff_fix_state(CLIP_MESH, final.clip_mesh); - } else { - assert_empty(final.task_control); - assert_empty(final.task_shader); - assert_empty(final.task_redistrib); - assert_empty(final.mesh_control); - assert_empty(final.mesh_shader); - assert_empty(final.mesh_distrib); - assert_empty(final.clip_mesh); - } - - /* States that can vary in length */ - diff_var_state(VF_SGVS_INSTANCING, final.vf_sgvs_instancing); - diff_var_state(SO_DECL_LIST, final.so_decl_list); - -#undef diff_fix_state -#undef diff_var_state -#undef assert_identical -#undef assert_empty - - /* We're not diffing the following : - * - anv_graphics_pipeline::vertex_input_data - * - anv_graphics_pipeline::final::vf_instancing - * - * since they are tracked by the runtime. - */ -} - static enum anv_cmd_dirty_bits get_pipeline_dirty_stages(struct anv_device *device, struct anv_graphics_pipeline *old_pipeline, @@ -636,7 +530,7 @@ get_pipeline_dirty_stages(struct anv_device *device, static void update_push_descriptor_flags(struct anv_cmd_pipeline_state *state, - struct anv_shader_bin **shaders, + struct anv_shader ** const shaders, uint32_t shader_count) { state->push_buffer_stages = 0; @@ -646,7 +540,7 @@ update_push_descriptor_flags(struct anv_cmd_pipeline_state *state, if (shaders[i] == NULL) continue; - VkShaderStageFlags stage = mesa_to_vk_shader_stage(shaders[i]->stage); + VkShaderStageFlags stage = mesa_to_vk_shader_stage(shaders[i]->vk.stage); if (shaders[i]->push_desc_info.used_descriptors) state->push_descriptor_stages |= stage; @@ -656,145 +550,6 @@ update_push_descriptor_flags(struct anv_cmd_pipeline_state *state, } } -void anv_CmdBindPipeline( - VkCommandBuffer commandBuffer, - VkPipelineBindPoint pipelineBindPoint, - VkPipeline _pipeline) -{ - ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); - ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline); - struct anv_cmd_pipeline_state *state; - VkShaderStageFlags stages = 0; - - switch (pipelineBindPoint) { - case VK_PIPELINE_BIND_POINT_COMPUTE: { - if (cmd_buffer->state.compute.base.pipeline == pipeline) - return; - - struct anv_compute_pipeline *compute_pipeline = - anv_pipeline_to_compute(pipeline); - - cmd_buffer->state.compute.shader = compute_pipeline->cs; - cmd_buffer->state.compute.pipeline_dirty = true; - - set_dirty_for_bind_map(cmd_buffer, MESA_SHADER_COMPUTE, - &compute_pipeline->cs->bind_map); - - state = &cmd_buffer->state.compute.base; - stages = VK_SHADER_STAGE_COMPUTE_BIT; - - update_push_descriptor_flags(state, &compute_pipeline->cs, 1); - break; - } - - case VK_PIPELINE_BIND_POINT_GRAPHICS: { - struct anv_graphics_pipeline *new_pipeline = - anv_pipeline_to_graphics(pipeline); - - /* Apply the non dynamic state from the pipeline */ - vk_cmd_set_dynamic_graphics_state(&cmd_buffer->vk, - &new_pipeline->dynamic_state); - - if (cmd_buffer->state.gfx.base.pipeline == pipeline) - return; - - struct anv_graphics_pipeline *old_pipeline = - cmd_buffer->state.gfx.base.pipeline == NULL ? NULL : - anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline); - - cmd_buffer->state.gfx.dirty |= - get_pipeline_dirty_stages(cmd_buffer->device, - old_pipeline, new_pipeline); - - STATIC_ASSERT(sizeof(cmd_buffer->state.gfx.shaders) == - sizeof(new_pipeline->base.shaders)); - memcpy(cmd_buffer->state.gfx.shaders, - new_pipeline->base.shaders, - sizeof(cmd_buffer->state.gfx.shaders)); - cmd_buffer->state.gfx.active_stages = pipeline->active_stages; - - anv_foreach_stage(stage, new_pipeline->base.base.active_stages) { - set_dirty_for_bind_map(cmd_buffer, stage, - &new_pipeline->base.shaders[stage]->bind_map); - } - - state = &cmd_buffer->state.gfx.base; - stages = new_pipeline->base.base.active_stages; - - update_push_descriptor_flags(state, - new_pipeline->base.shaders, - ARRAY_SIZE(new_pipeline->base.shaders)); - - /* When the pipeline is using independent states and dynamic buffers, - * this will trigger an update of anv_push_constants::dynamic_base_index - * & anv_push_constants::dynamic_offsets. - */ - struct anv_push_constants *push = - &cmd_buffer->state.gfx.base.push_constants; - struct anv_pipeline_sets_layout *layout = &new_pipeline->base.base.layout; - if (layout->independent_sets && layout->num_dynamic_buffers > 0) { - bool modified = false; - for (uint32_t s = 0; s < layout->num_sets; s++) { - if (layout->set_layouts[s] == NULL) - continue; - - assert(layout->dynamic_offset_start[s] < MAX_DYNAMIC_BUFFERS); - if (layout->set_layouts[s]->vk.dynamic_descriptor_count > 0 && - (push->desc_surface_offsets[s] & ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK) != - layout->dynamic_offset_start[s]) { - push->desc_surface_offsets[s] &= ~ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK; - push->desc_surface_offsets[s] |= (layout->dynamic_offset_start[s] & - ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK); - modified = true; - } - } - if (modified) { - cmd_buffer->state.push_constants_dirty |= stages; - state->push_constants_data_dirty = true; - } - } - - cmd_buffer->state.gfx.vs_source_hash = new_pipeline->vs_source_hash; - cmd_buffer->state.gfx.fs_source_hash = new_pipeline->fs_source_hash; - - cmd_buffer->state.gfx.instance_multiplier = new_pipeline->instance_multiplier; - - anv_cmd_buffer_flush_pipeline_hw_state(cmd_buffer, old_pipeline, new_pipeline); - break; - } - - case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: { - if (cmd_buffer->state.rt.base.pipeline == pipeline) - return; - - cmd_buffer->state.rt.pipeline_dirty = true; - - struct anv_ray_tracing_pipeline *rt_pipeline = - anv_pipeline_to_ray_tracing(pipeline); - if (rt_pipeline->stack_size > 0) { - anv_CmdSetRayTracingPipelineStackSizeKHR(commandBuffer, - rt_pipeline->stack_size); - } - - state = &cmd_buffer->state.rt.base; - - state->push_buffer_stages = pipeline->use_push_descriptor_buffer; - state->push_descriptor_stages = pipeline->use_push_descriptor_buffer; - state->push_descriptor_index = pipeline->layout.push_descriptor_set_index; - break; - } - - default: - UNREACHABLE("invalid bind point"); - break; - } - - state->pipeline = pipeline; - - if (pipeline->ray_queries > 0) - anv_cmd_buffer_set_ray_query_buffer(cmd_buffer, state, pipeline, stages); -} - static struct anv_cmd_pipeline_state * anv_cmd_buffer_get_pipeline_layout_state(struct anv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point, @@ -1519,20 +1274,37 @@ void anv_CmdPushDescriptorSetWithTemplate2KHR( NULL, NULL); } -void anv_CmdSetRayTracingPipelineStackSizeKHR( - VkCommandBuffer commandBuffer, - uint32_t pipelineStackSize) +void +anv_cmd_buffer_set_rt_state(struct vk_command_buffer *vk_cmd_buffer, + VkDeviceSize scratch_size, + uint32_t ray_queries) { - ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_cmd_buffer *cmd_buffer = + container_of(vk_cmd_buffer, struct anv_cmd_buffer, vk); struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt; + + rt->scratch_size = MAX2(rt->scratch_size, scratch_size); + if (ray_queries > 0) { + anv_cmd_buffer_set_rt_query_buffer(cmd_buffer, &rt->base, ray_queries, + ANV_RT_STAGE_BITS); + } +} + +void +anv_cmd_buffer_set_stack_size(struct vk_command_buffer *vk_cmd_buffer, + VkDeviceSize stack_size) +{ + struct anv_cmd_buffer *cmd_buffer = + container_of(vk_cmd_buffer, struct anv_cmd_buffer, vk); struct anv_device *device = cmd_buffer->device; + struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt; if (anv_batch_has_error(&cmd_buffer->batch)) return; uint32_t stack_ids_per_dss = 2048; /* TODO */ - unsigned stack_size_log2 = util_logbase2_ceil(pipelineStackSize); + unsigned stack_size_log2 = util_logbase2_ceil(stack_size); if (stack_size_log2 < 10) stack_size_log2 = 10; @@ -1585,7 +1357,7 @@ anv_cmd_buffer_save_state(struct anv_cmd_buffer *cmd_buffer, &cmd_buffer->state.compute.base; if (state->flags & ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE) - state->pipeline = pipe_state->pipeline; + state->shader = &cmd_buffer->state.compute.shader->vk; if (state->flags & ANV_CMD_SAVED_STATE_DESCRIPTOR_SET_0) state->descriptor_set[0] = pipe_state->descriptors[0]; @@ -1614,11 +1386,11 @@ anv_cmd_buffer_restore_state(struct anv_cmd_buffer *cmd_buffer, struct anv_cmd_pipeline_state *pipe_state = &cmd_buffer->state.compute.base; if (state->flags & ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE) { - if (state->pipeline) { - anv_CmdBindPipeline(cmd_buffer_, bind_point, - anv_pipeline_to_handle(state->pipeline)); + if (state->shader) { + mesa_shader_stage stage = MESA_SHADER_COMPUTE; + anv_cmd_buffer_bind_shaders(&cmd_buffer->vk, 1, &stage, &state->shader); } else { - pipe_state->pipeline = NULL; + cmd_buffer->state.compute.shader = NULL; } } @@ -1693,3 +1465,285 @@ anv_cmd_dispatch_unaligned(VkCommandBuffer commandBuffer, anv_genX(cmd_buffer->device->info, cmd_dispatch_unaligned) (commandBuffer, invocations_x, invocations_y, invocations_z); } + +static void +bind_compute_shader(struct anv_cmd_buffer *cmd_buffer, + struct anv_shader *shader) +{ + struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute; + + cmd_buffer->state.compute.shader = shader; + if (shader == NULL) + return; + + cmd_buffer->state.compute.pipeline_dirty = true; + set_dirty_for_bind_map(cmd_buffer, MESA_SHADER_COMPUTE, &shader->bind_map); + + update_push_descriptor_flags(&comp_state->base, + &cmd_buffer->state.compute.shader, 1); + + if (shader->vk.ray_queries > 0) { + assert(cmd_buffer->device->info->verx10 >= 125); + anv_cmd_buffer_set_rt_query_buffer(cmd_buffer, &comp_state->base, + shader->vk.ray_queries, + VK_SHADER_STAGE_COMPUTE_BIT); + } +} + +static void +bind_graphics_shaders(struct anv_cmd_buffer *cmd_buffer, + struct anv_shader *new_shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT]) +{ + struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state; + uint32_t ray_queries = 0; + + static const enum anv_cmd_dirty_bits mesa_stage_to_dirty_bit[] = { + [MESA_SHADER_VERTEX] = ANV_CMD_DIRTY_VS, + [MESA_SHADER_TESS_CTRL] = ANV_CMD_DIRTY_HS, + [MESA_SHADER_TESS_EVAL] = ANV_CMD_DIRTY_DS, + [MESA_SHADER_GEOMETRY] = ANV_CMD_DIRTY_GS, + [MESA_SHADER_TASK] = ANV_CMD_DIRTY_TASK, + [MESA_SHADER_MESH] = ANV_CMD_DIRTY_MESH, + [MESA_SHADER_FRAGMENT] = ANV_CMD_DIRTY_PS, + }; + + gfx->active_stages = 0; + gfx->instance_multiplier = 0; + + mesa_shader_stage new_streamout_stage = -1; + /* Find the last pre-rasterization stage */ + for (uint32_t i = 0; i < ANV_GRAPHICS_SHADER_STAGE_COUNT; i++) { + mesa_shader_stage s = ANV_GRAPHICS_SHADER_STAGE_COUNT - i - 1; + if (new_shaders[s] == NULL) + continue; + + assert(gfx->instance_multiplier == 0 || + gfx->instance_multiplier == new_shaders[s]->instance_multiplier); + gfx->active_stages |= mesa_to_vk_shader_stage(s); + gfx->instance_multiplier = new_shaders[s]->instance_multiplier; + + if (s == MESA_SHADER_FRAGMENT || + s == MESA_SHADER_TASK || + s == MESA_SHADER_TESS_CTRL) + continue; + + new_streamout_stage = MAX2(new_streamout_stage, s); + } + + for (uint32_t s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) { + struct anv_shader *shader = new_shaders[s]; + + if (shader != NULL) { + gfx->active_stages |= mesa_to_vk_shader_stage(s); + + ray_queries = MAX2(ray_queries, shader->vk.ray_queries); + if (gfx->shaders[s] != shader) + set_dirty_for_bind_map(cmd_buffer, s, &shader->bind_map); + } + + if (gfx->shaders[s] != shader) + gfx->dirty |= mesa_stage_to_dirty_bit[s]; + else + continue; + +#define diff_fix_state(bit, name) \ + do { \ + /* Fixed states should always have matching sizes */ \ + assert(gfx->shaders[s] == NULL || \ + gfx->shaders[s]->name.len == shader->name.len); \ + /* Don't bother memcmp if the state is already dirty */ \ + if (!BITSET_TEST(hw_state->pack_dirty, \ + ANV_GFX_STATE_##bit) && \ + (gfx->shaders[s] == NULL || \ + memcmp(&gfx->shaders[s]->cmd_data[ \ + gfx->shaders[s]->name.offset], \ + &shader->cmd_data[ \ + shader->name.offset], \ + 4 * shader->name.len) != 0)) \ + BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_##bit); \ + } while (0) +#define diff_var_state(bit, name) \ + do { \ + /* Don't bother memcmp if the state is already dirty */ \ + /* Also if the new state is empty, avoid marking dirty */ \ + if (!BITSET_TEST(hw_state->pack_dirty, \ + ANV_GFX_STATE_##bit) && \ + shader->name.len != 0 && \ + (gfx->shaders[s] == NULL || \ + gfx->shaders[s]->name.len != shader->name.len || \ + memcmp(&gfx->shaders[s]->cmd_data[ \ + gfx->shaders[s]->name.offset], \ + &shader->cmd_data[shader->name.offset], \ + 4 * shader->name.len) != 0)) \ + BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_##bit); \ + } while (0) +#define diff_fix_state_stage(bit, name, old_stage) \ + do { \ + /* Fixed states should always have matching sizes */ \ + assert(old_stage == MESA_SHADER_NONE || \ + gfx->shaders[old_stage] == NULL || \ + gfx->shaders[old_stage]->name.len == shader->name.len); \ + /* Don't bother memcmp if the state is already dirty */ \ + if (!BITSET_TEST(hw_state->pack_dirty, \ + ANV_GFX_STATE_##bit) && \ + (old_stage == MESA_SHADER_NONE || \ + gfx->shaders[old_stage] == NULL || \ + memcmp(&gfx->shaders[old_stage]->cmd_data[ \ + gfx->shaders[old_stage]->name.offset], \ + &shader->cmd_data[ \ + shader->name.offset], \ + 4 * shader->name.len) != 0)) \ + BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_##bit); \ + } while (0) +#define diff_var_state_stage(bit, name, old_stage) \ + do { \ + /* Don't bother memcmp if the state is already dirty */ \ + /* Also if the new state is empty, avoid marking dirty */ \ + if (!BITSET_TEST(hw_state->pack_dirty, \ + ANV_GFX_STATE_##bit) && \ + shader->name.len != 0 && \ + (gfx->shaders[old_stage] == NULL || \ + gfx->shaders[old_stage]->name.len != shader->name.len || \ + memcmp(&gfx->shaders[old_stage]->cmd_data[ \ + gfx->shaders[old_stage]->name.offset], \ + &shader->cmd_data[shader->name.offset], \ + 4 * shader->name.len) != 0)) \ + BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_##bit); \ + } while (0) + + switch (s) { + case MESA_SHADER_VERTEX: + if (shader != NULL) { + diff_fix_state(VS, vs.vs); + diff_fix_state(VF_SGVS, vs.vf_sgvs); + if (cmd_buffer->device->info->ver >= 11) + diff_fix_state(VF_SGVS_2, vs.vf_sgvs_2); + diff_fix_state(VF_COMPONENT_PACKING, vs.vf_component_packing); + diff_var_state(VF_SGVS_INSTANCING, vs.vf_sgvs_instancing); + gfx->vs_source_hash = shader->prog_data->source_hash; + } else { + BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_VS); + } + break; + + case MESA_SHADER_TESS_CTRL: + if (shader != NULL) + diff_fix_state(HS, hs.hs); + else + BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_HS); + break; + + case MESA_SHADER_TESS_EVAL: + if (shader != NULL) { + diff_fix_state(DS, ds.ds); + diff_fix_state(TE, ds.te); + } else { + BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_DS); + BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_TE); + } + break; + + case MESA_SHADER_GEOMETRY: + if (shader != NULL) + diff_fix_state(GS, gs.gs); + else + BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_GS); + break; + + case MESA_SHADER_MESH: + if (shader != NULL) { + diff_fix_state(MESH_CONTROL, ms.control); + diff_fix_state(MESH_SHADER, ms.shader); + diff_fix_state(MESH_DISTRIB, ms.distrib); + diff_fix_state(CLIP_MESH, ms.clip); + } else { + BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_MESH_CONTROL); + } + break; + + case MESA_SHADER_TASK: + if (shader != NULL) { + diff_fix_state(TASK_CONTROL, ts.control); + diff_fix_state(TASK_SHADER, ts.shader); + diff_fix_state(TASK_REDISTRIB, ts.redistrib); + } else { + BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_TASK_CONTROL); + } + break; + + case MESA_SHADER_FRAGMENT: + if (shader != NULL) { + diff_fix_state(WM, ps.wm); + diff_fix_state(PS, ps.ps); + diff_fix_state(PS_EXTRA, ps.ps_extra); + gfx->fs_source_hash = shader->prog_data->source_hash; + } else { + BITSET_SET(hw_state->pack_dirty, ANV_GFX_STATE_PS_EXTRA); + } + break; + + default: + UNREACHABLE("Invalid shader stage"); + } + + /* Only diff those field on the streamout stage */ + if (s == new_streamout_stage) { + diff_fix_state_stage(STREAMOUT, so, gfx->streamout_stage); + diff_var_state_stage(SO_DECL_LIST, so_decl_list, gfx->streamout_stage); + } + + gfx->shaders[s] = shader; + } + + gfx->streamout_stage = new_streamout_stage; + +#undef diff_fix_state +#undef diff_var_state +#undef diff_fix_state_stage +#undef diff_var_state_stage + + update_push_descriptor_flags(&gfx->base, + cmd_buffer->state.gfx.shaders, + ARRAY_SIZE(cmd_buffer->state.gfx.shaders)); + + if (ray_queries > 0) { + assert(cmd_buffer->device->info->verx10 >= 125); + anv_cmd_buffer_set_rt_query_buffer(cmd_buffer, &gfx->base, ray_queries, + cmd_buffer->state.gfx.active_stages); + } +} + +void +anv_cmd_buffer_bind_shaders(struct vk_command_buffer *vk_cmd_buffer, + uint32_t stage_count, + const mesa_shader_stage *stages, + struct vk_shader ** const vk_shaders) +{ + struct anv_shader ** const shaders = (struct anv_shader ** const)vk_shaders; + struct anv_cmd_buffer *cmd_buffer = + container_of(vk_cmd_buffer, struct anv_cmd_buffer, vk); + + /* Append any scratch surface used by the shaders */ + for (uint32_t i = 0; i < stage_count; i++) { + if (shaders[i] != NULL) { + anv_reloc_list_append(cmd_buffer->batch.relocs, + &shaders[i]->relocs); + } + } + + struct anv_shader *cs_shader = cmd_buffer->state.compute.shader; + struct anv_shader *gfx_shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT]; + memcpy(gfx_shaders, cmd_buffer->state.gfx.shaders, sizeof(gfx_shaders)); + for (uint32_t i = 0; i < stage_count; i++) { + if (mesa_shader_stage_is_compute(stages[i])) + cs_shader = shaders[i]; + else + gfx_shaders[stages[i]] = shaders[i]; + } + + if (cs_shader != cmd_buffer->state.compute.shader) + bind_compute_shader(cmd_buffer, cs_shader); + if (memcmp(gfx_shaders, cmd_buffer->state.gfx.shaders, sizeof(gfx_shaders))) + bind_graphics_shaders(cmd_buffer, gfx_shaders); +} diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 6b21b498e3a..00885daec5a 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -31,6 +31,7 @@ #include "anv_private.h" #include "anv_measure.h" +#include "anv_shader.h" #include "anv_slab_bo.h" #include "util/u_debug.h" #include "util/os_file.h" @@ -380,6 +381,8 @@ VkResult anv_CreateDevice( if (result != VK_SUCCESS) goto fail_alloc; + device->vk.shader_ops = &anv_device_shader_ops; + if (INTEL_DEBUG(DEBUG_BATCH) || INTEL_DEBUG(DEBUG_BATCH_STATS)) { for (unsigned i = 0; i < physical_device->queue.family_count; i++) { struct intel_batch_decode_ctx *decoder = &device->decoder[i]; diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h index ed079f59cec..342e00d31d9 100644 --- a/src/intel/vulkan/anv_genX.h +++ b/src/intel/vulkan/anv_genX.h @@ -223,7 +223,7 @@ uint32_t genX(cmd_buffer_flush_descriptor_sets)(struct anv_cmd_buffer *cmd_buffer, struct anv_cmd_pipeline_state *pipe_state, const VkShaderStageFlags dirty, - const struct anv_shader_bin **shaders, + const struct anv_shader **shaders, uint32_t num_shaders); void genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer); diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 9364eb4b909..ead66968608 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -173,17 +173,29 @@ anv_pipeline_finish(struct anv_pipeline *pipeline, vk_object_base_finish(&pipeline->vk.base); } +VKAPI_ATTR void VKAPI_CALL +vk_common_DestroyPipeline(VkDevice _device, + VkPipeline _pipeline, + const VkAllocationCallbacks *pAllocator); + void anv_DestroyPipeline( VkDevice _device, VkPipeline _pipeline, const VkAllocationCallbacks* pAllocator) { ANV_FROM_HANDLE(anv_device, device, _device); - ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline); + VK_FROM_HANDLE(vk_pipeline, vk_pipeline, _pipeline); - if (!pipeline) + if (!vk_pipeline) return; + if (vk_pipeline->ops != NULL) { + vk_common_DestroyPipeline(_device, _pipeline, pAllocator); + return; + } + + ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline); + ANV_RMV(resource_destroy, device, pipeline); switch (pipeline->type) { @@ -2851,6 +2863,7 @@ anv_compute_pipeline_create(struct anv_device *device, return pipeline->base.batch.status; } +#if 0 VkResult anv_CreateComputePipelines( VkDevice _device, VkPipelineCache pipelineCache, @@ -2885,6 +2898,7 @@ VkResult anv_CreateComputePipelines( return result; } +#endif static uint32_t get_vs_input_elements(const struct brw_vs_prog_data *vs_prog_data) @@ -3343,6 +3357,7 @@ anv_graphics_pipeline_create(struct anv_device *device, return pipeline->base.base.batch.status; } +#if 0 VkResult anv_CreateGraphicsPipelines( VkDevice _device, VkPipelineCache pipelineCache, @@ -3388,6 +3403,7 @@ VkResult anv_CreateGraphicsPipelines( return result; } +#endif static bool should_remat_cb(nir_instr *instr, void *data) @@ -4083,6 +4099,7 @@ anv_ray_tracing_pipeline_create( return pipeline->base.batch.status; } +#if 0 VkResult anv_CreateRayTracingPipelinesKHR( VkDevice _device, @@ -4491,3 +4508,4 @@ anv_GetRayTracingShaderGroupStackSizeKHR( return brw_bs_prog_data_const(bin->prog_data)->max_stack_size; } +#endif diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index de85f360815..00e48e319e7 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1224,7 +1224,6 @@ struct anv_shader { struct anv_state kernel; const struct brw_stage_prog_data *prog_data; - uint32_t prog_data_size; struct brw_compile_stats stats[3]; uint32_t num_stats; @@ -2186,6 +2185,11 @@ struct anv_gfx_dynamic_state { uint32_t PrimitiveTopologyType; } vft; + /* 3DSTATE_VS */ + struct { + bool VertexCacheDisable; + } vs; + /* 3DSTATE_VIEWPORT_STATE_POINTERS_CC */ struct { uint32_t count; @@ -4422,7 +4426,7 @@ struct anv_cmd_graphics_state { struct anv_cmd_pipeline_state base; /* Shaders bound */ - struct anv_shader_bin *shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT]; + struct anv_shader *shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT]; /* Bitfield of valid entries in the shaders array */ VkShaderStageFlags active_stages; @@ -4436,6 +4440,9 @@ struct anv_cmd_graphics_state { bool kill_pixel; bool uses_xfb; + /* Shader stage in base.shaders[] responsible for streamout */ + mesa_shader_stage streamout_stage; + /* Render pass information */ VkRenderingFlags rendering_flags; VkRect2D render_area; @@ -4530,7 +4537,7 @@ struct anv_cmd_graphics_state { struct anv_cmd_compute_state { struct anv_cmd_pipeline_state base; - struct anv_shader_bin *shader; + struct anv_shader *shader; bool pipeline_dirty; @@ -4551,6 +4558,8 @@ struct anv_cmd_ray_tracing_state { struct brw_rt_scratch_layout layout; } scratch; + VkDeviceSize scratch_size; + uint32_t debug_marker_count; uint32_t num_tlas; uint32_t num_blas; @@ -5022,6 +5031,12 @@ void anv_cmd_buffer_update_pending_query_bits(struct anv_cmd_buffer *cmd_buffer, enum anv_pipe_bits flushed_bits); +void +anv_cmd_buffer_bind_shaders(struct vk_command_buffer *cmd_buffer, + uint32_t stage_count, + const mesa_shader_stage *stages, + struct vk_shader ** const shaders); + /** * A allocation tied to a command buffer. * @@ -5083,7 +5098,7 @@ enum anv_cmd_saved_state_flags { struct anv_cmd_saved_state { uint32_t flags; - struct anv_pipeline *pipeline; + struct vk_shader *shader; struct anv_descriptor_set *descriptor_set[MAX_SETS]; uint8_t push_constants[MAX_PUSH_CONSTANTS_SIZE]; }; @@ -5444,7 +5459,6 @@ struct anv_graphics_pipeline { 4 * _cmd_state->len); \ } while (0) - struct anv_compute_pipeline { struct anv_pipeline base; @@ -6484,6 +6498,15 @@ anv_cmd_flush_buffer_write_cp(VkCommandBuffer cmd_buffer); VkResult anv_cmd_buffer_ensure_rcs_companion(struct anv_cmd_buffer *cmd_buffer); +void +anv_cmd_buffer_set_rt_state(struct vk_command_buffer *vk_cmd_buffer, + VkDeviceSize scratch_size, + uint32_t ray_queries); + +void +anv_cmd_buffer_set_stack_size(struct vk_command_buffer *vk_cmd_buffer, + VkDeviceSize stack_size); + bool anv_can_hiz_clear_image(struct anv_cmd_buffer *cmd_buffer, const struct anv_image *image, diff --git a/src/intel/vulkan/anv_shader_compile.c b/src/intel/vulkan/anv_shader_compile.c index 14eb0d2adf9..666082b4f4b 100644 --- a/src/intel/vulkan/anv_shader_compile.c +++ b/src/intel/vulkan/anv_shader_compile.c @@ -1886,5 +1886,8 @@ struct vk_device_shader_ops anv_device_shader_ops = { .deserialize = anv_shader_deserialize, .write_rt_shader_group = anv_write_rt_shader_group, .write_rt_shader_group_replay_handle = anv_write_rt_shader_group_replay_handle, + .cmd_bind_shaders = anv_cmd_buffer_bind_shaders, .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state, + .cmd_set_rt_state = anv_cmd_buffer_set_rt_state, + .cmd_set_stack_size = anv_cmd_buffer_set_stack_size, }; diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 33325da7197..46d84535a01 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -2121,7 +2121,7 @@ emit_direct_descriptor_binding_table_entry(struct anv_cmd_buffer *cmd_buffer, static VkResult emit_binding_table(struct anv_cmd_buffer *cmd_buffer, struct anv_cmd_pipeline_state *pipe_state, - const struct anv_shader_bin *shader, + const struct anv_shader *shader, struct anv_state *bt_state) { uint32_t state_offset; @@ -2153,7 +2153,7 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS: /* Color attachment binding */ - assert(shader->stage == MESA_SHADER_FRAGMENT); + assert(shader->vk.stage == MESA_SHADER_FRAGMENT); uint32_t index = binding->index < MAX_RTS ? cmd_buffer->state.gfx.color_output_mapping[binding->index] : binding->index; @@ -2268,7 +2268,7 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, static VkResult emit_samplers(struct anv_cmd_buffer *cmd_buffer, struct anv_cmd_pipeline_state *pipe_state, - const struct anv_shader_bin *shader, + const struct anv_shader *shader, struct anv_state *state) { const struct anv_pipeline_bind_map *map = &shader->bind_map; @@ -2312,7 +2312,7 @@ uint32_t genX(cmd_buffer_flush_descriptor_sets)(struct anv_cmd_buffer *cmd_buffer, struct anv_cmd_pipeline_state *pipe_state, const VkShaderStageFlags dirty, - const struct anv_shader_bin **shaders, + const struct anv_shader **shaders, uint32_t num_shaders) { VkShaderStageFlags flushed = 0; @@ -2322,7 +2322,7 @@ genX(cmd_buffer_flush_descriptor_sets)(struct anv_cmd_buffer *cmd_buffer, if (!shaders[i]) continue; - mesa_shader_stage stage = shaders[i]->stage; + mesa_shader_stage stage = shaders[i]->vk.stage; VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage); if ((vk_stage & dirty) == 0) continue; @@ -2361,7 +2361,7 @@ genX(cmd_buffer_flush_descriptor_sets)(struct anv_cmd_buffer *cmd_buffer, if (!shaders[i]) continue; - mesa_shader_stage stage = shaders[i]->stage; + mesa_shader_stage stage = shaders[i]->vk.stage; result = emit_samplers(cmd_buffer, pipe_state, shaders[i], &cmd_buffer->state.samplers[stage]); diff --git a/src/intel/vulkan/genX_cmd_compute.c b/src/intel/vulkan/genX_cmd_compute.c index 2eb8fceaa83..2fcbda1fef3 100644 --- a/src/intel/vulkan/genX_cmd_compute.c +++ b/src/intel/vulkan/genX_cmd_compute.c @@ -105,13 +105,11 @@ cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer) struct anv_device *device = cmd_buffer->device; struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute; const UNUSED struct intel_device_info *devinfo = cmd_buffer->device->info; - struct anv_compute_pipeline *pipeline = - anv_pipeline_to_compute(comp_state->base.pipeline); assert(comp_state->shader); genX(cmd_buffer_config_l3)(cmd_buffer, - pipeline->cs->prog_data->total_shared > 0 ? + comp_state->shader->prog_data->total_shared > 0 ? device->l3_slm_config : device->l3_config); genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE)); @@ -127,7 +125,7 @@ cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer) */ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); - if (cmd_buffer->state.compute.pipeline_dirty) { + if (comp_state->pipeline_dirty) { #if GFX_VERx10 < 125 /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE: * @@ -143,13 +141,28 @@ cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer) genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); #endif - anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch); +#define anv_batch_emit_cs(batch, cmd, field) ({ \ + void *__dst = anv_batch_emit_dwords( \ + batch, __anv_cmd_length(cmd)); \ + memcpy(__dst, \ + &comp_state->shader->cmd_data[ \ + comp_state->shader->field.offset], \ + 4 * __anv_cmd_length(cmd)); \ + VG(VALGRIND_CHECK_MEM_IS_DEFINED( \ + __dst, __anv_cmd_length(cmd) * 4)); \ + __dst; \ + }) + #if GFX_VERx10 >= 125 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(comp_state); genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, prog_data->base.total_scratch); +#else + anv_batch_emit_cs(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), cs.gfx9.vfe); #endif +#undef anv_batch_emit_cs + /* Changing the pipeline affects the push constants layout (different * amount of cross/per thread allocations). The allocation is also * bounded to just the amount consummed by the pipeline (see @@ -179,7 +192,7 @@ cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer) cmd_buffer, &cmd_buffer->state.compute.base, VK_SHADER_STAGE_COMPUTE_BIT, - (const struct anv_shader_bin **)&comp_state->shader, 1); + (const struct anv_shader **)&comp_state->shader, 1); cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT; #if GFX_VERx10 < 125 @@ -194,7 +207,7 @@ cmd_buffer_flush_compute_state(struct anv_cmd_buffer *cmd_buffer) struct anv_state state = anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw, - pipeline->gfx9.interface_descriptor_data, + comp_state->shader->cs.gfx9.idd, GENX(INTERFACE_DESCRIPTOR_DATA_length), 64); @@ -439,7 +452,7 @@ emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer, &cmd_buffer->batch, GENX(EXECUTE_INDIRECT_DISPATCH_length), GENX(EXECUTE_INDIRECT_DISPATCH_body_start) / 32, - anv_pipeline_to_compute(comp_state->base.pipeline)->gfx125.compute_walker_body, + comp_state->shader->cs.gfx125.compute_walker_body, GENX(EXECUTE_INDIRECT_DISPATCH), .PredicateEnable = predicate, .MaxCount = 1, @@ -520,7 +533,7 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer, &cmd_buffer->batch, GENX(COMPUTE_WALKER_length), GENX(COMPUTE_WALKER_body_start) / 32, - anv_pipeline_to_compute(comp_state->base.pipeline)->gfx125.compute_walker_body, + comp_state->shader->cs.gfx125.compute_walker_body, GENX(COMPUTE_WALKER), .IndirectParameterEnable = !anv_address_is_null(indirect_addr), .PredicateEnable = predicate, @@ -1051,8 +1064,6 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer, { struct anv_device *device = cmd_buffer->device; struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt; - struct anv_ray_tracing_pipeline *pipeline = - anv_pipeline_to_ray_tracing(rt->base.pipeline); if (INTEL_DEBUG(DEBUG_RT_NO_TRACE)) return; @@ -1211,18 +1222,18 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer, */ btd.PerDSSMemoryBackedBufferSize = 6; btd.MemoryBackedBufferBasePointer = (struct anv_address) { .bo = device->btd_fifo_bo }; - if (pipeline->base.scratch_size > 0) { + if (rt->scratch_size > 0) { struct anv_bo *scratch_bo = anv_scratch_pool_alloc(device, &device->scratch_pool, MESA_SHADER_COMPUTE, - pipeline->base.scratch_size); + rt->scratch_size); anv_reloc_list_add_bo(cmd_buffer->batch.relocs, scratch_bo); uint32_t scratch_surf = anv_scratch_pool_get_surf(cmd_buffer->device, &device->scratch_pool, - pipeline->base.scratch_size); + rt->scratch_size); btd.ScratchSpaceBuffer = scratch_surf >> ANV_SCRATCH_SPACE_SHIFT(GFX_VER); } #if INTEL_NEEDS_WA_14017794102 || INTEL_NEEDS_WA_14023061436 @@ -1234,7 +1245,7 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer, #endif } - genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, pipeline->base.scratch_size); + genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, rt->scratch_size); const struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data_const(device->rt_trampoline->prog_data); @@ -1273,7 +1284,7 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer, .ThreadGroupIDZDimension = global_size[2], .ExecutionMask = 0xff, .EmitInlineParameter = true, - .PostSync.MOCS = anv_mocs(pipeline->base.device, NULL, 0), + .PostSync.MOCS = anv_mocs(cmd_buffer->device, NULL, 0), #if GFX_VER >= 30 /* HSD 14016252163 */ .DispatchWalkOrder = cs_prog_data->uses_sampler ? MortonWalk : LinearWalk, diff --git a/src/intel/vulkan/genX_cmd_draw.c b/src/intel/vulkan/genX_cmd_draw.c index f7f61b38471..743012011ec 100644 --- a/src/intel/vulkan/genX_cmd_draw.c +++ b/src/intel/vulkan/genX_cmd_draw.c @@ -162,7 +162,7 @@ cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer, static struct anv_address get_push_range_address(struct anv_cmd_buffer *cmd_buffer, - const struct anv_shader_bin *shader, + const struct anv_shader *shader, const struct anv_push_range *range) { struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; @@ -242,10 +242,10 @@ get_push_range_address(struct anv_cmd_buffer *cmd_buffer, */ static uint32_t get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer, - const struct anv_shader_bin *shader, + const struct anv_shader *shader, const struct anv_push_range *range) { - assert(shader->stage != MESA_SHADER_COMPUTE); + assert(shader->vk.stage != MESA_SHADER_COMPUTE); const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; switch (range->set) { case ANV_DESCRIPTOR_SET_DESCRIPTORS: { @@ -443,7 +443,7 @@ cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer, if (!anv_gfx_has_stage(gfx, stage)) continue; - const struct anv_shader_bin *shader = gfx->shaders[stage]; + const struct anv_shader *shader = gfx->shaders[stage]; if (shader->prog_data->robust_ubo_ranges) { const struct anv_pipeline_bind_map *bind_map = &shader->bind_map; struct anv_push_constants *push = &gfx->base.push_constants; @@ -509,7 +509,7 @@ cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer, struct anv_address buffers[4] = {}; if (anv_gfx_has_stage(gfx, stage)) { - const struct anv_shader_bin *shader = gfx->shaders[stage]; + const struct anv_shader *shader = gfx->shaders[stage]; const struct anv_pipeline_bind_map *bind_map = &shader->bind_map; /* We have to gather buffer addresses as a second step because the @@ -593,7 +593,7 @@ get_mesh_task_push_addr64(struct anv_cmd_buffer *cmd_buffer, struct anv_cmd_graphics_state *gfx, mesa_shader_stage stage) { - const struct anv_shader_bin *shader = gfx->shaders[stage]; + const struct anv_shader *shader = gfx->shaders[stage]; const struct anv_pipeline_bind_map *bind_map = &shader->bind_map; if (bind_map->push_ranges[0].length == 0) return 0; @@ -645,31 +645,50 @@ cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer *cmd_buffer, ALWAYS_INLINE static void cmd_buffer_maybe_flush_rt_writes(struct anv_cmd_buffer *cmd_buffer, - const struct anv_graphics_pipeline *pipeline) + struct anv_cmd_graphics_state *gfx, + const struct vk_dynamic_graphics_state *dyn) { - if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) + if (!anv_gfx_has_stage(gfx, MESA_SHADER_FRAGMENT)) return; - UNUSED bool need_rt_flush = false; - for (uint32_t rt = 0; rt < pipeline->num_color_outputs; rt++) { - /* No writes going to this render target so it won't affect the RT cache - */ - if (pipeline->color_output_mapping[rt] == ANV_COLOR_OUTPUT_UNUSED) - continue; + /* Count the number of color attachments in the binding table */ + const struct anv_pipeline_bind_map *bind_map = + &gfx->shaders[MESA_SHADER_FRAGMENT]->bind_map; - /* No change */ - if (cmd_buffer->state.gfx.color_output_mapping[rt] == - pipeline->color_output_mapping[rt]) - continue; - - cmd_buffer->state.gfx.color_output_mapping[rt] = - pipeline->color_output_mapping[rt]; - need_rt_flush = true; - cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT; + /* Build a map of fragment color output to attachment */ + uint8_t rt_to_att[MAX_RTS]; + memset(rt_to_att, ANV_COLOR_OUTPUT_DISABLED, MAX_RTS); + for (uint32_t i = 0; i < MAX_RTS; i++) { + if (dyn->cal.color_map[i] != MESA_VK_ATTACHMENT_UNUSED) + rt_to_att[dyn->cal.color_map[i]] = i; + } + + /* For each fragment shader output if not unused apply the remapping to + * pipeline->color_output_mapping + */ + UNUSED bool need_rt_flush = false; + for (unsigned rt = 0; rt < MIN2(bind_map->surface_count, MAX_RTS); rt++) { + if (bind_map->surface_to_descriptor[rt].set != + ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS) + break; + + uint32_t index = bind_map->surface_to_descriptor[rt].index; + if (index == ANV_COLOR_OUTPUT_UNUSED) + continue; + + if (index == ANV_COLOR_OUTPUT_DISABLED && + gfx->color_output_mapping[rt] != index) { + gfx->color_output_mapping[rt] = index; + need_rt_flush = true; + } else if (gfx->color_output_mapping[rt] != rt_to_att[rt]) { + gfx->color_output_mapping[rt] = rt_to_att[rt]; + need_rt_flush = true; + } } -#if GFX_VER >= 11 if (need_rt_flush) { + cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT; +#if GFX_VER >= 11 /* The PIPE_CONTROL command description says: * * "Whenever a Binding Table Index (BTI) used by a Render Target Message @@ -689,8 +708,8 @@ cmd_buffer_maybe_flush_rt_writes(struct anv_cmd_buffer *cmd_buffer, ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT, "change RT due to shader outputs"); - } #endif + } } ALWAYS_INLINE static void @@ -750,8 +769,6 @@ cmd_buffer_flush_gfx_state(struct anv_cmd_buffer *cmd_buffer) { struct anv_device *device = cmd_buffer->device; struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; - struct anv_graphics_pipeline *pipeline = - anv_pipeline_to_graphics(gfx->base.pipeline); const struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state; @@ -772,16 +789,16 @@ cmd_buffer_flush_gfx_state(struct anv_cmd_buffer *cmd_buffer) * * Apply task URB workaround when switching from task to primitive. */ - if (anv_pipeline_is_primitive(pipeline)) { + if (!anv_gfx_has_stage(gfx, MESA_SHADER_MESH)) { genX(apply_task_urb_workaround)(cmd_buffer); - } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) { + } else if (anv_gfx_has_stage(gfx, MESA_SHADER_TASK)) { cmd_buffer->state.gfx.used_task_shader = true; } } if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP) || (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PS)) - cmd_buffer_maybe_flush_rt_writes(cmd_buffer, pipeline); + cmd_buffer_maybe_flush_rt_writes(cmd_buffer, gfx, dyn); /* Apply any pending pipeline flushes we may have. We want to apply them * now because, if any of those flushes are for things like push constants, @@ -887,17 +904,29 @@ cmd_buffer_flush_gfx_state(struct anv_cmd_buffer *cmd_buffer) /* If the pipeline changed, we may need to re-allocate push constant space * in the URB. */ - if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PUSH_CONSTANT_SHADERS) { + if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PUSH_CONSTANT_SHADERS) cmd_buffer_alloc_gfx_push_constants(cmd_buffer); - /* Also add the relocations (scratch buffers) */ - VkResult result = anv_reloc_list_append(cmd_buffer->batch.relocs, - pipeline->base.base.batch.relocs); - if (result != VK_SUCCESS) { - anv_batch_set_error(&cmd_buffer->batch, result); - return; +#if GFX_VERx10 < 125 + if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_VS | + ANV_CMD_DIRTY_HS | + ANV_CMD_DIRTY_DS | + ANV_CMD_DIRTY_GS | + ANV_CMD_DIRTY_PS)) { + for (unsigned s = 0; s <= MESA_SHADER_FRAGMENT; s++) { + if (gfx->shaders[s] == NULL) + continue; + + /* Also add the relocations (scratch buffers) */ + VkResult result = anv_reloc_list_append(cmd_buffer->batch.relocs, + &gfx->shaders[s]->relocs); + if (result != VK_SUCCESS) { + anv_batch_set_error(&cmd_buffer->batch, result); + return; + } } } +#endif /* Render targets live in the same binding table as fragment descriptors */ if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS) @@ -916,7 +945,7 @@ cmd_buffer_flush_gfx_state(struct anv_cmd_buffer *cmd_buffer) cmd_buffer, &cmd_buffer->state.gfx.base, descriptors_dirty, - (const struct anv_shader_bin **)gfx->shaders, + (const struct anv_shader **)gfx->shaders, ARRAY_SIZE(gfx->shaders)); cmd_buffer->state.descriptors_dirty &= ~dirty; } @@ -989,23 +1018,13 @@ anv_use_generated_draws(const struct anv_cmd_buffer *cmd_buffer, uint32_t count) ALWAYS_INLINE static void cmd_buffer_pre_draw_wa(struct anv_cmd_buffer *cmd_buffer) { + UNUSED const struct anv_device *device = cmd_buffer->device; + UNUSED const struct anv_instance *instance = + device->physical->instance; UNUSED const bool protected = cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT; - UNUSED struct anv_graphics_pipeline *pipeline = - anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline); - UNUSED struct anv_device *device = cmd_buffer->device; - UNUSED struct anv_instance *instance = device->physical->instance; - -#define DEBUG_SHADER_HASH(stage) do { \ - if (unlikely( \ - (instance->debug & ANV_DEBUG_SHADER_HASH) && \ - anv_pipeline_has_stage(pipeline, stage))) { \ - mi_store(&b, \ - mi_mem32(device->workaround_address), \ - mi_imm(pipeline->base.shaders[stage]-> \ - prog_data->source_hash)); \ - } \ - } while (0) + UNUSED struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + UNUSED struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state; struct mi_builder b; if (unlikely(instance->debug & ANV_DEBUG_SHADER_HASH)) { @@ -1013,18 +1032,35 @@ cmd_buffer_pre_draw_wa(struct anv_cmd_buffer *cmd_buffer) mi_builder_set_mocs(&b, isl_mocs(&device->isl_dev, 0, false)); } +#define DEBUG_SHADER_HASH(stage) do { \ + if (unlikely(instance->debug & ANV_DEBUG_SHADER_HASH)) { \ + mi_store(&b, \ + mi_mem32(device->workaround_address), \ + mi_imm(gfx->shaders[stage]->prog_data->source_hash)); \ + } \ + } while (0) + +#define anv_batch_emit_gfx(batch, cmd, name) ({ \ + void *__dst = anv_batch_emit_dwords( \ + batch, __anv_cmd_length(cmd)); \ + memcpy(__dst, hw_state->packed.name, \ + 4 * __anv_cmd_length(cmd)); \ + VG(VALGRIND_CHECK_MEM_IS_DEFINED( \ + __dst, __anv_cmd_length(cmd) * 4)); \ + __dst; \ + }) + #if INTEL_WA_16011107343_GFX_VER if (intel_needs_workaround(cmd_buffer->device->info, 16011107343) && - anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL)) { + anv_gfx_has_stage(gfx, MESA_SHADER_TESS_CTRL)) { DEBUG_SHADER_HASH(MESA_SHADER_TESS_CTRL); - anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline, - final.hs, protected); + anv_batch_emit_gfx(&cmd_buffer->batch, GENX(3DSTATE_HS), hs); } #endif #if INTEL_WA_22018402687_GFX_VER if (intel_needs_workaround(cmd_buffer->device->info, 22018402687) && - anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) { + anv_gfx_has_stage(gfx, MESA_SHADER_TESS_EVAL)) { DEBUG_SHADER_HASH(MESA_SHADER_TESS_EVAL); /* Wa_22018402687: * In any 3D enabled context, just before any Tessellation enabled @@ -1038,13 +1074,13 @@ cmd_buffer_pre_draw_wa(struct anv_cmd_buffer *cmd_buffer) * said switch, as it matters at the HW level, and can be triggered even * across processes, so we apply the Wa at all times. */ - anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline, - final.ds, protected); + anv_batch_emit_gfx(&cmd_buffer->batch, GENX(3DSTATE_DS), ds); } #endif genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true); +#undef anv_batch_emit_gfx #undef DEBUG_SHADER_HASH } diff --git a/src/intel/vulkan/genX_cmd_draw_generated_indirect.h b/src/intel/vulkan/genX_cmd_draw_generated_indirect.h index f5fd7c71d37..90584136ec6 100644 --- a/src/intel/vulkan/genX_cmd_draw_generated_indirect.h +++ b/src/intel/vulkan/genX_cmd_draw_generated_indirect.h @@ -96,18 +96,10 @@ genX(cmd_buffer_emit_generate_draws)(struct anv_cmd_buffer *cmd_buffer, ANV_STATE_NULL; UNUSED uint32_t wa_insts_offset = 0; -#if INTEL_WA_16011107343_GFX_VER || INTEL_WA_22018402687_GFX_VER - struct anv_graphics_pipeline *pipeline = - anv_pipeline_to_graphics(gfx->base.pipeline); -#endif - #if INTEL_WA_16011107343_GFX_VER if (wa_16011107343) { memcpy(wa_insts_state.map + wa_insts_offset, - &pipeline->batch_data[ - protected ? - pipeline->final.hs_protected.offset : - pipeline->final.hs.offset], + gfx->dyn_state.packed.hs, GENX(3DSTATE_HS_length) * 4); wa_insts_offset += GENX(3DSTATE_HS_length) * 4; } @@ -116,10 +108,7 @@ genX(cmd_buffer_emit_generate_draws)(struct anv_cmd_buffer *cmd_buffer, #if INTEL_WA_22018402687_GFX_VER if (wa_22018402687) { memcpy(wa_insts_state.map + wa_insts_offset, - &pipeline->batch_data[ - protected ? - pipeline->final.ds_protected.offset : - pipeline->final.ds.offset], + gfx->dyn_state.packed.ds, GENX(3DSTATE_DS_length) * 4); wa_insts_offset += GENX(3DSTATE_DS_length) * 4; } diff --git a/src/intel/vulkan/genX_gfx_state.c b/src/intel/vulkan/genX_gfx_state.c index 9892780b3b3..547cef56242 100644 --- a/src/intel/vulkan/genX_gfx_state.c +++ b/src/intel/vulkan/genX_gfx_state.c @@ -209,7 +209,7 @@ genX(streamout_prologue)(struct anv_cmd_buffer *cmd_buffer, if (!intel_needs_workaround(cmd_buffer->device->info, 16013994831)) return; - if (gfx->uses_xfb) { + if (gfx->shaders[gfx->streamout_stage]->xfb_info != NULL) { genX(cmd_buffer_set_preemption)(cmd_buffer, false); return; } @@ -417,10 +417,10 @@ want_stencil_pma_fix(const struct vk_dynamic_graphics_state *dyn, * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) || * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF) */ - struct anv_shader_bin *fs_bin = gfx->shaders[MESA_SHADER_FRAGMENT]; + struct anv_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT]; return kill_pixel(wm_prog_data, dyn) || - has_ds_feedback_loop(&fs_bin->bind_map, dyn) || + has_ds_feedback_loop(&fs->bind_map, dyn) || wm_prog_data->computed_depth_mode != PSCDEPTH_OFF; } @@ -1012,21 +1012,21 @@ update_ps(struct anv_gfx_dynamic_state *hw_state, return; } - const struct anv_shader_bin *fs_bin = gfx->shaders[MESA_SHADER_FRAGMENT]; + const struct anv_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT]; struct GENX(3DSTATE_PS) ps = {}; intel_set_ps_dispatch_state(&ps, device->info, wm_prog_data, MAX2(dyn->ms.rasterization_samples, 1), hw_state->fs_msaa_flags); SET(PS, ps.KernelStartPointer0, - fs_bin->kernel.offset + + fs->kernel.offset + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0)); SET(PS, ps.KernelStartPointer1, - fs_bin->kernel.offset + + fs->kernel.offset + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1)); #if GFX_VER < 20 SET(PS, ps.KernelStartPointer2, - fs_bin->kernel.offset + + fs->kernel.offset + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2)); #endif @@ -1124,12 +1124,12 @@ update_ps_extra_kills_pixel(struct anv_gfx_dynamic_state *hw_state, const struct vk_dynamic_graphics_state *dyn, const struct anv_cmd_graphics_state *gfx) { - struct anv_shader_bin *fs_bin = gfx->shaders[MESA_SHADER_FRAGMENT]; + struct anv_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT]; const struct brw_wm_prog_data *wm_prog_data = get_gfx_wm_prog_data(gfx); SET_STAGE(PS_EXTRA, ps_extra.PixelShaderKillsPixel, wm_prog_data && - (has_ds_feedback_loop(&fs_bin->bind_map, dyn) || + (has_ds_feedback_loop(&fs->bind_map, dyn) || wm_prog_data->uses_kill), FRAGMENT); } @@ -2174,6 +2174,35 @@ update_tbimr_info(struct anv_gfx_dynamic_state *hw_state, } #endif +#if GFX_VERx10 == 90 +ALWAYS_INLINE static void +update_vs(struct anv_gfx_dynamic_state *hw_state, + const struct anv_cmd_graphics_state *gfx, + const struct anv_device *device) +{ + if (device->info->gt < 4) + return; + + /* On Sky Lake GT4, we have experienced some hangs related to the VS cache + * and tessellation. It is unknown exactly what is happening but the + * Haswell docs for the "VS Reference Count Full Force Miss Enable" field + * of the "Thread Mode" register refer to a HSW bug in which the VUE handle + * reference count would overflow resulting in internal reference counting + * bugs. My (Faith's) best guess is that this bug cropped back up on SKL + * GT4 when we suddenly had more threads in play than any previous gfx9 + * hardware. + * + * What we do know for sure is that setting this bit when tessellation + * shaders are in use fixes a GPU hang in Batman: Arkham City when playing + * with DXVK (https://bugs.freedesktop.org/107280). Disabling the vertex + * cache with tessellation shaders should only have a minor performance + * impact as the tessellation shaders are likely generating and processing + * far more geometry than the vertex stage. + */ + SET(VS, vs.VertexCacheDisable, anv_gfx_has_stage(gfx, MESA_SHADER_TESS_EVAL)); +} +#endif + #if INTEL_WA_18019110168_GFX_VER static inline unsigned compute_mesh_provoking_vertex(const struct brw_mesh_prog_data *mesh_prog_data, @@ -2215,11 +2244,13 @@ cmd_buffer_flush_gfx_runtime_state(struct anv_gfx_dynamic_state *hw_state, const struct anv_device *device, const struct vk_dynamic_graphics_state *dyn, struct anv_cmd_graphics_state *gfx, - const struct anv_graphics_pipeline *pipeline, VkCommandBufferLevel cmd_buffer_level) { UNUSED bool fs_msaa_changed = false; + assert(gfx->shaders[gfx->streamout_stage] != NULL); + assert(gfx->instance_multiplier != 0); + /* Do this before update_fs_msaa_flags() for primitive_id_index */ if (gfx->dirty & ANV_CMD_DIRTY_ALL_SHADERS(device)) update_sbe(hw_state, gfx, device); @@ -2234,6 +2265,11 @@ cmd_buffer_flush_gfx_runtime_state(struct anv_gfx_dynamic_state *hw_state, if (gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS) update_urb_config(hw_state, gfx, device); +#if GFX_VERx10 == 90 + if (gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS) + update_vs(hw_state, gfx, device); +#endif + if ((gfx->dirty & ANV_CMD_DIRTY_PS) || BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_FS_MSAA_FLAGS)) { update_ps(hw_state, device, dyn, gfx); @@ -2482,8 +2518,7 @@ cmd_buffer_flush_gfx_runtime_state(struct anv_gfx_dynamic_state *hw_state, static void cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state, struct anv_cmd_buffer *cmd_buffer, - const struct anv_cmd_graphics_state *gfx, - const struct anv_graphics_pipeline *pipeline) + const struct anv_cmd_graphics_state *gfx) { struct anv_device *device = cmd_buffer->device; struct anv_instance *instance = device->physical->instance; @@ -2502,73 +2537,107 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state, } while (0) #define IS_DIRTY(name) BITSET_TEST(hw_state->pack_dirty, ANV_GFX_STATE_##name) -#define anv_gfx_copy(field, cmd, source) ({ \ - assert(sizeof(hw_state->packed.field) >= \ - 4 * __anv_cmd_length(cmd)); \ - assert((source).len == __anv_cmd_length(cmd)); \ - memcpy(&hw_state->packed.field, \ - &pipeline->batch_data[(source).offset], \ - 4 * __anv_cmd_length(cmd)); \ +#define anv_gfx_copy(field, cmd, stage, source) ({ \ + if (gfx->shaders[stage] != NULL) { \ + assert(sizeof(hw_state->packed.field) >= \ + 4 * __anv_cmd_length(cmd)); \ + assert((gfx->shaders[stage]->source).len == \ + __anv_cmd_length(cmd)); \ + memcpy(&hw_state->packed.field, \ + &gfx->shaders[stage]->cmd_data[ \ + (gfx->shaders[stage]->source).offset], \ + 4 * __anv_cmd_length(cmd)); \ + } else { \ + anv_gfx_pack(field, cmd, __unused_name); \ + } \ }) -#define anv_gfx_copy_variable(field, source) ({ \ - assert(sizeof(hw_state->packed.field) >= \ - 4 * (source).len); \ - memcpy(&hw_state->packed.field, \ - &pipeline->batch_data[(source).offset], \ - 4 * (source).len); \ - hw_state->packed.field##_len = (source).len; \ +#define anv_gfx_copy_variable(field, stage, source) ({ \ + if (gfx->shaders[stage] != NULL) { \ + assert(sizeof(hw_state->packed.field) >= \ + 4 * gfx->shaders[stage]->source.len); \ + memcpy(&hw_state->packed.field, \ + &gfx->shaders[stage]->cmd_data[ \ + (gfx->shaders[stage]->source).offset], \ + 4 * gfx->shaders[stage]->source.len); \ + hw_state->packed.field##_len = \ + gfx->shaders[stage]->source.len; \ + } \ }) -#define anv_gfx_copy_protected(field, cmd, source) ({ \ +#define anv_gfx_copy_protected(field, cmd, stage, source) ({ \ const bool __protected = (cmd_buffer->vk.pool->flags & \ VK_COMMAND_POOL_CREATE_PROTECTED_BIT); \ assert(sizeof(hw_state->packed.field) >= \ 4 * __anv_cmd_length(cmd)); \ - assert((source).len == __anv_cmd_length(cmd)); \ - memcpy(&hw_state->packed.field, \ - &pipeline->batch_data[ \ - __protected ? \ - (source##_protected).offset : \ - (source).offset], \ - 4 * __anv_cmd_length(cmd)); \ + if (gfx->shaders[stage] != NULL) { \ + assert((gfx->shaders[stage]->source).len == \ + __anv_cmd_length(cmd)); \ + memcpy(&hw_state->packed.field, \ + &gfx->shaders[stage]->cmd_data[ \ + __protected ? \ + gfx->shaders[stage]->source##_protected.offset : \ + gfx->shaders[stage]->source.offset], \ + 4 * __anv_cmd_length(cmd)); \ + } else { \ + memcpy(&hw_state->packed.field, \ + device->physical->gfx_default.field, \ + 4 * __anv_cmd_length(cmd)); \ + } \ }) -#define anv_gfx_pack_merge(field, cmd, prepacked, name) \ - for (struct cmd name = { 0 }, \ +#define anv_gfx_pack_merge(field, cmd, stage, source, name) \ + for (struct cmd name = (struct cmd) { 0 }, \ *_dst = (struct cmd *)hw_state->packed.field; \ __builtin_expect(_dst != NULL, 1); \ - ({ const struct anv_gfx_state_ptr *_cmd_state = &prepacked; \ + ({ \ uint32_t _partial[__anv_cmd_length(cmd)]; \ - assert(_cmd_state->len == __anv_cmd_length(cmd)); \ assert(sizeof(hw_state->packed.field) >= \ 4 * __anv_cmd_length(cmd)); \ __anv_cmd_pack(cmd)(NULL, _partial, &name); \ - for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) { \ - assert((_partial[i] & \ - (pipeline)->batch_data[ \ - (prepacked).offset + i]) == 0); \ - ((uint32_t *)_dst)[i] = _partial[i] | \ - (pipeline)->batch_data[_cmd_state->offset + i]; \ + if (gfx->shaders[stage] != NULL) { \ + const struct anv_gfx_state_ptr *_cmd_state = \ + &gfx->shaders[stage]->source; \ + assert(_cmd_state->len == __anv_cmd_length(cmd)); \ + for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) { \ + assert((_partial[i] & \ + gfx->shaders[stage]->cmd_data[ \ + _cmd_state->offset + i]) == 0); \ + ((uint32_t *)_dst)[i] = _partial[i] | \ + gfx->shaders[stage]->cmd_data[_cmd_state->offset + i]; \ + } \ + } else { \ + for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) { \ + assert((_partial[i] & \ + device->physical->gfx_default.field[i]) == 0); \ + ((uint32_t *)_dst)[i] = _partial[i] | \ + device->physical->gfx_default.field[i]; \ + } \ } \ _dst = NULL; \ - })) -#define anv_gfx_pack_merge_protected(field, cmd, prepacked, name) \ - for (struct cmd name = { 0 }, \ + })) +#define anv_gfx_pack_merge_protected(field, cmd, stage, source, name) \ + for (struct cmd name = (struct cmd) { 0 }, \ *_dst = (struct cmd *)hw_state->packed.field; \ __builtin_expect(_dst != NULL, 1); \ - ({ const struct anv_gfx_state_ptr *_cmd_state = \ - (cmd_buffer->vk.pool->flags & \ - VK_COMMAND_POOL_CREATE_PROTECTED_BIT) ? \ - &prepacked##_protected : &prepacked; \ + ({ \ uint32_t _partial[__anv_cmd_length(cmd)]; \ - assert(_cmd_state->len == __anv_cmd_length(cmd)); \ assert(sizeof(hw_state->packed.field) >= \ 4 * __anv_cmd_length(cmd)); \ __anv_cmd_pack(cmd)(NULL, _partial, &name); \ + const struct anv_gfx_state_ptr *_cmd_state = \ + gfx->shaders[stage] != NULL ? \ + ((cmd_buffer->vk.pool->flags & \ + VK_COMMAND_POOL_CREATE_PROTECTED_BIT) ? \ + &gfx->shaders[stage]->source##_protected : \ + &gfx->shaders[stage]->source) : \ + NULL; \ + assert(_cmd_state == NULL || \ + _cmd_state->len == __anv_cmd_length(cmd)); \ + const uint32_t *_inst_data = \ + gfx->shaders[stage] != NULL ? \ + &gfx->shaders[stage]->cmd_data[_cmd_state->offset] : \ + device->physical->gfx_default.field; \ for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) { \ - assert((_partial[i] & \ - (pipeline)->batch_data[ \ - (prepacked).offset + i]) == 0); \ - ((uint32_t *)_dst)[i] = _partial[i] | \ - (pipeline)->batch_data[_cmd_state->offset + i]; \ + assert((_partial[i] & _inst_data[i]) == 0); \ + ((uint32_t *)_dst)[i] = _partial[i] | _inst_data[i]; \ } \ _dst = NULL; \ })) @@ -2624,19 +2693,19 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state, #endif if (IS_DIRTY(VF_SGVS)) - anv_gfx_copy(vf_sgvs, GENX(3DSTATE_VF_SGVS), pipeline->final.vf_sgvs); + anv_gfx_copy(vf_sgvs, GENX(3DSTATE_VF_SGVS), MESA_SHADER_VERTEX, vs.vf_sgvs); #if GFX_VER >= 11 if (IS_DIRTY(VF_SGVS_2)) - anv_gfx_copy(vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), pipeline->final.vf_sgvs_2); + anv_gfx_copy(vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), MESA_SHADER_VERTEX, vs.vf_sgvs_2); #endif if (IS_DIRTY(VF_SGVS_INSTANCING)) - anv_gfx_copy_variable(vf_sgvs_instancing, pipeline->final.vf_sgvs_instancing); + anv_gfx_copy_variable(vf_sgvs_instancing, MESA_SHADER_VERTEX, vs.vf_sgvs_instancing); if (instance->vf_component_packing && IS_DIRTY(VF_COMPONENT_PACKING)) { anv_gfx_copy(vf_component_packing, GENX(3DSTATE_VF_COMPONENT_PACKING), - pipeline->final.vf_component_packing); + MESA_SHADER_VERTEX, vs.vf_component_packing); } if (IS_DIRTY(INDEX_BUFFER)) { @@ -2655,7 +2724,7 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state, if (IS_DIRTY(STREAMOUT)) { anv_gfx_pack_merge(so, GENX(3DSTATE_STREAMOUT), - pipeline->partial.so, so) { + gfx->streamout_stage, so, so) { SET(so, so, RenderingDisable); SET(so, so, RenderStreamSelect); SET(so, so, ReorderMode); @@ -2664,7 +2733,7 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state, } if (IS_DIRTY(SO_DECL_LIST)) - anv_gfx_copy_variable(so_decl_list, pipeline->final.so_decl_list); + anv_gfx_copy_variable(so_decl_list, gfx->streamout_stage, so_decl_list); if (IS_DIRTY(CLIP)) { anv_gfx_pack(clip, GENX(3DSTATE_CLIP), clip) { @@ -2886,7 +2955,8 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state, if (IS_DIRTY(TE)) { if (anv_gfx_has_stage(gfx, MESA_SHADER_TESS_EVAL)) { - anv_gfx_pack_merge(te, GENX(3DSTATE_TE), pipeline->partial.te, te) { + anv_gfx_pack_merge(te, GENX(3DSTATE_TE), + MESA_SHADER_TESS_EVAL, ds.te, te) { SET(te, te, OutputTopology); #if GFX_VERx10 >= 125 SET(te, te, TessellationDistributionMode); @@ -2986,7 +3056,8 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state, } if (IS_DIRTY(WM)) { - anv_gfx_pack_merge(wm, GENX(3DSTATE_WM), pipeline->partial.wm, wm) { + anv_gfx_pack_merge(wm, GENX(3DSTATE_WM), + MESA_SHADER_FRAGMENT, ps.wm, wm) { SET(wm, wm, LineStippleEnable); SET(wm, wm, BarycentricInterpolationMode); } @@ -3079,12 +3150,12 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state, } #if GFX_VERx10 >= 125 - if (device->vk.enabled_features.meshShader) { + if (device->vk.enabled_extensions.EXT_mesh_shader) { if (IS_DIRTY(MESH_CONTROL)) { if (anv_gfx_has_stage(gfx, MESA_SHADER_MESH)) { anv_gfx_copy_protected(mesh_control, GENX(3DSTATE_MESH_CONTROL), - pipeline->final.mesh_control); + MESA_SHADER_MESH, ms.control); } else { anv_gfx_pack(mesh_control, GENX(3DSTATE_MESH_CONTROL), mc); } @@ -3092,8 +3163,9 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state, if (IS_DIRTY(TASK_CONTROL)) { if (anv_gfx_has_stage(gfx, MESA_SHADER_TASK)) { - anv_gfx_copy_protected(task_control, GENX(3DSTATE_TASK_CONTROL), - pipeline->final.task_control); + anv_gfx_copy_protected(task_control, + GENX(3DSTATE_TASK_CONTROL), + MESA_SHADER_TASK, ts.control); } else { anv_gfx_pack(task_control, GENX(3DSTATE_TASK_CONTROL), tc); } @@ -3101,101 +3173,86 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state, if (IS_DIRTY(MESH_SHADER)) { anv_gfx_copy(mesh_shader, GENX(3DSTATE_MESH_SHADER), - pipeline->final.mesh_shader); + MESA_SHADER_MESH, ms.shader); } if (IS_DIRTY(MESH_DISTRIB)) { anv_gfx_copy(mesh_distrib, GENX(3DSTATE_MESH_DISTRIB), - pipeline->final.mesh_distrib); + MESA_SHADER_MESH, ms.distrib); } if (IS_DIRTY(CLIP_MESH)) { anv_gfx_copy(clip_mesh, GENX(3DSTATE_CLIP_MESH), - pipeline->final.clip_mesh); + MESA_SHADER_MESH, ms.clip); } if (IS_DIRTY(TASK_SHADER)) { anv_gfx_copy(task_shader, GENX(3DSTATE_TASK_SHADER), - pipeline->final.task_shader); + MESA_SHADER_TASK, ts.shader); } if (IS_DIRTY(TASK_REDISTRIB)) { anv_gfx_copy(task_redistrib, GENX(3DSTATE_TASK_REDISTRIB), - pipeline->final.task_redistrib); + MESA_SHADER_TASK, ts.redistrib); } } #endif /* GFX_VERx10 >= 125 */ if (IS_DIRTY(VS)) { - if (anv_gfx_has_stage(gfx, MESA_SHADER_VERTEX)) { - anv_gfx_copy_protected(vs, GENX(3DSTATE_VS), pipeline->final.vs); - } else { - anv_gfx_pack(vs, GENX(3DSTATE_VS), vs); +#if GFX_VERx10 == 90 + anv_gfx_pack_merge_protected(vs, GENX(3DSTATE_VS), + MESA_SHADER_VERTEX, vs.vs, vs) { + SET(vs, vs, VertexCacheDisable); } +#else + anv_gfx_copy_protected(vs, GENX(3DSTATE_VS), MESA_SHADER_VERTEX, vs.vs); +#endif } - if (IS_DIRTY(HS)) { - if (anv_gfx_has_stage(gfx, MESA_SHADER_TESS_CTRL)) { - anv_gfx_copy_protected(hs, GENX(3DSTATE_HS), pipeline->final.hs); - } else { - anv_gfx_pack(hs, GENX(3DSTATE_HS), hs); - } - } + if (IS_DIRTY(HS)) + anv_gfx_copy_protected(hs, GENX(3DSTATE_HS), MESA_SHADER_TESS_CTRL, hs.hs); - if (IS_DIRTY(DS)) { - if (anv_gfx_has_stage(gfx, MESA_SHADER_TESS_EVAL)) { - anv_gfx_copy_protected(ds, GENX(3DSTATE_DS), pipeline->final.ds); - } else { - anv_gfx_pack(ds, GENX(3DSTATE_DS), ds); - } - } + if (IS_DIRTY(DS)) + anv_gfx_copy_protected(ds, GENX(3DSTATE_DS), MESA_SHADER_TESS_EVAL, ds.ds); if (IS_DIRTY(GS)) { - if (anv_gfx_has_stage(gfx, MESA_SHADER_GEOMETRY)) { - anv_gfx_pack_merge_protected(gs, GENX(3DSTATE_GS), - pipeline->partial.gs, gs) { - SET(gs, gs, ReorderMode); - } - } else { - anv_gfx_pack(gs, GENX(3DSTATE_GS), gs); + anv_gfx_pack_merge_protected(gs, GENX(3DSTATE_GS), + MESA_SHADER_GEOMETRY, gs.gs, gs) { + SET(gs, gs, ReorderMode); } } if (IS_DIRTY(PS)) { - if (anv_gfx_has_stage(gfx, MESA_SHADER_FRAGMENT)) { - anv_gfx_pack_merge_protected(ps, GENX(3DSTATE_PS), - pipeline->partial.ps, ps) { - SET(ps, ps, KernelStartPointer0); - SET(ps, ps, KernelStartPointer1); - SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData0); - SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData1); + anv_gfx_pack_merge_protected(ps, GENX(3DSTATE_PS), + MESA_SHADER_FRAGMENT, ps.ps, ps) { + SET(ps, ps, KernelStartPointer0); + SET(ps, ps, KernelStartPointer1); + SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData0); + SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData1); #if GFX_VER < 20 - SET(ps, ps, KernelStartPointer2); - SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData2); + SET(ps, ps, KernelStartPointer2); + SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData2); - SET(ps, ps, _8PixelDispatchEnable); - SET(ps, ps, _16PixelDispatchEnable); - SET(ps, ps, _32PixelDispatchEnable); + SET(ps, ps, _8PixelDispatchEnable); + SET(ps, ps, _16PixelDispatchEnable); + SET(ps, ps, _32PixelDispatchEnable); #else - SET(ps, ps, Kernel0Enable); - SET(ps, ps, Kernel1Enable); - SET(ps, ps, Kernel0SIMDWidth); - SET(ps, ps, Kernel1SIMDWidth); - SET(ps, ps, Kernel0PolyPackingPolicy); - SET(ps, ps, Kernel0MaximumPolysperThread); + SET(ps, ps, Kernel0Enable); + SET(ps, ps, Kernel1Enable); + SET(ps, ps, Kernel0SIMDWidth); + SET(ps, ps, Kernel1SIMDWidth); + SET(ps, ps, Kernel0PolyPackingPolicy); + SET(ps, ps, Kernel0MaximumPolysperThread); #endif - SET(ps, ps, PositionXYOffsetSelect); - } - } else { - anv_gfx_pack(ps, GENX(3DSTATE_PS), ps); + SET(ps, ps, PositionXYOffsetSelect); } } if (IS_DIRTY(PS_EXTRA)) { if (anv_gfx_has_stage(gfx, MESA_SHADER_FRAGMENT)) { anv_gfx_pack_merge(ps_extra, GENX(3DSTATE_PS_EXTRA), - pipeline->partial.ps_extra, pse) { + MESA_SHADER_FRAGMENT, ps.ps_extra, pse) { SET(pse, ps_extra, PixelShaderHasUAV); SET(pse, ps_extra, PixelShaderIsPerSample); #if GFX_VER >= 11 @@ -3213,7 +3270,7 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state, * change through pre-rasterization shader) or if we notice a change. */ anv_gfx_pack_merge(ps_extra_dep, GENX(3DSTATE_PS_EXTRA), - pipeline->partial.ps_extra, pse) { + MESA_SHADER_FRAGMENT, ps.ps_extra, pse) { SET(pse, ps_extra, PixelShaderHasUAV); SET(pse, ps_extra, PixelShaderIsPerSample); #if GFX_VER >= 11 @@ -3269,15 +3326,13 @@ genX(cmd_buffer_flush_gfx_runtime_state)(struct anv_cmd_buffer *cmd_buffer) cmd_buffer->device, &cmd_buffer->vk.dynamic_graphics_state, &cmd_buffer->state.gfx, - anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline), cmd_buffer->vk.level); vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state); cmd_buffer_repack_gfx_state(&cmd_buffer->state.gfx.dyn_state, cmd_buffer, - &cmd_buffer->state.gfx, - anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline)); + &cmd_buffer->state.gfx); } static void @@ -3431,8 +3486,6 @@ cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer) struct anv_device *device = cmd_buffer->device; struct anv_instance *instance = device->physical->instance; struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; - struct anv_graphics_pipeline *pipeline = - anv_pipeline_to_graphics(gfx->base.pipeline); const struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state; struct anv_push_constants *push_consts = @@ -3493,7 +3546,7 @@ cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer) const struct brw_mesh_prog_data *mesh_prog_data = get_gfx_mesh_prog_data(gfx); if (mesh_prog_data) { push_consts->gfx.fs_per_prim_remap_offset = - pipeline->base.shaders[MESA_SHADER_MESH]->kernel.offset + + gfx->shaders[MESA_SHADER_MESH]->kernel.offset + mesh_prog_data->wa_18019110168_mapping_offset; } @@ -3576,7 +3629,7 @@ cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer) * 3. Send 3D State SOL with SOL Enabled */ if (intel_needs_workaround(device->info, 16011773973) && - pipeline->uses_xfb) + gfx->shaders[gfx->streamout_stage]->xfb_info != NULL) anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so); anv_batch_emit_gfx_variable(batch, so_decl_list); @@ -3597,7 +3650,7 @@ cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer) } #if GFX_VERx10 >= 125 - if (device->vk.enabled_features.meshShader) { + if (device->vk.enabled_extensions.EXT_mesh_shader) { if (IS_DIRTY(MESH_CONTROL)) anv_batch_emit_gfx(batch, GENX(3DSTATE_MESH_CONTROL), mesh_control); @@ -3670,8 +3723,8 @@ cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer) anv_batch_emit_gfx(batch, GENX(3DSTATE_VF_TOPOLOGY), vft); if (IS_DIRTY(VERTEX_INPUT)) { - genX(batch_emit_pipeline_vertex_input)(batch, device, - pipeline, dyn->vi); + genX(batch_emit_vertex_input)(batch, device, + gfx->shaders[MESA_SHADER_VERTEX], dyn->vi); } if (IS_DIRTY(TE)) @@ -3823,8 +3876,6 @@ genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer) { struct anv_device *device = cmd_buffer->device; struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; - struct anv_graphics_pipeline *pipeline = - anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline); struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state; if (INTEL_DEBUG(DEBUG_REEMIT)) { @@ -3863,7 +3914,7 @@ genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer) * it after. */ if (intel_needs_workaround(device->info, 16011773973) && - pipeline->uses_xfb && + gfx->shaders[gfx->streamout_stage]->xfb_info != NULL && BITSET_TEST(hw_state->emit_dirty, ANV_GFX_STATE_SO_DECL_LIST)) { BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_STREAMOUT); } diff --git a/src/intel/vulkan/genX_shader.c b/src/intel/vulkan/genX_shader.c index 3dbe549d244..0f13df1c744 100644 --- a/src/intel/vulkan/genX_shader.c +++ b/src/intel/vulkan/genX_shader.c @@ -569,31 +569,6 @@ emit_vs_shader(struct anv_batch *batch, vs.SoftwareExceptionEnable = false; vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1; -#if 0 - /* TODO: move to shader binding */ - if (GFX_VER == 9 && devinfo->gt == 4 && - anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) { - /* On Sky Lake GT4, we have experienced some hangs related to the VS - * cache and tessellation. It is unknown exactly what is happening - * but the Haswell docs for the "VS Reference Count Full Force Miss - * Enable" field of the "Thread Mode" register refer to a HSW bug in - * which the VUE handle reference count would overflow resulting in - * internal reference counting bugs. My (Faith's) best guess is that - * this bug cropped back up on SKL GT4 when we suddenly had more - * threads in play than any previous gfx9 hardware. - * - * What we do know for sure is that setting this bit when - * tessellation shaders are in use fixes a GPU hang in Batman: Arkham - * City when playing with DXVK (https://bugs.freedesktop.org/107280). - * Disabling the vertex cache with tessellation shaders should only - * have a minor performance impact as the tessellation shaders are - * likely generating and processing far more geometry than the vertex - * stage. - */ - vs.VertexCacheDisable = true; - } -#endif - vs.VertexURBEntryReadLength = vs_prog_data->base.urb_read_length; vs.VertexURBEntryReadOffset = 0; vs.DispatchGRFStartRegisterForURBData =