diff --git a/docs/drivers/anv.rst b/docs/drivers/anv.rst index 05a5534a26b..edc3ec8e8c4 100644 --- a/docs/drivers/anv.rst +++ b/docs/drivers/anv.rst @@ -7,6 +7,9 @@ Debugging Here are a few environment variable debug environment variables specific to ANV: +:envvar:`ANV_ENABLE_GENERATED_INDIRECT_DRAWS` + If defined to ``0`` or ``false``, this will disable the generated + indirect draw optimization in Anv. This will only affect Gfx11+. :envvar:`ANV_ENABLE_PIPELINE_CACHE` If defined to ``0`` or ``false``, this will disable pipeline caching, forcing ANV to reparse and recompile any VkShaderModule @@ -272,3 +275,34 @@ checking for ``ANV_CMD_DIRTY_PIPELINE``. It should only do so if it requires to know some value that is coming from the ``anv_graphics_pipeline`` object that is not available from ``anv_dynamic_state``. + + +Generated indirect draws optimization +------------------------------------- + +Indirect draws have traditionally been implemented on Intel HW by +loading the indirect parameters from memory into HW registers using +the command streamer's ``MI_LOAD_REGISTER_MEM`` instruction before +dispatching a draw call to the 3D pipeline. + +On recent products, it was found that the command streamer is showing +as performance bottleneck, because it cannot dispatch draw calls fast +enough to keep the 3D pipeline busy. + +The solution to this problem is to change the way we deal with +indirect draws. Instead of loading HW registers with values using the +command streamer, we generate entire set of ``3DPRIMITIVE`` +instructions using a shader. The generated instructions contain the +entire draw call parameters. This way the command streamer executes +only ``3DPRIMITIVE`` instructions and doesn´t do any data loading from +memory or touch HW registers, feeding the 3D pipeline as fast as it +can. + +In Anv this implemented by using a side batch buffer. When Anv +encounters the first indirect draws, it generates a jump into the side +batch, the side batch contains a draw call using a generation shader +for each indirect draw. We keep adding on more generation draws into +the batch until we have to stop due to command buffer end, secondary +command buffer calls or a barrier containing the access flag +``VK_ACCESS_INDIRECT_COMMAND_READ_BIT``. The side batch buffer jump +back right after the instruction where it was called. diff --git a/src/intel/ds/intel_tracepoints.py b/src/intel/ds/intel_tracepoints.py index 2ad7a37e4a7..d43cf2ec28e 100644 --- a/src/intel/ds/intel_tracepoints.py +++ b/src/intel/ds/intel_tracepoints.py @@ -84,6 +84,8 @@ def define_tracepoints(args): Arg(type='enum isl_format', name='src_fmt', var='src_fmt', c_format='%s', to_prim_type='isl_format_get_short_name({})'), ]) + begin_end_tp('generate_draws') + begin_end_tp('draw', tp_args=[Arg(type='uint32_t', var='count', c_format='%u')]) begin_end_tp('draw_multi', diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c index fd64598eb47..8bf46b9ea64 100644 --- a/src/intel/vulkan/anv_batch_chain.c +++ b/src/intel/vulkan/anv_batch_chain.c @@ -193,6 +193,14 @@ anv_batch_emit_ensure_space(struct anv_batch *batch, uint32_t size) return VK_SUCCESS; } +void +anv_batch_advance(struct anv_batch *batch, uint32_t size) +{ + assert(batch->next + size <= batch->end); + + batch->next += size; +} + struct anv_address anv_batch_address(struct anv_batch *batch, void *batch_location) { diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 68b0bdfa7b2..fa75222efad 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -72,6 +72,7 @@ static const driOptionDescription anv_dri_options[] = { DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS(false) DRI_CONF_ANV_SAMPLE_MASK_OUT_OPENGL_BEHAVIOUR(false) DRI_CONF_ANV_FP64_WORKAROUND_ENABLED(false) + DRI_CONF_ANV_GENERATED_INDIRECT_THRESHOLD(4) DRI_CONF_SECTION_END DRI_CONF_SECTION_DEBUG @@ -923,6 +924,12 @@ anv_physical_device_try_create(struct vk_instance *vk_instance, if (debug_get_bool_option("ANV_QUEUE_THREAD_DISABLE", false)) device->has_exec_timeline = false; + + device->generated_indirect_draws = + device->info.ver >= 11 && + debug_get_bool_option("ANV_ENABLE_GENERATED_INDIRECT_DRAWS", + true); + unsigned st_idx = 0; device->sync_syncobj_type = vk_drm_syncobj_get_type(fd); @@ -1104,6 +1111,8 @@ anv_init_dri_options(struct anv_instance *instance) driQueryOptionf(&instance->dri_options, "lower_depth_range_rate"); instance->fp64_workaround_enabled = driQueryOptionb(&instance->dri_options, "fp64_workaround_enabled"); + instance->generated_indirect_threshold = + driQueryOptioni(&instance->dri_options, "generated_indirect_threshold"); } VkResult anv_CreateInstance( @@ -3660,6 +3669,8 @@ VkResult anv_CreateDevice( anv_device_init_border_colors(device); + anv_device_init_generated_indirect_draws(device); + anv_device_perf_init(device); anv_device_utrace_init(device); @@ -3747,6 +3758,8 @@ void anv_DestroyDevice( anv_device_finish_rt_shaders(device); + anv_device_finish_generated_indirect_draws(device); + vk_pipeline_cache_destroy(device->internal_cache, NULL); vk_pipeline_cache_destroy(device->default_pipeline_cache, NULL); diff --git a/src/intel/vulkan/anv_generated_indirect_draws.c b/src/intel/vulkan/anv_generated_indirect_draws.c new file mode 100644 index 00000000000..10d8cfbb659 --- /dev/null +++ b/src/intel/vulkan/anv_generated_indirect_draws.c @@ -0,0 +1,341 @@ +/* + * Copyright © 2022 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "anv_private.h" + +#include "compiler/brw_compiler.h" +#include "compiler/brw_nir.h" +#include "compiler/spirv/nir_spirv.h" +#include "dev/intel_debug.h" +#include "util/macros.h" + +#include "anv_generated_indirect_draws.h" + +#include "shaders/generated_draws_spv.h" +#include "shaders/generated_draws_count_spv.h" + +/* This pass takes vulkan descriptor bindings 0 & 1 and turns them into global + * 64bit addresses. Binding 2 is left UBO that would normally be accessed + * through the binding table but it fully promoted to push constants. + * + * As a result we're not using the binding table at all which is nice because + * of the side command buffer we use for the generating shader does not + * interact with the binding table allocation. + */ +static bool +lower_vulkan_descriptors_instr(nir_builder *b, nir_instr *instr, void *cb_data) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_load_vulkan_descriptor) + return false; + + nir_instr *res_index_instr = intrin->src[0].ssa->parent_instr; + assert(res_index_instr->type == nir_instr_type_intrinsic); + nir_intrinsic_instr *res_index_intrin = + nir_instr_as_intrinsic(res_index_instr); + assert(res_index_intrin->intrinsic == nir_intrinsic_vulkan_resource_index); + + b->cursor = nir_after_instr(instr); + + nir_ssa_def *desc_value = NULL; + switch (nir_intrinsic_binding(res_index_intrin)) { + case 0: { + desc_value = + nir_load_ubo(b, 1, 64, + nir_imm_int(b, 2), + nir_imm_int(b, + offsetof(struct anv_generate_indirect_params, + indirect_data_addr)), + .align_mul = 8, + .align_offset = 0, + .range_base = 0, + .range = ~0); + desc_value = + nir_vec4(b, + nir_unpack_64_2x32_split_x(b, desc_value), + nir_unpack_64_2x32_split_y(b, desc_value), + nir_imm_int(b, 0), + nir_imm_int(b, 0)); + break; + } + + case 1: { + desc_value = + nir_load_ubo(b, 1, 64, + nir_imm_int(b, 2), + nir_imm_int(b, + offsetof(struct anv_generate_indirect_params, + generated_cmds_addr)), + .align_mul = 8, + .align_offset = 0, + .range_base = 0, + .range = ~0); + desc_value = + nir_vec4(b, + nir_unpack_64_2x32_split_x(b, desc_value), + nir_unpack_64_2x32_split_y(b, desc_value), + nir_imm_int(b, 0), + nir_imm_int(b, 0)); + break; + } + + case 2: + desc_value = + nir_vec2(b, + nir_imm_int(b, 2), + nir_imm_int(b, 0)); + break; + } + + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, desc_value); + + return true; +} + +static bool +lower_vulkan_descriptors(nir_shader *shader) +{ + return nir_shader_instructions_pass(shader, + lower_vulkan_descriptors_instr, + nir_metadata_block_index | + nir_metadata_dominance, + NULL); +} + +static struct anv_shader_bin * +compile_upload_spirv(struct anv_device *device, + const void *key, + uint32_t key_size, + const uint32_t *spirv_source, + uint32_t spirv_source_size, + uint32_t sends_count_expectation) +{ + struct spirv_to_nir_options spirv_options = { + .caps = { + }, + .ubo_addr_format = nir_address_format_32bit_index_offset, + .ssbo_addr_format = nir_address_format_64bit_global_32bit_offset, + .environment = NIR_SPIRV_VULKAN, + .create_library = false, + }; + const nir_shader_compiler_options *nir_options = + device->physical->compiler->nir_options[MESA_SHADER_FRAGMENT]; + + nir_shader* nir = + spirv_to_nir(spirv_source, spirv_source_size, + NULL, 0, MESA_SHADER_FRAGMENT, "main", + &spirv_options, nir_options); + + assert(nir != NULL); + + nir->info.internal = true; + + nir_validate_shader(nir, "after spirv_to_nir"); + nir_validate_ssa_dominance(nir, "after spirv_to_nir"); + + NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp); + NIR_PASS_V(nir, nir_lower_returns); + NIR_PASS_V(nir, nir_inline_functions); + NIR_PASS_V(nir, nir_opt_deref); + + NIR_PASS_V(nir, nir_lower_vars_to_ssa); + NIR_PASS_V(nir, nir_copy_prop); + NIR_PASS_V(nir, nir_opt_dce); + NIR_PASS_V(nir, nir_opt_cse); + NIR_PASS_V(nir, nir_opt_gcm, true); + NIR_PASS_V(nir, nir_opt_peephole_select, 1, false, false); + NIR_PASS_V(nir, nir_opt_dce); + + NIR_PASS_V(nir, nir_lower_variable_initializers, ~0); + + NIR_PASS_V(nir, nir_split_var_copies); + NIR_PASS_V(nir, nir_split_per_member_structs); + + struct brw_compiler *compiler = device->physical->compiler; + struct brw_nir_compiler_opts opts = {}; + brw_preprocess_nir(compiler, nir, &opts); + + NIR_PASS_V(nir, nir_propagate_invariant, false); + + NIR_PASS_V(nir, nir_lower_input_attachments, + &(nir_input_attachment_options) { + .use_fragcoord_sysval = true, + .use_layer_id_sysval = true, + }); + + nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); + + /* Do vectorizing here. For some reason when trying to do it in the back + * this just isn't working. + */ + nir_load_store_vectorize_options options = { + .modes = nir_var_mem_ubo | nir_var_mem_ssbo, + .callback = brw_nir_should_vectorize_mem, + .robust_modes = (nir_variable_mode)0, + }; + NIR_PASS_V(nir, nir_opt_load_store_vectorize, &options); + + NIR_PASS_V(nir, lower_vulkan_descriptors); + NIR_PASS_V(nir, nir_opt_dce); + + NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ubo, + nir_address_format_32bit_index_offset); + NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ssbo, + nir_address_format_64bit_global_32bit_offset); + + NIR_PASS_V(nir, nir_copy_prop); + NIR_PASS_V(nir, nir_opt_constant_folding); + NIR_PASS_V(nir, nir_opt_dce); + + struct brw_wm_prog_key wm_key; + memset(&wm_key, 0, sizeof(wm_key)); + + struct brw_wm_prog_data wm_prog_data = { + .base.nr_params = nir->num_uniforms / 4, + }; + + brw_nir_analyze_ubo_ranges(compiler, nir, NULL, wm_prog_data.base.ubo_ranges); + + struct brw_compile_stats stats[3]; + struct brw_compile_fs_params params = { + .nir = nir, + .key = &wm_key, + .prog_data = &wm_prog_data, + .stats = stats, + .log_data = device, + .debug_flag = DEBUG_WM, + }; + const unsigned *program = brw_compile_fs(compiler, nir, ¶ms); + + if (wm_prog_data.dispatch_8) { + assert(stats[0].spills == 0); + assert(stats[0].fills == 0); + assert(stats[0].sends == sends_count_expectation); + } + if (wm_prog_data.dispatch_16) { + assert(stats[1].spills == 0); + assert(stats[1].fills == 0); + assert(stats[1].sends == sends_count_expectation); + } + if (wm_prog_data.dispatch_32) { + assert(stats[2].spills == 0); + assert(stats[2].fills == 0); + assert(stats[2].sends == sends_count_expectation); + } + + struct anv_pipeline_bind_map bind_map; + memset(&bind_map, 0, sizeof(bind_map)); + + struct anv_push_descriptor_info push_desc_info = {}; + + struct anv_shader_bin *kernel = + anv_device_upload_kernel(device, + device->internal_cache, + nir->info.stage, + key, key_size, program, + wm_prog_data.base.program_size, + &wm_prog_data.base, sizeof(wm_prog_data), + NULL, 0, NULL, &bind_map, + &push_desc_info); + + ralloc_free(nir); + + return kernel; +} + +VkResult +anv_device_init_generated_indirect_draws(struct anv_device *device) +{ + if (device->info->ver < 11) + return VK_SUCCESS; + + const struct intel_l3_weights w = + intel_get_default_l3_weights(device->info, + true /* wants_dc_cache */, + false /* needs_slm */); + device->generated_draw_l3_config = intel_get_l3_config(device->info, w); + + struct { + char name[40]; + } indirect_draws_key = { + .name = "anv-generated-indirect-draws", + }, indirect_draws_count_key = { + .name = "anv-generated-indirect-draws-count", + }; + + device->generated_draw_kernel = + anv_device_search_for_kernel(device, + device->internal_cache, + &indirect_draws_key, + sizeof(indirect_draws_key), + NULL); + if (device->generated_draw_kernel == NULL) { + device->generated_draw_kernel = + compile_upload_spirv(device, + &indirect_draws_key, + sizeof(indirect_draws_key), + generated_draws_spv_source, + ARRAY_SIZE(generated_draws_spv_source), + 10 /* 2 * (2 loads + 3 stores) */); + } + if (device->generated_draw_kernel == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + /* The cache already has a reference and it's not going anywhere so there + * is no need to hold a second reference. + */ + anv_shader_bin_unref(device, device->generated_draw_kernel); + + device->generated_draw_count_kernel = + anv_device_search_for_kernel(device, + device->internal_cache, + &indirect_draws_count_key, + sizeof(indirect_draws_count_key), + NULL); + if (device->generated_draw_count_kernel == NULL) { + device->generated_draw_count_kernel = + compile_upload_spirv(device, + &indirect_draws_count_key, + sizeof(indirect_draws_count_key), + generated_draws_count_spv_source, + ARRAY_SIZE(generated_draws_count_spv_source), + 11 /* 2 * (3 loads + 3 stores) */); + } + if (device->generated_draw_count_kernel == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + /* The cache already has a reference and it's not going anywhere so there + * is no need to hold a second reference. + */ + anv_shader_bin_unref(device, device->generated_draw_count_kernel); + + return VK_SUCCESS; +} + +void +anv_device_finish_generated_indirect_draws(struct anv_device *device) +{ +} diff --git a/src/intel/vulkan/anv_generated_indirect_draws.h b/src/intel/vulkan/anv_generated_indirect_draws.h new file mode 100644 index 00000000000..7cc97ccbe8b --- /dev/null +++ b/src/intel/vulkan/anv_generated_indirect_draws.h @@ -0,0 +1,71 @@ +/* + * Copyright © 2022 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef ANV_GENERATED_INDIRECT_DRAWS_H +#define ANV_GENERATED_INDIRECT_DRAWS_H + +#include + +/* This needs to match generated_draws.glsl : + * + * layout(set = 0, binding = 2) uniform block + */ +struct anv_generated_indirect_draw_params { + uint32_t is_indexed; + uint32_t is_predicated; + uint32_t draw_base; + uint32_t draw_count; + uint32_t instance_multiplier; + uint32_t indirect_data_stride; +}; + +/* This needs to match generated_draws_count.glsl : + * + * layout(set = 0, binding = 2) uniform block + */ +struct anv_generated_indirect_draw_count_params { + uint32_t is_indexed; + uint32_t is_predicated; + uint32_t draw_base; + uint32_t item_count; + uint32_t draw_count; + uint32_t instance_multiplier; + uint32_t indirect_data_stride; + uint32_t end_addr_ldw; + uint32_t end_addr_udw; +}; + +struct anv_generate_indirect_params { + union { + struct anv_generated_indirect_draw_params draw; + struct anv_generated_indirect_draw_count_params draw_count; + }; + + /* Global address of binding 0 */ + uint64_t indirect_data_addr; + + /* Global address of binding 1 */ + uint64_t generated_cmds_addr; +}; + +#endif /* ANV_GENERATED_INDIRECT_DRAWS_H */ diff --git a/src/intel/vulkan/anv_pipeline_cache.c b/src/intel/vulkan/anv_pipeline_cache.c index e3894df6a42..d50a84e2a58 100644 --- a/src/intel/vulkan/anv_pipeline_cache.c +++ b/src/intel/vulkan/anv_pipeline_cache.c @@ -31,7 +31,7 @@ #include "nir/nir_xfb_info.h" #include "vulkan/util/vk_util.h" #include "compiler/spirv/nir_spirv.h" -#include "float64_spv.h" +#include "shaders/float64_spv.h" static bool anv_shader_bin_serialize(struct vk_pipeline_cache_object *object, diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 99afceed172..10785e5c41a 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1009,6 +1009,15 @@ struct anv_physical_device { bool always_flush_cache; + /** + * True if the generated indirect draw optimization is turned on. + * + * This optimization is currently only available on Gfx11+ to avoid + * dealing with the annoying Gfx8/9 tracking of vertex buffer for the VF + * cache workaround. + */ + bool generated_indirect_draws; + struct { uint32_t family_count; struct anv_queue_family families[ANV_MAX_QUEUE_FAMILIES]; @@ -1075,6 +1084,7 @@ struct anv_instance { bool sample_mask_out_opengl_behaviour; bool fp64_workaround_enabled; float lower_depth_range_rate; + unsigned generated_indirect_threshold; }; VkResult anv_init_wsi(struct anv_physical_device *physical_device); @@ -1241,6 +1251,15 @@ struct anv_device { enum anv_rt_bvh_build_method bvh_build_method; + /** Draw generation shader + * + * Generates direct draw calls out of indirect parameters. Used to + * workaround slowness with indirect draw calls. + */ + struct anv_shader_bin *generated_draw_kernel; + struct anv_shader_bin *generated_draw_count_kernel; + const struct intel_l3_config *generated_draw_l3_config; + pthread_mutex_t mutex; pthread_cond_t queue_submit; @@ -1462,6 +1481,7 @@ struct anv_batch { void *anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords); VkResult anv_batch_emit_ensure_space(struct anv_batch *batch, uint32_t size); +void anv_batch_advance(struct anv_batch *batch, uint32_t size); void anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other); struct anv_address anv_batch_address(struct anv_batch *batch, void *batch_location); @@ -2887,6 +2907,13 @@ void anv_cmd_buffer_dump(struct anv_cmd_buffer *cmd_buffer); void anv_cmd_emit_conditional_render_predicate(struct anv_cmd_buffer *cmd_buffer); +static inline unsigned +anv_cmd_buffer_get_view_count(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + return MAX2(1, util_bitcount(gfx->view_mask)); +} + enum anv_bo_sync_state { /** Indicates that this is a new (or newly reset fence) */ ANV_BO_SYNC_STATE_RESET, @@ -4139,6 +4166,18 @@ struct anv_memcpy_state { struct anv_vb_cache_range vb_dirty; }; +VkResult +anv_device_init_generated_indirect_draws(struct anv_device *device); +void +anv_device_finish_generated_indirect_draws(struct anv_device *device); + +static inline bool anv_use_generated_draws(const struct anv_device *device, + uint32_t count) +{ + return device->physical->generated_indirect_draws && + count >= device->physical->instance->generated_indirect_threshold; +} + struct anv_utrace_flush_copy { /* Needs to be the first field */ struct intel_ds_flush_data ds; diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index c5fede5d295..853334f7a0c 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -3413,6 +3413,11 @@ genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer) genX(cmd_buffer_flush_dynamic_state)(cmd_buffer); } +#define GFX_HAS_GENERATED_CMDS GFX_VER >= 11 +#if GFX_VER >= 11 +#include "genX_cmd_draw_generated_indirect.h" +#endif + VkResult genX(BeginCommandBuffer)( VkCommandBuffer commandBuffer, @@ -3618,6 +3623,10 @@ genX(EndCommandBuffer)( anv_measure_endcommandbuffer(cmd_buffer); +#if GFX_HAS_GENERATED_CMDS + genX(cmd_buffer_flush_generated_draws)(cmd_buffer); +#endif + /* We want every command buffer to start with the PMA fix in a known state, * so we disable it at the end of the command buffer. */ @@ -3657,6 +3666,10 @@ genX(CmdExecuteCommands)( */ genX(cmd_buffer_apply_pipe_flushes)(primary); +#if GFX_HAS_GENERATED_CMDS + genX(cmd_buffer_flush_generated_draws)(primary); +#endif + for (uint32_t i = 0; i < commandBufferCount; i++) { ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]); @@ -3819,6 +3832,11 @@ cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer, anv_pipe_flush_bits_for_access_flags(cmd_buffer->device, src_flags) | anv_pipe_invalidate_bits_for_access_flags(cmd_buffer->device, dst_flags); +#if GFX_HAS_GENERATED_CMDS + if (dst_flags & VK_ACCESS_INDIRECT_COMMAND_READ_BIT) + genX(cmd_buffer_flush_generated_draws)(cmd_buffer); +#endif + anv_add_pending_pipe_bits(cmd_buffer, bits, reason); } @@ -4442,9 +4460,24 @@ void genX(CmdDrawIndirect)( drawCount); trace_intel_begin_draw_indirect(&cmd_buffer->trace); +#if GFX_HAS_GENERATED_CMDS + if (anv_use_generated_draws(cmd_buffer->device, drawCount)) { + genX(cmd_buffer_emit_indirect_generated_draws)( + cmd_buffer, + anv_address_add(buffer->address, offset), + MAX2(stride, sizeof(VkDrawIndirectCommand)), + drawCount, + false /* indexed */); + } else { + emit_indirect_draws(cmd_buffer, + anv_address_add(buffer->address, offset), + stride, drawCount, false /* indexed */); + } +#else emit_indirect_draws(cmd_buffer, anv_address_add(buffer->address, offset), stride, drawCount, false /* indexed */); +#endif trace_intel_end_draw_indirect(&cmd_buffer->trace, drawCount); } @@ -4468,9 +4501,24 @@ void genX(CmdDrawIndexedIndirect)( drawCount); trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace); +#if GFX_HAS_GENERATED_CMDS + if (anv_use_generated_draws(cmd_buffer->device, drawCount)) { + genX(cmd_buffer_emit_indirect_generated_draws)( + cmd_buffer, + anv_address_add(buffer->address, offset), + MAX2(stride, sizeof(VkDrawIndexedIndirectCommand)), + drawCount, + true /* indexed */); + } else { + emit_indirect_draws(cmd_buffer, + anv_address_add(buffer->address, offset), + stride, drawCount, true /* indexed */); + } +#else emit_indirect_draws(cmd_buffer, anv_address_add(buffer->address, offset), stride, drawCount, true /* indexed */); +#endif trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, drawCount); } @@ -4643,12 +4691,37 @@ void genX(CmdDrawIndirectCount)( 0); trace_intel_begin_draw_indirect_count(&cmd_buffer->trace); + struct anv_address indirect_data_address = + anv_address_add(buffer->address, offset); + struct anv_address count_address = + anv_address_add(count_buffer->address, countBufferOffset); + stride = MAX2(stride, sizeof(VkDrawIndirectCommand)); + +#if GFX_HAS_GENERATED_CMDS + if (anv_use_generated_draws(cmd_buffer->device, maxDrawCount)) { + genX(cmd_buffer_emit_indirect_generated_draws_count)( + cmd_buffer, + indirect_data_address, + stride, + count_address, + maxDrawCount, + false /* indexed */); + } else { + emit_indirect_count_draws(cmd_buffer, + indirect_data_address, + stride, + count_address, + maxDrawCount, + false /* indexed */); + } +#else emit_indirect_count_draws(cmd_buffer, - anv_address_add(buffer->address, offset), - MAX2(stride, sizeof(VkDrawIndirectCommand)), - anv_address_add(count_buffer->address, countBufferOffset), + indirect_data_address, + stride, + count_address, maxDrawCount, false /* indexed */); +#endif trace_intel_end_draw_indirect_count(&cmd_buffer->trace, maxDrawCount); } @@ -4675,12 +4748,37 @@ void genX(CmdDrawIndexedIndirectCount)( 0); trace_intel_begin_draw_indexed_indirect_count(&cmd_buffer->trace); + struct anv_address indirect_data_address = + anv_address_add(buffer->address, offset); + struct anv_address count_address = + anv_address_add(count_buffer->address, countBufferOffset); + stride = MAX2(stride, sizeof(VkDrawIndexedIndirectCommand)); + +#if GFX_HAS_GENERATED_CMDS + if (anv_use_generated_draws(cmd_buffer->device, maxDrawCount)) { + genX(cmd_buffer_emit_indirect_generated_draws_count)( + cmd_buffer, + indirect_data_address, + stride, + count_address, + maxDrawCount, + true /* indexed */); + } else { + emit_indirect_count_draws(cmd_buffer, + indirect_data_address, + stride, + count_address, + maxDrawCount, + true /* indexed */); + } +#else emit_indirect_count_draws(cmd_buffer, - anv_address_add(buffer->address, offset), - MAX2(stride, sizeof(VkDrawIndirectCommand)), - anv_address_add(count_buffer->address, countBufferOffset), + indirect_data_address, + stride, + count_address, maxDrawCount, true /* indexed */); +#endif trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace, maxDrawCount); diff --git a/src/intel/vulkan/genX_cmd_draw_generated_indirect.h b/src/intel/vulkan/genX_cmd_draw_generated_indirect.h new file mode 100644 index 00000000000..256037ed07b --- /dev/null +++ b/src/intel/vulkan/genX_cmd_draw_generated_indirect.h @@ -0,0 +1,704 @@ +/* + * Copyright © 2022 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef GENX_CMD_GENERATED_INDIRECT_DRAW_H +#define GENX_CMD_GENERATED_INDIRECT_DRAW_H + +#include +#include + +#include "util/macros.h" + +#include "anv_private.h" +#include "anv_generated_indirect_draws.h" + +#if GFX_VER < 11 +#error "Generated draws optimization not supported prior to Gfx11" +#endif + +/* This is a maximum number of items a fragment shader can generate due to the + * viewport size. + */ +#define MAX_GENERATED_DRAW_COUNT (8192 * 8192) + +static void +genX(cmd_buffer_emit_generate_draws_pipeline)(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_batch *batch = &cmd_buffer->generation_batch; + struct anv_device *device = cmd_buffer->device; + const struct anv_shader_bin *draw_kernel = device->generated_draw_kernel; + const struct brw_wm_prog_data *prog_data = + brw_wm_prog_data_const(draw_kernel->prog_data); + + uint32_t *dw = anv_batch_emitn(batch, + 1 + 2 * GENX(VERTEX_ELEMENT_STATE_length), + GENX(3DSTATE_VERTEX_ELEMENTS)); + /* You might think there is some shady stuff going here and you would be + * right. We're setting up 2 VERTEX_ELEMENT_STATE yet we're only providing + * 1 (positions) VERTEX_BUFFER_STATE later. + * + * Find more about how to set up a 3D pipeline with a fragment shader but + * without a vertex shader in blorp_emit_vertex_elements() in + * blorp_genX_exec.h. + */ + GENX(VERTEX_ELEMENT_STATE_pack)( + batch, dw + 1, &(struct GENX(VERTEX_ELEMENT_STATE)) { + .VertexBufferIndex = 1, + .Valid = true, + .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT, + .SourceElementOffset = 0, + .Component0Control = VFCOMP_STORE_SRC, + .Component1Control = VFCOMP_STORE_0, + .Component2Control = VFCOMP_STORE_0, + .Component3Control = VFCOMP_STORE_0, + }); + GENX(VERTEX_ELEMENT_STATE_pack)( + batch, dw + 3, &(struct GENX(VERTEX_ELEMENT_STATE)) { + .VertexBufferIndex = 0, + .Valid = true, + .SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT, + .SourceElementOffset = 0, + .Component0Control = VFCOMP_STORE_SRC, + .Component1Control = VFCOMP_STORE_SRC, + .Component2Control = VFCOMP_STORE_SRC, + .Component3Control = VFCOMP_STORE_1_FP, + }); + + anv_batch_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf); + anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs) { + sgvs.InstanceIDEnable = true; + sgvs.InstanceIDComponentNumber = COMP_1; + sgvs.InstanceIDElementOffset = 0; + } + anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS_2), sgvs); + anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) { + vfi.InstancingEnable = false; + vfi.VertexElementIndex = 0; + } + anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) { + vfi.InstancingEnable = false; + vfi.VertexElementIndex = 1; + } + + anv_batch_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) { + topo.PrimitiveTopologyType = _3DPRIM_RECTLIST; + } + + /* Emit URB setup. We tell it that the VS is active because we want it to + * allocate space for the VS. Even though one isn't run, we need VUEs to + * store the data that VF is going to pass to SOL. + */ + const unsigned entry_size[4] = { DIV_ROUND_UP(32, 64), 1, 1, 1 }; + + genX(emit_l3_config)(batch, device, device->generated_draw_l3_config); + + cmd_buffer->state.current_l3_config = device->generated_draw_l3_config; + + genX(emit_urb_setup)(device, batch, device->generated_draw_l3_config, + VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT, + entry_size, NULL); + + anv_batch_emit(batch, GENX(3DSTATE_PS_BLEND), ps_blend) { + ps_blend.HasWriteableRT = true; + } + + anv_batch_emit(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), wm); + +#if GFX_VER >= 12 + anv_batch_emit(batch, GENX(3DSTATE_DEPTH_BOUNDS), db) { + db.DepthBoundsTestEnable = false; + db.DepthBoundsTestMinValue = 0.0; + db.DepthBoundsTestMaxValue = 1.0; + } +#endif + + anv_batch_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms); + anv_batch_emit(batch, GENX(3DSTATE_SAMPLE_MASK), sm) { + sm.SampleMask = 0x1; + } + + anv_batch_emit(batch, GENX(3DSTATE_VS), vs); + anv_batch_emit(batch, GENX(3DSTATE_HS), hs); + anv_batch_emit(batch, GENX(3DSTATE_TE), te); + anv_batch_emit(batch, GENX(3DSTATE_DS), DS); + + anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so); + + anv_batch_emit(batch, GENX(3DSTATE_GS), gs); + + anv_batch_emit(batch, GENX(3DSTATE_CLIP), clip) { + clip.PerspectiveDivideDisable = true; + } + + anv_batch_emit(batch, GENX(3DSTATE_SF), sf) { +#if GFX_VER >= 12 + sf.DerefBlockSize = INTEL_URB_DEREF_BLOCK_SIZE_32; // TODO +#endif + } + + anv_batch_emit(batch, GENX(3DSTATE_RASTER), raster) { + raster.CullMode = CULLMODE_NONE; + } + + anv_batch_emit(batch, GENX(3DSTATE_SBE), sbe) { + sbe.VertexURBEntryReadOffset = 1; + sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs; + sbe.VertexURBEntryReadLength = MAX2((prog_data->num_varying_inputs + 1) / 2, 1); + sbe.ConstantInterpolationEnable = prog_data->flat_inputs; + sbe.ForceVertexURBEntryReadLength = true; + sbe.ForceVertexURBEntryReadOffset = true; + for (unsigned i = 0; i < 32; i++) + sbe.AttributeActiveComponentFormat[i] = ACF_XYZW; + } + + anv_batch_emit(batch, GENX(3DSTATE_WM), wm) { + //wm.ForceThreadDispatchEnable = ForceON; + } + + anv_batch_emit(batch, GENX(3DSTATE_PS_EXTRA), psx) { + psx.PixelShaderValid = true; + psx.AttributeEnable = prog_data->num_varying_inputs > 0; + psx.PixelShaderIsPerSample = prog_data->persample_dispatch; + psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode; + psx.PixelShaderComputesStencil = prog_data->computed_stencil; + } + + anv_batch_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) { + struct anv_state cc_state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4 * GENX(CC_VIEWPORT_length), 32); + struct GENX(CC_VIEWPORT) cc_viewport = { + .MinimumDepth = 0.0f, + .MaximumDepth = 1.0f, + }; + GENX(CC_VIEWPORT_pack)(NULL, cc_state.map, &cc_viewport); + cc.CCViewportPointer = cc_state.offset; + } + +#if GFX_VER >= 12 + /* Disable Primitive Replication. */ + anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr); +#endif + + anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc); + anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_HS), alloc); + anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_DS), alloc); + anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_GS), alloc); + anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) { + alloc.ConstantBufferOffset = 0; + alloc.ConstantBufferSize = cmd_buffer->device->info->max_constant_urb_size_kb; + } + +#if GFX_VERx10 == 125 + /* DG2: Wa_22011440098 + * MTL: Wa_18022330953 + * + * In 3D mode, after programming push constant alloc command immediately + * program push constant command(ZERO length) without any commit between + * them. + */ + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) { + /* Update empty push constants for all stages (bitmask = 11111b) */ + c.ShaderUpdateEnable = 0x1f; + c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0); + } +#endif + + cmd_buffer->state.gfx.vb_dirty = BITFIELD_BIT(0) | BITFIELD_BIT(1); + cmd_buffer->state.gfx.dirty |= ~(ANV_CMD_DIRTY_INDEX_BUFFER | + ANV_CMD_DIRTY_XFB_ENABLE); + cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS; + cmd_buffer->state.gfx.push_constant_stages = VK_SHADER_STAGE_FRAGMENT_BIT; + vk_dynamic_graphics_state_dirty_all(&cmd_buffer->vk.dynamic_graphics_state); + + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT, + "after generation batch BTI change"); +} + +static void +genX(cmd_buffer_emit_generate_draws_vertex)(struct anv_cmd_buffer *cmd_buffer, + uint32_t draw_count) +{ + struct anv_batch *batch = &cmd_buffer->generation_batch; + struct anv_state vs_data_state = + anv_cmd_buffer_alloc_dynamic_state( + cmd_buffer, 9 * sizeof(uint32_t), 32); + + float x0 = 0.0f, x1 = MIN2(draw_count, 8192); + float y0 = 0.0f, y1 = DIV_ROUND_UP(draw_count, 8192); + float z = 0.0f; + + float *vertices = vs_data_state.map; + vertices[0] = x1; vertices[1] = y1; vertices[2] = z; /* v0 */ + vertices[3] = x0; vertices[4] = y1; vertices[5] = z; /* v1 */ + vertices[6] = x0; vertices[7] = y0; vertices[8] = z; /* v2 */ + + uint32_t *dw = anv_batch_emitn(batch, + 1 + GENX(VERTEX_BUFFER_STATE_length), + GENX(3DSTATE_VERTEX_BUFFERS)); + GENX(VERTEX_BUFFER_STATE_pack)(batch, dw + 1, + &(struct GENX(VERTEX_BUFFER_STATE)) { + .VertexBufferIndex = 0, + .AddressModifyEnable = true, + .BufferStartingAddress = (struct anv_address) { + .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .offset = vs_data_state.offset, + }, + .BufferPitch = 3 * sizeof(float), + .BufferSize = 9 * sizeof(float), + .MOCS = anv_mocs(cmd_buffer->device, NULL, 0), +#if GFX_VER >= 12 + .L3BypassDisable = true, +#endif + }); +} + +static struct anv_state +genX(cmd_buffer_alloc_generated_push_data)(struct anv_cmd_buffer *cmd_buffer) +{ + return anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, + sizeof(struct anv_generate_indirect_params), + ANV_UBO_ALIGNMENT); +} + + +static struct anv_state +genX(cmd_buffer_emit_generated_push_data)(struct anv_cmd_buffer *cmd_buffer, + struct anv_state push_data_state) +{ + struct anv_batch *batch = &cmd_buffer->generation_batch; + struct anv_address push_data_addr = anv_state_pool_state_address( + &cmd_buffer->device->dynamic_state_pool, push_data_state); + +#if GFX_VER >= 12 + const uint32_t num_dwords = GENX(3DSTATE_CONSTANT_ALL_length) + + GENX(3DSTATE_CONSTANT_ALL_DATA_length); + uint32_t *dw = + anv_batch_emitn(batch, num_dwords, + GENX(3DSTATE_CONSTANT_ALL), + .ShaderUpdateEnable = BITFIELD_BIT(MESA_SHADER_FRAGMENT), + .PointerBufferMask = 0x1, + .MOCS = anv_mocs(cmd_buffer->device, NULL, 0)); + + GENX(3DSTATE_CONSTANT_ALL_DATA_pack)( + batch, dw + GENX(3DSTATE_CONSTANT_ALL_length), + &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) { + .PointerToConstantBuffer = push_data_addr, + .ConstantBufferReadLength = DIV_ROUND_UP(push_data_state.alloc_size, 32), + }); +#else + anv_batch_emit(batch, GENX(3DSTATE_CONSTANT_PS), c) { + c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0); + c.ConstantBody.ReadLength[0] = DIV_ROUND_UP(push_data_state.alloc_size, 32); + c.ConstantBody.Buffer[0] = push_data_addr; + } +#endif + + return push_data_state; +} + +static void +genX(cmd_buffer_emit_generate_draws)(struct anv_cmd_buffer *cmd_buffer, + struct anv_address generated_cmds_addr, + uint32_t generated_cmds_size, + struct anv_address indirect_data_addr, + uint32_t indirect_data_stride, + uint32_t item_base, + uint32_t item_count, + bool indexed) +{ + struct anv_device *device = cmd_buffer->device; + struct anv_batch *batch = &cmd_buffer->generation_batch; + const struct anv_shader_bin *draw_kernel = device->generated_draw_kernel; + const struct brw_wm_prog_data *prog_data = + brw_wm_prog_data_const(draw_kernel->prog_data); + + anv_batch_emit(batch, GENX(3DSTATE_PS), ps) { + ps.BindingTableEntryCount = 2; + ps.PushConstantEnable = prog_data->base.nr_params > 0 || + prog_data->base.ubo_ranges[0].length; + + ps._8PixelDispatchEnable = prog_data->dispatch_8; + ps._16PixelDispatchEnable = prog_data->dispatch_16; + ps._32PixelDispatchEnable = prog_data->dispatch_32; + + ps.DispatchGRFStartRegisterForConstantSetupData0 = + brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0); + ps.DispatchGRFStartRegisterForConstantSetupData1 = + brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1); + ps.DispatchGRFStartRegisterForConstantSetupData2 = + brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2); + + ps.KernelStartPointer0 = draw_kernel->kernel.offset + + brw_wm_prog_data_prog_offset(prog_data, ps, 0); + ps.KernelStartPointer1 = draw_kernel->kernel.offset + + brw_wm_prog_data_prog_offset(prog_data, ps, 1); + ps.KernelStartPointer2 = draw_kernel->kernel.offset + + brw_wm_prog_data_prog_offset(prog_data, ps, 2); + + ps.MaximumNumberofThreadsPerPSD = device->info->max_threads_per_psd - 1; + } + + genX(cmd_buffer_emit_generate_draws_vertex)(cmd_buffer, item_count); + + struct anv_state push_data_state = + genX(cmd_buffer_alloc_generated_push_data)(cmd_buffer); + + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + + struct anv_generate_indirect_params *push_data = push_data_state.map; + *push_data = (struct anv_generate_indirect_params) { + .draw = { + .is_indexed = indexed, + .is_predicated = cmd_buffer->state.conditional_render_enabled, + .draw_base = item_base, + .draw_count = item_count, + .instance_multiplier = pipeline->instance_multiplier, + .indirect_data_stride = indirect_data_stride, + }, + .indirect_data_addr = anv_address_physical(indirect_data_addr), + .generated_cmds_addr = anv_address_physical(generated_cmds_addr), + }; + + genX(cmd_buffer_emit_generated_push_data)(cmd_buffer, push_data_state); + + anv_batch_emit(batch, GENX(3DPRIMITIVE), prim) { + prim.VertexAccessType = SEQUENTIAL; + prim.PrimitiveTopologyType = _3DPRIM_RECTLIST; + prim.VertexCountPerInstance = 3; + prim.InstanceCount = 1; + } +} + +static void +genX(cmd_buffer_emit_indirect_generated_draws_init)(struct anv_cmd_buffer *cmd_buffer) +{ +#if GFX_VER >= 12 + anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) { + arb.PreParserDisableMask = true; + arb.PreParserDisable = true; + } +#endif + + anv_batch_emit_ensure_space(&cmd_buffer->generation_batch, 4); + + trace_intel_begin_generate_draws(&cmd_buffer->trace); + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_BATCH_BUFFER_START), bbs) { + bbs.AddressSpaceIndicator = ASI_PPGTT; + bbs.BatchBufferStartAddress = + anv_batch_current_address(&cmd_buffer->generation_batch); + } + + cmd_buffer->generation_return_addr = anv_batch_current_address(&cmd_buffer->batch); + + trace_intel_end_generate_draws(&cmd_buffer->trace); + + genX(cmd_buffer_emit_generate_draws_pipeline)(cmd_buffer); +} + +static void +genX(cmd_buffer_emit_indirect_generated_draws)(struct anv_cmd_buffer *cmd_buffer, + struct anv_address indirect_data_addr, + uint32_t indirect_data_stride, + uint32_t draw_count, + bool indexed) +{ + genX(flush_pipeline_select_3d)(cmd_buffer); + + /* Apply the pipeline flush here so the indirect data is available for the + * generation shader. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + if (anv_address_is_null(cmd_buffer->generation_return_addr)) + genX(cmd_buffer_emit_indirect_generated_draws_init)(cmd_buffer); + + /* In order to have the vertex fetch gather the data we need to have a non + * 0 stride. It's possible to have a 0 stride given by the application when + * draw_count is 1, but we need a correct value for the + * VERTEX_BUFFER_STATE::BufferPitch, so ensure the caller set this + * correctly : + * + * Vulkan spec, vkCmdDrawIndirect: + * + * "If drawCount is less than or equal to one, stride is ignored." + */ + assert(indirect_data_stride > 0); + + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + + /* Emit the 3D state in the main batch. */ + genX(cmd_buffer_flush_gfx_state)(cmd_buffer); + + const uint32_t draw_cmd_stride = 4 * GENX(3DPRIMITIVE_EXTENDED_length); + + uint32_t item_base = 0; + while (item_base < draw_count) { + const uint32_t item_count = MIN2(draw_count - item_base, + MAX_GENERATED_DRAW_COUNT); + const uint32_t draw_cmd_size = item_count * draw_cmd_stride; + + /* Ensure we have enough contiguous space for all the draws so that the + * compute shader can edit all the 3DPRIMITIVEs from a single base + * address. + * + * TODO: we might have to split that if the amount of space is to large (at + * 1Mb?). + */ + VkResult result = anv_batch_emit_ensure_space(&cmd_buffer->batch, + draw_cmd_size); + if (result != VK_SUCCESS) + return; + + genX(cmd_buffer_emit_generate_draws)( + cmd_buffer, + anv_batch_current_address(&cmd_buffer->batch), + draw_cmd_size, + indirect_data_addr, + indirect_data_stride, + item_base, + item_count, + indexed); + + anv_batch_advance(&cmd_buffer->batch, draw_cmd_size); + + item_base += item_count; + } +} + +static void +genX(cmd_buffer_emit_generate_draws_count)(struct anv_cmd_buffer *cmd_buffer, + struct anv_address generated_cmds_addr, + uint32_t generated_cmds_size, + struct anv_address indirect_data_addr, + uint32_t indirect_data_stride, + uint32_t item_base, + uint32_t item_count, + struct anv_address count_addr, + bool indexed) +{ + struct anv_device *device = cmd_buffer->device; + struct anv_batch *batch = &cmd_buffer->generation_batch; + const struct anv_shader_bin *draw_kernel = + device->generated_draw_count_kernel; + const struct brw_wm_prog_data *prog_data = + brw_wm_prog_data_const(draw_kernel->prog_data); + + anv_batch_emit(batch, GENX(3DSTATE_PS), ps) { + ps.BindingTableEntryCount = 2; + ps.PushConstantEnable = prog_data->base.nr_params > 0 || + prog_data->base.ubo_ranges[0].length; + + ps._8PixelDispatchEnable = prog_data->dispatch_8; + ps._16PixelDispatchEnable = prog_data->dispatch_16; + ps._32PixelDispatchEnable = prog_data->dispatch_32; + + ps.DispatchGRFStartRegisterForConstantSetupData0 = + brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0); + ps.DispatchGRFStartRegisterForConstantSetupData1 = + brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1); + ps.DispatchGRFStartRegisterForConstantSetupData2 = + brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2); + + ps.KernelStartPointer0 = draw_kernel->kernel.offset + + brw_wm_prog_data_prog_offset(prog_data, ps, 0); + ps.KernelStartPointer1 = draw_kernel->kernel.offset + + brw_wm_prog_data_prog_offset(prog_data, ps, 1); + ps.KernelStartPointer2 = draw_kernel->kernel.offset + + brw_wm_prog_data_prog_offset(prog_data, ps, 2); + + ps.MaximumNumberofThreadsPerPSD = device->info->max_threads_per_psd - 1; + } + + genX(cmd_buffer_emit_generate_draws_vertex)(cmd_buffer, item_count); + + struct anv_state push_data_state = + genX(cmd_buffer_alloc_generated_push_data)(cmd_buffer); + + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + uint64_t end_cmd_addr = + anv_address_physical( + anv_address_add(generated_cmds_addr, generated_cmds_size)); + + struct anv_generate_indirect_params *push_data = push_data_state.map; + *push_data = (struct anv_generate_indirect_params) { + .draw_count = { + .is_indexed = indexed, + .is_predicated = cmd_buffer->state.conditional_render_enabled, + .draw_base = item_base, + .item_count = item_count, + .draw_count = 0, // Edit this through a the command streamer + .instance_multiplier = pipeline->instance_multiplier, + .indirect_data_stride = indirect_data_stride, + .end_addr_ldw = end_cmd_addr & 0xffffffff, + .end_addr_udw = end_cmd_addr >> 32, + }, + .indirect_data_addr = anv_address_physical(indirect_data_addr), + .generated_cmds_addr = anv_address_physical(generated_cmds_addr), + }; + + /* Copy the draw count into the push constants so that the generation gets + * the value straight away and doesn't even need to access memory. + */ + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, batch); + mi_memcpy(&b, + anv_address_add((struct anv_address) { + .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .offset = push_data_state.offset, + }, + offsetof(struct anv_generate_indirect_params, draw_count.draw_count)), + count_addr, 4); + + /* Only emit the data after the memcpy above. */ + genX(cmd_buffer_emit_generated_push_data)(cmd_buffer, push_data_state); + + anv_batch_emit(batch, GENX(3DPRIMITIVE), prim) { + prim.VertexAccessType = SEQUENTIAL; + prim.PrimitiveTopologyType = _3DPRIM_RECTLIST; + prim.VertexCountPerInstance = 3; + prim.InstanceCount = 1; + } +} + +static void +genX(cmd_buffer_emit_indirect_generated_draws_count)(struct anv_cmd_buffer *cmd_buffer, + struct anv_address indirect_data_addr, + uint32_t indirect_data_stride, + struct anv_address count_addr, + uint32_t max_draw_count, + bool indexed) +{ + genX(flush_pipeline_select_3d)(cmd_buffer); + + /* Apply the pipeline flush here so the indirect data is available for the + * generation shader. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + if (anv_address_is_null(cmd_buffer->generation_return_addr)) + genX(cmd_buffer_emit_indirect_generated_draws_init)(cmd_buffer); + + /* In order to have the vertex fetch gather the data we need to have a non + * 0 stride. It's possible to have a 0 stride given by the application when + * draw_count is 1, but we need a correct value for the + * VERTEX_BUFFER_STATE::BufferPitch, so ensure the caller set this + * correctly : + * + * Vulkan spec, vkCmdDrawIndirect: + * + * "If drawCount is less than or equal to one, stride is ignored." + */ + assert(indirect_data_stride > 0); + + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + + /* Emit the 3D state in the main batch. */ + genX(cmd_buffer_flush_gfx_state)(cmd_buffer); + + const uint32_t draw_cmd_stride = 4 * GENX(3DPRIMITIVE_EXTENDED_length); + + uint32_t item_base = 0; + while (item_base < max_draw_count) { + const uint32_t item_count = MIN2(max_draw_count - item_base, + MAX_GENERATED_DRAW_COUNT); + const uint32_t draw_cmd_size = item_count * draw_cmd_stride; + + /* Ensure we have enough contiguous space for all the draws so that the + * compute shader can edit all the 3DPRIMITIVEs from a single base + * address. + * + * TODO: we might have to split that if the amount of space is to large (at + * 1Mb?). + */ + VkResult result = anv_batch_emit_ensure_space(&cmd_buffer->batch, + draw_cmd_size); + if (result != VK_SUCCESS) + return; + + genX(cmd_buffer_emit_generate_draws_count)( + cmd_buffer, + anv_batch_current_address(&cmd_buffer->batch), + draw_cmd_size, + anv_address_add(indirect_data_addr, + item_base * indirect_data_stride), + indirect_data_stride, + item_base, + item_count, + count_addr, + indexed); + + anv_batch_advance(&cmd_buffer->batch, draw_cmd_size); + + item_base += item_count; + } +} + +static void +genX(cmd_buffer_flush_generated_draws)(struct anv_cmd_buffer *cmd_buffer) +{ + /* No return address setup means we don't have to do anything */ + if (anv_address_is_null(cmd_buffer->generation_return_addr)) + return; + + struct anv_batch *batch = &cmd_buffer->generation_batch; + + /* Wait for all the generation vertex shader to generate the commands. */ + genX(emit_apply_pipe_flushes)(batch, + cmd_buffer->device, + _3D, + ANV_PIPE_DATA_CACHE_FLUSH_BIT | + ANV_PIPE_CS_STALL_BIT); + +#if GFX_VER >= 12 + anv_batch_emit(batch, GENX(MI_ARB_CHECK), arb) { + arb.PreParserDisableMask = true; + arb.PreParserDisable = false; + } +#endif + +#if GFX_VER < 12 + /* Prior to Gfx12 we cannot disable the CS prefetch, so we have to emit a + * bunch of NOOPs to ensure we do not have generated commands loaded into + * the CS cache prior to them having been generated. + */ + const struct intel_device_info *devinfo = cmd_buffer->device->info; + const enum intel_engine_class engine_class = cmd_buffer->queue_family->engine_class; + for (uint32_t i = 0; i < devinfo->engine_class_prefetch[engine_class] / 4; i++) + anv_batch_emit(batch, GENX(MI_NOOP), noop); +#endif + + /* Return to the main batch. */ + anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_START), bbs) { + bbs.AddressSpaceIndicator = ASI_PPGTT; + bbs.BatchBufferStartAddress = cmd_buffer->generation_return_addr; + } + + cmd_buffer->generation_return_addr = ANV_NULL_ADDRESS; +} + +#endif /* GENX_CMD_GENERATED_INDIRECT_DRAW_H */ diff --git a/src/intel/vulkan/meson.build b/src/intel/vulkan/meson.build index 4ef7cc7bd43..2a1f46dfb34 100644 --- a/src/intel/vulkan/meson.build +++ b/src/intel/vulkan/meson.build @@ -18,6 +18,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +subdir('shaders') + inc_anv = include_directories('.') anv_flags = [ @@ -43,20 +45,6 @@ anv_entrypoints = custom_target( depend_files : vk_entrypoints_gen_depend_files, ) -float64_spv_h = custom_target( - 'float64_spv.h', - input : [glsl2spirv, float64_glsl_file], - output : 'float64_spv.h', - command : [ - prog_python, '@INPUT@', '@OUTPUT@', - prog_glslang, - '--create-entry', 'main', - '--vn', 'float64_spv_source', - '--glsl-version', '450', - '-Olib', - ] -) - idep_anv_headers = declare_dependency( sources : [anv_entrypoints[0]], include_directories : inc_anv, @@ -126,7 +114,8 @@ foreach g : [['90', ['gfx8_cmd_buffer.c']], _gfx_ver = g[0] libanv_per_hw_ver_libs += static_library( 'anv_per_hw_ver@0@'.format(_gfx_ver), - [anv_per_hw_ver_files, g[1], anv_entrypoints[0]], + [anv_per_hw_ver_files, g[1], anv_entrypoints[0], + generated_draws_spv_h, generated_draws_count_spv_h], include_directories : [ inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_compiler, inc_intel, ], @@ -152,6 +141,7 @@ libanv_files = files( 'anv_descriptor_set.c', 'anv_device.c', 'anv_formats.c', + 'anv_generated_indirect_draws.c', 'anv_genX.h', 'anv_image.c', 'anv_measure.c', @@ -216,7 +206,7 @@ libanv_common = static_library( c_args : anv_flags, cpp_args : anv_cpp_flags, gnu_symbol_visibility : 'hidden', - dependencies : anv_deps, + dependencies : anv_deps ) libvulkan_intel = shared_library( diff --git a/src/intel/vulkan/shaders/generated_draws.glsl b/src/intel/vulkan/shaders/generated_draws.glsl new file mode 100644 index 00000000000..434b4d6a0ef --- /dev/null +++ b/src/intel/vulkan/shaders/generated_draws.glsl @@ -0,0 +1,101 @@ +/* + * Copyright © 2022 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#version 450 + +/* These 2 bindings will be accessed through A64 messages */ +layout(set = 0, binding = 0, std430) buffer Storage0 { + uint indirect_data[]; +}; + +layout(set = 0, binding = 1, std430) buffer Storage1 { + uint commands[]; +}; + +/* This data will be provided through push constants. */ +layout(set = 0, binding = 2) uniform block { + uint is_indexed; + uint is_predicated; + uint draw_base; + uint draw_count; + uint instance_multiplier; + uint indirect_data_stride; +}; + +void main() +{ + uint item_idx = uint(gl_FragCoord.y) * 8192 + uint(gl_FragCoord.x); + uint indirect_data_offset = item_idx * indirect_data_stride / 4; + uint _3dprim_dw_size = 10; + uint cmd_idx = uint(item_idx) * _3dprim_dw_size; + uint draw_id = draw_base + item_idx; + + if (draw_id < draw_count) { + if (is_indexed != 0) { + /* Loading a VkDrawIndexedIndirectCommand */ + uint index_count = indirect_data[indirect_data_offset + 0]; + uint instance_count = indirect_data[indirect_data_offset + 1] * instance_multiplier; + uint first_index = indirect_data[indirect_data_offset + 2]; + uint vertex_offset = indirect_data[indirect_data_offset + 3]; + uint first_instance = indirect_data[indirect_data_offset + 4]; + + commands[cmd_idx + 0] = (3 << 29 | /* Command Type */ + 3 << 27 | /* Command SubType */ + 3 << 24 | /* 3D Command Opcode */ + 1 << 11 | /* Extended Parameter Enable */ + is_predicated << 8 | + 8 << 0); /* DWord Length */ + commands[cmd_idx + 1] = 1 << 8; /* Indexed */ + commands[cmd_idx + 2] = index_count; /* Vertex Count Per Instance */ + commands[cmd_idx + 3] = first_index; /* Start Vertex Location */ + commands[cmd_idx + 4] = instance_count; /* Instance Count */ + commands[cmd_idx + 5] = first_instance; /* Start Instance Location */ + commands[cmd_idx + 6] = vertex_offset; /* Base Vertex Location */ + commands[cmd_idx + 7] = vertex_offset; /* gl_BaseVertex */ + commands[cmd_idx + 8] = first_instance; /* gl_BaseInstance */ + commands[cmd_idx + 9] = draw_id; /* gl_DrawID */ + } else { + /* Loading a VkDrawIndirectCommand structure */ + uint vertex_count = indirect_data[indirect_data_offset + 0]; + uint instance_count = indirect_data[indirect_data_offset + 1] * instance_multiplier; + uint first_vertex = indirect_data[indirect_data_offset + 2]; + uint first_instance = indirect_data[indirect_data_offset + 3]; + + commands[cmd_idx + 0] = (3 << 29 | /* Command Type */ + 3 << 27 | /* Command SubType */ + 3 << 24 | /* 3D Command Opcode */ + 1 << 11 | /* Extended Parameter Enable */ + is_predicated << 8 | + 8 << 0); /* DWord Length */ + commands[cmd_idx + 1] = 0; + commands[cmd_idx + 2] = vertex_count; /* Vertex Count Per Instance */ + commands[cmd_idx + 3] = first_vertex; /* Start Vertex Location */ + commands[cmd_idx + 4] = instance_count; /* Instance Count */ + commands[cmd_idx + 5] = first_instance; /* Start Instance Location */ + commands[cmd_idx + 6] = 0; /* Base Vertex Location */ + commands[cmd_idx + 7] = first_vertex; /* gl_BaseVertex */ + commands[cmd_idx + 8] = first_instance; /* gl_BaseInstance */ + commands[cmd_idx + 9] = draw_id; /* gl_DrawID */ + } + } +} diff --git a/src/intel/vulkan/shaders/generated_draws_count.glsl b/src/intel/vulkan/shaders/generated_draws_count.glsl new file mode 100644 index 00000000000..0cff06753ff --- /dev/null +++ b/src/intel/vulkan/shaders/generated_draws_count.glsl @@ -0,0 +1,118 @@ +/* + * Copyright © 2022 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#version 450 + +/* These 2 bindings will be accessed through A64 messages */ +layout(set = 0, binding = 0, std430) buffer Storage0 { + uint indirect_data[]; +}; + +layout(set = 0, binding = 1, std430) buffer Storage1 { + uint commands[]; +}; + +/* This data will be provided through push constants. */ +layout(set = 0, binding = 2) uniform block { + uint is_indexed; + uint is_predicated; + uint draw_base; + uint item_count; + uint draw_count; + uint instance_multiplier; + uint indirect_data_stride; + uint end_addr_ldw; + uint end_addr_udw; +}; + +void main() +{ + uint item_idx = uint(gl_FragCoord.y) * 8192 + uint(gl_FragCoord.x); + uint indirect_data_offset = item_idx * indirect_data_stride / 4; + uint _3dprim_dw_size = 10; + uint cmd_idx = item_idx * _3dprim_dw_size; + + /* Loading a VkDrawIndexedIndirectCommand */ + uint index_count = indirect_data[indirect_data_offset + 0]; + uint instance_count = indirect_data[indirect_data_offset + 1]; + uint first_index = indirect_data[indirect_data_offset + 2]; + uint vertex_offset = indirect_data[indirect_data_offset + 3]; + uint first_instance = indirect_data[indirect_data_offset + 4]; + uint draw_id = draw_base + item_idx; + + if (draw_id < draw_count) { + if (is_indexed != 0) { + /* Loading a VkDrawIndexedIndirectCommand */ + uint index_count = indirect_data[indirect_data_offset + 0]; + uint instance_count = indirect_data[indirect_data_offset + 1] * instance_multiplier; + uint first_index = indirect_data[indirect_data_offset + 2]; + uint vertex_offset = indirect_data[indirect_data_offset + 3]; + uint first_instance = indirect_data[indirect_data_offset + 4]; + + commands[cmd_idx + 0] = (3 << 29 | /* Command Type */ + 3 << 27 | /* Command SubType */ + 3 << 24 | /* 3D Command Opcode */ + 1 << 11 | /* Extended Parameter Enable */ + is_predicated << 8 | + 8 << 0); /* DWord Length */ + commands[cmd_idx + 1] = 1 << 8; /* Indexed */ + commands[cmd_idx + 2] = index_count; /* Vertex Count Per Instance */ + commands[cmd_idx + 3] = first_index; /* Start Vertex Location */ + commands[cmd_idx + 4] = instance_count; /* Instance Count */ + commands[cmd_idx + 5] = first_instance; /* Start Instance Location */ + commands[cmd_idx + 6] = vertex_offset; /* Base Vertex Location */ + commands[cmd_idx + 7] = vertex_offset; /* gl_BaseVertex */ + commands[cmd_idx + 8] = first_instance; /* gl_BaseInstance */ + commands[cmd_idx + 9] = draw_id; /* gl_DrawID */ + } else { + /* Loading a VkDrawIndirectCommand structure */ + uint vertex_count = indirect_data[indirect_data_offset + 0]; + uint instance_count = indirect_data[indirect_data_offset + 1] * instance_multiplier; + uint first_vertex = indirect_data[indirect_data_offset + 2]; + uint first_instance = indirect_data[indirect_data_offset + 3]; + + commands[cmd_idx + 0] = (3 << 29 | /* Command Type */ + 3 << 27 | /* Command SubType */ + 3 << 24 | /* 3D Command Opcode */ + 1 << 11 | /* Extended Parameter Enable */ + is_predicated << 8 | + 8 << 0); /* DWord Length */ + commands[cmd_idx + 1] = 0; + commands[cmd_idx + 2] = vertex_count; /* Vertex Count Per Instance */ + commands[cmd_idx + 3] = first_vertex; /* Start Vertex Location */ + commands[cmd_idx + 4] = instance_count; /* Instance Count */ + commands[cmd_idx + 5] = first_instance; /* Start Instance Location */ + commands[cmd_idx + 6] = 0; /* Base Vertex Location */ + commands[cmd_idx + 7] = first_vertex; /* gl_BaseVertex */ + commands[cmd_idx + 8] = first_instance; /* gl_BaseInstance */ + commands[cmd_idx + 9] = draw_id; /* gl_DrawID */ + } + } else if (draw_id == draw_count) { + commands[cmd_idx + 0] = (0 << 29 | /* Command Type */ + 49 << 23 | /* MI Command Opcode */ + 1 << 8 | /* Address Space Indicator (PPGTT) */ + 1 << 0); /* DWord Length */ + commands[cmd_idx + 1] = end_addr_ldw; + commands[cmd_idx + 2] = end_addr_udw; + } +} diff --git a/src/intel/vulkan/shaders/meson.build b/src/intel/vulkan/shaders/meson.build new file mode 100644 index 00000000000..9ed504c70d2 --- /dev/null +++ b/src/intel/vulkan/shaders/meson.build @@ -0,0 +1,59 @@ +# Copyright © 2022 Intel Corporation + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +float64_spv_h = custom_target( + 'float64_spv.h', + input : [glsl2spirv, float64_glsl_file], + output : 'float64_spv.h', + command : [ + prog_python, '@INPUT@', '@OUTPUT@', + prog_glslang, + '--create-entry', 'main', + '--vn', 'float64_spv_source', + '--glsl-version', '450', + '-Olib', + ] +) + +generated_draws_spv_h = custom_target( + 'generated_draws_spv.h', + input : [glsl2spirv, 'generated_draws.glsl'], + output : 'generated_draws_spv.h', + command : [ + prog_python, '@INPUT@', '@OUTPUT@', + prog_glslang, + '--vn', 'generated_draws_spv_source', + '--glsl-version', '450', + '--stage', 'frag', + ] +) + +generated_draws_count_spv_h = custom_target( + 'generated_draws_count_spv.h', + input : [glsl2spirv, 'generated_draws_count.glsl'], + output : 'generated_draws_count_spv.h', + command : [ + prog_python, '@INPUT@', '@OUTPUT@', + prog_glslang, + '--vn', 'generated_draws_count_spv_source', + '--glsl-version', '450', + '--stage', 'frag', + ] +) diff --git a/src/util/driconf.h b/src/util/driconf.h index 1df74c70260..956169e1e03 100644 --- a/src/util/driconf.h +++ b/src/util/driconf.h @@ -615,4 +615,8 @@ DRI_CONF_OPT_B(fp64_workaround_enabled, def, \ "Use softpf64 when the shader uses float64, but the device doesn't support that type") +#define DRI_CONF_ANV_GENERATED_INDIRECT_THRESHOLD(def) \ + DRI_CONF_OPT_I(generated_indirect_threshold, def, 0, INT32_MAX, \ + "Indirect threshold count above which we start generating commands") + #endif