From f123030dcd736e6378af31ff167d29df51d4df30 Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Thu, 27 Feb 2025 08:05:26 +0200 Subject: [PATCH] anv: implement VK_KHR_device_address_commands Signed-off-by: Lionel Landwerlin Acked-by: Alyssa Rosenzweig Part-of: --- src/intel/vulkan/anv_blorp.c | 217 +++++++++++ src/intel/vulkan/anv_cmd_buffer.c | 89 +++++ src/intel/vulkan/anv_physical_device.c | 4 + src/intel/vulkan/anv_private.h | 22 ++ src/intel/vulkan/genX_cmd_buffer.c | 170 +++++++++ src/intel/vulkan/genX_cmd_compute.c | 10 + src/intel/vulkan/genX_cmd_draw.c | 489 +++++++++++++++++++++++++ src/intel/vulkan/genX_query.c | 34 ++ 8 files changed, 1035 insertions(+) diff --git a/src/intel/vulkan/anv_blorp.c b/src/intel/vulkan/anv_blorp.c index 61aca743509..29f4f3ea3d4 100644 --- a/src/intel/vulkan/anv_blorp.c +++ b/src/intel/vulkan/anv_blorp.c @@ -700,6 +700,41 @@ anv_blorp_execute_on_companion(struct anv_cmd_buffer *cmd_buffer, return false; } +static bool +anv_blorp_blitter_execute_on_companion2(struct anv_cmd_buffer *cmd_buffer, + struct anv_image *image, + uint32_t region_count, + const VkDeviceMemoryImageCopyKHR* regions) +{ + if (!anv_cmd_buffer_is_blitter_queue(cmd_buffer)) + return false; + + bool blorp_execute_on_companion = false; + + for (unsigned r = 0; r < region_count && !blorp_execute_on_companion; r++) { + VkImageAspectFlags aspect_mask = regions[r].imageSubresource.aspectMask; + + enum isl_format linear_format = + anv_get_isl_format(cmd_buffer->device->physical, image->vk.format, + aspect_mask, VK_IMAGE_TILING_LINEAR); + const struct isl_format_layout *linear_fmtl = + isl_format_get_layout(linear_format); + + switch (linear_fmtl->bpb) { + case 96: + /* We can only support linear mode for 96bpp on blitter engine. */ + blorp_execute_on_companion |= + image->vk.tiling != VK_IMAGE_TILING_LINEAR; + break; + default: + blorp_execute_on_companion |= linear_fmtl->bpb % 3 == 0; + break; + } + } + + return blorp_execute_on_companion; +} + void anv_CmdCopyImage2( VkCommandBuffer commandBuffer, const VkCopyImageInfo2* pCopyImageInfo) @@ -915,6 +950,77 @@ void anv_CmdCopyBufferToImage2( } } +void anv_CmdCopyMemoryToImageKHR( + VkCommandBuffer commandBuffer, + const VkCopyDeviceMemoryImageInfoKHR* pCopyMemoryInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_image, dst_image, pCopyMemoryInfo->image); + + bool blorp_execute_on_companion = + anv_blorp_execute_on_companion(cmd_buffer, NULL, dst_image); + + /* Check if any one of the aspects is incompatible with the blitter engine, + * if true, use the companion RCS command buffer for blit operation since 3 + * component formats are not supported natively except 96bpb on the blitter. + */ + blorp_execute_on_companion |= + anv_blorp_blitter_execute_on_companion2(cmd_buffer, dst_image, + pCopyMemoryInfo->regionCount, + pCopyMemoryInfo->pRegions); + + anv_cmd_require_rcs(cmd_buffer, blorp_execute_on_companion) { + struct blorp_batch batch; + anv_blorp_batch_init(cmd_buffer, &batch, BLORP_BATCH_SRC_UNPADDED); + + for (unsigned r = 0; r < pCopyMemoryInfo->regionCount; r++) { + const VkDeviceMemoryImageCopyKHR *region = &pCopyMemoryInfo->pRegions[r]; + const struct vk_image_buffer_layout buffer_layout = + vk_image_memory_copy_layout(&dst_image->vk, region); + + copy_buffer_to_image(cmd_buffer, &batch, + anv_address_from_range_flags( + region->addressRange, + region->addressFlags), + &buffer_layout, + dst_image, region->imageLayout, + region->imageSubresource, + region->imageOffset, region->imageExtent, + true); + } + + anv_blorp_batch_finish(&batch); + + if (dst_image->emu_plane_format != VK_FORMAT_UNDEFINED) { + assert(!anv_cmd_buffer_is_blitter_queue(cmd_buffer)); + const enum anv_pipe_bits pipe_bits = + anv_cmd_buffer_is_compute_queue(cmd_buffer) ? + ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; + anv_add_pending_pipe_bits(cmd_buffer, + (batch.flags & BLORP_BATCH_USE_COMPUTE) ? + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT : + VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + pipe_bits, + "Copy flush before astc emu"); + + for (unsigned r = 0; r < pCopyMemoryInfo->regionCount; r++) { + const VkDeviceMemoryImageCopyKHR *region = + &pCopyMemoryInfo->pRegions[r]; + const VkOffset3D block_offset = vk_image_offset_to_elements( + &dst_image->vk, region->imageOffset); + const VkExtent3D block_extent = vk_image_extent_to_elements( + &dst_image->vk, region->imageExtent); + anv_astc_emu_process(cmd_buffer, dst_image, + region->imageLayout, + ®ion->imageSubresource, + block_offset, block_extent); + } + } + } +} + static void anv_add_buffer_write_pending_bits(struct anv_cmd_buffer *cmd_buffer, const char *reason) @@ -976,6 +1082,50 @@ void anv_CmdCopyImageToBuffer2( } } +void anv_CmdCopyImageToMemoryKHR( + VkCommandBuffer commandBuffer, + const VkCopyDeviceMemoryImageInfoKHR* pCopyMemoryInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_image, src_image, pCopyMemoryInfo->image); + + bool blorp_execute_on_companion = + anv_blorp_execute_on_companion(cmd_buffer, src_image, NULL); + + /* Check if any one of the aspects is incompatible with the blitter engine, + * if true, use the companion RCS command buffer for blit operation since 3 + * component formats are not supported natively except 96bpb on the blitter. + */ + blorp_execute_on_companion |= + anv_blorp_blitter_execute_on_companion2(cmd_buffer, src_image, + pCopyMemoryInfo->regionCount, + pCopyMemoryInfo->pRegions); + + anv_cmd_require_rcs(cmd_buffer, blorp_execute_on_companion) { + struct blorp_batch batch; + anv_blorp_batch_init(cmd_buffer, &batch, 0); + + for (unsigned r = 0; r < pCopyMemoryInfo->regionCount; r++) { + const VkDeviceMemoryImageCopyKHR *region = &pCopyMemoryInfo->pRegions[r]; + const struct vk_image_buffer_layout memory_layout = + vk_image_memory_copy_layout(&src_image->vk, region); + + copy_buffer_to_image(cmd_buffer, &batch, + anv_address_from_range_flags(region->addressRange, + region->addressFlags), + &memory_layout, + src_image, region->imageLayout, + region->imageSubresource, + region->imageOffset, region->imageExtent, + false); + } + + anv_add_buffer_write_pending_bits(cmd_buffer, "after copy image to buffer"); + + anv_blorp_batch_finish(&batch); + } +} + static bool flip_coords(unsigned *src0, unsigned *src1, unsigned *dst0, unsigned *dst1) { @@ -1235,6 +1385,34 @@ void anv_CmdCopyBuffer2( anv_blorp_batch_finish(&batch); } +void anv_CmdCopyMemoryKHR( + VkCommandBuffer commandBuffer, + const VkCopyDeviceMemoryInfoKHR* pCopyMemoryInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + struct blorp_batch batch; + anv_blorp_batch_init(cmd_buffer, &batch, + BLORP_BATCH_SRC_UNPADDED | + (cmd_buffer->state.current_pipeline == + cmd_buffer->device->physical->gpgpu_pipeline_value ? + BLORP_BATCH_USE_COMPUTE : 0)); + + for (unsigned r = 0; r < pCopyMemoryInfo->regionCount; r++) { + const VkDeviceMemoryCopyKHR *region = &pCopyMemoryInfo->pRegions[r]; + copy_memory(cmd_buffer->device, &batch, + anv_address_from_range_flags(region->srcRange, + region->srcFlags), + anv_address_from_range_flags(region->dstRange, + region->dstFlags), + region->srcRange.size); + } + + anv_add_buffer_write_pending_bits(cmd_buffer, "after copy buffer"); + + anv_blorp_batch_finish(&batch); +} + void anv_cmd_buffer_update_addr( struct anv_cmd_buffer* cmd_buffer, @@ -1323,6 +1501,20 @@ void anv_CmdUpdateBuffer( dataSize, pData); } +void anv_CmdUpdateMemoryKHR( + VkCommandBuffer commandBuffer, + const VkDeviceAddressRangeKHR* pDstRange, + VkAddressCommandFlagsKHR dstFlags, + VkDeviceSize dataSize, + const void* pData) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + anv_cmd_buffer_update_addr(cmd_buffer, + anv_address_from_range_flags(*pDstRange, dstFlags), + pDstRange->size, pData); +} + void anv_cmd_buffer_fill_area(struct anv_cmd_buffer *cmd_buffer, struct anv_address address, @@ -1413,6 +1605,31 @@ void anv_CmdFillBuffer( anv_add_buffer_write_pending_bits(cmd_buffer, "after fill buffer"); } +void anv_CmdFillMemoryKHR( + VkCommandBuffer commandBuffer, + const VkDeviceAddressRangeKHR* pDstRange, + VkAddressCommandFlagsKHR dstFlags, + uint32_t data) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + /* From the Vulkan spec: + * + * "size is the number of bytes to fill, and must be either a multiple + * of 4, or VK_WHOLE_SIZE to fill the range from offset to the end of + * the buffer. If VK_WHOLE_SIZE is used and the remaining size of the + * buffer is not a multiple of 4, then the nearest smaller multiple is + * used." + */ + const VkDeviceSize size = pDstRange->size & ~3ull; + + anv_cmd_buffer_fill_area(cmd_buffer, + anv_address_from_range_flags(*pDstRange, dstFlags), + size, data); + + anv_add_buffer_write_pending_bits(cmd_buffer, "after fill buffer"); +} + static void exec_ccs_op(struct anv_cmd_buffer *cmd_buffer, struct blorp_batch *batch, diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index b6270447e3d..d6850e76325 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -1007,6 +1007,39 @@ void anv_CmdBindVertexBuffers2( } } +void anv_CmdBindVertexBuffers3KHR( + VkCommandBuffer commandBuffer, + uint32_t firstBinding, + uint32_t bindingCount, + const VkBindVertexBuffer3InfoKHR* pBindingInfos) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_vertex_binding *vb = cmd_buffer->state.vertex_bindings; + + /* We have to defer setting up vertex buffer since we need the buffer + * stride from the pipeline. */ + + assert(firstBinding + bindingCount <= get_max_vbs(cmd_buffer->device->info)); + for (uint32_t i = 0; i < bindingCount; i++) { + if (vb[firstBinding + i].addr != pBindingInfos[i].addressRange.address || + vb[firstBinding + i].size != pBindingInfos[i].addressRange.size) { + vb[firstBinding + i] = (struct anv_vertex_binding) { + .addr = pBindingInfos[i].addressRange.address, + .size = pBindingInfos[i].addressRange.size, + .mocs = anv_mocs(cmd_buffer->device, NULL, + ((pBindingInfos[i].addressFlags & + VK_ADDRESS_COMMAND_PROTECTED_BIT_KHR) ? + ISL_SURF_USAGE_PROTECTED_BIT : 0) | + ISL_SURF_USAGE_VERTEX_BUFFER_BIT), + }; + cmd_buffer->state.gfx.vb_dirty |= 1 << (firstBinding + i); + } + } + + vk_cmd_set_vertex_binding_strides2(&cmd_buffer->vk, firstBinding, + bindingCount, pBindingInfos); +} + void anv_CmdBindIndexBuffer2( VkCommandBuffer commandBuffer, VkBuffer _buffer, @@ -1038,6 +1071,32 @@ void anv_CmdBindIndexBuffer2( } } +void anv_CmdBindIndexBuffer3KHR( + VkCommandBuffer commandBuffer, + const VkBindIndexBuffer3InfoKHR* pInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + if (cmd_buffer->state.gfx.index_type != pInfo->indexType) { + cmd_buffer->state.gfx.index_type = pInfo->indexType; + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_INDEX_TYPE; + } + + vk_cmd_set_index_buffer_type(&cmd_buffer->vk, pInfo->indexType); + + if (cmd_buffer->state.gfx.index_addr != pInfo->addressRange.address || + cmd_buffer->state.gfx.index_size != pInfo->addressRange.size) { + cmd_buffer->state.gfx.index_addr = pInfo->addressRange.address; + cmd_buffer->state.gfx.index_size = pInfo->addressRange.size; + cmd_buffer->state.gfx.index_mocs = + anv_mocs(cmd_buffer->device, NULL, + ((pInfo->addressFlags & + VK_ADDRESS_COMMAND_PROTECTED_BIT_KHR) ? + ISL_SURF_USAGE_PROTECTED_BIT : 0) | + ISL_SURF_USAGE_INDEX_BUFFER_BIT); + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER; + } +} void anv_CmdBindTransformFeedbackBuffersEXT( VkCommandBuffer commandBuffer, @@ -1071,6 +1130,36 @@ void anv_CmdBindTransformFeedbackBuffersEXT( } } +void anv_CmdBindTransformFeedbackBuffers2EXT( + VkCommandBuffer commandBuffer, + uint32_t firstBinding, + uint32_t bindingCount, + const VkBindTransformFeedbackBuffer2InfoEXT* pBindingInfos) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_xfb_binding *xfb = cmd_buffer->state.xfb_bindings; + + /* We have to defer setting up vertex buffer since we need the buffer + * stride from the pipeline. */ + + assert(firstBinding + bindingCount <= MAX_XFB_BUFFERS); + for (uint32_t i = 0; i < bindingCount; i++) { + if (pBindingInfos[i].addressRange.size == 0) { + xfb[firstBinding + i] = (struct anv_xfb_binding) { 0 }; + } else { + xfb[firstBinding + i] = (struct anv_xfb_binding) { + .addr = pBindingInfos[i].addressRange.address, + .size = pBindingInfos[i].addressRange.size, + .mocs = anv_mocs(cmd_buffer->device, NULL, + ((pBindingInfos[i].addressFlags & + VK_ADDRESS_COMMAND_PROTECTED_BIT_KHR) ? + ISL_SURF_USAGE_PROTECTED_BIT : 0) | + ISL_SURF_USAGE_STREAM_OUT_BIT), + }; + } + } +} + enum isl_format anv_isl_format_for_descriptor_type(const struct anv_device *device, VkDescriptorType type) diff --git a/src/intel/vulkan/anv_physical_device.c b/src/intel/vulkan/anv_physical_device.c index 2ca07d9bc3e..5b723093e86 100644 --- a/src/intel/vulkan/anv_physical_device.c +++ b/src/intel/vulkan/anv_physical_device.c @@ -159,6 +159,7 @@ get_device_extensions(const struct anv_physical_device *device, .KHR_depth_stencil_resolve = true, .KHR_descriptor_update_template = true, .KHR_device_group = true, + .KHR_device_address_commands = true, .KHR_draw_indirect_count = true, .KHR_driver_properties = true, .KHR_dynamic_rendering = true, @@ -1051,6 +1052,9 @@ get_features(const struct anv_physical_device *pdevice, /* VK_KHR_maintenance11 */ .maintenance11 = true, + + /* VK_KHR_device_address_commands */ + .deviceAddressCommands = true, }; /* The new DOOM and Wolfenstein games require depthBounds without diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 3ad2abfd9a2..5bbc06bbdf8 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -530,6 +530,28 @@ anv_address_from_u64(uint64_t addr_u64) }; } +static inline struct anv_address +anv_address_from_range_flags(VkDeviceAddressRangeKHR range, + VkAddressCommandFlagsKHR flags) +{ + return (struct anv_address) { + .bo = NULL, + .offset = range.address, + .protected = (flags & VK_ADDRESS_COMMAND_PROTECTED_BIT_KHR) != 0, + }; +} + +static inline struct anv_address +anv_address_from_strided_range_flags(VkStridedDeviceAddressRangeKHR range, + VkAddressCommandFlagsKHR flags) +{ + return (struct anv_address) { + .bo = NULL, + .offset = range.address, + .protected = (flags & VK_ADDRESS_COMMAND_PROTECTED_BIT_KHR) != 0, + }; +} + static inline bool anv_address_is_null(struct anv_address addr) { diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index c1a54aaec56..17c1b31d42e 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -4936,6 +4936,27 @@ cmd_buffer_barrier_video(struct anv_cmd_buffer *cmd_buffer, break; } + const VkMemoryRangeBarriersInfoKHR *mem_range_barriers = + vk_find_struct_const(dep_infos->pNext, MEMORY_RANGE_BARRIERS_INFO_KHR); + for (uint32_t i = 0; mem_range_barriers && i < mem_range_barriers->memoryRangeBarrierCount; i++) { + const VkMemoryRangeBarrierKHR *mem_barrier = + &mem_range_barriers->pMemoryRangeBarriers[i]; + const VkMemoryBarrierAccessFlags3KHR *barrier3 = + vk_find_struct_const(mem_barrier->pNext, + MEMORY_BARRIER_ACCESS_FLAGS_3_KHR); + + /* Flush the cache if something is written by the video operations and + * used by any other stages except video encode/decode stage. + */ + if (stage_is_video(mem_barrier->srcStageMask) && + mask_is_write(mem_barrier->srcAccessMask, + barrier3 ? barrier3->srcAccessMask3 : 0) && + !stage_is_video(mem_barrier->dstStageMask)) { + flush_llc = true; + break; + } + } + if (flush_ccs || flush_llc || !anv_address_is_null(signal_addr)) { anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) { #if GFX_VERx10 >= 125 @@ -5051,6 +5072,26 @@ cmd_buffer_barrier_blitter(struct anv_cmd_buffer *cmd_buffer, } } + const VkMemoryRangeBarriersInfoKHR *mem_range_barriers = + vk_find_struct_const(dep_info->pNext, MEMORY_RANGE_BARRIERS_INFO_KHR); + for (uint32_t i = 0; mem_range_barriers && i < mem_range_barriers->memoryRangeBarrierCount; i++) { + const VkMemoryRangeBarrierKHR *mem_barrier = + &mem_range_barriers->pMemoryRangeBarriers[i]; + const VkMemoryBarrierAccessFlags3KHR *barrier3 = + vk_find_struct_const(mem_barrier->pNext, + MEMORY_BARRIER_ACCESS_FLAGS_3_KHR); + + /* Flush the cache if something is written by the transfer command + * and used by any other stages except transfer stage. + */ + if (stage_is_transfer(mem_barrier->srcStageMask) && + mask_is_write(mem_barrier->srcAccessMask, + barrier3 ? barrier3->srcAccessMask3 : 0)) { + flush_llc = true; + break; + } + } + /* We cannot gather more information than that. */ if (flush_ccs && flush_llc) break; @@ -5366,6 +5407,53 @@ cmd_buffer_accumulate_barrier_bits(struct anv_cmd_buffer *cmd_buffer, if (anv_image_is_sparse(image) && mask_is_write(src_flags, barrier3 ? barrier3->srcAccessMask3 : 0)) apply_sparse_flushes = true; +#endif + } + + const VkMemoryRangeBarriersInfoKHR *mem_range_barriers = + vk_find_struct_const(dep_info->pNext, MEMORY_RANGE_BARRIERS_INFO_KHR); + for (uint32_t i = 0; mem_range_barriers && i < mem_range_barriers->memoryRangeBarrierCount; i++) { + const VkMemoryRangeBarrierKHR *mem_barrier = + &mem_range_barriers->pMemoryRangeBarriers[i]; + const VkMemoryBarrierAccessFlags3KHR *barrier3 = + vk_find_struct_const(mem_barrier->pNext, + MEMORY_BARRIER_ACCESS_FLAGS_3_KHR); + + if (barrier3) { + src_flags3 |= barrier3->srcAccessMask3; + dst_flags3 |= barrier3->dstAccessMask3; + } + + src_flags |= mem_barrier->srcAccessMask; + dst_flags |= mem_barrier->dstAccessMask; + + src_stages |= mem_barrier->srcStageMask; + dst_stages |= mem_barrier->dstStageMask; + + /* Shader writes to buffers that could then be written by a transfer + * command (including queries). + */ + if (stage_is_shader(mem_barrier->srcStageMask) && + mask_is_shader_write(mem_barrier->srcAccessMask, + barrier3 ? barrier3->srcAccessMask3 : 0) && + stage_is_transfer(mem_barrier->dstStageMask)) { + cmd_buffer->state.queries.buffer_write_bits |= + ANV_QUERY_COMPUTE_WRITES_PENDING_BITS; + } + + if (stage_is_transfer(mem_barrier->srcStageMask) && + mask_is_transfer_write(mem_barrier->srcAccessMask) && + cmd_buffer_has_pending_copy_query(cmd_buffer)) + flush_query_copies = true; + +#if GFX_VER < 20 + /* There's no way of knowing if this memory barrier is related to + * sparse buffers! This is pretty horrible. + */ + if (mask_is_write(src_flags, + barrier3 ? barrier3->srcAccessMask3 : 0) && + p_atomic_read(&device->num_sparse_resources) > 0) + apply_sparse_flushes = true; #endif } } @@ -6958,6 +7046,49 @@ void genX(CmdBeginConditionalRenderingEXT)( mi_ult(&b, mi_imm(0), value)); } +void genX(CmdBeginConditionalRendering2EXT)( + VkCommandBuffer commandBuffer, + const VkConditionalRenderingBeginInfo2EXT* pConditionalRenderingBegin) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_cmd_state *cmd_state = &cmd_buffer->state; + struct anv_address value_address = + anv_address_from_u64(pConditionalRenderingBegin->addressRange.address); + + const bool isInverted = pConditionalRenderingBegin->flags & + VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT; + + cmd_state->conditional_render_enabled = true; + + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &value_address); + mi_builder_set_mocs(&b, mocs); + + /* Section 19.4 of the Vulkan 1.1.85 spec says: + * + * If the value of the predicate in buffer memory changes + * while conditional rendering is active, the rendering commands + * may be discarded in an implementation-dependent way. + * Some implementations may latch the value of the predicate + * upon beginning conditional rendering while others + * may read it before every rendering command. + * + * So it's perfectly fine to read a value from the buffer once. + */ + struct mi_value value = mi_mem32(value_address); + + /* Precompute predicate result, it is necessary to support secondary + * command buffers since it is unknown if conditional rendering is + * inverted when populating them. + */ + mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG), + isInverted ? mi_uge(&b, mi_imm(0), value) : + mi_ult(&b, mi_imm(0), value)); +} + void genX(CmdEndConditionalRenderingEXT)( VkCommandBuffer commandBuffer) { @@ -7673,6 +7804,45 @@ genX(CmdWriteBufferMarker2AMD)(VkCommandBuffer commandBuffer, trace_intel_end_write_buffer_marker(&cmd_buffer->trace); } +void genX(CmdWriteMarkerToMemoryAMD)( + VkCommandBuffer commandBuffer, + const VkMemoryMarkerInfoAMD* pInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + /* The barriers inserted by the application to make dstBuffer writable + * should already have the L1/L2 cache flushes. On platforms where the + * command streamer is not coherent with L3, we need an additional set of + * cache flushes. + */ + enum anv_pipe_bits bits = + (ANV_DEVINFO_HAS_COHERENT_L3_CS(cmd_buffer->device->info) ? 0 : + (ANV_PIPE_DATA_CACHE_FLUSH_BIT | ANV_PIPE_TILE_CACHE_FLUSH_BIT)) | + ANV_PIPE_END_OF_PIPE_SYNC_BIT; + + trace_intel_begin_write_buffer_marker(&cmd_buffer->trace); + + anv_add_pending_pipe_bits(cmd_buffer, pInfo->stage, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + bits, "write buffer marker"); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + /* Emitting a PIPE_CONTROL with Post-Sync Op = Write Immediate Data + * would be the logical way to implement this extension, as it could + * do a pipelined marker write. Unfortunately, it requires writing + * whole 64-bit QWords, and VK_AMD_buffer_marker requires writing a + * 32-bit value. MI_STORE_DATA_IMM is the only good way to do that, + * and unfortunately it requires stalling. + */ + mi_store(&b, mi_mem32(anv_address_from_u64(pInfo->dstRange.address)), + mi_imm(pInfo->marker)); + + trace_intel_end_write_buffer_marker(&cmd_buffer->trace); +} + void genX(cmd_write_buffer_cp)(struct anv_cmd_buffer *cmd_buffer, VkDeviceAddress dstAddr, diff --git a/src/intel/vulkan/genX_cmd_compute.c b/src/intel/vulkan/genX_cmd_compute.c index 63478c35701..082ea05d786 100644 --- a/src/intel/vulkan/genX_cmd_compute.c +++ b/src/intel/vulkan/genX_cmd_compute.c @@ -886,6 +886,16 @@ void genX(CmdDispatchIndirect)( genX(cmd_buffer_dispatch_indirect)(cmd_buffer, addr, false); } +void genX(CmdDispatchIndirect2KHR)( + VkCommandBuffer commandBuffer, + const VkDispatchIndirect2InfoKHR* pInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + genX(cmd_buffer_dispatch_indirect)( + cmd_buffer, anv_address_from_u64(pInfo->addressRange.address), false); +} + struct anv_address genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer) { diff --git a/src/intel/vulkan/genX_cmd_draw.c b/src/intel/vulkan/genX_cmd_draw.c index 377ea808562..5ffe6124a60 100644 --- a/src/intel/vulkan/genX_cmd_draw.c +++ b/src/intel/vulkan/genX_cmd_draw.c @@ -1774,6 +1774,97 @@ void genX(CmdDrawIndirectByteCountEXT)( gfx->fs_source_hash); } +void genX(CmdDrawIndirectByteCount2EXT)( + VkCommandBuffer commandBuffer, + uint32_t instanceCount, + uint32_t firstInstance, + const VkBindTransformFeedbackBuffer2InfoEXT* pCounterInfo, + uint32_t counterOffset, + uint32_t vertexStride) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + const struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + + /* firstVertex is always zero for this draw function */ + const uint32_t firstVertex = 0; + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + anv_measure_snapshot(cmd_buffer, + INTEL_SNAPSHOT_DRAW, + "draw indirect byte count", + instanceCount * gfx->instance_multiplier); + trace_intel_begin_draw_indirect_byte_count(&cmd_buffer->trace); + + /* Select pipeline here to allow + * cmd_buffer_emit_vertex_constants_and_flush() without flushing before + * emit_base_vertex_instance() & emit_draw_index(). + */ + genX(flush_pipeline_select_3d)(cmd_buffer); + +#if GFX_VER < 11 + const struct brw_vs_prog_data *vs_prog_data = get_gfx_vs_prog_data(gfx); + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) + emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance); + if (vs_prog_data->uses_drawid) + emit_draw_index(cmd_buffer, 0); +#endif + + cmd_buffer_flush_gfx(cmd_buffer); + + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + + struct anv_address counter_addr = + anv_address_from_u64(pCounterInfo->addressRange.address); + + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &counter_addr); + mi_builder_set_mocs(&b, mocs); + struct mi_value count = mi_mem32(counter_addr); + if (counterOffset) + count = mi_isub(&b, count, mi_imm(counterOffset)); + count = mi_udiv32_imm(&b, count, vertexStride); + mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count); + + mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex)); + mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), + mi_imm(instanceCount * gfx->instance_multiplier)); + mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance)); + mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0)); + +#if GFX_VER >= 11 + mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX), + mi_imm(firstVertex)); + /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */ + mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_DRAW_ID), mi_imm(0)); +#endif + + cmd_buffer_pre_draw_wa(cmd_buffer); + + anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) { +#if GFX_VERx10 >= 125 + prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr; +#endif + prim.IndirectParameterEnable = true; + prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; + prim.VertexAccessType = SEQUENTIAL; +#if GFX_VER >= 11 + prim.ExtendedParametersPresent = true; +#endif + } + + cmd_buffer_post_draw_wa(cmd_buffer, 1, SEQUENTIAL); + + trace_intel_end_draw_indirect_byte_count(&cmd_buffer->trace, + instanceCount * gfx->instance_multiplier, + gfx->vs_source_hash, + gfx->fs_source_hash); +} + static void load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer, struct anv_address addr, @@ -2106,6 +2197,48 @@ void genX(CmdDrawIndirect)( gfx->fs_source_hash); } +void genX(CmdDrawIndirect2KHR)( + VkCommandBuffer commandBuffer, + const VkDrawIndirect2InfoKHR* pInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + const struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + anv_measure_snapshot(cmd_buffer, + INTEL_SNAPSHOT_DRAW, + "draw indirect", + pInfo->drawCount); + trace_intel_begin_draw_indirect(&cmd_buffer->trace); + + struct anv_address indirect_data_addr = + anv_address_from_u64(pInfo->addressRange.address); + uint64_t stride = + MAX2(pInfo->addressRange.stride, sizeof(VkDrawIndirectCommand)); + + if (execute_indirect_draw_supported(cmd_buffer)) { + genX(cmd_buffer_emit_execute_indirect_draws)( + cmd_buffer, indirect_data_addr, stride, + ANV_NULL_ADDRESS /* count_addr */, pInfo->drawCount, + VK_CMD_DRAW_INDIRECT); + } else if (anv_use_generated_draws(cmd_buffer, pInfo->drawCount)) { + genX(cmd_buffer_emit_indirect_generated_draws)( + cmd_buffer,indirect_data_addr, stride, + ANV_NULL_ADDRESS /* count_addr */, pInfo->drawCount, + false /* indexed */); + } else { + emit_indirect_draws(cmd_buffer, + indirect_data_addr, stride, + pInfo->drawCount, false /* indexed */); + } + + trace_intel_end_draw_indirect(&cmd_buffer->trace, pInfo->drawCount, + gfx->vs_source_hash, + gfx->fs_source_hash); +} + void genX(CmdDrawIndexedIndirect)( VkCommandBuffer commandBuffer, VkBuffer _buffer, @@ -2158,6 +2291,47 @@ void genX(CmdDrawIndexedIndirect)( gfx->fs_source_hash); } +void genX(CmdDrawIndexedIndirect2KHR)( + VkCommandBuffer commandBuffer, + const VkDrawIndirect2InfoKHR* pInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + const struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + anv_measure_snapshot(cmd_buffer, + INTEL_SNAPSHOT_DRAW, + "draw indexed indirect", + pInfo->drawCount); + trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace); + + struct anv_address indirect_data_addr = + anv_address_from_u64(pInfo->addressRange.address); + uint64_t stride = + MAX2(pInfo->addressRange.stride, sizeof(VkDrawIndexedIndirectCommand)); + + if (execute_indirect_draw_supported(cmd_buffer)) { + genX(cmd_buffer_emit_execute_indirect_draws)( + cmd_buffer, indirect_data_addr, stride, + ANV_NULL_ADDRESS /* count_addr */, pInfo->drawCount, + VK_CMD_DRAW_INDEXED_INDIRECT); + } else if (anv_use_generated_draws(cmd_buffer, pInfo->drawCount)) { + genX(cmd_buffer_emit_indirect_generated_draws)( + cmd_buffer, indirect_data_addr, stride, + ANV_NULL_ADDRESS /* count_addr */, pInfo->drawCount, + true /* indexed */); + } else { + emit_indirect_draws(cmd_buffer, indirect_data_addr, stride, + pInfo->drawCount, true /* indexed */); + } + + trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, pInfo->drawCount, + gfx->vs_source_hash, + gfx->fs_source_hash); +} + #define MI_PREDICATE_SRC0 0x2400 #define MI_PREDICATE_SRC1 0x2408 #define MI_PREDICATE_RESULT 0x2418 @@ -2365,6 +2539,50 @@ void genX(CmdDrawIndirectCount)( gfx->fs_source_hash); } +void genX(CmdDrawIndirectCount2KHR)( + VkCommandBuffer commandBuffer, + const VkDrawIndirectCount2InfoKHR* pInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + const struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + anv_measure_snapshot(cmd_buffer, + INTEL_SNAPSHOT_DRAW, + "draw indirect count", + 0); + trace_intel_begin_draw_indirect_count(&cmd_buffer->trace); + + struct anv_address indirect_data_address = + anv_address_from_u64(pInfo->addressRange.address); + uint64_t stride = + MAX2(pInfo->addressRange.stride, sizeof(VkDrawIndirectCommand)); + struct anv_address count_address = + anv_address_from_u64(pInfo->countAddressRange.address); + + if (execute_indirect_draw_supported(cmd_buffer)) { + genX(cmd_buffer_emit_execute_indirect_draws)( + cmd_buffer, indirect_data_address, stride, + count_address, pInfo->maxDrawCount, + VK_CMD_DRAW_INDIRECT_COUNT); + } else if (anv_use_generated_draws(cmd_buffer, pInfo->maxDrawCount)) { + genX(cmd_buffer_emit_indirect_generated_draws)( + cmd_buffer, indirect_data_address, stride, + count_address, pInfo->maxDrawCount, false /* indexed */); + } else { + emit_indirect_count_draws( + cmd_buffer, indirect_data_address, stride, + count_address, pInfo->maxDrawCount, false /* indexed */); + } + + trace_intel_end_draw_indirect_count(&cmd_buffer->trace, + anv_address_utrace(count_address), + gfx->vs_source_hash, + gfx->fs_source_hash); +} + void genX(CmdDrawIndexedIndirectCount)( VkCommandBuffer commandBuffer, VkBuffer _buffer, @@ -2425,6 +2643,51 @@ void genX(CmdDrawIndexedIndirectCount)( gfx->fs_source_hash); } +void genX(CmdDrawIndexedIndirectCount2KHR)( + VkCommandBuffer commandBuffer, + const VkDrawIndirectCount2InfoKHR* pInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + const struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + anv_measure_snapshot(cmd_buffer, + INTEL_SNAPSHOT_DRAW, + "draw indexed indirect count", + 0); + trace_intel_begin_draw_indexed_indirect_count(&cmd_buffer->trace); + + struct anv_address indirect_data_address = + anv_address_from_u64(pInfo->addressRange.address); + uint64_t stride = + MAX2(pInfo->addressRange.stride, sizeof(VkDrawIndexedIndirectCommand)); + struct anv_address count_address = + anv_address_from_u64(pInfo->countAddressRange.address); + + if (execute_indirect_draw_supported(cmd_buffer)) { + genX(cmd_buffer_emit_execute_indirect_draws)( + cmd_buffer, indirect_data_address, stride, + count_address, pInfo->maxDrawCount, + VK_CMD_DRAW_INDEXED_INDIRECT_COUNT); + } else if (anv_use_generated_draws(cmd_buffer, pInfo->maxDrawCount)) { + genX(cmd_buffer_emit_indirect_generated_draws)( + cmd_buffer, indirect_data_address, stride, + count_address, pInfo->maxDrawCount, + true /* indexed */); + } else { + emit_indirect_count_draws( + cmd_buffer, indirect_data_address, stride, + count_address, pInfo->maxDrawCount, true /* indexed */); + } + + trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace, + anv_address_utrace(count_address), + gfx->vs_source_hash, + gfx->fs_source_hash); +} + void genX(CmdBeginTransformFeedbackEXT)( VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer, @@ -2481,6 +2744,59 @@ void genX(CmdBeginTransformFeedbackEXT)( cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE; } +void genX(CmdBeginTransformFeedback2EXT)( + VkCommandBuffer commandBuffer, + uint32_t firstCounterRange, + uint32_t counterRangeCount, + const VkBindTransformFeedbackBuffer2InfoEXT* pCounterInfos) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + assert(firstCounterRange < MAX_XFB_BUFFERS); + assert(counterRangeCount <= MAX_XFB_BUFFERS); + assert(firstCounterRange + counterRangeCount <= MAX_XFB_BUFFERS); + + trace_intel_begin_xfb(&cmd_buffer->trace); + + /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET: + * + * "Ssoftware must ensure that no HW stream output operations can be in + * process or otherwise pending at the point that the MI_LOAD/STORE + * commands are processed. This will likely require a pipeline flush." + */ + anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + ANV_PIPE_CS_STALL_BIT, + "begin transform feedback"); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) { + uint32_t cb_idx = idx - firstCounterRange; + /* If we have a counter buffer, this is a resume so we need to load the + * value into the streamout offset register. Otherwise, this is a begin + * and we need to reset it to zero. + */ + if (pCounterInfos && + idx >= firstCounterRange && + idx - firstCounterRange < counterRangeCount && + pCounterInfos[cb_idx].addressRange.size != 0) { + mi_store(&b, mi_reg32(GENX(SO_WRITE_OFFSET0_num) + idx * 4), + mi_mem32(anv_address_from_u64( + pCounterInfos[cb_idx].addressRange.address))); + } else { + mi_store(&b, mi_reg32(GENX(SO_WRITE_OFFSET0_num) + idx * 4), + mi_imm(0)); + } + } + + cmd_buffer->state.xfb_enabled = true; + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE; +} + void genX(CmdEndTransformFeedbackEXT)( VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer, @@ -2535,6 +2851,54 @@ void genX(CmdEndTransformFeedbackEXT)( cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE; } +void genX(CmdEndTransformFeedback2EXT)( + VkCommandBuffer commandBuffer, + uint32_t firstCounterRange, + uint32_t counterRangeCount, + const VkBindTransformFeedbackBuffer2InfoEXT* pCounterInfos) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + assert(firstCounterRange < MAX_XFB_BUFFERS); + assert(counterRangeCount <= MAX_XFB_BUFFERS); + assert(firstCounterRange + counterRangeCount <= MAX_XFB_BUFFERS); + + /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET: + * + * "Ssoftware must ensure that no HW stream output operations can be in + * process or otherwise pending at the point that the MI_LOAD/STORE + * commands are processed. This will likely require a pipeline flush." + */ + anv_add_pending_pipe_bits(cmd_buffer, + VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + ANV_PIPE_CS_STALL_BIT, + "end transform feedback"); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + for (uint32_t cb_idx = 0; cb_idx < counterRangeCount; cb_idx++) { + unsigned idx = firstCounterRange + cb_idx; + + /* If we have a counter buffer, this is a resume so we need to load the + * value into the streamout offset register. Otherwise, this is a begin + * and we need to reset it to zero. + */ + if (cb_idx < counterRangeCount && + pCounterInfos[cb_idx].addressRange.size != 0) { + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) { + srm.MemoryAddress = anv_address_from_u64( + pCounterInfos[cb_idx].addressRange.address); + srm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4; + } + } + } + + trace_intel_end_xfb(&cmd_buffer->trace); + + cmd_buffer->state.xfb_enabled = false; + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE; +} + #if GFX_VERx10 >= 125 void @@ -2673,6 +3037,65 @@ genX(CmdDrawMeshTasksIndirectEXT)( trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, drawCount); } +void genX(CmdDrawMeshTasksIndirect2EXT)( + VkCommandBuffer commandBuffer, + const VkDrawIndirect2InfoKHR* pInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + const struct brw_task_prog_data *task_prog_data = get_gfx_task_prog_data(gfx); + const struct brw_mesh_prog_data *mesh_prog_data = get_gfx_mesh_prog_data(gfx); + struct anv_cmd_state *cmd_state = &cmd_buffer->state; + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + anv_measure_snapshot(cmd_buffer, + INTEL_SNAPSHOT_DRAW, + "draw mesh indirect", pInfo->drawCount); + + struct anv_address indirect_data_addr = + anv_address_from_u64(pInfo->addressRange.address); + uint64_t stride = + MAX2(pInfo->addressRange.stride, sizeof(VkDrawMeshTasksIndirectCommandEXT)); + + trace_intel_begin_draw_mesh_indirect(&cmd_buffer->trace); + + if (execute_indirect_draw_supported(cmd_buffer)) { + genX(cmd_buffer_emit_execute_indirect_draws)( + cmd_buffer, indirect_data_addr, stride, + ANV_NULL_ADDRESS /* count_addr */, pInfo->drawCount, + VK_CMD_DRAW_MESH_TASKS_INDIRECT_EXT); + + trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, pInfo->drawCount); + return; + } + + cmd_buffer_flush_gfx(cmd_buffer); + + if (cmd_state->conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + + bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) || + mesh_prog_data->uses_drawid; + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + uint64_t offset = 0; + for (uint32_t i = 0; i < pInfo->drawCount; i++) { + struct anv_address draw = anv_address_add(indirect_data_addr, offset); + + mesh_load_indirect_parameters_3dmesh_3d(cmd_buffer, &b, draw, uses_drawid, i); + + emit_indirect_3dmesh_3d(&cmd_buffer->batch, + cmd_state->conditional_render_enabled, uses_drawid); + + offset += stride; + } + + trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, pInfo->drawCount); +} + void genX(CmdDrawMeshTasksIndirectCountEXT)( VkCommandBuffer commandBuffer, @@ -2746,4 +3169,70 @@ genX(CmdDrawMeshTasksIndirectCountEXT)( anv_address_utrace(count_addr)); } +void genX(CmdDrawMeshTasksIndirectCount2EXT)( + VkCommandBuffer commandBuffer, + const VkDrawIndirectCount2InfoKHR* pInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + const struct brw_task_prog_data *task_prog_data = get_gfx_task_prog_data(gfx); + const struct brw_mesh_prog_data *mesh_prog_data = get_gfx_mesh_prog_data(gfx); + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + anv_measure_snapshot(cmd_buffer, + INTEL_SNAPSHOT_DRAW, + "draw mesh indirect count", 0); + + trace_intel_begin_draw_mesh_indirect_count(&cmd_buffer->trace); + + struct anv_address indirect_data_addr = + anv_address_from_u64(pInfo->addressRange.address); + uint64_t stride = + MAX2(pInfo->addressRange.stride, sizeof(VkDrawMeshTasksIndirectCommandEXT)); + struct anv_address count_addr = + anv_address_from_u64(pInfo->countAddressRange.address); + + + if (execute_indirect_draw_supported(cmd_buffer)) { + genX(cmd_buffer_emit_execute_indirect_draws)( + cmd_buffer, indirect_data_addr, stride, + count_addr, pInfo->maxDrawCount, + VK_CMD_DRAW_MESH_TASKS_INDIRECT_COUNT_EXT); + + trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, pInfo->maxDrawCount); + return; + } + + cmd_buffer_flush_gfx(cmd_buffer); + + bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) || + mesh_prog_data->uses_drawid; + + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &count_addr); + mi_builder_set_mocs(&b, mocs); + + struct mi_value max = + prepare_for_draw_count_predicate(cmd_buffer, &b, count_addr); + + uint64_t offset = 0; + for (uint32_t i = 0; i < pInfo->maxDrawCount; i++) { + struct anv_address draw = anv_address_add(indirect_data_addr, offset); + + emit_draw_count_predicate_cond(cmd_buffer, &b, i, max); + + mesh_load_indirect_parameters_3dmesh_3d(cmd_buffer, &b, draw, uses_drawid, i); + + emit_indirect_3dmesh_3d(&cmd_buffer->batch, true, uses_drawid); + + offset += stride; + } + + trace_intel_end_draw_mesh_indirect_count(&cmd_buffer->trace, + anv_address_utrace(count_addr)); +} + #endif /* GFX_VERx10 >= 125 */ diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c index aca692e8881..c39a2f8b305 100644 --- a/src/intel/vulkan/genX_query.c +++ b/src/intel/vulkan/genX_query.c @@ -2062,6 +2062,40 @@ void genX(CmdCopyQueryPoolResults)( } } +void genX(CmdCopyQueryPoolResultsToMemoryKHR)( + VkCommandBuffer commandBuffer, + VkQueryPool queryPool, + uint32_t firstQuery, + uint32_t queryCount, + const VkStridedDeviceAddressRangeKHR* pDstRange, + VkAddressCommandFlagsKHR dstFlags, + VkQueryResultFlags queryResultFlags) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); + struct anv_device *device = cmd_buffer->device; + struct anv_physical_device *pdevice = device->physical; + + struct anv_address dst_addr = + anv_address_from_strided_range_flags(*pDstRange, dstFlags); + + if (queryCount > pdevice->instance->query_copy_with_shader_threshold) { + copy_query_results_with_shader(cmd_buffer, pool, + dst_addr, + pDstRange->stride, + firstQuery, + queryCount, + queryResultFlags); + } else { + copy_query_results_with_cs(cmd_buffer, pool, + dst_addr, + pDstRange->stride, + firstQuery, + queryCount, + queryResultFlags); + } +} + #if GFX_VERx10 >= 125 && ANV_SUPPORT_RT #include "bvh/anv_bvh.h"