diff --git a/docs/features.txt b/docs/features.txt index 1b5dc753009..b9976c09a2b 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -483,7 +483,7 @@ Vulkan 1.3 -- all DONE: anv, radv, lvp VK_EXT_4444_formats DONE (anv, lvp, radv, tu, v3dv) VK_EXT_extended_dynamic_state DONE (anv, lvp, radv, tu) VK_EXT_extended_dynamic_state2 DONE (anv, lvp, radv, tu) - VK_EXT_inline_uniform_block DONE (anv, radv) + VK_EXT_inline_uniform_block DONE (anv, radv, v3dv) VK_EXT_pipeline_creation_cache_control DONE (anv, radv, v3dv) VK_EXT_pipeline_creation_feedback DONE (anv, radv, v3dv) VK_EXT_private_data DONE (anv, lvp, radv, tu, v3dv) diff --git a/src/broadcom/common/v3d_limits.h b/src/broadcom/common/v3d_limits.h index 465802c5123..38993fb60bf 100644 --- a/src/broadcom/common/v3d_limits.h +++ b/src/broadcom/common/v3d_limits.h @@ -67,4 +67,7 @@ /* Sub-pixel precission bits in the rasterizer */ #define V3D_COORD_SHIFT 6 +/* Size of a cache line */ +#define V3D_NON_COHERENT_ATOM_SIZE 256 + #endif /* V3D_LIMITS_H */ diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index 1afe0209bde..af8201ea8b8 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -2638,41 +2638,54 @@ vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr) vir_MOV(c, color_reads_for_sample[component])); } +static bool +try_emit_uniform(struct v3d_compile *c, + int offset, + int num_components, + nir_dest *dest, + enum quniform_contents contents) +{ + /* Even though ldunif is strictly 32-bit we can still use it + * to load scalar 8-bit/16-bit uniforms so long as their offset + * is 32-bit aligned. In this case, ldunif would still load + * 32-bit into the destination with the 8-bit/16-bit uniform + * data in the LSB and garbage in the MSB, but that is fine + * because we should only be accessing the valid bits of the + * destination. + * + * FIXME: if in the future we improve our register allocator to + * pack 2 16-bit variables in the MSB and LSB of the same + * register then this optimization would not be valid as is, + * since the load clobbers the MSB. + */ + if (offset % 4 != 0) + return false; + + /* We need dwords */ + offset = offset / 4; + + for (int i = 0; i < num_components; i++) { + ntq_store_dest(c, dest, i, + vir_uniform(c, contents, offset + i)); + } + + return true; +} + static void ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr) { + /* We scalarize general TMU access for anything that is not 32-bit. */ + assert(nir_dest_bit_size(instr->dest) == 32 || + instr->num_components == 1); + + /* Try to emit ldunif if possible, otherwise fallback to general TMU */ if (nir_src_is_const(instr->src[0])) { int offset = (nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0])); - /* Even though ldunif is strictly 32-bit we can still use it - * to load scalar 8-bit/16-bit uniforms so long as their offset - * is * 32-bit aligned. In this case, ldunif would still load - * 32-bit into the destination with the 8-bit/16-bit uniform - * data in the LSB and garbage in the MSB, but that is fine - * because we should only be accessing the valid bits of the - * destination. - * - * FIXME: if in the future we improve our register allocator to - * pack 2 16-bit variables in the MSB and LSB of the same - * register then this optimization would not be valid as is, - * since the load clobbers the MSB. - */ - if (offset % 4 == 0) { - /* We need dwords */ - offset = offset / 4; - - /* We scalarize general TMU access for anything that - * is not 32-bit. - */ - assert(nir_dest_bit_size(instr->dest) == 32 || - instr->num_components == 1); - - for (int i = 0; i < instr->num_components; i++) { - ntq_store_dest(c, &instr->dest, i, - vir_uniform(c, QUNIFORM_UNIFORM, - offset + i)); - } + if (try_emit_uniform(c, offset, instr->num_components, + &instr->dest, QUNIFORM_UNIFORM)) { return; } } @@ -2680,6 +2693,41 @@ ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr) ntq_emit_tmu_general(c, instr, false); } +static bool +ntq_emit_inline_ubo_load(struct v3d_compile *c, nir_intrinsic_instr *instr) +{ + if (c->compiler->max_inline_uniform_buffers <= 0) + return false; + + /* On Vulkan we use indices 1..MAX_INLINE_UNIFORM_BUFFERS for inline + * uniform buffers which we want to handle more like push constants + * than regular UBO. OpenGL doesn't implement this feature. + */ + assert(c->key->environment == V3D_ENVIRONMENT_VULKAN); + uint32_t index = nir_src_as_uint(instr->src[0]); + if (index == 0 || index > c->compiler->max_inline_uniform_buffers) + return false; + + /* We scalarize general TMU access for anything that is not 32-bit */ + assert(nir_dest_bit_size(instr->dest) == 32 || + instr->num_components == 1); + + if (nir_src_is_const(instr->src[1])) { + /* Index 0 is reserved for push constants */ + assert(index > 0); + uint32_t inline_index = index - 1; + int offset = nir_src_as_uint(instr->src[1]); + if (try_emit_uniform(c, offset, instr->num_components, + &instr->dest, + QUNIFORM_INLINE_UBO_0 + inline_index)) { + return true; + } + } + + /* Fallback to regular UBO load */ + return false; +} + static void ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr) { @@ -3199,6 +3247,9 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_ubo: + if (ntq_emit_inline_ubo_load(c, instr)) + break; + FALLTHROUGH; case nir_intrinsic_load_ssbo: if (!ntq_emit_load_unifa(c, instr)) { ntq_emit_tmu_general(c, instr, false); diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index c978995237c..db4a4c41f8b 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -338,6 +338,14 @@ enum quniform_contents { * Current value of gl_ViewIndex for Multiview rendering. */ QUNIFORM_VIEW_INDEX, + + /** + * Inline uniform buffers + */ + QUNIFORM_INLINE_UBO_0, + QUNIFORM_INLINE_UBO_1, + QUNIFORM_INLINE_UBO_2, + QUNIFORM_INLINE_UBO_3, }; static inline uint32_t v3d_unit_data_create(uint32_t unit, uint32_t value) @@ -574,6 +582,7 @@ enum v3d_compilation_result { */ struct v3d_compiler { const struct v3d_device_info *devinfo; + uint32_t max_inline_uniform_buffers; struct ra_regs *regs; struct ra_class *reg_class_any[3]; struct ra_class *reg_class_r5[3]; @@ -1045,7 +1054,8 @@ vir_has_uniform(struct qinst *inst) return inst->uniform != ~0; } -const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo); +const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo, + uint32_t max_inline_uniform_buffers); void v3d_compiler_free(const struct v3d_compiler *compiler); void v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s); diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index 4992a7f1509..9d4fc587920 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -517,13 +517,15 @@ vir_link_blocks(struct qblock *predecessor, struct qblock *successor) } const struct v3d_compiler * -v3d_compiler_init(const struct v3d_device_info *devinfo) +v3d_compiler_init(const struct v3d_device_info *devinfo, + uint32_t max_inline_uniform_buffers) { struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler); if (!compiler) return NULL; compiler->devinfo = devinfo; + compiler->max_inline_uniform_buffers = max_inline_uniform_buffers; if (!vir_init_reg_sets(compiler)) { ralloc_free(compiler); diff --git a/src/broadcom/vulkan/v3dv_descriptor_set.c b/src/broadcom/vulkan/v3dv_descriptor_set.c index 1c5b78559cc..72e7eb810dd 100644 --- a/src/broadcom/vulkan/v3dv_descriptor_set.c +++ b/src/broadcom/vulkan/v3dv_descriptor_set.c @@ -31,16 +31,23 @@ * binding layout, and array_index, it returns the map region assigned to it * from the descriptor pool bo. */ -static void* +static void * descriptor_bo_map(struct v3dv_device *device, struct v3dv_descriptor_set *set, const struct v3dv_descriptor_set_binding_layout *binding_layout, uint32_t array_index) { - assert(v3dv_X(device, descriptor_bo_size)(binding_layout->type) > 0); + /* Inline uniform blocks use BO memory to store UBO contents, not + * descriptor data, so their descriptor BO size is 0 even though they + * do use BO memory. + */ + uint32_t bo_size = v3dv_X(device, descriptor_bo_size)(binding_layout->type); + assert(bo_size > 0 || + binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT); + return set->pool->bo->map + set->base_offset + binding_layout->descriptor_offset + - array_index * v3dv_X(device, descriptor_bo_size)(binding_layout->type); + array_index * bo_size; } static bool @@ -102,7 +109,7 @@ v3dv_descriptor_map_get_descriptor(struct v3dv_descriptor_state *descriptor_stat * It also returns the descriptor type, so the caller could do extra * validation or adding extra offsets if the bo contains more that one field. */ -static struct v3dv_cl_reloc +struct v3dv_cl_reloc v3dv_descriptor_map_get_descriptor_bo(struct v3dv_device *device, struct v3dv_descriptor_state *descriptor_state, struct v3dv_descriptor_map *map, @@ -125,8 +132,10 @@ v3dv_descriptor_map_get_descriptor_bo(struct v3dv_device *device, const struct v3dv_descriptor_set_binding_layout *binding_layout = &set->layout->binding[binding_number]; - assert(v3dv_X(device, descriptor_bo_size)(binding_layout->type) > 0); - *out_type = binding_layout->type; + assert(binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT || + v3dv_X(device, descriptor_bo_size)(binding_layout->type) > 0); + if (out_type) + *out_type = binding_layout->type; uint32_t array_index = map->array_index[index]; assert(array_index < binding_layout->array_size); @@ -364,6 +373,10 @@ v3dv_CreateDescriptorPool(VkDevice _device, uint32_t bo_size = 0; uint32_t descriptor_count = 0; + const VkDescriptorPoolInlineUniformBlockCreateInfo *inline_info = + vk_find_struct_const(pCreateInfo->pNext, + DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO); + assert(pCreateInfo->poolSizeCount > 0); for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) { /* Verify supported descriptor type */ @@ -379,6 +392,7 @@ v3dv_CreateDescriptorPool(VkDevice _device, case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: break; default: unreachable("Unimplemented descriptor type"); @@ -386,9 +400,28 @@ v3dv_CreateDescriptorPool(VkDevice _device, } assert(pCreateInfo->pPoolSizes[i].descriptorCount > 0); - descriptor_count += pCreateInfo->pPoolSizes[i].descriptorCount; - bo_size += v3dv_X(device, descriptor_bo_size)(pCreateInfo->pPoolSizes[i].type) * - pCreateInfo->pPoolSizes[i].descriptorCount; + if (pCreateInfo->pPoolSizes[i].type == + VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) { + /* Inline uniform blocks are specified to use the descriptor array + * size as the size in bytes of the block. + */ + assert(inline_info); + descriptor_count++; + bo_size += pCreateInfo->pPoolSizes[i].descriptorCount; + } else { + descriptor_count += pCreateInfo->pPoolSizes[i].descriptorCount; + bo_size += v3dv_X(device, descriptor_bo_size)(pCreateInfo->pPoolSizes[i].type) * + pCreateInfo->pPoolSizes[i].descriptorCount; + } + } + + /* We align all our buffers to V3D_NON_COHERENT_ATOM_SIZE, make sure we + * allocate enough memory to honor that requirement for all our inline + * buffers too. + */ + if (inline_info) { + bo_size += V3D_NON_COHERENT_ATOM_SIZE * + inline_info->maxInlineUniformBlockBindings; } if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) { @@ -599,6 +632,7 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device, case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: /* Nothing here, just to keep the descriptor type filtering below */ break; default: @@ -624,16 +658,36 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device, samplers_offset += sizeof(struct v3dv_sampler) * binding->descriptorCount; } - descriptor_count += binding->descriptorCount; - dynamic_offset_count += binding->descriptorCount * - set_layout->binding[binding_number].dynamic_offset_count; - set_layout->shader_stages |= binding->stageFlags; - set_layout->binding[binding_number].descriptor_offset = set_layout->bo_size; - set_layout->bo_size += - v3dv_X(device, descriptor_bo_size)(set_layout->binding[binding_number].type) * - binding->descriptorCount; + if (binding->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) { + dynamic_offset_count += binding->descriptorCount * + set_layout->binding[binding_number].dynamic_offset_count; + + descriptor_count += binding->descriptorCount; + + set_layout->binding[binding_number].descriptor_offset = + set_layout->bo_size; + set_layout->bo_size += + v3dv_X(device, descriptor_bo_size)(set_layout->binding[binding_number].type) * + binding->descriptorCount; + } else { + /* We align all our buffers, inline buffers too. We made sure to take + * this account when calculating total BO size requirements at pool + * creation time. + */ + set_layout->bo_size = align(set_layout->bo_size, + V3D_NON_COHERENT_ATOM_SIZE); + + set_layout->binding[binding_number].descriptor_offset = + set_layout->bo_size; + + /* Inline uniform blocks are not arrayed, instead descriptorCount + * specifies the size of the buffer in bytes. + */ + set_layout->bo_size += binding->descriptorCount; + descriptor_count++; + } } free(bindings); @@ -931,6 +985,31 @@ write_buffer_view_descriptor(struct v3dv_device *device, sizeof(bview->texture_shader_state)); } +static void +write_inline_uniform_descriptor(struct v3dv_device *device, + struct v3dv_descriptor *descriptor, + struct v3dv_descriptor_set *set, + const struct v3dv_descriptor_set_binding_layout *binding_layout, + const void *data, + size_t offset, + size_t size) +{ + assert(binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT); + descriptor->type = VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT; + descriptor->buffer = NULL; + + void *desc_map = descriptor_bo_map(device, set, binding_layout, 0); + memcpy(desc_map + offset, data, size); + + /* Inline uniform buffers allocate BO space in the pool for all inline + * buffers it may allocate and then this space is assigned to individual + * descriptors when they are written, so we define the range of an inline + * buffer as the largest range of data that the client has written to it. + */ + descriptor->offset = 0; + descriptor->range = MAX2(descriptor->range, offset + size); +} + VKAPI_ATTR void VKAPI_CALL v3dv_UpdateDescriptorSets(VkDevice _device, uint32_t descriptorWriteCount, @@ -949,9 +1028,20 @@ v3dv_UpdateDescriptorSets(VkDevice _device, struct v3dv_descriptor *descriptor = set->descriptors; descriptor += binding_layout->descriptor_index; - descriptor += writeset->dstArrayElement; - for (uint32_t j = 0; j < writeset->descriptorCount; ++j) { + /* Inline uniform blocks are not arrayed, instead they use dstArrayElement + * to specify the byte offset of the uniform update and descriptorCount + * to specify the size (in bytes) of the update. + */ + uint32_t descriptor_count; + if (writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) { + descriptor += writeset->dstArrayElement; + descriptor_count = writeset->descriptorCount; + } else { + descriptor_count = 1; + } + + for (uint32_t j = 0; j < descriptor_count; ++j) { switch(writeset->descriptorType) { case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: @@ -1006,6 +1096,18 @@ v3dv_UpdateDescriptorSets(VkDevice _device, writeset->dstArrayElement + j); break; } + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: { + const VkWriteDescriptorSetInlineUniformBlock *inline_write = + vk_find_struct_const(writeset->pNext, + WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK); + assert(inline_write->dataSize == writeset->descriptorCount); + write_inline_uniform_descriptor(device, descriptor, set, + binding_layout, + inline_write->pData, + writeset->dstArrayElement, /* offset */ + inline_write->dataSize); + break; + } default: unreachable("unimplemented descriptor type"); break; @@ -1032,9 +1134,25 @@ v3dv_UpdateDescriptorSets(VkDevice _device, struct v3dv_descriptor *dst_descriptor = dst_set->descriptors; src_descriptor += src_binding_layout->descriptor_index; - src_descriptor += copyset->srcArrayElement; - dst_descriptor += dst_binding_layout->descriptor_index; + + if (src_binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + /* {src,dst}ArrayElement specifies src/dst start offset and + * descriptorCount specifies size (in bytes) to copy. + */ + const void *src_data = src_set->pool->bo->map + + src_set->base_offset + + src_binding_layout->descriptor_offset + + copyset->srcArrayElement; + write_inline_uniform_descriptor(device, dst_descriptor, dst_set, + dst_binding_layout, + src_data, + copyset->dstArrayElement, + copyset->descriptorCount); + continue; + } + + src_descriptor += copyset->srcArrayElement; dst_descriptor += copyset->dstArrayElement; for (uint32_t j = 0; j < copyset->descriptorCount; j++) { @@ -1179,8 +1297,7 @@ v3dv_UpdateDescriptorSetWithTemplate( struct v3dv_descriptor *descriptor = set->descriptors + - binding_layout->descriptor_index + - entry->array_element; + binding_layout->descriptor_index; switch (entry->type) { case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: @@ -1190,7 +1307,8 @@ v3dv_UpdateDescriptorSetWithTemplate( for (uint32_t j = 0; j < entry->array_count; j++) { const VkDescriptorBufferInfo *info = pData + entry->offset + j * entry->stride; - write_buffer_descriptor(descriptor + j, entry->type, info); + write_buffer_descriptor(descriptor + entry->array_element + j, + entry->type, info); } break; @@ -1204,9 +1322,9 @@ v3dv_UpdateDescriptorSetWithTemplate( pData + entry->offset + j * entry->stride; V3DV_FROM_HANDLE(v3dv_image_view, iview, info->imageView); V3DV_FROM_HANDLE(v3dv_sampler, sampler, info->sampler); - write_image_descriptor(device, descriptor + j, entry->type, - set, binding_layout, iview, sampler, - entry->array_element + j); + write_image_descriptor(device, descriptor + entry->array_element + j, + entry->type, set, binding_layout, iview, + sampler, entry->array_element + j); } break; @@ -1216,12 +1334,22 @@ v3dv_UpdateDescriptorSetWithTemplate( const VkBufferView *_bview = pData + entry->offset + j * entry->stride; V3DV_FROM_HANDLE(v3dv_buffer_view, bview, *_bview); - write_buffer_view_descriptor(device, descriptor + j, entry->type, - set, binding_layout, bview, + write_buffer_view_descriptor(device, + descriptor + entry->array_element + j, + entry->type, set, binding_layout, bview, entry->array_element + j); } break; + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: { + write_inline_uniform_descriptor(device, descriptor, set, + binding_layout, + pData + entry->offset, + entry->array_element, /* offset */ + entry->array_count); /* size */ + break; + } + default: unreachable("Unsupported descriptor type"); } diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c index b62703bc650..4205b718d3c 100644 --- a/src/broadcom/vulkan/v3dv_device.c +++ b/src/broadcom/vulkan/v3dv_device.c @@ -153,6 +153,7 @@ get_device_extensions(const struct v3dv_physical_device *device, .EXT_4444_formats = true, .EXT_color_write_enable = true, .EXT_custom_border_color = true, + .EXT_inline_uniform_block = true, .EXT_external_memory_dma_buf = true, .EXT_host_query_reset = true, .EXT_image_drm_format_modifier = true, @@ -812,7 +813,8 @@ physical_device_init(struct v3dv_physical_device *device, if (result != VK_SUCCESS) goto fail; - device->compiler = v3d_compiler_init(&device->devinfo); + device->compiler = v3d_compiler_init(&device->devinfo, + MAX_INLINE_UNIFORM_BUFFERS); device->next_program_id = 0; ASSERTED int len = @@ -1089,6 +1091,20 @@ v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice, { v3dv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features); + VkPhysicalDeviceVulkan13Features vk13 = { + .inlineUniformBlock = true, + /* Inline buffers work like push constants, so after their are bound + * some of their contents may be copied into the uniform stream as soon + * as the next draw/dispatch is recorded in the command buffer. This means + * that if the client updates the buffer contents after binding it to + * a command buffer, the next queue submit of that command buffer may + * not use the latest update to the buffer contents, but the data that + * was present in the buffer at the time it was bound to the command + * buffer. + */ + .descriptorBindingInlineUniformBlockUpdateAfterBind = false, + }; + VkPhysicalDeviceVulkan12Features vk12 = { .hostQueryReset = true, .uniformAndStorageBuffer8BitAccess = true, @@ -1173,6 +1189,15 @@ v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice, break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_FEATURES_EXT: { + VkPhysicalDeviceInlineUniformBlockFeaturesEXT *features = + (VkPhysicalDeviceInlineUniformBlockFeaturesEXT *)ext; + features->inlineUniformBlock = vk13.inlineUniformBlock; + features->descriptorBindingInlineUniformBlockUpdateAfterBind = + vk13.descriptorBindingInlineUniformBlockUpdateAfterBind; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COLOR_WRITE_ENABLE_FEATURES_EXT: { VkPhysicalDeviceColorWriteEnableFeaturesEXT *features = (void *) ext; features->colorWriteEnable = true; @@ -1385,7 +1410,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, .maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE, .maxMemoryAllocationCount = mem_size / page_size, .maxSamplerAllocationCount = 64 * 1024, - .bufferImageGranularity = 256, /* A cache line */ + .bufferImageGranularity = V3D_NON_COHERENT_ATOM_SIZE, .sparseAddressSpaceSize = 0, .maxBoundDescriptorSets = MAX_SETS, .maxPerStageDescriptorSamplers = V3D_MAX_TEXTURE_SAMPLERS, @@ -1499,7 +1524,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice, .standardSampleLocations = false, .optimalBufferCopyOffsetAlignment = 32, .optimalBufferCopyRowPitchAlignment = 32, - .nonCoherentAtomSize = 256, + .nonCoherentAtomSize = V3D_NON_COHERENT_ATOM_SIZE, }; *pProperties = (VkPhysicalDeviceProperties) { @@ -1575,6 +1600,18 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, }; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_PROPERTIES_EXT: { + VkPhysicalDeviceInlineUniformBlockProperties *props = + (VkPhysicalDeviceInlineUniformBlockProperties *)ext; + props->maxInlineUniformBlockSize = 4096; + props->maxPerStageDescriptorInlineUniformBlocks = + MAX_INLINE_UNIFORM_BUFFERS; + props->maxDescriptorSetInlineUniformBlocks = + MAX_INLINE_UNIFORM_BUFFERS; + props->maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = 0; + props->maxDescriptorSetUpdateAfterBindInlineUniformBlocks = 0; + break; + } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_PROPERTIES_EXT: { VkPhysicalDeviceProvokingVertexPropertiesEXT *props = (VkPhysicalDeviceProvokingVertexPropertiesEXT *)ext; @@ -2516,7 +2553,7 @@ v3dv_CreateBuffer(VkDevice _device, buffer->size = pCreateInfo->size; buffer->usage = pCreateInfo->usage; - buffer->alignment = 256; /* nonCoherentAtomSize */ + buffer->alignment = V3D_NON_COHERENT_ATOM_SIZE; /* Limit allocations to 32-bit */ const VkDeviceSize aligned_size = align64(buffer->size, buffer->alignment); diff --git a/src/broadcom/vulkan/v3dv_limits.h b/src/broadcom/vulkan/v3dv_limits.h index 7e67d124b7b..52bf2dd5ee1 100644 --- a/src/broadcom/vulkan/v3dv_limits.h +++ b/src/broadcom/vulkan/v3dv_limits.h @@ -44,6 +44,7 @@ #define MAX_INPUT_ATTACHMENTS 4 #define MAX_UNIFORM_BUFFERS 12 +#define MAX_INLINE_UNIFORM_BUFFERS 4 #define MAX_STORAGE_BUFFERS 8 #define MAX_DYNAMIC_UNIFORM_BUFFERS 8 diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c index acfc696dbfd..17c8e332384 100644 --- a/src/broadcom/vulkan/v3dv_pipeline.c +++ b/src/broadcom/vulkan/v3dv_pipeline.c @@ -465,17 +465,19 @@ descriptor_map_add(struct v3dv_descriptor_map *map, int binding, int array_index, int array_size, + int start_index, uint8_t return_size) { assert(array_index < array_size); assert(return_size == 16 || return_size == 32); - unsigned index = 0; - for (unsigned i = 0; i < map->num_desc; i++) { - if (set == map->set[i] && - binding == map->binding[i] && - array_index == map->array_index[i]) { - assert(array_size == map->array_size[i]); + unsigned index = start_index; + for (; index < map->num_desc; index++) { + if (map->used[index] && + set == map->set[index] && + binding == map->binding[index] && + array_index == map->array_index[index]) { + assert(array_size == map->array_size[index]); if (return_size != map->return_size[index]) { /* It the return_size is different it means that the same sampler * was used for operations with different precision @@ -485,18 +487,21 @@ descriptor_map_add(struct v3dv_descriptor_map *map, map->return_size[index] = 32; } return index; + } else if (!map->used[index]) { + break; } - index++; } - assert(index == map->num_desc); + assert(index < DESCRIPTOR_MAP_SIZE); + assert(!map->used[index]); - map->set[map->num_desc] = set; - map->binding[map->num_desc] = binding; - map->array_index[map->num_desc] = array_index; - map->array_size[map->num_desc] = array_size; - map->return_size[map->num_desc] = return_size; - map->num_desc++; + map->used[index] = true; + map->set[index] = set; + map->binding[index] = binding; + map->array_index[index] = array_index; + map->array_size[index] = array_size; + map->return_size[index] = return_size; + map->num_desc = MAX2(map->num_desc, index + 1); return index; } @@ -536,8 +541,11 @@ pipeline_get_descriptor_map(struct v3dv_pipeline *pipeline, &pipeline->shared_data->maps[broadcom_stage]->sampler_map : &pipeline->shared_data->maps[broadcom_stage]->texture_map; case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: return &pipeline->shared_data->maps[broadcom_stage]->ubo_map; case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: return &pipeline->shared_data->maps[broadcom_stage]->ssbo_map; default: unreachable("Descriptor type unknown or not having a descriptor map"); @@ -563,31 +571,53 @@ lower_vulkan_resource_index(nir_builder *b, struct v3dv_descriptor_set_binding_layout *binding_layout = &set_layout->binding[binding]; unsigned index = 0; - const VkDescriptorType desc_type = nir_intrinsic_desc_type(instr); - switch (desc_type) { + switch (binding_layout->type) { case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: { + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: { struct v3dv_descriptor_map *descriptor_map = - pipeline_get_descriptor_map(pipeline, desc_type, shader->info.stage, false); + pipeline_get_descriptor_map(pipeline, binding_layout->type, + shader->info.stage, false); if (!const_val) unreachable("non-constant vulkan_resource_index array index"); + /* At compile-time we will need to know if we are processing a UBO load + * for an inline or a regular UBO so we can handle inline loads like + * push constants. At the level of NIR level however, the inline + * information is gone, so we rely on the index to make this distinction. + * Particularly, we reserve indices 1..MAX_INLINE_UNIFORM_BUFFERS for + * inline buffers. This means that at the descriptor map level + * we store inline buffers at slots 0..MAX_INLINE_UNIFORM_BUFFERS - 1, + * and regular UBOs at indices starting from MAX_INLINE_UNIFORM_BUFFERS. + */ + uint32_t start_index = 0; + if (binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER || + binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) { + start_index = MAX_INLINE_UNIFORM_BUFFERS; + } + index = descriptor_map_add(descriptor_map, set, binding, const_val->u32, binding_layout->array_size, + start_index, 32 /* return_size: doesn't really apply for this case */); - if (desc_type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { - /* skip index 0 which is used for push constants */ + /* We always reserve index 0 for push constants */ + if (binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER || + binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC || + binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) { index++; } + break; } default: - unreachable("unsupported desc_type for vulkan_resource_index"); + unreachable("unsupported descriptor type for vulkan_resource_index"); break; } @@ -698,6 +728,7 @@ lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx, deref->var->data.binding, array_index, binding_layout->array_size, + 0, return_size); if (is_sampler) @@ -807,6 +838,7 @@ lower_image_deref(nir_builder *b, deref->var->data.binding, array_index, binding_layout->array_size, + 0, 32 /* return_size: doesn't apply for textures */); /* Note: we don't need to do anything here in relation to the precision and @@ -1752,12 +1784,12 @@ pipeline_lower_nir(struct v3dv_pipeline *pipeline, */ UNUSED unsigned index = descriptor_map_add(&pipeline->shared_data->maps[p_stage->stage]->sampler_map, - -1, -1, -1, 0, 16); + -1, -1, -1, 0, 0, 16); assert(index == V3DV_NO_SAMPLER_16BIT_IDX); index = descriptor_map_add(&pipeline->shared_data->maps[p_stage->stage]->sampler_map, - -2, -2, -2, 0, 32); + -2, -2, -2, 0, 0, 32); assert(index == V3DV_NO_SAMPLER_32BIT_IDX); /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */ diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h index 3002d21fe80..ff94f1da24a 100644 --- a/src/broadcom/vulkan/v3dv_private.h +++ b/src/broadcom/vulkan/v3dv_private.h @@ -1353,8 +1353,8 @@ struct v3dv_descriptor { struct { struct v3dv_buffer *buffer; - uint32_t offset; - uint32_t range; + size_t offset; + size_t range; }; struct v3dv_buffer_view *buffer_view; @@ -1727,8 +1727,8 @@ struct v3dv_pipeline_layout { * FIXME: one alternative would be to allocate the map as big as you need for * each descriptor type. That would means more individual allocations. */ -#define DESCRIPTOR_MAP_SIZE MAX3(V3D_MAX_TEXTURE_SAMPLERS, \ - MAX_UNIFORM_BUFFERS, \ +#define DESCRIPTOR_MAP_SIZE MAX3(V3D_MAX_TEXTURE_SAMPLERS, \ + MAX_UNIFORM_BUFFERS + MAX_INLINE_UNIFORM_BUFFERS, \ MAX_STORAGE_BUFFERS) @@ -1739,6 +1739,7 @@ struct v3dv_descriptor_map { int binding[DESCRIPTOR_MAP_SIZE]; int array_index[DESCRIPTOR_MAP_SIZE]; int array_size[DESCRIPTOR_MAP_SIZE]; + bool used[DESCRIPTOR_MAP_SIZE]; /* NOTE: the following is only for sampler, but this is the easier place to * put it. @@ -2073,6 +2074,14 @@ v3dv_descriptor_map_get_descriptor(struct v3dv_descriptor_state *descriptor_stat uint32_t index, uint32_t *dynamic_offset); +struct v3dv_cl_reloc +v3dv_descriptor_map_get_descriptor_bo(struct v3dv_device *device, + struct v3dv_descriptor_state *descriptor_state, + struct v3dv_descriptor_map *map, + struct v3dv_pipeline_layout *pipeline_layout, + uint32_t index, + VkDescriptorType *out_type); + const struct v3dv_sampler * v3dv_descriptor_map_get_sampler(struct v3dv_descriptor_state *descriptor_state, struct v3dv_descriptor_map *map, diff --git a/src/broadcom/vulkan/v3dv_uniforms.c b/src/broadcom/vulkan/v3dv_uniforms.c index 863f7d07e92..8f14d0195a7 100644 --- a/src/broadcom/vulkan/v3dv_uniforms.c +++ b/src/broadcom/vulkan/v3dv_uniforms.c @@ -56,7 +56,8 @@ struct state_bo_list { struct v3dv_bo *states[MAX_TOTAL_STATES]; }; -#define MAX_TOTAL_UNIFORM_BUFFERS (1 + MAX_UNIFORM_BUFFERS * MAX_STAGES) +#define MAX_TOTAL_UNIFORM_BUFFERS (1 + (MAX_UNIFORM_BUFFERS + \ + MAX_INLINE_UNIFORM_BUFFERS) * MAX_STAGES) #define MAX_TOTAL_STORAGE_BUFFERS (MAX_STORAGE_BUFFERS * MAX_STAGES) struct buffer_bo_list { struct v3dv_bo *ubo[MAX_TOTAL_UNIFORM_BUFFERS]; @@ -247,10 +248,12 @@ write_ubo_ssbo_uniforms(struct v3dv_cmd_buffer *cmd_buffer, uint32_t dynamic_offset = 0; - /* For ubos, index is shifted, as 0 is reserved for push constants. + /* For ubos, index is shifted, as 0 is reserved for push constants + * and 1..MAX_INLINE_UNIFORM_BUFFERS are reserved for inline uniform + * buffers. */ - if (content == QUNIFORM_UBO_ADDR && - v3d_unit_data_get_unit(data) == 0) { + uint32_t index = v3d_unit_data_get_unit(data); + if (content == QUNIFORM_UBO_ADDR && index == 0) { /* This calls is to ensure that the push_constant_ubo is * updated. It already take into account it is should do the * update or not @@ -266,40 +269,97 @@ write_ubo_ssbo_uniforms(struct v3dv_cmd_buffer *cmd_buffer, offset + dynamic_offset); buffer_bos->ubo[0] = resource->bo; } else { - uint32_t index = - content == QUNIFORM_UBO_ADDR ? - v3d_unit_data_get_unit(data) - 1 : - data; + if (content == QUNIFORM_UBO_ADDR) { + /* We reserve index 0 for push constants and artificially increase our + * indices by one for that reason, fix that now before accessing the + * descriptor map. + */ + assert(index > 0); + index--; + } else { + index = data; + } struct v3dv_descriptor *descriptor = v3dv_descriptor_map_get_descriptor(descriptor_state, map, pipeline->layout, index, &dynamic_offset); + + /* Inline UBO descriptors store UBO data in descriptor pool memory, + * instead of an external buffer. + */ assert(descriptor); - assert(descriptor->buffer); - assert(descriptor->buffer->mem); - assert(descriptor->buffer->mem->bo); if (content == QUNIFORM_GET_SSBO_SIZE || content == QUNIFORM_GET_UBO_SIZE) { cl_aligned_u32(uniforms, descriptor->range); } else { - cl_aligned_u32(uniforms, descriptor->buffer->mem->bo->offset + - descriptor->buffer->mem_offset + - descriptor->offset + - offset + dynamic_offset); + /* Inline uniform buffers store their contents in pool memory instead + * of an external buffer. + */ + struct v3dv_bo *bo; + uint32_t addr; + if (descriptor->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) { + assert(dynamic_offset == 0); + struct v3dv_cl_reloc reloc = + v3dv_descriptor_map_get_descriptor_bo(cmd_buffer->device, + descriptor_state, map, + pipeline->layout, index, + NULL); + bo = reloc.bo; + addr = reloc.bo->offset + reloc.offset + offset; + } else { + assert(descriptor->buffer); + assert(descriptor->buffer->mem); + assert(descriptor->buffer->mem->bo); + + bo = descriptor->buffer->mem->bo; + addr = bo->offset + + descriptor->buffer->mem_offset + + descriptor->offset + + offset + dynamic_offset; + } + + cl_aligned_u32(uniforms, addr); if (content == QUNIFORM_UBO_ADDR) { - assert(index + 1 < MAX_TOTAL_UNIFORM_BUFFERS); - buffer_bos->ubo[index + 1] = descriptor->buffer->mem->bo; + assert(index < MAX_TOTAL_UNIFORM_BUFFERS); + buffer_bos->ubo[index] = bo; } else { assert(index < MAX_TOTAL_STORAGE_BUFFERS); - buffer_bos->ssbo[index] = descriptor->buffer->mem->bo; + buffer_bos->ssbo[index] = bo; } } } } +static void +write_inline_uniform(struct v3dv_cl_out **uniforms, + uint32_t index, + uint32_t offset, + struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_pipeline *pipeline, + enum broadcom_shader_stage stage) +{ + assert(index < MAX_INLINE_UNIFORM_BUFFERS); + + struct v3dv_descriptor_state *descriptor_state = + v3dv_cmd_buffer_get_descriptor_state(cmd_buffer, pipeline); + + struct v3dv_descriptor_map *map = + &pipeline->shared_data->maps[stage]->ubo_map; + + struct v3dv_cl_reloc reloc = + v3dv_descriptor_map_get_descriptor_bo(cmd_buffer->device, + descriptor_state, map, + pipeline->layout, index, + NULL); + + /* Offset comes in 32-bit units */ + uint32_t *addr = reloc.bo->map + reloc.offset + 4 * offset; + cl_aligned_u32(uniforms, *addr); +} + static uint32_t get_texture_size_from_image_view(struct v3dv_image_view *image_view, enum quniform_contents contents, @@ -432,6 +492,15 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer, cl_aligned_u32(&uniforms, cmd_buffer->push_constants_data[data]); break; + case QUNIFORM_INLINE_UBO_0: + case QUNIFORM_INLINE_UBO_1: + case QUNIFORM_INLINE_UBO_2: + case QUNIFORM_INLINE_UBO_3: + write_inline_uniform(&uniforms, + uinfo->contents[i] - QUNIFORM_INLINE_UBO_0, data, + cmd_buffer, pipeline, variant->stage); + break; + case QUNIFORM_VIEWPORT_X_SCALE: cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * 256.0f); break; diff --git a/src/gallium/drivers/v3d/v3d_screen.c b/src/gallium/drivers/v3d/v3d_screen.c index f3f0828fe69..92af23fec85 100644 --- a/src/gallium/drivers/v3d/v3d_screen.c +++ b/src/gallium/drivers/v3d/v3d_screen.c @@ -184,7 +184,7 @@ v3d_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) return screen->devinfo.ver >= 40; case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT: - return 256; + return V3D_NON_COHERENT_ATOM_SIZE; case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: if (screen->devinfo.ver < 40) @@ -872,7 +872,7 @@ v3d_screen_create(int fd, const struct pipe_screen_config *config, v3d_resource_screen_init(pscreen); - screen->compiler = v3d_compiler_init(&screen->devinfo); + screen->compiler = v3d_compiler_init(&screen->devinfo, 0); #ifdef ENABLE_SHADER_CACHE v3d_disk_cache_init(screen);