Merge branch 'copy-mem-indirect-blorp' into 'main'

anv: implement VK_KHR_copy_memory_indirect

See merge request mesa/mesa!39338
This commit is contained in:
Paulo Zanoni 2026-05-07 17:21:51 -07:00
commit 889ee7456d
15 changed files with 1270 additions and 18 deletions

View file

@ -542,7 +542,7 @@ Khronos extensions that are not part of any Vulkan version:
VK_KHR_calibrated_timestamps DONE (anv, hk, kk, nvk, panvk/v10+, radv, tu/a750+, vn)
VK_KHR_compute_shader_derivatives DONE (anv, lvp, nvk, radv, tu, vn)
VK_KHR_cooperative_matrix DONE (anv, nvk/Turing+, radv/gfx11+, vn)
VK_KHR_copy_memory_indirect DONE (nvk, radv/gfx8+)
VK_KHR_copy_memory_indirect DONE (anv, nvk, radv/gfx8+)
VK_KHR_deferred_host_operations DONE (anv, hasvk, lvp, radv, tu, vn)
VK_KHR_depth_clamp_zero_one DONE (anv, nvk, panvk, radv, tu, vn)
VK_KHR_device_address_commands DONE (radv)

View file

@ -11,6 +11,7 @@ typedef uint32_t VkBool32;
typedef uint64_t VkDeviceAddress;
typedef uint64_t VkDeviceSize;
typedef uint32_t VkFlags;
typedef VkFlags VkImageAspectFlags;
typedef enum VkQueryType {
VK_QUERY_TYPE_OCCLUSION = 0,
@ -126,6 +127,25 @@ typedef enum VkDepthBiasRepresentationEXT {
VK_DEPTH_BIAS_REPRESENTATION_MAX_ENUM_EXT = 0x7FFFFFFF
} VkDepthBiasRepresentationEXT;
typedef struct VkOffset3D {
int32_t x;
int32_t y;
int32_t z;
} VkOffset3D __attribute__((aligned(4)));
typedef struct VkExtent3D {
uint32_t width;
uint32_t height;
uint32_t depth;
} VkExtent3D __attribute__((aligned(4)));
typedef struct VkImageSubresourceLayers {
VkImageAspectFlags aspectMask;
uint32_t mipLevel;
uint32_t baseArrayLayer;
uint32_t layerCount;
} VkImageSubresourceLayers __attribute__((aligned(4)));
typedef struct VkDispatchIndirectCommand {
uint32_t x;
uint32_t y;
@ -186,4 +206,13 @@ typedef struct VkCopyMemoryIndirectCommandKHR {
VkDeviceAddress srcAddress;
VkDeviceAddress dstAddress;
VkDeviceSize size;
} VkCopyMemoryIndirectCommandKHR;
} VkCopyMemoryIndirectCommandKHR __attribute__((aligned(4)));
typedef struct VkCopyMemoryToImageIndirectCommandKHR {
VkDeviceAddress srcAddress;
uint32_t bufferRowLength;
uint32_t bufferImageHeight;
VkImageSubresourceLayers imageSubresource;
VkOffset3D imageOffset;
VkExtent3D imageExtent;
} VkCopyMemoryToImageIndirectCommandKHR __attribute__((aligned(4)));

View file

@ -534,3 +534,10 @@ blorp_emit_post_draw(struct blorp_batch *blorp_batch, const struct blorp_params
genX(maybe_emit_breakpoint)(batch, false);
blorp_measure_end(blorp_batch, params);
}
static bool *
blorp_get_write_fencing_status(struct blorp_batch *blorp_batch)
{
struct iris_batch *batch = blorp_batch->driver_batch;
return &batch->write_fence_status;
}

View file

@ -41,6 +41,8 @@ typedef struct nir_shader nir_shader;
enum blorp_op {
BLORP_OP_BLIT,
BLORP_OP_COPY,
BLORP_OP_COPY_INDIRECT,
BLORP_OP_COPY_IMAGE_INDIRECT,
BLORP_OP_CCS_AMBIGUATE,
BLORP_OP_CCS_COLOR_CLEAR,
BLORP_OP_CCS_PARTIAL_RESOLVE,
@ -299,6 +301,22 @@ blorp_buffer_copy(struct blorp_batch *batch,
struct blorp_address dst,
uint64_t size);
void
blorp_copy_memory_indirect(struct blorp_batch *batch,
uint64_t indirect_buf_addr,
uint32_t copy_count,
uint64_t stride);
void
blorp_copy_memory_to_image_indirect(struct blorp_batch *batch,
const struct blorp_surf *img_blorp_surf,
uint64_t indirect_buf_addr,
uint64_t indirect_buf_stride,
uint32_t first_copy_idx,
uint32_t img_mip_level,
int layer_count,
int forced_layer_or_z);
void
blorp_fast_clear(struct blorp_batch *batch,
const struct blorp_surf *surf,

View file

@ -148,6 +148,9 @@ brw_blorp_get_urb_length(const struct brw_fs_prog_data *prog_data)
return MAX2((prog_data->num_varying_inputs + 1) / 2, 1);
}
static bool *
blorp_get_write_fencing_status(struct blorp_batch *batch);
/***** BEGIN blorp_exec implementation ******/
static uint64_t
@ -164,9 +167,25 @@ _blorp_combine_address(struct blorp_batch *batch, void *location,
#define __gen_address_type struct blorp_address
#define __gen_user_data struct blorp_batch
#define __gen_combine_address _blorp_combine_address
#define __gen_get_write_fencing_status(b) blorp_get_write_fencing_status(b)
#define __gen_get_batch_dwords(b, d) blorp_emit_dwords((b), (d))
static inline struct blorp_address
__gen_address_offset(struct blorp_address addr, uint64_t offset)
{
addr.offset += offset;
return addr;
}
static inline struct blorp_address
__gen_get_batch_address(struct blorp_batch *batch, void *location)
{
UNREACHABLE("Not supported by blorp");
}
#include "genxml/genX_pack.h"
#include "common/intel_genX_state_brw.h"
#include "common/mi_builder.h"
#define _blorp_cmd_length(cmd) cmd ## _length
#define _blorp_cmd_length_bias(cmd) cmd ## _length_bias
@ -1361,6 +1380,10 @@ blorp_setup_binding_table(struct blorp_batch *batch,
uint32_t surface_offsets[BLORP_NUM_BT_ENTRIES], bind_offset = 0;
void *surface_maps[BLORP_NUM_BT_ENTRIES];
/* There's nothing to bind here. */
if (params->op == BLORP_OP_COPY_INDIRECT)
return 0;
if (params->use_pre_baked_binding_table) {
bind_offset = params->pre_baked_binding_table_offset;
} else {
@ -1765,6 +1788,170 @@ blorp_get_compute_push_const(struct blorp_batch *batch,
*state_size = push_const_size;
}
static void
blorp_indirect_buffer_get_dispatch_size(struct blorp_batch *batch,
const struct blorp_params *params,
struct mi_builder *b,
struct mi_value *size_x,
struct mi_value *size_y,
struct mi_value *size_z)
{
struct blorp_context *blorp = batch->blorp;
const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data;
uint64_t indirect_buf_addr = params->wm_inputs.indirect.indirect_buf_addr;
uint64_t stride = params->wm_inputs.indirect.indirect_buf_stride;
uint32_t copy_count = params->wm_inputs.indirect.copy_count;
size_t size_offset = 16; /* offsetof(VkCopyMemoryIndirectCommandKHR, size) */
struct mi_value biggest_copy_size;
for (int c = 0; c < copy_count; c++) {
struct blorp_address copy_size_addr = {
.buffer = NULL,
.offset = indirect_buf_addr + c * stride + size_offset,
.reloc_flags = 0,
.mocs = isl_mocs(blorp->isl_dev, ISL_SURF_USAGE_STORAGE_BIT, false),
.local_hint = false, /* We don't have a way to know this. */
};
struct mi_value this_copy_size = mi_mem32(copy_size_addr);
if (c == 0)
biggest_copy_size = this_copy_size;
else
biggest_copy_size = mi_umax2(b, this_copy_size, biggest_copy_size);
}
/* Each shader invocation writes an uint32_t. */
int divisor = cs_prog_data->local_size[0] * sizeof(uint32_t);
*size_x = mi_udiv32_imm(b, mi_iadd_imm(b, biggest_copy_size, divisor - 1),
divisor);
assert(cs_prog_data->local_size[1] == 1);
assert(cs_prog_data->local_size[2] == 1);
*size_y = mi_imm(1);
*size_z = mi_imm(1);
}
static void
blorp_indirect_buf2img_get_dispatch_size(struct blorp_batch *batch,
const struct blorp_params *params,
struct mi_builder *b,
struct mi_value *size_x,
struct mi_value *size_y,
struct mi_value *size_z)
{
struct blorp_context *blorp = batch->blorp;
const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data;
uint64_t indirect_buf_addr = params->wm_inputs.indirect.indirect_buf_addr;
uint64_t stride = params->wm_inputs.indirect.indirect_buf_stride;
uint32_t copy_idx = params->wm_inputs.indirect.copy_idx;
bool is_forced_layer = params->wm_inputs.indirect.forced_layer_or_z != -1;
bool is_3d = params->wm_inputs.indirect.dimensions == 3;
/* These are all offsetof(VkCopyMemoryToImageIndirectCommandKHR, x). */
const size_t img_extent_x_offset = 44;
const size_t img_extent_y_offset = 48;
const size_t img_extent_z_offset = 52;
struct blorp_address x_extent_addr = {
.buffer = NULL,
.offset = indirect_buf_addr + copy_idx * stride + img_extent_x_offset,
.reloc_flags = 0,
.mocs = isl_mocs(blorp->isl_dev, ISL_SURF_USAGE_STORAGE_BIT, false),
.local_hint = false, /* We don't have a way to know this. */
};
struct blorp_address y_extent_addr = {
.buffer = NULL,
.offset = indirect_buf_addr + copy_idx * stride + img_extent_y_offset,
.reloc_flags = 0,
.mocs = isl_mocs(blorp->isl_dev, ISL_SURF_USAGE_STORAGE_BIT, false),
.local_hint = false, /* We don't have a way to know this. */
};
struct blorp_address z_extent_addr = {
.buffer = NULL,
.offset = indirect_buf_addr + copy_idx * stride + img_extent_z_offset,
.reloc_flags = 0,
.mocs = isl_mocs(blorp->isl_dev, ISL_SURF_USAGE_STORAGE_BIT, false),
.local_hint = false, /* We don't have a way to know this. */
};
/* Notes on the 'params->num_layers' usage below:
*
* - Please see how we handle this in function
* blorp_copy_memory_to_image_indirect().
*
* - If we're using forced layers (see
* params.wm_inputs.indirect.forced_layer_or_z), then we can just set
* size_z to 1 and don't even look at the indirect buffer: each shader
* call operates on a single layer.
*
* - The information on the number of layers for each copy is not truly
* indirect: it has to be passed to during command creation, so we
* have already processed it. See 'max_layer_count' in
* blorp_copy_memory_to_image_indirect(). We don't want to be looking
* at the indirect buffer if we don't need to.
*
* - For 3D images, the applications can use the Z axis as either an
* actual axis (offset.z + extent.z) or pretend the Z axis is layers
* (base_layer + layer_count), so would have to check both places.
* Fortunately, base_layer + layer_count was already checked (see
* above) and recorded in params->num_layers. If params->num_layers is
* bigger than 1, then we don't even bother looking at extent.z.
*/
struct mi_value x_extent = mi_mem32(x_extent_addr);
struct mi_value y_extent = mi_mem32(y_extent_addr);
struct mi_value z_extent;
if (is_forced_layer) {
z_extent = mi_imm(1);
} else {
if (is_3d && params->num_layers == 1)
z_extent = mi_mem32(z_extent_addr);
else
z_extent = mi_imm(params->num_layers);
}
int divisor_x = cs_prog_data->local_size[0];
int divisor_y = cs_prog_data->local_size[1];
int divisor_z = cs_prog_data->local_size[2];
assert(divisor_x != 1 && divisor_y != 1 && divisor_z == 1);
*size_x = mi_udiv32_imm(b, mi_iadd_imm(b, x_extent, divisor_x - 1),
divisor_x);
*size_y = mi_udiv32_imm(b, mi_iadd_imm(b, y_extent, divisor_y - 1),
divisor_y);
*size_z = z_extent;
}
static void
blorp_indirect_write_gpgpu_dispatch_regs(struct blorp_batch *batch,
const struct blorp_params *params)
{
struct blorp_context *blorp = batch->blorp;
const struct intel_device_info *devinfo = blorp->compiler->brw->devinfo;
struct mi_builder b;
mi_builder_init(&b, devinfo, batch);
mi_builder_set_mocs(&b, isl_mocs(blorp->isl_dev, 0, false));
struct mi_value size_x, size_y, size_z;
if (params->op == BLORP_OP_COPY_INDIRECT) {
blorp_indirect_buffer_get_dispatch_size(batch, params, &b, &size_x,
&size_y, &size_z);
} else {
assert(params->op == BLORP_OP_COPY_IMAGE_INDIRECT);
blorp_indirect_buf2img_get_dispatch_size(batch, params, &b, &size_x,
&size_y, &size_z);
}
mi_store(&b, mi_reg32(GENX(GPGPU_DISPATCHDIMX_num)), size_x);
mi_store(&b, mi_reg32(GENX(GPGPU_DISPATCHDIMY_num)), size_y);
mi_store(&b, mi_reg32(GENX(GPGPU_DISPATCHDIMZ_num)), size_z);
}
static void
blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
{
@ -1779,6 +1966,11 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
const struct intel_cs_dispatch_info dispatch =
brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
bool use_indirect = blorp_op_type_is_indirect(params->op);
if (use_indirect)
blorp_indirect_write_gpgpu_dispatch_regs(batch, params);
uint32_t group_x0 = params->x0 / cs_prog_data->local_size[0];
uint32_t group_y0 = params->y0 / cs_prog_data->local_size[1];
uint32_t group_z0 = params->dst.z_offset;
@ -1892,6 +2084,7 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
assert(cs_prog_data->push.per_thread.regs == 0);
blorp_emit(batch, GENX(COMPUTE_WALKER), cw) {
cw.IndirectParameterEnable = use_indirect,
cw.body = body;
}
#else /* GFX_VERx10 >= 125 */
@ -1961,6 +2154,7 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
}
blorp_emit(batch, GENX(GPGPU_WALKER), ggw) {
ggw.IndirectParameterEnable = use_indirect,
ggw.SIMDSize = dispatch.simd_size / 16;
ggw.ThreadDepthCounterMaximum = 0;
ggw.ThreadHeightCounterMaximum = 0;

View file

@ -0,0 +1,355 @@
/* Copyright © 2025 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "blorp_nir_builder.h"
#include "compiler/nir/nir_format_convert.h"
#include "blorp_priv.h"
#include "dev/intel_debug.h"
#include "dev/intel_device_info.h"
#include "blorp_shaders.h"
/* Refer to struct blorp_wm_inputs_indirect. */
struct blorp_indirect_vars {
nir_variable *indirect_buf_addr;
nir_variable *stride;
nir_variable *copy_count;
nir_variable *copy_idx;
nir_variable *max_layer;
nir_variable *x_offset;
nir_variable *y_offset;
};
static enum isl_format
get_format_for_copy(int format_bpb)
{
switch (format_bpb) {
case 8: return ISL_FORMAT_R8_UINT;
case 16: return ISL_FORMAT_R16_UINT;
case 24: return ISL_FORMAT_R8G8B8_UINT;
case 32: return ISL_FORMAT_R32_UINT;
case 48: return ISL_FORMAT_R16G16B16_UINT;
case 64: return ISL_FORMAT_R32G32_UINT;
case 96: return ISL_FORMAT_R32G32B32_UINT;
case 128: return ISL_FORMAT_R32G32B32A32_UINT;
default:
mesa_loge("unexpected format bpb: %d", format_bpb);
assert(false);
return ISL_FORMAT_UNSUPPORTED;
}
}
static void
blorp_indirect_vars_init(nir_builder *b, struct blorp_indirect_vars *v)
{
v->copy_count =
BLORP_CREATE_NIR_INPUT(b->shader, indirect.copy_count,
glsl_uint_type());
v->indirect_buf_addr =
BLORP_CREATE_NIR_INPUT(b->shader, indirect.indirect_buf_addr,
glsl_uint64_t_type());
v->stride =
BLORP_CREATE_NIR_INPUT(b->shader, indirect.indirect_buf_stride,
glsl_uint64_t_type());
v->copy_idx =
BLORP_CREATE_NIR_INPUT(b->shader, indirect.copy_idx,
glsl_uint_type());
v->max_layer =
BLORP_CREATE_NIR_INPUT(b->shader, indirect.max_layer,
glsl_uint_type());
v->x_offset =
BLORP_CREATE_NIR_INPUT(b->shader, indirect.x_offset,
glsl_uint_type());
v->y_offset =
BLORP_CREATE_NIR_INPUT(b->shader, indirect.y_offset,
glsl_uint_type());
}
static nir_shader *
blorp_build_copy_mem_indirect_shader(struct blorp_batch *batch,
void *mem_ctx)
{
struct blorp_context *blorp = batch->blorp;
mesa_shader_stage stage = MESA_SHADER_COMPUTE;
const nir_shader_compiler_options *nir_options =
blorp->compiler->nir_options(blorp, stage);
nir_builder b = nir_builder_init_simple_shader(stage, nir_options,
"copy mem indirect");
ralloc_steal(mem_ctx, b.shader);
b.shader->info.workgroup_size[0] = 32;
b.shader->info.workgroup_size[1] = 1;
b.shader->info.workgroup_size[2] = 1;
struct blorp_indirect_vars v;
blorp_indirect_vars_init(&b, &v);
/* The indirect buffer is an array containing 'copy_count'
* VkCopyMemoryIndirectCommandKHR structures, separated by 'stride' bytes.
*/
nir_def *copy_count = nir_load_var(&b, v.copy_count);
nir_def *indirect_buf_addr = nir_load_var(&b, v.indirect_buf_addr);
nir_def *stride = nir_load_var(&b, v.stride);
nir_def *global_id =
nir_channel(&b, nir_load_global_invocation_id(&b, 32), 0);
blorp_copy_memory_indirect_shader(&b, indirect_buf_addr, copy_count,
stride, global_id);
return b.shader;
}
static nir_shader *
blorp_build_copy_mem2img_indirect_shader(struct blorp_batch *batch,
void *mem_ctx,
struct blorp_indirect_copy_mem2img_key *key)
{
struct blorp_context *blorp = batch->blorp;
mesa_shader_stage stage = MESA_SHADER_COMPUTE;
const nir_shader_compiler_options *nir_options =
blorp->compiler->nir_options(blorp, stage);
nir_builder b = nir_builder_init_simple_shader(stage, nir_options,
"copy mem2img indirect");
ralloc_steal(mem_ctx, b.shader);
b.shader->info.workgroup_size[0] = 4;
b.shader->info.workgroup_size[1] = 8;
b.shader->info.workgroup_size[2] = 1;
struct blorp_indirect_vars v;
blorp_indirect_vars_init(&b, &v);
/* The indirect buffer is an array containing 'copy_count'
* VkCopyMemoryToImageIndirectCommandKHR structures, separated by 'stride'
* bytes.
*/
nir_def *indirect_buf_addr = nir_load_var(&b, v.indirect_buf_addr);
nir_def *stride = nir_load_var(&b, v.stride);
nir_def *copy_idx = nir_load_var(&b, v.copy_idx);
nir_def *max_layer = nir_load_var(&b, v.max_layer);
nir_def *dest_coord_offsets_arr[2] = {
nir_load_var(&b, v.x_offset),
nir_load_var(&b, v.y_offset),
};
nir_def *dest_coord_offsets = nir_vec(&b, dest_coord_offsets_arr, 2);
nir_def *global_id = nir_load_global_invocation_id(&b, 32);
/* Shader keys (constants). */
nir_def *dimensions = nir_imm_int(&b, key->dimensions);
nir_def *forced_layer_or_z = nir_imm_int(&b, key->forced_layer_or_z);
nir_def *format_Bpb = nir_imm_intN_t(&b, key->format_Bpb, 16);
nir_def *format_block_size_arr[3] = {
nir_imm_int(&b, key->format_bw),
nir_imm_int(&b, key->format_bh),
nir_imm_int(&b, key->format_bd),
};
nir_def *format_block_size = nir_vec(&b, format_block_size_arr, 3);
/* Constants derived from the shader keys. */
nir_def *is_block_compressed =
nir_imm_bool(&b, key->format_bw > 1 || key->format_bh > 1 ||
key->format_bd > 1);
blorp_copy_memory_to_image_indirect_shader(&b, indirect_buf_addr,
stride,
copy_idx,
max_layer,
dest_coord_offsets,
global_id,
dimensions,
forced_layer_or_z,
format_Bpb,
format_block_size,
is_block_compressed);
return b.shader;
}
static bool
blorp_get_copy_mem_indirect_kernel_cs(struct blorp_batch *batch,
struct blorp_params *params)
{
struct blorp_context *blorp = batch->blorp;
const char *key = "copy_mem_indirect_kernel_cs";
uint32_t key_size = strlen(key);
if (blorp->lookup_shader(batch, key, key_size, &params->cs_prog_kernel,
&params->cs_prog_data))
return true;
void *mem_ctx = ralloc_context(NULL);
nir_shader *nir =
blorp_build_copy_mem_indirect_shader(batch, mem_ctx);
const struct blorp_program prog =
blorp_compile_cs(blorp, mem_ctx, nir, key, key_size);
bool result = blorp->upload_shader(batch, MESA_SHADER_COMPUTE,
key, key_size,
prog.kernel, prog.kernel_size,
prog.prog_data, prog.prog_data_size,
&params->cs_prog_kernel,
&params->cs_prog_data);
ralloc_free(mem_ctx);
return result;
}
static bool
blorp_get_copy_mem2img_indirect_kernel_cs(struct blorp_batch *batch,
struct blorp_params *params,
struct blorp_indirect_copy_mem2img_key *key)
{
struct blorp_context *blorp = batch->blorp;
if (blorp->lookup_shader(batch, key, sizeof(*key), &params->cs_prog_kernel,
&params->cs_prog_data))
return true;
void *mem_ctx = ralloc_context(NULL);
nir_shader *nir =
blorp_build_copy_mem2img_indirect_shader(batch, mem_ctx, key);
const struct blorp_program prog =
blorp_compile_cs(blorp, mem_ctx, nir, key, sizeof(*key));
bool result = blorp->upload_shader(batch, MESA_SHADER_COMPUTE,
key, sizeof(*key),
prog.kernel, prog.kernel_size,
prog.prog_data, prog.prog_data_size,
&params->cs_prog_kernel,
&params->cs_prog_data);
ralloc_free(mem_ctx);
return result;
}
void
blorp_copy_memory_indirect(struct blorp_batch *batch,
uint64_t indirect_buf_addr,
uint32_t copy_count,
uint64_t stride)
{
struct blorp_params params;
blorp_params_init(&params);
params.op = BLORP_OP_COPY_INDIRECT;
params.shader_type = BLORP_SHADER_TYPE_COPY_INDIRECT;
params.shader_pipeline = BLORP_SHADER_PIPELINE_COMPUTE;
params.x0 = 0;
params.y0 = 0;
params.x1 = 1;
params.y1 = 1;
params.wm_inputs.indirect.indirect_buf_addr = indirect_buf_addr;
params.wm_inputs.indirect.indirect_buf_stride = stride;
params.wm_inputs.indirect.copy_count = copy_count;
if (!blorp_get_copy_mem_indirect_kernel_cs(batch, &params)) {
mesa_loge("failed to get copy_memory_indirect CS kernel");
assert(false);
return;
}
batch->blorp->exec(batch, &params);
}
void
blorp_copy_memory_to_image_indirect(struct blorp_batch *batch,
const struct blorp_surf *img_blorp_surf,
uint64_t indirect_buf_addr,
uint64_t indirect_buf_stride,
uint32_t copy_idx,
uint32_t img_mip_level,
int layer_count,
int forced_layer_or_z)
{
enum isl_format original_format = img_blorp_surf->surf->format;
const struct isl_format_layout *fmtl =
isl_format_get_layout(original_format);
enum isl_format copy_format = get_format_for_copy(fmtl->bpb);
int dimensions = img_blorp_surf->surf->dim + 1;
struct blorp_indirect_copy_mem2img_key key = {
.dimensions = dimensions,
.forced_layer_or_z = forced_layer_or_z,
.format_Bpb = fmtl->bpb / 8,
.format_bw = fmtl->bw,
.format_bh = fmtl->bh,
.format_bd = fmtl->bd,
};
struct blorp_params params;
blorp_params_init(&params);
params.op = BLORP_OP_COPY_IMAGE_INDIRECT;
params.shader_type = BLORP_SHADER_TYPE_COPY_INDIRECT;
params.shader_pipeline = BLORP_SHADER_PIPELINE_COMPUTE;
params.wm_inputs.indirect.indirect_buf_addr = indirect_buf_addr;
params.wm_inputs.indirect.indirect_buf_stride = indirect_buf_stride;
params.wm_inputs.indirect.copy_idx = copy_idx;
params.wm_inputs.indirect.dimensions = dimensions;
params.wm_inputs.indirect.max_layer =
img_blorp_surf->surf->logical_level0_px.array_len - 1;
params.wm_inputs.indirect.forced_layer_or_z = forced_layer_or_z;
/* params.dst is our image. */
blorp_surface_info_init(batch, &params.dst, img_blorp_surf,
img_mip_level,
forced_layer_or_z == -1 ? 0 : forced_layer_or_z,
copy_format,
true /* is_dest */);
struct isl_extent3d mip_dimensions = {
.width = MAX2(params.dst.surf.logical_level0_px.w >> img_mip_level, 1),
.height = MAX2(params.dst.surf.logical_level0_px.h >> img_mip_level, 1),
.depth = MAX2(params.dst.surf.logical_level0_px.d >> img_mip_level, 1),
};
if (fmtl->bw > 1 || fmtl->bh > 1 || fmtl->bd > 1) {
blorp_surf_convert_to_uncompressed(batch->blorp->isl_dev,
&params.dst, NULL, NULL, NULL, NULL);
params.wm_inputs.indirect.x_offset = params.dst.tile_x_sa;
params.wm_inputs.indirect.y_offset = params.dst.tile_y_sa;
mip_dimensions.width = params.dst.surf.logical_level0_px.w;
mip_dimensions.height = params.dst.surf.logical_level0_px.h;
mip_dimensions.depth = params.dst.surf.logical_level0_px.d;
}
/* These settings control the number of workgroups in the shader, see
* blorp_exec_compute(). We don't need to divide by the local sizes here,
* this will be done later.
*/
params.x0 = 0;
params.y0 = 0;
params.x1 = mip_dimensions.width;
params.y1 = mip_dimensions.height;
if (forced_layer_or_z == -1) {
/* We set this here so blorp_indirect_buf2img_get_dispatch_size() can
* read it while figuring out how many shader instances we'll need.
*/
params.num_layers = layer_count;
} else {
params.num_layers = 1;
}
if (!blorp_get_copy_mem2img_indirect_kernel_cs(batch, &params, &key)) {
mesa_loge("failed to get copy_memory_to_image_indirect CS kernel");
assert(false);
return;
}
batch->blorp->exec(batch, &params);
}

View file

@ -205,17 +205,55 @@ struct blorp_wm_inputs_clear {
struct blorp_bounds_rect bounds_rect;
};
/* Parameters using in blorp_indirect_copy.c */
struct blorp_wm_inputs_indirect {
/* The address of the indirect buffer containing the information about the
* indirect copy.
*/
uint64_t indirect_buf_addr;
/* How far apart the information about each copy is inside the indirect
* buffer.
*/
uint64_t indirect_buf_stride;
/* How many copies we have to do. */
uint32_t copy_count;
/* For memory to image copies, we do a single copy per shader. This
* represents the index of the copy to be done.
*/
uint32_t copy_idx;
/* How many dimensions does our image have? 1, 2 or 3. */
uint32_t dimensions;
/* The maximum array layer of the image. */
uint32_t max_layer;
/* When compressed formats are used, we pretend they are a non-compressed
* format, of the same bpb. Since we can't maintain the exact same layout
* of mipmap and layer offsets, we're forced to make adjustments to where X
* and Y actually start, and are also forced to copy only one layer (or Z
* axis position) per shader invocation.
*/
uint32_t x_offset;
uint32_t y_offset;
int forced_layer_or_z;
};
struct blorp_wm_inputs
{
union {
struct blorp_wm_inputs_blit blit;
struct blorp_wm_inputs_clear clear;
struct blorp_wm_inputs_indirect indirect;
};
/* Note: Pad out to an integral number of registers when extending, but
* make sure subgroup_id is the last 32-bit item.
*/
uint32_t pad[2];
uint32_t pad[1];
uint32_t subgroup_id;
};
@ -257,6 +295,7 @@ enum blorp_shader_type {
BLORP_SHADER_TYPE_MCS_PARTIAL_RESOLVE,
BLORP_SHADER_TYPE_LAYER_OFFSET_VS,
BLORP_SHADER_TYPE_GFX4_SF,
BLORP_SHADER_TYPE_COPY_INDIRECT,
};
enum blorp_shader_pipeline {
@ -467,6 +506,25 @@ struct blorp_blit_prog_key
uint8_t local_y;
};
struct blorp_indirect_copy_mem2img_key {
/* How many dimensions does our image have? 1, 2 or 3. */
uint32_t dimensions;
/* When compressed formats are used, we pretend they are a non-compressed
* format, of the same bpb. Since we can't maintain the exact same layout
* of mipmap and layer offsets, we're forced to make adjustments to where X
* and Y actually start, and are also forced to copy only one layer (or Z
* axis position) per shader invocation.
*/
int forced_layer_or_z;
/* Info taken from isl_format_layout. */
uint16_t format_Bpb;
uint16_t format_bw;
uint16_t format_bh;
uint16_t format_bd;
};
/**
* \name BLORP internals
* \{
@ -597,6 +655,19 @@ blorp_op_type_is_clear(enum blorp_op op)
}
}
/* This means: blorp->wm_inputs.indirect should be used. */
static inline bool
blorp_op_type_is_indirect(enum blorp_op op)
{
switch (op) {
case BLORP_OP_COPY_INDIRECT:
case BLORP_OP_COPY_IMAGE_INDIRECT:
return true;
default:
return false;
}
}
/* Asserts unless the surface is a buffer to image copy */
#define blorp_assert_is_buffer(surf, view) \
do { \

View file

@ -3,6 +3,7 @@
*/
#include "compiler/libcl/libcl.h"
#include "compiler/libcl/libcl_vk.h"
#include "compiler/nir/nir_defines.h"
#include "compiler/shader_enums.h"
@ -15,3 +16,279 @@ blorp_check_in_bounds(uint4 bounds_rect, uint2 pos)
return pos.x >= x0 && pos.x < x1 &&
pos.y >= y0 && pos.y < y1;
}
void nir_image_store(uint handle, int4 coords, uint sample_index,
uint4 colors, uint lod, uint image_dim,
uint image_array, uint format, uint access,
uint range_base, uint src_type);
/* Used by vkCmdCopyMemoryIndirectKHR. */
void
blorp_copy_memory_indirect_shader(
global uint *indirect_buf,
uint copy_count,
ulong stride_bytes,
uint global_id)
{
for (uint c = 0; c < copy_count; c++) {
uint idx = (uint)(c * (stride_bytes / 4));
/* The spec says the minimum alignment is 4 instead of 8, so we have to
* do these tricks.
*/
VkCopyMemoryIndirectCommandKHR cmd =
(*(global VkCopyMemoryIndirectCommandKHR*)&indirect_buf[idx]);
uint copy_size_ints = cmd.size / 4;
global int *src = (global int*)cmd.srcAddress;
global int *dst = (global int*)cmd.dstAddress;
if (global_id < copy_size_ints)
dst[global_id] = src[global_id];
}
}
struct img_copy_params {
ulong src_address;
uint2 row_size_px;
uint base_layer;
uint layer_count;
uint3 offset;
uint3 extent;
};
void
read_img_params(
global uint *indirect_buf,
uint copy_idx,
ulong stride_bytes,
uint dimensions,
uint max_layer,
uint3 format_block_size,
bool is_block_compressed,
struct img_copy_params *p)
{
uint idx = copy_idx * (stride_bytes / 4);
VkCopyMemoryToImageIndirectCommandKHR cmd =
(*(global VkCopyMemoryToImageIndirectCommandKHR*)&indirect_buf[idx]);
p->src_address = cmd.srcAddress;
p->row_size_px = (uint2)(cmd.bufferRowLength, cmd.bufferImageHeight);
/* We don't use imageSubresource.aspectMask and imageSubresource.mipLevel,
* those are dealt with when the application calls
* vkCmdCopyMemoryToImageIndirectKHR().
*/
p->base_layer = cmd.imageSubresource.baseArrayLayer;
p->layer_count = cmd.imageSubresource.layerCount;
p->offset = (uint3)(cmd.imageOffset.x, cmd.imageOffset.y,
cmd.imageOffset.z);
p->extent = (uint3)(cmd.imageExtent.width, cmd.imageExtent.height,
cmd.imageExtent.depth);
if (p->row_size_px.x == 0)
p->row_size_px.x = p->extent.x;
if (p->row_size_px.y == 0)
p->row_size_px.y = p->extent.y;
/* Our code deals with blocks, not pixels. */
if (is_block_compressed) {
p->offset /= format_block_size;
p->extent = DIV_ROUND_UP(p->extent, format_block_size);
}
/* Users can pass 3D images with the Z axis as an array layer. */
if (dimensions == 3 && (p->base_layer != 0 || p->layer_count != 1)) {
p->offset.z = p->base_layer;
p->extent.z = p->layer_count;
p->base_layer = 0;
p->layer_count = 1;
}
/* This handles VK_REMAINING_ARRAY_LAYERS and bugs. */
if (p->base_layer + p->layer_count > max_layer + 1)
p->layer_count = max_layer - p->base_layer + 1;
}
uint4
get_pixel(
ulong src_address,
uint format_Bpb)
{
switch (format_Bpb) {
case 1: {
global uchar *src_buf = (global uchar *)src_address;
return (uint4)(src_buf[0], 0, 0, 0);
}
case 2: {
global ushort *src_buf = (global ushort *)src_address;
return (uint4)(src_buf[0], 0, 0, 0);
}
case 4: {
global uint *src_buf = (global uint *)src_address;
return (uint4)(src_buf[0], 0, 0, 0);
}
case 8: {
global uint *src_buf = (global uint *)src_address;
return (uint4)(vload2(0, src_buf), 0, 0);
}
case 16: {
global uint *src_buf = (global uint *)src_address;
return (uint4)(vload4(0, src_buf));
}
default:
/* TODO: support 3, 6, 12. */
return (uint4)(0xFF, 0, 0, 0xFF);
}
}
int4
get_coords(
uint3 pos,
uint layer,
uint2 dest_coord_offsets,
int dimensions,
int forced_layer_or_z)
{
int4 ret;
pos.xy += dest_coord_offsets;
switch (dimensions) {
case 1:
ret = (int4)(pos.x, layer, 0, 0);
break;
case 2:
ret = (int4)(pos.x, pos.y, layer, 0);
break;
case 3:
default:
ret = (int4)(pos.x, pos.y, pos.z, 0);
break;
}
if (forced_layer_or_z != -1) {
ret.z = 0;
ret.w = 0;
}
return ret;
}
void
write_pixel(
const int4 coords,
const uint4 colors)
{
/* We don't seem to need to set image_array to true if we set sampler_dim
* to 3D.
* Setting mip_level does not do anything on Intel, we set mip levels
* through the bindings.
*/
nir_image_store(0 /* The image handle. */,
coords, /* See get_coords(). */
0, /* Sample index for multi-sampling. */
colors, /* The RGBA pixels, in src_type. */
0 /* mip_level */,
GLSL_SAMPLER_DIM_3D,
false, /* image_array */
0, /* format */
ACCESS_NON_READABLE, /* access */
0, /* range_base */
nir_type_int32 /* src_type */);
}
/* Used by vkCmdCopyMemoryToImageIndirectKHR. */
void
blorp_copy_memory_to_image_indirect_shader(
/* Actual parameters. */
global uint *indirect_buf,
ulong stride_bytes,
uint copy_idx,
uint max_layer,
uint2 dest_coord_offsets,
uint3 global_id,
/* These are shader keys, they are NIR immediates. */
uint dimensions,
int forced_layer_or_z,
ushort format_Bpb,
uint3 format_block_size,
bool is_block_compressed)
{
/* We have one invocation per texel of a given a mip level. This means
* that for the pixels outside the copy area, we'll hit the 'continue'
* below.
*/
uint3 src_pos = global_id;
uint layer;
/* 'forced_layer_or_z' means that whatever slice or layer we're
* trying to work with is set as part of our binding (view) as depth
* 0 or layer 0, depending on the dimensionality. This can happen
* when we're trying to pretend a format is something that it's not
* (e.g., we're treating a block compressed 4x4 64bpp format as an
* r32g32 format).
*/
if (forced_layer_or_z != -1) {
if (dimensions == 3) {
src_pos.z = forced_layer_or_z;
layer = 0;
} else {
src_pos.z = 0;
layer = forced_layer_or_z;
}
} else {
if (dimensions == 3) {
src_pos.z = global_id.z;
layer = 0;
} else {
src_pos.z = 0;
layer = global_id.z;
}
}
struct img_copy_params p;
read_img_params(indirect_buf, copy_idx, stride_bytes, dimensions,
max_layer, format_block_size, is_block_compressed, &p);
if (any(src_pos >= p.extent))
return;
uint3 dst_pos = src_pos + p.offset;
if (forced_layer_or_z == -1 && dimensions != 3)
layer = global_id.z + p.base_layer;
if (layer > p.base_layer + p.layer_count - 1)
return;
uint2 row_size_blocks = p.row_size_px;
if (is_block_compressed) {
row_size_blocks = DIV_ROUND_UP(row_size_blocks,
format_block_size.xy);
}
uint row_length_bytes = row_size_blocks.x * format_Bpb;
uint row_height_bytes = row_size_blocks.y * row_length_bytes;
uint buf_z_offset_bytes = src_pos.z * row_height_bytes;
uint buf_y_offset_bytes = src_pos.y * row_length_bytes;
uint buf_x_offset_bytes = src_pos.x * format_Bpb;
uint layer_offset_bytes = (layer - p.base_layer) * row_height_bytes;
uint buf_offset_bytes = layer_offset_bytes +
buf_z_offset_bytes +
buf_y_offset_bytes +
buf_x_offset_bytes;
uint4 colors = get_pixel(p.src_address + buf_offset_bytes,
format_Bpb);
int4 coords = get_coords(dst_pos, layer, dest_coord_offsets, dimensions,
forced_layer_or_z);
write_pixel(coords, colors);
}

View file

@ -37,6 +37,7 @@ files_libblorp = files(
'blorp.h',
'blorp_blit.c',
'blorp_clear.c',
'blorp_indirect_copy.c',
'blorp_nir_builder.h',
'blorp_priv.h',
)

View file

@ -1066,6 +1066,17 @@ mi_ior(struct mi_builder *b,
MI_ALU_STORE, MI_ALU_ACCU);
}
static inline struct mi_value
mi_ixor(struct mi_builder *b,
struct mi_value src0, struct mi_value src1)
{
if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
return mi_imm(mi_value_to_u64(src0) ^ mi_value_to_u64(src1));
return mi_math_binop(b, MI_ALU_XOR, src0, src1,
MI_ALU_STORE, MI_ALU_ACCU);
}
#if GFX_VERx10 >= 125
static inline struct mi_value
mi_ishl(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
@ -1290,6 +1301,40 @@ mi_udiv32_imm(struct mi_builder *b, struct mi_value N, uint32_t D)
}
}
/* Finds the maximum between the two specified unsigned numbers. */
static inline struct mi_value
mi_umax2(struct mi_builder *b, struct mi_value val1, struct mi_value val2)
{
/* The idea of the alrogithm here is that the value of 'mask' will be
* either 0 or ~0 depending on which number is bigger. Then we use AND
* operations to ensure the smaller value becomes zero and the bigger value
* is preserved, and finally OR both values to the destination (the bigger
* and zero).
*
* In other words:
* mask = val1 < val2 ? 0xFFFFFFFF : 0x0;
* biggest = (val1 & ~mask) | (val2 & mask);
*/
/* If 'val1' is smaller, 'mask' is ~0, otherwise it's 0. */
struct mi_value mask = mi_ult(b, mi_value_ref(b, val1),
mi_value_ref(b, val2));
struct mi_value notmask = mi_ixor(b, mi_value_ref(b, mask),
mi_imm(UINT64_MAX));
/* If 'val1' is smaller, 'notmask' is 0, so we zero it, otherwise we
* preserve the value by ANDing it with ~0.
*/
struct mi_value val1_or_zero = mi_iand(b, val1, notmask);
/* If 'val2' is smaller, mask is 0, so we zero it, otherwise we preserve
* the value.
*/
struct mi_value val2_or_zero = mi_iand(b, val2, mask);
/* The smaller value was zeroed, the other was preserved, so just OR
* them now.
*/
return mi_ior(b, val1_or_zero, val2_or_zero);
}
#endif /* MI_MATH section */
/* This assumes addresses of strictly more than 32bits (aka. Gfx8+). */

View file

@ -1252,6 +1252,58 @@ TEST_F(mi_builder_test, udiv32_imm)
}
}
TEST_F(mi_builder_test, umax2_32)
{
const uint32_t values[] = {
0x01234567,
0x42424242,
0xffffffff,
0xf0000000,
0x00000001,
0x00000000,
0x000f0000,
0x000f0001,
};
memcpy(input, values, sizeof(values));
uint32_t cmp_val = 0x000f0001;
for (unsigned i = 0; i < ARRAY_SIZE(values); i++)
mi_store(&b, out_mem64(i * 4), mi_umax2(&b, in_mem32(i * 4), mi_imm(cmp_val)));
submit_batch();
for (unsigned i = 0; i < ARRAY_SIZE(values); i++) {
EXPECT_EQ(*(uint32_t *)(output + i * 4),
values[i] >= cmp_val ? values[i] : cmp_val);
}
}
TEST_F(mi_builder_test, umax2_64)
{
const uint64_t values[] = {
0x0123456789abcdef,
0x4242424242424242,
0xffffffffffffffff,
0xf000000000000000,
0x0000000000000001,
0x0000000000000000,
0x00000000f0000000,
0x00000000f0000001,
};
memcpy(input, values, sizeof(values));
uint64_t cmp_val = 0x00000000f0000001;
for (unsigned i = 0; i < ARRAY_SIZE(values); i++)
mi_store(&b, out_mem64(i * 8), mi_umax2(&b, in_mem64(i * 8), mi_imm(cmp_val)));
submit_batch();
for (unsigned i = 0; i < ARRAY_SIZE(values); i++) {
EXPECT_EQ(*(uint64_t *)(output + i * 8),
values[i] >= cmp_val ? values[i] : cmp_val);
}
}
TEST_F(mi_builder_test, store_if)
{
uint64_t u64 = 0xb453b411deadc0deull;

View file

@ -2811,3 +2811,135 @@ anv_image_ccs_op(struct anv_cmd_buffer *cmd_buffer,
anv_blorp_batch_finish(&batch);
}
void
anv_CmdCopyMemoryIndirectKHR(
VkCommandBuffer commandBuffer,
const VkCopyMemoryIndirectInfoKHR* pCopyMemoryIndirectInfo)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
if (pCopyMemoryIndirectInfo->copyCount == 0)
return;
const uint64_t indirect_buf_addr =
pCopyMemoryIndirectInfo->copyAddressRange.address;
const uint32_t copy_count = pCopyMemoryIndirectInfo->copyCount;
const uint64_t stride = pCopyMemoryIndirectInfo->copyAddressRange.stride;
/* These are all restrictions by the spec. */
assert((pCopyMemoryIndirectInfo->srcCopyFlags &
VK_ADDRESS_COPY_PROTECTED_BIT_KHR) == 0);
assert((pCopyMemoryIndirectInfo->dstCopyFlags &
VK_ADDRESS_COPY_PROTECTED_BIT_KHR) == 0);
assert(pCopyMemoryIndirectInfo->copyAddressRange.size >=
pCopyMemoryIndirectInfo->copyCount *
pCopyMemoryIndirectInfo->copyAddressRange.stride);
assert(!anv_cmd_buffer_is_blitter_queue(cmd_buffer));
enum blorp_batch_flags blorp_flags = BLORP_BATCH_USE_COMPUTE;
struct blorp_batch batch;
anv_blorp_batch_init(cmd_buffer, &batch, blorp_flags);
blorp_copy_memory_indirect(&batch, indirect_buf_addr, copy_count, stride);
anv_blorp_batch_finish(&batch);
}
void
anv_CmdCopyMemoryToImageIndirectKHR(
VkCommandBuffer commandBuffer,
const VkCopyMemoryToImageIndirectInfoKHR* pCopyMemoryToImageIndirectInfo)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
ANV_FROM_HANDLE(anv_image, anv_image,
pCopyMemoryToImageIndirectInfo->dstImage);
if (pCopyMemoryToImageIndirectInfo->copyCount == 0)
return;
const uint64_t indirect_buf_addr =
pCopyMemoryToImageIndirectInfo->copyAddressRange.address;
const uint32_t copy_count = pCopyMemoryToImageIndirectInfo->copyCount;
const uint64_t stride =
pCopyMemoryToImageIndirectInfo->copyAddressRange.stride;
const VkImageLayout img_layout =
pCopyMemoryToImageIndirectInfo->dstImageLayout;
assert((pCopyMemoryToImageIndirectInfo->srcCopyFlags &
VK_ADDRESS_COPY_PROTECTED_BIT_KHR) == 0);
assert(pCopyMemoryToImageIndirectInfo->copyAddressRange.size >=
pCopyMemoryToImageIndirectInfo->copyCount *
pCopyMemoryToImageIndirectInfo->copyAddressRange.stride);
assert(!anv_cmd_buffer_is_blitter_queue(cmd_buffer));
enum blorp_batch_flags blorp_flags = BLORP_BATCH_USE_COMPUTE;
struct blorp_batch batch;
anv_blorp_batch_init(cmd_buffer, &batch, blorp_flags);
for (int c = 0; c < copy_count; c++) {
const VkImageSubresourceLayers *img_subresource =
&pCopyMemoryToImageIndirectInfo->pImageSubresources[c];
VkImageAspectFlags aspect_mask = img_subresource->aspectMask;
uint32_t mip_level = img_subresource->mipLevel;
uint32_t base_layer = img_subresource->baseArrayLayer;
uint32_t layer_count = img_subresource->layerCount;
assert(mip_level != VK_REMAINING_MIP_LEVELS);
if (layer_count == VK_REMAINING_ARRAY_LAYERS)
layer_count = anv_image->vk.array_layers - base_layer;
const unsigned plane =
anv_image_aspect_to_plane(anv_image, aspect_mask);
struct isl_surf *img_isl_surf =
&anv_image->planes[plane].primary_surface.isl;
enum isl_format format = img_isl_surf->format;
bool format_is_compressed = isl_format_is_compressed(format);
struct blorp_surf img_blorp_surf;
get_blorp_surf_for_anv_image(cmd_buffer, anv_image,
aspect_mask, VK_IMAGE_USAGE_STORAGE_BIT,
img_layout,
anv_image->planes[plane].aux_usage,
format, false /* cross_aspect */,
&img_blorp_surf);
anv_cmd_buffer_mark_image_written(cmd_buffer, anv_image,
img_subresource->aspectMask,
img_blorp_surf.aux_usage, mip_level,
base_layer, layer_count);
/* If the format is compressed, we will pretend the format is one where
* each pixel is the size of the compressed block, so image stores can
* work. Unfortunately, that translation only works if we do it for one
* mip level of a specific layer, as the complete layout of, say, BC4
* and R32G32 formats are not compatible: so we loop through all layers
* here.
*/
if (format_is_compressed) {
int min_l_or_z, top_l_or_z;
if (img_blorp_surf.surf->dim == ISL_SURF_DIM_3D) {
min_l_or_z = 0;
top_l_or_z = img_isl_surf->logical_level0_px.depth >> mip_level;
} else {
min_l_or_z = base_layer;
top_l_or_z = base_layer + layer_count;
}
for (int l_or_z = min_l_or_z; l_or_z < top_l_or_z; l_or_z++) {
blorp_copy_memory_to_image_indirect(&batch, &img_blorp_surf,
indirect_buf_addr,
stride, c, mip_level,
layer_count, l_or_z);
}
} else {
blorp_copy_memory_to_image_indirect(&batch, &img_blorp_surf,
indirect_buf_addr, stride, c,
mip_level, layer_count, -1);
}
}
anv_blorp_batch_finish(&batch);
}

View file

@ -629,6 +629,39 @@ anv_get_format_aspect(const struct anv_physical_device *device,
return anv_get_format_plane(device, vk_format, plane, tiling);
}
static bool
anv_format_supports_indirect_copies(const struct anv_physical_device *pdevice,
const struct anv_format *anv_format)
{
const struct isl_format_layout *fmtl =
isl_format_get_layout(anv_format->planes[0].isl_format);
/* CTS insists on it even when we say we don't support it. */
if (!pdevice->vk.supported_features.indirectMemoryToImageCopy)
return false;
/* TODO: implement support for this in the copy shader. */
if (!util_is_power_of_two_or_zero(fmtl->bpb))
return false;
/* TODO: we use compute for indirect copies, and compute cannot write HiZ,
* we could try to support that if we see that applications want it.
*/
if (vk_format_is_depth_or_stencil(anv_format->vk_format))
return false;
/* Let's leave YCbCr and multi-planar formats out until we have proper
* tests to verify they work.
*/
if (isl_format_is_yuv(anv_format->planes[0].isl_format))
return false;
if (anv_format->n_planes > 1)
return false;
return true;
}
// Format capabilities
static bool
@ -715,28 +748,41 @@ anv_color_format_supports_drm_modifier_tiling(const struct anv_physical_device *
}
static VkFormatFeatureFlags2
anv_get_compressed_emulated_format_features(const struct anv_format *anv_format,
anv_get_compressed_emulated_format_features(const struct anv_physical_device *pdevice,
const struct anv_format *anv_format,
const VkImageTiling vk_tiling)
{
VkFormatFeatureFlags2 flags = 0;
assert(isl_format_is_compressed(anv_format->planes[0].isl_format));
/* Require optimal tiling so that we can decompress on upload */
if (vk_tiling == VK_IMAGE_TILING_OPTIMAL) {
switch (vk_tiling) {
case VK_IMAGE_TILING_OPTIMAL:
/* Required features for compressed formats */
return VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT |
VK_FORMAT_FEATURE_2_BLIT_SRC_BIT |
VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT |
VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT |
VK_FORMAT_FEATURE_2_HOST_IMAGE_TRANSFER_BIT_EXT;
} else if (vk_tiling == VK_IMAGE_TILING_LINEAR) {
flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT |
VK_FORMAT_FEATURE_2_BLIT_SRC_BIT |
VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT |
VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT |
VK_FORMAT_FEATURE_2_HOST_IMAGE_TRANSFER_BIT_EXT;
break;
case VK_IMAGE_TILING_LINEAR:
/* Images used for transfers */
return VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT |
VK_FORMAT_FEATURE_2_HOST_IMAGE_TRANSFER_BIT_EXT;
flags |= VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT |
VK_FORMAT_FEATURE_2_HOST_IMAGE_TRANSFER_BIT_EXT;
break;
default:
return 0;
}
return 0;
if (anv_format_supports_indirect_copies(pdevice, anv_format))
flags |= VK_FORMAT_FEATURE_2_COPY_IMAGE_INDIRECT_DST_BIT_KHR;
return flags;
}
static VkFormatFeatureFlags2
@ -852,7 +898,8 @@ anv_get_color_format_features(const struct anv_physical_device *physical_device,
if (vk_tiling == VK_IMAGE_TILING_LINEAR &&
isl_format_get_layout(plane_format.isl_format)->txc == ISL_TXC_ASTC)
return VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT;
VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT |
VK_FORMAT_FEATURE_2_COPY_IMAGE_INDIRECT_DST_BIT_KHR;
flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT |
VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT |
@ -935,6 +982,10 @@ anv_get_color_format_features(const struct anv_physical_device *physical_device,
VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT;
}
if ((flags & VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT) &&
anv_format_supports_indirect_copies(physical_device, anv_format))
flags |= VK_FORMAT_FEATURE_2_COPY_IMAGE_INDIRECT_DST_BIT_KHR;
/* XXX: We handle 3-channel formats by switching them out for RGBX or
* RGBA formats behind-the-scenes. This works fine for textures
* because the upload process will fill in the extra channel.
@ -1125,7 +1176,8 @@ anv_get_image_format_features2(const struct anv_physical_device *physical_device
}
if (anv_is_compressed_format_emulated(physical_device, vk_format)) {
return anv_get_compressed_emulated_format_features(anv_format,
return anv_get_compressed_emulated_format_features(physical_device,
anv_format,
vk_tiling);
}

View file

@ -152,6 +152,7 @@ get_device_extensions(const struct anv_physical_device *device,
.KHR_cooperative_matrix = device->has_cooperative_matrix,
.NV_cooperative_matrix2 = device->has_cooperative_matrix,
.KHR_copy_commands2 = true,
.KHR_copy_memory_indirect = true,
.KHR_create_renderpass2 = true,
.KHR_dedicated_allocation = true,
.KHR_deferred_host_operations = true,
@ -1023,6 +1024,10 @@ get_features(const struct anv_physical_device *pdevice,
/* VK_KHR_pipeline_binary */
.pipelineBinaries = true,
/* VK_KHR_copy_memory_indirect */
.indirectMemoryCopy = true,
.indirectMemoryToImageCopy = pdevice->info.ver >= 12,
#ifdef ANV_USE_WSI_PLATFORM
/* VK_EXT_present_timing */
.presentTiming = true,
@ -2205,6 +2210,11 @@ get_properties(const struct anv_physical_device *pdevice,
*/
props->shaderBinaryVersion = 0;
}
/* VK_KHR_copy_memory_indirect */
{
props->supportedQueues = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT;
}
}
/* This function restricts the maximum size of system memory heap. The

View file

@ -534,6 +534,8 @@ get_color_aux_op(const struct blorp_params *params)
case BLORP_OP_SLOW_COLOR_CLEAR:
case BLORP_OP_BLIT:
case BLORP_OP_COPY:
case BLORP_OP_COPY_INDIRECT:
case BLORP_OP_COPY_IMAGE_INDIRECT:
assert(params->fast_clear_op == ISL_AUX_OP_NONE);
return ANV_COLOR_AUX_OP_CLASS_NONE;
}
@ -596,3 +598,10 @@ genX(blorp_init_dynamic_states)(struct blorp_context *context)
{
blorp_init_dynamic_states(context);
}
static bool *
blorp_get_write_fencing_status(struct blorp_batch *blorp_batch)
{
struct anv_cmd_buffer *cmd_buffer = blorp_batch->driver_batch;
return &cmd_buffer->batch.write_fence_status;
}