From 6b15e459082c35125f2066e0316d7fe7bda99c73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 22 Apr 2024 03:48:53 -0400 Subject: [PATCH] ac/nir: import the universal compute clear/blit shader Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/amd/common/ac_nir.c | 19 + src/amd/common/ac_nir_helpers.h | 3 + src/amd/common/ac_nir_meta.h | 54 ++ src/amd/common/ac_nir_meta_cs_blit.c | 439 +++++++++++++++++ src/amd/common/meson.build | 1 + .../drivers/radeonsi/si_compute_blit.c | 93 ++-- src/gallium/drivers/radeonsi/si_pipe.h | 43 -- .../drivers/radeonsi/si_shaderlib_nir.c | 462 +----------------- 8 files changed, 574 insertions(+), 540 deletions(-) create mode 100644 src/amd/common/ac_nir_meta_cs_blit.c diff --git a/src/amd/common/ac_nir.c b/src/amd/common/ac_nir.c index aa17c2f8224..070148cadcc 100644 --- a/src/amd/common/ac_nir.c +++ b/src/amd/common/ac_nir.c @@ -1557,3 +1557,22 @@ ac_optimization_barrier_vgpr_array(const struct radeon_info *info, nir_builder * } } } + +nir_def * +ac_get_global_ids(nir_builder *b, unsigned num_components, unsigned bit_size) +{ + unsigned mask = BITFIELD_MASK(num_components); + + nir_def *local_ids = nir_channels(b, nir_load_local_invocation_id(b), mask); + nir_def *block_ids = nir_channels(b, nir_load_workgroup_id(b), mask); + nir_def *block_size = nir_channels(b, nir_load_workgroup_size(b), mask); + + assert(bit_size == 32 || bit_size == 16); + if (bit_size == 16) { + local_ids = nir_i2iN(b, local_ids, bit_size); + block_ids = nir_i2iN(b, block_ids, bit_size); + block_size = nir_i2iN(b, block_size, bit_size); + } + + return nir_iadd(b, nir_imul(b, block_ids, block_size), local_ids); +} diff --git a/src/amd/common/ac_nir_helpers.h b/src/amd/common/ac_nir_helpers.h index 53ce943e09a..9f84b3cfdcd 100644 --- a/src/amd/common/ac_nir_helpers.h +++ b/src/amd/common/ac_nir_helpers.h @@ -144,6 +144,9 @@ ac_optimization_barrier_vgpr_array(const struct radeon_info *info, nir_builder * nir_def **array, unsigned num_elements, unsigned num_components); +nir_def * +ac_get_global_ids(nir_builder *b, unsigned num_components, unsigned bit_size); + #ifdef __cplusplus } #endif diff --git a/src/amd/common/ac_nir_meta.h b/src/amd/common/ac_nir_meta.h index be231f24754..f0fccbc38dc 100644 --- a/src/amd/common/ac_nir_meta.h +++ b/src/amd/common/ac_nir_meta.h @@ -38,4 +38,58 @@ nir_shader * ac_create_resolve_ps(const struct ac_ps_resolve_options *options, const union ac_ps_resolve_key *key); +/* Universal optimized compute shader for image blits and clears. */ +#define SI_MAX_COMPUTE_BLIT_LANE_SIZE 16 +#define SI_MAX_COMPUTE_BLIT_SAMPLES 8 + +/* This describes all possible variants of the compute blit shader. */ +union ac_cs_blit_key { + struct { + bool use_aco:1; + /* Workgroup settings. */ + uint8_t wg_dim:2; /* 1, 2, or 3 */ + bool has_start_xyz:1; + /* The size of a block of pixels that a single thread will process. */ + uint8_t log_lane_width:3; + uint8_t log_lane_height:2; + uint8_t log_lane_depth:2; + /* Declaration modifiers. */ + bool is_clear:1; + bool src_is_1d:1; + bool dst_is_1d:1; + bool src_is_msaa:1; + bool dst_is_msaa:1; + bool src_has_z:1; + bool dst_has_z:1; + bool a16:1; + bool d16:1; + uint8_t log_samples:2; + bool sample0_only:1; /* src is MSAA, dst is not MSAA, log2_samples is ignored */ + /* Source coordinate modifiers. */ + bool x_clamp_to_edge:1; + bool y_clamp_to_edge:1; + bool flip_x:1; + bool flip_y:1; + /* Output modifiers. */ + bool sint_to_uint:1; + bool uint_to_sint:1; + bool dst_is_srgb:1; + bool use_integer_one:1; + uint8_t last_src_channel:2; /* this shouldn't be greater than last_dst_channel */ + uint8_t last_dst_channel:2; + }; + uint64_t key; +}; + +struct ac_cs_blit_options { + const nir_shader_compiler_options *nir_options; + const struct radeon_info *info; + bool use_aco; /* global driver setting */ + bool no_fmask; /* FMASK disabled by a debug option, ignored on GFX11+ */ + bool print_key; /* print ac_ps_resolve_key into stderr */ +}; + +nir_shader * +ac_create_blit_cs(const struct ac_cs_blit_options *options, const union ac_cs_blit_key *key); + #endif diff --git a/src/amd/common/ac_nir_meta_cs_blit.c b/src/amd/common/ac_nir_meta_cs_blit.c new file mode 100644 index 00000000000..3b7f78a9894 --- /dev/null +++ b/src/amd/common/ac_nir_meta_cs_blit.c @@ -0,0 +1,439 @@ +/* + * Copyright 2024 Advanced Micro Devices, Inc. + * + * SPDX-License-Identifier: MIT + */ + +#include "ac_nir_meta.h" +#include "ac_nir_helpers.h" +#include "nir_format_convert.h" +#include "compiler/aco_interface.h" + +static nir_def * +deref_ssa(nir_builder *b, nir_variable *var) +{ + return &nir_build_deref_var(b, var)->def; +} + +/* unpack_2x16_signed(src, x, y): x = (int32_t)((uint16_t)src); y = src >> 16; */ +static void +unpack_2x16_signed(nir_builder *b, unsigned bit_size, nir_def *src, nir_def **x, nir_def **y) +{ + assert(bit_size == 32 || bit_size == 16); + *x = nir_unpack_32_2x16_split_x(b, src); + *y = nir_unpack_32_2x16_split_y(b, src); + + if (bit_size == 32) { + *x = nir_i2i32(b, *x); + *y = nir_i2i32(b, *y); + } +} + +static nir_def * +convert_linear_to_srgb(nir_builder *b, nir_def *input) +{ + /* There are small precision differences compared to CB, so the gfx blit will return slightly + * different results. + */ + for (unsigned i = 0; i < MIN2(3, input->num_components); i++) { + input = nir_vector_insert_imm(b, input, + nir_format_linear_to_srgb(b, nir_channel(b, input, i)), i); + } + + return input; +} + +static nir_def * +apply_blit_output_modifiers(nir_builder *b, nir_def *color, + const union ac_cs_blit_key *key) +{ + unsigned bit_size = color->bit_size; + nir_def *zero = nir_imm_intN_t(b, 0, bit_size); + + if (key->sint_to_uint) + color = nir_imax(b, color, zero); + + if (key->uint_to_sint) { + color = nir_umin(b, color, + nir_imm_intN_t(b, bit_size == 16 ? INT16_MAX : INT32_MAX, + bit_size)); + } + + if (key->dst_is_srgb) + color = convert_linear_to_srgb(b, color); + + nir_def *one = key->use_integer_one ? nir_imm_intN_t(b, 1, bit_size) : + nir_imm_floatN_t(b, 1, bit_size); + + if (key->is_clear) { + if (key->last_dst_channel < 3) + color = nir_trim_vector(b, color, key->last_dst_channel + 1); + } else { + assert(key->last_src_channel <= key->last_dst_channel); + assert(color->num_components == key->last_src_channel + 1); + + /* Set channels not present in src to 0 or 1. */ + if (key->last_src_channel < key->last_dst_channel) { + color = nir_pad_vector(b, color, key->last_dst_channel + 1); + + for (unsigned chan = key->last_src_channel + 1; chan <= key->last_dst_channel; chan++) + color = nir_vector_insert_imm(b, color, chan == 3 ? one : zero, chan); + } + + /* Discard channels not present in dst. The hardware fills unstored channels with 0. */ + if (key->last_dst_channel < key->last_src_channel) + color = nir_trim_vector(b, color, key->last_dst_channel + 1); + } + + /* Discard channels not present in dst. The hardware fills unstored channels with 0. */ + if (key->last_dst_channel < 3) + color = nir_trim_vector(b, color, key->last_dst_channel + 1); + + return color; +} + +/* The compute blit shader. + * + * Implementation details: + * - Out-of-bounds dst coordinates are not clamped at all. The hw drops + * out-of-bounds stores for us. + * - Out-of-bounds src coordinates are clamped by emulating CLAMP_TO_EDGE using + * the image_size NIR intrinsic. + * - X/Y flipping just does this in the shader: -threadIDs - 1, assuming the starting coordinates + * are 1 pixel after the bottom-right corner, e.g. x + width, matching the gallium behavior. + * - This list doesn't do it justice. + */ +nir_shader * +ac_create_blit_cs(const struct ac_cs_blit_options *options, const union ac_cs_blit_key *key) +{ + if (options->print_key) { + fprintf(stderr, "Internal shader: compute_blit\n"); + fprintf(stderr, " key.use_aco = %u\n", key->use_aco); + fprintf(stderr, " key.wg_dim = %u\n", key->wg_dim); + fprintf(stderr, " key.has_start_xyz = %u\n", key->has_start_xyz); + fprintf(stderr, " key.log_lane_width = %u\n", key->log_lane_width); + fprintf(stderr, " key.log_lane_height = %u\n", key->log_lane_height); + fprintf(stderr, " key.log_lane_depth = %u\n", key->log_lane_depth); + fprintf(stderr, " key.is_clear = %u\n", key->is_clear); + fprintf(stderr, " key.src_is_1d = %u\n", key->src_is_1d); + fprintf(stderr, " key.dst_is_1d = %u\n", key->dst_is_1d); + fprintf(stderr, " key.src_is_msaa = %u\n", key->src_is_msaa); + fprintf(stderr, " key.dst_is_msaa = %u\n", key->dst_is_msaa); + fprintf(stderr, " key.src_has_z = %u\n", key->src_has_z); + fprintf(stderr, " key.dst_has_z = %u\n", key->dst_has_z); + fprintf(stderr, " key.a16 = %u\n", key->a16); + fprintf(stderr, " key.d16 = %u\n", key->d16); + fprintf(stderr, " key.log_samples = %u\n", key->log_samples); + fprintf(stderr, " key.sample0_only = %u\n", key->sample0_only); + fprintf(stderr, " key.x_clamp_to_edge = %u\n", key->x_clamp_to_edge); + fprintf(stderr, " key.y_clamp_to_edge = %u\n", key->y_clamp_to_edge); + fprintf(stderr, " key.flip_x = %u\n", key->flip_x); + fprintf(stderr, " key.flip_y = %u\n", key->flip_y); + fprintf(stderr, " key.sint_to_uint = %u\n", key->sint_to_uint); + fprintf(stderr, " key.uint_to_sint = %u\n", key->uint_to_sint); + fprintf(stderr, " key.dst_is_srgb = %u\n", key->dst_is_srgb); + fprintf(stderr, " key.use_integer_one = %u\n", key->use_integer_one); + fprintf(stderr, " key.last_src_channel = %u\n", key->last_src_channel); + fprintf(stderr, " key.last_dst_channel = %u\n", key->last_dst_channel); + fprintf(stderr, "\n"); + } + + nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options->nir_options, + "blit_non_scaled_cs"); + b.shader->info.use_aco_amd = options->use_aco || + (key->use_aco && aco_is_gpu_supported(options->info)); + b.shader->info.num_images = key->is_clear ? 1 : 2; + unsigned image_dst_index = b.shader->info.num_images - 1; + if (!key->is_clear && key->src_is_msaa) + BITSET_SET(b.shader->info.msaa_images, 0); + if (key->dst_is_msaa) + BITSET_SET(b.shader->info.msaa_images, image_dst_index); + /* The workgroup size varies depending on the tiling layout and blit dimensions. */ + b.shader->info.workgroup_size_variable = true; + b.shader->info.cs.user_data_components_amd = + key->is_clear ? (key->d16 ? 6 : 8) : key->has_start_xyz ? 4 : 3; + + const struct glsl_type *img_type[2] = { + glsl_image_type(key->src_is_1d ? GLSL_SAMPLER_DIM_1D : + key->src_is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D, + key->src_has_z, GLSL_TYPE_FLOAT), + glsl_image_type(key->dst_is_1d ? GLSL_SAMPLER_DIM_1D : + key->dst_is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D, + key->dst_has_z, GLSL_TYPE_FLOAT), + }; + + nir_variable *img_src = NULL; + if (!key->is_clear) { + img_src = nir_variable_create(b.shader, nir_var_uniform, img_type[0], "img0"); + img_src->data.binding = 0; + } + + nir_variable *img_dst = nir_variable_create(b.shader, nir_var_uniform, img_type[1], "img1"); + img_dst->data.binding = image_dst_index; + + unsigned lane_width = 1 << key->log_lane_width; + unsigned lane_height = 1 << key->log_lane_height; + unsigned lane_depth = 1 << key->log_lane_depth; + unsigned lane_size = lane_width * lane_height * lane_depth; + assert(lane_size <= SI_MAX_COMPUTE_BLIT_LANE_SIZE); + + nir_def *zero_lod = nir_imm_intN_t(&b, 0, key->a16 ? 16 : 32); + + /* Instructions. */ + /* Let's work with 0-based src and dst coordinates (thread IDs) first. */ + unsigned coord_bit_size = key->a16 ? 16 : 32; + nir_def *dst_xyz = ac_get_global_ids(&b, key->wg_dim, coord_bit_size); + dst_xyz = nir_pad_vector_imm_int(&b, dst_xyz, 0, 3); + + /* If the blit area is unaligned, we launched extra threads to make it aligned. + * Skip those threads here. + */ + nir_if *if_positive = NULL; + if (key->has_start_xyz) { + nir_def *start_xyz = nir_channel(&b, nir_load_user_data_amd(&b), 3); + start_xyz = nir_u2uN(&b, nir_unpack_32_4x8(&b, start_xyz), coord_bit_size); + start_xyz = nir_trim_vector(&b, start_xyz, 3); + + dst_xyz = nir_isub(&b, dst_xyz, start_xyz); + nir_def *is_positive_xyz = nir_ige_imm(&b, dst_xyz, 0); + nir_def *is_positive = nir_iand(&b, nir_channel(&b, is_positive_xyz, 0), + nir_iand(&b, nir_channel(&b, is_positive_xyz, 1), + nir_channel(&b, is_positive_xyz, 2))); + if_positive = nir_push_if(&b, is_positive); + } + + dst_xyz = nir_imul(&b, dst_xyz, nir_imm_ivec3_intN(&b, lane_width, lane_height, lane_depth, + coord_bit_size)); + nir_def *src_xyz = dst_xyz; + + /* Flip src coordinates. */ + for (unsigned i = 0; i < 2; i++) { + if (i ? key->flip_y : key->flip_x) { + /* A normal blit loads from (box.x + tid.x) where tid.x = 0..(width - 1). + * + * A flipped blit sets box.x = width, so we should make tid.x negative to load from + * (width - 1)..0. + * + * Therefore do: x = -x - 1, which becomes (width - 1) to 0 after we add box.x = width. + */ + nir_def *comp = nir_channel(&b, src_xyz, i); + comp = nir_iadd_imm(&b, nir_ineg(&b, comp), -(int)(i ? lane_height : lane_width)); + src_xyz = nir_vector_insert_imm(&b, src_xyz, comp, i); + } + } + + /* Add box.xyz. */ + nir_def *base_coord_src = NULL, *base_coord_dst = NULL; + unpack_2x16_signed(&b, coord_bit_size, nir_trim_vector(&b, nir_load_user_data_amd(&b), 3), + &base_coord_src, &base_coord_dst); + base_coord_dst = nir_iadd(&b, base_coord_dst, dst_xyz); + base_coord_src = nir_iadd(&b, base_coord_src, src_xyz); + + /* Coordinates must have 4 channels in NIR. */ + base_coord_src = nir_pad_vector(&b, base_coord_src, 4); + base_coord_dst = nir_pad_vector(&b, base_coord_dst, 4); + +/* Iterate over all pixels in the lane. num_samples is the only input. + * (sample, x, y, z) are generated coordinates, while "i" is the coordinates converted to + * an absolute index. + */ +#define foreach_pixel_in_lane(num_samples, sample, x, y, z, i) \ + for (unsigned z = 0; z < lane_depth; z++) \ + for (unsigned y = 0; y < lane_height; y++) \ + for (unsigned x = 0; x < lane_width; x++) \ + for (unsigned i = ((z * lane_height + y) * lane_width + x) * (num_samples), sample = 0; \ + sample < (num_samples); sample++, i++) \ + + /* Swizzle coordinates for 1D_ARRAY. */ + static const unsigned swizzle_xz[] = {0, 2, 0, 0}; + + /* Execute image loads and stores. */ + unsigned num_src_coords = (key->src_is_1d ? 1 : 2) + key->src_has_z + key->src_is_msaa; + unsigned num_dst_coords = (key->dst_is_1d ? 1 : 2) + key->dst_has_z + key->dst_is_msaa; + unsigned bit_size = key->d16 ? 16 : 32; + unsigned num_samples = 1 << key->log_samples; + unsigned src_samples = key->src_is_msaa && !key->sample0_only && + !key->is_clear ? num_samples : 1; + unsigned dst_samples = key->dst_is_msaa ? num_samples : 1; + nir_def *color[SI_MAX_COMPUTE_BLIT_LANE_SIZE * SI_MAX_COMPUTE_BLIT_SAMPLES] = {0}; + nir_def *coord_dst[SI_MAX_COMPUTE_BLIT_LANE_SIZE * SI_MAX_COMPUTE_BLIT_SAMPLES] = {0}; + nir_def *src_resinfo = NULL; + + if (key->is_clear) { + /* The clear color starts at component 4 of user data. */ + color[0] = nir_channels(&b, nir_load_user_data_amd(&b), + BITFIELD_RANGE(4, key->d16 ? 2 : 4)); + if (key->d16) + color[0] = nir_unpack_64_4x16(&b, nir_pack_64_2x32(&b, color[0])); + + foreach_pixel_in_lane(1, sample, x, y, z, i) { + color[i] = color[0]; + } + } else { + nir_def *coord_src[SI_MAX_COMPUTE_BLIT_LANE_SIZE * SI_MAX_COMPUTE_BLIT_SAMPLES] = {0}; + + /* Initialize src coordinates, one vector per pixel. */ + foreach_pixel_in_lane(src_samples, sample, x, y, z, i) { + unsigned tmp_x = x; + unsigned tmp_y = y; + + /* Change the order from 0..N to N..0 for flipped blits. */ + if (key->flip_x) + tmp_x = lane_width - 1 - x; + if (key->flip_y) + tmp_y = lane_height - 1 - y; + + coord_src[i] = nir_iadd(&b, base_coord_src, + nir_imm_ivec4_intN(&b, tmp_x, tmp_y, z, 0, coord_bit_size)); + if (key->src_is_1d) + coord_src[i] = nir_swizzle(&b, coord_src[i], swizzle_xz, 4); + if (key->src_is_msaa) { + coord_src[i] = nir_vector_insert_imm(&b, coord_src[i], + nir_imm_intN_t(&b, sample, coord_bit_size), + num_src_coords - 1); + } + + /* Clamp to edge for src, only X and Y because Z can't be out of bounds. */ + for (unsigned chan = 0; chan < 2; chan++) { + if (chan ? key->y_clamp_to_edge : key->x_clamp_to_edge) { + assert(!key->src_is_1d || chan == 0); + + if (!src_resinfo) { + /* Always use the 32-bit return type because the image dimensions can be + * > INT16_MAX even if the blit box fits within sint16. + */ + src_resinfo = nir_image_deref_size(&b, 4, 32, deref_ssa(&b, img_src), + zero_lod); + if (coord_bit_size == 16) { + src_resinfo = nir_umin_imm(&b, src_resinfo, INT16_MAX); + src_resinfo = nir_i2i16(&b, src_resinfo); + } + } + + nir_def *tmp = nir_channel(&b, coord_src[i], chan); + tmp = nir_imax_imm(&b, tmp, 0); + tmp = nir_imin(&b, tmp, nir_iadd_imm(&b, nir_channel(&b, src_resinfo, chan), -1)); + coord_src[i] = nir_vector_insert_imm(&b, coord_src[i], tmp, chan); + } + } + } + + /* We don't want the computation of src coordinates to be interleaved with loads. */ + if (lane_size > 1 || src_samples > 1) { + ac_optimization_barrier_vgpr_array(options->info, &b, coord_src, + lane_size * src_samples, num_src_coords); + } + + /* Use "samples_identical" for MSAA resolving if it's supported. */ + bool is_resolve = src_samples > 1 && dst_samples == 1; + bool uses_samples_identical = options->info->gfx_level < GFX11 && !options->no_fmask && is_resolve; + nir_def *samples_identical = NULL, *sample0[SI_MAX_COMPUTE_BLIT_LANE_SIZE] = {0}; + nir_if *if_identical = NULL; + + if (uses_samples_identical) { + samples_identical = nir_imm_true(&b); + + /* If we are resolving multiple pixels per lane, AND all results of "samples_identical". */ + foreach_pixel_in_lane(1, sample, x, y, z, i) { + nir_def *iden = nir_image_deref_samples_identical(&b, 1, deref_ssa(&b, img_src), + coord_src[i * src_samples], + .image_dim = GLSL_SAMPLER_DIM_MS); + samples_identical = nir_iand(&b, samples_identical, iden); + } + + /* If all samples are identical, load only sample 0. */ + if_identical = nir_push_if(&b, samples_identical); + foreach_pixel_in_lane(1, sample, x, y, z, i) { + sample0[i] = nir_image_deref_load(&b, key->last_src_channel + 1, bit_size, + deref_ssa(&b, img_src), coord_src[i * src_samples], + nir_channel(&b, coord_src[i * src_samples], + num_src_coords - 1), zero_lod, + .image_dim = img_src->type->sampler_dimensionality, + .image_array = img_src->type->sampler_array); + } + nir_push_else(&b, if_identical); + } + + /* Load src pixels, one per sample. */ + foreach_pixel_in_lane(src_samples, sample, x, y, z, i) { + color[i] = nir_image_deref_load(&b, key->last_src_channel + 1, bit_size, + deref_ssa(&b, img_src), coord_src[i], + nir_channel(&b, coord_src[i], num_src_coords - 1), zero_lod, + .image_dim = img_src->type->sampler_dimensionality, + .image_array = img_src->type->sampler_array); + } + + /* Resolve MSAA if necessary. */ + if (is_resolve) { + /* We don't want the averaging of samples to be interleaved with image loads. */ + ac_optimization_barrier_vgpr_array(options->info, &b, color, lane_size * src_samples, + key->last_src_channel + 1); + + /* This reduces the "color" array from "src_samples * lane_size" elements to only + * "lane_size" elements. + */ + foreach_pixel_in_lane(1, sample, x, y, z, i) { + color[i] = ac_average_samples(&b, &color[i * src_samples], src_samples); + } + src_samples = 1; + } + + if (uses_samples_identical) { + nir_pop_if(&b, if_identical); + foreach_pixel_in_lane(1, sample, x, y, z, i) { + color[i] = nir_if_phi(&b, sample0[i], color[i]); + } + } + } + + /* We need to load the descriptor here, otherwise the load would be after optimization + * barriers waiting for image loads, i.e. after s_waitcnt vmcnt(0). + */ + nir_def *img_dst_desc = nir_image_deref_descriptor_amd(&b, 8, 32, deref_ssa(&b, img_dst)); + if (lane_size > 1 && !b.shader->info.use_aco_amd) + img_dst_desc = nir_optimization_barrier_sgpr_amd(&b, 32, img_dst_desc); + + /* Apply the blit output modifiers, once per sample. */ + foreach_pixel_in_lane(src_samples, sample, x, y, z, i) { + color[i] = apply_blit_output_modifiers(&b, color[i], key); + } + + /* Initialize dst coordinates, one vector per pixel. */ + foreach_pixel_in_lane(dst_samples, sample, x, y, z, i) { + coord_dst[i] = nir_iadd(&b, base_coord_dst, + nir_imm_ivec4_intN(&b, x, y, z, 0, coord_bit_size)); + if (key->dst_is_1d) + coord_dst[i] = nir_swizzle(&b, coord_dst[i], swizzle_xz, 4); + if (key->dst_is_msaa) { + coord_dst[i] = nir_vector_insert_imm(&b, coord_dst[i], + nir_imm_intN_t(&b, sample, coord_bit_size), + num_dst_coords - 1); + } + } + + /* We don't want the computation of dst coordinates to be interleaved with stores. */ + if (lane_size > 1 || dst_samples > 1) { + ac_optimization_barrier_vgpr_array(options->info, &b, coord_dst, lane_size * dst_samples, + num_dst_coords); + } + + /* We don't want the application of blit output modifiers to be interleaved with stores. */ + if (!key->is_clear && (lane_size > 1 || MIN2(src_samples, dst_samples) > 1)) { + ac_optimization_barrier_vgpr_array(options->info, &b, color, lane_size * src_samples, + key->last_dst_channel + 1); + } + + /* Store the pixels, one per sample. */ + foreach_pixel_in_lane(dst_samples, sample, x, y, z, i) { + nir_bindless_image_store(&b, img_dst_desc, coord_dst[i], + nir_channel(&b, coord_dst[i], num_dst_coords - 1), + src_samples > 1 ? color[i] : color[i / dst_samples], zero_lod, + .image_dim = glsl_get_sampler_dim(img_type[1]), + .image_array = glsl_sampler_type_is_array(img_type[1])); + } + + if (key->has_start_xyz) + nir_pop_if(&b, if_positive); + + return b.shader; +} diff --git a/src/amd/common/meson.build b/src/amd/common/meson.build index 96ed5b3d24a..8c0d552eb45 100644 --- a/src/amd/common/meson.build +++ b/src/amd/common/meson.build @@ -114,6 +114,7 @@ amd_common_files = files( 'ac_nir_lower_ngg.c', 'ac_nir_lower_ps.c', 'ac_nir_meta.h', + 'ac_nir_meta_cs_blit.c', 'ac_nir_meta_ps_resolve.c', 'amd_family.c', 'ac_parse_ib.c', diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index 817f9c95de9..89de0b223c1 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -10,6 +10,7 @@ #include "util/u_helpers.h" #include "util/hash_table.h" #include "util/u_pack_color.h" +#include "ac_nir_meta.h" /* Determine the cache policy. */ static enum si_cache_policy get_cache_policy(struct si_context *sctx, enum si_coherency coher, @@ -1165,7 +1166,7 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info, } /* Check that the lane size fits into the shader key. */ - static const union si_compute_blit_shader_key max_lane_size = { + static const union ac_cs_blit_key max_lane_size = { .log_lane_width = ~0, .log_lane_height = ~0, .log_lane_depth = ~0, @@ -1487,80 +1488,88 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info, unsigned wg_dim = set_work_size(&grid, block_x, block_y, block_z, width, height, depth); /* Get the shader key. */ - union si_compute_blit_shader_key options; - options.key = 0; + union ac_cs_blit_key key; + key.key = 0; /* Only ACO can form VMEM clauses for image stores, which is a requirement for performance. */ - options.use_aco = true; - options.is_clear = is_clear; - options.wg_dim = wg_dim; - options.has_start_xyz = start_x || start_y || start_z; - options.log_lane_width = util_logbase2(lane_size.x); - options.log_lane_height = util_logbase2(lane_size.y); - options.log_lane_depth = util_logbase2(lane_size.z); - options.dst_is_1d = info->dst.resource->target == PIPE_TEXTURE_1D || + key.use_aco = true; + key.is_clear = is_clear; + key.wg_dim = wg_dim; + key.has_start_xyz = start_x || start_y || start_z; + key.log_lane_width = util_logbase2(lane_size.x); + key.log_lane_height = util_logbase2(lane_size.y); + key.log_lane_depth = util_logbase2(lane_size.z); + key.dst_is_1d = info->dst.resource->target == PIPE_TEXTURE_1D || info->dst.resource->target == PIPE_TEXTURE_1D_ARRAY; - options.dst_is_msaa = dst_samples > 1; - options.dst_has_z = info->dst.resource->target == PIPE_TEXTURE_3D || + key.dst_is_msaa = dst_samples > 1; + key.dst_has_z = info->dst.resource->target == PIPE_TEXTURE_3D || info->dst.resource->target == PIPE_TEXTURE_CUBE || info->dst.resource->target == PIPE_TEXTURE_1D_ARRAY || info->dst.resource->target == PIPE_TEXTURE_2D_ARRAY || info->dst.resource->target == PIPE_TEXTURE_CUBE_ARRAY; - options.last_dst_channel = util_format_get_last_component(info->dst.format); + key.last_dst_channel = util_format_get_last_component(info->dst.format); /* ACO doesn't support D16 on GFX8 */ - bool has_d16 = sctx->gfx_level >= (options.use_aco || sctx->screen->use_aco ? GFX9 : GFX8); + bool has_d16 = sctx->gfx_level >= (key.use_aco || sctx->screen->use_aco ? GFX9 : GFX8); if (is_clear) { assert(dst_samples <= 8); - options.log_samples = util_logbase2(dst_samples); - options.a16 = sctx->gfx_level >= GFX9 && util_is_box_sint16(&info->dst.box); - options.d16 = has_d16 && + key.log_samples = util_logbase2(dst_samples); + key.a16 = sctx->gfx_level >= GFX9 && util_is_box_sint16(&info->dst.box); + key.d16 = has_d16 && max_dst_chan_size <= (util_format_is_float(info->dst.format) || util_format_is_pure_integer(info->dst.format) ? 16 : 11); } else { - options.src_is_1d = info->src.resource->target == PIPE_TEXTURE_1D || + key.src_is_1d = info->src.resource->target == PIPE_TEXTURE_1D || info->src.resource->target == PIPE_TEXTURE_1D_ARRAY; - options.src_is_msaa = src_samples > 1; - options.src_has_z = info->src.resource->target == PIPE_TEXTURE_3D || + key.src_is_msaa = src_samples > 1; + key.src_has_z = info->src.resource->target == PIPE_TEXTURE_3D || info->src.resource->target == PIPE_TEXTURE_CUBE || info->src.resource->target == PIPE_TEXTURE_1D_ARRAY || info->src.resource->target == PIPE_TEXTURE_2D_ARRAY || info->src.resource->target == PIPE_TEXTURE_CUBE_ARRAY; /* Resolving integer formats only copies sample 0. log_samples is then unused. */ - options.sample0_only = sample0_only; + key.sample0_only = sample0_only; unsigned num_samples = MAX2(src_samples, dst_samples); assert(num_samples <= 8); - options.log_samples = sample0_only ? 0 : util_logbase2(num_samples); - options.x_clamp_to_edge = si_should_blit_clamp_to_edge(info, BITFIELD_BIT(0)); - options.y_clamp_to_edge = si_should_blit_clamp_to_edge(info, BITFIELD_BIT(1)); - options.flip_x = info->src.box.width < 0; - options.flip_y = info->src.box.height < 0; - options.sint_to_uint = util_format_is_pure_sint(info->src.format) && + key.log_samples = sample0_only ? 0 : util_logbase2(num_samples); + key.x_clamp_to_edge = si_should_blit_clamp_to_edge(info, BITFIELD_BIT(0)); + key.y_clamp_to_edge = si_should_blit_clamp_to_edge(info, BITFIELD_BIT(1)); + key.flip_x = info->src.box.width < 0; + key.flip_y = info->src.box.height < 0; + key.sint_to_uint = util_format_is_pure_sint(info->src.format) && util_format_is_pure_uint(info->dst.format); - options.uint_to_sint = util_format_is_pure_uint(info->src.format) && + key.uint_to_sint = util_format_is_pure_uint(info->src.format) && util_format_is_pure_sint(info->dst.format); - options.dst_is_srgb = util_format_is_srgb(info->dst.format); - options.last_src_channel = MIN2(util_format_get_last_component(info->src.format), - options.last_dst_channel); - options.use_integer_one = util_format_is_pure_integer(info->dst.format) && - options.last_src_channel < options.last_dst_channel && - options.last_dst_channel == 3; - options.a16 = sctx->gfx_level >= GFX9 && util_is_box_sint16(&info->dst.box) && + key.dst_is_srgb = util_format_is_srgb(info->dst.format); + key.last_src_channel = MIN2(util_format_get_last_component(info->src.format), + key.last_dst_channel); + key.use_integer_one = util_format_is_pure_integer(info->dst.format) && + key.last_src_channel < key.last_dst_channel && + key.last_dst_channel == 3; + key.a16 = sctx->gfx_level >= GFX9 && util_is_box_sint16(&info->dst.box) && util_is_box_sint16(&info->src.box); - options.d16 = has_d16 && + key.d16 = has_d16 && /* Blitting FP16 using D16 has precision issues. Resolving has precision * issues all the way down to R11G11B10_FLOAT. */ MIN2(max_dst_chan_size, max_src_chan_size) <= (util_format_is_pure_integer(info->dst.format) ? - (options.sint_to_uint || options.uint_to_sint ? 10 : 16) : + (key.sint_to_uint || key.uint_to_sint ? 10 : 16) : (is_resolve ? 10 : 11)); } - void *shader = _mesa_hash_table_u64_search(sctx->cs_blit_shaders, options.key); + void *shader = _mesa_hash_table_u64_search(sctx->cs_blit_shaders, key.key); if (!shader) { - shader = si_create_blit_cs(sctx, &options); - _mesa_hash_table_u64_insert(sctx->cs_blit_shaders, options.key, shader); + struct ac_cs_blit_options options = { + .nir_options = sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, + PIPE_SHADER_COMPUTE), + .info = &sctx->screen->info, + .use_aco = sctx->screen->use_aco, + .no_fmask = sctx->screen->debug_flags & DBG(NO_FMASK), + .print_key = si_can_dump_shader(sctx->screen, MESA_SHADER_COMPUTE, SI_DUMP_SHADER_KEY), + }; + shader = si_create_shader_state(sctx, ac_create_blit_cs(&options, &key)); + _mesa_hash_table_u64_insert(sctx->cs_blit_shaders, key.key, shader); } sctx->cs_user_data[0] = (info->src.box.x & 0xffff) | ((info->dst.box.x & 0xffff) << 16); @@ -1578,7 +1587,7 @@ bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info, final_value.f[i] = util_format_linear_to_srgb_float(final_value.f[i]); } - if (options.d16) { + if (key.d16) { enum pipe_format data_format; if (util_format_is_pure_uint(info->dst.format)) diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 9ff7cedc006..6b258943224 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1639,49 +1639,6 @@ void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf) void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex); void *si_create_passthrough_tcs(struct si_context *sctx); void *si_clear_image_dcc_single_shader(struct si_context *sctx, bool is_msaa, unsigned wg_dim); - -#define SI_MAX_COMPUTE_BLIT_LANE_SIZE 16 -#define SI_MAX_COMPUTE_BLIT_SAMPLES 8 - -union si_compute_blit_shader_key { - struct { - bool use_aco:1; - /* Workgroup settings. */ - uint8_t wg_dim:2; /* 1, 2, or 3 */ - bool has_start_xyz:1; - /* The size of a block of pixels that a single thread will process. */ - uint8_t log_lane_width:3; - uint8_t log_lane_height:2; - uint8_t log_lane_depth:2; - /* Declaration modifiers. */ - bool is_clear:1; - bool src_is_1d:1; - bool dst_is_1d:1; - bool src_is_msaa:1; - bool dst_is_msaa:1; - bool src_has_z:1; - bool dst_has_z:1; - bool a16:1; - bool d16:1; - uint8_t log_samples:2; - bool sample0_only:1; /* src is MSAA, dst is not MSAA, log2_samples is ignored */ - /* Source coordinate modifiers. */ - bool x_clamp_to_edge:1; - bool y_clamp_to_edge:1; - bool flip_x:1; - bool flip_y:1; - /* Output modifiers. */ - bool sint_to_uint:1; - bool uint_to_sint:1; - bool dst_is_srgb:1; - bool use_integer_one:1; - uint8_t last_src_channel:2; /* this shouldn't be greater than last_dst_channel */ - uint8_t last_dst_channel:2; - }; - uint64_t key; -}; - -void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_shader_key *options); void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, unsigned num_layers); void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread, diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c index 820bf569f8e..52f0a81119d 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c @@ -19,24 +19,6 @@ void *si_create_shader_state(struct si_context *sctx, nir_shader *nir) return pipe_shader_from_nir(&sctx->b, nir); } -static nir_def *get_global_ids(nir_builder *b, unsigned num_components, unsigned bit_size) -{ - unsigned mask = BITFIELD_MASK(num_components); - - nir_def *local_ids = nir_channels(b, nir_load_local_invocation_id(b), mask); - nir_def *block_ids = nir_channels(b, nir_load_workgroup_id(b), mask); - nir_def *block_size = nir_channels(b, nir_load_workgroup_size(b), mask); - - assert(bit_size == 32 || bit_size == 16); - if (bit_size == 16) { - local_ids = nir_i2iN(b, local_ids, bit_size); - block_ids = nir_i2iN(b, block_ids, bit_size); - block_size = nir_i2iN(b, block_size, bit_size); - } - - return nir_iadd(b, nir_imul(b, block_ids, block_size), local_ids); -} - /* unpack_2x16(src, x, y): x = src & 0xffff; y = src >> 16; */ static void unpack_2x16(nir_builder *b, nir_def *src, nir_def **x, nir_def **y) { @@ -44,26 +26,6 @@ static void unpack_2x16(nir_builder *b, nir_def *src, nir_def **x, nir_def **y) *y = nir_ushr_imm(b, src, 16); } -/* unpack_2x16_signed(src, x, y): x = (int32_t)((uint16_t)src); y = src >> 16; */ -static void unpack_2x16_signed(nir_builder *b, unsigned bit_size, nir_def *src, nir_def **x, - nir_def **y) -{ - assert(bit_size == 32 || bit_size == 16); - *x = nir_unpack_32_2x16_split_x(b, src); - *y = nir_unpack_32_2x16_split_y(b, src); - - if (bit_size == 32) { - *x = nir_i2i32(b, *x); - *y = nir_i2i32(b, *y); - } -} - -static nir_def * -deref_ssa(nir_builder *b, nir_variable *var) -{ - return &nir_build_deref_var(b, var)->def; -} - void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf) { nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options, @@ -85,7 +47,7 @@ void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf) unpack_2x16(&b, nir_channel(&b, user_sgprs, 2), &dst_dcc_pitch, &dst_dcc_height); /* Get the 2D coordinates. */ - nir_def *coord = get_global_ids(&b, 2, 32); + nir_def *coord = ac_get_global_ids(&b, 2, 32); nir_def *zero = nir_imm_int(&b, 0); /* Multiply the coordinates by the DCC block size (they are DCC block coordinates). */ @@ -128,7 +90,7 @@ void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture * clear_value = nir_u2u16(&b, clear_value); /* Get the 2D coordinates. */ - nir_def *coord = get_global_ids(&b, 3, 32); + nir_def *coord = ac_get_global_ids(&b, 3, 32); nir_def *zero = nir_imm_int(&b, 0); /* Multiply the coordinates by the DCC block size (they are DCC block coordinates). */ @@ -166,7 +128,7 @@ void *si_create_clear_buffer_rmw_cs(struct si_context *sctx) b.shader->info.num_ssbos = 1; /* address = blockID * 64 + threadID; */ - nir_def *address = get_global_ids(&b, 1, 32); + nir_def *address = ac_get_global_ids(&b, 1, 32); /* address = address * 16; (byte offset, loading one vec4 per thread) */ address = nir_ishl_imm(&b, address, 4); @@ -206,416 +168,6 @@ void *si_create_passthrough_tcs(struct si_context *sctx) return si_create_shader_state(sctx, tcs); } -static nir_def *convert_linear_to_srgb(nir_builder *b, nir_def *input) -{ - /* There are small precision differences compared to CB, so the gfx blit will return slightly - * different results. - */ - for (unsigned i = 0; i < MIN2(3, input->num_components); i++) { - input = nir_vector_insert_imm(b, input, - nir_format_linear_to_srgb(b, nir_channel(b, input, i)), i); - } - - return input; -} - -static nir_def *apply_blit_output_modifiers(nir_builder *b, nir_def *color, - const union si_compute_blit_shader_key *options) -{ - unsigned bit_size = color->bit_size; - nir_def *zero = nir_imm_intN_t(b, 0, bit_size); - - if (options->sint_to_uint) - color = nir_imax(b, color, zero); - - if (options->uint_to_sint) { - color = nir_umin(b, color, - nir_imm_intN_t(b, bit_size == 16 ? INT16_MAX : INT32_MAX, - bit_size)); - } - - if (options->dst_is_srgb) - color = convert_linear_to_srgb(b, color); - - nir_def *one = options->use_integer_one ? nir_imm_intN_t(b, 1, bit_size) : - nir_imm_floatN_t(b, 1, bit_size); - - if (options->is_clear) { - if (options->last_dst_channel < 3) - color = nir_trim_vector(b, color, options->last_dst_channel + 1); - } else { - assert(options->last_src_channel <= options->last_dst_channel); - assert(color->num_components == options->last_src_channel + 1); - - /* Set channels not present in src to 0 or 1. */ - if (options->last_src_channel < options->last_dst_channel) { - color = nir_pad_vector(b, color, options->last_dst_channel + 1); - - for (unsigned chan = options->last_src_channel + 1; chan <= options->last_dst_channel; chan++) - color = nir_vector_insert_imm(b, color, chan == 3 ? one : zero, chan); - } - - /* Discard channels not present in dst. The hardware fills unstored channels with 0. */ - if (options->last_dst_channel < options->last_src_channel) - color = nir_trim_vector(b, color, options->last_dst_channel + 1); - } - - /* Discard channels not present in dst. The hardware fills unstored channels with 0. */ - if (options->last_dst_channel < 3) - color = nir_trim_vector(b, color, options->last_dst_channel + 1); - - return color; -} - -/* The compute blit shader. - * - * Implementation details: - * - Out-of-bounds dst coordinates are not clamped at all. The hw drops - * out-of-bounds stores for us. - * - Out-of-bounds src coordinates are clamped by emulating CLAMP_TO_EDGE using - * the image_size NIR intrinsic. - * - X/Y flipping just does this in the shader: -threadIDs - 1, assuming the starting coordinates - * are 1 pixel after the bottom-right corner, e.g. x + width, matching the gallium behavior. - * - This list doesn't do it justice. - */ -void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_shader_key *options) -{ - if (si_can_dump_shader(sctx->screen, MESA_SHADER_COMPUTE, SI_DUMP_SHADER_KEY)) { - fprintf(stderr, "Internal shader: compute_blit\n"); - fprintf(stderr, " options.use_aco = %u\n", options->use_aco); - fprintf(stderr, " options.wg_dim = %u\n", options->wg_dim); - fprintf(stderr, " options.has_start_xyz = %u\n", options->has_start_xyz); - fprintf(stderr, " options.log_lane_width = %u\n", options->log_lane_width); - fprintf(stderr, " options.log_lane_height = %u\n", options->log_lane_height); - fprintf(stderr, " options.log_lane_depth = %u\n", options->log_lane_depth); - fprintf(stderr, " options.is_clear = %u\n", options->is_clear); - fprintf(stderr, " options.src_is_1d = %u\n", options->src_is_1d); - fprintf(stderr, " options.dst_is_1d = %u\n", options->dst_is_1d); - fprintf(stderr, " options.src_is_msaa = %u\n", options->src_is_msaa); - fprintf(stderr, " options.dst_is_msaa = %u\n", options->dst_is_msaa); - fprintf(stderr, " options.src_has_z = %u\n", options->src_has_z); - fprintf(stderr, " options.dst_has_z = %u\n", options->dst_has_z); - fprintf(stderr, " options.a16 = %u\n", options->a16); - fprintf(stderr, " options.d16 = %u\n", options->d16); - fprintf(stderr, " options.log_samples = %u\n", options->log_samples); - fprintf(stderr, " options.sample0_only = %u\n", options->sample0_only); - fprintf(stderr, " options.x_clamp_to_edge = %u\n", options->x_clamp_to_edge); - fprintf(stderr, " options.y_clamp_to_edge = %u\n", options->y_clamp_to_edge); - fprintf(stderr, " options.flip_x = %u\n", options->flip_x); - fprintf(stderr, " options.flip_y = %u\n", options->flip_y); - fprintf(stderr, " options.sint_to_uint = %u\n", options->sint_to_uint); - fprintf(stderr, " options.uint_to_sint = %u\n", options->uint_to_sint); - fprintf(stderr, " options.dst_is_srgb = %u\n", options->dst_is_srgb); - fprintf(stderr, " options.use_integer_one = %u\n", options->use_integer_one); - fprintf(stderr, " options.last_src_channel = %u\n", options->last_src_channel); - fprintf(stderr, " options.last_dst_channel = %u\n", options->last_dst_channel); - fprintf(stderr, "\n"); - } - - const nir_shader_compiler_options *nir_options = - sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE); - - nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, nir_options, - "blit_non_scaled_cs"); - b.shader->info.use_aco_amd = sctx->screen->use_aco || - (options->use_aco && aco_is_gpu_supported(&sctx->screen->info)); - b.shader->info.num_images = options->is_clear ? 1 : 2; - unsigned image_dst_index = b.shader->info.num_images - 1; - if (!options->is_clear && options->src_is_msaa) - BITSET_SET(b.shader->info.msaa_images, 0); - if (options->dst_is_msaa) - BITSET_SET(b.shader->info.msaa_images, image_dst_index); - /* The workgroup size varies depending on the tiling layout and blit dimensions. */ - b.shader->info.workgroup_size_variable = true; - b.shader->info.cs.user_data_components_amd = - options->is_clear ? (options->d16 ? 6 : 8) : options->has_start_xyz ? 4 : 3; - - const struct glsl_type *img_type[2] = { - glsl_image_type(options->src_is_1d ? GLSL_SAMPLER_DIM_1D : - options->src_is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D, - options->src_has_z, GLSL_TYPE_FLOAT), - glsl_image_type(options->dst_is_1d ? GLSL_SAMPLER_DIM_1D : - options->dst_is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D, - options->dst_has_z, GLSL_TYPE_FLOAT), - }; - - nir_variable *img_src = NULL; - if (!options->is_clear) { - img_src = nir_variable_create(b.shader, nir_var_uniform, img_type[0], "img0"); - img_src->data.binding = 0; - } - - nir_variable *img_dst = nir_variable_create(b.shader, nir_var_uniform, img_type[1], "img1"); - img_dst->data.binding = image_dst_index; - - unsigned lane_width = 1 << options->log_lane_width; - unsigned lane_height = 1 << options->log_lane_height; - unsigned lane_depth = 1 << options->log_lane_depth; - unsigned lane_size = lane_width * lane_height * lane_depth; - assert(lane_size <= SI_MAX_COMPUTE_BLIT_LANE_SIZE); - - nir_def *zero_lod = nir_imm_intN_t(&b, 0, options->a16 ? 16 : 32); - - /* Instructions. */ - /* Let's work with 0-based src and dst coordinates (thread IDs) first. */ - unsigned coord_bit_size = options->a16 ? 16 : 32; - nir_def *dst_xyz = get_global_ids(&b, options->wg_dim, coord_bit_size); - dst_xyz = nir_pad_vector_imm_int(&b, dst_xyz, 0, 3); - - /* If the blit area is unaligned, we launched extra threads to make it aligned. - * Skip those threads here. - */ - nir_if *if_positive = NULL; - if (options->has_start_xyz) { - nir_def *start_xyz = nir_channel(&b, nir_load_user_data_amd(&b), 3); - start_xyz = nir_u2uN(&b, nir_unpack_32_4x8(&b, start_xyz), coord_bit_size); - start_xyz = nir_trim_vector(&b, start_xyz, 3); - - dst_xyz = nir_isub(&b, dst_xyz, start_xyz); - nir_def *is_positive_xyz = nir_ige_imm(&b, dst_xyz, 0); - nir_def *is_positive = nir_iand(&b, nir_channel(&b, is_positive_xyz, 0), - nir_iand(&b, nir_channel(&b, is_positive_xyz, 1), - nir_channel(&b, is_positive_xyz, 2))); - if_positive = nir_push_if(&b, is_positive); - } - - dst_xyz = nir_imul(&b, dst_xyz, nir_imm_ivec3_intN(&b, lane_width, lane_height, lane_depth, - coord_bit_size)); - nir_def *src_xyz = dst_xyz; - - /* Flip src coordinates. */ - for (unsigned i = 0; i < 2; i++) { - if (i ? options->flip_y : options->flip_x) { - /* A normal blit loads from (box.x + tid.x) where tid.x = 0..(width - 1). - * - * A flipped blit sets box.x = width, so we should make tid.x negative to load from - * (width - 1)..0. - * - * Therefore do: x = -x - 1, which becomes (width - 1) to 0 after we add box.x = width. - */ - nir_def *comp = nir_channel(&b, src_xyz, i); - comp = nir_iadd_imm(&b, nir_ineg(&b, comp), -(int)(i ? lane_height : lane_width)); - src_xyz = nir_vector_insert_imm(&b, src_xyz, comp, i); - } - } - - /* Add box.xyz. */ - nir_def *base_coord_src = NULL, *base_coord_dst = NULL; - unpack_2x16_signed(&b, coord_bit_size, nir_trim_vector(&b, nir_load_user_data_amd(&b), 3), - &base_coord_src, &base_coord_dst); - base_coord_dst = nir_iadd(&b, base_coord_dst, dst_xyz); - base_coord_src = nir_iadd(&b, base_coord_src, src_xyz); - - /* Coordinates must have 4 channels in NIR. */ - base_coord_src = nir_pad_vector(&b, base_coord_src, 4); - base_coord_dst = nir_pad_vector(&b, base_coord_dst, 4); - -/* Iterate over all pixels in the lane. num_samples is the only input. - * (sample, x, y, z) are generated coordinates, while "i" is the coordinates converted to - * an absolute index. - */ -#define foreach_pixel_in_lane(num_samples, sample, x, y, z, i) \ - for (unsigned z = 0; z < lane_depth; z++) \ - for (unsigned y = 0; y < lane_height; y++) \ - for (unsigned x = 0; x < lane_width; x++) \ - for (unsigned i = ((z * lane_height + y) * lane_width + x) * (num_samples), sample = 0; \ - sample < (num_samples); sample++, i++) \ - - /* Swizzle coordinates for 1D_ARRAY. */ - static const unsigned swizzle_xz[] = {0, 2, 0, 0}; - - /* Execute image loads and stores. */ - unsigned num_src_coords = (options->src_is_1d ? 1 : 2) + options->src_has_z + options->src_is_msaa; - unsigned num_dst_coords = (options->dst_is_1d ? 1 : 2) + options->dst_has_z + options->dst_is_msaa; - unsigned bit_size = options->d16 ? 16 : 32; - unsigned num_samples = 1 << options->log_samples; - unsigned src_samples = options->src_is_msaa && !options->sample0_only && - !options->is_clear ? num_samples : 1; - unsigned dst_samples = options->dst_is_msaa ? num_samples : 1; - nir_def *color[SI_MAX_COMPUTE_BLIT_LANE_SIZE * SI_MAX_COMPUTE_BLIT_SAMPLES] = {0}; - nir_def *coord_dst[SI_MAX_COMPUTE_BLIT_LANE_SIZE * SI_MAX_COMPUTE_BLIT_SAMPLES] = {0}; - nir_def *src_resinfo = NULL; - - if (options->is_clear) { - /* The clear color starts at component 4 of user data. */ - color[0] = nir_channels(&b, nir_load_user_data_amd(&b), - BITFIELD_RANGE(4, options->d16 ? 2 : 4)); - if (options->d16) - color[0] = nir_unpack_64_4x16(&b, nir_pack_64_2x32(&b, color[0])); - - foreach_pixel_in_lane(1, sample, x, y, z, i) { - color[i] = color[0]; - } - } else { - nir_def *coord_src[SI_MAX_COMPUTE_BLIT_LANE_SIZE * SI_MAX_COMPUTE_BLIT_SAMPLES] = {0}; - - /* Initialize src coordinates, one vector per pixel. */ - foreach_pixel_in_lane(src_samples, sample, x, y, z, i) { - unsigned tmp_x = x; - unsigned tmp_y = y; - - /* Change the order from 0..N to N..0 for flipped blits. */ - if (options->flip_x) - tmp_x = lane_width - 1 - x; - if (options->flip_y) - tmp_y = lane_height - 1 - y; - - coord_src[i] = nir_iadd(&b, base_coord_src, - nir_imm_ivec4_intN(&b, tmp_x, tmp_y, z, 0, coord_bit_size)); - if (options->src_is_1d) - coord_src[i] = nir_swizzle(&b, coord_src[i], swizzle_xz, 4); - if (options->src_is_msaa) { - coord_src[i] = nir_vector_insert_imm(&b, coord_src[i], - nir_imm_intN_t(&b, sample, coord_bit_size), - num_src_coords - 1); - } - - /* Clamp to edge for src, only X and Y because Z can't be out of bounds. */ - for (unsigned chan = 0; chan < 2; chan++) { - if (chan ? options->y_clamp_to_edge : options->x_clamp_to_edge) { - assert(!options->src_is_1d || chan == 0); - - if (!src_resinfo) { - /* Always use the 32-bit return type because the image dimensions can be - * > INT16_MAX even if the blit box fits within sint16. - */ - src_resinfo = nir_image_deref_size(&b, 4, 32, deref_ssa(&b, img_src), - zero_lod); - if (coord_bit_size == 16) { - src_resinfo = nir_umin_imm(&b, src_resinfo, INT16_MAX); - src_resinfo = nir_i2i16(&b, src_resinfo); - } - } - - nir_def *tmp = nir_channel(&b, coord_src[i], chan); - tmp = nir_imax_imm(&b, tmp, 0); - tmp = nir_imin(&b, tmp, nir_iadd_imm(&b, nir_channel(&b, src_resinfo, chan), -1)); - coord_src[i] = nir_vector_insert_imm(&b, coord_src[i], tmp, chan); - } - } - } - - /* We don't want the computation of src coordinates to be interleaved with loads. */ - if (lane_size > 1 || src_samples > 1) { - ac_optimization_barrier_vgpr_array(&sctx->screen->info, &b, coord_src, - lane_size * src_samples, num_src_coords); - } - - /* Use "samples_identical" for MSAA resolving if it's supported. */ - bool is_resolve = src_samples > 1 && dst_samples == 1; - bool uses_samples_identical = sctx->gfx_level < GFX11 && - !(sctx->screen->debug_flags & DBG(NO_FMASK)) && is_resolve; - nir_def *samples_identical = NULL, *sample0[SI_MAX_COMPUTE_BLIT_LANE_SIZE] = {0}; - nir_if *if_identical = NULL; - - if (uses_samples_identical) { - samples_identical = nir_imm_true(&b); - - /* If we are resolving multiple pixels per lane, AND all results of "samples_identical". */ - foreach_pixel_in_lane(1, sample, x, y, z, i) { - nir_def *iden = nir_image_deref_samples_identical(&b, 1, deref_ssa(&b, img_src), - coord_src[i * src_samples], - .image_dim = GLSL_SAMPLER_DIM_MS); - samples_identical = nir_iand(&b, samples_identical, iden); - } - - /* If all samples are identical, load only sample 0. */ - if_identical = nir_push_if(&b, samples_identical); - foreach_pixel_in_lane(1, sample, x, y, z, i) { - sample0[i] = nir_image_deref_load(&b, options->last_src_channel + 1, bit_size, - deref_ssa(&b, img_src), coord_src[i * src_samples], - nir_channel(&b, coord_src[i * src_samples], - num_src_coords - 1), zero_lod, - .image_dim = img_src->type->sampler_dimensionality, - .image_array = img_src->type->sampler_array); - } - nir_push_else(&b, if_identical); - } - - /* Load src pixels, one per sample. */ - foreach_pixel_in_lane(src_samples, sample, x, y, z, i) { - color[i] = nir_image_deref_load(&b, options->last_src_channel + 1, bit_size, - deref_ssa(&b, img_src), coord_src[i], - nir_channel(&b, coord_src[i], num_src_coords - 1), zero_lod, - .image_dim = img_src->type->sampler_dimensionality, - .image_array = img_src->type->sampler_array); - } - - /* Resolve MSAA if necessary. */ - if (is_resolve) { - /* We don't want the averaging of samples to be interleaved with image loads. */ - ac_optimization_barrier_vgpr_array(&sctx->screen->info, &b, color, lane_size * src_samples, - options->last_src_channel + 1); - - /* This reduces the "color" array from "src_samples * lane_size" elements to only - * "lane_size" elements. - */ - foreach_pixel_in_lane(1, sample, x, y, z, i) { - color[i] = ac_average_samples(&b, &color[i * src_samples], src_samples); - } - src_samples = 1; - } - - if (uses_samples_identical) { - nir_pop_if(&b, if_identical); - foreach_pixel_in_lane(1, sample, x, y, z, i) { - color[i] = nir_if_phi(&b, sample0[i], color[i]); - } - } - } - - /* We need to load the descriptor here, otherwise the load would be after optimization - * barriers waiting for image loads, i.e. after s_waitcnt vmcnt(0). - */ - nir_def *img_dst_desc = nir_image_deref_descriptor_amd(&b, 8, 32, deref_ssa(&b, img_dst)); - if (lane_size > 1 && !b.shader->info.use_aco_amd) - img_dst_desc = nir_optimization_barrier_sgpr_amd(&b, 32, img_dst_desc); - - /* Apply the blit output modifiers, once per sample. */ - foreach_pixel_in_lane(src_samples, sample, x, y, z, i) { - color[i] = apply_blit_output_modifiers(&b, color[i], options); - } - - /* Initialize dst coordinates, one vector per pixel. */ - foreach_pixel_in_lane(dst_samples, sample, x, y, z, i) { - coord_dst[i] = nir_iadd(&b, base_coord_dst, - nir_imm_ivec4_intN(&b, x, y, z, 0, coord_bit_size)); - if (options->dst_is_1d) - coord_dst[i] = nir_swizzle(&b, coord_dst[i], swizzle_xz, 4); - if (options->dst_is_msaa) { - coord_dst[i] = nir_vector_insert_imm(&b, coord_dst[i], - nir_imm_intN_t(&b, sample, coord_bit_size), - num_dst_coords - 1); - } - } - - /* We don't want the computation of dst coordinates to be interleaved with stores. */ - if (lane_size > 1 || dst_samples > 1) { - ac_optimization_barrier_vgpr_array(&sctx->screen->info, &b, coord_dst, lane_size * dst_samples, - num_dst_coords); - } - - /* We don't want the application of blit output modifiers to be interleaved with stores. */ - if (!options->is_clear && (lane_size > 1 || MIN2(src_samples, dst_samples) > 1)) { - ac_optimization_barrier_vgpr_array(&sctx->screen->info, &b, color, lane_size * src_samples, - options->last_dst_channel + 1); - } - - /* Store the pixels, one per sample. */ - foreach_pixel_in_lane(dst_samples, sample, x, y, z, i) { - nir_bindless_image_store(&b, img_dst_desc, coord_dst[i], - nir_channel(&b, coord_dst[i], num_dst_coords - 1), - src_samples > 1 ? color[i] : color[i / dst_samples], zero_lod, - .image_dim = glsl_get_sampler_dim(img_type[1]), - .image_array = glsl_sampler_type_is_array(img_type[1])); - } - - if (options->has_start_xyz) - nir_pop_if(&b, if_positive); - - return si_create_shader_state(sctx, b.shader); -} - /* Store the clear color at the beginning of every 256B block. This is required when we clear DCC * to GFX11_DCC_CLEAR_SINGLE. */ @@ -635,7 +187,7 @@ void *si_clear_image_dcc_single_shader(struct si_context *sctx, bool is_msaa, un nir_variable *output_img = nir_variable_create(b.shader, nir_var_image, img_type, "out_img"); output_img->data.binding = 0; - nir_def *global_id = nir_pad_vector_imm_int(&b, get_global_ids(&b, wg_dim, 32), 0, 3); + nir_def *global_id = nir_pad_vector_imm_int(&b, ac_get_global_ids(&b, wg_dim, 32), 0, 3); nir_def *clear_color = nir_trim_vector(&b, nir_load_user_data_amd(&b), 4); nir_def *dcc_block_width, *dcc_block_height; @@ -664,7 +216,7 @@ void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx) b.shader->info.workgroup_size[2] = 1; b.shader->info.num_ssbos = 2; - nir_def *load_address = get_global_ids(&b, 1, 32); + nir_def *load_address = ac_get_global_ids(&b, 1, 32); nir_def *store_address = nir_imul_imm(&b, load_address, 2); nir_def *ubyte_value = nir_load_ssbo(&b, 1, 8, nir_imm_int(&b, 1), @@ -689,7 +241,7 @@ void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_ b.shader->info.num_ssbos = is_clear ? 1 : 2; b.shader->info.cs.user_data_components_amd = is_clear ? num_dwords_per_thread : 0; - nir_def *thread_id = get_global_ids(&b, 1, 32); + nir_def *thread_id = ac_get_global_ids(&b, 1, 32); /* Convert the global thread ID into bytes. */ nir_def *offset = nir_imul_imm(&b, thread_id, 4 * num_dwords_per_thread); nir_def *value; @@ -736,7 +288,7 @@ void *si_create_fmask_expand_cs(struct si_context *sctx, unsigned num_samples, b } nir_def *zero = nir_imm_int(&b, 0); - nir_def *address = get_global_ids(&b, 2, 32); + nir_def *address = ac_get_global_ids(&b, 2, 32); nir_def *sample[8], *addresses[8]; assert(num_samples <= ARRAY_SIZE(sample));