diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build index 2d37d56a7c0..c2b44fc9dc8 100644 --- a/src/gallium/drivers/radeonsi/meson.build +++ b/src/gallium/drivers/radeonsi/meson.build @@ -47,6 +47,7 @@ files_libradeonsi = files( 'si_query.h', 'si_nir_lower_abi.c', 'si_nir_lower_resource.c', + 'si_nir_lower_vs_inputs.c', 'si_nir_optim.c', 'si_sdma_copy_image.c', 'si_shader.c', diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_vs_inputs.c b/src/gallium/drivers/radeonsi/si_nir_lower_vs_inputs.c new file mode 100644 index 00000000000..ce7f86249bf --- /dev/null +++ b/src/gallium/drivers/radeonsi/si_nir_lower_vs_inputs.c @@ -0,0 +1,643 @@ +/* + * Copyright 2023 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "nir_builder.h" + +#include "ac_nir.h" +#include "si_shader_internal.h" +#include "si_state.h" +#include "si_pipe.h" + +struct lower_vs_inputs_state { + struct si_shader *shader; + struct si_shader_args *args; + + nir_ssa_def *instance_divisor_constbuf; + nir_ssa_def *vertex_index[16]; +}; + +/* See fast_idiv_by_const.h. */ +/* If num != UINT_MAX, this more efficient version can be used. */ +/* Set: increment = util_fast_udiv_info::increment; */ +static nir_ssa_def * +fast_udiv_nuw(nir_builder *b, nir_ssa_def *num, nir_ssa_def *divisor) +{ + nir_ssa_def *multiplier = nir_channel(b, divisor, 0); + nir_ssa_def *pre_shift = nir_channel(b, divisor, 1); + nir_ssa_def *post_shift = nir_channel(b, divisor, 2); + nir_ssa_def *increment = nir_channel(b, divisor, 3); + + num = nir_ushr(b, num, pre_shift); + num = nir_iadd_nuw(b, num, increment); + num = nir_imul(b, nir_u2u64(b, num), nir_u2u64(b, multiplier)); + num = nir_unpack_64_2x32_split_y(b, num); + return nir_ushr(b, num, post_shift); +} + +static nir_ssa_def * +get_vertex_index_for_mono_shader(nir_builder *b, int input_index, + struct lower_vs_inputs_state *s) +{ + const union si_shader_key *key = &s->shader->key; + + bool divisor_is_one = + key->ge.part.vs.prolog.instance_divisor_is_one & (1u << input_index); + bool divisor_is_fetched = + key->ge.part.vs.prolog.instance_divisor_is_fetched & (1u << input_index); + + if (divisor_is_one || divisor_is_fetched) { + nir_ssa_def *instance_id = nir_load_instance_id(b); + + /* This is used to determine vs vgpr count in si_get_vs_vgpr_comp_cnt(). */ + s->shader->info.uses_instanceid = true; + + nir_ssa_def *index = NULL; + if (divisor_is_one) { + index = instance_id; + } else { + nir_ssa_def *offset = nir_imm_int(b, input_index * 16); + nir_ssa_def *divisor = + nir_load_smem_buffer_amd(b, 4, s->instance_divisor_constbuf, offset); + + /* The faster NUW version doesn't work when InstanceID == UINT_MAX. + * Such InstanceID might not be achievable in a reasonable time though. + */ + index = fast_udiv_nuw(b, instance_id, divisor); + } + + nir_ssa_def *start_instance = nir_load_base_instance(b); + return nir_iadd(b, index, start_instance); + } else { + nir_ssa_def *vertex_id = nir_load_vertex_id_zero_base(b); + nir_ssa_def *base_vertex = nir_load_first_vertex(b); + + return nir_iadd(b, vertex_id, base_vertex); + } +} + +static nir_ssa_def * +get_vertex_index_for_part_shader(nir_builder *b, int input_index, + struct lower_vs_inputs_state *s) +{ + return ac_nir_load_arg_at_offset(b, &s->args->ac, s->args->vertex_index0, input_index); +} + +static void +get_vertex_index_for_all_inputs(nir_shader *nir, struct lower_vs_inputs_state *s) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + + nir_builder builder; + nir_builder_init(&builder, impl); + nir_builder *b = &builder; + + b->cursor = nir_before_cf_list(&impl->body); + + const struct si_shader_selector *sel = s->shader->selector; + const union si_shader_key *key = &s->shader->key; + + if (key->ge.part.vs.prolog.instance_divisor_is_fetched) { + s->instance_divisor_constbuf = + si_nir_load_internal_binding(b, s->args, SI_VS_CONST_INSTANCE_DIVISORS, 4); + } + + for (int i = 0; i < sel->info.num_inputs; i++) { + s->vertex_index[i] = s->shader->is_monolithic ? + get_vertex_index_for_mono_shader(b, i, s) : + get_vertex_index_for_part_shader(b, i, s); + } +} + +static void +load_vs_input_from_blit_sgpr(nir_builder *b, unsigned input_index, + struct lower_vs_inputs_state *s, + nir_ssa_def *out[4]) +{ + nir_ssa_def *vertex_id = nir_load_vertex_id_zero_base(b); + nir_ssa_def *sel_x1 = nir_uge(b, nir_imm_int(b, 1), vertex_id); + /* Use nir_ine, because we have 3 vertices and only + * the middle one should use y2. + */ + nir_ssa_def *sel_y1 = nir_ine_imm(b, vertex_id, 1); + + if (input_index == 0) { + /* Position: */ + nir_ssa_def *x1y1 = ac_nir_load_arg_at_offset(b, &s->args->ac, s->args->vs_blit_inputs, 0); + nir_ssa_def *x2y2 = ac_nir_load_arg_at_offset(b, &s->args->ac, s->args->vs_blit_inputs, 1); + + x1y1 = nir_i2i32(b, nir_unpack_32_2x16(b, x1y1)); + x2y2 = nir_i2i32(b, nir_unpack_32_2x16(b, x2y2)); + + nir_ssa_def *x1 = nir_channel(b, x1y1, 0); + nir_ssa_def *y1 = nir_channel(b, x1y1, 1); + nir_ssa_def *x2 = nir_channel(b, x2y2, 0); + nir_ssa_def *y2 = nir_channel(b, x2y2, 1); + + out[0] = nir_i2f32(b, nir_bcsel(b, sel_x1, x1, x2)); + out[1] = nir_i2f32(b, nir_bcsel(b, sel_y1, y1, y2)); + out[2] = ac_nir_load_arg_at_offset(b, &s->args->ac, s->args->vs_blit_inputs, 2); + out[3] = nir_imm_float(b, 1); + } else { + /* Color or texture coordinates: */ + assert(input_index == 1); + + unsigned vs_blit_property = s->shader->selector->info.base.vs.blit_sgprs_amd; + if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { + for (int i = 0; i < 4; i++) + out[i] = ac_nir_load_arg_at_offset(b, &s->args->ac, s->args->vs_blit_inputs, 3 + i); + } else { + assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD); + + nir_ssa_def *x1 = ac_nir_load_arg_at_offset(b, &s->args->ac, s->args->vs_blit_inputs, 3); + nir_ssa_def *y1 = ac_nir_load_arg_at_offset(b, &s->args->ac, s->args->vs_blit_inputs, 4); + nir_ssa_def *x2 = ac_nir_load_arg_at_offset(b, &s->args->ac, s->args->vs_blit_inputs, 5); + nir_ssa_def *y2 = ac_nir_load_arg_at_offset(b, &s->args->ac, s->args->vs_blit_inputs, 6); + + out[0] = nir_bcsel(b, sel_x1, x1, x2); + out[1] = nir_bcsel(b, sel_y1, y1, y2); + out[2] = ac_nir_load_arg_at_offset(b, &s->args->ac, s->args->vs_blit_inputs, 7); + out[3] = ac_nir_load_arg_at_offset(b, &s->args->ac, s->args->vs_blit_inputs, 8); + } + } +} + +/** + * Convert an 11- or 10-bit unsigned floating point number to an f32. + * + * The input exponent is expected to be biased analogous to IEEE-754, i.e. by + * 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs). + */ +static nir_ssa_def * +ufN_to_float(nir_builder *b, nir_ssa_def *src, unsigned exp_bits, unsigned mant_bits) +{ + assert(src->bit_size == 32); + + nir_ssa_def *mantissa = nir_iand_imm(b, src, (1 << mant_bits) - 1); + + /* Converting normal numbers is just a shift + correcting the exponent bias */ + unsigned normal_shift = 23 - mant_bits; + unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1); + + nir_ssa_def *shifted = nir_ishl_imm(b, src, normal_shift); + nir_ssa_def *normal = nir_iadd_imm(b, shifted, bias_shift << 23); + + /* Converting nan/inf numbers is the same, but with a different exponent update */ + nir_ssa_def *naninf = nir_ior_imm(b, normal, 0xff << 23); + + /* Converting denormals is the complex case: determine the leading zeros of the + * mantissa to obtain the correct shift for the mantissa and exponent correction. + */ + nir_ssa_def *ctlz = nir_uclz(b, mantissa); + /* Shift such that the leading 1 ends up as the LSB of the exponent field. */ + nir_ssa_def *denormal = nir_ishl(b, mantissa, nir_iadd_imm(b, ctlz, -8)); + + unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1; + nir_ssa_def *tmp = nir_isub_imm(b, denormal_exp, ctlz); + denormal = nir_iadd(b, denormal, nir_ishl_imm(b, tmp, 23)); + + /* Select the final result. */ + nir_ssa_def *cond = nir_uge(b, src, nir_imm_int(b, ((1ULL << exp_bits) - 1) << mant_bits)); + nir_ssa_def *result = nir_bcsel(b, cond, naninf, normal); + + cond = nir_uge(b, src, nir_imm_int(b, 1ULL << mant_bits)); + result = nir_bcsel(b, cond, result, denormal); + + cond = nir_ine_imm(b, src, 0); + result = nir_bcsel(b, cond, result, nir_imm_int(b, 0)); + + return result; +} + +/** + * Generate a fully general open coded buffer format fetch with all required + * fixups suitable for vertex fetch, using non-format buffer loads. + * + * Some combinations of argument values have special interpretations: + * - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT + * - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format + */ +static void +opencoded_load_format(nir_builder *b, nir_ssa_def *rsrc, nir_ssa_def *vindex, + union si_vs_fix_fetch fix_fetch, bool known_aligned, + enum amd_gfx_level gfx_level, nir_ssa_def *out[4]) +{ + unsigned log_size = fix_fetch.u.log_size; + unsigned num_channels = fix_fetch.u.num_channels_m1 + 1; + unsigned format = fix_fetch.u.format; + bool reverse = fix_fetch.u.reverse; + + unsigned load_log_size = log_size; + unsigned load_num_channels = num_channels; + if (log_size == 3) { + load_log_size = 2; + if (format == AC_FETCH_FORMAT_FLOAT) { + load_num_channels = 2 * num_channels; + } else { + load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */ + } + } + + int log_recombine = 0; + if ((gfx_level == GFX6 || gfx_level >= GFX10) && !known_aligned) { + /* Avoid alignment restrictions by loading one byte at a time. */ + load_num_channels <<= load_log_size; + log_recombine = load_log_size; + load_log_size = 0; + } else if (load_num_channels == 2 || load_num_channels == 4) { + log_recombine = -util_logbase2(load_num_channels); + load_num_channels = 1; + load_log_size += -log_recombine; + } + + nir_ssa_def *loads[32]; /* up to 32 bytes */ + for (unsigned i = 0; i < load_num_channels; ++i) { + nir_ssa_def *soffset = nir_imm_int(b, i << load_log_size); + unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2); + unsigned bit_size = 8 << MIN2(load_log_size, 2); + nir_ssa_def *zero = nir_imm_int(b, 0); + + loads[i] = nir_load_buffer_amd(b, num_channels, bit_size, rsrc, zero, soffset, vindex); + } + + if (log_recombine > 0) { + /* Recombine bytes if necessary (GFX6 only) */ + unsigned dst_bitsize = log_recombine == 2 ? 32 : 16; + + for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) { + nir_ssa_def *accum = NULL; + for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) { + nir_ssa_def *tmp = nir_u2uN(b, loads[src], dst_bitsize); + if (i == 0) { + accum = tmp; + } else { + tmp = nir_ishl_imm(b, tmp, 8 * i); + accum = nir_ior(b, accum, tmp); + } + } + loads[dst] = accum; + } + } else if (log_recombine < 0) { + /* Split vectors of dwords */ + if (load_log_size > 2) { + assert(load_num_channels == 1); + nir_ssa_def *loaded = loads[0]; + unsigned log_split = load_log_size - 2; + log_recombine += log_split; + load_num_channels = 1 << log_split; + load_log_size = 2; + for (unsigned i = 0; i < load_num_channels; ++i) + loads[i] = nir_channel(b, loaded, i); + } + + /* Further split dwords and shorts if required */ + if (log_recombine < 0) { + for (unsigned src = load_num_channels, dst = load_num_channels << -log_recombine; + src > 0; --src) { + unsigned dst_bits = 1 << (3 + load_log_size + log_recombine); + nir_ssa_def *loaded = loads[src - 1]; + for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) { + nir_ssa_def *tmp = nir_ushr_imm(b, loaded, dst_bits * (i - 1)); + loads[dst - 1] = nir_u2uN(b, tmp, dst_bits); + } + } + } + } + + if (log_size == 3) { + switch (format) { + case AC_FETCH_FORMAT_FLOAT: { + for (unsigned i = 0; i < num_channels; ++i) + loads[i] = nir_pack_64_2x32_split(b, loads[2 * i], loads[2 * i + 1]); + break; + } + case AC_FETCH_FORMAT_FIXED: { + /* 10_11_11_FLOAT */ + nir_ssa_def *data = loads[0]; + nir_ssa_def *red = nir_iand_imm(b, data, 2047); + nir_ssa_def *green = nir_iand_imm(b, nir_ushr_imm(b, data, 11), 2047); + nir_ssa_def *blue = nir_ushr_imm(b, data, 22); + + loads[0] = ufN_to_float(b, red, 5, 6); + loads[1] = ufN_to_float(b, green, 5, 6); + loads[2] = ufN_to_float(b, blue, 5, 5); + + num_channels = 3; + log_size = 2; + format = AC_FETCH_FORMAT_FLOAT; + break; + } + case AC_FETCH_FORMAT_UINT: + case AC_FETCH_FORMAT_UNORM: + case AC_FETCH_FORMAT_USCALED: { + /* 2_10_10_10 data formats */ + nir_ssa_def *data = loads[0]; + + loads[0] = nir_ubfe_imm(b, data, 0, 10); + loads[1] = nir_ubfe_imm(b, data, 10, 10); + loads[2] = nir_ubfe_imm(b, data, 20, 10); + loads[3] = nir_ubfe_imm(b, data, 30, 2); + + num_channels = 4; + break; + } + case AC_FETCH_FORMAT_SINT: + case AC_FETCH_FORMAT_SNORM: + case AC_FETCH_FORMAT_SSCALED: { + /* 2_10_10_10 data formats */ + nir_ssa_def *data = loads[0]; + + loads[0] = nir_ibfe_imm(b, data, 0, 10); + loads[1] = nir_ibfe_imm(b, data, 10, 10); + loads[2] = nir_ibfe_imm(b, data, 20, 10); + loads[3] = nir_ibfe_imm(b, data, 30, 2); + + num_channels = 4; + break; + } + default: + unreachable("invalid fetch format"); + break; + } + } + + switch (format) { + case AC_FETCH_FORMAT_FLOAT: + if (log_size != 2) { + for (unsigned chan = 0; chan < num_channels; ++chan) + loads[chan] = nir_f2f32(b, loads[chan]); + } + break; + case AC_FETCH_FORMAT_UINT: + if (log_size != 2) { + for (unsigned chan = 0; chan < num_channels; ++chan) + loads[chan] = nir_u2u32(b, loads[chan]); + } + break; + case AC_FETCH_FORMAT_SINT: + if (log_size != 2) { + for (unsigned chan = 0; chan < num_channels; ++chan) + loads[chan] = nir_i2i32(b, loads[chan]); + } + break; + case AC_FETCH_FORMAT_USCALED: + for (unsigned chan = 0; chan < num_channels; ++chan) + loads[chan] = nir_u2f32(b, loads[chan]); + break; + case AC_FETCH_FORMAT_SSCALED: + for (unsigned chan = 0; chan < num_channels; ++chan) + loads[chan] = nir_i2f32(b, loads[chan]); + break; + case AC_FETCH_FORMAT_FIXED: + for (unsigned chan = 0; chan < num_channels; ++chan) { + nir_ssa_def *tmp = nir_i2f32(b, loads[chan]); + loads[chan] = nir_fmul_imm(b, tmp, 1.0 / 0x10000); + } + break; + case AC_FETCH_FORMAT_UNORM: + for (unsigned chan = 0; chan < num_channels; ++chan) { + /* 2_10_10_10 data formats */ + unsigned bits = log_size == 3 ? (chan == 3 ? 2 : 10) : (8 << log_size); + nir_ssa_def *tmp = nir_u2f32(b, loads[chan]); + loads[chan] = nir_fmul_imm(b, tmp, 1.0 / BITFIELD64_MASK(bits)); + } + break; + case AC_FETCH_FORMAT_SNORM: + for (unsigned chan = 0; chan < num_channels; ++chan) { + /* 2_10_10_10 data formats */ + unsigned bits = log_size == 3 ? (chan == 3 ? 2 : 10) : (8 << log_size); + nir_ssa_def *tmp = nir_i2f32(b, loads[chan]); + tmp = nir_fmul_imm(b, tmp, 1.0 / BITFIELD64_MASK(bits - 1)); + /* Clamp to [-1, 1] */ + tmp = nir_fmax(b, tmp, nir_imm_float(b, -1)); + loads[chan] = nir_fmin(b, tmp, nir_imm_float(b, 1)); + } + break; + default: + unreachable("invalid fetch format"); + break; + } + + while (num_channels < 4) { + unsigned pad_value = num_channels == 3 ? 1 : 0; + loads[num_channels] = + format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT ? + nir_imm_int(b, pad_value) : nir_imm_float(b, pad_value); + num_channels++; + } + + if (reverse) { + nir_ssa_def *tmp = loads[0]; + loads[0] = loads[2]; + loads[2] = tmp; + } + + memcpy(out, loads, 4 * sizeof(out[0])); +} + +static void +load_vs_input_from_vertex_buffer(nir_builder *b, unsigned input_index, + struct lower_vs_inputs_state *s, + unsigned bit_size, nir_ssa_def *out[4]) +{ + const struct si_shader_selector *sel = s->shader->selector; + const union si_shader_key *key = &s->shader->key; + + nir_ssa_def *vb_desc; + if (input_index < sel->info.num_vbos_in_user_sgprs) { + vb_desc = ac_nir_load_arg(b, &s->args->ac, s->args->vb_descriptors[input_index]); + } else { + unsigned index = input_index - sel->info.num_vbos_in_user_sgprs; + nir_ssa_def *addr = ac_nir_load_arg(b, &s->args->ac, s->args->ac.vertex_buffers); + vb_desc = nir_load_smem_amd(b, 4, addr, nir_imm_int(b, index * 16)); + } + + nir_ssa_def *vertex_index = s->vertex_index[input_index]; + + /* Use the open-coded implementation for all loads of doubles and + * of dword-sized data that needs fixups. We need to insert conversion + * code anyway. + */ + bool opencode = key->ge.mono.vs_fetch_opencode & (1 << input_index); + union si_vs_fix_fetch fix_fetch = key->ge.mono.vs_fix_fetch[input_index]; + if (opencode || + (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) || + fix_fetch.u.log_size == 2) { + opencoded_load_format(b, vb_desc, vertex_index, fix_fetch, !opencode, + sel->screen->info.gfx_level, out); + + if (bit_size == 16) { + if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || + fix_fetch.u.format == AC_FETCH_FORMAT_SINT) { + for (unsigned i = 0; i < 4; i++) + out[i] = nir_u2u16(b, out[i]); + } else { + for (unsigned i = 0; i < 4; i++) + out[i] = nir_f2f16(b, out[i]); + } + } + return; + } + + unsigned required_channels = util_last_bit(sel->info.input[input_index].usage_mask); + if (required_channels == 0) { + for (unsigned i = 0; i < 4; ++i) + out[i] = nir_ssa_undef(b, 1, bit_size); + return; + } + + /* Do multiple loads for special formats. */ + nir_ssa_def *fetches[4]; + unsigned num_fetches; + unsigned fetch_stride; + unsigned channels_per_fetch; + + if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) { + num_fetches = MIN2(required_channels, 3); + fetch_stride = 1 << fix_fetch.u.log_size; + channels_per_fetch = 1; + } else { + num_fetches = 1; + fetch_stride = 0; + channels_per_fetch = required_channels; + } + + for (unsigned i = 0; i < num_fetches; ++i) { + nir_ssa_def *zero = nir_imm_int(b, 0); + fetches[i] = nir_load_buffer_amd(b, channels_per_fetch, bit_size, vb_desc, + zero, zero, vertex_index, + .base = fetch_stride * i, + .access = ACCESS_USES_FORMAT_AMD); + } + + if (num_fetches == 1 && channels_per_fetch > 1) { + nir_ssa_def *fetch = fetches[0]; + for (unsigned i = 0; i < channels_per_fetch; ++i) + fetches[i] = nir_channel(b, fetch, i); + + num_fetches = channels_per_fetch; + channels_per_fetch = 1; + } + + for (unsigned i = num_fetches; i < 4; ++i) + fetches[i] = nir_ssa_undef(b, 1, bit_size); + + if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 && required_channels == 4) { + if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT) + fetches[3] = nir_imm_intN_t(b, 1, bit_size); + else + fetches[3] = nir_imm_floatN_t(b, 1, bit_size); + } else if (fix_fetch.u.log_size == 3 && + (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM || + fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED || + fix_fetch.u.format == AC_FETCH_FORMAT_SINT) && + required_channels == 4) { + + /* For 2_10_10_10, the hardware returns an unsigned value; + * convert it to a signed one. + */ + nir_ssa_def *tmp = fetches[3]; + + /* First, recover the sign-extended signed integer value. */ + if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) + tmp = nir_f2uN(b, tmp, bit_size); + + /* For the integer-like cases, do a natural sign extension. + * + * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 + * and happen to contain 0, 1, 2, 3 as the two LSBs of the + * exponent. + */ + tmp = nir_ishl_imm(b, tmp, fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ? 7 : 30); + tmp = nir_ishr_imm(b, tmp, 30); + + /* Convert back to the right type. */ + if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) { + tmp = nir_i2fN(b, tmp, bit_size); + /* Clamp to [-1, 1] */ + tmp = nir_fmax(b, tmp, nir_imm_float(b, -1)); + tmp = nir_fmin(b, tmp, nir_imm_float(b, 1)); + } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) { + tmp = nir_i2fN(b, tmp, bit_size); + } + + fetches[3] = tmp; + } + + memcpy(out, fetches, 4 * sizeof(out[0])); +} + +static bool +lower_vs_input_instr(nir_builder *b, nir_instr *instr, void *state) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_load_input) + return false; + + struct lower_vs_inputs_state *s = (struct lower_vs_inputs_state *)state; + + b->cursor = nir_before_instr(instr); + + unsigned input_index = nir_intrinsic_base(intrin); + unsigned component = nir_intrinsic_component(intrin); + unsigned num_components = intrin->dest.ssa.num_components; + + nir_ssa_def *comp[4]; + if (s->shader->selector->info.base.vs.blit_sgprs_amd) + load_vs_input_from_blit_sgpr(b, input_index, s, comp); + else + load_vs_input_from_vertex_buffer(b, input_index, s, intrin->dest.ssa.bit_size, comp); + + nir_ssa_def *replacement = nir_vec(b, &comp[component], num_components); + + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, replacement); + nir_instr_remove(instr); + nir_instr_free(instr); + + return true; +} + +bool +si_nir_lower_vs_inputs(nir_shader *nir, struct si_shader *shader, struct si_shader_args *args) +{ + const struct si_shader_selector *sel = shader->selector; + + /* no inputs to lower */ + if (!sel->info.num_inputs) + return false; + + struct lower_vs_inputs_state state = { + .shader = shader, + .args = args, + }; + + if (!sel->info.base.vs.blit_sgprs_amd) + get_vertex_index_for_all_inputs(nir, &state); + + return nir_shader_instructions_pass(nir, lower_vs_input_instr, + nir_metadata_dominance | nir_metadata_block_index, + &state); +} diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index b667ed21da1..00c0a931e02 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -171,6 +171,10 @@ bool si_nir_lower_abi(nir_shader *nir, struct si_shader *shader, struct si_shade bool si_nir_lower_resource(nir_shader *nir, struct si_shader *shader, struct si_shader_args *args); +/* si_nir_lower_vs_inputs.c */ +bool si_nir_lower_vs_inputs(nir_shader *nir, struct si_shader *shader, + struct si_shader_args *args); + /* si_shader_llvm.c */ bool si_compile_llvm(struct si_screen *sscreen, struct si_shader_binary *binary, struct ac_shader_config *conf, struct ac_llvm_compiler *compiler,