diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 6ecb579586b..17a554d8a98 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -4888,15 +4888,14 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout } if (((stages[i].nir->info.bit_sizes_int | stages[i].nir->info.bit_sizes_float) & 16) && device->physical_device->rad_info.gfx_level >= GFX9) { - uint32_t sampler_dims = UINT32_MAX; - /* Skip because AMD doesn't support 16-bit types with these. */ - sampler_dims &= ~BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE); // TODO: also optimize the tex srcs. see radeonSI for reference */ - /* Skip if there are potentially conflicting rounding modes */ - if (!nir_has_any_rounding_mode_enabled(stages[i].nir->info.float_controls_execution_mode)) - NIR_PASS(_, stages[i].nir, nir_fold_16bit_sampler_conversions, 0, sampler_dims); + struct nir_fold_16bit_tex_image_options fold_16bit_options = { + .rounding_mode = nir_rounding_mode_rtne, + .fold_tex_dest = true, + .fold_image_load_store_data = true, + }; + NIR_PASS(_, stages[i].nir, nir_fold_16bit_tex_image, &fold_16bit_options); - NIR_PASS(_, stages[i].nir, nir_fold_16bit_image_load_store_conversions); NIR_PASS(_, stages[i].nir, nir_opt_vectorize, opt_vectorize_callback, device); } diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 1b8f68fe16b..35d4ff5db3c 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -5339,9 +5339,22 @@ bool nir_lower_mediump_io(nir_shader *nir, nir_variable_mode modes, bool nir_force_mediump_io(nir_shader *nir, nir_variable_mode modes, nir_alu_type types); bool nir_unpack_16bit_varying_slots(nir_shader *nir, nir_variable_mode modes); -bool nir_fold_16bit_sampler_conversions(nir_shader *nir, - unsigned tex_src_types, uint32_t sampler_dims); -bool nir_fold_16bit_image_load_store_conversions(nir_shader *nir); + +struct nir_fold_tex_srcs_options { + unsigned sampler_dims; + unsigned src_types; +}; + +struct nir_fold_16bit_tex_image_options { + nir_rounding_mode rounding_mode; + bool fold_tex_dest; + bool fold_image_load_store_data; + unsigned fold_srcs_options_count; + struct nir_fold_tex_srcs_options *fold_srcs_options; +}; + +bool nir_fold_16bit_tex_image(nir_shader *nir, + struct nir_fold_16bit_tex_image_options *options); typedef struct { bool legalize_type; /* whether this src should be legalized */ diff --git a/src/compiler/nir/nir_lower_mediump.c b/src/compiler/nir/nir_lower_mediump.c index 5c6f84ec9be..54ad83b7cb0 100644 --- a/src/compiler/nir/nir_lower_mediump.c +++ b/src/compiler/nir/nir_lower_mediump.c @@ -376,7 +376,6 @@ static bool is_f32_to_f16_conversion(nir_instr *instr) { return is_n_to_m_conversion(instr, 32, nir_op_f2f16) || - is_n_to_m_conversion(instr, 32, nir_op_f2f16_rtne) || is_n_to_m_conversion(instr, 32, nir_op_f2fmp); } @@ -396,156 +395,8 @@ static bool is_i32_to_i16_conversion(nir_instr *instr) { return is_n_to_m_conversion(instr, 32, nir_op_i2i16) || - is_n_to_m_conversion(instr, 32, nir_op_u2u16); -} - -static void -replace_with_mov(nir_builder *b, nir_instr *instr, nir_src *src, - nir_alu_instr *alu) -{ - nir_ssa_def *mov = nir_mov_alu(b, alu->src[0], - nir_dest_num_components(alu->dest.dest)); - assert(!alu->dest.saturate); - nir_instr_rewrite_src_ssa(instr, src, mov); -} - -/** - * If texture source operands use f16->f32 conversions or return values are - * followed by f16->f32 or f32->f16, remove those conversions. This benefits - * drivers that have texture opcodes that can accept and return 16-bit types. - * - * "tex_src_types" is a mask of nir_tex_src_* operands that should be handled. - * It's always done for the destination. - * - * This should be run after late algebraic optimizations. - * Copy propagation and DCE should be run after this. - */ -bool -nir_fold_16bit_sampler_conversions(nir_shader *nir, - unsigned tex_src_types, - uint32_t sampler_dims) -{ - bool changed = false; - nir_function_impl *impl = nir_shader_get_entrypoint(nir); - assert(impl); - - nir_builder b; - nir_builder_init(&b, impl); - - nir_foreach_block_safe (block, impl) { - nir_foreach_instr_safe (instr, block) { - if (instr->type != nir_instr_type_tex) - continue; - - nir_tex_instr *tex = nir_instr_as_tex(instr); - nir_instr *src; - nir_alu_instr *src_alu; - - /* Skip sparse residency */ - if (tex->is_sparse) - continue; - - if ((tex->op == nir_texop_txs || - tex->op == nir_texop_query_levels) || - !(sampler_dims & BITFIELD_BIT(tex->sampler_dim))) - continue; - - /* Optimize source operands. */ - for (unsigned i = 0; i < tex->num_srcs; i++) { - /* Filter out sources that should be ignored. */ - if (!(BITFIELD_BIT(tex->src[i].src_type) & tex_src_types)) - continue; - - src = tex->src[i].src.ssa->parent_instr; - if (src->type != nir_instr_type_alu) - continue; - - src_alu = nir_instr_as_alu(src); - b.cursor = nir_before_instr(src); - - nir_alu_type src_type = nir_tex_instr_src_type(tex, i); - - /* Handle vector sources that are made of scalar instructions. */ - if (nir_op_is_vec(src_alu->op)) { - /* See if the vector is made of f16->f32 opcodes. */ - unsigned num = nir_dest_num_components(src_alu->dest.dest); - bool is_f16_to_f32 = src_type == nir_type_float; - bool is_u16_to_u32 = src_type & (nir_type_int | nir_type_uint); - - for (unsigned comp = 0; comp < num; comp++) { - nir_instr *instr = src_alu->src[comp].src.ssa->parent_instr; - is_f16_to_f32 &= is_f16_to_f32_conversion(instr); - /* Zero-extension (u16) and sign-extension (i16) have - * the same behavior here - txf returns 0 if bit 15 is set - * because it's out of bounds and the higher bits don't - * matter. - */ - is_u16_to_u32 &= is_u16_to_u32_conversion(instr) || - is_i16_to_i32_conversion(instr); - } - - if (!is_f16_to_f32 && !is_u16_to_u32) - continue; - - nir_alu_instr *new_vec = nir_alu_instr_clone(nir, src_alu); - nir_instr_insert_after(&src_alu->instr, &new_vec->instr); - - /* Replace conversions with mov. */ - for (unsigned comp = 0; comp < num; comp++) { - nir_instr *instr = new_vec->src[comp].src.ssa->parent_instr; - replace_with_mov(&b, &new_vec->instr, - &new_vec->src[comp].src, - nir_instr_as_alu(instr)); - } - - new_vec->dest.dest.ssa.bit_size = - new_vec->src[0].src.ssa->bit_size; - nir_instr_rewrite_src_ssa(&tex->instr, &tex->src[i].src, - &new_vec->dest.dest.ssa); - changed = true; - } else if ((is_f16_to_f32_conversion(&src_alu->instr) && - src_type == nir_type_float) || - ((is_u16_to_u32_conversion(&src_alu->instr) || - is_i16_to_i32_conversion(&src_alu->instr)) && - src_type & (nir_type_int | nir_type_uint))) { - /* Handle scalar sources. */ - replace_with_mov(&b, &tex->instr, &tex->src[i].src, src_alu); - changed = true; - } - } - - /* Optimize the destination. */ - bool is_f32_to_f16 = tex->dest_type & nir_type_float; - /* same behavior for int and uint */ - bool is_i32_to_i16 = tex->dest_type & (nir_type_int | nir_type_uint); - - nir_foreach_use(use, &tex->dest.ssa) { - is_f32_to_f16 &= is_f32_to_f16_conversion(use->parent_instr); - is_i32_to_i16 &= is_i32_to_i16_conversion(use->parent_instr); - } - - if (is_f32_to_f16 || is_i32_to_i16) { - /* All uses are the same conversions. Replace them with mov. */ - nir_foreach_use(use, &tex->dest.ssa) { - nir_alu_instr *conv = nir_instr_as_alu(use->parent_instr); - conv->op = nir_op_mov; - tex->dest.ssa.bit_size = conv->dest.dest.ssa.bit_size; - tex->dest_type = (tex->dest_type & (~16 & ~32 & ~64)) | - conv->dest.dest.ssa.bit_size; - } - changed = true; - } - } - } - - if (changed) { - nir_metadata_preserve(impl, nir_metadata_dominance | - nir_metadata_block_index); - } else { - nir_metadata_preserve(impl, nir_metadata_all); - } - - return changed; + is_n_to_m_conversion(instr, 32, nir_op_u2u16) || + is_n_to_m_conversion(instr, 32, nir_op_i2imp); } /** @@ -662,6 +513,73 @@ const_is_i16(nir_ssa_scalar scalar) return value == (int16_t) value; } +static bool +can_fold_16bit_src(nir_ssa_def *ssa, nir_alu_type src_type, bool sext_matters) +{ + bool fold_f16 = src_type == nir_type_float32; + bool fold_u16 = src_type == nir_type_uint32 && sext_matters; + bool fold_i16 = src_type == nir_type_int32 && sext_matters; + bool fold_i16_u16 = (src_type == nir_type_uint32 || src_type == nir_type_int32) && !sext_matters; + + bool can_fold = fold_f16 || fold_u16 || fold_i16 || fold_i16_u16; + for (unsigned i = 0; can_fold && i < ssa->num_components; i++) { + nir_ssa_scalar comp = nir_ssa_scalar_resolved(ssa, i); + if (comp.def->parent_instr->type == nir_instr_type_ssa_undef) + continue; + else if (nir_ssa_scalar_is_const(comp)) { + if (fold_f16) + can_fold &= const_is_f16(comp); + else if (fold_u16) + can_fold &= const_is_u16(comp); + else if (fold_i16) + can_fold &= const_is_i16(comp); + else if (fold_i16_u16) + can_fold &= (const_is_u16(comp) || const_is_i16(comp)); + } else { + if (fold_f16) + can_fold &= is_f16_to_f32_conversion(comp.def->parent_instr); + else if (fold_u16) + can_fold &= is_u16_to_u32_conversion(comp.def->parent_instr); + else if (fold_i16) + can_fold &= is_i16_to_i32_conversion(comp.def->parent_instr); + else if (fold_i16_u16) + can_fold &= (is_i16_to_i32_conversion(comp.def->parent_instr) || + is_u16_to_u32_conversion(comp.def->parent_instr)); + } + } + + return can_fold; +} + +static void +fold_16bit_src(nir_builder *b, nir_instr *instr, nir_src *src, nir_alu_type src_type) +{ + b->cursor = nir_before_instr(instr); + + nir_ssa_scalar new_comps[NIR_MAX_VEC_COMPONENTS]; + for (unsigned i = 0; i < src->ssa->num_components; i++) { + nir_ssa_scalar comp = nir_ssa_scalar_resolved(src->ssa, i); + + if (comp.def->parent_instr->type == nir_instr_type_ssa_undef) + new_comps[i] = nir_get_ssa_scalar(nir_ssa_undef(b, 1, 16), 0); + else if (nir_ssa_scalar_is_const(comp)) { + nir_ssa_def *constant; + if (src_type == nir_type_float32) + constant = nir_imm_float16(b, nir_ssa_scalar_as_float(comp)); + else + constant = nir_imm_intN_t(b, nir_ssa_scalar_as_uint(comp), 16); + new_comps[i] = nir_get_ssa_scalar(constant, 0); + } else { + /* conversion instruction */ + new_comps[i] = nir_ssa_scalar_chase_alu_src(comp, 0); + } + } + + nir_ssa_def *new_vec = nir_vec_scalars(b, new_comps, src->ssa->num_components); + + nir_instr_rewrite_src_ssa(instr, src, new_vec); +} + static bool fold_16bit_store_data(nir_builder *b, nir_intrinsic_instr *instr) { @@ -670,49 +588,10 @@ fold_16bit_store_data(nir_builder *b, nir_intrinsic_instr *instr) b->cursor = nir_before_instr(&instr->instr); - bool fold_f16 = src_type == nir_type_float32; - bool fold_u16 = src_type == nir_type_uint32; - bool fold_i16 = src_type == nir_type_int32; - - nir_ssa_scalar comps[NIR_MAX_VEC_COMPONENTS]; - for (unsigned i = 0; i < instr->num_components; i++) { - comps[i] = nir_ssa_scalar_resolved(data_src->ssa, i); - if (comps[i].def->parent_instr->type == nir_instr_type_ssa_undef) - continue; - else if (nir_ssa_scalar_is_const(comps[i])) { - fold_f16 &= const_is_f16(comps[i]); - fold_u16 &= const_is_u16(comps[i]); - fold_i16 &= const_is_i16(comps[i]); - } else { - fold_f16 &= is_f16_to_f32_conversion(comps[i].def->parent_instr); - fold_u16 &= is_u16_to_u32_conversion(comps[i].def->parent_instr); - fold_i16 &= is_i16_to_i32_conversion(comps[i].def->parent_instr); - } - } - - if (!fold_f16 && !fold_u16 && !fold_i16) + if (!can_fold_16bit_src(data_src->ssa, src_type, true)) return false; - nir_ssa_scalar new_comps[NIR_MAX_VEC_COMPONENTS]; - for (unsigned i = 0; i < instr->num_components; i++) { - if (comps[i].def->parent_instr->type == nir_instr_type_ssa_undef) - new_comps[i] = nir_get_ssa_scalar(nir_ssa_undef(b, 1, 16), 0); - else if (nir_ssa_scalar_is_const(comps[i])) { - nir_ssa_def *constant; - if (src_type == nir_type_float32) - constant = nir_imm_float16(b, nir_ssa_scalar_as_float(comps[i])); - else - constant = nir_imm_intN_t(b, nir_ssa_scalar_as_uint(comps[i]), 16); - new_comps[i] = nir_get_ssa_scalar(constant, 0); - } else { - /* conversion instruction */ - new_comps[i] = nir_ssa_scalar_chase_alu_src(comps[i], 0); - } - } - - nir_ssa_def *new_vec = nir_vec_scalars(b, new_comps, instr->num_components); - - nir_instr_rewrite_src_ssa(&instr->instr, data_src, new_vec); + fold_16bit_src(b, &instr->instr, data_src, src_type); nir_intrinsic_set_src_type(instr, (src_type & ~32) | 16); @@ -720,70 +599,169 @@ fold_16bit_store_data(nir_builder *b, nir_intrinsic_instr *instr) } static bool -fold_16bit_load_data(nir_builder *b, nir_intrinsic_instr *instr) +fold_16bit_destination(nir_ssa_def *ssa, nir_alu_type dest_type, + unsigned exec_mode, nir_rounding_mode rdm) { - nir_alu_type dest_type = nir_intrinsic_dest_type(instr); - - if (dest_type == nir_type_float32 && - nir_has_any_rounding_mode_enabled(b->shader->info.float_controls_execution_mode)) - return false; - bool is_f32_to_f16 = dest_type == nir_type_float32; bool is_i32_to_i16 = dest_type == nir_type_int32 || dest_type == nir_type_uint32; - nir_foreach_use(use, &instr->dest.ssa) { - is_f32_to_f16 &= is_f32_to_f16_conversion(use->parent_instr); - is_i32_to_i16 &= is_i32_to_i16_conversion(use->parent_instr); + nir_rounding_mode src_rdm = + nir_get_rounding_mode_from_float_controls(exec_mode, nir_type_float16); + bool allow_standard = (src_rdm == rdm || src_rdm == nir_rounding_mode_undef); + bool allow_rtz = rdm == nir_rounding_mode_rtz; + bool allow_rtne = rdm == nir_rounding_mode_rtne; + + nir_foreach_use(use, ssa) { + nir_instr *instr = use->parent_instr; + is_f32_to_f16 &= (allow_standard && is_f32_to_f16_conversion(instr)) || + (allow_rtz && is_n_to_m_conversion(instr, 32, nir_op_f2f16_rtz)) || + (allow_rtne && is_n_to_m_conversion(instr, 32, nir_op_f2f16_rtne)); + is_i32_to_i16 &= is_i32_to_i16_conversion(instr); } if (!is_f32_to_f16 && !is_i32_to_i16) return false; /* All uses are the same conversions. Replace them with mov. */ - nir_foreach_use(use, &instr->dest.ssa) { + nir_foreach_use(use, ssa) { nir_alu_instr *conv = nir_instr_as_alu(use->parent_instr); conv->op = nir_op_mov; } - instr->dest.ssa.bit_size = 16; + ssa->bit_size = 16; + return true; +} + +static bool +fold_16bit_load_data(nir_builder *b, nir_intrinsic_instr *instr, + unsigned exec_mode, nir_rounding_mode rdm) +{ + nir_alu_type dest_type = nir_intrinsic_dest_type(instr); + + if (!fold_16bit_destination(&instr->dest.ssa, dest_type, exec_mode, rdm)) + return false; + nir_intrinsic_set_dest_type(instr, (dest_type & ~32) | 16); return true; } static bool -fold_16bit_image_load_store(nir_builder *b, nir_instr *instr, UNUSED void *unused) +fold_16bit_tex_dest(nir_tex_instr *tex, unsigned exec_mode, + nir_rounding_mode rdm) { - if (instr->type != nir_instr_type_intrinsic) + /* Skip sparse residency */ + if (tex->is_sparse) return false; - nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr); + if (tex->op != nir_texop_tex && + tex->op != nir_texop_txb && + tex->op != nir_texop_txd && + tex->op != nir_texop_txl && + tex->op != nir_texop_txf && + tex->op != nir_texop_txf_ms && + tex->op != nir_texop_tg4 && + tex->op != nir_texop_tex_prefetch && + tex->op != nir_texop_fragment_fetch_amd) + return false; + if (!fold_16bit_destination(&tex->dest.ssa, tex->dest_type, exec_mode, rdm)) + return false; + + tex->dest_type = (tex->dest_type & ~32) | 16; + return true; +} + + +static bool +fold_16bit_tex_srcs(nir_builder *b, nir_tex_instr *tex, + struct nir_fold_tex_srcs_options *options) +{ + if (tex->op != nir_texop_tex && + tex->op != nir_texop_txb && + tex->op != nir_texop_txd && + tex->op != nir_texop_txl && + tex->op != nir_texop_txf && + tex->op != nir_texop_txf_ms && + tex->op != nir_texop_tg4 && + tex->op != nir_texop_tex_prefetch && + tex->op != nir_texop_fragment_fetch_amd && + tex->op != nir_texop_fragment_mask_fetch_amd) + return false; + + if (!(options->sampler_dims & BITFIELD_BIT(tex->sampler_dim))) + return false; + + bool changed = false; + for (unsigned i = 0; i < tex->num_srcs; i++) { + /* Filter out sources that should be ignored. */ + if (!(BITFIELD_BIT(tex->src[i].src_type) & options->src_types)) + continue; + + nir_src *src = &tex->src[i].src; + + nir_alu_type src_type = nir_tex_instr_src_type(tex, i) | src->ssa->bit_size; + + /* Zero-extension (u16) and sign-extension (i16) have + * the same behavior here - txf returns 0 if bit 15 is set + * because it's out of bounds and the higher bits don't + * matter. + */ + if (!can_fold_16bit_src(src->ssa, src_type, false)) + continue; + + fold_16bit_src(b, &tex->instr, src, src_type); + changed = true; + } + + return changed; +} + +static bool +fold_16bit_tex_image(nir_builder *b, nir_instr *instr, void *params) +{ + struct nir_fold_16bit_tex_image_options *options = params; + unsigned exec_mode = b->shader->info.float_controls_execution_mode; bool progress = false; - switch (intrinsic->intrinsic) { - case nir_intrinsic_bindless_image_store: - case nir_intrinsic_image_deref_store: - case nir_intrinsic_image_store: - progress |= fold_16bit_store_data(b, intrinsic); - break; - case nir_intrinsic_bindless_image_load: - case nir_intrinsic_image_deref_load: - case nir_intrinsic_image_load: - progress |= fold_16bit_load_data(b, intrinsic); - break; - default: - break; + if (instr->type == nir_instr_type_intrinsic) { + nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr); + + switch (intrinsic->intrinsic) { + case nir_intrinsic_bindless_image_store: + case nir_intrinsic_image_deref_store: + case nir_intrinsic_image_store: + if (options->fold_image_load_store_data) + progress |= fold_16bit_store_data(b, intrinsic); + break; + case nir_intrinsic_bindless_image_load: + case nir_intrinsic_image_deref_load: + case nir_intrinsic_image_load: + if (options->fold_image_load_store_data) + progress |= fold_16bit_load_data(b, intrinsic, exec_mode, options->rounding_mode); + break; + default: + break; + } + } else if (instr->type == nir_instr_type_tex) { + nir_tex_instr *tex = nir_instr_as_tex(instr); + + if (options->fold_tex_dest) + progress |= fold_16bit_tex_dest(tex, exec_mode, options->rounding_mode); + + for (unsigned i = 0; i < options->fold_srcs_options_count; i++) { + progress |= fold_16bit_tex_srcs(b, tex, &options->fold_srcs_options[i]); + } } return progress; } -bool -nir_fold_16bit_image_load_store_conversions(nir_shader *nir) +bool nir_fold_16bit_tex_image(nir_shader *nir, + struct nir_fold_16bit_tex_image_options *options) { return nir_shader_instructions_pass(nir, - fold_16bit_image_load_store, + fold_16bit_tex_image, nir_metadata_block_index | nir_metadata_dominance, - NULL); + options); } diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index 93c43f488a7..9cc82cb2824 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -771,20 +771,27 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s) * coordinates that had been upconverted to 32-bits just for the * sampler to just be 16-bit texture sources. */ - OPT(s, nir_fold_16bit_sampler_conversions, - (1 << nir_tex_src_coord) | - (1 << nir_tex_src_lod) | - (1 << nir_tex_src_bias) | - (1 << nir_tex_src_comparator) | - (1 << nir_tex_src_min_lod) | - (1 << nir_tex_src_ms_index) | - (1 << nir_tex_src_ddx) | - (1 << nir_tex_src_ddy), - ~0); + struct nir_fold_tex_srcs_options fold_srcs_options = { + .sampler_dims = ~0, + .src_types = (1 << nir_tex_src_coord) | + (1 << nir_tex_src_lod) | + (1 << nir_tex_src_bias) | + (1 << nir_tex_src_comparator) | + (1 << nir_tex_src_min_lod) | + (1 << nir_tex_src_ms_index) | + (1 << nir_tex_src_ddx) | + (1 << nir_tex_src_ddy), + }; + struct nir_fold_16bit_tex_image_options fold_16bit_options = { + .rounding_mode = nir_rounding_mode_rtz, + .fold_tex_dest = true, + /* blob dumps have no half regs on pixel 2's ldib or stib, so only enable for a6xx+. */ + .fold_image_load_store_data = so->compiler->gen >= 6, + .fold_srcs_options_count = 1, + .fold_srcs_options = &fold_srcs_options, + }; + OPT(s, nir_fold_16bit_tex_image, &fold_16bit_options); - /* blob dumps have no half regs on pixel 2's ldib or stib, so only enable for a6xx+. */ - if (so->compiler->gen >= 6) - OPT(s, nir_fold_16bit_image_load_store_conversions); /* Now that we stripped off the 16-bit conversions, legalize so that we * don't have a mix of 16- and 32-bit args that will need to be diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index 0c64689ccef..24631ea3ae9 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -186,15 +186,21 @@ static void si_late_optimize_16bit_samplers(struct si_screen *sscreen, nir_shade }; bool changed = false; - uint32_t sampler_dims = UINT32_MAX; - /* Skip because AMD doesn't support 16-bit types with these. */ - sampler_dims &= ~BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE); - NIR_PASS(changed, nir, nir_fold_16bit_sampler_conversions, - (1 << nir_tex_src_coord) | - (has_g16 ? 1 << nir_tex_src_ddx : 0), - sampler_dims); + struct nir_fold_tex_srcs_options fold_srcs_options = { + .sampler_dims = ~BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE), + .src_types = (1 << nir_tex_src_coord) | + (has_g16 ? 1 << nir_tex_src_ddx : 0), + }; + struct nir_fold_16bit_tex_image_options fold_16bit_options = { + .rounding_mode = nir_rounding_mode_rtne, + .fold_tex_dest = true, + .fold_image_load_store_data = true, + .fold_srcs_options_count = 1, + .fold_srcs_options = &fold_srcs_options, + }; + NIR_PASS(changed, nir, nir_fold_16bit_tex_image, &fold_16bit_options); + NIR_PASS(changed, nir, nir_legalize_16bit_sampler_srcs, tex_constraints); - NIR_PASS(changed, nir, nir_fold_16bit_image_load_store_conversions); if (changed) { si_nir_opts(sscreen, nir, false); diff --git a/src/gallium/frontends/lavapipe/lvp_pipeline.c b/src/gallium/frontends/lavapipe/lvp_pipeline.c index f9e5c9e6dd9..64f7d238b66 100644 --- a/src/gallium/frontends/lavapipe/lvp_pipeline.c +++ b/src/gallium/frontends/lavapipe/lvp_pipeline.c @@ -1078,8 +1078,11 @@ lvp_shader_compile_to_ir(struct lvp_pipeline *pipeline, // TODO: also optimize the tex srcs. see radeonSI for reference */ /* Skip if there are potentially conflicting rounding modes */ - if (!nir_has_any_rounding_mode_enabled(nir->info.float_controls_execution_mode)) - NIR_PASS_V(nir, nir_fold_16bit_sampler_conversions, 0, UINT32_MAX); + struct nir_fold_16bit_tex_image_options fold_16bit_options = { + .rounding_mode = nir_rounding_mode_undef, + .fold_tex_dest = true, + }; + NIR_PASS_V(nir, nir_fold_16bit_tex_image, &fold_16bit_options); lvp_shader_optimize(nir);