diff --git a/src/amd/vulkan/radv_pipeline_graphics.c b/src/amd/vulkan/radv_pipeline_graphics.c index bff36b7d5d4..8c99d3adcf0 100644 --- a/src/amd/vulkan/radv_pipeline_graphics.c +++ b/src/amd/vulkan/radv_pipeline_graphics.c @@ -1665,8 +1665,8 @@ radv_graphics_shaders_link_varyings(struct radv_shader_stage *stages) if (next != MESA_SHADER_NONE && stages[next].nir && next != MESA_SHADER_FRAGMENT && !stages[s].key.optimisations_disabled && !stages[next].key.optimisations_disabled) { nir_shader *consumer = stages[next].nir; - NIR_PASS(_, producer, nir_opt_vectorize_io, nir_var_shader_out); - NIR_PASS(_, consumer, nir_opt_vectorize_io, nir_var_shader_in); + NIR_PASS(_, producer, nir_opt_vectorize_io, nir_var_shader_out, false); + NIR_PASS(_, consumer, nir_opt_vectorize_io, nir_var_shader_in, false); } /* Gather shader info; at least the I/O info likely changed diff --git a/src/compiler/glsl/gl_nir_linker.c b/src/compiler/glsl/gl_nir_linker.c index 770955502c9..0d2acfdb524 100644 --- a/src/compiler/glsl/gl_nir_linker.c +++ b/src/compiler/glsl/gl_nir_linker.c @@ -1505,7 +1505,7 @@ gl_nir_lower_optimize_varyings(const struct gl_constants *consts, */ NIR_PASS(_, nir, nir_lower_io_to_scalar, get_varying_nir_var_mask(nir), NULL, NULL); - NIR_PASS(_, nir, nir_opt_vectorize_io, get_varying_nir_var_mask(nir)); + NIR_PASS(_, nir, nir_opt_vectorize_io, get_varying_nir_var_mask(nir), false); return; } @@ -1569,7 +1569,7 @@ gl_nir_lower_optimize_varyings(const struct gl_constants *consts, nir_shader *nir = shaders[i]; /* Re-vectorize IO. */ - NIR_PASS(_, nir, nir_opt_vectorize_io, get_varying_nir_var_mask(nir)); + NIR_PASS(_, nir, nir_opt_vectorize_io, get_varying_nir_var_mask(nir), false); /* Recompute intrinsic bases, which are totally random after * optimizations and compaction. Do that for all inputs and outputs, diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 3b156f99257..b9a68969163 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -6264,7 +6264,8 @@ bool nir_opt_uniform_subgroup(nir_shader *shader, bool nir_opt_vectorize(nir_shader *shader, nir_vectorize_cb filter, void *data); -bool nir_opt_vectorize_io(nir_shader *shader, nir_variable_mode modes); +bool nir_opt_vectorize_io(nir_shader *shader, nir_variable_mode modes, + bool allow_holes); bool nir_opt_move_discards_to_top(nir_shader *shader); diff --git a/src/compiler/nir/nir_opt_vectorize_io.c b/src/compiler/nir/nir_opt_vectorize_io.c index fd21f09bbb8..3493c7b53db 100644 --- a/src/compiler/nir/nir_opt_vectorize_io.c +++ b/src/compiler/nir/nir_opt_vectorize_io.c @@ -148,6 +148,9 @@ vectorize_load(nir_intrinsic_instr *chan[8], unsigned start, unsigned count, * inserted. */ for (unsigned i = start; i < start + count; i++) { + if (!chan[i]) + continue; + first = !first || chan[i]->instr.index < first->instr.index ? chan[i] : first; if (step == merge_low_high_16_to_32) { first = !first || chan[4 + i]->instr.index < first->instr.index ? chan[4 + i] : first; @@ -205,7 +208,8 @@ vectorize_load(nir_intrinsic_instr *chan[8], unsigned start, unsigned count, } } else { for (unsigned i = start; i < start + count; i++) { - nir_def_replace(&chan[i]->def, nir_channel(&b, def, i - start)); + if (chan[i]) + nir_def_replace(&chan[i]->def, nir_channel(&b, def, i - start)); } } } @@ -360,9 +364,11 @@ vectorize_store(nir_intrinsic_instr *chan[8], unsigned start, unsigned count, * (the last 4 are the high 16-bit channels) */ static bool -vectorize_slot(nir_intrinsic_instr *chan[8], unsigned mask) +vectorize_slot(nir_intrinsic_instr *chan[8], unsigned mask, bool allow_holes) { bool progress = false; + assert(mask); + bool is_load = nir_intrinsic_infos[chan[ffs(mask) - 1]->intrinsic].has_dest; /* First, merge low and high 16-bit halves into 32 bits separately when * possible. Then vectorize what's left. @@ -407,8 +413,18 @@ vectorize_slot(nir_intrinsic_instr *chan[8], unsigned mask) } else if (step == vectorize_high_16_separately) { scan_mask = mask & BITFIELD_RANGE(4, 4); mask &= ~scan_mask; + + if (is_load && allow_holes) { + unsigned num = util_last_bit(scan_mask); + scan_mask = BITFIELD_RANGE(4, num - 4); + } } else { scan_mask = mask; + + if (is_load && allow_holes) { + unsigned num = util_last_bit(scan_mask); + scan_mask = BITFIELD_MASK(num); + } } while (scan_mask) { @@ -419,8 +435,6 @@ vectorize_slot(nir_intrinsic_instr *chan[8], unsigned mask) if (count == 1 && step != merge_low_high_16_to_32) continue; /* There is nothing to vectorize. */ - bool is_load = nir_intrinsic_infos[chan[start]->intrinsic].has_dest; - if (is_load) vectorize_load(chan, start, count, step); else @@ -434,7 +448,7 @@ vectorize_slot(nir_intrinsic_instr *chan[8], unsigned mask) } static bool -vectorize_batch(struct util_dynarray *io_instructions) +vectorize_batch(struct util_dynarray *io_instructions, bool allow_holes) { unsigned num_instr = util_dynarray_num_elements(io_instructions, void *); @@ -473,7 +487,7 @@ vectorize_batch(struct util_dynarray *io_instructions) if (prev && compare_is_not_vectorizable(prev, *intr)) { /* We need at least 2 instructions to have something to do. */ if (util_bitcount(chan_mask) > 1) - progress |= vectorize_slot(chan, chan_mask); + progress |= vectorize_slot(chan, chan_mask, allow_holes); prev = NULL; memset(chan, 0, sizeof(chan)); @@ -497,15 +511,28 @@ vectorize_batch(struct util_dynarray *io_instructions) /* Vectorize the last group. */ if (prev && util_bitcount(chan_mask) > 1) - progress |= vectorize_slot(chan, chan_mask); + progress |= vectorize_slot(chan, chan_mask, allow_holes); /* Clear the array. The next block will reuse it. */ util_dynarray_clear(io_instructions); return progress; } +/* Vectorize lowered IO (load_input/store_output/...). + * + * modes specifies whether to vectorize inputs and/or outputs. + * + * allow_holes enables vectorization of loads with holes, e.g.: + * load X; load W; ==> load XYZW; + * + * This is useful for VS input loads where it might not be possible to skip + * loading unused components, e.g. with AMD where loading W also loads XYZ, + * so if we also load X separately again, it's wasteful. It's better to get + * X from the vector that loads (XYZ)W. + */ bool -nir_opt_vectorize_io(nir_shader *shader, nir_variable_mode modes) +nir_opt_vectorize_io(nir_shader *shader, nir_variable_mode modes, + bool allow_holes) { assert(!(modes & ~(nir_var_shader_in | nir_var_shader_out))); @@ -520,8 +547,10 @@ nir_opt_vectorize_io(nir_shader *shader, nir_variable_mode modes) * but that is only done when outputs are ignored, so vectorize them * separately. */ - bool progress_in = nir_opt_vectorize_io(shader, nir_var_shader_in); - bool progress_out = nir_opt_vectorize_io(shader, nir_var_shader_out); + bool progress_in = nir_opt_vectorize_io(shader, nir_var_shader_in, + allow_holes); + bool progress_out = nir_opt_vectorize_io(shader, nir_var_shader_out, + allow_holes); return progress_in || progress_out; } @@ -584,7 +613,7 @@ nir_opt_vectorize_io(nir_shader *shader, nir_variable_mode modes) */ if (BITSET_TEST(is_load ? has_output_stores : has_output_loads, index)) { - progress |= vectorize_batch(&io_instructions); + progress |= vectorize_batch(&io_instructions, allow_holes); BITSET_ZERO(has_output_loads); BITSET_ZERO(has_output_stores); } @@ -595,7 +624,7 @@ nir_opt_vectorize_io(nir_shader *shader, nir_variable_mode modes) /* Don't vectorize across TCS barriers. */ if (modes & nir_var_shader_out && nir_intrinsic_memory_modes(intr) & nir_var_shader_out) { - progress |= vectorize_batch(&io_instructions); + progress |= vectorize_batch(&io_instructions, allow_holes); BITSET_ZERO(has_output_loads); BITSET_ZERO(has_output_stores); } @@ -603,7 +632,7 @@ nir_opt_vectorize_io(nir_shader *shader, nir_variable_mode modes) case nir_intrinsic_emit_vertex: /* Don't vectorize across GS emits. */ - progress |= vectorize_batch(&io_instructions); + progress |= vectorize_batch(&io_instructions, allow_holes); BITSET_ZERO(has_output_loads); BITSET_ZERO(has_output_stores); continue; @@ -622,7 +651,7 @@ nir_opt_vectorize_io(nir_shader *shader, nir_variable_mode modes) BITSET_SET(is_load ? has_output_loads : has_output_stores, index); } - progress |= vectorize_batch(&io_instructions); + progress |= vectorize_batch(&io_instructions, allow_holes); } nir_progress(progress, impl, diff --git a/src/imagination/pco/pco_nir.c b/src/imagination/pco/pco_nir.c index 840c03fb50f..1d8558206b1 100644 --- a/src/imagination/pco/pco_nir.c +++ b/src/imagination/pco/pco_nir.c @@ -287,7 +287,7 @@ void pco_lower_nir(pco_ctx *ctx, nir_shader *nir, pco_data *data) if (nir->info.stage != MESA_SHADER_FRAGMENT) vec_modes |= nir_var_shader_out; - NIR_PASS(_, nir, nir_opt_vectorize_io, vec_modes); + NIR_PASS(_, nir, nir_opt_vectorize_io, vec_modes, false); /* Special case for frag coords: * - x,y come from (non-consecutive) special regs - always scalar.