diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index c5f2486b27d..c2143d4aa47 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -1665,9 +1665,26 @@ brw_vectorize_lower_mem_access(nir_shader *nir, * - reduced register pressure */ nir_divergence_analysis(nir); - if (OPT(intel_nir_blockify_uniform_loads, compiler->devinfo)) + if (OPT(intel_nir_blockify_uniform_loads, compiler->devinfo)) { OPT(nir_opt_load_store_vectorize, &options); + OPT(nir_opt_constant_folding); + OPT(nir_copy_prop); + + if (OPT(brw_nir_rebase_const_offset_ubo_loads)) { + OPT(nir_opt_cse); + OPT(nir_copy_prop); + + nir_load_store_vectorize_options ubo_options = { + .modes = nir_var_mem_ubo, + .callback = brw_nir_should_vectorize_mem, + .robust_modes = options.robust_modes & nir_var_mem_ubo, + }; + + OPT(nir_opt_load_store_vectorize, &ubo_options); + } + } + nir_lower_mem_access_bit_sizes_options mem_access_options = { .modes = nir_var_mem_ssbo | nir_var_mem_constant | diff --git a/src/intel/compiler/intel_nir.h b/src/intel/compiler/intel_nir.h index fcb7262eedd..54a123a73f4 100644 --- a/src/intel/compiler/intel_nir.h +++ b/src/intel/compiler/intel_nir.h @@ -14,6 +14,7 @@ extern "C" { struct intel_device_info; void intel_nir_apply_tcs_quads_workaround(nir_shader *nir); +bool brw_nir_rebase_const_offset_ubo_loads(nir_shader *shader); bool intel_nir_blockify_uniform_loads(nir_shader *shader, const struct intel_device_info *devinfo); bool intel_nir_clamp_image_1d_2d_array_sizes(nir_shader *shader); diff --git a/src/intel/compiler/intel_nir_blockify_uniform_loads.c b/src/intel/compiler/intel_nir_blockify_uniform_loads.c index c2f25bff260..2595075da19 100644 --- a/src/intel/compiler/intel_nir_blockify_uniform_loads.c +++ b/src/intel/compiler/intel_nir_blockify_uniform_loads.c @@ -26,6 +26,128 @@ #include "isl/isl.h" #include "nir_builder.h" +static bool +rebase_const_offset_ubo_loads_instr(nir_builder *b, + nir_instr *instr, + void *cb_data) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_load_ubo_uniform_block_intel) + return false; + + if (!nir_src_is_const(intrin->src[1])) + return false; + + const unsigned type_bytes = intrin->def.bit_size / 8; + const unsigned cacheline_bytes = 64; + const unsigned block_components = + MIN2(cacheline_bytes / type_bytes, NIR_MAX_VEC_COMPONENTS); + + const unsigned orig_offset = nir_src_as_uint(intrin->src[1]); + const unsigned new_offset = ROUND_DOWN_TO(orig_offset, cacheline_bytes); + + const unsigned orig_def_components = intrin->def.num_components; + const unsigned orig_read_components = + nir_def_last_component_read(&intrin->def) + 1; + const unsigned pad_components = (orig_offset - new_offset) / type_bytes; + + /* Don't round down if we'd have to split a single load into two loads */ + if (orig_read_components + pad_components > block_components) + return false; + + /* Always read a full block so we can CSE reads of different sizes. + * The backend will skip reading unused trailing components anyway. + */ + intrin->def.num_components = block_components; + intrin->num_components = block_components; + nir_intrinsic_set_range_base(intrin, new_offset); + nir_intrinsic_set_range(intrin, block_components * type_bytes); + nir_intrinsic_set_align_offset(intrin, 0); + + if (pad_components) { + /* Change the base of the load to the new lower offset, and emit + * moves to read from the now higher vector component locations. + */ + b->cursor = nir_before_instr(instr); + nir_src_rewrite(&intrin->src[1], nir_imm_int(b, new_offset)); + } + + b->cursor = nir_after_instr(instr); + + nir_scalar components[NIR_MAX_VEC_COMPONENTS]; + nir_scalar undef = nir_get_scalar(nir_undef(b, 1, type_bytes * 8), 0); + unsigned i = 0; + for (; i < orig_read_components; i++) + components[i] = nir_get_scalar(&intrin->def, pad_components + i); + for (; i < orig_def_components; i++) + components[i] = undef; + + nir_def *rebase = nir_vec_scalars(b, components, orig_def_components); + rebase->divergent = false; + + nir_def_rewrite_uses_after(&intrin->def, rebase, rebase->parent_instr); + + return true; +} + +/** + * Shaders commonly contain small UBO loads with a constant offset scattered + * throughout the program. Ideally, we want to vectorize those into larger + * block loads so we can load whole cachelines at a time, or at least fill + * whole 32B registers rather than having empty space. + * + * nir_opt_load_store_vectorize() is terrific for combining small loads into + * nice large block loads. Unfortunately, it only vectorizes within a single + * basic block, and there's a lot of opportunity for optimizing globally. + * + * In the past, our backend loaded whole 64B cachelines at a time (on pre-Xe2, + * two registers) and rounded down constant UBO load offsets to the nearest + * multiple of 64B. This meant multiple loads within the same 64B would be + * CSE'd into the same load, and we could even take advantage of global CSE. + * However, we didn't have a method for shrinking loads from 64B back to 32B + * again, and also didn't have a lot of flexibility in how this interacted + * with the NIR load/store vectorization. + * + * This pass takes a similar approach, but in NIR. The idea is to: + * + * 1. Run load/store vectorization to combine access within a basic block + * + * 2. Find load_ubo_uniform_block_intel intrinsics with constant offsets. + * Round their base down to the nearest multiple of 64B, and also increase + * their returned vector to be a vec16 (64B for 32-bit values). However, + * only do this if a single vec16 load would cover this additional "pad" + * space at the front, and all used components of the existing load. That + * way, we don't blindly turn a single load into two loads. + * + * If we made any progress, then... + * + * 3. Run global CSE. This will coalesce any accesses to the same 64B + * region across subtrees of the CFG. + * + * 4. Run the load/store vectorizer again for UBOs. This will clean up + * any overlapping memory access within a block. + * + * 5. Have the backend only issue loads for components of the vec16 which + * are actually read. We could also shrink this in NIR, but doing it in + * the backend is pretty straightforward. + * + * We could probably do better with a fancier sliding-window type pass + * which looked across blocks to produce optimal loads. However, this + * simple hack using existing passes does a fairly good job for now. + */ +bool +brw_nir_rebase_const_offset_ubo_loads(nir_shader *shader) +{ + return nir_shader_instructions_pass(shader, + rebase_const_offset_ubo_loads_instr, + nir_metadata_control_flow | + nir_metadata_live_defs, + NULL); +} + static bool intel_nir_blockify_uniform_loads_instr(nir_builder *b, nir_instr *instr,