diff --git a/src/intel/compiler/brw/brw_from_nir.cpp b/src/intel/compiler/brw/brw_from_nir.cpp index 40f18a0517b..f1aa0d5ac1a 100644 --- a/src/intel/compiler/brw/brw_from_nir.cpp +++ b/src/intel/compiler/brw/brw_from_nir.cpp @@ -4573,10 +4573,10 @@ get_nir_buffer_intrinsic_index(nir_to_brw_state &ntb, const brw_builder &bld, static unsigned choose_block_size_dwords(const intel_device_info *devinfo, unsigned dwords) { - const unsigned min_block = 8; + const unsigned min_block = devinfo->has_lsc ? 1 : 4; const unsigned max_block = devinfo->has_lsc ? 64 : 32; - const unsigned block = 1 << util_logbase2(dwords); + const unsigned block = dwords > 4 ? 1 << util_logbase2(dwords) : dwords; return CLAMP(block, min_block, max_block); } @@ -6188,23 +6188,32 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb, unsigned first_read_component = 0; if (convergent_block_load) { - /* If the address is a constant and alignment permits, skip unread - * leading and trailing components. (It's probably not worth the + /* If the address is a constant and alignment permits, skip as many + * unread leading and trailing components as we can without splitting + * the load into more smaller blocks. (It's probably not worth the * extra address math for non-constant addresses.) * * Note that SLM block loads on HDC platforms need to be 16B aligned. */ if (srcs[MEMORY_LOGICAL_ADDRESS].file == IMM && - alignment >= data_bit_size / 8 && - (devinfo->has_lsc || mode != MEMORY_MODE_SHARED_LOCAL)) { + alignment >= nir_bit_size / 8) { first_read_component = nir_def_first_component_read(&instr->def); - unsigned last_component = nir_def_last_component_read(&instr->def); + unsigned last_component = nir_def_last_component_read(&instr->def) + 1; + if (!devinfo->has_lsc && mode == MEMORY_MODE_SHARED_LOCAL) { + first_read_component = ROUND_DOWN_TO(first_read_component, 4); + last_component = align(last_component, 4); + } + total = last_component - first_read_component; + total = brw_uniform_block_size(devinfo, total); + first_read_component = + total >= last_component ? 0 : last_component - total; + components = MIN2(components, last_component) - first_read_component; srcs[MEMORY_LOGICAL_ADDRESS].u64 += - first_read_component * (data_bit_size / 8); - components = last_component - first_read_component + 1; + first_read_component * (nir_bit_size / 8); + } else { + total = brw_uniform_block_size(devinfo, components); } - total = align(components, REG_SIZE * reg_unit(devinfo) / 4); dest = ubld.vgrf(BRW_TYPE_UD, total); } else { total = components * bld.dispatch_width(); @@ -6218,6 +6227,11 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb, unsigned block_comps = choose_block_size_dwords(devinfo, total - done); const unsigned block_bytes = block_comps * (nir_bit_size / 8); + /* Our current choice of block sizes and 32-bit data type will + * always give us a GRF-aligned offset into dest + */ + assert(done % (REG_SIZE / 4 * reg_unit(devinfo)) == 0); + brw_reg dst_offset = is_store ? brw_reg() : retype(byte_offset(dest, done * 4), BRW_TYPE_UD); if (is_store) { @@ -6228,7 +6242,7 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb, mem = ubld.emit(opcode, dst_offset, srcs, MEMORY_LOGICAL_NUM_SRCS)->as_mem(); mem->has_no_mask_send_params = no_mask_handle; if (is_load) - mem->size_written = block_bytes; + mem->size_written = align(block_bytes, REG_SIZE * reg_unit(devinfo)); mem->lsc_op = op; mem->mode = *mode; mem->binding_type = *binding_type; diff --git a/src/intel/compiler/brw/brw_nir.h b/src/intel/compiler/brw/brw_nir.h index 6cefd4be113..62eb3a5db9f 100644 --- a/src/intel/compiler/brw/brw_nir.h +++ b/src/intel/compiler/brw/brw_nir.h @@ -326,6 +326,24 @@ bool brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, nir_intrinsic_instr *high, void *data); +/** + * Gets the size of a nir_load_*_uniform_block_intel after its lowered + * by the backend to a block load message, note that page faults can + * happen if this is not accounted for when using these intrinsics. + */ +static inline unsigned +brw_uniform_block_size(const struct intel_device_info *devinfo, + unsigned num_components) +{ + /* Round up to a supported block size, or to the nearest multiple of + * 16 components if its any larger. + */ + return num_components > 8 ? align(num_components, 16) + : num_components > 4 ? 8 + : !devinfo->has_lsc ? 4 + : num_components; +} + void brw_nir_optimize(struct brw_pass_tracker *pt); #define BRW_NIR_FRAG_OUTPUT_INDEX_SHIFT 0