diff --git a/src/intel/compiler/brw/brw_eu_defines.h b/src/intel/compiler/brw/brw_eu_defines.h index 29801caa6de..4b3554b71e8 100644 --- a/src/intel/compiler/brw/brw_eu_defines.h +++ b/src/intel/compiler/brw/brw_eu_defines.h @@ -669,6 +669,10 @@ enum memory_flags { * fusion (Gfx12.x only). */ MEMORY_FLAG_FUSED_EU_DISABLE = 1 << 4, + /** Whether this memory load can be arbitrarily reordered or CSE'd + * with other loads. + */ + MEMORY_FLAG_CAN_REORDER = 1 << 5, }; enum rt_logical_srcs { diff --git a/src/intel/compiler/brw/brw_from_nir.cpp b/src/intel/compiler/brw/brw_from_nir.cpp index 13902d4ad67..ffd023e0a58 100644 --- a/src/intel/compiler/brw/brw_from_nir.cpp +++ b/src/intel/compiler/brw/brw_from_nir.cpp @@ -5930,13 +5930,15 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb, (nir_intrinsic_access(instr) & ACCESS_COHERENT); const bool fused_eu_disable = nir_intrinsic_has_access(instr) && (nir_intrinsic_access(instr) & ACCESS_FUSED_EU_DISABLE_INTEL); + const bool can_reorder = nir_intrinsic_can_reorder(instr); const unsigned alignment = nir_intrinsic_has_align(instr) ? nir_intrinsic_align(instr) : 0; uint8_t flags = (include_helpers ? MEMORY_FLAG_INCLUDE_HELPERS : 0) | (volatile_access ? MEMORY_FLAG_VOLATILE_ACCESS : 0) | (coherent_access ? MEMORY_FLAG_COHERENT_ACCESS : 0) | - (fused_eu_disable ? MEMORY_FLAG_FUSED_EU_DISABLE : 0); + (fused_eu_disable ? MEMORY_FLAG_FUSED_EU_DISABLE : 0) | + (can_reorder ? MEMORY_FLAG_CAN_REORDER : 0); bool no_mask_handle = false; uint8_t coord_components = 1; diff --git a/src/intel/compiler/brw/brw_lower_logical_sends.cpp b/src/intel/compiler/brw/brw_lower_logical_sends.cpp index 7fbe16f85ea..f141f0b7c55 100644 --- a/src/intel/compiler/brw/brw_lower_logical_sends.cpp +++ b/src/intel/compiler/brw/brw_lower_logical_sends.cpp @@ -1192,6 +1192,7 @@ lower_lsc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem) const bool coherent_access = mem->flags & MEMORY_FLAG_COHERENT_ACCESS; const bool has_side_effects = mem->has_side_effects(); const bool fused_eu_disable = mem->flags & MEMORY_FLAG_FUSED_EU_DISABLE; + const bool can_reorder = mem->flags & MEMORY_FLAG_CAN_REORDER; const uint32_t data_size_B = lsc_data_size_bytes(data_size); const enum brw_reg_type data_type = @@ -1344,7 +1345,7 @@ lower_lsc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem) send->ex_mlen = ex_mlen; send->header_size = 0; send->has_side_effects = has_side_effects; - send->is_volatile = !has_side_effects || volatile_access; + send->is_volatile = (!has_side_effects && !can_reorder) || volatile_access; send->fused_eu_disable = fused_eu_disable; /* Finally, the payload */ @@ -1405,6 +1406,7 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem) const bool volatile_access = mem->flags & MEMORY_FLAG_VOLATILE_ACCESS; const bool fused_eu_disable = mem->flags & MEMORY_FLAG_FUSED_EU_DISABLE; const bool has_side_effects = mem->has_side_effects(); + const bool can_reorder = mem->flags & MEMORY_FLAG_CAN_REORDER; const bool has_dest = mem->dst.file != BAD_FILE && !mem->dst.is_null(); assert(mem->address_offset == 0); @@ -1610,7 +1612,7 @@ lower_hdc_memory_logical_send(const brw_builder &bld, brw_mem_inst *mem) send->ex_mlen = ex_mlen; send->header_size = header.file != BAD_FILE ? 1 : 0; send->has_side_effects = has_side_effects; - send->is_volatile = !has_side_effects || volatile_access; + send->is_volatile = (!has_side_effects && !can_reorder) || volatile_access; send->fused_eu_disable = fused_eu_disable; if (block) { diff --git a/src/intel/compiler/brw/brw_opt_cse.cpp b/src/intel/compiler/brw/brw_opt_cse.cpp index 3aac87b4ce6..b7e5bf37118 100644 --- a/src/intel/compiler/brw/brw_opt_cse.cpp +++ b/src/intel/compiler/brw/brw_opt_cse.cpp @@ -94,7 +94,7 @@ is_expression(const brw_shader *v, const brw_inst *const inst) case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: return true; case SHADER_OPCODE_MEMORY_LOAD_LOGICAL: - return inst->as_mem()->mode == MEMORY_MODE_CONSTANT; + return inst->as_mem()->flags & MEMORY_FLAG_CAN_REORDER; case SHADER_OPCODE_LOAD_PAYLOAD: return !is_coalescing_payload(*v, inst); case SHADER_OPCODE_SEND: diff --git a/src/intel/compiler/brw/brw_schedule_instructions.cpp b/src/intel/compiler/brw/brw_schedule_instructions.cpp index a2a837eaea5..f856eaa0995 100644 --- a/src/intel/compiler/brw/brw_schedule_instructions.cpp +++ b/src/intel/compiler/brw/brw_schedule_instructions.cpp @@ -571,6 +571,7 @@ public: int block_count, bool post_reg_alloc, bool need_latencies); void add_barrier_deps(schedule_node *n); + void add_memory_deps(schedule_node *n); void add_cross_lane_deps(schedule_node *n); void add_dep(schedule_node *before, schedule_node *after, int latency); void add_dep(schedule_node *before, schedule_node *after); @@ -1080,8 +1081,27 @@ static bool is_scheduling_barrier(const brw_inst *inst) { return inst->opcode == SHADER_OPCODE_HALT_TARGET || + inst->opcode == SHADER_OPCODE_RND_MODE || + inst->opcode == SHADER_OPCODE_FLOAT_CONTROL_MODE || (inst->is_control_flow() && inst->opcode != BRW_OPCODE_HALT) || - inst->has_side_effects(); + inst->eot; +} + +static bool +has_memory_side_effects(const brw_inst *inst) +{ + assert(inst->opcode != SHADER_OPCODE_LSC_SPILL); + return inst->opcode == BRW_OPCODE_SYNC || + inst->opcode == SHADER_OPCODE_BARRIER || + inst->opcode == FS_OPCODE_SCHEDULING_FENCE || + (inst->is_send() && inst->as_send()->has_side_effects); +} + +static bool +is_memory_volatile(const brw_inst *inst) +{ + return has_memory_side_effects(inst) || + (inst->is_send() && inst->as_send()->is_volatile); } static bool @@ -1186,6 +1206,30 @@ brw_instruction_scheduler::add_barrier_deps(schedule_node *n) } } +void +brw_instruction_scheduler::add_memory_deps(schedule_node *n) +{ + for (schedule_node *prev = n - 1; prev >= current.start; prev--) { + if (has_memory_side_effects(prev->inst)) { + add_dep(prev, n, 0); + break; + } + if (is_memory_volatile(prev->inst)) { + add_dep(prev, n, 0); + } + } + + for (schedule_node *next = n + 1; next < current.end; next++) { + if (has_memory_side_effects(next->inst)) { + add_dep(n, next, 0); + break; + } + if (is_memory_volatile(next->inst)) { + add_dep(n, next, 0); + } + } +} + /** * Because some instructions like HALT can disable lanes, scheduling prior to * a cross lane access should not be allowed, otherwise we could end up with @@ -1338,6 +1382,9 @@ brw_instruction_scheduler::calculate_deps() if (is_scheduling_barrier(inst)) add_barrier_deps(n); + if (has_memory_side_effects(inst)) + add_memory_deps(n); + if (inst->opcode == BRW_OPCODE_HALT || inst->opcode == SHADER_OPCODE_HALT_TARGET) add_cross_lane_deps(n);