From dd70375ee28ec838bd57b4a5b60de4092c1fe9c3 Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Thu, 23 Jul 2020 15:15:34 +0300 Subject: [PATCH] intel/fs: Disable sample mask predication for scratch stores Scratch stores are being lowered to the instructions with side-effects, however they should be enabled in fs helper invocations, since they are produced from operations which don't imply side-effects. To fix this - we move the decision of whether the sample mask predication is enable to the point where logical brw instructions are created. GLSL example of the issue: int tmp[1024]; ... do { // changes to tmp } while (some_condition(tmp)) If `tmp` is lowered to scrach memory, `some_condition` would be undefined if scratch write is predicated on sample mask, making possible for the while loop to become infinite and hang the GPU. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/3256 Fixes: 53bfcdeecf4c9632e09ee641d2ca02dd9ec25e34 Signed-off-by: Danylo Piliaiev Reviewed-by: Matt Turner Acked-by: Jason Ekstrand Part-of: (cherry picked from commit 77486db867bd39aa9b76e549c946b0a165fcb21a) --- .pick_status.json | 2 +- src/intel/compiler/brw_eu_defines.h | 5 +++++ src/intel/compiler/brw_fs.cpp | 8 ++++++-- src/intel/compiler/brw_fs_nir.cpp | 23 +++++++++++++++++++++++ 4 files changed, 35 insertions(+), 3 deletions(-) diff --git a/.pick_status.json b/.pick_status.json index 6fe6e37c410..c43d44290a9 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -6250,7 +6250,7 @@ "description": "intel/fs: Disable sample mask predication for scratch stores", "nominated": true, "nomination_type": 1, - "resolution": 0, + "resolution": 1, "master_sha": null, "because_sha": "53bfcdeecf4c9632e09ee641d2ca02dd9ec25e34" }, diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index d63360222ec..33c6887f889 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -901,6 +901,11 @@ enum surface_logical_srcs { SURFACE_LOGICAL_SRC_IMM_DIMS, /** Per-opcode immediate argument. For atomics, this is the atomic opcode */ SURFACE_LOGICAL_SRC_IMM_ARG, + /** + * Some instructions with side-effects should not be predicated on + * sample mask, e.g. lowered stores to scratch. + */ + SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK, SURFACE_LOGICAL_NUM_SRCS }; diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index a89c53ea5a5..a11c00dea0b 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -5366,7 +5366,10 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE]; const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS]; const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG]; + const fs_reg &allow_sample_mask = + inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK]; assert(arg.file == IMM); + assert(allow_sample_mask.file == IMM); /* We must have exactly one of surface and surface_handle */ assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE)); @@ -5390,8 +5393,9 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) surface.ud == GEN8_BTI_STATELESS_NON_COHERENT); const bool has_side_effects = inst->has_side_effects(); - fs_reg sample_mask = has_side_effects ? sample_mask_reg(bld) : - fs_reg(brw_imm_d(0xffff)); + + fs_reg sample_mask = allow_sample_mask.ud ? sample_mask_reg(bld) : + fs_reg(brw_imm_d(0xffff)); /* From the BDW PRM Volume 7, page 147: * diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index afbb436b95a..5135b0c3880 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -3762,6 +3762,7 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(surface); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1); /* num components */ + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); /* Read the 3 GLuint components of gl_NumWorkGroups */ for (unsigned i = 0; i < 3; i++) { @@ -3799,6 +3800,7 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM); srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[0]); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); /* Make dest unsigned because that's what the temporary will be */ dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); @@ -3835,6 +3837,7 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM); srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); fs_reg data = get_nir_src(instr->src[0]); data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); @@ -4116,6 +4119,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr if (instr->intrinsic == nir_intrinsic_image_load || instr->intrinsic == nir_intrinsic_bindless_image_load) { srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); fs_inst *inst = bld.emit(SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL, dest, srcs, SURFACE_LOGICAL_NUM_SRCS); @@ -4124,6 +4128,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr instr->intrinsic == nir_intrinsic_bindless_image_store) { srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[3]); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL, fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); } else { @@ -4146,6 +4151,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr data = tmp; } srcs[SURFACE_LOGICAL_SRC_DATA] = data; + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); bld.emit(SHADER_OPCODE_TYPED_ATOMIC_LOGICAL, dest, srcs, SURFACE_LOGICAL_NUM_SRCS); @@ -4203,6 +4209,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); fs_inst *inst = bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, @@ -4219,6 +4226,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[2]); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); @@ -4632,6 +4640,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr get_nir_ssbo_intrinsic_index(bld, instr); srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); /* Make dest unsigned because that's what the temporary will be */ dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); @@ -4668,6 +4677,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr get_nir_ssbo_intrinsic_index(bld, instr); srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[2]); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); fs_reg data = get_nir_src(instr->src[0]); data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); @@ -4806,6 +4816,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); const fs_reg nir_addr = get_nir_src(instr->src[0]); /* Make dest unsigned because that's what the temporary will be */ @@ -4851,6 +4862,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); + /** + * While this instruction has side-effects, it should not be predicated + * on sample mask, because otherwise fs helper invocations would + * load undefined values from scratch memory. And scratch memory + * load-stores are produced from operations without side-effects, thus + * they should not have different behaviour in the helper invocations. + */ + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); const fs_reg nir_addr = get_nir_src(instr->src[1]); fs_reg data = get_nir_src(instr->src[0]); @@ -5299,6 +5318,7 @@ fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld, srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); fs_reg data; if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) @@ -5331,6 +5351,7 @@ fs_visitor::nir_emit_ssbo_atomic_float(const fs_builder &bld, srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); fs_reg data = get_nir_src(instr->src[2]); if (op == BRW_AOP_FCMPWR) { @@ -5359,6 +5380,7 @@ fs_visitor::nir_emit_shared_atomic(const fs_builder &bld, srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); fs_reg data; if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) @@ -5400,6 +5422,7 @@ fs_visitor::nir_emit_shared_atomic_float(const fs_builder &bld, srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM); srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); + srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); fs_reg data = get_nir_src(instr->src[1]); if (op == BRW_AOP_FCMPWR) {