intel/fs: Disable sample mask predication for scratch stores

Scratch stores are being lowered to the instructions with side-effects,
however they should be enabled in fs helper invocations, since they
are produced from operations which don't imply side-effects.

To fix this - we move the decision of whether the sample mask predication
is enable to the point where logical brw instructions are created.

GLSL example of the issue:

 int tmp[1024];
 ...
 do {
   // changes to tmp
 } while (some_condition(tmp))

If `tmp` is lowered to scrach memory, `some_condition` would be
undefined if scratch write is predicated on sample mask, making
possible for the while loop to become infinite and hang the GPU.

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/3256
Fixes: 53bfcdeecf
Signed-off-by: Danylo Piliaiev <danylo.piliaiev@globallogic.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
Acked-by: Jason Ekstrand <jason@jlekstrand.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6056>
(cherry picked from commit 77486db867)
This commit is contained in:
Danylo Piliaiev 2020-07-23 15:15:34 +03:00 committed by Dylan Baker
parent 365419d18a
commit dd70375ee2
4 changed files with 35 additions and 3 deletions

View file

@ -6250,7 +6250,7 @@
"description": "intel/fs: Disable sample mask predication for scratch stores",
"nominated": true,
"nomination_type": 1,
"resolution": 0,
"resolution": 1,
"master_sha": null,
"because_sha": "53bfcdeecf4c9632e09ee641d2ca02dd9ec25e34"
},

View file

@ -901,6 +901,11 @@ enum surface_logical_srcs {
SURFACE_LOGICAL_SRC_IMM_DIMS,
/** Per-opcode immediate argument. For atomics, this is the atomic opcode */
SURFACE_LOGICAL_SRC_IMM_ARG,
/**
* Some instructions with side-effects should not be predicated on
* sample mask, e.g. lowered stores to scratch.
*/
SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK,
SURFACE_LOGICAL_NUM_SRCS
};

View file

@ -5366,7 +5366,10 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
const fs_reg &allow_sample_mask =
inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
assert(arg.file == IMM);
assert(allow_sample_mask.file == IMM);
/* We must have exactly one of surface and surface_handle */
assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
@ -5390,8 +5393,9 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
surface.ud == GEN8_BTI_STATELESS_NON_COHERENT);
const bool has_side_effects = inst->has_side_effects();
fs_reg sample_mask = has_side_effects ? sample_mask_reg(bld) :
fs_reg(brw_imm_d(0xffff));
fs_reg sample_mask = allow_sample_mask.ud ? sample_mask_reg(bld) :
fs_reg(brw_imm_d(0xffff));
/* From the BDW PRM Volume 7, page 147:
*

View file

@ -3762,6 +3762,7 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(surface);
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1); /* num components */
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
/* Read the 3 GLuint components of gl_NumWorkGroups */
for (unsigned i = 0; i < 3; i++) {
@ -3799,6 +3800,7 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[0]);
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
/* Make dest unsigned because that's what the temporary will be */
dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
@ -3835,6 +3837,7 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
fs_reg data = get_nir_src(instr->src[0]);
data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
@ -4116,6 +4119,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
if (instr->intrinsic == nir_intrinsic_image_load ||
instr->intrinsic == nir_intrinsic_bindless_image_load) {
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
fs_inst *inst =
bld.emit(SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
@ -4124,6 +4128,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
instr->intrinsic == nir_intrinsic_bindless_image_store) {
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[3]);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
} else {
@ -4146,6 +4151,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
data = tmp;
}
srcs[SURFACE_LOGICAL_SRC_DATA] = data;
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
bld.emit(SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
@ -4203,6 +4209,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
fs_inst *inst =
bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
@ -4219,6 +4226,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[2]);
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
@ -4632,6 +4640,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
get_nir_ssbo_intrinsic_index(bld, instr);
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
/* Make dest unsigned because that's what the temporary will be */
dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
@ -4668,6 +4677,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
get_nir_ssbo_intrinsic_index(bld, instr);
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[2]);
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
fs_reg data = get_nir_src(instr->src[0]);
data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
@ -4806,6 +4816,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
const fs_reg nir_addr = get_nir_src(instr->src[0]);
/* Make dest unsigned because that's what the temporary will be */
@ -4851,6 +4862,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
/**
* While this instruction has side-effects, it should not be predicated
* on sample mask, because otherwise fs helper invocations would
* load undefined values from scratch memory. And scratch memory
* load-stores are produced from operations without side-effects, thus
* they should not have different behaviour in the helper invocations.
*/
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
const fs_reg nir_addr = get_nir_src(instr->src[1]);
fs_reg data = get_nir_src(instr->src[0]);
@ -5299,6 +5318,7 @@ fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
fs_reg data;
if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
@ -5331,6 +5351,7 @@ fs_visitor::nir_emit_ssbo_atomic_float(const fs_builder &bld,
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
fs_reg data = get_nir_src(instr->src[2]);
if (op == BRW_AOP_FCMPWR) {
@ -5359,6 +5380,7 @@ fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
fs_reg data;
if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
@ -5400,6 +5422,7 @@ fs_visitor::nir_emit_shared_atomic_float(const fs_builder &bld,
srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
fs_reg data = get_nir_src(instr->src[1]);
if (op == BRW_AOP_FCMPWR) {