From 68b8b9e9e13b358ae43f967e84e4e3c1eef5f48d Mon Sep 17 00:00:00 2001
From: Connor Abbott <cwabbott0@gmail.com>
Date: Mon, 31 May 2021 14:21:04 +0200
Subject: [PATCH] tu, ir3: Plumb through support for CS subgroup size/id

The way that the blob obtains the subgroup id on compute shaders is by
just and'ing gl_LocalInvocationIndex with 63, since it advertizes a
subgroupSize of 64. In order to support VK_EXT_subgroup_size_control and
expose a subgroupSize of 128, we'll have to do something a little more
flexible. Sometimes we have to fall back to a subgroup size of 64 due to
various constraints, and in that case we have to fake a subgroup size of
128 while actually using 64 under the hood, by just pretending that the
upper 64 invocations are all disabled. However when computing the
subgroup id we need to use the "real" subgroup size. For this purpose we
plumb through a driver param which exposes the real subgroup size. If
the user forces a particular subgroup size then we lower
load_subgroup_size in nir_lower_subgroups, otherwise we let it through,
and we assume when translating to ir3 that load_subgroup_size means
"give me the *actual* subgroup size that you decided in RA" and give you
the driver param.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6752>
---
 src/compiler/nir/nir_intrinsics.py   |  3 ++
 src/freedreno/ir3/ir3_compiler_nir.c |  6 +++
 src/freedreno/ir3/ir3_nir.c          | 60 ++++++++++++++++++++++++++++
 src/freedreno/ir3/ir3_shader.h       |  2 +
 src/freedreno/vulkan/tu_cmd_buffer.c | 27 +++++++++----
 src/freedreno/vulkan/tu_pipeline.c   |  2 +
 src/freedreno/vulkan/tu_private.h    |  1 +
 7 files changed, 94 insertions(+), 7 deletions(-)

diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index c6cfe4cd237..ab5a5fac57b 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -1018,6 +1018,9 @@ system_value("tess_factor_base_ir3", 2)
 system_value("tess_param_base_ir3", 2)
 system_value("tcs_header_ir3", 1)
 
+# System values for freedreno compute shaders.
+system_value("subgroup_id_shift_ir3", 1)
+
 # IR3-specific intrinsics for tessellation control shaders.  cond_end_ir3 end
 # the shader when src0 is false and is used to narrow down the TCS shader to
 # just thread 0 before writing out tessellation levels.
diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c
index 3134dda6a42..b43c521eda1 100644
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -2005,6 +2005,12 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 			dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i);
 		}
 		break;
+	case nir_intrinsic_load_subgroup_size:
+		dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_SIZE);
+		break;
+	case nir_intrinsic_load_subgroup_id_shift_ir3:
+		dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_ID_SHIFT);
+		break;
 	case nir_intrinsic_discard_if:
 	case nir_intrinsic_discard:
 	case nir_intrinsic_demote:
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index 0f13f5b25ee..51bda704a1f 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -409,6 +409,58 @@ ir3_finalize_nir(struct ir3_compiler *compiler, nir_shader *s)
 	nir_sweep(s);
 }
 
+static bool
+lower_subgroup_id_filter(const nir_instr *instr, const void *unused)
+{
+	(void)unused;
+
+	if (instr->type != nir_instr_type_intrinsic)
+		return false;
+
+	nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+	return intr->intrinsic == nir_intrinsic_load_subgroup_invocation ||
+		   intr->intrinsic == nir_intrinsic_load_subgroup_id ||
+		   intr->intrinsic == nir_intrinsic_load_num_subgroups;
+}
+
+static nir_ssa_def *
+lower_subgroup_id(nir_builder *b, nir_instr *instr, void *unused)
+{
+	(void)instr;
+	(void)unused;
+
+	nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+	if (intr->intrinsic == nir_intrinsic_load_subgroup_invocation) {
+		return nir_iand(b, nir_load_local_invocation_index(b),
+						nir_isub(b, nir_load_subgroup_size(b), nir_imm_int(b, 1)));
+	} else if (intr->intrinsic == nir_intrinsic_load_subgroup_id) {
+		return nir_ishr(b, nir_load_local_invocation_index(b),
+						nir_load_subgroup_id_shift_ir3(b));
+	} else {
+		assert(intr->intrinsic == nir_intrinsic_load_num_subgroups);
+		/* If the workgroup size is constant,
+		 * nir_lower_compute_system_values() will replace local_size with a
+		 * constant so this can mostly be constant folded away.
+		 */
+		nir_ssa_def *local_size = nir_load_workgroup_size(b);
+		nir_ssa_def *size =
+			nir_imul24(b, nir_channel(b, local_size, 0),
+					   nir_imul24(b, nir_channel(b, local_size, 1),
+								  nir_channel(b, local_size, 2)));
+		nir_ssa_def *one = nir_imm_int(b, 1);
+		return nir_iadd(b, one,
+						nir_ishr(b, nir_isub(b, size, one),
+							     nir_load_subgroup_id_shift_ir3(b)));
+	}
+}
+
+static bool
+ir3_nir_lower_subgroup_id_cs(nir_shader *shader)
+{
+	return nir_shader_lower_instructions(shader, lower_subgroup_id_filter,
+										 lower_subgroup_id, NULL);
+}
+
 /**
  * Late passes that need to be done after pscreen->finalize_nir()
  */
@@ -706,6 +758,14 @@ ir3_nir_scan_driver_consts(nir_shader *shader,
 					layout->num_driver_params =
 						MAX2(layout->num_driver_params, IR3_DP_BASE_GROUP_Z + 1);
 					break;
+				case nir_intrinsic_load_subgroup_size:
+					layout->num_driver_params =
+						MAX2(layout->num_driver_params, IR3_DP_SUBGROUP_SIZE + 1);
+					break;
+				case nir_intrinsic_load_subgroup_id_shift_ir3:
+					layout->num_driver_params =
+						MAX2(layout->num_driver_params, IR3_DP_SUBGROUP_ID_SHIFT + 1);
+					break;
 				default:
 					break;
 				}
diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h
index 0708b41ad7e..33ecb63ba3e 100644
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -48,9 +48,11 @@ enum ir3_driver_param {
 	IR3_DP_BASE_GROUP_X = 4,
 	IR3_DP_BASE_GROUP_Y = 5,
 	IR3_DP_BASE_GROUP_Z = 6,
+	IR3_DP_SUBGROUP_SIZE = 7,
 	IR3_DP_LOCAL_GROUP_SIZE_X = 8,
 	IR3_DP_LOCAL_GROUP_SIZE_Y = 9,
 	IR3_DP_LOCAL_GROUP_SIZE_Z = 10,
+	IR3_DP_SUBGROUP_ID_SHIFT = 11,
 	/* NOTE: gl_NumWorkGroups should be vec4 aligned because
 	 * glDispatchComputeIndirect() needs to load these from
 	 * the info->indirect buffer.  Keep that in mind when/if
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c
index cedca454f63..7a3e7207091 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@@ -4171,6 +4171,8 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
       &pipeline->program.link[type];
    const struct ir3_const_state *const_state = &link->const_state;
    uint32_t offset = const_state->offsets.driver_param;
+   unsigned subgroup_size = pipeline->compute.subgroup_size;
+   unsigned subgroup_shift = util_logbase2(subgroup_size);
 
    if (link->constlen <= offset)
       return;
@@ -4179,13 +4181,15 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
                               (link->constlen - offset) * 4);
 
    if (!info->indirect) {
-      uint32_t driver_params[8] = {
+      uint32_t driver_params[12] = {
          [IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0],
          [IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1],
          [IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2],
          [IR3_DP_BASE_GROUP_X] = info->offsets[0],
          [IR3_DP_BASE_GROUP_Y] = info->offsets[1],
          [IR3_DP_BASE_GROUP_Z] = info->offsets[2],
+         [IR3_DP_SUBGROUP_SIZE] = subgroup_size,
+         [IR3_DP_SUBGROUP_ID_SHIFT] = subgroup_shift,
       };
 
       assert(num_consts <= ARRAY_SIZE(driver_params));
@@ -4236,19 +4240,28 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
       tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[0]));
    }
 
-   /* Zeroing of IR3_DP_BASE_GROUP_X/Y/Z for indirect dispatch */
+   /* Fill out IR3_DP_SUBGROUP_SIZE and IR3_DP_SUBGROUP_ID_SHIFT for indirect
+    * dispatch.
+    */
    if (info->indirect && num_consts > IR3_DP_BASE_GROUP_X) {
-      assert(num_consts == align(IR3_DP_BASE_GROUP_Z, 4));
-
       tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 7);
       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset + (IR3_DP_BASE_GROUP_X / 4)) |
                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
-                 CP_LOAD_STATE6_0_NUM_UNIT(1));
+                 CP_LOAD_STATE6_0_NUM_UNIT((num_consts - IR3_DP_BASE_GROUP_X) / 4));
       tu_cs_emit_qw(cs, 0);
-      for (uint32_t i = 0; i < 4; i++)
-         tu_cs_emit(cs, 0);
+      tu_cs_emit(cs, 0); /* BASE_GROUP_X */
+      tu_cs_emit(cs, 0); /* BASE_GROUP_Y */
+      tu_cs_emit(cs, 0); /* BASE_GROUP_Z */
+      tu_cs_emit(cs, subgroup_size);
+      if (num_consts > IR3_DP_LOCAL_GROUP_SIZE_X) {
+         assert(num_consts == align(IR3_DP_SUBGROUP_ID_SHIFT, 4));
+         tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_X */
+         tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Y */
+         tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Z */
+         tu_cs_emit(cs, subgroup_shift);
+      }
    }
 }
 
diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c
index 0d5b035bb0c..7089d8fc6d5 100644
--- a/src/freedreno/vulkan/tu_pipeline.c
+++ b/src/freedreno/vulkan/tu_pipeline.c
@@ -3143,6 +3143,8 @@ tu_compute_pipeline_create(VkDevice device,
    for (int i = 0; i < 3; i++)
       pipeline->compute.local_size[i] = v->local_size[i];
 
+   pipeline->compute.subgroup_size = v->info.double_threadsize ? 128 : 64;
+
    struct tu_cs prog_cs;
    tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs);
    tu6_emit_cs_config(&prog_cs, shader, v, &pvtmem, shader_iova);
diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h
index c0324be48eb..6517166a935 100644
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -1189,6 +1189,7 @@ struct tu_pipeline
    struct
    {
       uint32_t local_size[3];
+      uint32_t subgroup_size;
    } compute;
 
    bool provoking_vertex_last;