diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index c6cfe4cd237..ab5a5fac57b 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1018,6 +1018,9 @@ system_value("tess_factor_base_ir3", 2) system_value("tess_param_base_ir3", 2) system_value("tcs_header_ir3", 1) +# System values for freedreno compute shaders. +system_value("subgroup_id_shift_ir3", 1) + # IR3-specific intrinsics for tessellation control shaders. cond_end_ir3 end # the shader when src0 is false and is used to narrow down the TCS shader to # just thread 0 before writing out tessellation levels. diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 3134dda6a42..b43c521eda1 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -2005,6 +2005,12 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i); } break; + case nir_intrinsic_load_subgroup_size: + dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_SIZE); + break; + case nir_intrinsic_load_subgroup_id_shift_ir3: + dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_ID_SHIFT); + break; case nir_intrinsic_discard_if: case nir_intrinsic_discard: case nir_intrinsic_demote: diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index 0f13f5b25ee..51bda704a1f 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -409,6 +409,58 @@ ir3_finalize_nir(struct ir3_compiler *compiler, nir_shader *s) nir_sweep(s); } +static bool +lower_subgroup_id_filter(const nir_instr *instr, const void *unused) +{ + (void)unused; + + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + return intr->intrinsic == nir_intrinsic_load_subgroup_invocation || + intr->intrinsic == nir_intrinsic_load_subgroup_id || + intr->intrinsic == nir_intrinsic_load_num_subgroups; +} + +static nir_ssa_def * +lower_subgroup_id(nir_builder *b, nir_instr *instr, void *unused) +{ + (void)instr; + (void)unused; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic == nir_intrinsic_load_subgroup_invocation) { + return nir_iand(b, nir_load_local_invocation_index(b), + nir_isub(b, nir_load_subgroup_size(b), nir_imm_int(b, 1))); + } else if (intr->intrinsic == nir_intrinsic_load_subgroup_id) { + return nir_ishr(b, nir_load_local_invocation_index(b), + nir_load_subgroup_id_shift_ir3(b)); + } else { + assert(intr->intrinsic == nir_intrinsic_load_num_subgroups); + /* If the workgroup size is constant, + * nir_lower_compute_system_values() will replace local_size with a + * constant so this can mostly be constant folded away. + */ + nir_ssa_def *local_size = nir_load_workgroup_size(b); + nir_ssa_def *size = + nir_imul24(b, nir_channel(b, local_size, 0), + nir_imul24(b, nir_channel(b, local_size, 1), + nir_channel(b, local_size, 2))); + nir_ssa_def *one = nir_imm_int(b, 1); + return nir_iadd(b, one, + nir_ishr(b, nir_isub(b, size, one), + nir_load_subgroup_id_shift_ir3(b))); + } +} + +static bool +ir3_nir_lower_subgroup_id_cs(nir_shader *shader) +{ + return nir_shader_lower_instructions(shader, lower_subgroup_id_filter, + lower_subgroup_id, NULL); +} + /** * Late passes that need to be done after pscreen->finalize_nir() */ @@ -706,6 +758,14 @@ ir3_nir_scan_driver_consts(nir_shader *shader, layout->num_driver_params = MAX2(layout->num_driver_params, IR3_DP_BASE_GROUP_Z + 1); break; + case nir_intrinsic_load_subgroup_size: + layout->num_driver_params = + MAX2(layout->num_driver_params, IR3_DP_SUBGROUP_SIZE + 1); + break; + case nir_intrinsic_load_subgroup_id_shift_ir3: + layout->num_driver_params = + MAX2(layout->num_driver_params, IR3_DP_SUBGROUP_ID_SHIFT + 1); + break; default: break; } diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index 0708b41ad7e..33ecb63ba3e 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -48,9 +48,11 @@ enum ir3_driver_param { IR3_DP_BASE_GROUP_X = 4, IR3_DP_BASE_GROUP_Y = 5, IR3_DP_BASE_GROUP_Z = 6, + IR3_DP_SUBGROUP_SIZE = 7, IR3_DP_LOCAL_GROUP_SIZE_X = 8, IR3_DP_LOCAL_GROUP_SIZE_Y = 9, IR3_DP_LOCAL_GROUP_SIZE_Z = 10, + IR3_DP_SUBGROUP_ID_SHIFT = 11, /* NOTE: gl_NumWorkGroups should be vec4 aligned because * glDispatchComputeIndirect() needs to load these from * the info->indirect buffer. Keep that in mind when/if diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index cedca454f63..7a3e7207091 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -4171,6 +4171,8 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd, &pipeline->program.link[type]; const struct ir3_const_state *const_state = &link->const_state; uint32_t offset = const_state->offsets.driver_param; + unsigned subgroup_size = pipeline->compute.subgroup_size; + unsigned subgroup_shift = util_logbase2(subgroup_size); if (link->constlen <= offset) return; @@ -4179,13 +4181,15 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd, (link->constlen - offset) * 4); if (!info->indirect) { - uint32_t driver_params[8] = { + uint32_t driver_params[12] = { [IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0], [IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1], [IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2], [IR3_DP_BASE_GROUP_X] = info->offsets[0], [IR3_DP_BASE_GROUP_Y] = info->offsets[1], [IR3_DP_BASE_GROUP_Z] = info->offsets[2], + [IR3_DP_SUBGROUP_SIZE] = subgroup_size, + [IR3_DP_SUBGROUP_ID_SHIFT] = subgroup_shift, }; assert(num_consts <= ARRAY_SIZE(driver_params)); @@ -4236,19 +4240,28 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd, tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[0])); } - /* Zeroing of IR3_DP_BASE_GROUP_X/Y/Z for indirect dispatch */ + /* Fill out IR3_DP_SUBGROUP_SIZE and IR3_DP_SUBGROUP_ID_SHIFT for indirect + * dispatch. + */ if (info->indirect && num_consts > IR3_DP_BASE_GROUP_X) { - assert(num_consts == align(IR3_DP_BASE_GROUP_Z, 4)); - tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 7); tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset + (IR3_DP_BASE_GROUP_X / 4)) | CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | - CP_LOAD_STATE6_0_NUM_UNIT(1)); + CP_LOAD_STATE6_0_NUM_UNIT((num_consts - IR3_DP_BASE_GROUP_X) / 4)); tu_cs_emit_qw(cs, 0); - for (uint32_t i = 0; i < 4; i++) - tu_cs_emit(cs, 0); + tu_cs_emit(cs, 0); /* BASE_GROUP_X */ + tu_cs_emit(cs, 0); /* BASE_GROUP_Y */ + tu_cs_emit(cs, 0); /* BASE_GROUP_Z */ + tu_cs_emit(cs, subgroup_size); + if (num_consts > IR3_DP_LOCAL_GROUP_SIZE_X) { + assert(num_consts == align(IR3_DP_SUBGROUP_ID_SHIFT, 4)); + tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_X */ + tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Y */ + tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Z */ + tu_cs_emit(cs, subgroup_shift); + } } } diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index 0d5b035bb0c..7089d8fc6d5 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -3143,6 +3143,8 @@ tu_compute_pipeline_create(VkDevice device, for (int i = 0; i < 3; i++) pipeline->compute.local_size[i] = v->local_size[i]; + pipeline->compute.subgroup_size = v->info.double_threadsize ? 128 : 64; + struct tu_cs prog_cs; tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs); tu6_emit_cs_config(&prog_cs, shader, v, &pvtmem, shader_iova); diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index c0324be48eb..6517166a935 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -1189,6 +1189,7 @@ struct tu_pipeline struct { uint32_t local_size[3]; + uint32_t subgroup_size; } compute; bool provoking_vertex_last;