diff --git a/src/compiler/shader_info.h b/src/compiler/shader_info.h index be3a6a542e8..13da17fa264 100644 --- a/src/compiler/shader_info.h +++ b/src/compiler/shader_info.h @@ -298,6 +298,7 @@ typedef struct shader_info { struct { uint16_t local_size[3]; + uint16_t max_variable_local_size; bool local_size_variable:1; uint8_t user_data_components_amd:3; diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index 08999e95071..2e34b16dd44 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -615,6 +615,9 @@ enum brw_param_builtin { BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Y, BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Z, BRW_PARAM_BUILTIN_SUBGROUP_ID, + BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X, + BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Y, + BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z, }; #define BRW_PARAM_BUILTIN_CLIP_PLANE(idx, comp) \ @@ -901,11 +904,13 @@ struct brw_cs_prog_data { struct brw_stage_prog_data base; unsigned local_size[3]; + unsigned max_variable_local_size; unsigned simd_size; unsigned threads; unsigned slm_size; bool uses_barrier; bool uses_num_work_groups; + bool uses_variable_group_size; struct { struct brw_push_const_block cross_thread; diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 96fdb6b0992..323fdb56ff5 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -1190,6 +1190,8 @@ fs_visitor::import_uniforms(fs_visitor *v) this->pull_constant_loc = v->pull_constant_loc; this->uniforms = v->uniforms; this->subgroup_id = v->subgroup_id; + for (unsigned i = 0; i < ARRAY_SIZE(this->group_size); i++) + this->group_size[i] = v->group_size[i]; } void @@ -8866,9 +8868,16 @@ static void cs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, unsigned size) { cs_prog_data->simd_size = size; - unsigned group_size = cs_prog_data->local_size[0] * - cs_prog_data->local_size[1] * cs_prog_data->local_size[2]; - cs_prog_data->threads = (group_size + size - 1) / size; + + unsigned group_size; + if (cs_prog_data->uses_variable_group_size) { + group_size = cs_prog_data->max_variable_local_size; + } else { + group_size = cs_prog_data->local_size[0] * + cs_prog_data->local_size[1] * + cs_prog_data->local_size[2]; + } + cs_prog_data->threads = DIV_ROUND_UP(group_size, size); } static nir_shader * @@ -8903,13 +8912,20 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, char **error_str) { prog_data->base.total_shared = src_shader->info.cs.shared_size; - prog_data->local_size[0] = src_shader->info.cs.local_size[0]; - prog_data->local_size[1] = src_shader->info.cs.local_size[1]; - prog_data->local_size[2] = src_shader->info.cs.local_size[2]; prog_data->slm_size = src_shader->num_shared; - unsigned local_workgroup_size = - src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] * - src_shader->info.cs.local_size[2]; + + unsigned local_workgroup_size; + if (prog_data->uses_variable_group_size) { + prog_data->max_variable_local_size = + src_shader->info.cs.max_variable_local_size; + local_workgroup_size = src_shader->info.cs.max_variable_local_size; + } else { + prog_data->local_size[0] = src_shader->info.cs.local_size[0]; + prog_data->local_size[1] = src_shader->info.cs.local_size[1]; + prog_data->local_size[2] = src_shader->info.cs.local_size[2]; + local_workgroup_size = src_shader->info.cs.local_size[0] * + src_shader->info.cs.local_size[1] * src_shader->info.cs.local_size[2]; + } /* Limit max_threads to 64 for the GPGPU_WALKER command */ const uint32_t max_threads = MIN2(64, compiler->devinfo->max_cs_threads); diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index c09c4eb8759..f2612968f25 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -370,6 +370,7 @@ public: int *push_constant_loc; fs_reg subgroup_id; + fs_reg group_size[3]; fs_reg scratch_base; fs_reg frag_depth; fs_reg frag_stencil; diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index f1d17a322e9..a038db72daa 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -101,11 +101,23 @@ fs_visitor::nir_setup_uniforms() uniforms = nir->num_uniforms / 4; if (stage == MESA_SHADER_COMPUTE) { - /* Add a uniform for the thread local id. It must be the last uniform - * on the list. - */ + /* Add uniforms for builtins after regular NIR uniforms. */ assert(uniforms == prog_data->nr_params); - uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1); + + uint32_t *param; + if (brw_cs_prog_data(prog_data)->uses_variable_group_size) { + param = brw_stage_prog_data_add_params(prog_data, 3); + for (unsigned i = 0; i < 3; i++) { + param[i] = (BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i); + group_size[i] = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD); + } + } + + /* Subgroup ID must be the last uniform on the list. This will make + * easier later to split between cross thread and per thread + * uniforms. + */ + param = brw_stage_prog_data_add_params(prog_data, 1); *param = BRW_PARAM_BUILTIN_SUBGROUP_ID; subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD); } @@ -3814,7 +3826,8 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, * invocations are already executed lock-step. Instead of an actual * barrier just emit a scheduling fence, that will generate no code. */ - if (workgroup_size() <= dispatch_width) { + if (!cs_prog_data->uses_variable_group_size && + workgroup_size() <= dispatch_width) { bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE); break; } @@ -3949,6 +3962,14 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, break; } + case nir_intrinsic_load_local_group_size: { + for (unsigned i = 0; i < 3; i++) { + bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD), + group_size[i]); + } + break; + } + default: nir_emit_intrinsic(bld, instr); break; @@ -4337,7 +4358,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr * * TODO: Check if applies for many HW threads sharing same Data Port. */ - if (slm_fence && workgroup_size() <= dispatch_width) + if (!brw_cs_prog_data(prog_data)->uses_variable_group_size && + slm_fence && workgroup_size() <= dispatch_width) slm_fence = false; /* Prior to Gen11, there's only L3 fence, so emit that instead. */ diff --git a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c index 434ad005281..2393011312c 100644 --- a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c +++ b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c @@ -72,8 +72,16 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state, nir_ssa_def *channel = nir_load_subgroup_invocation(b); nir_ssa_def *linear = nir_iadd(b, channel, thread_local_id); - nir_ssa_def *size_x = nir_imm_int(b, nir->info.cs.local_size[0]); - nir_ssa_def *size_y = nir_imm_int(b, nir->info.cs.local_size[1]); + nir_ssa_def *size_x; + nir_ssa_def *size_y; + if (state->nir->info.cs.local_size_variable) { + nir_ssa_def *size_xyz = nir_load_local_group_size(b); + size_x = nir_channel(b, size_xyz, 0); + size_y = nir_channel(b, size_xyz, 1); + } else { + size_x = nir_imm_int(b, nir->info.cs.local_size[0]); + size_y = nir_imm_int(b, nir->info.cs.local_size[1]); + } /* The local invocation index and ID must respect the following * @@ -152,12 +160,26 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state, break; case nir_intrinsic_load_num_subgroups: { - unsigned local_workgroup_size = - nir->info.cs.local_size[0] * nir->info.cs.local_size[1] * - nir->info.cs.local_size[2]; - unsigned num_subgroups = - DIV_ROUND_UP(local_workgroup_size, state->dispatch_width); - sysval = nir_imm_int(b, num_subgroups); + if (state->nir->info.cs.local_size_variable) { + nir_ssa_def *size_xyz = nir_load_local_group_size(b); + nir_ssa_def *size_x = nir_channel(b, size_xyz, 0); + nir_ssa_def *size_y = nir_channel(b, size_xyz, 1); + nir_ssa_def *size_z = nir_channel(b, size_xyz, 2); + nir_ssa_def *size = nir_imul(b, nir_imul(b, size_x, size_y), size_z); + + /* Calculate the equivalent of DIV_ROUND_UP. */ + sysval = nir_idiv(b, + nir_iadd_imm(b, + nir_iadd_imm(b, size, state->dispatch_width), -1), + nir_imm_int(b, state->dispatch_width)); + } else { + unsigned local_workgroup_size = + nir->info.cs.local_size[0] * nir->info.cs.local_size[1] * + nir->info.cs.local_size[2]; + unsigned num_subgroups = + DIV_ROUND_UP(local_workgroup_size, state->dispatch_width); + sysval = nir_imm_int(b, num_subgroups); + } break; } @@ -198,16 +220,21 @@ brw_nir_lower_cs_intrinsics(nir_shader *nir, .dispatch_width = dispatch_width, }; - assert(!nir->info.cs.local_size_variable); - state.local_workgroup_size = nir->info.cs.local_size[0] * - nir->info.cs.local_size[1] * - nir->info.cs.local_size[2]; + if (!nir->info.cs.local_size_variable) { + state.local_workgroup_size = nir->info.cs.local_size[0] * + nir->info.cs.local_size[1] * + nir->info.cs.local_size[2]; + } else { + state.local_workgroup_size = nir->info.cs.max_variable_local_size; + } /* Constraints from NV_compute_shader_derivatives. */ - if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) { + if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS && + !nir->info.cs.local_size_variable) { assert(nir->info.cs.local_size[0] % 2 == 0); assert(nir->info.cs.local_size[1] % 2 == 0); - } else if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_LINEAR) { + } else if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_LINEAR && + !nir->info.cs.local_size_variable) { assert(state.local_workgroup_size % 4 == 0); }