diff --git a/src/gallium/frontends/rusticl/core/device.rs b/src/gallium/frontends/rusticl/core/device.rs index 8cd48ed0db8..e4662c25d14 100644 --- a/src/gallium/frontends/rusticl/core/device.rs +++ b/src/gallium/frontends/rusticl/core/device.rs @@ -892,6 +892,11 @@ impl Device { v.into_iter().map(|v| v as usize).collect() } + pub fn max_grid_size(&self) -> Vec { + self.screen + .compute_param(pipe_compute_cap::PIPE_COMPUTE_CAP_MAX_GRID_SIZE) + } + pub fn max_clock_freq(&self) -> cl_uint { self.screen .compute_param(pipe_compute_cap::PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY) diff --git a/src/gallium/frontends/rusticl/core/kernel.rs b/src/gallium/frontends/rusticl/core/kernel.rs index 6865ebec658..edd4060479f 100644 --- a/src/gallium/frontends/rusticl/core/kernel.rs +++ b/src/gallium/frontends/rusticl/core/kernel.rs @@ -2,6 +2,7 @@ use crate::api::icd::*; use crate::core::device::*; use crate::core::event::*; use crate::core::memory::*; +use crate::core::platform::*; use crate::core::program::*; use crate::core::queue::*; use crate::impl_cl_type_trait; @@ -61,6 +62,8 @@ pub enum InternalKernelArgType { FormatArray, OrderArray, WorkDim, + WorkGroupOffsets, + NumWorkgroups, } #[derive(Hash, PartialEq, Eq, Clone)] @@ -221,6 +224,8 @@ impl InternalKernelArg { InternalKernelArgType::FormatArray => bin.push(4), InternalKernelArgType::OrderArray => bin.push(5), InternalKernelArgType::WorkDim => bin.push(6), + InternalKernelArgType::WorkGroupOffsets => bin.push(7), + InternalKernelArgType::NumWorkgroups => bin.push(8), } bin @@ -243,6 +248,8 @@ impl InternalKernelArg { 4 => InternalKernelArgType::FormatArray, 5 => InternalKernelArgType::OrderArray, 6 => InternalKernelArgType::WorkDim, + 7 => InternalKernelArgType::WorkGroupOffsets, + 8 => InternalKernelArgType::NumWorkgroups, _ => return None, }; @@ -531,6 +538,7 @@ fn lower_and_optimize_nir( nir_pass!(nir, nir_lower_system_values); let mut compute_options = nir_lower_compute_system_values_options::default(); compute_options.set_has_base_global_invocation_id(true); + compute_options.set_has_base_workgroup_id(true); nir_pass!(nir, nir_lower_compute_system_values, &compute_options); nir.gather_info(); @@ -549,6 +557,37 @@ fn lower_and_optimize_nir( ); } + if nir.reads_sysval(gl_system_value::SYSTEM_VALUE_BASE_WORKGROUP_ID) { + internal_args.push(InternalKernelArg { + kind: InternalKernelArgType::WorkGroupOffsets, + offset: 0, + size: 3 * size_of::(), + }); + lower_state.base_workgroup_id_loc = args.len() + internal_args.len() - 1; + nir.add_var( + nir_variable_mode::nir_var_uniform, + unsafe { glsl_vector_type(host_bits_base_type, 3) }, + lower_state.base_workgroup_id_loc, + "base_workgroup_id", + ); + } + + if nir.reads_sysval(gl_system_value::SYSTEM_VALUE_NUM_WORKGROUPS) { + internal_args.push(InternalKernelArg { + kind: InternalKernelArgType::NumWorkgroups, + offset: 0, + size: 12, + }); + + lower_state.num_workgroups_loc = args.len() + internal_args.len() - 1; + nir.add_var( + nir_variable_mode::nir_var_uniform, + unsafe { glsl_vector_type(glsl_base_type::GLSL_TYPE_UINT, 3) }, + lower_state.num_workgroups_loc, + "num_workgroups", + ); + } + if nir.has_constant() { internal_args.push(InternalKernelArg { kind: InternalKernelArgType::ConstantBuffer, @@ -906,6 +945,7 @@ impl Kernel { let mut block = create_kernel_arr::(block, 1)?; let mut grid = create_kernel_arr::(grid, 1)?; let offsets = create_kernel_arr::(offsets, 0)?; + let mut workgroup_id_offset_loc = None; let mut input: Vec = Vec::new(); let mut resource_info = Vec::new(); // Set it once so we get the alignment padding right @@ -919,10 +959,12 @@ impl Kernel { let mut tex_orders: Vec = Vec::new(); let mut img_formats: Vec = Vec::new(); let mut img_orders: Vec = Vec::new(); - let null_ptr: &[u8] = if q.device.address_bits() == 64 { - &[0; 8] + + let host_null_v3 = &[0u8; 3 * size_of::()]; + let null_ptr = if q.device.address_bits() == 64 { + [0u8; 8].as_slice() } else { - &[0; 4] + [0u8; 4].as_slice() }; self.optimize_local_size(q.device, &mut grid, &mut block); @@ -1043,6 +1085,10 @@ impl Kernel { InternalKernelArgType::GlobalWorkOffsets => { input.extend_from_slice(unsafe { as_byte_slice(&offsets) }); } + InternalKernelArgType::WorkGroupOffsets => { + workgroup_id_offset_loc = Some(input.len()); + input.extend_from_slice(host_null_v3); + } InternalKernelArgType::PrintfBuffer => { let buf = Arc::new( q.device @@ -1074,6 +1120,11 @@ impl Kernel { InternalKernelArgType::WorkDim => { input.extend_from_slice(&[work_dim as u8; 1]); } + InternalKernelArgType::NumWorkgroups => { + input.extend_from_slice(unsafe { + as_byte_slice(&[grid[0] as u32, grid[1] as u32, grid[2] as u32]) + }); + } } } @@ -1123,13 +1174,42 @@ impl Kernel { ctx.set_global_binding(resources.as_slice(), &mut globals); ctx.update_cb0(&input); - let grid = [ - grid[0].try_into().ok().ok_or(CL_OUT_OF_HOST_MEMORY)?, - grid[1].try_into().ok().ok_or(CL_OUT_OF_HOST_MEMORY)?, - grid[2].try_into().ok().ok_or(CL_OUT_OF_HOST_MEMORY)?, - ]; + let hw_max_grid: Vec = q + .device + .max_grid_size() + .into_iter() + .map(|val| val.try_into().unwrap_or(usize::MAX)) + // clamped as pipe_launch_grid::grid is only u32 + .map(|val| cmp::min(val, u32::MAX as usize)) + .collect(); - ctx.launch_grid(work_dim, block, grid, variable_local_size as u32); + for z in 0..div_round_up(grid[2], hw_max_grid[2]) { + for y in 0..div_round_up(grid[1], hw_max_grid[1]) { + for x in 0..div_round_up(grid[0], hw_max_grid[0]) { + if let Some(workgroup_id_offset_loc) = workgroup_id_offset_loc { + let this_offsets = + [x * hw_max_grid[0], y * hw_max_grid[1], z * hw_max_grid[2]]; + + input[workgroup_id_offset_loc + ..workgroup_id_offset_loc + (size_of::() * 3)] + .copy_from_slice(unsafe { as_byte_slice(&this_offsets) }); + } + + let this_grid = [ + cmp::min(hw_max_grid[0], grid[0] - hw_max_grid[0] * x) as u32, + cmp::min(hw_max_grid[1], grid[1] - hw_max_grid[1] * y) as u32, + cmp::min(hw_max_grid[2], grid[2] - hw_max_grid[2] * z) as u32, + ]; + + ctx.update_cb0(&input); + ctx.launch_grid(work_dim, block, this_grid, variable_local_size as u32); + + if Platform::dbg().sync_every_event { + ctx.flush().wait(); + } + } + } + } ctx.clear_global_binding(globals.len() as u32); ctx.clear_shader_images(iviews.len() as u32); diff --git a/src/gallium/frontends/rusticl/rusticl_nir.c b/src/gallium/frontends/rusticl/rusticl_nir.c index 083540fad25..17a341cf3e0 100644 --- a/src/gallium/frontends/rusticl/rusticl_nir.c +++ b/src/gallium/frontends/rusticl/rusticl_nir.c @@ -65,6 +65,10 @@ rusticl_lower_intrinsics_instr( return NULL; case nir_intrinsic_load_base_global_invocation_id: return nir_load_var(b, nir_find_variable_with_location(b->shader, nir_var_uniform, state->base_global_invoc_id_loc)); + case nir_intrinsic_load_base_workgroup_id: + return nir_load_var(b, nir_find_variable_with_location(b->shader, nir_var_uniform, state->base_workgroup_id_loc)); + case nir_intrinsic_load_num_workgroups: + return nir_load_var(b, nir_find_variable_with_location(b->shader, nir_var_uniform, state->num_workgroups_loc)); case nir_intrinsic_load_constant_base_ptr: return nir_load_var(b, nir_find_variable_with_location(b->shader, nir_var_uniform, state->const_buf_loc)); case nir_intrinsic_load_printf_buffer_address: diff --git a/src/gallium/frontends/rusticl/rusticl_nir.h b/src/gallium/frontends/rusticl/rusticl_nir.h index 44778027e9a..48707e4f18c 100644 --- a/src/gallium/frontends/rusticl/rusticl_nir.h +++ b/src/gallium/frontends/rusticl/rusticl_nir.h @@ -2,11 +2,13 @@ struct rusticl_lower_state { size_t base_global_invoc_id_loc; + size_t base_workgroup_id_loc; size_t const_buf_loc; size_t printf_buf_loc; size_t format_arr_loc; size_t order_arr_loc; size_t work_dim_loc; + size_t num_workgroups_loc; }; bool rusticl_lower_intrinsics(nir_shader *nir, struct rusticl_lower_state *state);