rusticl: lower huge grids

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27666>
This commit is contained in:
Karol Herbst 2023-12-20 02:05:06 +01:00 committed by Marge Bot
parent 204c287327
commit 91552bb4ec
4 changed files with 100 additions and 9 deletions

View file

@ -892,6 +892,11 @@ impl Device {
v.into_iter().map(|v| v as usize).collect()
}
pub fn max_grid_size(&self) -> Vec<u64> {
self.screen
.compute_param(pipe_compute_cap::PIPE_COMPUTE_CAP_MAX_GRID_SIZE)
}
pub fn max_clock_freq(&self) -> cl_uint {
self.screen
.compute_param(pipe_compute_cap::PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY)

View file

@ -2,6 +2,7 @@ use crate::api::icd::*;
use crate::core::device::*;
use crate::core::event::*;
use crate::core::memory::*;
use crate::core::platform::*;
use crate::core::program::*;
use crate::core::queue::*;
use crate::impl_cl_type_trait;
@ -61,6 +62,8 @@ pub enum InternalKernelArgType {
FormatArray,
OrderArray,
WorkDim,
WorkGroupOffsets,
NumWorkgroups,
}
#[derive(Hash, PartialEq, Eq, Clone)]
@ -221,6 +224,8 @@ impl InternalKernelArg {
InternalKernelArgType::FormatArray => bin.push(4),
InternalKernelArgType::OrderArray => bin.push(5),
InternalKernelArgType::WorkDim => bin.push(6),
InternalKernelArgType::WorkGroupOffsets => bin.push(7),
InternalKernelArgType::NumWorkgroups => bin.push(8),
}
bin
@ -243,6 +248,8 @@ impl InternalKernelArg {
4 => InternalKernelArgType::FormatArray,
5 => InternalKernelArgType::OrderArray,
6 => InternalKernelArgType::WorkDim,
7 => InternalKernelArgType::WorkGroupOffsets,
8 => InternalKernelArgType::NumWorkgroups,
_ => return None,
};
@ -531,6 +538,7 @@ fn lower_and_optimize_nir(
nir_pass!(nir, nir_lower_system_values);
let mut compute_options = nir_lower_compute_system_values_options::default();
compute_options.set_has_base_global_invocation_id(true);
compute_options.set_has_base_workgroup_id(true);
nir_pass!(nir, nir_lower_compute_system_values, &compute_options);
nir.gather_info();
@ -549,6 +557,37 @@ fn lower_and_optimize_nir(
);
}
if nir.reads_sysval(gl_system_value::SYSTEM_VALUE_BASE_WORKGROUP_ID) {
internal_args.push(InternalKernelArg {
kind: InternalKernelArgType::WorkGroupOffsets,
offset: 0,
size: 3 * size_of::<usize>(),
});
lower_state.base_workgroup_id_loc = args.len() + internal_args.len() - 1;
nir.add_var(
nir_variable_mode::nir_var_uniform,
unsafe { glsl_vector_type(host_bits_base_type, 3) },
lower_state.base_workgroup_id_loc,
"base_workgroup_id",
);
}
if nir.reads_sysval(gl_system_value::SYSTEM_VALUE_NUM_WORKGROUPS) {
internal_args.push(InternalKernelArg {
kind: InternalKernelArgType::NumWorkgroups,
offset: 0,
size: 12,
});
lower_state.num_workgroups_loc = args.len() + internal_args.len() - 1;
nir.add_var(
nir_variable_mode::nir_var_uniform,
unsafe { glsl_vector_type(glsl_base_type::GLSL_TYPE_UINT, 3) },
lower_state.num_workgroups_loc,
"num_workgroups",
);
}
if nir.has_constant() {
internal_args.push(InternalKernelArg {
kind: InternalKernelArgType::ConstantBuffer,
@ -906,6 +945,7 @@ impl Kernel {
let mut block = create_kernel_arr::<u32>(block, 1)?;
let mut grid = create_kernel_arr::<usize>(grid, 1)?;
let offsets = create_kernel_arr::<usize>(offsets, 0)?;
let mut workgroup_id_offset_loc = None;
let mut input: Vec<u8> = Vec::new();
let mut resource_info = Vec::new();
// Set it once so we get the alignment padding right
@ -919,10 +959,12 @@ impl Kernel {
let mut tex_orders: Vec<u16> = Vec::new();
let mut img_formats: Vec<u16> = Vec::new();
let mut img_orders: Vec<u16> = Vec::new();
let null_ptr: &[u8] = if q.device.address_bits() == 64 {
&[0; 8]
let host_null_v3 = &[0u8; 3 * size_of::<usize>()];
let null_ptr = if q.device.address_bits() == 64 {
[0u8; 8].as_slice()
} else {
&[0; 4]
[0u8; 4].as_slice()
};
self.optimize_local_size(q.device, &mut grid, &mut block);
@ -1043,6 +1085,10 @@ impl Kernel {
InternalKernelArgType::GlobalWorkOffsets => {
input.extend_from_slice(unsafe { as_byte_slice(&offsets) });
}
InternalKernelArgType::WorkGroupOffsets => {
workgroup_id_offset_loc = Some(input.len());
input.extend_from_slice(host_null_v3);
}
InternalKernelArgType::PrintfBuffer => {
let buf = Arc::new(
q.device
@ -1074,6 +1120,11 @@ impl Kernel {
InternalKernelArgType::WorkDim => {
input.extend_from_slice(&[work_dim as u8; 1]);
}
InternalKernelArgType::NumWorkgroups => {
input.extend_from_slice(unsafe {
as_byte_slice(&[grid[0] as u32, grid[1] as u32, grid[2] as u32])
});
}
}
}
@ -1123,13 +1174,42 @@ impl Kernel {
ctx.set_global_binding(resources.as_slice(), &mut globals);
ctx.update_cb0(&input);
let grid = [
grid[0].try_into().ok().ok_or(CL_OUT_OF_HOST_MEMORY)?,
grid[1].try_into().ok().ok_or(CL_OUT_OF_HOST_MEMORY)?,
grid[2].try_into().ok().ok_or(CL_OUT_OF_HOST_MEMORY)?,
];
let hw_max_grid: Vec<usize> = q
.device
.max_grid_size()
.into_iter()
.map(|val| val.try_into().unwrap_or(usize::MAX))
// clamped as pipe_launch_grid::grid is only u32
.map(|val| cmp::min(val, u32::MAX as usize))
.collect();
ctx.launch_grid(work_dim, block, grid, variable_local_size as u32);
for z in 0..div_round_up(grid[2], hw_max_grid[2]) {
for y in 0..div_round_up(grid[1], hw_max_grid[1]) {
for x in 0..div_round_up(grid[0], hw_max_grid[0]) {
if let Some(workgroup_id_offset_loc) = workgroup_id_offset_loc {
let this_offsets =
[x * hw_max_grid[0], y * hw_max_grid[1], z * hw_max_grid[2]];
input[workgroup_id_offset_loc
..workgroup_id_offset_loc + (size_of::<usize>() * 3)]
.copy_from_slice(unsafe { as_byte_slice(&this_offsets) });
}
let this_grid = [
cmp::min(hw_max_grid[0], grid[0] - hw_max_grid[0] * x) as u32,
cmp::min(hw_max_grid[1], grid[1] - hw_max_grid[1] * y) as u32,
cmp::min(hw_max_grid[2], grid[2] - hw_max_grid[2] * z) as u32,
];
ctx.update_cb0(&input);
ctx.launch_grid(work_dim, block, this_grid, variable_local_size as u32);
if Platform::dbg().sync_every_event {
ctx.flush().wait();
}
}
}
}
ctx.clear_global_binding(globals.len() as u32);
ctx.clear_shader_images(iviews.len() as u32);

View file

@ -65,6 +65,10 @@ rusticl_lower_intrinsics_instr(
return NULL;
case nir_intrinsic_load_base_global_invocation_id:
return nir_load_var(b, nir_find_variable_with_location(b->shader, nir_var_uniform, state->base_global_invoc_id_loc));
case nir_intrinsic_load_base_workgroup_id:
return nir_load_var(b, nir_find_variable_with_location(b->shader, nir_var_uniform, state->base_workgroup_id_loc));
case nir_intrinsic_load_num_workgroups:
return nir_load_var(b, nir_find_variable_with_location(b->shader, nir_var_uniform, state->num_workgroups_loc));
case nir_intrinsic_load_constant_base_ptr:
return nir_load_var(b, nir_find_variable_with_location(b->shader, nir_var_uniform, state->const_buf_loc));
case nir_intrinsic_load_printf_buffer_address:

View file

@ -2,11 +2,13 @@
struct rusticl_lower_state {
size_t base_global_invoc_id_loc;
size_t base_workgroup_id_loc;
size_t const_buf_loc;
size_t printf_buf_loc;
size_t format_arr_loc;
size_t order_arr_loc;
size_t work_dim_loc;
size_t num_workgroups_loc;
};
bool rusticl_lower_intrinsics(nir_shader *nir, struct rusticl_lower_state *state);