rusticl: lower huge grids

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27666>
2025-12-30 01:20:17 +01:00 · 2023-12-20 02:05:06 +01:00 · 2023-12-20 02:05:06 +01:00 · 91552bb4ec
commit 91552bb4ec
parent 204c287327
4 changed files with 100 additions and 9 deletions
--- a/src/gallium/frontends/rusticl/core/device.rs
+++ b/src/gallium/frontends/rusticl/core/device.rs
@ -892,6 +892,11 @@ impl Device {
        v.into_iter().map(|v| v as usize).collect()
    }

+    pub fn max_grid_size(&self) -> Vec<u64> {
+        self.screen
+            .compute_param(pipe_compute_cap::PIPE_COMPUTE_CAP_MAX_GRID_SIZE)
+    }
+
    pub fn max_clock_freq(&self) -> cl_uint {
        self.screen
            .compute_param(pipe_compute_cap::PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY)
--- a/src/gallium/frontends/rusticl/core/kernel.rs
+++ b/src/gallium/frontends/rusticl/core/kernel.rs
@ -2,6 +2,7 @@ use crate::api::icd::*;
 use crate::core::device::*;
 use crate::core::event::*;
 use crate::core::memory::*;
+use crate::core::platform::*;
 use crate::core::program::*;
 use crate::core::queue::*;
 use crate::impl_cl_type_trait;
@ -61,6 +62,8 @@ pub enum InternalKernelArgType {
    FormatArray,
    OrderArray,
    WorkDim,
+    WorkGroupOffsets,
+    NumWorkgroups,
 }

 #[derive(Hash, PartialEq, Eq, Clone)]
@ -221,6 +224,8 @@ impl InternalKernelArg {
            InternalKernelArgType::FormatArray => bin.push(4),
            InternalKernelArgType::OrderArray => bin.push(5),
            InternalKernelArgType::WorkDim => bin.push(6),
+            InternalKernelArgType::WorkGroupOffsets => bin.push(7),
+            InternalKernelArgType::NumWorkgroups => bin.push(8),
        }

        bin
@ -243,6 +248,8 @@ impl InternalKernelArg {
            4 => InternalKernelArgType::FormatArray,
            5 => InternalKernelArgType::OrderArray,
            6 => InternalKernelArgType::WorkDim,
+            7 => InternalKernelArgType::WorkGroupOffsets,
+            8 => InternalKernelArgType::NumWorkgroups,
            _ => return None,
        };

@ -531,6 +538,7 @@ fn lower_and_optimize_nir(
    nir_pass!(nir, nir_lower_system_values);
    let mut compute_options = nir_lower_compute_system_values_options::default();
    compute_options.set_has_base_global_invocation_id(true);
+    compute_options.set_has_base_workgroup_id(true);
    nir_pass!(nir, nir_lower_compute_system_values, &compute_options);
    nir.gather_info();

@ -549,6 +557,37 @@ fn lower_and_optimize_nir(
        );
    }

+    if nir.reads_sysval(gl_system_value::SYSTEM_VALUE_BASE_WORKGROUP_ID) {
+        internal_args.push(InternalKernelArg {
+            kind: InternalKernelArgType::WorkGroupOffsets,
+            offset: 0,
+            size: 3 * size_of::<usize>(),
+        });
+        lower_state.base_workgroup_id_loc = args.len() + internal_args.len() - 1;
+        nir.add_var(
+            nir_variable_mode::nir_var_uniform,
+            unsafe { glsl_vector_type(host_bits_base_type, 3) },
+            lower_state.base_workgroup_id_loc,
+            "base_workgroup_id",
+        );
+    }
+
+    if nir.reads_sysval(gl_system_value::SYSTEM_VALUE_NUM_WORKGROUPS) {
+        internal_args.push(InternalKernelArg {
+            kind: InternalKernelArgType::NumWorkgroups,
+            offset: 0,
+            size: 12,
+        });
+
+        lower_state.num_workgroups_loc = args.len() + internal_args.len() - 1;
+        nir.add_var(
+            nir_variable_mode::nir_var_uniform,
+            unsafe { glsl_vector_type(glsl_base_type::GLSL_TYPE_UINT, 3) },
+            lower_state.num_workgroups_loc,
+            "num_workgroups",
+        );
+    }
+
    if nir.has_constant() {
        internal_args.push(InternalKernelArg {
            kind: InternalKernelArgType::ConstantBuffer,
@ -906,6 +945,7 @@ impl Kernel {
        let mut block = create_kernel_arr::<u32>(block, 1)?;
        let mut grid = create_kernel_arr::<usize>(grid, 1)?;
        let offsets = create_kernel_arr::<usize>(offsets, 0)?;
+        let mut workgroup_id_offset_loc = None;
        let mut input: Vec<u8> = Vec::new();
        let mut resource_info = Vec::new();
        // Set it once so we get the alignment padding right
@ -919,10 +959,12 @@ impl Kernel {
        let mut tex_orders: Vec<u16> = Vec::new();
        let mut img_formats: Vec<u16> = Vec::new();
        let mut img_orders: Vec<u16> = Vec::new();
-        let null_ptr: &[u8] = if q.device.address_bits() == 64 {
-            &[0; 8]
+
+        let host_null_v3 = &[0u8; 3 * size_of::<usize>()];
+        let null_ptr = if q.device.address_bits() == 64 {
+            [0u8; 8].as_slice()
        } else {
-            &[0; 4]
+            [0u8; 4].as_slice()
        };

        self.optimize_local_size(q.device, &mut grid, &mut block);
@ -1043,6 +1085,10 @@ impl Kernel {
                InternalKernelArgType::GlobalWorkOffsets => {
                    input.extend_from_slice(unsafe { as_byte_slice(&offsets) });
                }
+                InternalKernelArgType::WorkGroupOffsets => {
+                    workgroup_id_offset_loc = Some(input.len());
+                    input.extend_from_slice(host_null_v3);
+                }
                InternalKernelArgType::PrintfBuffer => {
                    let buf = Arc::new(
                        q.device
@ -1074,6 +1120,11 @@ impl Kernel {
                InternalKernelArgType::WorkDim => {
                    input.extend_from_slice(&[work_dim as u8; 1]);
                }
+                InternalKernelArgType::NumWorkgroups => {
+                    input.extend_from_slice(unsafe {
+                        as_byte_slice(&[grid[0] as u32, grid[1] as u32, grid[2] as u32])
+                    });
+                }
            }
        }

@ -1123,13 +1174,42 @@ impl Kernel {
            ctx.set_global_binding(resources.as_slice(), &mut globals);
            ctx.update_cb0(&input);

-            let grid = [
-                grid[0].try_into().ok().ok_or(CL_OUT_OF_HOST_MEMORY)?,
-                grid[1].try_into().ok().ok_or(CL_OUT_OF_HOST_MEMORY)?,
-                grid[2].try_into().ok().ok_or(CL_OUT_OF_HOST_MEMORY)?,
-            ];
+            let hw_max_grid: Vec<usize> = q
+                .device
+                .max_grid_size()
+                .into_iter()
+                .map(|val| val.try_into().unwrap_or(usize::MAX))
+                // clamped as pipe_launch_grid::grid is only u32
+                .map(|val| cmp::min(val, u32::MAX as usize))
+                .collect();

-            ctx.launch_grid(work_dim, block, grid, variable_local_size as u32);
+            for z in 0..div_round_up(grid[2], hw_max_grid[2]) {
+                for y in 0..div_round_up(grid[1], hw_max_grid[1]) {
+                    for x in 0..div_round_up(grid[0], hw_max_grid[0]) {
+                        if let Some(workgroup_id_offset_loc) = workgroup_id_offset_loc {
+                            let this_offsets =
+                                [x * hw_max_grid[0], y * hw_max_grid[1], z * hw_max_grid[2]];
+
+                            input[workgroup_id_offset_loc
+                                ..workgroup_id_offset_loc + (size_of::<usize>() * 3)]
+                                .copy_from_slice(unsafe { as_byte_slice(&this_offsets) });
+                        }
+
+                        let this_grid = [
+                            cmp::min(hw_max_grid[0], grid[0] - hw_max_grid[0] * x) as u32,
+                            cmp::min(hw_max_grid[1], grid[1] - hw_max_grid[1] * y) as u32,
+                            cmp::min(hw_max_grid[2], grid[2] - hw_max_grid[2] * z) as u32,
+                        ];
+
+                        ctx.update_cb0(&input);
+                        ctx.launch_grid(work_dim, block, this_grid, variable_local_size as u32);
+
+                        if Platform::dbg().sync_every_event {
+                            ctx.flush().wait();
+                        }
+                    }
+                }
+            }

            ctx.clear_global_binding(globals.len() as u32);
            ctx.clear_shader_images(iviews.len() as u32);
--- a/src/gallium/frontends/rusticl/rusticl_nir.c
+++ b/src/gallium/frontends/rusticl/rusticl_nir.c
@ -65,6 +65,10 @@ rusticl_lower_intrinsics_instr(
        return NULL;
    case nir_intrinsic_load_base_global_invocation_id:
        return nir_load_var(b, nir_find_variable_with_location(b->shader, nir_var_uniform, state->base_global_invoc_id_loc));
+    case nir_intrinsic_load_base_workgroup_id:
+        return nir_load_var(b, nir_find_variable_with_location(b->shader, nir_var_uniform, state->base_workgroup_id_loc));
+    case nir_intrinsic_load_num_workgroups:
+        return nir_load_var(b, nir_find_variable_with_location(b->shader, nir_var_uniform, state->num_workgroups_loc));
    case nir_intrinsic_load_constant_base_ptr:
        return nir_load_var(b, nir_find_variable_with_location(b->shader, nir_var_uniform, state->const_buf_loc));
    case nir_intrinsic_load_printf_buffer_address:
--- a/src/gallium/frontends/rusticl/rusticl_nir.h
+++ b/src/gallium/frontends/rusticl/rusticl_nir.h
@ -2,11 +2,13 @@

 struct rusticl_lower_state {
    size_t base_global_invoc_id_loc;
+    size_t base_workgroup_id_loc;
    size_t const_buf_loc;
    size_t printf_buf_loc;
    size_t format_arr_loc;
    size_t order_arr_loc;
    size_t work_dim_loc;
+    size_t num_workgroups_loc;
 };

 bool rusticl_lower_intrinsics(nir_shader *nir, struct rusticl_lower_state *state);