diff --git a/src/gallium/frontends/rusticl/core/device.rs b/src/gallium/frontends/rusticl/core/device.rs
index 8cd48ed0db8..e4662c25d14 100644
--- a/src/gallium/frontends/rusticl/core/device.rs
+++ b/src/gallium/frontends/rusticl/core/device.rs
@@ -892,6 +892,11 @@ impl Device {
         v.into_iter().map(|v| v as usize).collect()
     }
 
+    pub fn max_grid_size(&self) -> Vec<u64> {
+        self.screen
+            .compute_param(pipe_compute_cap::PIPE_COMPUTE_CAP_MAX_GRID_SIZE)
+    }
+
     pub fn max_clock_freq(&self) -> cl_uint {
         self.screen
             .compute_param(pipe_compute_cap::PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY)
diff --git a/src/gallium/frontends/rusticl/core/kernel.rs b/src/gallium/frontends/rusticl/core/kernel.rs
index 6865ebec658..edd4060479f 100644
--- a/src/gallium/frontends/rusticl/core/kernel.rs
+++ b/src/gallium/frontends/rusticl/core/kernel.rs
@@ -2,6 +2,7 @@ use crate::api::icd::*;
 use crate::core::device::*;
 use crate::core::event::*;
 use crate::core::memory::*;
+use crate::core::platform::*;
 use crate::core::program::*;
 use crate::core::queue::*;
 use crate::impl_cl_type_trait;
@@ -61,6 +62,8 @@ pub enum InternalKernelArgType {
     FormatArray,
     OrderArray,
     WorkDim,
+    WorkGroupOffsets,
+    NumWorkgroups,
 }
 
 #[derive(Hash, PartialEq, Eq, Clone)]
@@ -221,6 +224,8 @@ impl InternalKernelArg {
             InternalKernelArgType::FormatArray => bin.push(4),
             InternalKernelArgType::OrderArray => bin.push(5),
             InternalKernelArgType::WorkDim => bin.push(6),
+            InternalKernelArgType::WorkGroupOffsets => bin.push(7),
+            InternalKernelArgType::NumWorkgroups => bin.push(8),
         }
 
         bin
@@ -243,6 +248,8 @@ impl InternalKernelArg {
             4 => InternalKernelArgType::FormatArray,
             5 => InternalKernelArgType::OrderArray,
             6 => InternalKernelArgType::WorkDim,
+            7 => InternalKernelArgType::WorkGroupOffsets,
+            8 => InternalKernelArgType::NumWorkgroups,
             _ => return None,
         };
 
@@ -531,6 +538,7 @@ fn lower_and_optimize_nir(
     nir_pass!(nir, nir_lower_system_values);
     let mut compute_options = nir_lower_compute_system_values_options::default();
     compute_options.set_has_base_global_invocation_id(true);
+    compute_options.set_has_base_workgroup_id(true);
     nir_pass!(nir, nir_lower_compute_system_values, &compute_options);
     nir.gather_info();
 
@@ -549,6 +557,37 @@ fn lower_and_optimize_nir(
         );
     }
 
+    if nir.reads_sysval(gl_system_value::SYSTEM_VALUE_BASE_WORKGROUP_ID) {
+        internal_args.push(InternalKernelArg {
+            kind: InternalKernelArgType::WorkGroupOffsets,
+            offset: 0,
+            size: 3 * size_of::<usize>(),
+        });
+        lower_state.base_workgroup_id_loc = args.len() + internal_args.len() - 1;
+        nir.add_var(
+            nir_variable_mode::nir_var_uniform,
+            unsafe { glsl_vector_type(host_bits_base_type, 3) },
+            lower_state.base_workgroup_id_loc,
+            "base_workgroup_id",
+        );
+    }
+
+    if nir.reads_sysval(gl_system_value::SYSTEM_VALUE_NUM_WORKGROUPS) {
+        internal_args.push(InternalKernelArg {
+            kind: InternalKernelArgType::NumWorkgroups,
+            offset: 0,
+            size: 12,
+        });
+
+        lower_state.num_workgroups_loc = args.len() + internal_args.len() - 1;
+        nir.add_var(
+            nir_variable_mode::nir_var_uniform,
+            unsafe { glsl_vector_type(glsl_base_type::GLSL_TYPE_UINT, 3) },
+            lower_state.num_workgroups_loc,
+            "num_workgroups",
+        );
+    }
+
     if nir.has_constant() {
         internal_args.push(InternalKernelArg {
             kind: InternalKernelArgType::ConstantBuffer,
@@ -906,6 +945,7 @@ impl Kernel {
         let mut block = create_kernel_arr::<u32>(block, 1)?;
         let mut grid = create_kernel_arr::<usize>(grid, 1)?;
         let offsets = create_kernel_arr::<usize>(offsets, 0)?;
+        let mut workgroup_id_offset_loc = None;
         let mut input: Vec<u8> = Vec::new();
         let mut resource_info = Vec::new();
         // Set it once so we get the alignment padding right
@@ -919,10 +959,12 @@ impl Kernel {
         let mut tex_orders: Vec<u16> = Vec::new();
         let mut img_formats: Vec<u16> = Vec::new();
         let mut img_orders: Vec<u16> = Vec::new();
-        let null_ptr: &[u8] = if q.device.address_bits() == 64 {
-            &[0; 8]
+
+        let host_null_v3 = &[0u8; 3 * size_of::<usize>()];
+        let null_ptr = if q.device.address_bits() == 64 {
+            [0u8; 8].as_slice()
         } else {
-            &[0; 4]
+            [0u8; 4].as_slice()
         };
 
         self.optimize_local_size(q.device, &mut grid, &mut block);
@@ -1043,6 +1085,10 @@ impl Kernel {
                 InternalKernelArgType::GlobalWorkOffsets => {
                     input.extend_from_slice(unsafe { as_byte_slice(&offsets) });
                 }
+                InternalKernelArgType::WorkGroupOffsets => {
+                    workgroup_id_offset_loc = Some(input.len());
+                    input.extend_from_slice(host_null_v3);
+                }
                 InternalKernelArgType::PrintfBuffer => {
                     let buf = Arc::new(
                         q.device
@@ -1074,6 +1120,11 @@ impl Kernel {
                 InternalKernelArgType::WorkDim => {
                     input.extend_from_slice(&[work_dim as u8; 1]);
                 }
+                InternalKernelArgType::NumWorkgroups => {
+                    input.extend_from_slice(unsafe {
+                        as_byte_slice(&[grid[0] as u32, grid[1] as u32, grid[2] as u32])
+                    });
+                }
             }
         }
 
@@ -1123,13 +1174,42 @@ impl Kernel {
             ctx.set_global_binding(resources.as_slice(), &mut globals);
             ctx.update_cb0(&input);
 
-            let grid = [
-                grid[0].try_into().ok().ok_or(CL_OUT_OF_HOST_MEMORY)?,
-                grid[1].try_into().ok().ok_or(CL_OUT_OF_HOST_MEMORY)?,
-                grid[2].try_into().ok().ok_or(CL_OUT_OF_HOST_MEMORY)?,
-            ];
+            let hw_max_grid: Vec<usize> = q
+                .device
+                .max_grid_size()
+                .into_iter()
+                .map(|val| val.try_into().unwrap_or(usize::MAX))
+                // clamped as pipe_launch_grid::grid is only u32
+                .map(|val| cmp::min(val, u32::MAX as usize))
+                .collect();
 
-            ctx.launch_grid(work_dim, block, grid, variable_local_size as u32);
+            for z in 0..div_round_up(grid[2], hw_max_grid[2]) {
+                for y in 0..div_round_up(grid[1], hw_max_grid[1]) {
+                    for x in 0..div_round_up(grid[0], hw_max_grid[0]) {
+                        if let Some(workgroup_id_offset_loc) = workgroup_id_offset_loc {
+                            let this_offsets =
+                                [x * hw_max_grid[0], y * hw_max_grid[1], z * hw_max_grid[2]];
+
+                            input[workgroup_id_offset_loc
+                                ..workgroup_id_offset_loc + (size_of::<usize>() * 3)]
+                                .copy_from_slice(unsafe { as_byte_slice(&this_offsets) });
+                        }
+
+                        let this_grid = [
+                            cmp::min(hw_max_grid[0], grid[0] - hw_max_grid[0] * x) as u32,
+                            cmp::min(hw_max_grid[1], grid[1] - hw_max_grid[1] * y) as u32,
+                            cmp::min(hw_max_grid[2], grid[2] - hw_max_grid[2] * z) as u32,
+                        ];
+
+                        ctx.update_cb0(&input);
+                        ctx.launch_grid(work_dim, block, this_grid, variable_local_size as u32);
+
+                        if Platform::dbg().sync_every_event {
+                            ctx.flush().wait();
+                        }
+                    }
+                }
+            }
 
             ctx.clear_global_binding(globals.len() as u32);
             ctx.clear_shader_images(iviews.len() as u32);
diff --git a/src/gallium/frontends/rusticl/rusticl_nir.c b/src/gallium/frontends/rusticl/rusticl_nir.c
index 083540fad25..17a341cf3e0 100644
--- a/src/gallium/frontends/rusticl/rusticl_nir.c
+++ b/src/gallium/frontends/rusticl/rusticl_nir.c
@@ -65,6 +65,10 @@ rusticl_lower_intrinsics_instr(
         return NULL;
     case nir_intrinsic_load_base_global_invocation_id:
         return nir_load_var(b, nir_find_variable_with_location(b->shader, nir_var_uniform, state->base_global_invoc_id_loc));
+    case nir_intrinsic_load_base_workgroup_id:
+        return nir_load_var(b, nir_find_variable_with_location(b->shader, nir_var_uniform, state->base_workgroup_id_loc));
+    case nir_intrinsic_load_num_workgroups:
+        return nir_load_var(b, nir_find_variable_with_location(b->shader, nir_var_uniform, state->num_workgroups_loc));
     case nir_intrinsic_load_constant_base_ptr:
         return nir_load_var(b, nir_find_variable_with_location(b->shader, nir_var_uniform, state->const_buf_loc));
     case nir_intrinsic_load_printf_buffer_address:
diff --git a/src/gallium/frontends/rusticl/rusticl_nir.h b/src/gallium/frontends/rusticl/rusticl_nir.h
index 44778027e9a..48707e4f18c 100644
--- a/src/gallium/frontends/rusticl/rusticl_nir.h
+++ b/src/gallium/frontends/rusticl/rusticl_nir.h
@@ -2,11 +2,13 @@
 
 struct rusticl_lower_state {
     size_t base_global_invoc_id_loc;
+    size_t base_workgroup_id_loc;
     size_t const_buf_loc;
     size_t printf_buf_loc;
     size_t format_arr_loc;
     size_t order_arr_loc;
     size_t work_dim_loc;
+    size_t num_workgroups_loc;
 };
 
 bool rusticl_lower_intrinsics(nir_shader *nir, struct rusticl_lower_state *state);