ir3: Support multiview in GS lowering

With GS+multiview, the VS will loop over each view in the shader while each GS invocation only corresponds to a single view. Varyings for each view will be stored next to each other in local memory. Implement view index calculations when lowering VS outputs/GS inputs. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40153>
2026-05-07 11:28:05 +02:00 · 2026-03-24 19:28:52 -04:00 · 2026-03-24 19:28:52 -04:00 · be84cb6211
commit be84cb6211
parent bc72ef2ee9
2 changed files with 50 additions and 3 deletions
--- a/src/freedreno/ir3/ir3_nir_lower_tess.c
+++ b/src/freedreno/ir3/ir3_nir_lower_tess.c
@ -16,6 +16,9 @@ struct state {
      unsigned stride;
   } map;

+   uint32_t view_mask;
+   unsigned view_count;
+
   nir_def *header;

   nir_variable *vertex_count_var;
@ -122,7 +125,8 @@ shader_io_get_unique_index(gl_varying_slot slot)

 static nir_def *
 build_local_offset(nir_builder *b, struct state *state, nir_def *vertex,
-                   uint32_t location, uint32_t comp, nir_def *offset)
+                   nir_def *view, uint32_t location, uint32_t comp,
+                   nir_def *offset)
 {
   nir_def *primitive_stride = nir_load_vs_primitive_stride_ir3(b);
   nir_def *primitive_offset =
@ -147,6 +151,9 @@ build_local_offset(nir_builder *b, struct state *state, nir_def *vertex,
      UNREACHABLE("bad shader stage");
   }

+   if (state->view_count > 1)
+      vertex = nir_iadd(b, nir_imul_imm(b, vertex, state->view_count), view);
+
   nir_def *vertex_offset = nir_imul24(b, vertex, vertex_stride);

   return nir_iadd(
@ -249,10 +256,17 @@ lower_block_to_explicit_output(nir_block *block, nir_builder *b,
         continue;

      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+      nir_def *view = NULL;

      switch (intr->intrinsic) {
+      case nir_intrinsic_store_per_view_output:
+         view = intr->src[1].ssa;
+         FALLTHROUGH;
      case nir_intrinsic_store_output: {
         // src[] = { value, offset }.
+         nir_def *intr_offset = intr->intrinsic ==
+            nir_intrinsic_store_per_view_output ? intr->src[2].ssa :
+            intr->src[1].ssa;

         /* nir_lower_io_vars_to_temporaries replaces all access to output
          * variables with temp variables and then emits a nir_copy_var at
@ -266,8 +280,9 @@ lower_block_to_explicit_output(nir_block *block, nir_builder *b,

         nir_def *vertex_id = build_vertex_id(b, state);
         nir_def *offset = build_local_offset(
-            b, state, vertex_id, nir_intrinsic_io_semantics(intr).location,
-            nir_intrinsic_component(intr), intr->src[1].ssa);
+            b, state, vertex_id, view,
+            nir_intrinsic_io_semantics(intr).location,
+            nir_intrinsic_component(intr), intr_offset);

         nir_store_shared_ir3(b, intr->src[0].ssa, offset);
         progress = true;
@ -295,6 +310,9 @@ ir3_nir_lower_to_explicit_output(nir_shader *shader,
 {
   struct state state = {};

+   state.view_mask = shader->info.view_mask;
+   state.view_count = MAX2(1, util_bitcount(shader->info.view_mask));
+
   build_primitive_map(shader, &state.map);
   memcpy(v->output_loc, state.map.loc, sizeof(v->output_loc));

@ -314,6 +332,7 @@ ir3_nir_lower_to_explicit_output(nir_shader *shader,
      progress |= lower_block_to_explicit_output(block, &b, &state);

   v->output_size = state.map.stride;
+   v->view_count = state.view_count;
   return nir_progress(progress, impl, nir_metadata_control_flow);
 }

@ -335,9 +354,29 @@ lower_block_to_explicit_input(nir_block *block, nir_builder *b,

         b->cursor = nir_before_instr(&intr->instr);

+         nir_def *view = NULL;
+         if (state->view_count > 1) {
+            view = nir_load_view_index(b);
+            /* nir_lower_multiview tightly packs the outputs, skipping over
+             * inactive views. This means we need to compute the tightly packed
+             * index from the original view_index if the view mask is not
+             * contiguous (i.e. not a power of two minus one):
+             *
+             * mask = (1u << view) - 1
+             * packed_view = bitcount(mask & view_mask)
+             */
+            if (!util_is_power_of_two_or_zero(state->view_mask + 1)) {
+               nir_def *mask =
+                  nir_iadd_imm(b, nir_ishl(b, nir_imm_int(b, 1), view), -1);
+               view =
+                  nir_bit_count(b, nir_iand_imm(b, mask, state->view_mask));
+            }
+         }
+
         nir_def *offset = build_local_offset(
            b, state,
            intr->src[0].ssa, // this is typically gl_InvocationID
+            view,
            nir_intrinsic_io_semantics(intr).location,
            nir_intrinsic_component(intr), intr->src[1].ssa);

@ -370,6 +409,9 @@ ir3_nir_lower_to_explicit_input(nir_shader *shader,
 {
   struct state state = {};

+   state.view_mask = shader->info.view_mask;
+   state.view_count = MAX2(1, util_bitcount(shader->info.view_mask));
+
   /* when using stl/ldl (instead of stlw/ldlw) for linking VS and HS,
    * HS uses a different primitive id, which starts at bit 16 in the header
    */
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@ -771,6 +771,11 @@ struct ir3_shader_variant {
   /* Size in dwords of all outputs for VS, size of entire patch for HS. */
   uint32_t output_size;

+   /* For stages with output_size, the number of views. Outputs are replicated
+    * per view.
+    */
+   uint32_t view_count;
+
   /* Expected size of incoming output_loc for HS, DS, and GS */
   uint32_t input_size;