From 448b5e0225f92c19d6d2721589c5c87e65bdf003 Mon Sep 17 00:00:00 2001
From: Benjamin Lee <benjamin.lee@collabora.com>
Date: Wed, 16 Oct 2024 22:27:25 -0700
Subject: [PATCH] panvk: implement multiview support

In Valhall multiview, position/varying shaders are invoked once per
draw. Each invocation write separate outputs for all views. Fragment
processing is handled by the existing multilayer support. Note that
because the hardware only supports up to 8 views, we don't have to care
about the case where there are too many layers to fit in one tiler when
multiview is enabled.

Signed-off-by: Benjamin Lee <benjamin.lee@collabora.com>
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31704>
---
 src/panfrost/compiler/bifrost_compile.c       | 30 ++++++++++++++++---
 src/panfrost/util/pan_ir.h                    |  1 +
 src/panfrost/util/pan_lower_store_component.c |  8 ++++-
 src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c   | 14 +++++++--
 src/panfrost/vulkan/panvk_cmd_draw.h          |  1 +
 src/panfrost/vulkan/panvk_vX_cmd_draw.c       |  7 +++--
 src/panfrost/vulkan/panvk_vX_shader.c         | 21 +++++++++++++
 7 files changed, 72 insertions(+), 10 deletions(-)

diff --git a/src/panfrost/compiler/bifrost_compile.c b/src/panfrost/compiler/bifrost_compile.c
index 34c52a67fb8..7252daf63be 100644
--- a/src/panfrost/compiler/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost_compile.c
@@ -1047,7 +1047,8 @@ bifrost_nir_specialize_idvs(nir_builder *b, nir_instr *instr, void *data)
 
    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
 
-   if (intr->intrinsic != nir_intrinsic_store_output)
+   if (intr->intrinsic != nir_intrinsic_store_output &&
+       intr->intrinsic != nir_intrinsic_store_per_view_output)
       return false;
 
    if (bi_should_remove_store(intr, *idvs)) {
@@ -1127,11 +1128,12 @@ bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
                 bi_imm_u32(format), regfmt, nr - 1);
    } else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) {
       bi_index index = bi_preload(b, 59);
+      unsigned index_offset = 0;
       unsigned pos_attr_offset = 0;
       unsigned src_bit_sz = nir_src_bit_size(instr->src[0]);
 
       if (psiz || layer)
-         index = bi_iadd_imm_i32(b, index, 4);
+         index_offset += 4;
 
       if (layer) {
          assert(nr == 1 && src_bit_sz == 32);
@@ -1143,11 +1145,29 @@ bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
       if (psiz)
          assert(T_size == 16 && "should've been lowered");
 
+      bool varying = (b->shader->idvs == BI_IDVS_VARYING);
+
+      if (instr->intrinsic == nir_intrinsic_store_per_view_output) {
+         unsigned view_index = nir_src_as_uint(instr->src[1]);
+
+         if (varying) {
+            index_offset += view_index * 4;
+         } else {
+            /* We don't patch these offsets in the no_psiz variant, so if
+             * multiview is enabled we can't switch to the basic format by
+             * using no_psiz */
+            bool extended_position_fifo = b->shader->nir->info.outputs_written &
+               (VARYING_BIT_LAYER | VARYING_BIT_PSIZ);
+            unsigned position_fifo_stride = extended_position_fifo ? 8 : 4;
+            index_offset += view_index * position_fifo_stride;
+         }
+      }
+
+      if (index_offset != 0)
+         index = bi_iadd_imm_i32(b, index, index_offset);
       bi_index address = bi_lea_buf_imm(b, index);
       bi_emit_split_i32(b, a, address, 2);
 
-      bool varying = (b->shader->idvs == BI_IDVS_VARYING);
-
       bi_store(b, nr * src_bit_sz, data, a[0], a[1],
                varying ? BI_SEG_VARY : BI_SEG_POS,
                varying ? bi_varying_offset(b->shader, instr) : pos_attr_offset);
@@ -1739,6 +1759,7 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
       break;
 
    case nir_intrinsic_store_output:
+   case nir_intrinsic_store_per_view_output:
       if (stage == MESA_SHADER_FRAGMENT)
          bi_emit_fragment_out(b, instr);
       else if (stage == MESA_SHADER_VERTEX)
@@ -1978,6 +1999,7 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
       bi_emit_derivative(b, dst, instr, 2, true);
       break;
 
+   case nir_intrinsic_load_view_index:
    case nir_intrinsic_load_layer_id:
       assert(b->shader->arch >= 9);
       bi_mov_i32_to(b, dst, bi_u8_to_u32(b, bi_byte(bi_preload(b, 62), 0)));
diff --git a/src/panfrost/util/pan_ir.h b/src/panfrost/util/pan_ir.h
index 5b0d9a1cc0c..8168b550039 100644
--- a/src/panfrost/util/pan_ir.h
+++ b/src/panfrost/util/pan_ir.h
@@ -105,6 +105,7 @@ struct panfrost_compile_inputs {
    } blend;
    bool no_idvs;
    bool no_ubo_to_push;
+   uint32_t view_mask;
 
    /* Used on Valhall.
     *
diff --git a/src/panfrost/util/pan_lower_store_component.c b/src/panfrost/util/pan_lower_store_component.c
index 717250138a3..8ca139a3a8a 100644
--- a/src/panfrost/util/pan_lower_store_component.c
+++ b/src/panfrost/util/pan_lower_store_component.c
@@ -36,7 +36,8 @@
 static bool
 lower_store_component(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 {
-   if (intr->intrinsic != nir_intrinsic_store_output)
+   if (intr->intrinsic != nir_intrinsic_store_output &&
+       intr->intrinsic != nir_intrinsic_store_per_view_output)
       return false;
 
    struct hash_table_u64 *slots = data;
@@ -44,6 +45,11 @@ lower_store_component(nir_builder *b, nir_intrinsic_instr *intr, void *data)
    nir_src *slot_src = nir_get_io_offset_src(intr);
    uint64_t slot = nir_src_as_uint(*slot_src) + nir_intrinsic_base(intr);
 
+   if (intr->intrinsic == nir_intrinsic_store_per_view_output) {
+      uint64_t view_index = nir_src_as_uint(intr->src[1]);
+      slot |= view_index << 32;
+   }
+
    nir_intrinsic_instr *prev = _mesa_hash_table_u64_search(slots, slot);
    unsigned mask = (prev ? nir_intrinsic_write_mask(prev) : 0);
 
diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
index 953823568c1..8ad3bebf60f 100644
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
@@ -1571,6 +1571,7 @@ set_tiler_idvs_flags(struct cs_builder *b, struct panvk_cmd_buffer *cmdbuf,
 
          cfg.secondary_shader = vs->info.vs.secondary_enable && fs != NULL;
          cfg.primitive_restart = ia->primitive_restart_enable;
+         cfg.view_mask = cmdbuf->state.gfx.render.view_mask;
       }
 
       cs_move32_to(b, cs_sr_reg32(b, 56), tiler_idvs_flags.opaque[0]);
@@ -1857,8 +1858,12 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
     * we decide to support layared+indirect, we'll need to pass the
     * layer_count info through the tiler descriptor, for instance by
     * re-using one of the word that's flagged 'ignored' in the descriptor
-    * (word 14:23). */
-   assert(cmdbuf->state.gfx.render.layer_count <= 1);
+    * (word 14:23).
+    *
+    * Multiview is limited to 8 layers, and so will always fit in one TD.
+    * Therefore layered rendering is allowed with multiview. */
+   assert(cmdbuf->state.gfx.render.layer_count <= 1 ||
+          cmdbuf->state.gfx.render.view_mask);
 
    /* MultiDrawIndirect (.maxDrawIndirectCount) needs additional changes. */
    assert(draw->indirect.draw_count == 1);
@@ -1978,7 +1983,10 @@ panvk_per_arch(cmd_inherit_render_state)(
           sizeof(cmdbuf->state.gfx.render.s_attachment));
    cmdbuf->state.gfx.render.bound_attachments = 0;
 
-   cmdbuf->state.gfx.render.layer_count = 0;
+   cmdbuf->state.gfx.render.view_mask = inheritance_info->viewMask;
+   cmdbuf->state.gfx.render.layer_count = inheritance_info->viewMask ?
+      util_last_bit(inheritance_info->viewMask) :
+      0;
    *fbinfo = (struct pan_fb_info){
       .tile_buf_budget = panfrost_query_optimal_tib_size(phys_dev->model),
       .nr_samples = inheritance_info->rasterizationSamples,
diff --git a/src/panfrost/vulkan/panvk_cmd_draw.h b/src/panfrost/vulkan/panvk_cmd_draw.h
index 73f751f34da..2312f0e79e6 100644
--- a/src/panfrost/vulkan/panvk_cmd_draw.h
+++ b/src/panfrost/vulkan/panvk_cmd_draw.h
@@ -40,6 +40,7 @@ struct panvk_resolve_attachment {
 struct panvk_rendering_state {
    VkRenderingFlags flags;
    uint32_t layer_count;
+   uint32_t view_mask;
 
    enum vk_rp_attachment_flags bound_attachments;
    struct {
diff --git a/src/panfrost/vulkan/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/panvk_vX_cmd_draw.c
index f184f8c468b..769004d9687 100644
--- a/src/panfrost/vulkan/panvk_vX_cmd_draw.c
+++ b/src/panfrost/vulkan/panvk_vX_cmd_draw.c
@@ -227,7 +227,10 @@ panvk_per_arch(cmd_init_render_state)(struct panvk_cmd_buffer *cmdbuf,
    memset(&state->render.s_attachment, 0, sizeof(state->render.s_attachment));
    state->render.bound_attachments = 0;
 
-   state->render.layer_count = pRenderingInfo->layerCount;
+   cmdbuf->state.gfx.render.layer_count = pRenderingInfo->viewMask ?
+      util_last_bit(pRenderingInfo->viewMask) :
+      pRenderingInfo->layerCount;
+   cmdbuf->state.gfx.render.view_mask = pRenderingInfo->viewMask;
    *fbinfo = (struct pan_fb_info){
       .tile_buf_budget = panfrost_query_optimal_tib_size(phys_dev->model),
       .nr_samples = 1,
@@ -390,7 +393,7 @@ panvk_per_arch(cmd_resolve_attachments)(struct panvk_cmd_buffer *cmdbuf)
             .extent.height = fbinfo->extent.maxy - fbinfo->extent.miny + 1,
          },
       .layerCount = cmdbuf->state.gfx.render.layer_count,
-      .viewMask = 0,
+      .viewMask = cmdbuf->state.gfx.render.view_mask,
       .colorAttachmentCount = color_att_count,
       .pColorAttachments = color_atts,
       .pDepthAttachment = &z_att,
diff --git a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c
index ced0ea76802..fc2b43e6891 100644
--- a/src/panfrost/vulkan/panvk_vX_shader.c
+++ b/src/panfrost/vulkan/panvk_vX_shader.c
@@ -372,6 +372,9 @@ panvk_hash_graphics_state(struct vk_physical_device *device,
    _mesa_blake3_update(&blake3_ctx, &sample_shading_enable,
                        sizeof(sample_shading_enable));
 
+   _mesa_blake3_update(&blake3_ctx, &state->rp->view_mask,
+                       sizeof(state->rp->view_mask));
+
    _mesa_blake3_final(&blake3_ctx, blake3_out);
 }
 
@@ -458,6 +461,23 @@ panvk_lower_nir(struct panvk_device *dev, nir_shader *nir,
       to_panvk_instance(dev->vk.physical->instance);
    gl_shader_stage stage = nir->info.stage;
 
+#if PAN_ARCH >= 10
+   if (stage == MESA_SHADER_VERTEX && compile_input->view_mask) {
+      nir_lower_multiview_options options = {
+         .view_mask = compile_input->view_mask,
+         .allowed_per_view_outputs = ~0
+      };
+      /* The only case where this should fail is with memory/image writes,
+       * which we don't support in vertex shaders */
+      assert(nir_can_lower_multiview(nir, options));
+      NIR_PASS(_, nir, nir_lower_multiview, options);
+      /* Pull output writes out of the loop and give them constant offsets for
+       * pan_lower_store_components */
+      NIR_PASS(_, nir, nir_lower_io_to_temporaries,
+               nir_shader_get_entrypoint(nir), true, false);
+   }
+#endif
+
    NIR_PASS(_, nir, panvk_per_arch(nir_lower_descriptors), dev, rs,
             set_layout_count, set_layouts, shader);
 
@@ -835,6 +855,7 @@ panvk_compile_shader(struct panvk_device *dev,
    struct panfrost_compile_inputs inputs = {
       .gpu_id = phys_dev->kmod.props.gpu_prod_id,
       .no_ubo_to_push = true,
+      .view_mask = (state && state->rp) ? state->rp->view_mask : 0,
    };
 
    panvk_lower_nir(dev, nir, info->set_layout_count, info->set_layouts,