From abb7f69520ee12845f77d19cf2ba0df03a97d97c Mon Sep 17 00:00:00 2001
From: Faith Ekstrand <faith.ekstrand@collabora.com>
Date: Mon, 30 Jan 2023 20:11:52 -0600
Subject: [PATCH] nvk/shader: Populate headers for vertex and fragment shaders

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24326>
---
 src/nouveau/vulkan/nvk_shader.c | 415 +++++++++++++++++++++++++++++++-
 src/nouveau/vulkan/nvk_shader.h |  30 ++-
 2 files changed, 440 insertions(+), 5 deletions(-)

diff --git a/src/nouveau/vulkan/nvk_shader.c b/src/nouveau/vulkan/nvk_shader.c
index 21dbc05c2f1..1af19d82b54 100644
--- a/src/nouveau/vulkan/nvk_shader.c
+++ b/src/nouveau/vulkan/nvk_shader.c
@@ -6,6 +6,7 @@
 #include "nvk_nir.h"
 
 #include "nouveau_bo.h"
+#include "nouveau_context.h"
 #include "vk_shader_module.h"
 
 #include "nir.h"
@@ -157,6 +158,379 @@ nvk_lower_nir(struct nvk_device *device, nir_shader *nir,
    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 }
 
+#ifndef NDEBUG
+static void
+nvk_shader_dump(struct nvk_shader *shader)
+{
+   unsigned pos;
+
+   if (shader->stage != MESA_SHADER_COMPUTE) {
+      _debug_printf("dumping HDR for %s shader\n",
+                    _mesa_shader_stage_to_string(shader->stage));
+      for (pos = 0; pos < ARRAY_SIZE(shader->hdr); ++pos)
+         _debug_printf("HDR[%02"PRIxPTR"] = 0x%08x\n",
+                      pos * sizeof(shader->hdr[0]), shader->hdr[pos]);
+   }
+   _debug_printf("shader binary code (0x%x bytes):", shader->code_size);
+   for (pos = 0; pos < shader->code_size / 4; ++pos) {
+      if ((pos % 8) == 0)
+         _debug_printf("\n");
+      _debug_printf("%08x ", ((uint32_t *)shader->code_ptr)[pos]);
+   }
+   _debug_printf("\n");
+}
+#endif
+
+#include "tgsi/tgsi_ureg.h"
+
+/* NOTE: Using a[0x270] in FP may cause an error even if we're using less than
+ * 124 scalar varying values.
+ */
+static uint32_t
+nvc0_shader_input_address(unsigned sn, unsigned si)
+{
+   switch (sn) {
+   case TGSI_SEMANTIC_TESSOUTER:    return 0x000 + si * 0x4;
+   case TGSI_SEMANTIC_TESSINNER:    return 0x010 + si * 0x4;
+   case TGSI_SEMANTIC_PATCH:        return 0x020 + si * 0x10;
+   case TGSI_SEMANTIC_PRIMID:       return 0x060;
+   case TGSI_SEMANTIC_LAYER:        return 0x064;
+   case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;
+   case TGSI_SEMANTIC_PSIZE:        return 0x06c;
+   case TGSI_SEMANTIC_POSITION:     return 0x070;
+   case TGSI_SEMANTIC_GENERIC:      return 0x080 + si * 0x10;
+   case TGSI_SEMANTIC_FOG:          return 0x2e8;
+   case TGSI_SEMANTIC_COLOR:        return 0x280 + si * 0x10;
+   case TGSI_SEMANTIC_BCOLOR:       return 0x2a0 + si * 0x10;
+   case TGSI_SEMANTIC_CLIPDIST:     return 0x2c0 + si * 0x10;
+   case TGSI_SEMANTIC_CLIPVERTEX:   return 0x270;
+   case TGSI_SEMANTIC_PCOORD:       return 0x2e0;
+   case TGSI_SEMANTIC_TESSCOORD:    return 0x2f0;
+   case TGSI_SEMANTIC_INSTANCEID:   return 0x2f8;
+   case TGSI_SEMANTIC_VERTEXID:     return 0x2fc;
+   case TGSI_SEMANTIC_TEXCOORD:     return 0x300 + si * 0x10;
+   default:
+      assert(!"invalid TGSI input semantic");
+      return ~0;
+   }
+}
+
+static uint32_t
+nvc0_shader_output_address(unsigned sn, unsigned si)
+{
+   switch (sn) {
+   case TGSI_SEMANTIC_TESSOUTER:     return 0x000 + si * 0x4;
+   case TGSI_SEMANTIC_TESSINNER:     return 0x010 + si * 0x4;
+   case TGSI_SEMANTIC_PATCH:         return 0x020 + si * 0x10;
+   case TGSI_SEMANTIC_PRIMID:        return 0x060;
+   case TGSI_SEMANTIC_LAYER:         return 0x064;
+   case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;
+   case TGSI_SEMANTIC_PSIZE:         return 0x06c;
+   case TGSI_SEMANTIC_POSITION:      return 0x070;
+   case TGSI_SEMANTIC_GENERIC:       return 0x080 + si * 0x10;
+   case TGSI_SEMANTIC_FOG:           return 0x2e8;
+   case TGSI_SEMANTIC_COLOR:         return 0x280 + si * 0x10;
+   case TGSI_SEMANTIC_BCOLOR:        return 0x2a0 + si * 0x10;
+   case TGSI_SEMANTIC_CLIPDIST:      return 0x2c0 + si * 0x10;
+   case TGSI_SEMANTIC_CLIPVERTEX:    return 0x270;
+   case TGSI_SEMANTIC_TEXCOORD:      return 0x300 + si * 0x10;
+   case TGSI_SEMANTIC_VIEWPORT_MASK: return 0x3a0;
+   case TGSI_SEMANTIC_EDGEFLAG:      return ~0;
+   default:
+      assert(!"invalid TGSI output semantic");
+      return ~0;
+   }
+}
+
+static int
+nvc0_vp_assign_input_slots(struct nv50_ir_prog_info_out *info)
+{
+   unsigned i, c, n;
+
+   for (n = 0, i = 0; i < info->numInputs; ++i) {
+      switch (info->in[i].sn) {
+      case TGSI_SEMANTIC_INSTANCEID: /* for SM4 only, in TGSI they're SVs */
+      case TGSI_SEMANTIC_VERTEXID:
+         info->in[i].mask = 0x1;
+         info->in[i].slot[0] =
+            nvc0_shader_input_address(info->in[i].sn, 0) / 4;
+         continue;
+      default:
+         break;
+      }
+      for (c = 0; c < 4; ++c)
+         info->in[i].slot[c] = (0x80 + n * 0x10 + c * 0x4) / 4;
+      ++n;
+   }
+
+   return 0;
+}
+
+static int
+nvc0_sp_assign_input_slots(struct nv50_ir_prog_info_out *info)
+{
+   unsigned offset;
+   unsigned i, c;
+
+   for (i = 0; i < info->numInputs; ++i) {
+      offset = nvc0_shader_input_address(info->in[i].sn, info->in[i].si);
+
+      for (c = 0; c < 4; ++c)
+         info->in[i].slot[c] = (offset + c * 0x4) / 4;
+   }
+
+   return 0;
+}
+
+static int
+nvc0_fp_assign_output_slots(struct nv50_ir_prog_info_out *info)
+{
+   unsigned count = info->prop.fp.numColourResults * 4;
+   unsigned i, c;
+
+   /* Compute the relative position of each color output, since skipped MRT
+    * positions will not have registers allocated to them.
+    */
+   unsigned colors[8] = {0};
+   for (i = 0; i < info->numOutputs; ++i)
+      if (info->out[i].sn == TGSI_SEMANTIC_COLOR)
+         colors[info->out[i].si] = 1;
+   for (i = 0, c = 0; i < 8; i++)
+      if (colors[i])
+         colors[i] = c++;
+   for (i = 0; i < info->numOutputs; ++i)
+      if (info->out[i].sn == TGSI_SEMANTIC_COLOR)
+         for (c = 0; c < 4; ++c)
+            info->out[i].slot[c] = colors[info->out[i].si] * 4 + c;
+
+   if (info->io.sampleMask < NV50_CODEGEN_MAX_VARYINGS)
+      info->out[info->io.sampleMask].slot[0] = count++;
+   else
+   if (info->target >= 0xe0)
+      count++; /* on Kepler, depth is always last colour reg + 2 */
+
+   if (info->io.fragDepth < NV50_CODEGEN_MAX_VARYINGS)
+      info->out[info->io.fragDepth].slot[2] = count;
+
+   return 0;
+}
+
+static int
+nvc0_sp_assign_output_slots(struct nv50_ir_prog_info_out *info)
+{
+   unsigned offset;
+   unsigned i, c;
+
+   for (i = 0; i < info->numOutputs; ++i) {
+      offset = nvc0_shader_output_address(info->out[i].sn, info->out[i].si);
+
+      for (c = 0; c < 4; ++c)
+         info->out[i].slot[c] = (offset + c * 0x4) / 4;
+   }
+
+   return 0;
+}
+
+static int
+nvc0_program_assign_varying_slots(struct nv50_ir_prog_info_out *info)
+{
+   int ret;
+
+   if (info->type == PIPE_SHADER_VERTEX)
+      ret = nvc0_vp_assign_input_slots(info);
+   else
+      ret = nvc0_sp_assign_input_slots(info);
+   if (ret)
+      return ret;
+
+   if (info->type == PIPE_SHADER_FRAGMENT)
+      ret = nvc0_fp_assign_output_slots(info);
+   else
+      ret = nvc0_sp_assign_output_slots(info);
+   return ret;
+}
+
+static inline void
+nvk_vtgs_hdr_update_oread(struct nvk_shader *vs, uint8_t slot)
+{
+   uint8_t min = (vs->hdr[4] >> 12) & 0xff;
+   uint8_t max = (vs->hdr[4] >> 24);
+
+   min = MIN2(min, slot);
+   max = MAX2(max, slot);
+
+   vs->hdr[4] = (max << 24) | (min << 12);
+}
+
+static int
+nvk_vtgp_gen_header(struct nvk_shader *vs, struct nv50_ir_prog_info_out *info)
+{
+   unsigned i, c, a;
+
+   for (i = 0; i < info->numInputs; ++i) {
+      if (info->in[i].patch)
+         continue;
+      for (c = 0; c < 4; ++c) {
+         a = info->in[i].slot[c];
+         if (info->in[i].mask & (1 << c))
+            vs->hdr[5 + a / 32] |= 1 << (a % 32);
+      }
+   }
+
+   for (i = 0; i < info->numOutputs; ++i) {
+      if (info->out[i].patch)
+         continue;
+      for (c = 0; c < 4; ++c) {
+         if (!(info->out[i].mask & (1 << c)))
+            continue;
+         assert(info->out[i].slot[c] >= 0x40 / 4);
+         a = info->out[i].slot[c] - 0x40 / 4;
+         vs->hdr[13 + a / 32] |= 1 << (a % 32);
+         if (info->out[i].oread)
+            nvk_vtgs_hdr_update_oread(vs, info->out[i].slot[c]);
+      }
+   }
+
+   for (i = 0; i < info->numSysVals; ++i) {
+      switch (info->sv[i].sn) {
+      case TGSI_SEMANTIC_PRIMID:
+         vs->hdr[5] |= 1 << 24;
+         break;
+      case TGSI_SEMANTIC_INSTANCEID:
+         vs->hdr[10] |= 1 << 30;
+         break;
+      case TGSI_SEMANTIC_VERTEXID:
+         vs->hdr[10] |= 1 << 31;
+         break;
+      case TGSI_SEMANTIC_TESSCOORD:
+         /* We don't have the mask, nor the slots populated. While this could
+          * be achieved, the vast majority of the time if either of the coords
+          * are read, then both will be read.
+          */
+         nvk_vtgs_hdr_update_oread(vs, 0x2f0 / 4);
+         nvk_vtgs_hdr_update_oread(vs, 0x2f4 / 4);
+         break;
+      default:
+         break;
+      }
+   }
+
+   vs->vs.clip_enable = (1 << info->io.clipDistances) - 1;
+   vs->vs.cull_enable =
+      ((1 << info->io.cullDistances) - 1) << info->io.clipDistances;
+   for (i = 0; i < info->io.cullDistances; ++i)
+      vs->vs.clip_mode |= 1 << ((info->io.clipDistances + i) * 4);
+
+   if (info->io.genUserClip < 0)
+      vs->vs.num_ucps = 8 + 1; /* prevent rebuilding */
+
+   vs->vs.layer_viewport_relative = info->io.layer_viewport_relative;
+
+   return 0;
+}
+
+static int
+nvk_vs_gen_header(struct nvk_shader *vs, struct nv50_ir_prog_info_out *info)
+{
+   vs->hdr[0] = 0x20061 | (1 << 10);
+   vs->hdr[4] = 0xff000;
+
+   return nvk_vtgp_gen_header(vs, info);
+}
+
+#define NVC0_INTERP_FLAT          (1 << 0)
+#define NVC0_INTERP_PERSPECTIVE   (2 << 0)
+#define NVC0_INTERP_LINEAR        (3 << 0)
+#define NVC0_INTERP_CENTROID      (1 << 2)
+
+static uint8_t
+nvk_hdr_interp_mode(const struct nv50_ir_varying *var)
+{
+   if (var->linear)
+      return NVC0_INTERP_LINEAR;
+   if (var->flat)
+      return NVC0_INTERP_FLAT;
+   return NVC0_INTERP_PERSPECTIVE;
+}
+
+
+static int
+nvk_fs_gen_header(struct nvk_shader *fs, struct nv50_ir_prog_info_out *info)
+{
+   unsigned i, c, a, m;
+
+   /* just 00062 on Kepler */
+   fs->hdr[0] = 0x20062 | (5 << 10);
+   fs->hdr[5] = 0x80000000; /* getting a trap if FRAG_COORD_UMASK.w = 0 */
+
+   if (info->prop.fp.usesDiscard)
+      fs->hdr[0] |= 0x8000;
+   if (!info->prop.fp.separateFragData)
+      fs->hdr[0] |= 0x4000;
+   if (info->io.sampleMask < 80 /* PIPE_MAX_SHADER_OUTPUTS */)
+      fs->hdr[19] |= 0x1;
+   if (info->prop.fp.writesDepth) {
+      fs->hdr[19] |= 0x2;
+      fs->flags[0] = 0x11; /* deactivate ZCULL */
+   }
+
+   for (i = 0; i < info->numInputs; ++i) {
+      m = nvk_hdr_interp_mode(&info->in[i]);
+      if (info->in[i].sn == TGSI_SEMANTIC_COLOR) {
+         fs->fs.colors |= 1 << info->in[i].si;
+         if (info->in[i].sc)
+            fs->fs.color_interp[info->in[i].si] = m | (info->in[i].mask << 4);
+      }
+      for (c = 0; c < 4; ++c) {
+         if (!(info->in[i].mask & (1 << c)))
+            continue;
+         a = info->in[i].slot[c];
+         if (info->in[i].slot[0] >= (0x060 / 4) &&
+             info->in[i].slot[0] <= (0x07c / 4)) {
+            fs->hdr[5] |= 1 << (24 + (a - 0x060 / 4));
+         } else
+         if (info->in[i].slot[0] >= (0x2c0 / 4) &&
+             info->in[i].slot[0] <= (0x2fc / 4)) {
+            fs->hdr[14] |= (1 << (a - 0x280 / 4)) & 0x07ff0000;
+         } else {
+            if (info->in[i].slot[c] < (0x040 / 4) ||
+                info->in[i].slot[c] > (0x380 / 4))
+               continue;
+            a *= 2;
+            if (info->in[i].slot[0] >= (0x300 / 4))
+               a -= 32;
+            fs->hdr[4 + a / 32] |= m << (a % 32);
+         }
+      }
+   }
+   /* GM20x+ needs TGSI_SEMANTIC_POSITION to access sample locations */
+   if (info->prop.fp.readsSampleLocations && info->target >= NVISA_GM200_CHIPSET)
+      fs->hdr[5] |= 0x30000000;
+
+   for (i = 0; i < info->numOutputs; ++i) {
+      if (info->out[i].sn == TGSI_SEMANTIC_COLOR)
+         fs->hdr[18] |= 0xf << (4 * info->out[i].si);
+   }
+
+   /* There are no "regular" attachments, but the shader still needs to be
+    * executed. It seems like it wants to think that it has some color
+    * outputs in order to actually run.
+    */
+   if (info->prop.fp.numColourResults == 0 && !info->prop.fp.writesDepth)
+      fs->hdr[18] |= 0xf;
+
+   fs->fs.early_z = info->prop.fp.earlyFragTests;
+   fs->fs.sample_mask_in = info->prop.fp.usesSampleMaskIn;
+   fs->fs.reads_framebuffer = info->prop.fp.readsFramebuffer;
+   fs->fs.post_depth_coverage = info->prop.fp.postDepthCoverage;
+
+   /* Mark position xy and layer as read */
+   if (fs->fs.reads_framebuffer)
+      fs->hdr[5] |= 0x32000000;
+
+   return 0;
+}
+
 VkResult
 nvk_compile_nir(struct nvk_physical_device *device, nir_shader *nir,
                 struct nvk_shader *shader)
@@ -184,11 +558,14 @@ nvk_compile_nir(struct nvk_physical_device *device, nir_shader *nir,
    if (nir->info.stage == MESA_SHADER_COMPUTE) {
       info->io.auxCBSlot = 1;
       info->prop.cp.gridInfoBase = 0;
+   } else {
+      info->assignSlots = nvc0_program_assign_varying_slots;
    }
    ret = nv50_ir_generate_code(info, &info_out);
    if (ret)
       return VK_ERROR_UNKNOWN;
 
+   shader->stage = nir->info.stage;
    shader->code_ptr = (uint8_t *)info_out.bin.code;
    shader->code_size = info_out.bin.codeSize;
 
@@ -199,6 +576,21 @@ nvk_compile_nir(struct nvk_physical_device *device, nir_shader *nir,
    shader->cp.smem_size = info_out.bin.smemSize;
    shader->num_barriers = info_out.numBarriers;
 
+   switch (info->type) {
+   case PIPE_SHADER_VERTEX:
+      ret = nvk_vs_gen_header(shader, &info_out);
+      break;
+   case PIPE_SHADER_FRAGMENT:
+      ret = nvk_fs_gen_header(shader, &info_out);
+      break;
+   case PIPE_SHADER_COMPUTE:
+      break;
+   default:
+      unreachable("Invalid shader stage");
+      break;
+   }
+   assert(ret == 0);
+
    if (info_out.bin.tlsSpace) {
       assert(info_out.bin.tlsSpace < (1 << 24));
       shader->hdr[0] |= 1 << 26;
@@ -219,15 +611,30 @@ nvk_compile_nir(struct nvk_physical_device *device, nir_shader *nir,
 void
 nvk_shader_upload(struct nvk_device *dev, struct nvk_shader *shader)
 {
-   void *ptr;
+   uint32_t hdr_size = 0;
+   if (shader->stage != MESA_SHADER_COMPUTE) {
+      if (dev->ctx->eng3d.cls >= 0xc597)
+         hdr_size = TU102_SHADER_HEADER_SIZE;
+      else
+         hdr_size = GF100_SHADER_HEADER_SIZE;
+   }
+
    /* TODO: The I-cache pre-fetches and we don't really know by how much.  So
     * throw on a bunch just to be sure.
     */
+   uint32_t total_size = shader->code_size + hdr_size;
    shader->bo = nouveau_ws_bo_new(nvk_device_physical(dev)->dev,
-                                  shader->code_size + 4096, 256,
+                                  total_size + 4096, 256,
                                   NOUVEAU_WS_BO_LOCAL | NOUVEAU_WS_BO_MAP);
 
-   ptr = nouveau_ws_bo_map(shader->bo, NOUVEAU_WS_BO_WR);
+   void *ptr = nouveau_ws_bo_map(shader->bo, NOUVEAU_WS_BO_WR);
 
-   memcpy(ptr, shader->code_ptr, shader->code_size);
+   assert(hdr_size <= sizeof(shader->hdr));
+   memcpy(ptr, shader->hdr, hdr_size);
+   memcpy(ptr + hdr_size, shader->code_ptr, shader->code_size);
+
+#ifndef NDEBUG
+   if (debug_get_bool_option("NV50_PROG_DEBUG", false))
+      nvk_shader_dump(shader);
+#endif
 }
diff --git a/src/nouveau/vulkan/nvk_shader.h b/src/nouveau/vulkan/nvk_shader.h
index c1ee0d94e7f..023e592a69d 100644
--- a/src/nouveau/vulkan/nvk_shader.h
+++ b/src/nouveau/vulkan/nvk_shader.h
@@ -17,13 +17,41 @@ struct nvk_physical_device;
 #define NVC0_MAX_SHADER_HEADER_SIZE TU102_SHADER_HEADER_SIZE
 
 struct nvk_shader {
+   gl_shader_stage stage;
+
    uint8_t *code_ptr;
    uint32_t code_size;
 
-   uint32_t hdr[NVC0_MAX_SHADER_HEADER_SIZE/4];
    bool need_tls;
    uint8_t num_gprs;
    uint8_t num_barriers;
+
+   uint32_t hdr[NVC0_MAX_SHADER_HEADER_SIZE/4];
+   uint32_t flags[2];
+
+   struct {
+      uint32_t clip_mode; /* clip/cull selection */
+      uint8_t clip_enable; /* mask of defined clip planes */
+      uint8_t cull_enable; /* mask of defined cull distances */
+      uint8_t num_ucps; /* also set to max if ClipDistance is used */
+      uint8_t edgeflag; /* attribute index of edgeflag input */
+      bool need_vertex_id;
+      bool need_draw_parameters;
+      bool layer_viewport_relative; /* also applies go gp and tp */
+   } vs;
+
+   struct {
+      uint8_t early_z;
+      uint8_t colors;
+      uint8_t color_interp[2];
+      bool sample_mask_in;
+      bool force_persample_interp;
+      bool flatshade;
+      bool reads_framebuffer;
+      bool post_depth_coverage;
+      bool msaa;
+   } fs;
+
    struct {
       uint32_t lmem_size; /* local memory (TGSI PRIVATE resource) size */
       uint32_t smem_size; /* shared memory (TGSI LOCAL resource) size */