swr: Support simd16 vertex shaders

Supporting simd16 vertex shaders involves packing the output of the fetch shader appropriately, especially the vertexID buffers that have to be formatted in one simd16 register, needed by the VS. As part of this support, we needed to remove the 2nd JitManager, since it was not accounting for vector width correctly. USE_SIMD16_SHADERS is also split into two defines. The additional one (USE_SIMD16_VS) controls the width of the vertex shader (VS), while the original one (USE_SIMD16_SHADERS) controls overall front end width. Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
2025-12-23 15:30:14 +01:00 · 2018-01-19 15:47:12 -06:00 · 2018-01-19 15:47:12 -06:00 · 8c83d2d371
commit 8c83d2d371
parent 1874d95a8e
3 changed files with 30 additions and 21 deletions
--- a/src/gallium/drivers/swr/swr_screen.cpp
+++ b/src/gallium/drivers/swr/swr_screen.cpp
@ -1064,9 +1064,6 @@ swr_destroy_screen(struct pipe_screen *p_screen)
   swr_fence_reference(p_screen, &screen->flush_fence, NULL);
   JitDestroyContext(screen->hJitMgr);
 #if USE_SIMD16_SHADERS
   JitDestroyContext(screen->hJitMgr16);
 #endif
   if (winsys->destroy)
      winsys->destroy(winsys);
@ -1150,9 +1147,6 @@ swr_create_screen_internal(struct sw_winsys *winsys)
   // Pass in "" for architecture for run-time determination
   screen->hJitMgr = JitCreateContext(KNOB_SIMD_WIDTH, "", "swr");
 #if USE_SIMD16_SHADERS
   screen->hJitMgr16 = JitCreateContext(16, "", "swr");
 #endif
   swr_fence_init(&screen->base);
--- a/src/gallium/drivers/swr/swr_screen.h
+++ b/src/gallium/drivers/swr/swr_screen.h
@ -49,9 +49,6 @@ struct swr_screen {
   uint32_t client_copy_limit;
   HANDLE hJitMgr;
 #if USE_SIMD16_SHADERS
   HANDLE hJitMgr16;
 #endif
   PFNSwrGetInterface pfnSwrGetInterface;
--- a/src/gallium/drivers/swr/swr_shader.cpp
+++ b/src/gallium/drivers/swr/swr_shader.cpp
@ -724,7 +724,7 @@ swr_compile_gs(struct swr_context *ctx, swr_jit_gs_key &key)
 void
 BuilderSWR::WriteVS(Value *pVal, Value *pVsContext, Value *pVtxOutput, unsigned slot, unsigned channel)
 {
-#if USE_SIMD16_FRONTEND && !USE_SIMD16_SHADERS
+#if USE_SIMD16_FRONTEND && !USE_SIMD16_VS
   // interleave the simdvertex components into the dest simd16vertex
   //   slot16offset = slot8offset * 2
   //   comp16offset = comp8offset * 2 + alternateOffset
@ -787,7 +787,7 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key)
   const_sizes_ptr->setName("num_vs_constants");
   Value *vtxInput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVin});
-#if USE_SIMD16_SHADERS
+#if USE_SIMD16_VS
   vtxInput = BITCAST(vtxInput, PointerType::get(Gen_simd16vertex(JM()), 0));
 #endif
@ -807,11 +807,22 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key)
   struct lp_bld_tgsi_system_values system_values;
   memset(&system_values, 0, sizeof(system_values));
   system_values.instance_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_InstanceID}));
 #if USE_SIMD16_VS
   system_values.vertex_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_VertexID16}));
 #else
   system_values.vertex_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_VertexID}));
 #endif
 #if USE_SIMD16_VS
   uint32_t vectorWidth = mVWidth16;
 #else
   uint32_t vectorWidth = mVWidth;
 #endif
   lp_build_tgsi_soa(gallivm,
                     swr_vs->pipe.tokens,
-                     lp_type_float_vec(32, 32 * mVWidth),
+                     lp_type_float_vec(32, 32 * vectorWidth),
                     NULL, // mask
                     wrap(consts_ptr),
                     wrap(const_sizes_ptr),
@ -829,7 +840,7 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key)
   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
   Value *vtxOutput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVout});
-#if USE_SIMD16_SHADERS
+#if USE_SIMD16_VS
   vtxOutput = BITCAST(vtxOutput, PointerType::get(Gen_simd16vertex(JM()), 0));
 #endif
@ -905,10 +916,21 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key)
         Value *py = LOAD(GEP(hPrivateData, {0, swr_draw_context_userClipPlanes, val, 1}));
         Value *pz = LOAD(GEP(hPrivateData, {0, swr_draw_context_userClipPlanes, val, 2}));
         Value *pw = LOAD(GEP(hPrivateData, {0, swr_draw_context_userClipPlanes, val, 3}));
-         Value *dist = FADD(FMUL(unwrap(cx), VBROADCAST(px)),
+#if USE_SIMD16_VS
-                            FADD(FMUL(unwrap(cy), VBROADCAST(py)),
+         Value *bpx = VBROADCAST_16(px);
-                                 FADD(FMUL(unwrap(cz), VBROADCAST(pz)),
+         Value *bpy = VBROADCAST_16(py);
-                                      FMUL(unwrap(cw), VBROADCAST(pw)))));
+         Value *bpz = VBROADCAST_16(pz);
         Value *bpw = VBROADCAST_16(pw);
 #else
         Value *bpx = VBROADCAST(px);
         Value *bpy = VBROADCAST(py);
         Value *bpz = VBROADCAST(pz);
         Value *bpw = VBROADCAST(pw);
 #endif
         Value *dist = FADD(FMUL(unwrap(cx), bpx),
                            FADD(FMUL(unwrap(cy), bpy),
                                 FADD(FMUL(unwrap(cz), bpz),
                                      FMUL(unwrap(cw), bpw))));
         if (val < 4)
            WriteVS(dist, pVsCtx, vtxOutput, VERTEX_CLIPCULL_DIST_LO_SLOT, val);
@ -942,11 +964,7 @@ swr_compile_vs(struct swr_context *ctx, swr_jit_vs_key &key)
      return NULL;
   BuilderSWR builder(
 #if USE_SIMD16_SHADERS
      reinterpret_cast<JitManager *>(swr_screen(ctx->pipe.screen)->hJitMgr16),
 #else
      reinterpret_cast<JitManager *>(swr_screen(ctx->pipe.screen)->hJitMgr),
 #endif
      "VS");
   PFN_VERTEX_FUNC func = builder.CompileVS(ctx, key);