llvmpipe: disable denorms in compute shaders on x86/sse

For consistency with other shader stages (required by d3d, neither GL nor Vulkan really care). A bit awkward since we don't want to disable them for things like rusticl, which we should be able to distinguish with shader type. Note that to satisfy d3d requirements, disabling denorms in general is not sufficient, due to d3d requiring them to be disabled for single precision opcodes, but enabled for double precision ones, and x86 can't switch that individually (hence will need per-instruction tracking and switching inside the shader). Reviewed-by: Brian Paul <brian.paul@broadcom.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40787>
2026-05-07 00:38:48 +02:00 · 2026-04-03 21:25:47 +02:00 · 2026-04-03 21:25:47 +02:00 · 76cdbcb96a
commit 76cdbcb96a
parent c20106f1e4
4 changed files with 55 additions and 18 deletions
--- a/src/gallium/drivers/llvmpipe/lp_cs_tpool.c
+++ b/src/gallium/drivers/llvmpipe/lp_cs_tpool.c
@ -30,7 +30,9 @@

 #include "util/u_thread.h"
 #include "util/u_memory.h"
+#include "util/u_math.h"
 #include "lp_cs_tpool.h"
+#include "compiler/shader_enums.h"

 static int
 lp_cs_tpool_worker(void *data)
@ -41,6 +43,9 @@ lp_cs_tpool_worker(void *data)
   memset(&lmem, 0, sizeof(lmem));
   mtx_lock(&pool->m);

+   unsigned fpstate = util_fpstate_get();
+   bool flush_denorms = false;
+
   while (!pool->shutdown) {
      struct lp_cs_tpool_task *task;
      unsigned iter_per_thread;
@ -70,6 +75,17 @@ lp_cs_tpool_worker(void *data)
         list_del(&task->list);

      mtx_unlock(&pool->m);
+
+      struct lp_cs_job_info *job_info = task->data;
+      if ((job_info->current->variant->stage == MESA_SHADER_KERNEL) == flush_denorms) {
+         if (flush_denorms) {
+            util_fpstate_set(fpstate);
+            flush_denorms = false;
+         } else {
+            util_fpstate_set_denorms_to_zero(fpstate);
+            flush_denorms = true;
+         }
+      }
      for (unsigned i = 0; i < iter_per_thread; i++)
         task->work(task->data, this_iter + i, &lmem);

@ -135,11 +151,25 @@ lp_cs_tpool_queue_task(struct lp_cs_tpool *pool,
   if (pool->num_threads == 0) {
      struct lp_cs_local_mem lmem;

+      struct lp_cs_job_info *job_info = data;
+      
+      unsigned fpstate = 0;
+      bool flush_denorms = false;
+      if (job_info->current->variant->stage != MESA_SHADER_KERNEL) {
+         fpstate = util_fpstate_get();
+         util_fpstate_set_denorms_to_zero(fpstate);
+         flush_denorms = true;
+      }
+
      memset(&lmem, 0, sizeof(lmem));
      for (unsigned t = 0; t < num_iters; t++) {
         work(data, t, &lmem);
      }
      FREE(lmem.local_mem_ptr);
+
+      if (flush_denorms) {
+         util_fpstate_set(fpstate);
+      }
      return NULL;
   }
   task = CALLOC_STRUCT(lp_cs_tpool_task);
--- a/src/gallium/drivers/llvmpipe/lp_cs_tpool.h
+++ b/src/gallium/drivers/llvmpipe/lp_cs_tpool.h
@ -38,6 +38,7 @@

 #include "util/u_thread.h"
 #include "util/list.h"
+#include "lp_state_cs.h"

 #include "lp_limits.h"

--- a/src/gallium/drivers/llvmpipe/lp_state_cs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_cs.c
@ -61,23 +61,6 @@ static unsigned cs_no = 0;
 static unsigned task_no = 0;
 static unsigned mesh_no = 0;

-struct lp_cs_job_info {
-   unsigned grid_size[3];
-   unsigned iter_size[3];
-   unsigned grid_base[3];
-   unsigned block_size[3];
-   unsigned req_local_mem;
-   unsigned work_dim;
-   unsigned draw_id;
-   bool zero_initialize_shared_memory;
-   bool use_iters;
-   struct lp_cs_exec *current;
-   struct vertex_header *io;
-   size_t io_stride;
-   void *payload;
-   size_t payload_stride;
-};
-
 enum {
   CS_ARG_CONTEXT,
   CS_ARG_RESOURCES,
@ -1322,8 +1305,10 @@ generate_variant(struct llvmpipe_context *lp,

   lp_jit_init_cs_types(variant);

+   struct nir_shader *nir = shader->base.ir.nir;
+   variant->stage = nir->info.stage;
+
   if (sh_type == MESA_SHADER_MESH) {
-      struct nir_shader *nir = shader->base.ir.nir;
      int per_prim_count = util_bitcount64(nir->info.per_primitive_outputs);
      int out_count = util_bitcount64(nir->info.outputs_written);
      int per_vert_count = out_count - per_prim_count;
--- a/src/gallium/drivers/llvmpipe/lp_state_cs.h
+++ b/src/gallium/drivers/llvmpipe/lp_state_cs.h
@ -36,6 +36,24 @@

 struct lp_compute_shader_variant;

+struct lp_cs_job_info {
+   unsigned grid_size[3];
+   unsigned iter_size[3];
+   unsigned grid_base[3];
+   unsigned block_size[3];
+   unsigned req_local_mem;
+   unsigned work_dim;
+   unsigned draw_id;
+   bool zero_initialize_shared_memory;
+   bool use_iters;
+   struct lp_cs_exec *current;
+   struct vertex_header *io;
+   size_t io_stride;
+   void *payload;
+   size_t payload_stride;
+};
+
+
 struct lp_compute_shader_variant_key
 {
   unsigned nr_samplers:8;
@ -101,6 +119,9 @@ struct lp_compute_shader_variant

   struct lp_compute_shader *shader;

+   /* shader stage as declared in the shader (i.e. can be kernel) */
+   mesa_shader_stage stage;
+
   /* For debugging/profiling purposes */
   unsigned no;