llvmpipe/linear: refactor linear samplers into templated code.

Before adding new copies of all of these for swapping start by refactoring into macro templated code. I avoided using inline functions because I want to test with opts turned down, and this will kill perf. Reviewed-by: Brian Paul <brianp@vmware.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24066>
2026-02-18 21:20:29 +01:00 · 2023-07-11 13:56:23 +10:00 · 2023-07-11 13:56:23 +10:00 · e43804ba65
commit e43804ba65
parent 0230179f8b
3 changed files with 229 additions and 291 deletions
--- a/src/gallium/drivers/llvmpipe/lp_linear_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_linear_sampler.c
@ -79,12 +79,27 @@ fixed16_approx(int x, int y, int tol)
   return y - tol <= x && x <= y + tol;
 }

+/* set alpha channel of rgba value to 0xff. */
+static inline uint32_t
+rgbx(uint32_t src_val)
+{
+   return src_val | 0xff000000;
+}
+
+/* set alpha channel of 128-bit 4xrgba values to 0xff. */
+static inline __m128i
+rgbx_128(const __m128i src_val)
+{
+   const __m128i mask = _mm_set1_epi32(0xff000000);
+   __m128i bgrx = _mm_or_si128(src_val, mask);
+   return bgrx;
+}

 /*
 * Unstretched blit of a bgra texture.
 */
 static const uint32_t *
-fetch_bgra_memcpy(struct lp_linear_elem *elem)
+fetch_memcpy_bgra(struct lp_linear_elem *elem)
 {
   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
   const struct lp_jit_texture *texture = samp->texture;
@ -109,223 +124,6 @@ fetch_bgra_memcpy(struct lp_linear_elem *elem)
   return row;
 }

-
-/*
- * Unstretched blit of a bgrx texture.
- */
-static const uint32_t *
-fetch_bgrx_memcpy(struct lp_linear_elem *elem)
-{
-   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
-   const struct lp_jit_texture *texture = samp->texture;
-   const uint32_t *src_row =
-      (const uint32_t *)((const uint8_t *)texture->base +
-                         (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
-   const int s     = samp->s;
-   const int width = samp->width;
-   uint32_t *row   = samp->row;
-
-   src_row = &src_row[s >> FIXED16_SHIFT];
-
-   for (int i = 0; i < width; i++) {
-      row[i] = src_row[i] | 0xff000000;
-   }
-
-   samp->t += samp->dtdy;
-   return row;
-}
-
-
-/*
- * Perform nearest filtered lookup of a row of texels.  Texture lookup
- * is assumed to be axis aligned but with arbitrary scaling.
- *
- * Texture coordinate interpolation is performed in 16.16 fixed point,
- * not to be confused with the 1.15 format used by the interpolants.
- *
- * After 64 pixels (ie. in the next tile), the starting point will be
- * recalculated with floating point arithmetic.
- */
-static const uint32_t *
-fetch_bgra_axis_aligned(struct lp_linear_elem *elem)
-{
-   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
-   const struct lp_jit_texture *texture = samp->texture;
-   const uint32_t *src_row =
-      (const uint32_t *)((const uint8_t *)texture->base +
-                         (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
-   const int dsdx  = samp->dsdx;
-   const int width = samp->width;
-   uint32_t *row   = samp->row;
-   int s = samp->s;
-
-   for (int i = 0; i < width; i++) {
-      row[i] = src_row[s>>FIXED16_SHIFT];
-      s += dsdx;
-   }
-
-   samp->t += samp->dtdy;
-   return row;
-}
-
-
-static const uint32_t *
-fetch_bgrx_axis_aligned(struct lp_linear_elem *elem)
-{
-   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
-   const struct lp_jit_texture *texture = samp->texture;
-   const uint32_t *src_row =
-      (const uint32_t *)((const uint8_t *)texture->base +
-                         (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
-   const int dsdx  = samp->dsdx;
-   const int width = samp->width;
-   uint32_t *row   = samp->row;
-   int s = samp->s;
-
-   for (int i = 0; i < width; i++) {
-      row[i] = src_row[s>>FIXED16_SHIFT] | 0xff000000;
-      s += dsdx;
-   }
-
-   samp->t += samp->dtdy;
-   return row;
-}
-
-
-/* Non-axis aligned, but no clamping or wrapping required
- */
-static const uint32_t *
-fetch_bgra(struct lp_linear_elem *elem)
-{
-   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
-   const struct lp_jit_texture *texture = samp->texture;
-   const uint8_t *src = texture->base;
-   const int stride = texture->row_stride[0];
-   const int dsdx  = samp->dsdx;
-   const int dtdx  = samp->dtdx;
-   const int width = samp->width;
-   uint32_t *row   = samp->row;
-   int s = samp->s;
-   int t = samp->t;
-
-   for (int i = 0; i < width; i++) {
-      const uint8_t *texel = (src +
-                              (t>>FIXED16_SHIFT) * stride +
-                              (s>>FIXED16_SHIFT) * 4);
-
-      row[i] = *(const uint32_t *)texel;
-
-      s += dsdx;
-      t += dtdx;
-   }
-
-   samp->s += samp->dsdy;
-   samp->t += samp->dtdy;
-   return row;
-}
-
-
-static const uint32_t *
-fetch_bgrx(struct lp_linear_elem *elem)
-{
-   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
-   const struct lp_jit_texture *texture = samp->texture;
-   const uint8_t *src = texture->base;
-   const int stride = texture->row_stride[0];
-   const int dsdx  = samp->dsdx;
-   const int dtdx  = samp->dtdx;
-   const int width = samp->width;
-   uint32_t *row   = samp->row;
-   int s = samp->s;
-   int t = samp->t;
-
-   for (int i = 0; i < width; i++) {
-      const uint8_t *texel = (src +
-                              (t>>FIXED16_SHIFT) * stride +
-                              (s>>FIXED16_SHIFT) * 4);
-
-      row[i] = (*(const uint32_t *)texel) | 0xff000000;
-
-      s += dsdx;
-      t += dtdx;
-   }
-
-   samp->s += samp->dsdy;
-   samp->t += samp->dtdy;
-   return row;
-}
-
-
-/* Non-axis aligned, clamped.
- */
-static const uint32_t *
-fetch_bgra_clamp(struct lp_linear_elem *elem)
-{
-   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
-   const struct lp_jit_texture *texture = samp->texture;
-   const uint8_t *src   = texture->base;
-   const int stride     = texture->row_stride[0];
-   const int tex_height = texture->height - 1;
-   const int tex_width  = texture->width - 1;
-   const int dsdx  = samp->dsdx;
-   const int dtdx  = samp->dtdx;
-   const int width = samp->width;
-   uint32_t *row   = samp->row;
-   int s = samp->s;
-   int t = samp->t;
-
-   for (int i = 0; i < width; i++) {
-      int ct = CLAMP(t>>FIXED16_SHIFT, 0, tex_height);
-      int cs = CLAMP(s>>FIXED16_SHIFT, 0, tex_width);
-
-      const uint8_t *texel = src + ct * stride + cs * 4;
-
-      row[i] = *(const uint32_t *)texel;
-
-      s += dsdx;
-      t += dtdx;
-   }
-
-   samp->s += samp->dsdy;
-   samp->t += samp->dtdy;
-   return row;
-}
-
-
-static const uint32_t *
-fetch_bgrx_clamp(struct lp_linear_elem *elem)
-{
-   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
-   const struct lp_jit_texture *texture = samp->texture;
-   const uint8_t *src   = texture->base;
-   const int stride     = texture->row_stride[0];
-   const int tex_height = texture->height - 1;
-   const int tex_width  = texture->width - 1;
-   const int dsdx  = samp->dsdx;
-   const int dtdx  = samp->dtdx;
-   const int width = samp->width;
-   uint32_t *row   = samp->row;
-   int s = samp->s;
-   int t = samp->t;
-
-   for (int i = 0; i < width; i++) {
-      int ct = CLAMP(t>>FIXED16_SHIFT, 0, tex_height);
-      int cs = CLAMP(s>>FIXED16_SHIFT, 0, tex_width);
-
-      const uint8_t *texel = src + ct * stride + cs * 4;
-
-      row[i] = (*(const uint32_t *)texel) | 0xff000000;
-
-      s += dsdx;
-      t += dtdx;
-   }
-
-   samp->s += samp->dsdy;
-   samp->t += samp->dtdy;
-   return row;
-}
-
-
 /**
 * Fetch and stretch one row.
 */
@ -394,7 +192,7 @@ fetch_and_stretch_bgra_row(struct lp_linear_sampler *samp,
 * temporary or fetch sparsely.
 */
 static const uint32_t *
-fetch_bgra_axis_aligned_linear(struct lp_linear_elem *elem)
+fetch_axis_aligned_linear_bgra(struct lp_linear_elem *elem)
 {
   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
   const int width = samp->width;
@ -431,7 +229,7 @@ fetch_bgra_axis_aligned_linear(struct lp_linear_elem *elem)
 * maximize.
 */
 static const uint32_t *
-fetch_bgra_linear(struct lp_linear_elem *elem)
+fetch_linear_bgra(struct lp_linear_elem *elem)
 {
   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
   const struct lp_jit_texture *texture = samp->texture;
@ -485,7 +283,7 @@ fetch_bgra_linear(struct lp_linear_elem *elem)
 * maximize.
 */
 static const uint32_t *
-fetch_bgra_clamp_linear(struct lp_linear_elem *elem)
+fetch_clamp_linear_bgra(struct lp_linear_elem *elem)
 {
   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
   const struct lp_jit_texture *texture = samp->texture;
@ -596,65 +394,16 @@ fetch_bgra_clamp_linear(struct lp_linear_elem *elem)
   return row;
 }

+/* don't generate bgra 128-bits or memcpy ops they have their own path */
+#define FETCH_TYPE bgra
+#define OP
+#define NO_MEMCPY
+#include "lp_linear_sampler_tmp.h"

-static const uint32_t *
-fetch_bgrx_axis_aligned_linear(struct lp_linear_elem *elem)
-{
-   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
-   const __m128i mask = _mm_set1_epi32(0xff000000);
-   uint32_t *dst_row = samp->row;
-   const uint32_t *src_row = fetch_bgra_axis_aligned_linear(&samp->base);
-   const int width = samp->width;
-
-   for (int i = 0; i < width; i += 4) {
-      __m128i bgra = *(__m128i *)&src_row[i];
-      __m128i bgrx = _mm_or_si128(bgra, mask);
-      *(__m128i *)&dst_row[i] = bgrx;
-   }
-
-   return dst_row;
-}
-
-
-static const uint32_t *
-fetch_bgrx_clamp_linear(struct lp_linear_elem *elem)
-{
-   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
-   const __m128i mask = _mm_set1_epi32(0xff000000);
-   uint32_t *row = samp->row;
-   const int width = samp->width;
-
-   fetch_bgra_clamp_linear(&samp->base);
-
-   for (int i = 0; i < width; i += 4) {
-      __m128i bgra = *(__m128i *)&row[i];
-      __m128i bgrx = _mm_or_si128(bgra, mask);
-      *(__m128i *)&row[i] = bgrx;
-   }
-
-   return row;
-}
-
-
-static const uint32_t *
-fetch_bgrx_linear(struct lp_linear_elem *elem)
-{
-   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
-   const __m128i mask = _mm_set1_epi32(0xff000000);
-   uint32_t *row = samp->row;
-   const int width = samp->width;
-
-   fetch_bgra_linear(&samp->base);
-
-   for (int i = 0; i < width; i += 4) {
-      __m128i bgra = *(__m128i *)&row[i];
-      __m128i bgrx = _mm_or_si128(bgra, mask);
-      *(__m128i *)&row[i] = bgrx;
-   }
-
-   return row;
-}
-
+#define FETCH_TYPE bgrx
+#define OP rgbx
+#define OP128 rgbx_128
+#include "lp_linear_sampler_tmp.h"

 static bool
 sampler_is_nearest(const struct lp_linear_sampler *samp,
@ -875,23 +624,23 @@ lp_linear_init_sampler(struct lp_linear_sampler *samp,
      switch (sampler_state->texture_state.format) {
      case PIPE_FORMAT_B8G8R8A8_UNORM:
         if (need_wrap)
-            samp->base.fetch = fetch_bgra_clamp;
+            samp->base.fetch = fetch_clamp_bgra;
         else if (!samp->axis_aligned)
            samp->base.fetch = fetch_bgra;
         else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
-            samp->base.fetch = fetch_bgra_axis_aligned;
+            samp->base.fetch = fetch_axis_aligned_bgra;
         else
-            samp->base.fetch = fetch_bgra_memcpy;
+            samp->base.fetch = fetch_memcpy_bgra;
         return true;
      case PIPE_FORMAT_B8G8R8X8_UNORM:
         if (need_wrap)
-            samp->base.fetch = fetch_bgrx_clamp;
+            samp->base.fetch = fetch_clamp_bgrx;
         else if (!samp->axis_aligned)
            samp->base.fetch = fetch_bgrx;
         else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
-            samp->base.fetch = fetch_bgrx_axis_aligned;
+            samp->base.fetch = fetch_axis_aligned_bgrx;
         else
-            samp->base.fetch = fetch_bgrx_memcpy;
+            samp->base.fetch = fetch_memcpy_bgrx;
         return true;
      default:
         break;
@ -906,19 +655,19 @@ lp_linear_init_sampler(struct lp_linear_sampler *samp,
      switch (sampler_state->texture_state.format) {
      case PIPE_FORMAT_B8G8R8A8_UNORM:
         if (need_wrap)
-            samp->base.fetch = fetch_bgra_clamp_linear;
+            samp->base.fetch = fetch_clamp_linear_bgra;
         else if (!samp->axis_aligned)
-            samp->base.fetch = fetch_bgra_linear;
+            samp->base.fetch = fetch_linear_bgra;
         else
-            samp->base.fetch = fetch_bgra_axis_aligned_linear;
+            samp->base.fetch = fetch_axis_aligned_linear_bgra;
         return true;
      case PIPE_FORMAT_B8G8R8X8_UNORM:
         if (need_wrap)
-            samp->base.fetch = fetch_bgrx_clamp_linear;
+            samp->base.fetch = fetch_clamp_linear_bgrx;
         else if (!samp->axis_aligned)
-            samp->base.fetch = fetch_bgrx_linear;
+            samp->base.fetch = fetch_linear_bgrx;
         else
-            samp->base.fetch = fetch_bgrx_axis_aligned_linear;
+            samp->base.fetch = fetch_axis_aligned_linear_bgrx;
         return true;
      default:
         break;
--- a/src/gallium/drivers/llvmpipe/lp_linear_sampler_tmp.h
+++ b/src/gallium/drivers/llvmpipe/lp_linear_sampler_tmp.h
@ -0,0 +1,188 @@
+/* sampler template functions */
+
+#ifndef NO_MEMCPY
+/*
+ * Unstretched blit of a bgrx texture.
+ */
+static const uint32_t *
+CONCAT2(fetch_memcpy_, FETCH_TYPE)(struct lp_linear_elem *elem)
+{
+   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
+   const struct lp_jit_texture *texture = samp->texture;
+   const uint32_t *src_row =
+      (const uint32_t *)((const uint8_t *)texture->base +
+                         (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
+   const int s     = samp->s;
+   const int width = samp->width;
+   uint32_t *row   = samp->row;
+
+   src_row = &src_row[s >> FIXED16_SHIFT];
+
+   for (int i = 0; i < width; i++) {
+      row[i] = OP(src_row[i]);
+   }
+
+   samp->t += samp->dtdy;
+   return row;
+}
+#endif
+
+/*
+ * Perform nearest filtered lookup of a row of texels.  Texture lookup
+ * is assumed to be axis aligned but with arbitrary scaling.
+ *
+ * Texture coordinate interpolation is performed in 16.16 fixed point,
+ * not to be confused with the 1.15 format used by the interpolants.
+ *
+ * After 64 pixels (ie. in the next tile), the starting point will be
+ * recalculated with floating point arithmetic.
+ */
+static const uint32_t *
+CONCAT2(fetch_axis_aligned_, FETCH_TYPE)(struct lp_linear_elem *elem)
+{
+   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
+   const struct lp_jit_texture *texture = samp->texture;
+   const uint32_t *src_row =
+      (const uint32_t *)((const uint8_t *)texture->base +
+                         (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
+   const int dsdx  = samp->dsdx;
+   const int width = samp->width;
+   uint32_t *row   = samp->row;
+   int s = samp->s;
+
+   for (int i = 0; i < width; i++) {
+      row[i] = OP(src_row[s>>FIXED16_SHIFT]);
+      s += dsdx;
+   }
+
+   samp->t += samp->dtdy;
+   return row;
+}
+
+/* Non-axis aligned, but no clamping or wrapping required
+ */
+static const uint32_t *
+CONCAT2(fetch_, FETCH_TYPE)(struct lp_linear_elem *elem)
+{
+   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
+   const struct lp_jit_texture *texture = samp->texture;
+   const uint8_t *src = texture->base;
+   const int stride = texture->row_stride[0];
+   const int dsdx  = samp->dsdx;
+   const int dtdx  = samp->dtdx;
+   const int width = samp->width;
+   uint32_t *row   = samp->row;
+   int s = samp->s;
+   int t = samp->t;
+
+   for (int i = 0; i < width; i++) {
+      const uint8_t *texel = (src +
+                              (t>>FIXED16_SHIFT) * stride +
+                              (s>>FIXED16_SHIFT) * 4);
+
+      row[i] = OP(*(const uint32_t *)texel);
+
+      s += dsdx;
+      t += dtdx;
+   }
+
+   samp->s += samp->dsdy;
+   samp->t += samp->dtdy;
+   return row;
+}
+
+/* Non-axis aligned, clamped.
+ */
+static const uint32_t *
+CONCAT2(fetch_clamp_, FETCH_TYPE)(struct lp_linear_elem *elem)
+{
+   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
+   const struct lp_jit_texture *texture = samp->texture;
+   const uint8_t *src   = texture->base;
+   const int stride     = texture->row_stride[0];
+   const int tex_height = texture->height - 1;
+   const int tex_width  = texture->width - 1;
+   const int dsdx  = samp->dsdx;
+   const int dtdx  = samp->dtdx;
+   const int width = samp->width;
+   uint32_t *row   = samp->row;
+   int s = samp->s;
+   int t = samp->t;
+
+   for (int i = 0; i < width; i++) {
+      int ct = CLAMP(t>>FIXED16_SHIFT, 0, tex_height);
+      int cs = CLAMP(s>>FIXED16_SHIFT, 0, tex_width);
+
+      const uint8_t *texel = src + ct * stride + cs * 4;
+
+      row[i] = OP(*(const uint32_t *)texel);
+
+      s += dsdx;
+      t += dtdx;
+   }
+
+   samp->s += samp->dsdy;
+   samp->t += samp->dtdy;
+   return row;
+}
+
+#ifdef OP128
+static const uint32_t *
+CONCAT2(fetch_axis_aligned_linear_, FETCH_TYPE)(struct lp_linear_elem *elem)
+{
+   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
+
+   uint32_t *dst_row = samp->row;
+   const uint32_t *src_row = fetch_axis_aligned_linear_bgra(&samp->base);
+   const int width = samp->width;
+
+   for (int i = 0; i < width; i += 4) {
+      __m128i bgra = *(__m128i *)&src_row[i];
+      __m128i rgba = OP128(bgra);
+      *(__m128i *)&dst_row[i] = rgba;
+   }
+
+   return dst_row;
+}
+
+static const uint32_t *
+CONCAT2(fetch_clamp_linear_, FETCH_TYPE)(struct lp_linear_elem *elem)
+{
+   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
+   uint32_t *row = samp->row;
+   const int width = samp->width;
+
+   fetch_clamp_linear_bgra(&samp->base);
+
+   for (int i = 0; i < width; i += 4) {
+      __m128i bgra = *(__m128i *)&row[i];
+      __m128i rgba = OP128(bgra);
+      *(__m128i *)&row[i] = rgba;
+   }
+
+   return row;
+}
+
+static const uint32_t *
+CONCAT2(fetch_linear_, FETCH_TYPE)(struct lp_linear_elem *elem)
+{
+   struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
+   uint32_t *row = samp->row;
+   const int width = samp->width;
+
+   fetch_linear_bgra(&samp->base);
+
+   for (int i = 0; i < width; i += 4) {
+      __m128i bgra = *(__m128i *)&row[i];
+      __m128i rgba = OP128(bgra);
+      *(__m128i *)&row[i] = rgba;
+   }
+
+   return row;
+}
+#endif
+
+#undef OP
+#undef OP128
+#undef FETCH_TYPE
+#undef NO_MEMCPY
--- a/src/gallium/drivers/llvmpipe/meson.build
+++ b/src/gallium/drivers/llvmpipe/meson.build
@ -48,6 +48,7 @@ files_llvmpipe = files(
  'lp_linear_fastpath.c',
  'lp_linear_interp.c',
  'lp_linear_sampler.c',
+  'lp_linear_sampler_tmp.h',
  'lp_memory.c',
  'lp_memory.h',
  'lp_perf.c',