pan/lib: Rewrite npot divisor algorithm

It was based on reverse engineering and inaccurate. This also get ride of fp64 usage and will allow us to use it in OpenCL C shaders for indirect/indexed draw. On the full 32-bit range, this fixes ~10 millions inaccurate results. Signed-off-by: Mary Guillemard <mary.guillemard@collabora.com> Fixes: 467ae0d39d ("panfrost: Factor out panfrost_compute_magic_divisor") Acked-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Olivia Lee <olivia.lee@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35490>
2026-05-04 22:49:13 +02:00 · 2025-06-12 10:50:46 +02:00 · 2025-06-12 10:50:46 +02:00 · b53d98eb41
commit b53d98eb41
parent 01fae0c5c2
2 changed files with 14 additions and 37 deletions
--- a/src/panfrost/lib/pan_attributes.c
+++ b/src/panfrost/lib/pan_attributes.c
@ -87,44 +87,21 @@ pan_padded_vertex_count(unsigned vertex_count)
      return pan_large_padded_vertex_count(vertex_count);
 }

-/* The much, much more irritating case -- instancing is enabled. See
- * pan_job.h for notes on how this works */
-
 unsigned
-pan_compute_magic_divisor(unsigned hw_divisor, unsigned *o_shift,
-                          unsigned *extra_flags)
+pan_compute_magic_divisor(unsigned hw_divisor, unsigned *divisor_r,
+                          unsigned *divisor_e)
 {
-   /* We have a NPOT divisor. Here's the fun one (multipling by
-    * the inverse and shifting) */
+   unsigned r = util_logbase2(hw_divisor);

-   /* floor(log2(d)) */
-   unsigned shift = util_logbase2(hw_divisor);
+   uint64_t shift_hi = 32 + r;
+   uint64_t t = (uint64_t)1 << shift_hi;
+   uint64_t f0 = t + hw_divisor / 2;
+   uint64_t fi = f0 / hw_divisor;
+   uint64_t ff = f0 - fi * hw_divisor;

-   /* m = ceil(2^(32 + shift) / d) */
-   uint64_t shift_hi = 32 + shift;
-   uint64_t t = 1ll << shift_hi;
-   double t_f = t;
-   double hw_divisor_d = hw_divisor;
-   double m_f = ceil(t_f / hw_divisor_d);
-   unsigned m = m_f;
+   uint64_t d = fi - (1ul << 31);
+   *divisor_r = r;
+   *divisor_e = ff > hw_divisor / 2 ? 1 : 0;

-   /* Default case */
-   uint32_t magic_divisor = m;
-
-   /* e = 2^(shift + 32) % d */
-   uint64_t e = t % hw_divisor;
-
-   /* Apply round-down algorithm? e <= 2^shift?. XXX: The blob
-    * seems to use a different condition */
-   if (e <= (1ll << shift)) {
-      magic_divisor = m - 1;
-      *extra_flags = 1;
-   }
-
-   /* Top flag implicitly set */
-   assert(magic_divisor & (1u << 31));
-   magic_divisor &= ~(1u << 31);
-   *o_shift = shift;
-
-   return magic_divisor;
+   return d;
 }
--- a/src/panfrost/lib/pan_encoder.h
+++ b/src/panfrost/lib/pan_encoder.h
@ -78,8 +78,8 @@ unsigned pan_get_total_stack_size(unsigned thread_size,

 unsigned pan_padded_vertex_count(unsigned vertex_count);

-unsigned pan_compute_magic_divisor(unsigned hw_divisor, unsigned *o_shift,
-                                   unsigned *extra_flags);
+unsigned pan_compute_magic_divisor(unsigned hw_divisor, unsigned *divisor_r,
+                                   unsigned *divisor_e);

 #ifdef PAN_ARCH
 /* Records for gl_VertexID and gl_InstanceID use special encodings on Midgard */