vc4: Compile the LT image helper per cpp we might load/store.

For the partial load/store support I'm about to add, we want the memcpy to
be compiled out to a single load/store.  This should also eliminate the
calls to vc4_utile_width/height().

Improves x11perf -putimage100 performance by  3.76344% +/- 1.16978% (n=15)
This commit is contained in:
Eric Anholt 2018-08-07 17:53:24 -07:00
parent d6a174669f
commit 3e06b918aa

View file

@ -289,12 +289,40 @@ vc4_lt_image_helper(void *gpu, uint32_t gpu_stride,
}
}
static inline void
vc4_lt_image_cpp_helper(void *gpu, uint32_t gpu_stride,
void *cpu, uint32_t cpu_stride,
int cpp, const struct pipe_box *box, bool to_cpu)
{
switch (cpp) {
case 1:
vc4_lt_image_helper(gpu, gpu_stride, cpu, cpu_stride, 1, box,
to_cpu);
break;
case 2:
vc4_lt_image_helper(gpu, gpu_stride, cpu, cpu_stride, 2, box,
to_cpu);
break;
case 4:
vc4_lt_image_helper(gpu, gpu_stride, cpu, cpu_stride, 4, box,
to_cpu);
break;
case 8:
vc4_lt_image_helper(gpu, gpu_stride, cpu, cpu_stride, 8, box,
to_cpu);
break;
default:
unreachable("bad cpp");
}
}
void
NEON_TAG(vc4_load_lt_image)(void *dst, uint32_t dst_stride,
void *src, uint32_t src_stride,
int cpp, const struct pipe_box *box)
{
vc4_lt_image_helper(src, src_stride, dst, dst_stride, cpp, box, true);
vc4_lt_image_cpp_helper(src, src_stride, dst, dst_stride, cpp, box,
true);
}
void
@ -302,5 +330,6 @@ NEON_TAG(vc4_store_lt_image)(void *dst, uint32_t dst_stride,
void *src, uint32_t src_stride,
int cpp, const struct pipe_box *box)
{
vc4_lt_image_helper(dst, dst_stride, src, src_stride, cpp, box, false);
vc4_lt_image_cpp_helper(dst, dst_stride, src, src_stride, cpp, box,
false);
}