mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-03-26 00:10:34 +01:00
vc4: Use named parameters for the NEON inline asm.
This makes the asm code more intelligible and clarifies the functional
change in the next commit.
(commit message and commit squashing by anholt)
(cherry picked from commiti 522f688471)
[Emil: apply the patch to vc4_tiling_lt.c instead of v3d_cpu_tiling.h]
Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
Conflicts:
src/broadcom/common/v3d_cpu_tiling.h
This commit is contained in:
parent
3b9e9e4723
commit
b280cdb59e
1 changed files with 100 additions and 80 deletions
|
|
@ -73,20 +73,22 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
|
|||
/* Load from the GPU in one shot, no interleave, to
|
||||
* d0-d7.
|
||||
*/
|
||||
"vldm %0, {q0, q1, q2, q3}\n"
|
||||
"vldm %[gpu], {q0, q1, q2, q3}\n"
|
||||
/* Store each 8-byte line to cpu-side destination,
|
||||
* incrementing it by the stride each time.
|
||||
*/
|
||||
"vst1.8 d0, [%1], %2\n"
|
||||
"vst1.8 d1, [%1], %2\n"
|
||||
"vst1.8 d2, [%1], %2\n"
|
||||
"vst1.8 d3, [%1], %2\n"
|
||||
"vst1.8 d4, [%1], %2\n"
|
||||
"vst1.8 d5, [%1], %2\n"
|
||||
"vst1.8 d6, [%1], %2\n"
|
||||
"vst1.8 d7, [%1]\n"
|
||||
"vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
|
||||
"vst1.8 d1, [%[cpu]], %[cpu_stride]\n"
|
||||
"vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
|
||||
"vst1.8 d3, [%[cpu]], %[cpu_stride]\n"
|
||||
"vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
|
||||
"vst1.8 d5, [%[cpu]], %[cpu_stride]\n"
|
||||
"vst1.8 d6, [%[cpu]], %[cpu_stride]\n"
|
||||
"vst1.8 d7, [%[cpu]]\n"
|
||||
:
|
||||
: "r"(gpu), "r"(cpu), "r"(cpu_stride)
|
||||
: [gpu] "r"(gpu),
|
||||
[cpu] "r"(cpu),
|
||||
[cpu_stride] "r"(cpu_stride)
|
||||
: "q0", "q1", "q2", "q3");
|
||||
} else {
|
||||
assert(gpu_stride == 16);
|
||||
|
|
@ -94,21 +96,24 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
|
|||
/* Load from the GPU in one shot, no interleave, to
|
||||
* d0-d7.
|
||||
*/
|
||||
"vldm %0, {q0, q1, q2, q3};\n"
|
||||
"vldm %[gpu], {q0, q1, q2, q3};\n"
|
||||
/* Store each 16-byte line in 2 parts to the cpu-side
|
||||
* destination. (vld1 can only store one d-register
|
||||
* at a time).
|
||||
*/
|
||||
"vst1.8 d0, [%1], %3\n"
|
||||
"vst1.8 d1, [%2], %3\n"
|
||||
"vst1.8 d2, [%1], %3\n"
|
||||
"vst1.8 d3, [%2], %3\n"
|
||||
"vst1.8 d4, [%1], %3\n"
|
||||
"vst1.8 d5, [%2], %3\n"
|
||||
"vst1.8 d6, [%1]\n"
|
||||
"vst1.8 d7, [%2]\n"
|
||||
"vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
|
||||
"vst1.8 d1, [%[cpu2]],%[cpu_stride]\n"
|
||||
"vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
|
||||
"vst1.8 d3, [%[cpu2]],%[cpu_stride]\n"
|
||||
"vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
|
||||
"vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"
|
||||
"vst1.8 d6, [%[cpu]]\n"
|
||||
"vst1.8 d7, [%[cpu2]]\n"
|
||||
:
|
||||
: "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
|
||||
: [gpu] "r"(gpu),
|
||||
[cpu] "r"(cpu),
|
||||
[cpu2] "r"(cpu + 8),
|
||||
[cpu_stride] "r"(cpu_stride)
|
||||
: "q0", "q1", "q2", "q3");
|
||||
}
|
||||
#elif defined (PIPE_ARCH_AARCH64)
|
||||
|
|
@ -117,20 +122,22 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
|
|||
/* Load from the GPU in one shot, no interleave, to
|
||||
* d0-d7.
|
||||
*/
|
||||
"ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
|
||||
"ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
|
||||
/* Store each 8-byte line to cpu-side destination,
|
||||
* incrementing it by the stride each time.
|
||||
*/
|
||||
"st1 {v0.D}[0], [%1], %2\n"
|
||||
"st1 {v0.D}[1], [%1], %2\n"
|
||||
"st1 {v1.D}[0], [%1], %2\n"
|
||||
"st1 {v1.D}[1], [%1], %2\n"
|
||||
"st1 {v2.D}[0], [%1], %2\n"
|
||||
"st1 {v2.D}[1], [%1], %2\n"
|
||||
"st1 {v3.D}[0], [%1], %2\n"
|
||||
"st1 {v3.D}[1], [%1]\n"
|
||||
"st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
|
||||
"st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
|
||||
"st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
|
||||
"st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
|
||||
"st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
|
||||
"st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
|
||||
"st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
|
||||
"st1 {v3.D}[1], [%[cpu]]\n"
|
||||
:
|
||||
: "r"(gpu), "r"(cpu), "r"(cpu_stride)
|
||||
: [gpu] "r"(gpu),
|
||||
[cpu] "r"(cpu),
|
||||
[cpu_stride] "r"(cpu_stride)
|
||||
: "v0", "v1", "v2", "v3");
|
||||
} else {
|
||||
assert(gpu_stride == 16);
|
||||
|
|
@ -138,21 +145,24 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
|
|||
/* Load from the GPU in one shot, no interleave, to
|
||||
* d0-d7.
|
||||
*/
|
||||
"ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
|
||||
"ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
|
||||
/* Store each 16-byte line in 2 parts to the cpu-side
|
||||
* destination. (vld1 can only store one d-register
|
||||
* at a time).
|
||||
*/
|
||||
"st1 {v0.D}[0], [%1], %3\n"
|
||||
"st1 {v0.D}[1], [%2], %3\n"
|
||||
"st1 {v1.D}[0], [%1], %3\n"
|
||||
"st1 {v1.D}[1], [%2], %3\n"
|
||||
"st1 {v2.D}[0], [%1], %3\n"
|
||||
"st1 {v2.D}[1], [%2], %3\n"
|
||||
"st1 {v3.D}[0], [%1]\n"
|
||||
"st1 {v3.D}[1], [%2]\n"
|
||||
"st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
|
||||
"st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
|
||||
"st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
|
||||
"st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
|
||||
"st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
|
||||
"st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
|
||||
"st1 {v3.D}[0], [%[cpu]]\n"
|
||||
"st1 {v3.D}[1], [%[cpu2]]\n"
|
||||
:
|
||||
: "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
|
||||
: [gpu] "r"(gpu),
|
||||
[cpu] "r"(cpu),
|
||||
[cpu2] "r"(cpu + 8),
|
||||
[cpu_stride] "r"(cpu_stride)
|
||||
: "v0", "v1", "v2", "v3");
|
||||
}
|
||||
#else
|
||||
|
|
@ -174,20 +184,22 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
|
|||
/* Load each 8-byte line from cpu-side source,
|
||||
* incrementing it by the stride each time.
|
||||
*/
|
||||
"vld1.8 d0, [%1], %2\n"
|
||||
"vld1.8 d1, [%1], %2\n"
|
||||
"vld1.8 d2, [%1], %2\n"
|
||||
"vld1.8 d3, [%1], %2\n"
|
||||
"vld1.8 d4, [%1], %2\n"
|
||||
"vld1.8 d5, [%1], %2\n"
|
||||
"vld1.8 d6, [%1], %2\n"
|
||||
"vld1.8 d7, [%1]\n"
|
||||
"vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
|
||||
"vld1.8 d1, [%[cpu]], %[cpu_stride]\n"
|
||||
"vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
|
||||
"vld1.8 d3, [%[cpu]], %[cpu_stride]\n"
|
||||
"vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
|
||||
"vld1.8 d5, [%[cpu]], %[cpu_stride]\n"
|
||||
"vld1.8 d6, [%[cpu]], %[cpu_stride]\n"
|
||||
"vld1.8 d7, [%[cpu]]\n"
|
||||
/* Load from the GPU in one shot, no interleave, to
|
||||
* d0-d7.
|
||||
*/
|
||||
"vstm %0, {q0, q1, q2, q3}\n"
|
||||
"vstm %[gpu], {q0, q1, q2, q3}\n"
|
||||
:
|
||||
: "r"(gpu), "r"(cpu), "r"(cpu_stride)
|
||||
: [gpu] "r"(gpu),
|
||||
[cpu] "r"(cpu),
|
||||
[cpu_stride] "r"(cpu_stride)
|
||||
: "q0", "q1", "q2", "q3");
|
||||
} else {
|
||||
assert(gpu_stride == 16);
|
||||
|
|
@ -196,18 +208,21 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
|
|||
* destination. (vld1 can only store one d-register
|
||||
* at a time).
|
||||
*/
|
||||
"vld1.8 d0, [%1], %3\n"
|
||||
"vld1.8 d1, [%2], %3\n"
|
||||
"vld1.8 d2, [%1], %3\n"
|
||||
"vld1.8 d3, [%2], %3\n"
|
||||
"vld1.8 d4, [%1], %3\n"
|
||||
"vld1.8 d5, [%2], %3\n"
|
||||
"vld1.8 d6, [%1]\n"
|
||||
"vld1.8 d7, [%2]\n"
|
||||
"vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
|
||||
"vld1.8 d1, [%[cpu2]],%[cpu_stride]\n"
|
||||
"vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
|
||||
"vld1.8 d3, [%[cpu2]],%[cpu_stride]\n"
|
||||
"vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
|
||||
"vld1.8 d5, [%[cpu2]],%[cpu_stride]\n"
|
||||
"vld1.8 d6, [%[cpu]]\n"
|
||||
"vld1.8 d7, [%[cpu2]]\n"
|
||||
/* Store to the GPU in one shot, no interleave. */
|
||||
"vstm %0, {q0, q1, q2, q3}\n"
|
||||
"vstm %[gpu], {q0, q1, q2, q3}\n"
|
||||
:
|
||||
: "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
|
||||
: [gpu] "r"(gpu),
|
||||
[cpu] "r"(cpu),
|
||||
[cpu2] "r"(cpu + 8),
|
||||
[cpu_stride] "r"(cpu_stride)
|
||||
: "q0", "q1", "q2", "q3");
|
||||
}
|
||||
#elif defined (PIPE_ARCH_AARCH64)
|
||||
|
|
@ -216,18 +231,20 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
|
|||
/* Load each 8-byte line from cpu-side source,
|
||||
* incrementing it by the stride each time.
|
||||
*/
|
||||
"ld1 {v0.D}[0], [%1], %2\n"
|
||||
"ld1 {v0.D}[1], [%1], %2\n"
|
||||
"ld1 {v1.D}[0], [%1], %2\n"
|
||||
"ld1 {v1.D}[1], [%1], %2\n"
|
||||
"ld1 {v2.D}[0], [%1], %2\n"
|
||||
"ld1 {v2.D}[1], [%1], %2\n"
|
||||
"ld1 {v3.D}[0], [%1], %2\n"
|
||||
"ld1 {v3.D}[1], [%1]\n"
|
||||
"ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
|
||||
"ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
|
||||
"ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
|
||||
"ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
|
||||
"ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
|
||||
"ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
|
||||
"ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
|
||||
"ld1 {v3.D}[1], [%[cpu]]\n"
|
||||
/* Store to the GPU in one shot, no interleave. */
|
||||
"st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
|
||||
"st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
|
||||
:
|
||||
: "r"(gpu), "r"(cpu), "r"(cpu_stride)
|
||||
: [gpu] "r"(gpu),
|
||||
[cpu] "r"(cpu),
|
||||
[cpu_stride] "r"(cpu_stride)
|
||||
: "v0", "v1", "v2", "v3");
|
||||
} else {
|
||||
assert(gpu_stride == 16);
|
||||
|
|
@ -236,18 +253,21 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
|
|||
* destination. (vld1 can only store one d-register
|
||||
* at a time).
|
||||
*/
|
||||
"ld1 {v0.D}[0], [%1], %3\n"
|
||||
"ld1 {v0.D}[1], [%2], %3\n"
|
||||
"ld1 {v1.D}[0], [%1], %3\n"
|
||||
"ld1 {v1.D}[1], [%2], %3\n"
|
||||
"ld1 {v2.D}[0], [%1], %3\n"
|
||||
"ld1 {v2.D}[1], [%2], %3\n"
|
||||
"ld1 {v3.D}[0], [%1]\n"
|
||||
"ld1 {v3.D}[1], [%2]\n"
|
||||
"ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
|
||||
"ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
|
||||
"ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
|
||||
"ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
|
||||
"ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
|
||||
"ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
|
||||
"ld1 {v3.D}[0], [%[cpu]]\n"
|
||||
"ld1 {v3.D}[1], [%[cpu2]]\n"
|
||||
/* Store to the GPU in one shot, no interleave. */
|
||||
"st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
|
||||
"st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
|
||||
:
|
||||
: "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
|
||||
: [gpu] "r"(gpu),
|
||||
[cpu] "r"(cpu),
|
||||
[cpu2] "r"(cpu + 8),
|
||||
[cpu_stride] "r"(cpu_stride)
|
||||
: "v0", "v1", "v2", "v3");
|
||||
}
|
||||
#else
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue