freedreno/a6xx: Fix SP_HS_UNKNOWN_A831 value and document it

It appears that storage for varyings in a wave has an upper
limit of wavesize * max_a831 where max_a831 is 64.
Exceeding the limit seam to force gpu to reduce primitives
processed per wave, at least calculations make sense with
such interpretation.

With blob SP_HS_UNKNOWN_A831 never exceeds 64 and setting
it to 65 in freedreno leads to a hang.

On A630 tests (patch_size=3 + gl_Position + array of vec4)
have shown such relation:

| Num of vec4 | A831 | PC_HS_INPUT_SIZE |
|-------------|------|------------------|
| 1           | 0x10 | 0xc              |
| 2           | 0x14 | 0xf              |
| 3           | 0x18 | 0x12             |
| 4           | 0x1c | 0x15             |
| 5           | 0x20 | 0x18             |
| 6           | 0x24 | 0x1b             |
| 7           | 0x28 | 0x1e             |
| 8           | 0x2c | 0x21             |
| 9           | 0x30 | 0x24             |
| 10          | 0x34 | 0x27             |
| 11          | 0x38 | 0x2a             |
| 12          | 0x3c | 0x2d             |
| 13          | 0x3f | 0x30             |
| 14          | 0x40 | 0x33             |
| 15          | 0x3d | 0x36             |
| 16          | 0x3d | 0x39             |
| 17          | 0x40 | 0x3c             |
| 18          | 0x3f | 0x3f             |
| 19          | 0x3e | 0x42             |
| 20          | 0x3d | 0x45             |
| 21          | 0x3f | 0x48             |
| 22          | 0x3d | 0x4b             |
| 23          | 0x40 | 0x4e             |
| 24          | 0x3d | 0x51             |
| 25          | 0x3f | 0x54             |
| 26          | 0x3c | 0x57             |
| 27          | 0x3e | 0x5a             |
| 28          | 0x40 | 0x5d             |
| 29          | 0x3c | 0x60             |
| 30          | 0x3e | 0x63             |
| 31          | 0x40 | 0x66             |
|-------------|------|------------------|

Brief tests with high patch sizes also confirm that formula
matches blob behaviour.

A831 is not a limit for storage available for one thread, so
naming it as SP_HS_WAVE_INPUT_SIZE would make more sense.

Fixes: 47e2c195 "freedreno/a6xx: Program state for tessellation stages"

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7917>
This commit is contained in:
Danylo Piliaiev 2020-12-14 18:42:59 +02:00
parent 22180137e9
commit e5499ca2bf
6 changed files with 39 additions and 17 deletions

View file

@ -7973,7 +7973,7 @@ clusters:
00000100 SP_VS_CONFIG: { ENABLED | NTEX = 0 | NSAMP = 0 | NIBO = 0 }
00000000 SP_VS_INSTRLEN: 0
00000000 SP_HS_CTRL_REG0: { HALFREGFOOTPRINT = 0 | FULLREGFOOTPRINT = 0 | BRANCHSTACK = 0 | THREADSIZE = TWO_QUADS }
00000000 SP_HS_UNKNOWN_A831: 0
00000000 SP_HS_WAVE_INPUT_SIZE: 0
00000000 0xa832: 00000000
00000000 SP_HS_OBJ_FIRST_EXEC_OFFSET: 0
780a8ca5 SP_HS_OBJ_START_LO: 0x780a8ca5
@ -8119,7 +8119,7 @@ clusters:
00000100 SP_VS_CONFIG: { ENABLED | NTEX = 0 | NSAMP = 0 | NIBO = 0 }
00000000 SP_VS_INSTRLEN: 0
00000000 SP_HS_CTRL_REG0: { HALFREGFOOTPRINT = 0 | FULLREGFOOTPRINT = 0 | BRANCHSTACK = 0 | THREADSIZE = TWO_QUADS }
00000000 SP_HS_UNKNOWN_A831: 0
00000000 SP_HS_WAVE_INPUT_SIZE: 0
00000000 0xa832: 00000000
00000000 SP_HS_OBJ_FIRST_EXEC_OFFSET: 0
780a8ca5 SP_HS_OBJ_START_LO: 0x780a8ca5

View file

@ -948,8 +948,8 @@ t4 write SP_CS_CONFIG (a9bb)
t4 write HLSQ_CS_CNTL (b987)
HLSQ_CS_CNTL: { CONSTLEN = 0 }
0000000001054250: 0000: 40b98701 00000000
t4 write SP_HS_UNKNOWN_A831 (a831)
SP_HS_UNKNOWN_A831: 0
t4 write SP_HS_WAVE_INPUT_SIZE (a831)
SP_HS_WAVE_INPUT_SIZE: 0
0000000001054258: 0000: 48a83101 00000000
t4 write VFD_CONTROL_1 (a001)
VFD_CONTROL_1: { REGID4VTX = r2.y | REGID4INST = r63.x | REGID4PRIMID = r63.x | REGID4VIEWID = r63.x }
@ -1511,7 +1511,7 @@ t7 opcode: CP_DRAW_INDIRECT_MULTI (2a) (12 dwords)
- shaderdb: 0 (ss), 0 (sy)
!+ 00000100 SP_VS_CONFIG: { ENABLED | NTEX = 0 | NSAMP = 0 | NIBO = 0 }
!+ 00000001 SP_VS_INSTRLEN: 1
+ 00000000 SP_HS_UNKNOWN_A831: 0
+ 00000000 SP_HS_WAVE_INPUT_SIZE: 0
+ 00000000 SP_HS_CONFIG: { NTEX = 0 | NSAMP = 0 | NIBO = 0 }
+ 00000000 SP_DS_CONFIG: { NTEX = 0 | NSAMP = 0 | NIBO = 0 }
+ 00000000 SP_GS_CONFIG: { NTEX = 0 | NSAMP = 0 | NIBO = 0 }

View file

@ -681,8 +681,8 @@ t4 write SP_VS_OUT[0].REG (a803)
t4 write SP_VS_VPC_DST[0].REG (a813)
SP_VS_VPC_DST[0].REG: { OUTLOC0 = 0 | OUTLOC1 = 0 | OUTLOC2 = 0 | OUTLOC3 = 0 }
0000000001121070: 0000: 48a81301 00000000
t4 write SP_HS_UNKNOWN_A831 (a831)
SP_HS_UNKNOWN_A831: 0
t4 write SP_HS_WAVE_INPUT_SIZE (a831)
SP_HS_WAVE_INPUT_SIZE: 0
0000000001121078: 0000: 48a83101 00000000
t4 write SP_VS_PRIMITIVE_CNTL (a802)
SP_VS_PRIMITIVE_CNTL: { OUT = 1 }
@ -1118,7 +1118,7 @@ t7 opcode: CP_DRAW_INDX_OFFSET (38) (4 dwords)
- shaderdb: 0 (ss), 0 (sy)
!+ 00000100 SP_VS_CONFIG: { ENABLED | NTEX = 0 | NSAMP = 0 | NIBO = 0 }
!+ 00000001 SP_VS_INSTRLEN: 1
+ 00000000 SP_HS_UNKNOWN_A831: 0
+ 00000000 SP_HS_WAVE_INPUT_SIZE: 0
+ 00000000 SP_HS_OBJ_FIRST_EXEC_OFFSET: 0
+ 00000000 SP_HS_CONFIG: { NTEX = 0 | NSAMP = 0 | NIBO = 0 }
+ 00000000 SP_DS_CONFIG: { NTEX = 0 | NSAMP = 0 | NIBO = 0 }
@ -1996,8 +1996,8 @@ t4 write SP_VS_OUT[0].REG (a803)
t4 write SP_VS_VPC_DST[0].REG (a813)
SP_VS_VPC_DST[0].REG: { OUTLOC0 = 0 | OUTLOC1 = 0 | OUTLOC2 = 0 | OUTLOC3 = 0 }
0000000001120070: 0000: 48a81301 00000000
t4 write SP_HS_UNKNOWN_A831 (a831)
SP_HS_UNKNOWN_A831: 0
t4 write SP_HS_WAVE_INPUT_SIZE (a831)
SP_HS_WAVE_INPUT_SIZE: 0
0000000001120078: 0000: 48a83101 00000000
t4 write SP_VS_PRIMITIVE_CNTL (a802)
SP_VS_PRIMITIVE_CNTL: { OUT = 1 }
@ -5343,7 +5343,7 @@ t7 opcode: CP_DRAW_INDX_OFFSET (38) (4 dwords)
- shaderdb: 0 (ss), 0 (sy)
+ 00000100 SP_VS_CONFIG: { ENABLED | NTEX = 0 | NSAMP = 0 | NIBO = 0 }
+ 00000001 SP_VS_INSTRLEN: 1
+ 00000000 SP_HS_UNKNOWN_A831: 0
+ 00000000 SP_HS_WAVE_INPUT_SIZE: 0
+ 00000000 SP_HS_OBJ_FIRST_EXEC_OFFSET: 0
+ 00000000 SP_HS_CONFIG: { NTEX = 0 | NSAMP = 0 | NIBO = 0 }
+ 00000000 SP_DS_CONFIG: { NTEX = 0 | NSAMP = 0 | NIBO = 0 }

View file

@ -3220,7 +3220,13 @@ to upconvert to 32b float internally?
<reg32 offset="0xa825" name="SP_VS_PVT_MEM_HW_STACK_OFFSET" low="0" high="18" shr="11"/>
<reg32 offset="0xa830" name="SP_HS_CTRL_REG0" type="a6xx_sp_xs_ctrl_reg0"/>
<reg32 offset="0xa831" name="SP_HS_UNKNOWN_A831"/>
<!--
Total size of local storage in dwords divided by the wave size.
The maximum value is 64. With the wave size being always 64 for HS,
the maximum size of local storage should be:
64 (wavesize) * 64 (SP_HS_WAVE_INPUT_SIZE) * 4 = 16k
-->
<reg32 offset="0xa831" name="SP_HS_WAVE_INPUT_SIZE" low="0" high="7" type="uint"/>
<reg32 offset="0xa833" name="SP_HS_OBJ_FIRST_EXEC_OFFSET" type="uint"/>
<reg32 offset="0xa834" name="SP_HS_OBJ_START_LO"/>

View file

@ -1058,7 +1058,7 @@ tu6_emit_vpc(struct tu_cs *cs,
unknown_a831 = DIV_ROUND_UP(total_size, wavesize);
}
tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_UNKNOWN_A831, 1);
tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
tu_cs_emit(cs, unknown_a831);
/* In SPIR-V generated from GLSL, the tessellation primitive params are
@ -1563,7 +1563,7 @@ tu6_emit_program(struct tu_cs *cs,
tu_cs_emit(cs, builder->multiview_mask);
}
tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_UNKNOWN_A831, 1);
tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
tu_cs_emit(cs, 0);
tu6_emit_vpc(cs, vs, hs, ds, gs, fs, cps_per_patch,

View file

@ -654,8 +654,24 @@ setup_stateobj(struct fd_ringbuffer *ring, struct fd_context *ctx,
OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1);
OUT_RING(ring, hs_info->tess.tcs_vertices_out * vs->output_size / 4);
OUT_PKT4(ring, REG_A6XX_SP_HS_UNKNOWN_A831, 1);
OUT_RING(ring, vs->output_size);
const uint32_t wavesize = 64;
const uint32_t max_wave_input_size = 64;
const uint32_t patch_control_points = hs_info->tess.tcs_vertices_out;
/* note: if HS is really just the VS extended, then this
* should be by MAX2(patch_control_points, hs_info->tess.tcs_vertices_out)
* however that doesn't match the blob, and fails some dEQP tests.
*/
uint32_t prims_per_wave = wavesize / hs_info->tess.tcs_vertices_out;
uint32_t max_prims_per_wave =
max_wave_input_size * wavesize / (vs->output_size * patch_control_points);
prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave);
uint32_t total_size = vs->output_size * patch_control_points * prims_per_wave;
uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize);
OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
OUT_RING(ring, wave_input_size);
shader_info *ds_info = &ds->shader->nir->info;
OUT_PKT4(ring, REG_A6XX_PC_TESS_CNTL, 1);
@ -706,7 +722,7 @@ setup_stateobj(struct fd_ringbuffer *ring, struct fd_context *ctx,
A6XX_PC_DS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
} else {
OUT_PKT4(ring, REG_A6XX_SP_HS_UNKNOWN_A831, 1);
OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
OUT_RING(ring, 0);
}