diff --git a/src/broadcom/vulkan/meson.build b/src/broadcom/vulkan/meson.build index 3f04a4162dc..8e2600115dd 100644 --- a/src/broadcom/vulkan/meson.build +++ b/src/broadcom/vulkan/meson.build @@ -34,7 +34,6 @@ v3dv_entrypoints = custom_target( libv3dv_files = files( 'v3dv_bo.c', - 'v3dv_cl.c', 'v3dv_cmd_buffer.c', 'v3dv_debug.c', 'v3dv_debug.h', @@ -57,6 +56,7 @@ libv3dv_files = files( ) + [v3d_xml_pack] files_per_version = files( + 'v3dvx_cl.c', 'v3dvx_cmd_buffer.c', 'v3dvx_descriptor_set.c', 'v3dvx_device.c', diff --git a/src/broadcom/vulkan/v3dv_cl.h b/src/broadcom/vulkan/v3dv_cl.h index 7e17ac395c4..96721530f77 100644 --- a/src/broadcom/vulkan/v3dv_cl.h +++ b/src/broadcom/vulkan/v3dv_cl.h @@ -182,6 +182,7 @@ void v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space); cl_packet_pack(packet)(cl, (uint8_t *)cl_out, &name); \ cl_advance_and_end(cl, cl_packet_length(packet)); \ _loop_terminate = NULL; \ + assert(v3dv_cl_offset(cl) <= (cl)->size); \ })) \ #define cl_emit_with_prepacked(cl, packet, prepacked, name) \ @@ -215,9 +216,10 @@ cl_pack_emit_reloc(struct v3dv_cl *cl, const struct v3dv_cl_reloc *reloc) v3dv_job_add_bo(cl->job, reloc->bo); } -#define cl_emit_prepacked_sized(cl, packet, size) do { \ - memcpy((cl)->next, packet, size); \ - cl_advance(&(cl)->next, size); \ +#define cl_emit_prepacked_sized(cl, packet, psize) do { \ + memcpy((cl)->next, packet, psize); \ + cl_advance(&(cl)->next, psize); \ + assert(v3dv_cl_offset(cl) <= (cl)->size); \ } while (0) #define cl_emit_prepacked(cl, packet) \ diff --git a/src/broadcom/vulkan/v3dv_cl.c b/src/broadcom/vulkan/v3dvx_cl.c similarity index 55% rename from src/broadcom/vulkan/v3dv_cl.c rename to src/broadcom/vulkan/v3dvx_cl.c index 7d414999e9b..8f5ea94a745 100644 --- a/src/broadcom/vulkan/v3dv_cl.c +++ b/src/broadcom/vulkan/v3dvx_cl.c @@ -1,4 +1,4 @@ -/* + /* * Copyright © 2019 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a @@ -22,15 +22,21 @@ */ #include "v3dv_private.h" - -/* We don't expect that the packets we use in this file change across hw - * versions, so we just explicitly set the V3D_VERSION and include v3dx_pack - * here - */ -#define V3D_VERSION 42 #include "broadcom/common/v3d_macros.h" #include "broadcom/cle/v3dx_pack.h" +/* The Control List Executor (CLE) pre-fetches V3D_CLE_READAHEAD bytes from + * the Control List buffer. The usage of these last bytes should be avoided or + * the CLE would pre-fetch the data after the end of the CL buffer, reporting + * the kernel "MMU error from client CLE". + */ +#if V3D_VERSION == 42 +#define V3D_CLE_READAHEAD 256 +#endif +#if V3D_VERSION >= 71 +#define V3D_CLE_READAHEAD 1024 +#endif + void v3dv_cl_init(struct v3dv_job *job, struct v3dv_cl *cl) { @@ -55,14 +61,40 @@ v3dv_cl_destroy(struct v3dv_cl *cl) v3dv_cl_init(NULL, cl); } +enum v3dv_cl_chain_type { + V3D_CL_BO_CHAIN_NONE = 0, + V3D_CL_BO_CHAIN_WITH_BRANCH, + V3D_CL_BO_CHAIN_WITH_RETURN_FROM_SUB_LIST, +}; + static bool -cl_alloc_bo(struct v3dv_cl *cl, uint32_t space, bool use_branch) +cl_alloc_bo(struct v3dv_cl *cl, uint32_t space, enum + v3dv_cl_chain_type chain_type) { + /* The last bytes of a CLE buffer are unusable because of readahead + * prefetch, so we need to take it into account when allocating a new BO + * for the CL. We also reserve space for the BRANCH/RETURN_FROM_SUB_LIST + * packet so we can always emit these last packets to the BO when + * needed. We will need to increase cl->size by the packet length before + * calling cl_submit to use this reserved space. + */ + uint32_t unusable_space = 0; + switch (chain_type) { + case V3D_CL_BO_CHAIN_WITH_BRANCH: + unusable_space = V3D_CLE_READAHEAD + cl_packet_length(BRANCH); + break; + case V3D_CL_BO_CHAIN_WITH_RETURN_FROM_SUB_LIST: + unusable_space = V3D_CLE_READAHEAD + cl_packet_length(RETURN_FROM_SUB_LIST); + break; + case V3D_CL_BO_CHAIN_NONE: + break; + } + /* If we are growing, double the BO allocation size to reduce the number * of allocations with large command buffers. This has a very significant * impact on the number of draw calls per second reported by vkoverhead. */ - space = align(space, 4096); + space = align(space + unusable_space, 4096); if (cl->bo) space = MAX2(cl->bo->size * 2, space); @@ -83,10 +115,29 @@ cl_alloc_bo(struct v3dv_cl *cl, uint32_t space, bool use_branch) } /* Chain to the new BO from the old one if requested */ - if (use_branch && cl->bo) { - cl->bo->cl_branch_offset = v3dv_cl_offset(cl); - cl_emit(cl, BRANCH, branch) { - branch.address = v3dv_cl_address(bo, 0); + if (cl->bo) { + switch (chain_type) { + case V3D_CL_BO_CHAIN_WITH_BRANCH: + cl->bo->cl_branch_offset = v3dv_cl_offset(cl); + cl->size += cl_packet_length(BRANCH); + assert(cl->size + V3D_CLE_READAHEAD <= cl->bo->size); + cl_emit(cl, BRANCH, branch) { + branch.address = v3dv_cl_address(bo, 0); + } + break; + case V3D_CL_BO_CHAIN_WITH_RETURN_FROM_SUB_LIST: + /* We do not want to emit branches from secondary command lists, instead, + * we will branch to them when we execute them in a primary using + * 'branch to sub list' commands, expecting each linked secondary to + * end with a 'return from sub list' command. + */ + cl->size += cl_packet_length(RETURN_FROM_SUB_LIST); + assert(cl->size + V3D_CLE_READAHEAD <= cl->bo->size); + cl_emit(cl, RETURN_FROM_SUB_LIST, ret); + FALLTHROUGH; + case V3D_CL_BO_CHAIN_NONE: + v3dv_job_add_bo_unchecked(cl->job, bo); + break; } } else { v3dv_job_add_bo_unchecked(cl->job, bo); @@ -94,7 +145,11 @@ cl_alloc_bo(struct v3dv_cl *cl, uint32_t space, bool use_branch) cl->bo = bo; cl->base = cl->bo->map; - cl->size = cl->bo->size; + /* Take only into account the usable size of the BO to guarantee that + * we never write in the last bytes of the CL buffer because of the + * readahead of the CLE + */ + cl->size = cl->bo->size - unusable_space; cl->next = cl->base; return true; @@ -110,22 +165,14 @@ v3dv_cl_ensure_space(struct v3dv_cl *cl, uint32_t space, uint32_t alignment) return offset; } - cl_alloc_bo(cl, space, false); + cl_alloc_bo(cl, space, V3D_CL_BO_CHAIN_NONE); + return 0; } void v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space) { - /* We do not want to emit branches from secondary command lists, instead, - * we will branch to them when we execute them in a primary using - * 'branch to sub list' commands, expecting each linked secondary to - * end with a 'return from sub list' command. - */ - bool needs_return_from_sub_list = false; - if (cl->job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE && cl->size > 0) - needs_return_from_sub_list = true; - /* * The CLE processor in the simulator tries to read V3D_CL_MAX_INSTR_SIZE * bytes form the CL for each new instruction. If the last instruction in our @@ -139,8 +186,9 @@ v3dv_cl_ensure_space_with_branch(struct v3dv_cl *cl, uint32_t space) if (v3dv_cl_offset(cl) + space <= cl->size) return; - if (needs_return_from_sub_list) - cl_emit(cl, RETURN_FROM_SUB_LIST, ret); + enum v3dv_cl_chain_type chain_type = V3D_CL_BO_CHAIN_WITH_BRANCH; + if (cl->job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE) + chain_type = V3D_CL_BO_CHAIN_WITH_RETURN_FROM_SUB_LIST; - cl_alloc_bo(cl, space, !needs_return_from_sub_list); + cl_alloc_bo(cl, space, chain_type); }