llvmpipe/cs: rework coroutine context handling (v2)

Get comfy.

llvmpipe coroutines have a stack frame. This is created by hooking
in malloc and coro.alloc and coro.size intrinsics.

LLVM has an CoroElide pass that is meant to allow that stack frame
to be done as an alloca in the caller instead of using the malloc path.

The CoroElide pass relies on the coroutine being inlined (fixed that).

The CoroElide pass relies on there being a direct connect between
coro.destroy(i8 *arg) and arg = coro.begin(id). However due to the
way the compute shaders are launched, there is no way to ensure that
link. Fixing the CoroElide pass seems quite difficult, I considered
having a force CoroElide always flag to make it dtrt, however I'm not
sure how ugly that would end up.

My first attempt tried to preallocate the stacks at a fixed size,
this turned out to be naive as the stack frame size was not sized
like I expected. Instead the first coro to run allocs enough for
everyone, so avoid the massive amounts of small allocations.

This remove coro malloc from a lot of profiles and shaves another 30s
or so from OpenCL ./conversions/test_conversions uchar_uin
(from 4.40m to just under 4m on my ryzen 7 1800x)

Reviewed-by: Roland Scheidegger <sroland@vmware.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12432>
This commit is contained in:
Dave Airlie 2021-08-18 04:38:58 +10:00
parent 8d3e97344c
commit 259e26e5e3
3 changed files with 52 additions and 8 deletions

View file

@ -196,6 +196,29 @@ LLVMValueRef lp_build_coro_begin_alloc_mem(struct gallivm_state *gallivm, LLVMVa
return coro_hdl;
}
LLVMValueRef lp_build_coro_alloc_mem_array(struct gallivm_state *gallivm,
LLVMValueRef coro_hdl_ptr, LLVMValueRef coro_idx,
LLVMValueRef coro_num_hdls)
{
LLVMTypeRef mem_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
LLVMValueRef alloced_ptr = LLVMBuildLoad(gallivm->builder, coro_hdl_ptr, "");
LLVMValueRef not_alloced = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, alloced_ptr, LLVMConstNull(mem_ptr_type), "");
LLVMValueRef coro_size = lp_build_coro_size(gallivm);
struct lp_build_if_state if_state_coro;
lp_build_if(&if_state_coro, gallivm, not_alloced);
LLVMValueRef alloc_mem;
LLVMValueRef alloc_size = LLVMBuildMul(gallivm->builder, coro_num_hdls, coro_size, "");
assert(gallivm->coro_malloc_hook);
alloc_mem = LLVMBuildCall(gallivm->builder, gallivm->coro_malloc_hook, &alloc_size, 1, "");
LLVMBuildStore(gallivm->builder, alloc_mem, coro_hdl_ptr);
lp_build_endif(&if_state_coro);
return LLVMBuildMul(gallivm->builder, coro_size, coro_idx, "");
}
void lp_build_coro_free_mem(struct gallivm_state *gallivm, LLVMValueRef coro_id, LLVMValueRef coro_hdl)
{
LLVMValueRef alloc_mem = lp_build_coro_free(gallivm, coro_id, coro_hdl);

View file

@ -55,6 +55,10 @@ LLVMValueRef lp_build_coro_suspend(struct gallivm_state *gallivm, bool last);
LLVMValueRef lp_build_coro_alloc(struct gallivm_state *gallivm, LLVMValueRef id);
LLVMValueRef lp_build_coro_begin_alloc_mem(struct gallivm_state *gallivm, LLVMValueRef coro_id);
LLVMValueRef lp_build_coro_alloc_mem_array(struct gallivm_state *gallivm,
LLVMValueRef coro_hdl_ptr, LLVMValueRef coro_idx,
LLVMValueRef coro_num_hdls);
void lp_build_coro_free_mem(struct gallivm_state *gallivm, LLVMValueRef coro_id, LLVMValueRef coro_hdl);
struct lp_build_coro_suspend_info {

View file

@ -70,7 +70,7 @@ generate_compute(struct llvmpipe_context *lp,
struct gallivm_state *gallivm = variant->gallivm;
const struct lp_compute_shader_variant_key *key = &variant->key;
char func_name[64], func_name_coro[64];
LLVMTypeRef arg_types[18];
LLVMTypeRef arg_types[19];
LLVMTypeRef func_type, coro_func_type;
LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
LLVMValueRef context_ptr;
@ -121,8 +121,9 @@ generate_compute(struct llvmpipe_context *lp,
arg_types[15] = int32_type; /* coro block_y_size */
arg_types[16] = int32_type; /* coro block_z_size */
arg_types[17] = int32_type; /* coro idx */
arg_types[18] = LLVMPointerType(LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), 0);
func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context),
arg_types, ARRAY_SIZE(arg_types) - 6, 0);
arg_types, ARRAY_SIZE(arg_types) - 7, 0);
coro_func_type = LLVMFunctionType(LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0),
arg_types, ARRAY_SIZE(arg_types), 0);
@ -138,7 +139,8 @@ generate_compute(struct llvmpipe_context *lp,
for(i = 0; i < ARRAY_SIZE(arg_types); ++i) {
if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) {
lp_add_function_attr(coro, i + 1, LP_FUNC_ATTR_NOALIAS);
lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
if (i < ARRAY_SIZE(arg_types) - 7)
lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
}
}
@ -191,7 +193,11 @@ generate_compute(struct llvmpipe_context *lp,
LLVMValueRef coro_num_hdls = LLVMBuildMul(gallivm->builder, num_x_loop, y_size_arg, "");
coro_num_hdls = LLVMBuildMul(gallivm->builder, coro_num_hdls, z_size_arg, "");
/* build a ptr in memory to store all the frames in later. */
LLVMTypeRef hdl_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
LLVMValueRef coro_mem = LLVMBuildAlloca(gallivm->builder, hdl_ptr_type, "coro_mem");
LLVMBuildStore(builder, LLVMConstNull(hdl_ptr_type), coro_mem);
LLVMValueRef coro_hdls = LLVMBuildArrayAlloca(gallivm->builder, hdl_ptr_type, coro_num_hdls, "coro_hdls");
unsigned end_coroutine = INT_MAX;
@ -211,7 +217,7 @@ generate_compute(struct llvmpipe_context *lp,
lp_build_loop_begin(&loop_state[0], gallivm,
lp_build_const_int32(gallivm, 0)); /* x loop */
{
LLVMValueRef args[18];
LLVMValueRef args[19];
args[0] = context_ptr;
args[1] = loop_state[0].counter;
args[2] = loop_state[1].counter;
@ -240,6 +246,8 @@ generate_compute(struct llvmpipe_context *lp,
loop_state[0].counter, "");
args[17] = coro_hdl_idx;
args[18] = coro_mem;
LLVMValueRef coro_entry = LLVMBuildGEP(gallivm->builder, coro_hdls, &coro_hdl_idx, 1, "");
LLVMValueRef coro_hdl = LLVMBuildLoad(gallivm->builder, coro_entry, "coro_hdl");
@ -249,7 +257,7 @@ generate_compute(struct llvmpipe_context *lp,
lp_build_const_int32(gallivm, 0), "");
/* first time here - call the coroutine function entry point */
lp_build_if(&ifstate, gallivm, cmp);
LLVMValueRef coro_ret = LLVMBuildCall(gallivm->builder, coro, args, 18, "");
LLVMValueRef coro_ret = LLVMBuildCall(gallivm->builder, coro, args, 19, "");
LLVMBuildStore(gallivm->builder, coro_ret, coro_entry);
lp_build_else(&ifstate);
/* subsequent calls for this invocation - check if done. */
@ -278,6 +286,10 @@ generate_compute(struct llvmpipe_context *lp,
lp_build_loop_end_cond(&loop_state[3],
lp_build_const_int32(gallivm, end_coroutine),
NULL, LLVMIntEQ);
LLVMValueRef coro_mem_ptr = LLVMBuildLoad(builder, coro_mem, "");
LLVMBuildCall(gallivm->builder, gallivm->coro_free_hook, &coro_mem_ptr, 1, "");
LLVMBuildRetVoid(builder);
/* This is stage (b) - generate the compute shader code inside the coroutine. */
@ -300,6 +312,7 @@ generate_compute(struct llvmpipe_context *lp,
block_y_size_arg = LLVMGetParam(coro, 15);
block_z_size_arg = LLVMGetParam(coro, 16);
LLVMValueRef coro_idx = LLVMGetParam(coro, 17);
coro_mem = LLVMGetParam(coro, 18);
block = LLVMAppendBasicBlockInContext(gallivm->context, coro, "entry");
LLVMPositionBuilderAtEnd(builder, block);
{
@ -319,10 +332,16 @@ generate_compute(struct llvmpipe_context *lp,
shared_ptr = lp_jit_cs_thread_data_shared(gallivm, thread_data_ptr);
LLVMValueRef coro_num_hdls = LLVMBuildMul(gallivm->builder, num_x_loop, block_y_size_arg, "");
coro_num_hdls = LLVMBuildMul(gallivm->builder, coro_num_hdls, block_z_size_arg, "");
/* these are coroutine entrypoint necessities */
LLVMValueRef coro_id = lp_build_coro_id(gallivm);
LLVMValueRef coro_hdl = lp_build_coro_begin_alloc_mem(gallivm, coro_id);
LLVMValueRef coro_entry = lp_build_coro_alloc_mem_array(gallivm, coro_mem, coro_idx, coro_num_hdls);
LLVMValueRef alloced_ptr = LLVMBuildLoad(gallivm->builder, coro_mem, "");
alloced_ptr = LLVMBuildGEP(gallivm->builder, alloced_ptr, &coro_entry, 1, "");
LLVMValueRef coro_hdl = lp_build_coro_begin(gallivm, coro_id, alloced_ptr);
LLVMValueRef has_partials = LLVMBuildICmp(gallivm->builder, LLVMIntNE, partials, lp_build_const_int32(gallivm, 0), "");
LLVMValueRef tid_vals[3];
LLVMValueRef tids_x[LP_MAX_VECTOR_LENGTH], tids_y[LP_MAX_VECTOR_LENGTH], tids_z[LP_MAX_VECTOR_LENGTH];
@ -417,8 +436,6 @@ generate_compute(struct llvmpipe_context *lp,
lp_build_coro_suspend_switch(gallivm, &coro_info, NULL, true);
LLVMPositionBuilderAtEnd(builder, clean_block);
lp_build_coro_free_mem(gallivm, coro_id, coro_hdl);
LLVMBuildBr(builder, sus_block);
LLVMPositionBuilderAtEnd(builder, sus_block);