i965/fs: Add initial support for 16-wide dispatch on gen6.

At this point it doesn't do uniforms, which have to be laid out the
same between 8 and 16.  Other than that, it supports everything but
flow control, which was the thing that forced us to choose 8-wide for
general GLSL support.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
This commit is contained in:
Eric Anholt 2011-03-11 19:19:01 -08:00
parent 76b7a0c1af
commit 662f1b48bd
6 changed files with 233 additions and 104 deletions

View file

@ -204,13 +204,16 @@ struct brw_wm_prog_data {
GLuint urb_read_length;
GLuint first_curbe_grf;
GLuint first_curbe_grf_16;
GLuint total_grf;
GLuint total_grf_16;
GLuint total_scratch;
GLuint nr_params; /**< number of float params/constants */
GLuint nr_pull_params;
GLboolean error;
int dispatch_width;
uint32_t prog_offset_16;
/* Pointer to tracked values (only valid once
* _mesa_load_state_parameters has been called at runtime).

View file

@ -194,6 +194,32 @@ fs_visitor::fail(const char *format, ...)
}
}
void
fs_visitor::push_force_uncompressed()
{
force_uncompressed_stack++;
}
void
fs_visitor::pop_force_uncompressed()
{
force_uncompressed_stack--;
assert(force_uncompressed_stack >= 0);
}
void
fs_visitor::push_force_sechalf()
{
force_sechalf_stack++;
}
void
fs_visitor::pop_force_sechalf()
{
force_sechalf_stack--;
assert(force_sechalf_stack >= 0);
}
/**
* Returns how many MRFs an FS opcode will write over.
*
@ -1738,6 +1764,10 @@ fs_visitor::visit(ir_if *ir)
{
fs_inst *inst;
if (c->dispatch_width == 16) {
fail("Can't support (non-uniform) control flow on 16-wide\n");
}
/* Don't point the annotation at the if statement, because then it plus
* the then and else blocks get printed.
*/
@ -1778,6 +1808,10 @@ fs_visitor::visit(ir_loop *ir)
{
fs_reg counter = reg_undef;
if (c->dispatch_width == 16) {
fail("Can't support (non-uniform) control flow on 16-wide\n");
}
if (ir->counter) {
this->base_ir = ir->counter;
ir->counter->accept(this);
@ -1881,6 +1915,11 @@ fs_visitor::emit(fs_inst inst)
fs_inst *list_inst = new(mem_ctx) fs_inst;
*list_inst = inst;
if (force_uncompressed_stack > 0)
list_inst->force_uncompressed = true;
else if (force_sechalf_stack > 0)
list_inst->force_sechalf = true;
list_inst->annotation = this->current_annotation;
list_inst->ir = this->base_ir;
@ -2006,6 +2045,7 @@ fs_visitor::emit_fb_writes()
this->current_annotation = "FB write header";
GLboolean header_present = GL_TRUE;
int nr = 0;
int reg_width = c->dispatch_width / 8;
if (intel->gen >= 6 &&
!this->kill_emitted &&
@ -2019,31 +2059,44 @@ fs_visitor::emit_fb_writes()
}
if (c->aa_dest_stencil_reg) {
push_force_uncompressed();
emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
pop_force_uncompressed();
}
/* Reserve space for color. It'll be filled in per MRT below. */
int color_mrf = nr;
nr += 4;
nr += 4 * reg_width;
if (c->source_depth_to_render_target) {
if (intel->gen == 6 && c->dispatch_width == 16) {
/* For outputting oDepth on gen6, SIMD8 writes have to be
* used. This would require 8-wide moves of each half to
* message regs, kind of like pre-gen5 SIMD16 FB writes.
* Just bail on doing so for now.
*/
fail("Missing support for simd16 depth writes on gen6\n");
}
if (c->computes_depth) {
/* Hand over gl_FragDepth. */
assert(this->frag_depth);
fs_reg depth = *(variable_storage(this->frag_depth));
emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth);
emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), depth);
} else {
/* Pass through the payload depth. */
emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
}
nr += reg_width;
}
if (c->dest_depth_reg) {
emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
nr += reg_width;
}
fs_reg color = reg_undef;
@ -2060,7 +2113,7 @@ fs_visitor::emit_fb_writes()
target);
if (this->frag_color || this->frag_data) {
for (int i = 0; i < 4; i++) {
emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i), color);
emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i * reg_width), color);
color.reg_offset++;
}
}
@ -2144,7 +2197,7 @@ fs_visitor::generate_fb_write(fs_inst *inst)
brw_pop_insn_state(p);
brw_fb_WRITE(p,
8, /* dispatch_width */
c->dispatch_width,
inst->base_mrf,
implied_header,
inst->target,
@ -2608,8 +2661,12 @@ fs_visitor::setup_paramvalues_refs()
void
fs_visitor::assign_curb_setup()
{
c->prog_data.first_curbe_grf = c->nr_payload_regs;
c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
if (c->dispatch_width == 8) {
c->prog_data.first_curbe_grf = c->nr_payload_regs;
} else {
c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
}
/* Map the offsets in the UNIFORM file to fixed HW regs. */
foreach_iter(exec_list_iterator, iter, this->instructions) {
@ -2618,7 +2675,7 @@ fs_visitor::assign_curb_setup()
for (unsigned int i = 0; i < 3; i++) {
if (inst->src[i].file == UNIFORM) {
int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf +
struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
constant_nr / 8,
constant_nr % 8);
@ -2670,7 +2727,7 @@ fs_visitor::calculate_urb_setup()
void
fs_visitor::assign_urb_setup()
{
int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length;
int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
/* Offset all the urb_setup[] index by the actual position of the
* setup regs, now that the location of the constants has been chosen.
@ -3516,7 +3573,7 @@ static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
void
fs_visitor::generate_code()
{
int last_native_inst = 0;
int last_native_inst = p->nr_insn;
const char *last_annotation_string = NULL;
ir_instruction *last_annotation_ir = NULL;
@ -3532,8 +3589,8 @@ fs_visitor::generate_code()
if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
printf("Native code for fragment shader %d:\n",
ctx->Shader.CurrentFragmentProgram->Name);
printf("Native code for fragment shader %d (%d-wide dispatch):\n",
ctx->Shader.CurrentFragmentProgram->Name, c->dispatch_width);
}
foreach_iter(exec_list_iterator, iter, this->instructions) {
@ -3566,6 +3623,14 @@ fs_visitor::generate_code()
brw_set_predicate_inverse(p, inst->predicate_inverse);
brw_set_saturate(p, inst->saturate);
if (inst->force_uncompressed || c->dispatch_width == 8) {
brw_set_compression_control(p, BRW_COMPRESSION_NONE);
} else if (inst->force_sechalf) {
brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
} else {
brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
}
switch (inst->opcode) {
case BRW_OPCODE_MOV:
brw_MOV(p, dst, src[0]);
@ -3804,7 +3869,111 @@ fs_visitor::generate_code()
}
}
GLboolean
bool
fs_visitor::run()
{
uint32_t prog_offset_16 = 0;
brw_wm_payload_setup(brw, c);
if (c->dispatch_width == 16) {
if (c->prog_data.curb_read_length) {
/* Haven't hooked in support for uniforms through the 16-wide
* version yet.
*/
return GL_FALSE;
}
/* align to 64 byte boundary. */
while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
brw_NOP(p);
}
/* Save off the start of this 16-wide program in case we succeed. */
prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction);
brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
}
if (0) {
emit_dummy_fs();
} else {
calculate_urb_setup();
if (intel->gen < 6)
emit_interpolation_setup_gen4();
else
emit_interpolation_setup_gen6();
/* Generate FS IR for main(). (the visitor only descends into
* functions called "main").
*/
foreach_iter(exec_list_iterator, iter, *shader->ir) {
ir_instruction *ir = (ir_instruction *)iter.get();
base_ir = ir;
ir->accept(this);
}
emit_fb_writes();
split_virtual_grfs();
setup_paramvalues_refs();
setup_pull_constants();
bool progress;
do {
progress = false;
progress = remove_duplicate_mrf_writes() || progress;
progress = propagate_constants() || progress;
progress = register_coalesce() || progress;
progress = compute_to_mrf() || progress;
progress = dead_code_eliminate() || progress;
} while (progress);
schedule_instructions();
assign_curb_setup();
assign_urb_setup();
if (0) {
/* Debug of register spilling: Go spill everything. */
int virtual_grf_count = virtual_grf_next;
for (int i = 1; i < virtual_grf_count; i++) {
spill_reg(i);
}
}
if (0)
assign_regs_trivial();
else {
while (!assign_regs()) {
if (failed)
break;
}
}
}
assert(force_uncompressed_stack == 0);
assert(force_sechalf_stack == 0);
if (!failed)
generate_code();
if (failed)
return GL_FALSE;
if (c->dispatch_width == 8) {
c->prog_data.total_grf = grf_used;
} else {
c->prog_data.total_grf_16 = grf_used;
c->prog_data.prog_offset_16 = prog_offset_16;
}
return !failed;
}
bool
brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
{
struct intel_context *intel = &brw->intel;
@ -3812,20 +3981,12 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
if (!prog)
return GL_FALSE;
return false;
struct brw_shader *shader =
(brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
if (!shader)
return GL_FALSE;
/* We always use 8-wide mode, at least for now. For one, flow
* control only works in 8-wide. Also, when we're fragment shader
* bound, we're almost always under register pressure as well, so
* 8-wide would save us from the performance cliff of spilling
* regs.
*/
c->dispatch_width = 8;
return false;
if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
printf("GLSL IR for native fragment shader %d:\n", prog->Name);
@ -3835,77 +3996,22 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
/* Now the main event: Visit the shader IR and generate our FS IR for it.
*/
c->dispatch_width = 8;
fs_visitor v(c, shader);
if (0) {
v.emit_dummy_fs();
} else {
v.calculate_urb_setup();
if (intel->gen < 6)
v.emit_interpolation_setup_gen4();
else
v.emit_interpolation_setup_gen6();
/* Generate FS IR for main(). (the visitor only descends into
* functions called "main").
*/
foreach_iter(exec_list_iterator, iter, *shader->ir) {
ir_instruction *ir = (ir_instruction *)iter.get();
v.base_ir = ir;
ir->accept(&v);
}
v.emit_fb_writes();
v.split_virtual_grfs();
v.setup_paramvalues_refs();
v.setup_pull_constants();
bool progress;
do {
progress = false;
progress = v.remove_duplicate_mrf_writes() || progress;
progress = v.propagate_constants() || progress;
progress = v.register_coalesce() || progress;
progress = v.compute_to_mrf() || progress;
progress = v.dead_code_eliminate() || progress;
} while (progress);
v.schedule_instructions();
v.assign_curb_setup();
v.assign_urb_setup();
if (0) {
/* Debug of register spilling: Go spill everything. */
int virtual_grf_count = v.virtual_grf_next;
for (int i = 1; i < virtual_grf_count; i++) {
v.spill_reg(i);
}
}
if (0)
v.assign_regs_trivial();
else {
while (!v.assign_regs()) {
if (v.failed)
break;
}
}
if (!v.run()) {
/* FINISHME: Cleanly fail, test at link time, etc. */
assert(!"not reached");
return false;
}
if (!v.failed)
v.generate_code();
if (intel->gen >= 6) {
c->dispatch_width = 16;
fs_visitor v2(c, shader);
v2.run();
}
assert(!v.failed); /* FINISHME: Cleanly fail, tested at link time, etc. */
c->prog_data.dispatch_width = 8;
if (v.failed)
return GL_FALSE;
c->prog_data.total_grf = v.grf_used;
return GL_TRUE;
return true;
}

View file

@ -343,6 +343,8 @@ public:
bool eot;
bool header_present;
bool shadow_compare;
bool force_uncompressed;
bool force_sechalf;
uint32_t offset; /* spill/unspill offset */
/** @{
@ -405,6 +407,8 @@ public:
this->live_intervals_valid = false;
this->kill_emitted = false;
this->force_uncompressed_stack = 0;
this->force_sechalf_stack = 0;
}
~fs_visitor()
@ -461,6 +465,7 @@ public:
return emit(fs_inst(opcode, dst, src0, src1, src2));
}
bool run();
void setup_paramvalues_refs();
void assign_curb_setup();
void calculate_urb_setup();
@ -481,6 +486,11 @@ public:
void schedule_instructions();
void fail(const char *msg, ...);
void push_force_uncompressed();
void pop_force_uncompressed();
void push_force_sechalf();
void pop_force_sechalf();
void generate_code();
void generate_fb_write(fs_inst *inst);
void generate_pixel_xy(struct brw_reg dst, bool is_x);
@ -568,6 +578,9 @@ public:
fs_reg reg_null_cmp;
int grf_used;
int force_uncompressed_stack;
int force_sechalf_stack;
};
GLboolean brw_do_channel_expressions(struct exec_list *instructions);

View file

@ -120,7 +120,7 @@ brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
brw_wm_emit(c);
}
static void
void
brw_wm_payload_setup(struct brw_context *brw,
struct brw_wm_compile *c)
{
@ -225,18 +225,13 @@ static void do_wm_prog( struct brw_context *brw,
brw_init_compile(brw, &c->func);
brw_wm_payload_setup(brw, c);
if (!brw_wm_fs_emit(brw, c)) {
/*
* Shader which use GLSL features such as flow control are handled
* differently from "simple" shaders.
*/
/* Fallback for fixed function and ARB_fp shaders. */
c->dispatch_width = 16;
brw_wm_payload_setup(brw, c);
brw_wm_non_glsl_emit(brw, c);
c->prog_data.dispatch_width = 16;
}
c->prog_data.dispatch_width = c->dispatch_width;
/* Scratch space is used for register spilling */
if (c->last_scratch) {
@ -467,7 +462,7 @@ static void brw_prepare_wm_prog(struct brw_context *brw)
struct brw_wm_prog_key key;
struct brw_fragment_program *fp = (struct brw_fragment_program *)
brw->fragment_program;
brw_wm_populate_key(brw, &key);
/* Make an early check for the key.

View file

@ -314,7 +314,7 @@ void brw_wm_print_program( struct brw_wm_compile *c,
void brw_wm_lookup_iz(struct intel_context *intel,
struct brw_wm_compile *c);
GLboolean brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c);
bool brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c);
/* brw_wm_emit.c */
void emit_alu1(struct brw_compile *p,
@ -474,5 +474,7 @@ struct gl_shader_program *brw_new_shader_program(struct gl_context *ctx, GLuint
bool brw_color_buffer_write_enabled(struct brw_context *brw);
bool brw_render_target_supported(gl_format format);
void brw_wm_payload_setup(struct brw_context *brw,
struct brw_wm_compile *c);
#endif

View file

@ -143,14 +143,19 @@ upload_wm_state(struct brw_context *brw)
dw2 |= (ALIGN(brw->wm.sampler_count, 4) / 4) << GEN6_WM_SAMPLER_COUNT_SHIFT;
dw4 |= (brw->wm.prog_data->first_curbe_grf <<
GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
dw4 |= (brw->wm.prog_data->first_curbe_grf_16 <<
GEN6_WM_DISPATCH_START_GRF_SHIFT_2);
dw5 |= (brw->wm_max_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT;
/* CACHE_NEW_WM_PROG */
if (brw->wm.prog_data->dispatch_width == 8)
if (brw->wm.prog_data->dispatch_width == 8) {
dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
else
if (brw->wm.prog_data->prog_offset_16)
dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
} else {
dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
}
/* _NEW_LINE */
if (ctx->Line.StippleFlag)
@ -194,7 +199,12 @@ upload_wm_state(struct brw_context *brw)
OUT_BATCH(dw5);
OUT_BATCH(dw6);
OUT_BATCH(0); /* kernel 1 pointer */
OUT_BATCH(0); /* kernel 2 pointer */
if (brw->wm.prog_data->prog_offset_16) {
OUT_RELOC(brw->wm.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
brw->wm.prog_data->prog_offset_16);
} else {
OUT_BATCH(0); /* kernel 2 pointer */
}
ADVANCE_BATCH();
}