i965: Add fs_visitor::run_vs() to generate scalar vertex shader code

This patch uses the previous refactoring to add a new run_vs() method
that generates vertex shader code using the scalar visitor and
optimizer.

Signed-off-by: Kristian Høgsberg <krh@bitplanet.net>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
This commit is contained in:
Kristian Høgsberg 2014-10-27 22:42:50 -07:00
parent bf23079379
commit 8b6a797d74
3 changed files with 436 additions and 13 deletions

View file

@ -1808,6 +1808,61 @@ fs_visitor::assign_urb_setup()
urb_start + prog_data->num_varying_inputs * 2;
}
void
fs_visitor::assign_vs_urb_setup()
{
brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
int grf, count, slot, channel, attr;
assert(stage == MESA_SHADER_VERTEX);
count = _mesa_bitcount_64(vs_prog_data->inputs_read);
if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
count++;
/* Each attribute is 4 regs. */
this->first_non_payload_grf =
payload.num_regs + prog_data->curb_read_length + count * 4;
unsigned vue_entries =
MAX2(count, vs_prog_data->base.vue_map.num_slots);
vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
vs_prog_data->base.urb_read_length = (count + 1) / 2;
assert(vs_prog_data->base.urb_read_length <= 15);
/* Rewrite all ATTR file references to the hw grf that they land in. */
foreach_block_and_inst(block, fs_inst, inst, cfg) {
for (int i = 0; i < inst->sources; i++) {
if (inst->src[i].file == ATTR) {
if (inst->src[i].reg == VERT_ATTRIB_MAX) {
slot = count - 1;
} else {
/* Attributes come in in a contiguous block, ordered by their
* gl_vert_attrib value. That means we can compute the slot
* number for an attribute by masking out the enabled
* attributes before it and counting the bits.
*/
attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
BITFIELD64_MASK(attr));
}
channel = inst->src[i].reg_offset & 3;
grf = payload.num_regs +
prog_data->curb_read_length +
slot * 4 + channel;
inst->src[i].file = HW_REG;
inst->src[i].fixed_hw_reg =
retype(brw_vec8_grf(grf, 0), inst->src[i].type);
}
}
}
}
/**
* Split large virtual GRFs into separate components if we can.
*
@ -3395,6 +3450,13 @@ fs_visitor::setup_payload_gen6()
}
}
void
fs_visitor::setup_vs_payload()
{
/* R0: thread header, R1: urb handles */
payload.num_regs = 2;
}
void
fs_visitor::assign_binding_table_offsets()
{
@ -3433,6 +3495,8 @@ fs_visitor::calculate_register_pressure()
void
fs_visitor::optimize()
{
const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
calculate_cfg();
split_virtual_grfs();
@ -3447,8 +3511,8 @@ fs_visitor::optimize()
\
if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
char filename[64]; \
snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass, \
dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
\
backend_visitor::dump_instructions(filename); \
} \
@ -3458,8 +3522,8 @@ fs_visitor::optimize()
if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
char filename[64];
snprintf(filename, 64, "fs%d-%04d-00-start",
dispatch_width, shader_prog ? shader_prog->Name : 0);
snprintf(filename, 64, "%s%d-%04d-00-start",
stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
backend_visitor::dump_instructions(filename);
}
@ -3527,6 +3591,9 @@ fs_visitor::allocate_registers()
}
if (!allocated_without_spills) {
const char *stage_name = stage == MESA_SHADER_VERTEX ?
"Vertex" : "Fragment";
/* We assume that any spilling is worse than just dropping back to
* SIMD8. There's probably actually some intermediate point where
* SIMD16 with a couple of spills is still better.
@ -3535,9 +3602,9 @@ fs_visitor::allocate_registers()
fail("Failure to register allocate. Reduce number of "
"live scalar values to avoid this.");
} else {
perf_debug("Fragment shader triggered register spilling. "
perf_debug("%s shader triggered register spilling. "
"Try reducing the number of live scalar values to "
"improve performance.\n");
"improve performance.\n", stage_name);
}
/* Since we're out of heuristics, just go spill registers until we
@ -3565,6 +3632,38 @@ fs_visitor::allocate_registers()
prog_data->total_scratch = brw_get_scratch_size(last_scratch);
}
bool
fs_visitor::run_vs()
{
assert(stage == MESA_SHADER_VERTEX);
assign_common_binding_table_offsets(0);
setup_vs_payload();
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
emit_shader_time_begin();
foreach_in_list(ir_instruction, ir, shader->base.ir) {
base_ir = ir;
this->result = reg_undef;
ir->accept(this);
}
base_ir = NULL;
if (failed)
return false;
emit_urb_writes();
optimize();
assign_curb_setup();
assign_vs_urb_setup();
allocate_registers();
return !failed;
}
bool
fs_visitor::run()
{

View file

@ -308,12 +308,23 @@ public:
struct gl_shader_program *shader_prog,
struct gl_fragment_program *fp,
unsigned dispatch_width);
fs_visitor(struct brw_context *brw,
void *mem_ctx,
const struct brw_vs_prog_key *key,
struct brw_vs_prog_data *prog_data,
struct gl_shader_program *shader_prog,
struct gl_vertex_program *cp,
unsigned dispatch_width);
~fs_visitor();
void init();
fs_reg *variable_storage(ir_variable *var);
int virtual_grf_alloc(int size);
void import_uniforms(fs_visitor *v);
void setup_uniform_clipplane_values();
void compute_clip_distance();
void visit(ir_variable *ir);
void visit(ir_assignment *ir);
@ -404,14 +415,17 @@ public:
uint32_t const_offset);
bool run();
bool run_vs();
void optimize();
void allocate_registers();
void assign_binding_table_offsets();
void setup_payload_gen4();
void setup_payload_gen6();
void setup_vs_payload();
void assign_curb_setup();
void calculate_urb_setup();
void assign_urb_setup();
void assign_vs_urb_setup();
bool assign_regs(bool allow_spilling);
void assign_regs_trivial();
void get_used_mrfs(bool *mrf_used);
@ -465,6 +479,7 @@ public:
fs_reg *emit_samplepos_setup();
fs_reg *emit_sampleid_setup();
fs_reg *emit_general_interpolation(ir_variable *ir);
fs_reg *emit_vs_system_value(enum brw_reg_type type, int location);
void emit_interpolation_setup_gen4();
void emit_interpolation_setup_gen6();
void compute_sample_position(fs_reg dst, fs_reg int_sample_pos);
@ -552,6 +567,7 @@ public:
fs_inst *emit_single_fb_write(fs_reg color1, fs_reg color2,
fs_reg src0_alpha, unsigned components);
void emit_fb_writes();
void emit_urb_writes();
void emit_shader_time_begin();
void emit_shader_time_end();
@ -627,8 +643,8 @@ public:
struct hash_table *variable_ht;
fs_reg frag_depth;
fs_reg sample_mask;
fs_reg outputs[BRW_MAX_DRAW_BUFFERS];
unsigned output_components[BRW_MAX_DRAW_BUFFERS];
fs_reg outputs[VARYING_SLOT_MAX];
unsigned output_components[VARYING_SLOT_MAX];
fs_reg dual_src_output;
bool do_dual_src;
int first_non_payload_grf;
@ -675,6 +691,7 @@ public:
fs_reg delta_x[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
fs_reg delta_y[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
fs_reg shader_start_time;
fs_reg userplane[MAX_CLIP_PLANES];
int grf_used;
bool spilled_any_registers;

View file

@ -43,11 +43,40 @@ extern "C" {
#include "brw_eu.h"
#include "brw_wm.h"
}
#include "brw_vec4.h"
#include "brw_fs.h"
#include "main/uniforms.h"
#include "glsl/glsl_types.h"
#include "glsl/ir_optimization.h"
fs_reg *
fs_visitor::emit_vs_system_value(enum brw_reg_type type, int location)
{
fs_reg *reg = new(this->mem_ctx)
fs_reg(ATTR, VERT_ATTRIB_MAX, type);
brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
switch (location) {
case SYSTEM_VALUE_BASE_VERTEX:
reg->reg_offset = 0;
vs_prog_data->uses_vertexid = true;
break;
case SYSTEM_VALUE_VERTEX_ID:
case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
reg->reg_offset = 2;
vs_prog_data->uses_vertexid = true;
break;
case SYSTEM_VALUE_INSTANCE_ID:
reg->reg_offset = 3;
vs_prog_data->uses_instanceid = true;
break;
default:
unreachable("not reached");
}
return reg;
}
void
fs_visitor::visit(ir_variable *ir)
{
@ -58,7 +87,11 @@ fs_visitor::visit(ir_variable *ir)
if (ir->data.mode == ir_var_shader_in) {
assert(ir->data.location != -1);
if (!strcmp(ir->name, "gl_FragCoord")) {
if (stage == MESA_SHADER_VERTEX) {
reg = new(this->mem_ctx)
fs_reg(ATTR, ir->data.location,
brw_type_for_base_type(ir->type->get_scalar_type()));
} else if (!strcmp(ir->name, "gl_FragCoord")) {
reg = emit_fragcoord_interpolation(ir);
} else if (!strcmp(ir->name, "gl_FrontFacing")) {
reg = emit_frontfacing_interpolation();
@ -71,7 +104,19 @@ fs_visitor::visit(ir_variable *ir)
} else if (ir->data.mode == ir_var_shader_out) {
reg = new(this->mem_ctx) fs_reg(this, ir->type);
if (ir->data.index > 0) {
if (stage == MESA_SHADER_VERTEX) {
int vector_elements =
ir->type->is_array() ? ir->type->fields.array->vector_elements
: ir->type->vector_elements;
for (int i = 0; i < (type_size(ir->type) + 3) / 4; i++) {
int output = ir->data.location + i;
this->outputs[output] = *reg;
this->outputs[output].reg_offset = i * 4;
this->output_components[output] = vector_elements;
}
} else if (ir->data.index > 0) {
assert(ir->data.location == FRAG_RESULT_DATA0);
assert(ir->data.index == 1);
this->dual_src_output = *reg;
@ -135,15 +180,26 @@ fs_visitor::visit(ir_variable *ir)
reg->type = brw_type_for_base_type(ir->type);
} else if (ir->data.mode == ir_var_system_value) {
if (ir->data.location == SYSTEM_VALUE_SAMPLE_POS) {
switch (ir->data.location) {
case SYSTEM_VALUE_BASE_VERTEX:
case SYSTEM_VALUE_VERTEX_ID:
case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
case SYSTEM_VALUE_INSTANCE_ID:
reg = emit_vs_system_value(brw_type_for_base_type(ir->type),
ir->data.location);
break;
case SYSTEM_VALUE_SAMPLE_POS:
reg = emit_samplepos_setup();
} else if (ir->data.location == SYSTEM_VALUE_SAMPLE_ID) {
break;
case SYSTEM_VALUE_SAMPLE_ID:
reg = emit_sampleid_setup();
} else if (ir->data.location == SYSTEM_VALUE_SAMPLE_MASK_IN) {
break;
case SYSTEM_VALUE_SAMPLE_MASK_IN:
assert(brw->gen >= 7);
reg = new(mem_ctx)
fs_reg(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
BRW_REGISTER_TYPE_D));
break;
}
}
@ -1770,6 +1826,8 @@ get_tex(gl_shader_stage stage, const void *key)
switch (stage) {
case MESA_SHADER_FRAGMENT:
return &((brw_wm_prog_key*) key)->tex;
case MESA_SHADER_VERTEX:
return &((brw_vue_prog_key*) key)->tex;
default:
unreachable("unhandled shader stage");
}
@ -3448,6 +3506,236 @@ fs_visitor::emit_fb_writes()
this->current_annotation = NULL;
}
void
fs_visitor::setup_uniform_clipplane_values()
{
gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
const struct brw_vue_prog_key *key =
(const struct brw_vue_prog_key *) this->key;
for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
this->userplane[i] = fs_reg(UNIFORM, uniforms);
for (int j = 0; j < 4; ++j) {
stage_prog_data->param[uniforms + j] =
(gl_constant_value *) &clip_planes[i][j];
}
uniforms += 4;
}
}
void fs_visitor::compute_clip_distance()
{
struct brw_vue_prog_data *vue_prog_data =
(struct brw_vue_prog_data *) prog_data;
const struct brw_vue_prog_key *key =
(const struct brw_vue_prog_key *) this->key;
/* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
*
* "If a linked set of shaders forming the vertex stage contains no
* static write to gl_ClipVertex or gl_ClipDistance, but the
* application has requested clipping against user clip planes through
* the API, then the coordinate written to gl_Position is used for
* comparison against the user clip planes."
*
* This function is only called if the shader didn't write to
* gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
* if the user wrote to it; otherwise we use gl_Position.
*/
gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
if (!(vue_prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX))
clip_vertex = VARYING_SLOT_POS;
/* If the clip vertex isn't written, skip this. Typically this means
* the GS will set up clipping. */
if (outputs[clip_vertex].file == BAD_FILE)
return;
setup_uniform_clipplane_values();
current_annotation = "user clip distances";
this->outputs[VARYING_SLOT_CLIP_DIST0] = fs_reg(this, glsl_type::vec4_type);
this->outputs[VARYING_SLOT_CLIP_DIST1] = fs_reg(this, glsl_type::vec4_type);
for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
fs_reg u = userplane[i];
fs_reg output = outputs[VARYING_SLOT_CLIP_DIST0 + i / 4];
output.reg_offset = i & 3;
emit(MUL(output, outputs[clip_vertex], u));
for (int j = 1; j < 4; j++) {
u.reg = userplane[i].reg + j;
emit(MAD(output, output, offset(outputs[clip_vertex], j), u));
}
}
}
void
fs_visitor::emit_urb_writes()
{
int slot, urb_offset, length;
struct brw_vs_prog_data *vs_prog_data =
(struct brw_vs_prog_data *) prog_data;
const struct brw_vs_prog_key *key =
(const struct brw_vs_prog_key *) this->key;
const GLbitfield64 psiz_mask =
VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ;
const struct brw_vue_map *vue_map = &vs_prog_data->base.vue_map;
bool flush;
fs_reg sources[8];
/* Lower legacy ff and ClipVertex clipping to clip distances */
if (key->base.userclip_active && !prog->UsesClipDistanceOut)
compute_clip_distance();
/* If we don't have any valid slots to write, just do a minimal urb write
* send to terminate the shader. */
if (vue_map->slots_valid == 0) {
fs_reg payload = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD);
fs_inst *inst = emit(MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
BRW_REGISTER_TYPE_UD))));
inst->force_writemask_all = true;
inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
inst->eot = true;
inst->mlen = 1;
inst->offset = 1;
return;
}
length = 0;
urb_offset = 0;
flush = false;
for (slot = 0; slot < vue_map->num_slots; slot++) {
fs_reg reg, src, zero;
int varying = vue_map->slot_to_varying[slot];
switch (varying) {
case VARYING_SLOT_PSIZ:
/* The point size varying slot is the vue header and is always in the
* vue map. But often none of the special varyings that live there
* are written and in that case we can skip writing to the vue
* header, provided the corresponding state properly clamps the
* values further down the pipeline. */
if ((vue_map->slots_valid & psiz_mask) == 0) {
assert(length == 0);
urb_offset++;
break;
}
zero = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD);
emit(MOV(zero, fs_reg(0u)));
sources[length++] = zero;
if (vue_map->slots_valid & VARYING_BIT_LAYER)
sources[length++] = this->outputs[VARYING_SLOT_LAYER];
else
sources[length++] = zero;
if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
else
sources[length++] = zero;
if (vue_map->slots_valid & VARYING_BIT_PSIZ)
sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
else
sources[length++] = zero;
break;
case BRW_VARYING_SLOT_NDC:
case VARYING_SLOT_EDGE:
unreachable("unexpected scalar vs output");
break;
case BRW_VARYING_SLOT_PAD:
break;
default:
/* gl_Position is always in the vue map, but isn't always written by
* the shader. Other varyings (clip distances) get added to the vue
* map but don't always get written. In those cases, the
* corresponding this->output[] slot will be invalid we and can skip
* the urb write for the varying. If we've already queued up a vue
* slot for writing we flush a mlen 5 urb write, otherwise we just
* advance the urb_offset.
*/
if (this->outputs[varying].file == BAD_FILE) {
if (length > 0)
flush = true;
else
urb_offset++;
break;
}
if ((varying == VARYING_SLOT_COL0 ||
varying == VARYING_SLOT_COL1 ||
varying == VARYING_SLOT_BFC0 ||
varying == VARYING_SLOT_BFC1) &&
key->clamp_vertex_color) {
/* We need to clamp these guys, so do a saturating MOV into a
* temp register and use that for the payload.
*/
for (int i = 0; i < 4; i++) {
reg = fs_reg(GRF, virtual_grf_alloc(1), outputs[varying].type);
src = offset(this->outputs[varying], i);
fs_inst *inst = emit(MOV(reg, src));
inst->saturate = true;
sources[length++] = reg;
}
} else {
for (int i = 0; i < 4; i++)
sources[length++] = offset(this->outputs[varying], i);
}
break;
}
current_annotation = "URB write";
/* If we've queued up 8 registers of payload (2 VUE slots), if this is
* the last slot or if we need to flush (see BAD_FILE varying case
* above), emit a URB write send now to flush out the data.
*/
int last = slot == vue_map->num_slots - 1;
if (length == 8 || last)
flush = true;
if (flush) {
if (last && (INTEL_DEBUG & DEBUG_SHADER_TIME))
emit_shader_time_end();
fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
fs_reg payload = fs_reg(GRF, virtual_grf_alloc(length + 1),
BRW_REGISTER_TYPE_F);
/* We need WE_all on the MOV for the message header (the URB handles)
* so do a MOV to a dummy register and set force_writemask_all on the
* MOV. LOAD_PAYLOAD will preserve that.
*/
fs_reg dummy = fs_reg(GRF, virtual_grf_alloc(1),
BRW_REGISTER_TYPE_UD);
fs_inst *inst = emit(MOV(dummy, fs_reg(retype(brw_vec8_grf(1, 0),
BRW_REGISTER_TYPE_UD))));
inst->force_writemask_all = true;
payload_sources[0] = dummy;
memcpy(&payload_sources[1], sources, length * sizeof sources[0]);
emit(LOAD_PAYLOAD(payload, payload_sources, length + 1));
inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
inst->eot = last;
inst->mlen = length + 1;
inst->offset = urb_offset;
urb_offset = slot + 1;
length = 0;
flush = false;
}
}
}
void
fs_visitor::resolve_ud_negate(fs_reg *reg)
{
@ -3500,6 +3788,25 @@ fs_visitor::fs_visitor(struct brw_context *brw,
init();
}
fs_visitor::fs_visitor(struct brw_context *brw,
void *mem_ctx,
const struct brw_vs_prog_key *key,
struct brw_vs_prog_data *prog_data,
struct gl_shader_program *shader_prog,
struct gl_vertex_program *cp,
unsigned dispatch_width)
: backend_visitor(brw, shader_prog, &cp->Base, &prog_data->base.base,
MESA_SHADER_VERTEX),
reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
key(key), prog_data(&prog_data->base.base),
dispatch_width(dispatch_width)
{
this->mem_ctx = mem_ctx;
init();
}
void
fs_visitor::init()
{