freedreno/ir3: "soft" depth scheduling for SFU instructions

First try with a "soft" depth, to try to schedule sfu instructions further from their consumers, but fall back to hard depth (which might result in stalling) if nothing else is avail to schedule. Previously the consumer of a sfu instruction could end up scheduled immediately after (since "hard" depth from sfu to consumer would be 0). This works because legalize pass would insert a (ss) sync bit, but it is sub-optimal since it would cause a stall. Instead prioritize other instructions for 4 cycles if they would no cause a nop to be inserted. This minimizes the stalling. There is a slight penalty in general to overall # of instructions in shader (since we could end up needing nop's later due to scheduling the "deeper" sfu consumer later), but ends up being a wash on register pressure. Overall this seems to be worth a 10+% gain in fps. Increasing the "soft" depth of sfu consumer beyond 4 helps a bit in some cases, but 4 seems to be a good trade-off between getting 99% of the gain and not increasing instruction count of shaders too much. It's possible a similar approach could help for tex/mem instructions, but the (sy) sync bit seems to trigger a switch to a different thread- group to hide memory latency (possibly with some limits depending on number of registers used?). Signed-off-by: Rob Clark <robdclark@gmail.com>
author: Rob Clark <robdclark@gmail.com> 2018-01-11 16:08:47 -0500
committer: Rob Clark <robdclark@gmail.com> 2018-01-14 16:14:19 -0500
commit: f10bd0a0e1f7cba65a4b462016d3869351b20106 (patch)
tree: 04b9623456c9a7da0d456b948f0554786e61ad08
parent: 50f9a9aa960b6340b84aae2fa0e86e14c0e40fa8 (diff)
1 files changed, 21 insertions, 9 deletions
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index 5d0fcc481a..cbb213d773 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -157,7 +157,8 @@ distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr,
 static unsigned
 delay_calc_srcn(struct ir3_sched_ctx *ctx,
 		struct ir3_instruction *assigner,
-		struct ir3_instruction *consumer, unsigned srcn)
+		struct ir3_instruction *consumer,
+		unsigned srcn, bool soft)
 {
 	unsigned delay = 0;
 
@@ -167,11 +168,19 @@ delay_calc_srcn(struct ir3_sched_ctx *ctx,
 			unsigned d;
 			if (src->block != assigner->block)
 				break;
-			d = delay_calc_srcn(ctx, src, consumer, srcn);
+			d = delay_calc_srcn(ctx, src, consumer, srcn, soft);
 			delay = MAX2(delay, d);
 		}
 	} else {
-		delay = ir3_delayslots(assigner, consumer, srcn);
+		if (soft) {
+			if (is_sfu(assigner)) {
+				delay = 4;
+			} else {
+				delay = ir3_delayslots(assigner, consumer, srcn);
+			}
+		} else {
+			delay = ir3_delayslots(assigner, consumer, srcn);
+		}
 		delay -= distance(ctx, assigner, delay);
 	}
 
@@ -180,7 +189,7 @@ delay_calc_srcn(struct ir3_sched_ctx *ctx,
 
 /* calculate delay for instruction (maximum of delay for all srcs): */
 static unsigned
-delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr, bool soft)
 {
 	unsigned delay = 0;
 	struct ir3_instruction *src;
@@ -192,7 +201,7 @@ delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 			continue;
 		if (src->block != instr->block)
 			continue;
-		d = delay_calc_srcn(ctx, src, instr, i);
+		d = delay_calc_srcn(ctx, src, instr, i, soft);
 		delay = MAX2(delay, d);
 	}
 
@@ -367,7 +376,8 @@ find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 
 /* find instruction to schedule: */
 static struct ir3_instruction *
-find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
+find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+		bool soft)
 {
 	struct ir3_instruction *best_instr = NULL;
 	unsigned min_delay = ~0;
@@ -386,7 +396,7 @@ find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
 		if (!candidate)
 			continue;
 
-		delay = delay_calc(ctx, candidate);
+		delay = delay_calc(ctx, candidate, soft);
 		if (delay < min_delay) {
 			best_instr = candidate;
 			min_delay = delay;
@@ -522,10 +532,12 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 		struct ir3_sched_notes notes = {0};
 		struct ir3_instruction *instr;
 
-		instr = find_eligible_instr(ctx, &notes);
+		instr = find_eligible_instr(ctx, &notes, true);
+		if (!instr)
+			instr = find_eligible_instr(ctx, &notes, false);
 
 		if (instr) {
-			unsigned delay = delay_calc(ctx, instr);
+			unsigned delay = delay_calc(ctx, instr, false);
 
 			/* and if we run out of instructions that can be scheduled,
 			 * then it is time for nop's:
author	Rob Clark <robdclark@gmail.com>	2018-01-11 16:08:47 -0500
committer	Rob Clark <robdclark@gmail.com>	2018-01-14 16:14:19 -0500
commit	f10bd0a0e1f7cba65a4b462016d3869351b20106 (patch)
tree	04b9623456c9a7da0d456b948f0554786e61ad08
parent	50f9a9aa960b6340b84aae2fa0e86e14c0e40fa8 (diff)