r600g/sb: improve alu packing on cayman

Scheduler/register allocator in r600-sb was developed and optimized on evergreen (VLIW-5) hardware, so currently it's not optimal for VLIW-4 chips. This patch should improve performance on cayman gpus due to better alu packing, but also it tends to increase register usage, so overall positive effect on performance has to be proven by real benchmarks yet. Some results with bfgminer kernel on cayman: source bytecode: 60 gprs, 3905 alu groups, sbcl before the patch: 45 gprs, 4088 alu groups, sbcl with this patch: 55 gprs, 3474 alu groups. Signed-off-by: Vadim Girlin <vadimgirlin@gmail.com>
author: Vadim Girlin <vadimgirlin@gmail.com> 2013-07-17 18:29:56 +0400
committer: Vadim Girlin <vadimgirlin@gmail.com> 2013-07-17 18:29:56 +0400
commit: 07baf9cfd16b38872be952382ae5a705057cbec2 (patch)
tree: 984159e130f9228e113b48253d4941505135e090
parent: ba7fa4c4c93e67fec798d837005a3041adda3d5b (diff)
2 files changed, 89 insertions, 15 deletions
diff --git a/src/gallium/drivers/r600/sb/sb_pass.h b/src/gallium/drivers/r600/sb/sb_pass.h
index c3ea8734de..95d2a203a6 100644
--- a/src/gallium/drivers/r600/sb/sb_pass.h
+++ b/src/gallium/drivers/r600/sb/sb_pass.h
@@ -507,12 +507,36 @@ class ra_init : public pass {
 
 public:
 
-	ra_init(shader &sh) : pass(sh) {}
+	ra_init(shader &sh) : pass(sh), prev_chans() {
+
+		// The parameter below affects register channels distribution.
+		// For cayman (VLIW-4) we're trying to distribute the channels
+		// uniformly, this means significantly better alu slots utilization
+		// at the expense of higher gpr usage. Hopefully this will improve
+		// performance, though it has to be proven with real benchmarks yet.
+		// For VLIW-5 this method could also slightly improve slots
+		// utilization, but increased register pressure seems more significant
+		// and overall performance effect is negative according to some
+		// benchmarks, so it's not used currently. Basically, VLIW-5 doesn't
+		// really need it because trans slot (unrestricted by register write
+		// channel) allows to consume most deviations from uniform channel
+		// distribution.
+		// Value 3 means that for new allocation we'll use channel that differs
+		// from 3 last used channels. 0 for VLIW-5 effectively turns this off.
+
+		ra_tune = sh.get_ctx().is_cayman() ? 3 : 0;
+	}
 
 	virtual int run();
 
 private:
 
+	unsigned prev_chans;
+	unsigned ra_tune;
+
+	void add_prev_chan(unsigned chan);
+	unsigned get_preferable_chan_mask();
+
 	void ra_node(container_node *c);
 	void process_op(node *n);
 
diff --git a/src/gallium/drivers/r600/sb/sb_ra_init.cpp b/src/gallium/drivers/r600/sb/sb_ra_init.cpp
index 24b24a0bde..0b332a9847 100644
--- a/src/gallium/drivers/r600/sb/sb_ra_init.cpp
+++ b/src/gallium/drivers/r600/sb/sb_ra_init.cpp
@@ -72,6 +72,7 @@ public:
 
 	sel_chan find_free_bit();
 	sel_chan find_free_chans(unsigned mask);
+	sel_chan find_free_chan_by_mask(unsigned mask);
 	sel_chan find_free_array(unsigned size, unsigned mask);
 
 	void dump();
@@ -86,7 +87,7 @@ void regbits::dump() {
 			sblog << "\n";
 
 		if (!(i & 3)) {
-			sblog.print_wl(i / 4, 7);
+			sblog.print_w(i / 4, 7);
 			sblog << " ";
 		}
 
@@ -186,34 +187,64 @@ sel_chan regbits::find_free_chans(unsigned mask) {
 	unsigned elt = 0;
 	unsigned bit = 0;
 
-	basetype cd = dta[elt] >> bit;
+	assert (!(mask & ~0xF));
+	basetype cd = dta[elt];
 
 	do {
-
 		if (!cd) {
-			if (++elt < size)
+			if (++elt < size) {
 				cd = dta[elt];
-			else
+				bit = 0;
+				continue;
+			} else
 				return 0;
-
-			bit = 0;
 		}
 
 		unsigned p = __builtin_ctz(cd) & ~(basetype)3u;
 
-		if (p > bt_bits - bit) {
-			if (++elt < size)
+		assert (p <= bt_bits - bit);
+		bit += p;
+		cd >>= p;
+
+		if ((cd & mask) == mask) {
+			return ((elt << bt_index_shift) | bit) + 1;
+		}
+
+		bit += 4;
+		cd >>= 4;
+
+	} while (1);
+
+	return 0;
+}
+
+sel_chan regbits::find_free_chan_by_mask(unsigned mask) {
+	unsigned elt = 0;
+	unsigned bit = 0;
+
+	assert (!(mask & ~0xF));
+	basetype cd = dta[elt];
+
+	do {
+		if (!cd) {
+			if (++elt < size) {
 				cd = dta[elt];
-			else
+				bit = 0;
+				continue;
+			} else
 				return 0;
-			bit = 0;
 		}
 
+		unsigned p = __builtin_ctz(cd) & ~(basetype)3u;
+
+		assert (p <= bt_bits - bit);
 		bit += p;
 		cd >>= p;
 
-		if ((cd & mask) == mask) {
-			return ((elt << bt_index_shift) | bit) + 1;
+		if (cd & mask) {
+			unsigned nb = __builtin_ctz(cd & mask);
+			unsigned ofs = ((elt << bt_index_shift) | bit);
+			return nb + ofs + 1;
 		}
 
 		bit += 4;
@@ -476,7 +507,9 @@ void ra_init::color(value* v) {
 		unsigned mask = 1 << v->pin_gpr.chan();
 		c = rb.find_free_chans(mask) + v->pin_gpr.chan();
 	} else {
-		c = rb.find_free_bit();
+		unsigned cm = get_preferable_chan_mask();
+		RA_DUMP( sblog << "pref chan mask: " << cm << "\n"; );
+		c = rb.find_free_chan_by_mask(cm);
 	}
 
 	assert(c && c.sel() < 128 - ctx.alu_temp_gprs && "color failed");
@@ -484,6 +517,7 @@ void ra_init::color(value* v) {
 }
 
 void ra_init::assign_color(value* v, sel_chan c) {
+	add_prev_chan(c.chan());
 	v->gpr = c;
 	RA_DUMP(
 		sblog << "colored ";
@@ -790,4 +824,20 @@ void ra_split::split_vector_inst(node* n) {
 	}
 }
 
+void ra_init::add_prev_chan(unsigned chan) {
+	prev_chans = (prev_chans << 4) | (1 << chan);
+}
+
+unsigned ra_init::get_preferable_chan_mask() {
+	unsigned i, used_chans = 0;
+	unsigned chans = prev_chans;
+
+	for (i = 0; i < ra_tune; ++i) {
+		used_chans |= chans;
+		chans >>= 4;
+	}
+
+	return (~used_chans) & 0xF;
+}
+
 } // namespace r600_sb
author	Vadim Girlin <vadimgirlin@gmail.com>	2013-07-17 18:29:56 +0400
committer	Vadim Girlin <vadimgirlin@gmail.com>	2013-07-17 18:29:56 +0400
commit	07baf9cfd16b38872be952382ae5a705057cbec2 (patch)
tree	984159e130f9228e113b48253d4941505135e090
parent	ba7fa4c4c93e67fec798d837005a3041adda3d5b (diff)