From 6b3462c784df961f22eea0c39490b38093086b83 Mon Sep 17 00:00:00 2001
From: David Chase <drchase@google.com>
Date: Sat, 27 Feb 2016 11:54:52 -0500
Subject: [PATCH] [dev.ssa] cmd/compile: adjust branch likeliness for
 calls/loops

Static branch predictions (which guide block ordering) are
adjusted based on:

loop/not-loop     (favor looping)
abnormal-exit/not (avoid panic)
call/not-call     (avoid call)
ret/default       (treat returns as rare)

This appears to make no difference in performance of real
code, meaning the compiler itself.  The earlier version of
this has been stripped down to help make the cost of this
only-aesthetic-on-Intel phase be as cheap as possible (we
probably want information about inner loops for improving
register allocation, but because register allocation follows
close behind this pass, conceivably the information could be
reused -- so we might do this anyway just to normalize
output).

For a ./make.bash that takes 200 user seconds, about .75
second is reported in likelyadjust (summing nanoseconds
reported with -d=ssa/likelyadjust/time ).

Upstream predictions are respected.
Includes test, limited to build on amd64 only.
Did several iterations on the debugging output to allow
some rough checks on behavior.
Debug=1 logging notes agree/disagree with earlier passes,
allowing analysis like the following:

Run on make.bash:
GO_GCFLAGS=-d=ssa/likelyadjust/debug \
   ./make.bash >& lkly5.log

grep 'ranch prediction' lkly5.log | wc -l
   78242 // 78k predictions

grep 'ranch predi' lkly5.log | egrep -v 'agrees with' | wc -l
   29633 // 29k NEW predictions

grep 'disagrees' lkly5.log | wc -l
     444 // contradicted 444 times

grep '< exit' lkly5.log | wc -l
   10212 // 10k exit predictions

grep '< exit' lkly5.log | egrep 'disagrees' | wc -l
       5 // 5 contradicted by previous prediction

grep '< exit' lkly5.log | egrep -v 'agrees' | wc -l
     702 // 702-5 redundant with previous prediction

grep '< call' lkly5.log | egrep -v 'agrees' | wc -l
   16699 // 16k new call predictions

grep 'stay in loop' lkly5.log | egrep -v 'agrees' | wc -l
    3951 // 4k new "remain in loop" predictions

Fixes #11451.

Change-Id: Iafb0504f7030d304ef4b6dc1aba9a5789151a593
Reviewed-on: https://go-review.googlesource.com/19995
Run-TryBot: David Chase <drchase@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
---
 src/cmd/compile/internal/ssa/TODO            |   3 +-
 src/cmd/compile/internal/ssa/compile.go      |   3 +-
 src/cmd/compile/internal/ssa/likelyadjust.go | 300 +++++++++++++++++++
 test/opt_branchlikely.go                     |  85 ++++++
 4 files changed, 388 insertions(+), 3 deletions(-)
 create mode 100755 src/cmd/compile/internal/ssa/likelyadjust.go
 create mode 100644 test/opt_branchlikely.go

diff --git a/src/cmd/compile/internal/ssa/TODO b/src/cmd/compile/internal/ssa/TODO
index 4e39d1e9c3..a457e67101 100644
--- a/src/cmd/compile/internal/ssa/TODO
+++ b/src/cmd/compile/internal/ssa/TODO
@@ -24,7 +24,7 @@ Optimizations (better compiled code)
 - Figure out how to make PARAMOUT variables ssa-able.
   They need to get spilled automatically at end-of-function somehow.
 - If strings are being passed around without being interpreted (ptr
-  and len feilds being accessed) pass them in xmm registers?
+  and len fields being accessed) pass them in xmm registers?
   Same for interfaces?
 - OpArrayIndex should take its index in AuxInt, not a full value.
 - remove FLAGS from REP instruction clobbers
@@ -32,7 +32,6 @@ Optimizations (better compiled code)
   Note that this is challenging for ops that generate flags
   because flagalloc wants to move those instructions around for
   flag regeneration.
-- In forms like if ... { call } else { no call }, mark the call branch as unlikely.
 - Non-constant rotate detection.
 - Do 0 <= x && x < n with one unsigned compare
 - nil-check removal in indexed load/store case:
diff --git a/src/cmd/compile/internal/ssa/compile.go b/src/cmd/compile/internal/ssa/compile.go
index 2780e5bcfc..f68819c3c2 100644
--- a/src/cmd/compile/internal/ssa/compile.go
+++ b/src/cmd/compile/internal/ssa/compile.go
@@ -178,7 +178,8 @@ var passes = [...]pass{
 	{name: "late phielim", fn: phielim},
 	{name: "late copyelim", fn: copyelim},
 	{name: "late deadcode", fn: deadcode},
-	{name: "critical", fn: critical, required: true},   // remove critical edges
+	{name: "critical", fn: critical, required: true}, // remove critical edges
+	{name: "likelyadjust", fn: likelyadjust},
 	{name: "layout", fn: layout, required: true},       // schedule blocks
 	{name: "schedule", fn: schedule, required: true},   // schedule values
 	{name: "flagalloc", fn: flagalloc, required: true}, // allocate flags register
diff --git a/src/cmd/compile/internal/ssa/likelyadjust.go b/src/cmd/compile/internal/ssa/likelyadjust.go
new file mode 100755
index 0000000000..6ce8705272
--- /dev/null
+++ b/src/cmd/compile/internal/ssa/likelyadjust.go
@@ -0,0 +1,300 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ssa
+
+import (
+	"fmt"
+)
+
+type loop struct {
+	header *Block // The header node of this (reducible) loop
+	outer  *loop  // loop containing this loop
+	// Next two fields not currently used, but cheap to maintain,
+	// and aid in computation of inner-ness and list of blocks.
+	nBlocks int32 // Number of blocks in this loop but not within inner loops
+	isInner bool  // True if never discovered to contain a loop
+}
+
+// outerinner records that outer contains inner
+func (sdom sparseTree) outerinner(outer, inner *loop) {
+	oldouter := inner.outer
+	if oldouter == nil || sdom.isAncestorEq(oldouter.header, outer.header) {
+		inner.outer = outer
+		outer.isInner = false
+	}
+}
+
+type loopnest struct {
+	f     *Func
+	b2l   []*loop
+	po    []*Block
+	sdom  sparseTree
+	loops []*loop
+}
+
+func min8(a, b int8) int8 {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+func max8(a, b int8) int8 {
+	if a > b {
+		return a
+	}
+	return b
+}
+
+const (
+	blDEFAULT = 0
+	blMin     = blDEFAULT
+	blCALL    = 1
+	blRET     = 2
+	blEXIT    = 3
+)
+
+var bllikelies [4]string = [4]string{"default", "call", "ret", "exit"}
+
+func describePredictionAgrees(b *Block, prediction BranchPrediction) string {
+	s := ""
+	if prediction == b.Likely {
+		s = " (agrees with previous)"
+	} else if b.Likely != BranchUnknown {
+		s = " (disagrees with previous, ignored)"
+	}
+	return s
+}
+
+func describeBranchPrediction(f *Func, b *Block, likely, not int8, prediction BranchPrediction) {
+	f.Config.Warnl(int(b.Line), "Branch prediction rule %s < %s%s",
+		bllikelies[likely-blMin], bllikelies[not-blMin], describePredictionAgrees(b, prediction))
+}
+
+func likelyadjust(f *Func) {
+	// The values assigned to certain and local only matter
+	// in their rank order.  0 is default, more positive
+	// is less likely.  It's possible to assign a negative
+	// unlikeliness (though not currently the case).
+	certain := make([]int8, f.NumBlocks()) // In the long run, all outcomes are at least this bad. Mainly for Exit
+	local := make([]int8, f.NumBlocks())   // for our immediate predecessors.
+
+	nest := loopnestfor(f)
+	po := nest.po
+	b2l := nest.b2l
+
+	for _, b := range po {
+		switch b.Kind {
+		case BlockExit:
+			// Very unlikely.
+			local[b.ID] = blEXIT
+			certain[b.ID] = blEXIT
+
+			// Ret, it depends.
+		case BlockRet, BlockRetJmp:
+			local[b.ID] = blRET
+			certain[b.ID] = blRET
+
+			// Calls. TODO not all calls are equal, names give useful clues.
+			// Any name-based heuristics are only relative to other calls,
+			// and less influential than inferences from loop structure.
+		case BlockCall:
+			local[b.ID] = blCALL
+			certain[b.ID] = max8(blCALL, certain[b.Succs[0].ID])
+
+		default:
+			if len(b.Succs) == 1 {
+				certain[b.ID] = certain[b.Succs[0].ID]
+			} else if len(b.Succs) == 2 {
+				// If successor is an unvisited backedge, it's in loop and we don't care.
+				// Its default unlikely is also zero which is consistent with favoring loop edges.
+				// Notice that this can act like a "reset" on unlikeliness at loops; the
+				// default "everything returns" unlikeliness is erased by min with the
+				// backedge likeliness; however a loop with calls on every path will be
+				// tagged with call cost.  Net effect is that loop entry is favored.
+				b0 := b.Succs[0].ID
+				b1 := b.Succs[1].ID
+				certain[b.ID] = min8(certain[b0], certain[b1])
+
+				l := b2l[b.ID]
+				l0 := b2l[b0]
+				l1 := b2l[b1]
+
+				prediction := b.Likely
+				// Weak loop heuristic -- both source and at least one dest are in loops,
+				// and there is a difference in the destinations.
+				// TODO what is best arrangement for nested loops?
+				if l != nil && l0 != l1 {
+					noprediction := false
+					switch {
+					// prefer not to exit loops
+					case l1 == nil:
+						prediction = BranchLikely
+					case l0 == nil:
+						prediction = BranchUnlikely
+
+						// prefer to stay in loop, not exit to outer.
+					case l == l0:
+						prediction = BranchLikely
+					case l == l1:
+						prediction = BranchUnlikely
+					default:
+						noprediction = true
+					}
+					if f.pass.debug > 0 && !noprediction {
+						f.Config.Warnl(int(b.Line), "Branch prediction rule stay in loop%s",
+							describePredictionAgrees(b, prediction))
+					}
+
+				} else {
+					// Lacking loop structure, fall back on heuristics.
+					if certain[b1] > certain[b0] {
+						prediction = BranchLikely
+						if f.pass.debug > 0 {
+							describeBranchPrediction(f, b, certain[b0], certain[b1], prediction)
+						}
+					} else if certain[b0] > certain[b1] {
+						prediction = BranchUnlikely
+						if f.pass.debug > 0 {
+							describeBranchPrediction(f, b, certain[b1], certain[b0], prediction)
+						}
+					} else if local[b1] > local[b0] {
+						prediction = BranchLikely
+						if f.pass.debug > 0 {
+							describeBranchPrediction(f, b, local[b0], local[b1], prediction)
+						}
+					} else if local[b0] > local[b1] {
+						prediction = BranchUnlikely
+						if f.pass.debug > 0 {
+							describeBranchPrediction(f, b, local[b1], local[b0], prediction)
+						}
+					}
+				}
+				if b.Likely != prediction {
+					if b.Likely == BranchUnknown {
+						b.Likely = prediction
+					}
+				}
+			}
+		}
+		if f.pass.debug > 2 {
+			f.Config.Warnl(int(b.Line), "BP: Block %s, local=%s, certain=%s", b, bllikelies[local[b.ID]-blMin], bllikelies[certain[b.ID]-blMin])
+		}
+
+	}
+}
+
+func (l *loop) String() string {
+	return fmt.Sprintf("hdr:%s", l.header)
+}
+
+func (l *loop) LongString() string {
+	i := ""
+	o := ""
+	if l.isInner {
+		i = ", INNER"
+	}
+	if l.outer != nil {
+		o = ", o=" + l.outer.header.String()
+	}
+	return fmt.Sprintf("hdr:%s%s%s", l.header, i, o)
+}
+
+// nearestOuterLoop returns the outer loop of loop most nearly
+// containing block b; the header must dominate b.  loop itself
+// is assumed to not be that loop.  For acceptable performance,
+// we're relying on loop nests to not be terribly deep.
+func (l *loop) nearestOuterLoop(sdom sparseTree, b *Block) *loop {
+	var o *loop
+	for o = l.outer; o != nil && !sdom.isAncestorEq(o.header, b); o = o.outer {
+	}
+	return o
+}
+
+func loopnestfor(f *Func) *loopnest {
+	po := postorder(f)
+	dom := dominators(f)
+	sdom := newSparseTree(f, dom)
+	b2l := make([]*loop, f.NumBlocks())
+	loops := make([]*loop, 0)
+
+	// Reducible-loop-nest-finding.
+	for _, b := range po {
+		if f.pass.debug > 3 {
+			fmt.Printf("loop finding (0) at %s\n", b)
+		}
+
+		var innermost *loop // innermost header reachable from this block
+
+		// IF any successor s of b is in a loop headed by h
+		// AND h dominates b
+		// THEN b is in the loop headed by h.
+		//
+		// Choose the first/innermost such h.
+		//
+		// IF s itself dominates b, the s is a loop header;
+		// and there may be more than one such s.
+		// Since there's at most 2 successors, the inner/outer ordering
+		// between them can be established with simple comparisons.
+		for _, bb := range b.Succs {
+			l := b2l[bb.ID]
+
+			if sdom.isAncestorEq(bb, b) { // Found a loop header
+				if l == nil {
+					l = &loop{header: bb, isInner: true}
+					loops = append(loops, l)
+					b2l[bb.ID] = l
+				}
+			} else { // Perhaps a loop header is inherited.
+				// is there any loop containing our successor whose
+				// header dominates b?
+				if l != nil && !sdom.isAncestorEq(l.header, b) {
+					l = l.nearestOuterLoop(sdom, b)
+				}
+			}
+
+			if l == nil || innermost == l {
+				continue
+			}
+
+			if innermost == nil {
+				innermost = l
+				continue
+			}
+
+			if sdom.isAncestor(innermost.header, l.header) {
+				sdom.outerinner(innermost, l)
+				innermost = l
+			} else if sdom.isAncestor(l.header, innermost.header) {
+				sdom.outerinner(l, innermost)
+			}
+		}
+
+		if innermost != nil {
+			b2l[b.ID] = innermost
+			innermost.nBlocks++
+		}
+	}
+	if f.pass.debug > 1 && len(loops) > 0 {
+		fmt.Printf("Loops in %s:\n", f.Name)
+		for _, l := range loops {
+			fmt.Printf("%s, b=", l.LongString())
+			for _, b := range f.Blocks {
+				if b2l[b.ID] == l {
+					fmt.Printf(" %s", b)
+				}
+			}
+			fmt.Print("\n")
+		}
+		fmt.Printf("Nonloop blocks in %s:", f.Name)
+		for _, b := range f.Blocks {
+			if b2l[b.ID] == nil {
+				fmt.Printf(" %s", b)
+			}
+		}
+		fmt.Print("\n")
+	}
+	return &loopnest{f, b2l, po, sdom, loops}
+}
diff --git a/test/opt_branchlikely.go b/test/opt_branchlikely.go
new file mode 100644
index 0000000000..99e914654f
--- /dev/null
+++ b/test/opt_branchlikely.go
@@ -0,0 +1,85 @@
+// +build amd64
+// errorcheck -0 -d=ssa/likelyadjust/debug=1
+
+// Copyright 2016 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test that branches have some prediction properties.
+package foo
+
+func f(x, y, z int) int {
+	a := 0
+	for i := 0; i < x; i++ { // ERROR "Branch prediction rule stay in loop"
+		for j := 0; j < y; j++ { // ERROR "Branch prediction rule stay in loop"
+			a += j
+		}
+		for k := 0; k < z; k++ { // ERROR "Branch prediction rule stay in loop"
+			a -= x + y + z
+		}
+	}
+	return a
+}
+
+func g(x, y, z int) int {
+	a := 0
+	if y == 0 { // ERROR "Branch prediction rule default < call"
+		y = g(y, z, x)
+	} else {
+		y++
+	}
+	if y == x { // ERROR "Branch prediction rule default < call"
+		y = g(y, z, x)
+	} else {
+	}
+	if y == 2 { // ERROR "Branch prediction rule default < call"
+		z++
+	} else {
+		y = g(z, x, y)
+	}
+	if y+z == 3 { // ERROR "Branch prediction rule call < exit"
+		println("ha ha")
+	} else {
+		panic("help help help")
+	}
+	if x != 0 { // ERROR "Branch prediction rule default < ret"
+		for i := 0; i < x; i++ { // ERROR "Branch prediction rule stay in loop"
+			if x == 4 { // ERROR "Branch prediction rule stay in loop"
+				return a
+			}
+			for j := 0; j < y; j++ { // ERROR "Branch prediction rule stay in loop"
+				for k := 0; k < z; k++ { // ERROR "Branch prediction rule stay in loop"
+					a -= j * i
+				}
+				a += j
+			}
+		}
+	}
+	return a
+}
+
+func h(x, y, z int) int {
+	a := 0
+	for i := 0; i < x; i++ { // ERROR "Branch prediction rule stay in loop"
+		for j := 0; j < y; j++ { // ERROR "Branch prediction rule stay in loop"
+			a += j
+			if i == j { // ERROR "Branch prediction rule stay in loop"
+				break
+			}
+			a *= j
+		}
+		for k := 0; k < z; k++ { // ERROR "Branch prediction rule stay in loop"
+			a -= k
+			if i == k {
+				continue
+			}
+			a *= k
+		}
+	}
+	if a > 0 { // ERROR "Branch prediction rule default < call"
+		a = g(x, y, z)
+	} else {
+		a = -a
+	}
+	return a
+}