From 9e0149da3c25c3a2399e0f578549f252c186491c Mon Sep 17 00:00:00 2001
From: David Chase <drchase@google.com>
Date: Tue, 27 Sep 2022 15:47:20 -0400
Subject: [PATCH] cmd/compile: fuse largest possible runs of plain blocks

This is predicted to reduce allocation, hence GC time.
(And it does.)

Change-Id: I30a46805b81e5ecd3fd7a6737f60ec26ef0498b1
Reviewed-on: https://go-review.googlesource.com/c/go/+/434796
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
---
 src/cmd/compile/internal/ssa/fuse.go | 157 ++++++++++++++++++++-------
 1 file changed, 117 insertions(+), 40 deletions(-)

diff --git a/src/cmd/compile/internal/ssa/fuse.go b/src/cmd/compile/internal/ssa/fuse.go
index 2b176dfa7b..6d3fb70780 100644
--- a/src/cmd/compile/internal/ssa/fuse.go
+++ b/src/cmd/compile/internal/ssa/fuse.go
@@ -6,6 +6,7 @@ package ssa
 
 import (
 	"cmd/internal/src"
+	"fmt"
 )
 
 // fuseEarly runs fuse(f, fuseTypePlain|fuseTypeIntInRange).
@@ -28,7 +29,9 @@ const (
 func fuse(f *Func, typ fuseType) {
 	for changed := true; changed; {
 		changed = false
-		// Fuse from end to beginning, to avoid quadratic behavior in fuseBlockPlain. See issue 13554.
+		// Be sure to avoid quadratic behavior in fuseBlockPlain. See issue 13554.
+		// Previously this was dealt with using backwards iteration, now fuseBlockPlain
+		// handles large runs of blocks.
 		for i := len(f.Blocks) - 1; i >= 0; i-- {
 			b := f.Blocks[i]
 			if typ&fuseTypeIf != 0 {
@@ -44,6 +47,7 @@ func fuse(f *Func, typ fuseType) {
 				changed = shortcircuitBlock(b) || changed
 			}
 		}
+
 		if typ&fuseTypeBranchRedirect != 0 {
 			changed = fuseBranchRedirect(f) || changed
 		}
@@ -172,65 +176,134 @@ func isEmpty(b *Block) bool {
 	return true
 }
 
+// fuseBlockPlain handles a run of blocks with length >= 2,
+// whose interior has single predecessors and successors,
+// b must be BlockPlain, allowing it to be any node except the
+// last (multiple successors means not BlockPlain).
+// Cycles are handled and merged into b's successor.
 func fuseBlockPlain(b *Block) bool {
 	if b.Kind != BlockPlain {
 		return false
 	}
 
 	c := b.Succs[0].b
-	if len(c.Preds) != 1 {
+	if len(c.Preds) != 1 || c == b { // At least 2 distinct blocks.
 		return false
 	}
 
-	// If a block happened to end in a statement marker,
-	// try to preserve it.
-	if b.Pos.IsStmt() == src.PosIsStmt {
-		l := b.Pos.Line()
-		for _, v := range c.Values {
-			if v.Pos.IsStmt() == src.PosNotStmt {
-				continue
+	// find earliest block in run.  Avoid simple cycles.
+	for len(b.Preds) == 1 && b.Preds[0].b != c && b.Preds[0].b.Kind == BlockPlain {
+		b = b.Preds[0].b
+	}
+
+	// find latest block in run.  Still beware of simple cycles.
+	for {
+		if c.Kind != BlockPlain {
+			break
+		} // Has exactly 1 successor
+		cNext := c.Succs[0].b
+		if cNext == b {
+			break
+		} // not a cycle
+		if len(cNext.Preds) != 1 {
+			break
+		} // no other incoming edge
+		c = cNext
+	}
+
+	// Try to preserve any statement marks on the ends of blocks; move values to C
+	var b_next *Block
+	for bx := b; bx != c; bx = b_next {
+		// For each bx with an end-of-block statement marker,
+		// try to move it to a value in the next block,
+		// or to the next block's end, if possible.
+		b_next = bx.Succs[0].b
+		if bx.Pos.IsStmt() == src.PosIsStmt {
+			l := bx.Pos.Line() // looking for another place to mark for line l
+			outOfOrder := false
+			for _, v := range b_next.Values {
+				if v.Pos.IsStmt() == src.PosNotStmt {
+					continue
+				}
+				if l == v.Pos.Line() { // Found a Value with same line, therefore done.
+					v.Pos = v.Pos.WithIsStmt()
+					l = 0
+					break
+				}
+				if l < v.Pos.Line() {
+					// The order of values in a block is not specified so OOO in a block is not interesting,
+					// but they do all come before the end of the block, so this disqualifies attaching to end of b_next.
+					outOfOrder = true
+				}
 			}
-			if l == v.Pos.Line() {
-				v.Pos = v.Pos.WithIsStmt()
-				l = 0
-				break
+			if l != 0 && !outOfOrder && (b_next.Pos.Line() == l || b_next.Pos.IsStmt() != src.PosIsStmt) {
+				b_next.Pos = bx.Pos.WithIsStmt()
 			}
 		}
-		if l != 0 && c.Pos.Line() == l {
-			c.Pos = c.Pos.WithIsStmt()
+		// move all of bx's values to c (note containing loop excludes c)
+		for _, v := range bx.Values {
+			v.Block = c
 		}
 	}
 
-	// move all of b's values to c.
-	for _, v := range b.Values {
-		v.Block = c
+	// Compute the total number of values and find the largest value slice in the run, to maximize chance of storage reuse.
+	total := 0
+	totalBeforeMax := 0 // number of elements preceding the maximum block (i.e. its position in the result).
+	max_b := b          // block with maximum capacity
+
+	for bx := b; ; bx = bx.Succs[0].b {
+		if cap(bx.Values) > cap(max_b.Values) {
+			totalBeforeMax = total
+			max_b = bx
+		}
+		total += len(bx.Values)
+		if bx == c {
+			break
+		}
 	}
-	// Use whichever value slice is larger, in the hopes of avoiding growth.
-	// However, take care to avoid c.Values pointing to b.valstorage.
+
+	// Use c's storage if fused blocks will fit, else use the max if that will fit, else allocate new storage.
+
+	// Take care to avoid c.Values pointing to b.valstorage.
 	// See golang.org/issue/18602.
+
 	// It's important to keep the elements in the same order; maintenance of
 	// debugging information depends on the order of *Values in Blocks.
 	// This can also cause changes in the order (which may affect other
 	// optimizations and possibly compiler output) for 32-vs-64 bit compilation
 	// platforms (word size affects allocation bucket size affects slice capacity).
-	if cap(c.Values) >= cap(b.Values) || len(b.Values) <= len(b.valstorage) {
-		bl := len(b.Values)
-		cl := len(c.Values)
-		var t []*Value // construct t = b.Values followed-by c.Values, but with attention to allocation.
-		if cap(c.Values) < bl+cl {
-			// reallocate
-			t = make([]*Value, bl+cl)
-		} else {
-			// in place.
-			t = c.Values[0 : bl+cl]
-		}
-		copy(t[bl:], c.Values) // possibly in-place
-		c.Values = t
-		copy(c.Values, b.Values)
+
+	// figure out what slice will hold the values,
+	// preposition the destination elements if not allocating new storage
+	var t []*Value
+	if total <= len(c.valstorage) {
+		t = c.valstorage[:total]
+		max_b = c
+		totalBeforeMax = total - len(c.Values)
+		copy(t[totalBeforeMax:], c.Values)
+	} else if total <= cap(max_b.Values) { // in place, somewhere
+		t = max_b.Values[0:total]
+		copy(t[totalBeforeMax:], max_b.Values)
 	} else {
-		c.Values = append(b.Values, c.Values...)
+		t = make([]*Value, total)
+		max_b = nil
 	}
 
+	// copy the values
+	copyTo := 0
+	for bx := b; ; bx = bx.Succs[0].b {
+		if bx != max_b {
+			copy(t[copyTo:], bx.Values)
+		} else if copyTo != totalBeforeMax { // trust but verify.
+			panic(fmt.Errorf("totalBeforeMax (%d) != copyTo (%d), max_b=%v, b=%v, c=%v", totalBeforeMax, copyTo, max_b, b, c))
+		}
+		if bx == c {
+			break
+		}
+		copyTo += len(bx.Values)
+	}
+	c.Values = t
+
 	// replace b->c edge with preds(b) -> c
 	c.predstorage[0] = Edge{}
 	if len(b.Preds) > len(b.predstorage) {
@@ -247,10 +320,14 @@ func fuseBlockPlain(b *Block) bool {
 		f.Entry = c
 	}
 
-	// trash b, just in case
-	b.Kind = BlockInvalid
-	b.Values = nil
-	b.Preds = nil
-	b.Succs = nil
+	// trash b's fields, just in case
+	for bx := b; bx != c; bx = b_next {
+		b_next = bx.Succs[0].b
+
+		bx.Kind = BlockInvalid
+		bx.Values = nil
+		bx.Preds = nil
+		bx.Succs = nil
+	}
 	return true
 }