From 12befc3ce3f44d500174d2b4a0aa524feb74e16b Mon Sep 17 00:00:00 2001
From: Keith Randall <khr@golang.org>
Date: Tue, 17 Nov 2020 15:32:45 -0800
Subject: [PATCH] cmd/compile: improve scheduling pass

Convert the scheduling pass from scheduling backwards to scheduling forwards.

Forward scheduling makes it easier to prioritize scheduling values as
soon as they are ready, which is important for things like nil checks,
select ops, etc.

Forward scheduling is also quite a bit clearer. It was originally
backwards because computing uses is tricky, but I found a way to do it
simply and with n lg n complexity. The new scheme also makes it easy
to add new scheduling edges if needed.

Fixes #42673
Update #56568

Change-Id: Ibbb38c52d191f50ce7a94f8c1cbd3cd9b614ea8b
Reviewed-on: https://go-review.googlesource.com/c/go/+/270940
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
Run-TryBot: Keith Randall <khr@golang.org>
Reviewed-by: David Chase <drchase@google.com>
---
 .../compile/internal/ssa/_gen/RISCV64Ops.go   |   2 +-
 src/cmd/compile/internal/ssa/_gen/S390XOps.go |   4 +-
 .../compile/internal/ssa/debug_lines_test.go  |   2 +-
 src/cmd/compile/internal/ssa/opGen.go         |   6 +-
 src/cmd/compile/internal/ssa/schedule.go      | 379 ++++++------------
 src/cmd/compile/internal/ssagen/ssa.go        |   3 +-
 test/nilptr3.go                               |   7 +
 7 files changed, 142 insertions(+), 261 deletions(-)

diff --git a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
index 82d21b48f0..bc47e1b441 100644
--- a/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go
@@ -169,7 +169,7 @@ func init() {
 		{name: "REMW", argLength: 2, reg: gp21, asm: "REMW", typ: "Int32"},
 		{name: "REMUW", argLength: 2, reg: gp21, asm: "REMUW", typ: "UInt32"},
 
-		{name: "MOVaddr", argLength: 1, reg: gp11sb, asm: "MOV", aux: "SymOff", rematerializeable: true, symEffect: "RdWr"}, // arg0 + auxint + offset encoded in aux
+		{name: "MOVaddr", argLength: 1, reg: gp11sb, asm: "MOV", aux: "SymOff", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxint + offset encoded in aux
 		// auxint+aux == add auxint and the offset of the symbol in aux (if any) to the effective address
 
 		{name: "MOVDconst", reg: gp01, asm: "MOV", typ: "UInt64", aux: "Int64", rematerializeable: true}, // auxint
diff --git a/src/cmd/compile/internal/ssa/_gen/S390XOps.go b/src/cmd/compile/internal/ssa/_gen/S390XOps.go
index f2184ad11f..636893a3c5 100644
--- a/src/cmd/compile/internal/ssa/_gen/S390XOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/S390XOps.go
@@ -418,8 +418,8 @@ func init() {
 		{name: "LEDBR", argLength: 1, reg: fp11, asm: "LEDBR"}, // convert float64 to float32
 		{name: "LDEBR", argLength: 1, reg: fp11, asm: "LDEBR"}, // convert float32 to float64
 
-		{name: "MOVDaddr", argLength: 1, reg: addr, aux: "SymOff", rematerializeable: true, symEffect: "Read"}, // arg0 + auxint + offset encoded in aux
-		{name: "MOVDaddridx", argLength: 2, reg: addridx, aux: "SymOff", symEffect: "Read"},                    // arg0 + arg1 + auxint + aux
+		{name: "MOVDaddr", argLength: 1, reg: addr, aux: "SymOff", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxint + offset encoded in aux
+		{name: "MOVDaddridx", argLength: 2, reg: addridx, aux: "SymOff", symEffect: "Addr"},                    // arg0 + arg1 + auxint + aux
 
 		// auxint+aux == add auxint and the offset of the symbol in aux (if any) to the effective address
 		{name: "MOVBZload", argLength: 2, reg: gpload, asm: "MOVBZ", aux: "SymOff", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"},  // load byte from arg0+auxint+aux. arg1=mem.  Zero extend.
diff --git a/src/cmd/compile/internal/ssa/debug_lines_test.go b/src/cmd/compile/internal/ssa/debug_lines_test.go
index ff651f6862..a9d33b6b0a 100644
--- a/src/cmd/compile/internal/ssa/debug_lines_test.go
+++ b/src/cmd/compile/internal/ssa/debug_lines_test.go
@@ -115,7 +115,7 @@ func TestInlineLines(t *testing.T) {
 		t.Skip("only runs for amd64 unless -arch explicitly supplied")
 	}
 
-	want := [][]int{{3}, {4, 10}, {4, 10, 16}, {4, 10}, {4, 11, 16}, {4, 11}, {4}, {5, 10}, {5, 10, 16}, {5, 10}, {5, 11, 16}, {5, 11}, {5}}
+	want := [][]int{{3}, {3}, {4, 10}, {4, 10, 16}, {4, 10}, {4, 11, 16}, {4, 11}, {4}, {5, 10}, {5, 10, 16}, {5, 10}, {5, 11, 16}, {5, 11}, {5}}
 	testInlineStack(t, "inline-dump.go", "f", want)
 }
 
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 597dc9c72e..9db2aec462 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -31047,7 +31047,7 @@ var opcodeTable = [...]opInfo{
 		auxType:           auxSymOff,
 		argLen:            1,
 		rematerializeable: true,
-		symEffect:         SymRdWr,
+		symEffect:         SymAddr,
 		asm:               riscv.AMOV,
 		reg: regInfo{
 			inputs: []inputInfo{
@@ -34771,7 +34771,7 @@ var opcodeTable = [...]opInfo{
 		auxType:           auxSymOff,
 		argLen:            1,
 		rematerializeable: true,
-		symEffect:         SymRead,
+		symEffect:         SymAddr,
 		reg: regInfo{
 			inputs: []inputInfo{
 				{0, 4295000064}, // SP SB
@@ -34785,7 +34785,7 @@ var opcodeTable = [...]opInfo{
 		name:      "MOVDaddridx",
 		auxType:   auxSymOff,
 		argLen:    2,
-		symEffect: SymRead,
+		symEffect: SymAddr,
 		reg: regInfo{
 			inputs: []inputInfo{
 				{0, 4295000064}, // SP SB
diff --git a/src/cmd/compile/internal/ssa/schedule.go b/src/cmd/compile/internal/ssa/schedule.go
index d88c33f304..a69e406df2 100644
--- a/src/cmd/compile/internal/ssa/schedule.go
+++ b/src/cmd/compile/internal/ssa/schedule.go
@@ -7,17 +7,16 @@ package ssa
 import (
 	"cmd/compile/internal/base"
 	"cmd/compile/internal/types"
+	"cmd/internal/src"
 	"container/heap"
 	"sort"
 )
 
 const (
-	ScorePhi = iota // towards top of block
-	ScoreArg
+	ScorePhi       = iota // towards top of block
+	ScoreArg              // must occur at the top of the entry block
+	ScoreReadTuple        // must occur immediately after tuple-generating insn (or call)
 	ScoreNilCheck
-	ScoreReadTuple
-	ScoreVarDef
-	ScoreCarryChainTail
 	ScoreMemory
 	ScoreReadFlags
 	ScoreDefault
@@ -52,29 +51,36 @@ func (h ValHeap) Less(i, j int) bool {
 	sx := h.score[x.ID]
 	sy := h.score[y.ID]
 	if c := sx - sy; c != 0 {
-		return c > 0 // higher score comes later.
+		return c < 0 // lower scores come earlier.
 	}
+	// Note: only scores are required for correct scheduling.
+	// Everything else is just heuristics.
+
 	if x.Pos != y.Pos { // Favor in-order line stepping
-		return x.Pos.After(y.Pos)
+		if x.Block == x.Block.Func.Entry && x.Pos.IsStmt() != y.Pos.IsStmt() {
+			// In the entry block, put statement-marked instructions earlier.
+			return x.Pos.IsStmt() == src.PosIsStmt && y.Pos.IsStmt() != src.PosIsStmt
+		}
+		return x.Pos.Before(y.Pos)
 	}
 	if x.Op != OpPhi {
 		if c := len(x.Args) - len(y.Args); c != 0 {
-			return c < 0 // smaller args comes later
+			return c > 0 // smaller args come later
 		}
 	}
 	if c := x.Uses - y.Uses; c != 0 {
-		return c < 0 // smaller uses come later
+		return c > 0 // smaller uses come later
 	}
 	// These comparisons are fairly arbitrary.
 	// The goal here is stability in the face
 	// of unrelated changes elsewhere in the compiler.
 	if c := x.AuxInt - y.AuxInt; c != 0 {
-		return c > 0
+		return c < 0
 	}
 	if cmp := x.Type.Compare(y.Type); cmp != types.CMPeq {
-		return cmp == types.CMPgt
+		return cmp == types.CMPlt
 	}
-	return x.ID > y.ID
+	return x.ID < y.ID
 }
 
 func (op Op) isLoweredGetClosurePtr() bool {
@@ -93,11 +99,6 @@ func (op Op) isLoweredGetClosurePtr() bool {
 // reasonable valid schedule using a priority queue. TODO(khr):
 // schedule smarter.
 func schedule(f *Func) {
-	// For each value, the number of times it is used in the block
-	// by values that have not been scheduled yet.
-	uses := f.Cache.allocInt32Slice(f.NumValues())
-	defer f.Cache.freeInt32Slice(uses)
-
 	// reusable priority queue
 	priq := new(ValHeap)
 
@@ -105,16 +106,9 @@ func schedule(f *Func) {
 	score := f.Cache.allocInt8Slice(f.NumValues())
 	defer f.Cache.freeInt8Slice(score)
 
-	// scheduling order. We queue values in this list in reverse order.
-	// A constant bound allows this to be stack-allocated. 64 is
-	// enough to cover almost every schedule call.
-	order := make([]*Value, 0, 64)
-
 	// maps mem values to the next live memory value
 	nextMem := f.Cache.allocValueSlice(f.NumValues())
 	defer f.Cache.freeValueSlice(nextMem)
-	// additional pretend arguments for each Value. Used to enforce load/store ordering.
-	additionalArgs := make([][]*Value, f.NumValues())
 
 	for _, b := range f.Blocks {
 		// Compute score. Larger numbers are scheduled closer to the end of the block.
@@ -129,68 +123,40 @@ func schedule(f *Func) {
 					f.Fatalf("LoweredGetClosurePtr appeared outside of entry block, b=%s", b.String())
 				}
 				score[v.ID] = ScorePhi
-			case v.Op == OpAMD64LoweredNilCheck || v.Op == OpPPC64LoweredNilCheck ||
-				v.Op == OpARMLoweredNilCheck || v.Op == OpARM64LoweredNilCheck ||
-				v.Op == Op386LoweredNilCheck || v.Op == OpMIPS64LoweredNilCheck ||
-				v.Op == OpS390XLoweredNilCheck || v.Op == OpMIPSLoweredNilCheck ||
-				v.Op == OpRISCV64LoweredNilCheck || v.Op == OpWasmLoweredNilCheck ||
-				v.Op == OpLOONG64LoweredNilCheck:
+			case opcodeTable[v.Op].nilCheck:
 				// Nil checks must come before loads from the same address.
 				score[v.ID] = ScoreNilCheck
 			case v.Op == OpPhi:
 				// We want all the phis first.
 				score[v.ID] = ScorePhi
-			case v.Op == OpVarDef:
-				// We want all the vardefs next.
-				score[v.ID] = ScoreVarDef
 			case v.Op == OpArgIntReg || v.Op == OpArgFloatReg:
 				// In-register args must be scheduled as early as possible to ensure that the
 				// context register is not stomped. They should only appear in the entry block.
 				if b != f.Entry {
 					f.Fatalf("%s appeared outside of entry block, b=%s", v.Op, b.String())
 				}
-				score[v.ID] = ScorePhi
+				score[v.ID] = ScoreArg
 			case v.Op == OpArg:
 				// We want all the args as early as possible, for better debugging.
 				score[v.ID] = ScoreArg
 			case v.Type.IsMemory():
 				// Schedule stores as early as possible. This tends to
-				// reduce register pressure. It also helps make sure
-				// VARDEF ops are scheduled before the corresponding LEA.
+				// reduce register pressure.
 				score[v.ID] = ScoreMemory
 			case v.Op == OpSelect0 || v.Op == OpSelect1 || v.Op == OpSelectN:
-				if (v.Op == OpSelect1 || v.Op == OpSelect0) && (v.Args[0].isCarry() || v.Type.IsFlags()) {
-					// When the Select pseudo op is being used for a carry or flag from
-					// a tuple then score it as ScoreFlags so it happens later. This
-					// prevents the bit from being clobbered before it is used.
-					score[v.ID] = ScoreFlags
-				} else {
-					score[v.ID] = ScoreReadTuple
-				}
-			case v.isCarry():
-				if w := v.getCarryInput(); w != nil && w.Block == b {
-					// The producing op is not the final user of the carry bit. Its
-					// current score is one of unscored, Flags, or CarryChainTail.
-					// These occur if the producer has not been scored, another user
-					// of the producers carry flag was scored (there are >1 users of
-					// the carry out flag), or it was visited earlier and already
-					// scored CarryChainTail (and prove w is not a tail).
-					score[w.ID] = ScoreFlags
-				}
-				// Verify v has not been scored. If v has not been visited, v may be
-				// the final (tail) operation in a carry chain. If v is not, v will be
-				// rescored above when v's carry-using op is scored. When scoring is done,
-				// only tail operations will retain the CarryChainTail score.
-				if score[v.ID] != ScoreFlags {
-					// Score the tail of carry chain operations to a lower (earlier in the
-					// block) priority. This creates a priority inversion which allows only
-					// one chain to be scheduled, if possible.
-					score[v.ID] = ScoreCarryChainTail
-				}
+				// Tuple selectors need to appear immediately after the instruction
+				// that generates the tuple.
+				score[v.ID] = ScoreReadTuple
+			case v.hasFlagInput():
+				// Schedule flag-reading ops earlier, to minimize the lifetime
+				// of flag values.
+				score[v.ID] = ScoreReadFlags
 			case v.isFlagOp():
 				// Schedule flag register generation as late as possible.
 				// This makes sure that we only have one live flags
 				// value at a time.
+				// Note that this case is afer the case above, so values
+				// which both read and generate flags are given ScoreReadFlags.
 				score[v.ID] = ScoreFlags
 			default:
 				score[v.ID] = ScoreDefault
@@ -202,23 +168,35 @@ func schedule(f *Func) {
 				}
 			}
 		}
+		for _, c := range b.ControlValues() {
+			// Force the control values to be scheduled at the end,
+			// unless they have other special priority.
+			if c.Block != b || score[c.ID] < ScoreReadTuple {
+				continue
+			}
+			if score[c.ID] == ScoreReadTuple {
+				score[c.Args[0].ID] = ScoreControl
+				continue
+			}
+			score[c.ID] = ScoreControl
+		}
 	}
+	priq.score = score
+
+	// An edge represents a scheduling constraint that x must appear before y in the schedule.
+	type edge struct {
+		x, y *Value
+	}
+	edges := make([]edge, 0, 64)
+
+	// inEdges is the number of scheduling edges incoming from values that haven't been scheduled yet.
+	// i.e. inEdges[y.ID] = |e in edges where e.y == y and e.x is not in the schedule yet|.
+	inEdges := f.Cache.allocInt32Slice(f.NumValues())
+	defer f.Cache.freeInt32Slice(inEdges)
 
 	for _, b := range f.Blocks {
-		// Find store chain for block.
-		// Store chains for different blocks overwrite each other, so
-		// the calculated store chain is good only for this block.
-		for _, v := range b.Values {
-			if v.Op != OpPhi && v.Type.IsMemory() {
-				for _, w := range v.Args {
-					if w.Type.IsMemory() {
-						nextMem[w.ID] = v
-					}
-				}
-			}
-		}
-
-		// Compute uses.
+		edges = edges[:0]
+		// Standard edges: from the argument of a value to that value.
 		for _, v := range b.Values {
 			if v.Op == OpPhi {
 				// If a value is used by a phi, it does not induce
@@ -226,144 +204,81 @@ func schedule(f *Func) {
 				// previous iteration.
 				continue
 			}
-			for _, w := range v.Args {
-				if w.Block == b {
-					uses[w.ID]++
-				}
-				// Any load must come before the following store.
-				if !v.Type.IsMemory() && w.Type.IsMemory() {
-					// v is a load.
-					s := nextMem[w.ID]
-					if s == nil || s.Block != b {
-						continue
-					}
-					additionalArgs[s.ID] = append(additionalArgs[s.ID], v)
-					uses[v.ID]++
+			for _, a := range v.Args {
+				if a.Block == b {
+					edges = append(edges, edge{a, v})
 				}
 			}
 		}
 
-		for _, c := range b.ControlValues() {
-			// Force the control values to be scheduled at the end,
-			// unless they are phi values (which must be first).
-			// OpArg also goes first -- if it is stack it register allocates
-			// to a LoadReg, if it is register it is from the beginning anyway.
-			if score[c.ID] == ScorePhi || score[c.ID] == ScoreArg {
+		// Find store chain for block.
+		// Store chains for different blocks overwrite each other, so
+		// the calculated store chain is good only for this block.
+		for _, v := range b.Values {
+			if v.Op != OpPhi && v.Op != OpInitMem && v.Type.IsMemory() {
+				nextMem[v.MemoryArg().ID] = v
+			}
+		}
+
+		// Add edges to enforce that any load must come before the following store.
+		for _, v := range b.Values {
+			if v.Op == OpPhi || v.Type.IsMemory() {
 				continue
 			}
-			score[c.ID] = ScoreControl
-
-			// Schedule values dependent on the control values at the end.
-			// This reduces the number of register spills. We don't find
-			// all values that depend on the controls, just values with a
-			// direct dependency. This is cheaper and in testing there
-			// was no difference in the number of spills.
-			for _, v := range b.Values {
-				if v.Op != OpPhi {
-					for _, a := range v.Args {
-						if a == c {
-							score[v.ID] = ScoreControl
-						}
-					}
-				}
+			w := v.MemoryArg()
+			if w == nil {
+				continue
+			}
+			if s := nextMem[w.ID]; s != nil && s.Block == b {
+				edges = append(edges, edge{v, s})
 			}
 		}
 
-		// To put things into a priority queue
-		// The values that should come last are least.
-		priq.score = score
-		priq.a = priq.a[:0]
+		// Sort all the edges by source Value ID.
+		sort.Slice(edges, func(i, j int) bool {
+			return edges[i].x.ID < edges[j].x.ID
+		})
+		// Compute inEdges for values in this block.
+		for _, e := range edges {
+			inEdges[e.y.ID]++
+		}
 
 		// Initialize priority queue with schedulable values.
+		priq.a = priq.a[:0]
 		for _, v := range b.Values {
-			if uses[v.ID] == 0 {
+			if inEdges[v.ID] == 0 {
 				heap.Push(priq, v)
 			}
 		}
 
-		// Schedule highest priority value, update use counts, repeat.
-		order = order[:0]
-		tuples := make(map[ID][]*Value)
+		// Produce the schedule. Pick the highest priority scheduleable value,
+		// add it to the schedule, add any of its uses that are now scheduleable
+		// to the queue, and repeat.
+		nv := len(b.Values)
+		b.Values = b.Values[:0]
 		for priq.Len() > 0 {
-			// Find highest priority schedulable value.
-			// Note that schedule is assembled backwards.
-
+			// Schedule the next schedulable value in priority order.
 			v := heap.Pop(priq).(*Value)
+			b.Values = append(b.Values, v)
 
-			if f.pass.debug > 1 && score[v.ID] == ScoreCarryChainTail && v.isCarry() {
-				// Add some debugging noise if the chain of carrying ops will not
-				// likely be scheduled without potential carry flag clobbers.
-				if !isCarryChainReady(v, uses) {
-					f.Warnl(v.Pos, "carry chain ending with %v not ready", v)
-				}
-			}
-
-			// Add it to the schedule.
-			// Do not emit tuple-reading ops until we're ready to emit the tuple-generating op.
-			//TODO: maybe remove ReadTuple score above, if it does not help on performance
-			switch {
-			case v.Op == OpSelect0:
-				if tuples[v.Args[0].ID] == nil {
-					tuples[v.Args[0].ID] = make([]*Value, 2)
-				}
-				tuples[v.Args[0].ID][0] = v
-			case v.Op == OpSelect1:
-				if tuples[v.Args[0].ID] == nil {
-					tuples[v.Args[0].ID] = make([]*Value, 2)
-				}
-				tuples[v.Args[0].ID][1] = v
-			case v.Op == OpSelectN:
-				if tuples[v.Args[0].ID] == nil {
-					tuples[v.Args[0].ID] = make([]*Value, v.Args[0].Type.NumFields())
-				}
-				tuples[v.Args[0].ID][v.AuxInt] = v
-			case v.Type.IsResults() && tuples[v.ID] != nil:
-				tup := tuples[v.ID]
-				for i := len(tup) - 1; i >= 0; i-- {
-					if tup[i] != nil {
-						order = append(order, tup[i])
-					}
-				}
-				delete(tuples, v.ID)
-				order = append(order, v)
-			case v.Type.IsTuple() && tuples[v.ID] != nil:
-				if tuples[v.ID][1] != nil {
-					order = append(order, tuples[v.ID][1])
-				}
-				if tuples[v.ID][0] != nil {
-					order = append(order, tuples[v.ID][0])
-				}
-				delete(tuples, v.ID)
-				fallthrough
-			default:
-				order = append(order, v)
-			}
-
-			// Update use counts of arguments.
-			for _, w := range v.Args {
-				if w.Block != b {
-					continue
-				}
-				uses[w.ID]--
-				if uses[w.ID] == 0 {
-					// All uses scheduled, w is now schedulable.
-					heap.Push(priq, w)
-				}
-			}
-			for _, w := range additionalArgs[v.ID] {
-				uses[w.ID]--
-				if uses[w.ID] == 0 {
-					// All uses scheduled, w is now schedulable.
-					heap.Push(priq, w)
+			// Find all the scheduling edges out from this value.
+			i := sort.Search(len(edges), func(i int) bool {
+				return edges[i].x.ID >= v.ID
+			})
+			j := sort.Search(len(edges), func(i int) bool {
+				return edges[i].x.ID > v.ID
+			})
+			// Decrement inEdges for each target of edges from v.
+			for _, e := range edges[i:j] {
+				inEdges[e.y.ID]--
+				if inEdges[e.y.ID] == 0 {
+					heap.Push(priq, e.y)
 				}
 			}
 		}
-		if len(order) != len(b.Values) {
+		if len(b.Values) != nv {
 			f.Fatalf("schedule does not include all values in block %s", b)
 		}
-		for i := 0; i < len(b.Values); i++ {
-			b.Values[i] = order[len(b.Values)-1-i]
-		}
 	}
 
 	// Remove SPanchored now that we've scheduled.
@@ -584,74 +499,32 @@ func storeOrder(values []*Value, sset *sparseSet, storeNumber []int32) []*Value
 
 // isFlagOp reports if v is an OP with the flag type.
 func (v *Value) isFlagOp() bool {
-	return v.Type.IsFlags() || v.Type.IsTuple() && v.Type.FieldType(1).IsFlags()
-}
-
-// isCarryChainReady reports whether all dependent carry ops can be scheduled after this.
-func isCarryChainReady(v *Value, uses []int32) bool {
-	// A chain can be scheduled in it's entirety if
-	// the use count of each dependent op is 1. If none,
-	// schedule the first.
-	j := 1 // The first op uses[k.ID] == 0. Dependent ops are always >= 1.
-	for k := v; k != nil; k = k.getCarryInput() {
-		j += int(uses[k.ID]) - 1
+	if v.Type.IsFlags() || v.Type.IsTuple() && v.Type.FieldType(1).IsFlags() {
+		return true
 	}
-	return j == 0
+	// PPC64 carry generators put their carry in a non-flag-typed register
+	// in their output.
+	switch v.Op {
+	case OpPPC64SUBC, OpPPC64ADDC, OpPPC64SUBCconst, OpPPC64ADDCconst:
+		return true
+	}
+	return false
 }
 
-// isCarryInput reports whether v accepts a carry value as input.
-func (v *Value) isCarryInput() bool {
-	return v.getCarryInput() != nil
-}
-
-// isCarryOutput reports whether v generates a carry as output.
-func (v *Value) isCarryOutput() bool {
-	// special cases for PPC64 which put their carry values in XER instead of flags
-	switch v.Block.Func.Config.arch {
-	case "ppc64", "ppc64le":
-		switch v.Op {
-		case OpPPC64SUBC, OpPPC64ADDC, OpPPC64SUBCconst, OpPPC64ADDCconst:
+// hasFlagInput reports whether v has a flag value as any of its inputs.
+func (v *Value) hasFlagInput() bool {
+	for _, a := range v.Args {
+		if a.isFlagOp() {
 			return true
 		}
-		return false
 	}
-	return v.isFlagOp() && v.Op != OpSelect1
-}
-
-// isCarryCreator reports whether op is an operation which produces a carry bit value,
-// but does not consume it.
-func (v *Value) isCarryCreator() bool {
-	return v.isCarryOutput() && !v.isCarryInput()
-}
-
-// isCarry reports whether op consumes or creates a carry a bit value.
-func (v *Value) isCarry() bool {
-	return v.isCarryOutput() || v.isCarryInput()
-}
-
-// getCarryInput returns the producing *Value of the carry bit of this op, or nil if none.
-func (v *Value) getCarryInput() *Value {
-	// special cases for PPC64 which put their carry values in XER instead of flags
-	switch v.Block.Func.Config.arch {
-	case "ppc64", "ppc64le":
-		switch v.Op {
-		case OpPPC64SUBE, OpPPC64ADDE, OpPPC64SUBZEzero, OpPPC64ADDZEzero:
-			// PPC64 carry dependencies are conveyed through their final argument.
-			// Likewise, there is always an OpSelect1 between them.
-			return v.Args[len(v.Args)-1].Args[0]
-		}
-		return nil
+	// PPC64 carry dependencies are conveyed through their final argument,
+	// so we treat those operations as taking flags as well.
+	switch v.Op {
+	case OpPPC64SUBE, OpPPC64ADDE, OpPPC64SUBZEzero, OpPPC64ADDZEzero:
+		return true
 	}
-	for _, a := range v.Args {
-		if !a.isFlagOp() {
-			continue
-		}
-		if a.Op == OpSelect1 {
-			a = a.Args[0]
-		}
-		return a
-	}
-	return nil
+	return false
 }
 
 type bySourcePos []*Value
diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go
index 9bad115def..4b6b28fad1 100644
--- a/src/cmd/compile/internal/ssagen/ssa.go
+++ b/src/cmd/compile/internal/ssagen/ssa.go
@@ -6928,7 +6928,7 @@ func genssa(f *ssa.Func, pp *objw.Progs) {
 	// debuggers may attribute it to previous function in program.
 	firstPos := src.NoXPos
 	for _, v := range f.Entry.Values {
-		if v.Pos.IsStmt() == src.PosIsStmt {
+		if v.Pos.IsStmt() == src.PosIsStmt && v.Op != ssa.OpArg && v.Op != ssa.OpArgIntReg && v.Op != ssa.OpArgFloatReg && v.Op != ssa.OpLoadReg && v.Op != ssa.OpStoreReg {
 			firstPos = v.Pos
 			v.Pos = firstPos.WithDefaultStmt()
 			break
@@ -7009,6 +7009,7 @@ func genssa(f *ssa.Func, pp *objw.Progs) {
 				inlMarkList = append(inlMarkList, p)
 				pos := v.Pos.AtColumn1()
 				inlMarksByPos[pos] = append(inlMarksByPos[pos], p)
+				firstPos = src.NoXPos
 
 			default:
 				// Special case for first line in function; move it to the start (which cannot be a register-valued instruction)
diff --git a/test/nilptr3.go b/test/nilptr3.go
index 3345cfa5ab..0e818ebf66 100644
--- a/test/nilptr3.go
+++ b/test/nilptr3.go
@@ -241,3 +241,10 @@ func f9() []int {
 	y := x[:] // ERROR "removed nil check"
 	return y
 }
+
+// See issue 42673.
+func f10(p **int) int {
+	return * // ERROR "removed nil check"
+	/* */
+	*p // ERROR "removed nil check"
+}