cmd/compile: simplify shift lowering on s390x

Use conditional moves instead of subtractions with borrow to handle saturation cases. This allows us to delete the SUBE/SUBEW ops and associated rules from the SSA backend. Using conditional moves also means we can detect when shift values are masked so I've added some new rules to constant fold the relevant comparisons and masking ops. Also use the new shiftIsBounded() function to avoid generating code to handle saturation cases where possible. Updates #25167 for s390x. Change-Id: Ief9991c91267c9151ce4c5ec07642abb4dcc1c0d Reviewed-on: https://go-review.googlesource.com/110070 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
2024-09-05 16:04:50 +00:00 · 2018-04-30 13:27:50 +01:00 · 2018-04-30 13:27:50 +01:00 · 8af0c77df3
parent 704893b16b
commit 8af0c77df3
6 changed files with 3156 additions and 727 deletions
--- a/src/cmd/compile/internal/s390x/ssa.go
+++ b/src/cmd/compile/internal/s390x/ssa.go
@ -305,13 +305,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		}
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = r
-	case ssa.OpS390XSUBEcarrymask, ssa.OpS390XSUBEWcarrymask:
-		r := v.Reg()
-		p := s.Prog(v.Op.Asm())
-		p.From.Type = obj.TYPE_REG
-		p.From.Reg = r
-		p.To.Type = obj.TYPE_REG
-		p.To.Reg = r
 	case ssa.OpS390XMOVDaddridx:
 		r := v.Args[0].Reg()
 		i := v.Args[1].Reg()
--- a/src/cmd/compile/internal/ssa/gen/S390X.rules
+++ b/src/cmd/compile/internal/ssa/gen/S390X.rules
@ -160,69 +160,50 @@
 (Round(32|64)F x) -> (LoweredRound(32|64)F x)

 // Lowering shifts
+
+// Lower bounded shifts first. No need to check shift value.
+(Lsh64x(64|32|16|8)  x y) && shiftIsBounded(v) -> (SLD x y)
+(Lsh32x(64|32|16|8)  x y) && shiftIsBounded(v) -> (SLW x y)
+(Lsh16x(64|32|16|8)  x y) && shiftIsBounded(v) -> (SLW x y)
+(Lsh8x(64|32|16|8)   x y) && shiftIsBounded(v) -> (SLW x y)
+(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) -> (SRD x y)
+(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) -> (SRW x y)
+(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) -> (SRW (MOVHZreg x) y)
+(Rsh8Ux(64|32|16|8)  x y) && shiftIsBounded(v) -> (SRW (MOVBZreg x) y)
+(Rsh64x(64|32|16|8)  x y) && shiftIsBounded(v) -> (SRAD x y)
+(Rsh32x(64|32|16|8)  x y) && shiftIsBounded(v) -> (SRAW x y)
+(Rsh16x(64|32|16|8)  x y) && shiftIsBounded(v) -> (SRAW (MOVHreg x) y)
+(Rsh8x(64|32|16|8)   x y) && shiftIsBounded(v) -> (SRAW (MOVBreg x) y)
+
 // Unsigned shifts need to return 0 if shift amount is >= width of shifted value.
-//   result = (arg << shift) & (shift >= argbits ? 0 : 0xffffffffffffffff)
-(Lsh64x64 <t> x y) -> (AND (SLD <t> x y) (SUBEcarrymask <t> (CMPUconst y [63])))
-(Lsh64x32 <t> x y) -> (AND (SLD <t> x y) (SUBEcarrymask <t> (CMPWUconst y [63])))
-(Lsh64x16 <t> x y) -> (AND (SLD <t> x y) (SUBEcarrymask <t> (CMPWUconst (MOVHZreg y) [63])))
-(Lsh64x8  <t> x y) -> (AND (SLD <t> x y) (SUBEcarrymask <t> (CMPWUconst (MOVBZreg y) [63])))
+//   result = shift >= 64 ? 0 : arg << shift
+(Lsh(64|32|16|8)x64 <t> x y) -> (MOVDGE <t> (SL(D|W|W|W) <t> x y) (MOVDconst [0]) (CMPUconst y [64]))
+(Lsh(64|32|16|8)x32 <t> x y) -> (MOVDGE <t> (SL(D|W|W|W) <t> x y) (MOVDconst [0]) (CMPWUconst y [64]))
+(Lsh(64|32|16|8)x16 <t> x y) -> (MOVDGE <t> (SL(D|W|W|W) <t> x y) (MOVDconst [0]) (CMPWUconst (MOVHZreg y) [64]))
+(Lsh(64|32|16|8)x8  <t> x y) -> (MOVDGE <t> (SL(D|W|W|W) <t> x y) (MOVDconst [0]) (CMPWUconst (MOVBZreg y) [64]))

-(Lsh32x64 <t> x y) -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPUconst y [31])))
-(Lsh32x32 <t> x y) -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPWUconst y [31])))
-(Lsh32x16 <t> x y) -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPWUconst (MOVHZreg y) [31])))
-(Lsh32x8  <t> x y) -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPWUconst (MOVBZreg y) [31])))
+(Rsh(64|32)Ux64 <t> x y) -> (MOVDGE <t> (SR(D|W) <t> x y) (MOVDconst [0]) (CMPUconst y [64]))
+(Rsh(64|32)Ux32 <t> x y) -> (MOVDGE <t> (SR(D|W) <t> x y) (MOVDconst [0]) (CMPWUconst y [64]))
+(Rsh(64|32)Ux16 <t> x y) -> (MOVDGE <t> (SR(D|W) <t> x y) (MOVDconst [0]) (CMPWUconst (MOVHZreg y) [64]))
+(Rsh(64|32)Ux8  <t> x y) -> (MOVDGE <t> (SR(D|W) <t> x y) (MOVDconst [0]) (CMPWUconst (MOVBZreg y) [64]))

-(Lsh16x64 <t> x y) -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPUconst y [31])))
-(Lsh16x32 <t> x y) -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPWUconst y [31])))
-(Lsh16x16 <t> x y) -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPWUconst (MOVHZreg y) [31])))
-(Lsh16x8  <t> x y) -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPWUconst (MOVBZreg y) [31])))
-
-(Lsh8x64 <t> x y)  -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPUconst y [31])))
-(Lsh8x32 <t> x y)  -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPWUconst y [31])))
-(Lsh8x16 <t> x y)  -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPWUconst (MOVHZreg y) [31])))
-(Lsh8x8  <t> x y)  -> (ANDW (SLW <t> x y) (SUBEWcarrymask <t> (CMPWUconst (MOVBZreg y) [31])))
-
-(Rsh64Ux64 <t> x y) -> (AND (SRD <t> x y) (SUBEcarrymask <t> (CMPUconst y [63])))
-(Rsh64Ux32 <t> x y) -> (AND (SRD <t> x y) (SUBEcarrymask <t> (CMPWUconst y [63])))
-(Rsh64Ux16 <t> x y) -> (AND (SRD <t> x y) (SUBEcarrymask <t> (CMPWUconst (MOVHZreg y) [63])))
-(Rsh64Ux8  <t> x y) -> (AND (SRD <t> x y) (SUBEcarrymask <t> (CMPWUconst (MOVBZreg y) [63])))
-
-(Rsh32Ux64 <t> x y) -> (ANDW (SRW <t> x y) (SUBEWcarrymask <t> (CMPUconst y [31])))
-(Rsh32Ux32 <t> x y) -> (ANDW (SRW <t> x y) (SUBEWcarrymask <t> (CMPWUconst y [31])))
-(Rsh32Ux16 <t> x y) -> (ANDW (SRW <t> x y) (SUBEWcarrymask <t> (CMPWUconst (MOVHZreg y) [31])))
-(Rsh32Ux8  <t> x y) -> (ANDW (SRW <t> x y) (SUBEWcarrymask <t> (CMPWUconst (MOVBZreg y) [31])))
-
-(Rsh16Ux64 <t> x y) -> (ANDW (SRW <t> (MOVHZreg x) y) (SUBEWcarrymask <t> (CMPUconst y [15])))
-(Rsh16Ux32 <t> x y) -> (ANDW (SRW <t> (MOVHZreg x) y) (SUBEWcarrymask <t> (CMPWUconst y [15])))
-(Rsh16Ux16 <t> x y) -> (ANDW (SRW <t> (MOVHZreg x) y) (SUBEWcarrymask <t> (CMPWUconst (MOVHZreg y) [15])))
-(Rsh16Ux8  <t> x y) -> (ANDW (SRW <t> (MOVHZreg x) y) (SUBEWcarrymask <t> (CMPWUconst (MOVBZreg y) [15])))
-
-(Rsh8Ux64 <t> x y)  -> (ANDW (SRW <t> (MOVBZreg x) y) (SUBEWcarrymask <t> (CMPUconst y [7])))
-(Rsh8Ux32 <t> x y)  -> (ANDW (SRW <t> (MOVBZreg x) y) (SUBEWcarrymask <t> (CMPWUconst y [7])))
-(Rsh8Ux16 <t> x y)  -> (ANDW (SRW <t> (MOVBZreg x) y) (SUBEWcarrymask <t> (CMPWUconst (MOVHZreg y) [7])))
-(Rsh8Ux8  <t> x y)  -> (ANDW (SRW <t> (MOVBZreg x) y) (SUBEWcarrymask <t> (CMPWUconst (MOVBZreg y) [7])))
+(Rsh(16|8)Ux64 <t> x y) -> (MOVDGE <t> (SRW <t> (MOV(H|B)Zreg x) y) (MOVDconst [0]) (CMPUconst y [64]))
+(Rsh(16|8)Ux32 <t> x y) -> (MOVDGE <t> (SRW <t> (MOV(H|B)Zreg x) y) (MOVDconst [0]) (CMPWUconst y [64]))
+(Rsh(16|8)Ux16 <t> x y) -> (MOVDGE <t> (SRW <t> (MOV(H|B)Zreg x) y) (MOVDconst [0]) (CMPWUconst (MOVHZreg y) [64]))
+(Rsh(16|8)Ux8  <t> x y) -> (MOVDGE <t> (SRW <t> (MOV(H|B)Zreg x) y) (MOVDconst [0]) (CMPWUconst (MOVBZreg y) [64]))

 // Signed right shift needs to return 0/-1 if shift amount is >= width of shifted value.
-// We implement this by setting the shift value to -1 (all ones) if the shift value is >= width.
-(Rsh64x64 <t> x y) -> (SRAD <t> x (OR <y.Type> y (NOT <y.Type> (SUBEcarrymask <y.Type> (CMPUconst y [63])))))
-(Rsh64x32 <t> x y) -> (SRAD <t> x (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst y [63])))))
-(Rsh64x16 <t> x y) -> (SRAD <t> x (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst (MOVHZreg y) [63])))))
-(Rsh64x8  <t> x y) -> (SRAD <t> x (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst (MOVBZreg y) [63])))))
+// We implement this by setting the shift value to 63 (all ones) if the shift value is more than 63.
+//   result = arg >> (shift >= 64 ? 63 : shift)
+(Rsh(64|32)x64 x y) -> (SRA(D|W) x (MOVDGE <y.Type> y (MOVDconst <y.Type> [63]) (CMPUconst  y [64])))
+(Rsh(64|32)x32 x y) -> (SRA(D|W) x (MOVDGE <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst y [64])))
+(Rsh(64|32)x16 x y) -> (SRA(D|W) x (MOVDGE <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst (MOVHZreg y) [64])))
+(Rsh(64|32)x8  x y) -> (SRA(D|W) x (MOVDGE <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst (MOVBZreg y) [64])))

-(Rsh32x64 <t> x y) -> (SRAW <t> x (OR <y.Type> y (NOT <y.Type> (SUBEcarrymask <y.Type> (CMPUconst y [31])))))
-(Rsh32x32 <t> x y) -> (SRAW <t> x (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst y [31])))))
-(Rsh32x16 <t> x y) -> (SRAW <t> x (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst (MOVHZreg y) [31])))))
-(Rsh32x8  <t> x y) -> (SRAW <t> x (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst (MOVBZreg y) [31])))))
-
-(Rsh16x64 <t> x y) -> (SRAW <t> (MOVHreg x) (OR <y.Type> y (NOT <y.Type> (SUBEcarrymask <y.Type> (CMPUconst y [15])))))
-(Rsh16x32 <t> x y) -> (SRAW <t> (MOVHreg x) (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst y [15])))))
-(Rsh16x16 <t> x y) -> (SRAW <t> (MOVHreg x) (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst (MOVHZreg y) [15])))))
-(Rsh16x8  <t> x y) -> (SRAW <t> (MOVHreg x) (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst (MOVBZreg y) [15])))))
-
-(Rsh8x64 <t> x y)  -> (SRAW <t> (MOVBreg x) (OR <y.Type> y (NOT <y.Type> (SUBEcarrymask <y.Type> (CMPUconst y [7])))))
-(Rsh8x32 <t> x y)  -> (SRAW <t> (MOVBreg x) (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst y [7])))))
-(Rsh8x16 <t> x y)  -> (SRAW <t> (MOVBreg x) (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst (MOVHZreg y) [7])))))
-(Rsh8x8  <t> x y)  -> (SRAW <t> (MOVBreg x) (ORW <y.Type> y (NOTW <y.Type> (SUBEWcarrymask <y.Type> (CMPWUconst (MOVBZreg y) [7])))))
+(Rsh(16|8)x64 x y) -> (SRAW (MOV(H|B)reg x) (MOVDGE <y.Type> y (MOVDconst <y.Type> [63]) (CMPUconst  y [64])))
+(Rsh(16|8)x32 x y) -> (SRAW (MOV(H|B)reg x) (MOVDGE <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst y [64])))
+(Rsh(16|8)x16 x y) -> (SRAW (MOV(H|B)reg x) (MOVDGE <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst (MOVHZreg y) [64])))
+(Rsh(16|8)x8  x y) -> (SRAW (MOV(H|B)reg x) (MOVDGE <y.Type> y (MOVDconst <y.Type> [63]) (CMPWUconst (MOVBZreg y) [64])))

 // Lowering comparisons
 (Less64      x y) -> (MOVDLT (MOVDconst [0]) (MOVDconst [1]) (CMP x y))
@ -494,6 +475,7 @@
 // TODO(mundaym): modify the assembler to accept 64-bit values
 // and use isU32Bit(^c).
 (AND x (MOVDconst [c])) && is32Bit(c) && c < 0 -> (ANDconst [c] x)
+(AND x (MOVDconst [c])) && is32Bit(c) && c >= 0 -> (MOVWZreg (ANDWconst <typ.UInt32> [int64(int32(c))] x))
 (ANDW x (MOVDconst [c])) -> (ANDWconst [int64(int32(c))] x)

 (ANDWconst [c] (ANDWconst [d] x)) -> (ANDWconst [c & d] x)
@ -505,19 +487,21 @@
 (XOR x (MOVDconst [c])) && isU32Bit(c) -> (XORconst [c] x)
 (XORW x (MOVDconst [c])) -> (XORWconst [int64(int32(c))] x)

-(SLD x (MOVDconst [c])) -> (SLDconst [c&63] x)
-(SLW x (MOVDconst [c])) -> (SLWconst [c&63] x)
-(SRD x (MOVDconst [c])) -> (SRDconst [c&63] x)
-(SRW x (MOVDconst [c])) -> (SRWconst [c&63] x)
-(SRAD x (MOVDconst [c])) -> (SRADconst [c&63] x)
-(SRAW x (MOVDconst [c])) -> (SRAWconst [c&63] x)
+// Constant shifts.
+(S(LD|RD|RAD|LW|RW|RAW) x (MOVDconst [c]))
+	-> (S(LD|RD|RAD|LW|RW|RAW)const x [c&63])

-(SRAW x (ANDWconst [63] y)) -> (SRAW x y)
-(SRAD x (ANDconst [63] y)) -> (SRAD x y)
-(SLW x (ANDWconst [63] y)) -> (SLW x y)
-(SLD x (ANDconst [63] y)) -> (SLD x y)
-(SRW x (ANDWconst [63] y)) -> (SRW x y)
-(SRD x (ANDconst [63] y)) -> (SRD x y)
+// Shifts only use the rightmost 6 bits of the shift value.
+(S(LD|RD|RAD|LW|RW|RAW) x (AND (MOVDconst [c]) y))
+	-> (S(LD|RD|RAD|LW|RW|RAW) x (ANDWconst <typ.UInt32> [c&63] y))
+(S(LD|RD|RAD|LW|RW|RAW) x (ANDWconst [c] y)) && c&63 == 63
+	-> (S(LD|RD|RAD|LW|RW|RAW) x y)
+(SLD  x (MOV(D|W|H|B|WZ|HZ|BZ)reg y)) -> (SLD  x y)
+(SRD  x (MOV(D|W|H|B|WZ|HZ|BZ)reg y)) -> (SRD  x y)
+(SRAD x (MOV(D|W|H|B|WZ|HZ|BZ)reg y)) -> (SRAD x y)
+(SLW  x (MOV(D|W|H|B|WZ|HZ|BZ)reg y)) -> (SLW  x y)
+(SRW  x (MOV(D|W|H|B|WZ|HZ|BZ)reg y)) -> (SRW  x y)
+(SRAW x (MOV(D|W|H|B|WZ|HZ|BZ)reg y)) -> (SRAW x y)

 // Rotate generation
 (ADD (SLDconst x [c]) (SRDconst x [d])) && d == 64-c -> (RLLGconst [c] x)
@ -953,22 +937,33 @@
 (CMPWUconst (MOVDconst [x]) [y]) && uint32(x)<uint32(y) -> (FlagLT)
 (CMPWUconst (MOVDconst [x]) [y]) && uint32(x)>uint32(y) -> (FlagGT)

-// Other known comparisons.
-(CMPconst (MOVBZreg _) [c]) && 0xFF < c -> (FlagLT)
-(CMPconst (MOVHZreg _) [c]) && 0xFFFF < c -> (FlagLT)
-(CMPconst (MOVWZreg _) [c]) && 0xFFFFFFFF < c -> (FlagLT)
-(CMPWconst (SRWconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 32 && (1<<uint64(32-c)) <= uint64(n) -> (FlagLT)
-(CMPconst (SRDconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 64 && (1<<uint64(64-c)) <= uint64(n) -> (FlagLT)
-(CMPconst (ANDconst _ [m]) [n]) && 0 <= m && m < n -> (FlagLT)
-(CMPWconst (ANDWconst _ [m]) [n]) && 0 <= int32(m) && int32(m) < int32(n) -> (FlagLT)
+(CMP(W|WU)const (MOVBZreg _) [c]) &&   0xff < c -> (FlagLT)
+(CMP(W|WU)const (MOVHZreg _) [c]) && 0xffff < c -> (FlagLT)

-// Absorb flag constants into SBB ops.
-(SUBEcarrymask (FlagEQ)) -> (MOVDconst [-1])
-(SUBEcarrymask (FlagLT)) -> (MOVDconst [-1])
-(SUBEcarrymask (FlagGT)) -> (MOVDconst [0])
-(SUBEWcarrymask (FlagEQ)) -> (MOVDconst [-1])
-(SUBEWcarrymask (FlagLT)) -> (MOVDconst [-1])
-(SUBEWcarrymask (FlagGT)) -> (MOVDconst [0])
+(CMPconst  (SRDconst _ [c]) [n]) && c > 0 && n < 0 -> (FlagGT)
+(CMPWconst (SRWconst _ [c]) [n]) && c > 0 && n < 0 -> (FlagGT)
+
+(CMPUconst  (SRDconst _ [c]) [n]) && c > 0 && c < 64 && (1<<uint(64-c)) <= uint64(n) -> (FlagLT)
+(CMPWUconst (SRWconst _ [c]) [n]) && c > 0 && c < 32 && (1<<uint(32-c)) <= uint32(n) -> (FlagLT)
+
+(CMPWconst  (ANDWconst _ [m]) [n]) && int32(m) >= 0 &&  int32(m) <  int32(n) -> (FlagLT)
+(CMPWUconst (ANDWconst _ [m]) [n]) && uint32(m) < uint32(n) -> (FlagLT)
+
+// Convert 64-bit comparisons to 32-bit comparisons and signed comparisons
+// to unsigned comparisons.
+// Helps simplify constant comparison detection.
+(CM(P|PU)const (MOV(W|WZ)reg x) [c]) -> (CMP(W|WU)const x [c])
+(CM(P|P|PU|PU)const x:(MOV(H|HZ|H|HZ)reg _) [c]) -> (CMP(W|W|WU|WU)const x [c])
+(CM(P|P|PU|PU)const x:(MOV(B|BZ|B|BZ)reg _) [c]) -> (CMP(W|W|WU|WU)const x [c])
+(CMPconst  (MOV(WZ|W)reg x:(ANDWconst [m] _)) [c]) && int32(m) >= 0 && c >= 0 -> (CMPWUconst x [c])
+(CMPUconst (MOV(WZ|W)reg x:(ANDWconst [m] _)) [c]) && int32(m) >= 0           -> (CMPWUconst x [c])
+(CMPconst  x:(SRDconst _ [c]) [n]) && c > 0 && n >= 0 -> (CMPUconst  x [n])
+(CMPWconst x:(SRWconst _ [c]) [n]) && c > 0 && n >= 0 -> (CMPWUconst x [n])
+
+// Absorb sign and zero extensions into 32-bit comparisons.
+(CMP(W|W|WU|WU)      x (MOV(W|WZ|W|WZ)reg y))   -> (CMP(W|W|WU|WU) x y)
+(CMP(W|W|WU|WU)      (MOV(W|WZ|W|WZ)reg x) y)   -> (CMP(W|W|WU|WU) x y)
+(CMP(W|W|WU|WU)const (MOV(W|WZ|W|WZ)reg x) [c]) -> (CMP(W|W|WU|WU)const x [c])

 // Absorb flag constants into branches.
 (EQ (FlagEQ) yes no) -> (First nil yes no)
@ -1079,6 +1074,10 @@
 (XOR x x) -> (MOVDconst [0])
 (XORW x x) -> (MOVDconst [0])
 (NEG (ADDconst [c] (NEG x))) && c != -(1<<31) -> (ADDconst [-c] x)
+(MOVBZreg (ANDWconst [m] x)) -> (MOVWZreg (ANDWconst <typ.UInt32> [int64( uint8(m))] x))
+(MOVHZreg (ANDWconst [m] x)) -> (MOVWZreg (ANDWconst <typ.UInt32> [int64(uint16(m))] x))
+(MOVBreg  (ANDWconst [m] x)) &&  int8(m) >= 0 -> (MOVWZreg (ANDWconst <typ.UInt32> [int64( uint8(m))] x))
+(MOVHreg  (ANDWconst [m] x)) && int16(m) >= 0 -> (MOVWZreg (ANDWconst <typ.UInt32> [int64(uint16(m))] x))

 // fused multiply-add
 (FADD (FMUL y z) x) -> (FMADD x y z)
--- a/src/cmd/compile/internal/ssa/gen/S390XOps.go
+++ b/src/cmd/compile/internal/ssa/gen/S390XOps.go
@ -152,7 +152,6 @@ func init() {

 		gp2flags  = regInfo{inputs: []regMask{gpsp, gpsp}}
 		gp1flags  = regInfo{inputs: []regMask{gpsp}}
-		flagsgp   = regInfo{outputs: gponly}
 		gp2flags1 = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}

 		gpload       = regInfo{inputs: []regMask{ptrspsb, 0}, outputs: gponly}
@ -334,10 +333,6 @@ func init() {

 		{name: "FSQRT", argLength: 1, reg: fp11, asm: "FSQRT"}, // sqrt(arg0)

-		{name: "SUBEcarrymask", argLength: 1, reg: flagsgp, asm: "SUBE"},  // (int64)(-1) if carry is set, 0 if carry is clear.
-		{name: "SUBEWcarrymask", argLength: 1, reg: flagsgp, asm: "SUBE"}, // (int32)(-1) if carry is set, 0 if carry is clear.
-		// Note: 32-bits subtraction is not implemented in S390X. Temporarily use SUBE (64-bits).
-
 		{name: "MOVDEQ", argLength: 3, reg: gp2flags1, resultInArg0: true, asm: "MOVDEQ"}, // extract == condition from arg0
 		{name: "MOVDNE", argLength: 3, reg: gp2flags1, resultInArg0: true, asm: "MOVDNE"}, // extract != condition from arg0
 		{name: "MOVDLT", argLength: 3, reg: gp2flags1, resultInArg0: true, asm: "MOVDLT"}, // extract signed < condition from arg0
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@ -1760,8 +1760,6 @@ const (
 	OpS390XNOT
 	OpS390XNOTW
 	OpS390XFSQRT
-	OpS390XSUBEcarrymask
-	OpS390XSUBEWcarrymask
 	OpS390XMOVDEQ
 	OpS390XMOVDNE
 	OpS390XMOVDLT
@ -23409,26 +23407,6 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
-	{
-		name:   "SUBEcarrymask",
-		argLen: 1,
-		asm:    s390x.ASUBE,
-		reg: regInfo{
-			outputs: []outputInfo{
-				{0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
-			},
-		},
-	},
-	{
-		name:   "SUBEWcarrymask",
-		argLen: 1,
-		asm:    s390x.ASUBE,
-		reg: regInfo{
-			outputs: []outputInfo{
-				{0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
-			},
-		},
-	},
 	{
 		name:         "MOVDEQ",
 		argLen:       3,
--- a/src/cmd/compile/internal/ssa/rewriteS390X.go
+++ b/src/cmd/compile/internal/ssa/rewriteS390X.go
--- a/test/codegen/shift.go
+++ b/test/codegen/shift.go
@ -0,0 +1,99 @@
+// asmcheck
+
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package codegen
+
+// ------------------ //
+//   masked shifts    //
+// ------------------ //
+
+func lshMask64x64(v int64, s uint64) int64 {
+	// s390x:-".*AND",-".*MOVDGE"
+	return v << (s&63)
+}
+
+func rshMask64Ux64(v uint64, s uint64) uint64 {
+	// s390x:-".*AND",-".*MOVDGE"
+	return v >> (s&63)
+}
+
+func rshMask64x64(v int64, s uint64) int64 {
+	// s390x:-".*AND",-".*MOVDGE"
+	return v >> (s&63)
+}
+
+func lshMask32x64(v int32, s uint64) int32 {
+	// s390x:-".*AND",-".*MOVDGE"
+	return v << (s&63)
+}
+
+func rshMask32Ux64(v uint32, s uint64) uint32 {
+	// s390x:-".*AND",-".*MOVDGE"
+	return v >> (s&63)
+}
+
+func rshMask32x64(v int32, s uint64) int32 {
+	// s390x:-".*AND",-".*MOVDGE"
+	return v >> (s&63)
+}
+
+func lshMask64x32(v int64, s uint32) int64 {
+	// s390x:-".*AND",-".*MOVDGE"
+	return v << (s&63)
+}
+
+func rshMask64Ux32(v uint64, s uint32) uint64 {
+	// s390x:-".*AND",-".*MOVDGE"
+	return v >> (s&63)
+}
+
+func rshMask64x32(v int64, s uint32) int64 {
+	// s390x:-".*AND",-".*MOVDGE"
+	return v >> (s&63)
+}
+
+func lshMask64x32Ext(v int64, s int32) int64 {
+	// s390x:-".*AND",-".*MOVDGE"
+	return v << uint(s&63)
+}
+
+func rshMask64Ux32Ext(v uint64, s int32) uint64 {
+	// s390x:-".*AND",-".*MOVDGE"
+	return v >> uint(s&63)
+}
+
+func rshMask64x32Ext(v int64, s int32) int64 {
+	// s390x:-".*AND",-".*MOVDGE"
+	return v >> uint(s&63)
+}
+
+// ------------------ //
+//   bounded shifts   //
+// ------------------ //
+
+func lshGuarded64(v int64, s uint) int64 {
+	if s < 64 {
+		// s390x:-".*AND",-".*MOVDGE"
+		return v >> s
+	}
+	panic("shift too large")
+}
+
+func rshGuarded64U(v uint64, s uint) uint64 {
+	if s < 64 {
+		// s390x:-".*AND",-".*MOVDGE"
+		return v >> s
+	}
+	panic("shift too large")
+}
+
+func rshGuarded64(v int64, s uint) int64 {
+	if s < 64 {
+		// s390x:-".*AND",-".*MOVDGE"
+		return v << s
+	}
+	panic("shift too large")
+}