[VM] Pick more compact instruction encodings on x64

Several tricks here: * When zeroing registers we can use xorl instead of xorq because the 'l' variant will zero the top bits. * test and 'and' instructions with immediate arguments can use 8-bit and 32 bit variants more heavily. * mov reg, immediate can use more compact encodings when sign-extension is not needed. Performance is better than +1% when measured on Dart2JS. R=vegorov@google.com Intel optimization manual says: "Assembly/Compiler Coding Rule 64. (H impact, M generality) Use the 32-bit versions of instructions in 64-bit mode to reduce code size unless the 64-bit version is necessary to access 64-bit data or additional registers." Bug: Change-Id: I2a989315c45f8d8ebab719653fbfa2b18ebb77c9 Reviewed-on: https://dart-review.googlesource.com/23400 Commit-Queue: Erik Corry <erikcorry@google.com> Reviewed-by: Vyacheslav Egorov <vegorov@google.com>
2024-09-16 01:59:38 +00:00 · 2017-11-24 10:53:17 +00:00 · 2017-11-24 10:53:17 +00:00 · 1d9ff70a08
parent 062e5d6db3
commit 1d9ff70a08
8 changed files with 462 additions and 370 deletions
--- a/runtime/vm/compiler/assembler/assembler_x64.cc
+++ b/runtime/vm/compiler/assembler/assembler_x64.cc
@ -328,16 +328,25 @@ void Assembler::movw(const Address& dst, const Immediate& imm) {

 void Assembler::movq(Register dst, const Immediate& imm) {
  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  if (imm.is_int32()) {
+  if (imm.is_uint32()) {
+    // Pick single byte B8 encoding if possible. If dst < 8 then we also omit
+    // the Rex byte.
+    EmitRegisterREX(dst, REX_NONE);
+    EmitUint8(0xB8 | (dst & 7));
+    EmitUInt32(imm.value());
+  } else if (imm.is_int32()) {
+    // Sign extended C7 Cx encoding if we have a negative input.
    Operand operand(dst);
    EmitOperandREX(0, operand, REX_W);
    EmitUint8(0xC7);
    EmitOperand(0, operand);
+    EmitImmediate(imm);
  } else {
+    // Full 64 bit immediate encoding.
    EmitRegisterREX(dst, REX_W);
    EmitUint8(0xB8 | (dst & 7));
+    EmitImmediate(imm);
  }
-  EmitImmediate(imm);
 }

 // Use 0x89 encoding (instead of 0x8B encoding), which is expected by gdb64
@ -1297,37 +1306,6 @@ void Assembler::testl(Register reg1, Register reg2) {
  EmitOperand(reg1 & 7, operand);
 }

-void Assembler::testl(Register reg, const Immediate& imm) {
-  // TODO(kasperl): Deal with registers r8-r15 using the short
-  // encoding form of the immediate?
-
-  // We are using RBP for the exception marker. See testl(Label*).
-  ASSERT(reg != RBP);
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  // For registers that have a byte variant (RAX, RBX, RCX, and RDX)
-  // we only test the byte register to keep the encoding short.
-  if (imm.is_uint8() && reg < 4) {
-    // Use zero-extended 8-bit immediate.
-    if (reg == RAX) {
-      EmitUint8(0xA8);
-    } else {
-      EmitUint8(0xF6);
-      EmitUint8(0xC0 + reg);
-    }
-    EmitUint8(imm.value() & 0xFF);
-  } else {
-    ASSERT(imm.is_int32());
-    if (reg == RAX) {
-      EmitUint8(0xA9);
-    } else {
-      EmitRegisterREX(reg, REX_NONE);
-      EmitUint8(0xF7);
-      EmitUint8(0xC0 | (reg & 7));
-    }
-    EmitImmediate(imm);
-  }
-}
-
 void Assembler::testb(const Address& address, const Immediate& imm) {
  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
  EmitOperandREX(0, address, REX_NONE);
@ -1346,27 +1324,37 @@ void Assembler::testq(Register reg1, Register reg2) {
 }

 void Assembler::testq(Register reg, const Immediate& imm) {
-  // TODO(kasperl): Deal with registers r8-r15 using the short
-  // encoding form of the immediate?
-
  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  // For registers that have a byte variant (RAX, RBX, RCX, and RDX)
-  // we only test the byte register to keep the encoding short.
-  if (imm.is_uint8() && reg < 4) {
+  if (imm.is_uint8()) {
    // Use zero-extended 8-bit immediate.
+    if (reg >= 4) {
+      // We need the Rex byte to give access to the SIL and DIL registers (the
+      // low bytes of RSI and RDI).
+      EmitRegisterREX(reg, REX_NONE, /* force = */ true);
+    }
    if (reg == RAX) {
      EmitUint8(0xA8);
    } else {
      EmitUint8(0xF6);
-      EmitUint8(0xC0 + reg);
+      EmitUint8(0xC0 + (reg & 7));
    }
    EmitUint8(imm.value() & 0xFF);
-  } else {
-    ASSERT(imm.is_int32());
+  } else if (imm.is_uint32()) {
    if (reg == RAX) {
-      EmitUint8(0xA9 | REX_W);
+      EmitUint8(0xA9);
+    } else {
+      EmitRegisterREX(reg, REX_NONE);
+      EmitUint8(0xF7);
+      EmitUint8(0xC0 | (reg & 7));
+    }
+    EmitUInt32(imm.value());
+  } else {
+    // Sign extended version of 32 bit test.
+    ASSERT(imm.is_int32());
+    EmitRegisterREX(reg, REX_W);
+    if (reg == RAX) {
+      EmitUint8(0xA9);
    } else {
-      EmitRegisterREX(reg, REX_W);
      EmitUint8(0xF7);
      EmitUint8(0xC0 | (reg & 7));
    }
@ -1375,7 +1363,7 @@ void Assembler::testq(Register reg, const Immediate& imm) {
 }

 void Assembler::TestImmediate(Register dst, const Immediate& imm) {
-  if (imm.is_int32()) {
+  if (imm.is_int32() || imm.is_uint32()) {
    testq(dst, imm);
  } else {
    ASSERT(dst != TMP);
@ -1444,7 +1432,27 @@ void Assembler::AluQ(uint8_t modrm_opcode,
                     uint8_t opcode,
                     Register dst,
                     const Immediate& imm) {
-  if (imm.is_int32()) {
+  if (modrm_opcode == 4 && imm.is_uint32()) {
+    // We can use andl for andq.
+    AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+    EmitRegisterREX(dst, REX_NONE);
+    // Would like to use EmitComplex here, but it doesn't like uint32
+    // immediates.
+    if (imm.is_int8()) {
+      // Use sign-extended 8-bit immediate.
+      EmitUint8(0x83);
+      EmitOperand(modrm_opcode, Operand(dst));
+      EmitUint8(imm.value() & 0xFF);
+    } else {
+      if (dst == RAX) {
+        EmitUint8(0x25);
+      } else {
+        EmitUint8(0x81);
+        EmitOperand(modrm_opcode, Operand(dst));
+      }
+      EmitUInt32(imm.value());
+    }
+  } else if (imm.is_int32()) {
    AssemblerBuffer::EnsureCapacity ensured(&buffer_);
    EmitRegisterREX(dst, REX_W);
    EmitComplex(modrm_opcode, Operand(dst), imm);
@ -1470,7 +1478,7 @@ void Assembler::AluQ(uint8_t modrm_opcode,
 }

 void Assembler::AndImmediate(Register dst, const Immediate& imm) {
-  if (imm.is_int32()) {
+  if (imm.is_int32() || imm.is_uint32()) {
    andq(dst, imm);
  } else {
    ASSERT(dst != TMP);
@ -2260,7 +2268,9 @@ intptr_t Assembler::FindImmediate(int64_t imm) {
 }

 void Assembler::LoadImmediate(Register reg, const Immediate& imm) {
-  if (imm.is_int32() || !constant_pool_allowed()) {
+  if (imm.value() == 0) {
+    xorl(reg, reg);
+  } else if (imm.is_int32() || !constant_pool_allowed()) {
    movq(reg, imm);
  } else {
    int32_t offset = ObjectPool::element_offset(FindImmediate(imm.value()));
--- a/runtime/vm/compiler/assembler/assembler_x64.h
+++ b/runtime/vm/compiler/assembler/assembler_x64.h
@ -33,6 +33,7 @@ class Immediate : public ValueObject {
  bool is_uint8() const { return Utils::IsUint(8, value_); }
  bool is_uint16() const { return Utils::IsUint(16, value_); }
  bool is_int32() const { return Utils::IsInt(32, value_); }
+  bool is_uint32() const { return Utils::IsUint(32, value_); }

 private:
  const int64_t value_;
@ -511,7 +512,7 @@ class Assembler : public ValueObject {
  void CompareImmediate(const Address& address, const Immediate& imm);

  void testl(Register reg1, Register reg2);
-  void testl(Register reg, const Immediate& imm);
+  void testl(Register reg, const Immediate& imm) { testq(reg, imm); }
  void testb(const Address& address, const Immediate& imm);

  void testq(Register reg1, Register reg2);
@ -712,7 +713,9 @@ class Assembler : public ValueObject {
  bool constant_pool_allowed() const { return constant_pool_allowed_; }
  void set_constant_pool_allowed(bool b) { constant_pool_allowed_ = b; }

+  // Unlike movq this can affect the flags or use the constant pool.
  void LoadImmediate(Register reg, const Immediate& imm);
+
  void LoadIsolate(Register dst);
  void LoadObject(Register dst, const Object& obj);
  void LoadUniqueObject(Register dst, const Object& obj);
@ -1023,9 +1026,12 @@ class Assembler : public ValueObject {

  inline void EmitUint8(uint8_t value);
  inline void EmitInt32(int32_t value);
+  inline void EmitUInt32(uint32_t value);
  inline void EmitInt64(int64_t value);

-  inline void EmitRegisterREX(Register reg, uint8_t rex);
+  inline void EmitRegisterREX(Register reg,
+                              uint8_t rex,
+                              bool force_emit = false);
  inline void EmitOperandREX(int rm, const Operand& operand, uint8_t rex);
  inline void EmitXmmRegisterOperand(int rm, XmmRegister reg);
  inline void EmitFixup(AssemblerFixup* fixup);
@ -1077,14 +1083,18 @@ inline void Assembler::EmitInt32(int32_t value) {
  buffer_.Emit<int32_t>(value);
 }

+inline void Assembler::EmitUInt32(uint32_t value) {
+  buffer_.Emit<uint32_t>(value);
+}
+
 inline void Assembler::EmitInt64(int64_t value) {
  buffer_.Emit<int64_t>(value);
 }

-inline void Assembler::EmitRegisterREX(Register reg, uint8_t rex) {
+inline void Assembler::EmitRegisterREX(Register reg, uint8_t rex, bool force) {
  ASSERT(reg != kNoRegister);
  rex |= (reg > 7 ? REX_B : REX_NONE);
-  if (rex != REX_NONE) EmitUint8(REX_PREFIX | rex);
+  if (rex != REX_NONE || force) EmitUint8(REX_PREFIX | rex);
 }

 inline void Assembler::EmitOperandREX(int rm,
--- a/runtime/vm/compiler/assembler/assembler_x64_test.cc
+++ b/runtime/vm/compiler/assembler/assembler_x64_test.cc
--- a/runtime/vm/compiler/assembler/disassembler_x86.cc
+++ b/runtime/vm/compiler/assembler/disassembler_x86.cc
@ -1156,6 +1156,8 @@ bool DisassemblerX64::DecodeInstructionType(uint8_t** data) {
        // TODO(srdjan): Should we enable printing of REX.W?
        // if (rex_w()) Print("REX.W ");
        Print("%s%s", idesc.mnem, operand_size_code());
+      } else if (current == 0xC3 || current == 0xCC) {
+        Print("%s", idesc.mnem);  // ret and int3 don't need a size specifier.
      } else {
        Print("%s%s", idesc.mnem, operand_size_code());
      }
@ -1215,7 +1217,7 @@ bool DisassemblerX64::DecodeInstructionType(uint8_t** data) {
    }

    case SHORT_IMMEDIATE_INSTR: {
-      Print("%s %s, ", idesc.mnem, Rax());
+      Print("%s%s %s,", idesc.mnem, operand_size_code(), Rax());
      PrintImmediate(*data + 1, DOUBLEWORD_SIZE);
      (*data) += 5;
      break;
@ -1901,27 +1903,10 @@ int DisassemblerX64::InstructionDecode(uword pc) {
        break;

      case 0xA9: {
-        int64_t value = 0;
-        bool check_for_stop = false;
-        switch (operand_size()) {
-          case WORD_SIZE:
-            value = *reinterpret_cast<uint16_t*>(data + 1);
-            data += 3;
-            break;
-          case DOUBLEWORD_SIZE:
-            value = *reinterpret_cast<uint32_t*>(data + 1);
-            data += 5;
-            check_for_stop = true;
-            break;
-          case QUADWORD_SIZE:
-            value = *reinterpret_cast<int32_t*>(data + 1);
-            data += 5;
-            break;
-          default:
-            UNREACHABLE();
-        }
+        data++;
+        bool check_for_stop = operand_size() == DOUBLEWORD_SIZE;
        Print("test%s %s,", operand_size_code(), Rax());
-        PrintImmediateValue(value);
+        data += PrintImmediate(data, operand_size());
        if (check_for_stop) {
          CheckPrintStop(data);
        }
--- a/runtime/vm/compiler/backend/flow_graph_compiler_x64.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler_x64.cc
@ -1342,7 +1342,7 @@ void FlowGraphCompiler::EmitOptimizedStaticCall(
      (isolate()->reify_generic_functions() && function.IsGeneric())) {
    __ LoadObject(R10, arguments_descriptor);
  } else {
-    __ xorq(R10, R10);  // GC safe smi zero because of stub.
+    __ xorl(R10, R10);  // GC safe smi zero because of stub.
  }
  // Do not use the code from the function, but let the code be patched so that
  // we can record the outgoing edges to other code.
@ -1532,7 +1532,7 @@ void ParallelMoveResolver::EmitMove(int index) {
    const Object& constant = source.constant();
    if (destination.IsRegister()) {
      if (constant.IsSmi() && (Smi::Cast(constant).Value() == 0)) {
-        __ xorq(destination.reg(), destination.reg());
+        __ xorl(destination.reg(), destination.reg());
      } else if (constant.IsSmi() &&
                 (source.constant_instruction()->representation() ==
                  kUnboxedInt32)) {
--- a/runtime/vm/compiler/intrinsifier_x64.cc
+++ b/runtime/vm/compiler/intrinsifier_x64.cc
@ -199,7 +199,7 @@ void Intrinsifier::GrowableArray_add(Assembler* assembler) {
    __ jmp(&done, Assembler::kNearJump);                                       \
                                                                               \
    __ Bind(&size_tag_overflow);                                               \
-    __ movq(RDI, Immediate(0));                                                \
+    __ LoadImmediate(RDI, Immediate(0));                                       \
    __ Bind(&done);                                                            \
                                                                               \
    /* Get the class index and insert it into the tags. */                     \
@ -1768,7 +1768,7 @@ void GenerateSubstringMatchesSpecialization(Assembler* assembler,

  __ SmiUntag(RBX);            // start
  __ SmiUntag(R9);             // other.length
-  __ movq(R11, Immediate(0));  // i = 0
+  __ LoadImmediate(R11, Immediate(0));  // i = 0

  // do
  Label loop;
--- a/runtime/vm/object.h
+++ b/runtime/vm/object.h
@ -4141,8 +4141,8 @@ class Instructions : public Object {
  static const intptr_t kCheckedEntryOffset = 0;
  static const intptr_t kUncheckedEntryOffset = 0;
 #elif defined(TARGET_ARCH_X64)
-  static const intptr_t kCheckedEntryOffset = 16;
-  static const intptr_t kUncheckedEntryOffset = 38;
+  static const intptr_t kCheckedEntryOffset = 15;
+  static const intptr_t kUncheckedEntryOffset = 34;
 #elif defined(TARGET_ARCH_ARM)
  static const intptr_t kCheckedEntryOffset = 8;
  static const intptr_t kUncheckedEntryOffset = 32;
--- a/runtime/vm/stub_code_x64.cc
+++ b/runtime/vm/stub_code_x64.cc
@ -671,7 +671,7 @@ void StubCode::GenerateAllocateArrayStub(Assembler* assembler) {
    __ jmp(&done, Assembler::kNearJump);

    __ Bind(&size_tag_overflow);
-    __ movq(RDI, Immediate(0));
+    __ LoadImmediate(RDI, Immediate(0));
    __ Bind(&done);

    // Get the class index and insert it into the tags.
@ -820,7 +820,7 @@ void StubCode::GenerateInvokeDartCodeStub(Assembler* assembler) {
  Label push_arguments;
  Label done_push_arguments;
  __ j(ZERO, &done_push_arguments, Assembler::kNearJump);
-  __ movq(RAX, Immediate(0));
+  __ LoadImmediate(RAX, Immediate(0));
  __ Bind(&push_arguments);
  __ pushq(Address(RDX, RAX, TIMES_8, 0));
  __ incq(RAX);
@ -925,7 +925,7 @@ void StubCode::GenerateAllocateContextStub(Assembler* assembler) {

      __ Bind(&size_tag_overflow);
      // Set overflow size tag value.
-      __ movq(R13, Immediate(0));
+      __ LoadImmediate(R13, Immediate(0));

      __ Bind(&done);
      // RAX: new object.