[VM] Pick more compact instruction encodings on x64

Several tricks here:
* When zeroing registers we can use xorl instead of xorq because the 'l'
  variant will zero the top bits.
* test and 'and' instructions with immediate arguments can use 8-bit and 32 bit
  variants more heavily.
* mov reg, immediate can use more compact encodings when sign-extension is not
  needed.

Performance is better than +1% when measured on Dart2JS.

R=vegorov@google.com

Intel optimization manual says: "Assembly/Compiler Coding Rule 64. (H impact, M
generality) Use the 32-bit versions of instructions in 64-bit mode to reduce
code size unless the 64-bit version is necessary to access 64-bit data or
additional registers."
Bug:
Change-Id: I2a989315c45f8d8ebab719653fbfa2b18ebb77c9
Reviewed-on: https://dart-review.googlesource.com/23400
Commit-Queue: Erik Corry <erikcorry@google.com>
Reviewed-by: Vyacheslav Egorov <vegorov@google.com>
This commit is contained in:
Erik Corry 2017-11-24 10:53:17 +00:00 committed by commit-bot@chromium.org
parent 062e5d6db3
commit 1d9ff70a08
8 changed files with 462 additions and 370 deletions

View file

@ -328,16 +328,25 @@ void Assembler::movw(const Address& dst, const Immediate& imm) {
void Assembler::movq(Register dst, const Immediate& imm) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
if (imm.is_int32()) {
if (imm.is_uint32()) {
// Pick single byte B8 encoding if possible. If dst < 8 then we also omit
// the Rex byte.
EmitRegisterREX(dst, REX_NONE);
EmitUint8(0xB8 | (dst & 7));
EmitUInt32(imm.value());
} else if (imm.is_int32()) {
// Sign extended C7 Cx encoding if we have a negative input.
Operand operand(dst);
EmitOperandREX(0, operand, REX_W);
EmitUint8(0xC7);
EmitOperand(0, operand);
EmitImmediate(imm);
} else {
// Full 64 bit immediate encoding.
EmitRegisterREX(dst, REX_W);
EmitUint8(0xB8 | (dst & 7));
EmitImmediate(imm);
}
EmitImmediate(imm);
}
// Use 0x89 encoding (instead of 0x8B encoding), which is expected by gdb64
@ -1297,37 +1306,6 @@ void Assembler::testl(Register reg1, Register reg2) {
EmitOperand(reg1 & 7, operand);
}
void Assembler::testl(Register reg, const Immediate& imm) {
// TODO(kasperl): Deal with registers r8-r15 using the short
// encoding form of the immediate?
// We are using RBP for the exception marker. See testl(Label*).
ASSERT(reg != RBP);
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
// For registers that have a byte variant (RAX, RBX, RCX, and RDX)
// we only test the byte register to keep the encoding short.
if (imm.is_uint8() && reg < 4) {
// Use zero-extended 8-bit immediate.
if (reg == RAX) {
EmitUint8(0xA8);
} else {
EmitUint8(0xF6);
EmitUint8(0xC0 + reg);
}
EmitUint8(imm.value() & 0xFF);
} else {
ASSERT(imm.is_int32());
if (reg == RAX) {
EmitUint8(0xA9);
} else {
EmitRegisterREX(reg, REX_NONE);
EmitUint8(0xF7);
EmitUint8(0xC0 | (reg & 7));
}
EmitImmediate(imm);
}
}
void Assembler::testb(const Address& address, const Immediate& imm) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitOperandREX(0, address, REX_NONE);
@ -1346,27 +1324,37 @@ void Assembler::testq(Register reg1, Register reg2) {
}
void Assembler::testq(Register reg, const Immediate& imm) {
// TODO(kasperl): Deal with registers r8-r15 using the short
// encoding form of the immediate?
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
// For registers that have a byte variant (RAX, RBX, RCX, and RDX)
// we only test the byte register to keep the encoding short.
if (imm.is_uint8() && reg < 4) {
if (imm.is_uint8()) {
// Use zero-extended 8-bit immediate.
if (reg >= 4) {
// We need the Rex byte to give access to the SIL and DIL registers (the
// low bytes of RSI and RDI).
EmitRegisterREX(reg, REX_NONE, /* force = */ true);
}
if (reg == RAX) {
EmitUint8(0xA8);
} else {
EmitUint8(0xF6);
EmitUint8(0xC0 + reg);
EmitUint8(0xC0 + (reg & 7));
}
EmitUint8(imm.value() & 0xFF);
} else {
ASSERT(imm.is_int32());
} else if (imm.is_uint32()) {
if (reg == RAX) {
EmitUint8(0xA9 | REX_W);
EmitUint8(0xA9);
} else {
EmitRegisterREX(reg, REX_NONE);
EmitUint8(0xF7);
EmitUint8(0xC0 | (reg & 7));
}
EmitUInt32(imm.value());
} else {
// Sign extended version of 32 bit test.
ASSERT(imm.is_int32());
EmitRegisterREX(reg, REX_W);
if (reg == RAX) {
EmitUint8(0xA9);
} else {
EmitRegisterREX(reg, REX_W);
EmitUint8(0xF7);
EmitUint8(0xC0 | (reg & 7));
}
@ -1375,7 +1363,7 @@ void Assembler::testq(Register reg, const Immediate& imm) {
}
void Assembler::TestImmediate(Register dst, const Immediate& imm) {
if (imm.is_int32()) {
if (imm.is_int32() || imm.is_uint32()) {
testq(dst, imm);
} else {
ASSERT(dst != TMP);
@ -1444,7 +1432,27 @@ void Assembler::AluQ(uint8_t modrm_opcode,
uint8_t opcode,
Register dst,
const Immediate& imm) {
if (imm.is_int32()) {
if (modrm_opcode == 4 && imm.is_uint32()) {
// We can use andl for andq.
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitRegisterREX(dst, REX_NONE);
// Would like to use EmitComplex here, but it doesn't like uint32
// immediates.
if (imm.is_int8()) {
// Use sign-extended 8-bit immediate.
EmitUint8(0x83);
EmitOperand(modrm_opcode, Operand(dst));
EmitUint8(imm.value() & 0xFF);
} else {
if (dst == RAX) {
EmitUint8(0x25);
} else {
EmitUint8(0x81);
EmitOperand(modrm_opcode, Operand(dst));
}
EmitUInt32(imm.value());
}
} else if (imm.is_int32()) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitRegisterREX(dst, REX_W);
EmitComplex(modrm_opcode, Operand(dst), imm);
@ -1470,7 +1478,7 @@ void Assembler::AluQ(uint8_t modrm_opcode,
}
void Assembler::AndImmediate(Register dst, const Immediate& imm) {
if (imm.is_int32()) {
if (imm.is_int32() || imm.is_uint32()) {
andq(dst, imm);
} else {
ASSERT(dst != TMP);
@ -2260,7 +2268,9 @@ intptr_t Assembler::FindImmediate(int64_t imm) {
}
void Assembler::LoadImmediate(Register reg, const Immediate& imm) {
if (imm.is_int32() || !constant_pool_allowed()) {
if (imm.value() == 0) {
xorl(reg, reg);
} else if (imm.is_int32() || !constant_pool_allowed()) {
movq(reg, imm);
} else {
int32_t offset = ObjectPool::element_offset(FindImmediate(imm.value()));

View file

@ -33,6 +33,7 @@ class Immediate : public ValueObject {
bool is_uint8() const { return Utils::IsUint(8, value_); }
bool is_uint16() const { return Utils::IsUint(16, value_); }
bool is_int32() const { return Utils::IsInt(32, value_); }
bool is_uint32() const { return Utils::IsUint(32, value_); }
private:
const int64_t value_;
@ -511,7 +512,7 @@ class Assembler : public ValueObject {
void CompareImmediate(const Address& address, const Immediate& imm);
void testl(Register reg1, Register reg2);
void testl(Register reg, const Immediate& imm);
void testl(Register reg, const Immediate& imm) { testq(reg, imm); }
void testb(const Address& address, const Immediate& imm);
void testq(Register reg1, Register reg2);
@ -712,7 +713,9 @@ class Assembler : public ValueObject {
bool constant_pool_allowed() const { return constant_pool_allowed_; }
void set_constant_pool_allowed(bool b) { constant_pool_allowed_ = b; }
// Unlike movq this can affect the flags or use the constant pool.
void LoadImmediate(Register reg, const Immediate& imm);
void LoadIsolate(Register dst);
void LoadObject(Register dst, const Object& obj);
void LoadUniqueObject(Register dst, const Object& obj);
@ -1023,9 +1026,12 @@ class Assembler : public ValueObject {
inline void EmitUint8(uint8_t value);
inline void EmitInt32(int32_t value);
inline void EmitUInt32(uint32_t value);
inline void EmitInt64(int64_t value);
inline void EmitRegisterREX(Register reg, uint8_t rex);
inline void EmitRegisterREX(Register reg,
uint8_t rex,
bool force_emit = false);
inline void EmitOperandREX(int rm, const Operand& operand, uint8_t rex);
inline void EmitXmmRegisterOperand(int rm, XmmRegister reg);
inline void EmitFixup(AssemblerFixup* fixup);
@ -1077,14 +1083,18 @@ inline void Assembler::EmitInt32(int32_t value) {
buffer_.Emit<int32_t>(value);
}
inline void Assembler::EmitUInt32(uint32_t value) {
buffer_.Emit<uint32_t>(value);
}
inline void Assembler::EmitInt64(int64_t value) {
buffer_.Emit<int64_t>(value);
}
inline void Assembler::EmitRegisterREX(Register reg, uint8_t rex) {
inline void Assembler::EmitRegisterREX(Register reg, uint8_t rex, bool force) {
ASSERT(reg != kNoRegister);
rex |= (reg > 7 ? REX_B : REX_NONE);
if (rex != REX_NONE) EmitUint8(REX_PREFIX | rex);
if (rex != REX_NONE || force) EmitUint8(REX_PREFIX | rex);
}
inline void Assembler::EmitOperandREX(int rm,

File diff suppressed because it is too large Load diff

View file

@ -1156,6 +1156,8 @@ bool DisassemblerX64::DecodeInstructionType(uint8_t** data) {
// TODO(srdjan): Should we enable printing of REX.W?
// if (rex_w()) Print("REX.W ");
Print("%s%s", idesc.mnem, operand_size_code());
} else if (current == 0xC3 || current == 0xCC) {
Print("%s", idesc.mnem); // ret and int3 don't need a size specifier.
} else {
Print("%s%s", idesc.mnem, operand_size_code());
}
@ -1215,7 +1217,7 @@ bool DisassemblerX64::DecodeInstructionType(uint8_t** data) {
}
case SHORT_IMMEDIATE_INSTR: {
Print("%s %s, ", idesc.mnem, Rax());
Print("%s%s %s,", idesc.mnem, operand_size_code(), Rax());
PrintImmediate(*data + 1, DOUBLEWORD_SIZE);
(*data) += 5;
break;
@ -1901,27 +1903,10 @@ int DisassemblerX64::InstructionDecode(uword pc) {
break;
case 0xA9: {
int64_t value = 0;
bool check_for_stop = false;
switch (operand_size()) {
case WORD_SIZE:
value = *reinterpret_cast<uint16_t*>(data + 1);
data += 3;
break;
case DOUBLEWORD_SIZE:
value = *reinterpret_cast<uint32_t*>(data + 1);
data += 5;
check_for_stop = true;
break;
case QUADWORD_SIZE:
value = *reinterpret_cast<int32_t*>(data + 1);
data += 5;
break;
default:
UNREACHABLE();
}
data++;
bool check_for_stop = operand_size() == DOUBLEWORD_SIZE;
Print("test%s %s,", operand_size_code(), Rax());
PrintImmediateValue(value);
data += PrintImmediate(data, operand_size());
if (check_for_stop) {
CheckPrintStop(data);
}

View file

@ -1342,7 +1342,7 @@ void FlowGraphCompiler::EmitOptimizedStaticCall(
(isolate()->reify_generic_functions() && function.IsGeneric())) {
__ LoadObject(R10, arguments_descriptor);
} else {
__ xorq(R10, R10); // GC safe smi zero because of stub.
__ xorl(R10, R10); // GC safe smi zero because of stub.
}
// Do not use the code from the function, but let the code be patched so that
// we can record the outgoing edges to other code.
@ -1532,7 +1532,7 @@ void ParallelMoveResolver::EmitMove(int index) {
const Object& constant = source.constant();
if (destination.IsRegister()) {
if (constant.IsSmi() && (Smi::Cast(constant).Value() == 0)) {
__ xorq(destination.reg(), destination.reg());
__ xorl(destination.reg(), destination.reg());
} else if (constant.IsSmi() &&
(source.constant_instruction()->representation() ==
kUnboxedInt32)) {

View file

@ -199,7 +199,7 @@ void Intrinsifier::GrowableArray_add(Assembler* assembler) {
__ jmp(&done, Assembler::kNearJump); \
\
__ Bind(&size_tag_overflow); \
__ movq(RDI, Immediate(0)); \
__ LoadImmediate(RDI, Immediate(0)); \
__ Bind(&done); \
\
/* Get the class index and insert it into the tags. */ \
@ -1768,7 +1768,7 @@ void GenerateSubstringMatchesSpecialization(Assembler* assembler,
__ SmiUntag(RBX); // start
__ SmiUntag(R9); // other.length
__ movq(R11, Immediate(0)); // i = 0
__ LoadImmediate(R11, Immediate(0)); // i = 0
// do
Label loop;

View file

@ -4141,8 +4141,8 @@ class Instructions : public Object {
static const intptr_t kCheckedEntryOffset = 0;
static const intptr_t kUncheckedEntryOffset = 0;
#elif defined(TARGET_ARCH_X64)
static const intptr_t kCheckedEntryOffset = 16;
static const intptr_t kUncheckedEntryOffset = 38;
static const intptr_t kCheckedEntryOffset = 15;
static const intptr_t kUncheckedEntryOffset = 34;
#elif defined(TARGET_ARCH_ARM)
static const intptr_t kCheckedEntryOffset = 8;
static const intptr_t kUncheckedEntryOffset = 32;

View file

@ -671,7 +671,7 @@ void StubCode::GenerateAllocateArrayStub(Assembler* assembler) {
__ jmp(&done, Assembler::kNearJump);
__ Bind(&size_tag_overflow);
__ movq(RDI, Immediate(0));
__ LoadImmediate(RDI, Immediate(0));
__ Bind(&done);
// Get the class index and insert it into the tags.
@ -820,7 +820,7 @@ void StubCode::GenerateInvokeDartCodeStub(Assembler* assembler) {
Label push_arguments;
Label done_push_arguments;
__ j(ZERO, &done_push_arguments, Assembler::kNearJump);
__ movq(RAX, Immediate(0));
__ LoadImmediate(RAX, Immediate(0));
__ Bind(&push_arguments);
__ pushq(Address(RDX, RAX, TIMES_8, 0));
__ incq(RAX);
@ -925,7 +925,7 @@ void StubCode::GenerateAllocateContextStub(Assembler* assembler) {
__ Bind(&size_tag_overflow);
// Set overflow size tag value.
__ movq(R13, Immediate(0));
__ LoadImmediate(R13, Immediate(0));
__ Bind(&done);
// RAX: new object.