[vm, compiler] Unroll object initialized to the allocation unit.

1/2 or 1/4 branches executed per object initialization for uncompressed and compressed pointers respectively.

Note ARM32 already has double-wide initialization.

TEST=ci
Change-Id: Iacc61cac39f74191bb2e5e06cd48b90c006b8585
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/241622
Reviewed-by: Alexander Markov <alexmarkov@google.com>
Commit-Queue: Ryan Macnak <rmacnak@google.com>
This commit is contained in:
Ryan Macnak 2022-04-26 21:49:34 +00:00 committed by Commit Bot
parent 63a2c57903
commit 19a59df399
9 changed files with 250 additions and 127 deletions

View file

@ -2717,7 +2717,7 @@ class Assembler : public AssemblerBase {
opc = B31;
break;
case kFourBytes:
opc = B30;
opc = op == LDP ? B30 : 0;
break;
case kUnsignedFourBytes:
opc = 0;

View file

@ -751,6 +751,92 @@ ASSEMBLER_TEST_RUN(LoadStorePairOffset, test) {
"ret\n");
}
ASSEMBLER_TEST_GENERATE(LoadStorePairUnsigned32, assembler) {
__ SetupDartSP();
__ LoadImmediate(R2, 0xAABBCCDDEEFF9988);
__ LoadImmediate(R3, 0xBBCCDDEEFF998877);
__ sub(SP, SP, Operand(4 * target::kWordSize));
__ andi(CSP, SP, Immediate(~15)); // Must not access beyond CSP.
__ stp(R2, R3,
Address(SP, 2 * sizeof(uint32_t), Address::PairOffset,
compiler::kUnsignedFourBytes),
kUnsignedFourBytes);
__ ldp(R0, R1,
Address(SP, 2 * sizeof(uint32_t), Address::PairOffset,
kUnsignedFourBytes),
kUnsignedFourBytes);
__ add(SP, SP, Operand(4 * target::kWordSize));
__ sub(R0, R0, Operand(R1));
__ RestoreCSP();
__ ret();
}
ASSEMBLER_TEST_RUN(LoadStorePairUnsigned32, test) {
typedef int64_t (*Int64Return)() DART_UNUSED;
EXPECT_EQ(-278523631, EXECUTE_TEST_CODE_INT64(Int64Return, test->entry()));
EXPECT_DISASSEMBLY(
"mov sp, csp\n"
"sub csp, csp, #0x1000\n"
"movz r2, #0x9988\n"
"movk r2, #0xeeff lsl 16\n"
"movk r2, #0xccdd lsl 32\n"
"movk r2, #0xaabb lsl 48\n"
"movz r3, #0x8877\n"
"movk r3, #0xff99 lsl 16\n"
"movk r3, #0xddee lsl 32\n"
"movk r3, #0xbbcc lsl 48\n"
"sub sp, sp, #0x20\n"
"and csp, sp, 0xfffffffffffffff0\n"
"stpw r2, r3, [sp, #8]\n"
"ldpw r0, r1, [sp, #8]\n"
"add sp, sp, #0x20\n"
"sub r0, r0, r1\n"
"mov csp, sp\n"
"ret\n");
}
ASSEMBLER_TEST_GENERATE(LoadStorePairSigned32, assembler) {
__ SetupDartSP();
__ LoadImmediate(R2, 0xAABBCCDDEEFF9988);
__ LoadImmediate(R3, 0xBBCCDDEEFF998877);
__ sub(SP, SP, Operand(4 * target::kWordSize));
__ andi(CSP, SP, Immediate(~15)); // Must not access beyond CSP.
__ stp(R2, R3,
Address(SP, 2 * sizeof(int32_t), Address::PairOffset, kFourBytes),
kFourBytes);
__ ldp(R0, R1,
Address(SP, 2 * sizeof(int32_t), Address::PairOffset, kFourBytes),
kFourBytes);
__ add(SP, SP, Operand(4 * target::kWordSize));
__ sub(R0, R0, Operand(R1));
__ RestoreCSP();
__ ret();
}
ASSEMBLER_TEST_RUN(LoadStorePairSigned32, test) {
typedef int64_t (*Int64Return)() DART_UNUSED;
EXPECT_EQ(-278523631, EXECUTE_TEST_CODE_INT64(Int64Return, test->entry()));
EXPECT_DISASSEMBLY(
"mov sp, csp\n"
"sub csp, csp, #0x1000\n"
"movz r2, #0x9988\n"
"movk r2, #0xeeff lsl 16\n"
"movk r2, #0xccdd lsl 32\n"
"movk r2, #0xaabb lsl 48\n"
"movz r3, #0x8877\n"
"movk r3, #0xff99 lsl 16\n"
"movk r3, #0xddee lsl 32\n"
"movk r3, #0xbbcc lsl 48\n"
"sub sp, sp, #0x20\n"
"and csp, sp, 0xfffffffffffffff0\n"
"stpw r2, r3, [sp, #8]\n"
"ldpsw r0, r1, [sp, #8]\n"
"add sp, sp, #0x20\n"
"sub r0, r0, r1\n"
"mov csp, sp\n"
"ret\n");
}
ASSEMBLER_TEST_GENERATE(PushRegisterPair, assembler) {
__ SetupDartSP();
__ LoadImmediate(R2, 12);

View file

@ -552,7 +552,11 @@ int ARM64Decoder::FormatOption(Instr* instr, const char* format) {
ASSERT(STRING_STARTS_WITH(format, "opc"));
if (instr->Bit(26) == 0) {
if (instr->Bit(31) == 0) {
Print("w");
if (instr->Bit(30) == 1) {
Print("sw");
} else {
Print("w");
}
} else {
// 64-bit width is most commonly used, no need to print "x".
}

View file

@ -63,7 +63,7 @@ namespace compiler {
// constants and introduce compilation errors when used.
//
// target::kWordSize and target::ObjectAlignment give access to
// word size and object aligment offsets for the target.
// word size and object alignment offsets for the target.
//
// Similarly kHostWordSize gives access to the host word size.
class InvalidClass {};

View file

@ -1376,15 +1376,22 @@ void StubCodeCompiler::GenerateAllocateArrayStub(Assembler* assembler) {
target::Array::data_offset() - kHeapObjectTag);
// R3: iterator which initially points to the start of the variable
// data area to be initialized.
Label loop, done;
#if defined(DART_COMPRESSED_POINTERS)
const Register kWordOfNulls = TMP;
__ andi(kWordOfNulls, NULL_REG, Immediate(0xFFFFFFFF));
__ orr(kWordOfNulls, kWordOfNulls, Operand(kWordOfNulls, LSL, 32));
#else
const Register kWordOfNulls = NULL_REG;
#endif
Label loop;
__ Bind(&loop);
// TODO(cshapiro): StoreIntoObjectNoBarrier
ASSERT(target::kObjectAlignment == 2 * target::kWordSize);
__ stp(kWordOfNulls, kWordOfNulls,
Address(R3, 2 * target::kWordSize, Address::PairPostIndex));
// Safe to only check every kObjectAlignment bytes instead of each word.
ASSERT(kAllocationRedZoneSize >= target::kObjectAlignment);
__ CompareRegisters(R3, R7);
__ b(&done, CS);
__ str(NULL_REG, Address(R3), kObjectBytes); // Store if unsigned lower.
__ AddImmediate(R3, target::kCompressedWordSize);
__ b(&loop); // Loop until R3 == R7.
__ Bind(&done);
__ b(&loop, UNSIGNED_LESS);
// Done allocating and initializing the array.
// AllocateArrayABI::kResultReg: new object.
@ -1697,17 +1704,25 @@ void StubCodeCompiler::GenerateAllocateContextStub(Assembler* assembler) {
// Initialize the context variables.
// R0: new object.
// R1: number of context variables.
{
Label loop, done;
__ AddImmediate(R3, R0,
target::Context::variable_offset(0) - kHeapObjectTag);
__ Bind(&loop);
__ subs(R1, R1, Operand(1));
__ b(&done, MI);
__ str(NULL_REG, Address(R3, R1, UXTX, Address::Scaled), kObjectBytes);
__ b(&loop, NE); // Loop if R1 not zero.
__ Bind(&done);
}
__ AddImmediate(R3, R0,
target::Context::variable_offset(0) - kHeapObjectTag);
#if defined(DART_COMPRESSED_POINTERS)
const Register kWordOfNulls = TMP;
__ andi(kWordOfNulls, NULL_REG, Immediate(0xFFFFFFFF));
__ orr(kWordOfNulls, kWordOfNulls, Operand(kWordOfNulls, LSL, 32));
#else
const Register kWordOfNulls = NULL_REG;
#endif
Label loop;
__ Bind(&loop);
ASSERT(target::kObjectAlignment == 2 * target::kWordSize);
__ stp(kWordOfNulls, kWordOfNulls,
Address(R3, 2 * target::kWordSize, Address::PairPostIndex));
// Safe to only check every kObjectAlignment bytes instead of each word.
ASSERT(kAllocationRedZoneSize >= target::kObjectAlignment);
__ subs(R1, R1,
Operand(target::kObjectAlignment / target::kCompressedWordSize));
__ b(&loop, HI);
// Done allocating and initializing the context.
// R0: new object.
@ -2047,20 +2062,24 @@ static void GenerateAllocateObjectHelper(Assembler* assembler,
// Initialize the remaining words of the object.
{
const Register kFieldReg = R4;
__ AddImmediate(kFieldReg, AllocateObjectABI::kResultReg,
target::Instance::first_field_offset());
Label done, init_loop;
__ Bind(&init_loop);
#if defined(DART_COMPRESSED_POINTERS)
const Register kWordOfNulls = TMP;
__ andi(kWordOfNulls, NULL_REG, Immediate(0xFFFFFFFF));
__ orr(kWordOfNulls, kWordOfNulls, Operand(kWordOfNulls, LSL, 32));
#else
const Register kWordOfNulls = NULL_REG;
#endif
Label loop;
__ Bind(&loop);
ASSERT(target::kObjectAlignment == 2 * target::kWordSize);
__ stp(kWordOfNulls, kWordOfNulls,
Address(kFieldReg, 2 * target::kWordSize, Address::PairPostIndex));
// Safe to only check every kObjectAlignment bytes instead of each word.
ASSERT(kAllocationRedZoneSize >= target::kObjectAlignment);
__ CompareRegisters(kFieldReg, kNewTopReg);
__ b(&done, UNSIGNED_GREATER_EQUAL);
__ str(
NULL_REG,
Address(kFieldReg, target::kCompressedWordSize, Address::PostIndex),
kObjectBytes);
__ b(&init_loop);
__ Bind(&done);
__ b(&loop, UNSIGNED_LESS);
} // kFieldReg = R4
if (is_cls_parameterized) {
@ -3832,14 +3851,12 @@ void StubCodeCompiler::GenerateAllocateTypedDataArrayStub(Assembler* assembler,
__ AddImmediate(R2, R0, target::TypedData::HeaderSize() - 1);
__ StoreInternalPointer(
R0, FieldAddress(R0, target::PointerBase::data_offset()), R2);
Label init_loop, done;
__ Bind(&init_loop);
Label loop;
__ Bind(&loop);
ASSERT(target::kObjectAlignment == 2 * target::kWordSize);
__ stp(ZR, ZR, Address(R2, 2 * target::kWordSize, Address::PairPostIndex));
__ cmp(R2, Operand(R1));
__ b(&done, CS);
__ str(ZR, Address(R2, 0));
__ add(R2, R2, Operand(target::kWordSize));
__ b(&init_loop);
__ Bind(&done);
__ b(&loop, UNSIGNED_LESS);
__ Ret();

View file

@ -928,17 +928,19 @@ void StubCodeCompiler::GenerateAllocateArrayStub(Assembler* assembler) {
__ leal(EBX, FieldAddress(AllocateArrayABI::kResultReg, EBX, TIMES_1, 0));
__ leal(EDI, FieldAddress(AllocateArrayABI::kResultReg,
target::Array::header_size()));
Label done;
Label init_loop;
__ Bind(&init_loop);
Label loop;
__ Bind(&loop);
for (intptr_t offset = 0; offset < target::kObjectAlignment;
offset += target::kWordSize) {
// No generational barrier needed, since we are storing null.
__ StoreIntoObjectNoBarrier(AllocateArrayABI::kResultReg,
Address(EDI, offset), NullObject());
}
// Safe to only check every kObjectAlignment bytes instead of each word.
ASSERT(kAllocationRedZoneSize >= target::kObjectAlignment);
__ addl(EDI, Immediate(target::kObjectAlignment));
__ cmpl(EDI, EBX);
__ j(ABOVE_EQUAL, &done, Assembler::kNearJump);
// No generational barrier needed, since we are storing null.
__ StoreIntoObjectNoBarrier(AllocateArrayABI::kResultReg, Address(EDI, 0),
NullObject());
__ addl(EDI, Immediate(target::kWordSize));
__ jmp(&init_loop, Assembler::kNearJump);
__ Bind(&done);
__ j(UNSIGNED_LESS, &loop);
__ ret();
// Unable to allocate the array using the fast inline code, just call
@ -1570,16 +1572,18 @@ void StubCodeCompiler::GenerateAllocationStubForClass(
// ECX: next word to be initialized.
// AllocateObjectABI::kTypeArgumentsReg: new object type arguments
// (if is_cls_parameterized).
Label init_loop;
Label done;
__ Bind(&init_loop);
Label loop;
__ Bind(&loop);
for (intptr_t offset = 0; offset < target::kObjectAlignment;
offset += target::kWordSize) {
__ StoreIntoObjectNoBarrier(AllocateObjectABI::kResultReg,
Address(ECX, offset), NullObject());
}
// Safe to only check every kObjectAlignment bytes instead of each word.
ASSERT(kAllocationRedZoneSize >= target::kObjectAlignment);
__ addl(ECX, Immediate(target::kObjectAlignment));
__ cmpl(ECX, EBX);
__ j(ABOVE_EQUAL, &done, Assembler::kNearJump);
__ StoreIntoObjectNoBarrier(AllocateObjectABI::kResultReg,
Address(ECX, 0), NullObject());
__ addl(ECX, Immediate(target::kWordSize));
__ jmp(&init_loop, Assembler::kNearJump);
__ Bind(&done);
__ j(UNSIGNED_LESS, &loop);
}
if (is_cls_parameterized) {
// AllocateObjectABI::kResultReg: new object (tagged).
@ -3066,14 +3070,17 @@ void StubCodeCompiler::GenerateAllocateTypedDataArrayStub(Assembler* assembler,
__ leal(EDI, FieldAddress(EAX, target::TypedData::HeaderSize()));
__ StoreInternalPointer(
EAX, FieldAddress(EAX, target::PointerBase::data_offset()), EDI);
Label done, init_loop;
__ Bind(&init_loop);
Label loop;
__ Bind(&loop);
for (intptr_t offset = 0; offset < target::kObjectAlignment;
offset += target::kWordSize) {
__ movl(Address(EDI, offset), ECX);
}
// Safe to only check every kObjectAlignment bytes instead of each word.
ASSERT(kAllocationRedZoneSize >= target::kObjectAlignment);
__ addl(EDI, Immediate(target::kObjectAlignment));
__ cmpl(EDI, EBX);
__ j(ABOVE_EQUAL, &done, Assembler::kNearJump);
__ movl(Address(EDI, 0), ECX);
__ addl(EDI, Immediate(target::kWordSize));
__ jmp(&init_loop, Assembler::kNearJump);
__ Bind(&done);
__ j(UNSIGNED_LESS, &loop);
__ ret();

View file

@ -1189,15 +1189,17 @@ void StubCodeCompiler::GenerateAllocateArrayStub(Assembler* assembler) {
target::Array::data_offset() - kHeapObjectTag);
// R3: iterator which initially points to the start of the variable
// data area to be initialized.
Label loop, done;
Label loop;
__ Bind(&loop);
// TODO(cshapiro): StoreIntoObjectNoBarrier
__ bgeu(T3, T4, &done);
__ sx(NULL_REG, Address(T3, 0));
__ sx(NULL_REG, Address(T3, target::kCompressedWordSize));
__ AddImmediate(T3, 2 * target::kCompressedWordSize);
__ j(&loop); // Loop until T3 == T4.
__ Bind(&done);
for (intptr_t offset = 0; offset < target::kObjectAlignment;
offset += target::kCompressedWordSize) {
__ StoreCompressedIntoObjectNoBarrier(AllocateArrayABI::kResultReg,
Address(T3, offset), NULL_REG);
}
// Safe to only check every kObjectAlignment bytes instead of each word.
ASSERT(kAllocationRedZoneSize >= target::kObjectAlignment);
__ addi(T3, T3, target::kObjectAlignment);
__ bltu(T3, T4, &loop);
// Done allocating and initializing the array.
// AllocateArrayABI::kResultReg: new object.
@ -1873,15 +1875,18 @@ static void GenerateAllocateObjectHelper(Assembler* assembler,
__ AddImmediate(kFieldReg, AllocateObjectABI::kResultReg,
target::Instance::first_field_offset());
Label done, init_loop;
__ Bind(&init_loop);
__ CompareRegisters(kFieldReg, kNewTopReg);
__ BranchIf(UNSIGNED_GREATER_EQUAL, &done);
__ sx(NULL_REG, Address(kFieldReg, 0));
__ addi(kFieldReg, kFieldReg, target::kCompressedWordSize);
__ j(&init_loop);
__ Bind(&done);
Label loop;
__ Bind(&loop);
for (intptr_t offset = 0; offset < target::kObjectAlignment;
offset += target::kCompressedWordSize) {
__ StoreCompressedIntoObjectNoBarrier(AllocateObjectABI::kResultReg,
Address(kFieldReg, offset),
NULL_REG);
}
// Safe to only check every kObjectAlignment bytes instead of each word.
ASSERT(kAllocationRedZoneSize >= target::kObjectAlignment);
__ addi(kFieldReg, kFieldReg, target::kObjectAlignment);
__ bltu(kFieldReg, kNewTopReg, &loop);
} // kFieldReg = T4
if (is_cls_parameterized) {
@ -3627,13 +3632,16 @@ void StubCodeCompiler::GenerateAllocateTypedDataArrayStub(Assembler* assembler,
__ AddImmediate(T3, A0, target::TypedData::HeaderSize() - 1);
__ StoreInternalPointer(
A0, FieldAddress(A0, target::PointerBase::data_offset()), T3);
Label init_loop, done;
__ Bind(&init_loop);
__ bgeu(T3, T4, &done);
__ sx(ZR, Address(T3, 0));
__ addi(T3, T3, target::kWordSize);
__ j(&init_loop);
__ Bind(&done);
Label loop;
__ Bind(&loop);
for (intptr_t offset = 0; offset < target::kObjectAlignment;
offset += target::kWordSize) {
__ sx(ZR, Address(T3, offset));
}
// Safe to only check every kObjectAlignment bytes instead of each word.
ASSERT(kAllocationRedZoneSize >= target::kObjectAlignment);
__ addi(T3, T3, target::kObjectAlignment);
__ bltu(T3, T4, &loop);
__ Ret();

View file

@ -1285,22 +1285,19 @@ void StubCodeCompiler::GenerateAllocateArrayStub(Assembler* assembler) {
__ LoadObject(R12, NullObject());
__ leaq(RDI, FieldAddress(AllocateArrayABI::kResultReg,
target::Array::header_size()));
Label done;
Label init_loop;
__ Bind(&init_loop);
Label loop;
__ Bind(&loop);
for (intptr_t offset = 0; offset < target::kObjectAlignment;
offset += target::kCompressedWordSize) {
// No generational barrier needed, since we are storing null.
__ StoreCompressedIntoObjectNoBarrier(AllocateArrayABI::kResultReg,
Address(RDI, offset), R12);
}
// Safe to only check every kObjectAlignment bytes instead of each word.
ASSERT(kAllocationRedZoneSize >= target::kObjectAlignment);
__ addq(RDI, Immediate(target::kObjectAlignment));
__ cmpq(RDI, RCX);
#if defined(DEBUG)
static auto const kJumpLength = Assembler::kFarJump;
#else
static auto const kJumpLength = Assembler::kNearJump;
#endif // DEBUG
__ j(ABOVE_EQUAL, &done, kJumpLength);
// No generational barrier needed, since we are storing null.
__ StoreCompressedIntoObjectNoBarrier(AllocateArrayABI::kResultReg,
Address(RDI, 0), R12);
__ addq(RDI, Immediate(target::kCompressedWordSize));
__ jmp(&init_loop, kJumpLength);
__ Bind(&done);
__ j(UNSIGNED_LESS, &loop);
__ ret();
// Unable to allocate the array using the fast inline code, just call
@ -1977,21 +1974,19 @@ static void GenerateAllocateObjectHelper(Assembler* assembler,
__ LoadObject(kNullReg, NullObject());
// Loop until the whole object is initialized.
Label init_loop;
Label done;
__ Bind(&init_loop);
Label loop;
__ Bind(&loop);
for (intptr_t offset = 0; offset < target::kObjectAlignment;
offset += target::kCompressedWordSize) {
__ StoreCompressedIntoObjectNoBarrier(AllocateObjectABI::kResultReg,
Address(kNextFieldReg, offset),
kNullReg);
}
// Safe to only check every kObjectAlignment bytes instead of each word.
ASSERT(kAllocationRedZoneSize >= target::kObjectAlignment);
__ addq(kNextFieldReg, Immediate(target::kObjectAlignment));
__ cmpq(kNextFieldReg, kNewTopReg);
#if defined(DEBUG)
static auto const kJumpLength = Assembler::kFarJump;
#else
static auto const kJumpLength = Assembler::kNearJump;
#endif // DEBUG
__ j(ABOVE_EQUAL, &done, kJumpLength);
__ StoreCompressedIntoObjectNoBarrier(
AllocateObjectABI::kResultReg, Address(kNextFieldReg, 0), kNullReg);
__ addq(kNextFieldReg, Immediate(target::kCompressedWordSize));
__ jmp(&init_loop, Assembler::kNearJump);
__ Bind(&done);
__ j(UNSIGNED_LESS, &loop);
} // kNextFieldReg = RDI, kNullReg = R10
if (is_cls_parameterized) {
@ -3774,18 +3769,19 @@ void StubCodeCompiler::GenerateAllocateTypedDataArrayStub(Assembler* assembler,
/* RDI: iterator which initially points to the start of the variable */
/* RBX: scratch register. */
/* data area to be initialized. */
__ xorq(RBX, RBX); /* Zero. */
__ pxor(XMM0, XMM0); /* Zero. */
__ leaq(RDI, FieldAddress(RAX, target::TypedData::HeaderSize()));
__ StoreInternalPointer(
RAX, FieldAddress(RAX, target::PointerBase::data_offset()), RDI);
Label done, init_loop;
__ Bind(&init_loop);
Label loop;
__ Bind(&loop);
ASSERT(target::kObjectAlignment == kFpuRegisterSize);
__ movups(Address(RDI, 0), XMM0);
// Safe to only check every kObjectAlignment bytes instead of each word.
ASSERT(kAllocationRedZoneSize >= target::kObjectAlignment);
__ addq(RDI, Immediate(target::kObjectAlignment));
__ cmpq(RDI, RCX);
__ j(ABOVE_EQUAL, &done, Assembler::kNearJump);
__ movq(Address(RDI, 0), RBX);
__ addq(RDI, Immediate(target::kWordSize));
__ jmp(&init_loop, Assembler::kNearJump);
__ Bind(&done);
__ j(UNSIGNED_LESS, &loop, Assembler::kNearJump);
__ ret();

View file

@ -33,6 +33,11 @@ static constexpr intptr_t kNewPageSize = 512 * KB;
static constexpr intptr_t kNewPageSizeInWords = kNewPageSize / kWordSize;
static constexpr intptr_t kNewPageMask = ~(kNewPageSize - 1);
// Simplify initialization in allocation stubs by ensuring it is safe
// to overshoot the object end by up to kAllocationRedZoneSize. (Just as the
// stack red zone allows one to overshoot the stack pointer.)
static constexpr intptr_t kAllocationRedZoneSize = kObjectAlignment;
// A page containing new generation objects.
class NewPage {
public:
@ -40,7 +45,7 @@ class NewPage {
void Deallocate();
uword start() const { return memory_->start(); }
uword end() const { return memory_->end(); }
uword end() const { return memory_->end() - kAllocationRedZoneSize; }
bool Contains(uword addr) const { return memory_->Contains(addr); }
void WriteProtect(bool read_only) {
memory_->Protect(read_only ? VirtualMemory::kReadOnly