[VM] Improve performance of allocators on ARM in non-product mode

This CL improves performance of allocation statistic counters on ARM
by removing duplicated loads and increasing distance between dependent
loads. These statistic counters are part of allocator fast path in a
non-product mode.

This change improves performance of gestures/velocity_tracker_bench
Flutter micro-benchmark in 'flutter run --profile' mode:

Before: 3352 µs
After:  3156 µs (-5.8%)
(minimum of 5 runs)
Change-Id: Ic7998318d9ca3e7997379d0054faaf5b0b569bb6
Reviewed-on: https://dart-review.googlesource.com/15640
Reviewed-by: Zach Anderson <zra@google.com>
Reviewed-by: Siva Annamalai <asiva@google.com>
Commit-Queue: Alexander Markov <alexmarkov@google.com>
This commit is contained in:
Alexander Markov 2017-10-20 18:01:49 +00:00 committed by commit-bot@chromium.org
parent d0de522142
commit 841b1ad814
4 changed files with 31 additions and 27 deletions

View file

@ -3093,13 +3093,12 @@ void Assembler::MonomorphicCheckedEntry() {
}
#ifndef PRODUCT
void Assembler::MaybeTraceAllocation(intptr_t cid,
Register temp_reg,
Label* trace) {
LoadAllocationStatsAddress(temp_reg, cid);
void Assembler::MaybeTraceAllocation(Register stats_addr_reg, Label* trace) {
ASSERT(stats_addr_reg != kNoRegister);
ASSERT(stats_addr_reg != TMP);
const uword state_offset = ClassHeapStats::state_offset();
ldr(temp_reg, Address(temp_reg, state_offset));
tst(temp_reg, Operand(ClassHeapStats::TraceAllocationMask()));
ldr(TMP, Address(stats_addr_reg, state_offset));
tst(TMP, Operand(ClassHeapStats::TraceAllocationMask()));
b(trace, NE);
}
@ -3165,10 +3164,7 @@ void Assembler::TryAllocate(const Class& cls,
ASSERT(instance_reg != temp_reg);
ASSERT(temp_reg != IP);
ASSERT(instance_size != 0);
// If this allocation is traced, program will jump to failure path
// (i.e. the allocation stub) which will allocate the object and trace the
// allocation call site.
NOT_IN_PRODUCT(MaybeTraceAllocation(cls.id(), temp_reg, failure));
NOT_IN_PRODUCT(LoadAllocationStatsAddress(temp_reg, cls.id()));
NOT_IN_PRODUCT(Heap::Space space = Heap::kNew);
ldr(instance_reg, Address(THR, Thread::top_offset()));
// TODO(koda): Protect against unsigned overflow here.
@ -3180,10 +3176,14 @@ void Assembler::TryAllocate(const Class& cls,
// fail if heap end unsigned less than or equal to instance_reg.
b(failure, LS);
// If this allocation is traced, program will jump to failure path
// (i.e. the allocation stub) which will allocate the object and trace the
// allocation call site.
NOT_IN_PRODUCT(MaybeTraceAllocation(temp_reg, failure));
// Successfully allocated the object, now update top to point to
// next object start and store the class in the class field of object.
str(instance_reg, Address(THR, Thread::top_offset()));
NOT_IN_PRODUCT(LoadAllocationStatsAddress(temp_reg, cls.id()));
ASSERT(instance_size >= kHeapObjectTag);
AddImmediate(instance_reg, -instance_size + kHeapObjectTag);
@ -3209,10 +3209,7 @@ void Assembler::TryAllocateArray(intptr_t cid,
Register temp1,
Register temp2) {
if (FLAG_inline_alloc && Heap::IsAllocatableInNewSpace(instance_size)) {
// If this allocation is traced, program will jump to failure path
// (i.e. the allocation stub) which will allocate the object and trace the
// allocation call site.
NOT_IN_PRODUCT(MaybeTraceAllocation(cid, temp1, failure));
NOT_IN_PRODUCT(LoadAllocationStatsAddress(temp1, cid));
NOT_IN_PRODUCT(Heap::Space space = Heap::kNew);
// Potential new object start.
ldr(instance, Address(THR, Thread::top_offset()));
@ -3226,7 +3223,10 @@ void Assembler::TryAllocateArray(intptr_t cid,
cmp(end_address, Operand(temp2));
b(failure, CS);
NOT_IN_PRODUCT(LoadAllocationStatsAddress(temp2, cid));
// If this allocation is traced, program will jump to failure path
// (i.e. the allocation stub) which will allocate the object and trace the
// allocation call site.
NOT_IN_PRODUCT(MaybeTraceAllocation(temp1, failure));
// Successfully allocated the object(s), now update top to point to
// next object start and initialize the object.
@ -3238,11 +3238,11 @@ void Assembler::TryAllocateArray(intptr_t cid,
uint32_t tags = 0;
tags = RawObject::ClassIdTag::update(cid, tags);
tags = RawObject::SizeTag::update(instance_size, tags);
LoadImmediate(temp1, tags);
str(temp1, FieldAddress(instance, Array::tags_offset())); // Store tags.
LoadImmediate(temp2, tags);
str(temp2, FieldAddress(instance, Array::tags_offset())); // Store tags.
LoadImmediate(temp1, instance_size);
NOT_IN_PRODUCT(IncrementAllocationStatsWithSize(temp2, temp1, space));
LoadImmediate(temp2, instance_size);
NOT_IN_PRODUCT(IncrementAllocationStatsWithSize(temp1, temp2, space));
} else {
b(failure);
}

View file

@ -1020,7 +1020,7 @@ class Assembler : public ValueObject {
void MonomorphicCheckedEntry();
// The register into which the allocation stats table is loaded with
// LoadAllocationStatsAddress should be passed to
// LoadAllocationStatsAddress should be passed to MaybeTraceAllocation and
// IncrementAllocationStats(WithSize) as stats_addr_reg to update the
// allocation stats. These are separate assembler macros so we can
// avoid a dependent load too nearby the load of the table address.
@ -1069,9 +1069,9 @@ class Assembler : public ValueObject {
void LoadWordUnaligned(Register dst, Register addr, Register tmp);
void StoreWordUnaligned(Register src, Register addr, Register tmp);
// If allocation tracing for |cid| is enabled, will jump to |trace| label,
// If allocation tracing is enabled, will jump to |trace| label,
// which will allocate in the runtime where tracing occurs.
void MaybeTraceAllocation(intptr_t cid, Register temp_reg, Label* trace);
void MaybeTraceAllocation(Register stats_addr_reg, Label* trace);
// Inlined allocation of an instance of class 'cls', code has no runtime
// calls. Jump to 'failure' if the instance cannot be allocated here.

View file

@ -154,7 +154,8 @@ void Intrinsifier::GrowableArray_add(Assembler* assembler) {
#define TYPED_ARRAY_ALLOCATION(type_name, cid, max_len, scale_shift) \
Label fall_through; \
const intptr_t kArrayLengthStackOffset = 0 * kWordSize; \
NOT_IN_PRODUCT(__ MaybeTraceAllocation(cid, R2, &fall_through)); \
NOT_IN_PRODUCT(__ LoadAllocationStatsAddress(R2, cid)); \
NOT_IN_PRODUCT(__ MaybeTraceAllocation(R2, &fall_through)); \
__ ldr(R2, Address(SP, kArrayLengthStackOffset)); /* Array length. */ \
/* Check that length is a positive Smi. */ \
/* R2: requested array length argument. */ \
@ -1922,7 +1923,8 @@ static void TryAllocateOnebyteString(Assembler* assembler,
Label* failure) {
const Register length_reg = R2;
Label fail;
NOT_IN_PRODUCT(__ MaybeTraceAllocation(kOneByteStringCid, R0, failure));
NOT_IN_PRODUCT(__ LoadAllocationStatsAddress(R0, kOneByteStringCid));
NOT_IN_PRODUCT(__ MaybeTraceAllocation(R0, failure));
__ mov(R8, Operand(length_reg)); // Save the length register.
// TODO(koda): Protect against negative length and overflow here.
__ SmiUntag(length_reg);

View file

@ -669,7 +669,8 @@ void StubCode::GenerateAllocateArrayStub(Assembler* assembler) {
__ b(&slow_case, GT);
const intptr_t cid = kArrayCid;
NOT_IN_PRODUCT(__ MaybeTraceAllocation(cid, R4, &slow_case));
NOT_IN_PRODUCT(__ LoadAllocationStatsAddress(R4, cid));
NOT_IN_PRODUCT(__ MaybeTraceAllocation(R4, &slow_case));
const intptr_t fixed_size_plus_alignment_padding =
sizeof(RawArray) + kObjectAlignment - 1;
@ -898,7 +899,8 @@ void StubCode::GenerateAllocateContextStub(Assembler* assembler) {
ASSERT(kSmiTagShift == 1);
__ bic(R2, R2, Operand(kObjectAlignment - 1));
NOT_IN_PRODUCT(__ MaybeTraceAllocation(kContextCid, R8, &slow_case));
NOT_IN_PRODUCT(__ LoadAllocationStatsAddress(R8, kContextCid));
NOT_IN_PRODUCT(__ MaybeTraceAllocation(R8, &slow_case));
// Now allocate the object.
// R1: number of context variables.
// R2: object size.