[vm, compiler] Use more compressed instructions on RISC-V.

dart2js.aot.rv64 25290024 -> 25042040 (-0.98%)
dart2js.aot.rv32 24466620 -> 24000012 (-1.91%)

TEST=ci
Change-Id: I0b60d0c3bd8df036426898e00cf650b398abb397
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/242065
Commit-Queue: Ryan Macnak <rmacnak@google.com>
Reviewed-by: Alexander Markov <alexmarkov@google.com>
This commit is contained in:
Ryan Macnak 2022-04-27 17:11:35 +00:00 committed by Commit Bot
parent 5748add35c
commit ad4126bba4
13 changed files with 90 additions and 32 deletions

View file

@ -3704,8 +3704,33 @@ void Assembler::LeaveDartFrame() {
lx(FP, Address(SP, 0 * target::kWordSize));
lx(RA, Address(SP, 1 * target::kWordSize));
addi(SP, SP, 2 * target::kWordSize);
}
// TODO(riscv): When we know the stack depth, we can avoid updating SP twice.
void Assembler::LeaveDartFrame(intptr_t fp_sp_dist) {
intptr_t pp_offset =
target::frame_layout.saved_caller_pp_from_fp * target::kWordSize -
fp_sp_dist;
intptr_t fp_offset =
target::frame_layout.saved_caller_fp_from_fp * target::kWordSize -
fp_sp_dist;
intptr_t ra_offset =
target::frame_layout.saved_caller_pc_from_fp * target::kWordSize -
fp_sp_dist;
if (!IsITypeImm(pp_offset) || !IsITypeImm(fp_offset) ||
!IsITypeImm(ra_offset)) {
// Shorter to update SP twice than generate large immediates.
LeaveDartFrame();
return;
}
if (!FLAG_precompiled_mode) {
lx(PP, Address(SP, pp_offset));
subi(PP, PP, kHeapObjectTag);
}
set_constant_pool_allowed(false);
lx(FP, Address(SP, fp_offset));
lx(RA, Address(SP, ra_offset));
addi(SP, SP, -fp_sp_dist);
}
void Assembler::CallRuntime(const RuntimeEntry& entry,

View file

@ -1233,6 +1233,7 @@ class Assembler : public MicroAssembler {
void EnterDartFrame(intptr_t frame_size, Register new_pp = kNoRegister);
void EnterOsrFrame(intptr_t extra_size, Register new_pp = kNoRegister);
void LeaveDartFrame();
void LeaveDartFrame(intptr_t fp_sp_dist);
// For non-leaf runtime calls. For leaf runtime calls, use LeafRuntimeScope,
void CallRuntime(const RuntimeEntry& entry, intptr_t argument_count);

View file

@ -1584,7 +1584,8 @@ bool FlowGraphCompiler::NeedsEdgeCounter(BlockEntryInstr* block) {
// Allocate a register that is not explictly blocked.
static Register AllocateFreeRegister(bool* blocked_registers) {
for (intptr_t regno = 0; regno < kNumberOfCpuRegisters; regno++) {
for (intptr_t i = 0; i < kNumberOfCpuRegisters; i++) {
intptr_t regno = (i + kRegisterAllocationBias) % kNumberOfCpuRegisters;
if (!blocked_registers[regno]) {
blocked_registers[regno] = true;
return static_cast<Register>(regno);
@ -1615,22 +1616,6 @@ void FlowGraphCompiler::AllocateRegistersLocally(Instruction* instr) {
bool blocked_registers[kNumberOfCpuRegisters];
bool blocked_fpu_registers[kNumberOfFpuRegisters];
// Connect input with peephole output for some special cases. All other
// cases are handled by simply allocating registers and generating code.
if (top_of_stack_ != nullptr) {
const intptr_t p = locs->input_count() - 1;
Location peephole = top_of_stack_->locs()->out(0);
if ((instr->RequiredInputRepresentation(p) == kTagged) &&
(locs->in(p).IsUnallocated() || locs->in(p).IsConstant())) {
// If input is unallocated, match with an output register, if set. Also,
// if input is a direct constant, but the peephole output is a register,
// use that register to avoid wasting the already generated code.
if (peephole.IsRegister()) {
locs->set_in(p, Location::RegisterLocation(peephole.reg()));
}
}
}
// Block all registers globally reserved by the assembler, etc and mark
// the rest as free.
for (intptr_t i = 0; i < kNumberOfCpuRegisters; i++) {
@ -1671,6 +1656,23 @@ void FlowGraphCompiler::AllocateRegistersLocally(Instruction* instr) {
}
}
// Connect input with peephole output for some special cases. All other
// cases are handled by simply allocating registers and generating code.
if (top_of_stack_ != nullptr) {
const intptr_t p = locs->input_count() - 1;
Location peephole = top_of_stack_->locs()->out(0);
if ((instr->RequiredInputRepresentation(p) == kTagged) &&
(locs->in(p).IsUnallocated() || locs->in(p).IsConstant())) {
// If input is unallocated, match with an output register, if set. Also,
// if input is a direct constant, but the peephole output is a register,
// use that register to avoid wasting the already generated code.
if (peephole.IsRegister() && !blocked_registers[peephole.reg()]) {
locs->set_in(p, Location::RegisterLocation(peephole.reg()));
blocked_registers[peephole.reg()] = true;
}
}
}
if (locs->out(0).IsRegister()) {
// Fixed output registers are allowed to overlap with
// temps and inputs.

View file

@ -346,14 +346,16 @@ void FlowGraphCompiler::EmitPrologue() {
}
__ Comment("Initialize spill slots");
const intptr_t fp_to_sp_delta =
num_locals + compiler::target::frame_layout.dart_fixed_frame_size;
for (intptr_t i = 0; i < num_locals; ++i) {
const intptr_t slot_index =
compiler::target::frame_layout.FrameSlotForVariableIndex(-i);
Register value_reg =
slot_index == args_desc_slot ? ARGS_DESC_REG : NULL_REG;
__ StoreToOffset(value_reg, FP, slot_index * kWordSize);
// TODO(riscv): Using an SP-relative address instead of an FP-relative
// address would allow for compressed instructions.
// SP-relative addresses allow for compressed instructions.
__ StoreToOffset(value_reg, SP,
(slot_index + fp_to_sp_delta) * kWordSize);
}
}

View file

@ -469,14 +469,14 @@ void ReturnInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
return;
}
#if defined(DEBUG)
compiler::Label stack_ok;
__ Comment("Stack Check");
const intptr_t fp_sp_dist =
(compiler::target::frame_layout.first_local_from_fp + 1 -
compiler->StackSize()) *
kWordSize;
ASSERT(fp_sp_dist <= 0);
#if defined(DEBUG)
compiler::Label stack_ok;
__ Comment("Stack Check");
__ sub(TMP, SP, FP);
__ CompareImmediate(TMP, fp_sp_dist);
__ BranchIf(EQ, &stack_ok, compiler::Assembler::kNearJump);
@ -487,7 +487,7 @@ void ReturnInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
if (yield_index() != UntaggedPcDescriptors::kInvalidYieldIndex) {
compiler->EmitYieldPositionMetadata(source(), yield_index());
}
__ LeaveDartFrame(); // Disallows constant pool use.
__ LeaveDartFrame(fp_sp_dist); // Disallows constant pool use.
__ ret();
// This ReturnInstr may be emitted out of order by the optimizer. The next
// block may be a target expecting a properly set constant pool pointer.

View file

@ -2185,7 +2185,8 @@ bool FlowGraphAllocator::AllocateFreeRegister(LiveRange* unallocated) {
TRACE_ALLOC(THR_Print("found hint %s for v%" Pd ": free until %" Pd "\n",
hint.Name(), unallocated->vreg(), free_until));
} else {
for (intptr_t reg = 0; reg < NumberOfRegisters(); ++reg) {
for (intptr_t i = 0; i < NumberOfRegisters(); ++i) {
intptr_t reg = (i + kRegisterAllocationBias) % NumberOfRegisters();
if (!blocked_registers_[reg] && (registers_[reg]->length() == 0)) {
candidate = reg;
free_until = kMaxPosition;
@ -2196,7 +2197,8 @@ bool FlowGraphAllocator::AllocateFreeRegister(LiveRange* unallocated) {
ASSERT(0 <= kMaxPosition);
if (free_until != kMaxPosition) {
for (intptr_t reg = 0; reg < NumberOfRegisters(); ++reg) {
for (intptr_t i = 0; i < NumberOfRegisters(); ++i) {
intptr_t reg = (i + kRegisterAllocationBias) % NumberOfRegisters();
if (blocked_registers_[reg] || (reg == candidate)) continue;
const intptr_t intersection =
FirstIntersectionWithAllocated(reg, unallocated);
@ -2258,7 +2260,8 @@ bool FlowGraphAllocator::AllocateFreeRegister(LiveRange* unallocated) {
unallocated->vreg(),
extra_loop_info_[loop_info->id()]->start,
extra_loop_info_[loop_info->id()]->end));
for (intptr_t reg = 0; reg < NumberOfRegisters(); ++reg) {
for (intptr_t i = 0; i < NumberOfRegisters(); ++i) {
intptr_t reg = (i + kRegisterAllocationBias) % NumberOfRegisters();
if (blocked_registers_[reg] || (reg == candidate) ||
used_on_backedge[reg]) {
continue;
@ -2371,7 +2374,8 @@ void FlowGraphAllocator::AllocateAnyRegister(LiveRange* unallocated) {
intptr_t free_until = 0;
intptr_t blocked_at = kMaxPosition;
for (int reg = 0; reg < NumberOfRegisters(); ++reg) {
for (int i = 0; i < NumberOfRegisters(); ++i) {
int reg = (i + kRegisterAllocationBias) % NumberOfRegisters();
if (blocked_registers_[reg]) continue;
if (UpdateFreeUntil(reg, unallocated, &free_until, &blocked_at)) {
candidate = reg;
@ -2581,7 +2585,8 @@ void FlowGraphAllocator::ConvertAllUses(LiveRange* range) {
}
void FlowGraphAllocator::AdvanceActiveIntervals(const intptr_t start) {
for (intptr_t reg = 0; reg < NumberOfRegisters(); reg++) {
for (intptr_t i = 0; i < NumberOfRegisters(); ++i) {
intptr_t reg = (i + kRegisterAllocationBias) % NumberOfRegisters();
if (registers_[reg]->is_empty()) continue;
intptr_t first_evicted = -1;
@ -2704,7 +2709,8 @@ void FlowGraphAllocator::PrepareForAllocation(
ASSERT(unallocated_.is_empty());
unallocated_.AddArray(unallocated);
for (intptr_t reg = 0; reg < number_of_registers; reg++) {
for (intptr_t i = 0; i < NumberOfRegisters(); ++i) {
intptr_t reg = (i + kRegisterAllocationBias) % NumberOfRegisters();
blocked_registers_[reg] = blocked_registers[reg];
ASSERT(registers_[reg]->is_empty());

View file

@ -574,6 +574,8 @@ constexpr RegList kDartAvailableCpuRegs =
kAllCpuRegistersList & ~kReservedCpuRegisters;
constexpr int kNumberOfDartAvailableCpuRegs =
kNumberOfCpuRegisters - kNumberOfReservedCpuRegisters;
// No reason to prefer certain registers on ARM.
constexpr int kRegisterAllocationBias = 0;
const intptr_t kStoreBufferWrapperSize = 24;
// Registers available to Dart that are not preserved by runtime calls.
const RegList kDartVolatileCpuRegs =

View file

@ -431,6 +431,8 @@ const RegList kDartAvailableCpuRegs =
kAllCpuRegistersList & ~kReservedCpuRegisters;
constexpr int kNumberOfDartAvailableCpuRegs =
kNumberOfCpuRegisters - kNumberOfReservedCpuRegisters;
// No reason to prefer certain registers on ARM64.
constexpr int kRegisterAllocationBias = 0;
// Registers available to Dart that are not preserved by runtime calls.
const RegList kDartVolatileCpuRegs =
kDartAvailableCpuRegs & ~kAbiPreservedCpuRegs;

View file

@ -278,6 +278,8 @@ const intptr_t kReservedCpuRegisters = (1 << SPREG) | (1 << FPREG) | (1 << THR);
// CPU registers available to Dart allocator.
const RegList kDartAvailableCpuRegs =
kAllCpuRegistersList & ~kReservedCpuRegisters;
// No reason to prefer certain registers on IA32.
constexpr int kRegisterAllocationBias = 0;
const RegList kAbiPreservedCpuRegs = (1 << EDI) | (1 << ESI) | (1 << EBX);

View file

@ -411,6 +411,8 @@ constexpr intptr_t kNumberOfReservedCpuRegisters = 14;
constexpr RegList kDartAvailableCpuRegs =
kAllCpuRegistersList & ~kReservedCpuRegisters;
constexpr int kNumberOfDartAvailableCpuRegs = 18;
// Registers X8-15 (S0-1,A0-5) have more compressed instructions available.
constexpr int kRegisterAllocationBias = 8;
// Registers available to Dart that are not preserved by runtime calls.
constexpr RegList kDartVolatileCpuRegs =
kDartAvailableCpuRegs & ~kAbiPreservedCpuRegs;

View file

@ -355,6 +355,8 @@ const RegList kDartAvailableCpuRegs =
kAllCpuRegistersList & ~kReservedCpuRegisters;
constexpr int kNumberOfDartAvailableCpuRegs =
kNumberOfCpuRegisters - kNumberOfReservedCpuRegisters;
// Low numbered registers sometimes require fewer prefixes.
constexpr int kRegisterAllocationBias = 0;
constexpr int kStoreBufferWrapperSize = 13;
#if defined(DART_TARGET_OS_WINDOWS)

View file

@ -42,6 +42,12 @@ struct UntaggedFrame {
// The offset (in words) from FP to the saved pool (if applicable).
int saved_caller_pp_from_fp;
// The offset (in words) from FP to the saved FP.
int saved_caller_fp_from_fp;
// The offset (in words) from FP to the saved return address.
int saved_caller_pc_from_fp;
// The offset (in words) from FP to the code object (if applicable).
int code_from_fp;

View file

@ -34,6 +34,8 @@ const UntaggedFrame invalid_frame_layout = {
/*.first_local_from_fp = */ -1,
/*.dart_fixed_frame_size = */ -1,
/*.saved_caller_pp_from_fp = */ -1,
/*.saved_caller_fp_from_fp = */ -1,
/*.saved_caller_pc_from_fp = */ -1,
/*.code_from_fp = */ -1,
/*.exit_link_slot_from_entry_fp = */ -1,
};
@ -46,6 +48,8 @@ const UntaggedFrame default_frame_layout = {
/*.first_local_from_fp = */ kFirstLocalSlotFromFp,
/*.dart_fixed_frame_size = */ kDartFrameFixedSize,
/*.saved_caller_pp_from_fp = */ kSavedCallerPpSlotFromFp,
/*.saved_caller_fp_from_fp = */ kSavedCallerFpSlotFromFp,
/*.saved_caller_pc_from_fp = */ kSavedCallerPcSlotFromFp,
/*.code_from_fp = */ kPcMarkerSlotFromFp,
/*.exit_link_slot_from_entry_fp = */ kExitLinkSlotFromEntryFp,
};
@ -60,7 +64,9 @@ const UntaggedFrame bare_instructions_frame_layout = {
/*.dart_fixed_frame_size =*/kDartFrameFixedSize -
2, // No saved CODE, PP slots.
/*.saved_caller_pp_from_fp = */ 0, // No saved PP slot.
/*.code_from_fp = */ 0, // No saved CODE
/*.saved_caller_fp_from_fp = */ kSavedCallerFpSlotFromFp,
/*.saved_caller_pc_from_fp = */ kSavedCallerPcSlotFromFp,
/*.code_from_fp = */ 0, // No saved CODE
/*.exit_link_slot_from_entry_fp = */ kExitLinkSlotFromEntryFp,
};