[vm] Use codegen block order in regalloc in AOT.

Codegen block order keeps loop blocks together and reflecting that in
live ranges allows to produce better register allocation decisions.

Consider for example the loop:

    v = def();
    while (cond) {
      if (smth) { // (*)
        use(v);
        return true;
      }
    }

If block (*) is interspersed with loop blocks the register allocator
might decide to allocate it to the register in some of the loop blocks.
If the same block is "sunk" away from loop blocks - the register
allocator can clearly see that `v` does not necessarily have to live on
the register in the whole loop.

In JIT codegen block order is not topologically sorted and as such
is unsuitable for our linear scan.

TEST=ci

Cq-Include-Trybots: luci.dart.try:vm-aot-linux-debug-x64-try,vm-aot-mac-release-arm64-try,vm-aot-linux-release-x64-try
Change-Id: I0726815db998b559267949e157cd2158f5dd55f7
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/358448
Commit-Queue: Slava Egorov <vegorov@google.com>
Reviewed-by: Alexander Markov <alexmarkov@google.com>
This commit is contained in:
Vyacheslav Egorov 2024-03-21 10:35:11 +00:00 committed by Commit Queue
parent 3bc14606f9
commit 395b024685
5 changed files with 57 additions and 47 deletions

View file

@ -225,8 +225,10 @@ void BlockScheduler::ReorderBlocksJIT(FlowGraph* flow_graph) {
flow_graph->CodegenBlockOrder()->Add(checked_entry);
}
// Build a new block order. Emit each chain when its first block occurs
// in the original reverse postorder ordering (which gives a topological
// sort of the blocks).
// in the original reverse postorder ordering.
// Note: the resulting order is not topologically sorted and can't be
// used a replacement for reverse_postorder in algorithms that expect
// topological sort.
for (intptr_t i = block_count - 1; i >= 0; --i) {
if (chains[i]->first->block == flow_graph->postorder()[i]) {
for (Link* link = chains[i]->first; link != nullptr; link = link->next) {

View file

@ -210,6 +210,8 @@ class FlowGraph : public ZoneAllocated {
const GrowableArray<BlockEntryInstr*>& optimized_block_order() const {
return optimized_block_order_;
}
// In AOT these are guaranteed to be topologically sorted, but not in JIT.
GrowableArray<BlockEntryInstr*>* CodegenBlockOrder();
const GrowableArray<BlockEntryInstr*>* CodegenBlockOrder() const;

View file

@ -77,12 +77,20 @@ static ExtraLoopInfo* ComputeExtraLoopInfo(Zone* zone, LoopInfo* loop_info) {
return new (zone) ExtraLoopInfo(start, end);
}
static const GrowableArray<BlockEntryInstr*>& BlockOrderForAllocation(
const FlowGraph& flow_graph) {
// Currently CodegenBlockOrder is not topologically sorted in JIT and can't
// be used for register allocation.
return CompilerState::Current().is_aot() ? *flow_graph.CodegenBlockOrder()
: flow_graph.reverse_postorder();
}
FlowGraphAllocator::FlowGraphAllocator(const FlowGraph& flow_graph,
bool intrinsic_mode)
: flow_graph_(flow_graph),
reaching_defs_(flow_graph),
value_representations_(flow_graph.max_vreg()),
block_order_(flow_graph.reverse_postorder()),
block_order_(BlockOrderForAllocation(flow_graph)),
postorder_(flow_graph.postorder()),
instructions_(),
block_entries_(),
@ -582,19 +590,21 @@ static bool HasOnlyUnconstrainedUses(LiveRange* range) {
}
void FlowGraphAllocator::BuildLiveRanges() {
const intptr_t block_count = postorder_.length();
ASSERT(postorder_.Last()->IsGraphEntry());
const intptr_t block_count = block_order_.length();
ASSERT(block_order_[0]->IsGraphEntry());
BitVector* current_interference_set = nullptr;
Zone* zone = flow_graph_.zone();
for (intptr_t i = 0; i < (block_count - 1); i++) {
BlockEntryInstr* block = postorder_[i];
for (intptr_t x = block_count - 1; x > 0; --x) {
BlockEntryInstr* block = block_order_[x];
ASSERT(BlockEntryAt(block->start_pos()) == block);
// For every SSA value that is live out of this block, create an interval
// that covers the whole block. It will be shortened if we encounter a
// definition of this value in this block.
for (BitVector::Iterator it(liveness_.GetLiveOutSetAt(i)); !it.Done();
it.Advance()) {
for (BitVector::Iterator it(
liveness_.GetLiveOutSetAt(block->postorder_number()));
!it.Done(); it.Advance()) {
LiveRange* range = GetLiveRange(it.Current());
range->AddUseInterval(block->start_pos(), block->end_pos());
}
@ -637,8 +647,9 @@ void FlowGraphAllocator::BuildLiveRanges() {
if (block->IsLoopHeader()) {
ASSERT(loop_info != nullptr);
current_interference_set = nullptr;
for (BitVector::Iterator it(liveness_.GetLiveInSetAt(i)); !it.Done();
it.Advance()) {
for (BitVector::Iterator it(
liveness_.GetLiveInSetAt(block->postorder_number()));
!it.Done(); it.Advance()) {
LiveRange* range = GetLiveRange(it.Current());
intptr_t loop_end = extra_loop_info_[loop_info->id()]->end;
if (HasOnlyUnconstrainedUsesInLoop(range, loop_end)) {
@ -1681,11 +1692,7 @@ static ParallelMoveInstr* CreateParallelMoveAfter(Instruction* instr,
void FlowGraphAllocator::NumberInstructions() {
intptr_t pos = 0;
// The basic block order is reverse postorder.
const intptr_t block_count = postorder_.length();
for (intptr_t i = block_count - 1; i >= 0; i--) {
BlockEntryInstr* block = postorder_[i];
for (auto block : block_order_) {
instructions_.Add(block);
block_entries_.Add(block);
block->set_start_pos(pos);
@ -1706,9 +1713,7 @@ void FlowGraphAllocator::NumberInstructions() {
// Create parallel moves in join predecessors. This must be done after
// all instructions are numbered.
for (intptr_t i = block_count - 1; i >= 0; i--) {
BlockEntryInstr* block = postorder_[i];
for (auto block : block_order_) {
// For join entry predecessors create phi resolution moves if
// necessary. They will be populated by the register allocator.
JoinEntryInstr* join = block->AsJoinEntry();
@ -3180,10 +3185,7 @@ void FlowGraphAllocator::CollectRepresentations() {
}
}
for (BlockIterator it = flow_graph_.reverse_postorder_iterator(); !it.Done();
it.Advance()) {
BlockEntryInstr* block = it.Current();
for (auto block : block_order_) {
if (auto entry = block->AsBlockEntryWithInitialDefs()) {
initial_definitions = entry->initial_definitions();
for (intptr_t i = 0; i < initial_definitions->length(); ++i) {
@ -3209,9 +3211,8 @@ void FlowGraphAllocator::CollectRepresentations() {
}
// Normal instructions.
for (ForwardInstructionIterator instr_it(block); !instr_it.Done();
instr_it.Advance()) {
Definition* def = instr_it.Current()->AsDefinition();
for (auto instr : block->instructions()) {
Definition* def = instr->AsDefinition();
if ((def != nullptr) && (def->vreg(0) >= 0)) {
const intptr_t vreg = def->vreg(0);
value_representations_[vreg] =
@ -3257,7 +3258,7 @@ void FlowGraphAllocator::RemoveFrameIfNotNeeded() {
#if defined(TARGET_ARCH_ARM64) || defined(TARGET_ARCH_ARM)
bool has_write_barrier_call = false;
#endif
for (auto block : flow_graph_.reverse_postorder()) {
for (auto block : block_order_) {
for (auto instruction : block->instructions()) {
if (instruction->HasLocs() && instruction->locs()->can_call()) {
// Function contains a call and thus needs a frame.
@ -3359,7 +3360,7 @@ void FlowGraphAllocator::AllocateOutgoingArguments() {
const intptr_t total_spill_slot_count =
flow_graph_.graph_entry()->spill_slot_count();
for (auto block : flow_graph_.reverse_postorder()) {
for (auto block : block_order_) {
for (auto instr : block->instructions()) {
if (auto move_arg = instr->AsMoveArgument()) {
// Register calling conventions are not used in JIT.
@ -3383,7 +3384,7 @@ void FlowGraphAllocator::AllocateOutgoingArguments() {
void FlowGraphAllocator::ScheduleParallelMoves() {
ParallelMoveResolver resolver;
for (auto block : flow_graph_.reverse_postorder()) {
for (auto block : block_order_) {
if (block->HasParallelMove()) {
resolver.Resolve(block->parallel_move());
}

View file

@ -330,8 +330,9 @@ FlowGraph* CompilerPass::RunForceOptimizedPipeline(
INVOKE_PASS_AOT(DelayAllocations);
INVOKE_PASS(EliminateWriteBarriers);
INVOKE_PASS(FinalizeGraph);
INVOKE_PASS(AllocateRegisters);
INVOKE_PASS(ReorderBlocks);
INVOKE_PASS(AllocateRegisters);
INVOKE_PASS(TestILSerialization); // Must be last.
return pass_state->flow_graph();
}
@ -398,8 +399,9 @@ FlowGraph* CompilerPass::RunPipeline(PipelineMode mode,
INVOKE_PASS(EliminateWriteBarriers);
INVOKE_PASS(FinalizeGraph);
INVOKE_PASS(Canonicalize);
INVOKE_PASS(AllocateRegisters);
INVOKE_PASS(ReorderBlocks);
INVOKE_PASS(AllocateRegisters);
INVOKE_PASS(TestILSerialization); // Must be last.
return pass_state->flow_graph();
}
@ -571,22 +573,6 @@ COMPILER_PASS(AllocateRegistersForGraphIntrinsic, {
COMPILER_PASS(ReorderBlocks, {
BlockScheduler::ReorderBlocks(flow_graph);
// This is the last compiler pass.
// Test that round-trip IL serialization works before generating code.
if (FLAG_test_il_serialization && CompilerState::Current().is_aot()) {
Zone* zone = flow_graph->zone();
auto* detached_defs = new (zone) ZoneGrowableArray<Definition*>(zone, 0);
flow_graph->CompactSSA(detached_defs);
ZoneWriteStream write_stream(flow_graph->zone(), 1024);
FlowGraphSerializer serializer(&write_stream);
serializer.WriteFlowGraph(*flow_graph, *detached_defs);
ReadStream read_stream(write_stream.buffer(), write_stream.bytes_written());
FlowGraphDeserializer deserializer(flow_graph->parsed_function(),
&read_stream);
state->set_flow_graph(deserializer.ReadFlowGraph());
}
});
COMPILER_PASS(EliminateWriteBarriers, { EliminateWriteBarriers(flow_graph); });
@ -606,6 +592,24 @@ COMPILER_PASS(FinalizeGraph, {
flow_graph->RemoveRedefinitions();
});
COMPILER_PASS(TestILSerialization, {
// This is the last compiler pass.
// Test that round-trip IL serialization works before generating code.
if (FLAG_test_il_serialization && CompilerState::Current().is_aot()) {
Zone* zone = flow_graph->zone();
auto* detached_defs = new (zone) ZoneGrowableArray<Definition*>(zone, 0);
flow_graph->CompactSSA(detached_defs);
ZoneWriteStream write_stream(flow_graph->zone(), 1024);
FlowGraphSerializer serializer(&write_stream);
serializer.WriteFlowGraph(*flow_graph, *detached_defs);
ReadStream read_stream(write_stream.buffer(), write_stream.bytes_written());
FlowGraphDeserializer deserializer(flow_graph->parsed_function(),
&read_stream);
state->set_flow_graph(deserializer.ReadFlowGraph());
}
});
COMPILER_PASS(GenerateCode, { state->graph_compiler->CompileGraph(); });
} // namespace dart

View file

@ -54,6 +54,7 @@ namespace dart {
V(UseTableDispatch) \
V(WidenSmiToInt32) \
V(EliminateWriteBarriers) \
V(TestILSerialization) \
V(GenerateCode)
class AllocationSinking;