[vm/compiler] Split ParallelMove codegen into scheduling and emission

This CL does not contain any changes to behaviour, but simply moves ParallelMoveResolver to a separate file. Additionally instead of immediately generating code we produce a move schedule which is attached to the ParallelMoveInstr and later converted to the native code. This refactoring prepares the code for subsequent improvements, e.g. we want to rework how temporaries used by move resolution are allocated: instead of pushing/poping them around every move that needs them we will allocate space for them in spill area. Having ParallelMove scheduling separated from code emission also allows to unit test it. TEST=ci Cq-Include-Trybots: luci.dart.try:vm-aot-linux-debug-x64c-try,vm-aot-linux-product-x64-try,vm-aot-android-release-arm64c-try,vm-aot-android-release-arm_x64-try,vm-aot-asan-linux-release-x64-try Change-Id: Ifeb17940f4cfb3c0cc004cb3f74895f0d3c2b7bf Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/285840 Commit-Queue: Slava Egorov <vegorov@google.com> Reviewed-by: Alexander Markov <alexmarkov@google.com>
2024-09-16 01:10:27 +00:00 · 2023-02-28 12:21:41 +00:00 · 2023-02-28 12:21:41 +00:00 · 27230ae008
parent 8f95f78524
commit 27230ae008
23 changed files with 769 additions and 602 deletions
--- a/runtime/vm/compiler/backend/flow_graph_compiler.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler.cc
@ -166,7 +166,6 @@ FlowGraphCompiler::FlowGraphCompiler(
          Class::ZoneHandle(isolate_group()->object_store()->int32x4_class())),
      list_class_(Class::ZoneHandle(Library::Handle(Library::CoreLibrary())
                                        .LookupClass(Symbols::List()))),
-      parallel_move_resolver_(this),
      pending_deoptimization_env_(NULL),
      deopt_id_to_ic_data_(deopt_id_to_ic_data),
      edge_counters_array_(Array::ZoneHandle()) {
@ -737,25 +736,22 @@ void FlowGraphCompiler::VisitBlocks() {
        }
        EmitComment(instr);
      }
-      if (instr->IsParallelMove()) {
-        parallel_move_resolver_.EmitNativeCode(instr->AsParallelMove());
+
+      BeginCodeSourceRange(instr->source());
+      EmitInstructionPrologue(instr);
+      ASSERT(pending_deoptimization_env_ == NULL);
+      pending_deoptimization_env_ = instr->env();
+      DEBUG_ONLY(current_instruction_ = instr);
+      instr->EmitNativeCode(this);
+      DEBUG_ONLY(current_instruction_ = nullptr);
+      pending_deoptimization_env_ = NULL;
+      if (IsPeephole(instr)) {
+        ASSERT(top_of_stack_ == nullptr);
+        top_of_stack_ = instr->AsDefinition();
      } else {
-        BeginCodeSourceRange(instr->source());
-        EmitInstructionPrologue(instr);
-        ASSERT(pending_deoptimization_env_ == NULL);
-        pending_deoptimization_env_ = instr->env();
-        DEBUG_ONLY(current_instruction_ = instr);
-        instr->EmitNativeCode(this);
-        DEBUG_ONLY(current_instruction_ = nullptr);
-        pending_deoptimization_env_ = NULL;
-        if (IsPeephole(instr)) {
-          ASSERT(top_of_stack_ == nullptr);
-          top_of_stack_ = instr->AsDefinition();
-        } else {
-          EmitInstructionEpilogue(instr);
-        }
-        EndCodeSourceRange(instr->source());
+        EmitInstructionEpilogue(instr);
      }
+      EndCodeSourceRange(instr->source());

 #if defined(DEBUG)
      if (!is_optimizing()) {
@ -1854,270 +1850,6 @@ void FlowGraphCompiler::AllocateRegistersLocally(Instruction* instr) {
  }
 }

-static uword RegMaskBit(Register reg) {
-  return ((reg) != kNoRegister) ? (1 << (reg)) : 0;
-}
-
-ParallelMoveResolver::ParallelMoveResolver(FlowGraphCompiler* compiler)
-    : compiler_(compiler), moves_(32) {}
-
-void ParallelMoveResolver::EmitNativeCode(ParallelMoveInstr* parallel_move) {
-  ASSERT(moves_.is_empty());
-
-  // Build up a worklist of moves.
-  BuildInitialMoveList(parallel_move);
-
-  const InstructionSource& move_source = InstructionSource(
-      TokenPosition::kParallelMove, parallel_move->inlining_id());
-  for (int i = 0; i < moves_.length(); ++i) {
-    const MoveOperands& move = *moves_[i];
-    // Skip constants to perform them last.  They don't block other moves
-    // and skipping such moves with register destinations keeps those
-    // registers free for the whole algorithm.
-    if (!move.IsEliminated() && !move.src().IsConstant()) {
-      PerformMove(move_source, i);
-    }
-  }
-
-  // Perform the moves with constant sources.
-  for (int i = 0; i < moves_.length(); ++i) {
-    const MoveOperands& move = *moves_[i];
-    if (!move.IsEliminated()) {
-      ASSERT(move.src().IsConstant());
-      compiler_->BeginCodeSourceRange(move_source);
-      EmitMove(i);
-      compiler_->EndCodeSourceRange(move_source);
-    }
-  }
-
-  moves_.Clear();
-}
-
-void ParallelMoveResolver::BuildInitialMoveList(
-    ParallelMoveInstr* parallel_move) {
-  // Perform a linear sweep of the moves to add them to the initial list of
-  // moves to perform, ignoring any move that is redundant (the source is
-  // the same as the destination, the destination is ignored and
-  // unallocated, or the move was already eliminated).
-  for (int i = 0; i < parallel_move->NumMoves(); i++) {
-    MoveOperands* move = parallel_move->MoveOperandsAt(i);
-    if (!move->IsRedundant()) moves_.Add(move);
-  }
-}
-
-void ParallelMoveResolver::PerformMove(const InstructionSource& source,
-                                       int index) {
-  // Each call to this function performs a move and deletes it from the move
-  // graph.  We first recursively perform any move blocking this one.  We
-  // mark a move as "pending" on entry to PerformMove in order to detect
-  // cycles in the move graph.  We use operand swaps to resolve cycles,
-  // which means that a call to PerformMove could change any source operand
-  // in the move graph.
-
-  ASSERT(!moves_[index]->IsPending());
-  ASSERT(!moves_[index]->IsRedundant());
-
-  // Clear this move's destination to indicate a pending move.  The actual
-  // destination is saved in a stack-allocated local.  Recursion may allow
-  // multiple moves to be pending.
-  ASSERT(!moves_[index]->src().IsInvalid());
-  Location destination = moves_[index]->MarkPending();
-
-  // Perform a depth-first traversal of the move graph to resolve
-  // dependencies.  Any unperformed, unpending move with a source the same
-  // as this one's destination blocks this one so recursively perform all
-  // such moves.
-  for (int i = 0; i < moves_.length(); ++i) {
-    const MoveOperands& other_move = *moves_[i];
-    if (other_move.Blocks(destination) && !other_move.IsPending()) {
-      // Though PerformMove can change any source operand in the move graph,
-      // this call cannot create a blocking move via a swap (this loop does
-      // not miss any).  Assume there is a non-blocking move with source A
-      // and this move is blocked on source B and there is a swap of A and
-      // B.  Then A and B must be involved in the same cycle (or they would
-      // not be swapped).  Since this move's destination is B and there is
-      // only a single incoming edge to an operand, this move must also be
-      // involved in the same cycle.  In that case, the blocking move will
-      // be created but will be "pending" when we return from PerformMove.
-      PerformMove(source, i);
-    }
-  }
-
-  // We are about to resolve this move and don't need it marked as
-  // pending, so restore its destination.
-  moves_[index]->ClearPending(destination);
-
-  // This move's source may have changed due to swaps to resolve cycles and
-  // so it may now be the last move in the cycle.  If so remove it.
-  if (moves_[index]->src().Equals(destination)) {
-    moves_[index]->Eliminate();
-    return;
-  }
-
-  // The move may be blocked on a (at most one) pending move, in which case
-  // we have a cycle.  Search for such a blocking move and perform a swap to
-  // resolve it.
-  for (int i = 0; i < moves_.length(); ++i) {
-    const MoveOperands& other_move = *moves_[i];
-    if (other_move.Blocks(destination)) {
-      ASSERT(other_move.IsPending());
-      compiler_->BeginCodeSourceRange(source);
-      EmitSwap(index);
-      compiler_->EndCodeSourceRange(source);
-      return;
-    }
-  }
-
-  // This move is not blocked.
-  compiler_->BeginCodeSourceRange(source);
-  EmitMove(index);
-  compiler_->EndCodeSourceRange(source);
-}
-
-void ParallelMoveResolver::EmitMove(int index) {
-  MoveOperands* const move = moves_[index];
-  const Location dst = move->dest();
-  if (dst.IsStackSlot() || dst.IsDoubleStackSlot()) {
-    ASSERT((dst.base_reg() != FPREG) ||
-           ((-compiler::target::frame_layout.VariableIndexForFrameSlot(
-                dst.stack_index())) < compiler_->StackSize()));
-  }
-  const Location src = move->src();
-  ParallelMoveResolver::TemporaryAllocator temp(this, /*blocked=*/kNoRegister);
-  compiler_->EmitMove(dst, src, &temp);
-#if defined(DEBUG)
-  // Allocating a scratch register here may cause stack spilling. Neither the
-  // source nor destination register should be SP-relative in that case.
-  for (const Location& loc : {dst, src}) {
-    ASSERT(!temp.DidAllocateTemporary() || !loc.HasStackIndex() ||
-           loc.base_reg() != SPREG);
-  }
-#endif
-  move->Eliminate();
-}
-
-bool ParallelMoveResolver::IsScratchLocation(Location loc) {
-  for (int i = 0; i < moves_.length(); ++i) {
-    if (moves_[i]->Blocks(loc)) {
-      return false;
-    }
-  }
-
-  for (int i = 0; i < moves_.length(); ++i) {
-    if (moves_[i]->dest().Equals(loc)) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-intptr_t ParallelMoveResolver::AllocateScratchRegister(
-    Location::Kind kind,
-    uword blocked_mask,
-    intptr_t first_free_register,
-    intptr_t last_free_register,
-    bool* spilled) {
-  COMPILE_ASSERT(static_cast<intptr_t>(sizeof(blocked_mask)) * kBitsPerByte >=
-                 kNumberOfFpuRegisters);
-  COMPILE_ASSERT(static_cast<intptr_t>(sizeof(blocked_mask)) * kBitsPerByte >=
-                 kNumberOfCpuRegisters);
-  intptr_t scratch = -1;
-  for (intptr_t reg = first_free_register; reg <= last_free_register; reg++) {
-    if ((((1 << reg) & blocked_mask) == 0) &&
-        IsScratchLocation(Location::MachineRegisterLocation(kind, reg))) {
-      scratch = reg;
-      break;
-    }
-  }
-
-  if (scratch == -1) {
-    *spilled = true;
-    for (intptr_t reg = first_free_register; reg <= last_free_register; reg++) {
-      if (((1 << reg) & blocked_mask) == 0) {
-        scratch = reg;
-        break;
-      }
-    }
-  } else {
-    *spilled = false;
-  }
-
-  return scratch;
-}
-
-ParallelMoveResolver::ScratchFpuRegisterScope::ScratchFpuRegisterScope(
-    ParallelMoveResolver* resolver,
-    FpuRegister blocked)
-    : resolver_(resolver), reg_(kNoFpuRegister), spilled_(false) {
-  COMPILE_ASSERT(FpuTMP != kNoFpuRegister);
-  uword blocked_mask =
-      ((blocked != kNoFpuRegister) ? 1 << blocked : 0) | 1 << FpuTMP;
-  reg_ = static_cast<FpuRegister>(resolver_->AllocateScratchRegister(
-      Location::kFpuRegister, blocked_mask, 0, kNumberOfFpuRegisters - 1,
-      &spilled_));
-
-  if (spilled_) {
-    resolver->SpillFpuScratch(reg_);
-  }
-}
-
-ParallelMoveResolver::ScratchFpuRegisterScope::~ScratchFpuRegisterScope() {
-  if (spilled_) {
-    resolver_->RestoreFpuScratch(reg_);
-  }
-}
-
-ParallelMoveResolver::TemporaryAllocator::TemporaryAllocator(
-    ParallelMoveResolver* resolver,
-    Register blocked)
-    : resolver_(resolver),
-      blocked_(blocked),
-      reg_(kNoRegister),
-      spilled_(false) {}
-
-Register ParallelMoveResolver::TemporaryAllocator::AllocateTemporary() {
-  ASSERT(reg_ == kNoRegister);
-
-  uword blocked_mask = RegMaskBit(blocked_) | kReservedCpuRegisters;
-  if (resolver_->compiler_->intrinsic_mode()) {
-    // Block additional registers that must be preserved for intrinsics.
-    blocked_mask |= RegMaskBit(ARGS_DESC_REG);
-#if !defined(TARGET_ARCH_IA32)
-    // Need to preserve CODE_REG to be able to store the PC marker
-    // and load the pool pointer.
-    blocked_mask |= RegMaskBit(CODE_REG);
-#endif
-  }
-  reg_ = static_cast<Register>(
-      resolver_->AllocateScratchRegister(Location::kRegister, blocked_mask, 0,
-                                         kNumberOfCpuRegisters - 1, &spilled_));
-
-  if (spilled_) {
-    resolver_->SpillScratch(reg_);
-  }
-
-  DEBUG_ONLY(allocated_ = true;)
-  return reg_;
-}
-
-void ParallelMoveResolver::TemporaryAllocator::ReleaseTemporary() {
-  if (spilled_) {
-    resolver_->RestoreScratch(reg_);
-  }
-  reg_ = kNoRegister;
-}
-
-ParallelMoveResolver::ScratchRegisterScope::ScratchRegisterScope(
-    ParallelMoveResolver* resolver,
-    Register blocked)
-    : allocator_(resolver, blocked) {
-  reg_ = allocator_.AllocateTemporary();
-}
-
-ParallelMoveResolver::ScratchRegisterScope::~ScratchRegisterScope() {
-  allocator_.ReleaseTemporary();
-}

 const ICData* FlowGraphCompiler::GetOrAddInstanceCallICData(
    intptr_t deopt_id,
--- a/runtime/vm/compiler/backend/flow_graph_compiler.h
+++ b/runtime/vm/compiler/backend/flow_graph_compiler.h
@ -63,107 +63,6 @@ class NoTemporaryAllocator : public TemporaryRegisterAllocator {
  void ReleaseTemporary() override { UNREACHABLE(); }
 };

-class ParallelMoveResolver : public ValueObject {
- public:
-  explicit ParallelMoveResolver(FlowGraphCompiler* compiler);
-
-  // Resolve a set of parallel moves, emitting assembler instructions.
-  void EmitNativeCode(ParallelMoveInstr* parallel_move);
-
- private:
-  class ScratchFpuRegisterScope : public ValueObject {
-   public:
-    ScratchFpuRegisterScope(ParallelMoveResolver* resolver,
-                            FpuRegister blocked);
-    ~ScratchFpuRegisterScope();
-
-    FpuRegister reg() const { return reg_; }
-
-   private:
-    ParallelMoveResolver* resolver_;
-    FpuRegister reg_;
-    bool spilled_;
-  };
-
-  class TemporaryAllocator : public TemporaryRegisterAllocator {
-   public:
-    TemporaryAllocator(ParallelMoveResolver* resolver, Register blocked);
-
-    Register AllocateTemporary() override;
-    void ReleaseTemporary() override;
-    DEBUG_ONLY(bool DidAllocateTemporary() { return allocated_; })
-
-    virtual ~TemporaryAllocator() { ASSERT(reg_ == kNoRegister); }
-
-   private:
-    ParallelMoveResolver* const resolver_;
-    const Register blocked_;
-    Register reg_;
-    bool spilled_;
-    DEBUG_ONLY(bool allocated_ = false);
-  };
-
-  class ScratchRegisterScope : public ValueObject {
-   public:
-    ScratchRegisterScope(ParallelMoveResolver* resolver, Register blocked);
-    ~ScratchRegisterScope();
-
-    Register reg() const { return reg_; }
-
-   private:
-    TemporaryAllocator allocator_;
-    Register reg_;
-  };
-
-  bool IsScratchLocation(Location loc);
-  intptr_t AllocateScratchRegister(Location::Kind kind,
-                                   uword blocked_mask,
-                                   intptr_t first_free_register,
-                                   intptr_t last_free_register,
-                                   bool* spilled);
-
-  void SpillScratch(Register reg);
-  void RestoreScratch(Register reg);
-  void SpillFpuScratch(FpuRegister reg);
-  void RestoreFpuScratch(FpuRegister reg);
-
-  // friend class ScratchXmmRegisterScope;
-
-  // Build the initial list of moves.
-  void BuildInitialMoveList(ParallelMoveInstr* parallel_move);
-
-  // Perform the move at the moves_ index in question (possibly requiring
-  // other moves to satisfy dependencies).
-  void PerformMove(const InstructionSource& source, int index);
-
-  // Emit a move and remove it from the move graph.
-  void EmitMove(int index);
-
-  // Execute a move by emitting a swap of two operands.  The move from
-  // source to destination is removed from the move graph.
-  void EmitSwap(int index);
-
-  // Verify the move list before performing moves.
-  void Verify();
-
-  // Helpers for non-trivial source-destination combinations that cannot
-  // be handled by a single instruction.
-  void MoveMemoryToMemory(const compiler::Address& dst,
-                          const compiler::Address& src);
-  void Exchange(Register reg, const compiler::Address& mem);
-  void Exchange(const compiler::Address& mem1, const compiler::Address& mem2);
-  void Exchange(Register reg, Register base_reg, intptr_t stack_offset);
-  void Exchange(Register base_reg1,
-                intptr_t stack_offset1,
-                Register base_reg2,
-                intptr_t stack_offset2);
-
-  FlowGraphCompiler* compiler_;
-
-  // List of moves not yet resolved.
-  GrowableArray<MoveOperands*> moves_;
-};
-
 // Used for describing a deoptimization point after call (lazy deoptimization).
 // For deoptimization before instruction use class CompilerDeoptInfoWithStub.
 class CompilerDeoptInfo : public ZoneAllocated {
@ -545,9 +444,6 @@ class FlowGraphCompiler : public ValueObject {
  bool ForceSlowPathForStackOverflow() const;

  const GrowableArray<BlockInfo*>& block_info() const { return block_info_; }
-  ParallelMoveResolver* parallel_move_resolver() {
-    return &parallel_move_resolver_;
-  }

  void StatsBegin(Instruction* instr) {
    if (stats_ != NULL) stats_->Begin(instr);
@ -1290,8 +1186,6 @@ class FlowGraphCompiler : public ValueObject {
  const Class& int32x4_class_;
  const Class& list_class_;

-  ParallelMoveResolver parallel_move_resolver_;
-
  // Currently instructions generate deopt stubs internally by
  // calling AddDeoptStub.  To communicate deoptimization environment
  // that should be used when deoptimizing we store it in this variable.
--- a/runtime/vm/compiler/backend/flow_graph_compiler_arm.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler_arm.cc
@ -10,6 +10,7 @@
 #include "vm/compiler/api/type_check_mode.h"
 #include "vm/compiler/backend/il_printer.h"
 #include "vm/compiler/backend/locations.h"
+#include "vm/compiler/backend/parallel_move_resolver.h"
 #include "vm/compiler/jit/compiler.h"
 #include "vm/cpu.h"
 #include "vm/dart_entry.h"
@ -1101,10 +1102,9 @@ void FlowGraphCompiler::LoadBSSEntry(BSS::Relocation relocation,
 #undef __
 #define __ compiler_->assembler()->

-void ParallelMoveResolver::EmitSwap(int index) {
-  MoveOperands* move = moves_[index];
-  const Location source = move->src();
-  const Location destination = move->dest();
+void ParallelMoveEmitter::EmitSwap(const MoveOperands& move) {
+  const Location source = move.src();
+  const Location destination = move.dest();

  if (source.IsRegister() && destination.IsRegister()) {
    ASSERT(source.reg() != IP);
@ -1183,56 +1183,39 @@ void ParallelMoveResolver::EmitSwap(int index) {
  } else {
    UNREACHABLE();
  }
-
-  // The swap of source and destination has executed a move from source to
-  // destination.
-  move->Eliminate();
-
-  // Any unperformed (including pending) move with a source of either
-  // this move's source or destination needs to have their source
-  // changed to reflect the state of affairs after the swap.
-  for (int i = 0; i < moves_.length(); ++i) {
-    const MoveOperands& other_move = *moves_[i];
-    if (other_move.Blocks(source)) {
-      moves_[i]->set_src(destination);
-    } else if (other_move.Blocks(destination)) {
-      moves_[i]->set_src(source);
-    }
-  }
 }

-void ParallelMoveResolver::MoveMemoryToMemory(const compiler::Address& dst,
-                                              const compiler::Address& src) {
+void ParallelMoveEmitter::MoveMemoryToMemory(const compiler::Address& dst,
+                                             const compiler::Address& src) {
  UNREACHABLE();
 }

 // Do not call or implement this function. Instead, use the form below that
 // uses an offset from the frame pointer instead of an Address.
-void ParallelMoveResolver::Exchange(Register reg,
-                                    const compiler::Address& mem) {
+void ParallelMoveEmitter::Exchange(Register reg, const compiler::Address& mem) {
  UNREACHABLE();
 }

 // Do not call or implement this function. Instead, use the form below that
 // uses offsets from the frame pointer instead of Addresses.
-void ParallelMoveResolver::Exchange(const compiler::Address& mem1,
-                                    const compiler::Address& mem2) {
+void ParallelMoveEmitter::Exchange(const compiler::Address& mem1,
+                                   const compiler::Address& mem2) {
  UNREACHABLE();
 }

-void ParallelMoveResolver::Exchange(Register reg,
-                                    Register base_reg,
-                                    intptr_t stack_offset) {
+void ParallelMoveEmitter::Exchange(Register reg,
+                                   Register base_reg,
+                                   intptr_t stack_offset) {
  ScratchRegisterScope tmp(this, reg);
  __ mov(tmp.reg(), compiler::Operand(reg));
  __ LoadFromOffset(reg, base_reg, stack_offset);
  __ StoreToOffset(tmp.reg(), base_reg, stack_offset);
 }

-void ParallelMoveResolver::Exchange(Register base_reg1,
-                                    intptr_t stack_offset1,
-                                    Register base_reg2,
-                                    intptr_t stack_offset2) {
+void ParallelMoveEmitter::Exchange(Register base_reg1,
+                                   intptr_t stack_offset1,
+                                   Register base_reg2,
+                                   intptr_t stack_offset2) {
  ScratchRegisterScope tmp1(this, kNoRegister);
  ScratchRegisterScope tmp2(this, tmp1.reg());
  __ LoadFromOffset(tmp1.reg(), base_reg1, stack_offset1);
@ -1241,19 +1224,19 @@ void ParallelMoveResolver::Exchange(Register base_reg1,
  __ StoreToOffset(tmp2.reg(), base_reg1, stack_offset1);
 }

-void ParallelMoveResolver::SpillScratch(Register reg) {
+void ParallelMoveEmitter::SpillScratch(Register reg) {
  __ Push(reg);
 }

-void ParallelMoveResolver::RestoreScratch(Register reg) {
+void ParallelMoveEmitter::RestoreScratch(Register reg) {
  __ Pop(reg);
 }

-void ParallelMoveResolver::SpillFpuScratch(FpuRegister reg) {
+void ParallelMoveEmitter::SpillFpuScratch(FpuRegister reg) {
  __ PushQuad(reg);
 }

-void ParallelMoveResolver::RestoreFpuScratch(FpuRegister reg) {
+void ParallelMoveEmitter::RestoreFpuScratch(FpuRegister reg) {
  __ PopQuad(reg);
 }

--- a/runtime/vm/compiler/backend/flow_graph_compiler_arm64.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler_arm64.cc
@ -10,6 +10,7 @@
 #include "vm/compiler/api/type_check_mode.h"
 #include "vm/compiler/backend/il_printer.h"
 #include "vm/compiler/backend/locations.h"
+#include "vm/compiler/backend/parallel_move_resolver.h"
 #include "vm/compiler/jit/compiler.h"
 #include "vm/cpu.h"
 #include "vm/dart_entry.h"
@ -1076,10 +1077,9 @@ void FlowGraphCompiler::LoadBSSEntry(BSS::Relocation relocation,
 #undef __
 #define __ compiler_->assembler()->

-void ParallelMoveResolver::EmitSwap(int index) {
-  MoveOperands* move = moves_[index];
-  const Location source = move->src();
-  const Location destination = move->dest();
+void ParallelMoveEmitter::EmitSwap(const MoveOperands& move) {
+  const Location source = move.src();
+  const Location destination = move.dest();

  if (source.IsRegister() && destination.IsRegister()) {
    ASSERT(source.reg() != TMP);
@ -1146,56 +1146,39 @@ void ParallelMoveResolver::EmitSwap(int index) {
  } else {
    UNREACHABLE();
  }
-
-  // The swap of source and destination has executed a move from source to
-  // destination.
-  move->Eliminate();
-
-  // Any unperformed (including pending) move with a source of either
-  // this move's source or destination needs to have their source
-  // changed to reflect the state of affairs after the swap.
-  for (int i = 0; i < moves_.length(); ++i) {
-    const MoveOperands& other_move = *moves_[i];
-    if (other_move.Blocks(source)) {
-      moves_[i]->set_src(destination);
-    } else if (other_move.Blocks(destination)) {
-      moves_[i]->set_src(source);
-    }
-  }
 }

-void ParallelMoveResolver::MoveMemoryToMemory(const compiler::Address& dst,
-                                              const compiler::Address& src) {
+void ParallelMoveEmitter::MoveMemoryToMemory(const compiler::Address& dst,
+                                             const compiler::Address& src) {
  UNREACHABLE();
 }

 // Do not call or implement this function. Instead, use the form below that
 // uses an offset from the frame pointer instead of an Address.
-void ParallelMoveResolver::Exchange(Register reg,
-                                    const compiler::Address& mem) {
+void ParallelMoveEmitter::Exchange(Register reg, const compiler::Address& mem) {
  UNREACHABLE();
 }

 // Do not call or implement this function. Instead, use the form below that
 // uses offsets from the frame pointer instead of Addresses.
-void ParallelMoveResolver::Exchange(const compiler::Address& mem1,
-                                    const compiler::Address& mem2) {
+void ParallelMoveEmitter::Exchange(const compiler::Address& mem1,
+                                   const compiler::Address& mem2) {
  UNREACHABLE();
 }

-void ParallelMoveResolver::Exchange(Register reg,
-                                    Register base_reg,
-                                    intptr_t stack_offset) {
+void ParallelMoveEmitter::Exchange(Register reg,
+                                   Register base_reg,
+                                   intptr_t stack_offset) {
  ScratchRegisterScope tmp(this, reg);
  __ mov(tmp.reg(), reg);
  __ LoadFromOffset(reg, base_reg, stack_offset);
  __ StoreToOffset(tmp.reg(), base_reg, stack_offset);
 }

-void ParallelMoveResolver::Exchange(Register base_reg1,
-                                    intptr_t stack_offset1,
-                                    Register base_reg2,
-                                    intptr_t stack_offset2) {
+void ParallelMoveEmitter::Exchange(Register base_reg1,
+                                   intptr_t stack_offset1,
+                                   Register base_reg2,
+                                   intptr_t stack_offset2) {
  ScratchRegisterScope tmp1(this, kNoRegister);
  ScratchRegisterScope tmp2(this, tmp1.reg());
  __ LoadFromOffset(tmp1.reg(), base_reg1, stack_offset1);
@ -1204,19 +1187,19 @@ void ParallelMoveResolver::Exchange(Register base_reg1,
  __ StoreToOffset(tmp2.reg(), base_reg1, stack_offset1);
 }

-void ParallelMoveResolver::SpillScratch(Register reg) {
+void ParallelMoveEmitter::SpillScratch(Register reg) {
  __ Push(reg);
 }

-void ParallelMoveResolver::RestoreScratch(Register reg) {
+void ParallelMoveEmitter::RestoreScratch(Register reg) {
  __ Pop(reg);
 }

-void ParallelMoveResolver::SpillFpuScratch(FpuRegister reg) {
+void ParallelMoveEmitter::SpillFpuScratch(FpuRegister reg) {
  __ PushQuad(reg);
 }

-void ParallelMoveResolver::RestoreFpuScratch(FpuRegister reg) {
+void ParallelMoveEmitter::RestoreFpuScratch(FpuRegister reg) {
  __ PopQuad(reg);
 }

--- a/runtime/vm/compiler/backend/flow_graph_compiler_ia32.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler_ia32.cc
@ -11,6 +11,7 @@
 #include "vm/compiler/api/type_check_mode.h"
 #include "vm/compiler/backend/il_printer.h"
 #include "vm/compiler/backend/locations.h"
+#include "vm/compiler/backend/parallel_move_resolver.h"
 #include "vm/compiler/frontend/flow_graph_builder.h"
 #include "vm/compiler/jit/compiler.h"
 #include "vm/cpu.h"
@ -1030,10 +1031,9 @@ void FlowGraphCompiler::EmitNativeMoveArchitecture(
 #undef __
 #define __ compiler_->assembler()->

-void ParallelMoveResolver::EmitSwap(int index) {
-  MoveOperands* move = moves_[index];
-  const Location source = move->src();
-  const Location destination = move->dest();
+void ParallelMoveEmitter::EmitSwap(const MoveOperands& move) {
+  const Location source = move.src();
+  const Location destination = move.dest();

  if (source.IsRegister() && destination.IsRegister()) {
    __ xchgl(destination.reg(), source.reg());
@ -1092,40 +1092,23 @@ void ParallelMoveResolver::EmitSwap(int index) {
  } else {
    UNREACHABLE();
  }
-
-  // The swap of source and destination has executed a move from source to
-  // destination.
-  move->Eliminate();
-
-  // Any unperformed (including pending) move with a source of either
-  // this move's source or destination needs to have their source
-  // changed to reflect the state of affairs after the swap.
-  for (int i = 0; i < moves_.length(); ++i) {
-    const MoveOperands& other_move = *moves_[i];
-    if (other_move.Blocks(source)) {
-      moves_[i]->set_src(destination);
-    } else if (other_move.Blocks(destination)) {
-      moves_[i]->set_src(source);
-    }
-  }
 }

-void ParallelMoveResolver::MoveMemoryToMemory(const compiler::Address& dst,
-                                              const compiler::Address& src) {
+void ParallelMoveEmitter::MoveMemoryToMemory(const compiler::Address& dst,
+                                             const compiler::Address& src) {
  ScratchRegisterScope ensure_scratch(this, kNoRegister);
  __ MoveMemoryToMemory(dst, src, ensure_scratch.reg());
 }

-void ParallelMoveResolver::Exchange(Register reg,
-                                    const compiler::Address& mem) {
+void ParallelMoveEmitter::Exchange(Register reg, const compiler::Address& mem) {
  ScratchRegisterScope ensure_scratch(this, reg);
  __ movl(ensure_scratch.reg(), mem);
  __ movl(mem, reg);
  __ movl(reg, ensure_scratch.reg());
 }

-void ParallelMoveResolver::Exchange(const compiler::Address& mem1,
-                                    const compiler::Address& mem2) {
+void ParallelMoveEmitter::Exchange(const compiler::Address& mem1,
+                                   const compiler::Address& mem2) {
  ScratchRegisterScope ensure_scratch1(this, kNoRegister);
  ScratchRegisterScope ensure_scratch2(this, ensure_scratch1.reg());
  __ movl(ensure_scratch1.reg(), mem1);
@ -1134,33 +1117,33 @@ void ParallelMoveResolver::Exchange(const compiler::Address& mem1,
  __ movl(mem1, ensure_scratch2.reg());
 }

-void ParallelMoveResolver::Exchange(Register reg,
-                                    Register base_reg,
-                                    intptr_t stack_offset) {
+void ParallelMoveEmitter::Exchange(Register reg,
+                                   Register base_reg,
+                                   intptr_t stack_offset) {
  UNREACHABLE();
 }

-void ParallelMoveResolver::Exchange(Register base_reg1,
-                                    intptr_t stack_offset1,
-                                    Register base_reg2,
-                                    intptr_t stack_offset2) {
+void ParallelMoveEmitter::Exchange(Register base_reg1,
+                                   intptr_t stack_offset1,
+                                   Register base_reg2,
+                                   intptr_t stack_offset2) {
  UNREACHABLE();
 }

-void ParallelMoveResolver::SpillScratch(Register reg) {
+void ParallelMoveEmitter::SpillScratch(Register reg) {
  __ pushl(reg);
 }

-void ParallelMoveResolver::RestoreScratch(Register reg) {
+void ParallelMoveEmitter::RestoreScratch(Register reg) {
  __ popl(reg);
 }

-void ParallelMoveResolver::SpillFpuScratch(FpuRegister reg) {
+void ParallelMoveEmitter::SpillFpuScratch(FpuRegister reg) {
  __ subl(ESP, compiler::Immediate(kFpuRegisterSize));
  __ movups(compiler::Address(ESP, 0), reg);
 }

-void ParallelMoveResolver::RestoreFpuScratch(FpuRegister reg) {
+void ParallelMoveEmitter::RestoreFpuScratch(FpuRegister reg) {
  __ movups(reg, compiler::Address(ESP, 0));
  __ addl(ESP, compiler::Immediate(kFpuRegisterSize));
 }
--- a/runtime/vm/compiler/backend/flow_graph_compiler_riscv.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler_riscv.cc
@ -10,6 +10,7 @@
 #include "vm/compiler/api/type_check_mode.h"
 #include "vm/compiler/backend/il_printer.h"
 #include "vm/compiler/backend/locations.h"
+#include "vm/compiler/backend/parallel_move_resolver.h"
 #include "vm/compiler/jit/compiler.h"
 #include "vm/cpu.h"
 #include "vm/dart_entry.h"
@ -1080,10 +1081,9 @@ void FlowGraphCompiler::LoadBSSEntry(BSS::Relocation relocation,
 #undef __
 #define __ compiler_->assembler()->

-void ParallelMoveResolver::EmitSwap(int index) {
-  MoveOperands* move = moves_[index];
-  const Location source = move->src();
-  const Location destination = move->dest();
+void ParallelMoveEmitter::EmitSwap(const MoveOperands& move) {
+  const Location source = move.src();
+  const Location destination = move.dest();

  if (source.IsRegister() && destination.IsRegister()) {
    ASSERT(source.reg() != TMP);
@ -1122,55 +1122,38 @@ void ParallelMoveResolver::EmitSwap(int index) {
  } else {
    UNREACHABLE();
  }
-
-  // The swap of source and destination has executed a move from source to
-  // destination.
-  move->Eliminate();
-
-  // Any unperformed (including pending) move with a source of either
-  // this move's source or destination needs to have their source
-  // changed to reflect the state of affairs after the swap.
-  for (int i = 0; i < moves_.length(); ++i) {
-    const MoveOperands& other_move = *moves_[i];
-    if (other_move.Blocks(source)) {
-      moves_[i]->set_src(destination);
-    } else if (other_move.Blocks(destination)) {
-      moves_[i]->set_src(source);
-    }
-  }
 }

-void ParallelMoveResolver::MoveMemoryToMemory(const compiler::Address& dst,
-                                              const compiler::Address& src) {
+void ParallelMoveEmitter::MoveMemoryToMemory(const compiler::Address& dst,
+                                             const compiler::Address& src) {
  UNREACHABLE();
 }

 // Do not call or implement this function. Instead, use the form below that
 // uses an offset from the frame pointer instead of an Address.
-void ParallelMoveResolver::Exchange(Register reg,
-                                    const compiler::Address& mem) {
+void ParallelMoveEmitter::Exchange(Register reg, const compiler::Address& mem) {
  UNREACHABLE();
 }

 // Do not call or implement this function. Instead, use the form below that
 // uses offsets from the frame pointer instead of Addresses.
-void ParallelMoveResolver::Exchange(const compiler::Address& mem1,
-                                    const compiler::Address& mem2) {
+void ParallelMoveEmitter::Exchange(const compiler::Address& mem1,
+                                   const compiler::Address& mem2) {
  UNREACHABLE();
 }

-void ParallelMoveResolver::Exchange(Register reg,
-                                    Register base_reg,
-                                    intptr_t stack_offset) {
+void ParallelMoveEmitter::Exchange(Register reg,
+                                   Register base_reg,
+                                   intptr_t stack_offset) {
  __ mv(TMP, reg);
  __ LoadFromOffset(reg, base_reg, stack_offset);
  __ StoreToOffset(TMP, base_reg, stack_offset);
 }

-void ParallelMoveResolver::Exchange(Register base_reg1,
-                                    intptr_t stack_offset1,
-                                    Register base_reg2,
-                                    intptr_t stack_offset2) {
+void ParallelMoveEmitter::Exchange(Register base_reg1,
+                                   intptr_t stack_offset1,
+                                   Register base_reg2,
+                                   intptr_t stack_offset2) {
  ScratchRegisterScope tmp1(this, kNoRegister);
  ScratchRegisterScope tmp2(this, tmp1.reg());
  __ LoadFromOffset(tmp1.reg(), base_reg1, stack_offset1);
@ -1179,20 +1162,20 @@ void ParallelMoveResolver::Exchange(Register base_reg1,
  __ StoreToOffset(tmp2.reg(), base_reg1, stack_offset1);
 }

-void ParallelMoveResolver::SpillScratch(Register reg) {
+void ParallelMoveEmitter::SpillScratch(Register reg) {
  __ PushRegister(reg);
 }

-void ParallelMoveResolver::RestoreScratch(Register reg) {
+void ParallelMoveEmitter::RestoreScratch(Register reg) {
  __ PopRegister(reg);
 }

-void ParallelMoveResolver::SpillFpuScratch(FpuRegister reg) {
+void ParallelMoveEmitter::SpillFpuScratch(FpuRegister reg) {
  __ subi(SP, SP, sizeof(double));
  __ fsd(reg, compiler::Address(SP, 0));
 }

-void ParallelMoveResolver::RestoreFpuScratch(FpuRegister reg) {
+void ParallelMoveEmitter::RestoreFpuScratch(FpuRegister reg) {
  __ fld(reg, compiler::Address(SP, 0));
  __ addi(SP, SP, sizeof(double));
 }
--- a/runtime/vm/compiler/backend/flow_graph_compiler_x64.cc
+++ b/runtime/vm/compiler/backend/flow_graph_compiler_x64.cc
@ -10,6 +10,7 @@
 #include "vm/compiler/api/type_check_mode.h"
 #include "vm/compiler/backend/il_printer.h"
 #include "vm/compiler/backend/locations.h"
+#include "vm/compiler/backend/parallel_move_resolver.h"
 #include "vm/compiler/ffi/native_location.h"
 #include "vm/compiler/jit/compiler.h"
 #include "vm/dart_entry.h"
@ -1068,10 +1069,9 @@ void FlowGraphCompiler::LoadBSSEntry(BSS::Relocation relocation,
 #undef __
 #define __ compiler_->assembler()->

-void ParallelMoveResolver::EmitSwap(int index) {
-  MoveOperands* move = moves_[index];
-  const Location source = move->src();
-  const Location destination = move->dest();
+void ParallelMoveEmitter::EmitSwap(const MoveOperands& move) {
+  const Location source = move.src();
+  const Location destination = move.dest();

  if (source.IsRegister() && destination.IsRegister()) {
    __ xchgq(destination.reg(), source.reg());
@ -1130,66 +1130,49 @@ void ParallelMoveResolver::EmitSwap(int index) {
  } else {
    UNREACHABLE();
  }
-
-  // The swap of source and destination has executed a move from source to
-  // destination.
-  move->Eliminate();
-
-  // Any unperformed (including pending) move with a source of either
-  // this move's source or destination needs to have their source
-  // changed to reflect the state of affairs after the swap.
-  for (int i = 0; i < moves_.length(); ++i) {
-    const MoveOperands& other_move = *moves_[i];
-    if (other_move.Blocks(source)) {
-      moves_[i]->set_src(destination);
-    } else if (other_move.Blocks(destination)) {
-      moves_[i]->set_src(source);
-    }
-  }
 }

-void ParallelMoveResolver::MoveMemoryToMemory(const compiler::Address& dst,
-                                              const compiler::Address& src) {
+void ParallelMoveEmitter::MoveMemoryToMemory(const compiler::Address& dst,
+                                             const compiler::Address& src) {
  __ MoveMemoryToMemory(dst, src);
 }

-void ParallelMoveResolver::Exchange(Register reg,
-                                    const compiler::Address& mem) {
+void ParallelMoveEmitter::Exchange(Register reg, const compiler::Address& mem) {
  __ Exchange(reg, mem);
 }

-void ParallelMoveResolver::Exchange(const compiler::Address& mem1,
-                                    const compiler::Address& mem2) {
+void ParallelMoveEmitter::Exchange(const compiler::Address& mem1,
+                                   const compiler::Address& mem2) {
  __ Exchange(mem1, mem2);
 }

-void ParallelMoveResolver::Exchange(Register reg,
-                                    Register base_reg,
-                                    intptr_t stack_offset) {
+void ParallelMoveEmitter::Exchange(Register reg,
+                                   Register base_reg,
+                                   intptr_t stack_offset) {
  UNREACHABLE();
 }

-void ParallelMoveResolver::Exchange(Register base_reg1,
-                                    intptr_t stack_offset1,
-                                    Register base_reg2,
-                                    intptr_t stack_offset2) {
+void ParallelMoveEmitter::Exchange(Register base_reg1,
+                                   intptr_t stack_offset1,
+                                   Register base_reg2,
+                                   intptr_t stack_offset2) {
  UNREACHABLE();
 }

-void ParallelMoveResolver::SpillScratch(Register reg) {
+void ParallelMoveEmitter::SpillScratch(Register reg) {
  __ pushq(reg);
 }

-void ParallelMoveResolver::RestoreScratch(Register reg) {
+void ParallelMoveEmitter::RestoreScratch(Register reg) {
  __ popq(reg);
 }

-void ParallelMoveResolver::SpillFpuScratch(FpuRegister reg) {
+void ParallelMoveEmitter::SpillFpuScratch(FpuRegister reg) {
  __ AddImmediate(RSP, compiler::Immediate(-kFpuRegisterSize));
  __ movups(compiler::Address(RSP, 0), reg);
 }

-void ParallelMoveResolver::RestoreFpuScratch(FpuRegister reg) {
+void ParallelMoveEmitter::RestoreFpuScratch(FpuRegister reg) {
  __ movups(reg, compiler::Address(RSP, 0));
  __ AddImmediate(RSP, compiler::Immediate(kFpuRegisterSize));
 }
--- a/runtime/vm/compiler/backend/il.cc
+++ b/runtime/vm/compiler/backend/il.cc
@ -16,6 +16,7 @@
 #include "vm/compiler/backend/locations.h"
 #include "vm/compiler/backend/locations_helpers.h"
 #include "vm/compiler/backend/loops.h"
+#include "vm/compiler/backend/parallel_move_resolver.h"
 #include "vm/compiler/backend/range_analysis.h"
 #include "vm/compiler/ffi/frame_rebase.h"
 #include "vm/compiler/ffi/marshaller.h"
@ -4035,7 +4036,7 @@ void JoinEntryInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
                                   InstructionSource());
  }
  if (HasParallelMove()) {
-    compiler->parallel_move_resolver()->EmitNativeCode(parallel_move());
+    parallel_move()->EmitNativeCode(compiler);
  }
 }

@ -4065,7 +4066,7 @@ void TargetEntryInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
    if (compiler::Assembler::EmittingComments()) {
      compiler->EmitComment(parallel_move());
    }
-    compiler->parallel_move_resolver()->EmitNativeCode(parallel_move());
+    parallel_move()->EmitNativeCode(compiler);
  }
 }

@ -4139,7 +4140,7 @@ void FunctionEntryInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
    if (compiler::Assembler::EmittingComments()) {
      compiler->EmitComment(parallel_move());
    }
-    compiler->parallel_move_resolver()->EmitNativeCode(parallel_move());
+    parallel_move()->EmitNativeCode(compiler);
  }
 }

@ -4235,7 +4236,7 @@ void OsrEntryInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
    if (compiler::Assembler::EmittingComments()) {
      compiler->EmitComment(parallel_move());
    }
-    compiler->parallel_move_resolver()->EmitNativeCode(parallel_move());
+    parallel_move()->EmitNativeCode(compiler);
  }
 }

@ -4680,7 +4681,7 @@ LocationSummary* ParallelMoveInstr::MakeLocationSummary(Zone* zone,
 }

 void ParallelMoveInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
-  UNREACHABLE();
+  ParallelMoveEmitter(compiler, this).EmitNativeCode();
 }

 LocationSummary* ConstraintInstr::MakeLocationSummary(Zone* zone,
--- a/runtime/vm/compiler/backend/il.h
+++ b/runtime/vm/compiler/backend/il.h
@ -59,6 +59,7 @@ class Instruction;
 class InstructionVisitor;
 class LocalVariable;
 class LoopInfo;
+class MoveSchedule;
 class ParsedFunction;
 class Range;
 class RangeAnalysis;
@ -1479,6 +1480,8 @@ class TemplateInstruction
 class MoveOperands : public ZoneAllocated {
 public:
  MoveOperands(Location dest, Location src) : dest_(dest), src_(src) {}
+  MoveOperands(const MoveOperands& other)
+      : dest_(other.dest_), src_(other.src_) {}

  MoveOperands& operator=(const MoveOperands& other) {
    dest_ = other.dest_;
@ -1568,12 +1571,22 @@ class ParallelMoveInstr : public TemplateInstruction<0, NoThrow> {
    return TokenPosition::kParallelMove;
  }

+  const MoveSchedule& move_schedule() const {
+    ASSERT(move_schedule_ != nullptr);
+    return *move_schedule_;
+  }
+
+  void set_move_schedule(const MoveSchedule& schedule) {
+    move_schedule_ = &schedule;
+  }
+
  PRINT_TO_SUPPORT
  DECLARE_EMPTY_SERIALIZATION(ParallelMoveInstr, TemplateInstruction)
  DECLARE_EXTRA_SERIALIZATION

 private:
  GrowableArray<MoveOperands*> moves_;  // Elements cannot be null.
+  const MoveSchedule* move_schedule_ = nullptr;

  DISALLOW_COPY_AND_ASSIGN(ParallelMoveInstr);
 };
--- a/runtime/vm/compiler/backend/il_arm.cc
+++ b/runtime/vm/compiler/backend/il_arm.cc
@ -3187,7 +3187,7 @@ void CatchBlockEntryInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
    }
  }
  if (HasParallelMove()) {
-    compiler->parallel_move_resolver()->EmitNativeCode(parallel_move());
+    parallel_move()->EmitNativeCode(compiler);
  }

  // Restore SP from FP as we are coming from a throw and the code for
@ -7126,7 +7126,7 @@ void GotoInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
                                   InstructionSource());
  }
  if (HasParallelMove()) {
-    compiler->parallel_move_resolver()->EmitNativeCode(parallel_move());
+    parallel_move()->EmitNativeCode(compiler);
  }

  // We can fall through if the successor is the next block in the list.
--- a/runtime/vm/compiler/backend/il_arm64.cc
+++ b/runtime/vm/compiler/backend/il_arm64.cc
@ -2847,7 +2847,7 @@ void CatchBlockEntryInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
    }
  }
  if (HasParallelMove()) {
-    compiler->parallel_move_resolver()->EmitNativeCode(parallel_move());
+    parallel_move()->EmitNativeCode(compiler);
  }

  // Restore SP from FP as we are coming from a throw and the code for
@ -6218,7 +6218,7 @@ void GotoInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
                                   InstructionSource());
  }
  if (HasParallelMove()) {
-    compiler->parallel_move_resolver()->EmitNativeCode(parallel_move());
+    parallel_move()->EmitNativeCode(compiler);
  }

  // We can fall through if the successor is the next block in the list.
--- a/runtime/vm/compiler/backend/il_ia32.cc
+++ b/runtime/vm/compiler/backend/il_ia32.cc
@ -2474,7 +2474,7 @@ void CatchBlockEntryInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
    }
  }
  if (HasParallelMove()) {
-    compiler->parallel_move_resolver()->EmitNativeCode(parallel_move());
+    parallel_move()->EmitNativeCode(compiler);
  }

  // Restore ESP from EBP as we are coming from a throw and the code for
@ -6264,7 +6264,7 @@ void GotoInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
                                   InstructionSource());
  }
  if (HasParallelMove()) {
-    compiler->parallel_move_resolver()->EmitNativeCode(parallel_move());
+    parallel_move()->EmitNativeCode(compiler);
  }

  // We can fall through if the successor is the next block in the list.
--- a/runtime/vm/compiler/backend/il_riscv.cc
+++ b/runtime/vm/compiler/backend/il_riscv.cc
@ -3128,7 +3128,7 @@ void CatchBlockEntryInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
    }
  }
  if (HasParallelMove()) {
-    compiler->parallel_move_resolver()->EmitNativeCode(parallel_move());
+    parallel_move()->EmitNativeCode(compiler);
  }

  // Restore SP from FP as we are coming from a throw and the code for
@ -7243,7 +7243,7 @@ void GotoInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
                                   InstructionSource());
  }
  if (HasParallelMove()) {
-    compiler->parallel_move_resolver()->EmitNativeCode(parallel_move());
+    parallel_move()->EmitNativeCode(compiler);
  }

  // We can fall through if the successor is the next block in the list.
--- a/runtime/vm/compiler/backend/il_serializer.cc
+++ b/runtime/vm/compiler/backend/il_serializer.cc
@ -1369,17 +1369,30 @@ template <>
 void FlowGraphSerializer::WriteTrait<MoveOperands*>::Write(
    FlowGraphSerializer* s,
    MoveOperands* x) {
+  s->Write<const MoveOperands*>(x);
+}
+
+template <>
+void FlowGraphSerializer::WriteTrait<const MoveOperands*>::Write(
+    FlowGraphSerializer* s,
+    const MoveOperands* x) {
  ASSERT(x != nullptr);
  x->src().Write(s);
  x->dest().Write(s);
 }

 template <>
-MoveOperands* FlowGraphDeserializer::ReadTrait<MoveOperands*>::Read(
+MoveOperands FlowGraphDeserializer::ReadTrait<MoveOperands>::Read(
    FlowGraphDeserializer* d) {
  Location src = Location::Read(d);
  Location dest = Location::Read(d);
-  return new (d->zone()) MoveOperands(dest, src);
+  return {dest, src};
+}
+
+template <>
+MoveOperands* FlowGraphDeserializer::ReadTrait<MoveOperands*>::Read(
+    FlowGraphDeserializer* d) {
+  return new (d->zone()) MoveOperands(d->Read<MoveOperands>());
 }

 template <>
@ -2083,11 +2096,13 @@ OsrEntryInstr::OsrEntryInstr(FlowGraphDeserializer* d)
 void ParallelMoveInstr::WriteExtra(FlowGraphSerializer* s) {
  Instruction::WriteExtra(s);
  s->Write<GrowableArray<MoveOperands*>>(moves_);
+  s->Write<const MoveSchedule*>(move_schedule_);
 }

 void ParallelMoveInstr::ReadExtra(FlowGraphDeserializer* d) {
  Instruction::ReadExtra(d);
  moves_ = d->Read<GrowableArray<MoveOperands*>>();
+  move_schedule_ = d->Read<const MoveSchedule*>();
 }

 void PhiInstr::WriteTo(FlowGraphSerializer* s) {
--- a/runtime/vm/compiler/backend/il_serializer.h
+++ b/runtime/vm/compiler/backend/il_serializer.h
@ -38,6 +38,7 @@ class JoinEntryInstr;
 class LocalVariable;
 class LocationSummary;
 class MoveOperands;
+class MoveSchedule;
 class NonStreamingWriteStream;
 class OsrEntryInstr;
 class ParsedFunction;
@ -100,6 +101,7 @@ class NativeCallingConvention;
  V(const LocalVariable&)                                                      \
  V(LocationSummary*)                                                          \
  V(MoveOperands*)                                                             \
+  V(const MoveSchedule*)                                                       \
  V(const compiler::ffi::NativeCallingConvention&)                             \
  V(const Object&)                                                             \
  V(ParallelMoveInstr*)                                                        \
@ -253,6 +255,7 @@ class FlowGraphSerializer : public ValueObject {
    static void Write(FlowGraphSerializer* s, type x);                         \
  };
  IL_SERIALIZABLE_TYPE_LIST(DECLARE_WRITE_TRAIT)
+  DECLARE_WRITE_TRAIT(const MoveOperands*)
 #undef DECLARE_WRITE_TRAIT

  template <typename T>
@ -472,6 +475,7 @@ class FlowGraphDeserializer : public ValueObject {
    static type Read(FlowGraphDeserializer* d);                                \
  };
  IL_SERIALIZABLE_TYPE_LIST(DECLARE_READ_TRAIT)
+  DECLARE_READ_TRAIT(MoveOperands)
 #undef DECLARE_READ_TRAIT

  template <typename T>
--- a/runtime/vm/compiler/backend/il_x64.cc
+++ b/runtime/vm/compiler/backend/il_x64.cc
@ -2894,7 +2894,7 @@ void CatchBlockEntryInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
    }
  }
  if (HasParallelMove()) {
-    compiler->parallel_move_resolver()->EmitNativeCode(parallel_move());
+    parallel_move()->EmitNativeCode(compiler);
  }

  // Restore RSP from RBP as we are coming from a throw and the code for
@ -6583,7 +6583,7 @@ void GotoInstr::EmitNativeCode(FlowGraphCompiler* compiler) {
                                   InstructionSource());
  }
  if (HasParallelMove()) {
-    compiler->parallel_move_resolver()->EmitNativeCode(parallel_move());
+    parallel_move()->EmitNativeCode(compiler);
  }

  // We can fall through if the successor is the next block in the list.
--- a/runtime/vm/compiler/backend/linearscan.cc
+++ b/runtime/vm/compiler/backend/linearscan.cc
@ -10,6 +10,7 @@
 #include "vm/compiler/backend/il.h"
 #include "vm/compiler/backend/il_printer.h"
 #include "vm/compiler/backend/loops.h"
+#include "vm/compiler/backend/parallel_move_resolver.h"
 #include "vm/log.h"
 #include "vm/parser.h"
 #include "vm/stack_frame.h"
@ -3311,6 +3312,26 @@ void FlowGraphAllocator::AllocateOutgoingArguments() {
  }
 }

+void FlowGraphAllocator::ScheduleParallelMoves() {
+  ParallelMoveResolver resolver;
+
+  for (auto block : flow_graph_.reverse_postorder()) {
+    if (block->HasParallelMove()) {
+      resolver.Resolve(block->parallel_move());
+    }
+    for (auto instruction : block->instructions()) {
+      if (auto move = instruction->AsParallelMove()) {
+        resolver.Resolve(move);
+      }
+    }
+    if (auto goto_instr = block->last_instruction()->AsGoto()) {
+      if (goto_instr->HasParallelMove()) {
+        resolver.Resolve(goto_instr->parallel_move());
+      }
+    }
+  }
+}
+
 void FlowGraphAllocator::AllocateRegisters() {
  CollectRepresentations();

@ -3375,6 +3396,8 @@ void FlowGraphAllocator::AllocateRegisters() {

  ResolveControlFlow();

+  ScheduleParallelMoves();
+
  if (FLAG_print_ssa_liveranges && CompilerState::ShouldTrace()) {
    const Function& function = flow_graph_.function();

--- a/runtime/vm/compiler/backend/linearscan.h
+++ b/runtime/vm/compiler/backend/linearscan.h
@ -181,6 +181,8 @@ class FlowGraphAllocator : public ValueObject {
  // Connect split siblings over non-linear control flow edges.
  void ResolveControlFlow();

+  void ScheduleParallelMoves();
+
  // Returns true if the target location is the spill slot for the given range.
  bool TargetLocationIsSpillSlot(LiveRange* range, Location target);

--- a/runtime/vm/compiler/backend/parallel_move_resolver.cc
+++ b/runtime/vm/compiler/backend/parallel_move_resolver.cc
@ -0,0 +1,414 @@
+// Copyright (c) 2023, the Dart project authors.  Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+#include "vm/compiler/backend/parallel_move_resolver.h"
+
+namespace dart {
+
+// Simple dynamically allocated array of fixed length.
+template <typename Subclass, typename Element>
+class FixedArray {
+ public:
+  static Subclass& Allocate(intptr_t length) {
+    static_assert(Utils::IsAligned(alignof(Subclass), alignof(Element)));
+    auto result =
+        reinterpret_cast<void*>(Thread::Current()->zone()->AllocUnsafe(
+            sizeof(Subclass) + length * sizeof(Element)));
+    return *new (result) Subclass(length);
+  }
+
+  intptr_t length() const { return length_; }
+
+  Element& operator[](intptr_t i) {
+    ASSERT(0 <= i && i < length_);
+    return data()[i];
+  }
+
+  const Element& operator[](intptr_t i) const {
+    ASSERT(0 <= i && i < length_);
+    return data()[i];
+  }
+
+  Element* data() { OPEN_ARRAY_START(Element, Element); }
+  const Element* data() const { OPEN_ARRAY_START(Element, Element); }
+
+  Element* begin() { return data(); }
+  const Element* begin() const { return data(); }
+
+  Element* end() { return data() + length_; }
+  const Element* end() const { return data() + length_; }
+
+ protected:
+  explicit FixedArray(intptr_t length) : length_(length) {}
+
+ private:
+  intptr_t length_;
+
+  DISALLOW_COPY_AND_ASSIGN(FixedArray);
+};
+
+class MoveSchedule : public FixedArray<MoveSchedule, ParallelMoveResolver::Op> {
+ public:
+  // Converts the given list of |ParallelMoveResolver::Op| operations
+  // into a |MoveSchedule| and filters out all |kNop| operations.
+  static const MoveSchedule& From(
+      const GrowableArray<ParallelMoveResolver::Op>& ops) {
+    intptr_t count = 0;
+    for (const auto& op : ops) {
+      if (op.kind != ParallelMoveResolver::OpKind::kNop) count++;
+    }
+
+    auto& result = FixedArray::Allocate(count);
+    intptr_t i = 0;
+    for (const auto& op : ops) {
+      if (op.kind != ParallelMoveResolver::OpKind::kNop) {
+        result[i++] = op;
+      }
+    }
+    return result;
+  }
+
+ private:
+  friend class FixedArray;
+
+  explicit MoveSchedule(intptr_t length) : FixedArray(length) {}
+
+  DISALLOW_COPY_AND_ASSIGN(MoveSchedule);
+};
+
+static uword RegMaskBit(Register reg) {
+  return ((reg) != kNoRegister) ? (1 << (reg)) : 0;
+}
+
+ParallelMoveResolver::ParallelMoveResolver() : moves_(32) {}
+
+void ParallelMoveResolver::Resolve(ParallelMoveInstr* parallel_move) {
+  ASSERT(moves_.is_empty());
+
+  // Build up a worklist of moves.
+  BuildInitialMoveList(parallel_move);
+
+  const InstructionSource& move_source = InstructionSource(
+      TokenPosition::kParallelMove, parallel_move->inlining_id());
+  for (intptr_t i = 0; i < moves_.length(); ++i) {
+    const MoveOperands& move = moves_[i];
+    // Skip constants to perform them last.  They don't block other moves
+    // and skipping such moves with register destinations keeps those
+    // registers free for the whole algorithm.
+    if (!move.IsEliminated() && !move.src().IsConstant()) {
+      PerformMove(move_source, i);
+    }
+  }
+
+  // Perform the moves with constant sources.
+  for (const auto& move : moves_) {
+    if (!move.IsEliminated()) {
+      ASSERT(move.src().IsConstant());
+      scheduled_ops_.Add({OpKind::kMove, move});
+    }
+  }
+  moves_.Clear();
+
+  // Schedule is ready. Update parallel move itself.
+  parallel_move->set_move_schedule(MoveSchedule::From(scheduled_ops_));
+  scheduled_ops_.Clear();
+}
+
+void ParallelMoveResolver::BuildInitialMoveList(
+    ParallelMoveInstr* parallel_move) {
+  // Perform a linear sweep of the moves to add them to the initial list of
+  // moves to perform, ignoring any move that is redundant (the source is
+  // the same as the destination, the destination is ignored and
+  // unallocated, or the move was already eliminated).
+  for (int i = 0; i < parallel_move->NumMoves(); i++) {
+    MoveOperands* move = parallel_move->MoveOperandsAt(i);
+    if (!move->IsRedundant()) moves_.Add(*move);
+  }
+}
+
+void ParallelMoveResolver::PerformMove(const InstructionSource& source,
+                                       int index) {
+  // Each call to this function performs a move and deletes it from the move
+  // graph.  We first recursively perform any move blocking this one.  We
+  // mark a move as "pending" on entry to PerformMove in order to detect
+  // cycles in the move graph.  We use operand swaps to resolve cycles,
+  // which means that a call to PerformMove could change any source operand
+  // in the move graph.
+
+  ASSERT(!moves_[index].IsPending());
+  ASSERT(!moves_[index].IsRedundant());
+
+  // Clear this move's destination to indicate a pending move.  The actual
+  // destination is saved in a stack-allocated local.  Recursion may allow
+  // multiple moves to be pending.
+  ASSERT(!moves_[index].src().IsInvalid());
+  Location destination = moves_[index].MarkPending();
+
+  // Perform a depth-first traversal of the move graph to resolve
+  // dependencies.  Any unperformed, unpending move with a source the same
+  // as this one's destination blocks this one so recursively perform all
+  // such moves.
+  for (int i = 0; i < moves_.length(); ++i) {
+    const MoveOperands& other_move = moves_[i];
+    if (other_move.Blocks(destination) && !other_move.IsPending()) {
+      // Though PerformMove can change any source operand in the move graph,
+      // this call cannot create a blocking move via a swap (this loop does
+      // not miss any).  Assume there is a non-blocking move with source A
+      // and this move is blocked on source B and there is a swap of A and
+      // B.  Then A and B must be involved in the same cycle (or they would
+      // not be swapped).  Since this move's destination is B and there is
+      // only a single incoming edge to an operand, this move must also be
+      // involved in the same cycle.  In that case, the blocking move will
+      // be created but will be "pending" when we return from PerformMove.
+      PerformMove(source, i);
+    }
+  }
+
+  // We are about to resolve this move and don't need it marked as
+  // pending, so restore its destination.
+  moves_[index].ClearPending(destination);
+
+  // This move's source may have changed due to swaps to resolve cycles and
+  // so it may now be the last move in the cycle.  If so remove it.
+  if (moves_[index].src().Equals(destination)) {
+    moves_[index].Eliminate();
+    return;
+  }
+
+  // The move may be blocked on a (at most one) pending move, in which case
+  // we have a cycle.  Search for such a blocking move and perform a swap to
+  // resolve it.
+  for (auto& other_move : moves_) {
+    if (other_move.Blocks(destination)) {
+      ASSERT(other_move.IsPending());
+      AddSwapToSchedule(index);
+      return;
+    }
+  }
+
+  // This move is not blocked.
+  AddMoveToSchedule(index);
+}
+
+void ParallelMoveResolver::AddMoveToSchedule(int index) {
+  auto& move = moves_[index];
+  scheduled_ops_.Add({OpKind::kMove, move});
+  move.Eliminate();
+}
+
+void ParallelMoveResolver::AddSwapToSchedule(int index) {
+  auto& move = moves_[index];
+  const auto source = move.src();
+  const auto destination = move.dest();
+
+  scheduled_ops_.Add({OpKind::kSwap, move});
+
+  // The swap of source and destination has executed a move from source to
+  // destination.
+  move.Eliminate();
+
+  // Any unperformed (including pending) move with a source of either
+  // this move's source or destination needs to have their source
+  // changed to reflect the state of affairs after the swap.
+  for (auto& other_move : moves_) {
+    if (other_move.Blocks(source)) {
+      other_move.set_src(destination);
+    } else if (other_move.Blocks(destination)) {
+      other_move.set_src(source);
+    }
+  }
+}
+
+void ParallelMoveEmitter::EmitNativeCode() {
+  const auto& move_schedule = parallel_move_->move_schedule();
+  for (intptr_t i = 0; i < move_schedule.length(); i++) {
+    current_move_ = i;
+    const auto& op = move_schedule[i];
+    switch (op.kind) {
+      case ParallelMoveResolver::OpKind::kNop:
+        // |MoveSchedule::From| is expected to filter nops.
+        UNREACHABLE();
+        break;
+      case ParallelMoveResolver::OpKind::kMove:
+        EmitMove(op.operands);
+        break;
+      case ParallelMoveResolver::OpKind::kSwap:
+        EmitSwap(op.operands);
+        break;
+    }
+  }
+}
+
+void ParallelMoveEmitter::EmitMove(const MoveOperands& move) {
+  const Location src = move.src();
+  const Location dst = move.dest();
+  ParallelMoveEmitter::TemporaryAllocator temp(this, /*blocked=*/kNoRegister);
+  compiler_->EmitMove(dst, src, &temp);
+#if defined(DEBUG)
+  // Allocating a scratch register here may cause stack spilling. Neither the
+  // source nor destination register should be SP-relative in that case.
+  for (const Location& loc : {dst, src}) {
+    ASSERT(!temp.DidAllocateTemporary() || !loc.HasStackIndex() ||
+           loc.base_reg() != SPREG);
+  }
+#endif
+}
+
+bool ParallelMoveEmitter::IsScratchLocation(Location loc) {
+  const auto& move_schedule = parallel_move_->move_schedule();
+  for (intptr_t i = current_move_; i < move_schedule.length(); i++) {
+    const auto& op = move_schedule[i];
+    if (op.operands.src().Equals(loc) ||
+        (op.kind == ParallelMoveResolver::OpKind::kSwap &&
+         op.operands.dest().Equals(loc))) {
+      return false;
+    }
+  }
+
+  for (intptr_t i = current_move_ + 1; i < move_schedule.length(); i++) {
+    const auto& op = move_schedule[i];
+    if (op.kind == ParallelMoveResolver::OpKind::kMove &&
+        op.operands.dest().Equals(loc)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+intptr_t ParallelMoveEmitter::AllocateScratchRegister(
+    Location::Kind kind,
+    uword blocked_mask,
+    intptr_t first_free_register,
+    intptr_t last_free_register,
+    bool* spilled) {
+  COMPILE_ASSERT(static_cast<intptr_t>(sizeof(blocked_mask)) * kBitsPerByte >=
+                 kNumberOfFpuRegisters);
+  COMPILE_ASSERT(static_cast<intptr_t>(sizeof(blocked_mask)) * kBitsPerByte >=
+                 kNumberOfCpuRegisters);
+  intptr_t scratch = -1;
+  for (intptr_t reg = first_free_register; reg <= last_free_register; reg++) {
+    if ((((1 << reg) & blocked_mask) == 0) &&
+        IsScratchLocation(Location::MachineRegisterLocation(kind, reg))) {
+      scratch = reg;
+      break;
+    }
+  }
+
+  if (scratch == -1) {
+    *spilled = true;
+    for (intptr_t reg = first_free_register; reg <= last_free_register; reg++) {
+      if (((1 << reg) & blocked_mask) == 0) {
+        scratch = reg;
+        break;
+      }
+    }
+  } else {
+    *spilled = false;
+  }
+
+  return scratch;
+}
+
+ParallelMoveEmitter::ScratchFpuRegisterScope::ScratchFpuRegisterScope(
+    ParallelMoveEmitter* emitter,
+    FpuRegister blocked)
+    : emitter_(emitter), reg_(kNoFpuRegister), spilled_(false) {
+  COMPILE_ASSERT(FpuTMP != kNoFpuRegister);
+  uword blocked_mask =
+      ((blocked != kNoFpuRegister) ? 1 << blocked : 0) | 1 << FpuTMP;
+  reg_ = static_cast<FpuRegister>(
+      emitter_->AllocateScratchRegister(Location::kFpuRegister, blocked_mask, 0,
+                                        kNumberOfFpuRegisters - 1, &spilled_));
+
+  if (spilled_) {
+    emitter->SpillFpuScratch(reg_);
+  }
+}
+
+ParallelMoveEmitter::ScratchFpuRegisterScope::~ScratchFpuRegisterScope() {
+  if (spilled_) {
+    emitter_->RestoreFpuScratch(reg_);
+  }
+}
+
+ParallelMoveEmitter::TemporaryAllocator::TemporaryAllocator(
+    ParallelMoveEmitter* emitter,
+    Register blocked)
+    : emitter_(emitter),
+      blocked_(blocked),
+      reg_(kNoRegister),
+      spilled_(false) {}
+
+Register ParallelMoveEmitter::TemporaryAllocator::AllocateTemporary() {
+  ASSERT(reg_ == kNoRegister);
+
+  uword blocked_mask = RegMaskBit(blocked_) | kReservedCpuRegisters;
+  if (emitter_->compiler_->intrinsic_mode()) {
+    // Block additional registers that must be preserved for intrinsics.
+    blocked_mask |= RegMaskBit(ARGS_DESC_REG);
+#if !defined(TARGET_ARCH_IA32)
+    // Need to preserve CODE_REG to be able to store the PC marker
+    // and load the pool pointer.
+    blocked_mask |= RegMaskBit(CODE_REG);
+#endif
+  }
+  reg_ = static_cast<Register>(
+      emitter_->AllocateScratchRegister(Location::kRegister, blocked_mask, 0,
+                                        kNumberOfCpuRegisters - 1, &spilled_));
+
+  if (spilled_) {
+    emitter_->SpillScratch(reg_);
+  }
+
+  DEBUG_ONLY(allocated_ = true;)
+  return reg_;
+}
+
+void ParallelMoveEmitter::TemporaryAllocator::ReleaseTemporary() {
+  if (spilled_) {
+    emitter_->RestoreScratch(reg_);
+  }
+  reg_ = kNoRegister;
+}
+
+ParallelMoveEmitter::ScratchRegisterScope::ScratchRegisterScope(
+    ParallelMoveEmitter* emitter,
+    Register blocked)
+    : allocator_(emitter, blocked) {
+  reg_ = allocator_.AllocateTemporary();
+}
+
+ParallelMoveEmitter::ScratchRegisterScope::~ScratchRegisterScope() {
+  allocator_.ReleaseTemporary();
+}
+
+template <>
+void FlowGraphSerializer::WriteTrait<const MoveSchedule*>::Write(
+    FlowGraphSerializer* s,
+    const MoveSchedule* schedule) {
+  ASSERT(schedule != nullptr);
+  const intptr_t len = schedule->length();
+  s->Write<intptr_t>(len);
+  for (intptr_t i = 0; i < len; ++i) {
+    const auto& op = (*schedule)[i];
+    s->Write<uint8_t>(static_cast<uint8_t>(op.kind));
+    s->Write<const MoveOperands*>(&op.operands);
+  }
+}
+
+template <>
+const MoveSchedule* FlowGraphDeserializer::ReadTrait<const MoveSchedule*>::Read(
+    FlowGraphDeserializer* d) {
+  const intptr_t len = d->Read<intptr_t>();
+  MoveSchedule& schedule = MoveSchedule::Allocate(len);
+  for (intptr_t i = 0; i < len; ++i) {
+    schedule[i].kind =
+        static_cast<ParallelMoveResolver::OpKind>(d->Read<uint8_t>());
+    schedule[i].operands = d->Read<MoveOperands>();
+  }
+  return &schedule;
+}
+
+}  // namespace dart
--- a/runtime/vm/compiler/backend/parallel_move_resolver.h
+++ b/runtime/vm/compiler/backend/parallel_move_resolver.h
@ -0,0 +1,158 @@
+// Copyright (c) 2023, the Dart project authors.  Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+#ifndef RUNTIME_VM_COMPILER_BACKEND_PARALLEL_MOVE_RESOLVER_H_
+#define RUNTIME_VM_COMPILER_BACKEND_PARALLEL_MOVE_RESOLVER_H_
+
+#if defined(DART_PRECOMPILED_RUNTIME)
+#error "AOT runtime should not use compiler sources (including header files)"
+#endif  // defined(DART_PRECOMPILED_RUNTIME)
+
+#include "vm/allocation.h"
+#include "vm/compiler/backend/flow_graph_compiler.h"
+#include "vm/compiler/backend/locations.h"
+#include "vm/constants.h"
+
+namespace dart {
+
+class MoveOperands;
+
+class ParallelMoveResolver : public ValueObject {
+ public:
+  ParallelMoveResolver();
+
+  // Schedule moves specified by the given parallel move and store the
+  // schedule on the parallel move itself.
+  void Resolve(ParallelMoveInstr* parallel_move);
+
+ private:
+  // Build the initial list of moves.
+  void BuildInitialMoveList(ParallelMoveInstr* parallel_move);
+
+  // Perform the move at the moves_ index in question (possibly requiring
+  // other moves to satisfy dependencies).
+  void PerformMove(const InstructionSource& source, int index);
+
+  // Schedule a move and remove it from the move graph.
+  void AddMoveToSchedule(int index);
+
+  // Schedule a swap of two operands. The move from
+  // source to destination is removed from the move graph.
+  void AddSwapToSchedule(int index);
+
+  FlowGraphCompiler* compiler_;
+
+  // List of moves not yet resolved.
+  GrowableArray<MoveOperands> moves_;
+
+  enum class OpKind : uint8_t {
+    kNop,
+    kMove,
+    kSwap,
+  };
+
+  struct Op {
+    OpKind kind;
+    MoveOperands operands;
+  };
+
+  GrowableArray<Op> scheduled_ops_;
+
+  friend class MoveSchedule;
+  friend class ParallelMoveEmitter;
+  friend class FlowGraphDeserializer;
+};
+
+class ParallelMoveEmitter : public ValueObject {
+ public:
+  ParallelMoveEmitter(FlowGraphCompiler* compiler,
+                      ParallelMoveInstr* parallel_move)
+      : compiler_(compiler), parallel_move_(parallel_move) {}
+
+  void EmitNativeCode();
+
+ private:
+  class ScratchFpuRegisterScope : public ValueObject {
+   public:
+    ScratchFpuRegisterScope(ParallelMoveEmitter* emitter, FpuRegister blocked);
+    ~ScratchFpuRegisterScope();
+
+    FpuRegister reg() const { return reg_; }
+
+   private:
+    ParallelMoveEmitter* const emitter_;
+    FpuRegister reg_;
+    bool spilled_;
+  };
+
+  class TemporaryAllocator : public TemporaryRegisterAllocator {
+   public:
+    TemporaryAllocator(ParallelMoveEmitter* emitter, Register blocked);
+
+    Register AllocateTemporary() override;
+    void ReleaseTemporary() override;
+    DEBUG_ONLY(bool DidAllocateTemporary() { return allocated_; })
+
+    virtual ~TemporaryAllocator() { ASSERT(reg_ == kNoRegister); }
+
+   private:
+    ParallelMoveEmitter* const emitter_;
+    const Register blocked_;
+    Register reg_;
+    bool spilled_;
+    DEBUG_ONLY(bool allocated_ = false);
+  };
+
+  class ScratchRegisterScope : public ValueObject {
+   public:
+    ScratchRegisterScope(ParallelMoveEmitter* emitter, Register blocked);
+    ~ScratchRegisterScope();
+
+    Register reg() const { return reg_; }
+
+   private:
+    TemporaryAllocator allocator_;
+    Register reg_;
+  };
+
+  bool IsScratchLocation(Location loc);
+  intptr_t AllocateScratchRegister(Location::Kind kind,
+                                   uword blocked_mask,
+                                   intptr_t first_free_register,
+                                   intptr_t last_free_register,
+                                   bool* spilled);
+
+  void SpillScratch(Register reg);
+  void RestoreScratch(Register reg);
+  void SpillFpuScratch(FpuRegister reg);
+  void RestoreFpuScratch(FpuRegister reg);
+
+  // Generate the code for a move from source to destination.
+  void EmitMove(const MoveOperands& move);
+
+  void EmitSwap(const MoveOperands& swap);
+
+  // Verify the move list before performing moves.
+  void Verify();
+
+  // Helpers for non-trivial source-destination combinations that cannot
+  // be handled by a single instruction.
+  void MoveMemoryToMemory(const compiler::Address& dst,
+                          const compiler::Address& src);
+  void Exchange(Register reg, const compiler::Address& mem);
+  void Exchange(const compiler::Address& mem1, const compiler::Address& mem2);
+  void Exchange(Register reg, Register base_reg, intptr_t stack_offset);
+  void Exchange(Register base_reg1,
+                intptr_t stack_offset1,
+                Register base_reg2,
+                intptr_t stack_offset2);
+
+  FlowGraphCompiler* const compiler_;
+  ParallelMoveInstr* parallel_move_;
+  intptr_t current_move_;
+};
+
+}  // namespace dart
+
+#endif  // RUNTIME_VM_COMPILER_BACKEND_PARALLEL_MOVE_RESOLVER_H_
--- a/runtime/vm/compiler/compiler_sources.gni
+++ b/runtime/vm/compiler/compiler_sources.gni
@ -78,6 +78,8 @@ compiler_sources = [
  "backend/locations_helpers_arm.h",
  "backend/loops.cc",
  "backend/loops.h",
+  "backend/parallel_move_resolver.cc",
+  "backend/parallel_move_resolver.h",
  "backend/range_analysis.cc",
  "backend/range_analysis.h",
  "backend/redundancy_elimination.cc",
--- a/runtime/vm/compiler/graph_intrinsifier.cc
+++ b/runtime/vm/compiler/graph_intrinsifier.cc
@ -52,22 +52,16 @@ static void EmitCodeFor(FlowGraphCompiler* compiler, FlowGraph* graph) {
    if (block->IsGraphEntry()) continue;  // No code for graph entry needed.

    if (block->HasParallelMove()) {
-      compiler->parallel_move_resolver()->EmitNativeCode(
-          block->parallel_move());
+      block->parallel_move()->EmitNativeCode(compiler);
    }

    for (ForwardInstructionIterator it(block); !it.Done(); it.Advance()) {
      Instruction* instr = it.Current();
      if (FLAG_code_comments) compiler->EmitComment(instr);
-      if (instr->IsParallelMove()) {
-        compiler->parallel_move_resolver()->EmitNativeCode(
-            instr->AsParallelMove());
-      } else {
-        ASSERT(instr->locs() != NULL);
-        // Calls are not supported in intrinsics code.
-        ASSERT(!instr->locs()->always_calls());
-        instr->EmitNativeCode(compiler);
-      }
+      // Calls are not supported in intrinsics code.
+      ASSERT(instr->IsParallelMove() ||
+             (instr->locs() != nullptr && !instr->locs()->always_calls()));
+      instr->EmitNativeCode(compiler);
    }
  }
  compiler->assembler()->Comment("Graph intrinsic end");
--- a/runtime/vm/globals.h
+++ b/runtime/vm/globals.h
@ -154,7 +154,7 @@ const intptr_t kOffsetOfPtr = 32;
 #define OPEN_ARRAY_START(type, align)                                          \
  do {                                                                         \
    const uword result = reinterpret_cast<uword>(this) + sizeof(*this);        \
-    ASSERT(Utils::IsAligned(result, sizeof(align)));                           \
+    ASSERT(Utils::IsAligned(result, alignof(align)));                          \
    return reinterpret_cast<type*>(result);                                    \
  } while (0)