[vm/compiler] Create leaf runtime entry for memmove.

Instead of making a StaticCall to _TypedListBase.nativeSetRange
inside _memMoveN, make a CCall to the memmove leaf runtime entry.

Rename _TypedListBase._nativeSetRange to _setClampedRange, since
it's now only used when per-element clamping is necessary.

Fix the load optimizer so that loads of unboxed fields from freshly
allocated objects do not have the tagged null value forwarded
as their initial post-allocation value.

TEST=co19{,_2}/LibTest/typed_data lib{,_2}/typed_data
     corelib{,_2}/list_test
     vm/cc/LoadOptimizer_LoadDataFieldOfNewTypedData

Issue: https://github.com/dart-lang/sdk/issues/42072
Change-Id: Ib82e24a5b3287fa53099fffd3b563a27d777507e
Cq-Include-Trybots: luci.dart.try:vm-aot-linux-debug-simarm_x64-try,vm-aot-linux-debug-x64-try,vm-aot-linux-debug-x64c-try,vm-kernel-linux-debug-x64-try,vm-kernel-precomp-linux-debug-x64-try,vm-linux-debug-x64-try,vm-linux-debug-x64c-try,vm-mac-debug-arm64-try,vm-aot-linux-release-simarm_x64-try,vm-aot-linux-release-x64-try,vm-aot-mac-release-arm64-try,vm-aot-msan-linux-release-x64-try,vm-msan-linux-release-x64-try,vm-aot-tsan-linux-release-x64-try,vm-tsan-linux-release-x64-try,vm-linux-release-x64-try,vm-mac-release-arm64-try,vm-kernel-precomp-linux-release-x64-try
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/324080
Reviewed-by: Alexander Markov <alexmarkov@google.com>
Commit-Queue: Tess Strickland <sstrickl@google.com>
This commit is contained in:
Tess Strickland 2023-09-11 21:25:09 +00:00 committed by Commit Queue
parent 8a91749e42
commit 3f53d22d43
10 changed files with 1172 additions and 1061 deletions

View file

@ -88,7 +88,7 @@ static bool IsUint8(intptr_t cid) {
cid <= kUnmodifiableTypedDataUint8ClampedArrayViewCid;
}
DEFINE_NATIVE_ENTRY(TypedDataBase_setRange, 0, 5) {
DEFINE_NATIVE_ENTRY(TypedDataBase_setClampedRange, 0, 5) {
// This is called after bounds checking, so the numeric inputs are
// guaranteed to be Smis, and the length is guaranteed to be non-zero.
const TypedDataBase& dst =
@ -128,19 +128,9 @@ DEFINE_NATIVE_ENTRY(TypedDataBase_setRange, 0, 5) {
ASSERT(length_in_bytes <= src_length_in_bytes - src_start_in_bytes);
#endif
if (!IsClamped(dst.ptr()->GetClassId()) || IsUint8(src.ptr()->GetClassId())) {
// TODO(dartbug.com/42072): We do this when the copy length gets large
// enough that a native call to invoke memmove is faster than the generated
// code from MemoryCopy. Replace the static call to _nativeSetRange with
// a CCall() to a memmove leaf runtime entry and remove the possibility of
// calling _nativeSetRange except in the clamping case.
NoSafepointScope no_safepoint;
memmove(dst.DataAddr(dst_start_in_bytes), src.DataAddr(src_start_in_bytes),
length_in_bytes);
return Object::null();
}
ASSERT_EQUAL(element_size_in_bytes, 1);
ASSERT(IsClamped(dst.ptr()->GetClassId()));
ASSERT(!IsUint8(src.ptr()->GetClassId()));
NoSafepointScope no_safepoint;
uint8_t* dst_data =

View file

@ -173,7 +173,7 @@ namespace dart {
V(TypedData_Int32x4Array_new, 2) \
V(TypedData_Float64x2Array_new, 2) \
V(TypedDataBase_length, 1) \
V(TypedDataBase_setRange, 5) \
V(TypedDataBase_setClampedRange, 5) \
V(TypedData_GetInt8, 2) \
V(TypedData_SetInt8, 3) \
V(TypedData_GetUint8, 2) \

View file

@ -2244,6 +2244,11 @@ class LoadOptimizer : public ValueObject {
const intptr_t pos = alloc->InputForSlot(*slot);
if (pos != -1) {
forward_def = alloc->InputAt(pos)->definition();
} else if (slot->is_unboxed()) {
// Unboxed fields that are not provided as an input should not
// have a tagged null value forwarded for them, similar to
// payloads of typed data arrays.
continue;
} else {
// Fields not provided as an input to the instruction are
// initialized to null during allocation.

View file

@ -665,6 +665,104 @@ ISOLATE_UNIT_TEST_CASE(LoadOptimizer_AliasingViaTypedDataAndUntaggedTypedData) {
}
}
// This test ensures that a LoadNativeField of the PointerBase data field for
// a newly allocated TypedData object does not have tagged null forwarded to it,
// as that's wrong for two reasons: it's an unboxed field, and it is initialized
// during the allocation stub.
ISOLATE_UNIT_TEST_CASE(LoadOptimizer_LoadDataFieldOfNewTypedData) {
using compiler::BlockBuilder;
CompilerState S(thread, /*is_aot=*/false, /*is_optimizing=*/true);
FlowGraphBuilderHelper H;
auto zone = H.flow_graph()->zone();
// We are going to build the following graph:
//
// B0[graph_entry] {
// vc42 <- Constant(42)
// }
//
// B1[function_entry] {
// }
// array <- AllocateTypedData(kTypedDataUint8ArrayCid, vc42)
// view <- AllocateObject(kTypedDataUint8ArrayViewCid)
// v1 <- LoadNativeField(array, Slot::PointerBase_data())
// StoreNativeField(Slot::PointerBase_data(), view, v1, kNoStoreBarrier,
// kInitalizing)
// return view
// }
const auto& lib = Library::Handle(zone, Library::TypedDataLibrary());
EXPECT(!lib.IsNull());
const Class& view_cls = Class::ZoneHandle(
zone, lib.LookupClassAllowPrivate(Symbols::_Uint8ArrayView()));
EXPECT(!view_cls.IsNull());
const Error& err = Error::Handle(zone, view_cls.EnsureIsFinalized(thread));
EXPECT(err.IsNull());
auto vc42 = H.flow_graph()->GetConstant(Integer::Handle(Integer::New(42)));
auto b1 = H.flow_graph()->graph_entry()->normal_entry();
AllocateTypedDataInstr* array;
AllocateObjectInstr* view;
LoadFieldInstr* v1;
StoreFieldInstr* store;
ReturnInstr* ret;
{
BlockBuilder builder(H.flow_graph(), b1);
// array <- AllocateTypedData(kTypedDataUint8ArrayCid, vc42)
array = builder.AddDefinition(
new AllocateTypedDataInstr(InstructionSource(), kTypedDataUint8ArrayCid,
new (zone) Value(vc42), DeoptId::kNone));
// view <- AllocateObject(kTypedDataUint8ArrayViewCid, vta)
view = builder.AddDefinition(
new AllocateObjectInstr(InstructionSource(), view_cls, DeoptId::kNone));
// v1 <- LoadNativeField(array, Slot::PointerBase_data())
v1 = builder.AddDefinition(new LoadFieldInstr(new (zone) Value(array),
Slot::PointerBase_data(),
InstructionSource()));
// StoreNativeField(Slot::PointerBase_data(), view, v1, kNoStoreBarrier,
// kInitalizing)
store = builder.AddInstruction(new StoreFieldInstr(
Slot::PointerBase_data(), new (zone) Value(view), new (zone) Value(v1),
kNoStoreBarrier, InstructionSource(),
StoreFieldInstr::Kind::kInitializing));
// return view
ret = builder.AddInstruction(new ReturnInstr(
InstructionSource(), new Value(view), S.GetNextDeoptId()));
}
H.FinishGraph();
DominatorBasedCSE::Optimize(H.flow_graph());
{
Instruction* alloc_array = nullptr;
Instruction* alloc_view = nullptr;
Instruction* lf = nullptr;
Instruction* sf = nullptr;
Instruction* r = nullptr;
ILMatcher cursor(H.flow_graph(), b1, true);
RELEASE_ASSERT(cursor.TryMatch({
kMatchAndMoveFunctionEntry,
{kMatchAndMoveAllocateTypedData, &alloc_array},
{kMatchAndMoveAllocateObject, &alloc_view},
{kMatchAndMoveLoadField, &lf},
{kMatchAndMoveStoreField, &sf},
{kMatchReturn, &r},
}));
EXPECT(array == alloc_array);
EXPECT(view == alloc_view);
EXPECT(v1 == lf);
EXPECT(store == sf);
EXPECT(ret == r);
}
}
// This test verifies that we correctly alias load/stores into typed array
// which use different element sizes. This is a regression test for
// a fix in 836c04f.

View file

@ -111,7 +111,8 @@ Slot* SlotCache::CreateNativeSlot(Slot::Kind kind) {
case Slot::Kind::k##ClassName##_##FieldName: \
return new (zone_) \
Slot(Slot::Kind::k##ClassName##_##FieldName, \
Slot::IsImmutableBit::encode(FIELD_##mutability), \
Slot::IsImmutableBit::encode(FIELD_##mutability) | \
Slot::IsUnboxedBit::encode(true), \
compiler::target::ClassName::FieldName##_offset(), \
#ClassName "." #FieldName, \
CompileType::FromUnboxedRepresentation(kUnboxed##representation), \

View file

@ -1763,28 +1763,27 @@ Fragment FlowGraphBuilder::BuildTypedDataMemMove(const Function& function,
LocalVariable* arg_from_start = parsed_function_->RawParameterVariable(4);
Fragment body;
// If we're copying at least this many elements, calling _nativeSetRange,
// which calls memmove via a native call, is faster than using the code
// currently emitted by the MemoryCopy instruction.
// If we're copying at least this many elements, calling memmove via CCall
// is faster than using the code currently emitted by MemoryCopy.
#if defined(TARGET_ARCH_X64) || defined(TARGET_ARCH_IA32)
// On X86, the breakpoint for using a native call instead of generating a
// loop via MemoryCopy() is around the same as the largest benchmark
// (1048576 elements) on the machines we use.
const intptr_t kCopyLengthForNativeCall = 1024 * 1024;
// On X86, the breakpoint for using CCall instead of generating a loop via
// MemoryCopy() is around the same as the largest benchmark (1048576 elements)
// on the machines we use.
const intptr_t kCopyLengthForCCall = 1024 * 1024;
#else
// On other architectures, when the element size is less than a word,
// we copy in word-sized chunks when possible to get back some speed without
// increasing the number of emitted instructions for MemoryCopy too much, but
// memmove is even more aggressive, copying in 64-byte chunks when possible.
// Thus, the breakpoint for a native call being faster is much lower for our
// benchmarks than for X86.
const intptr_t kCopyLengthForNativeCall = 1024;
// Thus, the breakpoint for a call to memmove being faster is much lower for
// our benchmarks than for X86.
const intptr_t kCopyLengthForCCall = 1024;
#endif
JoinEntryInstr* done = BuildJoinEntry();
TargetEntryInstr *is_small_enough, *is_too_large;
body += LoadLocal(arg_count);
body += IntConstant(kCopyLengthForNativeCall);
body += IntConstant(kCopyLengthForCCall);
body += SmiRelationalOp(Token::kLT);
body += BranchIfTrue(&is_small_enough, &is_too_large);
@ -1798,30 +1797,38 @@ Fragment FlowGraphBuilder::BuildTypedDataMemMove(const Function& function,
/*unboxed_inputs=*/false, /*can_overlap=*/true);
use_instruction += Goto(done);
// TODO(dartbug.com/42072): Instead of doing a static call to a native
// method, make a leaf runtime entry for memmove and use CCall.
const Library& lib = Library::Handle(Z, Library::TypedDataLibrary());
ASSERT(!lib.IsNull());
const Class& typed_list_base =
Class::Handle(Z, lib.LookupClassAllowPrivate(Symbols::_TypedListBase()));
ASSERT(!typed_list_base.IsNull());
const auto& error = typed_list_base.EnsureIsFinalized(H.thread());
ASSERT(error == Error::null());
const Function& native_set_range = Function::ZoneHandle(
Z,
typed_list_base.LookupFunctionAllowPrivate(Symbols::_nativeSetRange()));
ASSERT(!native_set_range.IsNull());
Fragment call_native(is_too_large);
call_native += LoadLocal(arg_to);
call_native += LoadLocal(arg_to_start);
call_native += LoadLocal(arg_count);
call_native += LoadLocal(arg_from);
call_native += LoadLocal(arg_from_start);
call_native += StaticCall(TokenPosition::kNoSource, native_set_range, 5,
ICData::kNoRebind);
call_native += Drop();
call_native += Goto(done);
const intptr_t element_size = Instance::ElementSizeFor(cid);
Fragment call_memmove(is_too_large);
call_memmove += LoadLocal(arg_to);
call_memmove += LoadUntagged(compiler::target::PointerBase::data_offset());
call_memmove += ConvertUntaggedToUnboxed(kUnboxedIntPtr);
call_memmove += LoadLocal(arg_to_start);
call_memmove += IntConstant(element_size);
call_memmove += SmiBinaryOp(Token::kMUL, /*is_truncating=*/true);
call_memmove += UnboxTruncate(kUnboxedIntPtr);
call_memmove +=
BinaryIntegerOp(Token::kADD, kUnboxedIntPtr, /*is_truncating=*/true);
call_memmove += LoadLocal(arg_from);
call_memmove += LoadUntagged(compiler::target::PointerBase::data_offset());
call_memmove += ConvertUntaggedToUnboxed(kUnboxedIntPtr);
call_memmove += LoadLocal(arg_from_start);
call_memmove += IntConstant(element_size);
call_memmove += SmiBinaryOp(Token::kMUL, /*is_truncating=*/true);
call_memmove += UnboxTruncate(kUnboxedIntPtr);
call_memmove +=
BinaryIntegerOp(Token::kADD, kUnboxedIntPtr, /*is_truncating=*/true);
call_memmove += LoadLocal(arg_count);
call_memmove += IntConstant(element_size);
call_memmove += SmiBinaryOp(Token::kMUL, /*is_truncating=*/true);
call_memmove += UnboxTruncate(kUnboxedIntPtr);
call_memmove += LoadThread();
call_memmove += LoadUntagged(
compiler::target::Thread::OffsetFromThread(&kMemoryMoveRuntimeEntry));
call_memmove +=
ConvertUntaggedToUnboxed(kUnboxedFfiIntPtr); // function address.
call_memmove += CCall(3);
call_memmove += Drop();
call_memmove += Goto(done);
body.current = done;
body += NullConstant();

File diff suppressed because it is too large Load diff

View file

@ -3845,6 +3845,7 @@ DEFINE_RUNTIME_ENTRY(FfiAsyncCallbackSend, 1) {
// Use expected function signatures to help MSVC compiler resolve overloading.
typedef double (*UnaryMathCFunction)(double x);
typedef double (*BinaryMathCFunction)(double x, double y);
typedef void* (*MemMoveCFunction)(void* dest, const void* src, size_t n);
DEFINE_RAW_LEAF_RUNTIME_ENTRY(
LibcPow,
@ -3938,6 +3939,12 @@ DEFINE_RAW_LEAF_RUNTIME_ENTRY(
true /* is_float */,
reinterpret_cast<RuntimeFunction>(static_cast<UnaryMathCFunction>(&log)));
DEFINE_RAW_LEAF_RUNTIME_ENTRY(
MemoryMove,
3,
false /* is_float */,
reinterpret_cast<RuntimeFunction>(static_cast<MemMoveCFunction>(&memmove)));
extern "C" void DFLRT_EnterSafepoint(NativeArguments __unusable_) {
CHECK_STACK_ALIGNMENT;
TRACE_RUNTIME_CALL("%s", "EnterSafepoint");

View file

@ -114,7 +114,8 @@ namespace dart {
V(void, MsanUnpoisonParam, size_t) \
V(void, TsanLoadAcquire, void*) \
V(void, TsanStoreRelease, void*) \
V(bool, TryDoubleAsInteger, Thread*)
V(bool, TryDoubleAsInteger, Thread*) \
V(void*, MemoryMove, void*, const void*, size_t)
} // namespace dart

View file

@ -147,10 +147,12 @@ abstract final class _TypedListBase {
//
// Primarily called by Dart code to handle clamping.
//
// Element sizes of [this] and [from] must match (test at caller).
@pragma("vm:external-name", "TypedDataBase_setRange")
// Element sizes of [this] and [from] must match. [this] must be a clamped
// typed data object, and the values in [from] must require possible clamping
// (tests at caller).
@pragma("vm:external-name", "TypedDataBase_setClampedRange")
@pragma("vm:entry-point")
external void _nativeSetRange(
external void _setClampedRange(
int start, int count, _TypedListBase from, int skipOffset);
// Performs a copy of the [count] elements starting at [skipCount] in [from]
@ -2350,7 +2352,7 @@ final class _Uint8ClampedList extends _TypedList
int start, int count, _TypedListBase from, int skipCount) =>
from._containsUnsignedBytes
? _memMove1(start, count, from, skipCount)
: _nativeSetRange(start, count, from, skipCount);
: _setClampedRange(start, count, from, skipCount);
}
@patch
@ -3252,7 +3254,7 @@ final class _ExternalUint8ClampedArray extends _TypedList
int start, int count, _TypedListBase from, int skipCount) =>
from._containsUnsignedBytes
? _memMove1(start, count, from, skipCount)
: _nativeSetRange(start, count, from, skipCount);
: _setClampedRange(start, count, from, skipCount);
}
@pragma("vm:entry-point")
@ -4496,7 +4498,7 @@ final class _Uint8ClampedArrayView extends _TypedListView
int start, int count, _TypedListBase from, int skipCount) =>
from._containsUnsignedBytes
? _memMove1(start, count, from, skipCount)
: _nativeSetRange(start, count, from, skipCount);
: _setClampedRange(start, count, from, skipCount);
}
@pragma("vm:entry-point")