mirror of
https://github.com/dart-lang/sdk
synced 2024-09-05 16:41:07 +00:00
[vm/compiler] Create leaf runtime entry for memmove.
Instead of making a StaticCall to _TypedListBase.nativeSetRange inside _memMoveN, make a CCall to the memmove leaf runtime entry. Rename _TypedListBase._nativeSetRange to _setClampedRange, since it's now only used when per-element clamping is necessary. Fix the load optimizer so that loads of unboxed fields from freshly allocated objects do not have the tagged null value forwarded as their initial post-allocation value. TEST=co19{,_2}/LibTest/typed_data lib{,_2}/typed_data corelib{,_2}/list_test vm/cc/LoadOptimizer_LoadDataFieldOfNewTypedData Issue: https://github.com/dart-lang/sdk/issues/42072 Change-Id: Ib82e24a5b3287fa53099fffd3b563a27d777507e Cq-Include-Trybots: luci.dart.try:vm-aot-linux-debug-simarm_x64-try,vm-aot-linux-debug-x64-try,vm-aot-linux-debug-x64c-try,vm-kernel-linux-debug-x64-try,vm-kernel-precomp-linux-debug-x64-try,vm-linux-debug-x64-try,vm-linux-debug-x64c-try,vm-mac-debug-arm64-try,vm-aot-linux-release-simarm_x64-try,vm-aot-linux-release-x64-try,vm-aot-mac-release-arm64-try,vm-aot-msan-linux-release-x64-try,vm-msan-linux-release-x64-try,vm-aot-tsan-linux-release-x64-try,vm-tsan-linux-release-x64-try,vm-linux-release-x64-try,vm-mac-release-arm64-try,vm-kernel-precomp-linux-release-x64-try Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/324080 Reviewed-by: Alexander Markov <alexmarkov@google.com> Commit-Queue: Tess Strickland <sstrickl@google.com>
This commit is contained in:
parent
8a91749e42
commit
3f53d22d43
|
@ -88,7 +88,7 @@ static bool IsUint8(intptr_t cid) {
|
|||
cid <= kUnmodifiableTypedDataUint8ClampedArrayViewCid;
|
||||
}
|
||||
|
||||
DEFINE_NATIVE_ENTRY(TypedDataBase_setRange, 0, 5) {
|
||||
DEFINE_NATIVE_ENTRY(TypedDataBase_setClampedRange, 0, 5) {
|
||||
// This is called after bounds checking, so the numeric inputs are
|
||||
// guaranteed to be Smis, and the length is guaranteed to be non-zero.
|
||||
const TypedDataBase& dst =
|
||||
|
@ -128,19 +128,9 @@ DEFINE_NATIVE_ENTRY(TypedDataBase_setRange, 0, 5) {
|
|||
ASSERT(length_in_bytes <= src_length_in_bytes - src_start_in_bytes);
|
||||
#endif
|
||||
|
||||
if (!IsClamped(dst.ptr()->GetClassId()) || IsUint8(src.ptr()->GetClassId())) {
|
||||
// TODO(dartbug.com/42072): We do this when the copy length gets large
|
||||
// enough that a native call to invoke memmove is faster than the generated
|
||||
// code from MemoryCopy. Replace the static call to _nativeSetRange with
|
||||
// a CCall() to a memmove leaf runtime entry and remove the possibility of
|
||||
// calling _nativeSetRange except in the clamping case.
|
||||
NoSafepointScope no_safepoint;
|
||||
memmove(dst.DataAddr(dst_start_in_bytes), src.DataAddr(src_start_in_bytes),
|
||||
length_in_bytes);
|
||||
return Object::null();
|
||||
}
|
||||
|
||||
ASSERT_EQUAL(element_size_in_bytes, 1);
|
||||
ASSERT(IsClamped(dst.ptr()->GetClassId()));
|
||||
ASSERT(!IsUint8(src.ptr()->GetClassId()));
|
||||
|
||||
NoSafepointScope no_safepoint;
|
||||
uint8_t* dst_data =
|
||||
|
|
|
@ -173,7 +173,7 @@ namespace dart {
|
|||
V(TypedData_Int32x4Array_new, 2) \
|
||||
V(TypedData_Float64x2Array_new, 2) \
|
||||
V(TypedDataBase_length, 1) \
|
||||
V(TypedDataBase_setRange, 5) \
|
||||
V(TypedDataBase_setClampedRange, 5) \
|
||||
V(TypedData_GetInt8, 2) \
|
||||
V(TypedData_SetInt8, 3) \
|
||||
V(TypedData_GetUint8, 2) \
|
||||
|
|
|
@ -2244,6 +2244,11 @@ class LoadOptimizer : public ValueObject {
|
|||
const intptr_t pos = alloc->InputForSlot(*slot);
|
||||
if (pos != -1) {
|
||||
forward_def = alloc->InputAt(pos)->definition();
|
||||
} else if (slot->is_unboxed()) {
|
||||
// Unboxed fields that are not provided as an input should not
|
||||
// have a tagged null value forwarded for them, similar to
|
||||
// payloads of typed data arrays.
|
||||
continue;
|
||||
} else {
|
||||
// Fields not provided as an input to the instruction are
|
||||
// initialized to null during allocation.
|
||||
|
|
|
@ -665,6 +665,104 @@ ISOLATE_UNIT_TEST_CASE(LoadOptimizer_AliasingViaTypedDataAndUntaggedTypedData) {
|
|||
}
|
||||
}
|
||||
|
||||
// This test ensures that a LoadNativeField of the PointerBase data field for
|
||||
// a newly allocated TypedData object does not have tagged null forwarded to it,
|
||||
// as that's wrong for two reasons: it's an unboxed field, and it is initialized
|
||||
// during the allocation stub.
|
||||
ISOLATE_UNIT_TEST_CASE(LoadOptimizer_LoadDataFieldOfNewTypedData) {
|
||||
using compiler::BlockBuilder;
|
||||
CompilerState S(thread, /*is_aot=*/false, /*is_optimizing=*/true);
|
||||
FlowGraphBuilderHelper H;
|
||||
|
||||
auto zone = H.flow_graph()->zone();
|
||||
|
||||
// We are going to build the following graph:
|
||||
//
|
||||
// B0[graph_entry] {
|
||||
// vc42 <- Constant(42)
|
||||
// }
|
||||
//
|
||||
// B1[function_entry] {
|
||||
// }
|
||||
// array <- AllocateTypedData(kTypedDataUint8ArrayCid, vc42)
|
||||
// view <- AllocateObject(kTypedDataUint8ArrayViewCid)
|
||||
// v1 <- LoadNativeField(array, Slot::PointerBase_data())
|
||||
// StoreNativeField(Slot::PointerBase_data(), view, v1, kNoStoreBarrier,
|
||||
// kInitalizing)
|
||||
// return view
|
||||
// }
|
||||
|
||||
const auto& lib = Library::Handle(zone, Library::TypedDataLibrary());
|
||||
EXPECT(!lib.IsNull());
|
||||
const Class& view_cls = Class::ZoneHandle(
|
||||
zone, lib.LookupClassAllowPrivate(Symbols::_Uint8ArrayView()));
|
||||
EXPECT(!view_cls.IsNull());
|
||||
const Error& err = Error::Handle(zone, view_cls.EnsureIsFinalized(thread));
|
||||
EXPECT(err.IsNull());
|
||||
|
||||
auto vc42 = H.flow_graph()->GetConstant(Integer::Handle(Integer::New(42)));
|
||||
auto b1 = H.flow_graph()->graph_entry()->normal_entry();
|
||||
|
||||
AllocateTypedDataInstr* array;
|
||||
AllocateObjectInstr* view;
|
||||
LoadFieldInstr* v1;
|
||||
StoreFieldInstr* store;
|
||||
ReturnInstr* ret;
|
||||
|
||||
{
|
||||
BlockBuilder builder(H.flow_graph(), b1);
|
||||
|
||||
// array <- AllocateTypedData(kTypedDataUint8ArrayCid, vc42)
|
||||
array = builder.AddDefinition(
|
||||
new AllocateTypedDataInstr(InstructionSource(), kTypedDataUint8ArrayCid,
|
||||
new (zone) Value(vc42), DeoptId::kNone));
|
||||
|
||||
// view <- AllocateObject(kTypedDataUint8ArrayViewCid, vta)
|
||||
view = builder.AddDefinition(
|
||||
new AllocateObjectInstr(InstructionSource(), view_cls, DeoptId::kNone));
|
||||
|
||||
// v1 <- LoadNativeField(array, Slot::PointerBase_data())
|
||||
v1 = builder.AddDefinition(new LoadFieldInstr(new (zone) Value(array),
|
||||
Slot::PointerBase_data(),
|
||||
InstructionSource()));
|
||||
|
||||
// StoreNativeField(Slot::PointerBase_data(), view, v1, kNoStoreBarrier,
|
||||
// kInitalizing)
|
||||
store = builder.AddInstruction(new StoreFieldInstr(
|
||||
Slot::PointerBase_data(), new (zone) Value(view), new (zone) Value(v1),
|
||||
kNoStoreBarrier, InstructionSource(),
|
||||
StoreFieldInstr::Kind::kInitializing));
|
||||
|
||||
// return view
|
||||
ret = builder.AddInstruction(new ReturnInstr(
|
||||
InstructionSource(), new Value(view), S.GetNextDeoptId()));
|
||||
}
|
||||
H.FinishGraph();
|
||||
|
||||
DominatorBasedCSE::Optimize(H.flow_graph());
|
||||
{
|
||||
Instruction* alloc_array = nullptr;
|
||||
Instruction* alloc_view = nullptr;
|
||||
Instruction* lf = nullptr;
|
||||
Instruction* sf = nullptr;
|
||||
Instruction* r = nullptr;
|
||||
ILMatcher cursor(H.flow_graph(), b1, true);
|
||||
RELEASE_ASSERT(cursor.TryMatch({
|
||||
kMatchAndMoveFunctionEntry,
|
||||
{kMatchAndMoveAllocateTypedData, &alloc_array},
|
||||
{kMatchAndMoveAllocateObject, &alloc_view},
|
||||
{kMatchAndMoveLoadField, &lf},
|
||||
{kMatchAndMoveStoreField, &sf},
|
||||
{kMatchReturn, &r},
|
||||
}));
|
||||
EXPECT(array == alloc_array);
|
||||
EXPECT(view == alloc_view);
|
||||
EXPECT(v1 == lf);
|
||||
EXPECT(store == sf);
|
||||
EXPECT(ret == r);
|
||||
}
|
||||
}
|
||||
|
||||
// This test verifies that we correctly alias load/stores into typed array
|
||||
// which use different element sizes. This is a regression test for
|
||||
// a fix in 836c04f.
|
||||
|
|
|
@ -111,7 +111,8 @@ Slot* SlotCache::CreateNativeSlot(Slot::Kind kind) {
|
|||
case Slot::Kind::k##ClassName##_##FieldName: \
|
||||
return new (zone_) \
|
||||
Slot(Slot::Kind::k##ClassName##_##FieldName, \
|
||||
Slot::IsImmutableBit::encode(FIELD_##mutability), \
|
||||
Slot::IsImmutableBit::encode(FIELD_##mutability) | \
|
||||
Slot::IsUnboxedBit::encode(true), \
|
||||
compiler::target::ClassName::FieldName##_offset(), \
|
||||
#ClassName "." #FieldName, \
|
||||
CompileType::FromUnboxedRepresentation(kUnboxed##representation), \
|
||||
|
|
|
@ -1763,28 +1763,27 @@ Fragment FlowGraphBuilder::BuildTypedDataMemMove(const Function& function,
|
|||
LocalVariable* arg_from_start = parsed_function_->RawParameterVariable(4);
|
||||
|
||||
Fragment body;
|
||||
// If we're copying at least this many elements, calling _nativeSetRange,
|
||||
// which calls memmove via a native call, is faster than using the code
|
||||
// currently emitted by the MemoryCopy instruction.
|
||||
// If we're copying at least this many elements, calling memmove via CCall
|
||||
// is faster than using the code currently emitted by MemoryCopy.
|
||||
#if defined(TARGET_ARCH_X64) || defined(TARGET_ARCH_IA32)
|
||||
// On X86, the breakpoint for using a native call instead of generating a
|
||||
// loop via MemoryCopy() is around the same as the largest benchmark
|
||||
// (1048576 elements) on the machines we use.
|
||||
const intptr_t kCopyLengthForNativeCall = 1024 * 1024;
|
||||
// On X86, the breakpoint for using CCall instead of generating a loop via
|
||||
// MemoryCopy() is around the same as the largest benchmark (1048576 elements)
|
||||
// on the machines we use.
|
||||
const intptr_t kCopyLengthForCCall = 1024 * 1024;
|
||||
#else
|
||||
// On other architectures, when the element size is less than a word,
|
||||
// we copy in word-sized chunks when possible to get back some speed without
|
||||
// increasing the number of emitted instructions for MemoryCopy too much, but
|
||||
// memmove is even more aggressive, copying in 64-byte chunks when possible.
|
||||
// Thus, the breakpoint for a native call being faster is much lower for our
|
||||
// benchmarks than for X86.
|
||||
const intptr_t kCopyLengthForNativeCall = 1024;
|
||||
// Thus, the breakpoint for a call to memmove being faster is much lower for
|
||||
// our benchmarks than for X86.
|
||||
const intptr_t kCopyLengthForCCall = 1024;
|
||||
#endif
|
||||
|
||||
JoinEntryInstr* done = BuildJoinEntry();
|
||||
TargetEntryInstr *is_small_enough, *is_too_large;
|
||||
body += LoadLocal(arg_count);
|
||||
body += IntConstant(kCopyLengthForNativeCall);
|
||||
body += IntConstant(kCopyLengthForCCall);
|
||||
body += SmiRelationalOp(Token::kLT);
|
||||
body += BranchIfTrue(&is_small_enough, &is_too_large);
|
||||
|
||||
|
@ -1798,30 +1797,38 @@ Fragment FlowGraphBuilder::BuildTypedDataMemMove(const Function& function,
|
|||
/*unboxed_inputs=*/false, /*can_overlap=*/true);
|
||||
use_instruction += Goto(done);
|
||||
|
||||
// TODO(dartbug.com/42072): Instead of doing a static call to a native
|
||||
// method, make a leaf runtime entry for memmove and use CCall.
|
||||
const Library& lib = Library::Handle(Z, Library::TypedDataLibrary());
|
||||
ASSERT(!lib.IsNull());
|
||||
const Class& typed_list_base =
|
||||
Class::Handle(Z, lib.LookupClassAllowPrivate(Symbols::_TypedListBase()));
|
||||
ASSERT(!typed_list_base.IsNull());
|
||||
const auto& error = typed_list_base.EnsureIsFinalized(H.thread());
|
||||
ASSERT(error == Error::null());
|
||||
const Function& native_set_range = Function::ZoneHandle(
|
||||
Z,
|
||||
typed_list_base.LookupFunctionAllowPrivate(Symbols::_nativeSetRange()));
|
||||
ASSERT(!native_set_range.IsNull());
|
||||
|
||||
Fragment call_native(is_too_large);
|
||||
call_native += LoadLocal(arg_to);
|
||||
call_native += LoadLocal(arg_to_start);
|
||||
call_native += LoadLocal(arg_count);
|
||||
call_native += LoadLocal(arg_from);
|
||||
call_native += LoadLocal(arg_from_start);
|
||||
call_native += StaticCall(TokenPosition::kNoSource, native_set_range, 5,
|
||||
ICData::kNoRebind);
|
||||
call_native += Drop();
|
||||
call_native += Goto(done);
|
||||
const intptr_t element_size = Instance::ElementSizeFor(cid);
|
||||
Fragment call_memmove(is_too_large);
|
||||
call_memmove += LoadLocal(arg_to);
|
||||
call_memmove += LoadUntagged(compiler::target::PointerBase::data_offset());
|
||||
call_memmove += ConvertUntaggedToUnboxed(kUnboxedIntPtr);
|
||||
call_memmove += LoadLocal(arg_to_start);
|
||||
call_memmove += IntConstant(element_size);
|
||||
call_memmove += SmiBinaryOp(Token::kMUL, /*is_truncating=*/true);
|
||||
call_memmove += UnboxTruncate(kUnboxedIntPtr);
|
||||
call_memmove +=
|
||||
BinaryIntegerOp(Token::kADD, kUnboxedIntPtr, /*is_truncating=*/true);
|
||||
call_memmove += LoadLocal(arg_from);
|
||||
call_memmove += LoadUntagged(compiler::target::PointerBase::data_offset());
|
||||
call_memmove += ConvertUntaggedToUnboxed(kUnboxedIntPtr);
|
||||
call_memmove += LoadLocal(arg_from_start);
|
||||
call_memmove += IntConstant(element_size);
|
||||
call_memmove += SmiBinaryOp(Token::kMUL, /*is_truncating=*/true);
|
||||
call_memmove += UnboxTruncate(kUnboxedIntPtr);
|
||||
call_memmove +=
|
||||
BinaryIntegerOp(Token::kADD, kUnboxedIntPtr, /*is_truncating=*/true);
|
||||
call_memmove += LoadLocal(arg_count);
|
||||
call_memmove += IntConstant(element_size);
|
||||
call_memmove += SmiBinaryOp(Token::kMUL, /*is_truncating=*/true);
|
||||
call_memmove += UnboxTruncate(kUnboxedIntPtr);
|
||||
call_memmove += LoadThread();
|
||||
call_memmove += LoadUntagged(
|
||||
compiler::target::Thread::OffsetFromThread(&kMemoryMoveRuntimeEntry));
|
||||
call_memmove +=
|
||||
ConvertUntaggedToUnboxed(kUnboxedFfiIntPtr); // function address.
|
||||
call_memmove += CCall(3);
|
||||
call_memmove += Drop();
|
||||
call_memmove += Goto(done);
|
||||
|
||||
body.current = done;
|
||||
body += NullConstant();
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -3845,6 +3845,7 @@ DEFINE_RUNTIME_ENTRY(FfiAsyncCallbackSend, 1) {
|
|||
// Use expected function signatures to help MSVC compiler resolve overloading.
|
||||
typedef double (*UnaryMathCFunction)(double x);
|
||||
typedef double (*BinaryMathCFunction)(double x, double y);
|
||||
typedef void* (*MemMoveCFunction)(void* dest, const void* src, size_t n);
|
||||
|
||||
DEFINE_RAW_LEAF_RUNTIME_ENTRY(
|
||||
LibcPow,
|
||||
|
@ -3938,6 +3939,12 @@ DEFINE_RAW_LEAF_RUNTIME_ENTRY(
|
|||
true /* is_float */,
|
||||
reinterpret_cast<RuntimeFunction>(static_cast<UnaryMathCFunction>(&log)));
|
||||
|
||||
DEFINE_RAW_LEAF_RUNTIME_ENTRY(
|
||||
MemoryMove,
|
||||
3,
|
||||
false /* is_float */,
|
||||
reinterpret_cast<RuntimeFunction>(static_cast<MemMoveCFunction>(&memmove)));
|
||||
|
||||
extern "C" void DFLRT_EnterSafepoint(NativeArguments __unusable_) {
|
||||
CHECK_STACK_ALIGNMENT;
|
||||
TRACE_RUNTIME_CALL("%s", "EnterSafepoint");
|
||||
|
|
|
@ -114,7 +114,8 @@ namespace dart {
|
|||
V(void, MsanUnpoisonParam, size_t) \
|
||||
V(void, TsanLoadAcquire, void*) \
|
||||
V(void, TsanStoreRelease, void*) \
|
||||
V(bool, TryDoubleAsInteger, Thread*)
|
||||
V(bool, TryDoubleAsInteger, Thread*) \
|
||||
V(void*, MemoryMove, void*, const void*, size_t)
|
||||
|
||||
} // namespace dart
|
||||
|
||||
|
|
|
@ -147,10 +147,12 @@ abstract final class _TypedListBase {
|
|||
//
|
||||
// Primarily called by Dart code to handle clamping.
|
||||
//
|
||||
// Element sizes of [this] and [from] must match (test at caller).
|
||||
@pragma("vm:external-name", "TypedDataBase_setRange")
|
||||
// Element sizes of [this] and [from] must match. [this] must be a clamped
|
||||
// typed data object, and the values in [from] must require possible clamping
|
||||
// (tests at caller).
|
||||
@pragma("vm:external-name", "TypedDataBase_setClampedRange")
|
||||
@pragma("vm:entry-point")
|
||||
external void _nativeSetRange(
|
||||
external void _setClampedRange(
|
||||
int start, int count, _TypedListBase from, int skipOffset);
|
||||
|
||||
// Performs a copy of the [count] elements starting at [skipCount] in [from]
|
||||
|
@ -2350,7 +2352,7 @@ final class _Uint8ClampedList extends _TypedList
|
|||
int start, int count, _TypedListBase from, int skipCount) =>
|
||||
from._containsUnsignedBytes
|
||||
? _memMove1(start, count, from, skipCount)
|
||||
: _nativeSetRange(start, count, from, skipCount);
|
||||
: _setClampedRange(start, count, from, skipCount);
|
||||
}
|
||||
|
||||
@patch
|
||||
|
@ -3252,7 +3254,7 @@ final class _ExternalUint8ClampedArray extends _TypedList
|
|||
int start, int count, _TypedListBase from, int skipCount) =>
|
||||
from._containsUnsignedBytes
|
||||
? _memMove1(start, count, from, skipCount)
|
||||
: _nativeSetRange(start, count, from, skipCount);
|
||||
: _setClampedRange(start, count, from, skipCount);
|
||||
}
|
||||
|
||||
@pragma("vm:entry-point")
|
||||
|
@ -4496,7 +4498,7 @@ final class _Uint8ClampedArrayView extends _TypedListView
|
|||
int start, int count, _TypedListBase from, int skipCount) =>
|
||||
from._containsUnsignedBytes
|
||||
? _memMove1(start, count, from, skipCount)
|
||||
: _nativeSetRange(start, count, from, skipCount);
|
||||
: _setClampedRange(start, count, from, skipCount);
|
||||
}
|
||||
|
||||
@pragma("vm:entry-point")
|
||||
|
|
Loading…
Reference in a new issue