From c6bcacc7c9d89ff87e3a15e4a8919edbfc237800 Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Thu, 23 May 2024 22:21:31 +0200 Subject: [PATCH] Vendor import of llvm-project branch release/18.x llvmorg-18.1.6-0-g1118c2e05e67. --- clang/lib/CodeGen/CodeGenModule.cpp | 14 ++++ clang/lib/Driver/ToolChains/OpenBSD.cpp | 3 +- clang/lib/Format/UnwrappedLineParser.cpp | 7 +- clang/lib/Format/WhitespaceManager.cpp | 2 +- clang/lib/Interpreter/IncrementalParser.cpp | 24 +++++-- clang/lib/Interpreter/IncrementalParser.h | 5 ++ clang/lib/Sema/SemaTemplate.cpp | 25 +++++-- libcxx/src/atomic.cpp | 16 ++++- libcxx/src/chrono.cpp | 4 +- lld/ELF/Relocations.cpp | 5 +- llvm/include/llvm/CodeGen/MachineFrameInfo.h | 7 ++ llvm/lib/Analysis/InstructionSimplify.cpp | 4 ++ .../CodeGen/InterleavedLoadCombinePass.cpp | 3 + .../SelectionDAG/SelectionDAGBuilder.cpp | 3 +- .../AArch64/AArch64Arm64ECCallLowering.cpp | 16 +++-- .../Target/AArch64/AArch64ISelLowering.cpp | 3 +- .../AArch64/GISel/AArch64GlobalISelUtils.cpp | 6 ++ .../AArch64/GISel/AArch64LegalizerInfo.cpp | 1 + .../AArch64/GISel/AArch64RegisterBankInfo.cpp | 5 +- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 2 +- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 11 ++++ llvm/lib/Target/AMDGPU/SOPInstructions.td | 2 +- .../lib/Target/PowerPC/PPCMergeStringPool.cpp | 57 +++++----------- .../RISCV/MCTargetDesc/RISCVELFStreamer.cpp | 8 +-- .../RISCV/MCTargetDesc/RISCVELFStreamer.h | 1 - .../Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp | 2 - .../MCTargetDesc/RISCVTargetStreamer.cpp | 5 ++ .../RISCV/MCTargetDesc/RISCVTargetStreamer.h | 5 ++ llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp | 32 +++++++-- .../Target/RISCV/RISCVExpandPseudoInsts.cpp | 5 +- llvm/lib/Target/RISCV/RISCVFeatures.td | 5 ++ llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 9 ++- llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 65 +++++++++++-------- llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +- llvm/lib/Target/X86/X86InstrAVX512.td | 42 ++++++------ llvm/lib/TargetParser/Host.cpp | 3 +- llvm/lib/Transforms/IPO/FunctionAttrs.cpp | 7 +- llvm/lib/Transforms/IPO/GlobalOpt.cpp | 3 + .../InstCombine/InstCombineSelect.cpp | 14 +++- .../Transforms/Vectorize/SLPVectorizer.cpp | 21 +----- openmp/runtime/src/kmp_settings.cpp | 2 + 41 files changed, 305 insertions(+), 155 deletions(-) diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 1280bcd36de9..eb13cd40eb8a 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -67,6 +67,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/RISCVISAInfo.h" #include "llvm/Support/TimeProfiler.h" #include "llvm/Support/xxhash.h" #include "llvm/TargetParser/Triple.h" @@ -1059,6 +1060,19 @@ void CodeGenModule::Release() { llvm::LLVMContext &Ctx = TheModule.getContext(); getModule().addModuleFlag(llvm::Module::Error, "target-abi", llvm::MDString::get(Ctx, ABIStr)); + + // Add the canonical ISA string as metadata so the backend can set the ELF + // attributes correctly. We use AppendUnique so LTO will keep all of the + // unique ISA strings that were linked together. + const std::vector &Features = + getTarget().getTargetOpts().Features; + auto ParseResult = llvm::RISCVISAInfo::parseFeatures( + Arch == llvm::Triple::riscv64 ? 64 : 32, Features); + if (!errorToBool(ParseResult.takeError())) + getModule().addModuleFlag( + llvm::Module::AppendUnique, "riscv-isa", + llvm::MDNode::get( + Ctx, llvm::MDString::get(Ctx, (*ParseResult)->toString()))); } if (CodeGenOpts.SanitizeCfiCrossDso) { diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp index fd6aa4d7e684..00b6c520fcdd 100644 --- a/clang/lib/Driver/ToolChains/OpenBSD.cpp +++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp @@ -371,7 +371,8 @@ std::string OpenBSD::getCompilerRT(const ArgList &Args, StringRef Component, if (Component == "builtins") { SmallString<128> Path(getDriver().SysRoot); llvm::sys::path::append(Path, "/usr/lib/libcompiler_rt.a"); - return std::string(Path); + if (getVFS().exists(Path)) + return std::string(Path); } SmallString<128> P(getDriver().ResourceDir); std::string CRTBasename = diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index a6eb18bb2b32..f70affb732a0 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -2510,6 +2510,7 @@ bool UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) { assert(FormatTok->is(tok::l_paren) && "'(' expected."); auto *LeftParen = FormatTok; bool SeenEqual = false; + bool MightBeFoldExpr = false; const bool MightBeStmtExpr = Tokens->peekNextToken()->is(tok::l_brace); nextToken(); do { @@ -2521,7 +2522,7 @@ bool UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) { parseChildBlock(); break; case tok::r_paren: - if (!MightBeStmtExpr && !Line->InMacroBody && + if (!MightBeStmtExpr && !MightBeFoldExpr && !Line->InMacroBody && Style.RemoveParentheses > FormatStyle::RPS_Leave) { const auto *Prev = LeftParen->Previous; const auto *Next = Tokens->peekNextToken(); @@ -2564,6 +2565,10 @@ bool UnwrappedLineParser::parseParens(TokenType AmpAmpTokenType) { parseBracedList(); } break; + case tok::ellipsis: + MightBeFoldExpr = true; + nextToken(); + break; case tok::equal: SeenEqual = true; if (Style.isCSharp() && FormatTok->is(TT_FatArrow)) diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp index df84f97a8e8a..7525e6ee650b 100644 --- a/clang/lib/Format/WhitespaceManager.cpp +++ b/clang/lib/Format/WhitespaceManager.cpp @@ -1466,7 +1466,7 @@ WhitespaceManager::CellDescriptions WhitespaceManager::getCells(unsigned Start, : Cell); // Go to the next non-comment and ensure there is a break in front const auto *NextNonComment = C.Tok->getNextNonComment(); - while (NextNonComment->is(tok::comma)) + while (NextNonComment && NextNonComment->is(tok::comma)) NextNonComment = NextNonComment->getNextNonComment(); auto j = i; while (Changes[j].Tok != NextNonComment && j < End) diff --git a/clang/lib/Interpreter/IncrementalParser.cpp b/clang/lib/Interpreter/IncrementalParser.cpp index 370bcbfee8b0..f5f32b9f3924 100644 --- a/clang/lib/Interpreter/IncrementalParser.cpp +++ b/clang/lib/Interpreter/IncrementalParser.cpp @@ -209,6 +209,10 @@ IncrementalParser::IncrementalParser(Interpreter &Interp, if (Err) return; CI->ExecuteAction(*Act); + + if (getCodeGen()) + CachedInCodeGenModule = GenModule(); + std::unique_ptr IncrConsumer = std::make_unique(Interp, CI->takeASTConsumer()); CI->setASTConsumer(std::move(IncrConsumer)); @@ -224,11 +228,8 @@ IncrementalParser::IncrementalParser(Interpreter &Interp, return; // PTU.takeError(); } - if (CodeGenerator *CG = getCodeGen()) { - std::unique_ptr M(CG->ReleaseModule()); - CG->StartModule("incr_module_" + std::to_string(PTUs.size()), - M->getContext()); - PTU->TheModule = std::move(M); + if (getCodeGen()) { + PTU->TheModule = GenModule(); assert(PTU->TheModule && "Failed to create initial PTU"); } } @@ -364,6 +365,19 @@ IncrementalParser::Parse(llvm::StringRef input) { std::unique_ptr IncrementalParser::GenModule() { static unsigned ID = 0; if (CodeGenerator *CG = getCodeGen()) { + // Clang's CodeGen is designed to work with a single llvm::Module. In many + // cases for convenience various CodeGen parts have a reference to the + // llvm::Module (TheModule or Module) which does not change when a new + // module is pushed. However, the execution engine wants to take ownership + // of the module which does not map well to CodeGen's design. To work this + // around we created an empty module to make CodeGen happy. We should make + // sure it always stays empty. + assert((!CachedInCodeGenModule || + (CachedInCodeGenModule->empty() && + CachedInCodeGenModule->global_empty() && + CachedInCodeGenModule->alias_empty() && + CachedInCodeGenModule->ifunc_empty())) && + "CodeGen wrote to a readonly module"); std::unique_ptr M(CG->ReleaseModule()); CG->StartModule("incr_module_" + std::to_string(ID++), M->getContext()); return M; diff --git a/clang/lib/Interpreter/IncrementalParser.h b/clang/lib/Interpreter/IncrementalParser.h index e13b74c7f659..f63bce50acd3 100644 --- a/clang/lib/Interpreter/IncrementalParser.h +++ b/clang/lib/Interpreter/IncrementalParser.h @@ -24,6 +24,7 @@ #include namespace llvm { class LLVMContext; +class Module; } // namespace llvm namespace clang { @@ -57,6 +58,10 @@ class IncrementalParser { /// of code. std::list PTUs; + /// When CodeGen is created the first llvm::Module gets cached in many places + /// and we must keep it alive. + std::unique_ptr CachedInCodeGenModule; + IncrementalParser(); public: diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index b619f5d729e8..a12a64939c46 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -2404,9 +2404,6 @@ struct ConvertConstructorToDeductionGuideTransform { Args.addOuterRetainedLevel(); } - if (NestedPattern) - Args.addOuterRetainedLevels(NestedPattern->getTemplateDepth()); - FunctionProtoTypeLoc FPTL = CD->getTypeSourceInfo()->getTypeLoc() .getAsAdjusted(); assert(FPTL && "no prototype for constructor declaration"); @@ -2526,11 +2523,27 @@ struct ConvertConstructorToDeductionGuideTransform { // -- The types of the function parameters are those of the constructor. for (auto *OldParam : TL.getParams()) { - ParmVarDecl *NewParam = - transformFunctionTypeParam(OldParam, Args, MaterializedTypedefs); - if (NestedPattern && NewParam) + ParmVarDecl *NewParam = OldParam; + // Given + // template struct C { + // template struct D { + // template D(U, V); + // }; + // }; + // First, transform all the references to template parameters that are + // defined outside of the surrounding class template. That is T in the + // above example. + if (NestedPattern) { NewParam = transformFunctionTypeParam(NewParam, OuterInstantiationArgs, MaterializedTypedefs); + if (!NewParam) + return QualType(); + } + // Then, transform all the references to template parameters that are + // defined at the class template and the constructor. In this example, + // they're U and V, respectively. + NewParam = + transformFunctionTypeParam(NewParam, Args, MaterializedTypedefs); if (!NewParam) return QualType(); ParamTypes.push_back(NewParam->getType()); diff --git a/libcxx/src/atomic.cpp b/libcxx/src/atomic.cpp index 2f0389ae6974..6b1f03c21bbc 100644 --- a/libcxx/src/atomic.cpp +++ b/libcxx/src/atomic.cpp @@ -25,16 +25,28 @@ # if !defined(SYS_futex) && defined(SYS_futex_time64) # define SYS_futex SYS_futex_time64 # endif +# define _LIBCPP_FUTEX(...) syscall(SYS_futex, __VA_ARGS__) #elif defined(__FreeBSD__) # include # include +# define _LIBCPP_FUTEX(...) syscall(SYS_futex, __VA_ARGS__) + +#elif defined(__OpenBSD__) + +# include + +// OpenBSD has no indirect syscalls +# define _LIBCPP_FUTEX(...) futex(__VA_ARGS__) + #else // <- Add other operating systems here // Baseline needs no new headers +# define _LIBCPP_FUTEX(...) syscall(SYS_futex, __VA_ARGS__) + #endif _LIBCPP_BEGIN_NAMESPACE_STD @@ -44,11 +56,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD static void __libcpp_platform_wait_on_address(__cxx_atomic_contention_t const volatile* __ptr, __cxx_contention_t __val) { static constexpr timespec __timeout = {2, 0}; - syscall(SYS_futex, __ptr, FUTEX_WAIT_PRIVATE, __val, &__timeout, 0, 0); + _LIBCPP_FUTEX(__ptr, FUTEX_WAIT_PRIVATE, __val, &__timeout, 0, 0); } static void __libcpp_platform_wake_by_address(__cxx_atomic_contention_t const volatile* __ptr, bool __notify_one) { - syscall(SYS_futex, __ptr, FUTEX_WAKE_PRIVATE, __notify_one ? 1 : INT_MAX, 0, 0, 0); + _LIBCPP_FUTEX(__ptr, FUTEX_WAKE_PRIVATE, __notify_one ? 1 : INT_MAX, 0, 0, 0); } #elif defined(__APPLE__) && defined(_LIBCPP_USE_ULOCK) diff --git a/libcxx/src/chrono.cpp b/libcxx/src/chrono.cpp index c5e827c0cb59..e7d6dfbc2292 100644 --- a/libcxx/src/chrono.cpp +++ b/libcxx/src/chrono.cpp @@ -31,7 +31,9 @@ # include // for gettimeofday and timeval #endif -#if defined(__APPLE__) || defined(__gnu_hurd__) || (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0) +// OpenBSD does not have a fully conformant suite of POSIX timers, but +// it does have clock_gettime and CLOCK_MONOTONIC which is all we need. +#if defined(__APPLE__) || defined(__gnu_hurd__) || defined(__OpenBSD__) || (defined(_POSIX_TIMERS) && _POSIX_TIMERS > 0) # define _LIBCPP_HAS_CLOCK_GETTIME #endif diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index 619fbaf5dc54..92a1b9baaca3 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -1480,7 +1480,10 @@ template void RelocationScanner::scanOne(RelTy *&i) { // Process TLS relocations, including TLS optimizations. Note that // R_TPREL and R_TPREL_NEG relocations are resolved in processAux. - if (sym.isTls()) { + // + // Some RISCV TLSDESC relocations reference a local NOTYPE symbol, + // but we need to process them in handleTlsRelocation. + if (sym.isTls() || oneof(expr)) { if (unsigned processed = handleTlsRelocation(type, sym, *sec, offset, addend, expr)) { i += processed - 1; diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h index 7d11d63d4066..c35faac09c4d 100644 --- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h +++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h @@ -697,6 +697,13 @@ class MachineFrameInfo { return Objects[ObjectIdx+NumFixedObjects].isAliased; } + /// Set "maybe pointed to by an LLVM IR value" for an object. + void setIsAliasedObjectIndex(int ObjectIdx, bool IsAliased) { + assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() && + "Invalid Object Idx!"); + Objects[ObjectIdx+NumFixedObjects].isAliased = IsAliased; + } + /// Returns true if the specified index corresponds to an immutable object. bool isImmutableObjectIndex(int ObjectIdx) const { // Tail calling functions can clobber their function arguments. diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 72b6dfa181e8..8dcffe45c644 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -4322,6 +4322,10 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, if (match(I, m_Intrinsic())) return nullptr; + // Don't simplify freeze. + if (isa(I)) + return nullptr; + // Replace Op with RepOp in instruction operands. SmallVector NewOps; bool AnyReplaced = false; diff --git a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp index f2d5c3c867c2..bbb0b654dc67 100644 --- a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp +++ b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp @@ -877,6 +877,9 @@ struct VectorInfo { if (LI->isAtomic()) return false; + if (!DL.typeSizeEqualsStoreSize(Result.VTy->getElementType())) + return false; + // Get the base polynomial computePolynomialFromPointer(*LI->getPointerOperand(), Offset, BasePtr, DL); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 5ce1013f30fd..7406a8ac1611 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -10888,7 +10888,7 @@ static void tryToElideArgumentCopy( } // Perform the elision. Delete the old stack object and replace its only use - // in the variable info map. Mark the stack object as mutable. + // in the variable info map. Mark the stack object as mutable and aliased. LLVM_DEBUG({ dbgs() << "Eliding argument copy from " << Arg << " to " << *AI << '\n' << " Replacing frame index " << OldIndex << " with " << FixedIndex @@ -10896,6 +10896,7 @@ static void tryToElideArgumentCopy( }); MFI.RemoveStackObject(OldIndex); MFI.setIsImmutableObjectIndex(FixedIndex, false); + MFI.setIsAliasedObjectIndex(FixedIndex, true); AllocaIndex = FixedIndex; ArgCopyElisionFrameIndexMap.insert({OldIndex, FixedIndex}); for (SDValue ArgVal : ArgVals) diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp index 55c5bbc66a3f..862aefe46193 100644 --- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp @@ -181,13 +181,14 @@ void AArch64Arm64ECCallLowering::getThunkArgTypes( } for (unsigned E = FT->getNumParams(); I != E; ++I) { - Align ParamAlign = AttrList.getParamAlignment(I).valueOrOne(); #if 0 // FIXME: Need more information about argument size; see // https://reviews.llvm.org/D132926 uint64_t ArgSizeBytes = AttrList.getParamArm64ECArgSizeBytes(I); + Align ParamAlign = AttrList.getParamAlignment(I).valueOrOne(); #else uint64_t ArgSizeBytes = 0; + Align ParamAlign = Align(); #endif Type *Arm64Ty, *X64Ty; canonicalizeThunkType(FT->getParamType(I), ParamAlign, @@ -297,7 +298,7 @@ void AArch64Arm64ECCallLowering::canonicalizeThunkType( uint64_t TotalSizeBytes = ElementCnt * ElementSizePerBytes; if (ElementTy->isFloatTy() || ElementTy->isDoubleTy()) { Out << (ElementTy->isFloatTy() ? "F" : "D") << TotalSizeBytes; - if (Alignment.value() >= 8 && !T->isPointerTy()) + if (Alignment.value() >= 16 && !Ret) Out << "a" << Alignment.value(); Arm64Ty = T; if (TotalSizeBytes <= 8) { @@ -328,7 +329,7 @@ void AArch64Arm64ECCallLowering::canonicalizeThunkType( Out << "m"; if (TypeSize != 4) Out << TypeSize; - if (Alignment.value() >= 8 && !T->isPointerTy()) + if (Alignment.value() >= 16 && !Ret) Out << "a" << Alignment.value(); // FIXME: Try to canonicalize Arm64Ty more thoroughly? Arm64Ty = T; @@ -513,7 +514,14 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) { // Call the function passed to the thunk. Value *Callee = Thunk->getArg(0); Callee = IRB.CreateBitCast(Callee, PtrTy); - Value *Call = IRB.CreateCall(Arm64Ty, Callee, Args); + CallInst *Call = IRB.CreateCall(Arm64Ty, Callee, Args); + + auto SRetAttr = F->getAttributes().getParamAttr(0, Attribute::StructRet); + auto InRegAttr = F->getAttributes().getParamAttr(0, Attribute::InReg); + if (SRetAttr.isValid() && !InRegAttr.isValid()) { + Thunk->addParamAttr(1, SRetAttr); + Call->addParamAttr(0, SRetAttr); + } Value *RetVal = Call; if (TransformDirectToSRet) { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 95d8ab95b2c0..bcfd0253e73c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -22122,7 +22122,8 @@ SDValue performCONDCombine(SDNode *N, SDNode *SubsNode = N->getOperand(CmpIndex).getNode(); unsigned CondOpcode = SubsNode->getOpcode(); - if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0)) + if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0) || + !SubsNode->hasOneUse()) return SDValue(); // There is a SUBS feeding this condition. Is it fed by a mask we can diff --git a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp index 92db89cc0915..80fe4bcb8b58 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp @@ -147,6 +147,12 @@ void AArch64GISelUtils::changeFCMPPredToAArch64CC( case CmpInst::FCMP_UNE: CondCode = AArch64CC::NE; break; + case CmpInst::FCMP_TRUE: + CondCode = AArch64CC::AL; + break; + case CmpInst::FCMP_FALSE: + CondCode = AArch64CC::NV; + break; } } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 4b9d549e7911..de3c89e925a2 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -877,6 +877,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT) .legalIf(typeInSet(0, {v16s8, v8s8, v8s16, v4s16, v4s32, v2s32, v2s64})) + .moreElementsToNextPow2(0) .widenVectorEltsToVectorMinSize(0, 64); getActionDefinitionsBuilder(G_BUILD_VECTOR) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index b8e5e7bbdaba..06cdd7e4ef48 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -623,8 +623,11 @@ bool AArch64RegisterBankInfo::isLoadFromFPType(const MachineInstr &MI) const { EltTy = GV->getValueType(); // Look at the first element of the struct to determine the type we are // loading - while (StructType *StructEltTy = dyn_cast(EltTy)) + while (StructType *StructEltTy = dyn_cast(EltTy)) { + if (StructEltTy->getNumElements() == 0) + break; EltTy = StructEltTy->getTypeAtIndex(0U); + } // Look at the first element of the array to determine its type if (isa(EltTy)) EltTy = EltTy->getArrayElementType(); diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 6ecb1c8bf6e1..7a3198612f86 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1832,7 +1832,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // not, we need to ensure the subtarget is capable of backing off barrier // instructions in case there are any outstanding memory operations that may // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here. - if (MI.getOpcode() == AMDGPU::S_BARRIER && + if (TII->isBarrierStart(MI.getOpcode()) && !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { Wait = Wait.combined( AMDGPU::Waitcnt::allZero(ST->hasExtendedWaitCounts(), ST->hasVscnt())); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 1c9dacc09f81..626d903c0c69 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -908,6 +908,17 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return MI.getDesc().TSFlags & SIInstrFlags::IsNeverUniform; } + // Check to see if opcode is for a barrier start. Pre gfx12 this is just the + // S_BARRIER, but after support for S_BARRIER_SIGNAL* / S_BARRIER_WAIT we want + // to check for the barrier start (S_BARRIER_SIGNAL*) + bool isBarrierStart(unsigned Opcode) const { + return Opcode == AMDGPU::S_BARRIER || + Opcode == AMDGPU::S_BARRIER_SIGNAL_M0 || + Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0 || + Opcode == AMDGPU::S_BARRIER_SIGNAL_IMM || + Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM; + } + static bool doesNotReadTiedSource(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::TiedSourceNotRead; } diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index ae5ef0541929..5762efde73f0 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1786,7 +1786,7 @@ def : GCNPat< let SubtargetPredicate = isNotGFX12Plus in def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 0))>; let SubtargetPredicate = isGFX12Plus in - def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 1))>; + def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 2))>; // The first 10 bits of the mode register are the core FP mode on all // subtargets. diff --git a/llvm/lib/Target/PowerPC/PPCMergeStringPool.cpp b/llvm/lib/Target/PowerPC/PPCMergeStringPool.cpp index d9465e86d896..ebd876d50c44 100644 --- a/llvm/lib/Target/PowerPC/PPCMergeStringPool.cpp +++ b/llvm/lib/Target/PowerPC/PPCMergeStringPool.cpp @@ -23,6 +23,7 @@ #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/ValueSymbolTable.h" #include "llvm/Pass.h" @@ -116,9 +117,20 @@ class PPCMergeStringPool : public ModulePass { // sure that they can be replaced. static bool hasReplaceableUsers(GlobalVariable &GV) { for (User *CurrentUser : GV.users()) { - // Instruction users are always valid. - if (isa(CurrentUser)) + if (auto *I = dyn_cast(CurrentUser)) { + // Do not merge globals in exception pads. + if (I->isEHPad()) + return false; + + if (auto *II = dyn_cast(I)) { + // Some intrinsics require a plain global. + if (II->getIntrinsicID() == Intrinsic::eh_typeid_for) + return false; + } + + // Other instruction users are always valid. continue; + } // We cannot replace GlobalValue users because they are not just nodes // in IR. To replace a user like this we would need to create a new @@ -302,14 +314,6 @@ void PPCMergeStringPool::replaceUsesWithGEP(GlobalVariable *GlobalToReplace, Users.push_back(CurrentUser); for (User *CurrentUser : Users) { - Instruction *UserInstruction = dyn_cast(CurrentUser); - Constant *UserConstant = dyn_cast(CurrentUser); - - // At this point we expect that the user is either an instruction or a - // constant. - assert((UserConstant || UserInstruction) && - "Expected the user to be an instruction or a constant."); - // The user was not found so it must have been replaced earlier. if (!userHasOperand(CurrentUser, GlobalToReplace)) continue; @@ -318,38 +322,13 @@ void PPCMergeStringPool::replaceUsesWithGEP(GlobalVariable *GlobalToReplace, if (isa(CurrentUser)) continue; - if (!UserInstruction) { - // User is a constant type. - Constant *ConstGEP = ConstantExpr::getInBoundsGetElementPtr( - PooledStructType, GPool, Indices); - UserConstant->handleOperandChange(GlobalToReplace, ConstGEP); - continue; - } - - if (PHINode *UserPHI = dyn_cast(UserInstruction)) { - // GEP instructions cannot be added before PHI nodes. - // With getInBoundsGetElementPtr we create the GEP and then replace it - // inline into the PHI. - Constant *ConstGEP = ConstantExpr::getInBoundsGetElementPtr( - PooledStructType, GPool, Indices); - UserPHI->replaceUsesOfWith(GlobalToReplace, ConstGEP); - continue; - } - // The user is a valid instruction that is not a PHINode. - GetElementPtrInst *GEPInst = - GetElementPtrInst::Create(PooledStructType, GPool, Indices); - GEPInst->insertBefore(UserInstruction); - - LLVM_DEBUG(dbgs() << "Inserting GEP before:\n"); - LLVM_DEBUG(UserInstruction->dump()); - + Constant *ConstGEP = ConstantExpr::getInBoundsGetElementPtr( + PooledStructType, GPool, Indices); LLVM_DEBUG(dbgs() << "Replacing this global:\n"); LLVM_DEBUG(GlobalToReplace->dump()); LLVM_DEBUG(dbgs() << "with this:\n"); - LLVM_DEBUG(GEPInst->dump()); - - // After the GEP is inserted the GV can be replaced. - CurrentUser->replaceUsesOfWith(GlobalToReplace, GEPInst); + LLVM_DEBUG(ConstGEP->dump()); + GlobalToReplace->replaceAllUsesWith(ConstGEP); } } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp index 961b8f0afe22..cdf7c048a4bf 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp @@ -31,12 +31,13 @@ using namespace llvm; // This part is for ELF object output. RISCVTargetELFStreamer::RISCVTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI) - : RISCVTargetStreamer(S), CurrentVendor("riscv"), STI(STI) { + : RISCVTargetStreamer(S), CurrentVendor("riscv") { MCAssembler &MCA = getStreamer().getAssembler(); const FeatureBitset &Features = STI.getFeatureBits(); auto &MAB = static_cast(MCA.getBackend()); setTargetABI(RISCVABI::computeTargetABI(STI.getTargetTriple(), Features, MAB.getTargetOptions().getABIName())); + setFlagsFromFeatures(STI); // `j label` in `.option norelax; j label; .option relax; ...; label:` needs a // relocation to ensure the jump target is correct after linking. This is due // to a limitation that shouldForceRelocation has to make the decision upfront @@ -87,14 +88,13 @@ void RISCVTargetELFStreamer::finishAttributeSection() { void RISCVTargetELFStreamer::finish() { RISCVTargetStreamer::finish(); MCAssembler &MCA = getStreamer().getAssembler(); - const FeatureBitset &Features = STI.getFeatureBits(); RISCVABI::ABI ABI = getTargetABI(); unsigned EFlags = MCA.getELFHeaderEFlags(); - if (Features[RISCV::FeatureStdExtC]) + if (hasRVC()) EFlags |= ELF::EF_RISCV_RVC; - if (Features[RISCV::FeatureStdExtZtso]) + if (hasTSO()) EFlags |= ELF::EF_RISCV_TSO; switch (ABI) { diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h index a6f54bf67b5d..e8f29cd8449b 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h @@ -46,7 +46,6 @@ class RISCVTargetELFStreamer : public RISCVTargetStreamer { StringRef CurrentVendor; MCSection *AttributeSection = nullptr; - const MCSubtargetInfo &STI; void emitAttribute(unsigned Attribute, unsigned Value) override; void emitTextAttribute(unsigned Attribute, StringRef String) override; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp index 254a9a4bc0ef..b8e0f3a867f4 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp @@ -207,8 +207,6 @@ void RISCVMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const { case VK_RISCV_TLS_GOT_HI: case VK_RISCV_TLS_GD_HI: case VK_RISCV_TLSDESC_HI: - case VK_RISCV_TLSDESC_ADD_LO: - case VK_RISCV_TLSDESC_LOAD_LO: break; } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp index ac4861bf113e..eee78a8c161f 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp @@ -48,6 +48,11 @@ void RISCVTargetStreamer::setTargetABI(RISCVABI::ABI ABI) { TargetABI = ABI; } +void RISCVTargetStreamer::setFlagsFromFeatures(const MCSubtargetInfo &STI) { + HasRVC = STI.hasFeature(RISCV::FeatureStdExtC); + HasTSO = STI.hasFeature(RISCV::FeatureStdExtZtso); +} + void RISCVTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI, bool EmitStackAlign) { if (EmitStackAlign) { diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h index 070e72fb157a..cb8bc21cb635 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h @@ -33,6 +33,8 @@ struct RISCVOptionArchArg { class RISCVTargetStreamer : public MCTargetStreamer { RISCVABI::ABI TargetABI = RISCVABI::ABI_Unknown; + bool HasRVC = false; + bool HasTSO = false; public: RISCVTargetStreamer(MCStreamer &S); @@ -58,6 +60,9 @@ class RISCVTargetStreamer : public MCTargetStreamer { void emitTargetAttributes(const MCSubtargetInfo &STI, bool EmitStackAlign); void setTargetABI(RISCVABI::ABI ABI); RISCVABI::ABI getTargetABI() const { return TargetABI; } + void setFlagsFromFeatures(const MCSubtargetInfo &STI); + bool hasRVC() const { return HasRVC; } + bool hasTSO() const { return HasTSO; } }; // This part is for ascii assembly output diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp index b2e9cd87373b..87bd9b4048cd 100644 --- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp +++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp @@ -100,7 +100,7 @@ class RISCVAsmPrinter : public AsmPrinter { bool emitDirectiveOptionArch(); private: - void emitAttributes(); + void emitAttributes(const MCSubtargetInfo &SubtargetInfo); void emitNTLHint(const MachineInstr *MI); @@ -385,8 +385,32 @@ void RISCVAsmPrinter::emitStartOfAsmFile(Module &M) { if (const MDString *ModuleTargetABI = dyn_cast_or_null(M.getModuleFlag("target-abi"))) RTS.setTargetABI(RISCVABI::getTargetABI(ModuleTargetABI->getString())); + + MCSubtargetInfo SubtargetInfo = *TM.getMCSubtargetInfo(); + + // Use module flag to update feature bits. + if (auto *MD = dyn_cast_or_null(M.getModuleFlag("riscv-isa"))) { + for (auto &ISA : MD->operands()) { + if (auto *ISAString = dyn_cast_or_null(ISA)) { + auto ParseResult = llvm::RISCVISAInfo::parseArchString( + ISAString->getString(), /*EnableExperimentalExtension=*/true, + /*ExperimentalExtensionVersionCheck=*/true); + if (!errorToBool(ParseResult.takeError())) { + auto &ISAInfo = *ParseResult; + for (const auto &Feature : RISCVFeatureKV) { + if (ISAInfo->hasExtension(Feature.Key) && + !SubtargetInfo.hasFeature(Feature.Value)) + SubtargetInfo.ToggleFeature(Feature.Key); + } + } + } + } + + RTS.setFlagsFromFeatures(SubtargetInfo); + } + if (TM.getTargetTriple().isOSBinFormatELF()) - emitAttributes(); + emitAttributes(SubtargetInfo); } void RISCVAsmPrinter::emitEndOfAsmFile(Module &M) { @@ -398,13 +422,13 @@ void RISCVAsmPrinter::emitEndOfAsmFile(Module &M) { EmitHwasanMemaccessSymbols(M); } -void RISCVAsmPrinter::emitAttributes() { +void RISCVAsmPrinter::emitAttributes(const MCSubtargetInfo &SubtargetInfo) { RISCVTargetStreamer &RTS = static_cast(*OutStreamer->getTargetStreamer()); // Use MCSubtargetInfo from TargetMachine. Individual functions may have // attributes that differ from other functions in the module and we have no // way to know which function is correct. - RTS.emitTargetAttributes(*TM.getMCSubtargetInfo(), /*EmitStackAlign*/ true); + RTS.emitTargetAttributes(SubtargetInfo, /*EmitStackAlign*/ true); } void RISCVAsmPrinter::emitFunctionEntryLabel() { diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp index 0a314fdd41cb..89207640ee54 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp @@ -317,8 +317,9 @@ bool RISCVExpandPseudo::expandRV32ZdinxStore(MachineBasicBlock &MBB, .addReg(MBBI->getOperand(1).getReg()) .add(MBBI->getOperand(2)); if (MBBI->getOperand(2).isGlobal() || MBBI->getOperand(2).isCPI()) { - // FIXME: Zdinx RV32 can not work on unaligned memory. - assert(!STI->hasFastUnalignedAccess()); + // FIXME: Zdinx RV32 can not work on unaligned scalar memory. + assert(!STI->hasFastUnalignedAccess() && + !STI->enableUnalignedScalarMem()); assert(MBBI->getOperand(2).getOffset() % 8 == 0); MBBI->getOperand(2).setOffset(MBBI->getOperand(2).getOffset() + 4); diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 26451c80f57b..1bb6b6a561f4 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1025,6 +1025,11 @@ def FeatureFastUnalignedAccess "true", "Has reasonably performant unaligned " "loads and stores (both scalar and vector)">; +def FeatureUnalignedScalarMem + : SubtargetFeature<"unaligned-scalar-mem", "EnableUnalignedScalarMem", + "true", "Has reasonably performant unaligned scalar " + "loads and stores">; + def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler", "UsePostRAScheduler", "true", "Schedule again after register allocation">; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index d46093b9e260..3fe7ddfdd427 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1883,7 +1883,8 @@ bool RISCVTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, // replace. If we don't support unaligned scalar mem, prefer the constant // pool. // TODO: Can the caller pass down the alignment? - if (!Subtarget.hasFastUnalignedAccess()) + if (!Subtarget.hasFastUnalignedAccess() && + !Subtarget.enableUnalignedScalarMem()) return true; // Prefer to keep the load if it would require many instructions. @@ -19772,8 +19773,10 @@ bool RISCVTargetLowering::allowsMisalignedMemoryAccesses( unsigned *Fast) const { if (!VT.isVector()) { if (Fast) - *Fast = Subtarget.hasFastUnalignedAccess(); - return Subtarget.hasFastUnalignedAccess(); + *Fast = Subtarget.hasFastUnalignedAccess() || + Subtarget.enableUnalignedScalarMem(); + return Subtarget.hasFastUnalignedAccess() || + Subtarget.enableUnalignedScalarMem(); } // All vector implementations must support element alignment diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp index bf6547cc87ec..2f2dc6b80792 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -70,49 +70,62 @@ void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI, MachineBasicBlock *MBB = MI->getParent(); MachineFunction &MF = *MBB->getParent(); - // Get two load or store instructions. Use the original instruction for one - // of them (arbitrarily the second here) and create a clone for the other. - MachineInstr *EarlierMI = MF.CloneMachineInstr(&*MI); - MBB->insert(MI, EarlierMI); + // Get two load or store instructions. Use the original instruction for + // one of them and create a clone for the other. + MachineInstr *HighPartMI = MF.CloneMachineInstr(&*MI); + MachineInstr *LowPartMI = &*MI; + MBB->insert(LowPartMI, HighPartMI); // Set up the two 64-bit registers and remember super reg and its flags. - MachineOperand &HighRegOp = EarlierMI->getOperand(0); - MachineOperand &LowRegOp = MI->getOperand(0); + MachineOperand &HighRegOp = HighPartMI->getOperand(0); + MachineOperand &LowRegOp = LowPartMI->getOperand(0); Register Reg128 = LowRegOp.getReg(); unsigned Reg128Killed = getKillRegState(LowRegOp.isKill()); unsigned Reg128Undef = getUndefRegState(LowRegOp.isUndef()); HighRegOp.setReg(RI.getSubReg(HighRegOp.getReg(), SystemZ::subreg_h64)); LowRegOp.setReg(RI.getSubReg(LowRegOp.getReg(), SystemZ::subreg_l64)); - if (MI->mayStore()) { - // Add implicit uses of the super register in case one of the subregs is - // undefined. We could track liveness and skip storing an undefined - // subreg, but this is hopefully rare (discovered with llvm-stress). - // If Reg128 was killed, set kill flag on MI. - unsigned Reg128UndefImpl = (Reg128Undef | RegState::Implicit); - MachineInstrBuilder(MF, EarlierMI).addReg(Reg128, Reg128UndefImpl); - MachineInstrBuilder(MF, MI).addReg(Reg128, (Reg128UndefImpl | Reg128Killed)); - } - // The address in the first (high) instruction is already correct. // Adjust the offset in the second (low) instruction. - MachineOperand &HighOffsetOp = EarlierMI->getOperand(2); - MachineOperand &LowOffsetOp = MI->getOperand(2); + MachineOperand &HighOffsetOp = HighPartMI->getOperand(2); + MachineOperand &LowOffsetOp = LowPartMI->getOperand(2); LowOffsetOp.setImm(LowOffsetOp.getImm() + 8); - // Clear the kill flags on the registers in the first instruction. - if (EarlierMI->getOperand(0).isReg() && EarlierMI->getOperand(0).isUse()) - EarlierMI->getOperand(0).setIsKill(false); - EarlierMI->getOperand(1).setIsKill(false); - EarlierMI->getOperand(3).setIsKill(false); - // Set the opcodes. unsigned HighOpcode = getOpcodeForOffset(NewOpcode, HighOffsetOp.getImm()); unsigned LowOpcode = getOpcodeForOffset(NewOpcode, LowOffsetOp.getImm()); assert(HighOpcode && LowOpcode && "Both offsets should be in range"); + HighPartMI->setDesc(get(HighOpcode)); + LowPartMI->setDesc(get(LowOpcode)); - EarlierMI->setDesc(get(HighOpcode)); - MI->setDesc(get(LowOpcode)); + MachineInstr *FirstMI = HighPartMI; + if (MI->mayStore()) { + FirstMI->getOperand(0).setIsKill(false); + // Add implicit uses of the super register in case one of the subregs is + // undefined. We could track liveness and skip storing an undefined + // subreg, but this is hopefully rare (discovered with llvm-stress). + // If Reg128 was killed, set kill flag on MI. + unsigned Reg128UndefImpl = (Reg128Undef | RegState::Implicit); + MachineInstrBuilder(MF, HighPartMI).addReg(Reg128, Reg128UndefImpl); + MachineInstrBuilder(MF, LowPartMI).addReg(Reg128, (Reg128UndefImpl | Reg128Killed)); + } else { + // If HighPartMI clobbers any of the address registers, it needs to come + // after LowPartMI. + auto overlapsAddressReg = [&](Register Reg) -> bool { + return RI.regsOverlap(Reg, MI->getOperand(1).getReg()) || + RI.regsOverlap(Reg, MI->getOperand(3).getReg()); + }; + if (overlapsAddressReg(HighRegOp.getReg())) { + assert(!overlapsAddressReg(LowRegOp.getReg()) && + "Both loads clobber address!"); + MBB->splice(HighPartMI, MBB, LowPartMI); + FirstMI = LowPartMI; + } + } + + // Clear the kill flags on the address registers in the first instruction. + FirstMI->getOperand(1).setIsKill(false); + FirstMI->getOperand(3).setIsKill(false); } // Split ADJDYNALLOC instruction MI. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 71fc6b5047ea..3e4ecab8443a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7295,7 +7295,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, // With pattern matching, the VBROADCAST node may become a VMOVDDUP. if (ScalarSize == 32 || (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) || - CVT == MVT::f16 || + (CVT == MVT::f16 && Subtarget.hasAVX2()) || (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) { const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast(Ld)) @@ -29841,7 +29841,9 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, return R; // AVX512 implicitly uses modulo rotation amounts. - if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) { + if ((Subtarget.hasVLX() || + (Subtarget.hasAVX512() && Subtarget.hasEVEX512())) && + 32 <= EltSizeInBits) { // Attempt to rotate by immediate. if (IsCstSplat) { unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI; diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index bb5e22c71427..0564f2167d8e 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -814,7 +814,7 @@ defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v32f16_info, v16f16x_info, // A 128-bit extract from bits [255:128] of a 512-bit vector should use a // smaller extract to enable EVEX->VEX. -let Predicates = [NoVLX] in { +let Predicates = [NoVLX, HasEVEX512] in { def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 2))), (v2i64 (VEXTRACTI128rr (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm)), @@ -3068,7 +3068,7 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, addr:$src2, (X86cmpm_imm_commute timm:$cc)), Narrow.KRC)>; } -let Predicates = [HasAVX512, NoVLX] in { +let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { defm : axv512_icmp_packed_cc_no_vlx_lowering; defm : axv512_icmp_packed_cc_no_vlx_lowering; @@ -3099,7 +3099,7 @@ let Predicates = [HasAVX512, NoVLX] in { defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPD", v2f64x_info, v8f64_info>; } -let Predicates = [HasBWI, NoVLX] in { +let Predicates = [HasBWI, NoVLX, HasEVEX512] in { defm : axv512_icmp_packed_cc_no_vlx_lowering; defm : axv512_icmp_packed_cc_no_vlx_lowering; @@ -3493,7 +3493,7 @@ multiclass mask_move_lowering; defm : mask_move_lowering<"VMOVDQA32Z", v4i32x_info, v16i32_info>; defm : mask_move_lowering<"VMOVAPSZ", v8f32x_info, v16f32_info>; @@ -3505,7 +3505,7 @@ let Predicates = [HasAVX512, NoVLX] in { defm : mask_move_lowering<"VMOVDQA64Z", v4i64x_info, v8i64_info>; } -let Predicates = [HasBWI, NoVLX] in { +let Predicates = [HasBWI, NoVLX, HasEVEX512] in { defm : mask_move_lowering<"VMOVDQU8Z", v16i8x_info, v64i8_info>; defm : mask_move_lowering<"VMOVDQU8Z", v32i8x_info, v64i8_info>; @@ -4998,8 +4998,8 @@ defm VPMINUD : avx512_binop_rm_vl_d<0x3B, "vpminud", umin, defm VPMINUQ : avx512_binop_rm_vl_q<0x3B, "vpminuq", umin, SchedWriteVecALU, HasAVX512, 1>, T8; -// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX. -let Predicates = [HasDQI, NoVLX] in { +// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX, HasEVEX512. +let Predicates = [HasDQI, NoVLX, HasEVEX512] in { def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))), (EXTRACT_SUBREG (VPMULLQZrr @@ -5055,7 +5055,7 @@ multiclass avx512_min_max_lowering { sub_xmm)>; } -let Predicates = [HasAVX512, NoVLX] in { +let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { defm : avx512_min_max_lowering<"VPMAXUQZ", umax>; defm : avx512_min_max_lowering<"VPMINUQZ", umin>; defm : avx512_min_max_lowering<"VPMAXSQZ", smax>; @@ -6032,7 +6032,7 @@ defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl, SchedWriteVecShift>; // Use 512bit VPSRA/VPSRAI version to implement v2i64/v4i64 in case NoVLX. -let Predicates = [HasAVX512, NoVLX] in { +let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { def : Pat<(v4i64 (X86vsra (v4i64 VR256X:$src1), (v2i64 VR128X:$src2))), (EXTRACT_SUBREG (v8i64 (VPSRAQZrr @@ -6161,14 +6161,14 @@ defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", X86vsrlv, SchedWriteVarVecS defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr, SchedWriteVarVecShift>; defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl, SchedWriteVarVecShift>; -defm : avx512_var_shift_lowering; -defm : avx512_var_shift_lowering; -defm : avx512_var_shift_lowering; -defm : avx512_var_shift_lowering; +defm : avx512_var_shift_lowering; +defm : avx512_var_shift_lowering; +defm : avx512_var_shift_lowering; +defm : avx512_var_shift_lowering; // Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX. -let Predicates = [HasAVX512, NoVLX] in { +let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { def : Pat<(v2i64 (rotl (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), (EXTRACT_SUBREG (v8i64 (VPROLVQZrr @@ -6219,7 +6219,7 @@ let Predicates = [HasAVX512, NoVLX] in { } // Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX. -let Predicates = [HasAVX512, NoVLX] in { +let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { def : Pat<(v2i64 (rotr (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), (EXTRACT_SUBREG (v8i64 (VPRORVQZrr @@ -9816,7 +9816,7 @@ defm VPMOVUSWB : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus, truncstore_us_vi8, masked_truncstore_us_vi8, X86vtruncus, X86vmtruncus>; -let Predicates = [HasAVX512, NoVLX] in { +let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { def: Pat<(v8i16 (trunc (v8i32 VR256X:$src))), (v8i16 (EXTRACT_SUBREG (v16i16 (VPMOVDWZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), @@ -9827,7 +9827,7 @@ def: Pat<(v4i32 (trunc (v4i64 VR256X:$src))), VR256X:$src, sub_ymm)))), sub_xmm))>; } -let Predicates = [HasBWI, NoVLX] in { +let Predicates = [HasBWI, NoVLX, HasEVEX512] in { def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))), (v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src, sub_ymm))), sub_xmm))>; @@ -10370,7 +10370,7 @@ multiclass avx512_convert_vector_to_mask opc, string OpcodeStr, defm Z128 : convert_vector_to_mask_common, EVEX_V128; } - let Predicates = [prd, NoVLX] in { + let Predicates = [prd, NoVLX, HasEVEX512] in { defm Z256_Alt : convert_vector_to_mask_lowering; defm Z128_Alt : convert_vector_to_mask_lowering; } @@ -11157,7 +11157,7 @@ defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs, SchedWriteVecALU>; // VPABS: Use 512bit version to implement 128/256 bit in case NoVLX. -let Predicates = [HasAVX512, NoVLX] in { +let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { def : Pat<(v4i64 (abs VR256X:$src)), (EXTRACT_SUBREG (VPABSQZrr @@ -11173,7 +11173,7 @@ let Predicates = [HasAVX512, NoVLX] in { // Use 512bit version to implement 128/256 bit. multiclass avx512_unary_lowering { - let Predicates = [prd, NoVLX] in { + let Predicates = [prd, NoVLX, HasEVEX512] in { def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1))), (EXTRACT_SUBREG (!cast(InstrStr # "Zrr") @@ -11792,7 +11792,7 @@ let Predicates = [HasAVX512] in { (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>; } -let Predicates = [HasAVX512, NoVLX] in { +let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { def : Pat<(v16i8 (vnot VR128X:$src)), (EXTRACT_SUBREG (VPTERNLOGQZrri diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 1adef15771fa..848b531dd8dd 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -1774,7 +1774,8 @@ bool sys::getHostCPUFeatures(StringMap &Features) { Features["rtm"] = HasLeaf7 && ((EBX >> 11) & 1); // AVX512 is only supported if the OS supports the context save for it. Features["avx512f"] = HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save; - Features["evex512"] = Features["avx512f"]; + if (Features["avx512f"]) + Features["evex512"] = true; Features["avx512dq"] = HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save; Features["rdseed"] = HasLeaf7 && ((EBX >> 18) & 1); Features["adx"] = HasLeaf7 && ((EBX >> 19) & 1); diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index 7ebf265e17ba..27c411250d53 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -1186,10 +1186,15 @@ static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes, switch (RVI->getOpcode()) { // Extend the analysis by looking upwards. case Instruction::BitCast: - case Instruction::GetElementPtr: case Instruction::AddrSpaceCast: FlowsToReturn.insert(RVI->getOperand(0)); continue; + case Instruction::GetElementPtr: + if (cast(RVI)->isInBounds()) { + FlowsToReturn.insert(RVI->getOperand(0)); + continue; + } + return false; case Instruction::Select: { SelectInst *SI = cast(RVI); FlowsToReturn.insert(SI->getTrueValue()); diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 951372adcfa9..619b3f612f25 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -2212,6 +2212,9 @@ static bool mayHaveOtherReferences(GlobalValue &GV, const LLVMUsed &U) { static bool hasUsesToReplace(GlobalAlias &GA, const LLVMUsed &U, bool &RenameTarget) { + if (GA.isWeakForLinker()) + return false; + RenameTarget = false; bool Ret = false; if (hasUseOtherThanLLVMUsed(GA, U)) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 8cc7901cbac7..86a39cf2ee93 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -3201,7 +3201,8 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { // pattern. static bool isSafeToRemoveBitCeilSelect(ICmpInst::Predicate Pred, Value *Cond0, const APInt *Cond1, Value *CtlzOp, - unsigned BitWidth) { + unsigned BitWidth, + bool &ShouldDropNUW) { // The challenge in recognizing std::bit_ceil(X) is that the operand is used // for the CTLZ proper and select condition, each possibly with some // operation like add and sub. @@ -3224,6 +3225,8 @@ static bool isSafeToRemoveBitCeilSelect(ICmpInst::Predicate Pred, Value *Cond0, ConstantRange CR = ConstantRange::makeExactICmpRegion( CmpInst::getInversePredicate(Pred), *Cond1); + ShouldDropNUW = false; + // Match the operation that's used to compute CtlzOp from CommonAncestor. If // CtlzOp == CommonAncestor, return true as no operation is needed. If a // match is found, execute the operation on CR, update CR, and return true. @@ -3237,6 +3240,7 @@ static bool isSafeToRemoveBitCeilSelect(ICmpInst::Predicate Pred, Value *Cond0, return true; } if (match(CtlzOp, m_Sub(m_APInt(C), m_Specific(CommonAncestor)))) { + ShouldDropNUW = true; CR = ConstantRange(*C).sub(CR); return true; } @@ -3306,14 +3310,20 @@ static Instruction *foldBitCeil(SelectInst &SI, IRBuilderBase &Builder) { Pred = CmpInst::getInversePredicate(Pred); } + bool ShouldDropNUW; + if (!match(FalseVal, m_One()) || !match(TrueVal, m_OneUse(m_Shl(m_One(), m_OneUse(m_Sub(m_SpecificInt(BitWidth), m_Value(Ctlz)))))) || !match(Ctlz, m_Intrinsic(m_Value(CtlzOp), m_Zero())) || - !isSafeToRemoveBitCeilSelect(Pred, Cond0, Cond1, CtlzOp, BitWidth)) + !isSafeToRemoveBitCeilSelect(Pred, Cond0, Cond1, CtlzOp, BitWidth, + ShouldDropNUW)) return nullptr; + if (ShouldDropNUW) + cast(CtlzOp)->setHasNoUnsignedWrap(false); + // Build 1 << (-CTLZ & (BitWidth-1)). The negation likely corresponds to a // single hardware instruction as opposed to BitWidth - CTLZ, where BitWidth // is an integer constant. Masking with BitWidth-1 comes free on some diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 1fbd69e38eae..0a9e2c7f49f5 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -11653,12 +11653,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1)) TysForDecl.push_back( FixedVectorType::get(CI->getType(), E->Scalars.size())); - auto *CEI = cast(VL0); for (unsigned I : seq(0, CI->arg_size())) { ValueList OpVL; // Some intrinsics have scalar arguments. This argument should not be // vectorized. if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) { + CallInst *CEI = cast(VL0); ScalarArg = CEI->getArgOperand(I); OpVecs.push_back(CEI->getArgOperand(I)); if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I)) @@ -11671,25 +11671,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); return E->VectorizedValue; } - auto GetOperandSignedness = [&](unsigned Idx) { - const TreeEntry *OpE = getOperandEntry(E, Idx); - bool IsSigned = false; - auto It = MinBWs.find(OpE); - if (It != MinBWs.end()) - IsSigned = It->second.second; - else - IsSigned = any_of(OpE->Scalars, [&](Value *R) { - return !isKnownNonNegative(R, SimplifyQuery(*DL)); - }); - return IsSigned; - }; - ScalarArg = CEI->getArgOperand(I); - if (cast(OpVec->getType())->getElementType() != - ScalarArg->getType()) { - auto *CastTy = FixedVectorType::get(ScalarArg->getType(), - VecTy->getNumElements()); - OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I)); - } LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n"); OpVecs.push_back(OpVec); if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I)) diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp index ec86ee07472c..58f19ea5b8ab 100644 --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -6426,6 +6426,8 @@ void __kmp_env_initialize(char const *string) { } if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) && (__kmp_nested_proc_bind.bind_types[0] != proc_bind_default)) { + if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false) + __kmp_affinity.type = affinity_none; if (__kmp_affinity.type == affinity_default) { __kmp_affinity.type = affinity_compact; __kmp_affinity.flags.dups = FALSE;