From 12f3ca4cdb95b193af905a00e722a4dcb40b3de3 Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Apr 26 2017 19:45:00 +0000 Subject: Vendor import of llvm trunk r301441: https://llvm.org/svn/llvm-project/llvm/trunk@301441 --- diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake index 099d2eb..882d68e 100644 --- a/cmake/modules/HandleLLVMOptions.cmake +++ b/cmake/modules/HandleLLVMOptions.cmake @@ -222,6 +222,13 @@ if( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT WIN32 ) endif( LLVM_BUILD_32_BITS ) endif( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT WIN32 ) +# If building on a GNU specific 32-bit system, make sure off_t is 64 bits +# so that off_t can stored offset > 2GB +if( CMAKE_SIZEOF_VOID_P EQUAL 4 ) + add_definitions( -D_LARGEFILE_SOURCE ) + add_definitions( -D_FILE_OFFSET_BITS=64 ) +endif() + if( XCODE ) # For Xcode enable several build settings that correspond to # many warnings that are on by default in Clang but are diff --git a/docs/AMDGPUUsage.rst b/docs/AMDGPUUsage.rst index 5ff0f20..9749705 100644 --- a/docs/AMDGPUUsage.rst +++ b/docs/AMDGPUUsage.rst @@ -82,9 +82,8 @@ handler as follows: =============== ============= =============================================== Usage Code Sequence Description =============== ============= =============================================== - llvm.trap s_endpgm Causes wavefront to be terminated. - llvm.debugtrap s_nop No operation. Compiler warning generated that - there is no trap handler installed. + llvm.trap s_endpgm Causes wavefront to be terminated. + llvm.debugtrap Nothing. Compiler warning generated that there is no trap handler installed. =============== ============= =============================================== Assembler diff --git a/docs/GettingStarted.rst b/docs/GettingStarted.rst index a888603..d5c8ba4 100644 --- a/docs/GettingStarted.rst +++ b/docs/GettingStarted.rst @@ -171,6 +171,8 @@ Linux PowerPC GCC, Clang Solaris V9 (Ultrasparc) GCC FreeBSD x86\ :sup:`1` GCC, Clang FreeBSD amd64 GCC, Clang +NetBSD x86\ :sup:`1` GCC, Clang +NetBSD amd64 GCC, Clang MacOS X\ :sup:`2` PowerPC GCC MacOS X x86 GCC, Clang Cygwin/Win32 x86\ :sup:`1, 3` GCC diff --git a/docs/HowToAddABuilder.rst b/docs/HowToAddABuilder.rst index fcc2293..08cbecd 100644 --- a/docs/HowToAddABuilder.rst +++ b/docs/HowToAddABuilder.rst @@ -83,6 +83,8 @@ Here are the steps you can follow to do so: * slaves are added to ``buildbot/osuosl/master/config/slaves.py`` * builders are added to ``buildbot/osuosl/master/config/builders.py`` + Please make sure your builder name and its builddir are unique through the file. + It is possible to whitelist email addresses to unconditionally receive notifications on build failure; for this you'll need to add an ``InformativeMailNotifier`` to ``buildbot/osuosl/master/config/status.py``. This is particularly useful for the diff --git a/docs/LibFuzzer.rst b/docs/LibFuzzer.rst index 0b785a3..a11baa7 100644 --- a/docs/LibFuzzer.rst +++ b/docs/LibFuzzer.rst @@ -87,10 +87,16 @@ Some important things to remember about fuzz targets: * Usually, the narrower the target the better. E.g. if your target can parse several data formats, split it into several targets, one per format. -Building --------- +Fuzzer Usage +------------ + +Very recent versions of Clang (> April 20 2017) include libFuzzer, +and no installation is necessary. +In order to fuzz your binary, use the `-fsanitize=fuzzer` flag during the compilation:: -Next, build the libFuzzer library as a static archive, without any sanitizer + clang -fsanitize=fuzzer,address mytarget.c + +Otherwise, build the libFuzzer library as a static archive, without any sanitizer options. Note that the libFuzzer library contains the ``main()`` function: .. code-block:: console @@ -728,6 +734,7 @@ to crash on invalid inputs. Examples: regular expression matchers, text or binary format parsers, compression, network, crypto. + Trophies ======== * GLIBC: https://sourceware.org/glibc/wiki/FuzzingLibc diff --git a/include/llvm/ADT/APFloat.h b/include/llvm/ADT/APFloat.h index e7e5036..bef6efd 100644 --- a/include/llvm/ADT/APFloat.h +++ b/include/llvm/ADT/APFloat.h @@ -397,6 +397,12 @@ public: /// consider inserting before falling back to scientific /// notation. 0 means to always use scientific notation. /// + /// \param TruncateZero Indicate whether to remove the trailing zero in + /// fraction part or not. Also setting this parameter to false forcing + /// producing of output more similar to default printf behavior. + /// Specifically the lower e is used as exponent delimiter and exponent + /// always contains no less than two digits. + /// /// Number Precision MaxPadding Result /// ------ --------- ---------- ------ /// 1.01E+4 5 2 10100 @@ -406,7 +412,7 @@ public: /// 1.01E-2 4 2 0.0101 /// 1.01E-2 4 1 1.01E-2 void toString(SmallVectorImpl &Str, unsigned FormatPrecision = 0, - unsigned FormatMaxPadding = 3) const; + unsigned FormatMaxPadding = 3, bool TruncateZero = true) const; /// If this value has an exact multiplicative inverse, store it in inv and /// return true. @@ -649,7 +655,7 @@ public: bool isInteger() const; void toString(SmallVectorImpl &Str, unsigned FormatPrecision, - unsigned FormatMaxPadding) const; + unsigned FormatMaxPadding, bool TruncateZero = true) const; bool getExactInverse(APFloat *inv) const; @@ -1144,9 +1150,9 @@ public: APFloat &operator=(APFloat &&RHS) = default; void toString(SmallVectorImpl &Str, unsigned FormatPrecision = 0, - unsigned FormatMaxPadding = 3) const { + unsigned FormatMaxPadding = 3, bool TruncateZero = true) const { APFLOAT_DISPATCH_ON_SEMANTICS( - toString(Str, FormatPrecision, FormatMaxPadding)); + toString(Str, FormatPrecision, FormatMaxPadding, TruncateZero)); } void print(raw_ostream &) const; diff --git a/include/llvm/ADT/APInt.h b/include/llvm/ADT/APInt.h index ceb623d..d0104c3 100644 --- a/include/llvm/ADT/APInt.h +++ b/include/llvm/ADT/APInt.h @@ -78,6 +78,8 @@ public: APINT_BITS_PER_WORD = APINT_WORD_SIZE * CHAR_BIT }; + static const WordType WORD_MAX = ~WordType(0); + private: /// This union is used to store the integer value. When the /// integer bit-width <= 64, it uses VAL, otherwise it uses pVal. @@ -90,6 +92,8 @@ private: friend struct DenseMapAPIntKeyInfo; + friend class APSInt; + /// \brief Fast internal constructor /// /// This constructor is used only internally for speed of construction of @@ -134,15 +138,10 @@ private: /// zero'd out. APInt &clearUnusedBits() { // Compute how many bits are used in the final word - unsigned wordBits = BitWidth % APINT_BITS_PER_WORD; - if (wordBits == 0) - // If all bits are used, we want to leave the value alone. This also - // avoids the undefined behavior of >> when the shift is the same size as - // the word size (64). - return *this; + unsigned WordBits = ((BitWidth-1) % APINT_BITS_PER_WORD) + 1; // Mask out the high bits. - uint64_t mask = UINT64_MAX >> (APINT_BITS_PER_WORD - wordBits); + uint64_t mask = WORD_MAX >> (APINT_BITS_PER_WORD - WordBits); if (isSingleWord()) VAL &= mask; else @@ -194,6 +193,9 @@ private: /// out-of-line slow case for lshr. void lshrSlowCase(unsigned ShiftAmt); + /// out-of-line slow case for ashr. + void ashrSlowCase(unsigned ShiftAmt); + /// out-of-line slow case for operator= void AssignSlowCase(const APInt &RHS); @@ -230,6 +232,14 @@ private: /// out-of-line slow case for operator^=. void XorAssignSlowCase(const APInt& RHS); + /// Unsigned comparison. Returns -1, 0, or 1 if this APInt is less than, equal + /// to, or greater than RHS. + int compare(const APInt &RHS) const LLVM_READONLY; + + /// Signed comparison. Returns -1, 0, or 1 if this APInt is less than, equal + /// to, or greater than RHS. + int compareSigned(const APInt &RHS) const LLVM_READONLY; + public: /// \name Constructors /// @{ @@ -363,7 +373,7 @@ public: /// This checks to see if the value has all bits of the APInt are set or not. bool isAllOnesValue() const { if (isSingleWord()) - return VAL == UINT64_MAX >> (APINT_BITS_PER_WORD - BitWidth); + return VAL == WORD_MAX >> (APINT_BITS_PER_WORD - BitWidth); return countPopulationSlowCase() == BitWidth; } @@ -445,7 +455,7 @@ public: assert(numBits != 0 && "numBits must be non-zero"); assert(numBits <= BitWidth && "numBits out of range"); if (isSingleWord()) - return VAL == (UINT64_MAX >> (APINT_BITS_PER_WORD - numBits)); + return VAL == (WORD_MAX >> (APINT_BITS_PER_WORD - numBits)); unsigned Ones = countTrailingOnesSlowCase(); return (numBits == Ones) && ((Ones + countLeadingZerosSlowCase()) == BitWidth); @@ -509,7 +519,7 @@ public: /// /// \returns the all-ones value for an APInt of the specified bit-width. static APInt getAllOnesValue(unsigned numBits) { - return APInt(numBits, UINT64_MAX, true); + return APInt(numBits, WORD_MAX, true); } /// \brief Get the '0' value. @@ -886,7 +896,26 @@ public: /// \brief Arithmetic right-shift function. /// /// Arithmetic right-shift this APInt by shiftAmt. - APInt ashr(unsigned shiftAmt) const; + APInt ashr(unsigned ShiftAmt) const { + APInt R(*this); + R.ashrInPlace(ShiftAmt); + return R; + } + + /// Arithmetic right-shift this APInt by ShiftAmt in place. + void ashrInPlace(unsigned ShiftAmt) { + assert(ShiftAmt <= BitWidth && "Invalid shift amount"); + if (isSingleWord()) { + int64_t SExtVAL = SignExtend64(VAL, BitWidth); + if (ShiftAmt == BitWidth) + VAL = SExtVAL >> (APINT_BITS_PER_WORD - 1); // Fill with sign bit. + else + VAL = SExtVAL >> ShiftAmt; + clearUnusedBits(); + return; + } + ashrSlowCase(ShiftAmt); + } /// \brief Logical right-shift function. /// @@ -928,7 +957,14 @@ public: /// \brief Arithmetic right-shift function. /// /// Arithmetic right-shift this APInt by shiftAmt. - APInt ashr(const APInt &shiftAmt) const; + APInt ashr(const APInt &ShiftAmt) const { + APInt R(*this); + R.ashrInPlace(ShiftAmt); + return R; + } + + /// Arithmetic right-shift this APInt by shiftAmt in place. + void ashrInPlace(const APInt &shiftAmt); /// \brief Logical right-shift function. /// @@ -1079,7 +1115,7 @@ public: /// the validity of the less-than relationship. /// /// \returns true if *this < RHS when both are considered unsigned. - bool ult(const APInt &RHS) const LLVM_READONLY; + bool ult(const APInt &RHS) const { return compare(RHS) < 0; } /// \brief Unsigned less than comparison /// @@ -1098,7 +1134,7 @@ public: /// validity of the less-than relationship. /// /// \returns true if *this < RHS when both are considered signed. - bool slt(const APInt &RHS) const LLVM_READONLY; + bool slt(const APInt &RHS) const { return compareSigned(RHS) < 0; } /// \brief Signed less than comparison /// @@ -1117,7 +1153,7 @@ public: /// validity of the less-or-equal relationship. /// /// \returns true if *this <= RHS when both are considered unsigned. - bool ule(const APInt &RHS) const { return ult(RHS) || eq(RHS); } + bool ule(const APInt &RHS) const { return compare(RHS) <= 0; } /// \brief Unsigned less or equal comparison /// @@ -1133,7 +1169,7 @@ public: /// validity of the less-or-equal relationship. /// /// \returns true if *this <= RHS when both are considered signed. - bool sle(const APInt &RHS) const { return slt(RHS) || eq(RHS); } + bool sle(const APInt &RHS) const { return compareSigned(RHS) <= 0; } /// \brief Signed less or equal comparison /// @@ -1149,7 +1185,7 @@ public: /// the validity of the greater-than relationship. /// /// \returns true if *this > RHS when both are considered unsigned. - bool ugt(const APInt &RHS) const { return !ult(RHS) && !eq(RHS); } + bool ugt(const APInt &RHS) const { return !ule(RHS); } /// \brief Unsigned greater than comparison /// @@ -1168,7 +1204,7 @@ public: /// validity of the greater-than relationship. /// /// \returns true if *this > RHS when both are considered signed. - bool sgt(const APInt &RHS) const { return !slt(RHS) && !eq(RHS); } + bool sgt(const APInt &RHS) const { return !sle(RHS); } /// \brief Signed greater than comparison /// @@ -1286,7 +1322,7 @@ public: /// \brief Set every bit to 1. void setAllBits() { if (isSingleWord()) - VAL = UINT64_MAX; + VAL = WORD_MAX; else // Set all the bits in all the words. memset(pVal, -1, getNumWords() * APINT_WORD_SIZE); @@ -1316,7 +1352,7 @@ public: return; } if (loBit < APINT_BITS_PER_WORD && hiBit <= APINT_BITS_PER_WORD) { - uint64_t mask = UINT64_MAX >> (APINT_BITS_PER_WORD - (hiBit - loBit)); + uint64_t mask = WORD_MAX >> (APINT_BITS_PER_WORD - (hiBit - loBit)); mask <<= loBit; if (isSingleWord()) VAL |= mask; @@ -1358,7 +1394,7 @@ public: /// \brief Toggle every bit to its opposite value. void flipAllBits() { if (isSingleWord()) { - VAL ^= UINT64_MAX; + VAL ^= WORD_MAX; clearUnusedBits(); } else { flipAllBitsSlowCase(); @@ -1653,7 +1689,7 @@ public: /// referencing 2 in a space where 2 does no exist. unsigned nearestLogBase2() const { // Special case when we have a bitwidth of 1. If VAL is 1, then we - // get 0. If VAL is 0, we get UINT64_MAX which gets truncated to + // get 0. If VAL is 0, we get WORD_MAX which gets truncated to // UINT32_MAX. if (BitWidth == 1) return VAL - 1; diff --git a/include/llvm/ADT/APSInt.h b/include/llvm/ADT/APSInt.h index 5b6dfa4..dabbf33 100644 --- a/include/llvm/ADT/APSInt.h +++ b/include/llvm/ADT/APSInt.h @@ -125,7 +125,10 @@ public: return IsUnsigned ? APSInt(lshr(Amt), true) : APSInt(ashr(Amt), false); } APSInt& operator>>=(unsigned Amt) { - *this = *this >> Amt; + if (IsUnsigned) + lshrInPlace(Amt); + else + ashrInPlace(Amt); return *this; } @@ -179,7 +182,7 @@ public: return APSInt(static_cast(*this) << Bits, IsUnsigned); } APSInt& operator<<=(unsigned Amt) { - *this = *this << Amt; + static_cast(*this) <<= Amt; return *this; } @@ -285,12 +288,12 @@ public: /// \brief Compare underlying values of two numbers. static int compareValues(const APSInt &I1, const APSInt &I2) { if (I1.getBitWidth() == I2.getBitWidth() && I1.isSigned() == I2.isSigned()) - return I1 == I2 ? 0 : I1 > I2 ? 1 : -1; + return I1.IsUnsigned ? I1.compare(I2) : I1.compareSigned(I2); // Check for a bit-width mismatch. if (I1.getBitWidth() > I2.getBitWidth()) return compareValues(I1, I2.extend(I1.getBitWidth())); - else if (I2.getBitWidth() > I1.getBitWidth()) + if (I2.getBitWidth() > I1.getBitWidth()) return compareValues(I1.extend(I2.getBitWidth()), I2); // We have a signedness mismatch. Check for negative values and do an @@ -305,7 +308,7 @@ public: return 1; } - return I1.eq(I2) ? 0 : I1.ugt(I2) ? 1 : -1; + return I1.compare(I2); } static APSInt get(int64_t X) { return APSInt(APInt(64, X), false); } diff --git a/include/llvm/ADT/BitVector.h b/include/llvm/ADT/BitVector.h index e48c023..5aa1015 100644 --- a/include/llvm/ADT/BitVector.h +++ b/include/llvm/ADT/BitVector.h @@ -15,7 +15,6 @@ #define LLVM_ADT_BITVECTOR_H #include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/Support/MathExtras.h" #include #include @@ -35,9 +34,8 @@ class BitVector { static_assert(BITWORD_SIZE == 64 || BITWORD_SIZE == 32, "Unsupported word size"); - BitWord *Bits; // Actual bits. - unsigned Size; // Size of bitvector in bits. - unsigned Capacity; // Number of BitWords allocated in the Bits array. + MutableArrayRef Bits; // Actual bits. + unsigned Size; // Size of bitvector in bits. public: typedef unsigned size_type; @@ -77,16 +75,14 @@ public: /// BitVector default ctor - Creates an empty bitvector. - BitVector() : Size(0), Capacity(0) { - Bits = nullptr; - } + BitVector() : Size(0) {} /// BitVector ctor - Creates a bitvector of specified number of bits. All /// bits are initialized to the specified value. explicit BitVector(unsigned s, bool t = false) : Size(s) { - Capacity = NumBitWords(s); - Bits = (BitWord *)std::malloc(Capacity * sizeof(BitWord)); - init_words(Bits, Capacity, t); + size_t Capacity = NumBitWords(s); + Bits = allocate(Capacity); + init_words(Bits, t); if (t) clear_unused_bits(); } @@ -94,25 +90,21 @@ public: /// BitVector copy ctor. BitVector(const BitVector &RHS) : Size(RHS.size()) { if (Size == 0) { - Bits = nullptr; - Capacity = 0; + Bits = MutableArrayRef(); return; } - Capacity = NumBitWords(RHS.size()); - Bits = (BitWord *)std::malloc(Capacity * sizeof(BitWord)); - std::memcpy(Bits, RHS.Bits, Capacity * sizeof(BitWord)); + size_t Capacity = NumBitWords(RHS.size()); + Bits = allocate(Capacity); + std::memcpy(Bits.data(), RHS.Bits.data(), Capacity * sizeof(BitWord)); } - BitVector(BitVector &&RHS) - : Bits(RHS.Bits), Size(RHS.Size), Capacity(RHS.Capacity) { - RHS.Bits = nullptr; - RHS.Size = RHS.Capacity = 0; + BitVector(BitVector &&RHS) : Bits(RHS.Bits), Size(RHS.Size) { + RHS.Bits = MutableArrayRef(); + RHS.Size = 0; } - ~BitVector() { - std::free(Bits); - } + ~BitVector() { std::free(Bits.data()); } /// empty - Tests whether there are no bits in this bitvector. bool empty() const { return Size == 0; } @@ -163,6 +155,22 @@ public: return -1; } + /// find_last - Returns the index of the last set bit, -1 if none of the bits + /// are set. + int find_last() const { + if (Size == 0) + return -1; + + unsigned N = NumBitWords(size()); + assert(N > 0); + + unsigned i = N - 1; + while (i > 0 && Bits[i] == BitWord(0)) + --i; + + return int((i + 1) * BITWORD_SIZE - countLeadingZeros(Bits[i])) - 1; + } + /// find_first_unset - Returns the index of the first unset bit, -1 if all /// of the bits are set. int find_first_unset() const { @@ -174,6 +182,30 @@ public: return -1; } + /// find_last_unset - Returns the index of the last unset bit, -1 if all of + /// the bits are set. + int find_last_unset() const { + if (Size == 0) + return -1; + + const unsigned N = NumBitWords(size()); + assert(N > 0); + + unsigned i = N - 1; + BitWord W = Bits[i]; + + // The last word in the BitVector has some unused bits, so we need to set + // them all to 1 first. Set them all to 1 so they don't get treated as + // valid unset bits. + unsigned UnusedCount = BITWORD_SIZE - size() % BITWORD_SIZE; + W |= maskLeadingOnes(UnusedCount); + + while (W == ~BitWord(0) && --i > 0) + W = Bits[i]; + + return int((i + 1) * BITWORD_SIZE - countLeadingOnes(W)) - 1; + } + /// find_next - Returns the index of the next set bit following the /// "Prev" bit. Returns -1 if the next set bit is not found. int find_next(unsigned Prev) const { @@ -228,10 +260,10 @@ public: /// resize - Grow or shrink the bitvector. void resize(unsigned N, bool t = false) { - if (N > Capacity * BITWORD_SIZE) { - unsigned OldCapacity = Capacity; + if (N > getBitCapacity()) { + unsigned OldCapacity = Bits.size(); grow(N); - init_words(&Bits[OldCapacity], (Capacity-OldCapacity), t); + init_words(Bits.drop_front(OldCapacity), t); } // Set any old unused bits that are now included in the BitVector. This @@ -248,19 +280,19 @@ public: } void reserve(unsigned N) { - if (N > Capacity * BITWORD_SIZE) + if (N > getBitCapacity()) grow(N); } // Set, reset, flip BitVector &set() { - init_words(Bits, Capacity, true); + init_words(Bits, true); clear_unused_bits(); return *this; } BitVector &set(unsigned Idx) { - assert(Bits && "Bits never allocated"); + assert(Bits.data() && "Bits never allocated"); Bits[Idx / BITWORD_SIZE] |= BitWord(1) << (Idx % BITWORD_SIZE); return *this; } @@ -295,7 +327,7 @@ public: } BitVector &reset() { - init_words(Bits, Capacity, false); + init_words(Bits, false); return *this; } @@ -562,21 +594,21 @@ public: Size = RHS.size(); unsigned RHSWords = NumBitWords(Size); - if (Size <= Capacity * BITWORD_SIZE) { + if (Size <= getBitCapacity()) { if (Size) - std::memcpy(Bits, RHS.Bits, RHSWords * sizeof(BitWord)); + std::memcpy(Bits.data(), RHS.Bits.data(), RHSWords * sizeof(BitWord)); clear_unused_bits(); return *this; } // Grow the bitvector to have enough elements. - Capacity = RHSWords; - assert(Capacity > 0 && "negative capacity?"); - BitWord *NewBits = (BitWord *)std::malloc(Capacity * sizeof(BitWord)); - std::memcpy(NewBits, RHS.Bits, Capacity * sizeof(BitWord)); + unsigned NewCapacity = RHSWords; + assert(NewCapacity > 0 && "negative capacity?"); + auto NewBits = allocate(NewCapacity); + std::memcpy(NewBits.data(), RHS.Bits.data(), NewCapacity * sizeof(BitWord)); // Destroy the old bits. - std::free(Bits); + std::free(Bits.data()); Bits = NewBits; return *this; @@ -585,13 +617,12 @@ public: const BitVector &operator=(BitVector &&RHS) { if (this == &RHS) return *this; - std::free(Bits); + std::free(Bits.data()); Bits = RHS.Bits; Size = RHS.Size; - Capacity = RHS.Capacity; - RHS.Bits = nullptr; - RHS.Size = RHS.Capacity = 0; + RHS.Bits = MutableArrayRef(); + RHS.Size = 0; return *this; } @@ -599,7 +630,6 @@ public: void swap(BitVector &RHS) { std::swap(Bits, RHS.Bits); std::swap(Size, RHS.Size); - std::swap(Capacity, RHS.Capacity); } //===--------------------------------------------------------------------===// @@ -659,14 +689,14 @@ private: uint32_t NumWords = NumBitWords(Size); - auto Src = ArrayRef(Bits, NumWords).drop_back(Count); - auto Dest = MutableArrayRef(Bits, NumWords).drop_front(Count); + auto Src = Bits.take_front(NumWords).drop_back(Count); + auto Dest = Bits.take_front(NumWords).drop_front(Count); // Since we always move Word-sized chunks of data with src and dest both // aligned to a word-boundary, we don't need to worry about endianness // here. std::memmove(Dest.begin(), Src.begin(), Dest.size() * sizeof(BitWord)); - std::memset(Bits, 0, Count * sizeof(BitWord)); + std::memset(Bits.data(), 0, Count * sizeof(BitWord)); clear_unused_bits(); } @@ -679,14 +709,19 @@ private: uint32_t NumWords = NumBitWords(Size); - auto Src = ArrayRef(Bits, NumWords).drop_front(Count); - auto Dest = MutableArrayRef(Bits, NumWords).drop_back(Count); + auto Src = Bits.take_front(NumWords).drop_front(Count); + auto Dest = Bits.take_front(NumWords).drop_back(Count); assert(Dest.size() == Src.size()); std::memmove(Dest.begin(), Src.begin(), Dest.size() * sizeof(BitWord)); std::memset(Dest.end(), 0, Count * sizeof(BitWord)); } + MutableArrayRef allocate(size_t NumWords) { + BitWord *RawBits = (BitWord *)std::malloc(NumWords * sizeof(BitWord)); + return MutableArrayRef(RawBits, NumWords); + } + int next_unset_in_word(int WordIndex, BitWord Word) const { unsigned Result = WordIndex * BITWORD_SIZE + countTrailingOnes(Word); return Result < size() ? Result : -1; @@ -700,8 +735,8 @@ private: void set_unused_bits(bool t = true) { // Set high words first. unsigned UsedWords = NumBitWords(Size); - if (Capacity > UsedWords) - init_words(&Bits[UsedWords], (Capacity-UsedWords), t); + if (Bits.size() > UsedWords) + init_words(Bits.drop_front(UsedWords), t); // Then set any stray high bits of the last used word. unsigned ExtraBits = Size % BITWORD_SIZE; @@ -720,16 +755,17 @@ private: } void grow(unsigned NewSize) { - Capacity = std::max(NumBitWords(NewSize), Capacity * 2); - assert(Capacity > 0 && "realloc-ing zero space"); - Bits = (BitWord *)std::realloc(Bits, Capacity * sizeof(BitWord)); - + size_t NewCapacity = std::max(NumBitWords(NewSize), Bits.size() * 2); + assert(NewCapacity > 0 && "realloc-ing zero space"); + BitWord *NewBits = + (BitWord *)std::realloc(Bits.data(), NewCapacity * sizeof(BitWord)); + Bits = MutableArrayRef(NewBits, NewCapacity); clear_unused_bits(); } - void init_words(BitWord *B, unsigned NumWords, bool t) { - if (NumWords > 0) - memset(B, 0 - (int)t, NumWords*sizeof(BitWord)); + void init_words(MutableArrayRef B, bool t) { + if (B.size() > 0) + memset(B.data(), 0 - (int)t, B.size() * sizeof(BitWord)); } template @@ -761,7 +797,8 @@ private: public: /// Return the size (in bytes) of the bit vector. - size_t getMemorySize() const { return Capacity * sizeof(BitWord); } + size_t getMemorySize() const { return Bits.size() * sizeof(BitWord); } + size_t getBitCapacity() const { return Bits.size() * BITWORD_SIZE; } }; static inline size_t capacity_in_bytes(const BitVector &X) { diff --git a/include/llvm/ADT/SmallBitVector.h b/include/llvm/ADT/SmallBitVector.h index 607e040..bf16af5 100644 --- a/include/llvm/ADT/SmallBitVector.h +++ b/include/llvm/ADT/SmallBitVector.h @@ -117,9 +117,7 @@ private: } // Return the size. - size_t getSmallSize() const { - return getSmallRawBits() >> SmallNumDataBits; - } + size_t getSmallSize() const { return getSmallRawBits() >> SmallNumDataBits; } void setSmallSize(size_t Size) { setSmallRawBits(getSmallBits() | (Size << SmallNumDataBits)); @@ -216,6 +214,16 @@ public: return getPointer()->find_first(); } + int find_last() const { + if (isSmall()) { + uintptr_t Bits = getSmallBits(); + if (Bits == 0) + return -1; + return NumBaseBits - countLeadingZeros(Bits); + } + return getPointer()->find_last(); + } + /// Returns the index of the first unset bit, -1 if all of the bits are set. int find_first_unset() const { if (isSmall()) { @@ -228,6 +236,17 @@ public: return getPointer()->find_first_unset(); } + int find_last_unset() const { + if (isSmall()) { + if (count() == getSmallSize()) + return -1; + + uintptr_t Bits = getSmallBits(); + return NumBaseBits - countLeadingOnes(Bits); + } + return getPointer()->find_last_unset(); + } + /// Returns the index of the next set bit following the "Prev" bit. /// Returns -1 if the next set bit is not found. int find_next(unsigned Prev) const { diff --git a/include/llvm/ADT/StringExtras.h b/include/llvm/ADT/StringExtras.h index 8214782..26f1192 100644 --- a/include/llvm/ADT/StringExtras.h +++ b/include/llvm/ADT/StringExtras.h @@ -76,6 +76,36 @@ static inline std::string toHex(StringRef Input) { return Output; } +static inline uint8_t hexFromNibbles(char MSB, char LSB) { + unsigned U1 = hexDigitValue(MSB); + unsigned U2 = hexDigitValue(LSB); + assert(U1 != -1U && U2 != -1U); + + return static_cast((U1 << 4) | U2); +} + +/// Convert hexadecimal string \p Input to its binary representation. +/// The return string is half the size of \p Input. +static inline std::string fromHex(StringRef Input) { + if (Input.empty()) + return std::string(); + + std::string Output; + Output.reserve((Input.size() + 1) / 2); + if (Input.size() % 2 == 1) { + Output.push_back(hexFromNibbles('0', Input.front())); + Input = Input.drop_front(); + } + + assert(Input.size() % 2 == 0); + while (!Input.empty()) { + uint8_t Hex = hexFromNibbles(Input[0], Input[1]); + Output.push_back(Hex); + Input = Input.drop_front(2); + } + return Output; +} + static inline std::string utostr(uint64_t X, bool isNeg = false) { char Buffer[21]; char *BufPtr = std::end(Buffer); diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h index e271075..e3a8a31 100644 --- a/include/llvm/ADT/Triple.h +++ b/include/llvm/ADT/Triple.h @@ -140,7 +140,8 @@ public: Myriad, AMD, Mesa, - LastVendorType = Mesa + SUSE, + LastVendorType = SUSE }; enum OSType { UnknownOS, diff --git a/include/llvm/Analysis/DemandedBits.h b/include/llvm/Analysis/DemandedBits.h index c603274..e5fd8a0 100644 --- a/include/llvm/Analysis/DemandedBits.h +++ b/include/llvm/Analysis/DemandedBits.h @@ -35,6 +35,7 @@ class Function; class Instruction; class DominatorTree; class AssumptionCache; +struct KnownBits; class DemandedBits { public: @@ -58,8 +59,7 @@ private: void determineLiveOperandBits(const Instruction *UserI, const Instruction *I, unsigned OperandNo, const APInt &AOut, APInt &AB, - APInt &KnownZero, APInt &KnownOne, - APInt &KnownZero2, APInt &KnownOne2); + KnownBits &Known, KnownBits &Known2); bool Analyzed; diff --git a/include/llvm/Analysis/InstructionSimplify.h b/include/llvm/Analysis/InstructionSimplify.h index b829e99..25240da 100644 --- a/include/llvm/Analysis/InstructionSimplify.h +++ b/include/llvm/Analysis/InstructionSimplify.h @@ -47,8 +47,33 @@ namespace llvm { class Type; class Value; + struct SimplifyQuery { + const DataLayout &DL; + const TargetLibraryInfo *TLI = nullptr; + const DominatorTree *DT = nullptr; + AssumptionCache *AC = nullptr; + const Instruction *CxtI = nullptr; + SimplifyQuery(const DataLayout &DL) : DL(DL) {} + + SimplifyQuery(const DataLayout &DL, const TargetLibraryInfo *TLI, + const DominatorTree *DT, AssumptionCache *AC = nullptr, + const Instruction *CXTI = nullptr) + : DL(DL), TLI(TLI), DT(DT), AC(AC), CxtI(CXTI) {} + SimplifyQuery getWithInstruction(Instruction *I) const { + SimplifyQuery Copy(*this); + Copy.CxtI = I; + return Copy; + } + }; + + // NOTE: the explicit multiple argument versions of these functions are + // deprecated. + // Please use the SimplifyQuery versions in new code. + /// Given operands for an Add, fold the result or return null. Value *SimplifyAddInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW, + const SimplifyQuery &Q); + Value *SimplifyAddInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -57,6 +82,8 @@ namespace llvm { /// Given operands for a Sub, fold the result or return null. Value *SimplifySubInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW, + const SimplifyQuery &Q); + Value *SimplifySubInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -65,6 +92,8 @@ namespace llvm { /// Given operands for an FAdd, fold the result or return null. Value *SimplifyFAddInst(Value *LHS, Value *RHS, FastMathFlags FMF, + const SimplifyQuery &Q); + Value *SimplifyFAddInst(Value *LHS, Value *RHS, FastMathFlags FMF, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -73,6 +102,8 @@ namespace llvm { /// Given operands for an FSub, fold the result or return null. Value *SimplifyFSubInst(Value *LHS, Value *RHS, FastMathFlags FMF, + const SimplifyQuery &Q); + Value *SimplifyFSubInst(Value *LHS, Value *RHS, FastMathFlags FMF, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -81,6 +112,8 @@ namespace llvm { /// Given operands for an FMul, fold the result or return null. Value *SimplifyFMulInst(Value *LHS, Value *RHS, FastMathFlags FMF, + const SimplifyQuery &Q); + Value *SimplifyFMulInst(Value *LHS, Value *RHS, FastMathFlags FMF, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -88,6 +121,7 @@ namespace llvm { const Instruction *CxtI = nullptr); /// Given operands for a Mul, fold the result or return null. + Value *SimplifyMulInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); Value *SimplifyMulInst(Value *LHS, Value *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -95,6 +129,7 @@ namespace llvm { const Instruction *CxtI = nullptr); /// Given operands for an SDiv, fold the result or return null. + Value *SimplifySDivInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); Value *SimplifySDivInst(Value *LHS, Value *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -102,6 +137,7 @@ namespace llvm { const Instruction *CxtI = nullptr); /// Given operands for a UDiv, fold the result or return null. + Value *SimplifyUDivInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); Value *SimplifyUDivInst(Value *LHS, Value *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -110,6 +146,8 @@ namespace llvm { /// Given operands for an FDiv, fold the result or return null. Value *SimplifyFDivInst(Value *LHS, Value *RHS, FastMathFlags FMF, + const SimplifyQuery &Q); + Value *SimplifyFDivInst(Value *LHS, Value *RHS, FastMathFlags FMF, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -117,6 +155,7 @@ namespace llvm { const Instruction *CxtI = nullptr); /// Given operands for an SRem, fold the result or return null. + Value *SimplifySRemInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); Value *SimplifySRemInst(Value *LHS, Value *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -124,6 +163,7 @@ namespace llvm { const Instruction *CxtI = nullptr); /// Given operands for a URem, fold the result or return null. + Value *SimplifyURemInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); Value *SimplifyURemInst(Value *LHS, Value *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -132,6 +172,8 @@ namespace llvm { /// Given operands for an FRem, fold the result or return null. Value *SimplifyFRemInst(Value *LHS, Value *RHS, FastMathFlags FMF, + const SimplifyQuery &Q); + Value *SimplifyFRemInst(Value *LHS, Value *RHS, FastMathFlags FMF, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -140,6 +182,8 @@ namespace llvm { /// Given operands for a Shl, fold the result or return null. Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, + const SimplifyQuery &Q); + Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -148,6 +192,8 @@ namespace llvm { /// Given operands for a LShr, fold the result or return null. Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact, + const SimplifyQuery &Q); + Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -156,6 +202,8 @@ namespace llvm { /// Given operands for a AShr, fold the result or return nulll. Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact, + const SimplifyQuery &Q); + Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -163,6 +211,7 @@ namespace llvm { const Instruction *CxtI = nullptr); /// Given operands for an And, fold the result or return null. + Value *SimplifyAndInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); Value *SimplifyAndInst(Value *LHS, Value *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -170,6 +219,7 @@ namespace llvm { const Instruction *CxtI = nullptr); /// Given operands for an Or, fold the result or return null. + Value *SimplifyOrInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); Value *SimplifyOrInst(Value *LHS, Value *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -177,6 +227,7 @@ namespace llvm { const Instruction *CxtI = nullptr); /// Given operands for an Xor, fold the result or return null. + Value *SimplifyXorInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); Value *SimplifyXorInst(Value *LHS, Value *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -185,6 +236,8 @@ namespace llvm { /// Given operands for an ICmpInst, fold the result or return null. Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, + const SimplifyQuery &Q); + Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -193,6 +246,8 @@ namespace llvm { /// Given operands for an FCmpInst, fold the result or return null. Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS, + FastMathFlags FMF, const SimplifyQuery &Q); + Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS, FastMathFlags FMF, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -201,13 +256,17 @@ namespace llvm { /// Given operands for a SelectInst, fold the result or return null. Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal, + const SimplifyQuery &Q); + Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, AssumptionCache *AC = nullptr, const Instruction *CxtI = nullptr); - /// Given operands for a GetElementPtrInst, fold the result or return null. + /// Given operands for a GetElementPtrInst, fold the result or return null. + Value *SimplifyGEPInst(Type *SrcTy, ArrayRef Ops, + const SimplifyQuery &Q); Value *SimplifyGEPInst(Type *SrcTy, ArrayRef Ops, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, @@ -217,6 +276,9 @@ namespace llvm { /// Given operands for an InsertValueInst, fold the result or return null. Value *SimplifyInsertValueInst(Value *Agg, Value *Val, + ArrayRef Idxs, + const SimplifyQuery &Q); + Value *SimplifyInsertValueInst(Value *Agg, Value *Val, ArrayRef Idxs, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -225,6 +287,8 @@ namespace llvm { /// Given operands for an ExtractValueInst, fold the result or return null. Value *SimplifyExtractValueInst(Value *Agg, ArrayRef Idxs, + const SimplifyQuery &Q); + Value *SimplifyExtractValueInst(Value *Agg, ArrayRef Idxs, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -233,6 +297,8 @@ namespace llvm { /// Given operands for an ExtractElementInst, fold the result or return null. Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, + const SimplifyQuery &Q); + Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -241,6 +307,8 @@ namespace llvm { /// Given operands for a CastInst, fold the result or return null. Value *SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty, + const SimplifyQuery &Q); + Value *SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -249,6 +317,8 @@ namespace llvm { /// Given operands for a ShuffleVectorInst, fold the result or return null. Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask, + Type *RetTy, const SimplifyQuery &Q); + Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask, Type *RetTy, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -260,6 +330,8 @@ namespace llvm { /// Given operands for a CmpInst, fold the result or return null. Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS, + const SimplifyQuery &Q); + Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -268,6 +340,8 @@ namespace llvm { /// Given operands for a BinaryOperator, fold the result or return null. Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, + const SimplifyQuery &Q); + Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -278,7 +352,9 @@ namespace llvm { /// In contrast to SimplifyBinOp, try to use FastMathFlag when folding the /// result. In case we don't need FastMathFlags, simply fall to SimplifyBinOp. Value *SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS, - const FastMathFlags &FMF, const DataLayout &DL, + FastMathFlags FMF, const SimplifyQuery &Q); + Value *SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS, + FastMathFlags FMF, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, AssumptionCache *AC = nullptr, @@ -287,6 +363,8 @@ namespace llvm { /// Given a function and iterators over arguments, fold the result or return /// null. Value *SimplifyCall(Value *V, User::op_iterator ArgBegin, + User::op_iterator ArgEnd, const SimplifyQuery &Q); + Value *SimplifyCall(Value *V, User::op_iterator ArgBegin, User::op_iterator ArgEnd, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -294,6 +372,7 @@ namespace llvm { const Instruction *CxtI = nullptr); /// Given a function and set of arguments, fold the result or return null. + Value *SimplifyCall(Value *V, ArrayRef Args, const SimplifyQuery &Q); Value *SimplifyCall(Value *V, ArrayRef Args, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, @@ -302,6 +381,8 @@ namespace llvm { /// See if we can compute a simplified version of this instruction. If not, /// return null. + Value *SimplifyInstruction(Instruction *I, const SimplifyQuery &Q, + OptimizationRemarkEmitter *ORE = nullptr); Value *SimplifyInstruction(Instruction *I, const DataLayout &DL, const TargetLibraryInfo *TLI = nullptr, const DominatorTree *DT = nullptr, diff --git a/include/llvm/Analysis/LoopInfo.h b/include/llvm/Analysis/LoopInfo.h index 2fad173..096df1e 100644 --- a/include/llvm/Analysis/LoopInfo.h +++ b/include/llvm/Analysis/LoopInfo.h @@ -158,7 +158,7 @@ public: /// True if terminator in the block can branch to another block that is /// outside of the current loop. bool isLoopExiting(const BlockT *BB) const { - for (const auto Succ : children(BB)) { + for (const auto &Succ : children(BB)) { if (!contains(Succ)) return true; } diff --git a/include/llvm/Analysis/LoopInfoImpl.h b/include/llvm/Analysis/LoopInfoImpl.h index 6dc0422..66c9f68 100644 --- a/include/llvm/Analysis/LoopInfoImpl.h +++ b/include/llvm/Analysis/LoopInfoImpl.h @@ -35,7 +35,7 @@ template void LoopBase:: getExitingBlocks(SmallVectorImpl &ExitingBlocks) const { for (const auto BB : blocks()) - for (const auto Succ : children(BB)) + for (const auto &Succ : children(BB)) if (!contains(Succ)) { // Not in current loop? It must be an exit block. ExitingBlocks.push_back(BB); @@ -61,7 +61,7 @@ template void LoopBase:: getExitBlocks(SmallVectorImpl &ExitBlocks) const { for (const auto BB : blocks()) - for (const auto Succ : children(BB)) + for (const auto &Succ : children(BB)) if (!contains(Succ)) // Not in current loop? It must be an exit block. ExitBlocks.push_back(Succ); @@ -83,7 +83,7 @@ template void LoopBase:: getExitEdges(SmallVectorImpl &ExitEdges) const { for (const auto BB : blocks()) - for (const auto Succ : children(BB)) + for (const auto &Succ : children(BB)) if (!contains(Succ)) // Not in current loop? It must be an exit block. ExitEdges.emplace_back(BB, Succ); diff --git a/include/llvm/Analysis/RegionInfo.h b/include/llvm/Analysis/RegionInfo.h index caeb21d..16ee07f 100644 --- a/include/llvm/Analysis/RegionInfo.h +++ b/include/llvm/Analysis/RegionInfo.h @@ -708,10 +708,24 @@ class RegionInfoBase { /// The top level region. RegionT *TopLevelRegion; -private: /// Map every BB to the smallest region, that contains BB. BBtoRegionMap BBtoRegion; +protected: + /// \brief Update refences to a RegionInfoT held by the RegionT managed here + /// + /// This is a post-move helper. Regions hold references to the owning + /// RegionInfo object. After a move these need to be fixed. + template + void updateRegionTree(RegionInfoT &RI, TheRegionT *R) { + if (!R) + return; + R->RI = &RI; + for (auto &SubR : *R) + updateRegionTree(RI, SubR.get()); + } + +private: /// \brief Wipe this region tree's state without releasing any resources. /// /// This is essentially a post-move helper only. It leaves the object in an @@ -879,10 +893,12 @@ public: ~RegionInfo() override; - RegionInfo(RegionInfo &&Arg) - : Base(std::move(static_cast(Arg))) {} + RegionInfo(RegionInfo &&Arg) : Base(std::move(static_cast(Arg))) { + updateRegionTree(*this, TopLevelRegion); + } RegionInfo &operator=(RegionInfo &&RHS) { Base::operator=(std::move(static_cast(RHS))); + updateRegionTree(*this, TopLevelRegion); return *this; } diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h index 91aeae0..54bc4dc 100644 --- a/include/llvm/Analysis/ScalarEvolution.h +++ b/include/llvm/Analysis/ScalarEvolution.h @@ -877,6 +877,47 @@ private: bool ControlsExit, bool AllowPredicates = false); + // Helper functions for computeExitLimitFromCond to avoid exponential time + // complexity. + + class ExitLimitCache { + // It may look like we need key on the whole (L, TBB, FBB, ControlsExit, + // AllowPredicates) tuple, but recursive calls to + // computeExitLimitFromCondCached from computeExitLimitFromCondImpl only + // vary the in \c ExitCond and \c ControlsExit parameters. We remember the + // initial values of the other values to assert our assumption. + SmallDenseMap, ExitLimit> TripCountMap; + + const Loop *L; + BasicBlock *TBB; + BasicBlock *FBB; + bool AllowPredicates; + + public: + ExitLimitCache(const Loop *L, BasicBlock *TBB, BasicBlock *FBB, + bool AllowPredicates) + : L(L), TBB(TBB), FBB(FBB), AllowPredicates(AllowPredicates) {} + + Optional find(const Loop *L, Value *ExitCond, BasicBlock *TBB, + BasicBlock *FBB, bool ControlsExit, + bool AllowPredicates); + + void insert(const Loop *L, Value *ExitCond, BasicBlock *TBB, + BasicBlock *FBB, bool ControlsExit, bool AllowPredicates, + const ExitLimit &EL); + }; + + typedef ExitLimitCache ExitLimitCacheTy; + ExitLimit computeExitLimitFromCondCached(ExitLimitCacheTy &Cache, + const Loop *L, Value *ExitCond, + BasicBlock *TBB, BasicBlock *FBB, + bool ControlsExit, + bool AllowPredicates); + ExitLimit computeExitLimitFromCondImpl(ExitLimitCacheTy &Cache, const Loop *L, + Value *ExitCond, BasicBlock *TBB, + BasicBlock *FBB, bool ControlsExit, + bool AllowPredicates); + /// Compute the number of times the backedge of the specified loop will /// execute if its exit condition were a conditional branch of the ICmpInst /// ExitCond, TBB, and FBB. If AllowPredicates is set, this call will try diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h index e3c2f3b..764308d 100644 --- a/include/llvm/Analysis/ValueTracking.h +++ b/include/llvm/Analysis/ValueTracking.h @@ -29,6 +29,7 @@ template class ArrayRef; class DominatorTree; class GEPOperator; class Instruction; + struct KnownBits; class Loop; class LoopInfo; class OptimizationRemarkEmitter; @@ -49,7 +50,7 @@ template class ArrayRef; /// where V is a vector, the known zero and known one values are the /// same width as the vector element, and the bit is set only if it is true /// for all of the elements in the vector. - void computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne, + void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth = 0, AssumptionCache *AC = nullptr, const Instruction *CxtI = nullptr, diff --git a/include/llvm/CodeGen/DIE.h b/include/llvm/CodeGen/DIE.h index 95c4b42..a401473 100644 --- a/include/llvm/CodeGen/DIE.h +++ b/include/llvm/CodeGen/DIE.h @@ -793,6 +793,9 @@ class DIEUnit { uint32_t Length; /// The length in bytes of all of the DIEs in this unit. const uint16_t Version; /// The Dwarf version number for this unit. const uint8_t AddrSize; /// The size in bytes of an address for this unit. +protected: + ~DIEUnit() = default; + public: DIEUnit(uint16_t Version, uint8_t AddrSize, dwarf::Tag UnitTag); DIEUnit(const DIEUnit &RHS) = delete; @@ -808,6 +811,10 @@ public: this->Section = Section; } + virtual const MCSymbol *getCrossSectionRelativeBaseAddress() const { + return nullptr; + } + /// Return the section that this DIEUnit will be emitted into. /// /// \returns Section pointer which can be NULL. @@ -822,7 +829,11 @@ public: const DIE &getUnitDie() const { return Die; } }; - +struct BasicDIEUnit final : DIEUnit { + BasicDIEUnit(uint16_t Version, uint8_t AddrSize, dwarf::Tag UnitTag) + : DIEUnit(Version, AddrSize, UnitTag) {} +}; + //===--------------------------------------------------------------------===// /// DIELoc - Represents an expression location. // diff --git a/include/llvm/CodeGen/GlobalISel/InstructionSelector.h b/include/llvm/CodeGen/GlobalISel/InstructionSelector.h index 911e875..899563a 100644 --- a/include/llvm/CodeGen/GlobalISel/InstructionSelector.h +++ b/include/llvm/CodeGen/GlobalISel/InstructionSelector.h @@ -18,20 +18,52 @@ #include "llvm/ADT/Optional.h" #include +#include +#include namespace llvm { class MachineInstr; +class MachineInstrBuilder; +class MachineFunction; class MachineOperand; class MachineRegisterInfo; class RegisterBankInfo; class TargetInstrInfo; class TargetRegisterInfo; +/// Container class for CodeGen predicate results. +/// This is convenient because std::bitset does not have a constructor +/// with an initializer list of set bits. +/// +/// Each InstructionSelector subclass should define a PredicateBitset class with: +/// const unsigned MAX_SUBTARGET_PREDICATES = 192; +/// using PredicateBitset = PredicateBitsetImpl; +/// and updating the constant to suit the target. Tablegen provides a suitable +/// definition for the predicates in use in GenGlobalISel.inc when +/// GET_GLOBALISEL_PREDICATE_BITSET is defined. +template +class PredicateBitsetImpl : public std::bitset { +public: + // Cannot inherit constructors because it's not supported by VC++.. + PredicateBitsetImpl() = default; + + PredicateBitsetImpl(const std::bitset &B) + : std::bitset(B) {} + + PredicateBitsetImpl(std::initializer_list Init) { + for (auto I : Init) + std::bitset::set(I); + } +}; + /// Provides the logic to select generic machine instructions. class InstructionSelector { public: virtual ~InstructionSelector() {} + /// This is executed before selecting a function. + virtual void beginFunction(const MachineFunction &MF) {} + /// Select the (possibly generic) instruction \p I to only use target-specific /// opcodes. It is OK to insert multiple instructions, but they cannot be /// generic pre-isel instructions. @@ -46,6 +78,8 @@ public: virtual bool select(MachineInstr &I) const = 0; protected: + typedef std::function ComplexRendererFn; + InstructionSelector(); /// Mutate the newly-selected instruction \p I to constrain its (possibly diff --git a/include/llvm/CodeGen/MachineOperand.h b/include/llvm/CodeGen/MachineOperand.h index 81b4312..e163540 100644 --- a/include/llvm/CodeGen/MachineOperand.h +++ b/include/llvm/CodeGen/MachineOperand.h @@ -65,7 +65,6 @@ public: MO_CFIIndex, ///< MCCFIInstruction index. MO_IntrinsicID, ///< Intrinsic ID for ISel MO_Predicate, ///< Generic predicate for ISel - MO_Placeholder, ///< Placeholder for GlobalISel ComplexPattern result. }; private: @@ -768,11 +767,6 @@ public: return Op; } - static MachineOperand CreatePlaceholder() { - MachineOperand Op(MachineOperand::MO_Placeholder); - return Op; - } - friend class MachineInstr; friend class MachineRegisterInfo; private: diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h index 6f05095..4bb6588 100644 --- a/include/llvm/CodeGen/SelectionDAG.h +++ b/include/llvm/CodeGen/SelectionDAG.h @@ -654,6 +654,15 @@ public: return getNode(ISD::BUILD_VECTOR, DL, VT, Ops); } + /// Return an ISD::BUILD_VECTOR node. The number of elements in VT, + /// which must be a vector type, must match the number of operands in Ops. + /// The operands must have the same type as (or, for integers, a type wider + /// than) VT's element type. + SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef Ops) { + // VerifySDNode (via InsertNode) checks BUILD_VECTOR later. + return getNode(ISD::BUILD_VECTOR, DL, VT, Ops); + } + /// Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all /// elements. VT must be a vector type. Op's type must be the same as (or, /// for integers, a type wider than) VT's element type. @@ -968,7 +977,7 @@ public: bool IsExpanding = false); SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, - MachineMemOperand *MMO, bool IsTruncating = false, + MachineMemOperand *MMO, bool IsTruncating = false, bool IsCompressing = false); SDValue getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl, ArrayRef Ops, MachineMemOperand *MMO); diff --git a/include/llvm/DebugInfo/CodeView/CodeView.h b/include/llvm/DebugInfo/CodeView/CodeView.h index 2791c9d..e599f8a 100644 --- a/include/llvm/DebugInfo/CodeView/CodeView.h +++ b/include/llvm/DebugInfo/CodeView/CodeView.h @@ -546,7 +546,7 @@ enum class TrampolineType : uint16_t { TrampIncremental, BranchIsland }; // These values correspond to the CV_SourceChksum_t enumeration. enum class FileChecksumKind : uint8_t { None, MD5, SHA1, SHA256 }; -enum LineFlags : uint32_t { +enum LineFlags : uint16_t { HaveColumns = 1, // CV_LINES_HAVE_COLUMNS }; } diff --git a/include/llvm/DebugInfo/CodeView/ModuleSubstreamVisitor.h b/include/llvm/DebugInfo/CodeView/ModuleSubstreamVisitor.h index 1a40654..31344a9 100644 --- a/include/llvm/DebugInfo/CodeView/ModuleSubstreamVisitor.h +++ b/include/llvm/DebugInfo/CodeView/ModuleSubstreamVisitor.h @@ -81,7 +81,7 @@ public: BinaryStreamReader Reader(Stream); if (auto EC = Reader.readObject(BlockHeader)) return EC; - bool HasColumn = Header->Flags & LineFlags::HaveColumns; + bool HasColumn = Header->Flags & uint32_t(LineFlags::HaveColumns); uint32_t LineInfoSize = BlockHeader->NumLines * (sizeof(LineNumberEntry) + (HasColumn ? sizeof(ColumnNumberEntry) : 0)); diff --git a/include/llvm/DebugInfo/CodeView/TypeDumperBase.h b/include/llvm/DebugInfo/CodeView/TypeDumperBase.h deleted file mode 100644 index e69de29..0000000 --- a/include/llvm/DebugInfo/CodeView/TypeDumperBase.h +++ /dev/null diff --git a/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h b/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h index b2a4d24..a46d46a 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h +++ b/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h @@ -18,9 +18,9 @@ namespace llvm { class DWARFCompileUnit : public DWARFUnit { public: DWARFCompileUnit(DWARFContext &Context, const DWARFSection &Section, - const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS, - StringRef SOS, StringRef AOS, StringRef LS, bool LE, - bool IsDWO, const DWARFUnitSectionBase &UnitSection, + const DWARFDebugAbbrev *DA, const DWARFSection *RS, + StringRef SS, StringRef SOS, StringRef AOS, StringRef LS, + bool LE, bool IsDWO, const DWARFUnitSectionBase &UnitSection, const DWARFUnitIndex::Entry *Entry) : DWARFUnit(Context, Section, DA, RS, SS, SOS, AOS, LS, LE, IsDWO, UnitSection, Entry) {} diff --git a/include/llvm/DebugInfo/DWARF/DWARFContext.h b/include/llvm/DebugInfo/DWARF/DWARFContext.h index f941cdd..d89e2c6 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFContext.h +++ b/include/llvm/DebugInfo/DWARF/DWARFContext.h @@ -50,6 +50,11 @@ class raw_ostream; // entire size of the debug info sections. typedef DenseMap> RelocAddrMap; +/// Reads a value from data extractor and applies a relocation to the result if +/// one exists for the given offset. +uint64_t getRelocatedValue(const DataExtractor &Data, uint32_t Size, + uint32_t *Off, const RelocAddrMap *Relocs); + /// DWARFContext /// This data structure is the top level entity that deals with dwarf debug /// information parsing. The actual data is supplied through pure virtual @@ -216,7 +221,7 @@ public: virtual StringRef getEHFrameSection() = 0; virtual const DWARFSection &getLineSection() = 0; virtual StringRef getStringSection() = 0; - virtual StringRef getRangeSection() = 0; + virtual const DWARFSection& getRangeSection() = 0; virtual StringRef getMacinfoSection() = 0; virtual StringRef getPubNamesSection() = 0; virtual StringRef getPubTypesSection() = 0; @@ -231,7 +236,7 @@ public: virtual const DWARFSection &getLocDWOSection() = 0; virtual StringRef getStringDWOSection() = 0; virtual StringRef getStringOffsetDWOSection() = 0; - virtual StringRef getRangeDWOSection() = 0; + virtual const DWARFSection &getRangeDWOSection() = 0; virtual StringRef getAddrSection() = 0; virtual const DWARFSection& getAppleNamesSection() = 0; virtual const DWARFSection& getAppleTypesSection() = 0; @@ -271,7 +276,7 @@ class DWARFContextInMemory : public DWARFContext { StringRef EHFrameSection; DWARFSection LineSection; StringRef StringSection; - StringRef RangeSection; + DWARFSection RangeSection; StringRef MacinfoSection; StringRef PubNamesSection; StringRef PubTypesSection; @@ -286,7 +291,7 @@ class DWARFContextInMemory : public DWARFContext { DWARFSection LocDWOSection; StringRef StringDWOSection; StringRef StringOffsetDWOSection; - StringRef RangeDWOSection; + DWARFSection RangeDWOSection; StringRef AddrSection; DWARFSection AppleNamesSection; DWARFSection AppleTypesSection; @@ -319,7 +324,7 @@ public: StringRef getEHFrameSection() override { return EHFrameSection; } const DWARFSection &getLineSection() override { return LineSection; } StringRef getStringSection() override { return StringSection; } - StringRef getRangeSection() override { return RangeSection; } + const DWARFSection &getRangeSection() override { return RangeSection; } StringRef getMacinfoSection() override { return MacinfoSection; } StringRef getPubNamesSection() override { return PubNamesSection; } StringRef getPubTypesSection() override { return PubTypesSection; } @@ -346,7 +351,7 @@ public: return StringOffsetDWOSection; } - StringRef getRangeDWOSection() override { return RangeDWOSection; } + const DWARFSection &getRangeDWOSection() override { return RangeDWOSection; } StringRef getAddrSection() override { return AddrSection; diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h index 018a049..9172df5 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h +++ b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h @@ -11,6 +11,8 @@ #define LLVM_DEBUGINFO_DWARF_DWARFDEBUGRANGELIST_H #include "llvm/Support/DataExtractor.h" +#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h" + #include #include #include @@ -71,7 +73,7 @@ public: void clear(); void dump(raw_ostream &OS) const; - bool extract(DataExtractor data, uint32_t *offset_ptr); + bool extract(DataExtractor data, uint32_t *offset_ptr, const RelocAddrMap& Relocs); const std::vector &getEntries() { return Entries; } /// getAbsoluteRanges - Returns absolute address ranges defined by this range diff --git a/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h b/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h index 7033160..c9da2c9 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h +++ b/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h @@ -30,9 +30,9 @@ private: public: DWARFTypeUnit(DWARFContext &Context, const DWARFSection &Section, - const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS, - StringRef SOS, StringRef AOS, StringRef LS, bool LE, bool IsDWO, - const DWARFUnitSectionBase &UnitSection, + const DWARFDebugAbbrev *DA, const DWARFSection *RS, + StringRef SS, StringRef SOS, StringRef AOS, StringRef LS, + bool LE, bool IsDWO, const DWARFUnitSectionBase &UnitSection, const DWARFUnitIndex::Entry *Entry) : DWARFUnit(Context, Section, DA, RS, SS, SOS, AOS, LS, LE, IsDWO, UnitSection, Entry) {} diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/include/llvm/DebugInfo/DWARF/DWARFUnit.h index 023a0f7..e29ba52 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFUnit.h +++ b/include/llvm/DebugInfo/DWARF/DWARFUnit.h @@ -56,9 +56,9 @@ protected: ~DWARFUnitSectionBase() = default; virtual void parseImpl(DWARFContext &Context, const DWARFSection &Section, - const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS, - StringRef SOS, StringRef AOS, StringRef LS, - bool isLittleEndian, bool isDWO) = 0; + const DWARFDebugAbbrev *DA, const DWARFSection *RS, + StringRef SS, StringRef SOS, StringRef AOS, + StringRef LS, bool isLittleEndian, bool isDWO) = 0; }; const DWARFUnitIndex &getDWARFUnitIndex(DWARFContext &Context, @@ -88,9 +88,9 @@ public: private: void parseImpl(DWARFContext &Context, const DWARFSection &Section, - const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS, - StringRef SOS, StringRef AOS, StringRef LS, bool LE, - bool IsDWO) override { + const DWARFDebugAbbrev *DA, const DWARFSection *RS, + StringRef SS, StringRef SOS, StringRef AOS, StringRef LS, + bool LE, bool IsDWO) override { if (Parsed) return; const auto &Index = getDWARFUnitIndex(Context, UnitType::Section); @@ -115,7 +115,7 @@ class DWARFUnit { const DWARFSection &InfoSection; const DWARFDebugAbbrev *Abbrev; - StringRef RangeSection; + const DWARFSection *RangeSection; uint32_t RangeSectionBase; StringRef LineSection; StringRef StringSection; @@ -171,7 +171,7 @@ protected: public: DWARFUnit(DWARFContext &Context, const DWARFSection &Section, - const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS, + const DWARFDebugAbbrev *DA, const DWARFSection *RS, StringRef SS, StringRef SOS, StringRef AOS, StringRef LS, bool LE, bool IsDWO, const DWARFUnitSectionBase &UnitSection, const DWARFUnitIndex::Entry *IndexEntry = nullptr); @@ -192,7 +192,7 @@ public: // Recursively update address to Die map. void updateAddressDieMap(DWARFDie Die); - void setRangesSection(StringRef RS, uint32_t Base) { + void setRangesSection(const DWARFSection *RS, uint32_t Base) { RangeSection = RS; RangeSectionBase = Base; } diff --git a/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h b/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h index c0633cb..3710eb2 100644 --- a/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h +++ b/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h @@ -102,7 +102,8 @@ public: uint32_t getVirtualBaseDispIndex() const override; uint32_t getVirtualBaseOffset() const override; uint32_t getVirtualTableShapeId() const override; - std::unique_ptr getVirtualBaseTableType() const override; + std::unique_ptr + getVirtualBaseTableType() const override; PDB_DataKind getDataKind() const override; PDB_SymType getSymTag() const override; PDB_UniqueId getGuid() const override; diff --git a/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h b/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h index 4c28e19..fab086c 100644 --- a/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h +++ b/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h @@ -113,7 +113,7 @@ public: virtual Variant getValue() const = 0; virtual uint32_t getVirtualBaseDispIndex() const = 0; virtual uint32_t getVirtualBaseOffset() const = 0; - virtual std::unique_ptr + virtual std::unique_ptr getVirtualBaseTableType() const = 0; virtual uint32_t getVirtualTableShapeId() const = 0; virtual PDB_DataKind getDataKind() const = 0; diff --git a/include/llvm/DebugInfo/PDB/Native/ModStream.h b/include/llvm/DebugInfo/PDB/Native/ModStream.h index d65e195..b12d4ff 100644 --- a/include/llvm/DebugInfo/PDB/Native/ModStream.h +++ b/include/llvm/DebugInfo/PDB/Native/ModStream.h @@ -40,6 +40,8 @@ public: iterator_range lines(bool *HadError) const; + bool hasLineInfo() const; + Error commit(); private: diff --git a/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h b/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h index cffb5d0..e1e7803 100644 --- a/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h +++ b/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h @@ -101,7 +101,8 @@ public: uint32_t getVirtualBaseDispIndex() const override; uint32_t getVirtualBaseOffset() const override; uint32_t getVirtualTableShapeId() const override; - std::unique_ptr getVirtualBaseTableType() const override; + std::unique_ptr + getVirtualBaseTableType() const override; PDB_DataKind getDataKind() const override; PDB_SymType getSymTag() const override; PDB_UniqueId getGuid() const override; diff --git a/include/llvm/DebugInfo/PDB/UDTLayout.h b/include/llvm/DebugInfo/PDB/UDTLayout.h index e3dcba5..6bc3660 100644 --- a/include/llvm/DebugInfo/PDB/UDTLayout.h +++ b/include/llvm/DebugInfo/PDB/UDTLayout.h @@ -15,6 +15,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" #include #include @@ -32,40 +33,63 @@ class PDBSymbolTypeVTable; class ClassLayout; class BaseClassLayout; -class StorageItemBase; +class LayoutItemBase; class UDTLayoutBase; -class StorageItemBase { +class LayoutItemBase { public: - StorageItemBase(const UDTLayoutBase &Parent, const PDBSymbol &Symbol, - const std::string &Name, uint32_t OffsetInParent, - uint32_t Size); - virtual ~StorageItemBase() {} + LayoutItemBase(const UDTLayoutBase *Parent, const PDBSymbol *Symbol, + const std::string &Name, uint32_t OffsetInParent, + uint32_t Size, bool IsElided); + virtual ~LayoutItemBase() {} - virtual uint32_t deepPaddingSize() const; + uint32_t deepPaddingSize() const; + virtual uint32_t immediatePadding() const { return 0; } + virtual uint32_t tailPadding() const; - const UDTLayoutBase &getParent() const { return Parent; } + const UDTLayoutBase *getParent() const { return Parent; } StringRef getName() const { return Name; } uint32_t getOffsetInParent() const { return OffsetInParent; } uint32_t getSize() const { return SizeOf; } - const PDBSymbol &getSymbol() const { return Symbol; } + uint32_t getLayoutSize() const { return LayoutSize; } + const PDBSymbol *getSymbol() const { return Symbol; } + const BitVector &usedBytes() const { return UsedBytes; } + bool isElided() const { return IsElided; } + virtual bool isVBPtr() const { return false; } + + uint32_t containsOffset(uint32_t Off) const { + uint32_t Begin = getOffsetInParent(); + uint32_t End = Begin + getSize(); + return (Off >= Begin && Off < End); + } protected: - const UDTLayoutBase &Parent; - const PDBSymbol &Symbol; + const PDBSymbol *Symbol = nullptr; + const UDTLayoutBase *Parent = nullptr; BitVector UsedBytes; std::string Name; uint32_t OffsetInParent = 0; uint32_t SizeOf = 0; + uint32_t LayoutSize = 0; + bool IsElided = false; +}; + +class VBPtrLayoutItem : public LayoutItemBase { +public: + VBPtrLayoutItem(const UDTLayoutBase &Parent, + std::unique_ptr Sym, uint32_t Offset, + uint32_t Size); + virtual bool isVBPtr() const { return true; } + +private: + std::unique_ptr Type; }; -class DataMemberLayoutItem : public StorageItemBase { +class DataMemberLayoutItem : public LayoutItemBase { public: DataMemberLayoutItem(const UDTLayoutBase &Parent, std::unique_ptr DataMember); - virtual uint32_t deepPaddingSize() const; - const PDBSymbolData &getDataMember(); bool hasUDTLayout() const; const ClassLayout &getUDTLayout() const; @@ -75,77 +99,73 @@ private: std::unique_ptr UdtLayout; }; -class VTableLayoutItem : public StorageItemBase { +class VTableLayoutItem : public LayoutItemBase { public: VTableLayoutItem(const UDTLayoutBase &Parent, std::unique_ptr VTable); - ArrayRef funcs() const { return VTableFuncs; } uint32_t getElementSize() const { return ElementSize; } - void setFunction(uint32_t Index, PDBSymbolFunc &Func) { - VTableFuncs[Index] = &Func; - } - private: uint32_t ElementSize = 0; - std::unique_ptr Shape; std::unique_ptr VTable; - std::vector VTableFuncs; }; -class UDTLayoutBase { +class UDTLayoutBase : public LayoutItemBase { template using UniquePtrVector = std::vector>; public: - UDTLayoutBase(const PDBSymbol &Symbol, const std::string &Name, - uint32_t Size); - - uint32_t shallowPaddingSize() const; - uint32_t deepPaddingSize() const; - - const BitVector &usedBytes() const { return UsedBytes; } + UDTLayoutBase(const UDTLayoutBase *Parent, const PDBSymbol &Sym, + const std::string &Name, uint32_t OffsetInParent, uint32_t Size, + bool IsElided); - uint32_t getClassSize() const { return SizeOf; } + uint32_t tailPadding() const override; - ArrayRef> layout_items() const { - return ChildStorage; - } - - VTableLayoutItem *findVTableAtOffset(uint32_t RelativeOffset); + ArrayRef layout_items() const { return LayoutItems; } - StringRef getUDTName() const { return Name; } + ArrayRef bases() const { return AllBases; } + ArrayRef regular_bases() const { return NonVirtualBases; } + ArrayRef virtual_bases() const { return VirtualBases; } - ArrayRef bases() const { return BaseClasses; } - ArrayRef> vbases() const { - return VirtualBases; - } + uint32_t directVirtualBaseCount() const { return DirectVBaseCount; } ArrayRef> funcs() const { return Funcs; } ArrayRef> other_items() const { return Other; } - const PDBSymbol &getSymbolBase() const { return SymbolBase; } - protected: + bool hasVBPtrAtOffset(uint32_t Off) const; void initializeChildren(const PDBSymbol &Sym); - void addChildToLayout(std::unique_ptr Child); - void addVirtualOverride(PDBSymbolFunc &Func); - void addVirtualIntro(PDBSymbolFunc &Func); + void addChildToLayout(std::unique_ptr Child); - const PDBSymbol &SymbolBase; - std::string Name; - uint32_t SizeOf = 0; + uint32_t DirectVBaseCount = 0; - BitVector UsedBytes; UniquePtrVector Other; UniquePtrVector Funcs; - UniquePtrVector VirtualBases; - UniquePtrVector ChildStorage; - std::vector> ChildrenPerByte; - std::vector BaseClasses; + UniquePtrVector ChildStorage; + std::vector LayoutItems; + + std::vector AllBases; + ArrayRef NonVirtualBases; + ArrayRef VirtualBases; + VTableLayoutItem *VTable = nullptr; + VBPtrLayoutItem *VBPtr = nullptr; +}; + +class BaseClassLayout : public UDTLayoutBase { +public: + BaseClassLayout(const UDTLayoutBase &Parent, uint32_t OffsetInParent, + bool Elide, std::unique_ptr Base); + + const PDBSymbolTypeBaseClass &getBase() const { return *Base; } + bool isVirtualBase() const { return IsVirtualBase; } + bool isEmptyBase() { return SizeOf == 1 && LayoutSize == 0; } + +private: + std::unique_ptr Base; + bool IsVirtualBase; }; class ClassLayout : public UDTLayoutBase { @@ -156,24 +176,13 @@ public: ClassLayout(ClassLayout &&Other) = default; const PDBSymbolTypeUDT &getClass() const { return UDT; } + uint32_t immediatePadding() const override; private: + BitVector ImmediateUsedBytes; std::unique_ptr OwnedStorage; const PDBSymbolTypeUDT &UDT; }; - -class BaseClassLayout : public UDTLayoutBase, public StorageItemBase { -public: - BaseClassLayout(const UDTLayoutBase &Parent, - std::unique_ptr Base); - - const PDBSymbolTypeBaseClass &getBase() const { return *Base; } - bool isVirtualBase() const { return IsVirtualBase; } - -private: - std::unique_ptr Base; - bool IsVirtualBase; -}; } } // namespace llvm diff --git a/include/llvm/ExecutionEngine/Orc/RPCSerialization.h b/include/llvm/ExecutionEngine/Orc/RPCSerialization.h index 84a037b..a3be242 100644 --- a/include/llvm/ExecutionEngine/Orc/RPCSerialization.h +++ b/include/llvm/ExecutionEngine/Orc/RPCSerialization.h @@ -348,7 +348,7 @@ public: // key of the deserializers map to save us from duplicating the string in // the serializer. This should be changed to use a stringpool if we switch // to a map type that may move keys in memory. - std::lock_guard Lock(DeserializersMutex); + std::lock_guard Lock(DeserializersMutex); auto I = Deserializers.insert(Deserializers.begin(), std::make_pair(std::move(Name), @@ -358,7 +358,7 @@ public: { assert(KeyName != nullptr && "No keyname pointer"); - std::lock_guard Lock(SerializersMutex); + std::lock_guard Lock(SerializersMutex); // FIXME: Move capture Serialize once we have C++14. Serializers[ErrorInfoT::classID()] = [KeyName, Serialize](ChannelT &C, const ErrorInfoBase &EIB) -> Error { @@ -372,7 +372,8 @@ public: } static Error serialize(ChannelT &C, Error &&Err) { - std::lock_guard Lock(SerializersMutex); + std::lock_guard Lock(SerializersMutex); + if (!Err) return serializeSeq(C, std::string()); @@ -386,7 +387,7 @@ public: } static Error deserialize(ChannelT &C, Error &Err) { - std::lock_guard Lock(DeserializersMutex); + std::lock_guard Lock(DeserializersMutex); std::string Key; if (auto Err = deserializeSeq(C, Key)) @@ -406,8 +407,6 @@ public: private: static Error serializeAsStringError(ChannelT &C, const ErrorInfoBase &EIB) { - assert(EIB.dynamicClassID() != StringError::classID() && - "StringError serialization not registered"); std::string ErrMsg; { raw_string_ostream ErrMsgStream(ErrMsg); @@ -417,17 +416,17 @@ private: inconvertibleErrorCode())); } - static std::mutex SerializersMutex; - static std::mutex DeserializersMutex; + static std::recursive_mutex SerializersMutex; + static std::recursive_mutex DeserializersMutex; static std::map Serializers; static std::map Deserializers; }; template -std::mutex SerializationTraits::SerializersMutex; +std::recursive_mutex SerializationTraits::SerializersMutex; template -std::mutex SerializationTraits::DeserializersMutex; +std::recursive_mutex SerializationTraits::DeserializersMutex; template std::map::WrappedErrorDeserializer> SerializationTraits::Deserializers; +/// Registers a serializer and deserializer for the given error type on the +/// given channel type. +template +void registerErrorSerialization(std::string Name, SerializeFtor &&Serialize, + DeserializeFtor &&Deserialize) { + SerializationTraits::template registerErrorType( + std::move(Name), + std::forward(Serialize), + std::forward(Deserialize)); +} + +/// Registers serialization/deserialization for StringError. template void registerStringError() { static bool AlreadyRegistered = false; if (!AlreadyRegistered) { - SerializationTraits:: - template registerErrorType( - "StringError", - [](ChannelT &C, const StringError &SE) { - return serializeSeq(C, SE.getMessage()); - }, - [](ChannelT &C, Error &Err) { - ErrorAsOutParameter EAO(&Err); - std::string Msg; - if (auto E2 = deserializeSeq(C, Msg)) - return E2; - Err = - make_error(std::move(Msg), - orcError( - OrcErrorCode::UnknownErrorCodeFromRemote)); - return Error::success(); - }); + registerErrorSerialization( + "StringError", + [](ChannelT &C, const StringError &SE) { + return serializeSeq(C, SE.getMessage()); + }, + [](ChannelT &C, Error &Err) -> Error { + ErrorAsOutParameter EAO(&Err); + std::string Msg; + if (auto E2 = deserializeSeq(C, Msg)) + return E2; + Err = + make_error(std::move(Msg), + orcError( + OrcErrorCode::UnknownErrorCodeFromRemote)); + return Error::success(); + }); AlreadyRegistered = true; } } diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h index b13f197..e2cd4c2 100644 --- a/include/llvm/IR/Attributes.h +++ b/include/llvm/IR/Attributes.h @@ -509,7 +509,7 @@ public: unsigned getSlotIndex(unsigned Slot) const; /// \brief Return the attributes at the given slot. - AttributeList getSlotAttributes(unsigned Slot) const; + AttributeSet getSlotAttributes(unsigned Slot) const; void dump() const; }; diff --git a/include/llvm/IR/ConstantRange.h b/include/llvm/IR/ConstantRange.h index 47004e8..fd7f96a 100644 --- a/include/llvm/IR/ConstantRange.h +++ b/include/llvm/IR/ConstantRange.h @@ -93,7 +93,7 @@ public: /// /// NB! The returned set does *not* contain **all** possible values of X for /// which "X BinOpC Y" does not wrap -- some viable values of X may be - /// missing, so you cannot use this to contrain X's range. E.g. in the last + /// missing, so you cannot use this to constrain X's range. E.g. in the last /// example, "(-2) + 1" is both nsw and nuw (so the "X" could be -2), but (-2) /// is not in the set returned. /// diff --git a/include/llvm/IR/Dominators.h b/include/llvm/IR/Dominators.h index cae03d3..8f6c85f 100644 --- a/include/llvm/IR/Dominators.h +++ b/include/llvm/IR/Dominators.h @@ -157,6 +157,10 @@ public: /// This should only be used for debugging as it aborts the program if the /// verification fails. void verifyDomTree() const; + + // Pop up a GraphViz/gv window with the Dominator Tree rendered using `dot`. + void viewGraph(const Twine &Name, const Twine &Title); + void viewGraph(); }; //===------------------------------------- diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td index 5415c6b..21d8a15 100644 --- a/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/include/llvm/IR/IntrinsicsAMDGPU.td @@ -629,6 +629,8 @@ def int_amdgcn_readfirstlane : GCCBuiltin<"__builtin_amdgcn_readfirstlane">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>; +// The lane argument must be uniform across the currently active threads of the +// current wave. Otherwise, the result is undefined. def int_amdgcn_readlane : GCCBuiltin<"__builtin_amdgcn_readlane">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>; diff --git a/include/llvm/IR/Module.h b/include/llvm/IR/Module.h index 70c57cf..67c35cd 100644 --- a/include/llvm/IR/Module.h +++ b/include/llvm/IR/Module.h @@ -319,7 +319,7 @@ public: /// exist, add a prototype for the function and return it. This function /// guarantees to return a constant of pointer to the specified function type /// or a ConstantExpr BitCast of that type if the named function has a - /// different type. This version of the method takes a null terminated list of + /// different type. This version of the method takes a list of /// function arguments, which makes it easier for clients to use. template Constant *getOrInsertFunction(StringRef Name, diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h index a4b48d7..00f8213 100644 --- a/include/llvm/IR/Value.h +++ b/include/llvm/IR/Value.h @@ -482,6 +482,17 @@ public: static_cast(this)->stripPointerCasts()); } + /// \brief Strip off pointer casts, all-zero GEPs, aliases and barriers. + /// + /// Returns the original uncasted value. If this is called on a non-pointer + /// value, it returns 'this'. This function should be used only in + /// Alias analysis. + const Value *stripPointerCastsAndBarriers() const; + Value *stripPointerCastsAndBarriers() { + return const_cast( + static_cast(this)->stripPointerCastsAndBarriers()); + } + /// \brief Strip off pointer casts and all-zero GEPs. /// /// Returns the original uncasted value. If this is called on a non-pointer diff --git a/include/llvm/MC/MCTargetOptions.h b/include/llvm/MC/MCTargetOptions.h index 06f58d4..ab027ab 100644 --- a/include/llvm/MC/MCTargetOptions.h +++ b/include/llvm/MC/MCTargetOptions.h @@ -54,6 +54,7 @@ public: int DwarfVersion = 0; std::string ABIName; + std::string SplitDwarfFile; /// Additional paths to search for `.include` directives when using the /// integrated assembler. diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h index 7a3155b..9c72bd4 100644 --- a/include/llvm/Object/ELF.h +++ b/include/llvm/Object/ELF.h @@ -14,9 +14,19 @@ #ifndef LLVM_OBJECT_ELF_H #define LLVM_OBJECT_ELF_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Object/ELFTypes.h" -#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Object/Error.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/Error.h" +#include +#include +#include +#include +#include namespace llvm { namespace object { @@ -41,27 +51,27 @@ template class ELFFile { public: LLVM_ELF_IMPORT_TYPES_ELFT(ELFT) - typedef typename ELFT::uint uintX_t; - typedef typename ELFT::Ehdr Elf_Ehdr; - typedef typename ELFT::Shdr Elf_Shdr; - typedef typename ELFT::Sym Elf_Sym; - typedef typename ELFT::Dyn Elf_Dyn; - typedef typename ELFT::Phdr Elf_Phdr; - typedef typename ELFT::Rel Elf_Rel; - typedef typename ELFT::Rela Elf_Rela; - typedef typename ELFT::Verdef Elf_Verdef; - typedef typename ELFT::Verdaux Elf_Verdaux; - typedef typename ELFT::Verneed Elf_Verneed; - typedef typename ELFT::Vernaux Elf_Vernaux; - typedef typename ELFT::Versym Elf_Versym; - typedef typename ELFT::Hash Elf_Hash; - typedef typename ELFT::GnuHash Elf_GnuHash; - typedef typename ELFT::DynRange Elf_Dyn_Range; - typedef typename ELFT::ShdrRange Elf_Shdr_Range; - typedef typename ELFT::SymRange Elf_Sym_Range; - typedef typename ELFT::RelRange Elf_Rel_Range; - typedef typename ELFT::RelaRange Elf_Rela_Range; - typedef typename ELFT::PhdrRange Elf_Phdr_Range; + using uintX_t = typename ELFT::uint; + using Elf_Ehdr = typename ELFT::Ehdr; + using Elf_Shdr = typename ELFT::Shdr; + using Elf_Sym = typename ELFT::Sym; + using Elf_Dyn = typename ELFT::Dyn; + using Elf_Phdr = typename ELFT::Phdr; + using Elf_Rel = typename ELFT::Rel; + using Elf_Rela = typename ELFT::Rela; + using Elf_Verdef = typename ELFT::Verdef; + using Elf_Verdaux = typename ELFT::Verdaux; + using Elf_Verneed = typename ELFT::Verneed; + using Elf_Vernaux = typename ELFT::Vernaux; + using Elf_Versym = typename ELFT::Versym; + using Elf_Hash = typename ELFT::Hash; + using Elf_GnuHash = typename ELFT::GnuHash; + using Elf_Dyn_Range = typename ELFT::DynRange; + using Elf_Shdr_Range = typename ELFT::ShdrRange; + using Elf_Sym_Range = typename ELFT::SymRange; + using Elf_Rel_Range = typename ELFT::RelRange; + using Elf_Rela_Range = typename ELFT::RelaRange; + using Elf_Phdr_Range = typename ELFT::PhdrRange; const uint8_t *base() const { return reinterpret_cast(Buf.data()); @@ -70,7 +80,6 @@ public: size_t getBufSize() const { return Buf.size(); } private: - StringRef Buf; public: @@ -161,10 +170,10 @@ public: Expected> getSectionContents(const Elf_Shdr *Sec) const; }; -typedef ELFFile> ELF32LEFile; -typedef ELFFile> ELF64LEFile; -typedef ELFFile> ELF32BEFile; -typedef ELFFile> ELF64BEFile; +using ELF32LEFile = ELFFile>; +using ELF64LEFile = ELFFile>; +using ELF32BEFile = ELFFile>; +using ELF64BEFile = ELFFile>; template inline Expected @@ -194,7 +203,7 @@ ELFFile::getSectionIndex(const Elf_Sym *Sym, Elf_Sym_Range Syms, ArrayRef ShndxTable) const { uint32_t Index = Sym->st_shndx; if (Index == ELF::SHN_XINDEX) { - auto ErrorOrIndex = object::getExtendedSymbolTableIndex( + auto ErrorOrIndex = getExtendedSymbolTableIndex( Sym, Syms.begin(), ShndxTable); if (!ErrorOrIndex) return ErrorOrIndex.takeError(); @@ -519,7 +528,8 @@ inline unsigned hashSysV(StringRef SymbolName) { } return h; } + } // end namespace object } // end namespace llvm -#endif +#endif // LLVM_OBJECT_ELF_H diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h index 9e95f29..d8b58b8 100644 --- a/include/llvm/Object/ELFObjectFile.h +++ b/include/llvm/Object/ELFObjectFile.h @@ -27,6 +27,7 @@ #include "llvm/Object/ObjectFile.h" #include "llvm/Object/SymbolicFile.h" #include "llvm/Support/ARMAttributeParser.h" +#include "llvm/Support/ARMBuildAttributes.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ELF.h" #include "llvm/Support/Endian.h" @@ -42,13 +43,11 @@ namespace llvm { namespace object { class elf_symbol_iterator; -class ELFSymbolRef; -class ELFRelocationRef; class ELFObjectFileBase : public ObjectFile { - friend class ELFSymbolRef; - friend class ELFSectionRef; friend class ELFRelocationRef; + friend class ELFSectionRef; + friend class ELFSymbolRef; protected: ELFObjectFileBase(unsigned int Type, MemoryBufferRef Source); @@ -65,7 +64,8 @@ protected: virtual ErrorOr getRelocationAddend(DataRefImpl Rel) const = 0; public: - typedef iterator_range elf_symbol_iterator_range; + using elf_symbol_iterator_range = iterator_range; + virtual elf_symbol_iterator_range getDynamicSymbolIterators() const = 0; elf_symbol_iterator_range symbols() const; @@ -201,14 +201,14 @@ template class ELFObjectFile : public ELFObjectFileBase { public: LLVM_ELF_IMPORT_TYPES_ELFT(ELFT) - typedef typename ELFFile::uintX_t uintX_t; + using uintX_t = typename ELFFile::uintX_t; - typedef typename ELFFile::Elf_Sym Elf_Sym; - typedef typename ELFFile::Elf_Shdr Elf_Shdr; - typedef typename ELFFile::Elf_Ehdr Elf_Ehdr; - typedef typename ELFFile::Elf_Rel Elf_Rel; - typedef typename ELFFile::Elf_Rela Elf_Rela; - typedef typename ELFFile::Elf_Dyn Elf_Dyn; + using Elf_Sym = typename ELFFile::Elf_Sym; + using Elf_Shdr = typename ELFFile::Elf_Shdr; + using Elf_Ehdr = typename ELFFile::Elf_Ehdr; + using Elf_Rel = typename ELFFile::Elf_Rel; + using Elf_Rela = typename ELFFile::Elf_Rela; + using Elf_Dyn = typename ELFFile::Elf_Dyn; protected: ELFFile EF; @@ -398,10 +398,10 @@ public: bool isRelocatableObject() const override; }; -typedef ELFObjectFile> ELF32LEObjectFile; -typedef ELFObjectFile> ELF64LEObjectFile; -typedef ELFObjectFile> ELF32BEObjectFile; -typedef ELFObjectFile> ELF64BEObjectFile; +using ELF32LEObjectFile = ELFObjectFile>; +using ELF64LEObjectFile = ELFObjectFile>; +using ELF32BEObjectFile = ELFObjectFile>; +using ELF64BEObjectFile = ELFObjectFile>; template void ELFObjectFile::moveSymbolNext(DataRefImpl &Sym) const { diff --git a/include/llvm/Object/ELFTypes.h b/include/llvm/Object/ELFTypes.h index 3e03fd8..99346fe 100644 --- a/include/llvm/Object/ELFTypes.h +++ b/include/llvm/Object/ELFTypes.h @@ -11,10 +11,15 @@ #define LLVM_OBJECT_ELFTYPES_H #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Object/Error.h" #include "llvm/Support/ELF.h" #include "llvm/Support/Endian.h" -#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/Error.h" +#include +#include +#include +#include namespace llvm { namespace object { @@ -45,58 +50,58 @@ public: static const endianness TargetEndianness = E; static const bool Is64Bits = Is64; - typedef typename std::conditional::type uint; - typedef Elf_Ehdr_Impl> Ehdr; - typedef Elf_Shdr_Impl> Shdr; - typedef Elf_Sym_Impl> Sym; - typedef Elf_Dyn_Impl> Dyn; - typedef Elf_Phdr_Impl> Phdr; - typedef Elf_Rel_Impl, false> Rel; - typedef Elf_Rel_Impl, true> Rela; - typedef Elf_Verdef_Impl> Verdef; - typedef Elf_Verdaux_Impl> Verdaux; - typedef Elf_Verneed_Impl> Verneed; - typedef Elf_Vernaux_Impl> Vernaux; - typedef Elf_Versym_Impl> Versym; - typedef Elf_Hash_Impl> Hash; - typedef Elf_GnuHash_Impl> GnuHash; - typedef Elf_Chdr_Impl> Chdr; - typedef ArrayRef DynRange; - typedef ArrayRef ShdrRange; - typedef ArrayRef SymRange; - typedef ArrayRef RelRange; - typedef ArrayRef RelaRange; - typedef ArrayRef PhdrRange; - - typedef packed Half; - typedef packed Word; - typedef packed Sword; - typedef packed Xword; - typedef packed Sxword; - typedef packed Addr; - typedef packed Off; -}; - -typedef ELFType ELF32LE; -typedef ELFType ELF32BE; -typedef ELFType ELF64LE; -typedef ELFType ELF64BE; + using uint = typename std::conditional::type; + using Ehdr = Elf_Ehdr_Impl>; + using Shdr = Elf_Shdr_Impl>; + using Sym = Elf_Sym_Impl>; + using Dyn = Elf_Dyn_Impl>; + using Phdr = Elf_Phdr_Impl>; + using Rel = Elf_Rel_Impl, false>; + using Rela = Elf_Rel_Impl, true>; + using Verdef = Elf_Verdef_Impl>; + using Verdaux = Elf_Verdaux_Impl>; + using Verneed = Elf_Verneed_Impl>; + using Vernaux = Elf_Vernaux_Impl>; + using Versym = Elf_Versym_Impl>; + using Hash = Elf_Hash_Impl>; + using GnuHash = Elf_GnuHash_Impl>; + using Chdr = Elf_Chdr_Impl>; + using DynRange = ArrayRef; + using ShdrRange = ArrayRef; + using SymRange = ArrayRef; + using RelRange = ArrayRef; + using RelaRange = ArrayRef; + using PhdrRange = ArrayRef; + + using Half = packed; + using Word = packed; + using Sword = packed; + using Xword = packed; + using Sxword = packed; + using Addr = packed; + using Off = packed; +}; + +using ELF32LE = ELFType; +using ELF32BE = ELFType; +using ELF64LE = ELFType; +using ELF64BE = ELFType; // Use an alignment of 2 for the typedefs since that is the worst case for // ELF files in archives. // Templates to choose Elf_Addr and Elf_Off depending on is64Bits. template struct ELFDataTypeTypedefHelperCommon { - typedef support::detail::packed_endian_specific_integral< - uint16_t, target_endianness, 2> Elf_Half; - typedef support::detail::packed_endian_specific_integral< - uint32_t, target_endianness, 2> Elf_Word; - typedef support::detail::packed_endian_specific_integral< - int32_t, target_endianness, 2> Elf_Sword; - typedef support::detail::packed_endian_specific_integral< - uint64_t, target_endianness, 2> Elf_Xword; - typedef support::detail::packed_endian_specific_integral< - int64_t, target_endianness, 2> Elf_Sxword; + using Elf_Half = support::detail::packed_endian_specific_integral< + uint16_t, target_endianness, 2>; + using Elf_Word = support::detail::packed_endian_specific_integral< + uint32_t, target_endianness, 2>; + using Elf_Sword = support::detail::packed_endian_specific_integral< + int32_t, target_endianness, 2>; + using Elf_Xword = support::detail::packed_endian_specific_integral< + uint64_t, target_endianness, 2>; + using Elf_Sxword = support::detail::packed_endian_specific_integral< + int64_t, target_endianness, 2>; }; template struct ELFDataTypeTypedefHelper; @@ -105,34 +110,34 @@ template struct ELFDataTypeTypedefHelper; template struct ELFDataTypeTypedefHelper> : ELFDataTypeTypedefHelperCommon { - typedef uint32_t value_type; - typedef support::detail::packed_endian_specific_integral< - value_type, TargetEndianness, 2> Elf_Addr; - typedef support::detail::packed_endian_specific_integral< - value_type, TargetEndianness, 2> Elf_Off; + using value_type = uint32_t; + using Elf_Addr = support::detail::packed_endian_specific_integral< + value_type, TargetEndianness, 2>; + using Elf_Off = support::detail::packed_endian_specific_integral< + value_type, TargetEndianness, 2>; }; /// ELF 64bit types. template struct ELFDataTypeTypedefHelper> : ELFDataTypeTypedefHelperCommon { - typedef uint64_t value_type; - typedef support::detail::packed_endian_specific_integral< - value_type, TargetEndianness, 2> Elf_Addr; - typedef support::detail::packed_endian_specific_integral< - value_type, TargetEndianness, 2> Elf_Off; + using value_type = uint64_t; + using Elf_Addr = support::detail::packed_endian_specific_integral< + value_type, TargetEndianness, 2>; + using Elf_Off = support::detail::packed_endian_specific_integral< + value_type, TargetEndianness, 2>; }; // I really don't like doing this, but the alternative is copypasta. #define LLVM_ELF_IMPORT_TYPES_ELFT(ELFT) \ - typedef typename ELFT::Addr Elf_Addr; \ - typedef typename ELFT::Off Elf_Off; \ - typedef typename ELFT::Half Elf_Half; \ - typedef typename ELFT::Word Elf_Word; \ - typedef typename ELFT::Sword Elf_Sword; \ - typedef typename ELFT::Xword Elf_Xword; \ - typedef typename ELFT::Sxword Elf_Sxword; + using Elf_Addr = typename ELFT::Addr; \ + using Elf_Off = typename ELFT::Off; \ + using Elf_Half = typename ELFT::Half; \ + using Elf_Word = typename ELFT::Word; \ + using Elf_Sword = typename ELFT::Sword; \ + using Elf_Xword = typename ELFT::Xword; \ + using Elf_Sxword = typename ELFT::Sxword; #define LLD_ELF_COMMA , #define LLVM_ELF_IMPORT_TYPES(E, W) \ @@ -222,6 +227,7 @@ struct Elf_Sym_Impl : Elf_Sym_Base { uint64_t getValue() const { return st_value; } void setBinding(unsigned char b) { setBindingAndType(b, getType()); } void setType(unsigned char t) { setBindingAndType(getBinding(), t); } + void setBindingAndType(unsigned char b, unsigned char t) { st_info = (b << 4) + (t & 0x0f); } @@ -238,22 +244,29 @@ struct Elf_Sym_Impl : Elf_Sym_Base { } bool isAbsolute() const { return st_shndx == ELF::SHN_ABS; } + bool isCommon() const { return getType() == ELF::STT_COMMON || st_shndx == ELF::SHN_COMMON; } + bool isDefined() const { return !isUndefined(); } + bool isProcessorSpecific() const { return st_shndx >= ELF::SHN_LOPROC && st_shndx <= ELF::SHN_HIPROC; } + bool isOSSpecific() const { return st_shndx >= ELF::SHN_LOOS && st_shndx <= ELF::SHN_HIOS; } + bool isReserved() const { // ELF::SHN_HIRESERVE is 0xffff so st_shndx <= ELF::SHN_HIRESERVE is always // true and some compilers warn about it. return st_shndx >= ELF::SHN_LORESERVE; } + bool isUndefined() const { return st_shndx == ELF::SHN_UNDEF; } + bool isExternal() const { return getBinding() != ELF::STB_LOCAL; } @@ -277,14 +290,12 @@ struct Elf_Versym_Impl { Elf_Half vs_index; // Version index with flags (e.g. VERSYM_HIDDEN) }; -template struct Elf_Verdaux_Impl; - /// Elf_Verdef: This is the structure of entries in the SHT_GNU_verdef section /// (.gnu.version_d). This structure is identical for ELF32 and ELF64. template struct Elf_Verdef_Impl { LLVM_ELF_IMPORT_TYPES_ELFT(ELFT) - typedef Elf_Verdaux_Impl Elf_Verdaux; + using Elf_Verdaux = Elf_Verdaux_Impl; Elf_Half vd_version; // Version of this structure (e.g. VER_DEF_CURRENT) Elf_Half vd_flags; // Bitwise flags (VER_DEF_*) Elf_Half vd_ndx; // Version index, used in .gnu.version entries @@ -361,10 +372,10 @@ template struct Elf_Dyn_Impl : Elf_Dyn_Base { using Elf_Dyn_Base::d_tag; using Elf_Dyn_Base::d_un; - typedef typename std::conditional::type intX_t; - typedef typename std::conditional::type uintX_t; + using intX_t = typename std::conditional::type; + using uintX_t = typename std::conditional::type; intX_t getTag() const { return d_tag; } uintX_t getVal() const { return d_un.d_val; } uintX_t getPtr() const { return d_un.d_ptr; } @@ -430,6 +441,7 @@ struct Elf_Rel_Impl, false> { return (t << 32) | ((t >> 8) & 0xff000000) | ((t >> 24) & 0x00ff0000) | ((t >> 40) & 0x0000ff00) | ((t >> 56) & 0x000000ff); } + void setRInfo(uint64_t R, bool IsMips64EL) { if (IsMips64EL) r_info = (R >> 32) | ((R & 0xff000000) << 8) | ((R & 0x00ff0000) << 24) | @@ -483,15 +495,15 @@ struct Elf_Ehdr_Impl { Elf_Half e_shnum; // Number of entries in the section header table Elf_Half e_shstrndx; // Section header table index of section name // string table + bool checkMagic() const { return (memcmp(e_ident, ELF::ElfMagic, strlen(ELF::ElfMagic))) == 0; } + unsigned char getFileClass() const { return e_ident[ELF::EI_CLASS]; } unsigned char getDataEncoding() const { return e_ident[ELF::EI_DATA]; } }; -template struct Elf_Phdr_Impl; - template struct Elf_Phdr_Impl> { LLVM_ELF_IMPORT_TYPES(TargetEndianness, false) @@ -582,7 +594,7 @@ struct Elf_Chdr_Impl> { template struct Elf_Mips_RegInfo; -template +template struct Elf_Mips_RegInfo> { LLVM_ELF_IMPORT_TYPES(TargetEndianness, false) Elf_Word ri_gprmask; // bit-mask of used general registers @@ -590,7 +602,7 @@ struct Elf_Mips_RegInfo> { Elf_Addr ri_gp_value; // gp register value }; -template +template struct Elf_Mips_RegInfo> { LLVM_ELF_IMPORT_TYPES(TargetEndianness, true) Elf_Word ri_gprmask; // bit-mask of used general registers @@ -609,7 +621,7 @@ template struct Elf_Mips_Options { Elf_Word info; // Kind-specific information Elf_Mips_RegInfo &getRegInfo() { - assert(kind == llvm::ELF::ODK_REGINFO); + assert(kind == ELF::ODK_REGINFO); return *reinterpret_cast *>( (uint8_t *)this + sizeof(Elf_Mips_Options)); } @@ -637,4 +649,4 @@ template struct Elf_Mips_ABIFlags { } // end namespace object. } // end namespace llvm. -#endif +#endif // LLVM_OBJECT_ELFTYPES_H diff --git a/include/llvm/Object/IRSymtab.h b/include/llvm/Object/IRSymtab.h index be0f02a..b425543 100644 --- a/include/llvm/Object/IRSymtab.h +++ b/include/llvm/Object/IRSymtab.h @@ -25,23 +25,31 @@ #define LLVM_OBJECT_IRSYMTAB_H #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/ADT/StringRef.h" #include "llvm/IR/GlobalValue.h" #include "llvm/Object/SymbolicFile.h" #include "llvm/Support/Endian.h" +#include "llvm/Support/Error.h" +#include +#include +#include namespace llvm { namespace irsymtab { + namespace storage { // The data structures in this namespace define the low-level serialization // format. Clients that just want to read a symbol table should use the // irsymtab::Reader class. -typedef support::ulittle32_t Word; +using Word = support::ulittle32_t; /// A reference to a string in the string table. struct Str { Word Offset, Size; + StringRef get(StringRef Strtab) const { return {Strtab.data() + Offset, Size}; } @@ -50,6 +58,7 @@ struct Str { /// A reference to a range of objects in the symbol table. template struct Range { Word Offset, Size; + ArrayRef get(StringRef Symtab) const { return {reinterpret_cast(Symtab.data() + Offset), Size}; } @@ -122,7 +131,7 @@ struct Header { Str COFFLinkerOpts; }; -} +} // end namespace storage /// Fills in Symtab and Strtab with a valid symbol and string table for Mods. Error build(ArrayRef Mods, SmallVector &Symtab, @@ -152,18 +161,22 @@ struct Symbol { int getComdatIndex() const { return ComdatIndex; } using S = storage::Symbol; + GlobalValue::VisibilityTypes getVisibility() const { return GlobalValue::VisibilityTypes((Flags >> S::FB_visibility) & 3); } + bool isUndefined() const { return (Flags >> S::FB_undefined) & 1; } bool isWeak() const { return (Flags >> S::FB_weak) & 1; } bool isCommon() const { return (Flags >> S::FB_common) & 1; } bool isIndirect() const { return (Flags >> S::FB_indirect) & 1; } bool isUsed() const { return (Flags >> S::FB_used) & 1; } bool isTLS() const { return (Flags >> S::FB_tls) & 1; } + bool canBeOmittedFromSymbolTable() const { return (Flags >> S::FB_may_omit) & 1; } + bool isGlobal() const { return (Flags >> S::FB_global) & 1; } bool isFormatSpecific() const { return (Flags >> S::FB_format_specific) & 1; } bool isUnnamedAddr() const { return (Flags >> S::FB_unnamed_addr) & 1; } @@ -173,6 +186,7 @@ struct Symbol { assert(isCommon()); return CommonSize; } + uint32_t getCommonAlignment() const { assert(isCommon()); return CommonAlign; @@ -197,9 +211,11 @@ class Reader { ArrayRef Uncommons; StringRef str(storage::Str S) const { return S.get(Strtab); } + template ArrayRef range(storage::Range R) const { return R.get(Symtab); } + const storage::Header &header() const { return *reinterpret_cast(Symtab.data()); } @@ -215,7 +231,7 @@ public: Uncommons = range(header().Uncommons); } - typedef iterator_range> symbol_range; + using symbol_range = iterator_range>; /// Returns the symbol table for the entire bitcode file. /// The symbols enumerated by this method are ephemeral, but they can be @@ -298,8 +314,7 @@ inline Reader::symbol_range Reader::module_symbols(unsigned I) const { SymbolRef(MEnd, MEnd, nullptr, this)}; } -} - -} +} // end namespace irsymtab +} // end namespace llvm -#endif +#endif // LLVM_OBJECT_IRSYMTAB_H diff --git a/include/llvm/Object/MachO.h b/include/llvm/Object/MachO.h index 1ee571c..2955355 100644 --- a/include/llvm/Object/MachO.h +++ b/include/llvm/Object/MachO.h @@ -16,10 +16,25 @@ #define LLVM_OBJECT_MACHO_H #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" +#include "llvm/MC/SubtargetFeature.h" +#include "llvm/Object/Binary.h" #include "llvm/Object/ObjectFile.h" +#include "llvm/Object/SymbolicFile.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/Format.h" #include "llvm/Support/MachO.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include +#include namespace llvm { namespace object { @@ -28,11 +43,10 @@ namespace object { /// data in code entry in the table in a Mach-O object file. class DiceRef { DataRefImpl DicePimpl; - const ObjectFile *OwningObject; + const ObjectFile *OwningObject = nullptr; public: - DiceRef() : OwningObject(nullptr) { } - + DiceRef() = default; DiceRef(DataRefImpl DiceP, const ObjectFile *Owner); bool operator==(const DiceRef &Other) const; @@ -47,7 +61,7 @@ public: DataRefImpl getRawDataRefImpl() const; const ObjectFile *getObjectFile() const; }; -typedef content_iterator dice_iterator; +using dice_iterator = content_iterator; /// ExportEntry encapsulates the current-state-of-the-walk used when doing a /// non-recursive walk of the trie data structure. This allows you to iterate @@ -71,6 +85,7 @@ public: private: friend class MachOObjectFile; + void moveToFirst(); void moveToEnd(); uint64_t readULEB128(const uint8_t *&p); @@ -80,25 +95,26 @@ private: // Represents a node in the mach-o exports trie. struct NodeState { NodeState(const uint8_t *Ptr); + const uint8_t *Start; const uint8_t *Current; - uint64_t Flags; - uint64_t Address; - uint64_t Other; - const char *ImportName; - unsigned ChildCount; - unsigned NextChildIndex; - unsigned ParentStringLength; - bool IsExportNode; + uint64_t Flags = 0; + uint64_t Address = 0; + uint64_t Other = 0; + const char *ImportName = nullptr; + unsigned ChildCount = 0; + unsigned NextChildIndex = 0; + unsigned ParentStringLength = 0; + bool IsExportNode = false; }; ArrayRef Trie; SmallString<256> CumulativeString; SmallVector Stack; - bool Malformed; - bool Done; + bool Malformed = false; + bool Done = false; }; -typedef content_iterator export_iterator; +using export_iterator = content_iterator; // Segment info so SegIndex/SegOffset pairs in a Mach-O Bind or Rebase entry // can be checked and translated. Only the SegIndex/SegOffset pairs from @@ -106,7 +122,7 @@ typedef content_iterator export_iterator; // address() methods below. class BindRebaseSegInfo { public: - BindRebaseSegInfo(const object::MachOObjectFile *Obj); + BindRebaseSegInfo(const MachOObjectFile *Obj); // Used to check a Mach-O Bind or Rebase entry for errors when iterating. const char *checkSegAndOffset(int32_t SegIndex, uint64_t SegOffset, @@ -130,6 +146,7 @@ private: int32_t SegmentIndex; }; const SectionInfo &findSection(int32_t SegIndex, uint64_t SegOffset); + SmallVector Sections; int32_t MaxSegIndex; }; @@ -159,6 +176,7 @@ public: private: friend class MachOObjectFile; + void moveToFirst(); void moveToEnd(); uint64_t readULEB128(const char **error); @@ -167,15 +185,15 @@ private: const MachOObjectFile *O; ArrayRef Opcodes; const uint8_t *Ptr; - uint64_t SegmentOffset; - int32_t SegmentIndex; - uint64_t RemainingLoopCount; - uint64_t AdvanceAmount; - uint8_t RebaseType; + uint64_t SegmentOffset = 0; + int32_t SegmentIndex = -1; + uint64_t RemainingLoopCount = 0; + uint64_t AdvanceAmount = 0; + uint8_t RebaseType = 0; uint8_t PointerSize; - bool Done; + bool Done = false; }; -typedef content_iterator rebase_iterator; +using rebase_iterator = content_iterator; /// MachOBindEntry encapsulates the current state in the decompression of /// binding opcodes. This allows you to iterate through the compressed table of @@ -209,6 +227,7 @@ public: private: friend class MachOObjectFile; + void moveToFirst(); void moveToEnd(); uint64_t readULEB128(const char **error); @@ -218,21 +237,21 @@ private: const MachOObjectFile *O; ArrayRef Opcodes; const uint8_t *Ptr; - uint64_t SegmentOffset; - int32_t SegmentIndex; + uint64_t SegmentOffset = 0; + int32_t SegmentIndex = -1; StringRef SymbolName; - bool LibraryOrdinalSet; - int Ordinal; - uint32_t Flags; - int64_t Addend; - uint64_t RemainingLoopCount; - uint64_t AdvanceAmount; - uint8_t BindType; + bool LibraryOrdinalSet = false; + int Ordinal = 0; + uint32_t Flags = 0; + int64_t Addend = 0; + uint64_t RemainingLoopCount = 0; + uint64_t AdvanceAmount = 0; + uint8_t BindType = 0; uint8_t PointerSize; Kind TableKind; - bool Done; + bool Done = false; }; -typedef content_iterator bind_iterator; +using bind_iterator = content_iterator; class MachOObjectFile : public ObjectFile { public: @@ -240,8 +259,8 @@ public: const char *Ptr; // Where in memory the load command is. MachO::load_command C; // The command itself. }; - typedef SmallVector LoadCommandList; - typedef LoadCommandList::const_iterator load_command_iterator; + using LoadCommandList = SmallVector; + using load_command_iterator = LoadCommandList::const_iterator; static Expected> create(MemoryBufferRef Object, bool IsLittleEndian, bool Is64Bits, @@ -563,7 +582,7 @@ public: case MachO::PLATFORM_BRIDGEOS: return "bridgeos"; default: std::string ret; - llvm::raw_string_ostream ss(ret); + raw_string_ostream ss(ret); ss << format_hex(platform, 8, true); return ss.str(); } @@ -576,7 +595,7 @@ public: case MachO::TOOL_LD: return "ld"; default: std::string ret; - llvm::raw_string_ostream ss(ret); + raw_string_ostream ss(ret); ss << format_hex(tools, 8, true); return ss.str(); } @@ -595,7 +614,6 @@ public: } private: - MachOObjectFile(MemoryBufferRef Object, bool IsLittleEndian, bool Is64Bits, Error &Err, uint32_t UniversalCputype = 0, uint32_t UniversalIndex = 0); @@ -606,23 +624,23 @@ private: MachO::mach_header_64 Header64; MachO::mach_header Header; }; - typedef SmallVector SectionList; + using SectionList = SmallVector; SectionList Sections; - typedef SmallVector LibraryList; + using LibraryList = SmallVector; LibraryList Libraries; LoadCommandList LoadCommands; - typedef SmallVector LibraryShortName; + using LibraryShortName = SmallVector; using BuildToolList = SmallVector; BuildToolList BuildTools; mutable LibraryShortName LibrariesShortNames; std::unique_ptr BindRebaseSectionTable; - const char *SymtabLoadCmd; - const char *DysymtabLoadCmd; - const char *DataInCodeLoadCmd; - const char *LinkOptHintsLoadCmd; - const char *DyldInfoLoadCmd; - const char *UuidLoadCmd; - bool HasPageZeroSegment; + const char *SymtabLoadCmd = nullptr; + const char *DysymtabLoadCmd = nullptr; + const char *DataInCodeLoadCmd = nullptr; + const char *LinkOptHintsLoadCmd = nullptr; + const char *DyldInfoLoadCmd = nullptr; + const char *UuidLoadCmd = nullptr; + bool HasPageZeroSegment = false; }; /// DiceRef @@ -679,7 +697,7 @@ inline const ObjectFile *DiceRef::getObjectFile() const { return OwningObject; } -} -} +} // end namespace object +} // end namespace llvm -#endif +#endif // LLVM_OBJECT_MACHO_H diff --git a/include/llvm/Object/ModuleSummaryIndexObjectFile.h b/include/llvm/Object/ModuleSummaryIndexObjectFile.h index 7130222..f733f86 100644 --- a/include/llvm/Object/ModuleSummaryIndexObjectFile.h +++ b/include/llvm/Object/ModuleSummaryIndexObjectFile.h @@ -1,4 +1,4 @@ -//===- ModuleSummaryIndexObjectFile.h - Summary index file implementation -=// +//===- ModuleSummaryIndexObjectFile.h - Summary index file implementation -===// // // The LLVM Compiler Infrastructure // @@ -14,14 +14,22 @@ #ifndef LLVM_OBJECT_MODULESUMMARYINDEXOBJECTFILE_H #define LLVM_OBJECT_MODULESUMMARYINDEXOBJECTFILE_H -#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Object/Binary.h" #include "llvm/Object/SymbolicFile.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/MemoryBuffer.h" +#include +#include namespace llvm { + class ModuleSummaryIndex; -class Module; namespace object { + class ObjectFile; /// This class is used to read just the module summary index related @@ -41,15 +49,18 @@ public: void moveSymbolNext(DataRefImpl &Symb) const override { llvm_unreachable("not implemented"); } + std::error_code printSymbolName(raw_ostream &OS, DataRefImpl Symb) const override { llvm_unreachable("not implemented"); return std::error_code(); } + uint32_t getSymbolFlags(DataRefImpl Symb) const override { llvm_unreachable("not implemented"); return 0; } + basic_symbol_iterator symbol_begin() const override { llvm_unreachable("not implemented"); return basic_symbol_iterator(BasicSymbolRef()); @@ -85,7 +96,8 @@ public: static Expected> create(MemoryBufferRef Object); }; -} + +} // end namespace object /// Parse the module summary index out of an IR file and return the module /// summary index object if found, or nullptr if not. If Identifier is @@ -94,6 +106,7 @@ public: /// containing minimized bitcode just for the thin link. Expected> getModuleSummaryIndexForFile(StringRef Path, StringRef Identifier = ""); -} -#endif +} // end namespace llvm + +#endif // LLVM_OBJECT_MODULESUMMARYINDEXOBJECTFILE_H diff --git a/include/llvm/Object/ModuleSymbolTable.h b/include/llvm/Object/ModuleSymbolTable.h index 333301d..9e93228 100644 --- a/include/llvm/Object/ModuleSymbolTable.h +++ b/include/llvm/Object/ModuleSymbolTable.h @@ -1,4 +1,4 @@ -//===- ModuleSymbolTable.h - symbol table for in-memory IR ----------------===// +//===- ModuleSymbolTable.h - symbol table for in-memory IR ------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -16,22 +16,24 @@ #ifndef LLVM_OBJECT_MODULESYMBOLTABLE_H #define LLVM_OBJECT_MODULESYMBOLTABLE_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/PointerUnion.h" -#include "llvm/ADT/Triple.h" #include "llvm/IR/Mangler.h" #include "llvm/Object/SymbolicFile.h" +#include "llvm/Support/Allocator.h" +#include #include #include +#include namespace llvm { class GlobalValue; -class RecordStreamer; class ModuleSymbolTable { public: - typedef std::pair AsmSymbol; - typedef PointerUnion Symbol; + using AsmSymbol = std::pair; + using Symbol = PointerUnion; private: Module *FirstMod = nullptr; @@ -57,6 +59,6 @@ public: function_ref AsmSymbol); }; -} +} // end namespace llvm -#endif +#endif // LLVM_OBJECT_MODULESYMBOLTABLE_H diff --git a/include/llvm/Object/RelocVisitor.h b/include/llvm/Object/RelocVisitor.h index 3a0a62d..73c7ce3 100644 --- a/include/llvm/Object/RelocVisitor.h +++ b/include/llvm/Object/RelocVisitor.h @@ -1,4 +1,4 @@ -//===-- RelocVisitor.h - Visitor for object file relocations -*- C++ -*-===// +//===- RelocVisitor.h - Visitor for object file relocations -----*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -16,34 +16,38 @@ #ifndef LLVM_OBJECT_RELOCVISITOR_H #define LLVM_OBJECT_RELOCVISITOR_H +#include "llvm/ADT/Triple.h" #include "llvm/Object/COFF.h" #include "llvm/Object/ELFObjectFile.h" #include "llvm/Object/MachO.h" #include "llvm/Object/ObjectFile.h" -#include "llvm/Support/Debug.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/ErrorOr.h" #include "llvm/Support/MachO.h" -#include "llvm/Support/raw_ostream.h" +#include +#include namespace llvm { namespace object { struct RelocToApply { // The computed value after applying the relevant relocations. - int64_t Value; + int64_t Value = 0; // The width of the value; how many bytes to touch when applying the // relocation. - char Width; + char Width = 0; + + RelocToApply() = default; RelocToApply(int64_t Value, char Width) : Value(Value), Width(Width) {} - RelocToApply() : Value(0), Width(0) {} }; /// @brief Base class for object file relocation visitors. class RelocVisitor { public: - explicit RelocVisitor(const ObjectFile &Obj) - : ObjToVisit(Obj), HasError(false) {} + explicit RelocVisitor(const ObjectFile &Obj) : ObjToVisit(Obj) {} // TODO: Should handle multiple applied relocations via either passing in the // previously computed value or just count paired relocations as a single @@ -64,22 +68,22 @@ public: private: const ObjectFile &ObjToVisit; - bool HasError; + bool HasError = false; RelocToApply visitELF(uint32_t RelocType, RelocationRef R, uint64_t Value) { if (ObjToVisit.getBytesInAddress() == 8) { // 64-bit object file switch (ObjToVisit.getArch()) { case Triple::x86_64: switch (RelocType) { - case llvm::ELF::R_X86_64_NONE: + case ELF::R_X86_64_NONE: return visitELF_X86_64_NONE(R); - case llvm::ELF::R_X86_64_64: + case ELF::R_X86_64_64: return visitELF_X86_64_64(R, Value); - case llvm::ELF::R_X86_64_PC32: + case ELF::R_X86_64_PC32: return visitELF_X86_64_PC32(R, Value); - case llvm::ELF::R_X86_64_32: + case ELF::R_X86_64_32: return visitELF_X86_64_32(R, Value); - case llvm::ELF::R_X86_64_32S: + case ELF::R_X86_64_32S: return visitELF_X86_64_32S(R, Value); default: HasError = true; @@ -88,9 +92,9 @@ private: case Triple::aarch64: case Triple::aarch64_be: switch (RelocType) { - case llvm::ELF::R_AARCH64_ABS32: + case ELF::R_AARCH64_ABS32: return visitELF_AARCH64_ABS32(R, Value); - case llvm::ELF::R_AARCH64_ABS64: + case ELF::R_AARCH64_ABS64: return visitELF_AARCH64_ABS64(R, Value); default: HasError = true; @@ -99,9 +103,9 @@ private: case Triple::bpfel: case Triple::bpfeb: switch (RelocType) { - case llvm::ELF::R_BPF_64_64: + case ELF::R_BPF_64_64: return visitELF_BPF_64_64(R, Value); - case llvm::ELF::R_BPF_64_32: + case ELF::R_BPF_64_32: return visitELF_BPF_64_32(R, Value); default: HasError = true; @@ -110,9 +114,9 @@ private: case Triple::mips64el: case Triple::mips64: switch (RelocType) { - case llvm::ELF::R_MIPS_32: + case ELF::R_MIPS_32: return visitELF_MIPS64_32(R, Value); - case llvm::ELF::R_MIPS_64: + case ELF::R_MIPS_64: return visitELF_MIPS64_64(R, Value); default: HasError = true; @@ -121,9 +125,9 @@ private: case Triple::ppc64le: case Triple::ppc64: switch (RelocType) { - case llvm::ELF::R_PPC64_ADDR32: + case ELF::R_PPC64_ADDR32: return visitELF_PPC64_ADDR32(R, Value); - case llvm::ELF::R_PPC64_ADDR64: + case ELF::R_PPC64_ADDR64: return visitELF_PPC64_ADDR64(R, Value); default: HasError = true; @@ -131,9 +135,9 @@ private: } case Triple::systemz: switch (RelocType) { - case llvm::ELF::R_390_32: + case ELF::R_390_32: return visitELF_390_32(R, Value); - case llvm::ELF::R_390_64: + case ELF::R_390_64: return visitELF_390_64(R, Value); default: HasError = true; @@ -141,11 +145,11 @@ private: } case Triple::sparcv9: switch (RelocType) { - case llvm::ELF::R_SPARC_32: - case llvm::ELF::R_SPARC_UA32: + case ELF::R_SPARC_32: + case ELF::R_SPARC_UA32: return visitELF_SPARCV9_32(R, Value); - case llvm::ELF::R_SPARC_64: - case llvm::ELF::R_SPARC_UA64: + case ELF::R_SPARC_64: + case ELF::R_SPARC_UA64: return visitELF_SPARCV9_64(R, Value); default: HasError = true; @@ -153,9 +157,9 @@ private: } case Triple::amdgcn: switch (RelocType) { - case llvm::ELF::R_AMDGPU_ABS32: + case ELF::R_AMDGPU_ABS32: return visitELF_AMDGPU_ABS32(R, Value); - case llvm::ELF::R_AMDGPU_ABS64: + case ELF::R_AMDGPU_ABS64: return visitELF_AMDGPU_ABS64(R, Value); default: HasError = true; @@ -169,11 +173,11 @@ private: switch (ObjToVisit.getArch()) { case Triple::x86: switch (RelocType) { - case llvm::ELF::R_386_NONE: + case ELF::R_386_NONE: return visitELF_386_NONE(R); - case llvm::ELF::R_386_32: + case ELF::R_386_32: return visitELF_386_32(R, Value); - case llvm::ELF::R_386_PC32: + case ELF::R_386_PC32: return visitELF_386_PC32(R, Value); default: HasError = true; @@ -181,7 +185,7 @@ private: } case Triple::ppc: switch (RelocType) { - case llvm::ELF::R_PPC_ADDR32: + case ELF::R_PPC_ADDR32: return visitELF_PPC_ADDR32(R, Value); default: HasError = true; @@ -193,12 +197,12 @@ private: default: HasError = true; return RelocToApply(); - case llvm::ELF::R_ARM_ABS32: + case ELF::R_ARM_ABS32: return visitELF_ARM_ABS32(R, Value); } case Triple::lanai: switch (RelocType) { - case llvm::ELF::R_LANAI_32: + case ELF::R_LANAI_32: return visitELF_Lanai_32(R, Value); default: HasError = true; @@ -207,7 +211,7 @@ private: case Triple::mipsel: case Triple::mips: switch (RelocType) { - case llvm::ELF::R_MIPS_32: + case ELF::R_MIPS_32: return visitELF_MIPS_32(R, Value); default: HasError = true; @@ -215,8 +219,8 @@ private: } case Triple::sparc: switch (RelocType) { - case llvm::ELF::R_SPARC_32: - case llvm::ELF::R_SPARC_UA32: + case ELF::R_SPARC_32: + case ELF::R_SPARC_UA32: return visitELF_SPARC_32(R, Value); default: HasError = true; @@ -224,7 +228,7 @@ private: } case Triple::hexagon: switch (RelocType) { - case llvm::ELF::R_HEX_32: + case ELF::R_HEX_32: return visitELF_HEX_32(R, Value); default: HasError = true; @@ -483,6 +487,7 @@ private: } }; -} -} -#endif +} // end namespace object +} // end namespace llvm + +#endif // LLVM_OBJECT_RELOCVISITOR_H diff --git a/include/llvm/Object/StackMapParser.h b/include/llvm/Object/StackMapParser.h index efea62b..0c5e1e3 100644 --- a/include/llvm/Object/StackMapParser.h +++ b/include/llvm/Object/StackMapParser.h @@ -1,4 +1,4 @@ -//===-------- StackMapParser.h - StackMap Parsing Support -------*- C++ -*-===// +//===- StackMapParser.h - StackMap Parsing Support --------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -11,7 +11,11 @@ #define LLVM_CODEGEN_STACKMAPPARSER_H #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/Support/Endian.h" +#include +#include +#include #include namespace llvm { @@ -19,12 +23,11 @@ namespace llvm { template class StackMapV2Parser { public: - template class AccessorIterator { public: - AccessorIterator(AccessorT A) : A(A) {} + AccessorIterator& operator++() { A = A.next(); return *this; } AccessorIterator operator++(int) { auto tmp = *this; @@ -48,8 +51,8 @@ public: /// Accessor for function records. class FunctionAccessor { friend class StackMapV2Parser; - public: + public: /// Get the function address. uint64_t getFunctionAddress() const { return read(P); @@ -80,13 +83,12 @@ public: /// Accessor for constants. class ConstantAccessor { friend class StackMapV2Parser; - public: + public: /// Return the value of this constant. uint64_t getValue() const { return read(P); } private: - ConstantAccessor(const uint8_t *P) : P(P) {} const static int ConstantAccessorSize = sizeof(uint64_t); @@ -98,20 +100,16 @@ public: const uint8_t *P; }; - // Forward-declare RecordAccessor so we can friend it below. - class RecordAccessor; - enum class LocationKind : uint8_t { Register = 1, Direct = 2, Indirect = 3, Constant = 4, ConstantIndex = 5 }; - /// Accessor for location records. class LocationAccessor { friend class StackMapV2Parser; friend class RecordAccessor; - public: + public: /// Get the Kind for this location. LocationKind getKind() const { return LocationKind(P[KindOffset]); @@ -144,7 +142,6 @@ public: } private: - LocationAccessor(const uint8_t *P) : P(P) {} LocationAccessor next() const { @@ -163,8 +160,8 @@ public: class LiveOutAccessor { friend class StackMapV2Parser; friend class RecordAccessor; - public: + public: /// Get the Dwarf register number for this live-out. uint16_t getDwarfRegNum() const { return read(P + DwarfRegNumOffset); @@ -176,7 +173,6 @@ public: } private: - LiveOutAccessor(const uint8_t *P) : P(P) {} LiveOutAccessor next() const { @@ -194,10 +190,10 @@ public: /// Accessor for stackmap records. class RecordAccessor { friend class StackMapV2Parser; - public: - typedef AccessorIterator location_iterator; - typedef AccessorIterator liveout_iterator; + public: + using location_iterator = AccessorIterator; + using liveout_iterator = AccessorIterator; /// Get the patchpoint/stackmap ID for this record. uint64_t getID() const { @@ -254,7 +250,6 @@ public: return liveout_iterator(getLiveOut(0)); } - /// End iterator for live-outs. liveout_iterator liveouts_end() const { return liveout_iterator(getLiveOut(getNumLiveOuts())); @@ -266,7 +261,6 @@ public: } private: - RecordAccessor(const uint8_t *P) : P(P) {} unsigned getNumLiveOutsOffset() const { @@ -316,9 +310,9 @@ public: } } - typedef AccessorIterator function_iterator; - typedef AccessorIterator constant_iterator; - typedef AccessorIterator record_iterator; + using function_iterator = AccessorIterator; + using constant_iterator = AccessorIterator; + using record_iterator = AccessorIterator; /// Get the version number of this stackmap. (Always returns 2). unsigned getVersion() const { return 2; } @@ -413,7 +407,6 @@ public: } private: - template static T read(const uint8_t *P) { return support::endian::read(P); @@ -441,6 +434,6 @@ private: std::vector StackMapRecordOffsets; }; -} +} // end namespace llvm -#endif +#endif // LLVM_CODEGEN_STACKMAPPARSER_H diff --git a/include/llvm/Object/Wasm.h b/include/llvm/Object/Wasm.h index 43ad62b..6b6bbe2 100644 --- a/include/llvm/Object/Wasm.h +++ b/include/llvm/Object/Wasm.h @@ -17,6 +17,8 @@ #ifndef LLVM_OBJECT_WASM_H #define LLVM_OBJECT_WASM_H +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Object/Binary.h" #include "llvm/Object/ObjectFile.h" #include "llvm/Support/Error.h" @@ -47,10 +49,10 @@ public: class WasmSection { public: - WasmSection() : Type(0), Offset(0) {} + WasmSection() = default; - uint32_t Type; // Section type (See below) - uint32_t Offset; // Offset with in the file + uint32_t Type = 0; // Section type (See below) + uint32_t Offset = 0; // Offset with in the file StringRef Name; // Section name (User-defined sections only) ArrayRef Content; // Section content std::vector Relocations; // Relocations for this section @@ -74,12 +76,15 @@ public: const std::vector& memories() const { return Memories; } const std::vector& globals() const { return Globals; } const std::vector& exports() const { return Exports; } + const std::vector& elements() const { return ElemSegments; } + const std::vector& dataSegments() const { return DataSegments; } + const std::vector& functions() const { return Functions; } const ArrayRef& code() const { return CodeSection; } uint32_t startFunction() const { return StartFunction; } @@ -178,7 +183,7 @@ private: std::vector Symbols; std::vector Functions; ArrayRef CodeSection; - uint32_t StartFunction; + uint32_t StartFunction = -1; }; } // end namespace object diff --git a/include/llvm/ObjectYAML/WasmYAML.h b/include/llvm/ObjectYAML/WasmYAML.h index b1af8bb..dfeeb85 100644 --- a/include/llvm/ObjectYAML/WasmYAML.h +++ b/include/llvm/ObjectYAML/WasmYAML.h @@ -88,7 +88,7 @@ struct Relocation { RelocType Type; uint32_t Index; yaml::Hex32 Offset; - yaml::Hex32 Addend; + int32_t Addend; }; struct DataSegment { diff --git a/include/llvm/ProfileData/InstrProf.h b/include/llvm/ProfileData/InstrProf.h index c982885..1b07c33 100644 --- a/include/llvm/ProfileData/InstrProf.h +++ b/include/llvm/ProfileData/InstrProf.h @@ -79,14 +79,6 @@ inline StringRef getInstrProfValueRangeProfFuncName() { return INSTR_PROF_VALUE_RANGE_PROF_FUNC_STR; } -/// Return the name of the section containing function coverage mapping -/// data. -std::string getInstrProfCoverageSectionName(const Module *M = nullptr); -/// Similar to the above, but used by host tool (e.g, coverage) which has -/// object format information. The section name returned is not prefixed -/// with segment name. -std::string getInstrProfCoverageSectionNameInObject(bool isCoff); - /// Return the name prefix of variables containing instrumented function names. inline StringRef getInstrProfNameVarPrefix() { return "__profn_"; } diff --git a/include/llvm/Support/BranchProbability.h b/include/llvm/Support/BranchProbability.h index e8eb50d..b403d7f 100644 --- a/include/llvm/Support/BranchProbability.h +++ b/include/llvm/Support/BranchProbability.h @@ -112,6 +112,13 @@ public: return *this; } + BranchProbability &operator*=(uint32_t RHS) { + assert(N != UnknownN && + "Unknown probability cannot participate in arithmetics."); + N = (uint64_t(N) * RHS > D) ? D : N * RHS; + return *this; + } + BranchProbability &operator/=(uint32_t RHS) { assert(N != UnknownN && "Unknown probability cannot participate in arithmetics."); @@ -135,6 +142,11 @@ public: return Prob *= RHS; } + BranchProbability operator*(uint32_t RHS) const { + BranchProbability Prob(*this); + return Prob *= RHS; + } + BranchProbability operator/(uint32_t RHS) const { BranchProbability Prob(*this); return Prob /= RHS; diff --git a/include/llvm/Support/FileSystem.h b/include/llvm/Support/FileSystem.h index 29515c2..e3c5de7 100644 --- a/include/llvm/Support/FileSystem.h +++ b/include/llvm/Support/FileSystem.h @@ -116,7 +116,9 @@ inline perms &operator&=(perms &l, perms r) { return l; } inline perms operator~(perms x) { - return static_cast(~static_cast(x)); + // Avoid UB by explicitly truncating the (unsigned) ~ result. + return static_cast( + static_cast(~static_cast(x))); } class UniqueID { diff --git a/include/llvm/Support/GenericDomTree.h b/include/llvm/Support/GenericDomTree.h index eb7c27d..851ff7d 100644 --- a/include/llvm/Support/GenericDomTree.h +++ b/include/llvm/Support/GenericDomTree.h @@ -286,13 +286,13 @@ protected: NodeRef NewBBSucc = *GraphT::child_begin(NewBB); std::vector PredBlocks; - for (const auto Pred : children>(NewBB)) + for (const auto &Pred : children>(NewBB)) PredBlocks.push_back(Pred); assert(!PredBlocks.empty() && "No predblocks?"); bool NewBBDominatesNewBBSucc = true; - for (const auto Pred : children>(NewBBSucc)) { + for (const auto &Pred : children>(NewBBSucc)) { if (Pred != NewBB && !dominates(NewBBSucc, Pred) && isReachableFromEntry(Pred)) { NewBBDominatesNewBBSucc = false; diff --git a/include/llvm/Support/KnownBits.h b/include/llvm/Support/KnownBits.h new file mode 100644 index 0000000..08d4ded --- /dev/null +++ b/include/llvm/Support/KnownBits.h @@ -0,0 +1,43 @@ +//===- llvm/Support/KnownBits.h - Stores known zeros/ones -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a class for representing known zeros and ones used by +// computeKnownBits. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_KNOWNBITS_H +#define LLVM_SUPPORT_KNOWNBITS_H + +#include "llvm/ADT/APInt.h" + +namespace llvm { + +// For now this is a simple wrapper around two APInts. +struct KnownBits { + APInt Zero; + APInt One; + + // Default construct Zero and One. + KnownBits() {} + + /// Create a known bits object of BitWidth bits initialized to unknown. + KnownBits(unsigned BitWidth) : Zero(BitWidth, 0), One(BitWidth, 0) {} + + /// Get the bit width of this value. + unsigned getBitWidth() const { + assert(Zero.getBitWidth() == One.getBitWidth() && + "Zero and One should have the same width!"); + return Zero.getBitWidth(); + } +}; + +} // end namespace llvm + +#endif diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h index 6d02e4a..ffea679 100644 --- a/include/llvm/Support/YAMLTraits.h +++ b/include/llvm/Support/YAMLTraits.h @@ -606,7 +606,7 @@ public: template void bitSetCase(T &Val, const char* Str, const T ConstVal) { if ( bitSetMatch(Str, outputting() && (Val & ConstVal) == ConstVal) ) { - Val = Val | ConstVal; + Val = static_cast(Val | ConstVal); } } @@ -614,7 +614,7 @@ public: template void bitSetCase(T &Val, const char* Str, const uint32_t ConstVal) { if ( bitSetMatch(Str, outputting() && (Val & ConstVal) == ConstVal) ) { - Val = Val | ConstVal; + Val = static_cast(Val | ConstVal); } } diff --git a/include/llvm/Target/GlobalISel/Target.td b/include/llvm/Target/GlobalISel/Target.td index fa1a424..fd2ebca 100644 --- a/include/llvm/Target/GlobalISel/Target.td +++ b/include/llvm/Target/GlobalISel/Target.td @@ -30,21 +30,13 @@ def s64 : LLT; // Definitions that inherit from this may also inherit from // GIComplexPatternEquiv to enable the import of SelectionDAG patterns involving // those ComplexPatterns. -class GIComplexOperandMatcher { +class GIComplexOperandMatcher { // The expected type of the root of the match. // // TODO: We should probably support, any-type, any-scalar, and multiple types // in the future. LLT Type = type; - // The operands that result from a successful match - // Should be of the form '(ops ty1, ty2, ...)' where ty1/ty2 are definitions - // that inherit from Operand. - // - // FIXME: Which definition is used for ty1/ty2 doesn't actually matter at the - // moment. Only the number of operands is used. - dag Operands = operands; - // The function that determines whether the operand matches. It should be of // the form: // bool select(const MatchOperand &Root, MatchOperand &Result1) diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h index 0dc9cf7..82a682c 100644 --- a/include/llvm/Target/TargetInstrInfo.h +++ b/include/llvm/Target/TargetInstrInfo.h @@ -1108,7 +1108,7 @@ public: /// Return the noop instruction to use for a noop. - virtual void getNoopForMachoTarget(MCInst &NopInst) const; + virtual void getNoop(MCInst &NopInst) const; /// Return true for post-incremented instructions. virtual bool isPostIncrement(const MachineInstr &MI) const { diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 24039ea..51f11e1 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -236,6 +236,12 @@ public: return getPointerTy(DL, DL.getAllocaAddrSpace()); } + /// Return the type for operands of fence. + /// TODO: Let fence operands be of i32 type and remove this. + virtual MVT getFenceOperandTy(const DataLayout &DL) const { + return getPointerTy(DL); + } + /// EVT is not used in-tree, but is used by out-of-tree target. /// A documentation for this function would be nice... virtual MVT getScalarShiftAmountTy(const DataLayout &, EVT) const; @@ -2268,7 +2274,8 @@ protected: /// Return true if the value types that can be represented by the specified /// register class are all legal. - bool isLegalRC(const TargetRegisterClass *RC) const; + bool isLegalRC(const TargetRegisterInfo &TRI, + const TargetRegisterClass &RC) const; /// Replace/modify any TargetFrameIndex operands with a targte-dependent /// sequence of memory operands that is recognized by PrologEpilogInserter. @@ -2388,30 +2395,39 @@ public: New = N; return true; } - - /// Check to see if the specified operand of the specified instruction is a - /// constant integer. If so, check to see if there are any bits set in the - /// constant that are not demanded. If so, shrink the constant and return - /// true. - bool ShrinkDemandedConstant(SDValue Op, const APInt &Demanded); - - /// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free. This - /// uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be - /// generalized for targets with other types of implicit widening casts. - bool ShrinkDemandedOp(SDValue Op, unsigned BitWidth, const APInt &Demanded, - const SDLoc &dl); - - /// Helper for SimplifyDemandedBits that can simplify an operation with - /// multiple uses. This function uses TLI.SimplifyDemandedBits to - /// simplify Operand \p OpIdx of \p User and then updated \p User with - /// the simplified version. No other uses of \p OpIdx are updated. - /// If \p User is the only user of \p OpIdx, this function behaves exactly - /// like TLI.SimplifyDemandedBits except that it also updates the DAG by - /// calling DCI.CommitTargetLoweringOpt. - bool SimplifyDemandedBits(SDNode *User, unsigned OpIdx, - const APInt &Demanded, DAGCombinerInfo &DCI); }; + /// Check to see if the specified operand of the specified instruction is a + /// constant integer. If so, check to see if there are any bits set in the + /// constant that are not demanded. If so, shrink the constant and return + /// true. + bool ShrinkDemandedConstant(SDValue Op, const APInt &Demanded, + TargetLoweringOpt &TLO) const; + + // Target hook to do target-specific const optimization, which is called by + // ShrinkDemandedConstant. This function should return true if the target + // doesn't want ShrinkDemandedConstant to further optimize the constant. + virtual bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded, + TargetLoweringOpt &TLO) const { + return false; + } + + /// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free. This + /// uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be + /// generalized for targets with other types of implicit widening casts. + bool ShrinkDemandedOp(SDValue Op, unsigned BitWidth, const APInt &Demanded, + TargetLoweringOpt &TLO) const; + + /// Helper for SimplifyDemandedBits that can simplify an operation with + /// multiple uses. This function simplifies operand \p OpIdx of \p User and + /// then updates \p User with the simplified version. No other uses of + /// \p OpIdx are updated. If \p User is the only user of \p OpIdx, this + /// function behaves exactly like function SimplifyDemandedBits declared + /// below except that it also updates the DAG by calling + /// DCI.CommitTargetLoweringOpt. + bool SimplifyDemandedBits(SDNode *User, unsigned OpIdx, const APInt &Demanded, + DAGCombinerInfo &DCI, TargetLoweringOpt &TLO) const; + /// Look at Op. At this point, we know that only the DemandedMask bits of the /// result of Op are ever used downstream. If we can use this information to /// simplify Op, create a new simplified DAG node and return true, returning diff --git a/include/llvm/Target/TargetRegisterInfo.h b/include/llvm/Target/TargetRegisterInfo.h index 3f5daea..4ce6d2f 100644 --- a/include/llvm/Target/TargetRegisterInfo.h +++ b/include/llvm/Target/TargetRegisterInfo.h @@ -40,13 +40,12 @@ class TargetRegisterClass { public: typedef const MCPhysReg* iterator; typedef const MCPhysReg* const_iterator; - typedef const MVT::SimpleValueType* vt_iterator; typedef const TargetRegisterClass* const * sc_iterator; // Instance variables filled by tablegen, do not use! const MCRegisterClass *MC; const uint16_t SpillSize, SpillAlignment; - const vt_iterator VTs; + const MVT::SimpleValueType *VTs; const uint32_t *SubClassMask; const uint16_t *SuperRegIndices; const LaneBitmask LaneMask; @@ -93,13 +92,6 @@ public: return MC->contains(Reg1, Reg2); } - /// Return the size of the register in bytes, which is also the size - /// of a stack slot allocated to hold a spilled copy of this register. - unsigned getSize() const { return SpillSize; } - - /// Return the minimum required alignment for a register of this class. - unsigned getAlignment() const { return SpillAlignment; } - /// Return the cost of copying a value between two registers in this class. /// A negative number means the register class is very expensive /// to copy e.g. status flag register classes. @@ -109,26 +101,6 @@ public: /// registers. bool isAllocatable() const { return MC->isAllocatable(); } - /// Return true if this TargetRegisterClass has the ValueType vt. - bool hasType(MVT vt) const { - for(int i = 0; VTs[i] != MVT::Other; ++i) - if (MVT(VTs[i]) == vt) - return true; - return false; - } - - /// vt_begin / vt_end - Loop over all of the value types that can be - /// represented by values in this register class. - vt_iterator vt_begin() const { - return VTs; - } - - vt_iterator vt_end() const { - vt_iterator I = VTs; - while (*I != MVT::Other) ++I; - return I; - } - /// Return true if the specified TargetRegisterClass /// is a proper sub-class of this TargetRegisterClass. bool hasSubClass(const TargetRegisterClass *RC) const { @@ -246,6 +218,7 @@ struct RegClassWeight { class TargetRegisterInfo : public MCRegisterInfo { public: typedef const TargetRegisterClass * const * regclass_iterator; + typedef const MVT::SimpleValueType* vt_iterator; private: const TargetRegisterInfoDesc *InfoDesc; // Extra desc array for codegen const char *const *SubRegIndexNames; // Names of subreg indexes. @@ -327,6 +300,44 @@ public: return Index | (1u << 31); } + /// Return the size in bits of a register from class RC. + unsigned getRegSizeInBits(const TargetRegisterClass &RC) const { + return RC.SpillSize * 8; + } + + /// Return the size in bytes of the stack slot allocated to hold a spilled + /// copy of a register from class RC. + unsigned getSpillSize(const TargetRegisterClass &RC) const { + return RC.SpillSize; + } + + /// Return the minimum required alignment for a spill slot for a register + /// of this class. + unsigned getSpillAlignment(const TargetRegisterClass &RC) const { + return RC.SpillAlignment; + } + + /// Return true if the given TargetRegisterClass has the ValueType T. + bool isTypeLegalForClass(const TargetRegisterClass &RC, MVT T) const { + for (int i = 0; RC.VTs[i] != MVT::Other; ++i) + if (MVT(RC.VTs[i]) == T) + return true; + return false; + } + + /// Loop over all of the value types that can be represented by values + // in the given register class. + vt_iterator legalclasstypes_begin(const TargetRegisterClass &RC) const { + return RC.VTs; + } + + vt_iterator legalclasstypes_end(const TargetRegisterClass &RC) const { + vt_iterator I = RC.VTs; + while (*I != MVT::Other) + ++I; + return I; + } + /// Returns the Register Class of a physical register of the given type, /// picking the most sub register class of the right type that contains this /// physreg. diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h index 01a3975..db6723d 100644 --- a/include/llvm/Transforms/Instrumentation.h +++ b/include/llvm/Transforms/Instrumentation.h @@ -131,7 +131,8 @@ FunctionPass *createAddressSanitizerFunctionPass(bool CompileKernel = false, bool Recover = false, bool UseAfterScope = false); ModulePass *createAddressSanitizerModulePass(bool CompileKernel = false, - bool Recover = false); + bool Recover = false, + bool UseGlobalsGC = true); // Insert MemorySanitizer instrumentation (detection of uninitialized reads) FunctionPass *createMemorySanitizerPass(int TrackOrigins = 0, diff --git a/include/llvm/Transforms/Scalar/ConstantHoisting.h b/include/llvm/Transforms/Scalar/ConstantHoisting.h index 3e2b332..edc91ad 100644 --- a/include/llvm/Transforms/Scalar/ConstantHoisting.h +++ b/include/llvm/Transforms/Scalar/ConstantHoisting.h @@ -36,6 +36,7 @@ #ifndef LLVM_TRANSFORMS_SCALAR_CONSTANTHOISTING_H #define LLVM_TRANSFORMS_SCALAR_CONSTANTHOISTING_H +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/PassManager.h" @@ -98,7 +99,7 @@ public: // Glue for old PM. bool runImpl(Function &F, TargetTransformInfo &TTI, DominatorTree &DT, - BasicBlock &Entry); + BlockFrequencyInfo *BFI, BasicBlock &Entry); void releaseMemory() { ConstantVec.clear(); @@ -112,6 +113,7 @@ private: const TargetTransformInfo *TTI; DominatorTree *DT; + BlockFrequencyInfo *BFI; BasicBlock *Entry; /// Keeps track of constant candidates found in the function. @@ -124,8 +126,8 @@ private: SmallVector ConstantVec; Instruction *findMatInsertPt(Instruction *Inst, unsigned Idx = ~0U) const; - Instruction *findConstantInsertionPoint( - const consthoist::ConstantInfo &ConstInfo) const; + SmallPtrSet + findConstantInsertionPoint(const consthoist::ConstantInfo &ConstInfo) const; void collectConstantCandidates(ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx, ConstantInt *ConstInt); diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp index 3db041c..5378230 100644 --- a/lib/Analysis/BasicAliasAnalysis.cpp +++ b/lib/Analysis/BasicAliasAnalysis.cpp @@ -924,8 +924,8 @@ static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1, uint64_t V2Size, const DataLayout &DL) { - assert(GEP1->getPointerOperand()->stripPointerCasts() == - GEP2->getPointerOperand()->stripPointerCasts() && + assert(GEP1->getPointerOperand()->stripPointerCastsAndBarriers() == + GEP2->getPointerOperand()->stripPointerCastsAndBarriers() && GEP1->getPointerOperandType() == GEP2->getPointerOperandType() && "Expected GEPs with the same pointer operand"); @@ -1184,8 +1184,8 @@ AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size, // If we know the two GEPs are based off of the exact same pointer (and not // just the same underlying object), see if that tells us anything about // the resulting pointers. - if (GEP1->getPointerOperand()->stripPointerCasts() == - GEP2->getPointerOperand()->stripPointerCasts() && + if (GEP1->getPointerOperand()->stripPointerCastsAndBarriers() == + GEP2->getPointerOperand()->stripPointerCastsAndBarriers() && GEP1->getPointerOperandType() == GEP2->getPointerOperandType()) { AliasResult R = aliasSameBasePointerGEPs(GEP1, V1Size, GEP2, V2Size, DL); // If we couldn't find anything interesting, don't abandon just yet. @@ -1500,8 +1500,8 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, uint64_t V1Size, return NoAlias; // Strip off any casts if they exist. - V1 = V1->stripPointerCasts(); - V2 = V2->stripPointerCasts(); + V1 = V1->stripPointerCastsAndBarriers(); + V2 = V2->stripPointerCastsAndBarriers(); // If V1 or V2 is undef, the result is NoAlias because we can always pick a // value for undef that aliases nothing in the program. diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp index 14176da..863fbdb 100644 --- a/lib/Analysis/ConstantFolding.cpp +++ b/lib/Analysis/ConstantFolding.cpp @@ -42,6 +42,7 @@ #include "llvm/IR/Value.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include #include @@ -687,21 +688,21 @@ Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0, Constant *Op1, if (Opc == Instruction::And) { unsigned BitWidth = DL.getTypeSizeInBits(Op0->getType()->getScalarType()); - APInt KnownZero0(BitWidth, 0), KnownOne0(BitWidth, 0); - APInt KnownZero1(BitWidth, 0), KnownOne1(BitWidth, 0); - computeKnownBits(Op0, KnownZero0, KnownOne0, DL); - computeKnownBits(Op1, KnownZero1, KnownOne1, DL); - if ((KnownOne1 | KnownZero0).isAllOnesValue()) { + KnownBits Known0(BitWidth); + KnownBits Known1(BitWidth); + computeKnownBits(Op0, Known0, DL); + computeKnownBits(Op1, Known1, DL); + if ((Known1.One | Known0.Zero).isAllOnesValue()) { // All the bits of Op0 that the 'and' could be masking are already zero. return Op0; } - if ((KnownOne0 | KnownZero1).isAllOnesValue()) { + if ((Known0.One | Known1.Zero).isAllOnesValue()) { // All the bits of Op1 that the 'and' could be masking are already zero. return Op1; } - APInt KnownZero = KnownZero0 | KnownZero1; - APInt KnownOne = KnownOne0 & KnownOne1; + APInt KnownZero = Known0.Zero | Known1.Zero; + APInt KnownOne = Known0.One & Known1.One; if ((KnownZero | KnownOne).isAllOnesValue()) { return ConstantInt::get(Op0->getType(), KnownOne); } diff --git a/lib/Analysis/DemandedBits.cpp b/lib/Analysis/DemandedBits.cpp index 151c0b0..285339d 100644 --- a/lib/Analysis/DemandedBits.cpp +++ b/lib/Analysis/DemandedBits.cpp @@ -37,6 +37,7 @@ #include "llvm/IR/Operator.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -72,8 +73,7 @@ static bool isAlwaysLive(Instruction *I) { void DemandedBits::determineLiveOperandBits( const Instruction *UserI, const Instruction *I, unsigned OperandNo, - const APInt &AOut, APInt &AB, APInt &KnownZero, APInt &KnownOne, - APInt &KnownZero2, APInt &KnownOne2) { + const APInt &AOut, APInt &AB, KnownBits &Known, KnownBits &Known2) { unsigned BitWidth = AB.getBitWidth(); // We're called once per operand, but for some instructions, we need to @@ -85,15 +85,13 @@ void DemandedBits::determineLiveOperandBits( auto ComputeKnownBits = [&](unsigned BitWidth, const Value *V1, const Value *V2) { const DataLayout &DL = I->getModule()->getDataLayout(); - KnownZero = APInt(BitWidth, 0); - KnownOne = APInt(BitWidth, 0); - computeKnownBits(const_cast(V1), KnownZero, KnownOne, DL, 0, + Known = KnownBits(BitWidth); + computeKnownBits(const_cast(V1), Known, DL, 0, &AC, UserI, &DT); if (V2) { - KnownZero2 = APInt(BitWidth, 0); - KnownOne2 = APInt(BitWidth, 0); - computeKnownBits(const_cast(V2), KnownZero2, KnownOne2, DL, + Known2 = KnownBits(BitWidth); + computeKnownBits(const_cast(V2), Known2, DL, 0, &AC, UserI, &DT); } }; @@ -120,7 +118,7 @@ void DemandedBits::determineLiveOperandBits( // known to be one. ComputeKnownBits(BitWidth, I, nullptr); AB = APInt::getHighBitsSet(BitWidth, - std::min(BitWidth, KnownOne.countLeadingZeros()+1)); + std::min(BitWidth, Known.One.countLeadingZeros()+1)); } break; case Intrinsic::cttz: @@ -130,7 +128,7 @@ void DemandedBits::determineLiveOperandBits( // known to be one. ComputeKnownBits(BitWidth, I, nullptr); AB = APInt::getLowBitsSet(BitWidth, - std::min(BitWidth, KnownOne.countTrailingZeros()+1)); + std::min(BitWidth, Known.One.countTrailingZeros()+1)); } break; } @@ -200,11 +198,11 @@ void DemandedBits::determineLiveOperandBits( // dead). if (OperandNo == 0) { ComputeKnownBits(BitWidth, I, UserI->getOperand(1)); - AB &= ~KnownZero2; + AB &= ~Known2.Zero; } else { if (!isa(UserI->getOperand(0))) ComputeKnownBits(BitWidth, UserI->getOperand(0), I); - AB &= ~(KnownZero & ~KnownZero2); + AB &= ~(Known.Zero & ~Known2.Zero); } break; case Instruction::Or: @@ -216,11 +214,11 @@ void DemandedBits::determineLiveOperandBits( // dead). if (OperandNo == 0) { ComputeKnownBits(BitWidth, I, UserI->getOperand(1)); - AB &= ~KnownOne2; + AB &= ~Known2.One; } else { if (!isa(UserI->getOperand(0))) ComputeKnownBits(BitWidth, UserI->getOperand(0), I); - AB &= ~(KnownOne & ~KnownOne2); + AB &= ~(Known.One & ~Known2.One); } break; case Instruction::Xor: @@ -318,7 +316,7 @@ void DemandedBits::performAnalysis() { if (!UserI->getType()->isIntegerTy()) Visited.insert(UserI); - APInt KnownZero, KnownOne, KnownZero2, KnownOne2; + KnownBits Known, Known2; // Compute the set of alive bits for each operand. These are anded into the // existing set, if any, and if that changes the set of alive bits, the // operand is added to the work-list. @@ -335,8 +333,7 @@ void DemandedBits::performAnalysis() { // Bits of each operand that are used to compute alive bits of the // output are alive, all others are dead. determineLiveOperandBits(UserI, I, OI.getOperandNo(), AOut, AB, - KnownZero, KnownOne, - KnownZero2, KnownOne2); + Known, Known2); } // If we've added to the set of alive bits (or the operand has not diff --git a/lib/Analysis/DomPrinter.cpp b/lib/Analysis/DomPrinter.cpp index 7acfb41..8abc0e7 100644 --- a/lib/Analysis/DomPrinter.cpp +++ b/lib/Analysis/DomPrinter.cpp @@ -80,6 +80,22 @@ struct DOTGraphTraits }; } +void DominatorTree::viewGraph(const Twine &Name, const Twine &Title) { +#ifndef NDEBUG + ViewGraph(this, Name, false, Title); +#else + errs() << "DomTree dump not available, build with DEBUG\n"; +#endif // NDEBUG +} + +void DominatorTree::viewGraph() { +#ifndef NDEBUG + this->viewGraph("domtree", "Dominator Tree for function"); +#else + errs() << "DomTree dump not available, build with DEBUG\n"; +#endif // NDEBUG +} + namespace { struct DominatorTreeWrapperPassAnalysisGraphTraits { static DominatorTree *getGraph(DominatorTreeWrapperPass *DTWP) { diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp index fde805a..c30feb9 100644 --- a/lib/Analysis/IVUsers.cpp +++ b/lib/Analysis/IVUsers.cpp @@ -253,18 +253,8 @@ bool IVUsers::AddUsersImpl(Instruction *I, const SCEV *OriginalISE = ISE; auto NormalizePred = [&](const SCEVAddRecExpr *AR) { - // We only allow affine AddRecs to be normalized, otherwise we would not - // be able to correctly denormalize. - // e.g. {1,+,3,+,2} == {-2,+,1,+,2} + {3,+,2} - // Normalized form: {-2,+,1,+,2} - // Denormalized form: {1,+,3,+,2} - // - // However, denormalization would use a different step expression than - // normalization (see getPostIncExpr), generating the wrong final - // expression: {-2,+,1,+,2} + {1,+,2} => {-1,+,3,+,2} auto *L = AR->getLoop(); - bool Result = - AR->isAffine() && IVUseShouldUsePostIncValue(User, I, L, DT); + bool Result = IVUseShouldUsePostIncValue(User, I, L, DT); if (Result) NewUse.PostIncLoops.insert(L); return Result; diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp index 1f8dec2..788f908 100644 --- a/lib/Analysis/InlineCost.cpp +++ b/lib/Analysis/InlineCost.cpp @@ -1557,7 +1557,6 @@ InlineParams llvm::getInlineParams(int Threshold) { Params.ColdCallSiteThreshold = ColdCallSiteThreshold; // Set the OptMinSizeThreshold and OptSizeThreshold params only if the - // Set the OptMinSizeThreshold and OptSizeThreshold params only if the // -inlinehint-threshold commandline option is not explicitly given. If that // option is present, then its value applies even for callees with size and // minsize attributes. diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp index 2259fba..e720e3e 100644 --- a/lib/Analysis/InstructionSimplify.cpp +++ b/lib/Analysis/InstructionSimplify.cpp @@ -35,6 +35,7 @@ #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/Support/KnownBits.h" #include using namespace llvm; using namespace llvm::PatternMatch; @@ -46,34 +47,19 @@ enum { RecursionLimit = 3 }; STATISTIC(NumExpand, "Number of expansions"); STATISTIC(NumReassoc, "Number of reassociations"); -namespace { -struct Query { - const DataLayout &DL; - const TargetLibraryInfo *TLI; - const DominatorTree *DT; - AssumptionCache *AC; - const Instruction *CxtI; - - Query(const DataLayout &DL, const TargetLibraryInfo *tli, - const DominatorTree *dt, AssumptionCache *ac = nullptr, - const Instruction *cxti = nullptr) - : DL(DL), TLI(tli), DT(dt), AC(ac), CxtI(cxti) {} -}; -} // end anonymous namespace - -static Value *SimplifyAndInst(Value *, Value *, const Query &, unsigned); -static Value *SimplifyBinOp(unsigned, Value *, Value *, const Query &, +static Value *SimplifyAndInst(Value *, Value *, const SimplifyQuery &, unsigned); +static Value *SimplifyBinOp(unsigned, Value *, Value *, const SimplifyQuery &, unsigned); static Value *SimplifyFPBinOp(unsigned, Value *, Value *, const FastMathFlags &, - const Query &, unsigned); -static Value *SimplifyCmpInst(unsigned, Value *, Value *, const Query &, + const SimplifyQuery &, unsigned); +static Value *SimplifyCmpInst(unsigned, Value *, Value *, const SimplifyQuery &, unsigned); static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, - const Query &Q, unsigned MaxRecurse); -static Value *SimplifyOrInst(Value *, Value *, const Query &, unsigned); -static Value *SimplifyXorInst(Value *, Value *, const Query &, unsigned); + const SimplifyQuery &Q, unsigned MaxRecurse); +static Value *SimplifyOrInst(Value *, Value *, const SimplifyQuery &, unsigned); +static Value *SimplifyXorInst(Value *, Value *, const SimplifyQuery &, unsigned); static Value *SimplifyCastInst(unsigned, Value *, Type *, - const Query &, unsigned); + const SimplifyQuery &, unsigned); /// For a boolean type or a vector of boolean type, return false or a vector /// with every element false. @@ -138,7 +124,7 @@ static bool ValueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) { /// Also performs the transform "(A op' B) op C" -> "(A op C) op' (B op C)". /// Returns the simplified value, or null if no simplification was performed. static Value *ExpandBinOp(Instruction::BinaryOps Opcode, Value *LHS, Value *RHS, - Instruction::BinaryOps OpcodeToExpand, const Query &Q, + Instruction::BinaryOps OpcodeToExpand, const SimplifyQuery &Q, unsigned MaxRecurse) { // Recursion is always used, so bail out at once if we already hit the limit. if (!MaxRecurse--) @@ -196,7 +182,7 @@ static Value *ExpandBinOp(Instruction::BinaryOps Opcode, Value *LHS, Value *RHS, /// Generic simplifications for associative binary operations. /// Returns the simpler value, or null if none was found. static Value *SimplifyAssociativeBinOp(Instruction::BinaryOps Opcode, - Value *LHS, Value *RHS, const Query &Q, + Value *LHS, Value *RHS, const SimplifyQuery &Q, unsigned MaxRecurse) { assert(Instruction::isAssociative(Opcode) && "Not an associative operation!"); @@ -295,7 +281,7 @@ static Value *SimplifyAssociativeBinOp(Instruction::BinaryOps Opcode, /// of the select results in the same value. Returns the common value if so, /// otherwise returns null. static Value *ThreadBinOpOverSelect(Instruction::BinaryOps Opcode, Value *LHS, - Value *RHS, const Query &Q, + Value *RHS, const SimplifyQuery &Q, unsigned MaxRecurse) { // Recursion is always used, so bail out at once if we already hit the limit. if (!MaxRecurse--) @@ -367,7 +353,7 @@ static Value *ThreadBinOpOverSelect(Instruction::BinaryOps Opcode, Value *LHS, /// comparison by seeing whether both branches of the select result in the same /// value. Returns the common value if so, otherwise returns null. static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS, - Value *RHS, const Query &Q, + Value *RHS, const SimplifyQuery &Q, unsigned MaxRecurse) { // Recursion is always used, so bail out at once if we already hit the limit. if (!MaxRecurse--) @@ -449,7 +435,7 @@ static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS, /// phi values yields the same result for every value. If so returns the common /// value, otherwise returns null. static Value *ThreadBinOpOverPHI(Instruction::BinaryOps Opcode, Value *LHS, - Value *RHS, const Query &Q, + Value *RHS, const SimplifyQuery &Q, unsigned MaxRecurse) { // Recursion is always used, so bail out at once if we already hit the limit. if (!MaxRecurse--) @@ -492,7 +478,7 @@ static Value *ThreadBinOpOverPHI(Instruction::BinaryOps Opcode, Value *LHS, /// yields the same result every time. If so returns the common result, /// otherwise returns null. static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS, - const Query &Q, unsigned MaxRecurse) { + const SimplifyQuery &Q, unsigned MaxRecurse) { // Recursion is always used, so bail out at once if we already hit the limit. if (!MaxRecurse--) return nullptr; @@ -527,7 +513,7 @@ static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS, static Constant *foldOrCommuteConstant(Instruction::BinaryOps Opcode, Value *&Op0, Value *&Op1, - const Query &Q) { + const SimplifyQuery &Q) { if (auto *CLHS = dyn_cast(Op0)) { if (auto *CRHS = dyn_cast(Op1)) return ConstantFoldBinaryOpOperands(Opcode, CLHS, CRHS, Q.DL); @@ -542,7 +528,7 @@ static Constant *foldOrCommuteConstant(Instruction::BinaryOps Opcode, /// Given operands for an Add, see if we can fold the result. /// If not, this returns null. static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, - const Query &Q, unsigned MaxRecurse) { + const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Instruction::Add, Op0, Op1, Q)) return C; @@ -601,10 +587,15 @@ Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyAddInst(Op0, Op1, isNSW, isNUW, Query(DL, TLI, DT, AC, CxtI), + return ::SimplifyAddInst(Op0, Op1, isNSW, isNUW, {DL, TLI, DT, AC, CxtI}, RecursionLimit); } +Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, + const SimplifyQuery &Query) { + return ::SimplifyAddInst(Op0, Op1, isNSW, isNUW, Query, RecursionLimit); +} + /// \brief Compute the base pointer and cumulative constant offsets for V. /// /// This strips all constant offsets off of V, leaving it the base pointer, and @@ -679,7 +670,7 @@ static Constant *computePointerDifference(const DataLayout &DL, Value *LHS, /// Given operands for a Sub, see if we can fold the result. /// If not, this returns null. static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, - const Query &Q, unsigned MaxRecurse) { + const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Instruction::Sub, Op0, Op1, Q)) return C; @@ -703,10 +694,9 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, return Op0; unsigned BitWidth = Op1->getType()->getScalarSizeInBits(); - APInt KnownZero(BitWidth, 0); - APInt KnownOne(BitWidth, 0); - computeKnownBits(Op1, KnownZero, KnownOne, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); - if (KnownZero.isMaxSignedValue()) { + KnownBits Known(BitWidth); + computeKnownBits(Op1, Known, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); + if (Known.Zero.isMaxSignedValue()) { // Op1 is either 0 or the minimum signed value. If the sub is NSW, then // Op1 must be 0 because negating the minimum signed value is undefined. if (isNSW) @@ -813,14 +803,19 @@ Value *llvm::SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifySubInst(Op0, Op1, isNSW, isNUW, Query(DL, TLI, DT, AC, CxtI), + return ::SimplifySubInst(Op0, Op1, isNSW, isNUW, {DL, TLI, DT, AC, CxtI}, RecursionLimit); } +Value *llvm::SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, + const SimplifyQuery &Q) { + return ::SimplifySubInst(Op0, Op1, isNSW, isNUW, Q, RecursionLimit); +} + /// Given operands for an FAdd, see if we can fold the result. If not, this /// returns null. static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF, - const Query &Q, unsigned MaxRecurse) { + const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Instruction::FAdd, Op0, Op1, Q)) return C; @@ -854,7 +849,7 @@ static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF, /// Given operands for an FSub, see if we can fold the result. If not, this /// returns null. static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF, - const Query &Q, unsigned MaxRecurse) { + const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Instruction::FSub, Op0, Op1, Q)) return C; @@ -886,7 +881,7 @@ static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF, /// Given the operands for an FMul, see if we can fold the result static Value *SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF, - const Query &Q, unsigned MaxRecurse) { + const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Instruction::FMul, Op0, Op1, Q)) return C; @@ -903,7 +898,7 @@ static Value *SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF, /// Given operands for a Mul, see if we can fold the result. /// If not, this returns null. -static Value *SimplifyMulInst(Value *Op0, Value *Op1, const Query &Q, +static Value *SimplifyMulInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Instruction::Mul, Op0, Op1, Q)) return C; @@ -963,34 +958,52 @@ Value *llvm::SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyFAddInst(Op0, Op1, FMF, Query(DL, TLI, DT, AC, CxtI), + return ::SimplifyFAddInst(Op0, Op1, FMF, {DL, TLI, DT, AC, CxtI}, RecursionLimit); } +Value *llvm::SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF, + const SimplifyQuery &Q) { + return ::SimplifyFAddInst(Op0, Op1, FMF, Q, RecursionLimit); +} + Value *llvm::SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyFSubInst(Op0, Op1, FMF, Query(DL, TLI, DT, AC, CxtI), + return ::SimplifyFSubInst(Op0, Op1, FMF, {DL, TLI, DT, AC, CxtI}, RecursionLimit); } +Value *llvm::SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF, + const SimplifyQuery &Q) { + return ::SimplifyFSubInst(Op0, Op1, FMF, Q, RecursionLimit); +} + Value *llvm::SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyFMulInst(Op0, Op1, FMF, Query(DL, TLI, DT, AC, CxtI), + return ::SimplifyFMulInst(Op0, Op1, FMF, {DL, TLI, DT, AC, CxtI}, RecursionLimit); } +Value *llvm::SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF, + const SimplifyQuery &Q) { + return ::SimplifyFMulInst(Op0, Op1, FMF, Q, RecursionLimit); +} + Value *llvm::SimplifyMulInst(Value *Op0, Value *Op1, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyMulInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI), - RecursionLimit); + return ::SimplifyMulInst(Op0, Op1, {DL, TLI, DT, AC, CxtI}, RecursionLimit); +} + +Value *llvm::SimplifyMulInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) { + return ::SimplifyMulInst(Op0, Op1, Q, RecursionLimit); } /// Check for common or similar folds of integer division or integer remainder. @@ -1047,7 +1060,7 @@ static Value *simplifyDivRem(Value *Op0, Value *Op1, bool IsDiv) { /// Given operands for an SDiv or UDiv, see if we can fold the result. /// If not, this returns null. static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1, - const Query &Q, unsigned MaxRecurse) { + const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q)) return C; @@ -1103,7 +1116,7 @@ static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1, /// Given operands for an SDiv, see if we can fold the result. /// If not, this returns null. -static Value *SimplifySDivInst(Value *Op0, Value *Op1, const Query &Q, +static Value *SimplifySDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Value *V = SimplifyDiv(Instruction::SDiv, Op0, Op1, Q, MaxRecurse)) return V; @@ -1115,13 +1128,16 @@ Value *llvm::SimplifySDivInst(Value *Op0, Value *Op1, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifySDivInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI), - RecursionLimit); + return ::SimplifySDivInst(Op0, Op1, {DL, TLI, DT, AC, CxtI}, RecursionLimit); +} + +Value *llvm::SimplifySDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) { + return ::SimplifySDivInst(Op0, Op1, Q, RecursionLimit); } /// Given operands for a UDiv, see if we can fold the result. /// If not, this returns null. -static Value *SimplifyUDivInst(Value *Op0, Value *Op1, const Query &Q, +static Value *SimplifyUDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Value *V = SimplifyDiv(Instruction::UDiv, Op0, Op1, Q, MaxRecurse)) return V; @@ -1143,12 +1159,15 @@ Value *llvm::SimplifyUDivInst(Value *Op0, Value *Op1, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyUDivInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI), - RecursionLimit); + return ::SimplifyUDivInst(Op0, Op1, {DL, TLI, DT, AC, CxtI}, RecursionLimit); +} + +Value *llvm::SimplifyUDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) { + return ::SimplifyUDivInst(Op0, Op1, Q, RecursionLimit); } static Value *SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF, - const Query &Q, unsigned) { + const SimplifyQuery &Q, unsigned) { if (Constant *C = foldOrCommuteConstant(Instruction::FDiv, Op0, Op1, Q)) return C; @@ -1193,14 +1212,19 @@ Value *llvm::SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyFDivInst(Op0, Op1, FMF, Query(DL, TLI, DT, AC, CxtI), + return ::SimplifyFDivInst(Op0, Op1, FMF, {DL, TLI, DT, AC, CxtI}, RecursionLimit); } +Value *llvm::SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF, + const SimplifyQuery &Q) { + return ::SimplifyFDivInst(Op0, Op1, FMF, Q, RecursionLimit); +} + /// Given operands for an SRem or URem, see if we can fold the result. /// If not, this returns null. static Value *SimplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1, - const Query &Q, unsigned MaxRecurse) { + const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q)) return C; @@ -1231,7 +1255,7 @@ static Value *SimplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1, /// Given operands for an SRem, see if we can fold the result. /// If not, this returns null. -static Value *SimplifySRemInst(Value *Op0, Value *Op1, const Query &Q, +static Value *SimplifySRemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Value *V = SimplifyRem(Instruction::SRem, Op0, Op1, Q, MaxRecurse)) return V; @@ -1243,13 +1267,16 @@ Value *llvm::SimplifySRemInst(Value *Op0, Value *Op1, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifySRemInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI), - RecursionLimit); + return ::SimplifySRemInst(Op0, Op1, {DL, TLI, DT, AC, CxtI}, RecursionLimit); +} + +Value *llvm::SimplifySRemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) { + return ::SimplifySRemInst(Op0, Op1, Q, RecursionLimit); } /// Given operands for a URem, see if we can fold the result. /// If not, this returns null. -static Value *SimplifyURemInst(Value *Op0, Value *Op1, const Query &Q, +static Value *SimplifyURemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Value *V = SimplifyRem(Instruction::URem, Op0, Op1, Q, MaxRecurse)) return V; @@ -1271,12 +1298,15 @@ Value *llvm::SimplifyURemInst(Value *Op0, Value *Op1, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyURemInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI), - RecursionLimit); + return ::SimplifyURemInst(Op0, Op1, {DL, TLI, DT, AC, CxtI}, RecursionLimit); +} + +Value *llvm::SimplifyURemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) { + return ::SimplifyURemInst(Op0, Op1, Q, RecursionLimit); } static Value *SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF, - const Query &Q, unsigned) { + const SimplifyQuery &Q, unsigned) { if (Constant *C = foldOrCommuteConstant(Instruction::FRem, Op0, Op1, Q)) return C; @@ -1302,10 +1332,15 @@ Value *llvm::SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyFRemInst(Op0, Op1, FMF, Query(DL, TLI, DT, AC, CxtI), + return ::SimplifyFRemInst(Op0, Op1, FMF, {DL, TLI, DT, AC, CxtI}, RecursionLimit); } +Value *llvm::SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF, + const SimplifyQuery &Q) { + return ::SimplifyFRemInst(Op0, Op1, FMF, Q, RecursionLimit); +} + /// Returns true if a shift by \c Amount always yields undef. static bool isUndefShift(Value *Amount) { Constant *C = dyn_cast(Amount); @@ -1336,7 +1371,7 @@ static bool isUndefShift(Value *Amount) { /// Given operands for an Shl, LShr or AShr, see if we can fold the result. /// If not, this returns null. static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0, - Value *Op1, const Query &Q, unsigned MaxRecurse) { + Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q)) return C; @@ -1367,17 +1402,15 @@ static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0, // If any bits in the shift amount make that value greater than or equal to // the number of bits in the type, the shift is undefined. unsigned BitWidth = Op1->getType()->getScalarSizeInBits(); - APInt KnownZero(BitWidth, 0); - APInt KnownOne(BitWidth, 0); - computeKnownBits(Op1, KnownZero, KnownOne, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); - if (KnownOne.getLimitedValue() >= BitWidth) + KnownBits Known(BitWidth); + computeKnownBits(Op1, Known, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); + if (Known.One.getLimitedValue() >= BitWidth) return UndefValue::get(Op0->getType()); // If all valid bits in the shift amount are known zero, the first operand is // unchanged. unsigned NumValidShiftBits = Log2_32_Ceil(BitWidth); - APInt ShiftAmountMask = APInt::getLowBitsSet(BitWidth, NumValidShiftBits); - if ((KnownZero & ShiftAmountMask) == ShiftAmountMask) + if (Known.Zero.countTrailingOnes() >= NumValidShiftBits) return Op0; return nullptr; @@ -1386,7 +1419,7 @@ static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0, /// \brief Given operands for an Shl, LShr or AShr, see if we can /// fold the result. If not, this returns null. static Value *SimplifyRightShift(Instruction::BinaryOps Opcode, Value *Op0, - Value *Op1, bool isExact, const Query &Q, + Value *Op1, bool isExact, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Value *V = SimplifyShift(Opcode, Op0, Op1, Q, MaxRecurse)) return V; @@ -1403,11 +1436,9 @@ static Value *SimplifyRightShift(Instruction::BinaryOps Opcode, Value *Op0, // The low bit cannot be shifted out of an exact shift if it is set. if (isExact) { unsigned BitWidth = Op0->getType()->getScalarSizeInBits(); - APInt Op0KnownZero(BitWidth, 0); - APInt Op0KnownOne(BitWidth, 0); - computeKnownBits(Op0, Op0KnownZero, Op0KnownOne, Q.DL, /*Depth=*/0, Q.AC, - Q.CxtI, Q.DT); - if (Op0KnownOne[0]) + KnownBits Op0Known(BitWidth); + computeKnownBits(Op0, Op0Known, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT); + if (Op0Known.One[0]) return Op0; } @@ -1417,7 +1448,7 @@ static Value *SimplifyRightShift(Instruction::BinaryOps Opcode, Value *Op0, /// Given operands for an Shl, see if we can fold the result. /// If not, this returns null. static Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, - const Query &Q, unsigned MaxRecurse) { + const SimplifyQuery &Q, unsigned MaxRecurse) { if (Value *V = SimplifyShift(Instruction::Shl, Op0, Op1, Q, MaxRecurse)) return V; @@ -1437,14 +1468,19 @@ Value *llvm::SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyShlInst(Op0, Op1, isNSW, isNUW, Query(DL, TLI, DT, AC, CxtI), + return ::SimplifyShlInst(Op0, Op1, isNSW, isNUW, {DL, TLI, DT, AC, CxtI}, RecursionLimit); } +Value *llvm::SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, + const SimplifyQuery &Q) { + return ::SimplifyShlInst(Op0, Op1, isNSW, isNUW, Q, RecursionLimit); +} + /// Given operands for an LShr, see if we can fold the result. /// If not, this returns null. static Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact, - const Query &Q, unsigned MaxRecurse) { + const SimplifyQuery &Q, unsigned MaxRecurse) { if (Value *V = SimplifyRightShift(Instruction::LShr, Op0, Op1, isExact, Q, MaxRecurse)) return V; @@ -1462,14 +1498,19 @@ Value *llvm::SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyLShrInst(Op0, Op1, isExact, Query(DL, TLI, DT, AC, CxtI), + return ::SimplifyLShrInst(Op0, Op1, isExact, {DL, TLI, DT, AC, CxtI}, RecursionLimit); } +Value *llvm::SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact, + const SimplifyQuery &Q) { + return ::SimplifyLShrInst(Op0, Op1, isExact, Q, RecursionLimit); +} + /// Given operands for an AShr, see if we can fold the result. /// If not, this returns null. static Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact, - const Query &Q, unsigned MaxRecurse) { + const SimplifyQuery &Q, unsigned MaxRecurse) { if (Value *V = SimplifyRightShift(Instruction::AShr, Op0, Op1, isExact, Q, MaxRecurse)) return V; @@ -1496,10 +1537,15 @@ Value *llvm::SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyAShrInst(Op0, Op1, isExact, Query(DL, TLI, DT, AC, CxtI), + return ::SimplifyAShrInst(Op0, Op1, isExact, {DL, TLI, DT, AC, CxtI}, RecursionLimit); } +Value *llvm::SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact, + const SimplifyQuery &Q) { + return ::SimplifyAShrInst(Op0, Op1, isExact, Q, RecursionLimit); +} + static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp, ICmpInst *UnsignedICmp, bool IsAnd) { Value *X, *Y; @@ -1575,6 +1621,7 @@ static Value *SimplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) { if (Value *X = simplifyAndOfICmpsWithSameOperands(Op0, Op1)) return X; + // FIXME: This should be shared with or-of-icmps. // Look for this pattern: (icmp V, C0) & (icmp V, C1)). Type *ITy = Op0->getType(); ICmpInst::Predicate Pred0, Pred1; @@ -1584,10 +1631,16 @@ static Value *SimplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) { match(Op1, m_ICmp(Pred1, m_Specific(V), m_APInt(C1)))) { // Make a constant range that's the intersection of the two icmp ranges. // If the intersection is empty, we know that the result is false. - auto Range0 = ConstantRange::makeAllowedICmpRegion(Pred0, *C0); - auto Range1 = ConstantRange::makeAllowedICmpRegion(Pred1, *C1); + auto Range0 = ConstantRange::makeExactICmpRegion(Pred0, *C0); + auto Range1 = ConstantRange::makeExactICmpRegion(Pred1, *C1); if (Range0.intersectWith(Range1).isEmptySet()) return getFalse(ITy); + + // If a range is a superset of the other, the smaller set is all we need. + if (Range0.contains(Range1)) + return Op1; + if (Range1.contains(Range0)) + return Op0; } // (icmp (add V, C0), C1) & (icmp V, C0) @@ -1633,7 +1686,7 @@ static Value *SimplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) { /// Given operands for an And, see if we can fold the result. /// If not, this returns null. -static Value *SimplifyAndInst(Value *Op0, Value *Op1, const Query &Q, +static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Instruction::And, Op0, Op1, Q)) return C; @@ -1744,8 +1797,11 @@ Value *llvm::SimplifyAndInst(Value *Op0, Value *Op1, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyAndInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI), - RecursionLimit); + return ::SimplifyAndInst(Op0, Op1, {DL, TLI, DT, AC, CxtI}, RecursionLimit); +} + +Value *llvm::SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) { + return ::SimplifyAndInst(Op0, Op1, Q, RecursionLimit); } /// Commuted variants are assumed to be handled by calling this function again @@ -1830,7 +1886,7 @@ static Value *SimplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1) { /// Given operands for an Or, see if we can fold the result. /// If not, this returns null. -static Value *SimplifyOrInst(Value *Op0, Value *Op1, const Query &Q, +static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Instruction::Or, Op0, Op1, Q)) return C; @@ -1877,6 +1933,25 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const Query &Q, (A == Op0 || B == Op0)) return Constant::getAllOnesValue(Op0->getType()); + // (A & ~B) | (A ^ B) -> (A ^ B) + // (~B & A) | (A ^ B) -> (A ^ B) + // (A & ~B) | (B ^ A) -> (B ^ A) + // (~B & A) | (B ^ A) -> (B ^ A) + if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && + (match(Op0, m_c_And(m_Specific(A), m_Not(m_Specific(B)))) || + match(Op0, m_c_And(m_Not(m_Specific(A)), m_Specific(B))))) + return Op1; + + // Commute the 'or' operands. + // (A ^ B) | (A & ~B) -> (A ^ B) + // (A ^ B) | (~B & A) -> (A ^ B) + // (B ^ A) | (A & ~B) -> (B ^ A) + // (B ^ A) | (~B & A) -> (B ^ A) + if (match(Op0, m_Xor(m_Value(A), m_Value(B))) && + (match(Op1, m_c_And(m_Specific(A), m_Not(m_Specific(B)))) || + match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B))))) + return Op0; + if (auto *ICILHS = dyn_cast(Op0)) { if (auto *ICIRHS = dyn_cast(Op1)) { if (Value *V = SimplifyOrOfICmps(ICILHS, ICIRHS)) @@ -1952,13 +2027,16 @@ Value *llvm::SimplifyOrInst(Value *Op0, Value *Op1, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyOrInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI), - RecursionLimit); + return ::SimplifyOrInst(Op0, Op1, {DL, TLI, DT, AC, CxtI}, RecursionLimit); +} + +Value *llvm::SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) { + return ::SimplifyOrInst(Op0, Op1, Q, RecursionLimit); } /// Given operands for a Xor, see if we can fold the result. /// If not, this returns null. -static Value *SimplifyXorInst(Value *Op0, Value *Op1, const Query &Q, +static Value *SimplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) { if (Constant *C = foldOrCommuteConstant(Instruction::Xor, Op0, Op1, Q)) return C; @@ -2001,10 +2079,14 @@ Value *llvm::SimplifyXorInst(Value *Op0, Value *Op1, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyXorInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI), - RecursionLimit); + return ::SimplifyXorInst(Op0, Op1, {DL, TLI, DT, AC, CxtI}, RecursionLimit); +} + +Value *llvm::SimplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) { + return ::SimplifyXorInst(Op0, Op1, Q, RecursionLimit); } + static Type *GetCompareTy(Value *Op) { return CmpInst::makeCmpResultType(Op->getType()); } @@ -2238,7 +2320,7 @@ computePointerICmp(const DataLayout &DL, const TargetLibraryInfo *TLI, /// Fold an icmp when its operands have i1 scalar type. static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS, - Value *RHS, const Query &Q) { + Value *RHS, const SimplifyQuery &Q) { Type *ITy = GetCompareTy(LHS); // The return type. Type *OpTy = LHS->getType(); // The operand type. if (!OpTy->getScalarType()->isIntegerTy(1)) @@ -2301,7 +2383,7 @@ static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS, /// Try hard to fold icmp with zero RHS because this is a common case. static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS, - Value *RHS, const Query &Q) { + Value *RHS, const SimplifyQuery &Q) { if (!match(RHS, m_Zero())) return nullptr; @@ -2556,7 +2638,7 @@ static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS, } static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS, - Value *RHS, const Query &Q, + Value *RHS, const SimplifyQuery &Q, unsigned MaxRecurse) { Type *ITy = GetCompareTy(LHS); // The return type. @@ -2866,7 +2948,7 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS, /// Simplify integer comparisons where at least one operand of the compare /// matches an integer min/max idiom. static Value *simplifyICmpWithMinMax(CmpInst::Predicate Pred, Value *LHS, - Value *RHS, const Query &Q, + Value *RHS, const SimplifyQuery &Q, unsigned MaxRecurse) { Type *ITy = GetCompareTy(LHS); // The return type. Value *A, *B; @@ -3070,7 +3152,7 @@ static Value *simplifyICmpWithMinMax(CmpInst::Predicate Pred, Value *LHS, /// Given operands for an ICmpInst, see if we can fold the result. /// If not, this returns null. static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, - const Query &Q, unsigned MaxRecurse) { + const SimplifyQuery &Q, unsigned MaxRecurse) { CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate; assert(CmpInst::isIntPredicate(Pred) && "Not an integer compare!"); @@ -3342,11 +3424,10 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, const APInt *RHSVal; if (match(RHS, m_APInt(RHSVal))) { unsigned BitWidth = RHSVal->getBitWidth(); - APInt LHSKnownZero(BitWidth, 0); - APInt LHSKnownOne(BitWidth, 0); - computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, Q.DL, /*Depth=*/0, Q.AC, - Q.CxtI, Q.DT); - if (((LHSKnownZero & *RHSVal) != 0) || ((LHSKnownOne & ~(*RHSVal)) != 0)) + KnownBits LHSKnown(BitWidth); + computeKnownBits(LHS, LHSKnown, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT); + if (LHSKnown.Zero.intersects(*RHSVal) || + !LHSKnown.One.isSubsetOf(*RHSVal)) return Pred == ICmpInst::ICMP_EQ ? ConstantInt::getFalse(ITy) : ConstantInt::getTrue(ITy); } @@ -3372,14 +3453,19 @@ Value *llvm::SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyICmpInst(Predicate, LHS, RHS, Query(DL, TLI, DT, AC, CxtI), + return ::SimplifyICmpInst(Predicate, LHS, RHS, {DL, TLI, DT, AC, CxtI}, RecursionLimit); } +Value *llvm::SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, + const SimplifyQuery &Q) { + return ::SimplifyICmpInst(Predicate, LHS, RHS, Q, RecursionLimit); +} + /// Given operands for an FCmpInst, see if we can fold the result. /// If not, this returns null. static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS, - FastMathFlags FMF, const Query &Q, + FastMathFlags FMF, const SimplifyQuery &Q, unsigned MaxRecurse) { CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate; assert(CmpInst::isFPPredicate(Pred) && "Not an FP compare!"); @@ -3505,13 +3591,18 @@ Value *llvm::SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyFCmpInst(Predicate, LHS, RHS, FMF, - Query(DL, TLI, DT, AC, CxtI), RecursionLimit); + return ::SimplifyFCmpInst(Predicate, LHS, RHS, FMF, {DL, TLI, DT, AC, CxtI}, + RecursionLimit); +} + +Value *llvm::SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS, + FastMathFlags FMF, const SimplifyQuery &Q) { + return ::SimplifyFCmpInst(Predicate, LHS, RHS, FMF, Q, RecursionLimit); } /// See if V simplifies when its operand Op is replaced with RepOp. static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, - const Query &Q, + const SimplifyQuery &Q, unsigned MaxRecurse) { // Trivial replacement. if (V == Op) @@ -3659,7 +3750,7 @@ static Value *simplifySelectWithFakeICmpEq(Value *CmpLHS, Value *TrueVal, /// Try to simplify a select instruction when its condition operand is an /// integer comparison. static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal, - Value *FalseVal, const Query &Q, + Value *FalseVal, const SimplifyQuery &Q, unsigned MaxRecurse) { ICmpInst::Predicate Pred; Value *CmpLHS, *CmpRHS; @@ -3738,7 +3829,7 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal, /// Given operands for a SelectInst, see if we can fold the result. /// If not, this returns null. static Value *SimplifySelectInst(Value *CondVal, Value *TrueVal, - Value *FalseVal, const Query &Q, + Value *FalseVal, const SimplifyQuery &Q, unsigned MaxRecurse) { // select true, X, Y -> X // select false, X, Y -> Y @@ -3775,14 +3866,19 @@ Value *llvm::SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifySelectInst(Cond, TrueVal, FalseVal, - Query(DL, TLI, DT, AC, CxtI), RecursionLimit); + return ::SimplifySelectInst(Cond, TrueVal, FalseVal, {DL, TLI, DT, AC, CxtI}, + RecursionLimit); +} + +Value *llvm::SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal, + const SimplifyQuery &Q) { + return ::SimplifySelectInst(Cond, TrueVal, FalseVal, Q, RecursionLimit); } /// Given operands for an GetElementPtrInst, see if we can fold the result. /// If not, this returns null. static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef Ops, - const Query &Q, unsigned) { + const SimplifyQuery &Q, unsigned) { // The type of the GEP pointer operand. unsigned AS = cast(Ops[0]->getType()->getScalarType())->getAddressSpace(); @@ -3896,14 +3992,18 @@ Value *llvm::SimplifyGEPInst(Type *SrcTy, ArrayRef Ops, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyGEPInst(SrcTy, Ops, - Query(DL, TLI, DT, AC, CxtI), RecursionLimit); + return ::SimplifyGEPInst(SrcTy, Ops, {DL, TLI, DT, AC, CxtI}, RecursionLimit); +} + +Value *llvm::SimplifyGEPInst(Type *SrcTy, ArrayRef Ops, + const SimplifyQuery &Q) { + return ::SimplifyGEPInst(SrcTy, Ops, Q, RecursionLimit); } /// Given operands for an InsertValueInst, see if we can fold the result. /// If not, this returns null. static Value *SimplifyInsertValueInst(Value *Agg, Value *Val, - ArrayRef Idxs, const Query &Q, + ArrayRef Idxs, const SimplifyQuery &Q, unsigned) { if (Constant *CAgg = dyn_cast(Agg)) if (Constant *CVal = dyn_cast(Val)) @@ -3933,14 +4033,20 @@ Value *llvm::SimplifyInsertValueInst( Value *Agg, Value *Val, ArrayRef Idxs, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyInsertValueInst(Agg, Val, Idxs, Query(DL, TLI, DT, AC, CxtI), + return ::SimplifyInsertValueInst(Agg, Val, Idxs, {DL, TLI, DT, AC, CxtI}, RecursionLimit); } +Value *llvm::SimplifyInsertValueInst(Value *Agg, Value *Val, + ArrayRef Idxs, + const SimplifyQuery &Q) { + return ::SimplifyInsertValueInst(Agg, Val, Idxs, Q, RecursionLimit); +} + /// Given operands for an ExtractValueInst, see if we can fold the result. /// If not, this returns null. static Value *SimplifyExtractValueInst(Value *Agg, ArrayRef Idxs, - const Query &, unsigned) { + const SimplifyQuery &, unsigned) { if (auto *CAgg = dyn_cast(Agg)) return ConstantFoldExtractValueInstruction(CAgg, Idxs); @@ -3968,13 +4074,18 @@ Value *llvm::SimplifyExtractValueInst(Value *Agg, ArrayRef Idxs, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyExtractValueInst(Agg, Idxs, Query(DL, TLI, DT, AC, CxtI), + return ::SimplifyExtractValueInst(Agg, Idxs, {DL, TLI, DT, AC, CxtI}, RecursionLimit); } +Value *llvm::SimplifyExtractValueInst(Value *Agg, ArrayRef Idxs, + const SimplifyQuery &Q) { + return ::SimplifyExtractValueInst(Agg, Idxs, Q, RecursionLimit); +} + /// Given operands for an ExtractElementInst, see if we can fold the result. /// If not, this returns null. -static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const Query &, +static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const SimplifyQuery &, unsigned) { if (auto *CVec = dyn_cast(Vec)) { if (auto *CIdx = dyn_cast(Idx)) @@ -4000,12 +4111,17 @@ static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const Query &, Value *llvm::SimplifyExtractElementInst( Value *Vec, Value *Idx, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyExtractElementInst(Vec, Idx, Query(DL, TLI, DT, AC, CxtI), + return ::SimplifyExtractElementInst(Vec, Idx, {DL, TLI, DT, AC, CxtI}, RecursionLimit); } +Value *llvm::SimplifyExtractElementInst(Value *Vec, Value *Idx, + const SimplifyQuery &Q) { + return ::SimplifyExtractElementInst(Vec, Idx, Q, RecursionLimit); +} + /// See if we can fold the given phi. If not, returns null. -static Value *SimplifyPHINode(PHINode *PN, const Query &Q) { +static Value *SimplifyPHINode(PHINode *PN, const SimplifyQuery &Q) { // If all of the PHI's incoming values are the same then replace the PHI node // with the common value. Value *CommonValue = nullptr; @@ -4038,7 +4154,7 @@ static Value *SimplifyPHINode(PHINode *PN, const Query &Q) { } static Value *SimplifyCastInst(unsigned CastOpc, Value *Op, - Type *Ty, const Query &Q, unsigned MaxRecurse) { + Type *Ty, const SimplifyQuery &Q, unsigned MaxRecurse) { if (auto *C = dyn_cast(Op)) return ConstantFoldCastOperand(CastOpc, C, Ty, Q.DL); @@ -4076,10 +4192,15 @@ Value *llvm::SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyCastInst(CastOpc, Op, Ty, Query(DL, TLI, DT, AC, CxtI), + return ::SimplifyCastInst(CastOpc, Op, Ty, {DL, TLI, DT, AC, CxtI}, RecursionLimit); } +Value *llvm::SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty, + const SimplifyQuery &Q) { + return ::SimplifyCastInst(CastOpc, Op, Ty, Q, RecursionLimit); +} + /// For the given destination element of a shuffle, peek through shuffles to /// match a root vector source operand that contains that element in the same /// vector lane (ie, the same mask index), so we can eliminate the shuffle(s). @@ -4135,7 +4256,7 @@ static Value *foldIdentityShuffles(int DestElt, Value *Op0, Value *Op1, } static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask, - Type *RetTy, const Query &Q, + Type *RetTy, const SimplifyQuery &Q, unsigned MaxRecurse) { Type *InVecTy = Op0->getType(); unsigned MaskNumElts = Mask->getType()->getVectorNumElements(); @@ -4207,8 +4328,13 @@ Value *llvm::SimplifyShuffleVectorInst( Value *Op0, Value *Op1, Constant *Mask, Type *RetTy, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyShuffleVectorInst( - Op0, Op1, Mask, RetTy, Query(DL, TLI, DT, AC, CxtI), RecursionLimit); + return ::SimplifyShuffleVectorInst(Op0, Op1, Mask, RetTy, + {DL, TLI, DT, AC, CxtI}, RecursionLimit); +} + +Value *llvm::SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask, + Type *RetTy, const SimplifyQuery &Q) { + return ::SimplifyShuffleVectorInst(Op0, Op1, Mask, RetTy, Q, RecursionLimit); } //=== Helper functions for higher up the class hierarchy. @@ -4216,7 +4342,7 @@ Value *llvm::SimplifyShuffleVectorInst( /// Given operands for a BinaryOperator, see if we can fold the result. /// If not, this returns null. static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, - const Query &Q, unsigned MaxRecurse) { + const SimplifyQuery &Q, unsigned MaxRecurse) { switch (Opcode) { case Instruction::Add: return SimplifyAddInst(LHS, RHS, false, false, Q, MaxRecurse); @@ -4264,7 +4390,7 @@ static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, /// In contrast to SimplifyBinOp, try to use FastMathFlag when folding the /// result. In case we don't need FastMathFlags, simply fall to SimplifyBinOp. static Value *SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS, - const FastMathFlags &FMF, const Query &Q, + const FastMathFlags &FMF, const SimplifyQuery &Q, unsigned MaxRecurse) { switch (Opcode) { case Instruction::FAdd: @@ -4284,22 +4410,32 @@ Value *llvm::SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyBinOp(Opcode, LHS, RHS, Query(DL, TLI, DT, AC, CxtI), + return ::SimplifyBinOp(Opcode, LHS, RHS, {DL, TLI, DT, AC, CxtI}, RecursionLimit); } +Value *llvm::SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, + const SimplifyQuery &Q) { + return ::SimplifyBinOp(Opcode, LHS, RHS, Q, RecursionLimit); +} + Value *llvm::SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS, - const FastMathFlags &FMF, const DataLayout &DL, + FastMathFlags FMF, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyFPBinOp(Opcode, LHS, RHS, FMF, Query(DL, TLI, DT, AC, CxtI), + return ::SimplifyFPBinOp(Opcode, LHS, RHS, FMF, {DL, TLI, DT, AC, CxtI}, RecursionLimit); } +Value *llvm::SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS, + FastMathFlags FMF, const SimplifyQuery &Q) { + return ::SimplifyFPBinOp(Opcode, LHS, RHS, FMF, Q, RecursionLimit); +} + /// Given operands for a CmpInst, see if we can fold the result. static Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS, - const Query &Q, unsigned MaxRecurse) { + const SimplifyQuery &Q, unsigned MaxRecurse) { if (CmpInst::isIntPredicate((CmpInst::Predicate)Predicate)) return SimplifyICmpInst(Predicate, LHS, RHS, Q, MaxRecurse); return SimplifyFCmpInst(Predicate, LHS, RHS, FastMathFlags(), Q, MaxRecurse); @@ -4309,10 +4445,15 @@ Value *llvm::SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyCmpInst(Predicate, LHS, RHS, Query(DL, TLI, DT, AC, CxtI), + return ::SimplifyCmpInst(Predicate, LHS, RHS, {DL, TLI, DT, AC, CxtI}, RecursionLimit); } +Value *llvm::SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS, + const SimplifyQuery &Q) { + return ::SimplifyCmpInst(Predicate, LHS, RHS, Q, RecursionLimit); +} + static bool IsIdempotent(Intrinsic::ID ID) { switch (ID) { default: return false; @@ -4403,7 +4544,7 @@ static bool maskIsAllZeroOrUndef(Value *Mask) { template static Value *SimplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd, - const Query &Q, unsigned MaxRecurse) { + const SimplifyQuery &Q, unsigned MaxRecurse) { Intrinsic::ID IID = F->getIntrinsicID(); unsigned NumOperands = std::distance(ArgBegin, ArgEnd); @@ -4497,7 +4638,7 @@ static Value *SimplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd, template static Value *SimplifyCall(Value *V, IterTy ArgBegin, IterTy ArgEnd, - const Query &Q, unsigned MaxRecurse) { + const SimplifyQuery &Q, unsigned MaxRecurse) { Type *Ty = V->getType(); if (PointerType *PTy = dyn_cast(Ty)) Ty = PTy->getElementType(); @@ -4535,16 +4676,26 @@ Value *llvm::SimplifyCall(Value *V, User::op_iterator ArgBegin, User::op_iterator ArgEnd, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyCall(V, ArgBegin, ArgEnd, Query(DL, TLI, DT, AC, CxtI), + return ::SimplifyCall(V, ArgBegin, ArgEnd, {DL, TLI, DT, AC, CxtI}, RecursionLimit); } +Value *llvm::SimplifyCall(Value *V, User::op_iterator ArgBegin, + User::op_iterator ArgEnd, const SimplifyQuery &Q) { + return ::SimplifyCall(V, ArgBegin, ArgEnd, Q, RecursionLimit); +} + Value *llvm::SimplifyCall(Value *V, ArrayRef Args, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) { - return ::SimplifyCall(V, Args.begin(), Args.end(), - Query(DL, TLI, DT, AC, CxtI), RecursionLimit); + return ::SimplifyCall(V, Args.begin(), Args.end(), {DL, TLI, DT, AC, CxtI}, + RecursionLimit); +} + +Value *llvm::SimplifyCall(Value *V, ArrayRef Args, + const SimplifyQuery &Q) { + return ::SimplifyCall(V, Args.begin(), Args.end(), Q, RecursionLimit); } /// See if we can compute a simplified version of this instruction. @@ -4553,152 +4704,141 @@ Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC, OptimizationRemarkEmitter *ORE) { + return SimplifyInstruction(I, {DL, TLI, DT, AC, I}, ORE); +} + +Value *llvm::SimplifyInstruction(Instruction *I, const SimplifyQuery &Q, + OptimizationRemarkEmitter *ORE) { Value *Result; switch (I->getOpcode()) { default: - Result = ConstantFoldInstruction(I, DL, TLI); + Result = ConstantFoldInstruction(I, Q.DL, Q.TLI); break; case Instruction::FAdd: Result = SimplifyFAddInst(I->getOperand(0), I->getOperand(1), - I->getFastMathFlags(), DL, TLI, DT, AC, I); + I->getFastMathFlags(), Q); break; case Instruction::Add: Result = SimplifyAddInst(I->getOperand(0), I->getOperand(1), cast(I)->hasNoSignedWrap(), - cast(I)->hasNoUnsignedWrap(), DL, - TLI, DT, AC, I); + cast(I)->hasNoUnsignedWrap(), Q); break; case Instruction::FSub: Result = SimplifyFSubInst(I->getOperand(0), I->getOperand(1), - I->getFastMathFlags(), DL, TLI, DT, AC, I); + I->getFastMathFlags(), Q); break; case Instruction::Sub: Result = SimplifySubInst(I->getOperand(0), I->getOperand(1), cast(I)->hasNoSignedWrap(), - cast(I)->hasNoUnsignedWrap(), DL, - TLI, DT, AC, I); + cast(I)->hasNoUnsignedWrap(), Q); break; case Instruction::FMul: Result = SimplifyFMulInst(I->getOperand(0), I->getOperand(1), - I->getFastMathFlags(), DL, TLI, DT, AC, I); + I->getFastMathFlags(), Q); break; case Instruction::Mul: - Result = - SimplifyMulInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT, AC, I); + Result = SimplifyMulInst(I->getOperand(0), I->getOperand(1), Q); break; case Instruction::SDiv: - Result = SimplifySDivInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT, - AC, I); + Result = SimplifySDivInst(I->getOperand(0), I->getOperand(1), Q); break; case Instruction::UDiv: - Result = SimplifyUDivInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT, - AC, I); + Result = SimplifyUDivInst(I->getOperand(0), I->getOperand(1), Q); break; case Instruction::FDiv: Result = SimplifyFDivInst(I->getOperand(0), I->getOperand(1), - I->getFastMathFlags(), DL, TLI, DT, AC, I); + I->getFastMathFlags(), Q); break; case Instruction::SRem: - Result = SimplifySRemInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT, - AC, I); + Result = SimplifySRemInst(I->getOperand(0), I->getOperand(1), Q); break; case Instruction::URem: - Result = SimplifyURemInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT, - AC, I); + Result = SimplifyURemInst(I->getOperand(0), I->getOperand(1), Q); break; case Instruction::FRem: Result = SimplifyFRemInst(I->getOperand(0), I->getOperand(1), - I->getFastMathFlags(), DL, TLI, DT, AC, I); + I->getFastMathFlags(), Q); break; case Instruction::Shl: Result = SimplifyShlInst(I->getOperand(0), I->getOperand(1), cast(I)->hasNoSignedWrap(), - cast(I)->hasNoUnsignedWrap(), DL, - TLI, DT, AC, I); + cast(I)->hasNoUnsignedWrap(), Q); break; case Instruction::LShr: Result = SimplifyLShrInst(I->getOperand(0), I->getOperand(1), - cast(I)->isExact(), DL, TLI, DT, - AC, I); + cast(I)->isExact(), Q); break; case Instruction::AShr: Result = SimplifyAShrInst(I->getOperand(0), I->getOperand(1), - cast(I)->isExact(), DL, TLI, DT, - AC, I); + cast(I)->isExact(), Q); break; case Instruction::And: - Result = - SimplifyAndInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT, AC, I); + Result = SimplifyAndInst(I->getOperand(0), I->getOperand(1), Q); break; case Instruction::Or: - Result = - SimplifyOrInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT, AC, I); + Result = SimplifyOrInst(I->getOperand(0), I->getOperand(1), Q); break; case Instruction::Xor: - Result = - SimplifyXorInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT, AC, I); + Result = SimplifyXorInst(I->getOperand(0), I->getOperand(1), Q); break; case Instruction::ICmp: - Result = - SimplifyICmpInst(cast(I)->getPredicate(), I->getOperand(0), - I->getOperand(1), DL, TLI, DT, AC, I); + Result = SimplifyICmpInst(cast(I)->getPredicate(), + I->getOperand(0), I->getOperand(1), Q); break; case Instruction::FCmp: - Result = SimplifyFCmpInst(cast(I)->getPredicate(), - I->getOperand(0), I->getOperand(1), - I->getFastMathFlags(), DL, TLI, DT, AC, I); + Result = + SimplifyFCmpInst(cast(I)->getPredicate(), I->getOperand(0), + I->getOperand(1), I->getFastMathFlags(), Q); break; case Instruction::Select: Result = SimplifySelectInst(I->getOperand(0), I->getOperand(1), - I->getOperand(2), DL, TLI, DT, AC, I); + I->getOperand(2), Q); break; case Instruction::GetElementPtr: { - SmallVector Ops(I->op_begin(), I->op_end()); + SmallVector Ops(I->op_begin(), I->op_end()); Result = SimplifyGEPInst(cast(I)->getSourceElementType(), - Ops, DL, TLI, DT, AC, I); + Ops, Q); break; } case Instruction::InsertValue: { InsertValueInst *IV = cast(I); Result = SimplifyInsertValueInst(IV->getAggregateOperand(), IV->getInsertedValueOperand(), - IV->getIndices(), DL, TLI, DT, AC, I); + IV->getIndices(), Q); break; } case Instruction::ExtractValue: { auto *EVI = cast(I); Result = SimplifyExtractValueInst(EVI->getAggregateOperand(), - EVI->getIndices(), DL, TLI, DT, AC, I); + EVI->getIndices(), Q); break; } case Instruction::ExtractElement: { auto *EEI = cast(I); - Result = SimplifyExtractElementInst( - EEI->getVectorOperand(), EEI->getIndexOperand(), DL, TLI, DT, AC, I); + Result = SimplifyExtractElementInst(EEI->getVectorOperand(), + EEI->getIndexOperand(), Q); break; } case Instruction::ShuffleVector: { auto *SVI = cast(I); Result = SimplifyShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1), - SVI->getMask(), SVI->getType(), DL, TLI, - DT, AC, I); + SVI->getMask(), SVI->getType(), Q); break; } case Instruction::PHI: - Result = SimplifyPHINode(cast(I), Query(DL, TLI, DT, AC, I)); + Result = SimplifyPHINode(cast(I), Q); break; case Instruction::Call: { CallSite CS(cast(I)); - Result = SimplifyCall(CS.getCalledValue(), CS.arg_begin(), CS.arg_end(), DL, - TLI, DT, AC, I); + Result = SimplifyCall(CS.getCalledValue(), CS.arg_begin(), CS.arg_end(), Q); break; } #define HANDLE_CAST_INST(num, opc, clas) case Instruction::opc: #include "llvm/IR/Instruction.def" #undef HANDLE_CAST_INST - Result = SimplifyCastInst(I->getOpcode(), I->getOperand(0), I->getType(), - DL, TLI, DT, AC, I); + Result = + SimplifyCastInst(I->getOpcode(), I->getOperand(0), I->getType(), Q); break; case Instruction::Alloca: // No simplifications for Alloca and it can't be constant folded. @@ -4710,11 +4850,10 @@ Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout &DL, // value even when the operands are not all constants. if (!Result && I->getType()->isIntOrIntVectorTy()) { unsigned BitWidth = I->getType()->getScalarSizeInBits(); - APInt KnownZero(BitWidth, 0); - APInt KnownOne(BitWidth, 0); - computeKnownBits(I, KnownZero, KnownOne, DL, /*Depth*/0, AC, I, DT, ORE); - if ((KnownZero | KnownOne).isAllOnesValue()) - Result = ConstantInt::get(I->getType(), KnownOne); + KnownBits Known(BitWidth); + computeKnownBits(I, Known, Q.DL, /*Depth*/ 0, Q.AC, I, Q.DT, ORE); + if ((Known.Zero | Known.One).isAllOnesValue()) + Result = ConstantInt::get(I->getType(), Known.One); } /// If called on unreachable code, the above logic may report that the diff --git a/lib/Analysis/Lint.cpp b/lib/Analysis/Lint.cpp index 2ca46b1..0f04af5 100644 --- a/lib/Analysis/Lint.cpp +++ b/lib/Analysis/Lint.cpp @@ -70,6 +70,7 @@ #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include @@ -534,10 +535,9 @@ static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, VectorType *VecTy = dyn_cast(V->getType()); if (!VecTy) { unsigned BitWidth = V->getType()->getIntegerBitWidth(); - APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); - computeKnownBits(V, KnownZero, KnownOne, DL, 0, AC, - dyn_cast(V), DT); - return KnownZero.isAllOnesValue(); + KnownBits Known(BitWidth); + computeKnownBits(V, Known, DL, 0, AC, dyn_cast(V), DT); + return Known.Zero.isAllOnesValue(); } // Per-component check doesn't work with zeroinitializer @@ -556,9 +556,9 @@ static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, if (isa(Elem)) return true; - APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); - computeKnownBits(Elem, KnownZero, KnownOne, DL); - if (KnownZero.isAllOnesValue()) + KnownBits Known(BitWidth); + computeKnownBits(Elem, Known, DL); + if (Known.Zero.isAllOnesValue()) return true; } diff --git a/lib/Analysis/MemorySSAUpdater.cpp b/lib/Analysis/MemorySSAUpdater.cpp index c63677f..da5c79a 100644 --- a/lib/Analysis/MemorySSAUpdater.cpp +++ b/lib/Analysis/MemorySSAUpdater.cpp @@ -29,7 +29,7 @@ #define DEBUG_TYPE "memoryssa" using namespace llvm; -namespace llvm { + // This is the marker algorithm from "Simple and Efficient Construction of // Static Single Assignment Form" // The simple, non-marker algorithm places phi nodes at any join @@ -211,8 +211,8 @@ void MemorySSAUpdater::insertUse(MemoryUse *MU) { } // Set every incoming edge {BB, MP->getBlock()} of MemoryPhi MP to NewDef. -void setMemoryPhiValueForBlock(MemoryPhi *MP, const BasicBlock *BB, - MemoryAccess *NewDef) { +static void setMemoryPhiValueForBlock(MemoryPhi *MP, const BasicBlock *BB, + MemoryAccess *NewDef) { // Replace any operand with us an incoming block with the new defining // access. int i = MP->getBasicBlockIndex(BB); @@ -415,6 +415,7 @@ static MemoryAccess *onlySingleValue(MemoryPhi *MP) { } return MA; } + void MemorySSAUpdater::removeMemoryAccess(MemoryAccess *MA) { assert(!MSSA->isLiveOnEntryDef(MA) && "Trying to remove the live on entry def"); @@ -490,5 +491,3 @@ MemoryUseOrDef *MemorySSAUpdater::createMemoryAccessAfter( ++InsertPt->getIterator()); return NewAccess; } - -} // namespace llvm diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index 700c383..3ac4bf1 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -89,6 +89,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/SaveAndRestore.h" @@ -4575,10 +4576,10 @@ uint32_t ScalarEvolution::GetMinTrailingZerosImpl(const SCEV *S) { if (const SCEVUnknown *U = dyn_cast(S)) { // For a SCEVUnknown, ask ValueTracking. unsigned BitWidth = getTypeSizeInBits(U->getType()); - APInt Zeros(BitWidth, 0), Ones(BitWidth, 0); - computeKnownBits(U->getValue(), Zeros, Ones, getDataLayout(), 0, &AC, + KnownBits Known(BitWidth); + computeKnownBits(U->getValue(), Known, getDataLayout(), 0, &AC, nullptr, &DT); - return Zeros.countTrailingOnes(); + return Known.Zero.countTrailingOnes(); } // SCEVUDivExpr @@ -4757,11 +4758,12 @@ ScalarEvolution::getRange(const SCEV *S, const DataLayout &DL = getDataLayout(); if (SignHint == ScalarEvolution::HINT_RANGE_UNSIGNED) { // For a SCEVUnknown, ask ValueTracking. - APInt Zeros(BitWidth, 0), Ones(BitWidth, 0); - computeKnownBits(U->getValue(), Zeros, Ones, DL, 0, &AC, nullptr, &DT); - if (Ones != ~Zeros + 1) + KnownBits Known(BitWidth); + computeKnownBits(U->getValue(), Known, DL, 0, &AC, nullptr, &DT); + if (Known.One != ~Known.Zero + 1) ConservativeResult = - ConservativeResult.intersectWith(ConstantRange(Ones, ~Zeros + 1)); + ConservativeResult.intersectWith(ConstantRange(Known.One, + ~Known.Zero + 1)); } else { assert(SignHint == ScalarEvolution::HINT_RANGE_SIGNED && "generalize as needed!"); @@ -5292,13 +5294,13 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) { unsigned LZ = A.countLeadingZeros(); unsigned TZ = A.countTrailingZeros(); unsigned BitWidth = A.getBitWidth(); - APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); - computeKnownBits(BO->LHS, KnownZero, KnownOne, getDataLayout(), + KnownBits Known(BitWidth); + computeKnownBits(BO->LHS, Known, getDataLayout(), 0, &AC, nullptr, &DT); APInt EffectiveMask = APInt::getLowBitsSet(BitWidth, BitWidth - LZ - TZ).shl(TZ); - if ((LZ != 0 || TZ != 0) && !((~A & ~KnownZero) & EffectiveMask)) { + if ((LZ != 0 || TZ != 0) && !((~A & ~Known.Zero) & EffectiveMask)) { const SCEV *MulCount = getConstant(APInt::getOneBitSet(BitWidth, TZ)); const SCEV *LHS = getSCEV(BO->LHS); const SCEV *ShiftedLHS = nullptr; @@ -5328,12 +5330,28 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) { break; case Instruction::Or: - // Use ValueTracking to check whether this is actually an add. - if (haveNoCommonBitsSet(BO->LHS, BO->RHS, getDataLayout(), &AC, - nullptr, &DT)) { - // There aren't any common bits set, so the add can't wrap. - auto Flags = SCEV::NoWrapFlags(SCEV::FlagNUW | SCEV::FlagNSW); - return getAddExpr(getSCEV(BO->LHS), getSCEV(BO->RHS), Flags); + // If the RHS of the Or is a constant, we may have something like: + // X*4+1 which got turned into X*4|1. Handle this as an Add so loop + // optimizations will transparently handle this case. + // + // In order for this transformation to be safe, the LHS must be of the + // form X*(2^n) and the Or constant must be less than 2^n. + if (ConstantInt *CI = dyn_cast(BO->RHS)) { + const SCEV *LHS = getSCEV(BO->LHS); + const APInt &CIVal = CI->getValue(); + if (GetMinTrailingZeros(LHS) >= + (CIVal.getBitWidth() - CIVal.countLeadingZeros())) { + // Build a plain add SCEV. + const SCEV *S = getAddExpr(LHS, getSCEV(CI)); + // If the LHS of the add was an addrec and it has no-wrap flags, + // transfer the no-wrap flags, since an or won't introduce a wrap. + if (const SCEVAddRecExpr *NewAR = dyn_cast(S)) { + const SCEVAddRecExpr *OldAR = cast(LHS); + const_cast(NewAR)->setNoWrapFlags( + OldAR->getNoWrapFlags()); + } + return S; + } } break; @@ -6063,24 +6081,74 @@ ScalarEvolution::computeExitLimit(const Loop *L, BasicBlock *ExitingBlock, return getCouldNotCompute(); } -ScalarEvolution::ExitLimit -ScalarEvolution::computeExitLimitFromCond(const Loop *L, - Value *ExitCond, - BasicBlock *TBB, - BasicBlock *FBB, - bool ControlsExit, - bool AllowPredicates) { +ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCond( + const Loop *L, Value *ExitCond, BasicBlock *TBB, BasicBlock *FBB, + bool ControlsExit, bool AllowPredicates) { + ScalarEvolution::ExitLimitCacheTy Cache(L, TBB, FBB, AllowPredicates); + return computeExitLimitFromCondCached(Cache, L, ExitCond, TBB, FBB, + ControlsExit, AllowPredicates); +} + +Optional +ScalarEvolution::ExitLimitCache::find(const Loop *L, Value *ExitCond, + BasicBlock *TBB, BasicBlock *FBB, + bool ControlsExit, bool AllowPredicates) { + (void)this->L; + (void)this->TBB; + (void)this->FBB; + (void)this->AllowPredicates; + + assert(this->L == L && this->TBB == TBB && this->FBB == FBB && + this->AllowPredicates == AllowPredicates && + "Variance in assumed invariant key components!"); + auto Itr = TripCountMap.find({ExitCond, ControlsExit}); + if (Itr == TripCountMap.end()) + return None; + return Itr->second; +} + +void ScalarEvolution::ExitLimitCache::insert(const Loop *L, Value *ExitCond, + BasicBlock *TBB, BasicBlock *FBB, + bool ControlsExit, + bool AllowPredicates, + const ExitLimit &EL) { + assert(this->L == L && this->TBB == TBB && this->FBB == FBB && + this->AllowPredicates == AllowPredicates && + "Variance in assumed invariant key components!"); + + auto InsertResult = TripCountMap.insert({{ExitCond, ControlsExit}, EL}); + assert(InsertResult.second && "Expected successful insertion!"); + (void)InsertResult; +} + +ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondCached( + ExitLimitCacheTy &Cache, const Loop *L, Value *ExitCond, BasicBlock *TBB, + BasicBlock *FBB, bool ControlsExit, bool AllowPredicates) { + + if (auto MaybeEL = + Cache.find(L, ExitCond, TBB, FBB, ControlsExit, AllowPredicates)) + return *MaybeEL; + + ExitLimit EL = computeExitLimitFromCondImpl(Cache, L, ExitCond, TBB, FBB, + ControlsExit, AllowPredicates); + Cache.insert(L, ExitCond, TBB, FBB, ControlsExit, AllowPredicates, EL); + return EL; +} + +ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondImpl( + ExitLimitCacheTy &Cache, const Loop *L, Value *ExitCond, BasicBlock *TBB, + BasicBlock *FBB, bool ControlsExit, bool AllowPredicates) { // Check if the controlling expression for this loop is an And or Or. if (BinaryOperator *BO = dyn_cast(ExitCond)) { if (BO->getOpcode() == Instruction::And) { // Recurse on the operands of the and. bool EitherMayExit = L->contains(TBB); - ExitLimit EL0 = computeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB, - ControlsExit && !EitherMayExit, - AllowPredicates); - ExitLimit EL1 = computeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB, - ControlsExit && !EitherMayExit, - AllowPredicates); + ExitLimit EL0 = computeExitLimitFromCondCached( + Cache, L, BO->getOperand(0), TBB, FBB, ControlsExit && !EitherMayExit, + AllowPredicates); + ExitLimit EL1 = computeExitLimitFromCondCached( + Cache, L, BO->getOperand(1), TBB, FBB, ControlsExit && !EitherMayExit, + AllowPredicates); const SCEV *BECount = getCouldNotCompute(); const SCEV *MaxBECount = getCouldNotCompute(); if (EitherMayExit) { @@ -6124,12 +6192,12 @@ ScalarEvolution::computeExitLimitFromCond(const Loop *L, if (BO->getOpcode() == Instruction::Or) { // Recurse on the operands of the or. bool EitherMayExit = L->contains(FBB); - ExitLimit EL0 = computeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB, - ControlsExit && !EitherMayExit, - AllowPredicates); - ExitLimit EL1 = computeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB, - ControlsExit && !EitherMayExit, - AllowPredicates); + ExitLimit EL0 = computeExitLimitFromCondCached( + Cache, L, BO->getOperand(0), TBB, FBB, ControlsExit && !EitherMayExit, + AllowPredicates); + ExitLimit EL1 = computeExitLimitFromCondCached( + Cache, L, BO->getOperand(1), TBB, FBB, ControlsExit && !EitherMayExit, + AllowPredicates); const SCEV *BECount = getCouldNotCompute(); const SCEV *MaxBECount = getCouldNotCompute(); if (EitherMayExit) { @@ -10221,84 +10289,75 @@ void ScalarEvolution::forgetMemoizedResults(const SCEV *S) { RemoveSCEVFromBackedgeMap(PredicatedBackedgeTakenCounts); } -typedef DenseMap VerifyMap; +void ScalarEvolution::verify() const { + ScalarEvolution &SE = *const_cast(this); + ScalarEvolution SE2(F, TLI, AC, DT, LI); + + SmallVector LoopStack(LI.begin(), LI.end()); -/// replaceSubString - Replaces all occurrences of From in Str with To. -static void replaceSubString(std::string &Str, StringRef From, StringRef To) { - size_t Pos = 0; - while ((Pos = Str.find(From, Pos)) != std::string::npos) { - Str.replace(Pos, From.size(), To.data(), To.size()); - Pos += To.size(); - } -} + // Map's SCEV expressions from one ScalarEvolution "universe" to another. + struct SCEVMapper : public SCEVRewriteVisitor { + const SCEV *visitConstant(const SCEVConstant *Constant) { + return SE.getConstant(Constant->getAPInt()); + } + const SCEV *visitUnknown(const SCEVUnknown *Expr) { + return SE.getUnknown(Expr->getValue()); + } -/// getLoopBackedgeTakenCounts - Helper method for verifyAnalysis. -static void -getLoopBackedgeTakenCounts(Loop *L, VerifyMap &Map, ScalarEvolution &SE) { - std::string &S = Map[L]; - if (S.empty()) { - raw_string_ostream OS(S); - SE.getBackedgeTakenCount(L)->print(OS); + const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *Expr) { + return SE.getCouldNotCompute(); + } + SCEVMapper(ScalarEvolution &SE) : SCEVRewriteVisitor(SE) {} + }; - // false and 0 are semantically equivalent. This can happen in dead loops. - replaceSubString(OS.str(), "false", "0"); - // Remove wrap flags, their use in SCEV is highly fragile. - // FIXME: Remove this when SCEV gets smarter about them. - replaceSubString(OS.str(), "", ""); - replaceSubString(OS.str(), "", ""); - replaceSubString(OS.str(), "", ""); - } + SCEVMapper SCM(SE2); - for (auto *R : reverse(*L)) - getLoopBackedgeTakenCounts(R, Map, SE); // recurse. -} + while (!LoopStack.empty()) { + auto *L = LoopStack.pop_back_val(); + LoopStack.insert(LoopStack.end(), L->begin(), L->end()); -void ScalarEvolution::verify() const { - ScalarEvolution &SE = *const_cast(this); + auto *CurBECount = SCM.visit( + const_cast(this)->getBackedgeTakenCount(L)); + auto *NewBECount = SE2.getBackedgeTakenCount(L); - // Gather stringified backedge taken counts for all loops using SCEV's caches. - // FIXME: It would be much better to store actual values instead of strings, - // but SCEV pointers will change if we drop the caches. - VerifyMap BackedgeDumpsOld, BackedgeDumpsNew; - for (LoopInfo::reverse_iterator I = LI.rbegin(), E = LI.rend(); I != E; ++I) - getLoopBackedgeTakenCounts(*I, BackedgeDumpsOld, SE); + if (CurBECount == SE2.getCouldNotCompute() || + NewBECount == SE2.getCouldNotCompute()) { + // NB! This situation is legal, but is very suspicious -- whatever pass + // change the loop to make a trip count go from could not compute to + // computable or vice-versa *should have* invalidated SCEV. However, we + // choose not to assert here (for now) since we don't want false + // positives. + continue; + } - // Gather stringified backedge taken counts for all loops using a fresh - // ScalarEvolution object. - ScalarEvolution SE2(F, TLI, AC, DT, LI); - for (LoopInfo::reverse_iterator I = LI.rbegin(), E = LI.rend(); I != E; ++I) - getLoopBackedgeTakenCounts(*I, BackedgeDumpsNew, SE2); - - // Now compare whether they're the same with and without caches. This allows - // verifying that no pass changed the cache. - assert(BackedgeDumpsOld.size() == BackedgeDumpsNew.size() && - "New loops suddenly appeared!"); - - for (VerifyMap::iterator OldI = BackedgeDumpsOld.begin(), - OldE = BackedgeDumpsOld.end(), - NewI = BackedgeDumpsNew.begin(); - OldI != OldE; ++OldI, ++NewI) { - assert(OldI->first == NewI->first && "Loop order changed!"); - - // Compare the stringified SCEVs. We don't care if undef backedgetaken count - // changes. - // FIXME: We currently ignore SCEV changes from/to CouldNotCompute. This - // means that a pass is buggy or SCEV has to learn a new pattern but is - // usually not harmful. - if (OldI->second != NewI->second && - OldI->second.find("undef") == std::string::npos && - NewI->second.find("undef") == std::string::npos && - OldI->second != "***COULDNOTCOMPUTE***" && - NewI->second != "***COULDNOTCOMPUTE***") { - dbgs() << "SCEVValidator: SCEV for loop '" - << OldI->first->getHeader()->getName() - << "' changed from '" << OldI->second - << "' to '" << NewI->second << "'!\n"; + if (containsUndefs(CurBECount) || containsUndefs(NewBECount)) { + // SCEV treats "undef" as an unknown but consistent value (i.e. it does + // not propagate undef aggressively). This means we can (and do) fail + // verification in cases where a transform makes the trip count of a loop + // go from "undef" to "undef+1" (say). The transform is fine, since in + // both cases the loop iterates "undef" times, but SCEV thinks we + // increased the trip count of the loop by 1 incorrectly. + continue; + } + + if (SE.getTypeSizeInBits(CurBECount->getType()) > + SE.getTypeSizeInBits(NewBECount->getType())) + NewBECount = SE2.getZeroExtendExpr(NewBECount, CurBECount->getType()); + else if (SE.getTypeSizeInBits(CurBECount->getType()) < + SE.getTypeSizeInBits(NewBECount->getType())) + CurBECount = SE2.getZeroExtendExpr(CurBECount, NewBECount->getType()); + + auto *ConstantDelta = + dyn_cast(SE2.getMinusSCEV(CurBECount, NewBECount)); + + if (ConstantDelta && ConstantDelta->getAPInt() != 0) { + dbgs() << "Trip Count Changed!\n"; + dbgs() << "Old: " << *CurBECount << "\n"; + dbgs() << "New: " << *NewBECount << "\n"; + dbgs() << "Delta: " << *ConstantDelta << "\n"; std::abort(); } } - - // TODO: Verify more things. } bool ScalarEvolution::invalidate( diff --git a/lib/Analysis/ScalarEvolutionNormalization.cpp b/lib/Analysis/ScalarEvolutionNormalization.cpp index 2aaa4c1..54c44c8 100644 --- a/lib/Analysis/ScalarEvolutionNormalization.cpp +++ b/lib/Analysis/ScalarEvolutionNormalization.cpp @@ -51,40 +51,47 @@ NormalizeDenormalizeRewriter::visitAddRecExpr(const SCEVAddRecExpr *AR) { transform(AR->operands(), std::back_inserter(Operands), [&](const SCEV *Op) { return visit(Op); }); - // Conservatively use AnyWrap until/unless we need FlagNW. - const SCEV *Result = - SE.getAddRecExpr(Operands, AR->getLoop(), SCEV::FlagAnyWrap); - switch (Kind) { - case Normalize: - // We want to normalize step expression, because otherwise we might not be - // able to denormalize to the original expression. + if (!Pred(AR)) + return SE.getAddRecExpr(Operands, AR->getLoop(), SCEV::FlagAnyWrap); + + // Normalization and denormalization are fancy names for decrementing and + // incrementing a SCEV expression with respect to a set of loops. Since + // Pred(AR) has returned true, we know we need to normalize or denormalize AR + // with respect to its loop. + + if (Kind == Denormalize) { + // Denormalization / "partial increment" is essentially the same as \c + // SCEVAddRecExpr::getPostIncExpr. Here we use an explicit loop to make the + // symmetry with Normalization clear. + for (int i = 0, e = Operands.size() - 1; i < e; i++) + Operands[i] = SE.getAddExpr(Operands[i], Operands[i + 1]); + } else { + assert(Kind == Normalize && "Only two possibilities!"); + + // Normalization / "partial decrement" is a bit more subtle. Since + // incrementing a SCEV expression (in general) changes the step of the SCEV + // expression as well, we cannot use the step of the current expression. + // Instead, we have to use the step of the very expression we're trying to + // compute! + // + // We solve the issue by recursively building up the result, starting from + // the "least significant" operand in the add recurrence: // - // Here is an example what will happen if we don't normalize step: - // ORIGINAL ISE: - // {(100 /u {1,+,1}<%bb16>),+,(100 /u {1,+,1}<%bb16>)}<%bb25> - // NORMALIZED ISE: - // {((-1 * (100 /u {1,+,1}<%bb16>)) + (100 /u {0,+,1}<%bb16>)),+, - // (100 /u {0,+,1}<%bb16>)}<%bb25> - // DENORMALIZED BACK ISE: - // {((2 * (100 /u {1,+,1}<%bb16>)) + (-1 * (100 /u {2,+,1}<%bb16>))),+, - // (100 /u {1,+,1}<%bb16>)}<%bb25> - // Note that the initial value changes after normalization + - // denormalization, which isn't correct. - if (Pred(AR)) { - const SCEV *TransformedStep = visit(AR->getStepRecurrence(SE)); - Result = SE.getMinusSCEV(Result, TransformedStep); - } - break; - case Denormalize: - // Here we want to normalize step expressions for the same reasons, as - // stated above. - if (Pred(AR)) { - const SCEV *TransformedStep = visit(AR->getStepRecurrence(SE)); - Result = SE.getAddExpr(Result, TransformedStep); - } - break; + // Base case: + // Single operand add recurrence. It's its own normalization. + // + // N-operand case: + // {S_{N-1},+,S_{N-2},+,...,+,S_0} = S + // + // Since the step recurrence of S is {S_{N-2},+,...,+,S_0}, we know its + // normalization by induction. We subtract the normalized step + // recurrence from S_{N-1} to get the normalization of S. + + for (int i = Operands.size() - 2; i >= 0; i--) + Operands[i] = SE.getMinusSCEV(Operands[i], Operands[i + 1]); } - return Result; + + return SE.getAddRecExpr(Operands, AR->getLoop(), SCEV::FlagAnyWrap); } const SCEV *llvm::normalizeForPostIncUse(const SCEV *S, diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp index 900a236..af964b6 100644 --- a/lib/Analysis/ValueTracking.cpp +++ b/lib/Analysis/ValueTracking.cpp @@ -38,6 +38,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Statepoint.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include #include @@ -130,15 +131,15 @@ static const Instruction *safeCxtI(const Value *V, const Instruction *CxtI) { return nullptr; } -static void computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne, +static void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth, const Query &Q); -void llvm::computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne, +void llvm::computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT, OptimizationRemarkEmitter *ORE) { - ::computeKnownBits(V, KnownZero, KnownOne, Depth, + ::computeKnownBits(V, Known, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT, ORE)); } @@ -151,11 +152,11 @@ bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS, assert(LHS->getType()->isIntOrIntVectorTy() && "LHS and RHS should be integers"); IntegerType *IT = cast(LHS->getType()->getScalarType()); - APInt LHSKnownZero(IT->getBitWidth(), 0), LHSKnownOne(IT->getBitWidth(), 0); - APInt RHSKnownZero(IT->getBitWidth(), 0), RHSKnownOne(IT->getBitWidth(), 0); - computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, DL, 0, AC, CxtI, DT); - computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, DL, 0, AC, CxtI, DT); - return (LHSKnownZero | RHSKnownZero).isAllOnesValue(); + KnownBits LHSKnown(IT->getBitWidth()); + KnownBits RHSKnown(IT->getBitWidth()); + computeKnownBits(LHS, LHSKnown, DL, 0, AC, CxtI, DT); + computeKnownBits(RHS, RHSKnown, DL, 0, AC, CxtI, DT); + return (LHSKnown.Zero | RHSKnown.Zero).isAllOnesValue(); } static void ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne, @@ -252,67 +253,65 @@ unsigned llvm::ComputeNumSignBits(const Value *V, const DataLayout &DL, static void computeKnownBitsAddSub(bool Add, const Value *Op0, const Value *Op1, bool NSW, - APInt &KnownZero, APInt &KnownOne, - APInt &KnownZero2, APInt &KnownOne2, + KnownBits &KnownOut, KnownBits &Known2, unsigned Depth, const Query &Q) { - unsigned BitWidth = KnownZero.getBitWidth(); + unsigned BitWidth = KnownOut.getBitWidth(); // If an initial sequence of bits in the result is not needed, the // corresponding bits in the operands are not needed. - APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0); - computeKnownBits(Op0, LHSKnownZero, LHSKnownOne, Depth + 1, Q); - computeKnownBits(Op1, KnownZero2, KnownOne2, Depth + 1, Q); + KnownBits LHSKnown(BitWidth); + computeKnownBits(Op0, LHSKnown, Depth + 1, Q); + computeKnownBits(Op1, Known2, Depth + 1, Q); // Carry in a 1 for a subtract, rather than a 0. uint64_t CarryIn = 0; if (!Add) { // Sum = LHS + ~RHS + 1 - std::swap(KnownZero2, KnownOne2); + std::swap(Known2.Zero, Known2.One); CarryIn = 1; } - APInt PossibleSumZero = ~LHSKnownZero + ~KnownZero2 + CarryIn; - APInt PossibleSumOne = LHSKnownOne + KnownOne2 + CarryIn; + APInt PossibleSumZero = ~LHSKnown.Zero + ~Known2.Zero + CarryIn; + APInt PossibleSumOne = LHSKnown.One + Known2.One + CarryIn; // Compute known bits of the carry. - APInt CarryKnownZero = ~(PossibleSumZero ^ LHSKnownZero ^ KnownZero2); - APInt CarryKnownOne = PossibleSumOne ^ LHSKnownOne ^ KnownOne2; + APInt CarryKnownZero = ~(PossibleSumZero ^ LHSKnown.Zero ^ Known2.Zero); + APInt CarryKnownOne = PossibleSumOne ^ LHSKnown.One ^ Known2.One; // Compute set of known bits (where all three relevant bits are known). - APInt LHSKnown = LHSKnownZero | LHSKnownOne; - APInt RHSKnown = KnownZero2 | KnownOne2; - APInt CarryKnown = CarryKnownZero | CarryKnownOne; - APInt Known = LHSKnown & RHSKnown & CarryKnown; + APInt LHSKnownUnion = LHSKnown.Zero | LHSKnown.One; + APInt RHSKnownUnion = Known2.Zero | Known2.One; + APInt CarryKnownUnion = CarryKnownZero | CarryKnownOne; + APInt Known = LHSKnownUnion & RHSKnownUnion & CarryKnownUnion; assert((PossibleSumZero & Known) == (PossibleSumOne & Known) && "known bits of sum differ"); // Compute known bits of the result. - KnownZero = ~PossibleSumOne & Known; - KnownOne = PossibleSumOne & Known; + KnownOut.Zero = ~PossibleSumOne & Known; + KnownOut.One = PossibleSumOne & Known; // Are we still trying to solve for the sign bit? if (!Known.isSignBitSet()) { if (NSW) { // Adding two non-negative numbers, or subtracting a negative number from // a non-negative one, can't wrap into negative. - if (LHSKnownZero.isSignBitSet() && KnownZero2.isSignBitSet()) - KnownZero.setSignBit(); + if (LHSKnown.Zero.isSignBitSet() && Known2.Zero.isSignBitSet()) + KnownOut.Zero.setSignBit(); // Adding two negative numbers, or subtracting a non-negative number from // a negative one, can't wrap into non-negative. - else if (LHSKnownOne.isSignBitSet() && KnownOne2.isSignBitSet()) - KnownOne.setSignBit(); + else if (LHSKnown.One.isSignBitSet() && Known2.One.isSignBitSet()) + KnownOut.One.setSignBit(); } } } static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW, - APInt &KnownZero, APInt &KnownOne, - APInt &KnownZero2, APInt &KnownOne2, + KnownBits &Known, KnownBits &Known2, unsigned Depth, const Query &Q) { - unsigned BitWidth = KnownZero.getBitWidth(); - computeKnownBits(Op1, KnownZero, KnownOne, Depth + 1, Q); - computeKnownBits(Op0, KnownZero2, KnownOne2, Depth + 1, Q); + unsigned BitWidth = Known.getBitWidth(); + computeKnownBits(Op1, Known, Depth + 1, Q); + computeKnownBits(Op0, Known2, Depth + 1, Q); bool isKnownNegative = false; bool isKnownNonNegative = false; @@ -322,10 +321,10 @@ static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW, // The product of a number with itself is non-negative. isKnownNonNegative = true; } else { - bool isKnownNonNegativeOp1 = KnownZero.isSignBitSet(); - bool isKnownNonNegativeOp0 = KnownZero2.isSignBitSet(); - bool isKnownNegativeOp1 = KnownOne.isSignBitSet(); - bool isKnownNegativeOp0 = KnownOne2.isSignBitSet(); + bool isKnownNonNegativeOp1 = Known.Zero.isSignBitSet(); + bool isKnownNonNegativeOp0 = Known2.Zero.isSignBitSet(); + bool isKnownNegativeOp1 = Known.One.isSignBitSet(); + bool isKnownNegativeOp0 = Known2.One.isSignBitSet(); // The product of two numbers with the same sign is non-negative. isKnownNonNegative = (isKnownNegativeOp1 && isKnownNegativeOp0) || (isKnownNonNegativeOp1 && isKnownNonNegativeOp0); @@ -343,28 +342,28 @@ static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW, // Also compute a conservative estimate for high known-0 bits. // More trickiness is possible, but this is sufficient for the // interesting case of alignment computation. - KnownOne.clearAllBits(); - unsigned TrailZ = KnownZero.countTrailingOnes() + - KnownZero2.countTrailingOnes(); - unsigned LeadZ = std::max(KnownZero.countLeadingOnes() + - KnownZero2.countLeadingOnes(), + Known.One.clearAllBits(); + unsigned TrailZ = Known.Zero.countTrailingOnes() + + Known2.Zero.countTrailingOnes(); + unsigned LeadZ = std::max(Known.Zero.countLeadingOnes() + + Known2.Zero.countLeadingOnes(), BitWidth) - BitWidth; TrailZ = std::min(TrailZ, BitWidth); LeadZ = std::min(LeadZ, BitWidth); - KnownZero.clearAllBits(); - KnownZero.setLowBits(TrailZ); - KnownZero.setHighBits(LeadZ); + Known.Zero.clearAllBits(); + Known.Zero.setLowBits(TrailZ); + Known.Zero.setHighBits(LeadZ); // Only make use of no-wrap flags if we failed to compute the sign bit // directly. This matters if the multiplication always overflows, in // which case we prefer to follow the result of the direct computation, // though as the program is invoking undefined behaviour we can choose // whatever we like here. - if (isKnownNonNegative && !KnownOne.isSignBitSet()) - KnownZero.setSignBit(); - else if (isKnownNegative && !KnownZero.isSignBitSet()) - KnownOne.setSignBit(); + if (isKnownNonNegative && !Known.One.isSignBitSet()) + Known.Zero.setSignBit(); + else if (isKnownNegative && !Known.Zero.isSignBitSet()) + Known.One.setSignBit(); } void llvm::computeKnownBitsFromRangeMetadata(const MDNode &Ranges, @@ -499,15 +498,14 @@ bool llvm::isValidAssumeForContext(const Instruction *Inv, return !isEphemeralValueOf(Inv, CxtI); } -static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero, - APInt &KnownOne, unsigned Depth, - const Query &Q) { +static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known, + unsigned Depth, const Query &Q) { // Use of assumptions is context-sensitive. If we don't have a context, we // cannot use them! if (!Q.AC || !Q.CxtI) return; - unsigned BitWidth = KnownZero.getBitWidth(); + unsigned BitWidth = Known.getBitWidth(); // Note that the patterns below need to be kept in sync with the code // in AssumptionCache::updateAffectedValues. @@ -532,15 +530,15 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero, if (Arg == V && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { assert(BitWidth == 1 && "assume operand is not i1?"); - KnownZero.clearAllBits(); - KnownOne.setAllBits(); + Known.Zero.clearAllBits(); + Known.One.setAllBits(); return; } if (match(Arg, m_Not(m_Specific(V))) && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { assert(BitWidth == 1 && "assume operand is not i1?"); - KnownZero.setAllBits(); - KnownOne.clearAllBits(); + Known.Zero.setAllBits(); + Known.One.clearAllBits(); return; } @@ -558,126 +556,126 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero, // assume(v = a) if (match(Arg, m_c_ICmp(Pred, m_V, m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { - APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); - computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); - KnownZero |= RHSKnownZero; - KnownOne |= RHSKnownOne; + KnownBits RHSKnown(BitWidth); + computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I)); + Known.Zero |= RHSKnown.Zero; + Known.One |= RHSKnown.One; // assume(v & b = a) } else if (match(Arg, m_c_ICmp(Pred, m_c_And(m_V, m_Value(B)), m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { - APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); - computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); - APInt MaskKnownZero(BitWidth, 0), MaskKnownOne(BitWidth, 0); - computeKnownBits(B, MaskKnownZero, MaskKnownOne, Depth+1, Query(Q, I)); + KnownBits RHSKnown(BitWidth); + computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I)); + KnownBits MaskKnown(BitWidth); + computeKnownBits(B, MaskKnown, Depth+1, Query(Q, I)); // For those bits in the mask that are known to be one, we can propagate // known bits from the RHS to V. - KnownZero |= RHSKnownZero & MaskKnownOne; - KnownOne |= RHSKnownOne & MaskKnownOne; + Known.Zero |= RHSKnown.Zero & MaskKnown.One; + Known.One |= RHSKnown.One & MaskKnown.One; // assume(~(v & b) = a) } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_And(m_V, m_Value(B))), m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { - APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); - computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); - APInt MaskKnownZero(BitWidth, 0), MaskKnownOne(BitWidth, 0); - computeKnownBits(B, MaskKnownZero, MaskKnownOne, Depth+1, Query(Q, I)); + KnownBits RHSKnown(BitWidth); + computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I)); + KnownBits MaskKnown(BitWidth); + computeKnownBits(B, MaskKnown, Depth+1, Query(Q, I)); // For those bits in the mask that are known to be one, we can propagate // inverted known bits from the RHS to V. - KnownZero |= RHSKnownOne & MaskKnownOne; - KnownOne |= RHSKnownZero & MaskKnownOne; + Known.Zero |= RHSKnown.One & MaskKnown.One; + Known.One |= RHSKnown.Zero & MaskKnown.One; // assume(v | b = a) } else if (match(Arg, m_c_ICmp(Pred, m_c_Or(m_V, m_Value(B)), m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { - APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); - computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); - APInt BKnownZero(BitWidth, 0), BKnownOne(BitWidth, 0); - computeKnownBits(B, BKnownZero, BKnownOne, Depth+1, Query(Q, I)); + KnownBits RHSKnown(BitWidth); + computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I)); + KnownBits BKnown(BitWidth); + computeKnownBits(B, BKnown, Depth+1, Query(Q, I)); // For those bits in B that are known to be zero, we can propagate known // bits from the RHS to V. - KnownZero |= RHSKnownZero & BKnownZero; - KnownOne |= RHSKnownOne & BKnownZero; + Known.Zero |= RHSKnown.Zero & BKnown.Zero; + Known.One |= RHSKnown.One & BKnown.Zero; // assume(~(v | b) = a) } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_Or(m_V, m_Value(B))), m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { - APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); - computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); - APInt BKnownZero(BitWidth, 0), BKnownOne(BitWidth, 0); - computeKnownBits(B, BKnownZero, BKnownOne, Depth+1, Query(Q, I)); + KnownBits RHSKnown(BitWidth); + computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I)); + KnownBits BKnown(BitWidth); + computeKnownBits(B, BKnown, Depth+1, Query(Q, I)); // For those bits in B that are known to be zero, we can propagate // inverted known bits from the RHS to V. - KnownZero |= RHSKnownOne & BKnownZero; - KnownOne |= RHSKnownZero & BKnownZero; + Known.Zero |= RHSKnown.One & BKnown.Zero; + Known.One |= RHSKnown.Zero & BKnown.Zero; // assume(v ^ b = a) } else if (match(Arg, m_c_ICmp(Pred, m_c_Xor(m_V, m_Value(B)), m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { - APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); - computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); - APInt BKnownZero(BitWidth, 0), BKnownOne(BitWidth, 0); - computeKnownBits(B, BKnownZero, BKnownOne, Depth+1, Query(Q, I)); + KnownBits RHSKnown(BitWidth); + computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I)); + KnownBits BKnown(BitWidth); + computeKnownBits(B, BKnown, Depth+1, Query(Q, I)); // For those bits in B that are known to be zero, we can propagate known // bits from the RHS to V. For those bits in B that are known to be one, // we can propagate inverted known bits from the RHS to V. - KnownZero |= RHSKnownZero & BKnownZero; - KnownOne |= RHSKnownOne & BKnownZero; - KnownZero |= RHSKnownOne & BKnownOne; - KnownOne |= RHSKnownZero & BKnownOne; + Known.Zero |= RHSKnown.Zero & BKnown.Zero; + Known.One |= RHSKnown.One & BKnown.Zero; + Known.Zero |= RHSKnown.One & BKnown.One; + Known.One |= RHSKnown.Zero & BKnown.One; // assume(~(v ^ b) = a) } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_Xor(m_V, m_Value(B))), m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { - APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); - computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); - APInt BKnownZero(BitWidth, 0), BKnownOne(BitWidth, 0); - computeKnownBits(B, BKnownZero, BKnownOne, Depth+1, Query(Q, I)); + KnownBits RHSKnown(BitWidth); + computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I)); + KnownBits BKnown(BitWidth); + computeKnownBits(B, BKnown, Depth+1, Query(Q, I)); // For those bits in B that are known to be zero, we can propagate // inverted known bits from the RHS to V. For those bits in B that are // known to be one, we can propagate known bits from the RHS to V. - KnownZero |= RHSKnownOne & BKnownZero; - KnownOne |= RHSKnownZero & BKnownZero; - KnownZero |= RHSKnownZero & BKnownOne; - KnownOne |= RHSKnownOne & BKnownOne; + Known.Zero |= RHSKnown.One & BKnown.Zero; + Known.One |= RHSKnown.Zero & BKnown.Zero; + Known.Zero |= RHSKnown.Zero & BKnown.One; + Known.One |= RHSKnown.One & BKnown.One; // assume(v << c = a) } else if (match(Arg, m_c_ICmp(Pred, m_Shl(m_V, m_ConstantInt(C)), m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { - APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); - computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); + KnownBits RHSKnown(BitWidth); + computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I)); // For those bits in RHS that are known, we can propagate them to known // bits in V shifted to the right by C. - RHSKnownZero.lshrInPlace(C->getZExtValue()); - KnownZero |= RHSKnownZero; - RHSKnownOne.lshrInPlace(C->getZExtValue()); - KnownOne |= RHSKnownOne; + RHSKnown.Zero.lshrInPlace(C->getZExtValue()); + Known.Zero |= RHSKnown.Zero; + RHSKnown.One.lshrInPlace(C->getZExtValue()); + Known.One |= RHSKnown.One; // assume(~(v << c) = a) } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_Shl(m_V, m_ConstantInt(C))), m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { - APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); - computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); + KnownBits RHSKnown(BitWidth); + computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I)); // For those bits in RHS that are known, we can propagate them inverted // to known bits in V shifted to the right by C. - RHSKnownOne.lshrInPlace(C->getZExtValue()); - KnownZero |= RHSKnownOne; - RHSKnownZero.lshrInPlace(C->getZExtValue()); - KnownOne |= RHSKnownZero; + RHSKnown.One.lshrInPlace(C->getZExtValue()); + Known.Zero |= RHSKnown.One; + RHSKnown.Zero.lshrInPlace(C->getZExtValue()); + Known.One |= RHSKnown.Zero; // assume(v >> c = a) } else if (match(Arg, m_c_ICmp(Pred, m_CombineOr(m_LShr(m_V, m_ConstantInt(C)), @@ -685,12 +683,12 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero, m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { - APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); - computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); + KnownBits RHSKnown(BitWidth); + computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I)); // For those bits in RHS that are known, we can propagate them to known // bits in V shifted to the right by C. - KnownZero |= RHSKnownZero << C->getZExtValue(); - KnownOne |= RHSKnownOne << C->getZExtValue(); + Known.Zero |= RHSKnown.Zero << C->getZExtValue(); + Known.One |= RHSKnown.One << C->getZExtValue(); // assume(~(v >> c) = a) } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_CombineOr( m_LShr(m_V, m_ConstantInt(C)), @@ -698,78 +696,78 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero, m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { - APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); - computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); + KnownBits RHSKnown(BitWidth); + computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I)); // For those bits in RHS that are known, we can propagate them inverted // to known bits in V shifted to the right by C. - KnownZero |= RHSKnownOne << C->getZExtValue(); - KnownOne |= RHSKnownZero << C->getZExtValue(); + Known.Zero |= RHSKnown.One << C->getZExtValue(); + Known.One |= RHSKnown.Zero << C->getZExtValue(); // assume(v >=_s c) where c is non-negative } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) && Pred == ICmpInst::ICMP_SGE && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { - APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); - computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); + KnownBits RHSKnown(BitWidth); + computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I)); - if (RHSKnownZero.isSignBitSet()) { + if (RHSKnown.Zero.isSignBitSet()) { // We know that the sign bit is zero. - KnownZero.setSignBit(); + Known.Zero.setSignBit(); } // assume(v >_s c) where c is at least -1. } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) && Pred == ICmpInst::ICMP_SGT && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { - APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); - computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); + KnownBits RHSKnown(BitWidth); + computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I)); - if (RHSKnownOne.isAllOnesValue() || RHSKnownZero.isSignBitSet()) { + if (RHSKnown.One.isAllOnesValue() || RHSKnown.Zero.isSignBitSet()) { // We know that the sign bit is zero. - KnownZero.setSignBit(); + Known.Zero.setSignBit(); } // assume(v <=_s c) where c is negative } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) && Pred == ICmpInst::ICMP_SLE && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { - APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); - computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); + KnownBits RHSKnown(BitWidth); + computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I)); - if (RHSKnownOne.isSignBitSet()) { + if (RHSKnown.One.isSignBitSet()) { // We know that the sign bit is one. - KnownOne.setSignBit(); + Known.One.setSignBit(); } // assume(v <_s c) where c is non-positive } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) && Pred == ICmpInst::ICMP_SLT && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { - APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); - computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); + KnownBits RHSKnown(BitWidth); + computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I)); - if (RHSKnownZero.isAllOnesValue() || RHSKnownOne.isSignBitSet()) { + if (RHSKnown.Zero.isAllOnesValue() || RHSKnown.One.isSignBitSet()) { // We know that the sign bit is one. - KnownOne.setSignBit(); + Known.One.setSignBit(); } // assume(v <=_u c) } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) && Pred == ICmpInst::ICMP_ULE && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { - APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); - computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); + KnownBits RHSKnown(BitWidth); + computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I)); // Whatever high bits in c are zero are known to be zero. - KnownZero.setHighBits(RHSKnownZero.countLeadingOnes()); + Known.Zero.setHighBits(RHSKnown.Zero.countLeadingOnes()); // assume(v <_u c) } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) && Pred == ICmpInst::ICMP_ULT && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { - APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); - computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); + KnownBits RHSKnown(BitWidth); + computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I)); // Whatever high bits in c are zero are known to be zero (if c is a power // of 2, then one more). if (isKnownToBeAPowerOfTwo(A, false, Depth + 1, Query(Q, I))) - KnownZero.setHighBits(RHSKnownZero.countLeadingOnes()+1); + Known.Zero.setHighBits(RHSKnown.Zero.countLeadingOnes()+1); else - KnownZero.setHighBits(RHSKnownZero.countLeadingOnes()); + Known.Zero.setHighBits(RHSKnown.Zero.countLeadingOnes()); } } @@ -778,9 +776,9 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero, // so this isn't a real bug. On the other hand, the program may have undefined // behavior, or we might have a bug in the compiler. We can't assert/crash, so // clear out the known bits, try to warn the user, and hope for the best. - if ((KnownZero & KnownOne) != 0) { - KnownZero.clearAllBits(); - KnownOne.clearAllBits(); + if (Known.Zero.intersects(Known.One)) { + Known.Zero.clearAllBits(); + Known.One.clearAllBits(); if (Q.ORE) { auto *CxtI = const_cast(Q.CxtI); @@ -793,57 +791,57 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero, } // Compute known bits from a shift operator, including those with a -// non-constant shift amount. KnownZero and KnownOne are the outputs of this -// function. KnownZero2 and KnownOne2 are pre-allocated temporaries with the -// same bit width as KnownZero and KnownOne. KZF and KOF are operator-specific -// functors that, given the known-zero or known-one bits respectively, and a -// shift amount, compute the implied known-zero or known-one bits of the shift -// operator's result respectively for that shift amount. The results from calling -// KZF and KOF are conservatively combined for all permitted shift amounts. +// non-constant shift amount. Known is the outputs of this function. Known2 is a +// pre-allocated temporary with the/ same bit width as Known. KZF and KOF are +// operator-specific functors that, given the known-zero or known-one bits +// respectively, and a shift amount, compute the implied known-zero or known-one +// bits of the shift operator's result respectively for that shift amount. The +// results from calling KZF and KOF are conservatively combined for all +// permitted shift amounts. static void computeKnownBitsFromShiftOperator( - const Operator *I, APInt &KnownZero, APInt &KnownOne, APInt &KnownZero2, - APInt &KnownOne2, unsigned Depth, const Query &Q, + const Operator *I, KnownBits &Known, KnownBits &Known2, + unsigned Depth, const Query &Q, function_ref KZF, function_ref KOF) { - unsigned BitWidth = KnownZero.getBitWidth(); + unsigned BitWidth = Known.getBitWidth(); if (auto *SA = dyn_cast(I->getOperand(1))) { unsigned ShiftAmt = SA->getLimitedValue(BitWidth-1); - computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q); - KnownZero = KZF(KnownZero, ShiftAmt); - KnownOne = KOF(KnownOne, ShiftAmt); - // If there is conflict between KnownZero and KnownOne, this must be an - // overflowing left shift, so the shift result is undefined. Clear KnownZero - // and KnownOne bits so that other code could propagate this undef. - if ((KnownZero & KnownOne) != 0) { - KnownZero.clearAllBits(); - KnownOne.clearAllBits(); + computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); + Known.Zero = KZF(Known.Zero, ShiftAmt); + Known.One = KOF(Known.One, ShiftAmt); + // If there is conflict between Known.Zero and Known.One, this must be an + // overflowing left shift, so the shift result is undefined. Clear Known + // bits so that other code could propagate this undef. + if ((Known.Zero & Known.One) != 0) { + Known.Zero.clearAllBits(); + Known.One.clearAllBits(); } return; } - computeKnownBits(I->getOperand(1), KnownZero, KnownOne, Depth + 1, Q); + computeKnownBits(I->getOperand(1), Known, Depth + 1, Q); // If the shift amount could be greater than or equal to the bit-width of the LHS, the // value could be undef, so we don't know anything about it. - if ((~KnownZero).uge(BitWidth)) { - KnownZero.clearAllBits(); - KnownOne.clearAllBits(); + if ((~Known.Zero).uge(BitWidth)) { + Known.Zero.clearAllBits(); + Known.One.clearAllBits(); return; } - // Note: We cannot use KnownZero.getLimitedValue() here, because if + // Note: We cannot use Known.Zero.getLimitedValue() here, because if // BitWidth > 64 and any upper bits are known, we'll end up returning the // limit value (which implies all bits are known). - uint64_t ShiftAmtKZ = KnownZero.zextOrTrunc(64).getZExtValue(); - uint64_t ShiftAmtKO = KnownOne.zextOrTrunc(64).getZExtValue(); + uint64_t ShiftAmtKZ = Known.Zero.zextOrTrunc(64).getZExtValue(); + uint64_t ShiftAmtKO = Known.One.zextOrTrunc(64).getZExtValue(); // It would be more-clearly correct to use the two temporaries for this // calculation. Reusing the APInts here to prevent unnecessary allocations. - KnownZero.clearAllBits(); - KnownOne.clearAllBits(); + Known.Zero.clearAllBits(); + Known.One.clearAllBits(); // If we know the shifter operand is nonzero, we can sometimes infer more // known bits. However this is expensive to compute, so be lazy about it and @@ -858,9 +856,10 @@ static void computeKnownBitsFromShiftOperator( return; } - computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q); + computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); - KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth); + Known.Zero.setAllBits(); + Known.One.setAllBits(); for (unsigned ShiftAmt = 0; ShiftAmt < BitWidth; ++ShiftAmt) { // Combine the shifted known input bits only for those shift amounts // compatible with its known constraints. @@ -879,8 +878,8 @@ static void computeKnownBitsFromShiftOperator( continue; } - KnownZero &= KZF(KnownZero2, ShiftAmt); - KnownOne &= KOF(KnownOne2, ShiftAmt); + Known.Zero &= KZF(Known2.Zero, ShiftAmt); + Known.One &= KOF(Known2.One, ShiftAmt); } // If there are no compatible shift amounts, then we've proven that the shift @@ -888,33 +887,32 @@ static void computeKnownBitsFromShiftOperator( // return anything we'd like, but we need to make sure the sets of known bits // stay disjoint (it should be better for some other code to actually // propagate the undef than to pick a value here using known bits). - if ((KnownZero & KnownOne) != 0) { - KnownZero.clearAllBits(); - KnownOne.clearAllBits(); + if (Known.Zero.intersects(Known.One)) { + Known.Zero.clearAllBits(); + Known.One.clearAllBits(); } } -static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero, - APInt &KnownOne, unsigned Depth, - const Query &Q) { - unsigned BitWidth = KnownZero.getBitWidth(); +static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known, + unsigned Depth, const Query &Q) { + unsigned BitWidth = Known.getBitWidth(); - APInt KnownZero2(KnownZero), KnownOne2(KnownOne); + KnownBits Known2(Known); switch (I->getOpcode()) { default: break; case Instruction::Load: if (MDNode *MD = cast(I)->getMetadata(LLVMContext::MD_range)) - computeKnownBitsFromRangeMetadata(*MD, KnownZero, KnownOne); + computeKnownBitsFromRangeMetadata(*MD, Known.Zero, Known.One); break; case Instruction::And: { // If either the LHS or the RHS are Zero, the result is zero. - computeKnownBits(I->getOperand(1), KnownZero, KnownOne, Depth + 1, Q); - computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q); + computeKnownBits(I->getOperand(1), Known, Depth + 1, Q); + computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); // Output known-1 bits are only known if set in both the LHS & RHS. - KnownOne &= KnownOne2; + Known.One &= Known2.One; // Output known-0 are known to be clear if zero in either the LHS | RHS. - KnownZero |= KnownZero2; + Known.Zero |= Known2.Zero; // and(x, add (x, -1)) is a common idiom that always clears the low bit; // here we handle the more general case of adding any odd number by @@ -922,115 +920,115 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero, // TODO: This could be generalized to clearing any bit set in y where the // following bit is known to be unset in y. Value *Y = nullptr; - if (!KnownZero[0] && !KnownOne[0] && + if (!Known.Zero[0] && !Known.One[0] && (match(I->getOperand(0), m_Add(m_Specific(I->getOperand(1)), m_Value(Y))) || match(I->getOperand(1), m_Add(m_Specific(I->getOperand(0)), m_Value(Y))))) { - KnownZero2.clearAllBits(); KnownOne2.clearAllBits(); - computeKnownBits(Y, KnownZero2, KnownOne2, Depth + 1, Q); - if (KnownOne2.countTrailingOnes() > 0) - KnownZero.setBit(0); + Known2.Zero.clearAllBits(); Known2.One.clearAllBits(); + computeKnownBits(Y, Known2, Depth + 1, Q); + if (Known2.One.countTrailingOnes() > 0) + Known.Zero.setBit(0); } break; } case Instruction::Or: { - computeKnownBits(I->getOperand(1), KnownZero, KnownOne, Depth + 1, Q); - computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q); + computeKnownBits(I->getOperand(1), Known, Depth + 1, Q); + computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); // Output known-0 bits are only known if clear in both the LHS & RHS. - KnownZero &= KnownZero2; + Known.Zero &= Known2.Zero; // Output known-1 are known to be set if set in either the LHS | RHS. - KnownOne |= KnownOne2; + Known.One |= Known2.One; break; } case Instruction::Xor: { - computeKnownBits(I->getOperand(1), KnownZero, KnownOne, Depth + 1, Q); - computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q); + computeKnownBits(I->getOperand(1), Known, Depth + 1, Q); + computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); // Output known-0 bits are known if clear or set in both the LHS & RHS. - APInt KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2); + APInt KnownZeroOut = (Known.Zero & Known2.Zero) | (Known.One & Known2.One); // Output known-1 are known to be set if set in only one of the LHS, RHS. - KnownOne = (KnownZero & KnownOne2) | (KnownOne & KnownZero2); - KnownZero = std::move(KnownZeroOut); + Known.One = (Known.Zero & Known2.One) | (Known.One & Known2.Zero); + Known.Zero = std::move(KnownZeroOut); break; } case Instruction::Mul: { bool NSW = cast(I)->hasNoSignedWrap(); - computeKnownBitsMul(I->getOperand(0), I->getOperand(1), NSW, KnownZero, - KnownOne, KnownZero2, KnownOne2, Depth, Q); + computeKnownBitsMul(I->getOperand(0), I->getOperand(1), NSW, Known, + Known2, Depth, Q); break; } case Instruction::UDiv: { // For the purposes of computing leading zeros we can conservatively // treat a udiv as a logical right shift by the power of 2 known to // be less than the denominator. - computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q); - unsigned LeadZ = KnownZero2.countLeadingOnes(); + computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); + unsigned LeadZ = Known2.Zero.countLeadingOnes(); - KnownOne2.clearAllBits(); - KnownZero2.clearAllBits(); - computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, Depth + 1, Q); - unsigned RHSUnknownLeadingOnes = KnownOne2.countLeadingZeros(); + Known2.One.clearAllBits(); + Known2.Zero.clearAllBits(); + computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q); + unsigned RHSUnknownLeadingOnes = Known2.One.countLeadingZeros(); if (RHSUnknownLeadingOnes != BitWidth) LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSUnknownLeadingOnes - 1); - KnownZero.setHighBits(LeadZ); + Known.Zero.setHighBits(LeadZ); break; } case Instruction::Select: { const Value *LHS, *RHS; SelectPatternFlavor SPF = matchSelectPattern(I, LHS, RHS).Flavor; if (SelectPatternResult::isMinOrMax(SPF)) { - computeKnownBits(RHS, KnownZero, KnownOne, Depth + 1, Q); - computeKnownBits(LHS, KnownZero2, KnownOne2, Depth + 1, Q); + computeKnownBits(RHS, Known, Depth + 1, Q); + computeKnownBits(LHS, Known2, Depth + 1, Q); } else { - computeKnownBits(I->getOperand(2), KnownZero, KnownOne, Depth + 1, Q); - computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, Depth + 1, Q); + computeKnownBits(I->getOperand(2), Known, Depth + 1, Q); + computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q); } unsigned MaxHighOnes = 0; unsigned MaxHighZeros = 0; if (SPF == SPF_SMAX) { // If both sides are negative, the result is negative. - if (KnownOne.isSignBitSet() && KnownOne2.isSignBitSet()) + if (Known.One.isSignBitSet() && Known2.One.isSignBitSet()) // We can derive a lower bound on the result by taking the max of the // leading one bits. - MaxHighOnes = - std::max(KnownOne.countLeadingOnes(), KnownOne2.countLeadingOnes()); + MaxHighOnes = std::max(Known.One.countLeadingOnes(), + Known2.One.countLeadingOnes()); // If either side is non-negative, the result is non-negative. - else if (KnownZero.isSignBitSet() || KnownZero2.isSignBitSet()) + else if (Known.Zero.isSignBitSet() || Known2.Zero.isSignBitSet()) MaxHighZeros = 1; } else if (SPF == SPF_SMIN) { // If both sides are non-negative, the result is non-negative. - if (KnownZero.isSignBitSet() && KnownZero2.isSignBitSet()) + if (Known.Zero.isSignBitSet() && Known2.Zero.isSignBitSet()) // We can derive an upper bound on the result by taking the max of the // leading zero bits. - MaxHighZeros = std::max(KnownZero.countLeadingOnes(), - KnownZero2.countLeadingOnes()); + MaxHighZeros = std::max(Known.Zero.countLeadingOnes(), + Known2.Zero.countLeadingOnes()); // If either side is negative, the result is negative. - else if (KnownOne.isSignBitSet() || KnownOne2.isSignBitSet()) + else if (Known.One.isSignBitSet() || Known2.One.isSignBitSet()) MaxHighOnes = 1; } else if (SPF == SPF_UMAX) { // We can derive a lower bound on the result by taking the max of the // leading one bits. MaxHighOnes = - std::max(KnownOne.countLeadingOnes(), KnownOne2.countLeadingOnes()); + std::max(Known.One.countLeadingOnes(), Known2.One.countLeadingOnes()); } else if (SPF == SPF_UMIN) { // We can derive an upper bound on the result by taking the max of the // leading zero bits. MaxHighZeros = - std::max(KnownZero.countLeadingOnes(), KnownZero2.countLeadingOnes()); + std::max(Known.Zero.countLeadingOnes(), Known2.Zero.countLeadingOnes()); } // Only known if known in both the LHS and RHS. - KnownOne &= KnownOne2; - KnownZero &= KnownZero2; + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; if (MaxHighOnes > 0) - KnownOne.setHighBits(MaxHighOnes); + Known.One.setHighBits(MaxHighOnes); if (MaxHighZeros > 0) - KnownZero.setHighBits(MaxHighZeros); + Known.Zero.setHighBits(MaxHighZeros); break; } case Instruction::FPTrunc: @@ -1054,14 +1052,14 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero, SrcBitWidth = Q.DL.getTypeSizeInBits(SrcTy->getScalarType()); assert(SrcBitWidth && "SrcBitWidth can't be zero"); - KnownZero = KnownZero.zextOrTrunc(SrcBitWidth); - KnownOne = KnownOne.zextOrTrunc(SrcBitWidth); - computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q); - KnownZero = KnownZero.zextOrTrunc(BitWidth); - KnownOne = KnownOne.zextOrTrunc(BitWidth); + Known.Zero = Known.Zero.zextOrTrunc(SrcBitWidth); + Known.One = Known.One.zextOrTrunc(SrcBitWidth); + computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); + Known.Zero = Known.Zero.zextOrTrunc(BitWidth); + Known.One = Known.One.zextOrTrunc(BitWidth); // Any top bits are known to be zero. if (BitWidth > SrcBitWidth) - KnownZero.setBitsFrom(SrcBitWidth); + Known.Zero.setBitsFrom(SrcBitWidth); break; } case Instruction::BitCast: { @@ -1070,7 +1068,7 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero, // TODO: For now, not handling conversions like: // (bitcast i64 %x to <2 x i32>) !I->getType()->isVectorTy()) { - computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q); + computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); break; } break; @@ -1079,13 +1077,13 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero, // Compute the bits in the result that are not present in the input. unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits(); - KnownZero = KnownZero.trunc(SrcBitWidth); - KnownOne = KnownOne.trunc(SrcBitWidth); - computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q); + Known.Zero = Known.Zero.trunc(SrcBitWidth); + Known.One = Known.One.trunc(SrcBitWidth); + computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); // If the sign bit of the input is known set or clear, then we know the // top bits of the result. - KnownZero = KnownZero.sext(BitWidth); - KnownOne = KnownOne.sext(BitWidth); + Known.Zero = Known.Zero.sext(BitWidth); + Known.One = Known.One.sext(BitWidth); break; } case Instruction::Shl: { @@ -1108,9 +1106,7 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero, return KOResult; }; - computeKnownBitsFromShiftOperator(I, KnownZero, KnownOne, - KnownZero2, KnownOne2, Depth, Q, KZF, - KOF); + computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF); break; } case Instruction::LShr: { @@ -1126,9 +1122,7 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero, return KnownOne.lshr(ShiftAmt); }; - computeKnownBitsFromShiftOperator(I, KnownZero, KnownOne, - KnownZero2, KnownOne2, Depth, Q, KZF, - KOF); + computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF); break; } case Instruction::AShr: { @@ -1141,23 +1135,19 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero, return KnownOne.ashr(ShiftAmt); }; - computeKnownBitsFromShiftOperator(I, KnownZero, KnownOne, - KnownZero2, KnownOne2, Depth, Q, KZF, - KOF); + computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF); break; } case Instruction::Sub: { bool NSW = cast(I)->hasNoSignedWrap(); computeKnownBitsAddSub(false, I->getOperand(0), I->getOperand(1), NSW, - KnownZero, KnownOne, KnownZero2, KnownOne2, Depth, - Q); + Known, Known2, Depth, Q); break; } case Instruction::Add: { bool NSW = cast(I)->hasNoSignedWrap(); computeKnownBitsAddSub(true, I->getOperand(0), I->getOperand(1), NSW, - KnownZero, KnownOne, KnownZero2, KnownOne2, Depth, - Q); + Known, Known2, Depth, Q); break; } case Instruction::SRem: @@ -1165,34 +1155,33 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero, APInt RA = Rem->getValue().abs(); if (RA.isPowerOf2()) { APInt LowBits = RA - 1; - computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, - Q); + computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); // The low bits of the first operand are unchanged by the srem. - KnownZero = KnownZero2 & LowBits; - KnownOne = KnownOne2 & LowBits; + Known.Zero = Known2.Zero & LowBits; + Known.One = Known2.One & LowBits; // If the first operand is non-negative or has all low bits zero, then // the upper bits are all zero. - if (KnownZero2.isSignBitSet() || ((KnownZero2 & LowBits) == LowBits)) - KnownZero |= ~LowBits; + if (Known2.Zero.isSignBitSet() || ((Known2.Zero & LowBits) == LowBits)) + Known.Zero |= ~LowBits; // If the first operand is negative and not all low bits are zero, then // the upper bits are all one. - if (KnownOne2.isSignBitSet() && ((KnownOne2 & LowBits) != 0)) - KnownOne |= ~LowBits; + if (Known2.One.isSignBitSet() && ((Known2.One & LowBits) != 0)) + Known.One |= ~LowBits; - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); + assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?"); break; } } // The sign bit is the LHS's sign bit, except when the result of the // remainder is zero. - computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q); + computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); // If it's known zero, our sign bit is also zero. - if (KnownZero2.isSignBitSet()) - KnownZero.setSignBit(); + if (Known2.Zero.isSignBitSet()) + Known.Zero.setSignBit(); break; case Instruction::URem: { @@ -1200,23 +1189,23 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero, const APInt &RA = Rem->getValue(); if (RA.isPowerOf2()) { APInt LowBits = (RA - 1); - computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q); - KnownZero |= ~LowBits; - KnownOne &= LowBits; + computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); + Known.Zero |= ~LowBits; + Known.One &= LowBits; break; } } // Since the result is less than or equal to either operand, any leading // zero bits in either operand must also exist in the result. - computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q); - computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, Depth + 1, Q); - - unsigned Leaders = std::max(KnownZero.countLeadingOnes(), - KnownZero2.countLeadingOnes()); - KnownOne.clearAllBits(); - KnownZero.clearAllBits(); - KnownZero.setHighBits(Leaders); + computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); + computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q); + + unsigned Leaders = std::max(Known.Zero.countLeadingOnes(), + Known2.Zero.countLeadingOnes()); + Known.One.clearAllBits(); + Known.Zero.clearAllBits(); + Known.Zero.setHighBits(Leaders); break; } @@ -1227,16 +1216,15 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero, Align = Q.DL.getABITypeAlignment(AI->getAllocatedType()); if (Align > 0) - KnownZero.setLowBits(countTrailingZeros(Align)); + Known.Zero.setLowBits(countTrailingZeros(Align)); break; } case Instruction::GetElementPtr: { // Analyze all of the subscripts of this getelementptr instruction // to determine if we can prove known low zero bits. - APInt LocalKnownZero(BitWidth, 0), LocalKnownOne(BitWidth, 0); - computeKnownBits(I->getOperand(0), LocalKnownZero, LocalKnownOne, Depth + 1, - Q); - unsigned TrailZ = LocalKnownZero.countTrailingOnes(); + KnownBits LocalKnown(BitWidth); + computeKnownBits(I->getOperand(0), LocalKnown, Depth + 1, Q); + unsigned TrailZ = LocalKnown.Zero.countTrailingOnes(); gep_type_iterator GTI = gep_type_begin(I); for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i, ++GTI) { @@ -1266,15 +1254,15 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero, } unsigned GEPOpiBits = Index->getType()->getScalarSizeInBits(); uint64_t TypeSize = Q.DL.getTypeAllocSize(IndexedTy); - LocalKnownZero = LocalKnownOne = APInt(GEPOpiBits, 0); - computeKnownBits(Index, LocalKnownZero, LocalKnownOne, Depth + 1, Q); + LocalKnown.Zero = LocalKnown.One = APInt(GEPOpiBits, 0); + computeKnownBits(Index, LocalKnown, Depth + 1, Q); TrailZ = std::min(TrailZ, unsigned(countTrailingZeros(TypeSize) + - LocalKnownZero.countTrailingOnes())); + LocalKnown.Zero.countTrailingOnes())); } } - KnownZero.setLowBits(TrailZ); + Known.Zero.setLowBits(TrailZ); break; } case Instruction::PHI: { @@ -1309,14 +1297,14 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero, break; // Ok, we have a PHI of the form L op= R. Check for low // zero bits. - computeKnownBits(R, KnownZero2, KnownOne2, Depth + 1, Q); + computeKnownBits(R, Known2, Depth + 1, Q); // We need to take the minimum number of known bits - APInt KnownZero3(KnownZero), KnownOne3(KnownOne); - computeKnownBits(L, KnownZero3, KnownOne3, Depth + 1, Q); + KnownBits Known3(Known); + computeKnownBits(L, Known3, Depth + 1, Q); - KnownZero.setLowBits(std::min(KnownZero2.countTrailingOnes(), - KnownZero3.countTrailingOnes())); + Known.Zero.setLowBits(std::min(Known2.Zero.countTrailingOnes(), + Known3.Zero.countTrailingOnes())); if (DontImproveNonNegativePhiBits) break; @@ -1333,25 +1321,25 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero, // (add non-negative, non-negative) --> non-negative // (add negative, negative) --> negative if (Opcode == Instruction::Add) { - if (KnownZero2.isSignBitSet() && KnownZero3.isSignBitSet()) - KnownZero.setSignBit(); - else if (KnownOne2.isSignBitSet() && KnownOne3.isSignBitSet()) - KnownOne.setSignBit(); + if (Known2.Zero.isSignBitSet() && Known3.Zero.isSignBitSet()) + Known.Zero.setSignBit(); + else if (Known2.One.isSignBitSet() && Known3.One.isSignBitSet()) + Known.One.setSignBit(); } // (sub nsw non-negative, negative) --> non-negative // (sub nsw negative, non-negative) --> negative else if (Opcode == Instruction::Sub && LL == I) { - if (KnownZero2.isSignBitSet() && KnownOne3.isSignBitSet()) - KnownZero.setSignBit(); - else if (KnownOne2.isSignBitSet() && KnownZero3.isSignBitSet()) - KnownOne.setSignBit(); + if (Known2.Zero.isSignBitSet() && Known3.One.isSignBitSet()) + Known.Zero.setSignBit(); + else if (Known2.One.isSignBitSet() && Known3.Zero.isSignBitSet()) + Known.One.setSignBit(); } // (mul nsw non-negative, non-negative) --> non-negative - else if (Opcode == Instruction::Mul && KnownZero2.isSignBitSet() && - KnownZero3.isSignBitSet()) - KnownZero.setSignBit(); + else if (Opcode == Instruction::Mul && Known2.Zero.isSignBitSet() && + Known3.Zero.isSignBitSet()) + Known.Zero.setSignBit(); } break; @@ -1365,27 +1353,26 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero, // Otherwise take the unions of the known bit sets of the operands, // taking conservative care to avoid excessive recursion. - if (Depth < MaxDepth - 1 && !KnownZero && !KnownOne) { + if (Depth < MaxDepth - 1 && !Known.Zero && !Known.One) { // Skip if every incoming value references to ourself. if (dyn_cast_or_null(P->hasConstantValue())) break; - KnownZero.setAllBits(); - KnownOne.setAllBits(); + Known.Zero.setAllBits(); + Known.One.setAllBits(); for (Value *IncValue : P->incoming_values()) { // Skip direct self references. if (IncValue == P) continue; - KnownZero2 = APInt(BitWidth, 0); - KnownOne2 = APInt(BitWidth, 0); + Known2 = KnownBits(BitWidth); // Recurse, but cap the recursion to one level, because we don't // want to waste time spinning around in loops. - computeKnownBits(IncValue, KnownZero2, KnownOne2, MaxDepth - 1, Q); - KnownZero &= KnownZero2; - KnownOne &= KnownOne2; + computeKnownBits(IncValue, Known2, MaxDepth - 1, Q); + Known.Zero &= Known2.Zero; + Known.One &= Known2.One; // If all bits have been ruled out, there's no need to check // more operands. - if (!KnownZero && !KnownOne) + if (!Known.Zero && !Known.One) break; } } @@ -1397,24 +1384,24 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero, // and then intersect with known bits based on other properties of the // function. if (MDNode *MD = cast(I)->getMetadata(LLVMContext::MD_range)) - computeKnownBitsFromRangeMetadata(*MD, KnownZero, KnownOne); + computeKnownBitsFromRangeMetadata(*MD, Known.Zero, Known.One); if (const Value *RV = ImmutableCallSite(I).getReturnedArgOperand()) { - computeKnownBits(RV, KnownZero2, KnownOne2, Depth + 1, Q); - KnownZero |= KnownZero2; - KnownOne |= KnownOne2; + computeKnownBits(RV, Known2, Depth + 1, Q); + Known.Zero |= Known2.Zero; + Known.One |= Known2.One; } if (const IntrinsicInst *II = dyn_cast(I)) { switch (II->getIntrinsicID()) { default: break; case Intrinsic::bitreverse: - computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q); - KnownZero |= KnownZero2.reverseBits(); - KnownOne |= KnownOne2.reverseBits(); + computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); + Known.Zero |= Known2.Zero.reverseBits(); + Known.One |= Known2.One.reverseBits(); break; case Intrinsic::bswap: - computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q); - KnownZero |= KnownZero2.byteSwap(); - KnownOne |= KnownOne2.byteSwap(); + computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); + Known.Zero |= Known2.Zero.byteSwap(); + Known.One |= Known2.One.byteSwap(); break; case Intrinsic::ctlz: case Intrinsic::cttz: { @@ -1422,22 +1409,22 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero, // If this call is undefined for 0, the result will be less than 2^n. if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext())) LowBits -= 1; - KnownZero.setBitsFrom(LowBits); + Known.Zero.setBitsFrom(LowBits); break; } case Intrinsic::ctpop: { - computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q); + computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); // We can bound the space the count needs. Also, bits known to be zero // can't contribute to the population. - unsigned BitsPossiblySet = BitWidth - KnownZero2.countPopulation(); + unsigned BitsPossiblySet = BitWidth - Known2.Zero.countPopulation(); unsigned LowBits = Log2_32(BitsPossiblySet)+1; - KnownZero.setBitsFrom(LowBits); + Known.Zero.setBitsFrom(LowBits); // TODO: we could bound KnownOne using the lower bound on the number // of bits which might be set provided by popcnt KnownOne2. break; } case Intrinsic::x86_sse42_crc32_64_64: - KnownZero.setBitsFrom(32); + Known.Zero.setBitsFrom(32); break; } } @@ -1447,7 +1434,7 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero, // tracking the specific element. But at least we might find information // valid for all elements of the vector (for example if vector is sign // extended, shifted, etc). - computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q); + computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); break; case Instruction::ExtractValue: if (IntrinsicInst *II = dyn_cast(I->getOperand(0))) { @@ -1459,20 +1446,19 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero, case Intrinsic::uadd_with_overflow: case Intrinsic::sadd_with_overflow: computeKnownBitsAddSub(true, II->getArgOperand(0), - II->getArgOperand(1), false, KnownZero, - KnownOne, KnownZero2, KnownOne2, Depth, Q); + II->getArgOperand(1), false, Known, Known2, + Depth, Q); break; case Intrinsic::usub_with_overflow: case Intrinsic::ssub_with_overflow: computeKnownBitsAddSub(false, II->getArgOperand(0), - II->getArgOperand(1), false, KnownZero, - KnownOne, KnownZero2, KnownOne2, Depth, Q); + II->getArgOperand(1), false, Known, Known2, + Depth, Q); break; case Intrinsic::umul_with_overflow: case Intrinsic::smul_with_overflow: computeKnownBitsMul(II->getArgOperand(0), II->getArgOperand(1), false, - KnownZero, KnownOne, KnownZero2, KnownOne2, Depth, - Q); + Known, Known2, Depth, Q); break; } } @@ -1481,7 +1467,7 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero, } /// Determine which bits of V are known to be either zero or one and return -/// them in the KnownZero/KnownOne bit sets. +/// them in the Known bit set. /// /// NOTE: we cannot consider 'undef' to be "IsZero" here. The problem is that /// we cannot optimize based on the assumption that it is zero without changing @@ -1495,11 +1481,11 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero, /// where V is a vector, known zero, and known one values are the /// same width as the vector element, and the bit is set only if it is true /// for all of the elements in the vector. -void computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne, - unsigned Depth, const Query &Q) { +void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth, + const Query &Q) { assert(V && "No Value?"); assert(Depth <= MaxDepth && "Limit Search Depth"); - unsigned BitWidth = KnownZero.getBitWidth(); + unsigned BitWidth = Known.getBitWidth(); assert((V->getType()->isIntOrIntVectorTy() || V->getType()->getScalarType()->isPointerTy()) && @@ -1507,22 +1493,20 @@ void computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne, assert((Q.DL.getTypeSizeInBits(V->getType()->getScalarType()) == BitWidth) && (!V->getType()->isIntOrIntVectorTy() || V->getType()->getScalarSizeInBits() == BitWidth) && - KnownZero.getBitWidth() == BitWidth && - KnownOne.getBitWidth() == BitWidth && - "V, KnownOne and KnownZero should have same BitWidth"); + "V and Known should have same BitWidth"); (void)BitWidth; const APInt *C; if (match(V, m_APInt(C))) { // We know all of the bits for a scalar constant or a splat vector constant! - KnownOne = *C; - KnownZero = ~KnownOne; + Known.One = *C; + Known.Zero = ~Known.One; return; } // Null and aggregate-zero are all-zeros. if (isa(V) || isa(V)) { - KnownOne.clearAllBits(); - KnownZero.setAllBits(); + Known.One.clearAllBits(); + Known.Zero.setAllBits(); return; } // Handle a constant vector by taking the intersection of the known bits of @@ -1530,12 +1514,12 @@ void computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne, if (const ConstantDataSequential *CDS = dyn_cast(V)) { // We know that CDS must be a vector of integers. Take the intersection of // each element. - KnownZero.setAllBits(); KnownOne.setAllBits(); - APInt Elt(KnownZero.getBitWidth(), 0); + Known.Zero.setAllBits(); Known.One.setAllBits(); + APInt Elt(BitWidth, 0); for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) { Elt = CDS->getElementAsInteger(i); - KnownZero &= ~Elt; - KnownOne &= Elt; + Known.Zero &= ~Elt; + Known.One &= Elt; } return; } @@ -1543,25 +1527,25 @@ void computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne, if (const auto *CV = dyn_cast(V)) { // We know that CV must be a vector of integers. Take the intersection of // each element. - KnownZero.setAllBits(); KnownOne.setAllBits(); - APInt Elt(KnownZero.getBitWidth(), 0); + Known.Zero.setAllBits(); Known.One.setAllBits(); + APInt Elt(BitWidth, 0); for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) { Constant *Element = CV->getAggregateElement(i); auto *ElementCI = dyn_cast_or_null(Element); if (!ElementCI) { - KnownZero.clearAllBits(); - KnownOne.clearAllBits(); + Known.Zero.clearAllBits(); + Known.One.clearAllBits(); return; } Elt = ElementCI->getValue(); - KnownZero &= ~Elt; - KnownOne &= Elt; + Known.Zero &= ~Elt; + Known.One &= Elt; } return; } // Start out not knowing anything. - KnownZero.clearAllBits(); KnownOne.clearAllBits(); + Known.Zero.clearAllBits(); Known.One.clearAllBits(); // We can't imply anything about undefs. if (isa(V)) @@ -1580,27 +1564,27 @@ void computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne, // the bits of its aliasee. if (const GlobalAlias *GA = dyn_cast(V)) { if (!GA->isInterposable()) - computeKnownBits(GA->getAliasee(), KnownZero, KnownOne, Depth + 1, Q); + computeKnownBits(GA->getAliasee(), Known, Depth + 1, Q); return; } if (const Operator *I = dyn_cast(V)) - computeKnownBitsFromOperator(I, KnownZero, KnownOne, Depth, Q); + computeKnownBitsFromOperator(I, Known, Depth, Q); - // Aligned pointers have trailing zeros - refine KnownZero set + // Aligned pointers have trailing zeros - refine Known.Zero set if (V->getType()->isPointerTy()) { unsigned Align = V->getPointerAlignment(Q.DL); if (Align) - KnownZero.setLowBits(countTrailingZeros(Align)); + Known.Zero.setLowBits(countTrailingZeros(Align)); } - // computeKnownBitsFromAssume strictly refines KnownZero and - // KnownOne. Therefore, we run them after computeKnownBitsFromOperator. + // computeKnownBitsFromAssume strictly refines Known. + // Therefore, we run them after computeKnownBitsFromOperator. // Check whether a nearby assume intrinsic can determine some known bits. - computeKnownBitsFromAssume(V, KnownZero, KnownOne, Depth, Q); + computeKnownBitsFromAssume(V, Known, Depth, Q); - assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); + assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?"); } /// Determine whether the sign bit is known to be zero or one. @@ -1613,11 +1597,10 @@ void ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne, KnownOne = false; return; } - APInt ZeroBits(BitWidth, 0); - APInt OneBits(BitWidth, 0); - computeKnownBits(V, ZeroBits, OneBits, Depth, Q); - KnownOne = OneBits.isSignBitSet(); - KnownZero = ZeroBits.isSignBitSet(); + KnownBits Bits(BitWidth); + computeKnownBits(V, Bits, Depth, Q); + KnownOne = Bits.One.isSignBitSet(); + KnownZero = Bits.Zero.isSignBitSet(); } /// Return true if the given value is known to have exactly one @@ -1689,18 +1672,18 @@ bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth, return true; unsigned BitWidth = V->getType()->getScalarSizeInBits(); - APInt LHSZeroBits(BitWidth, 0), LHSOneBits(BitWidth, 0); - computeKnownBits(X, LHSZeroBits, LHSOneBits, Depth, Q); + KnownBits LHSBits(BitWidth); + computeKnownBits(X, LHSBits, Depth, Q); - APInt RHSZeroBits(BitWidth, 0), RHSOneBits(BitWidth, 0); - computeKnownBits(Y, RHSZeroBits, RHSOneBits, Depth, Q); + KnownBits RHSBits(BitWidth); + computeKnownBits(Y, RHSBits, Depth, Q); // If i8 V is a power of two or zero: // ZeroBits: 1 1 1 0 1 1 1 1 // ~ZeroBits: 0 0 0 1 0 0 0 0 - if ((~(LHSZeroBits & RHSZeroBits)).isPowerOf2()) + if ((~(LHSBits.Zero & RHSBits.Zero)).isPowerOf2()) // If OrZero isn't set, we cannot give back a zero result. // Make sure either the LHS or RHS has a bit set. - if (OrZero || RHSOneBits.getBoolValue() || LHSOneBits.getBoolValue()) + if (OrZero || RHSBits.One.getBoolValue() || LHSBits.One.getBoolValue()) return true; } } @@ -1871,10 +1854,9 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) { if (BO->hasNoUnsignedWrap()) return isKnownNonZero(X, Depth, Q); - APInt KnownZero(BitWidth, 0); - APInt KnownOne(BitWidth, 0); - computeKnownBits(X, KnownZero, KnownOne, Depth, Q); - if (KnownOne[0]) + KnownBits Known(BitWidth); + computeKnownBits(X, Known, Depth, Q); + if (Known.One[0]) return true; } // shr X, Y != 0 if X is negative. Note that the value of the shift is not @@ -1894,16 +1876,15 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) { // out are known to be zero, and X is known non-zero then at least one // non-zero bit must remain. if (ConstantInt *Shift = dyn_cast(Y)) { - APInt KnownZero(BitWidth, 0); - APInt KnownOne(BitWidth, 0); - computeKnownBits(X, KnownZero, KnownOne, Depth, Q); + KnownBits Known(BitWidth); + computeKnownBits(X, Known, Depth, Q); auto ShiftVal = Shift->getLimitedValue(BitWidth - 1); // Is there a known one in the portion not shifted out? - if (KnownOne.countLeadingZeros() < BitWidth - ShiftVal) + if (Known.One.countLeadingZeros() < BitWidth - ShiftVal) return true; // Are all the bits to be shifted out known zero? - if (KnownZero.countTrailingOnes() >= ShiftVal) + if (Known.Zero.countTrailingOnes() >= ShiftVal) return isKnownNonZero(X, Depth, Q); } } @@ -1927,18 +1908,17 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) { // If X and Y are both negative (as signed values) then their sum is not // zero unless both X and Y equal INT_MIN. if (BitWidth && XKnownNegative && YKnownNegative) { - APInt KnownZero(BitWidth, 0); - APInt KnownOne(BitWidth, 0); + KnownBits Known(BitWidth); APInt Mask = APInt::getSignedMaxValue(BitWidth); // The sign bit of X is set. If some other bit is set then X is not equal // to INT_MIN. - computeKnownBits(X, KnownZero, KnownOne, Depth, Q); - if ((KnownOne & Mask) != 0) + computeKnownBits(X, Known, Depth, Q); + if (Known.One.intersects(Mask)) return true; // The sign bit of Y is set. If some other bit is set then Y is not equal // to INT_MIN. - computeKnownBits(Y, KnownZero, KnownOne, Depth, Q); - if ((KnownOne & Mask) != 0) + computeKnownBits(Y, Known, Depth, Q); + if (Known.One.intersects(Mask)) return true; } @@ -1993,10 +1973,9 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) { } if (!BitWidth) return false; - APInt KnownZero(BitWidth, 0); - APInt KnownOne(BitWidth, 0); - computeKnownBits(V, KnownZero, KnownOne, Depth, Q); - return KnownOne != 0; + KnownBits Known(BitWidth); + computeKnownBits(V, Known, Depth, Q); + return Known.One != 0; } /// Return true if V2 == V1 + X, where X is known non-zero. @@ -2028,14 +2007,13 @@ static bool isKnownNonEqual(const Value *V1, const Value *V2, const Query &Q) { // Are any known bits in V1 contradictory to known bits in V2? If V1 // has a known zero where V2 has a known one, they must not be equal. auto BitWidth = Ty->getBitWidth(); - APInt KnownZero1(BitWidth, 0); - APInt KnownOne1(BitWidth, 0); - computeKnownBits(V1, KnownZero1, KnownOne1, 0, Q); - APInt KnownZero2(BitWidth, 0); - APInt KnownOne2(BitWidth, 0); - computeKnownBits(V2, KnownZero2, KnownOne2, 0, Q); - - auto OppositeBits = (KnownZero1 & KnownOne2) | (KnownZero2 & KnownOne1); + KnownBits Known1(BitWidth); + computeKnownBits(V1, Known1, 0, Q); + KnownBits Known2(BitWidth); + computeKnownBits(V2, Known2, 0, Q); + + APInt OppositeBits = (Known1.Zero & Known2.One) | + (Known2.Zero & Known1.One); if (OppositeBits.getBoolValue()) return true; } @@ -2053,9 +2031,9 @@ static bool isKnownNonEqual(const Value *V1, const Value *V2, const Query &Q) { /// for all of the elements in the vector. bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth, const Query &Q) { - APInt KnownZero(Mask.getBitWidth(), 0), KnownOne(Mask.getBitWidth(), 0); - computeKnownBits(V, KnownZero, KnownOne, Depth, Q); - return (KnownZero & Mask) == Mask; + KnownBits Known(Mask.getBitWidth()); + computeKnownBits(V, Known, Depth, Q); + return Mask.isSubsetOf(Known.Zero); } /// For vector constants, loop over the elements and find the constant with the @@ -2233,17 +2211,17 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth, // Special case decrementing a value (ADD X, -1): if (const auto *CRHS = dyn_cast(U->getOperand(1))) if (CRHS->isAllOnesValue()) { - APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0); - computeKnownBits(U->getOperand(0), KnownZero, KnownOne, Depth + 1, Q); + KnownBits Known(TyBits); + computeKnownBits(U->getOperand(0), Known, Depth + 1, Q); // If the input is known to be 0 or 1, the output is 0/-1, which is all // sign bits set. - if ((KnownZero | APInt(TyBits, 1)).isAllOnesValue()) + if ((Known.Zero | 1).isAllOnesValue()) return TyBits; // If we are subtracting one from a positive number, there is no carry // out of the result. - if (KnownZero.isSignBitSet()) + if (Known.Zero.isSignBitSet()) return Tmp; } @@ -2258,16 +2236,16 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth, // Handle NEG. if (const auto *CLHS = dyn_cast(U->getOperand(0))) if (CLHS->isNullValue()) { - APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0); - computeKnownBits(U->getOperand(1), KnownZero, KnownOne, Depth + 1, Q); + KnownBits Known(TyBits); + computeKnownBits(U->getOperand(1), Known, Depth + 1, Q); // If the input is known to be 0 or 1, the output is 0/-1, which is all // sign bits set. - if ((KnownZero | APInt(TyBits, 1)).isAllOnesValue()) + if ((Known.Zero | 1).isAllOnesValue()) return TyBits; // If the input is known to be positive (the sign bit is known clear), // the output of the NEG has the same number of sign bits as the input. - if (KnownZero.isSignBitSet()) + if (Known.Zero.isSignBitSet()) return Tmp2; // Otherwise, we treat this like a SUB. @@ -2319,16 +2297,16 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth, if (unsigned VecSignBits = computeNumSignBitsVectorConstant(V, TyBits)) return VecSignBits; - APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0); - computeKnownBits(V, KnownZero, KnownOne, Depth, Q); + KnownBits Known(TyBits); + computeKnownBits(V, Known, Depth, Q); // If we know that the sign bit is either zero or one, determine the number of // identical bits in the top of the input value. - if (KnownZero.isSignBitSet()) - return std::max(FirstAnswer, KnownZero.countLeadingOnes()); + if (Known.Zero.isSignBitSet()) + return std::max(FirstAnswer, Known.Zero.countLeadingOnes()); - if (KnownOne.isSignBitSet()) - return std::max(FirstAnswer, KnownOne.countLeadingOnes()); + if (Known.One.isSignBitSet()) + return std::max(FirstAnswer, Known.One.countLeadingOnes()); // computeKnownBits gave us no extra information about the top bits. return FirstAnswer; @@ -3534,26 +3512,22 @@ OverflowResult llvm::computeOverflowForUnsignedMul(const Value *LHS, // we can guarantee that the result does not overflow. // Ref: "Hacker's Delight" by Henry Warren unsigned BitWidth = LHS->getType()->getScalarSizeInBits(); - APInt LHSKnownZero(BitWidth, 0); - APInt LHSKnownOne(BitWidth, 0); - APInt RHSKnownZero(BitWidth, 0); - APInt RHSKnownOne(BitWidth, 0); - computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, DL, /*Depth=*/0, AC, CxtI, - DT); - computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, DL, /*Depth=*/0, AC, CxtI, - DT); + KnownBits LHSKnown(BitWidth); + KnownBits RHSKnown(BitWidth); + computeKnownBits(LHS, LHSKnown, DL, /*Depth=*/0, AC, CxtI, DT); + computeKnownBits(RHS, RHSKnown, DL, /*Depth=*/0, AC, CxtI, DT); // Note that underestimating the number of zero bits gives a more // conservative answer. - unsigned ZeroBits = LHSKnownZero.countLeadingOnes() + - RHSKnownZero.countLeadingOnes(); + unsigned ZeroBits = LHSKnown.Zero.countLeadingOnes() + + RHSKnown.Zero.countLeadingOnes(); // First handle the easy case: if we have enough zero bits there's // definitely no overflow. if (ZeroBits >= BitWidth) return OverflowResult::NeverOverflows; // Get the largest possible values for each operand. - APInt LHSMax = ~LHSKnownZero; - APInt RHSMax = ~RHSKnownZero; + APInt LHSMax = ~LHSKnown.Zero; + APInt RHSMax = ~RHSKnown.Zero; // We know the multiply operation doesn't overflow if the maximum values for // each operand will not overflow after we multiply them together. @@ -3565,7 +3539,7 @@ OverflowResult llvm::computeOverflowForUnsignedMul(const Value *LHS, // We know it always overflows if multiplying the smallest possible values for // the operands also results in overflow. bool MinOverflow; - (void)LHSKnownOne.umul_ov(RHSKnownOne, MinOverflow); + (void)LHSKnown.One.umul_ov(RHSKnown.One, MinOverflow); if (MinOverflow) return OverflowResult::AlwaysOverflows; @@ -4284,11 +4258,10 @@ static bool isTruePredicate(CmpInst::Predicate Pred, // If X & C == 0 then (X | C) == X +_{nuw} C if (match(A, m_Or(m_Value(X), m_APInt(CA))) && match(B, m_Or(m_Specific(X), m_APInt(CB)))) { - unsigned BitWidth = CA->getBitWidth(); - APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); - computeKnownBits(X, KnownZero, KnownOne, DL, Depth + 1, AC, CxtI, DT); + KnownBits Known(CA->getBitWidth()); + computeKnownBits(X, Known, DL, Depth + 1, AC, CxtI, DT); - if ((KnownZero & *CA) == *CA && (KnownZero & *CB) == *CB) + if (CA->isSubsetOf(Known.Zero) && CB->isSubsetOf(Known.Zero)) return true; } diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp index 1d3cde2..e5aba03 100644 --- a/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -726,54 +726,50 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) { } void ModuleBitcodeWriter::writeAttributeGroupTable() { - const std::vector &AttrGrps = VE.getAttributeGroups(); + const std::vector &AttrGrps = + VE.getAttributeGroups(); if (AttrGrps.empty()) return; Stream.EnterSubblock(bitc::PARAMATTR_GROUP_BLOCK_ID, 3); SmallVector Record; - for (unsigned i = 0, e = AttrGrps.size(); i != e; ++i) { - AttributeList AS = AttrGrps[i]; - for (unsigned i = 0, e = AS.getNumSlots(); i != e; ++i) { - AttributeList A = AS.getSlotAttributes(i); - - Record.push_back(VE.getAttributeGroupID(A)); - Record.push_back(AS.getSlotIndex(i)); - - for (AttributeList::iterator I = AS.begin(0), E = AS.end(0); I != E; - ++I) { - Attribute Attr = *I; - if (Attr.isEnumAttribute()) { - Record.push_back(0); - Record.push_back(getAttrKindEncoding(Attr.getKindAsEnum())); - } else if (Attr.isIntAttribute()) { - Record.push_back(1); - Record.push_back(getAttrKindEncoding(Attr.getKindAsEnum())); - Record.push_back(Attr.getValueAsInt()); - } else { - StringRef Kind = Attr.getKindAsString(); - StringRef Val = Attr.getValueAsString(); - - Record.push_back(Val.empty() ? 3 : 4); - Record.append(Kind.begin(), Kind.end()); + for (ValueEnumerator::IndexAndAttrSet Pair : AttrGrps) { + unsigned AttrListIndex = Pair.first; + AttributeSet AS = Pair.second; + Record.push_back(VE.getAttributeGroupID(Pair)); + Record.push_back(AttrListIndex); + + for (Attribute Attr : AS) { + if (Attr.isEnumAttribute()) { + Record.push_back(0); + Record.push_back(getAttrKindEncoding(Attr.getKindAsEnum())); + } else if (Attr.isIntAttribute()) { + Record.push_back(1); + Record.push_back(getAttrKindEncoding(Attr.getKindAsEnum())); + Record.push_back(Attr.getValueAsInt()); + } else { + StringRef Kind = Attr.getKindAsString(); + StringRef Val = Attr.getValueAsString(); + + Record.push_back(Val.empty() ? 3 : 4); + Record.append(Kind.begin(), Kind.end()); + Record.push_back(0); + if (!Val.empty()) { + Record.append(Val.begin(), Val.end()); Record.push_back(0); - if (!Val.empty()) { - Record.append(Val.begin(), Val.end()); - Record.push_back(0); - } } } - - Stream.EmitRecord(bitc::PARAMATTR_GRP_CODE_ENTRY, Record); - Record.clear(); } + + Stream.EmitRecord(bitc::PARAMATTR_GRP_CODE_ENTRY, Record); + Record.clear(); } Stream.ExitBlock(); } void ModuleBitcodeWriter::writeAttributeTable() { - const std::vector &Attrs = VE.getAttributes(); + const std::vector &Attrs = VE.getAttributeLists(); if (Attrs.empty()) return; Stream.EnterSubblock(bitc::PARAMATTR_BLOCK_ID, 3); @@ -782,7 +778,8 @@ void ModuleBitcodeWriter::writeAttributeTable() { for (unsigned i = 0, e = Attrs.size(); i != e; ++i) { const AttributeList &A = Attrs[i]; for (unsigned i = 0, e = A.getNumSlots(); i != e; ++i) - Record.push_back(VE.getAttributeGroupID(A.getSlotAttributes(i))); + Record.push_back( + VE.getAttributeGroupID({A.getSlotIndex(i), A.getSlotAttributes(i)})); Stream.EmitRecord(bitc::PARAMATTR_CODE_ENTRY, Record); Record.clear(); @@ -1270,7 +1267,7 @@ void ModuleBitcodeWriter::writeModuleInfo() { Vals.push_back(F.getCallingConv()); Vals.push_back(F.isDeclaration()); Vals.push_back(getEncodedLinkage(F)); - Vals.push_back(VE.getAttributeID(F.getAttributes())); + Vals.push_back(VE.getAttributeListID(F.getAttributes())); Vals.push_back(Log2_32(F.getAlignment())+1); Vals.push_back(F.hasSection() ? SectionMap[F.getSection()] : 0); Vals.push_back(getEncodedVisibility(F)); @@ -2616,7 +2613,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I, Code = bitc::FUNC_CODE_INST_INVOKE; - Vals.push_back(VE.getAttributeID(II->getAttributes())); + Vals.push_back(VE.getAttributeListID(II->getAttributes())); Vals.push_back(II->getCallingConv() | 1 << 13); Vals.push_back(VE.getValueID(II->getNormalDest())); Vals.push_back(VE.getValueID(II->getUnwindDest())); @@ -2808,7 +2805,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I, Code = bitc::FUNC_CODE_INST_CALL; - Vals.push_back(VE.getAttributeID(CI.getAttributes())); + Vals.push_back(VE.getAttributeListID(CI.getAttributes())); unsigned Flags = getOptimizationFlags(&I); Vals.push_back(CI.getCallingConv() << bitc::CALL_CCONV | diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp index 3800d9a..8611507 100644 --- a/lib/Bitcode/Writer/ValueEnumerator.cpp +++ b/lib/Bitcode/Writer/ValueEnumerator.cpp @@ -891,19 +891,19 @@ void ValueEnumerator::EnumerateAttributes(AttributeList PAL) { if (PAL.isEmpty()) return; // null is always 0. // Do a lookup. - unsigned &Entry = AttributeMap[PAL]; + unsigned &Entry = AttributeListMap[PAL]; if (Entry == 0) { // Never saw this before, add it. - Attribute.push_back(PAL); - Entry = Attribute.size(); + AttributeLists.push_back(PAL); + Entry = AttributeLists.size(); } // Do lookups for all attribute groups. for (unsigned i = 0, e = PAL.getNumSlots(); i != e; ++i) { - AttributeList AS = PAL.getSlotAttributes(i); - unsigned &Entry = AttributeGroupMap[AS]; + IndexAndAttrSet Pair = {PAL.getSlotIndex(i), PAL.getSlotAttributes(i)}; + unsigned &Entry = AttributeGroupMap[Pair]; if (Entry == 0) { - AttributeGroups.push_back(AS); + AttributeGroups.push_back(Pair); Entry = AttributeGroups.size(); } } diff --git a/lib/Bitcode/Writer/ValueEnumerator.h b/lib/Bitcode/Writer/ValueEnumerator.h index 8a82aab..e7ccc8d 100644 --- a/lib/Bitcode/Writer/ValueEnumerator.h +++ b/lib/Bitcode/Writer/ValueEnumerator.h @@ -48,6 +48,10 @@ public: // For each value, we remember its Value* and occurrence frequency. typedef std::vector > ValueList; + /// Attribute groups as encoded in bitcode are almost AttributeSets, but they + /// include the AttributeList index, so we have to track that in our map. + typedef std::pair IndexAndAttrSet; + UseListOrderStack UseListOrders; private: @@ -102,13 +106,13 @@ private: bool ShouldPreserveUseListOrder; - typedef DenseMap AttributeGroupMapType; + typedef DenseMap AttributeGroupMapType; AttributeGroupMapType AttributeGroupMap; - std::vector AttributeGroups; + std::vector AttributeGroups; - typedef DenseMap AttributeMapType; - AttributeMapType AttributeMap; - std::vector Attribute; + typedef DenseMap AttributeListMapType; + AttributeListMapType AttributeListMap; + std::vector AttributeLists; /// GlobalBasicBlockIDs - This map memoizes the basic block ID's referenced by /// the "getGlobalBasicBlockID" method. @@ -166,16 +170,17 @@ public: unsigned getInstructionID(const Instruction *I) const; void setInstructionID(const Instruction *I); - unsigned getAttributeID(AttributeList PAL) const { + unsigned getAttributeListID(AttributeList PAL) const { if (PAL.isEmpty()) return 0; // Null maps to zero. - AttributeMapType::const_iterator I = AttributeMap.find(PAL); - assert(I != AttributeMap.end() && "Attribute not in ValueEnumerator!"); + AttributeListMapType::const_iterator I = AttributeListMap.find(PAL); + assert(I != AttributeListMap.end() && "Attribute not in ValueEnumerator!"); return I->second; } - unsigned getAttributeGroupID(AttributeList PAL) const { - if (PAL.isEmpty()) return 0; // Null maps to zero. - AttributeGroupMapType::const_iterator I = AttributeGroupMap.find(PAL); + unsigned getAttributeGroupID(IndexAndAttrSet Group) const { + if (!Group.second.hasAttributes()) + return 0; // Null maps to zero. + AttributeGroupMapType::const_iterator I = AttributeGroupMap.find(Group); assert(I != AttributeGroupMap.end() && "Attribute not in ValueEnumerator!"); return I->second; } @@ -206,8 +211,8 @@ public: const std::vector &getBasicBlocks() const { return BasicBlocks; } - const std::vector &getAttributes() const { return Attribute; } - const std::vector &getAttributeGroups() const { + const std::vector &getAttributeLists() const { return AttributeLists; } + const std::vector &getAttributeGroups() const { return AttributeGroups; } diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp index 955524c..3a57772 100644 --- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp +++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp @@ -964,10 +964,8 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies( // sure to update that as well. const SUnit *SU = MISUnitMap[Q.second.Operand->getParent()]; if (!SU) continue; - for (DbgValueVector::iterator DVI = DbgValues.begin(), - DVE = DbgValues.end(); DVI != DVE; ++DVI) - if (DVI->second == Q.second.Operand->getParent()) - UpdateDbgValue(*DVI->first, AntiDepReg, NewReg); + UpdateDbgValues(DbgValues, Q.second.Operand->getParent(), + AntiDepReg, NewReg); } // We just went back in time and modified history; the diff --git a/lib/CodeGen/AntiDepBreaker.h b/lib/CodeGen/AntiDepBreaker.h index 04f7f41..d14d931 100644 --- a/lib/CodeGen/AntiDepBreaker.h +++ b/lib/CodeGen/AntiDepBreaker.h @@ -60,6 +60,25 @@ public: if (MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == OldReg) MI.getOperand(0).setReg(NewReg); } + + /// Update all DBG_VALUE instructions that may be affected by the dependency + /// breaker's update of ParentMI to use NewReg. + void UpdateDbgValues(const DbgValueVector &DbgValues, MachineInstr *ParentMI, + unsigned OldReg, unsigned NewReg) { + // The following code is dependent on the order in which the DbgValues are + // constructed in ScheduleDAGInstrs::buildSchedGraph. + MachineInstr *PrevDbgMI = nullptr; + for (const auto &DV : make_range(DbgValues.crbegin(), DbgValues.crend())) { + MachineInstr *PrevMI = DV.second; + if ((PrevMI == ParentMI) || (PrevMI == PrevDbgMI)) { + MachineInstr *DbgMI = DV.first; + UpdateDbgValue(*DbgMI, OldReg, NewReg); + PrevDbgMI = DbgMI; + } else if (PrevDbgMI) { + break; // If no match and already found a DBG_VALUE, we're done. + } + } + } }; } diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 028c79f..d99065b 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -825,41 +825,25 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) { OS << Name << ":"; } OS << V->getName(); - - const DIExpression *Expr = MI->getDebugExpression(); - auto Fragment = Expr->getFragmentInfo(); - if (Fragment) - OS << " [fragment offset=" << Fragment->OffsetInBits - << " size=" << Fragment->SizeInBits << "]"; OS << " <- "; // The second operand is only an offset if it's an immediate. - bool Deref = false; bool MemLoc = MI->getOperand(0).isReg() && MI->getOperand(1).isImm(); int64_t Offset = MemLoc ? MI->getOperand(1).getImm() : 0; - for (unsigned i = 0; i < Expr->getNumElements(); ++i) { - uint64_t Op = Expr->getElement(i); - if (Op == dwarf::DW_OP_LLVM_fragment) { - // There can't be any operands after this in a valid expression - break; - } else if (Deref) { - // We currently don't support extra Offsets or derefs after the first - // one. Bail out early instead of emitting an incorrect comment. - OS << " [complex expression]"; - AP.OutStreamer->emitRawComment(OS.str()); - return true; - } else if (Op == dwarf::DW_OP_deref) { - Deref = true; - continue; - } - - uint64_t ExtraOffset = Expr->getElement(i++); - if (Op == dwarf::DW_OP_plus) - Offset += ExtraOffset; - else { - assert(Op == dwarf::DW_OP_minus); - Offset -= ExtraOffset; + const DIExpression *Expr = MI->getDebugExpression(); + if (Expr->getNumElements()) { + OS << '['; + bool NeedSep = false; + for (auto Op : Expr->expr_ops()) { + if (NeedSep) + OS << ", "; + else + NeedSep = true; + OS << dwarf::OperationEncodingString(Op.getOp()); + for (unsigned I = 0; I < Op.getNumArgs(); ++I) + OS << ' ' << Op.getArg(I); } + OS << "] "; } // Register or immediate value. Register 0 means undef. @@ -890,7 +874,7 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) { const TargetFrameLowering *TFI = AP.MF->getSubtarget().getFrameLowering(); Offset += TFI->getFrameIndexReference(*AP.MF, MI->getOperand(0).getIndex(), Reg); - Deref = true; + MemLoc = true; } if (Reg == 0) { // Suppress offset, it is not meaningful here. @@ -899,12 +883,12 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) { AP.OutStreamer->emitRawComment(OS.str()); return true; } - if (MemLoc || Deref) + if (MemLoc) OS << '['; OS << PrintReg(Reg, AP.MF->getSubtarget().getRegisterInfo()); } - if (MemLoc || Deref) + if (MemLoc) OS << '+' << Offset << ']'; // NOTE: Want this comment at start of line, don't emit with AddComment. @@ -936,6 +920,16 @@ void AsmPrinter::emitCFIInstruction(const MachineInstr &MI) { if (needsCFIMoves() == CFI_M_None) return; + // If there is no "real" instruction following this CFI instruction, skip + // emitting it; it would be beyond the end of the function's FDE range. + auto *MBB = MI.getParent(); + auto I = std::next(MI.getIterator()); + while (I != MBB->end() && I->isTransient()) + ++I; + if (I == MBB->instr_end() && + MBB->getReverseIterator() == MBB->getParent()->rbegin()) + return; + const std::vector &Instrs = MF->getFrameInstructions(); unsigned CFIIndex = MI.getOperand(0).getCFIIndex(); const MCCFIInstruction &CFI = Instrs[CFIIndex]; @@ -1046,15 +1040,23 @@ void AsmPrinter::EmitFunctionBody() { // If the function is empty and the object file uses .subsections_via_symbols, // then we need to emit *something* to the function body to prevent the // labels from collapsing together. Just emit a noop. - if ((MAI->hasSubsectionsViaSymbols() && !HasAnyRealCode)) { + // Similarly, don't emit empty functions on Windows either. It can lead to + // duplicate entries (two functions with the same RVA) in the Guard CF Table + // after linking, causing the kernel not to load the binary: + // https://developercommunity.visualstudio.com/content/problem/45366/vc-linker-creates-invalid-dll-with-clang-cl.html + // FIXME: Hide this behind some API in e.g. MCAsmInfo or MCTargetStreamer. + const Triple &TT = TM.getTargetTriple(); + if (!HasAnyRealCode && (MAI->hasSubsectionsViaSymbols() || + (TT.isOSWindows() && TT.isOSBinFormatCOFF()))) { MCInst Noop; - MF->getSubtarget().getInstrInfo()->getNoopForMachoTarget(Noop); - OutStreamer->AddComment("avoids zero-length function"); + MF->getSubtarget().getInstrInfo()->getNoop(Noop); // Targets can opt-out of emitting the noop here by leaving the opcode // unspecified. - if (Noop.getOpcode()) + if (Noop.getOpcode()) { + OutStreamer->AddComment("avoids zero-length function"); OutStreamer->EmitInstruction(Noop, getSubtargetInfo()); + } } const Function *F = MF->getFunction(); diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp index 683e622..a0bf163 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp @@ -144,6 +144,9 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI, " we don't have an asm parser for this target\n"); Parser->setAssemblerDialect(Dialect); Parser->setTargetParser(*TAP.get()); + if (Dialect == InlineAsm::AD_Intel) + // We need this flag to be able to parse numbers like "0bH" + Parser->setParsingInlineAsm(true); if (MF) { const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); TAP->SetFrameRegister(TRI->getFrameRegister(*MF)); diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp index 31c2b3b..30bfd7c 100644 --- a/lib/CodeGen/AsmPrinter/DIE.cpp +++ b/lib/CodeGen/AsmPrinter/DIE.cpp @@ -655,20 +655,12 @@ void DIEEntry::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const { case dwarf::DW_FORM_ref_addr: { // Get the absolute offset for this DIE within the debug info/types section. unsigned Addr = Entry->getDebugSectionOffset(); - if (AP->MAI->doesDwarfUseRelocationsAcrossSections()) { - const DwarfDebug *DD = AP->getDwarfDebug(); - if (DD) - assert(!DD->useSplitDwarf() && - "TODO: dwo files can't have relocations."); - const DIEUnit *Unit = Entry->getUnit(); - assert(Unit && "CUDie should belong to a CU."); - MCSection *Section = Unit->getSection(); - if (Section) { - const MCSymbol *SectionSym = Section->getBeginSymbol(); - AP->EmitLabelPlusOffset(SectionSym, Addr, SizeOf(AP, Form), true); - return; - } + if (const MCSymbol *SectionSym = + Entry->getUnit()->getCrossSectionRelativeBaseAddress()) { + AP->EmitLabelPlusOffset(SectionSym, Addr, SizeOf(AP, Form), true); + return; } + AP->OutStreamer->EmitIntValue(Addr, SizeOf(AP, Form)); return; } diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h index 9a64b4b..20a4151 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h +++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h @@ -28,7 +28,7 @@ class DwarfFile; class MCSymbol; class LexicalScope; -class DwarfCompileUnit : public DwarfUnit { +class DwarfCompileUnit final : public DwarfUnit { /// A numeric ID unique among all CUs in the module unsigned UniqueID; diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index d72656b..6f442f5 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -91,14 +91,6 @@ DwarfAccelTables("dwarf-accel-tables", cl::Hidden, cl::init(Default)); static cl::opt -SplitDwarf("split-dwarf", cl::Hidden, - cl::desc("Output DWARF5 split debug info."), - cl::values(clEnumVal(Default, "Default for platform"), - clEnumVal(Enable, "Enabled"), - clEnumVal(Disable, "Disabled")), - cl::init(Default)); - -static cl::opt DwarfPubSections("generate-dwarf-pub-sections", cl::Hidden, cl::desc("Generate DWARF pubnames and pubtypes sections"), cl::values(clEnumVal(Default, "Default for platform"), @@ -253,11 +245,8 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M) HasAppleExtensionAttributes = tuneForLLDB(); - // Handle split DWARF. Off by default for now. - if (SplitDwarf == Default) - HasSplitDwarf = false; - else - HasSplitDwarf = SplitDwarf == Enable; + // Handle split DWARF. + HasSplitDwarf = !Asm->TM.Options.MCOptions.SplitDwarfFile.empty(); // Pubnames/pubtypes on by default for GDB. if (DwarfPubSections == Default) @@ -412,7 +401,7 @@ DwarfDebug::constructDwarfCompileUnit(const DICompileUnit *DIUnit) { if (useSplitDwarf()) { NewCU.setSkeleton(constructSkeletonCU(NewCU)); NewCU.addString(Die, dwarf::DW_AT_GNU_dwo_name, - DIUnit->getSplitDebugFilename()); + Asm->TM.Options.MCOptions.SplitDwarfFile); } // LTO with assembly output shares a single line table amongst multiple CUs. @@ -1885,7 +1874,7 @@ void DwarfDebug::emitDebugMacinfo() { void DwarfDebug::initSkeletonUnit(const DwarfUnit &U, DIE &Die, std::unique_ptr NewU) { NewU->addString(Die, dwarf::DW_AT_GNU_dwo_name, - U.getCUNode()->getSplitDebugFilename()); + Asm->TM.Options.MCOptions.SplitDwarfFile); if (!CompilationDir.empty()) NewU->addString(Die, dwarf::DW_AT_comp_dir, CompilationDir); diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp index f65dc15..ccd3269 100644 --- a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp @@ -117,8 +117,9 @@ bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI, // Otherwise, attempt to find a covering set of sub-register numbers. // For example, Q0 on ARM is a composition of D0+D1. unsigned CurPos = 0; - // The size of the register in bits, assuming 8 bits per byte. - unsigned RegSize = TRI.getMinimalPhysRegClass(MachineReg)->getSize() * 8; + // The size of the register in bits. + const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(MachineReg); + unsigned RegSize = TRI.getRegSizeInBits(*RC); // Keep track of the bits in the register we already emitted, so we // can avoid emitting redundant aliasing subregs. SmallBitVector Coverage(RegSize, false); @@ -198,8 +199,10 @@ bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI, unsigned MachineReg, unsigned FragmentOffsetInBits) { auto Fragment = ExprCursor.getFragmentInfo(); - if (!addMachineReg(TRI, MachineReg, Fragment ? Fragment->SizeInBits : ~1U)) + if (!addMachineReg(TRI, MachineReg, Fragment ? Fragment->SizeInBits : ~1U)) { + LocationKind = Unknown; return false; + } bool HasComplexExpression = false; auto Op = ExprCursor.peek(); @@ -212,6 +215,7 @@ bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI, // operation to multiple DW_OP_pieces. if (HasComplexExpression && DwarfRegs.size() > 1) { DwarfRegs.clear(); + LocationKind = Unknown; return false; } @@ -233,6 +237,7 @@ bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI, return Op.getOp() == dwarf::DW_OP_stack_value; })) { DwarfRegs.clear(); + LocationKind = Unknown; return false; } @@ -343,7 +348,6 @@ void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor, emitUnsigned(Op->getArg(0)); break; case dwarf::DW_OP_stack_value: - assert(LocationKind == Unknown || LocationKind == Implicit); LocationKind = Implicit; break; case dwarf::DW_OP_swap: diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index bac0c20..16fb20d 100644 --- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -1595,3 +1595,11 @@ void DwarfTypeUnit::addGlobalType(const DIType *Ty, const DIE &Die, const DIScope *Context) { getCU().addGlobalTypeUnitType(Ty, Context); } + +const MCSymbol *DwarfUnit::getCrossSectionRelativeBaseAddress() const { + if (!Asm->MAI->doesDwarfUseRelocationsAcrossSections()) + return nullptr; + if (isDwoUnit()) + return nullptr; + return getSection()->getBeginSymbol(); +} diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h index d626ef9..e84df46 100644 --- a/lib/CodeGen/AsmPrinter/DwarfUnit.h +++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h @@ -104,8 +104,6 @@ protected: bool applySubprogramDefinitionAttributes(const DISubprogram *SP, DIE &SPDie); public: - virtual ~DwarfUnit(); - // Accessors. AsmPrinter* getAsmPrinter() const { return Asm; } uint16_t getLanguage() const { return CUNode->getSourceLanguage(); } @@ -289,6 +287,8 @@ public: void constructTypeDIE(DIE &Buffer, const DICompositeType *CTy); protected: + ~DwarfUnit(); + /// Create new static data member DIE. DIE *getOrCreateStaticMemberDIE(const DIDerivedType *DT); @@ -335,9 +335,10 @@ private: void setIndexTyDie(DIE *D) { IndexTyDie = D; } virtual bool isDwoUnit() const = 0; + const MCSymbol *getCrossSectionRelativeBaseAddress() const override; }; -class DwarfTypeUnit : public DwarfUnit { +class DwarfTypeUnit final : public DwarfUnit { uint64_t TypeSignature; const DIE *Ty; DwarfCompileUnit &CU; diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp index e1eeddf..b2d6652 100644 --- a/lib/CodeGen/CriticalAntiDepBreaker.cpp +++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp @@ -648,10 +648,8 @@ BreakAntiDependencies(const std::vector& SUnits, // as well. const SUnit *SU = MISUnitMap[Q->second->getParent()]; if (!SU) continue; - for (DbgValueVector::iterator DVI = DbgValues.begin(), - DVE = DbgValues.end(); DVI != DVE; ++DVI) - if (DVI->second == Q->second->getParent()) - UpdateDbgValue(*DVI->first, AntiDepReg, NewReg); + UpdateDbgValues(DbgValues, Q->second->getParent(), + AntiDepReg, NewReg); } // We just went back in time and modified history; the diff --git a/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/lib/CodeGen/GlobalISel/InstructionSelect.cpp index 26454c1..cf97c63 100644 --- a/lib/CodeGen/GlobalISel/InstructionSelect.cpp +++ b/lib/CodeGen/GlobalISel/InstructionSelect.cpp @@ -145,6 +145,8 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { } } + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + // Now that selection is complete, there are no more generic vregs. Verify // that the size of the now-constrained vreg is unchanged and that it has a // register class. @@ -165,7 +167,7 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { continue; if (VRegToType.second.isValid() && - VRegToType.second.getSizeInBits() > (RC->getSize() * 8)) { + VRegToType.second.getSizeInBits() > TRI.getRegSizeInBits(*RC)) { reportGISelFailure(MF, TPC, MORE, "gisel-select", "VReg has explicit size different from class size", *MI); diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 5877807..ef5818d 100644 --- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -76,6 +76,12 @@ void LegalizerHelper::extractParts(unsigned Reg, LLT Ty, int NumParts, static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { switch (Opcode) { + case TargetOpcode::G_SDIV: + assert(Size == 32 && "Unsupported size"); + return RTLIB::SDIV_I32; + case TargetOpcode::G_UDIV: + assert(Size == 32 && "Unsupported size"); + return RTLIB::UDIV_I32; case TargetOpcode::G_FADD: assert((Size == 32 || Size == 64) && "Unsupported size"); return Size == 64 ? RTLIB::ADD_F64 : RTLIB::ADD_F32; @@ -87,31 +93,43 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { llvm_unreachable("Unknown libcall function"); } +static LegalizerHelper::LegalizeResult +simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size, + Type *OpType) { + auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering(); + auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); + auto Libcall = getRTLibDesc(MI.getOpcode(), Size); + const char *Name = TLI.getLibcallName(Libcall); + MIRBuilder.getMF().getFrameInfo().setHasCalls(true); + CLI.lowerCall(MIRBuilder, TLI.getLibcallCallingConv(Libcall), + MachineOperand::CreateES(Name), + {MI.getOperand(0).getReg(), OpType}, + {{MI.getOperand(1).getReg(), OpType}, + {MI.getOperand(2).getReg(), OpType}}); + MI.eraseFromParent(); + return LegalizerHelper::Legalized; +} + LegalizerHelper::LegalizeResult LegalizerHelper::libcall(MachineInstr &MI) { LLT Ty = MRI.getType(MI.getOperand(0).getReg()); unsigned Size = Ty.getSizeInBits(); + auto &Ctx = MIRBuilder.getMF().getFunction()->getContext(); MIRBuilder.setInstr(MI); switch (MI.getOpcode()) { default: return UnableToLegalize; + case TargetOpcode::G_SDIV: + case TargetOpcode::G_UDIV: { + Type *Ty = Type::getInt32Ty(Ctx); + return simpleLibcall(MI, MIRBuilder, Size, Ty); + } case TargetOpcode::G_FADD: case TargetOpcode::G_FPOW: case TargetOpcode::G_FREM: { - auto &Ctx = MIRBuilder.getMF().getFunction()->getContext(); Type *Ty = Size == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx); - auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering(); - auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering(); - auto Libcall = getRTLibDesc(MI.getOpcode(), Size); - const char *Name = TLI.getLibcallName(Libcall); - MIRBuilder.getMF().getFrameInfo().setHasCalls(true); - CLI.lowerCall( - MIRBuilder, TLI.getLibcallCallingConv(Libcall), - MachineOperand::CreateES(Name), {MI.getOperand(0).getReg(), Ty}, - {{MI.getOperand(1).getReg(), Ty}, {MI.getOperand(2).getReg(), Ty}}); - MI.eraseFromParent(); - return Legalized; + return simpleLibcall(MI, MIRBuilder, Size, Ty); } } } diff --git a/lib/CodeGen/GlobalISel/RegisterBank.cpp b/lib/CodeGen/GlobalISel/RegisterBank.cpp index 940957d..83b21e6 100644 --- a/lib/CodeGen/GlobalISel/RegisterBank.cpp +++ b/lib/CodeGen/GlobalISel/RegisterBank.cpp @@ -48,7 +48,7 @@ bool RegisterBank::verify(const TargetRegisterInfo &TRI) const { // Verify that the Size of the register bank is big enough to cover // all the register classes it covers. - assert((getSize() >= SubRC.getSize() * 8) && + assert(getSize() >= TRI.getRegSizeInBits(SubRC) && "Size is not big enough for all the subclasses!"); assert(covers(SubRC) && "Not all subclasses are covered"); } diff --git a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp index b2df2f1..d5ae9a6 100644 --- a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp +++ b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp @@ -421,7 +421,7 @@ unsigned RegisterBankInfo::getSizeInBits(unsigned Reg, RC = MRI.getRegClass(Reg); } assert(RC && "Unable to deduce the register class"); - return RC->getSize() * 8; + return TRI.getRegSizeInBits(*RC); } //------------------------------------------------------------------------------ diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp index 6da174a..b6624b8 100644 --- a/lib/CodeGen/MIRPrinter.cpp +++ b/lib/CodeGen/MIRPrinter.cpp @@ -925,9 +925,6 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI, << CmpInst::getPredicateName(Pred) << ')'; break; } - case MachineOperand::MO_Placeholder: - OS << ""; - break; } } diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp index 4bd5fbf..1faf629 100644 --- a/lib/CodeGen/MachineInstr.cpp +++ b/lib/CodeGen/MachineInstr.cpp @@ -287,8 +287,6 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const { return getIntrinsicID() == Other.getIntrinsicID(); case MachineOperand::MO_Predicate: return getPredicate() == Other.getPredicate(); - case MachineOperand::MO_Placeholder: - return true; } llvm_unreachable("Invalid machine operand type"); } @@ -337,8 +335,6 @@ hash_code llvm::hash_value(const MachineOperand &MO) { return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIntrinsicID()); case MachineOperand::MO_Predicate: return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getPredicate()); - case MachineOperand::MO_Placeholder: - return hash_combine(); } llvm_unreachable("Invalid machine operand type"); } @@ -515,9 +511,6 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, << CmpInst::getPredicateName(Pred) << '>'; break; } - case MachineOperand::MO_Placeholder: - OS << ""; - break; } if (unsigned TF = getTargetFlags()) OS << "[TF=" << TF << ']'; diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp index b3d1843..7eb9917 100644 --- a/lib/CodeGen/MachineLICM.cpp +++ b/lib/CodeGen/MachineLICM.cpp @@ -330,7 +330,7 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) { /// Return true if instruction stores to the specified frame. static bool InstructionStoresToFI(const MachineInstr *MI, int FI) { // If we lost memory operands, conservatively assume that the instruction - // writes to all slots. + // writes to all slots. if (MI->memoperands_empty()) return true; for (const MachineMemOperand *MemOp : MI->memoperands()) { @@ -708,7 +708,7 @@ void MachineLICM::SinkIntoLoop() { for (MachineBasicBlock::instr_iterator I = Preheader->instr_begin(); I != Preheader->instr_end(); ++I) { // We need to ensure that we can safely move this instruction into the loop. - // As such, it must not have side-effects, e.g. such as a call has. + // As such, it must not have side-effects, e.g. such as a call has. if (IsLoopInvariantInst(*I) && !HasLoopPHIUse(&*I)) Candidates.push_back(&*I); } @@ -837,9 +837,9 @@ MachineLICM::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen, /// constant pool. static bool mayLoadFromGOTOrConstantPool(MachineInstr &MI) { assert (MI.mayLoad() && "Expected MI that loads!"); - + // If we lost memory operands, conservatively assume that the instruction - // reads from everything.. + // reads from everything.. if (MI.memoperands_empty()) return true; @@ -1337,7 +1337,7 @@ bool MachineLICM::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) { Preheader->splice(Preheader->getFirstTerminator(),MI->getParent(),MI); // Since we are moving the instruction out of its basic block, we do not - // retain its debug location. Doing so would degrade the debugging + // retain its debug location. Doing so would degrade the debugging // experience and adversely affect the accuracy of profiling information. MI->setDebugLoc(DebugLoc()); diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp index 1354009..570a0cd 100644 --- a/lib/CodeGen/PrologEpilogInserter.cpp +++ b/lib/CodeGen/PrologEpilogInserter.cpp @@ -373,22 +373,22 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F, FixedSlot->Reg != Reg) ++FixedSlot; + unsigned Size = RegInfo->getSpillSize(*RC); if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) { // Nope, just spill it anywhere convenient. - unsigned Align = RC->getAlignment(); + unsigned Align = RegInfo->getSpillAlignment(*RC); unsigned StackAlign = TFI->getStackAlignment(); // We may not be able to satisfy the desired alignment specification of // the TargetRegisterClass if the stack alignment is smaller. Use the // min. Align = std::min(Align, StackAlign); - FrameIdx = MFI.CreateStackObject(RC->getSize(), Align, true); + FrameIdx = MFI.CreateStackObject(Size, Align, true); if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx; if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx; } else { // Spill it to the stack where we must. - FrameIdx = - MFI.CreateFixedSpillStackObject(RC->getSize(), FixedSlot->Offset); + FrameIdx = MFI.CreateFixedSpillStackObject(Size, FixedSlot->Offset); } CS.setFrameIdx(FrameIdx); diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp index 283d846..c606b7b 100644 --- a/lib/CodeGen/RegAllocFast.cpp +++ b/lib/CodeGen/RegAllocFast.cpp @@ -212,8 +212,9 @@ int RAFast::getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC) { return SS; // Already has space allocated? // Allocate a new stack object for this spill location... - int FrameIdx = MF->getFrameInfo().CreateSpillStackObject(RC->getSize(), - RC->getAlignment()); + unsigned Size = TRI->getSpillSize(*RC); + unsigned Align = TRI->getSpillAlignment(*RC); + int FrameIdx = MF->getFrameInfo().CreateSpillStackObject(Size, Align); // Assign the slot. StackSlotForVirtReg[VirtReg] = FrameIdx; diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp index 6392136..35db30f 100644 --- a/lib/CodeGen/RegisterScavenging.cpp +++ b/lib/CodeGen/RegisterScavenging.cpp @@ -395,8 +395,8 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC, // Find an available scavenging slot with size and alignment matching // the requirements of the class RC. const MachineFrameInfo &MFI = MF.getFrameInfo(); - unsigned NeedSize = RC->getSize(); - unsigned NeedAlign = RC->getAlignment(); + unsigned NeedSize = TRI->getSpillSize(*RC); + unsigned NeedAlign = TRI->getSpillAlignment(*RC); unsigned SI = Scavenged.size(), Diff = std::numeric_limits::max(); int FIB = MFI.getObjectIndexBegin(), FIE = MFI.getObjectIndexEnd(); diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 4702d63..1251ae6 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3878,27 +3878,29 @@ static bool isBSwapHWordElement(SDValue N, MutableArrayRef Parts) { if (Opc != ISD::AND && Opc != ISD::SHL && Opc != ISD::SRL) return false; + SDValue N0 = N.getOperand(0); + unsigned Opc0 = N0.getOpcode(); + ConstantSDNode *N1C = dyn_cast(N.getOperand(1)); if (!N1C) return false; - unsigned Num; + unsigned MaskByteOffset; switch (N1C->getZExtValue()) { default: return false; - case 0xFF: Num = 0; break; - case 0xFF00: Num = 1; break; - case 0xFF0000: Num = 2; break; - case 0xFF000000: Num = 3; break; + case 0xFF: MaskByteOffset = 0; break; + case 0xFF00: MaskByteOffset = 1; break; + case 0xFF0000: MaskByteOffset = 2; break; + case 0xFF000000: MaskByteOffset = 3; break; } // Look for (x & 0xff) << 8 as well as ((x << 8) & 0xff00). - SDValue N0 = N.getOperand(0); if (Opc == ISD::AND) { - if (Num == 0 || Num == 2) { + if (MaskByteOffset == 0 || MaskByteOffset == 2) { // (x >> 8) & 0xff // (x >> 8) & 0xff0000 - if (N0.getOpcode() != ISD::SRL) + if (Opc0 != ISD::SRL) return false; ConstantSDNode *C = dyn_cast(N0.getOperand(1)); if (!C || C->getZExtValue() != 8) @@ -3906,7 +3908,7 @@ static bool isBSwapHWordElement(SDValue N, MutableArrayRef Parts) { } else { // (x << 8) & 0xff00 // (x << 8) & 0xff000000 - if (N0.getOpcode() != ISD::SHL) + if (Opc0 != ISD::SHL) return false; ConstantSDNode *C = dyn_cast(N0.getOperand(1)); if (!C || C->getZExtValue() != 8) @@ -3915,7 +3917,7 @@ static bool isBSwapHWordElement(SDValue N, MutableArrayRef Parts) { } else if (Opc == ISD::SHL) { // (x & 0xff) << 8 // (x & 0xff0000) << 8 - if (Num != 0 && Num != 2) + if (MaskByteOffset != 0 && MaskByteOffset != 2) return false; ConstantSDNode *C = dyn_cast(N.getOperand(1)); if (!C || C->getZExtValue() != 8) @@ -3923,17 +3925,17 @@ static bool isBSwapHWordElement(SDValue N, MutableArrayRef Parts) { } else { // Opc == ISD::SRL // (x & 0xff00) >> 8 // (x & 0xff000000) >> 8 - if (Num != 1 && Num != 3) + if (MaskByteOffset != 1 && MaskByteOffset != 3) return false; ConstantSDNode *C = dyn_cast(N.getOperand(1)); if (!C || C->getZExtValue() != 8) return false; } - if (Parts[Num]) + if (Parts[MaskByteOffset]) return false; - Parts[Num] = N0.getOperand(0).getNode(); + Parts[MaskByteOffset] = N0.getOperand(0).getNode(); return true; } @@ -4198,20 +4200,22 @@ SDValue DAGCombiner::visitOR(SDNode *N) { // reassociate or if (SDValue ROR = ReassociateOps(ISD::OR, SDLoc(N), N0, N1)) return ROR; + // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2) // iff (c1 & c2) != 0. - if (N1C && N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && - isa(N0.getOperand(1))) { - ConstantSDNode *C1 = cast(N0.getOperand(1)); - if ((C1->getAPIntValue() & N1C->getAPIntValue()) != 0) { - if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT, - N1C, C1)) - return DAG.getNode( - ISD::AND, SDLoc(N), VT, - DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1), COR); - return SDValue(); + if (N1C && N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse()) { + if (ConstantSDNode *C1 = dyn_cast(N0.getOperand(1))) { + if (C1->getAPIntValue().intersects(N1C->getAPIntValue())) { + if (SDValue COR = + DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT, N1C, C1)) + return DAG.getNode( + ISD::AND, SDLoc(N), VT, + DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1), COR); + return SDValue(); + } } } + // Simplify: (or (op x...), (op y...)) -> (op (or x, y)) if (N0.getOpcode() == N1.getOpcode()) if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N)) @@ -5611,24 +5615,24 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { // fold (srl (trunc (srl x, c1)), c2) -> 0 or (trunc (srl x, (add c1, c2))) if (N1C && N0.getOpcode() == ISD::TRUNCATE && - N0.getOperand(0).getOpcode() == ISD::SRL && - isa(N0.getOperand(0)->getOperand(1))) { - uint64_t c1 = - cast(N0.getOperand(0)->getOperand(1))->getZExtValue(); - uint64_t c2 = N1C->getZExtValue(); - EVT InnerShiftVT = N0.getOperand(0).getValueType(); - EVT ShiftCountVT = N0.getOperand(0)->getOperand(1).getValueType(); - uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits(); - // This is only valid if the OpSizeInBits + c1 = size of inner shift. - if (c1 + OpSizeInBits == InnerShiftSize) { - SDLoc DL(N0); - if (c1 + c2 >= InnerShiftSize) - return DAG.getConstant(0, DL, VT); - return DAG.getNode(ISD::TRUNCATE, DL, VT, - DAG.getNode(ISD::SRL, DL, InnerShiftVT, - N0.getOperand(0)->getOperand(0), - DAG.getConstant(c1 + c2, DL, - ShiftCountVT))); + N0.getOperand(0).getOpcode() == ISD::SRL) { + if (auto N001C = isConstOrConstSplat(N0.getOperand(0).getOperand(1))) { + uint64_t c1 = N001C->getZExtValue(); + uint64_t c2 = N1C->getZExtValue(); + EVT InnerShiftVT = N0.getOperand(0).getValueType(); + EVT ShiftCountVT = N0.getOperand(0).getOperand(1).getValueType(); + uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits(); + // This is only valid if the OpSizeInBits + c1 = size of inner shift. + if (c1 + OpSizeInBits == InnerShiftSize) { + SDLoc DL(N0); + if (c1 + c2 >= InnerShiftSize) + return DAG.getConstant(0, DL, VT); + return DAG.getNode(ISD::TRUNCATE, DL, VT, + DAG.getNode(ISD::SRL, DL, InnerShiftVT, + N0.getOperand(0).getOperand(0), + DAG.getConstant(c1 + c2, DL, + ShiftCountVT))); + } } } @@ -11641,7 +11645,7 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) { // Check if this is a trunc(lshr). if (User->getOpcode() == ISD::SRL && User->hasOneUse() && isa(User->getOperand(1))) { - Shift = cast(User->getOperand(1))->getZExtValue(); + Shift = User->getConstantOperandVal(1); User = *User->use_begin(); } @@ -13120,8 +13124,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { // do this only if indices are both constants and Idx1 < Idx0. if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse() && isa(InVec.getOperand(2))) { - unsigned OtherElt = - cast(InVec.getOperand(2))->getZExtValue(); + unsigned OtherElt = InVec.getConstantOperandVal(2); if (Elt < OtherElt) { // Swap nodes. SDValue NewOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, @@ -14065,7 +14068,7 @@ static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) { if (!isa(Op.getOperand(1))) return SDValue(); - int ExtIdx = cast(Op.getOperand(1))->getZExtValue(); + int ExtIdx = Op.getConstantOperandVal(1); // Ensure that we are extracting a subvector from a vector the same // size as the result. @@ -15049,7 +15052,7 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() && N1.getValueType() == N0.getOperand(1).getValueType() && isa(N0.getOperand(2))) { - unsigned OtherIdx = cast(N0.getOperand(2))->getZExtValue(); + unsigned OtherIdx = N0.getConstantOperandVal(2); if (InsIdx < OtherIdx) { // Swap nodes. SDValue NewOp = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, @@ -16088,6 +16091,19 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const { if (Op1->isInvariant() && Op0->writeMem()) return false; + unsigned NumBytes0 = Op0->getMemoryVT().getSizeInBits() >> 3; + unsigned NumBytes1 = Op1->getMemoryVT().getSizeInBits() >> 3; + + // Check for BaseIndexOffset matching. + BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0->getBasePtr(), DAG); + BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1->getBasePtr(), DAG); + if (BasePtr0.equalBaseIndex(BasePtr1)) + return !((BasePtr0.Offset + NumBytes0 <= BasePtr1.Offset) || + (BasePtr1.Offset + NumBytes1 <= BasePtr0.Offset)); + + // FIXME: findBaseOffset and ConstantValue/GlobalValue/FrameIndex analysis + // modified to use BaseIndexOffset. + // Gather base node and offset information. SDValue Base0, Base1; int64_t Offset0, Offset1; @@ -16099,8 +16115,6 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const { Base1, Offset1, GV1, CV1); // If they have the same base address, then check to see if they overlap. - unsigned NumBytes0 = Op0->getMemoryVT().getSizeInBits() >> 3; - unsigned NumBytes1 = Op1->getMemoryVT().getSizeInBits() >> 3; if (Base0 == Base1 || (GV0 && (GV0 == GV1)) || (CV0 && (CV0 == CV1))) return !((Offset0 + NumBytes0) <= Offset1 || (Offset1 + NumBytes1) <= Offset0); diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index e85d195..b235e19 100644 --- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -161,7 +161,8 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned, if (VRBase) { DstRC = MRI->getRegClass(VRBase); } else if (UseRC) { - assert(UseRC->hasType(VT) && "Incompatible phys register def and uses!"); + assert(TRI->isTypeLegalForClass(*UseRC, VT) && + "Incompatible phys register def and uses!"); DstRC = UseRC; } else { DstRC = TLI->getRegClassFor(VT); diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 3bae3bf..fdebb8b 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3497,11 +3497,11 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { // part. unsigned LoSize = VT.getSizeInBits(); SDValue HiLHS = - DAG.getNode(ISD::SRA, dl, VT, RHS, + DAG.getNode(ISD::SRA, dl, VT, LHS, DAG.getConstant(LoSize - 1, dl, TLI.getPointerTy(DAG.getDataLayout()))); SDValue HiRHS = - DAG.getNode(ISD::SRA, dl, VT, LHS, + DAG.getNode(ISD::SRA, dl, VT, RHS, DAG.getConstant(LoSize - 1, dl, TLI.getPointerTy(DAG.getDataLayout()))); diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 85068e8..9ed70c9 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -3251,7 +3251,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) { Ops.push_back(Op); } - return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops); + return DAG.getBuildVector(NOutVT, dl, Ops); } @@ -3294,7 +3294,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_VECTOR(SDNode *N) { Ops.push_back(Op); } - return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops); + return DAG.getBuildVector(NOutVT, dl, Ops); } SDValue DAGTypeLegalizer::PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N) { @@ -3342,7 +3342,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) { } } - return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops); + return DAG.getBuildVector(NOutVT, dl, Ops); } SDValue DAGTypeLegalizer::PromoteIntRes_EXTEND_VECTOR_INREG(SDNode *N) { @@ -3445,5 +3445,5 @@ SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) { } } - return DAG.getNode(ISD::BUILD_VECTOR, dl, N->getValueType(0), NewOps); + return DAG.getBuildVector(N->getValueType(0), dl, NewOps); } diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index c02b896..aa69e0e 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -362,8 +362,8 @@ SDValue DAGTypeLegalizer::ExpandOp_BITCAST(SDNode *N) { SmallVector Ops; IntegerToVector(N->getOperand(0), NumElts, Ops, NVT.getVectorElementType()); - SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, - makeArrayRef(Ops.data(), NumElts)); + SDValue Vec = + DAG.getBuildVector(NVT, dl, makeArrayRef(Ops.data(), NumElts)); return DAG.getNode(ISD::BITCAST, dl, N->getValueType(0), Vec); } @@ -396,10 +396,8 @@ SDValue DAGTypeLegalizer::ExpandOp_BUILD_VECTOR(SDNode *N) { NewElts.push_back(Hi); } - SDValue NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl, - EVT::getVectorVT(*DAG.getContext(), - NewVT, NewElts.size()), - NewElts); + EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NewElts.size()); + SDValue NewVec = DAG.getBuildVector(NewVecVT, dl, NewElts); // Convert the new vector to the old vector type. return DAG.getNode(ISD::BITCAST, dl, VecVT, NewVec); @@ -458,7 +456,7 @@ SDValue DAGTypeLegalizer::ExpandOp_SCALAR_TO_VECTOR(SDNode *N) { SDValue UndefVal = DAG.getUNDEF(Ops[0].getValueType()); for (unsigned i = 1; i < NumElts; ++i) Ops[i] = UndefVal; - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + return DAG.getBuildVector(VT, dl, Ops); } SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) { diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 1a7d7b7..4a31602 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -512,7 +512,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) { N->getValueType(0).getScalarType(), Elt); // Revectorize the result so the types line up with what the uses of this // expression expect. - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N->getValueType(0), Op); + return DAG.getBuildVector(N->getValueType(0), SDLoc(N), Op); } /// The vectors to concatenate have length one - use a BUILD_VECTOR instead. @@ -523,16 +523,16 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) { return DAG.getBuildVector(N->getValueType(0), SDLoc(N), Ops); } -/// If the input is a vector that needs to be scalarized, it must be <1 x ty>, -/// so just return the element, ignoring the index. -SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) { - SDValue Res = GetScalarizedVector(N->getOperand(0)); - if (Res.getValueType() != N->getValueType(0)) - Res = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), N->getValueType(0), - Res); - return Res; -} - +/// If the input is a vector that needs to be scalarized, it must be <1 x ty>, +/// so just return the element, ignoring the index. +SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) { + EVT VT = N->getValueType(0); + SDValue Res = GetScalarizedVector(N->getOperand(0)); + if (Res.getValueType() != VT) + Res = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Res); + return Res; +} + /// If the input condition is a vector that needs to be scalarized, it must be /// <1 x i1>, so just convert to a normal ISD::SELECT @@ -2631,7 +2631,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) { if (InVT.isVector()) NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewInVT, Ops); else - NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl, NewInVT, Ops); + NewVec = DAG.getBuildVector(NewInVT, dl, Ops); return DAG.getNode(ISD::BITCAST, dl, WidenVT, NewVec); } } diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index e923e30..69b76fb 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -1320,6 +1320,18 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl &LRegs) { RegAdded, LRegs); const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode()); + if (MCID.hasOptionalDef()) { + // Most ARM instructions have an OptionalDef for CPSR, to model the S-bit. + // This operand can be either a def of CPSR, if the S bit is set; or a use + // of %noreg. When the OptionalDef is set to a valid register, we need to + // handle it in the same way as an ImplicitDef. + for (unsigned i = 0; i < MCID.getNumDefs(); ++i) + if (MCID.OpInfo[i].isOptionalDef()) { + const SDValue &OptionalDef = Node->getOperand(i - Node->getNumValues()); + unsigned Reg = cast(OptionalDef)->getReg(); + CheckForLiveRegDef(SU, Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI); + } + } if (!MCID.ImplicitDefs) continue; for (const MCPhysReg *Reg = MCID.getImplicitDefs(); *Reg; ++Reg) diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 523f409..439f67f 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -36,6 +36,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/Mutex.h" @@ -2868,7 +2869,7 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const { // A left-shift of a constant one will have exactly one bit set because // shifting the bit off the end is undefined. if (Val.getOpcode() == ISD::SHL) { - auto *C = dyn_cast(Val.getOperand(0)); + auto *C = isConstOrConstSplat(Val.getOperand(0)); if (C && C->getAPIntValue() == 1) return true; } @@ -2876,7 +2877,7 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const { // Similarly, a logical right-shift of a constant sign-bit will have exactly // one bit set. if (Val.getOpcode() == ISD::SRL) { - auto *C = dyn_cast(Val.getOperand(0)); + auto *C = isConstOrConstSplat(Val.getOperand(0)); if (C && C->getAPIntValue().isSignMask()) return true; } @@ -7539,10 +7540,10 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const { int64_t GVOffset = 0; if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) { unsigned PtrWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType()); - APInt KnownZero(PtrWidth, 0), KnownOne(PtrWidth, 0); - llvm::computeKnownBits(const_cast(GV), KnownZero, KnownOne, + KnownBits Known(PtrWidth); + llvm::computeKnownBits(const_cast(GV), Known, getDataLayout()); - unsigned AlignBits = KnownZero.countTrailingOnes(); + unsigned AlignBits = Known.Zero.countTrailingOnes(); unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0; if (Align) return MinAlign(Align, GVOffset); @@ -7629,52 +7630,52 @@ Type *ConstantPoolSDNode::getType() const { return Val.ConstVal->getType(); } -bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, - APInt &SplatUndef, +bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits, - bool isBigEndian) const { + bool IsBigEndian) const { EVT VT = getValueType(0); assert(VT.isVector() && "Expected a vector type"); - unsigned sz = VT.getSizeInBits(); - if (MinSplatBits > sz) + unsigned VecWidth = VT.getSizeInBits(); + if (MinSplatBits > VecWidth) return false; - SplatValue = APInt(sz, 0); - SplatUndef = APInt(sz, 0); + // FIXME: The widths are based on this node's type, but build vectors can + // truncate their operands. + SplatValue = APInt(VecWidth, 0); + SplatUndef = APInt(VecWidth, 0); - // Get the bits. Bits with undefined values (when the corresponding element + // Get the bits. Bits with undefined values (when the corresponding element // of the vector is an ISD::UNDEF value) are set in SplatUndef and cleared - // in SplatValue. If any of the values are not constant, give up and return + // in SplatValue. If any of the values are not constant, give up and return // false. - unsigned int nOps = getNumOperands(); - assert(nOps > 0 && "isConstantSplat has 0-size build vector"); - unsigned EltBitSize = VT.getScalarSizeInBits(); + unsigned int NumOps = getNumOperands(); + assert(NumOps > 0 && "isConstantSplat has 0-size build vector"); + unsigned EltWidth = VT.getScalarSizeInBits(); - for (unsigned j = 0; j < nOps; ++j) { - unsigned i = isBigEndian ? nOps-1-j : j; + for (unsigned j = 0; j < NumOps; ++j) { + unsigned i = IsBigEndian ? NumOps - 1 - j : j; SDValue OpVal = getOperand(i); - unsigned BitPos = j * EltBitSize; + unsigned BitPos = j * EltWidth; if (OpVal.isUndef()) - SplatUndef.setBits(BitPos, BitPos + EltBitSize); - else if (ConstantSDNode *CN = dyn_cast(OpVal)) - SplatValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltBitSize), - BitPos); - else if (ConstantFPSDNode *CN = dyn_cast(OpVal)) + SplatUndef.setBits(BitPos, BitPos + EltWidth); + else if (auto *CN = dyn_cast(OpVal)) + SplatValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltWidth), BitPos); + else if (auto *CN = dyn_cast(OpVal)) SplatValue.insertBits(CN->getValueAPF().bitcastToAPInt(), BitPos); else return false; } - // The build_vector is all constants or undefs. Find the smallest element + // The build_vector is all constants or undefs. Find the smallest element // size that splats the vector. - HasAnyUndefs = (SplatUndef != 0); - while (sz > 8) { - unsigned HalfSize = sz / 2; + // FIXME: This does not work for vectors with elements less than 8 bits. + while (VecWidth > 8) { + unsigned HalfSize = VecWidth / 2; APInt HighValue = SplatValue.lshr(HalfSize).trunc(HalfSize); APInt LowValue = SplatValue.trunc(HalfSize); APInt HighUndef = SplatUndef.lshr(HalfSize).trunc(HalfSize); @@ -7688,10 +7689,10 @@ bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, SplatValue = HighValue | LowValue; SplatUndef = HighUndef & LowUndef; - sz = HalfSize; + VecWidth = HalfSize; } - SplatBitSize = sz; + SplatBitSize = VecWidth; return true; } diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 2c58953..6a737ed 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -362,11 +362,11 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, return DAG.getUNDEF(ValueVT); } - if (ValueVT.getVectorNumElements() == 1 && - ValueVT.getVectorElementType() != PartEVT) - Val = DAG.getAnyExtOrTrunc(Val, DL, ValueVT.getScalarType()); + EVT ValueSVT = ValueVT.getVectorElementType(); + if (ValueVT.getVectorNumElements() == 1 && ValueSVT != PartEVT) + Val = DAG.getAnyExtOrTrunc(Val, DL, ValueSVT); - return DAG.getNode(ISD::BUILD_VECTOR, DL, ValueVT, Val); + return DAG.getBuildVector(ValueVT, DL, Val); } static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &dl, @@ -537,7 +537,7 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL, e = PartVT.getVectorNumElements(); i != e; ++i) Ops.push_back(DAG.getUNDEF(ElementVT)); - Val = DAG.getNode(ISD::BUILD_VECTOR, DL, PartVT, Ops); + Val = DAG.getBuildVector(PartVT, DL, Ops); // FIXME: Use CONCAT for 2x -> 4x. @@ -1088,8 +1088,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { if (isa(CDS->getType())) return DAG.getMergeValues(Ops, getCurSDLoc()); - return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurSDLoc(), - VT, Ops); + return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops); } if (C->getType()->isStructTy() || C->getType()->isArrayTy()) { @@ -1141,7 +1140,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { } // Create a BUILD_VECTOR node. - return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurSDLoc(), VT, Ops); + return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops); } // If this is a static alloca, generate it as the frameindex instead of @@ -3147,7 +3146,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) { Ops.push_back(Res); } - setValue(&I, DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Ops)); + setValue(&I, DAG.getBuildVector(VT, DL, Ops)); } void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) { @@ -3969,9 +3968,9 @@ void SelectionDAGBuilder::visitFence(const FenceInst &I) { SDValue Ops[3]; Ops[0] = getRoot(); Ops[1] = DAG.getConstant((unsigned)I.getOrdering(), dl, - TLI.getPointerTy(DAG.getDataLayout())); + TLI.getFenceOperandTy(DAG.getDataLayout())); Ops[2] = DAG.getConstant(I.getSynchScope(), dl, - TLI.getPointerTy(DAG.getDataLayout())); + TLI.getFenceOperandTy(DAG.getDataLayout())); DAG.setRoot(DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops)); } @@ -4896,11 +4895,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { Entry.Node = Src; Args.push_back(Entry); - + Entry.Ty = I.getArgOperand(2)->getType(); Entry.Node = NumElements; Args.push_back(Entry); - + Entry.Ty = Type::getInt32Ty(*DAG.getContext()); Entry.Node = ElementSize; Args.push_back(Entry); @@ -5183,7 +5182,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { SDValue ShOps[2]; ShOps[0] = ShAmt; ShOps[1] = DAG.getConstant(0, sdl, MVT::i32); - ShAmt = DAG.getNode(ISD::BUILD_VECTOR, sdl, ShAmtVT, ShOps); + ShAmt = DAG.getBuildVector(ShAmtVT, sdl, ShOps); EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); ShAmt = DAG.getNode(ISD::BITCAST, sdl, DestVT, ShAmt); Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, sdl, DestVT, @@ -5743,7 +5742,7 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(const CallInst &I, unsigned Opcode; switch (Intrinsic) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::experimental_constrained_fadd: + case Intrinsic::experimental_constrained_fadd: Opcode = ISD::STRICT_FADD; break; case Intrinsic::experimental_constrained_fsub: @@ -6653,12 +6652,12 @@ static void GetRegistersForValue(SelectionDAG &DAG, const TargetLowering &TLI, MachineFunction &MF = DAG.getMachineFunction(); SmallVector Regs; + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); // If this is a constraint for a single physreg, or a constraint for a // register class, find it. std::pair PhysReg = - TLI.getRegForInlineAsmConstraint(MF.getSubtarget().getRegisterInfo(), - OpInfo.ConstraintCode, + TLI.getRegForInlineAsmConstraint(&TRI, OpInfo.ConstraintCode, OpInfo.ConstraintVT); unsigned NumRegs = 1; @@ -6666,12 +6665,12 @@ static void GetRegistersForValue(SelectionDAG &DAG, const TargetLowering &TLI, // If this is a FP input in an integer register (or visa versa) insert a bit // cast of the input value. More generally, handle any case where the input // value disagrees with the register class we plan to stick this in. - if (OpInfo.Type == InlineAsm::isInput && - PhysReg.second && !PhysReg.second->hasType(OpInfo.ConstraintVT)) { + if (OpInfo.Type == InlineAsm::isInput && PhysReg.second && + !TRI.isTypeLegalForClass(*PhysReg.second, OpInfo.ConstraintVT)) { // Try to convert to the first EVT that the reg class contains. If the // types are identical size, use a bitcast to convert (e.g. two differing // vector types). - MVT RegVT = *PhysReg.second->vt_begin(); + MVT RegVT = *TRI.legalclasstypes_begin(*PhysReg.second); if (RegVT.getSizeInBits() == OpInfo.CallOperand.getValueSizeInBits()) { OpInfo.CallOperand = DAG.getNode(ISD::BITCAST, DL, RegVT, OpInfo.CallOperand); @@ -6699,12 +6698,12 @@ static void GetRegistersForValue(SelectionDAG &DAG, const TargetLowering &TLI, if (unsigned AssignedReg = PhysReg.first) { const TargetRegisterClass *RC = PhysReg.second; if (OpInfo.ConstraintVT == MVT::Other) - ValueVT = *RC->vt_begin(); + ValueVT = *TRI.legalclasstypes_begin(*RC); // Get the actual register value type. This is important, because the user // may have asked for (e.g.) the AX register in i32 type. We need to // remember that AX is actually i16 to get the right extension. - RegVT = *RC->vt_begin(); + RegVT = *TRI.legalclasstypes_begin(*RC); // This is a explicit reference to a physical register. Regs.push_back(AssignedReg); @@ -6730,7 +6729,7 @@ static void GetRegistersForValue(SelectionDAG &DAG, const TargetLowering &TLI, // Otherwise, if this was a reference to an LLVM register class, create vregs // for this reference. if (const TargetRegisterClass *RC = PhysReg.second) { - RegVT = *RC->vt_begin(); + RegVT = *TRI.legalclasstypes_begin(*RC); if (OpInfo.ConstraintVT == MVT::Other) ValueVT = RegVT; diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 93c6738..136dec8 100644 --- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -342,11 +342,16 @@ TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { /// If the specified instruction has a constant integer operand and there are /// bits set in that constant that are not demanded, then clear those bits and /// return true. -bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant( - SDValue Op, const APInt &Demanded) { +bool TargetLowering::ShrinkDemandedConstant(SDValue Op, const APInt &Demanded, + TargetLoweringOpt &TLO) const { + SelectionDAG &DAG = TLO.DAG; SDLoc DL(Op); unsigned Opcode = Op.getOpcode(); + // Do target-specific constant optimization. + if (targetShrinkDemandedConstant(Op, Demanded, TLO)) + return TLO.New.getNode(); + // FIXME: ISD::SELECT, ISD::SELECT_CC switch (Opcode) { default: @@ -367,7 +372,7 @@ bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant( EVT VT = Op.getValueType(); SDValue NewC = DAG.getConstant(Demanded & C, DL, VT); SDValue NewOp = DAG.getNode(Opcode, DL, VT, Op.getOperand(0), NewC); - return CombineTo(Op, NewOp); + return TLO.CombineTo(Op, NewOp); } break; @@ -380,15 +385,17 @@ bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant( /// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free. /// This uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be /// generalized for targets with other types of implicit widening casts. -bool TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op, - unsigned BitWidth, - const APInt &Demanded, - const SDLoc &dl) { +bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth, + const APInt &Demanded, + TargetLoweringOpt &TLO) const { assert(Op.getNumOperands() == 2 && "ShrinkDemandedOp only supports binary operators!"); assert(Op.getNode()->getNumValues() == 1 && "ShrinkDemandedOp only supports nodes with one result!"); + SelectionDAG &DAG = TLO.DAG; + SDLoc dl(Op); + // Early return, as this function cannot handle vector types. if (Op.getValueType().isVector()) return false; @@ -418,23 +425,22 @@ bool TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op, bool NeedZext = DemandedSize > SmallVTBits; SDValue Z = DAG.getNode(NeedZext ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND, dl, Op.getValueType(), X); - return CombineTo(Op, Z); + return TLO.CombineTo(Op, Z); } } return false; } bool -TargetLowering::TargetLoweringOpt::SimplifyDemandedBits(SDNode *User, - unsigned OpIdx, - const APInt &Demanded, - DAGCombinerInfo &DCI) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); +TargetLowering::SimplifyDemandedBits(SDNode *User, unsigned OpIdx, + const APInt &Demanded, + DAGCombinerInfo &DCI, + TargetLoweringOpt &TLO) const { SDValue Op = User->getOperand(OpIdx); APInt KnownZero, KnownOne; - if (!TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, - *this, 0, true)) + if (!SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, + TLO, 0, true)) return false; @@ -446,9 +452,9 @@ TargetLowering::TargetLoweringOpt::SimplifyDemandedBits(SDNode *User, // with the value 'x', which will give us: // Old = i32 and x, 0xffffff // New = x - if (Old.hasOneUse()) { + if (TLO.Old.hasOneUse()) { // For the one use case, we just commit the change. - DCI.CommitTargetLoweringOpt(*this); + DCI.CommitTargetLoweringOpt(TLO); return true; } @@ -456,17 +462,17 @@ TargetLowering::TargetLoweringOpt::SimplifyDemandedBits(SDNode *User, // AssumeSingleUse flag is not propogated to recursive calls of // SimplifyDemanded bits, so the only node with multiple use that // it will attempt to combine will be opt. - assert(Old == Op); + assert(TLO.Old == Op); SmallVector NewOps; for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) { if (i == OpIdx) { - NewOps.push_back(New); + NewOps.push_back(TLO.New); continue; } NewOps.push_back(User->getOperand(i)); } - DAG.UpdateNodeOperands(User, NewOps); + TLO.DAG.UpdateNodeOperands(User, NewOps); // Op has less users now, so we may be able to perform additional combines // with it. DCI.AddToWorklist(Op.getNode()); @@ -585,7 +591,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, // If any of the set bits in the RHS are known zero on the LHS, shrink // the constant. - if (TLO.ShrinkDemandedConstant(Op, ~LHSZero & NewMask)) + if (ShrinkDemandedConstant(Op, ~LHSZero & NewMask, TLO)) return true; // Bitwise-not (xor X, -1) is a special case: we don't usually shrink its @@ -620,10 +626,10 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, if ((NewMask & (KnownZero|KnownZero2)) == NewMask) return TLO.CombineTo(Op, TLO.DAG.getConstant(0, dl, Op.getValueType())); // If the RHS is a constant, see if we can simplify it. - if (TLO.ShrinkDemandedConstant(Op, ~KnownZero2 & NewMask)) + if (ShrinkDemandedConstant(Op, ~KnownZero2 & NewMask, TLO)) return true; // If the operation can be done in a smaller type, do so. - if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl)) + if (ShrinkDemandedOp(Op, BitWidth, NewMask, TLO)) return true; // Output known-1 bits are only known if set in both the LHS & RHS. @@ -654,10 +660,10 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, if ((NewMask & ~KnownZero2 & KnownOne) == (~KnownZero2 & NewMask)) return TLO.CombineTo(Op, Op.getOperand(1)); // If the RHS is a constant, see if we can simplify it. - if (TLO.ShrinkDemandedConstant(Op, NewMask)) + if (ShrinkDemandedConstant(Op, NewMask, TLO)) return true; // If the operation can be done in a smaller type, do so. - if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl)) + if (ShrinkDemandedOp(Op, BitWidth, NewMask, TLO)) return true; // Output known-0 bits are only known if clear in both the LHS & RHS. @@ -682,7 +688,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, if ((KnownZero2 & NewMask) == NewMask) return TLO.CombineTo(Op, Op.getOperand(1)); // If the operation can be done in a smaller type, do so. - if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl)) + if (ShrinkDemandedOp(Op, BitWidth, NewMask, TLO)) return true; // If all of the unknown bits are known to be zero on one side or the other @@ -727,7 +733,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, } // If it already has all the bits set, nothing to change // but don't shrink either! - } else if (TLO.ShrinkDemandedConstant(Op, NewMask)) { + } else if (ShrinkDemandedConstant(Op, NewMask, TLO)) { return true; } } @@ -746,7 +752,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); // If the operands are constants, see if we can simplify them. - if (TLO.ShrinkDemandedConstant(Op, NewMask)) + if (ShrinkDemandedConstant(Op, NewMask, TLO)) return true; // Only known if known in both the LHS and RHS. @@ -764,7 +770,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); // If the operands are constants, see if we can simplify them. - if (TLO.ShrinkDemandedConstant(Op, NewMask)) + if (ShrinkDemandedConstant(Op, NewMask, TLO)) return true; // Only known if known in both the LHS and RHS. @@ -1284,7 +1290,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, SimplifyDemandedBits(Op.getOperand(1), LoMask, KnownZero2, KnownOne2, TLO, Depth+1) || // See if the operation should be performed at a smaller bit width. - TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl)) { + ShrinkDemandedOp(Op, BitWidth, NewMask, TLO)) { const SDNodeFlags *Flags = Op.getNode()->getFlags(); if (Flags->hasNoSignedWrap() || Flags->hasNoUnsignedWrap()) { // Disable the nsw and nuw flags. We can no longer guarantee that we @@ -1358,31 +1364,38 @@ unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, return 1; } +// FIXME: Ideally, this would use ISD::isConstantSplatVector(), but that must +// work with truncating build vectors and vectors with elements of less than +// 8 bits. bool TargetLowering::isConstTrueVal(const SDNode *N) const { if (!N) return false; - const ConstantSDNode *CN = dyn_cast(N); - if (!CN) { - const BuildVectorSDNode *BV = dyn_cast(N); - if (!BV) - return false; - - // Only interested in constant splats, we don't care about undef - // elements in identifying boolean constants and getConstantSplatNode - // returns NULL if all ops are undef; - CN = BV->getConstantSplatNode(); + APInt CVal; + if (auto *CN = dyn_cast(N)) { + CVal = CN->getAPIntValue(); + } else if (auto *BV = dyn_cast(N)) { + auto *CN = BV->getConstantSplatNode(); if (!CN) return false; + + // If this is a truncating build vector, truncate the splat value. + // Otherwise, we may fail to match the expected values below. + unsigned BVEltWidth = BV->getValueType(0).getScalarSizeInBits(); + CVal = CN->getAPIntValue(); + if (BVEltWidth < CVal.getBitWidth()) + CVal = CVal.trunc(BVEltWidth); + } else { + return false; } switch (getBooleanContents(N->getValueType(0))) { case UndefinedBooleanContent: - return CN->getAPIntValue()[0]; + return CVal[0]; case ZeroOrOneBooleanContent: - return CN->isOne(); + return CVal == 1; case ZeroOrNegativeOneBooleanContent: - return CN->isAllOnesValue(); + return CVal.isAllOnesValue(); } llvm_unreachable("Invalid boolean contents"); @@ -2535,7 +2548,7 @@ TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI, for (const TargetRegisterClass *RC : RI->regclasses()) { // If none of the value types for this register class are valid, we // can't use it. For example, 64-bit reg classes on 32-bit targets. - if (!isLegalRC(RC)) + if (!isLegalRC(*RI, *RC)) continue; for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); @@ -2547,9 +2560,9 @@ TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI, // If this register class has the requested value type, return it, // otherwise keep searching and return the first class found // if no other is found which explicitly has the requested type. - if (RC->hasType(VT)) + if (RI->isTypeLegalForClass(*RC, VT)) return S; - else if (!R.second) + if (!R.second) R = S; } } diff --git a/lib/CodeGen/StackMaps.cpp b/lib/CodeGen/StackMaps.cpp index 1a8ec5b..315b059 100644 --- a/lib/CodeGen/StackMaps.cpp +++ b/lib/CodeGen/StackMaps.cpp @@ -161,7 +161,8 @@ StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI, if (SubRegIdx) Offset = TRI->getSubRegIdxOffset(SubRegIdx); - Locs.emplace_back(Location::Register, RC->getSize(), DwarfRegNum, Offset); + Locs.emplace_back(Location::Register, TRI->getSpillSize(*RC), + DwarfRegNum, Offset); return ++MOI; } @@ -245,7 +246,7 @@ void StackMaps::print(raw_ostream &OS) { StackMaps::LiveOutReg StackMaps::createLiveOutReg(unsigned Reg, const TargetRegisterInfo *TRI) const { unsigned DwarfRegNum = getDwarfRegNum(Reg, TRI); - unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); + unsigned Size = TRI->getSpillSize(*TRI->getMinimalPhysRegClass(Reg)); return LiveOutReg(Reg, DwarfRegNum, Size); } diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp index 711144a..14c5adc 100644 --- a/lib/CodeGen/TargetInstrInfo.cpp +++ b/lib/CodeGen/TargetInstrInfo.cpp @@ -345,12 +345,12 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC, unsigned SubIdx, unsigned &Size, unsigned &Offset, const MachineFunction &MF) const { + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); if (!SubIdx) { - Size = RC->getSize(); + Size = TRI->getSpillSize(*RC); Offset = 0; return true; } - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); unsigned BitSize = TRI->getSubRegIdxSize(SubIdx); // Convert bit size to byte size to be consistent with // MCRegisterClass::getSize(). @@ -364,10 +364,10 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC, Size = BitSize /= 8; Offset = (unsigned)BitOffset / 8; - assert(RC->getSize() >= (Offset + Size) && "bad subregister range"); + assert(TRI->getSpillSize(*RC) >= (Offset + Size) && "bad subregister range"); if (!MF.getDataLayout().isLittleEndian()) { - Offset = RC->getSize() - (Offset + Size); + Offset = TRI->getSpillSize(*RC) - (Offset + Size); } return true; } @@ -428,8 +428,8 @@ static const TargetRegisterClass *canFoldCopy(const MachineInstr &MI, return nullptr; } -void TargetInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { - llvm_unreachable("Not a MachO target"); +void TargetInstrInfo::getNoop(MCInst &NopInst) const { + llvm_unreachable("Not implemented"); } static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI, diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp index 27630a3..e579922 100644 --- a/lib/CodeGen/TargetLoweringBase.cpp +++ b/lib/CodeGen/TargetLoweringBase.cpp @@ -1184,12 +1184,11 @@ static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT, /// isLegalRC - Return true if the value types that can be represented by the /// specified register class are all legal. -bool TargetLoweringBase::isLegalRC(const TargetRegisterClass *RC) const { - for (TargetRegisterClass::vt_iterator I = RC->vt_begin(), E = RC->vt_end(); - I != E; ++I) { +bool TargetLoweringBase::isLegalRC(const TargetRegisterInfo &TRI, + const TargetRegisterClass &RC) const { + for (auto I = TRI.legalclasstypes_begin(RC); *I != MVT::Other; ++I) if (isTypeLegal(*I)) return true; - } return false; } @@ -1299,9 +1298,9 @@ TargetLoweringBase::findRepresentativeClass(const TargetRegisterInfo *TRI, for (int i = SuperRegRC.find_first(); i >= 0; i = SuperRegRC.find_next(i)) { const TargetRegisterClass *SuperRC = TRI->getRegClass(i); // We want the largest possible spill size. - if (SuperRC->getSize() <= BestRC->getSize()) + if (TRI->getSpillSize(*SuperRC) <= TRI->getSpillSize(*BestRC)) continue; - if (!isLegalRC(SuperRC)) + if (!isLegalRC(*TRI, *SuperRC)) continue; BestRC = SuperRC; } diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp index 66cdad2..f6e4c17 100644 --- a/lib/CodeGen/TargetRegisterInfo.cpp +++ b/lib/CodeGen/TargetRegisterInfo.cpp @@ -156,8 +156,8 @@ TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, MVT VT) const { // this physreg. const TargetRegisterClass* BestRC = nullptr; for (const TargetRegisterClass* RC : regclasses()) { - if ((VT == MVT::Other || RC->hasType(VT)) && RC->contains(reg) && - (!BestRC || BestRC->hasSubClass(RC))) + if ((VT == MVT::Other || isTypeLegalForClass(*RC, VT)) && + RC->contains(reg) && (!BestRC || BestRC->hasSubClass(RC))) BestRC = RC; } @@ -207,7 +207,7 @@ const TargetRegisterClass *firstCommonClass(const uint32_t *A, if (unsigned Common = *A++ & *B++) { const TargetRegisterClass *RC = TRI->getRegClass(I + countTrailingZeros(Common)); - if (SVT == MVT::SimpleValueType::Any || RC->hasType(VT)) + if (SVT == MVT::SimpleValueType::Any || TRI->isTypeLegalForClass(*RC, VT)) return RC; } return nullptr; @@ -265,7 +265,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA, const TargetRegisterClass *BestRC = nullptr; unsigned *BestPreA = &PreA; unsigned *BestPreB = &PreB; - if (RCA->getSize() < RCB->getSize()) { + if (getRegSizeInBits(*RCA) < getRegSizeInBits(*RCB)) { std::swap(RCA, RCB); std::swap(SubA, SubB); std::swap(BestPreA, BestPreB); @@ -273,7 +273,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA, // Also terminate the search one we have found a register class as small as // RCA. - unsigned MinSize = RCA->getSize(); + unsigned MinSize = getRegSizeInBits(*RCA); for (SuperRegClassIterator IA(RCA, this, true); IA.isValid(); ++IA) { unsigned FinalA = composeSubRegIndices(IA.getSubReg(), SubA); @@ -281,7 +281,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA, // Check if a common super-register class exists for this index pair. const TargetRegisterClass *RC = firstCommonClass(IA.getMask(), IB.getMask(), this); - if (!RC || RC->getSize() < MinSize) + if (!RC || getRegSizeInBits(*RC) < MinSize) continue; // The indexes must compose identically: PreA+SubA == PreB+SubB. @@ -290,7 +290,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA, continue; // Is RC a better candidate than BestRC? - if (BestRC && RC->getSize() >= BestRC->getSize()) + if (BestRC && getRegSizeInBits(*RC) >= getRegSizeInBits(*BestRC)) continue; // Yes, RC is the smallest super-register seen so far. @@ -299,7 +299,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA, *BestPreB = IB.getSubReg(); // Bail early if we reached MinSize. We won't find a better candidate. - if (BestRC->getSize() == MinSize) + if (getRegSizeInBits(*BestRC) == MinSize) return BestRC; } } diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp index c894601..d10ca1a 100644 --- a/lib/CodeGen/VirtRegMap.cpp +++ b/lib/CodeGen/VirtRegMap.cpp @@ -73,8 +73,9 @@ void VirtRegMap::grow() { } unsigned VirtRegMap::createSpillSlot(const TargetRegisterClass *RC) { - int SS = MF->getFrameInfo().CreateSpillStackObject(RC->getSize(), - RC->getAlignment()); + unsigned Size = TRI->getSpillSize(*RC); + unsigned Align = TRI->getSpillAlignment(*RC); + int SS = MF->getFrameInfo().CreateSpillStackObject(Size, Align); ++NumSpillSlots; return SS; } diff --git a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp index 85e1eae..a12f8ad 100644 --- a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp +++ b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp @@ -9,6 +9,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h" +#include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/DebugInfo/DWARF/DWARFFormValue.h" #include "llvm/DebugInfo/DWARF/DWARFRelocMap.h" #include "llvm/Support/Dwarf.h" @@ -112,10 +113,8 @@ LLVM_DUMP_METHOD void DWARFAcceleratorTable::dump(raw_ostream &OS) const { continue; } while (AccelSection.isValidOffsetForDataOfSize(DataOffset, 4)) { - unsigned StringOffset = AccelSection.getU32(&DataOffset); - RelocAddrMap::const_iterator Reloc = Relocs.find(DataOffset-4); - if (Reloc != Relocs.end()) - StringOffset += Reloc->second.second; + unsigned StringOffset = + getRelocatedValue(AccelSection, 4, &DataOffset, &Relocs); if (!StringOffset) break; OS << format(" Name: %08x \"%s\"\n", StringOffset, diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp index bbb19b5..7e8d046 100644 --- a/lib/DebugInfo/DWARF/DWARFContext.cpp +++ b/lib/DebugInfo/DWARF/DWARFContext.cpp @@ -56,6 +56,16 @@ typedef DWARFDebugLine::LineTable DWARFLineTable; typedef DILineInfoSpecifier::FileLineInfoKind FileLineInfoKind; typedef DILineInfoSpecifier::FunctionNameKind FunctionNameKind; +uint64_t llvm::getRelocatedValue(const DataExtractor &Data, uint32_t Size, + uint32_t *Off, const RelocAddrMap *Relocs) { + if (!Relocs) + return Data.getUnsigned(Off, Size); + RelocAddrMap::const_iterator AI = Relocs->find(*Off); + if (AI == Relocs->end()) + return Data.getUnsigned(Off, Size); + return Data.getUnsigned(Off, Size) + AI->second.second; +} + static void dumpAccelSection(raw_ostream &OS, StringRef Name, const DWARFSection& Section, StringRef StringSection, bool LittleEndian) { @@ -212,11 +222,11 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType, bool DumpEH, // sizes, but for simplicity we just use the address byte size of the last // compile unit (there is no easy and fast way to associate address range // list and the compile unit it describes). - DataExtractor rangesData(getRangeSection(), isLittleEndian(), + DataExtractor rangesData(getRangeSection().Data, isLittleEndian(), savedAddressByteSize); offset = 0; DWARFDebugRangeList rangeList; - while (rangeList.extract(rangesData, &offset)) + while (rangeList.extract(rangesData, &offset, getRangeSection().Relocs)) rangeList.dump(OS); } @@ -722,7 +732,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj, *SectionData = data; if (name == "debug_ranges") { // FIXME: Use the other dwo range section when we emit it. - RangeDWOSection = data; + RangeDWOSection.Data = data; } } else if (name == "debug_types") { // Find debug_types data by section rather than name as there are @@ -763,6 +773,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj, .Case("debug_loc", &LocSection.Relocs) .Case("debug_info.dwo", &InfoDWOSection.Relocs) .Case("debug_line", &LineSection.Relocs) + .Case("debug_ranges", &RangeSection.Relocs) .Case("apple_names", &AppleNamesSection.Relocs) .Case("apple_types", &AppleTypesSection.Relocs) .Case("apple_namespaces", &AppleNamespacesSection.Relocs) @@ -845,7 +856,7 @@ StringRef *DWARFContextInMemory::MapSectionToMember(StringRef Name) { .Case("debug_frame", &DebugFrameSection) .Case("eh_frame", &EHFrameSection) .Case("debug_str", &StringSection) - .Case("debug_ranges", &RangeSection) + .Case("debug_ranges", &RangeSection.Data) .Case("debug_macinfo", &MacinfoSection) .Case("debug_pubnames", &PubNamesSection) .Case("debug_pubtypes", &PubTypesSection) diff --git a/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/lib/DebugInfo/DWARF/DWARFDebugLine.cpp index e467051..ff6ed9c 100644 --- a/lib/DebugInfo/DWARF/DWARFDebugLine.cpp +++ b/lib/DebugInfo/DWARF/DWARFDebugLine.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/SmallString.h" +#include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/DebugInfo/DWARF/DWARFDebugLine.h" #include "llvm/DebugInfo/DWARF/DWARFRelocMap.h" #include "llvm/Support/Dwarf.h" @@ -302,16 +303,9 @@ bool DWARFDebugLine::LineTable::parse(DataExtractor debug_line_data, // relocatable address. All of the other statement program opcodes // that affect the address register add a delta to it. This instruction // stores a relocatable value into it instead. - { - // If this address is in our relocation map, apply the relocation. - RelocAddrMap::const_iterator AI = RMap->find(*offset_ptr); - if (AI != RMap->end()) { - const std::pair &R = AI->second; - State.Row.Address = - debug_line_data.getAddress(offset_ptr) + R.second; - } else - State.Row.Address = debug_line_data.getAddress(offset_ptr); - } + State.Row.Address = + getRelocatedValue(debug_line_data, debug_line_data.getAddressSize(), + offset_ptr, RMap); break; case DW_LNE_define_file: diff --git a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp index e2799ab..d5c3421 100644 --- a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp +++ b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/StringRef.h" +#include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h" #include "llvm/DebugInfo/DWARF/DWARFRelocMap.h" #include "llvm/Support/Dwarf.h" @@ -48,18 +49,10 @@ void DWARFDebugLoc::parse(DataExtractor data, unsigned AddressSize) { // 2.6.2 Location Lists // A location list entry consists of: while (true) { + // A beginning and ending address offsets. Entry E; - RelocAddrMap::const_iterator AI = RelocMap.find(Offset); - // 1. A beginning address offset. ... - E.Begin = data.getUnsigned(&Offset, AddressSize); - if (AI != RelocMap.end()) - E.Begin += AI->second.second; - - AI = RelocMap.find(Offset); - // 2. An ending address offset. ... - E.End = data.getUnsigned(&Offset, AddressSize); - if (AI != RelocMap.end()) - E.End += AI->second.second; + E.Begin = getRelocatedValue(data, AddressSize, &Offset, &RelocMap); + E.End = getRelocatedValue(data, AddressSize, &Offset, &RelocMap); // The end of any given location list is marked by an end of list entry, // which consists of a 0 for the beginning address offset and a 0 for the diff --git a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp index f1d82fd..9380fe8 100644 --- a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp +++ b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp @@ -7,6 +7,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h" #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" @@ -22,7 +23,8 @@ void DWARFDebugRangeList::clear() { Entries.clear(); } -bool DWARFDebugRangeList::extract(DataExtractor data, uint32_t *offset_ptr) { +bool DWARFDebugRangeList::extract(DataExtractor data, uint32_t *offset_ptr, + const RelocAddrMap &Relocs) { clear(); if (!data.isValidOffset(*offset_ptr)) return false; @@ -33,8 +35,11 @@ bool DWARFDebugRangeList::extract(DataExtractor data, uint32_t *offset_ptr) { while (true) { RangeListEntry entry; uint32_t prev_offset = *offset_ptr; - entry.StartAddress = data.getAddress(offset_ptr); - entry.EndAddress = data.getAddress(offset_ptr); + entry.StartAddress = + getRelocatedValue(data, AddressSize, offset_ptr, &Relocs); + entry.EndAddress = + getRelocatedValue(data, AddressSize, offset_ptr, &Relocs); + // Check that both values were extracted correctly. if (*offset_ptr != prev_offset + 2 * AddressSize) { clear(); diff --git a/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/lib/DebugInfo/DWARF/DWARFFormValue.cpp index 6de57b9..28592e4 100644 --- a/lib/DebugInfo/DWARF/DWARFFormValue.cpp +++ b/lib/DebugInfo/DWARF/DWARFFormValue.cpp @@ -334,11 +334,8 @@ bool DWARFFormValue::extractValue(const DataExtractor &data, (Form == DW_FORM_addr) ? U->getAddressByteSize() : U->getRefAddrByteSize(); - RelocAddrMap::const_iterator AI = U->getRelocMap()->find(*offset_ptr); - if (AI != U->getRelocMap()->end()) { - Value.uval = data.getUnsigned(offset_ptr, AddrSize) + AI->second.second; - } else - Value.uval = data.getUnsigned(offset_ptr, AddrSize); + Value.uval = + getRelocatedValue(data, AddrSize, offset_ptr, U->getRelocMap()); break; } case DW_FORM_exprloc: @@ -376,12 +373,8 @@ bool DWARFFormValue::extractValue(const DataExtractor &data, case DW_FORM_ref_sup4: case DW_FORM_strx4: case DW_FORM_addrx4: { - Value.uval = data.getU32(offset_ptr); - if (!U) - break; - RelocAddrMap::const_iterator AI = U->getRelocMap()->find(*offset_ptr-4); - if (AI != U->getRelocMap()->end()) - Value.uval += AI->second.second; + const RelocAddrMap* RelocMap = U ? U->getRelocMap() : nullptr; + Value.uval = getRelocatedValue(data, 4, offset_ptr, RelocMap); break; } case DW_FORM_data8: @@ -411,11 +404,8 @@ bool DWARFFormValue::extractValue(const DataExtractor &data, case DW_FORM_strp_sup: { if (!U) return false; - RelocAddrMap::const_iterator AI = U->getRelocMap()->find(*offset_ptr); - uint8_t Size = U->getDwarfOffsetByteSize(); - Value.uval = data.getUnsigned(offset_ptr, Size); - if (AI != U->getRelocMap()->end()) - Value.uval += AI->second.second; + Value.uval = getRelocatedValue(data, U->getDwarfOffsetByteSize(), + offset_ptr, U->getRelocMap()); break; } case DW_FORM_flag_present: diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp index c3f4677..f50487f 100644 --- a/lib/DebugInfo/DWARF/DWARFUnit.cpp +++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp @@ -32,7 +32,7 @@ using namespace llvm; using namespace dwarf; void DWARFUnitSectionBase::parse(DWARFContext &C, const DWARFSection &Section) { - parseImpl(C, Section, C.getDebugAbbrev(), C.getRangeSection(), + parseImpl(C, Section, C.getDebugAbbrev(), &C.getRangeSection(), C.getStringSection(), StringRef(), C.getAddrSection(), C.getLineSection().Data, C.isLittleEndian(), false); } @@ -40,16 +40,17 @@ void DWARFUnitSectionBase::parse(DWARFContext &C, const DWARFSection &Section) { void DWARFUnitSectionBase::parseDWO(DWARFContext &C, const DWARFSection &DWOSection, DWARFUnitIndex *Index) { - parseImpl(C, DWOSection, C.getDebugAbbrevDWO(), C.getRangeDWOSection(), + parseImpl(C, DWOSection, C.getDebugAbbrevDWO(), &C.getRangeDWOSection(), C.getStringDWOSection(), C.getStringOffsetDWOSection(), C.getAddrSection(), C.getLineDWOSection().Data, C.isLittleEndian(), true); } DWARFUnit::DWARFUnit(DWARFContext &DC, const DWARFSection &Section, - const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS, - StringRef SOS, StringRef AOS, StringRef LS, bool LE, - bool IsDWO, const DWARFUnitSectionBase &UnitSection, + const DWARFDebugAbbrev *DA, const DWARFSection *RS, + StringRef SS, StringRef SOS, StringRef AOS, StringRef LS, + bool LE, bool IsDWO, + const DWARFUnitSectionBase &UnitSection, const DWARFUnitIndex::Entry *IndexEntry) : Context(DC), InfoSection(Section), Abbrev(DA), RangeSection(RS), LineSection(LS), StringSection(SS), StringOffsetSection([&]() { @@ -142,9 +143,10 @@ bool DWARFUnit::extractRangeList(uint32_t RangeListOffset, DWARFDebugRangeList &RangeList) const { // Require that compile unit is extracted. assert(!DieArray.empty()); - DataExtractor RangesData(RangeSection, isLittleEndian, AddrSize); + DataExtractor RangesData(RangeSection->Data, isLittleEndian, AddrSize); uint32_t ActualRangeListOffset = RangeSectionBase + RangeListOffset; - return RangeList.extract(RangesData, &ActualRangeListOffset); + return RangeList.extract(RangesData, &ActualRangeListOffset, + RangeSection->Relocs); } void DWARFUnit::clear() { diff --git a/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp b/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp index 5e8c0bd..4e2474c 100644 --- a/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp +++ b/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp @@ -14,6 +14,7 @@ #include "llvm/DebugInfo/PDB/DIA/DIAEnumSymbols.h" #include "llvm/DebugInfo/PDB/DIA/DIASession.h" #include "llvm/DebugInfo/PDB/PDBExtras.h" +#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h" #include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h" #include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h" #include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h" @@ -720,7 +721,7 @@ uint32_t DIARawSymbol::getVirtualTableShapeId() const { return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_virtualTableShapeId); } -std::unique_ptr +std::unique_ptr DIARawSymbol::getVirtualBaseTableType() const { CComPtr TableType; if (FAILED(Symbol->get_virtualBaseTableType(&TableType)) || !TableType) @@ -729,7 +730,7 @@ DIARawSymbol::getVirtualBaseTableType() const { auto RawVT = llvm::make_unique(Session, TableType); auto Pointer = llvm::make_unique(Session, std::move(RawVT)); - return unique_dyn_cast(Pointer->getPointeeType()); + return unique_dyn_cast(Pointer->getPointeeType()); } PDB_DataKind DIARawSymbol::getDataKind() const { diff --git a/lib/DebugInfo/PDB/DIA/DIASession.cpp b/lib/DebugInfo/PDB/DIA/DIASession.cpp index 6ecf335..ef47b92 100644 --- a/lib/DebugInfo/PDB/DIA/DIASession.cpp +++ b/lib/DebugInfo/PDB/DIA/DIASession.cpp @@ -21,12 +21,22 @@ #include "llvm/DebugInfo/PDB/PDBSymbolExe.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/Format.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; using namespace llvm::pdb; -static Error ErrorFromHResult(HRESULT Result, StringRef Context) { +template +static Error ErrorFromHResult(HRESULT Result, const char *Str, Ts &&... Args) { + SmallString<64> MessageStorage; + StringRef Context; + if (sizeof...(Args) > 0) { + MessageStorage = formatv(Str, std::forward(Args)...).str(); + Context = MessageStorage; + } else + Context = Str; + switch (Result) { case E_PDB_NOT_FOUND: return make_error(generic_error_code::invalid_path, Context); @@ -95,8 +105,9 @@ Error DIASession::createFromPdb(StringRef Path, const wchar_t *Path16Str = reinterpret_cast(Path16.data()); HRESULT HR; - if (FAILED(HR = DiaDataSource->loadDataFromPdb(Path16Str))) - return ErrorFromHResult(HR, "Calling loadDataFromPdb"); + if (FAILED(HR = DiaDataSource->loadDataFromPdb(Path16Str))) { + return ErrorFromHResult(HR, "Calling loadDataFromPdb {0}", Path); + } if (FAILED(HR = DiaDataSource->openSession(&DiaSession))) return ErrorFromHResult(HR, "Calling openSession"); diff --git a/lib/DebugInfo/PDB/Native/ModStream.cpp b/lib/DebugInfo/PDB/Native/ModStream.cpp index 08798cf..e87e2c4 100644 --- a/lib/DebugInfo/PDB/Native/ModStream.cpp +++ b/lib/DebugInfo/PDB/Native/ModStream.cpp @@ -82,4 +82,8 @@ ModStream::lines(bool *HadError) const { return make_range(LineInfo.begin(HadError), LineInfo.end()); } +bool ModStream::hasLineInfo() const { + return C13LinesSubstream.getLength() > 0 || LinesSubstream.getLength() > 0; +} + Error ModStream::commit() { return Error::success(); } diff --git a/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp b/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp index 3aba35a..70968d4 100644 --- a/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp +++ b/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp @@ -13,6 +13,7 @@ #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h" #include "llvm/DebugInfo/PDB/Native/NativeSession.h" #include "llvm/DebugInfo/PDB/PDBExtras.h" +#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h" #include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h" #include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h" #include "llvm/Support/ConvertUTF.h" @@ -320,7 +321,7 @@ uint32_t NativeRawSymbol::getVirtualTableShapeId() const { return 0; } -std::unique_ptr +std::unique_ptr NativeRawSymbol::getVirtualBaseTableType() const { return nullptr; } diff --git a/lib/DebugInfo/PDB/UDTLayout.cpp b/lib/DebugInfo/PDB/UDTLayout.cpp index 61cef09..aacefae 100644 --- a/lib/DebugInfo/PDB/UDTLayout.cpp +++ b/lib/DebugInfo/PDB/UDTLayout.cpp @@ -16,6 +16,7 @@ #include "llvm/DebugInfo/PDB/PDBSymbolExe.h" #include "llvm/DebugInfo/PDB/PDBSymbolFunc.h" #include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h" +#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h" #include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h" #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h" #include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h" @@ -39,36 +40,47 @@ static uint32_t getTypeLength(const PDBSymbol &Symbol) { return RawType.getLength(); } -StorageItemBase::StorageItemBase(const UDTLayoutBase &Parent, - const PDBSymbol &Symbol, - const std::string &Name, - uint32_t OffsetInParent, uint32_t Size) - : Parent(Parent), Symbol(Symbol), Name(Name), - OffsetInParent(OffsetInParent), SizeOf(Size) { +LayoutItemBase::LayoutItemBase(const UDTLayoutBase *Parent, + const PDBSymbol *Symbol, const std::string &Name, + uint32_t OffsetInParent, uint32_t Size, + bool IsElided) + : Symbol(Symbol), Parent(Parent), Name(Name), + OffsetInParent(OffsetInParent), SizeOf(Size), LayoutSize(Size), + IsElided(IsElided) { UsedBytes.resize(SizeOf, true); } -uint32_t StorageItemBase::deepPaddingSize() const { - // sizeof(Field) - sizeof(typeof(Field)) is trailing padding. - return SizeOf - getTypeLength(Symbol); +uint32_t LayoutItemBase::deepPaddingSize() const { + return UsedBytes.size() - UsedBytes.count(); +} + +uint32_t LayoutItemBase::tailPadding() const { + int Last = UsedBytes.find_last(); + + return UsedBytes.size() - (Last + 1); } DataMemberLayoutItem::DataMemberLayoutItem( - const UDTLayoutBase &Parent, std::unique_ptr DataMember) - : StorageItemBase(Parent, *DataMember, DataMember->getName(), - DataMember->getOffset(), getTypeLength(*DataMember)), - DataMember(std::move(DataMember)) { - auto Type = this->DataMember->getType(); + const UDTLayoutBase &Parent, std::unique_ptr Member) + : LayoutItemBase(&Parent, Member.get(), Member->getName(), + Member->getOffset(), getTypeLength(*Member), false), + DataMember(std::move(Member)) { + auto Type = DataMember->getType(); if (auto UDT = unique_dyn_cast(Type)) { - // UDT data members might have padding in between fields, but otherwise - // a member should occupy its entire storage. - UsedBytes.resize(SizeOf, false); UdtLayout = llvm::make_unique(std::move(UDT)); + UsedBytes = UdtLayout->usedBytes(); } } +VBPtrLayoutItem::VBPtrLayoutItem(const UDTLayoutBase &Parent, + std::unique_ptr Sym, + uint32_t Offset, uint32_t Size) + : LayoutItemBase(&Parent, Sym.get(), "", Offset, Size, false), + Type(std::move(Sym)) { +} + const PDBSymbolData &DataMemberLayoutItem::getDataMember() { - return *dyn_cast(&Symbol); + return *dyn_cast(Symbol); } bool DataMemberLayoutItem::hasUDTLayout() const { return UdtLayout != nullptr; } @@ -77,60 +89,73 @@ const ClassLayout &DataMemberLayoutItem::getUDTLayout() const { return *UdtLayout; } -uint32_t DataMemberLayoutItem::deepPaddingSize() const { - uint32_t Result = StorageItemBase::deepPaddingSize(); - if (UdtLayout) - Result += UdtLayout->deepPaddingSize(); - return Result; -} - VTableLayoutItem::VTableLayoutItem(const UDTLayoutBase &Parent, - std::unique_ptr VTable) - : StorageItemBase(Parent, *VTable, "", 0, getTypeLength(*VTable)), - VTable(std::move(VTable)) { - auto VTableType = cast(this->VTable->getType()); + std::unique_ptr VT) + : LayoutItemBase(&Parent, VT.get(), "", 0, getTypeLength(*VT), false), + VTable(std::move(VT)) { + auto VTableType = cast(VTable->getType()); ElementSize = VTableType->getLength(); +} - Shape = - unique_dyn_cast(VTableType->getPointeeType()); - if (Shape) - VTableFuncs.resize(Shape->getCount()); +UDTLayoutBase::UDTLayoutBase(const UDTLayoutBase *Parent, const PDBSymbol &Sym, + const std::string &Name, uint32_t OffsetInParent, + uint32_t Size, bool IsElided) + : LayoutItemBase(Parent, &Sym, Name, OffsetInParent, Size, IsElided) { + // UDT storage comes from a union of all the children's storage, so start out + // uninitialized. + UsedBytes.reset(0, Size); + + initializeChildren(Sym); + if (LayoutSize < Size) + UsedBytes.resize(LayoutSize); } -UDTLayoutBase::UDTLayoutBase(const PDBSymbol &Symbol, const std::string &Name, - uint32_t Size) - : SymbolBase(Symbol), Name(Name), SizeOf(Size) { - UsedBytes.resize(Size); - ChildrenPerByte.resize(Size); - initializeChildren(Symbol); +uint32_t UDTLayoutBase::tailPadding() const { + uint32_t Abs = LayoutItemBase::tailPadding(); + if (!LayoutItems.empty()) { + const LayoutItemBase *Back = LayoutItems.back(); + uint32_t ChildPadding = Back->LayoutItemBase::tailPadding(); + if (Abs < ChildPadding) + Abs = 0; + else + Abs -= ChildPadding; + } + return Abs; } ClassLayout::ClassLayout(const PDBSymbolTypeUDT &UDT) - : UDTLayoutBase(UDT, UDT.getName(), UDT.getLength()), UDT(UDT) {} + : UDTLayoutBase(nullptr, UDT, UDT.getName(), 0, UDT.getLength(), false), + UDT(UDT) { + ImmediateUsedBytes.resize(SizeOf, false); + for (auto &LI : LayoutItems) { + uint32_t Begin = LI->getOffsetInParent(); + uint32_t End = Begin + LI->getLayoutSize(); + End = std::min(SizeOf, End); + ImmediateUsedBytes.set(Begin, End); + } +} ClassLayout::ClassLayout(std::unique_ptr UDT) : ClassLayout(*UDT) { OwnedStorage = std::move(UDT); } -BaseClassLayout::BaseClassLayout(const UDTLayoutBase &Parent, - std::unique_ptr Base) - : UDTLayoutBase(*Base, Base->getName(), Base->getLength()), - StorageItemBase(Parent, *Base, Base->getName(), Base->getOffset(), - Base->getLength()), - Base(std::move(Base)) { - IsVirtualBase = this->Base->isVirtualBaseClass(); -} - -uint32_t UDTLayoutBase::shallowPaddingSize() const { - return UsedBytes.size() - UsedBytes.count(); +uint32_t ClassLayout::immediatePadding() const { + return SizeOf - ImmediateUsedBytes.count(); } -uint32_t UDTLayoutBase::deepPaddingSize() const { - uint32_t Result = shallowPaddingSize(); - for (auto &Child : ChildStorage) - Result += Child->deepPaddingSize(); - return Result; +BaseClassLayout::BaseClassLayout(const UDTLayoutBase &Parent, + uint32_t OffsetInParent, bool Elide, + std::unique_ptr B) + : UDTLayoutBase(&Parent, *B, B->getName(), OffsetInParent, B->getLength(), + Elide), + Base(std::move(B)) { + if (isEmptyBase()) { + // Special case an empty base so that it doesn't get treated as padding. + UsedBytes.resize(1); + UsedBytes.set(0); + } + IsVirtualBase = Base->isVirtualBaseClass(); } void UDTLayoutBase::initializeChildren(const PDBSymbol &Sym) { @@ -138,15 +163,16 @@ void UDTLayoutBase::initializeChildren(const PDBSymbol &Sym) { // followed by functions, followed by other. This ordering is necessary // so that bases and vtables get initialized before any functions which // may override them. - UniquePtrVector Bases; UniquePtrVector VTables; UniquePtrVector Members; + UniquePtrVector VirtualBaseSyms; + auto Children = Sym.findAllChildren(); while (auto Child = Children->getNext()) { if (auto Base = unique_dyn_cast(Child)) { if (Base->isVirtualBaseClass()) - VirtualBases.push_back(std::move(Base)); + VirtualBaseSyms.push_back(std::move(Base)); else Bases.push_back(std::move(Base)); } @@ -164,20 +190,33 @@ void UDTLayoutBase::initializeChildren(const PDBSymbol &Sym) { Other.push_back(std::move(Child)); } + // We don't want to have any re-allocations in the list of bases, so make + // sure to reserve enough space so that our ArrayRefs don't get invalidated. + AllBases.reserve(Bases.size() + VirtualBaseSyms.size()); + + // Only add non-virtual bases to the class first. Only at the end of the + // class, after all non-virtual bases and data members have been added do we + // add virtual bases. This way the offsets are correctly aligned when we go + // to lay out virtual bases. for (auto &Base : Bases) { - auto BL = llvm::make_unique(*this, std::move(Base)); - BaseClasses.push_back(BL.get()); + uint32_t Offset = Base->getOffset(); + // Non-virtual bases never get elided. + auto BL = llvm::make_unique(*this, Offset, false, + std::move(Base)); + AllBases.push_back(BL.get()); addChildToLayout(std::move(BL)); } + NonVirtualBases = AllBases; - for (auto &VT : VTables) { - auto VTLayout = llvm::make_unique(*this, std::move(VT)); + assert(VTables.size() <= 1); + if (!VTables.empty()) { + auto VTLayout = + llvm::make_unique(*this, std::move(VTables[0])); VTable = VTLayout.get(); addChildToLayout(std::move(VTLayout)); - continue; } for (auto &Data : Members) { @@ -186,150 +225,74 @@ void UDTLayoutBase::initializeChildren(const PDBSymbol &Sym) { addChildToLayout(std::move(DM)); } - for (auto &Func : Funcs) { - if (!Func->isVirtual()) - continue; + // Make sure add virtual bases before adding functions, since functions may be + // overrides of virtual functions declared in a virtual base, so the VTables + // and virtual intros need to be correctly initialized. + for (auto &VB : VirtualBaseSyms) { + int VBPO = VB->getVirtualBasePointerOffset(); + if (!hasVBPtrAtOffset(VBPO)) { + if (auto VBP = VB->getRawSymbol().getVirtualBaseTableType()) { + auto VBPL = llvm::make_unique(*this, std::move(VBP), + VBPO, VBP->getLength()); + VBPtr = VBPL.get(); + addChildToLayout(std::move(VBPL)); + } + } - if (Func->isIntroVirtualFunction()) - addVirtualIntro(*Func); - else - addVirtualOverride(*Func); + // Virtual bases always go at the end. So just look for the last place we + // ended when writing something, and put our virtual base there. + // Note that virtual bases get elided unless this is a top-most derived + // class. + uint32_t Offset = UsedBytes.find_last() + 1; + bool Elide = (Parent != nullptr); + auto BL = + llvm::make_unique(*this, Offset, Elide, std::move(VB)); + AllBases.push_back(BL.get()); + + // Only lay this virtual base out directly inside of *this* class if this + // is a top-most derived class. Keep track of it regardless, but only + // physically lay it out if it's a topmost derived class. + addChildToLayout(std::move(BL)); } + VirtualBases = makeArrayRef(AllBases).drop_front(NonVirtualBases.size()); + + if (Parent != nullptr) + LayoutSize = UsedBytes.find_last() + 1; } -void UDTLayoutBase::addVirtualIntro(PDBSymbolFunc &Func) { - // Kind of a hack, but we prefer the more common destructor name that people - // are familiar with, e.g. ~ClassName. It seems there are always both and - // the vector deleting destructor overwrites the nice destructor, so just - // ignore the vector deleting destructor. - if (Func.getName() == "__vecDelDtor") - return; - - if (!VTable) { - // FIXME: Handle this. What's most likely happening is we have an intro - // virtual in a derived class where the base also has an intro virtual. - // In this case the vtable lives in the base. What we really need is - // for each UDTLayoutBase to contain a list of all its vtables, and - // then propagate this list up the hierarchy so that derived classes have - // direct access to their bases' vtables. - return; +bool UDTLayoutBase::hasVBPtrAtOffset(uint32_t Off) const { + if (VBPtr && VBPtr->getOffsetInParent() == Off) + return true; + for (BaseClassLayout *BL : AllBases) { + if (BL->hasVBPtrAtOffset(Off - BL->getOffsetInParent())) + return true; } - - uint32_t Stride = VTable->getElementSize(); - - uint32_t Index = Func.getVirtualBaseOffset(); - assert(Index % Stride == 0); - Index /= Stride; - - VTable->setFunction(Index, Func); + return false; } -VTableLayoutItem *UDTLayoutBase::findVTableAtOffset(uint32_t RelativeOffset) { - if (VTable && VTable->getOffsetInParent() == RelativeOffset) - return VTable; - for (auto Base : BaseClasses) { - uint32_t Begin = Base->getOffsetInParent(); - uint32_t End = Begin + Base->getSize(); - if (RelativeOffset < Begin || RelativeOffset >= End) - continue; - - return Base->findVTableAtOffset(RelativeOffset - Begin); - } +void UDTLayoutBase::addChildToLayout(std::unique_ptr Child) { + uint32_t Begin = Child->getOffsetInParent(); - return nullptr; -} + if (!Child->isElided()) { + BitVector ChildBytes = Child->usedBytes(); -void UDTLayoutBase::addVirtualOverride(PDBSymbolFunc &Func) { - auto Signature = Func.getSignature(); - auto ThisAdjust = Signature->getThisAdjust(); - // ThisAdjust tells us which VTable we're looking for. Specifically, it's - // the offset into the current class of the VTable we're looking for. So - // look through the base hierarchy until we find one such that - // AbsoluteOffset(VT) == ThisAdjust - VTableLayoutItem *VT = findVTableAtOffset(ThisAdjust); - if (!VT) { - // FIXME: There really should be a vtable here. If there's not it probably - // means that the vtable is in a virtual base, which we don't yet support. - assert(!VirtualBases.empty()); - return; - } - int32_t OverrideIndex = -1; - // Now we've found the VTable. Func will not have a virtual base offset set, - // so instead we need to compare names and signatures. We iterate each item - // in the VTable. All items should already have non null entries because they - // were initialized by the intro virtual, which was guaranteed to come before. - for (auto ItemAndIndex : enumerate(VT->funcs())) { - auto Item = ItemAndIndex.value(); - assert(Item); - // If the name doesn't match, this isn't an override. Note that it's ok - // for the return type to not match (e.g. co-variant return). - if (Item->getName() != Func.getName()) { - if (Item->isDestructor() && Func.isDestructor()) { - OverrideIndex = ItemAndIndex.index(); - break; - } - continue; - } - // Now make sure it's the right overload. Get the signature of the existing - // vtable method and make sure it has the same arglist and the same cv-ness. - auto ExistingSig = Item->getSignature(); - if (ExistingSig->isConstType() != Signature->isConstType()) - continue; - if (ExistingSig->isVolatileType() != Signature->isVolatileType()) - continue; - - // Now compare arguments. Using the raw bytes of the PDB this would be - // trivial - // because there is an ArgListId and they should be identical. But DIA - // doesn't - // expose this, so the best we can do is iterate each argument and confirm - // that - // each one is identical. - if (ExistingSig->getCount() != Signature->getCount()) - continue; - bool IsMatch = true; - auto ExistingEnumerator = ExistingSig->getArguments(); - auto NewEnumerator = Signature->getArguments(); - for (uint32_t I = 0; I < ExistingEnumerator->getChildCount(); ++I) { - auto ExistingArg = ExistingEnumerator->getNext(); - auto NewArg = NewEnumerator->getNext(); - if (ExistingArg->getSymIndexId() != NewArg->getSymIndexId()) { - IsMatch = false; - break; - } - } - if (!IsMatch) - continue; + // Suppose the child occupies 4 bytes starting at offset 12 in a 32 byte + // class. When we call ChildBytes.resize(32), the Child's storage will + // still begin at offset 0, so we need to shift it left by offset bytes + // to get it into the right position. + ChildBytes.resize(UsedBytes.size()); + ChildBytes <<= Child->getOffsetInParent(); + UsedBytes |= ChildBytes; - // It's a match! Stick the new function into the VTable. - OverrideIndex = ItemAndIndex.index(); - break; - } - if (OverrideIndex == -1) { - // FIXME: This is probably due to one of the other FIXMEs in this file. - return; - } - VT->setFunction(OverrideIndex, Func); -} + if (ChildBytes.count() > 0) { + auto Loc = std::upper_bound(LayoutItems.begin(), LayoutItems.end(), Begin, + [](uint32_t Off, const LayoutItemBase *Item) { + return (Off < Item->getOffsetInParent()); + }); -void UDTLayoutBase::addChildToLayout(std::unique_ptr Child) { - uint32_t Begin = Child->getOffsetInParent(); - uint32_t End = Begin + Child->getSize(); - // Due to the empty base optimization, End might point outside the bounds of - // the parent class. If that happens, just clamp the value. - End = std::min(End, getClassSize()); - - UsedBytes.set(Begin, End); - while (Begin != End) { - ChildrenPerByte[Begin].push_back(Child.get()); - ++Begin; + LayoutItems.insert(Loc, Child.get()); + } } - auto Loc = std::upper_bound( - ChildStorage.begin(), ChildStorage.end(), Begin, - [](uint32_t Off, const std::unique_ptr &Item) { - return Off < Item->getOffsetInParent(); - }); - - ChildStorage.insert(Loc, std::move(Child)); + ChildStorage.push_back(std::move(Child)); } \ No newline at end of file diff --git a/lib/Fuzzer/CMakeLists.txt b/lib/Fuzzer/CMakeLists.txt index 59cef04..b886021 100644 --- a/lib/Fuzzer/CMakeLists.txt +++ b/lib/Fuzzer/CMakeLists.txt @@ -1,6 +1,18 @@ -set(LIBFUZZER_FLAGS_BASE "${CMAKE_CXX_FLAGS}") -# Disable the coverage and sanitizer instrumentation for the fuzzer itself. -set(CMAKE_CXX_FLAGS "${LIBFUZZER_FLAGS_BASE} -fno-sanitize-coverage=trace-pc-guard,edge,trace-cmp,indirect-calls,8bit-counters -Werror") +include(CheckCXXSourceCompiles) + +if( APPLE ) + CHECK_CXX_SOURCE_COMPILES(" + static thread_local int blah; + int main() { + return 0; + } + " HAS_THREAD_LOCAL) + + if( NOT HAS_THREAD_LOCAL ) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Dthread_local=__thread") + endif() +endif() + if( LLVM_USE_SANITIZE_COVERAGE ) if(NOT "${LLVM_USE_SANITIZER}" STREQUAL "Address") message(FATAL_ERROR @@ -8,41 +20,50 @@ if( LLVM_USE_SANITIZE_COVERAGE ) "LLVM_USE_SANITIZE_COVERAGE=YES to be set." ) endif() + set(LIBFUZZER_FLAGS_BASE "${CMAKE_CXX_FLAGS}") + + # Disable the coverage and sanitizer instrumentation for the fuzzer itself. + set(CMAKE_CXX_FLAGS "${LIBFUZZER_FLAGS_BASE} -fno-sanitize-coverage=trace-pc-guard,edge,trace-cmp,indirect-calls,8bit-counters -Werror") +endif() + +# Compile libFuzzer if the compilation is specifically requested, OR +# if the platform is known to be working. +if ( LLVM_USE_SANITIZE_COVERAGE OR CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux" ) add_library(LLVMFuzzerNoMainObjects OBJECT - FuzzerCrossOver.cpp - FuzzerDriver.cpp - FuzzerExtFunctionsDlsym.cpp - FuzzerExtFunctionsDlsymWin.cpp - FuzzerExtFunctionsWeak.cpp - FuzzerExtraCounters.cpp - FuzzerIO.cpp - FuzzerIOPosix.cpp - FuzzerIOWindows.cpp - FuzzerLoop.cpp - FuzzerMerge.cpp - FuzzerMutate.cpp - FuzzerSHA1.cpp - FuzzerShmemPosix.cpp - FuzzerShmemWindows.cpp - FuzzerTracePC.cpp - FuzzerTraceState.cpp - FuzzerUtil.cpp - FuzzerUtilDarwin.cpp - FuzzerUtilLinux.cpp - FuzzerUtilPosix.cpp - FuzzerUtilWindows.cpp - ) + FuzzerCrossOver.cpp + FuzzerDriver.cpp + FuzzerExtFunctionsDlsym.cpp + FuzzerExtFunctionsDlsymWin.cpp + FuzzerExtFunctionsWeak.cpp + FuzzerExtraCounters.cpp + FuzzerIO.cpp + FuzzerIOPosix.cpp + FuzzerIOWindows.cpp + FuzzerLoop.cpp + FuzzerMerge.cpp + FuzzerMutate.cpp + FuzzerSHA1.cpp + FuzzerShmemPosix.cpp + FuzzerShmemWindows.cpp + FuzzerTracePC.cpp + FuzzerTraceState.cpp + FuzzerUtil.cpp + FuzzerUtilDarwin.cpp + FuzzerUtilLinux.cpp + FuzzerUtilPosix.cpp + FuzzerUtilWindows.cpp + ) add_library(LLVMFuzzerNoMain STATIC - $ - ) + $ + ) target_link_libraries(LLVMFuzzerNoMain ${LLVM_PTHREAD_LIB}) add_library(LLVMFuzzer STATIC - FuzzerMain.cpp - $ - ) + FuzzerMain.cpp + $ + ) target_link_libraries(LLVMFuzzer ${LLVM_PTHREAD_LIB}) +endif() - if( LLVM_INCLUDE_TESTS ) - add_subdirectory(test) - endif() +if( LLVM_USE_SANITIZE_COVERAGE AND LLVM_INCLUDE_TESTS ) + add_subdirectory(test) endif() diff --git a/lib/Fuzzer/FuzzerDefs.h b/lib/Fuzzer/FuzzerDefs.h index bd18275..27f5719 100644 --- a/lib/Fuzzer/FuzzerDefs.h +++ b/lib/Fuzzer/FuzzerDefs.h @@ -36,17 +36,29 @@ #error "Support for your platform has not been implemented" #endif +#ifndef __has_attribute +# define __has_attribute(x) 0 +#endif + #define LIBFUZZER_POSIX LIBFUZZER_APPLE || LIBFUZZER_LINUX #ifdef __x86_64 -#define ATTRIBUTE_TARGET_POPCNT __attribute__((target("popcnt"))) +# if __has_attribute(target) +# define ATTRIBUTE_TARGET_POPCNT __attribute__((target("popcnt"))) +# else +# define ATTRIBUTE_TARGET_POPCNT +# endif #else -#define ATTRIBUTE_TARGET_POPCNT +# define ATTRIBUTE_TARGET_POPCNT #endif #ifdef __clang__ // avoid gcc warning. -# define ATTRIBUTE_NO_SANITIZE_MEMORY __attribute__((no_sanitize("memory"))) +# if __has_attribute(no_sanitize) +# define ATTRIBUTE_NO_SANITIZE_MEMORY __attribute__((no_sanitize("memory"))) +# else +# define ATTRIBUTE_NO_SANITIZE_MEMORY +# endif # define ALWAYS_INLINE __attribute__((always_inline)) #else # define ATTRIBUTE_NO_SANITIZE_MEMORY diff --git a/lib/Fuzzer/FuzzerMerge.h b/lib/Fuzzer/FuzzerMerge.h index cf4a086..dd4c37b 100644 --- a/lib/Fuzzer/FuzzerMerge.h +++ b/lib/Fuzzer/FuzzerMerge.h @@ -69,7 +69,7 @@ struct Merger { size_t Merge(const std::set &InitialFeatures, std::vector *NewFiles); size_t Merge(std::vector *NewFiles) { - return Merge({}, NewFiles); + return Merge(std::set{}, NewFiles); } size_t ApproximateMemoryConsumption() const; std::set AllFeatures() const; diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp index d0b77e7..b7de071 100644 --- a/lib/IR/AsmWriter.cpp +++ b/lib/IR/AsmWriter.cpp @@ -1103,35 +1103,34 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV, } if (const ConstantFP *CFP = dyn_cast(CV)) { - if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEsingle() || - &CFP->getValueAPF().getSemantics() == &APFloat::IEEEdouble()) { + const APFloat &APF = CFP->getValueAPF(); + if (&APF.getSemantics() == &APFloat::IEEEsingle() || + &APF.getSemantics() == &APFloat::IEEEdouble()) { // We would like to output the FP constant value in exponential notation, // but we cannot do this if doing so will lose precision. Check here to // make sure that we only output it in exponential format if we can parse // the value back and get the same value. // bool ignored; - bool isDouble = &CFP->getValueAPF().getSemantics()==&APFloat::IEEEdouble(); - bool isInf = CFP->getValueAPF().isInfinity(); - bool isNaN = CFP->getValueAPF().isNaN(); + bool isDouble = &APF.getSemantics() == &APFloat::IEEEdouble(); + bool isInf = APF.isInfinity(); + bool isNaN = APF.isNaN(); if (!isInf && !isNaN) { - double Val = isDouble ? CFP->getValueAPF().convertToDouble() : - CFP->getValueAPF().convertToFloat(); + double Val = isDouble ? APF.convertToDouble() : APF.convertToFloat(); SmallString<128> StrVal; - raw_svector_ostream(StrVal) << Val; - + APF.toString(StrVal, 6, 0, false); // Check to make sure that the stringized number is not some string like // "Inf" or NaN, that atof will accept, but the lexer will not. Check // that the string matches the "[-+]?[0-9]" regex. // - if ((StrVal[0] >= '0' && StrVal[0] <= '9') || - ((StrVal[0] == '-' || StrVal[0] == '+') && - (StrVal[1] >= '0' && StrVal[1] <= '9'))) { - // Reparse stringized version! - if (APFloat(APFloat::IEEEdouble(), StrVal).convertToDouble() == Val) { - Out << StrVal; - return; - } + assert(((StrVal[0] >= '0' && StrVal[0] <= '9') || + ((StrVal[0] == '-' || StrVal[0] == '+') && + (StrVal[1] >= '0' && StrVal[1] <= '9'))) && + "[-+]?[0-9] regex does not match!"); + // Reparse stringized version! + if (APFloat(APFloat::IEEEdouble(), StrVal).convertToDouble() == Val) { + Out << StrVal; + return; } } // Otherwise we could not reparse it to exactly the same value, so we must @@ -1140,7 +1139,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV, // x86, so we must not use these types. static_assert(sizeof(double) == sizeof(uint64_t), "assuming that double is 64 bits!"); - APFloat apf = CFP->getValueAPF(); + APFloat apf = APF; // Floats are represented in ASCII IR as double, convert. if (!isDouble) apf.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, @@ -1153,27 +1152,27 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV, // These appear as a magic letter identifying the type, then a // fixed number of hex digits. Out << "0x"; - APInt API = CFP->getValueAPF().bitcastToAPInt(); - if (&CFP->getValueAPF().getSemantics() == &APFloat::x87DoubleExtended()) { + APInt API = APF.bitcastToAPInt(); + if (&APF.getSemantics() == &APFloat::x87DoubleExtended()) { Out << 'K'; Out << format_hex_no_prefix(API.getHiBits(16).getZExtValue(), 4, /*Upper=*/true); Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16, /*Upper=*/true); return; - } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEquad()) { + } else if (&APF.getSemantics() == &APFloat::IEEEquad()) { Out << 'L'; Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16, /*Upper=*/true); Out << format_hex_no_prefix(API.getHiBits(64).getZExtValue(), 16, /*Upper=*/true); - } else if (&CFP->getValueAPF().getSemantics() == &APFloat::PPCDoubleDouble()) { + } else if (&APF.getSemantics() == &APFloat::PPCDoubleDouble()) { Out << 'M'; Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16, /*Upper=*/true); Out << format_hex_no_prefix(API.getHiBits(64).getZExtValue(), 16, /*Upper=*/true); - } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEhalf()) { + } else if (&APF.getSemantics() == &APFloat::IEEEhalf()) { Out << 'H'; Out << format_hex_no_prefix(API.getZExtValue(), 4, /*Upper=*/true); diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h index 09f0373..cf29252 100644 --- a/lib/IR/AttributeImpl.h +++ b/lib/IR/AttributeImpl.h @@ -255,17 +255,10 @@ public: /// \brief Retrieve the attribute set node for the given "slot" in the /// AttrNode list. - AttributeSet getSlotNode(unsigned Slot) const { + AttributeSet getSlotAttributes(unsigned Slot) const { return getSlotPair(Slot)->second; } - /// \brief Retrieve the attributes for the given "slot" in the AttrNode list. - /// \p Slot is an index into the AttrNodes list, not the index of the return / - /// parameter/ function which the attributes apply to. - AttributeList getSlotAttributes(unsigned Slot) const { - return AttributeList::get(Context, *getSlotPair(Slot)); - } - /// \brief Return true if the AttributeSet or the FunctionIndex has an /// enum attribute of the given kind. bool hasFnAttribute(Attribute::AttrKind Kind) const { @@ -273,8 +266,10 @@ public: } typedef AttributeSet::iterator iterator; - iterator begin(unsigned Slot) const { return getSlotNode(Slot).begin(); } - iterator end(unsigned Slot) const { return getSlotNode(Slot).end(); } + iterator begin(unsigned Slot) const { + return getSlotAttributes(Slot).begin(); + } + iterator end(unsigned Slot) const { return getSlotAttributes(Slot).end(); } void Profile(FoldingSetNodeID &ID) const; static void Profile(FoldingSetNodeID &ID, diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp index d690111..e304145 100644 --- a/lib/IR/Attributes.cpp +++ b/lib/IR/Attributes.cpp @@ -955,13 +955,13 @@ AttributeList AttributeList::addAttribute(LLVMContext &C, for (unsigned Index : Indices) { // Add all attribute slots before the current index. for (; I < E && getSlotIndex(I) < Index; ++I) - AttrVec.emplace_back(getSlotIndex(I), pImpl->getSlotNode(I)); + AttrVec.emplace_back(getSlotIndex(I), pImpl->getSlotAttributes(I)); // Add the attribute at this index. If we already have attributes at this // index, merge them into a new set. AttrBuilder B; if (I < E && getSlotIndex(I) == Index) { - B.merge(AttrBuilder(pImpl->getSlotNode(I))); + B.merge(AttrBuilder(pImpl->getSlotAttributes(I))); ++I; } B.addAttribute(A); @@ -970,7 +970,7 @@ AttributeList AttributeList::addAttribute(LLVMContext &C, // Add remaining attributes. for (; I < E; ++I) - AttrVec.emplace_back(getSlotIndex(I), pImpl->getSlotNode(I)); + AttrVec.emplace_back(getSlotIndex(I), pImpl->getSlotAttributes(I)); return get(C, AttrVec); } @@ -1008,13 +1008,13 @@ AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index, for (I = 0; I < NumAttrs; ++I) { if (getSlotIndex(I) >= Index) break; - AttrVec.emplace_back(getSlotIndex(I), pImpl->getSlotNode(I)); + AttrVec.emplace_back(getSlotIndex(I), pImpl->getSlotAttributes(I)); } AttrBuilder NewAttrs; if (I < NumAttrs && getSlotIndex(I) == Index) { // We need to merge the attribute sets. - NewAttrs.merge(pImpl->getSlotNode(I)); + NewAttrs.merge(pImpl->getSlotAttributes(I)); ++I; } NewAttrs.merge(B); @@ -1024,7 +1024,7 @@ AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index, // Add the remaining entries. for (; I < NumAttrs; ++I) - AttrVec.emplace_back(getSlotIndex(I), pImpl->getSlotNode(I)); + AttrVec.emplace_back(getSlotIndex(I), pImpl->getSlotAttributes(I)); return get(C, AttrVec); } @@ -1063,11 +1063,11 @@ AttributeList AttributeList::removeAttributes(LLVMContext &C, unsigned Index, for (unsigned I = 0, E = NumAttrs; I != E; ++I) { if (getSlotIndex(I) >= Index) { if (getSlotIndex(I) == Index) - B = AttrBuilder(pImpl->getSlotNode(LastIndex++)); + B = AttrBuilder(getSlotAttributes(LastIndex++)); break; } LastIndex = I + 1; - AttrSets.push_back({getSlotIndex(I), pImpl->getSlotNode(I)}); + AttrSets.push_back({getSlotIndex(I), getSlotAttributes(I)}); } // Remove the attributes from the existing set and add them. @@ -1077,7 +1077,7 @@ AttributeList AttributeList::removeAttributes(LLVMContext &C, unsigned Index, // Add the remaining attribute slots. for (unsigned I = LastIndex, E = NumAttrs; I < E; ++I) - AttrSets.push_back({getSlotIndex(I), pImpl->getSlotNode(I)}); + AttrSets.push_back({getSlotIndex(I), getSlotAttributes(I)}); return get(C, AttrSets); } @@ -1091,7 +1091,7 @@ AttributeList AttributeList::removeAttributes(LLVMContext &C, for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I) { unsigned Index = getSlotIndex(I); if (Index != WithoutIndex) - AttrSet.push_back({Index, pImpl->getSlotNode(I)}); + AttrSet.push_back({Index, pImpl->getSlotAttributes(I)}); } return get(C, AttrSet); } @@ -1220,7 +1220,7 @@ AttributeSet AttributeList::getAttributes(unsigned Index) const { // Loop through to find the attribute node we want. for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I) if (pImpl->getSlotIndex(I) == Index) - return pImpl->getSlotNode(I); + return pImpl->getSlotAttributes(I); return AttributeSet(); } @@ -1251,7 +1251,7 @@ unsigned AttributeList::getSlotIndex(unsigned Slot) const { return pImpl->getSlotIndex(Slot); } -AttributeList AttributeList::getSlotAttributes(unsigned Slot) const { +AttributeSet AttributeList::getSlotAttributes(unsigned Slot) const { assert(pImpl && Slot < pImpl->getNumSlots() && "Slot # out of range!"); return pImpl->getSlotAttributes(Slot); diff --git a/lib/IR/GCOV.cpp b/lib/IR/GCOV.cpp index ba92a91..d4b4552 100644 --- a/lib/IR/GCOV.cpp +++ b/lib/IR/GCOV.cpp @@ -589,8 +589,12 @@ FileInfo::openCoveragePath(StringRef CoveragePath) { /// print - Print source files with collected line count information. void FileInfo::print(raw_ostream &InfoOS, StringRef MainFilename, StringRef GCNOFile, StringRef GCDAFile) { - for (const auto &LI : LineInfo) { - StringRef Filename = LI.first(); + SmallVector Filenames; + for (const auto &LI : LineInfo) + Filenames.push_back(LI.first()); + std::sort(Filenames.begin(), Filenames.end()); + + for (StringRef Filename : Filenames) { auto AllLines = LineConsumer(Filename); std::string CoveragePath = getCoveragePath(Filename, MainFilename); @@ -603,7 +607,7 @@ void FileInfo::print(raw_ostream &InfoOS, StringRef MainFilename, CovOS << " -: 0:Runs:" << RunCount << "\n"; CovOS << " -: 0:Programs:" << ProgramCount << "\n"; - const LineData &Line = LI.second; + const LineData &Line = LineInfo[Filename]; GCOVCoverage FileCoverage(Filename); for (uint32_t LineIndex = 0; LineIndex < Line.LastLine || !AllLines.empty(); ++LineIndex) { diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp index b07c576..d83bdf2 100644 --- a/lib/IR/Value.cpp +++ b/lib/IR/Value.cpp @@ -432,6 +432,7 @@ namespace { enum PointerStripKind { PSK_ZeroIndices, PSK_ZeroIndicesAndAliases, + PSK_ZeroIndicesAndAliasesAndBarriers, PSK_InBoundsConstantIndices, PSK_InBounds }; @@ -450,6 +451,7 @@ static const Value *stripPointerCastsAndOffsets(const Value *V) { if (auto *GEP = dyn_cast(V)) { switch (StripKind) { case PSK_ZeroIndicesAndAliases: + case PSK_ZeroIndicesAndAliasesAndBarriers: case PSK_ZeroIndices: if (!GEP->hasAllZeroIndices()) return V; @@ -472,12 +474,20 @@ static const Value *stripPointerCastsAndOffsets(const Value *V) { return V; V = GA->getAliasee(); } else { - if (auto CS = ImmutableCallSite(V)) + if (auto CS = ImmutableCallSite(V)) { if (const Value *RV = CS.getReturnedArgOperand()) { V = RV; continue; } - + // The result of invariant.group.barrier must alias it's argument, + // but it can't be marked with returned attribute, that's why it needs + // special case. + if (StripKind == PSK_ZeroIndicesAndAliasesAndBarriers && + CS.getIntrinsicID() == Intrinsic::invariant_group_barrier) { + V = CS.getArgOperand(0); + continue; + } + } return V; } assert(V->getType()->isPointerTy() && "Unexpected operand type!"); @@ -499,6 +509,11 @@ const Value *Value::stripInBoundsConstantOffsets() const { return stripPointerCastsAndOffsets(this); } +const Value *Value::stripPointerCastsAndBarriers() const { + return stripPointerCastsAndOffsets( + this); +} + const Value * Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const { diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp index 9782c89..1bc0d73 100644 --- a/lib/LTO/LTO.cpp +++ b/lib/LTO/LTO.cpp @@ -415,7 +415,8 @@ void LTO::addSymbolToGlobalRes(const InputFile::Symbol &Sym, // Flag as visible outside of ThinLTO if visible from a regular object or // if this is a reference in the regular LTO partition. GlobalRes.VisibleOutsideThinLTO |= - (Res.VisibleToRegularObj || (Partition == GlobalResolution::RegularLTO)); + (Res.VisibleToRegularObj || Sym.isUsed() || + Partition == GlobalResolution::RegularLTO); } static void writeToResolutionFile(raw_ostream &OS, InputFile *Input, diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp index 4bd251f..30447c5 100644 --- a/lib/LTO/LTOBackend.cpp +++ b/lib/LTO/LTOBackend.cpp @@ -25,7 +25,6 @@ #include "llvm/IR/PassManager.h" #include "llvm/IR/Verifier.h" #include "llvm/LTO/LTO.h" -#include "llvm/LTO/legacy/UpdateCompilerUsed.h" #include "llvm/MC/SubtargetFeature.h" #include "llvm/Object/ModuleSymbolTable.h" #include "llvm/Passes/PassBuilder.h" @@ -353,19 +352,6 @@ finalizeOptimizationRemarks(std::unique_ptr DiagOutputFile) { DiagOutputFile->os().flush(); } -static void handleAsmUndefinedRefs(Module &Mod, TargetMachine &TM) { - // Collect the list of undefined symbols used in asm and update - // llvm.compiler.used to prevent optimization to drop these from the output. - StringSet<> AsmUndefinedRefs; - ModuleSymbolTable::CollectAsmSymbols( - Mod, - [&AsmUndefinedRefs](StringRef Name, object::BasicSymbolRef::Flags Flags) { - if (Flags & object::BasicSymbolRef::SF_Undefined) - AsmUndefinedRefs.insert(Name); - }); - updateCompilerUsed(Mod, TM, AsmUndefinedRefs); -} - Error lto::backend(Config &C, AddStreamFn AddStream, unsigned ParallelCodeGenParallelismLevel, std::unique_ptr Mod, @@ -377,8 +363,6 @@ Error lto::backend(Config &C, AddStreamFn AddStream, std::unique_ptr TM = createTargetMachine(C, Mod->getTargetTriple(), *TOrErr); - handleAsmUndefinedRefs(*Mod, *TM); - // Setup optimization remarks. auto DiagFileOrErr = lto::setupOptimizationRemarks( Mod->getContext(), C.RemarksFilename, C.RemarksWithHotness); @@ -416,8 +400,6 @@ Error lto::thinBackend(Config &Conf, unsigned Task, AddStreamFn AddStream, std::unique_ptr TM = createTargetMachine(Conf, Mod.getTargetTriple(), *TOrErr); - handleAsmUndefinedRefs(Mod, *TM); - if (Conf.CodeGenOnly) { codegen(Conf, TM.get(), AddStream, Task, Mod); return Error::success(); diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp index 42e8ad3..2fa9c03 100644 --- a/lib/MC/MCParser/AsmParser.cpp +++ b/lib/MC/MCParser/AsmParser.cpp @@ -134,7 +134,7 @@ struct ParseStatementInfo { SmallVectorImpl *AsmRewrites = nullptr; - ParseStatementInfo() = default; + ParseStatementInfo() = delete; ParseStatementInfo(SmallVectorImpl *rewrites) : AsmRewrites(rewrites) {} }; @@ -737,6 +737,7 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) { HadError = false; AsmCond StartingCondState = TheCondState; + SmallVector AsmStrRewrites; // If we are generating dwarf for assembly source files save the initial text // section and generate a .file directive. @@ -756,7 +757,7 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) { // While we have input, parse each statement. while (Lexer.isNot(AsmToken::Eof)) { - ParseStatementInfo Info; + ParseStatementInfo Info(&AsmStrRewrites); if (!parseStatement(Info, nullptr)) continue; @@ -1650,7 +1651,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info, } // Emit the label. - if (!ParsingInlineAsm) + if (!getTargetParser().isParsingInlineAsm()) Out.EmitLabel(Sym, IDLoc); // If we are generating dwarf for assembly source files then gather the @@ -2057,9 +2058,9 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info, // If parsing succeeded, match the instruction. if (!ParseHadError) { uint64_t ErrorInfo; - if (getTargetParser().MatchAndEmitInstruction(IDLoc, Info.Opcode, - Info.ParsedOperands, Out, - ErrorInfo, ParsingInlineAsm)) + if (getTargetParser().MatchAndEmitInstruction( + IDLoc, Info.Opcode, Info.ParsedOperands, Out, ErrorInfo, + getTargetParser().isParsingInlineAsm())) return true; } return false; diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp index 159cc3b..6444046 100644 --- a/lib/MC/WasmObjectWriter.cpp +++ b/lib/MC/WasmObjectWriter.cpp @@ -1105,7 +1105,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm, encodeULEB128(wasm::WASM_SEC_CODE, getStream()); - encodeULEB128(CodeRelocations.size(), getStream()); + encodeULEB128(CodeRelocations.size() + TypeIndexFixups.size(), getStream()); WriteRelocations(CodeRelocations, getStream(), SymbolIndices); WriteTypeRelocations(TypeIndexFixups, TypeIndexFixupTypes, getStream()); diff --git a/lib/Object/ELF.cpp b/lib/Object/ELF.cpp index 23682e1..e89a4a3 100644 --- a/lib/Object/ELF.cpp +++ b/lib/Object/ELF.cpp @@ -1,4 +1,4 @@ -//===- ELF.cpp - ELF object file implementation -----------------*- C++ -*-===// +//===- ELF.cpp - ELF object file implementation ---------------------------===// // // The LLVM Compiler Infrastructure // @@ -8,15 +8,17 @@ //===----------------------------------------------------------------------===// #include "llvm/Object/ELF.h" +#include "llvm/Support/ELF.h" -namespace llvm { -namespace object { +using namespace llvm; +using namespace object; #define ELF_RELOC(name, value) \ case ELF::name: \ return #name; \ -StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type) { +StringRef llvm::object::getELFRelocationTypeName(uint32_t Machine, + uint32_t Type) { switch (Machine) { case ELF::EM_X86_64: switch (Type) { @@ -139,6 +141,3 @@ StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type) { } #undef ELF_RELOC - -} // end namespace object -} // end namespace llvm diff --git a/lib/Object/ELFObjectFile.cpp b/lib/Object/ELFObjectFile.cpp index 3f8c81c..86f033b 100644 --- a/lib/Object/ELFObjectFile.cpp +++ b/lib/Object/ELFObjectFile.cpp @@ -1,4 +1,4 @@ -//===- ELFObjectFile.cpp - ELF object file implementation -------*- C++ -*-===// +//===- ELFObjectFile.cpp - ELF object file implementation -----------------===// // // The LLVM Compiler Infrastructure // @@ -11,12 +11,27 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/Triple.h" +#include "llvm/MC/SubtargetFeature.h" +#include "llvm/Object/ELF.h" #include "llvm/Object/ELFObjectFile.h" +#include "llvm/Object/ELFTypes.h" +#include "llvm/Object/Error.h" #include "llvm/Support/ARMBuildAttributes.h" #include "llvm/Support/ARMAttributeParser.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" +#include +#include +#include +#include +#include +#include +#include -namespace llvm { +using namespace llvm; using namespace object; ELFObjectFileBase::ELFObjectFileBase(unsigned int Type, MemoryBufferRef Source) @@ -299,5 +314,3 @@ void ELFObjectFileBase::setARMSubArch(Triple &TheTriple) const { TheTriple.setArchName(Triple); } - -} // end namespace llvm diff --git a/lib/Object/IRSymtab.cpp b/lib/Object/IRSymtab.cpp index bb3d1b2..5f08378 100644 --- a/lib/Object/IRSymtab.cpp +++ b/lib/Object/IRSymtab.cpp @@ -1,4 +1,4 @@ -//===- IRSymtab.cpp - implementation of IR symbol tables --------*- C++ -*-===// +//===- IRSymtab.cpp - implementation of IR symbol tables ------------------===// // // The LLVM Compiler Infrastructure // @@ -7,14 +7,34 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Object/IRSymtab.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" #include "llvm/Analysis/ObjectUtils.h" +#include "llvm/IR/Comdat.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalObject.h" #include "llvm/IR/Mangler.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/MC/StringTableBuilder.h" +#include "llvm/Object/IRSymtab.h" #include "llvm/Object/ModuleSymbolTable.h" +#include "llvm/Object/SymbolicFile.h" #include "llvm/Support/Allocator.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Support/StringSaver.h" +#include +#include +#include +#include using namespace llvm; using namespace irsymtab; @@ -25,6 +45,7 @@ namespace { struct Builder { SmallVector &Symtab; SmallVector &Strtab; + Builder(SmallVector &Symtab, SmallVector &Strtab) : Symtab(Symtab), Strtab(Strtab) {} @@ -49,6 +70,7 @@ struct Builder { S.Offset = StrtabBuilder.add(Value); S.Size = Value.size(); } + template void writeRange(storage::Range &R, const std::vector &Objs) { R.Offset = Symtab.size(); @@ -141,6 +163,9 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab, Sym.ComdatIndex = -1; auto *GV = Msym.dyn_cast(); if (!GV) { + // Undefined module asm symbols act as GC roots and are implicitly used. + if (Flags & object::BasicSymbolRef::SF_Undefined) + Sym.Flags |= 1 << storage::Symbol::FB_used; setStr(Sym.IRName, ""); return Error::success(); } @@ -228,7 +253,7 @@ Error Builder::build(ArrayRef IRMods) { return Error::success(); } -} // anonymous namespace +} // end anonymous namespace Error irsymtab::build(ArrayRef Mods, SmallVector &Symtab, SmallVector &Strtab) { diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp index 1753d2b..3d3fa07 100644 --- a/lib/Object/MachOObjectFile.cpp +++ b/lib/Object/MachOObjectFile.cpp @@ -1,4 +1,4 @@ -//===- MachOObjectFile.cpp - Mach-O object file binding ---------*- C++ -*-===// +//===- MachOObjectFile.cpp - Mach-O object file binding -------------------===// // // The LLVM Compiler Infrastructure // @@ -12,32 +12,52 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Object/MachO.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Triple.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Object/Error.h" +#include "llvm/Object/MachO.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Object/SymbolicFile.h" #include "llvm/Support/DataExtractor.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/Host.h" #include "llvm/Support/LEB128.h" #include "llvm/Support/MachO.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/raw_ostream.h" -#include +#include "llvm/Support/SwapByteOrder.h" +#include +#include +#include +#include #include #include #include +#include +#include +#include using namespace llvm; using namespace object; namespace { + struct section_base { char sectname[16]; char segname[16]; }; -} + +} // end anonymous namespace static Error malformedError(Twine Msg) { @@ -1144,11 +1164,7 @@ MachOObjectFile::MachOObjectFile(MemoryBufferRef Object, bool IsLittleEndian, bool Is64bits, Error &Err, uint32_t UniversalCputype, uint32_t UniversalIndex) - : ObjectFile(getMachOType(IsLittleEndian, Is64bits), Object), - SymtabLoadCmd(nullptr), DysymtabLoadCmd(nullptr), - DataInCodeLoadCmd(nullptr), LinkOptHintsLoadCmd(nullptr), - DyldInfoLoadCmd(nullptr), UuidLoadCmd(nullptr), - HasPageZeroSegment(false) { + : ObjectFile(getMachOType(IsLittleEndian, Is64bits), Object) { ErrorAsOutParameter ErrAsOutParam(&Err); uint64_t SizeOfHeaders; uint32_t cputype; @@ -2343,11 +2359,11 @@ StringRef MachOObjectFile::getFileFormatName() const { unsigned CPUType = getCPUType(*this); if (!is64Bit()) { switch (CPUType) { - case llvm::MachO::CPU_TYPE_I386: + case MachO::CPU_TYPE_I386: return "Mach-O 32-bit i386"; - case llvm::MachO::CPU_TYPE_ARM: + case MachO::CPU_TYPE_ARM: return "Mach-O arm"; - case llvm::MachO::CPU_TYPE_POWERPC: + case MachO::CPU_TYPE_POWERPC: return "Mach-O 32-bit ppc"; default: return "Mach-O 32-bit unknown"; @@ -2355,11 +2371,11 @@ StringRef MachOObjectFile::getFileFormatName() const { } switch (CPUType) { - case llvm::MachO::CPU_TYPE_X86_64: + case MachO::CPU_TYPE_X86_64: return "Mach-O 64-bit x86-64"; - case llvm::MachO::CPU_TYPE_ARM64: + case MachO::CPU_TYPE_ARM64: return "Mach-O arm64"; - case llvm::MachO::CPU_TYPE_POWERPC64: + case MachO::CPU_TYPE_POWERPC64: return "Mach-O 64-bit ppc64"; default: return "Mach-O 64-bit unknown"; @@ -2368,17 +2384,17 @@ StringRef MachOObjectFile::getFileFormatName() const { Triple::ArchType MachOObjectFile::getArch(uint32_t CPUType) { switch (CPUType) { - case llvm::MachO::CPU_TYPE_I386: + case MachO::CPU_TYPE_I386: return Triple::x86; - case llvm::MachO::CPU_TYPE_X86_64: + case MachO::CPU_TYPE_X86_64: return Triple::x86_64; - case llvm::MachO::CPU_TYPE_ARM: + case MachO::CPU_TYPE_ARM: return Triple::arm; - case llvm::MachO::CPU_TYPE_ARM64: + case MachO::CPU_TYPE_ARM64: return Triple::aarch64; - case llvm::MachO::CPU_TYPE_POWERPC: + case MachO::CPU_TYPE_POWERPC: return Triple::ppc; - case llvm::MachO::CPU_TYPE_POWERPC64: + case MachO::CPU_TYPE_POWERPC64: return Triple::ppc64; default: return Triple::UnknownArch; @@ -2571,8 +2587,7 @@ dice_iterator MachOObjectFile::end_dices() const { return dice_iterator(DiceRef(DRI, this)); } -ExportEntry::ExportEntry(ArrayRef T) - : Trie(T), Malformed(false), Done(false) {} +ExportEntry::ExportEntry(ArrayRef T) : Trie(T) {} void ExportEntry::moveToFirst() { pushNode(0); @@ -2641,9 +2656,7 @@ uint32_t ExportEntry::nodeOffset() const { } ExportEntry::NodeState::NodeState(const uint8_t *Ptr) - : Start(Ptr), Current(Ptr), Flags(0), Address(0), Other(0), - ImportName(nullptr), ChildCount(0), NextChildIndex(0), - ParentStringLength(0), IsExportNode(false) {} + : Start(Ptr), Current(Ptr) {} void ExportEntry::pushNode(uint64_t offset) { const uint8_t *Ptr = Trie.begin() + offset; @@ -2733,7 +2746,7 @@ void ExportEntry::moveNext() { iterator_range MachOObjectFile::exports(ArrayRef Trie) { ExportEntry Start(Trie); - if (Trie.size() == 0) + if (Trie.empty()) Start.moveToEnd(); else Start.moveToFirst(); @@ -2750,9 +2763,8 @@ iterator_range MachOObjectFile::exports() const { MachORebaseEntry::MachORebaseEntry(Error *E, const MachOObjectFile *O, ArrayRef Bytes, bool is64Bit) - : E(E), O(O), Opcodes(Bytes), Ptr(Bytes.begin()), SegmentOffset(0), - SegmentIndex(-1), RemainingLoopCount(0), AdvanceAmount(0), RebaseType(0), - PointerSize(is64Bit ? 8 : 4), Done(false) {} + : E(E), O(O), Opcodes(Bytes), Ptr(Bytes.begin()), + PointerSize(is64Bit ? 8 : 4) {} void MachORebaseEntry::moveToFirst() { Ptr = Opcodes.begin(); @@ -2794,7 +2806,7 @@ void MachORebaseEntry::moveNext() { More = false; Done = true; moveToEnd(); - DEBUG_WITH_TYPE("mach-o-rebase", llvm::dbgs() << "REBASE_OPCODE_DONE\n"); + DEBUG_WITH_TYPE("mach-o-rebase", dbgs() << "REBASE_OPCODE_DONE\n"); break; case MachO::REBASE_OPCODE_SET_TYPE_IMM: RebaseType = ImmValue; @@ -2807,8 +2819,8 @@ void MachORebaseEntry::moveNext() { } DEBUG_WITH_TYPE( "mach-o-rebase", - llvm::dbgs() << "REBASE_OPCODE_SET_TYPE_IMM: " - << "RebaseType=" << (int) RebaseType << "\n"); + dbgs() << "REBASE_OPCODE_SET_TYPE_IMM: " + << "RebaseType=" << (int) RebaseType << "\n"); break; case MachO::REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB: SegmentIndex = ImmValue; @@ -2831,10 +2843,10 @@ void MachORebaseEntry::moveNext() { } DEBUG_WITH_TYPE( "mach-o-rebase", - llvm::dbgs() << "REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB: " - << "SegmentIndex=" << SegmentIndex << ", " - << format("SegmentOffset=0x%06X", SegmentOffset) - << "\n"); + dbgs() << "REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB: " + << "SegmentIndex=" << SegmentIndex << ", " + << format("SegmentOffset=0x%06X", SegmentOffset) + << "\n"); break; case MachO::REBASE_OPCODE_ADD_ADDR_ULEB: SegmentOffset += readULEB128(&error); @@ -2855,9 +2867,9 @@ void MachORebaseEntry::moveNext() { return; } DEBUG_WITH_TYPE("mach-o-rebase", - llvm::dbgs() << "REBASE_OPCODE_ADD_ADDR_ULEB: " - << format("SegmentOffset=0x%06X", - SegmentOffset) << "\n"); + dbgs() << "REBASE_OPCODE_ADD_ADDR_ULEB: " + << format("SegmentOffset=0x%06X", + SegmentOffset) << "\n"); break; case MachO::REBASE_OPCODE_ADD_ADDR_IMM_SCALED: error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, @@ -2881,9 +2893,9 @@ void MachORebaseEntry::moveNext() { return; } DEBUG_WITH_TYPE("mach-o-rebase", - llvm::dbgs() << "REBASE_OPCODE_ADD_ADDR_IMM_SCALED: " - << format("SegmentOffset=0x%06X", - SegmentOffset) << "\n"); + dbgs() << "REBASE_OPCODE_ADD_ADDR_IMM_SCALED: " + << format("SegmentOffset=0x%06X", + SegmentOffset) << "\n"); break; case MachO::REBASE_OPCODE_DO_REBASE_IMM_TIMES: error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, @@ -2913,11 +2925,11 @@ void MachORebaseEntry::moveNext() { } DEBUG_WITH_TYPE( "mach-o-rebase", - llvm::dbgs() << "REBASE_OPCODE_DO_REBASE_IMM_TIMES: " - << format("SegmentOffset=0x%06X", SegmentOffset) - << ", AdvanceAmount=" << AdvanceAmount - << ", RemainingLoopCount=" << RemainingLoopCount - << "\n"); + dbgs() << "REBASE_OPCODE_DO_REBASE_IMM_TIMES: " + << format("SegmentOffset=0x%06X", SegmentOffset) + << ", AdvanceAmount=" << AdvanceAmount + << ", RemainingLoopCount=" << RemainingLoopCount + << "\n"); return; case MachO::REBASE_OPCODE_DO_REBASE_ULEB_TIMES: error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, @@ -2954,11 +2966,11 @@ void MachORebaseEntry::moveNext() { } DEBUG_WITH_TYPE( "mach-o-rebase", - llvm::dbgs() << "REBASE_OPCODE_DO_REBASE_ULEB_TIMES: " - << format("SegmentOffset=0x%06X", SegmentOffset) - << ", AdvanceAmount=" << AdvanceAmount - << ", RemainingLoopCount=" << RemainingLoopCount - << "\n"); + dbgs() << "REBASE_OPCODE_DO_REBASE_ULEB_TIMES: " + << format("SegmentOffset=0x%06X", SegmentOffset) + << ", AdvanceAmount=" << AdvanceAmount + << ", RemainingLoopCount=" << RemainingLoopCount + << "\n"); return; case MachO::REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB: error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, @@ -2992,11 +3004,11 @@ void MachORebaseEntry::moveNext() { } DEBUG_WITH_TYPE( "mach-o-rebase", - llvm::dbgs() << "REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB: " - << format("SegmentOffset=0x%06X", SegmentOffset) - << ", AdvanceAmount=" << AdvanceAmount - << ", RemainingLoopCount=" << RemainingLoopCount - << "\n"); + dbgs() << "REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB: " + << format("SegmentOffset=0x%06X", SegmentOffset) + << ", AdvanceAmount=" << AdvanceAmount + << ", RemainingLoopCount=" << RemainingLoopCount + << "\n"); return; case MachO::REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB: error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, @@ -3041,11 +3053,11 @@ void MachORebaseEntry::moveNext() { } DEBUG_WITH_TYPE( "mach-o-rebase", - llvm::dbgs() << "REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB: " - << format("SegmentOffset=0x%06X", SegmentOffset) - << ", AdvanceAmount=" << AdvanceAmount - << ", RemainingLoopCount=" << RemainingLoopCount - << "\n"); + dbgs() << "REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB: " + << format("SegmentOffset=0x%06X", SegmentOffset) + << ", AdvanceAmount=" << AdvanceAmount + << ", RemainingLoopCount=" << RemainingLoopCount + << "\n"); return; default: *E = malformedError("bad rebase info (bad opcode value 0x" + @@ -3131,10 +3143,8 @@ iterator_range MachOObjectFile::rebaseTable(Error &Err) { MachOBindEntry::MachOBindEntry(Error *E, const MachOObjectFile *O, ArrayRef Bytes, bool is64Bit, Kind BK) - : E(E), O(O), Opcodes(Bytes), Ptr(Bytes.begin()), SegmentOffset(0), - SegmentIndex(-1), LibraryOrdinalSet(false), Ordinal(0), Flags(0), - Addend(0), RemainingLoopCount(0), AdvanceAmount(0), BindType(0), - PointerSize(is64Bit ? 8 : 4), TableKind(BK), Done(false) {} + : E(E), O(O), Opcodes(Bytes), Ptr(Bytes.begin()), + PointerSize(is64Bit ? 8 : 4), TableKind(BK) {} void MachOBindEntry::moveToFirst() { Ptr = Opcodes.begin(); @@ -3189,7 +3199,7 @@ void MachOBindEntry::moveNext() { } More = false; moveToEnd(); - DEBUG_WITH_TYPE("mach-o-bind", llvm::dbgs() << "BIND_OPCODE_DONE\n"); + DEBUG_WITH_TYPE("mach-o-bind", dbgs() << "BIND_OPCODE_DONE\n"); break; case MachO::BIND_OPCODE_SET_DYLIB_ORDINAL_IMM: if (TableKind == Kind::Weak) { @@ -3211,8 +3221,8 @@ void MachOBindEntry::moveNext() { } DEBUG_WITH_TYPE( "mach-o-bind", - llvm::dbgs() << "BIND_OPCODE_SET_DYLIB_ORDINAL_IMM: " - << "Ordinal=" << Ordinal << "\n"); + dbgs() << "BIND_OPCODE_SET_DYLIB_ORDINAL_IMM: " + << "Ordinal=" << Ordinal << "\n"); break; case MachO::BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB: if (TableKind == Kind::Weak) { @@ -3241,8 +3251,8 @@ void MachOBindEntry::moveNext() { } DEBUG_WITH_TYPE( "mach-o-bind", - llvm::dbgs() << "BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB: " - << "Ordinal=" << Ordinal << "\n"); + dbgs() << "BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB: " + << "Ordinal=" << Ordinal << "\n"); break; case MachO::BIND_OPCODE_SET_DYLIB_SPECIAL_IMM: if (TableKind == Kind::Weak) { @@ -3267,8 +3277,8 @@ void MachOBindEntry::moveNext() { Ordinal = 0; DEBUG_WITH_TYPE( "mach-o-bind", - llvm::dbgs() << "BIND_OPCODE_SET_DYLIB_SPECIAL_IMM: " - << "Ordinal=" << Ordinal << "\n"); + dbgs() << "BIND_OPCODE_SET_DYLIB_SPECIAL_IMM: " + << "Ordinal=" << Ordinal << "\n"); break; case MachO::BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM: Flags = ImmValue; @@ -3288,8 +3298,8 @@ void MachOBindEntry::moveNext() { ++Ptr; DEBUG_WITH_TYPE( "mach-o-bind", - llvm::dbgs() << "BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM: " - << "SymbolName=" << SymbolName << "\n"); + dbgs() << "BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM: " + << "SymbolName=" << SymbolName << "\n"); if (TableKind == Kind::Weak) { if (ImmValue & MachO::BIND_SYMBOL_FLAGS_NON_WEAK_DEFINITION) return; @@ -3306,8 +3316,8 @@ void MachOBindEntry::moveNext() { } DEBUG_WITH_TYPE( "mach-o-bind", - llvm::dbgs() << "BIND_OPCODE_SET_TYPE_IMM: " - << "BindType=" << (int)BindType << "\n"); + dbgs() << "BIND_OPCODE_SET_TYPE_IMM: " + << "BindType=" << (int)BindType << "\n"); break; case MachO::BIND_OPCODE_SET_ADDEND_SLEB: Addend = readSLEB128(&error); @@ -3320,8 +3330,8 @@ void MachOBindEntry::moveNext() { } DEBUG_WITH_TYPE( "mach-o-bind", - llvm::dbgs() << "BIND_OPCODE_SET_ADDEND_SLEB: " - << "Addend=" << Addend << "\n"); + dbgs() << "BIND_OPCODE_SET_ADDEND_SLEB: " + << "Addend=" << Addend << "\n"); break; case MachO::BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB: SegmentIndex = ImmValue; @@ -3343,10 +3353,10 @@ void MachOBindEntry::moveNext() { } DEBUG_WITH_TYPE( "mach-o-bind", - llvm::dbgs() << "BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB: " - << "SegmentIndex=" << SegmentIndex << ", " - << format("SegmentOffset=0x%06X", SegmentOffset) - << "\n"); + dbgs() << "BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB: " + << "SegmentIndex=" << SegmentIndex << ", " + << format("SegmentOffset=0x%06X", SegmentOffset) + << "\n"); break; case MachO::BIND_OPCODE_ADD_ADDR_ULEB: SegmentOffset += readULEB128(&error); @@ -3366,9 +3376,9 @@ void MachOBindEntry::moveNext() { return; } DEBUG_WITH_TYPE("mach-o-bind", - llvm::dbgs() << "BIND_OPCODE_ADD_ADDR_ULEB: " - << format("SegmentOffset=0x%06X", - SegmentOffset) << "\n"); + dbgs() << "BIND_OPCODE_ADD_ADDR_ULEB: " + << format("SegmentOffset=0x%06X", + SegmentOffset) << "\n"); break; case MachO::BIND_OPCODE_DO_BIND: AdvanceAmount = PointerSize; @@ -3395,9 +3405,9 @@ void MachOBindEntry::moveNext() { return; } DEBUG_WITH_TYPE("mach-o-bind", - llvm::dbgs() << "BIND_OPCODE_DO_BIND: " - << format("SegmentOffset=0x%06X", - SegmentOffset) << "\n"); + dbgs() << "BIND_OPCODE_DO_BIND: " + << format("SegmentOffset=0x%06X", + SegmentOffset) << "\n"); return; case MachO::BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB: if (TableKind == Kind::Lazy) { @@ -3452,11 +3462,11 @@ void MachOBindEntry::moveNext() { RemainingLoopCount = 0; DEBUG_WITH_TYPE( "mach-o-bind", - llvm::dbgs() << "BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB: " - << format("SegmentOffset=0x%06X", SegmentOffset) - << ", AdvanceAmount=" << AdvanceAmount - << ", RemainingLoopCount=" << RemainingLoopCount - << "\n"); + dbgs() << "BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB: " + << format("SegmentOffset=0x%06X", SegmentOffset) + << ", AdvanceAmount=" << AdvanceAmount + << ", RemainingLoopCount=" << RemainingLoopCount + << "\n"); return; case MachO::BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED: if (TableKind == Kind::Lazy) { @@ -3501,10 +3511,9 @@ void MachOBindEntry::moveNext() { return; } DEBUG_WITH_TYPE("mach-o-bind", - llvm::dbgs() + dbgs() << "BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED: " - << format("SegmentOffset=0x%06X", - SegmentOffset) << "\n"); + << format("SegmentOffset=0x%06X", SegmentOffset) << "\n"); return; case MachO::BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB: if (TableKind == Kind::Lazy) { @@ -3568,11 +3577,11 @@ void MachOBindEntry::moveNext() { } DEBUG_WITH_TYPE( "mach-o-bind", - llvm::dbgs() << "BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB: " - << format("SegmentOffset=0x%06X", SegmentOffset) - << ", AdvanceAmount=" << AdvanceAmount - << ", RemainingLoopCount=" << RemainingLoopCount - << "\n"); + dbgs() << "BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB: " + << format("SegmentOffset=0x%06X", SegmentOffset) + << ", AdvanceAmount=" << AdvanceAmount + << ", RemainingLoopCount=" << RemainingLoopCount + << "\n"); return; default: *E = malformedError("bad bind info (bad opcode value 0x" + diff --git a/lib/Object/ModuleSummaryIndexObjectFile.cpp b/lib/Object/ModuleSummaryIndexObjectFile.cpp index de1ddab..91f93a4 100644 --- a/lib/Object/ModuleSummaryIndexObjectFile.cpp +++ b/lib/Object/ModuleSummaryIndexObjectFile.cpp @@ -1,4 +1,4 @@ -//===- ModuleSummaryIndexObjectFile.cpp - Summary index file implementation ==// +//==- ModuleSummaryIndexObjectFile.cpp - Summary index file implementation -==// // // The LLVM Compiler Infrastructure // @@ -11,29 +11,38 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Object/ModuleSummaryIndexObjectFile.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/IR/ModuleSummaryIndex.h" -#include "llvm/MC/MCStreamer.h" +#include "llvm/Object/Binary.h" +#include "llvm/Object/Error.h" +#include "llvm/Object/ModuleSummaryIndexObjectFile.h" #include "llvm/Object/ObjectFile.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/FileSystem.h" #include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/raw_ostream.h" +#include +#include +#include + using namespace llvm; using namespace object; -static llvm::cl::opt IgnoreEmptyThinLTOIndexFile( - "ignore-empty-index-file", llvm::cl::ZeroOrMore, - llvm::cl::desc( +static cl::opt IgnoreEmptyThinLTOIndexFile( + "ignore-empty-index-file", cl::ZeroOrMore, + cl::desc( "Ignore an empty index file and perform non-ThinLTO compilation"), - llvm::cl::init(false)); + cl::init(false)); ModuleSummaryIndexObjectFile::ModuleSummaryIndexObjectFile( MemoryBufferRef Object, std::unique_ptr I) : SymbolicFile(Binary::ID_ModuleSummaryIndex, Object), Index(std::move(I)) { } -ModuleSummaryIndexObjectFile::~ModuleSummaryIndexObjectFile() {} +ModuleSummaryIndexObjectFile::~ModuleSummaryIndexObjectFile() = default; std::unique_ptr ModuleSummaryIndexObjectFile::takeIndex() { return std::move(Index); diff --git a/lib/Object/ModuleSymbolTable.cpp b/lib/Object/ModuleSymbolTable.cpp index 9a935d8..a5b4272 100644 --- a/lib/Object/ModuleSymbolTable.cpp +++ b/lib/Object/ModuleSymbolTable.cpp @@ -1,4 +1,4 @@ -//===- ModuleSymbolTable.cpp - symbol table for in-memory IR ----*- C++ -*-===// +//===- ModuleSymbolTable.cpp - symbol table for in-memory IR --------------===// // // The LLVM Compiler Infrastructure // @@ -13,27 +13,45 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Object/IRObjectFile.h" #include "RecordStreamer.h" +#include "llvm/ADT/SmallString.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/Bitcode/BitcodeReader.h" -#include "llvm/IR/GVMaterializer.h" -#include "llvm/IR/LLVMContext.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Mangler.h" #include "llvm/IR/Module.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDirectives.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCParser/MCTargetAsmParser.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/Object/ObjectFile.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCTargetOptions.h" +#include "llvm/Object/ModuleSymbolTable.h" +#include "llvm/Object/SymbolicFile.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/SMLoc.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/raw_ostream.h" +#include +#include +#include +#include +#include + using namespace llvm; using namespace object; diff --git a/lib/Object/RecordStreamer.cpp b/lib/Object/RecordStreamer.cpp index c9c2745..e94e9cf 100644 --- a/lib/Object/RecordStreamer.cpp +++ b/lib/Object/RecordStreamer.cpp @@ -9,6 +9,7 @@ #include "RecordStreamer.h" #include "llvm/MC/MCSymbol.h" + using namespace llvm; void RecordStreamer::markDefined(const MCSymbol &Symbol) { @@ -69,14 +70,14 @@ void RecordStreamer::markUsed(const MCSymbol &Symbol) { void RecordStreamer::visitUsedSymbol(const MCSymbol &Sym) { markUsed(Sym); } +RecordStreamer::RecordStreamer(MCContext &Context) : MCStreamer(Context) {} + RecordStreamer::const_iterator RecordStreamer::begin() { return Symbols.begin(); } RecordStreamer::const_iterator RecordStreamer::end() { return Symbols.end(); } -RecordStreamer::RecordStreamer(MCContext &Context) : MCStreamer(Context) {} - void RecordStreamer::EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, bool) { MCStreamer::EmitInstruction(Inst, STI); diff --git a/lib/Object/RecordStreamer.h b/lib/Object/RecordStreamer.h index a845ecd..4d11909 100644 --- a/lib/Object/RecordStreamer.h +++ b/lib/Object/RecordStreamer.h @@ -1,4 +1,4 @@ -//===-- RecordStreamer.h - Record asm defined and used symbols ---*- C++ -*===// +//===- RecordStreamer.h - Record asm defined and used symbols ---*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -10,9 +10,16 @@ #ifndef LLVM_LIB_OBJECT_RECORDSTREAMER_H #define LLVM_LIB_OBJECT_RECORDSTREAMER_H +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/MC/MCDirectives.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/SMLoc.h" +#include namespace llvm { + class RecordStreamer : public MCStreamer { public: enum State { NeverSeen, Global, Defined, DefinedGlobal, DefinedWeak, Used, @@ -24,16 +31,19 @@ private: // their symbol binding after parsing complete. This maps from each // aliasee to its list of aliases. DenseMap> SymverAliasMap; + void markDefined(const MCSymbol &Symbol); void markGlobal(const MCSymbol &Symbol, MCSymbolAttr Attribute); void markUsed(const MCSymbol &Symbol); void visitUsedSymbol(const MCSymbol &Sym) override; public: - typedef StringMap::const_iterator const_iterator; + RecordStreamer(MCContext &Context); + + using const_iterator = StringMap::const_iterator; + const_iterator begin(); const_iterator end(); - RecordStreamer(MCContext &Context); void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, bool) override; void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override; @@ -50,6 +60,7 @@ public: DenseMap> &symverAliases() { return SymverAliasMap; } + /// Get the state recorded for the given symbol. State getSymbolState(const MCSymbol *Sym) { auto SI = Symbols.find(Sym->getName()); @@ -58,5 +69,7 @@ public: return SI->second; } }; -} -#endif + +} // end namespace llvm + +#endif // LLVM_LIB_OBJECT_RECORDSTREAMER_H diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp index fc1dca3..9f3486e 100644 --- a/lib/Object/WasmObjectFile.cpp +++ b/lib/Object/WasmObjectFile.cpp @@ -11,6 +11,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" +#include "llvm/MC/SubtargetFeature.h" #include "llvm/Object/Binary.h" #include "llvm/Object/Error.h" #include "llvm/Object/ObjectFile.h" @@ -22,7 +23,9 @@ #include "llvm/Support/LEB128.h" #include "llvm/Support/Wasm.h" #include +#include #include +#include #include using namespace llvm; @@ -141,7 +144,7 @@ static Error readInitExpr(wasm::WasmInitExpr &Expr, const uint8_t *&Ptr) { Expr.Value.Float64 = readFloat64(Ptr); break; case wasm::WASM_OPCODE_GET_GLOBAL: - Expr.Value.Global = readUint32(Ptr); + Expr.Value.Global = readULEB128(Ptr); break; default: return make_error("Invalid opcode in init_expr", @@ -180,7 +183,7 @@ static Error readSection(WasmSection &Section, const uint8_t *&Ptr, } WasmObjectFile::WasmObjectFile(MemoryBufferRef Buffer, Error &Err) - : ObjectFile(Binary::ID_Wasm, Buffer), StartFunction(-1) { + : ObjectFile(Binary::ID_Wasm, Buffer) { ErrorAsOutParameter ErrAsOutParam(&Err); Header.Magic = getData().substr(0, 4); if (Header.Magic != StringRef("\0asm", 4)) { @@ -252,7 +255,7 @@ Error WasmObjectFile::parseNameSection(const uint8_t *Ptr, const uint8_t *End) { while (Count--) { /*uint32_t Index =*/readVaruint32(Ptr); StringRef Name = readString(Ptr); - if (Name.size()) + if (!Name.empty()) Symbols.emplace_back(Name, WasmSymbol::SymbolType::DEBUG_FUNCTION_NAME); } @@ -313,11 +316,12 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, const uint8_t *Ptr, case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB: case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB: case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32: + case wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB: break; case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_LEB: case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_SLEB: case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_I32: - Reloc.Addend = readVaruint32(Ptr); + Reloc.Addend = readVarint32(Ptr); break; default: return make_error("Bad relocation type", diff --git a/lib/ObjectYAML/WasmYAML.cpp b/lib/ObjectYAML/WasmYAML.cpp index 3e1bed1..9b1ff7e 100644 --- a/lib/ObjectYAML/WasmYAML.cpp +++ b/lib/ObjectYAML/WasmYAML.cpp @@ -223,7 +223,7 @@ void MappingTraits::mapping( IO.mapRequired("Type", Relocation.Type); IO.mapRequired("Index", Relocation.Index); IO.mapRequired("Offset", Relocation.Offset); - IO.mapRequired("Addend", Relocation.Addend); + IO.mapOptional("Addend", Relocation.Addend, 0); } void MappingTraits::mapping( @@ -294,6 +294,9 @@ void MappingTraits::mapping(IO &IO, case wasm::WASM_OPCODE_F64_CONST: IO.mapRequired("Value", Expr.Value.Float64); break; + case wasm::WASM_OPCODE_GET_GLOBAL: + IO.mapRequired("Index", Expr.Value.Global); + break; } } diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp index 0421946..55ac254 100644 --- a/lib/Passes/PassBuilder.cpp +++ b/lib/Passes/PassBuilder.cpp @@ -624,6 +624,10 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, // And finally clean up LCSSA form before generating code. OptimizePM.addPass(InstSimplifierPass()); + // LoopSink (and other loop passes since the last simplifyCFG) might have + // resulted in single-entry-single-exit or empty blocks. Clean up the CFG. + OptimizePM.addPass(SimplifyCFGPass()); + // Add the core optimizing pipeline. MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM))); diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp index c4c892f..e1e2c22 100644 --- a/lib/Support/APFloat.cpp +++ b/lib/Support/APFloat.cpp @@ -3393,7 +3393,7 @@ namespace { } void IEEEFloat::toString(SmallVectorImpl &Str, unsigned FormatPrecision, - unsigned FormatMaxPadding) const { + unsigned FormatMaxPadding, bool TruncateZero) const { switch (category) { case fcInfinity: if (isNegative()) @@ -3407,9 +3407,16 @@ void IEEEFloat::toString(SmallVectorImpl &Str, unsigned FormatPrecision, if (isNegative()) Str.push_back('-'); - if (!FormatMaxPadding) - append(Str, "0.0E+0"); - else + if (!FormatMaxPadding) { + if (TruncateZero) + append(Str, "0.0E+0"); + else { + append(Str, "0.0"); + if (FormatPrecision > 1) + Str.append(FormatPrecision - 1, '0'); + append(Str, "e+00"); + } + } else Str.push_back('0'); return; @@ -3543,12 +3550,16 @@ void IEEEFloat::toString(SmallVectorImpl &Str, unsigned FormatPrecision, Str.push_back(buffer[NDigits-1]); Str.push_back('.'); - if (NDigits == 1) + if (NDigits == 1 && TruncateZero) Str.push_back('0'); else for (unsigned I = 1; I != NDigits; ++I) Str.push_back(buffer[NDigits-1-I]); - Str.push_back('E'); + // Fill with zeros up to FormatPrecision. + if (!TruncateZero && FormatPrecision > NDigits - 1) + Str.append(FormatPrecision - NDigits + 1, '0'); + // For !TruncateZero we use lower 'e'. + Str.push_back(TruncateZero ? 'E' : 'e'); Str.push_back(exp >= 0 ? '+' : '-'); if (exp < 0) exp = -exp; @@ -3557,6 +3568,9 @@ void IEEEFloat::toString(SmallVectorImpl &Str, unsigned FormatPrecision, expbuf.push_back((char) ('0' + (exp % 10))); exp /= 10; } while (exp); + // Exponent always at least two digits if we do not truncate zeros. + if (!TruncateZero && expbuf.size() < 2) + expbuf.push_back('0'); for (unsigned I = 0, E = expbuf.size(); I != E; ++I) Str.push_back(expbuf[E-1-I]); return; @@ -4362,10 +4376,11 @@ bool DoubleAPFloat::isInteger() const { void DoubleAPFloat::toString(SmallVectorImpl &Str, unsigned FormatPrecision, - unsigned FormatMaxPadding) const { + unsigned FormatMaxPadding, + bool TruncateZero) const { assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics"); APFloat(semPPCDoubleDoubleLegacy, bitcastToAPInt()) - .toString(Str, FormatPrecision, FormatMaxPadding); + .toString(Str, FormatPrecision, FormatMaxPadding, TruncateZero); } bool DoubleAPFloat::getExactInverse(APFloat *inv) const { diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp index 2d049a1..1227d75 100644 --- a/lib/Support/APInt.cpp +++ b/lib/Support/APInt.cpp @@ -81,7 +81,7 @@ void APInt::initSlowCase(uint64_t val, bool isSigned) { pVal[0] = val; if (isSigned && int64_t(val) < 0) for (unsigned i = 1; i < getNumWords(); ++i) - pVal[i] = -1ULL; + pVal[i] = WORD_MAX; clearUnusedBits(); } @@ -364,44 +364,20 @@ bool APInt::EqualSlowCase(const APInt& RHS) const { return std::equal(pVal, pVal + getNumWords(), RHS.pVal); } -bool APInt::ult(const APInt& RHS) const { +int APInt::compare(const APInt& RHS) const { assert(BitWidth == RHS.BitWidth && "Bit widths must be same for comparison"); if (isSingleWord()) - return VAL < RHS.VAL; + return VAL < RHS.VAL ? -1 : VAL > RHS.VAL; - // Get active bit length of both operands - unsigned n1 = getActiveBits(); - unsigned n2 = RHS.getActiveBits(); - - // If magnitude of LHS is less than RHS, return true. - if (n1 < n2) - return true; - - // If magnitude of RHS is greater than LHS, return false. - if (n2 < n1) - return false; - - // If they both fit in a word, just compare the low order word - if (n1 <= APINT_BITS_PER_WORD && n2 <= APINT_BITS_PER_WORD) - return pVal[0] < RHS.pVal[0]; - - // Otherwise, compare all words - unsigned topWord = whichWord(std::max(n1,n2)-1); - for (int i = topWord; i >= 0; --i) { - if (pVal[i] > RHS.pVal[i]) - return false; - if (pVal[i] < RHS.pVal[i]) - return true; - } - return false; + return tcCompare(pVal, RHS.pVal, getNumWords()); } -bool APInt::slt(const APInt& RHS) const { +int APInt::compareSigned(const APInt& RHS) const { assert(BitWidth == RHS.BitWidth && "Bit widths must be same for comparison"); if (isSingleWord()) { int64_t lhsSext = SignExtend64(VAL, BitWidth); int64_t rhsSext = SignExtend64(RHS.VAL, BitWidth); - return lhsSext < rhsSext; + return lhsSext < rhsSext ? -1 : lhsSext > rhsSext; } bool lhsNeg = isNegative(); @@ -409,11 +385,11 @@ bool APInt::slt(const APInt& RHS) const { // If the sign bits don't match, then (LHS < RHS) if LHS is negative if (lhsNeg != rhsNeg) - return lhsNeg; + return lhsNeg ? -1 : 1; // Otherwise we can just use an unsigned comparison, because even negative // numbers compare correctly this way if both have the same signed-ness. - return ult(RHS); + return tcCompare(pVal, RHS.pVal, getNumWords()); } void APInt::setBit(unsigned bitPosition) { @@ -428,13 +404,13 @@ void APInt::setBitsSlowCase(unsigned loBit, unsigned hiBit) { unsigned hiWord = whichWord(hiBit); // Create an initial mask for the low word with zeros below loBit. - uint64_t loMask = UINT64_MAX << whichBit(loBit); + uint64_t loMask = WORD_MAX << whichBit(loBit); // If hiBit is not aligned, we need a high mask. unsigned hiShiftAmt = whichBit(hiBit); if (hiShiftAmt != 0) { // Create a high mask with zeros above hiBit. - uint64_t hiMask = UINT64_MAX >> (APINT_BITS_PER_WORD - hiShiftAmt); + uint64_t hiMask = WORD_MAX >> (APINT_BITS_PER_WORD - hiShiftAmt); // If loWord and hiWord are equal, then we combine the masks. Otherwise, // set the bits in hiWord. if (hiWord == loWord) @@ -447,7 +423,7 @@ void APInt::setBitsSlowCase(unsigned loBit, unsigned hiBit) { // Fill any words between loWord and hiWord with all ones. for (unsigned word = loWord + 1; word < hiWord; ++word) - pVal[word] = UINT64_MAX; + pVal[word] = WORD_MAX; } /// Set the given bit to 0 whose position is given as "bitPosition". @@ -487,7 +463,7 @@ void APInt::insertBits(const APInt &subBits, unsigned bitPosition) { // Single word result can be done as a direct bitmask. if (isSingleWord()) { - uint64_t mask = UINT64_MAX >> (APINT_BITS_PER_WORD - subBitWidth); + uint64_t mask = WORD_MAX >> (APINT_BITS_PER_WORD - subBitWidth); VAL &= ~(mask << bitPosition); VAL |= (subBits.VAL << bitPosition); return; @@ -499,7 +475,7 @@ void APInt::insertBits(const APInt &subBits, unsigned bitPosition) { // Insertion within a single word can be done as a direct bitmask. if (loWord == hi1Word) { - uint64_t mask = UINT64_MAX >> (APINT_BITS_PER_WORD - subBitWidth); + uint64_t mask = WORD_MAX >> (APINT_BITS_PER_WORD - subBitWidth); pVal[loWord] &= ~(mask << loBit); pVal[loWord] |= (subBits.VAL << loBit); return; @@ -515,7 +491,7 @@ void APInt::insertBits(const APInt &subBits, unsigned bitPosition) { // Mask+insert remaining bits. unsigned remainingBits = subBitWidth % APINT_BITS_PER_WORD; if (remainingBits != 0) { - uint64_t mask = UINT64_MAX >> (APINT_BITS_PER_WORD - remainingBits); + uint64_t mask = WORD_MAX >> (APINT_BITS_PER_WORD - remainingBits); pVal[hi1Word] &= ~mask; pVal[hi1Word] |= subBits.getWord(subBitWidth - 1); } @@ -682,7 +658,7 @@ unsigned APInt::countLeadingOnes() const { unsigned Count = llvm::countLeadingOnes(pVal[i] << shift); if (Count == highWordBits) { for (i--; i >= 0; --i) { - if (pVal[i] == -1ULL) + if (pVal[i] == WORD_MAX) Count += APINT_BITS_PER_WORD; else { Count += llvm::countLeadingOnes(pVal[i]); @@ -708,11 +684,12 @@ unsigned APInt::countTrailingZeros() const { unsigned APInt::countTrailingOnesSlowCase() const { unsigned Count = 0; unsigned i = 0; - for (; i < getNumWords() && pVal[i] == -1ULL; ++i) + for (; i < getNumWords() && pVal[i] == WORD_MAX; ++i) Count += APINT_BITS_PER_WORD; if (i < getNumWords()) Count += llvm::countTrailingOnes(pVal[i]); - return std::min(Count, BitWidth); + assert(Count <= BitWidth); + return Count; } unsigned APInt::countPopulationSlowCase() const { @@ -962,43 +939,26 @@ APInt APInt::trunc(unsigned width) const { } // Sign extend to a new width. -APInt APInt::sext(unsigned width) const { - assert(width > BitWidth && "Invalid APInt SignExtend request"); +APInt APInt::sext(unsigned Width) const { + assert(Width > BitWidth && "Invalid APInt SignExtend request"); - if (width <= APINT_BITS_PER_WORD) { - uint64_t val = VAL << (APINT_BITS_PER_WORD - BitWidth); - val = (int64_t)val >> (width - BitWidth); - return APInt(width, val >> (APINT_BITS_PER_WORD - width)); - } - - APInt Result(getMemory(getNumWords(width)), width); - - // Copy full words. - unsigned i; - uint64_t word = 0; - for (i = 0; i != BitWidth / APINT_BITS_PER_WORD; i++) { - word = getRawData()[i]; - Result.pVal[i] = word; - } + if (Width <= APINT_BITS_PER_WORD) + return APInt(Width, SignExtend64(VAL, BitWidth)); - // Read and sign-extend any partial word. - unsigned bits = (0 - BitWidth) % APINT_BITS_PER_WORD; - if (bits != 0) - word = (int64_t)getRawData()[i] << bits >> bits; - else - word = (int64_t)word >> (APINT_BITS_PER_WORD - 1); + APInt Result(getMemory(getNumWords(Width)), Width); - // Write remaining full words. - for (; i != width / APINT_BITS_PER_WORD; i++) { - Result.pVal[i] = word; - word = (int64_t)word >> (APINT_BITS_PER_WORD - 1); - } + // Copy words. + std::memcpy(Result.pVal, getRawData(), getNumWords() * APINT_WORD_SIZE); - // Write any partial word. - bits = (0 - width) % APINT_BITS_PER_WORD; - if (bits != 0) - Result.pVal[i] = word << bits >> bits; + // Sign extend the last word since there may be unused bits in the input. + Result.pVal[getNumWords() - 1] = + SignExtend64(Result.pVal[getNumWords() - 1], + ((BitWidth - 1) % APINT_BITS_PER_WORD) + 1); + // Fill with sign bits. + std::memset(Result.pVal + getNumWords(), isNegative() ? -1 : 0, + (Result.getNumWords() - getNumWords()) * APINT_WORD_SIZE); + Result.clearUnusedBits(); return Result; } @@ -1012,12 +972,11 @@ APInt APInt::zext(unsigned width) const { APInt Result(getMemory(getNumWords(width)), width); // Copy words. - unsigned i; - for (i = 0; i != getNumWords(); i++) - Result.pVal[i] = getRawData()[i]; + std::memcpy(Result.pVal, getRawData(), getNumWords() * APINT_WORD_SIZE); // Zero remaining words. - memset(&Result.pVal[i], 0, (Result.getNumWords() - i) * APINT_WORD_SIZE); + std::memset(Result.pVal + getNumWords(), 0, + (Result.getNumWords() - getNumWords()) * APINT_WORD_SIZE); return Result; } @@ -1052,89 +1011,51 @@ APInt APInt::sextOrSelf(unsigned width) const { /// Arithmetic right-shift this APInt by shiftAmt. /// @brief Arithmetic right-shift function. -APInt APInt::ashr(const APInt &shiftAmt) const { - return ashr((unsigned)shiftAmt.getLimitedValue(BitWidth)); +void APInt::ashrInPlace(const APInt &shiftAmt) { + ashrInPlace((unsigned)shiftAmt.getLimitedValue(BitWidth)); } /// Arithmetic right-shift this APInt by shiftAmt. /// @brief Arithmetic right-shift function. -APInt APInt::ashr(unsigned shiftAmt) const { - assert(shiftAmt <= BitWidth && "Invalid shift amount"); - // Handle a degenerate case - if (shiftAmt == 0) - return *this; - - // Handle single word shifts with built-in ashr - if (isSingleWord()) { - if (shiftAmt == BitWidth) - return APInt(BitWidth, 0); // undefined - return APInt(BitWidth, SignExtend64(VAL, BitWidth) >> shiftAmt); - } +void APInt::ashrSlowCase(unsigned ShiftAmt) { + // Don't bother performing a no-op shift. + if (!ShiftAmt) + return; - // If all the bits were shifted out, the result is, technically, undefined. - // We return -1 if it was negative, 0 otherwise. We check this early to avoid - // issues in the algorithm below. - if (shiftAmt == BitWidth) { - if (isNegative()) - return APInt(BitWidth, -1ULL, true); - else - return APInt(BitWidth, 0); - } - - // Create some space for the result. - uint64_t * val = new uint64_t[getNumWords()]; - - // Compute some values needed by the following shift algorithms - unsigned wordShift = shiftAmt % APINT_BITS_PER_WORD; // bits to shift per word - unsigned offset = shiftAmt / APINT_BITS_PER_WORD; // word offset for shift - unsigned breakWord = getNumWords() - 1 - offset; // last word affected - unsigned bitsInWord = whichBit(BitWidth); // how many bits in last word? - if (bitsInWord == 0) - bitsInWord = APINT_BITS_PER_WORD; - - // If we are shifting whole words, just move whole words - if (wordShift == 0) { - // Move the words containing significant bits - for (unsigned i = 0; i <= breakWord; ++i) - val[i] = pVal[i+offset]; // move whole word - - // Adjust the top significant word for sign bit fill, if negative - if (isNegative()) - if (bitsInWord < APINT_BITS_PER_WORD) - val[breakWord] |= ~0ULL << bitsInWord; // set high bits - } else { - // Shift the low order words - for (unsigned i = 0; i < breakWord; ++i) { - // This combines the shifted corresponding word with the low bits from - // the next word (shifted into this word's high bits). - val[i] = (pVal[i+offset] >> wordShift) | - (pVal[i+offset+1] << (APINT_BITS_PER_WORD - wordShift)); - } + // Save the original sign bit for later. + bool Negative = isNegative(); - // Shift the break word. In this case there are no bits from the next word - // to include in this word. - val[breakWord] = pVal[breakWord+offset] >> wordShift; - - // Deal with sign extension in the break word, and possibly the word before - // it. - if (isNegative()) { - if (wordShift > bitsInWord) { - if (breakWord > 0) - val[breakWord-1] |= - ~0ULL << (APINT_BITS_PER_WORD - (wordShift - bitsInWord)); - val[breakWord] |= ~0ULL; - } else - val[breakWord] |= (~0ULL << (bitsInWord - wordShift)); + // WordShift is the inter-part shift; BitShift is is intra-part shift. + unsigned WordShift = ShiftAmt / APINT_BITS_PER_WORD; + unsigned BitShift = ShiftAmt % APINT_BITS_PER_WORD; + + unsigned WordsToMove = getNumWords() - WordShift; + if (WordsToMove != 0) { + // Sign extend the last word to fill in the unused bits. + pVal[getNumWords() - 1] = SignExtend64( + pVal[getNumWords() - 1], ((BitWidth - 1) % APINT_BITS_PER_WORD) + 1); + + // Fastpath for moving by whole words. + if (BitShift == 0) { + std::memmove(pVal, pVal + WordShift, WordsToMove * APINT_WORD_SIZE); + } else { + // Move the words containing significant bits. + for (unsigned i = 0; i != WordsToMove - 1; ++i) + pVal[i] = (pVal[i + WordShift] >> BitShift) | + (pVal[i + WordShift + 1] << (APINT_BITS_PER_WORD - BitShift)); + + // Handle the last word which has no high bits to copy. + pVal[WordsToMove - 1] = pVal[WordShift + WordsToMove - 1] >> BitShift; + // Sign extend one more time. + pVal[WordsToMove - 1] = + SignExtend64(pVal[WordsToMove - 1], APINT_BITS_PER_WORD - BitShift); } } - // Remaining words are 0 or -1, just assign them. - uint64_t fillValue = (isNegative() ? -1ULL : 0); - for (unsigned i = breakWord+1; i < getNumWords(); ++i) - val[i] = fillValue; - APInt Result(val, BitWidth); - Result.clearUnusedBits(); - return Result; + // Fill in the remainder based on the original sign. + std::memset(pVal + WordsToMove, Negative ? -1 : 0, + WordShift * APINT_WORD_SIZE); + clearUnusedBits(); } /// Logical right-shift this APInt by shiftAmt. @@ -2608,7 +2529,7 @@ void APInt::tcShiftLeft(WordType *Dst, unsigned Words, unsigned Count) { if (!Count) return; - /* WordShift is the inter-part shift; BitShift is is intra-part shift. */ + // WordShift is the inter-part shift; BitShift is the intra-part shift. unsigned WordShift = std::min(Count / APINT_BITS_PER_WORD, Words); unsigned BitShift = Count % APINT_BITS_PER_WORD; @@ -2635,7 +2556,7 @@ void APInt::tcShiftRight(WordType *Dst, unsigned Words, unsigned Count) { if (!Count) return; - // WordShift is the inter-part shift; BitShift is is intra-part shift. + // WordShift is the inter-part shift; BitShift is the intra-part shift. unsigned WordShift = std::min(Count / APINT_BITS_PER_WORD, Words); unsigned BitShift = Count % APINT_BITS_PER_WORD; @@ -2684,10 +2605,8 @@ int APInt::tcCompare(const WordType *lhs, const WordType *rhs, unsigned parts) { while (parts) { parts--; - if (lhs[parts] == rhs[parts]) - continue; - - return (lhs[parts] > rhs[parts]) ? 1 : -1; + if (lhs[parts] != rhs[parts]) + return (lhs[parts] > rhs[parts]) ? 1 : -1; } return 0; diff --git a/lib/Support/DynamicLibrary.cpp b/lib/Support/DynamicLibrary.cpp index 92ce618..22fb3f2 100644 --- a/lib/Support/DynamicLibrary.cpp +++ b/lib/Support/DynamicLibrary.cpp @@ -193,4 +193,3 @@ void *LLVMSearchForAddressOfSymbol(const char *symbolName) { void LLVMAddSymbol(const char *symbolName, void *symbolValue) { return llvm::sys::DynamicLibrary::AddSymbol(symbolName, symbolValue); } - diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp index 64d5977..f3a654d 100644 --- a/lib/Support/Triple.cpp +++ b/lib/Support/Triple.cpp @@ -161,6 +161,7 @@ StringRef Triple::getVendorTypeName(VendorType Kind) { case Myriad: return "myriad"; case AMD: return "amd"; case Mesa: return "mesa"; + case SUSE: return "suse"; } llvm_unreachable("Invalid VendorType!"); @@ -443,6 +444,7 @@ static Triple::VendorType parseVendor(StringRef VendorName) { .Case("myriad", Triple::Myriad) .Case("amd", Triple::AMD) .Case("mesa", Triple::Mesa) + .Case("suse", Triple::SUSE) .Default(Triple::UnknownVendor); } diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index d0c0956..629ad5c 100644 --- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -942,6 +942,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, AArch64::XZR, NextMBBI); case AArch64::CMP_SWAP_128: return expandCMP_SWAP_128(MBB, MBBI, NextMBBI); + } return false; } diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 550174b..dc916c0 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1125,7 +1125,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, if (RegInfo->hasBasePointer(MF)) BasePointerReg = RegInfo->getBaseRegister(); - bool ExtraCSSpill = false; + unsigned ExtraCSSpill = 0; const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); // Figure out which callee-saved registers to save/restore. for (unsigned i = 0; CSRegs[i]; ++i) { @@ -1153,7 +1153,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, SavedRegs.set(PairedReg); if (AArch64::GPR64RegClass.contains(PairedReg) && !RegInfo->isReservedReg(MF, PairedReg)) - ExtraCSSpill = true; + ExtraCSSpill = PairedReg; } } @@ -1186,8 +1186,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // register scavenging. If we already spilled an extra callee-saved register // above to keep the number of spills even, we don't need to do anything else // here. - if (BigStack && !ExtraCSSpill) { - if (UnspilledCSGPR != AArch64::NoRegister) { + if (BigStack) { + if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) { DEBUG(dbgs() << "Spilling " << PrintReg(UnspilledCSGPR, RegInfo) << " to get a scratch register.\n"); SavedRegs.set(UnspilledCSGPR); @@ -1196,15 +1196,18 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // store the pair. if (produceCompactUnwindFrame(MF)) SavedRegs.set(UnspilledCSGPRPaired); - ExtraCSSpill = true; + ExtraCSSpill = UnspilledCSGPRPaired; NumRegsSpilled = SavedRegs.count(); } // If we didn't find an extra callee-saved register to spill, create // an emergency spill slot. - if (!ExtraCSSpill) { - const TargetRegisterClass *RC = &AArch64::GPR64RegClass; - int FI = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), false); + if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) { + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + const TargetRegisterClass &RC = AArch64::GPR64RegClass; + unsigned Size = TRI->getSpillSize(RC); + unsigned Align = TRI->getSpillAlignment(RC); + int FI = MFI.CreateStackObject(Size, Align, false); RS->addScavengingFrameIndex(FI); DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI << " as the emergency spill slot.\n"); diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 4ddc951..a7c98fb 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -91,6 +91,7 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); +STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); static cl::opt EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, @@ -105,6 +106,12 @@ cl::opt EnableAArch64ELFLocalDynamicTLSGeneration( cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false)); +static cl::opt +EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, + cl::desc("Enable AArch64 logical imm instruction " + "optimization"), + cl::init(true)); + /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; @@ -787,6 +794,140 @@ EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, return VT.changeVectorElementTypeToInteger(); } +static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, + const APInt &Demanded, + TargetLowering::TargetLoweringOpt &TLO, + unsigned NewOpc) { + uint64_t OldImm = Imm, NewImm, Enc; + uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask; + + // Return if the immediate is already all zeros, all ones, a bimm32 or a + // bimm64. + if (Imm == 0 || Imm == Mask || + AArch64_AM::isLogicalImmediate(Imm & Mask, Size)) + return false; + + unsigned EltSize = Size; + uint64_t DemandedBits = Demanded.getZExtValue(); + + // Clear bits that are not demanded. + Imm &= DemandedBits; + + while (true) { + // The goal here is to set the non-demanded bits in a way that minimizes + // the number of switching between 0 and 1. In order to achieve this goal, + // we set the non-demanded bits to the value of the preceding demanded bits. + // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a + // non-demanded bit), we copy bit0 (1) to the least significant 'x', + // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'. + // The final result is 0b11000011. + uint64_t NonDemandedBits = ~DemandedBits; + uint64_t InvertedImm = ~Imm & DemandedBits; + uint64_t RotatedImm = + ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) & + NonDemandedBits; + uint64_t Sum = RotatedImm + NonDemandedBits; + bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1)); + uint64_t Ones = (Sum + Carry) & NonDemandedBits; + NewImm = (Imm | Ones) & Mask; + + // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate + // or all-ones or all-zeros, in which case we can stop searching. Otherwise, + // we halve the element size and continue the search. + if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask))) + break; + + // We cannot shrink the element size any further if it is 2-bits. + if (EltSize == 2) + return false; + + EltSize /= 2; + Mask >>= EltSize; + uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize; + + // Return if there is mismatch in any of the demanded bits of Imm and Hi. + if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0) + return false; + + // Merge the upper and lower halves of Imm and DemandedBits. + Imm |= Hi; + DemandedBits |= DemandedBitsHi; + } + + ++NumOptimizedImms; + + // Replicate the element across the register width. + while (EltSize < Size) { + NewImm |= NewImm << EltSize; + EltSize *= 2; + } + + (void)OldImm; + assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 && + "demanded bits should never be altered"); + assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm"); + + // Create the new constant immediate node. + EVT VT = Op.getValueType(); + SDLoc DL(Op); + + // If the new constant immediate is all-zeros or all-ones, let the target + // independent DAG combine optimize this node. + if (NewImm == 0 || NewImm == OrigMask) + return TLO.CombineTo(Op.getOperand(1), TLO.DAG.getConstant(NewImm, DL, VT)); + + // Otherwise, create a machine node so that target independent DAG combine + // doesn't undo this optimization. + Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size); + SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT); + SDValue New( + TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0); + + return TLO.CombineTo(Op, New); +} + +bool AArch64TargetLowering::targetShrinkDemandedConstant( + SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const { + // Delay this optimization to as late as possible. + if (!TLO.LegalOps) + return false; + + if (!EnableOptimizeLogicalImm) + return false; + + EVT VT = Op.getValueType(); + if (VT.isVector()) + return false; + + unsigned Size = VT.getSizeInBits(); + assert((Size == 32 || Size == 64) && + "i32 or i64 is expected after legalization."); + + // Exit early if we demand all bits. + if (Demanded.countPopulation() == Size) + return false; + + unsigned NewOpc; + switch (Op.getOpcode()) { + default: + return false; + case ISD::AND: + NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri; + break; + case ISD::OR: + NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri; + break; + case ISD::XOR: + NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri; + break; + } + ConstantSDNode *C = dyn_cast(Op.getOperand(1)); + if (!C) + return false; + uint64_t Imm = C->getZExtValue(); + return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc); +} + /// computeKnownBitsForTargetNode - Determine which of the bits specified in /// Mask are known to be either zero or one and return them in the /// KnownZero/KnownOne bitsets. @@ -3418,11 +3559,75 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // Other Lowering Code //===----------------------------------------------------------------------===// +SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty, + SelectionDAG &DAG, + unsigned Flag) const { + return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 0, Flag); +} + +SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty, + SelectionDAG &DAG, + unsigned Flag) const { + return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag); +} + +SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty, + SelectionDAG &DAG, + unsigned Flag) const { + return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(), + N->getOffset(), Flag); +} + +SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty, + SelectionDAG &DAG, + unsigned Flag) const { + return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag); +} + +// (loadGOT sym) +template +SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG) const { + DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n"); + SDLoc DL(N); + EVT Ty = getPointerTy(DAG.getDataLayout()); + SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT); + // FIXME: Once remat is capable of dealing with instructions with register + // operands, expand this into two nodes instead of using a wrapper node. + return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr); +} + +// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym)) +template +SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG) + const { + DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n"); + SDLoc DL(N); + EVT Ty = getPointerTy(DAG.getDataLayout()); + const unsigned char MO_NC = AArch64II::MO_NC; + return DAG.getNode( + AArch64ISD::WrapperLarge, DL, Ty, + getTargetNode(N, Ty, DAG, AArch64II::MO_G3), + getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC), + getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC), + getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC)); +} + +// (addlow (adrp %hi(sym)) %lo(sym)) +template +SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG) const { + DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n"); + SDLoc DL(N); + EVT Ty = getPointerTy(DAG.getDataLayout()); + SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE); + SDValue Lo = getTargetNode(N, Ty, DAG, + AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi); + return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo); +} + SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDLoc DL(Op); - const GlobalAddressSDNode *GN = cast(Op); + GlobalAddressSDNode *GN = cast(Op); const GlobalValue *GV = GN->getGlobal(); unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); @@ -3430,32 +3635,15 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, assert(cast(Op)->getOffset() == 0 && "unexpected offset in global node"); - // This also catched the large code model case for Darwin. + // This also catches the large code model case for Darwin. if ((OpFlags & AArch64II::MO_GOT) != 0) { - SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); - // FIXME: Once remat is capable of dealing with instructions with register - // operands, expand this into two nodes instead of using a wrapper node. - return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); + return getGOT(GN, DAG); } if (getTargetMachine().getCodeModel() == CodeModel::Large) { - const unsigned char MO_NC = AArch64II::MO_NC; - return DAG.getNode( - AArch64ISD::WrapperLarge, DL, PtrVT, - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3), - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC), - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC), - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); + return getAddrLarge(GN, DAG); } else { - // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and - // the only correct model on Darwin. - SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, - OpFlags | AArch64II::MO_PAGE); - unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC; - SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags); - - SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); - return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + return getAddr(GN, DAG); } } @@ -4232,90 +4420,37 @@ SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, // Jump table entries as PC relative offsets. No additional tweaking // is necessary here. Just get the address of the jump table. JumpTableSDNode *JT = cast(Op); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDLoc DL(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large && !Subtarget->isTargetMachO()) { - const unsigned char MO_NC = AArch64II::MO_NC; - return DAG.getNode( - AArch64ISD::WrapperLarge, DL, PtrVT, - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3), - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC), - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC), - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, - AArch64II::MO_G0 | MO_NC)); + return getAddrLarge(JT, DAG); } - - SDValue Hi = - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE); - SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, - AArch64II::MO_PAGEOFF | AArch64II::MO_NC); - SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); - return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + return getAddr(JT, DAG); } SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { ConstantPoolSDNode *CP = cast(Op); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDLoc DL(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large) { // Use the GOT for the large code model on iOS. if (Subtarget->isTargetMachO()) { - SDValue GotAddr = DAG.getTargetConstantPool( - CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), - AArch64II::MO_GOT); - return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); + return getGOT(CP, DAG); } - - const unsigned char MO_NC = AArch64II::MO_NC; - return DAG.getNode( - AArch64ISD::WrapperLarge, DL, PtrVT, - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), AArch64II::MO_G3), - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), AArch64II::MO_G2 | MO_NC), - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), AArch64II::MO_G1 | MO_NC), - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), AArch64II::MO_G0 | MO_NC)); + return getAddrLarge(CP, DAG); } else { - // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on - // ELF, the only valid one on Darwin. - SDValue Hi = - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), AArch64II::MO_PAGE); - SDValue Lo = DAG.getTargetConstantPool( - CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), - AArch64II::MO_PAGEOFF | AArch64II::MO_NC); - - SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); - return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + return getAddr(CP, DAG); } } SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { - const BlockAddress *BA = cast(Op)->getBlockAddress(); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDLoc DL(Op); + BlockAddressSDNode *BA = cast(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large && !Subtarget->isTargetMachO()) { - const unsigned char MO_NC = AArch64II::MO_NC; - return DAG.getNode( - AArch64ISD::WrapperLarge, DL, PtrVT, - DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3), - DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC), - DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC), - DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); + return getAddrLarge(BA, DAG); } else { - SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE); - SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF | - AArch64II::MO_NC); - SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); - return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + return getAddr(BA, DAG); } } diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index a023b43..6081b07 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -255,6 +255,9 @@ public: const SelectionDAG &DAG, unsigned Depth = 0) const override; + bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded, + TargetLoweringOpt &TLO) const override; + MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override; /// Returns true if the target allows unaligned memory accesses of the @@ -508,6 +511,18 @@ private: const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; + SDValue getTargetNode(GlobalAddressSDNode *N, EVT Ty, SelectionDAG &DAG, + unsigned Flag) const; + SDValue getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG, + unsigned Flag) const; + SDValue getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG, + unsigned Flag) const; + SDValue getTargetNode(BlockAddressSDNode *N, EVT Ty, SelectionDAG &DAG, + unsigned Flag) const; + template SDValue getGOT(NodeTy *N, SelectionDAG &DAG) const; + template + SDValue getAddrLarge(NodeTy *N, SelectionDAG &DAG) const; + template SDValue getAddr(NodeTy *N, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td index 867074c..71826be 100644 --- a/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/lib/Target/AArch64/AArch64InstrAtomics.td @@ -14,6 +14,9 @@ //===---------------------------------- // Atomic fences //===---------------------------------- +let AddedComplexity = 15, Size = 0 in +def CompilerBarrier : Pseudo<(outs), (ins i32imm:$ordering), + [(atomic_fence imm:$ordering, 0)]>, Sched<[]>; def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>; def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>; diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td index 16be443..c44daf3 100644 --- a/lib/Target/AArch64/AArch64InstrFormats.td +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -693,11 +693,11 @@ def addsub_shifted_imm32_neg : addsub_shifted_imm_neg; def addsub_shifted_imm64_neg : addsub_shifted_imm_neg; def gi_addsub_shifted_imm32 : - GIComplexOperandMatcher, + GIComplexOperandMatcher, GIComplexPatternEquiv; def gi_addsub_shifted_imm64 : - GIComplexOperandMatcher, + GIComplexOperandMatcher, GIComplexPatternEquiv; class neg_addsub_shifted_imm diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index 41fc8ec..cb26882 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2320,7 +2320,7 @@ void AArch64InstrInfo::storeRegToStackSlot( PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align); unsigned Opc = 0; bool Offset = true; - switch (RC->getSize()) { + switch (TRI->getSpillSize(*RC)) { case 1: if (AArch64::FPR8RegClass.hasSubClassEq(RC)) Opc = AArch64::STRBui; @@ -2424,7 +2424,7 @@ void AArch64InstrInfo::loadRegFromStackSlot( unsigned Opc = 0; bool Offset = true; - switch (RC->getSize()) { + switch (TRI->getSpillSize(*RC)) { case 1: if (AArch64::FPR8RegClass.hasSubClassEq(RC)) Opc = AArch64::LDRBui; @@ -2649,7 +2649,8 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( }; if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { - assert(getRegClass(DstReg)->getSize() == getRegClass(SrcReg)->getSize() && + assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) == + TRI.getRegSizeInBits(*getRegClass(SrcReg)) && "Mismatched register size in non subreg COPY"); if (IsSpill) storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, @@ -2735,7 +2736,8 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( } if (FillRC) { - assert(getRegClass(SrcReg)->getSize() == FillRC->getSize() && + assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) == + TRI.getRegSizeInBits(*FillRC) && "Mismatched regclass size on folded subreg COPY"); loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI); MachineInstr &LoadMI = *--InsertPt; @@ -3025,7 +3027,7 @@ bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, return false; } -void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { +void AArch64InstrInfo::getNoop(MCInst &NopInst) const { NopInst.setOpcode(AArch64::HINT); NopInst.addOperand(MCOperand::createImm(0)); } diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h index bacce44..4cd14db 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.h +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -205,7 +205,7 @@ public: const DebugLoc &DL, unsigned DstReg, ArrayRef Cond, unsigned TrueReg, unsigned FalseReg) const override; - void getNoopForMachoTarget(MCInst &NopInst) const override; + void getNoop(MCInst &NopInst) const override; /// analyzeCompare - For a comparison instruction, return the source registers /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp index 5e01b6c..b0e0e3e 100644 --- a/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -41,12 +41,17 @@ using namespace llvm; namespace { +#define GET_GLOBALISEL_PREDICATE_BITSET +#include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATE_BITSET + class AArch64InstructionSelector : public InstructionSelector { public: AArch64InstructionSelector(const AArch64TargetMachine &TM, const AArch64Subtarget &STI, const AArch64RegisterBankInfo &RBI); + void beginFunction(const MachineFunction &MF) override; bool select(MachineInstr &I) const override; private: @@ -62,14 +67,19 @@ private: bool selectCompareBranch(MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const; - bool selectArithImmed(MachineOperand &Root, MachineOperand &Result1, - MachineOperand &Result2) const; + ComplexRendererFn selectArithImmed(MachineOperand &Root) const; const AArch64TargetMachine &TM; const AArch64Subtarget &STI; const AArch64InstrInfo &TII; const AArch64RegisterInfo &TRI; const AArch64RegisterBankInfo &RBI; + bool ForCodeSize; + + PredicateBitset AvailableFeatures; + PredicateBitset + computeAvailableFeatures(const MachineFunction *MF, + const AArch64Subtarget *Subtarget) const; // We declare the temporaries used by selectImpl() in the class to minimize the // cost of constructing placeholder values. @@ -88,7 +98,7 @@ AArch64InstructionSelector::AArch64InstructionSelector( const AArch64TargetMachine &TM, const AArch64Subtarget &STI, const AArch64RegisterBankInfo &RBI) : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()), - TRI(*STI.getRegisterInfo()), RBI(RBI) + TRI(*STI.getRegisterInfo()), RBI(RBI), ForCodeSize(), AvailableFeatures() #define GET_GLOBALISEL_TEMPORARIES_INIT #include "AArch64GenGlobalISel.inc" #undef GET_GLOBALISEL_TEMPORARIES_INIT @@ -567,6 +577,12 @@ bool AArch64InstructionSelector::selectVaStartDarwin( return true; } +void AArch64InstructionSelector::beginFunction( + const MachineFunction &MF) { + ForCodeSize = MF.getFunction()->optForSize(); + AvailableFeatures = computeAvailableFeatures(&MF, &STI); +} + bool AArch64InstructionSelector::select(MachineInstr &I) const { assert(I.getParent() && "Instruction should be in a basic block!"); assert(I.getParent()->getParent() && "Instruction should be in a function!"); @@ -1312,9 +1328,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { /// SelectArithImmed - Select an immediate value that can be represented as /// a 12-bit value shifted left by either 0 or 12. If so, return true with /// Val set to the 12-bit value and Shift set to the shifter operand. -bool AArch64InstructionSelector::selectArithImmed( - MachineOperand &Root, MachineOperand &Result1, - MachineOperand &Result2) const { +InstructionSelector::ComplexRendererFn +AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const { MachineInstr &MI = *Root.getParent(); MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); @@ -1333,13 +1348,13 @@ bool AArch64InstructionSelector::selectArithImmed( else if (Root.isReg()) { MachineInstr *Def = MRI.getVRegDef(Root.getReg()); if (Def->getOpcode() != TargetOpcode::G_CONSTANT) - return false; + return nullptr; MachineOperand &Op1 = Def->getOperand(1); if (!Op1.isCImm() || Op1.getCImm()->getBitWidth() > 64) - return false; + return nullptr; Immed = Op1.getCImm()->getZExtValue(); } else - return false; + return nullptr; unsigned ShiftAmt; @@ -1349,14 +1364,10 @@ bool AArch64InstructionSelector::selectArithImmed( ShiftAmt = 12; Immed = Immed >> 12; } else - return false; + return nullptr; unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); - Result1.ChangeToImmediate(Immed); - Result1.clearParent(); - Result2.ChangeToImmediate(ShVal); - Result2.clearParent(); - return true; + return [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed).addImm(ShVal); }; } namespace llvm { diff --git a/lib/Target/AArch64/AArch64SchedFalkor.td b/lib/Target/AArch64/AArch64SchedFalkor.td index eec0890..cf1c0b6 100644 --- a/lib/Target/AArch64/AArch64SchedFalkor.td +++ b/lib/Target/AArch64/AArch64SchedFalkor.td @@ -79,14 +79,14 @@ def : WriteRes { let Latency = 5; } def : WriteRes { let Latency = 1; } def : WriteRes { let Latency = 1; } def : WriteRes { let Latency = 3; } -def : WriteRes - { let Latency = 3; let NumMicroOps = 3; } +def : WriteRes + { let Latency = 0; let NumMicroOps = 2; } def : WriteRes { let Latency = 0; let NumMicroOps = 2; } -def : WriteRes { let Latency = 5; } +def : WriteRes { let Latency = 1; } def : WriteRes { let Latency = 5; } -def : WriteRes - { let Latency = 4; let NumMicroOps = 3; } +def : WriteRes + { let Latency = 0; let NumMicroOps = 2; } def : WriteRes { let Latency = 3; let NumMicroOps = 2; } def : WriteRes { let Latency = 2; } diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td index 4bd77d3..8f8eeef 100644 --- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td +++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td @@ -326,6 +326,10 @@ def : InstRW<[FalkorWr_5VXVY_7cyc], (instregex "^TBX(v8i8Four|v16i8Four)$")>; // SIMD Store Instructions // ----------------------------------------------------------------------------- +def : InstRW<[WriteVST], (instregex "^STP(D|S)(i)$")>; +def : InstRW<[WriteVST, WriteAdr], (instregex "^STP(D|S)(post|pre)$")>; +def : InstRW<[FalkorWr_2XYZ_2ST_2VSD_0cyc], (instregex "^STRQro(W|X)$")>; + def : InstRW<[WriteVST], (instregex "^ST1(One(v8b|v4h|v2s|v1d)(_POST)?|(i8|i16|i32|i64)(_POST)?|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>; def : InstRW<[WriteVST], (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))$")>; def : InstRW<[WriteVST, WriteAdr], (instregex "^ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>; @@ -421,6 +425,7 @@ def : InstRW<[FalkorWr_1VX_1VY_2cyc], (instregex "^FSQRT(H|S|D)r$")>; def : InstRW<[FalkorWr_1VXVY_5cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)(H|S)rrr$")>; def : InstRW<[FalkorWr_1VXVY_6cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)Drrr$")>; + // FP Miscellaneous Instructions // ----------------------------------------------------------------------------- def : InstRW<[FalkorWr_FMOV], (instregex "^FMOV(HW|HX|SW|DX|DXHigh)r$")>; @@ -433,7 +438,6 @@ def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)CVTF(v1i16|v1i32|v2i32|v def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(S|U)CVTF(v2i64|v4i32|v8i16|v2f64|v4f32|v8f16)(_shift)?")>; - // Load Instructions // ----------------------------------------------------------------------------- def : InstRW<[FalkorWr_1ST_0cyc], (instrs PRFMui, PRFMl)>; @@ -461,6 +465,7 @@ def : InstRW<[FalkorWr_1LD_4cyc, WriteAdr],(instregex "^LDRS(BW|BX|HW|HX|W)(post def : InstRW<[WriteLD, WriteLDHi, WriteAdr],(instregex "^LDP(W|X)(post|pre)$")>; def : InstRW<[FalkorWr_1LD_4cyc, WriteLDHi],(instrs LDPSWi)>; def : InstRW<[FalkorWr_1LD_4cyc, WriteLDHi, WriteAdr],(instregex "^LDPSW(post|pre)$")>; + // Miscellaneous Data-Processing Instructions // ----------------------------------------------------------------------------- def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(S|U)?BFM(W|X)ri$")>; @@ -502,28 +507,30 @@ def : InstRW<[FalkorWr_1LD_1Z_3cyc], (instrs DRPS)>; def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs MSR)>; def : InstRW<[WriteVST], (instrs STNPDi, STNPSi)>; -def : InstRW<[WriteSTP], (instrs STNPWi, STNPXi)>; +def : InstRW<[WriteSTP], (instrs STNPWi, STNPXi)>; def : InstRW<[FalkorWr_2LD_1Z_3cyc], (instrs ERET)>; -def : InstRW<[WriteST], (instregex "^LDC.*$")>; -def : InstRW<[WriteST], (instregex "^STLR(B|H|W|X)$")>; -def : InstRW<[WriteST], (instregex "^STXP(W|X)$")>; -def : InstRW<[WriteST], (instregex "^STXR(B|H|W|X)$")>; +def : InstRW<[FalkorWr_1ST_1SD_1LD_3cyc], (instregex "^LDC.*$")>; +def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STLR(B|H|W|X)$")>; +def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STXP(W|X)$")>; +def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STXR(B|H|W|X)$")>; -def : InstRW<[WriteSTX], (instregex "^STLXP(W|X)$")>; -def : InstRW<[WriteSTX], (instregex "^STLXR(B|H|W|X)$")>; +def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc], (instregex "^STLXP(W|X)$")>; +def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc], (instregex "^STLXR(B|H|W|X)$")>; def : InstRW<[WriteVST, WriteVST], (instrs STNPQi)>; // Store Instructions // ----------------------------------------------------------------------------- -def : InstRW<[WriteVST], (instregex "^STP(D|S)(i|post|pre)$")>; -def : InstRW<[WriteST], (instregex "^STP(W|X)(i|post|pre)$")>; +def : InstRW<[WriteST], (instregex "^STP(W|X)i$")>; +def : InstRW<[WriteST, WriteAdr], (instregex "^STP(W|X)(post|pre)$")>; def : InstRW<[WriteST], (instregex "^STR(Q|D|S|BB|HH)ui$")>; def : InstRW<[WriteST], (instregex "^STUR(Q|D|S|BB|HH)i$")>; -def : InstRW<[WriteST], (instregex "^STR(B|H|W|X)(post|pre|ui)$")>; +def : InstRW<[WriteST], (instregex "^STR(B|H|W|X)ui$")>; +def : InstRW<[WriteST, WriteAdr], (instregex "^STR(B|H|W|X)(post|pre)$")>; def : InstRW<[WriteST], (instregex "^STTR(B|H|W|X)i$")>; def : InstRW<[WriteST], (instregex "^STUR(B|H|W|X)i$")>; def : InstRW<[WriteST, WriteAdr], (instregex "^STR(B|H|W|X)ro(W|X)$")>; -def : InstRW<[WriteVST, WriteVST], (instregex "^STPQ(i|post|pre)$")>; +def : InstRW<[WriteVST, WriteVST], (instregex "^STPQi$")>; +def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^STPQ(post|pre)$")>; diff --git a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td index 9cdb4be..e64b2c4 100644 --- a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td +++ b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td @@ -28,7 +28,6 @@ //===----------------------------------------------------------------------===// // Define 1 micro-op types - def FalkorWr_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 2; } def FalkorWr_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; } def FalkorWr_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; } @@ -175,18 +174,33 @@ def FalkorWr_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitSD, FalkorUnitST]> { //===----------------------------------------------------------------------===// // Define 3 micro-op types +def FalkorWr_1ST_1SD_1LD_0cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD, + FalkorUnitLD]> { + let Latency = 0; + let NumMicroOps = 3; +} + +def FalkorWr_1ST_1SD_1LD_3cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD, + FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 3; +} + def FalkorWr_3VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { let Latency = 3; let NumMicroOps = 3; } + def FalkorWr_3VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { let Latency = 4; let NumMicroOps = 3; } + def FalkorWr_3VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { let Latency = 5; let NumMicroOps = 3; } + def FalkorWr_3VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { let Latency = 6; let NumMicroOps = 3; @@ -196,10 +210,12 @@ def FalkorWr_1LD_2VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> { let Latency = 4; let NumMicroOps = 3; } + def FalkorWr_2LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> { let Latency = 3; let NumMicroOps = 3; } + def FalkorWr_3LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, FalkorUnitLD]> { let Latency = 3; @@ -259,6 +275,12 @@ def FalkorWr_2LD_2none_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> { let NumMicroOps = 4; } +def FalkorWr_2LD_1ST_1SD_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitST, + FalkorUnitSD, FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 4; +} + //===----------------------------------------------------------------------===// // Define 5 micro-op types @@ -289,6 +311,13 @@ def FalkorWr_2LD_2VXVY_2none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, let NumMicroOps = 6; } +def FalkorWr_2XYZ_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST, + FalkorUnitVSD, FalkorUnitXYZ, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 6; +} + //===----------------------------------------------------------------------===// // Define 8 micro-op types diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index d7bbc2b..4dbcc95 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -2473,16 +2473,14 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { return MatchOperand_ParseFail; } - auto DB = AArch64DB::lookupDBByName(Tok.getString()); - if (!DB) { - TokError("invalid barrier option name"); - return MatchOperand_ParseFail; - } - // The only valid named option for ISB is 'sy' - if (Mnemonic == "isb" && DB->Encoding != AArch64DB::sy) { + auto DB = AArch64DB::lookupDBByName(Tok.getString()); + if (Mnemonic == "isb" && (!DB || DB->Encoding != AArch64DB::sy)) { TokError("'sy' or #imm operand expected"); return MatchOperand_ParseFail; + } else if (!DB) { + TokError("invalid barrier option name"); + return MatchOperand_ParseFail; } Operands.push_back(AArch64Operand::CreateBarrier( diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp index 41ae70f..fc89657 100644 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCRegisterInfo.h" @@ -275,6 +276,12 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O, } } + if (Opcode == AArch64::CompilerBarrier) { + O << '\t' << MAI.getCommentString() << " COMPILER BARRIER"; + printAnnotation(O, Annot); + return; + } + if (!printAliasInstr(MI, STI, O)) printInstruction(MI, STI, O); diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index 62dfa59..33698d2 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -565,6 +565,9 @@ void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, MCFixupKind Fixup = MCFixupKind(AArch64::fixup_aarch64_tlsdesc_call); Fixups.push_back(MCFixup::create(0, MI.getOperand(0).getExpr(), Fixup)); return; + } else if (MI.getOpcode() == AArch64::CompilerBarrier) { + // This just prevents the compiler from reordering accesses, no actual code. + return; } uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI); diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 2c7a2d8..0f33148 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -406,7 +406,8 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9", FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm, - FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode + FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode, + FeatureFastFMAF32 ] >; diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 318de7f..f511085 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -116,8 +116,11 @@ private: bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &SLC) const; - bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr, - SDValue &SOffset, SDValue &ImmOffset) const; + bool SelectMUBUFScratchOffen(SDValue Addr, SDValue &RSrc, SDValue &VAddr, + SDValue &SOffset, SDValue &ImmOffset) const; + bool SelectMUBUFScratchOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, + SDValue &Offset) const; + bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, SDValue &TFE) const; @@ -150,14 +153,12 @@ private: bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; - bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3NoMods(SDValue In, SDValue &Src) const; bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const; bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const; - bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods, - SDValue &Omod) const; bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const; @@ -953,8 +954,12 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, return true; } +static bool isLegalMUBUFImmOffset(unsigned Imm) { + return isUInt<12>(Imm); +} + static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) { - return isUInt<12>(Imm->getZExtValue()); + return isLegalMUBUFImmOffset(Imm->getZExtValue()); } bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, @@ -1076,9 +1081,9 @@ SDValue AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { return N; } -bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, - SDValue &VAddr, SDValue &SOffset, - SDValue &ImmOffset) const { +bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDValue Addr, SDValue &Rsrc, + SDValue &VAddr, SDValue &SOffset, + SDValue &ImmOffset) const { SDLoc DL(Addr); MachineFunction &MF = CurDAG->getMachineFunction(); @@ -1087,8 +1092,22 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32); - // (add n0, c1) + if (ConstantSDNode *CAddr = dyn_cast(Addr)) { + unsigned Imm = CAddr->getZExtValue(); + assert(!isLegalMUBUFImmOffset(Imm) && + "should have been selected by other pattern"); + + SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32); + MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, + DL, MVT::i32, HighBits); + VAddr = SDValue(MovHighBits, 0); + ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16); + return true; + } + if (CurDAG->isBaseWithConstantOffset(Addr)) { + // (add n0, c1) + SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); @@ -1107,6 +1126,24 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, return true; } +bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDValue Addr, + SDValue &SRsrc, + SDValue &SOffset, + SDValue &Offset) const { + ConstantSDNode *CAddr = dyn_cast(Addr); + if (!CAddr || !isLegalMUBUFImmOffset(CAddr)) + return false; + + SDLoc DL(Addr); + MachineFunction &MF = CurDAG->getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo(); + + SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); + SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32); + Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); + return true; +} + bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, @@ -1628,38 +1665,20 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, return isNoNanSrc(Src); } -bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src, - SDValue &SrcMods) const { - bool Res = SelectVOP3Mods(In, Src, SrcMods); - return Res && cast(SrcMods)->isNullValue(); +bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const { + if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG) + return false; + + Src = In; + return true; } bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const { SDLoc DL(In); - // FIXME: Handle Clamp and Omod - Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32); - Omod = CurDAG->getTargetConstant(0, DL, MVT::i32); - - return SelectVOP3Mods(In, Src, SrcMods); -} - -bool AMDGPUDAGToDAGISel::SelectVOP3NoMods0(SDValue In, SDValue &Src, - SDValue &SrcMods, SDValue &Clamp, - SDValue &Omod) const { - bool Res = SelectVOP3Mods0(In, Src, SrcMods, Clamp, Omod); - - return Res && cast(SrcMods)->isNullValue() && - cast(Clamp)->isNullValue() && - cast(Omod)->isNullValue(); -} - -bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, - SDValue &SrcMods, - SDValue &Omod) const { - // FIXME: Handle Omod - Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); + Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); + Omod = CurDAG->getTargetConstant(0, DL, MVT::i1); return SelectVOP3Mods(In, Src, SrcMods); } @@ -1677,9 +1696,8 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src, Src = In; SDLoc DL(In); - // FIXME: Handle Clamp and Omod - Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32); - Omod = CurDAG->getTargetConstant(0, DL, MVT::i32); + Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); + Omod = CurDAG->getTargetConstant(0, DL, MVT::i1); return true; } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index c0f336e..e21775e 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2315,12 +2315,13 @@ static bool simplifyI24(SDNode *Node24, unsigned OpIdx, SelectionDAG &DAG = DCI.DAG; SDValue Op = Node24->getOperand(OpIdx); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = Op.getValueType(); APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24); APInt KnownZero, KnownOne; TargetLowering::TargetLoweringOpt TLO(DAG, true, true); - if (TLO.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI)) + if (TLI.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI, TLO)) return true; return false; @@ -3361,7 +3362,7 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) || + if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) || TLI.SimplifyDemandedBits(BitsFrom, Demanded, KnownZero, KnownOne, TLO)) { DCI.CommitTargetLoweringOpt(TLO); @@ -3436,6 +3437,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(ELSE) NODE_NAME_CASE(LOOP) NODE_NAME_CASE(CALL) + NODE_NAME_CASE(TRAP) NODE_NAME_CASE(RET_FLAG) NODE_NAME_CASE(RETURN_TO_EPILOG) NODE_NAME_CASE(ENDPGM) diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index d6aa0ba..13cbfe2 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -231,6 +231,10 @@ public: AMDGPUAS getAMDGPUAS() const { return AMDGPUASI; } + + MVT getFenceOperandTy(const DataLayout &DL) const override { + return MVT::i32; + } }; namespace AMDGPUISD { @@ -244,6 +248,7 @@ enum NodeType : unsigned { // Function call. CALL, + TRAP, // Masked control flow nodes. IF, diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 56f0609..c1706d1 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -78,6 +78,11 @@ def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>; def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>; def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>; +def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP", + SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>, + [SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue] +>; + def AMDGPUconstdata_ptr : SDNode< "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<0, iPTR>]> diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td index b8d6812..4e688ab 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -50,6 +50,16 @@ def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; def InstFlag : OperandWithDefaultOps ; def ADDRIndirect : ComplexPattern; +def u16ImmTarget : AsmOperandClass { + let Name = "U16Imm"; + let RenderMethod = "addImmOperands"; +} + +def s16ImmTarget : AsmOperandClass { + let Name = "S16Imm"; + let RenderMethod = "addImmOperands"; +} + let OperandType = "OPERAND_IMMEDIATE" in { def u32imm : Operand { @@ -58,6 +68,12 @@ def u32imm : Operand { def u16imm : Operand { let PrintMethod = "printU16ImmOperand"; + let ParserMatchClass = u16ImmTarget; +} + +def s16imm : Operand { + let PrintMethod = "printU16ImmOperand"; + let ParserMatchClass = s16ImmTarget; } def u8imm : Operand { diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 14ee1c8..da247fe 100644 --- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -225,6 +225,12 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } + if (MI->getOpcode() == AMDGPU::SI_MASKED_UNREACHABLE) { + if (isVerbose()) + OutStreamer->emitRawComment(" divergent unreachable"); + return; + } + MCInst TmpInst; MCInstLowering.lower(MI, TmpInst); EmitToStreamer(*OutStreamer, TmpInst); diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 961f718..70c848f 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -479,6 +479,8 @@ public: bool isSMRDLiteralOffset() const; bool isDPPCtrl() const; bool isGPRIdxMode() const; + bool isS16Imm() const; + bool isU16Imm() const; StringRef getExpressionAsToken() const { assert(isExpr()); @@ -2836,6 +2838,28 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) { // s_waitcnt //===----------------------------------------------------------------------===// +static bool +encodeCnt( + const AMDGPU::IsaInfo::IsaVersion ISA, + int64_t &IntVal, + int64_t CntVal, + bool Saturate, + unsigned (*encode)(const IsaInfo::IsaVersion &Version, unsigned, unsigned), + unsigned (*decode)(const IsaInfo::IsaVersion &Version, unsigned)) +{ + bool Failed = false; + + IntVal = encode(ISA, IntVal, CntVal); + if (CntVal != decode(ISA, IntVal)) { + if (Saturate) { + IntVal = encode(ISA, IntVal, -1); + } else { + Failed = true; + } + } + return Failed; +} + bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { StringRef CntName = Parser.getTok().getString(); int64_t CntVal; @@ -2851,25 +2875,35 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { if (getParser().parseAbsoluteExpression(CntVal)) return true; - if (getLexer().isNot(AsmToken::RParen)) - return true; - - Parser.Lex(); - if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) - Parser.Lex(); - AMDGPU::IsaInfo::IsaVersion ISA = AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()); - if (CntName == "vmcnt") - IntVal = encodeVmcnt(ISA, IntVal, CntVal); - else if (CntName == "expcnt") - IntVal = encodeExpcnt(ISA, IntVal, CntVal); - else if (CntName == "lgkmcnt") - IntVal = encodeLgkmcnt(ISA, IntVal, CntVal); - else - return true; - return false; + bool Failed = true; + bool Sat = CntName.endswith("_sat"); + + if (CntName == "vmcnt" || CntName == "vmcnt_sat") { + Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeVmcnt, decodeVmcnt); + } else if (CntName == "expcnt" || CntName == "expcnt_sat") { + Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeExpcnt, decodeExpcnt); + } else if (CntName == "lgkmcnt" || CntName == "lgkmcnt_sat") { + Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeLgkmcnt, decodeLgkmcnt); + } + + // To improve diagnostics, do not skip delimiters on errors + if (!Failed) { + if (getLexer().isNot(AsmToken::RParen)) { + return true; + } + Parser.Lex(); + if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) { + const AsmToken NextToken = getLexer().peekTok(); + if (NextToken.is(AsmToken::Identifier)) { + Parser.Lex(); + } + } + } + + return Failed; } OperandMatchResultTy @@ -3858,6 +3892,14 @@ bool AMDGPUOperand::isGPRIdxMode() const { return isImm() && isUInt<4>(getImm()); } +bool AMDGPUOperand::isS16Imm() const { + return isImm() && (isInt<16>(getImm()) || isUInt<16>(getImm())); +} + +bool AMDGPUOperand::isU16Imm() const { + return isImm() && isUInt<16>(getImm()); +} + OperandMatchResultTy AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { SMLoc S = Parser.getTok().getLoc(); diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td index a6609f0..89eddb9 100644 --- a/lib/Target/AMDGPU/BUFInstructions.td +++ b/lib/Target/AMDGPU/BUFInstructions.td @@ -11,7 +11,9 @@ def MUBUFAddr32 : ComplexPattern; def MUBUFAddr64 : ComplexPattern; def MUBUFAddr64Atomic : ComplexPattern; -def MUBUFScratch : ComplexPattern; +def MUBUFScratchOffen : ComplexPattern; +def MUBUFScratchOffset : ComplexPattern; + def MUBUFOffset : ComplexPattern; def MUBUFOffsetNoGLC : ComplexPattern; def MUBUFOffsetAtomic : ComplexPattern; @@ -958,21 +960,30 @@ defm : MUBUFLoad_Pattern ; } // End Predicates = [Has16BitInsts] -class MUBUFScratchLoadPat : Pat < - (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr, - i32:$soffset, u16imm:$offset))), - (Instr $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) ->; +multiclass MUBUFScratchLoadPat { + def : Pat < + (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, + i32:$soffset, u16imm:$offset))), + (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) + >; + + def : Pat < + (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))), + (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0) + >; +} -def : MUBUFScratchLoadPat ; -def : MUBUFScratchLoadPat ; -def : MUBUFScratchLoadPat ; -def : MUBUFScratchLoadPat ; -def : MUBUFScratchLoadPat ; -def : MUBUFScratchLoadPat ; -def : MUBUFScratchLoadPat ; -def : MUBUFScratchLoadPat ; -def : MUBUFScratchLoadPat ; +defm : MUBUFScratchLoadPat ; +defm : MUBUFScratchLoadPat ; +defm : MUBUFScratchLoadPat ; +defm : MUBUFScratchLoadPat ; +defm : MUBUFScratchLoadPat ; +defm : MUBUFScratchLoadPat ; +defm : MUBUFScratchLoadPat ; +defm : MUBUFScratchLoadPat ; +defm : MUBUFScratchLoadPat ; // BUFFER_LOAD_DWORD*, addr64=0 multiclass MUBUF_Load_Dword ; defm : MUBUFStore_Pattern ; -class MUBUFScratchStorePat : Pat < - (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, - u16imm:$offset)), - (Instr $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) ->; +multiclass MUBUFScratchStorePat { + def : Pat < + (st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, + i32:$soffset, u16imm:$offset)), + (InstrOffen $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) + >; + + def : Pat < + (st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, + u16imm:$offset)), + (InstrOffset $value, $srsrc, $soffset, $offset, 0, 0, 0) + >; +} -def : MUBUFScratchStorePat ; -def : MUBUFScratchStorePat ; -def : MUBUFScratchStorePat ; -def : MUBUFScratchStorePat ; -def : MUBUFScratchStorePat ; -def : MUBUFScratchStorePat ; -def : MUBUFScratchStorePat ; +defm : MUBUFScratchStorePat ; +defm : MUBUFScratchStorePat ; +defm : MUBUFScratchStorePat ; +defm : MUBUFScratchStorePat ; +defm : MUBUFScratchStorePat ; +defm : MUBUFScratchStorePat ; +defm : MUBUFScratchStorePat ; //===----------------------------------------------------------------------===// // MTBUF Patterns diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp index 4ecfa11..bf16a82 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -83,8 +83,8 @@ unsigned GCNRegPressure::getRegKind(unsigned Reg, const auto RC = MRI.getRegClass(Reg); auto STI = static_cast(MRI.getTargetRegisterInfo()); return STI->isSGPRClass(RC) ? - (RC->getSize() == 4 ? SGPR32 : SGPR_TUPLE) : - (RC->getSize() == 4 ? VGPR32 : VGPR_TUPLE); + (STI->getRegSizeInBits(*RC) == 32 ? SGPR32 : SGPR_TUPLE) : + (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE); } void GCNRegPressure::inc(unsigned Reg, diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp index 29a6ab9..647017d 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp @@ -286,20 +286,20 @@ ValueKind MetadataStreamer::getValueKind(Type *Ty, StringRef TypeQual, return ValueKind::Pipe; return StringSwitch(BaseTypeName) + .Case("image1d_t", ValueKind::Image) + .Case("image1d_array_t", ValueKind::Image) + .Case("image1d_buffer_t", ValueKind::Image) + .Case("image2d_t", ValueKind::Image) + .Case("image2d_array_t", ValueKind::Image) + .Case("image2d_array_depth_t", ValueKind::Image) + .Case("image2d_array_msaa_t", ValueKind::Image) + .Case("image2d_array_msaa_depth_t", ValueKind::Image) + .Case("image2d_depth_t", ValueKind::Image) + .Case("image2d_msaa_t", ValueKind::Image) + .Case("image2d_msaa_depth_t", ValueKind::Image) + .Case("image3d_t", ValueKind::Image) .Case("sampler_t", ValueKind::Sampler) .Case("queue_t", ValueKind::Queue) - .Cases("image1d_t", - "image1d_array_t", - "image1d_buffer_t", - "image2d_t" , - "image2d_array_t", - "image2d_array_depth_t", - "image2d_array_msaa_t" - "image2d_array_msaa_depth_t" - "image2d_depth_t", - "image2d_msaa_t", - "image2d_msaa_depth_t", - "image3d_t", ValueKind::Image) .Default(isa(Ty) ? (Ty->getPointerAddressSpace() == AMDGPUASI.LOCAL_ADDRESS ? diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 6c61fb1..2364e7b 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -15,6 +15,7 @@ using namespace llvm; AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { CodePointerSize = (TT.getArch() == Triple::amdgcn) ? 8 : 4; + StackGrowsUp = true; HasSingleParameterDotFile = false; //===------------------------------------------------------------------===// MinInstAlignment = 4; diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index f9d258f..b0f0bf0 100644 --- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -81,6 +81,11 @@ using namespace llvm; #define DEBUG_TYPE "si-fix-sgpr-copies" +static cl::opt EnableM0Merge( + "amdgpu-enable-merge-m0", + cl::desc("Merge and hoist M0 initializations"), + cl::init(false)); + namespace { class SIFixSGPRCopies : public MachineFunctionPass { @@ -108,7 +113,7 @@ public: INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE, "SI Fix SGPR copies", false, false) -INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE, "SI Fix SGPR copies", false, false) @@ -332,27 +337,186 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, return true; } -static bool predsHasDivergentTerminator(MachineBasicBlock *MBB, - const TargetRegisterInfo *TRI) { - DenseSet Visited; +template +bool searchPredecessors(const MachineBasicBlock *MBB, + const MachineBasicBlock *CutOff, + UnaryPredicate Predicate) { + + if (MBB == CutOff) + return false; + + DenseSet Visited; SmallVector Worklist(MBB->pred_begin(), MBB->pred_end()); while (!Worklist.empty()) { - MachineBasicBlock *mbb = Worklist.back(); - Worklist.pop_back(); + MachineBasicBlock *MBB = Worklist.pop_back_val(); - if (!Visited.insert(mbb).second) + if (!Visited.insert(MBB).second) continue; - if (hasTerminatorThatModifiesExec(*mbb, *TRI)) + if (MBB == CutOff) + continue; + if (Predicate(MBB)) return true; - Worklist.insert(Worklist.end(), mbb->pred_begin(), mbb->pred_end()); + Worklist.append(MBB->pred_begin(), MBB->pred_end()); } return false; } +static bool predsHasDivergentTerminator(MachineBasicBlock *MBB, + const TargetRegisterInfo *TRI) { + return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) { + return hasTerminatorThatModifiesExec(*MBB, *TRI); }); +} + +// Checks if there is potential path From instruction To instruction. +// If CutOff is specified and it sits in between of that path we ignore +// a higher portion of the path and report it is not reachable. +static bool isReachable(const MachineInstr *From, + const MachineInstr *To, + const MachineBasicBlock *CutOff, + MachineDominatorTree &MDT) { + // If either From block dominates To block or instructions are in the same + // block and From is higher. + if (MDT.dominates(From, To)) + return true; + + const MachineBasicBlock *MBBFrom = From->getParent(); + const MachineBasicBlock *MBBTo = To->getParent(); + if (MBBFrom == MBBTo) + return false; + + // Instructions are in different blocks, do predecessor search. + // We should almost never get here since we do not usually produce M0 stores + // other than -1. + return searchPredecessors(MBBTo, CutOff, [MBBFrom] + (const MachineBasicBlock *MBB) { return MBB == MBBFrom; }); +} + +// Hoist and merge identical SGPR initializations into a common predecessor. +// This is intended to combine M0 initializations, but can work with any +// SGPR. A VGPR cannot be processed since we cannot guarantee vector +// executioon. +static bool hoistAndMergeSGPRInits(unsigned Reg, + const MachineRegisterInfo &MRI, + MachineDominatorTree &MDT) { + // List of inits by immediate value. + typedef std::map> InitListMap; + InitListMap Inits; + // List of clobbering instructions. + SmallVector Clobbers; + bool Changed = false; + + for (auto &MI : MRI.def_instructions(Reg)) { + MachineOperand *Imm = nullptr; + for (auto &MO: MI.operands()) { + if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) || + (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) { + Imm = nullptr; + break; + } else if (MO.isImm()) + Imm = &MO; + } + if (Imm) + Inits[Imm->getImm()].push_front(&MI); + else + Clobbers.push_back(&MI); + } + + for (auto &Init : Inits) { + auto &Defs = Init.second; + + for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) { + MachineInstr *MI1 = *I1; + + for (auto I2 = std::next(I1); I2 != E; ) { + MachineInstr *MI2 = *I2; + + // Check any possible interference + auto intereferes = [&](MachineBasicBlock::iterator From, + MachineBasicBlock::iterator To) -> bool { + + assert(MDT.dominates(&*To, &*From)); + + auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool { + const MachineBasicBlock *MBBFrom = From->getParent(); + const MachineBasicBlock *MBBTo = To->getParent(); + bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT); + bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT); + if (!MayClobberFrom && !MayClobberTo) + return false; + if ((MayClobberFrom && !MayClobberTo) || + (!MayClobberFrom && MayClobberTo)) + return true; + // Both can clobber, this is not an interference only if both are + // dominated by Clobber and belong to the same block or if Clobber + // properly dominates To, given that To >> From, so it dominates + // both and located in a common dominator. + return !((MBBFrom == MBBTo && + MDT.dominates(Clobber, &*From) && + MDT.dominates(Clobber, &*To)) || + MDT.properlyDominates(Clobber->getParent(), MBBTo)); + }; + + return (any_of(Clobbers, interferes)) || + (any_of(Inits, [&](InitListMap::value_type &C) { + return C.first != Init.first && any_of(C.second, interferes); + })); + }; + + if (MDT.dominates(MI1, MI2)) { + if (!intereferes(MI2, MI1)) { + DEBUG(dbgs() << "Erasing from BB#" << MI2->getParent()->getNumber() + << " " << *MI2); + MI2->eraseFromParent(); + Defs.erase(I2++); + Changed = true; + continue; + } + } else if (MDT.dominates(MI2, MI1)) { + if (!intereferes(MI1, MI2)) { + DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber() + << " " << *MI1); + MI1->eraseFromParent(); + Defs.erase(I1++); + Changed = true; + break; + } + } else { + auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(), + MI2->getParent()); + if (!MBB) { + ++I2; + continue; + } + + MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); + if (!intereferes(MI1, I) && !intereferes(MI2, I)) { + DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber() + << " " << *MI1 << "and moving from BB#" + << MI2->getParent()->getNumber() << " to BB#" + << I->getParent()->getNumber() << " " << *MI2); + I->getParent()->splice(I, MI2->getParent(), MI2); + MI1->eraseFromParent(); + Defs.erase(I1++); + Changed = true; + break; + } + } + ++I2; + } + ++I1; + } + } + + if (Changed) + MRI.clearKillFlags(Reg); + + return Changed; +} + bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { const SISubtarget &ST = MF.getSubtarget(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -485,5 +649,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { } } + if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge) + hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT); + return true; } diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index abe6af9..86e3b37 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -101,10 +101,12 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( const SIRegisterInfo *TRI, SIMachineFunctionInfo *MFI, MachineFunction &MF) const { + MachineRegisterInfo &MRI = MF.getRegInfo(); // We need to insert initialization of the scratch resource descriptor. unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); - if (ScratchRsrcReg == AMDGPU::NoRegister) + if (ScratchRsrcReg == AMDGPU::NoRegister || + !MRI.isPhysRegUsed(ScratchRsrcReg)) return AMDGPU::NoRegister; if (ST.hasSGPRInitBug() || @@ -122,8 +124,6 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( // We find the resource first because it has an alignment requirement. - MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; ArrayRef AllSGPR128s = getAllSGPR128(ST, MF); AllSGPR128s = AllSGPR128s.slice(std::min(static_cast(AllSGPR128s.size()), NumPreloaded)); @@ -143,24 +143,34 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( return ScratchRsrcReg; } -unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( +// Shift down registers reserved for the scratch wave offset and stack pointer +// SGPRs. +std::pair +SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( const SISubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, SIMachineFunctionInfo *MFI, MachineFunction &MF) const { + MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); - if (ST.hasSGPRInitBug() || - ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) - return ScratchWaveOffsetReg; - unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); - MachineRegisterInfo &MRI = MF.getRegInfo(); + // No replacement necessary. + if (ScratchWaveOffsetReg == AMDGPU::NoRegister || + !MRI.isPhysRegUsed(ScratchWaveOffsetReg)) { + assert(MFI->getStackPtrOffsetReg() == AMDGPU::NoRegister); + return std::make_pair(AMDGPU::NoRegister, AMDGPU::NoRegister); + } + + unsigned SPReg = MFI->getStackPtrOffsetReg(); + if (ST.hasSGPRInitBug()) + return std::make_pair(ScratchWaveOffsetReg, SPReg); + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); ArrayRef AllSGPRs = getAllSGPRs(ST, MF); if (NumPreloaded > AllSGPRs.size()) - return ScratchWaveOffsetReg; + return std::make_pair(ScratchWaveOffsetReg, SPReg); AllSGPRs = AllSGPRs.slice(NumPreloaded); @@ -175,26 +185,42 @@ unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( // register from the list to consider, it means that when this // register is being used for the scratch wave offset and there // are no other free SGPRs, then the value will stay in this register. + // + 1 if stack pointer is used. // ---- - // 13 - if (AllSGPRs.size() < 13) - return ScratchWaveOffsetReg; + // 13 (+1) + unsigned ReservedRegCount = 13; + if (SPReg != AMDGPU::NoRegister) + ++ReservedRegCount; - for (MCPhysReg Reg : AllSGPRs.drop_back(13)) { + if (AllSGPRs.size() < ReservedRegCount) + return std::make_pair(ScratchWaveOffsetReg, SPReg); + + bool HandledScratchWaveOffsetReg = + ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + + for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) { // Pick the first unallocated SGPR. Be careful not to pick an alias of the // scratch descriptor, since we haven’t added its uses yet. - if (!MRI.isPhysRegUsed(Reg)) { - if (!MRI.isAllocatable(Reg) || - TRI->isSubRegisterEq(ScratchRsrcReg, Reg)) - continue; + if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { + if (!HandledScratchWaveOffsetReg) { + HandledScratchWaveOffsetReg = true; - MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); - MFI->setScratchWaveOffsetReg(Reg); - return Reg; + MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); + MFI->setScratchWaveOffsetReg(Reg); + ScratchWaveOffsetReg = Reg; + } else { + if (SPReg == AMDGPU::NoRegister) + break; + + MRI.replaceRegWith(SPReg, Reg); + MFI->setStackPtrOffsetReg(Reg); + SPReg = Reg; + break; + } } } - return ScratchWaveOffsetReg; + return std::make_pair(ScratchWaveOffsetReg, SPReg); } void SIFrameLowering::emitPrologue(MachineFunction &MF, @@ -220,18 +246,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned ScratchRsrcReg - = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF); - unsigned ScratchWaveOffsetReg - = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); - - if (ScratchRsrcReg == AMDGPU::NoRegister) { - assert(ScratchWaveOffsetReg == AMDGPU::NoRegister); - return; - } - - assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg)); - // We need to do the replacement of the private segment buffer and wave offset // register even if there are no stack objects. There could be stores to undef // or a constant without an associated object. @@ -244,19 +258,49 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit()) emitFlatScratchInit(ST, MF, MBB); + unsigned SPReg = MFI->getStackPtrOffsetReg(); + if (SPReg != AMDGPU::NoRegister) { + DebugLoc DL; + int64_t StackSize = MF.getFrameInfo().getStackSize(); + + if (StackSize == 0) { + BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg) + .addReg(MFI->getScratchWaveOffsetReg()); + } else { + BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::S_ADD_U32), SPReg) + .addReg(MFI->getScratchWaveOffsetReg()) + .addImm(StackSize * ST.getWavefrontSize()); + } + } + + unsigned ScratchRsrcReg + = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF); + + unsigned ScratchWaveOffsetReg; + std::tie(ScratchWaveOffsetReg, SPReg) + = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); + + // It's possible to have uses of only ScratchWaveOffsetReg without + // ScratchRsrcReg if it's only used for the initialization of flat_scratch, + // but the inverse is not true. + if (ScratchWaveOffsetReg == AMDGPU::NoRegister) { + assert(ScratchRsrcReg == AMDGPU::NoRegister); + return; + } + // We need to insert initialization of the scratch resource descriptor. unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) { PreloadedPrivateBufferReg = TRI->getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); } - bool OffsetRegUsed = !MRI.use_empty(ScratchWaveOffsetReg); - bool ResourceRegUsed = !MRI.use_empty(ScratchRsrcReg); + bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg); + bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister && + MRI.isPhysRegUsed(ScratchRsrcReg); // We added live-ins during argument lowering, but since they were not used // they were deleted. We're adding the uses now, so add them back. @@ -469,7 +513,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( // this also ensures we shouldn't need a register for the offset when // emergency scavenging. int ScavengeFI = MFI.CreateFixedObject( - AMDGPU::SGPR_32RegClass.getSize(), 0, false); + TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false); RS->addScavengingFrameIndex(ScavengeFI); } } diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h index 1bfc080..7ccd02b 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.h +++ b/lib/Target/AMDGPU/SIFrameLowering.h @@ -49,7 +49,7 @@ private: SIMachineFunctionInfo *MFI, MachineFunction &MF) const; - unsigned getReservedPrivateSegmentWaveByteOffsetReg( + std::pair getReservedPrivateSegmentWaveByteOffsetReg( const SISubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index dd867b1..ce74a7c 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -287,8 +287,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // On SI this is s_memtime and s_memrealtime on VI. setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); - setOperationAction(ISD::TRAP, MVT::Other, Legal); - setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); + setOperationAction(ISD::TRAP, MVT::Other, Custom); + setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom); setOperationAction(ISD::FMINNUM, MVT::f64, Legal); setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); @@ -1644,7 +1644,7 @@ computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset) { - int NumElts = SuperRC->getSize() / 4; + int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32; // Skip out of bounds offsets, or else we would end up using an undefined // register. @@ -1793,17 +1793,18 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, return LoopBB; } -static unsigned getMOVRELDPseudo(const TargetRegisterClass *VecRC) { - switch (VecRC->getSize()) { - case 4: +static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI, + const TargetRegisterClass *VecRC) { + switch (TRI.getRegSizeInBits(*VecRC)) { + case 32: // 4 bytes return AMDGPU::V_MOVRELD_B32_V1; - case 8: + case 64: // 8 bytes return AMDGPU::V_MOVRELD_B32_V2; - case 16: + case 128: // 16 bytes return AMDGPU::V_MOVRELD_B32_V4; - case 32: + case 256: // 32 bytes return AMDGPU::V_MOVRELD_B32_V8; - case 64: + case 512: // 64 bytes return AMDGPU::V_MOVRELD_B32_V16; default: llvm_unreachable("unsupported size for MOVRELD pseudos"); @@ -1863,7 +1864,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); } else { - const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC)); + const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC)); BuildMI(MBB, I, DL, MovRelDesc) .addReg(Dst, RegState::Define) @@ -1907,7 +1908,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, .addReg(PhiReg, RegState::Implicit) .addReg(AMDGPU::M0, RegState::Implicit); } else { - const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC)); + const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC)); BuildMI(*LoopBB, InsPt, DL, MovRelDesc) .addReg(Dst, RegState::Define) @@ -1948,50 +1949,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( } switch (MI.getOpcode()) { - case AMDGPU::S_TRAP_PSEUDO: { - const DebugLoc &DL = MI.getDebugLoc(); - const int TrapType = MI.getOperand(0).getImm(); - - if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa && - Subtarget->isTrapHandlerEnabled()) { - - MachineFunction *MF = BB->getParent(); - SIMachineFunctionInfo *Info = MF->getInfo(); - unsigned UserSGPR = Info->getQueuePtrUserSGPR(); - assert(UserSGPR != AMDGPU::NoRegister); - - if (!BB->isLiveIn(UserSGPR)) - BB->addLiveIn(UserSGPR); - - BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::SGPR0_SGPR1) - .addReg(UserSGPR); - BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_TRAP)) - .addImm(TrapType) - .addReg(AMDGPU::SGPR0_SGPR1, RegState::Implicit); - } else { - switch (TrapType) { - case SISubtarget::TrapIDLLVMTrap: - BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_ENDPGM)); - break; - case SISubtarget::TrapIDLLVMDebugTrap: { - DiagnosticInfoUnsupported NoTrap(*MF->getFunction(), - "debugtrap handler not supported", - DL, - DS_Warning); - LLVMContext &C = MF->getFunction()->getContext(); - C.diagnose(NoTrap); - BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_NOP)) - .addImm(0); - break; - } - default: - llvm_unreachable("unsupported trap handler type!"); - } - } - - MI.eraseFromParent(); - return BB; - } case AMDGPU::SI_INIT_M0: BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) @@ -2163,6 +2120,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return lowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::FP_ROUND: return lowerFP_ROUND(Op, DAG); + + case ISD::TRAP: + case ISD::DEBUGTRAP: + return lowerTRAP(Op, DAG); } return SDValue(); } @@ -2431,6 +2392,57 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);; } +SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + MachineFunction &MF = DAG.getMachineFunction(); + SDValue Chain = Op.getOperand(0); + + unsigned TrapID = Op.getOpcode() == ISD::DEBUGTRAP ? + SISubtarget::TrapIDLLVMDebugTrap : SISubtarget::TrapIDLLVMTrap; + + if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa && + Subtarget->isTrapHandlerEnabled()) { + SIMachineFunctionInfo *Info = MF.getInfo(); + unsigned UserSGPR = Info->getQueuePtrUserSGPR(); + assert(UserSGPR != AMDGPU::NoRegister); + + SDValue QueuePtr = CreateLiveInRegister( + DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); + + SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64); + + SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, + QueuePtr, SDValue()); + + SDValue Ops[] = { + ToReg, + DAG.getTargetConstant(TrapID, SL, MVT::i16), + SGPR01, + ToReg.getValue(1) + }; + + return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); + } + + switch (TrapID) { + case SISubtarget::TrapIDLLVMTrap: + return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain); + case SISubtarget::TrapIDLLVMDebugTrap: { + DiagnosticInfoUnsupported NoTrap(*MF.getFunction(), + "debugtrap handler not supported", + Op.getDebugLoc(), + DS_Warning); + LLVMContext &Ctx = MF.getFunction()->getContext(); + Ctx.diagnose(NoTrap); + return Chain; + } + default: + llvm_unreachable("unsupported trap handler type!"); + } + + return Chain; +} + SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, SelectionDAG &DAG) const { // FIXME: Use inline constants (src_{shared, private}_base) instead. @@ -3410,9 +3422,11 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, EVT VT = Op.getValueType(); bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; + if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals()) + return SDValue(); + if (const ConstantFPSDNode *CLHS = dyn_cast(LHS)) { - if (Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || - VT == MVT::f16) { + if (Unsafe || VT == MVT::f32 || VT == MVT::f16) { if (CLHS->isExactlyValue(1.0)) { // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to // the CI documentation has a worst case error of 1 ulp. @@ -4696,7 +4710,7 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N, TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLO.ShrinkDemandedConstant(Src, Demanded) || + if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) || TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { DCI.CommitTargetLoweringOpt(TLO); } diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index c2a3e62..9122cd7 100644 --- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -428,8 +428,8 @@ RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI, const MachineInstr &MIA = *MI; const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo); - unsigned Size = RC->getSize(); - Result.second = Result.first + (Size / 4); + unsigned Size = TRI->getRegSizeInBits(*RC); + Result.second = Result.first + (Size / 32); return Result; } diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp index 47257ce..9f32ecf 100644 --- a/lib/Target/AMDGPU/SIInsertWaits.cpp +++ b/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -216,8 +216,8 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { // XXX - What if this is a write into a super register? const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0); - unsigned Size = RC->getSize(); - Result.Named.LGKM = Size > 4 ? 2 : 1; + unsigned Size = TRI->getRegSizeInBits(*RC); + Result.Named.LGKM = Size > 32 ? 2 : 1; } else { // s_dcache_inv etc. do not have a a destination register. Assume we // want a wait on these. @@ -289,12 +289,12 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC, const MachineOperand &Reg) const { - unsigned Size = RC->getSize(); - assert(Size >= 4); + unsigned Size = TRI->getRegSizeInBits(*RC); + assert(Size >= 32); RegInterval Result; Result.first = TRI->getEncodingValue(Reg.getReg()); - Result.second = Result.first + Size / 4; + Result.second = Result.first + Size / 32; return Result; } diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 05ac67d..92e452a 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -138,6 +138,11 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, } if (isSMRD(Opc0) && isSMRD(Opc1)) { + // Skip time and cache invalidation instructions. + if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 || + AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) + return false; + assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); // Check base reg. @@ -245,11 +250,11 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, unsigned EltSize; if (LdSt.mayLoad()) - EltSize = getOpRegClass(LdSt, 0)->getSize() / 2; + EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; else { assert(LdSt.mayStore()); int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); - EltSize = getOpRegClass(LdSt, Data0Idx)->getSize(); + EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; } if (isStride64(Opc)) @@ -345,7 +350,7 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, FirstLdSt.getParent()->getParent()->getRegInfo(); const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); - return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold; + return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold; } static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, @@ -433,7 +438,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned EltSize = 4; unsigned Opcode = AMDGPU::V_MOV_B32_e32; if (RI.isSGPRClass(RC)) { - if (RC->getSize() > 4) { + if (RI.getRegSizeInBits(*RC) > 32) { Opcode = AMDGPU::S_MOV_B64; EltSize = 8; } else { @@ -493,11 +498,11 @@ int SIInstrInfo::commuteOpcode(unsigned Opcode) const { unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { - if (DstRC->getSize() == 4) { + if (RI.getRegSizeInBits(*DstRC) == 32) { return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; - } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { + } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { return AMDGPU::S_MOV_B64; - } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { + } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { return AMDGPU::V_MOV_B64_PSEUDO; } return AMDGPU::COPY; @@ -557,17 +562,18 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, Size, Align); + unsigned SpillSize = TRI->getSpillSize(*RC); if (RI.isSGPRClass(RC)) { MFI->setHasSpilledSGPRs(); // We are only allowed to create one new instruction when spilling // registers, so we need to use pseudo instruction for spilling SGPRs. - const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(RC->getSize())); + const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); // The SGPR spill/restore instructions only work on number sgprs, so we need // to make sure we are using the correct register class. - if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) { + if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) { MachineRegisterInfo &MRI = MF->getRegInfo(); MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); } @@ -602,7 +608,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); - unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); + unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) .addReg(SrcReg, getKillRegState(isKill)) // data @@ -660,6 +666,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, DebugLoc DL = MBB.findDebugLoc(MI); unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); unsigned Size = FrameInfo.getObjectSize(FrameIndex); + unsigned SpillSize = TRI->getSpillSize(*RC); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, FrameIndex); @@ -670,8 +677,8 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, if (RI.isSGPRClass(RC)) { // FIXME: Maybe this should not include a memoperand because it will be // lowered to non-memory instructions. - const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(RC->getSize())); - if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) { + const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); + if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) { MachineRegisterInfo &MRI = MF->getRegInfo(); MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); } @@ -701,7 +708,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); - unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); + unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize); BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // vaddr .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc @@ -1440,9 +1447,9 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); - unsigned DstSize = DstRC->getSize(); + unsigned DstSize = RI.getRegSizeInBits(*DstRC); - if (DstSize == 4) { + if (DstSize == 32) { unsigned SelOp = Pred == SCC_TRUE ? AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32; @@ -1456,7 +1463,7 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, return; } - if (DstSize == 8 && Pred == SCC_TRUE) { + if (DstSize == 64 && Pred == SCC_TRUE) { MachineInstr *Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) .addReg(FalseReg) @@ -1483,7 +1490,7 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; const int16_t *SubIndices = Sub0_15; - int NElts = DstSize / 4; + int NElts = DstSize / 32; // 64-bit select is only avaialble for SALU. if (Pred == SCC_TRUE) { @@ -2635,6 +2642,19 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) return; + // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for + // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane + // select is uniform. + if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && + RI.isVGPR(MRI, Src1.getReg())) { + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + const DebugLoc &DL = MI.getDebugLoc(); + BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) + .add(Src1); + Src1.ChangeToRegister(Reg, false); + return; + } + // We do not use commuteInstruction here because it is too aggressive and will // commute if it is possible. We only want to commute here if it improves // legality. This can be called a fairly large number of times so don't waste @@ -2729,7 +2749,7 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); unsigned DstReg = MRI.createVirtualRegister(SRC); - unsigned SubRegs = VRC->getSize() / 4; + unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; SmallVector SRegs; for (unsigned i = 0; i < SubRegs; ++i) { @@ -3595,7 +3615,7 @@ void SIInstrInfo::movePackToVALU(SmallVectorImpl &Worklist, .addImm(16) .add(Src0); BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) - .addImm(0xffff); + .addImm(0xffff0000); BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg) .add(Src1) .addReg(ImmReg, RegState::Kill) diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index 659473c..03a5ef7 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -626,13 +626,13 @@ public: return 4; } - return RI.getRegClass(OpInfo.RegClass)->getSize(); + return RI.getRegSizeInBits(*RI.getRegClass(OpInfo.RegClass)) / 8; } /// \brief This form should usually be preferred since it handles operands /// with unknown register classes. unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const { - return getOpRegClass(MI, OpNo)->getSize(); + return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8; } /// \returns true if it is legal for the operand at index \p OpNo diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index c6daf74..7b05284 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -646,11 +646,10 @@ def DS64Bit4ByteAligned : ComplexPattern; def MOVRELOffset : ComplexPattern; def VOP3Mods0 : ComplexPattern; -def VOP3NoMods0 : ComplexPattern; def VOP3Mods0Clamp : ComplexPattern; def VOP3Mods0Clamp0OMod : ComplexPattern; def VOP3Mods : ComplexPattern; -def VOP3NoMods : ComplexPattern; +def VOP3NoMods : ComplexPattern; // VOP3Mods, but the input source is known to never be NaN. def VOP3Mods_nnan : ComplexPattern; diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 2f89503..3f6ddec 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -94,6 +94,12 @@ defm V_INTERP_MOV_F32 : VINTRP_m < //===----------------------------------------------------------------------===// // Pseudo Instructions //===----------------------------------------------------------------------===// +def ATOMIC_FENCE : SPseudoInstSI< + (outs), (ins i32imm:$ordering, i32imm:$scope), + [(atomic_fence (i32 imm:$ordering), (i32 imm:$scope))], + "ATOMIC_FENCE $ordering, $scope"> { + let hasSideEffects = 1; +} let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { @@ -111,12 +117,6 @@ def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), (ins VSrc_b64:$src0)>; } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] -def S_TRAP_PSEUDO : SPseudoInstSI <(outs), (ins i16imm:$simm16)> { - let hasSideEffects = 1; - let SALU = 1; - let usesCustomInserter = 1; -} - let usesCustomInserter = 1, SALU = 1 in { def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins), [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; @@ -400,13 +400,8 @@ def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < let Predicates = [isGCN] in { def : Pat< - (trap), - (S_TRAP_PSEUDO TRAPID.LLVM_TRAP) ->; - -def : Pat< - (debugtrap), - (S_TRAP_PSEUDO TRAPID.LLVM_DEBUG_TRAP) + (AMDGPUtrap timm:$trapid), + (S_TRAP $trapid) >; def : Pat< @@ -477,8 +472,8 @@ def : Pat < // fp_to_fp16 patterns def : Pat < - (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)))), - (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, $clamp, $omod) + (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, DSTCLAMP.NONE, DSTOMOD.NONE) >; def : Pat < @@ -507,11 +502,11 @@ def : Pat < multiclass FMADPat { def : Pat < - (vt (fmad (VOP3NoMods0 vt:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), - (VOP3NoMods vt:$src1, i32:$src1_modifiers), - (VOP3NoMods vt:$src2, i32:$src2_modifiers))), - (inst $src0_modifiers, $src0, $src1_modifiers, $src1, - $src2_modifiers, $src2, $clamp, $omod) + (vt (fmad (VOP3NoMods vt:$src0), + (VOP3NoMods vt:$src1), + (VOP3NoMods vt:$src2))), + (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, + SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) >; } @@ -681,10 +676,9 @@ def : BitConvert ; // If denormals are not enabled, it only impacts the compare of the // inputs. The output result is not flushed. class ClampPat : Pat < - (vt (AMDGPUclamp - (VOP3Mods0Clamp vt:$src0, i32:$src0_modifiers, i32:$omod))), + (vt (AMDGPUclamp (VOP3Mods vt:$src0, i32:$src0_modifiers))), (inst i32:$src0_modifiers, vt:$src0, - i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, $omod) + i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, DSTOMOD.NONE) >; def : ClampPat; diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 8e612d2..b6a982a 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -25,6 +25,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) TIDReg(AMDGPU::NoRegister), ScratchRSrcReg(AMDGPU::NoRegister), ScratchWaveOffsetReg(AMDGPU::NoRegister), + FrameOffsetReg(AMDGPU::NoRegister), + StackPtrOffsetReg(AMDGPU::NoRegister), PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister), DispatchPtrUserSGPR(AMDGPU::NoRegister), QueuePtrUserSGPR(AMDGPU::NoRegister), diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 810fb05..dc9f509 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -88,6 +88,14 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { unsigned ScratchRSrcReg; unsigned ScratchWaveOffsetReg; + // This is the current function's incremented size from the kernel's scratch + // wave offset register. For an entry function, this is exactly the same as + // the ScratchWaveOffsetReg. + unsigned FrameOffsetReg; + + // Top of the stack SGPR offset derived from the ScratchWaveOffsetReg. + unsigned StackPtrOffsetReg; + // Input registers for non-HSA ABI unsigned PrivateMemoryPtrUserSGPR; @@ -364,9 +372,25 @@ public: return ScratchWaveOffsetReg; } + unsigned getFrameOffsetReg() const { + return FrameOffsetReg; + } + + void setStackPtrOffsetReg(unsigned Reg) { + assert(Reg != AMDGPU::NoRegister && "Should never be unset"); + StackPtrOffsetReg = Reg; + } + + unsigned getStackPtrOffsetReg() const { + return StackPtrOffsetReg; + } + void setScratchWaveOffsetReg(unsigned Reg) { assert(Reg != AMDGPU::NoRegister && "Should never be unset"); ScratchWaveOffsetReg = Reg; + + // FIXME: Only for entry functions. + FrameOffsetReg = ScratchWaveOffsetReg; } unsigned getQueuePtrUserSGPR() const { diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 098c672..8820e29 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -146,6 +146,9 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::EXEC); reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); + // M0 has to be reserved so that llvm accepts it as a live-in into a block. + reserveRegisterTuples(Reserved, AMDGPU::M0); + // Reserve the memory aperture registers. reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE); reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT); @@ -615,7 +618,8 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, if (SpillToSMEM && isSGPRClass(RC)) { // XXX - if private_element_size is larger than 4 it might be useful to be // able to spill wider vmem spills. - std::tie(EltSize, ScalarStoreOp) = getSpillEltSize(RC->getSize(), true); + std::tie(EltSize, ScalarStoreOp) = + getSpillEltSize(getRegSizeInBits(*RC) / 8, true); } ArrayRef SplitParts = getRegSplitParts(RC, EltSize); @@ -775,7 +779,8 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, if (SpillToSMEM && isSGPRClass(RC)) { // XXX - if private_element_size is larger than 4 it might be useful to be // able to spill wider vmem spills. - std::tie(EltSize, ScalarLoadOp) = getSpillEltSize(RC->getSize(), false); + std::tie(EltSize, ScalarLoadOp) = + getSpillEltSize(getRegSizeInBits(*RC) / 8, false); } ArrayRef SplitParts = getRegSplitParts(RC, EltSize); @@ -1038,20 +1043,21 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { // TODO: It might be helpful to have some target specific flags in // TargetRegisterClass to mark which classes are VGPRs to make this trivial. bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { - switch (RC->getSize()) { - case 0: return false; - case 1: return false; - case 4: + unsigned Size = getRegSizeInBits(*RC); + if (Size < 32) + return false; + switch (Size) { + case 32: return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; - case 8: + case 64: return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr; - case 12: + case 96: return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr; - case 16: + case 128: return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr; - case 32: + case 256: return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr; - case 64: + case 512: return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; default: llvm_unreachable("Invalid register class size"); @@ -1060,18 +1066,18 @@ bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( const TargetRegisterClass *SRC) const { - switch (SRC->getSize()) { - case 4: + switch (getRegSizeInBits(*SRC)) { + case 32: return &AMDGPU::VGPR_32RegClass; - case 8: + case 64: return &AMDGPU::VReg_64RegClass; - case 12: + case 96: return &AMDGPU::VReg_96RegClass; - case 16: + case 128: return &AMDGPU::VReg_128RegClass; - case 32: + case 256: return &AMDGPU::VReg_256RegClass; - case 64: + case 512: return &AMDGPU::VReg_512RegClass; default: llvm_unreachable("Invalid register class size"); @@ -1080,16 +1086,16 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( const TargetRegisterClass *VRC) const { - switch (VRC->getSize()) { - case 4: + switch (getRegSizeInBits(*VRC)) { + case 32: return &AMDGPU::SGPR_32RegClass; - case 8: + case 64: return &AMDGPU::SReg_64RegClass; - case 16: + case 128: return &AMDGPU::SReg_128RegClass; - case 32: + case 256: return &AMDGPU::SReg_256RegClass; - case 64: + case 512: return &AMDGPU::SReg_512RegClass; default: llvm_unreachable("Invalid register class size"); @@ -1354,15 +1360,15 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, const TargetRegisterClass *DstRC, unsigned DstSubReg, const TargetRegisterClass *NewRC) const { - unsigned SrcSize = SrcRC->getSize(); - unsigned DstSize = DstRC->getSize(); - unsigned NewSize = NewRC->getSize(); + unsigned SrcSize = getRegSizeInBits(*SrcRC); + unsigned DstSize = getRegSizeInBits(*DstRC); + unsigned NewSize = getRegSizeInBits(*NewRC); // Do not increase size of registers beyond dword, we would need to allocate // adjacent registers and constraint regalloc more than needed. // Always allow dword coalescing. - if (SrcSize <= 4 || DstSize <= 4) + if (SrcSize <= 32 || DstSize <= 32) return true; return NewSize <= DstSize || NewSize <= SrcSize; diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td index b4adbdd..593439c 100644 --- a/lib/Target/AMDGPU/SOPInstructions.td +++ b/lib/Target/AMDGPU/SOPInstructions.td @@ -530,14 +530,16 @@ class SOPKInstTable { class SOPK_32 pattern=[]> : SOPK_Pseudo < opName, (outs SReg_32:$sdst), - (ins u16imm:$simm16), + (ins s16imm:$simm16), "$sdst, $simm16", pattern>; -class SOPK_SCC : SOPK_Pseudo < +class SOPK_SCC : SOPK_Pseudo < opName, (outs), - (ins SReg_32:$sdst, u16imm:$simm16), + !if(isSignExt, + (ins SReg_32:$sdst, s16imm:$simm16), + (ins SReg_32:$sdst, u16imm:$simm16)), "$sdst, $simm16", []>, SOPKInstTable<1, base_op>{ let Defs = [SCC]; @@ -546,7 +548,7 @@ class SOPK_SCC : SOPK_Pseudo < class SOPK_32TIE pattern=[]> : SOPK_Pseudo < opName, (outs SReg_32:$sdst), - (ins SReg_32:$src0, u16imm:$simm16), + (ins SReg_32:$src0, s16imm:$simm16), "$sdst, $simm16", pattern >; @@ -575,20 +577,20 @@ let isCompare = 1 in { // [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))] // >; -def S_CMPK_EQ_I32 : SOPK_SCC <"s_cmpk_eq_i32", "s_cmp_eq_i32">; -def S_CMPK_LG_I32 : SOPK_SCC <"s_cmpk_lg_i32", "s_cmp_lg_i32">; -def S_CMPK_GT_I32 : SOPK_SCC <"s_cmpk_gt_i32", "s_cmp_gt_i32">; -def S_CMPK_GE_I32 : SOPK_SCC <"s_cmpk_ge_i32", "s_cmp_ge_i32">; -def S_CMPK_LT_I32 : SOPK_SCC <"s_cmpk_lt_i32", "s_cmp_lt_i32">; -def S_CMPK_LE_I32 : SOPK_SCC <"s_cmpk_le_i32", "s_cmp_le_i32">; +def S_CMPK_EQ_I32 : SOPK_SCC <"s_cmpk_eq_i32", "s_cmp_eq_i32", 1>; +def S_CMPK_LG_I32 : SOPK_SCC <"s_cmpk_lg_i32", "s_cmp_lg_i32", 1>; +def S_CMPK_GT_I32 : SOPK_SCC <"s_cmpk_gt_i32", "s_cmp_gt_i32", 1>; +def S_CMPK_GE_I32 : SOPK_SCC <"s_cmpk_ge_i32", "s_cmp_ge_i32", 1>; +def S_CMPK_LT_I32 : SOPK_SCC <"s_cmpk_lt_i32", "s_cmp_lt_i32", 1>; +def S_CMPK_LE_I32 : SOPK_SCC <"s_cmpk_le_i32", "s_cmp_le_i32", 1>; let SOPKZext = 1 in { -def S_CMPK_EQ_U32 : SOPK_SCC <"s_cmpk_eq_u32", "s_cmp_eq_u32">; -def S_CMPK_LG_U32 : SOPK_SCC <"s_cmpk_lg_u32", "s_cmp_lg_u32">; -def S_CMPK_GT_U32 : SOPK_SCC <"s_cmpk_gt_u32", "s_cmp_gt_u32">; -def S_CMPK_GE_U32 : SOPK_SCC <"s_cmpk_ge_u32", "s_cmp_ge_u32">; -def S_CMPK_LT_U32 : SOPK_SCC <"s_cmpk_lt_u32", "s_cmp_lt_u32">; -def S_CMPK_LE_U32 : SOPK_SCC <"s_cmpk_le_u32", "s_cmp_le_u32">; +def S_CMPK_EQ_U32 : SOPK_SCC <"s_cmpk_eq_u32", "s_cmp_eq_u32", 0>; +def S_CMPK_LG_U32 : SOPK_SCC <"s_cmpk_lg_u32", "s_cmp_lg_u32", 0>; +def S_CMPK_GT_U32 : SOPK_SCC <"s_cmpk_gt_u32", "s_cmp_gt_u32", 0>; +def S_CMPK_GE_U32 : SOPK_SCC <"s_cmpk_ge_u32", "s_cmp_ge_u32", 0>; +def S_CMPK_LT_U32 : SOPK_SCC <"s_cmpk_lt_u32", "s_cmp_lt_u32", 0>; +def S_CMPK_LE_U32 : SOPK_SCC <"s_cmpk_le_u32", "s_cmp_le_u32", 0>; } // End SOPKZext = 1 } // End isCompare = 1 @@ -600,7 +602,7 @@ let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0", def S_CBRANCH_I_FORK : SOPK_Pseudo < "s_cbranch_i_fork", - (outs), (ins SReg_64:$sdst, u16imm:$simm16), + (outs), (ins SReg_64:$sdst, s16imm:$simm16), "$sdst, $simm16" >; diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 86095a8..5a3242b 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -93,6 +93,12 @@ unsigned getVmcntBitWidthHi() { return 2; } } // end namespace anonymous namespace llvm { + +static cl::opt EnablePackedInlinableLiterals( + "enable-packed-inlinable-literals", + cl::desc("Enable packed inlinable literals (v2f16, v2i16)"), + cl::init(false)); + namespace AMDGPU { namespace IsaInfo { @@ -703,6 +709,9 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) { bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) { assert(HasInv2Pi); + if (!EnablePackedInlinableLiterals) + return false; + int16_t Lo16 = static_cast(Literal); int16_t Hi16 = static_cast(Literal >> 16); return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi); diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index 4f5711c..5c9d589 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -905,7 +905,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align); - switch (RC->getSize()) { + switch (TRI->getSpillSize(*RC)) { case 4: if (ARM::GPRRegClass.hasSubClassEq(RC)) { BuildMI(MBB, I, DL, get(ARM::STRi12)) @@ -1103,7 +1103,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align); - switch (RC->getSize()) { + switch (TRI->getSpillSize(*RC)) { case 4: if (ARM::GPRRegClass.hasSubClassEq(RC)) { BuildMI(MBB, I, DL, get(ARM::LDRi12), DestReg) diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index faf1c63..28c407f 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -105,10 +105,6 @@ public: // Return whether the target has an explicit NOP encoding. bool hasNOP() const; - virtual void getNoopForElfTarget(MCInst &NopInst) const { - getNoopForMachoTarget(NopInst); - } - // Return the non-pre/post incrementing version of 'Opc'. Return 0 // if there is not such an opcode. virtual unsigned getUnindexedOpcode(unsigned Opc) const = 0; diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 70a44ea..a208875 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -806,7 +806,8 @@ bool ARMBaseRegisterInfo::shouldCoalesce(MachineInstr *MI, if (!DstSubReg) return true; // Small registers don't frequently cause a problem, so we can coalesce them. - if (NewRC->getSize() < 32 && DstRC->getSize() < 32 && SrcRC->getSize() < 32) + if (getRegSizeInBits(*NewRC) < 256 && getRegSizeInBits(*DstRC) < 256 && + getRegSizeInBits(*SrcRC) < 256) return true; auto NewRCWeight = diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp index 94b317a..13fb307 100644 --- a/lib/Target/ARM/ARMCallLowering.cpp +++ b/lib/Target/ARM/ARMCallLowering.cpp @@ -35,7 +35,8 @@ ARMCallLowering::ARMCallLowering(const ARMTargetLowering &TLI) static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI, Type *T) { EVT VT = TLI.getValueType(DL, T, true); - if (!VT.isSimple() || VT.isVector()) + if (!VT.isSimple() || VT.isVector() || + !(VT.isInteger() || VT.isFloatingPoint())) return false; unsigned VTSize = VT.getSimpleVT().getSizeInBits(); diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index e0aecff..78a9144 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -661,7 +661,6 @@ static bool IsAnAddressOperand(const MachineOperand &MO) { return false; case MachineOperand::MO_IntrinsicID: case MachineOperand::MO_Predicate: - case MachineOperand::MO_Placeholder: llvm_unreachable("should not exist post-isel"); } llvm_unreachable("unhandled machine operand type"); diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index 70dbe1b..4f7a0ab 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -1960,10 +1960,10 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, // note: Thumb1 functions spill to R12, not the stack. Reserve a slot // closest to SP or frame pointer. assert(RS && "Register scavenging not provided"); - const TargetRegisterClass *RC = &ARM::GPRRegClass; - RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(), - RC->getAlignment(), - false)); + const TargetRegisterClass &RC = ARM::GPRRegClass; + unsigned Size = TRI->getSpillSize(RC); + unsigned Align = TRI->getSpillAlignment(RC); + RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false)); } } } diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 165e9b7..382f881 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -3358,8 +3358,12 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { - // FIXME: handle "fence singlethread" more efficiently. SDLoc dl(Op); + ConstantSDNode *ScopeN = cast(Op.getOperand(2)); + auto Scope = static_cast(ScopeN->getZExtValue()); + if (Scope == SynchronizationScope::SingleThread) + return Op; + if (!Subtarget->hasDataBarrier()) { // Some ARMv6 cpus can support data barriers with an mcr instruction. // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get @@ -9476,8 +9480,11 @@ AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, return SDValue(); } - // Don't generate vpaddl+vmovn; we'll match it to vpadd later. - if (Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) + // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure + // we're using the entire input vector, otherwise there's a size/legality + // mismatch somewhere. + if (nextIndex != Vec.getValueType().getVectorNumElements() || + Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) return SDValue(); // Create VPADDL node. diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp index 3b3606e..a0e2ac4 100644 --- a/lib/Target/ARM/ARMInstrInfo.cpp +++ b/lib/Target/ARM/ARMInstrInfo.cpp @@ -32,8 +32,8 @@ using namespace llvm; ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) : ARMBaseInstrInfo(STI), RI() {} -/// getNoopForMachoTarget - Return the noop instruction to use for a noop. -void ARMInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { +/// Return the noop instruction to use for a noop. +void ARMInstrInfo::getNoop(MCInst &NopInst) const { if (hasNOP()) { NopInst.setOpcode(ARM::HINT); NopInst.addOperand(MCOperand::createImm(0)); diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h index 4b1b709..c87fb97 100644 --- a/lib/Target/ARM/ARMInstrInfo.h +++ b/lib/Target/ARM/ARMInstrInfo.h @@ -25,8 +25,8 @@ class ARMInstrInfo : public ARMBaseInstrInfo { public: explicit ARMInstrInfo(const ARMSubtarget &STI); - /// getNoopForMachoTarget - Return the noop instruction to use for a noop. - void getNoopForMachoTarget(MCInst &NopInst) const override; + /// Return the noop instruction to use for a noop. + void getNoop(MCInst &NopInst) const override; // Return the non-pre/post incrementing version of 'Opc'. Return 0 // if there is not such an opcode. diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 703e807..9d8ee5c 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -5975,3 +5975,10 @@ def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPR:$status), (ins GPR:$addr, GPRPair:$desired, GPRPair:$new), NoItinerary, []>, Sched<[]>; } + +def CompilerBarrier : PseudoInst<(outs), (ins i32imm:$ordering), NoItinerary, + [(atomic_fence imm:$ordering, 0)]> { + let hasSideEffects = 1; + let Size = 0; + let AsmString = "@ COMPILER BARRIER"; +} diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td index f2f426e..8048c75 100644 --- a/lib/Target/ARM/ARMInstrThumb.td +++ b/lib/Target/ARM/ARMInstrThumb.td @@ -953,7 +953,7 @@ let isAdd = 1 in { /// These opcodes will be converted to the real non-S opcodes by /// AdjustInstrPostInstrSelection after giving then an optional CPSR operand. let hasPostISelHook = 1, Defs = [CPSR] in { - let isCommutable = 1 in + let isCommutable = 1, Uses = [CPSR] in def tADCS : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), 2, IIC_iALUr, [(set tGPR:$Rdn, CPSR, (ARMadde tGPR:$Rn, tGPR:$Rm, @@ -1292,6 +1292,7 @@ def tSUBrr : // A8.6.212 /// These opcodes will be converted to the real non-S opcodes by /// AdjustInstrPostInstrSelection after giving then an optional CPSR operand. let hasPostISelHook = 1, Defs = [CPSR] in { + let Uses = [CPSR] in def tSBCS : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), 2, IIC_iALUr, [(set tGPR:$Rdn, CPSR, (ARMsube tGPR:$Rn, tGPR:$Rm, diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp index 816596b..1c13d51 100644 --- a/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/lib/Target/ARM/ARMInstructionSelector.cpp @@ -47,12 +47,9 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, unsigned SrcReg = I.getOperand(1).getReg(); const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); (void)SrcSize; - assert((DstSize == SrcSize || - // Copies are a means to setup initial types, the number of - // bits may not exactly match. - (TargetRegisterInfo::isPhysicalRegister(SrcReg) && - DstSize <= SrcSize)) && - "Copy with different width?!"); + // We use copies for trunc, so it's ok for the size of the destination to be + // smaller (the higher bits will just be undefined). + assert(DstSize <= SrcSize && "Copy with different width?!"); assert((RegBank->getID() == ARM::GPRRegBankID || RegBank->getID() == ARM::FPRRegBankID) && @@ -294,6 +291,28 @@ bool ARMInstructionSelector::select(MachineInstr &I) const { } break; } + case G_TRUNC: { + // The high bits are undefined, so there's nothing special to do, just + // treat it as a copy. + auto SrcReg = I.getOperand(1).getReg(); + auto DstReg = I.getOperand(0).getReg(); + + const auto &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); + const auto &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); + + if (SrcRegBank.getID() != DstRegBank.getID()) { + DEBUG(dbgs() << "G_TRUNC operands on different register banks\n"); + return false; + } + + if (SrcRegBank.getID() != ARM::GPRRegBankID) { + DEBUG(dbgs() << "G_TRUNC on non-GPR not supported yet\n"); + return false; + } + + I.setDesc(TII.get(COPY)); + return selectCopy(I, TII, MRI, TRI, RBI); + } case G_ADD: case G_GEP: I.setDesc(TII.get(ARM::ADDrr)); @@ -313,6 +332,16 @@ bool ARMInstructionSelector::select(MachineInstr &I) const { } MIB.add(predOps(ARMCC::AL)).add(condCodeOp()); break; + case G_SDIV: + assert(TII.getSubtarget().hasDivideInARMMode() && "Unsupported operation"); + I.setDesc(TII.get(ARM::SDIV)); + MIB.add(predOps(ARMCC::AL)); + break; + case G_UDIV: + assert(TII.getSubtarget().hasDivideInARMMode() && "Unsupported operation"); + I.setDesc(TII.get(ARM::UDIV)); + MIB.add(predOps(ARMCC::AL)); + break; case G_FADD: if (!selectFAdd(MIB, TII, MRI)) return false; @@ -332,6 +361,18 @@ bool ARMInstructionSelector::select(MachineInstr &I) const { "Expected constant to live in a GPR"); I.setDesc(TII.get(ARM::MOVi)); MIB.add(predOps(ARMCC::AL)).add(condCodeOp()); + + auto &Val = I.getOperand(1); + if (Val.isCImm()) { + if (Val.getCImm()->getBitWidth() > 32) + return false; + Val.ChangeToImmediate(Val.getCImm()->getZExtValue()); + } + + if (!Val.isImm()) { + return false; + } + break; } case G_STORE: diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp index fe96814..9b86030 100644 --- a/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -13,6 +13,8 @@ #include "ARMLegalizerInfo.h" #include "ARMSubtarget.h" +#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Type.h" @@ -47,6 +49,18 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { for (auto Ty : {s1, s8, s16, s32}) setAction({Op, Ty}, Legal); + for (unsigned Op : {G_SDIV, G_UDIV}) { + for (auto Ty : {s8, s16}) + // FIXME: We need WidenScalar here, but in the case of targets with + // software division we'll also need Libcall afterwards. Treat as Custom + // until we have better support for chaining legalization actions. + setAction({Op, Ty}, Custom); + if (ST.hasDivideInARMMode()) + setAction({Op, s32}, Legal); + else + setAction({Op, s32}, Libcall); + } + for (unsigned Op : {G_SEXT, G_ZEXT}) { setAction({Op, s32}, Legal); for (auto Ty : {s1, s8, s16}) @@ -75,3 +89,48 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { computeTables(); } + +bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + using namespace TargetOpcode; + + switch (MI.getOpcode()) { + default: + return false; + case G_SDIV: + case G_UDIV: { + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + if (Ty != LLT::scalar(16) && Ty != LLT::scalar(8)) + return false; + + // We need to widen to 32 bits and then maybe, if the target requires, + // transform into a libcall. + LegalizerHelper Helper(MIRBuilder.getMF()); + + MachineInstr *NewMI = nullptr; + Helper.MIRBuilder.recordInsertions([&](MachineInstr *MI) { + // Store the new, 32-bit div instruction. + if (MI->getOpcode() == G_SDIV || MI->getOpcode() == G_UDIV) + NewMI = MI; + }); + + auto Result = Helper.widenScalar(MI, 0, LLT::scalar(32)); + Helper.MIRBuilder.stopRecordingInsertions(); + if (Result == LegalizerHelper::UnableToLegalize) { + return false; + } + assert(NewMI && "Couldn't find widened instruction"); + assert((NewMI->getOpcode() == G_SDIV || NewMI->getOpcode() == G_UDIV) && + "Unexpected widened instruction"); + assert(MRI.getType(NewMI->getOperand(0).getReg()).getSizeInBits() == 32 && + "Unexpected type for the widened instruction"); + + Result = Helper.legalizeInstrStep(*NewMI); + if (Result == LegalizerHelper::UnableToLegalize) { + return false; + } + return true; + } + } +} diff --git a/lib/Target/ARM/ARMLegalizerInfo.h b/lib/Target/ARM/ARMLegalizerInfo.h index 0b8a608..a9bdd36 100644 --- a/lib/Target/ARM/ARMLegalizerInfo.h +++ b/lib/Target/ARM/ARMLegalizerInfo.h @@ -24,6 +24,9 @@ class ARMSubtarget; class ARMLegalizerInfo : public LegalizerInfo { public: ARMLegalizerInfo(const ARMSubtarget &ST); + + bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const override; }; } // End llvm namespace. #endif diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp index 0fd9826..9e9c1ba 100644 --- a/lib/Target/ARM/ARMMCInstLower.cpp +++ b/lib/Target/ARM/ARMMCInstLower.cpp @@ -211,11 +211,9 @@ void ARMAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) .addImm(ARMCC::AL).addReg(0)); MCInst Noop; - Subtarget->getInstrInfo()->getNoopForElfTarget(Noop); + Subtarget->getInstrInfo()->getNoop(Noop); for (int8_t I = 0; I < NoopsInSledCount; I++) - { OutStreamer->EmitInstruction(Noop, getSubtargetInfo()); - } OutStreamer->EmitLabel(Target); recordSled(CurSled, MI, Kind); diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp index e47bd3a..7325817 100644 --- a/lib/Target/ARM/ARMRegisterBankInfo.cpp +++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp @@ -221,8 +221,11 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case G_ADD: case G_SUB: case G_MUL: + case G_SDIV: + case G_UDIV: case G_SEXT: case G_ZEXT: + case G_TRUNC: case G_GEP: // FIXME: We're abusing the fact that everything lives in a GPR for now; in // the real world we would use different mappings. diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp index 27bff4d..0ebf559 100644 --- a/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -24,8 +24,8 @@ using namespace llvm; Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI) : ARMBaseInstrInfo(STI), RI() {} -/// getNoopForMachoTarget - Return the noop instruction to use for a noop. -void Thumb1InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { +/// Return the noop instruction to use for a noop. +void Thumb1InstrInfo::getNoop(MCInst &NopInst) const { NopInst.setOpcode(ARM::tMOVr); NopInst.addOperand(MCOperand::createReg(ARM::R8)); NopInst.addOperand(MCOperand::createReg(ARM::R8)); diff --git a/lib/Target/ARM/Thumb1InstrInfo.h b/lib/Target/ARM/Thumb1InstrInfo.h index 931914a..e8d9a9c 100644 --- a/lib/Target/ARM/Thumb1InstrInfo.h +++ b/lib/Target/ARM/Thumb1InstrInfo.h @@ -25,8 +25,8 @@ class Thumb1InstrInfo : public ARMBaseInstrInfo { public: explicit Thumb1InstrInfo(const ARMSubtarget &STI); - /// getNoopForMachoTarget - Return the noop instruction to use for a noop. - void getNoopForMachoTarget(MCInst &NopInst) const override; + /// Return the noop instruction to use for a noop. + void getNoop(MCInst &NopInst) const override; // Return the non-pre/post incrementing version of 'Opc'. Return 0 // if there is not such an opcode. diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp index 818ba85..2e2dfe0 100644 --- a/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -32,8 +32,8 @@ OldT2IfCvt("old-thumb2-ifcvt", cl::Hidden, Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI) : ARMBaseInstrInfo(STI), RI() {} -/// getNoopForMachoTarget - Return the noop instruction to use for a noop. -void Thumb2InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { +/// Return the noop instruction to use for a noop. +void Thumb2InstrInfo::getNoop(MCInst &NopInst) const { NopInst.setOpcode(ARM::tHINT); NopInst.addOperand(MCOperand::createImm(0)); NopInst.addOperand(MCOperand::createImm(ARMCC::AL)); diff --git a/lib/Target/ARM/Thumb2InstrInfo.h b/lib/Target/ARM/Thumb2InstrInfo.h index 15d6330..c834ba7 100644 --- a/lib/Target/ARM/Thumb2InstrInfo.h +++ b/lib/Target/ARM/Thumb2InstrInfo.h @@ -26,8 +26,8 @@ class Thumb2InstrInfo : public ARMBaseInstrInfo { public: explicit Thumb2InstrInfo(const ARMSubtarget &STI); - /// getNoopForMachoTarget - Return the noop instruction to use for a noop. - void getNoopForMachoTarget(MCInst &NopInst) const override; + /// Return the noop instruction to use for a noop. + void getNoop(MCInst &NopInst) const override; // Return the non-pre/post incrementing version of 'Opc'. Return 0 // if there is not such an opcode. diff --git a/lib/Target/AVR/AVRAsmPrinter.cpp b/lib/Target/AVR/AVRAsmPrinter.cpp index 50bb50b..d6491ce 100644 --- a/lib/Target/AVR/AVRAsmPrinter.cpp +++ b/lib/Target/AVR/AVRAsmPrinter.cpp @@ -112,7 +112,8 @@ bool AVRAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, const AVRSubtarget &STI = MF->getSubtarget(); const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); - unsigned BytesPerReg = TRI.getMinimalPhysRegClass(Reg)->getSize(); + const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg); + unsigned BytesPerReg = TRI.getRegSizeInBits(*RC) / 8; assert(BytesPerReg <= 2 && "Only 8 and 16 bit regs are supported."); unsigned RegIdx = ByteNumber / BytesPerReg; diff --git a/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/lib/Target/AVR/AVRExpandPseudoInsts.cpp index 13080a5..540e05a 100644 --- a/lib/Target/AVR/AVRExpandPseudoInsts.cpp +++ b/lib/Target/AVR/AVRExpandPseudoInsts.cpp @@ -88,6 +88,9 @@ private: unsigned ArithOpcode, Block &MBB, BlockIt MBBI); + + /// Scavenges a free GPR8 register for use. + unsigned scavengeGPR8(MachineInstr &MI); }; char AVRExpandPseudo::ID = 0; @@ -577,24 +580,43 @@ bool AVRExpandPseudo::expand(Block &MBB, BlockIt MBBI) { MachineInstr &MI = *MBBI; unsigned OpLo, OpHi, DstLoReg, DstHiReg; unsigned DstReg = MI.getOperand(0).getReg(); + unsigned TmpReg = 0; // 0 for no temporary register unsigned SrcReg = MI.getOperand(1).getReg(); - bool DstIsDead = MI.getOperand(0).isDead(); bool SrcIsKill = MI.getOperand(1).isKill(); OpLo = AVR::LDRdPtr; OpHi = AVR::LDDRdPtrQ; TRI->splitReg(DstReg, DstLoReg, DstHiReg); - assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same"); + // Use a temporary register if src and dst registers are the same. + if (DstReg == SrcReg) + TmpReg = scavengeGPR8(MI); + + unsigned CurDstLoReg = (DstReg == SrcReg) ? TmpReg : DstLoReg; + unsigned CurDstHiReg = (DstReg == SrcReg) ? TmpReg : DstHiReg; + // Load low byte. auto MIBLO = buildMI(MBB, MBBI, OpLo) - .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(CurDstLoReg, RegState::Define) .addReg(SrcReg); + // Push low byte onto stack if necessary. + if (TmpReg) + buildMI(MBB, MBBI, AVR::PUSHRr).addReg(TmpReg); + + // Load high byte. auto MIBHI = buildMI(MBB, MBBI, OpHi) - .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(CurDstHiReg, RegState::Define) .addReg(SrcReg, getKillRegState(SrcIsKill)) .addImm(1); + if (TmpReg) { + // Move the high byte into the final destination. + buildMI(MBB, MBBI, AVR::MOVRdRr).addReg(DstHiReg).addReg(TmpReg); + + // Move the low byte from the scratch space into the final destination. + buildMI(MBB, MBBI, AVR::POPRd).addReg(DstLoReg); + } + MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); @@ -669,9 +691,9 @@ bool AVRExpandPseudo::expand(Block &MBB, BlockIt MBBI) { MachineInstr &MI = *MBBI; unsigned OpLo, OpHi, DstLoReg, DstHiReg; unsigned DstReg = MI.getOperand(0).getReg(); + unsigned TmpReg = 0; // 0 for no temporary register unsigned SrcReg = MI.getOperand(1).getReg(); unsigned Imm = MI.getOperand(2).getImm(); - bool DstIsDead = MI.getOperand(0).isDead(); bool SrcIsKill = MI.getOperand(1).isKill(); OpLo = AVR::LDDRdPtrQ; OpHi = AVR::LDDRdPtrQ; @@ -679,60 +701,35 @@ bool AVRExpandPseudo::expand(Block &MBB, BlockIt MBBI) { assert(Imm <= 63 && "Offset is out of range"); - MachineInstr *MIBLO, *MIBHI; - - // HACK: We shouldn't have instances of this instruction - // where src==dest because the instruction itself is - // marked earlyclobber. We do however get this instruction when - // loading from stack slots where the earlyclobber isn't useful. - // - // In this case, just use a temporary register. - if (DstReg == SrcReg) { - RegScavenger RS; - - RS.enterBasicBlock(MBB); - RS.forward(MBBI); - - BitVector Candidates = - TRI->getAllocatableSet - (*MBB.getParent(), &AVR::GPR8RegClass); - - // Exclude all the registers being used by the instruction. - for (MachineOperand &MO : MI.operands()) { - if (MO.isReg() && MO.getReg() != 0 && !MO.isDef() && - !TargetRegisterInfo::isVirtualRegister(MO.getReg())) - Candidates.reset(MO.getReg()); - } - - BitVector Available = RS.getRegsAvailable(&AVR::GPR8RegClass); - Available &= Candidates; + // Use a temporary register if src and dst registers are the same. + if (DstReg == SrcReg) + TmpReg = scavengeGPR8(MI); - signed TmpReg = Available.find_first(); - assert(TmpReg != -1 && "ran out of registers"); + unsigned CurDstLoReg = (DstReg == SrcReg) ? TmpReg : DstLoReg; + unsigned CurDstHiReg = (DstReg == SrcReg) ? TmpReg : DstHiReg; - MIBLO = buildMI(MBB, MBBI, OpLo) - .addReg(TmpReg, RegState::Define) - .addReg(SrcReg) - .addImm(Imm); + // Load low byte. + auto MIBLO = buildMI(MBB, MBBI, OpLo) + .addReg(CurDstLoReg, RegState::Define) + .addReg(SrcReg) + .addImm(Imm); - buildMI(MBB, MBBI, AVR::MOVRdRr).addReg(DstLoReg).addReg(TmpReg); + // Push low byte onto stack if necessary. + if (TmpReg) + buildMI(MBB, MBBI, AVR::PUSHRr).addReg(TmpReg); - MIBHI = buildMI(MBB, MBBI, OpHi) - .addReg(TmpReg, RegState::Define) - .addReg(SrcReg, getKillRegState(SrcIsKill)) - .addImm(Imm + 1); + // Load high byte. + auto MIBHI = buildMI(MBB, MBBI, OpHi) + .addReg(CurDstHiReg, RegState::Define) + .addReg(SrcReg, getKillRegState(SrcIsKill)) + .addImm(Imm + 1); + if (TmpReg) { + // Move the high byte into the final destination. buildMI(MBB, MBBI, AVR::MOVRdRr).addReg(DstHiReg).addReg(TmpReg); - } else { - MIBLO = buildMI(MBB, MBBI, OpLo) - .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead)) - .addReg(SrcReg) - .addImm(Imm); - MIBHI = buildMI(MBB, MBBI, OpHi) - .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead)) - .addReg(SrcReg, getKillRegState(SrcIsKill)) - .addImm(Imm + 1); + // Move the low byte from the scratch space into the final destination. + buildMI(MBB, MBBI, AVR::POPRd).addReg(DstLoReg); } MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); @@ -819,6 +816,32 @@ bool AVRExpandPseudo::expandAtomicArithmeticOp(unsigned Width, }); } +unsigned AVRExpandPseudo::scavengeGPR8(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + RegScavenger RS; + + RS.enterBasicBlock(MBB); + RS.forward(MI); + + BitVector Candidates = + TRI->getAllocatableSet + (*MBB.getParent(), &AVR::GPR8RegClass); + + // Exclude all the registers being used by the instruction. + for (MachineOperand &MO : MI.operands()) { + if (MO.isReg() && MO.getReg() != 0 && !MO.isDef() && + !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + Candidates.reset(MO.getReg()); + } + + BitVector Available = RS.getRegsAvailable(&AVR::GPR8RegClass); + Available &= Candidates; + + signed Reg = Available.find_first(); + assert(Reg != -1 && "ran out of registers"); + return Reg; +} + template<> bool AVRExpandPseudo::expand(Block &MBB, BlockIt MBBI) { return expandAtomicBinaryOp(AVR::LDRdPtr, MBB, MBBI); @@ -948,7 +971,6 @@ bool AVRExpandPseudo::expand(Block &MBB, BlockIt MBBI) { unsigned OpLo, OpHi, SrcLoReg, SrcHiReg; unsigned DstReg = MI.getOperand(0).getReg(); unsigned SrcReg = MI.getOperand(1).getReg(); - bool DstIsKill = MI.getOperand(0).isKill(); bool SrcIsKill = MI.getOperand(1).isKill(); OpLo = AVR::STPtrRr; OpHi = AVR::STDPtrQRr; @@ -960,7 +982,7 @@ bool AVRExpandPseudo::expand(Block &MBB, BlockIt MBBI) { .addReg(SrcLoReg, getKillRegState(SrcIsKill)); auto MIBHI = buildMI(MBB, MBBI, OpHi) - .addReg(DstReg, getKillRegState(DstIsKill)) + .addReg(DstReg) .addImm(1) .addReg(SrcHiReg, getKillRegState(SrcIsKill)); diff --git a/lib/Target/AVR/AVRFrameLowering.cpp b/lib/Target/AVR/AVRFrameLowering.cpp index b8cb221..ab42a7a 100644 --- a/lib/Target/AVR/AVRFrameLowering.cpp +++ b/lib/Target/AVR/AVRFrameLowering.cpp @@ -239,7 +239,7 @@ bool AVRFrameLowering::spillCalleeSavedRegisters( unsigned Reg = CSI[i - 1].getReg(); bool IsNotLiveIn = !MBB.isLiveIn(Reg); - assert(TRI->getMinimalPhysRegClass(Reg)->getSize() == 1 && + assert(TRI->getRegSizeInBits(*TRI->getMinimalPhysRegClass(Reg)) == 8 && "Invalid register size"); // Add the callee-saved register as live-in only if it is not already a @@ -277,7 +277,7 @@ bool AVRFrameLowering::restoreCalleeSavedRegisters( for (const CalleeSavedInfo &CCSI : CSI) { unsigned Reg = CCSI.getReg(); - assert(TRI->getMinimalPhysRegClass(Reg)->getSize() == 1 && + assert(TRI->getRegSizeInBits(*TRI->getMinimalPhysRegClass(Reg)) == 8 && "Invalid register size"); BuildMI(MBB, MI, DL, TII.get(AVR::POPRd), Reg); diff --git a/lib/Target/AVR/AVRInstrInfo.cpp b/lib/Target/AVR/AVRInstrInfo.cpp index 88f8892..afba66b 100644 --- a/lib/Target/AVR/AVRInstrInfo.cpp +++ b/lib/Target/AVR/AVRInstrInfo.cpp @@ -142,9 +142,9 @@ void AVRInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MFI.getObjectAlignment(FrameIndex)); unsigned Opcode = 0; - if (RC->hasType(MVT::i8)) { + if (TRI->isTypeLegalForClass(*RC, MVT::i8)) { Opcode = AVR::STDPtrQRr; - } else if (RC->hasType(MVT::i16)) { + } else if (TRI->isTypeLegalForClass(*RC, MVT::i16)) { Opcode = AVR::STDWPtrQRr; } else { llvm_unreachable("Cannot store this register into a stack slot!"); @@ -176,9 +176,9 @@ void AVRInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MFI.getObjectAlignment(FrameIndex)); unsigned Opcode = 0; - if (RC->hasType(MVT::i8)) { + if (TRI->isTypeLegalForClass(*RC, MVT::i8)) { Opcode = AVR::LDDRdPtrQ; - } else if (RC->hasType(MVT::i16)) { + } else if (TRI->isTypeLegalForClass(*RC, MVT::i16)) { // Opcode = AVR::LDDWRdPtrQ; //:FIXME: remove this once PR13375 gets fixed Opcode = AVR::LDDWRdYQ; diff --git a/lib/Target/AVR/AVRRegisterInfo.cpp b/lib/Target/AVR/AVRRegisterInfo.cpp index 48798bd..5cc7eaf 100644 --- a/lib/Target/AVR/AVRRegisterInfo.cpp +++ b/lib/Target/AVR/AVRRegisterInfo.cpp @@ -78,11 +78,12 @@ BitVector AVRRegisterInfo::getReservedRegs(const MachineFunction &MF) const { const TargetRegisterClass * AVRRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const { - if (RC->hasType(MVT::i16)) { + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + if (TRI->isTypeLegalForClass(*RC, MVT::i16)) { return &AVR::DREGSRegClass; } - if (RC->hasType(MVT::i8)) { + if (TRI->isTypeLegalForClass(*RC, MVT::i8)) { return &AVR::GPR8RegClass; } diff --git a/lib/Target/Hexagon/BitTracker.cpp b/lib/Target/Hexagon/BitTracker.cpp index cb3049b..07767d1 100644 --- a/lib/Target/Hexagon/BitTracker.cpp +++ b/lib/Target/Hexagon/BitTracker.cpp @@ -347,7 +347,7 @@ uint16_t BT::MachineEvaluator::getRegBitWidth(const RegisterRef &RR) const { unsigned PhysS = (RR.Sub == 0) ? PhysR : TRI.getSubReg(PhysR, RR.Sub); const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(PhysS); - uint16_t BW = RC->getSize()*8; + uint16_t BW = TRI.getRegSizeInBits(*RC); return BW; } diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp index fda23f8..c8483f7 100644 --- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp +++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp @@ -286,9 +286,9 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst, const MCRegisterInfo *RI = OutStreamer->getContext().getRegisterInfo(); const MachineFunction &MF = *MI.getParent()->getParent(); const auto &HST = MF.getSubtarget(); - unsigned VectorSize = HST.useHVXSglOps() - ? Hexagon::VectorRegsRegClass.getSize() - : Hexagon::VectorRegs128BRegClass.getSize(); + const auto &VecRC = HST.useHVXSglOps() ? Hexagon::VectorRegsRegClass + : Hexagon::VectorRegs128BRegClass; + unsigned VectorSize = HST.getRegisterInfo()->getSpillSize(VecRC); switch (Inst.getOpcode()) { default: return; diff --git a/lib/Target/Hexagon/HexagonBitSimplify.cpp b/lib/Target/Hexagon/HexagonBitSimplify.cpp index 61f290c..8502bf2 100644 --- a/lib/Target/Hexagon/HexagonBitSimplify.cpp +++ b/lib/Target/Hexagon/HexagonBitSimplify.cpp @@ -407,7 +407,7 @@ bool HexagonBitSimplify::getSubregMask(const BitTracker::RegisterRef &RR, const TargetRegisterClass *RC = MRI.getRegClass(RR.Reg); if (RR.Sub == 0) { Begin = 0; - Width = RC->getSize()*8; + Width = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC); return true; } @@ -417,7 +417,7 @@ bool HexagonBitSimplify::getSubregMask(const BitTracker::RegisterRef &RR, case Hexagon::DoubleRegsRegClassID: case Hexagon::VecDblRegsRegClassID: case Hexagon::VecDblRegs128BRegClassID: - Width = RC->getSize()*8 / 2; + Width = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 2; if (RR.Sub == Hexagon::isub_hi || RR.Sub == Hexagon::vsub_hi) Begin = Width; break; @@ -1054,8 +1054,8 @@ namespace { class RedundantInstrElimination : public Transformation { public: RedundantInstrElimination(BitTracker &bt, const HexagonInstrInfo &hii, - MachineRegisterInfo &mri) - : Transformation(true), HII(hii), MRI(mri), BT(bt) {} + const HexagonRegisterInfo &hri, MachineRegisterInfo &mri) + : Transformation(true), HII(hii), HRI(hri), MRI(mri), BT(bt) {} bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override; @@ -1070,6 +1070,7 @@ namespace { bool usedBitsEqual(BitTracker::RegisterRef RD, BitTracker::RegisterRef RS); const HexagonInstrInfo &HII; + const HexagonRegisterInfo &HRI; MachineRegisterInfo &MRI; BitTracker &BT; }; @@ -1262,7 +1263,7 @@ bool RedundantInstrElimination::computeUsedBits(const MachineInstr &MI, assert(MI.getOperand(OpN).isReg()); BitTracker::RegisterRef RR = MI.getOperand(OpN); const TargetRegisterClass *RC = HBS::getFinalVRegClass(RR, MRI); - uint16_t Width = RC->getSize()*8; + uint16_t Width = HRI.getRegSizeInBits(*RC); if (!GotBits) T.set(Begin, Begin+Width); @@ -2173,8 +2174,10 @@ bool BitSimplification::genBitSplit(MachineInstr *MI, const RegisterSet &AVs) { if (!GenBitSplit) return false; - if (CountBitSplit >= MaxBitSplit) - return false; + if (MaxBitSplit.getNumOccurrences()) { + if (CountBitSplit >= MaxBitSplit) + return false; + } unsigned Opc = MI->getOpcode(); switch (Opc) { @@ -2253,7 +2256,8 @@ bool BitSimplification::genBitSplit(MachineInstr *MI, continue; // Generate bitsplit where S is defined. - CountBitSplit++; + if (MaxBitSplit.getNumOccurrences()) + CountBitSplit++; MachineInstr *DefS = MRI.getVRegDef(S); assert(DefS != nullptr); DebugLoc DL = DefS->getDebugLoc(); @@ -2379,9 +2383,11 @@ bool BitSimplification::simplifyExtractLow(MachineInstr *MI, const RegisterSet &AVs) { if (!GenExtract) return false; - if (CountExtract >= MaxExtract) - return false; - CountExtract++; + if (MaxExtract.getNumOccurrences()) { + if (CountExtract >= MaxExtract) + return false; + CountExtract++; + } unsigned W = RC.width(); unsigned RW = W; @@ -2651,7 +2657,7 @@ bool HexagonBitSimplify::runOnMachineFunction(MachineFunction &MF) { Changed |= visitBlock(Entry, ImmG, AIG); RegisterSet ARE; // Available registers for RIE. - RedundantInstrElimination RIE(BT, HII, MRI); + RedundantInstrElimination RIE(BT, HII, HRI, MRI); bool Ried = visitBlock(Entry, RIE, ARE); if (Ried) { Changed = true; diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp index d8ba5dc..9f8c9de 100644 --- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp +++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp @@ -559,10 +559,10 @@ unsigned HexagonExpandCondsets::getCondTfrOpcode(const MachineOperand &SO, } unsigned PhysS = (RS.Sub == 0) ? PhysR : TRI->getSubReg(PhysR, RS.Sub); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysS); - switch (RC->getSize()) { - case 4: + switch (TRI->getRegSizeInBits(*RC)) { + case 32: return IfTrue ? A2_tfrt : A2_tfrf; - case 8: + case 64: return IfTrue ? A2_tfrpt : A2_tfrpf; } llvm_unreachable("Invalid register operand"); diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp index 0e2380f..a04aca4 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -1425,7 +1425,7 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF, if (!SRegs[S->Reg]) continue; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(S->Reg); - int FI = MFI.CreateFixedSpillStackObject(RC->getSize(), S->Offset); + int FI = MFI.CreateFixedSpillStackObject(TRI->getSpillSize(*RC), S->Offset); MinOffset = std::min(MinOffset, S->Offset); CSI.push_back(CalleeSavedInfo(S->Reg, FI)); SRegs[S->Reg] = false; @@ -1437,11 +1437,12 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF, for (int x = SRegs.find_first(); x >= 0; x = SRegs.find_next(x)) { unsigned R = x; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(R); - int Off = MinOffset - RC->getSize(); - unsigned Align = std::min(RC->getAlignment(), getStackAlignment()); + unsigned Size = TRI->getSpillSize(*RC); + int Off = MinOffset - Size; + unsigned Align = std::min(TRI->getSpillAlignment(*RC), getStackAlignment()); assert(isPowerOf2_32(Align)); Off &= -Align; - int FI = MFI.CreateFixedSpillStackObject(RC->getSize(), Off); + int FI = MFI.CreateFixedSpillStackObject(Size, Off); MinOffset = std::min(MinOffset, Off); CSI.push_back(CalleeSavedInfo(R, FI)); SRegs[R] = false; @@ -1677,10 +1678,10 @@ bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B, int FI = MI->getOperand(0).getIndex(); bool Is128B = HST.useHVXDblOps(); - auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass - : &Hexagon::VectorRegs128BRegClass; - unsigned Size = RC->getSize(); - unsigned NeedAlign = RC->getAlignment(); + const auto &RC = !Is128B ? Hexagon::VectorRegsRegClass + : Hexagon::VectorRegs128BRegClass; + unsigned Size = HRI.getSpillSize(RC); + unsigned NeedAlign = HRI.getSpillAlignment(RC); unsigned HasAlign = MFI.getObjectAlignment(FI); unsigned StoreOpc; @@ -1734,10 +1735,10 @@ bool HexagonFrameLowering::expandLoadVec2(MachineBasicBlock &B, int FI = MI->getOperand(1).getIndex(); bool Is128B = HST.useHVXDblOps(); - auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass - : &Hexagon::VectorRegs128BRegClass; - unsigned Size = RC->getSize(); - unsigned NeedAlign = RC->getAlignment(); + const auto &RC = !Is128B ? Hexagon::VectorRegsRegClass + : Hexagon::VectorRegs128BRegClass; + unsigned Size = HRI.getSpillSize(RC); + unsigned NeedAlign = HRI.getSpillAlignment(RC); unsigned HasAlign = MFI.getObjectAlignment(FI); unsigned LoadOpc; @@ -1777,16 +1778,16 @@ bool HexagonFrameLowering::expandStoreVec(MachineBasicBlock &B, if (!MI->getOperand(0).isFI()) return false; + auto &HRI = *HST.getRegisterInfo(); DebugLoc DL = MI->getDebugLoc(); unsigned SrcR = MI->getOperand(2).getReg(); bool IsKill = MI->getOperand(2).isKill(); int FI = MI->getOperand(0).getIndex(); bool Is128B = HST.useHVXDblOps(); - auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass - : &Hexagon::VectorRegs128BRegClass; - - unsigned NeedAlign = RC->getAlignment(); + const auto &RC = !Is128B ? Hexagon::VectorRegsRegClass + : Hexagon::VectorRegs128BRegClass; + unsigned NeedAlign = HRI.getSpillAlignment(RC); unsigned HasAlign = MFI.getObjectAlignment(FI); unsigned StoreOpc; @@ -1815,15 +1816,15 @@ bool HexagonFrameLowering::expandLoadVec(MachineBasicBlock &B, if (!MI->getOperand(1).isFI()) return false; + auto &HRI = *HST.getRegisterInfo(); DebugLoc DL = MI->getDebugLoc(); unsigned DstR = MI->getOperand(0).getReg(); int FI = MI->getOperand(1).getIndex(); bool Is128B = HST.useHVXDblOps(); - auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass - : &Hexagon::VectorRegs128BRegClass; - - unsigned NeedAlign = RC->getAlignment(); + const auto &RC = !Is128B ? Hexagon::VectorRegsRegClass + : Hexagon::VectorRegs128BRegClass; + unsigned NeedAlign = HRI.getSpillAlignment(RC); unsigned HasAlign = MFI.getObjectAlignment(FI); unsigned LoadOpc; @@ -1932,7 +1933,7 @@ void HexagonFrameLowering::determineCalleeSaves(MachineFunction &MF, if (!needToReserveScavengingSpillSlots(MF, HRI, RC)) continue; unsigned Num = RC == &Hexagon::IntRegsRegClass ? NumberScavengerSlots : 1; - unsigned S = RC->getSize(), A = RC->getAlignment(); + unsigned S = HRI.getSpillSize(*RC), A = HRI.getSpillAlignment(*RC); for (unsigned i = 0; i < Num; i++) { int NewFI = MFI.CreateSpillStackObject(S, A); RS->addScavengingFrameIndex(NewFI); diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index b594847..1829c5d 100644 --- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -26,6 +26,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/raw_ostream.h" #include @@ -1206,10 +1207,9 @@ bool PolynomialMultiplyRecognize::highBitsAreZero(Value *V, if (!T) return false; - unsigned BW = T->getBitWidth(); - APInt K0(BW, 0), K1(BW, 0); - computeKnownBits(V, K0, K1, DL); - return K0.countLeadingOnes() >= IterCount; + KnownBits Known(T->getBitWidth()); + computeKnownBits(V, Known, DL); + return Known.Zero.countLeadingOnes() >= IterCount; } diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td index c0c29b9..22fc247 100644 --- a/lib/Target/MSP430/MSP430InstrInfo.td +++ b/lib/Target/MSP430/MSP430InstrInfo.td @@ -122,6 +122,7 @@ def ADJCALLSTACKUP : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2), } let usesCustomInserter = 1 in { + let Uses = [SR] in { def Select8 : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$src2, i8imm:$cc), "# Select8 PSEUDO", [(set GR8:$dst, @@ -130,6 +131,7 @@ let usesCustomInserter = 1 in { "# Select16 PSEUDO", [(set GR16:$dst, (MSP430selectcc GR16:$src, GR16:$src2, imm:$cc))]>; + } let Defs = [SR] in { def Shl8 : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt), "# Shl8 PSEUDO", diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp index 2a9d962..134f7ac 100644 --- a/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/lib/Target/Mips/MipsAsmPrinter.cpp @@ -273,9 +273,9 @@ void MipsAsmPrinter::printSavedRegsBitmask() { const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); const std::vector &CSI = MFI.getCalleeSavedInfo(); // size of stack area to which FP callee-saved regs are saved. - unsigned CPURegSize = Mips::GPR32RegClass.getSize(); - unsigned FGR32RegSize = Mips::FGR32RegClass.getSize(); - unsigned AFGR64RegSize = Mips::AFGR64RegClass.getSize(); + unsigned CPURegSize = TRI->getRegSizeInBits(Mips::GPR32RegClass) / 8; + unsigned FGR32RegSize = TRI->getRegSizeInBits(Mips::FGR32RegClass) / 8; + unsigned AFGR64RegSize = TRI->getRegSizeInBits(Mips::AFGR64RegClass) / 8; bool HasAFGR64Reg = false; unsigned CSFPRegsSize = 0; diff --git a/lib/Target/Mips/MipsCCState.cpp b/lib/Target/Mips/MipsCCState.cpp index 7af988c..cb9f676 100644 --- a/lib/Target/Mips/MipsCCState.cpp +++ b/lib/Target/Mips/MipsCCState.cpp @@ -38,7 +38,7 @@ static bool isF128SoftLibCall(const char *CallSym) { /// This function returns true if Ty is fp128, {f128} or i128 which was /// originally a fp128. -static bool originalTypeIsF128(Type *Ty, const SDNode *CallNode) { +static bool originalTypeIsF128(const Type *Ty, const char *Func) { if (Ty->isFP128Ty()) return true; @@ -46,12 +46,9 @@ static bool originalTypeIsF128(Type *Ty, const SDNode *CallNode) { Ty->getStructElementType(0)->isFP128Ty()) return true; - const ExternalSymbolSDNode *ES = - dyn_cast_or_null(CallNode); - // If the Ty is i128 and the function being called is a long double emulation // routine, then the original type is f128. - return (ES && Ty->isIntegerTy(128) && isF128SoftLibCall(ES->getSymbol())); + return (Func && Ty->isIntegerTy(128) && isF128SoftLibCall(Func)); } MipsCCState::SpecialCallingConvType @@ -73,11 +70,11 @@ MipsCCState::getSpecialCallingConvForCallee(const SDNode *Callee, void MipsCCState::PreAnalyzeCallResultForF128( const SmallVectorImpl &Ins, - const TargetLowering::CallLoweringInfo &CLI) { + const Type *RetTy, const char *Call) { for (unsigned i = 0; i < Ins.size(); ++i) { OriginalArgWasF128.push_back( - originalTypeIsF128(CLI.RetTy, CLI.Callee.getNode())); - OriginalArgWasFloat.push_back(CLI.RetTy->isFloatingPointTy()); + originalTypeIsF128(RetTy, Call)); + OriginalArgWasFloat.push_back(RetTy->isFloatingPointTy()); } } @@ -99,10 +96,10 @@ void MipsCCState::PreAnalyzeReturnForF128( void MipsCCState::PreAnalyzeCallOperands( const SmallVectorImpl &Outs, std::vector &FuncArgs, - const SDNode *CallNode) { + const char *Func) { for (unsigned i = 0; i < Outs.size(); ++i) { OriginalArgWasF128.push_back( - originalTypeIsF128(FuncArgs[Outs[i].OrigArgIndex].Ty, CallNode)); + originalTypeIsF128(FuncArgs[Outs[i].OrigArgIndex].Ty, Func)); OriginalArgWasFloat.push_back( FuncArgs[Outs[i].OrigArgIndex].Ty->isFloatingPointTy()); CallOperandIsFixed.push_back(Outs[i].IsFixed); diff --git a/lib/Target/Mips/MipsCCState.h b/lib/Target/Mips/MipsCCState.h index 081c393..77ecc65 100644 --- a/lib/Target/Mips/MipsCCState.h +++ b/lib/Target/Mips/MipsCCState.h @@ -31,7 +31,7 @@ private: /// Identify lowered values that originated from f128 arguments and record /// this for use by RetCC_MipsN. void PreAnalyzeCallResultForF128(const SmallVectorImpl &Ins, - const TargetLowering::CallLoweringInfo &CLI); + const Type *RetTy, const char * Func); /// Identify lowered values that originated from f128 arguments and record /// this for use by RetCC_MipsN. @@ -42,7 +42,7 @@ private: void PreAnalyzeCallOperands(const SmallVectorImpl &Outs, std::vector &FuncArgs, - const SDNode *CallNode); + const char *Func); /// Identify lowered values that originated from f128 arguments and record /// this. @@ -73,8 +73,8 @@ public: AnalyzeCallOperands(const SmallVectorImpl &Outs, CCAssignFn Fn, std::vector &FuncArgs, - const SDNode *CallNode) { - PreAnalyzeCallOperands(Outs, FuncArgs, CallNode); + const char *Func) { + PreAnalyzeCallOperands(Outs, FuncArgs, Func); CCState::AnalyzeCallOperands(Outs, Fn); OriginalArgWasF128.clear(); OriginalArgWasFloat.clear(); @@ -99,9 +99,9 @@ public: } void AnalyzeCallResult(const SmallVectorImpl &Ins, - CCAssignFn Fn, - const TargetLowering::CallLoweringInfo &CLI) { - PreAnalyzeCallResultForF128(Ins, CLI); + CCAssignFn Fn, const Type *RetTy, + const char *Func) { + PreAnalyzeCallResultForF128(Ins, RetTy, Func); CCState::AnalyzeCallResult(Ins, Fn); OriginalArgWasFloat.clear(); OriginalArgWasF128.clear(); diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp index c060cf0..a5c7bf7 100644 --- a/lib/Target/Mips/MipsFastISel.cpp +++ b/lib/Target/Mips/MipsFastISel.cpp @@ -1260,8 +1260,10 @@ bool MipsFastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT, emitInst(Mips::ADJCALLSTACKUP).addImm(16).addImm(0); if (RetVT != MVT::isVoid) { SmallVector RVLocs; - CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context); - CCInfo.AnalyzeCallResult(RetVT, RetCC_Mips); + MipsCCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context); + + CCInfo.AnalyzeCallResult(CLI.Ins, RetCC_Mips, CLI.RetTy, + CLI.Symbol->getName().data()); // Only handle a single return value. if (RVLocs.size() != 1) diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp index b2cf039..ef05166 100644 --- a/lib/Target/Mips/MipsFrameLowering.cpp +++ b/lib/Target/Mips/MipsFrameLowering.cpp @@ -119,7 +119,7 @@ uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const { // Conservatively assume all callee-saved registers will be saved. for (const MCPhysReg *R = TRI.getCalleeSavedRegs(&MF); *R; ++R) { - unsigned Size = TRI.getMinimalPhysRegClass(*R)->getSize(); + unsigned Size = TRI.getSpillSize(*TRI.getMinimalPhysRegClass(*R)); Offset = alignTo(Offset + Size, Size); } diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 93c5f49..8f39ebd 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -2750,7 +2750,10 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // caller side but removing it breaks the frame size calculation. CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1); - CCInfo.AnalyzeCallOperands(Outs, CC_Mips, CLI.getArgs(), Callee.getNode()); + const ExternalSymbolSDNode *ES = + dyn_cast_or_null(Callee.getNode()); + CCInfo.AnalyzeCallOperands(Outs, CC_Mips, CLI.getArgs(), + ES ? ES->getSymbol() : nullptr); // Get a count of how many bytes are to be pushed on the stack. unsigned NextStackOffset = CCInfo.getNextStackOffset(); @@ -2985,7 +2988,11 @@ SDValue MipsTargetLowering::LowerCallResult( SmallVector RVLocs; MipsCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); - CCInfo.AnalyzeCallResult(Ins, RetCC_Mips, CLI); + + const ExternalSymbolSDNode *ES = + dyn_cast_or_null(CLI.Callee.getNode()); + CCInfo.AnalyzeCallResult(Ins, RetCC_Mips, CLI.RetTy, + ES ? ES->getSymbol() : nullptr); // Copy all of the result registers out of their specified physreg. for (unsigned i = 0; i != RVLocs.size(); ++i) { diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp index 5bf4c95..63034ec 100644 --- a/lib/Target/Mips/MipsMachineFunction.cpp +++ b/lib/Target/Mips/MipsMachineFunction.cpp @@ -40,11 +40,7 @@ unsigned MipsFunctionInfo::getGlobalBaseReg() { const TargetRegisterClass *RC = STI.inMips16Mode() ? &Mips::CPU16RegsRegClass - : STI.inMicroMipsMode() - ? STI.hasMips64() - ? &Mips::GPRMM16_64RegClass - : &Mips::GPRMM16RegClass - : static_cast(MF.getTarget()) + : static_cast(MF.getTarget()) .getABI() .IsN64() ? &Mips::GPR64RegClass @@ -53,14 +49,15 @@ unsigned MipsFunctionInfo::getGlobalBaseReg() { } void MipsFunctionInfo::createEhDataRegsFI() { + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); for (int I = 0; I < 4; ++I) { - const TargetRegisterClass *RC = + const TargetRegisterClass &RC = static_cast(MF.getTarget()).getABI().IsN64() - ? &Mips::GPR64RegClass - : &Mips::GPR32RegClass; + ? Mips::GPR64RegClass + : Mips::GPR32RegClass; - EhDataRegFI[I] = MF.getFrameInfo().CreateStackObject(RC->getSize(), - RC->getAlignment(), false); + EhDataRegFI[I] = MF.getFrameInfo().CreateStackObject(TRI.getSpillSize(RC), + TRI.getSpillAlignment(RC), false); } } @@ -69,11 +66,12 @@ void MipsFunctionInfo::createISRRegFI() { // The current implementation only supports Mips32r2+ not Mips64rX. Status // is always 32 bits, ErrorPC is 32 or 64 bits dependent on architecture, // however Mips32r2+ is the supported architecture. - const TargetRegisterClass *RC = &Mips::GPR32RegClass; + const TargetRegisterClass &RC = Mips::GPR32RegClass; + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); for (int I = 0; I < 2; ++I) ISRDataRegFI[I] = MF.getFrameInfo().CreateStackObject( - RC->getSize(), RC->getAlignment(), false); + TRI.getSpillSize(RC), TRI.getSpillAlignment(RC), false); } bool MipsFunctionInfo::isEhDataRegFI(int FI) const { @@ -93,9 +91,10 @@ MachinePointerInfo MipsFunctionInfo::callPtrInfo(const GlobalValue *GV) { } int MipsFunctionInfo::getMoveF64ViaSpillFI(const TargetRegisterClass *RC) { + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); if (MoveF64ViaSpillFI == -1) { MoveF64ViaSpillFI = MF.getFrameInfo().CreateStackObject( - RC->getSize(), RC->getAlignment(), false); + TRI.getSpillSize(*RC), TRI.getSpillAlignment(*RC), false); } return MoveF64ViaSpillFI; } diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp index f33857f..68dcbdf 100644 --- a/lib/Target/Mips/MipsOptimizePICCall.cpp +++ b/lib/Target/Mips/MipsOptimizePICCall.cpp @@ -116,9 +116,10 @@ static MachineOperand *getCallTargetRegOpnd(MachineInstr &MI) { /// Return type of register Reg. static MVT::SimpleValueType getRegTy(unsigned Reg, MachineFunction &MF) { + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(Reg); - assert(RC->vt_end() - RC->vt_begin() == 1); - return *RC->vt_begin(); + assert(TRI.legalclasstypes_end(*RC) - TRI.legalclasstypes_begin(*RC) == 1); + return *TRI.legalclasstypes_begin(*RC); } /// Do the following transformation: diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp index ef8d18c..e765b46 100644 --- a/lib/Target/Mips/MipsSEFrameLowering.cpp +++ b/lib/Target/Mips/MipsSEFrameLowering.cpp @@ -260,7 +260,8 @@ bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I, // copy dst_hi, $vr1 unsigned Dst = I->getOperand(0).getReg(), Src = I->getOperand(1).getReg(); - unsigned VRegSize = RegInfo.getMinimalPhysRegClass(Dst)->getSize() / 2; + const TargetRegisterClass *DstRC = RegInfo.getMinimalPhysRegClass(Dst); + unsigned VRegSize = RegInfo.getRegSizeInBits(*DstRC) / 16; const TargetRegisterClass *RC = RegInfo.intRegClass(VRegSize); unsigned VR0 = MRI.createVirtualRegister(RC); unsigned VR1 = MRI.createVirtualRegister(RC); @@ -858,6 +859,7 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const { TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); MipsFunctionInfo *MipsFI = MF.getInfo(); MipsABIInfo ABI = STI.getABI(); unsigned FP = ABI.GetFramePtr(); @@ -883,10 +885,11 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF, if (ExpandPseudo(MF).expand()) { // The spill slot should be half the size of the accumulator. If target is // mips64, it should be 64-bit, otherwise it should be 32-bt. - const TargetRegisterClass *RC = STI.hasMips64() ? - &Mips::GPR64RegClass : &Mips::GPR32RegClass; - int FI = MF.getFrameInfo().CreateStackObject(RC->getSize(), - RC->getAlignment(), false); + const TargetRegisterClass &RC = STI.hasMips64() ? + Mips::GPR64RegClass : Mips::GPR32RegClass; + int FI = MF.getFrameInfo().CreateStackObject(TRI->getSpillSize(RC), + TRI->getSpillAlignment(RC), + false); RS->addScavengingFrameIndex(FI); } @@ -897,10 +900,11 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF, if (isInt<16>(MaxSPOffset)) return; - const TargetRegisterClass *RC = - ABI.ArePtrs64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass; - int FI = MF.getFrameInfo().CreateStackObject(RC->getSize(), - RC->getAlignment(), false); + const TargetRegisterClass &RC = + ABI.ArePtrs64bit() ? Mips::GPR64RegClass : Mips::GPR32RegClass; + int FI = MF.getFrameInfo().CreateStackObject(TRI->getSpillSize(RC), + TRI->getSpillAlignment(RC), + false); RS->addScavengingFrameIndex(FI); } diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp index 91e712a..ee07479 100644 --- a/lib/Target/Mips/MipsSEInstrInfo.cpp +++ b/lib/Target/Mips/MipsSEInstrInfo.cpp @@ -207,13 +207,16 @@ storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Opc = Mips::SDC1; else if (Mips::FGR64RegClass.hasSubClassEq(RC)) Opc = Mips::SDC164; - else if (RC->hasType(MVT::v16i8)) + else if (TRI->isTypeLegalForClass(*RC, MVT::v16i8)) Opc = Mips::ST_B; - else if (RC->hasType(MVT::v8i16) || RC->hasType(MVT::v8f16)) + else if (TRI->isTypeLegalForClass(*RC, MVT::v8i16) || + TRI->isTypeLegalForClass(*RC, MVT::v8f16)) Opc = Mips::ST_H; - else if (RC->hasType(MVT::v4i32) || RC->hasType(MVT::v4f32)) + else if (TRI->isTypeLegalForClass(*RC, MVT::v4i32) || + TRI->isTypeLegalForClass(*RC, MVT::v4f32)) Opc = Mips::ST_W; - else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64)) + else if (TRI->isTypeLegalForClass(*RC, MVT::v2i64) || + TRI->isTypeLegalForClass(*RC, MVT::v2f64)) Opc = Mips::ST_D; else if (Mips::LO32RegClass.hasSubClassEq(RC)) Opc = Mips::SW; @@ -280,13 +283,16 @@ loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Opc = Mips::LDC1; else if (Mips::FGR64RegClass.hasSubClassEq(RC)) Opc = Mips::LDC164; - else if (RC->hasType(MVT::v16i8)) + else if (TRI->isTypeLegalForClass(*RC, MVT::v16i8)) Opc = Mips::LD_B; - else if (RC->hasType(MVT::v8i16) || RC->hasType(MVT::v8f16)) + else if (TRI->isTypeLegalForClass(*RC, MVT::v8i16) || + TRI->isTypeLegalForClass(*RC, MVT::v8f16)) Opc = Mips::LD_H; - else if (RC->hasType(MVT::v4i32) || RC->hasType(MVT::v4f32)) + else if (TRI->isTypeLegalForClass(*RC, MVT::v4i32) || + TRI->isTypeLegalForClass(*RC, MVT::v4f32)) Opc = Mips::LD_W; - else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64)) + else if (TRI->isTypeLegalForClass(*RC, MVT::v2i64) || + TRI->isTypeLegalForClass(*RC, MVT::v2f64)) Opc = Mips::LD_D; else if (Mips::HI32RegClass.hasSubClassEq(RC)) Opc = Mips::LW; @@ -567,8 +573,8 @@ MipsSEInstrInfo::compareOpndSize(unsigned Opc, const MCInstrDesc &Desc = get(Opc); assert(Desc.NumOperands == 2 && "Unary instruction expected."); const MipsRegisterInfo *RI = &getRegisterInfo(); - unsigned DstRegSize = getRegClass(Desc, 0, RI, MF)->getSize(); - unsigned SrcRegSize = getRegClass(Desc, 1, RI, MF)->getSize(); + unsigned DstRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 0, RI, MF)); + unsigned SrcRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 1, RI, MF)); return std::make_pair(DstRegSize > SrcRegSize, DstRegSize < SrcRegSize); } diff --git a/lib/Target/Mips/Relocation.txt b/lib/Target/Mips/Relocation.txt new file mode 100644 index 0000000..f1a6fd8 --- /dev/null +++ b/lib/Target/Mips/Relocation.txt @@ -0,0 +1,125 @@ +MIPS Relocation Principles + +In LLVM, there are several elements of the llvm::ISD::NodeType enum +that deal with addresses and/or relocations. These are defined in +include/llvm/Target/TargetSelectionDAG.td, namely: + GlobalAddress, GlobalTLSAddress, JumpTable, ConstantPool, + ExternalSymbol, BlockAddress +The MIPS backend uses several principles to handle these. + +1. Code for lowering addresses references to machine dependent code is +factored into common code for generating different address forms and +is called by the relocation model specific lowering function, using +templated functions. For example: + + // lib/Target/Mips/MipsISelLowering.cpp + SDValue MipsTargetLowering:: + lowerJumpTable(SDValue Op, SelectionDAG &DAG) const + +calls + + template // lib/Target/Mips/MipsISelLowering.h + SDValue getAddrLocal(NodeTy *N, const SDLoc &DL, EVT Ty, + SelectionDAG &DAG, bool IsN32OrN64) const + +which calls the overloaded function: + + // lib/Target/Mips/MipsISelLowering.h + SDValue getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG, + unsigned Flag) const; + +2. Generic address nodes are lowered to some combination of target +independent and machine specific SDNodes (for example: +MipsISD::{Highest, Higher, Hi, Lo}) depending upon relocation model, +ABI, and compilation options. + +The choice of specific instructions that are to be used is delegated +to ISel which in turn relies on TableGen patterns to choose subtarget +specific instructions. For example, in getAddrLocal, the pseudo-code +generated is: + + (add (load (wrapper $gp, %got(sym)), %lo(sym)) + +where "%lo" represents an instance of an SDNode with opcode +"MipsISD::Lo", "wrapper" indicates one with opcode "MipsISD::Wrapper", +and "%got" the global table pointer "getGlobalReg(...)". The "add" is +"ISD::ADD", not a target dependent one. + +3. A TableGen multiclass pattern "MipsHiLoRelocs" is used to define a +template pattern parameterized over the load upper immediate +instruction, add operation, the zero register, and register class. +Here the instantiation of MipsHiLoRelocs in MipsInstrInfo.td is used +to MIPS32 to compute addresses for the static relocation model. + + // lib/Target/Mips/MipsInstrInfo.td + multiclass MipsHiLoRelocs { + def : MipsPat<(MipsHi tglobaladdr:$in), (Lui tglobaladdr:$in)>; + ... + def : MipsPat<(MipsLo tglobaladdr:$in), (Addiu ZeroReg, tglobaladdr:$in)>; + ... + def : MipsPat<(add GPROpnd:$hi, (MipsLo tglobaladdr:$lo)), + (Addiu GPROpnd:$hi, tglobaladdr:$lo)>; + ... + } + defm : MipsHiLoRelocs; + + // lib/Target/Mips/Mips64InstrInfo.td + defm : MipsHiLoRelocs, SYM_32; + +The instantiation in Mips64InstrInfo.td is used for MIPS64 in ILP32 +mode, as guarded by the predicate "SYM_32" and also for a submode of +LP64 where symbols are assumed to be 32 bits wide. A similar +multiclass for MIPS64 in LP64 mode is also defined: + + // lib/Target/Mips/Mips64InstrInfo.td + multiclass MipsHighestHigherHiLoRelocs { + ... + def : MipsPat<(MipsHighest (i64 tglobaladdr:$in)), + (Lui tglobaladdr:$in)>; + ... + def : MipsPat<(MipsHigher (i64 tglobaladdr:$in)), + (Daddiu ZERO_64, tglobaladdr:$in)>; + ... + def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tglobaladdr:$lo))), + (Daddiu GPR64:$hi, tglobaladdr:$lo)>; + ... + def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tglobaladdr:$lo))), + (Daddiu GPR64:$hi, tglobaladdr:$lo)>; + ... + def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tglobaladdr:$lo))), + (Daddiu GPR64:$hi, tglobaladdr:$lo)>; + } + +and it is instantiated twice: + + // lib/Target/Mips/Mips64InstrInfo.td + defm : MipsHighestHigherHiLoRelocs, SYM_64; + // lib/Target/Mips/MicroMips64r6InstrInfo.td + defm : MipsHighestHigherHiLoRelocs, SYM_64, + ISA_MICROMIPS64R6; + +These patterns are used during instruction selection to match +MipsISD::{Highest, Higher, Hi, Lo} to a specific machine instruction +and operands. + +More details on how multiclasses in TableGen work can be found in the +section "Multiclass definitions and instances" in the document +"TableGen Language Introduction" + +4. Instruction definitions are multiply defined to cover the different +register classes. In some cases, such as LW/LW64, this also accounts +for the difference in the results of instruction execution. On MIPS32, +"lw" loads a 32 bit value from memory. On MIPS64, "lw" loads a 32 bit +value from memory and sign extends the value to 64 bits. + + // lib/Target/Mips/MipsInstrInfo.td + def LUi : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16_relaxed>, LUI_FM; + // lib/Target/Mips/Mips64InstrInfo.td + def LUi64 : LoadUpper<"lui", GPR64Opnd, uimm16_64_relaxed>, LUI_FM; + +defines two names "LUi" and "LUi64" with two different register +classes, but with the same encoding---"LUI_FM". These instructions load a +16-bit immediate into bits 31-16 and clear the lower 15 bits. On MIPS64, +the result is sign-extended to 64 bits. diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp index 3026f0b..0f6c2e5 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -38,7 +38,7 @@ void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB, const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg); const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); - if (DestRC->getSize() != SrcRC->getSize()) + if (RegInfo.getRegSizeInBits(*DestRC) != RegInfo.getRegSizeInBits(*SrcRC)) report_fatal_error("Copy one register into another with a different width"); unsigned Op; diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp index 4c9430a..2a402de 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -1898,12 +1898,13 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF, MachineFrameInfo &MFI = MF.getFrameInfo(); if (MFI.hasVarSizedObjects() || spillsCR(MF) || spillsVRSAVE(MF) || hasNonRISpills(MF) || (hasSpills(MF) && !isInt<16>(StackSize))) { - const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; - const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; - const TargetRegisterClass *RC = Subtarget.isPPC64() ? G8RC : GPRC; - RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(), - RC->getAlignment(), - false)); + const TargetRegisterClass &GPRC = PPC::GPRCRegClass; + const TargetRegisterClass &G8RC = PPC::G8RCRegClass; + const TargetRegisterClass &RC = Subtarget.isPPC64() ? G8RC : GPRC; + const TargetRegisterInfo &TRI = *Subtarget.getRegisterInfo(); + unsigned Size = TRI.getSpillSize(RC); + unsigned Align = TRI.getSpillAlignment(RC); + RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false)); // Might we have over-aligned allocas? bool HasAlVars = MFI.hasVarSizedObjects() && @@ -1911,9 +1912,7 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF, // These kinds of spills might need two registers. if (spillsCR(MF) || spillsVRSAVE(MF) || HasAlVars) - RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(), - RC->getAlignment(), - false)); + RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false)); } } diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index f7663d8..4659a2e 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -9057,6 +9057,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI.getDebugLoc(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -9070,7 +9071,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, unsigned DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); - assert(RC->hasType(MVT::i32) && "Invalid destination!"); + assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); unsigned mainDstReg = MRI.createVirtualRegister(RC); unsigned restoreDstReg = MRI.createVirtualRegister(RC); @@ -9153,7 +9154,6 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, // Setup MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB); - const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo(); MIB.addRegMask(TRI->getNoPreservedMask()); BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1); diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index 8e159f4..790a890 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -440,8 +440,8 @@ void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB, BuildMI(MBB, MI, DL, get(Opcode)); } -/// getNoopForMachoTarget - Return the noop instruction to use for a noop. -void PPCInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { +/// Return the noop instruction to use for a noop. +void PPCInstrInfo::getNoop(MCInst &NopInst) const { NopInst.setOpcode(PPC::NOP); } diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h index f11aed8..b30d09e 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.h +++ b/lib/Target/PowerPC/PPCInstrInfo.h @@ -269,7 +269,7 @@ public: /// unsigned getInstSizeInBytes(const MachineInstr &MI) const override; - void getNoopForMachoTarget(MCInst &NopInst) const override; + void getNoop(MCInst &NopInst) const override; std::pair decomposeMachineOperandsTargetFlags(unsigned TF) const override; diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp index 455d1ee..f120a98 100644 --- a/lib/Target/Sparc/SparcISelLowering.cpp +++ b/lib/Target/Sparc/SparcISelLowering.cpp @@ -3234,6 +3234,7 @@ SparcTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI.getDebugLoc(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -3245,7 +3246,8 @@ SparcTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, unsigned DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); - assert(RC->hasType(MVT::i32) && "Invalid destination!"); + assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); + (void)TRI; unsigned mainDstReg = MRI.createVirtualRegister(RC); unsigned restoreDstReg = MRI.createVirtualRegister(RC); diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp index c8ff955..fee008b 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -104,8 +104,9 @@ void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI, MachineOperand &LowOffsetOp = MI->getOperand(2); LowOffsetOp.setImm(LowOffsetOp.getImm() + 8); - // Clear the kill flags for the base and index registers in the first - // instruction. + // Clear the kill flags on the registers in the first instruction. + if (EarlierMI->getOperand(0).isReg() && EarlierMI->getOperand(0).isUse()) + EarlierMI->getOperand(0).setIsKill(false); EarlierMI->getOperand(1).setIsKill(false); EarlierMI->getOperand(3).setIsKill(false); @@ -1114,10 +1115,9 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl( return nullptr; unsigned OpNum = Ops[0]; - assert(Size == - MF.getRegInfo() - .getRegClass(MI.getOperand(OpNum).getReg()) - ->getSize() && + assert(Size * 8 == + TRI->getRegSizeInBits(*MF.getRegInfo() + .getRegClass(MI.getOperand(OpNum).getReg())) && "Invalid size combination"); if ((Opcode == SystemZ::AHI || Opcode == SystemZ::AGHI) && OpNum == 0 && diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp index d9c2dba..4178ec0 100644 --- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp +++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -45,10 +45,11 @@ using namespace llvm; //===----------------------------------------------------------------------===// MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const { + const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); const TargetRegisterClass *TRC = MRI->getRegClass(RegNo); for (MVT T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64, MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32}) - if (TRC->hasType(T)) + if (TRI->isTypeLegalForClass(*TRC, T)) return T; DEBUG(errs() << "Unknown type for register number: " << RegNo); llvm_unreachable("Unknown register type"); diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index 324da65..c1cfc82 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -3094,6 +3094,7 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { else if (IDVal.startswith(".code")) return ParseDirectiveCode(IDVal, DirectiveID.getLoc()); else if (IDVal.startswith(".att_syntax")) { + getParser().setParsingInlineAsm(false); if (getLexer().isNot(AsmToken::EndOfStatement)) { if (Parser.getTok().getString() == "prefix") Parser.Lex(); @@ -3106,6 +3107,7 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { return false; } else if (IDVal.startswith(".intel_syntax")) { getParser().setAssemblerDialect(1); + getParser().setParsingInlineAsm(true); if (getLexer().isNot(AsmToken::EndOfStatement)) { if (Parser.getTok().getString() == "noprefix") Parser.Lex(); diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index fdcc7e1..19c93cf 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -95,7 +95,8 @@ void initializeFixupBWInstPassPass(PassRegistry &); /// encoding when possible in order to reduce code size. FunctionPass *createX86EvexToVexInsts(); -InstructionSelector *createX86InstructionSelector(X86Subtarget &, +InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM, + X86Subtarget &, X86RegisterBankInfo &); void initializeEvexToVexInstPassPass(PassRegistry &); diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 8fcc8e3..d2f650c 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -273,6 +273,16 @@ def FeatureFastSHLDRotate "fast-shld-rotate", "HasFastSHLDRotate", "true", "SHLD can be used as a faster rotate">; +// Ivy Bridge and newer processors have enhanced REP MOVSB and STOSB (aka +// "string operations"). See "REP String Enhancement" in the Intel Software +// Development Manual. This feature essentially means that REP MOVSB will copy +// using the largest available size instead of copying bytes one by one, making +// it at least as fast as REPMOVS{W,D,Q}. +def FeatureERMSB + : SubtargetFeature< + "ermsb", "HasERMSB", "true", + "REP MOVS/STOS are fast">; + //===----------------------------------------------------------------------===// // X86 processors supported. //===----------------------------------------------------------------------===// @@ -498,6 +508,7 @@ def HSWFeatures : ProcessorFeatures BitOffsets; SmallVector SplitRegs; EVT PartVT = TLI.getRegisterType(Context, VT); @@ -64,8 +63,10 @@ void X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg, ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*PartTy, DL)), PartTy, OrigArg.Flags}; SplitArgs.push_back(Info); - PerformArgSplit(Info.Reg, PartVT.getSizeInBits() * i); + SplitRegs.push_back(Info.Reg); } + + PerformArgSplit(SplitRegs); } namespace { @@ -112,10 +113,9 @@ bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F); SmallVector SplitArgs; - splitToValueTypes(OrigArg, SplitArgs, DL, MRI, - [&](unsigned Reg, uint64_t Offset) { - MIRBuilder.buildExtract(Reg, VReg, Offset); - }); + splitToValueTypes( + OrigArg, SplitArgs, DL, MRI, + [&](ArrayRef Regs) { MIRBuilder.buildUnmerge(Regs, VReg); }); FuncReturnHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86); if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) @@ -183,22 +183,10 @@ bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, for (auto &Arg : F.args()) { ArgInfo OrigArg(VRegs[Idx], Arg.getType()); setArgFlags(OrigArg, Idx + 1, DL, F); - LLT Ty = MRI.getType(VRegs[Idx]); - unsigned Dst = VRegs[Idx]; - bool Split = false; splitToValueTypes(OrigArg, SplitArgs, DL, MRI, - [&](unsigned Reg, uint64_t Offset) { - if (!Split) { - Split = true; - Dst = MRI.createGenericVirtualRegister(Ty); - MIRBuilder.buildUndef(Dst); - } - unsigned Tmp = MRI.createGenericVirtualRegister(Ty); - MIRBuilder.buildInsert(Tmp, Dst, Reg, Offset); - Dst = Tmp; + [&](ArrayRef Regs) { + MIRBuilder.buildMerge(VRegs[Idx], Regs); }); - if (Dst != VRegs[Idx]) - MIRBuilder.buildCopy(VRegs[Idx], Dst); Idx++; } diff --git a/lib/Target/X86/X86CallLowering.h b/lib/Target/X86/X86CallLowering.h index 204e697..8a8afb5 100644 --- a/lib/Target/X86/X86CallLowering.h +++ b/lib/Target/X86/X86CallLowering.h @@ -34,14 +34,15 @@ public: bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef VRegs) const override; + private: /// A function of this type is used to perform value split action. - typedef std::function SplitArgTy; + typedef std::function)> SplitArgTy; void splitToValueTypes(const ArgInfo &OrigArgInfo, SmallVectorImpl &SplitArgs, const DataLayout &DL, MachineRegisterInfo &MRI, SplitArgTy SplitArg) const; }; -} // End of namespace llvm; +} // namespace llvm #endif diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 036f5d2..b847781 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -2149,7 +2149,8 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { if (!LHSReg || !RHSReg) return false; - unsigned Opc = X86::getCMovFromCond(CC, RC->getSize()); + const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo(); + unsigned Opc = X86::getCMovFromCond(CC, TRI.getRegSizeInBits(*RC)/8); unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill); updateValueMap(I, ResultReg); diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 8678a13..a94045c 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -1783,6 +1783,14 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, return Offset + FPDelta; } +int X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF, + int FI, unsigned &FrameReg, + int Adjustment) const { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + FrameReg = TRI->getStackRegister(); + return MFI.getObjectOffset(FI) - getOffsetOfLocalArea() + Adjustment; +} + int X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI, unsigned &FrameReg, @@ -1839,9 +1847,6 @@ X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF, assert(MF.getInfo()->getTCReturnAddrDelta() >= 0 && "we don't handle this case!"); - // Fill in FrameReg output argument. - FrameReg = TRI->getStackRegister(); - // This is how the math works out: // // %rsp grows (i.e. gets lower) left to right. Each box below is @@ -1866,12 +1871,8 @@ X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF, // (C - E) == (C - A) - (B - A) + (B - E) // { Using [1], [2] and [3] above } // == getObjectOffset - LocalAreaOffset + StackSize - // - - // Get the Offset from the StackPointer - int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea(); - return Offset + StackSize; + return getFrameIndexReferenceSP(MF, FI, FrameReg, StackSize); } bool X86FrameLowering::assignCalleeSavedSpillSlots( @@ -1923,14 +1924,15 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( continue; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + unsigned Size = TRI->getSpillSize(*RC); + unsigned Align = TRI->getSpillAlignment(*RC); // ensure alignment - SpillSlotOffset -= std::abs(SpillSlotOffset) % RC->getAlignment(); + SpillSlotOffset -= std::abs(SpillSlotOffset) % Align; // spill into slot - SpillSlotOffset -= RC->getSize(); - int SlotIndex = - MFI.CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset); + SpillSlotOffset -= Size; + int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset); CSI[i - 1].setFrameIdx(SlotIndex); - MFI.ensureMaxAlignment(RC->getAlignment()); + MFI.ensureMaxAlignment(Align); } return true; diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h index 863dc8b..7d214ca 100644 --- a/lib/Target/X86/X86FrameLowering.h +++ b/lib/Target/X86/X86FrameLowering.h @@ -100,6 +100,8 @@ public: int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; + int getFrameIndexReferenceSP(const MachineFunction &MF, + int FI, unsigned &SPReg, int Adjustment) const; int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI, unsigned &FrameReg, bool IgnoreSPUpdates) const override; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b5f29fb..ada4664 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5060,8 +5060,8 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) - return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, - makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk)); + return DAG.getBuildVector( + ResultVT, dl, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk)); SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); @@ -14424,8 +14424,8 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, // If the input is a buildvector just emit a smaller one. unsigned ElemsPerChunk = ResVT.getVectorNumElements(); if (In.getOpcode() == ISD::BUILD_VECTOR) - return DAG.getNode(ISD::BUILD_VECTOR, dl, ResVT, - makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk)); + return DAG.getBuildVector( + ResVT, dl, makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk)); // Everything else is legal. return Op; @@ -25944,6 +25944,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); @@ -25960,7 +25961,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, DstReg = MI.getOperand(CurOp++).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); - assert(RC->hasType(MVT::i32) && "Invalid destination!"); + assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); + (void)TRI; unsigned mainDstReg = MRI.createVirtualRegister(RC); unsigned restoreDstReg = MRI.createVirtualRegister(RC); @@ -30207,7 +30209,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, APInt KnownZero, KnownOne; TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), DCI.isBeforeLegalizeOps()); - if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || + if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) || TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) { // If we changed the computation somewhere in the DAG, this change will @@ -33777,7 +33779,7 @@ static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || + if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) || TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) DCI.CommitTargetLoweringOpt(TLO); } @@ -35937,7 +35939,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to // turn into {ax},{dx}. // MVT::Other is used to specify clobber names. - if (Res.second->hasType(VT) || VT == MVT::Other) + if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other) return Res; // Correct type already, nothing to do. // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should @@ -35975,11 +35977,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, Res.second = &X86::FR32RegClass; else if (VT == MVT::f64 || VT == MVT::i64) Res.second = &X86::FR64RegClass; - else if (X86::VR128RegClass.hasType(VT)) + else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT)) Res.second = &X86::VR128RegClass; - else if (X86::VR256RegClass.hasType(VT)) + else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT)) Res.second = &X86::VR256RegClass; - else if (X86::VR512RegClass.hasType(VT)) + else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT)) Res.second = &X86::VR512RegClass; else { // Type mismatch and not a clobber: Return an error; diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index bfd21c0..6638201 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -989,10 +989,12 @@ multiclass ArithBinOp_RF BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, } } // Constraints = "$src1 = $dst" - def NAME#8mr : BinOpMR_RMW; - def NAME#16mr : BinOpMR_RMW; - def NAME#32mr : BinOpMR_RMW; - def NAME#64mr : BinOpMR_RMW; + let mayLoad = 1, mayStore = 1 in { + def NAME#8mr : BinOpMR_RMW; + def NAME#16mr : BinOpMR_RMW; + def NAME#32mr : BinOpMR_RMW; + def NAME#64mr : BinOpMR_RMW; + } // NOTE: These are order specific, we want the mi8 forms to be listed // first so that they are slightly preferred to the mi forms. diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 7b456fd..26444dd 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -6284,9 +6284,11 @@ void X86InstrInfo::insertSelect(MachineBasicBlock &MBB, ArrayRef Cond, unsigned TrueReg, unsigned FalseReg) const { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); + const TargetRegisterClass &RC = *MRI.getRegClass(DstReg); assert(Cond.size() == 1 && "Invalid Cond array"); unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(), - MRI.getRegClass(DstReg)->getSize(), + TRI.getRegSizeInBits(RC) / 8, false /*HasMemoryOperand*/); BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg); } @@ -6557,7 +6559,7 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, bool HasAVX512 = STI.hasAVX512(); bool HasVLX = STI.hasVLX(); - switch (RC->getSize()) { + switch (STI.getRegisterInfo()->getSpillSize(*RC)) { default: llvm_unreachable("Unknown spill size"); case 1: @@ -6603,28 +6605,36 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass"); return load ? X86::LD_Fp80m : X86::ST_FpP80m; case 16: { - assert(X86::VR128XRegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass"); - // If stack is realigned we can use aligned stores. - if (isStackAligned) - return load ? - (HasVLX ? X86::VMOVAPSZ128rm : - HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX : - HasAVX ? X86::VMOVAPSrm : - X86::MOVAPSrm): - (HasVLX ? X86::VMOVAPSZ128mr : - HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX : - HasAVX ? X86::VMOVAPSmr : - X86::MOVAPSmr); - else - return load ? - (HasVLX ? X86::VMOVUPSZ128rm : - HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX : - HasAVX ? X86::VMOVUPSrm : - X86::MOVUPSrm): - (HasVLX ? X86::VMOVUPSZ128mr : - HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX : - HasAVX ? X86::VMOVUPSmr : - X86::MOVUPSmr); + if (X86::VR128XRegClass.hasSubClassEq(RC)) { + // If stack is realigned we can use aligned stores. + if (isStackAligned) + return load ? + (HasVLX ? X86::VMOVAPSZ128rm : + HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX : + HasAVX ? X86::VMOVAPSrm : + X86::MOVAPSrm): + (HasVLX ? X86::VMOVAPSZ128mr : + HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX : + HasAVX ? X86::VMOVAPSmr : + X86::MOVAPSmr); + else + return load ? + (HasVLX ? X86::VMOVUPSZ128rm : + HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX : + HasAVX ? X86::VMOVUPSrm : + X86::MOVUPSrm): + (HasVLX ? X86::VMOVUPSZ128mr : + HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX : + HasAVX ? X86::VMOVUPSmr : + X86::MOVUPSmr); + } + if (X86::BNDRRegClass.hasSubClassEq(RC)) { + if (STI.is64Bit()) + return load ? X86::BNDMOVRM64rm : X86::BNDMOVMR64mr; + else + return load ? X86::BNDMOVRM32rm : X86::BNDMOVMR32mr; + } + llvm_unreachable("Unknown 16-byte regclass"); } case 32: assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass"); @@ -6709,9 +6719,9 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); - assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= RC->getSize() && + assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) && "Stack slot too small for store"); - unsigned Alignment = std::max(RC->getSize(), 16); + unsigned Alignment = std::max(TRI->getSpillSize(*RC), 16); bool isAligned = (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) || RI.canRealignStack(MF); @@ -6728,7 +6738,8 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg, MachineInstr::mmo_iterator MMOBegin, MachineInstr::mmo_iterator MMOEnd, SmallVectorImpl &NewMIs) const { - unsigned Alignment = std::max(RC->getSize(), 16); + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + unsigned Alignment = std::max(TRI.getSpillSize(*RC), 16); bool isAligned = MMOBegin != MMOEnd && (*MMOBegin)->getAlignment() >= Alignment; unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget); @@ -6748,7 +6759,7 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); - unsigned Alignment = std::max(RC->getSize(), 16); + unsigned Alignment = std::max(TRI->getSpillSize(*RC), 16); bool isAligned = (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) || RI.canRealignStack(MF); @@ -6763,7 +6774,8 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg, MachineInstr::mmo_iterator MMOBegin, MachineInstr::mmo_iterator MMOEnd, SmallVectorImpl &NewMIs) const { - unsigned Alignment = std::max(RC->getSize(), 16); + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + unsigned Alignment = std::max(TRI.getSpillSize(*RC), 16); bool isAligned = MMOBegin != MMOEnd && (*MMOBegin)->getAlignment() >= Alignment; unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget); @@ -7222,7 +7234,8 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, NewOpc = getSETFromCond(NewCC, HasMemoryOperand); else { unsigned DstReg = Instr.getOperand(0).getReg(); - NewOpc = getCMovFromCond(NewCC, MRI->getRegClass(DstReg)->getSize(), + const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); + NewOpc = getCMovFromCond(NewCC, TRI->getRegSizeInBits(*DstRC)/8, HasMemoryOperand); } @@ -7750,7 +7763,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( unsigned DstIdx = (Imm >> 4) & 3; unsigned SrcIdx = (Imm >> 6) & 3; - unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize(); + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); + unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; if (Size <= RCSize && 4 <= Align) { int PtrOffset = SrcIdx * 4; unsigned NewImm = (DstIdx << 4) | ZMask; @@ -7772,7 +7787,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( // To fold the load, adjust the pointer to the upper and use (V)MOVLPS. // TODO: In most cases AVX doesn't have a 8-byte alignment requirement. if (OpNum == 2) { - unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize(); + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); + unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; if (Size <= RCSize && 8 <= Align) { unsigned NewOpCode = (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm : @@ -7861,7 +7878,10 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( return nullptr; bool NarrowToMOV32rm = false; if (Size) { - unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize(); + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, + &RI, MF); + unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; if (Size < RCSize) { // Check if it's safe to fold the load. If the size of the object is // narrower than the load width, then it's not. @@ -8302,11 +8322,13 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, const MachineFunction &MF) { unsigned Opc = LoadMI.getOpcode(); unsigned UserOpc = UserMI.getOpcode(); - unsigned RegSize = - MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize(); + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + const TargetRegisterClass *RC = + MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg()); + unsigned RegSize = TRI.getRegSizeInBits(*RC); if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm) && - RegSize > 4) { + RegSize > 32) { // These instructions only load 32 bits, we can't fold them if the // destination register is wider than 32 bits (4 bytes), and its user // instruction isn't scalar (SS). @@ -8357,7 +8379,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, } if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm) && - RegSize > 8) { + RegSize > 64) { // These instructions only load 64 bits, we can't fold them if the // destination register is wider than 64 bits (8 bytes), and its user // instruction isn't scalar (SD). @@ -8702,6 +8724,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, bool FoldedStore = I->second.second & TB_FOLDED_STORE; const MCInstrDesc &MCID = get(Opc); MachineFunction &MF = DAG.getMachineFunction(); + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); unsigned NumDefs = MCID.NumDefs; std::vector AddrOps; @@ -8724,7 +8747,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, // Emit the load instruction. SDNode *Load = nullptr; if (FoldedLoad) { - EVT VT = *RC->vt_begin(); + EVT VT = *TRI.legalclasstypes_begin(*RC); std::pair MMOs = MF.extractLoadMemRefs(cast(N)->memoperands_begin(), @@ -8736,7 +8759,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, return false; // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte // memory access is slow above. - unsigned Alignment = std::max(RC->getSize(), 16); + unsigned Alignment = std::max(TRI.getSpillSize(*RC), 16); bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl, @@ -8752,7 +8775,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, const TargetRegisterClass *DstRC = nullptr; if (MCID.getNumDefs() > 0) { DstRC = getRegClass(MCID, 0, &RI, MF); - VTs.push_back(*DstRC->vt_begin()); + VTs.push_back(*TRI.legalclasstypes_begin(*DstRC)); } for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) { EVT VT = N->getValueType(i); @@ -8781,7 +8804,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, return false; // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte // memory access is slow above. - unsigned Alignment = std::max(RC->getSize(), 16); + unsigned Alignment = std::max(TRI.getSpillSize(*RC), 16); bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; SDNode *Store = @@ -9514,7 +9537,7 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const { } /// Return the noop instruction to use for a noop. -void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { +void X86InstrInfo::getNoop(MCInst &NopInst) const { NopInst.setOpcode(X86::NOOP); } diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 2fee485..3856783 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -457,7 +457,7 @@ public: int64_t Offset1, int64_t Offset2, unsigned NumLoads) const override; - void getNoopForMachoTarget(MCInst &NopInst) const override; + void getNoop(MCInst &NopInst) const override; bool reverseBranchCondition(SmallVectorImpl &Cond) const override; diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index e31d276..c3def46 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -897,6 +897,7 @@ def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">; def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">; def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">; def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">; +def HasERMSB : Predicate<"Subtarget->hasERMSB()">; def HasMFence : Predicate<"Subtarget->hasMFence()">; //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index e1bf28c..f22a502 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -4602,17 +4602,17 @@ def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), (v4i32 (scalar_to_vector (loadi32 addr:$src))))], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), - "mov{d|q}\t{$src, $dst|$dst, $src}", + "movq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (scalar_to_vector GR64:$src)))], IIC_SSE_MOVDQ>, Sched<[WriteMove]>; let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), - "mov{d|q}\t{$src, $dst|$dst, $src}", + "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; let isCodeGenOnly = 1 in def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), - "mov{d|q}\t{$src, $dst|$dst, $src}", + "movq\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (bitconvert GR64:$src))], IIC_SSE_MOVDQ>, Sched<[WriteMove]>; } // ExeDomain = SSEPackedInt @@ -4681,7 +4681,7 @@ def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), VEX; def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), - "mov{d|q}\t{$src, $dst|$dst, $src}", + "movq\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (extractelt (v2i64 VR128:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>; @@ -4694,7 +4694,7 @@ def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs), [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), - "mov{d|q}\t{$src, $dst|$dst, $src}", + "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; } // ExeDomain = SSEPackedInt @@ -4721,7 +4721,7 @@ let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), - "mov{d|q}\t{$src, $dst|$dst, $src}", + "movq\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (bitconvert FR64:$src))], IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), @@ -4811,12 +4811,12 @@ let Predicates = [UseSSE2] in { } } -// These are the correct encodings of the instructions so that we know how to -// read correct assembly, even though we continue to emit the wrong ones for -// compatibility with Darwin's buggy assembler. -def : InstAlias<"movq\t{$src, $dst|$dst, $src}", +// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of +// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add +// these aliases. +def : InstAlias<"movd\t{$src, $dst|$dst, $src}", (MOV64toPQIrr VR128:$dst, GR64:$src), 0>; -def : InstAlias<"movq\t{$src, $dst|$dst, $src}", +def : InstAlias<"movd\t{$src, $dst|$dst, $src}", (MOVPQIto64rr GR64:$dst, VR128:$src), 0>; // Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX. def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", @@ -7144,33 +7144,37 @@ let Predicates = [UseSSE41] in { /// SS42I_binop_rm - Simple SSE 4.2 binary operator multiclass SS42I_binop_rm opc, string OpcodeStr, SDNode OpNode, ValueType OpVT, RegisterClass RC, PatFrag memop_frag, - X86MemOperand x86memop, bit Is2Addr = 1> { + X86MemOperand x86memop, OpndItins itins, + bit Is2Addr = 1> { def rr : SS428I; + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, Sched<[itins.Sched]>; def rm : SS428I; + (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } let Predicates = [HasAVX] in defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128, - loadv2i64, i128mem, 0>, VEX_4V, VEX_WIG; + loadv2i64, i128mem, SSE_INTALU_ITINS_P, 0>, + VEX_4V, VEX_WIG; let Predicates = [HasAVX2] in defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, - loadv4i64, i256mem, 0>, VEX_4V, VEX_L, VEX_WIG; + loadv4i64, i256mem, SSE_INTALU_ITINS_P, 0>, + VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst" in defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, - memopv2i64, i128mem>; + memopv2i64, i128mem, SSE_INTALU_ITINS_P>; //===----------------------------------------------------------------------===// // SSE4.2 - String/text Processing Instructions diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp index fb93157..d0f1b70 100644 --- a/lib/Target/X86/X86InstructionSelector.cpp +++ b/lib/Target/X86/X86InstructionSelector.cpp @@ -39,11 +39,16 @@ using namespace llvm; namespace { +#define GET_GLOBALISEL_PREDICATE_BITSET +#include "X86GenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATE_BITSET + class X86InstructionSelector : public InstructionSelector { public: - X86InstructionSelector(const X86Subtarget &STI, + X86InstructionSelector(const X86TargetMachine &TM, const X86Subtarget &STI, const X86RegisterBankInfo &RBI); + void beginFunction(const MachineFunction &MF) override; bool select(MachineInstr &I) const override; private: @@ -70,10 +75,17 @@ private: bool selectTrunc(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; + const X86TargetMachine &TM; const X86Subtarget &STI; const X86InstrInfo &TII; const X86RegisterInfo &TRI; const X86RegisterBankInfo &RBI; + bool OptForSize; + bool OptForMinSize; + + PredicateBitset AvailableFeatures; + PredicateBitset computeAvailableFeatures(const MachineFunction *MF, + const X86Subtarget *Subtarget) const; #define GET_GLOBALISEL_TEMPORARIES_DECL #include "X86GenGlobalISel.inc" @@ -86,10 +98,12 @@ private: #include "X86GenGlobalISel.inc" #undef GET_GLOBALISEL_IMPL -X86InstructionSelector::X86InstructionSelector(const X86Subtarget &STI, +X86InstructionSelector::X86InstructionSelector(const X86TargetMachine &TM, + const X86Subtarget &STI, const X86RegisterBankInfo &RBI) - : InstructionSelector(), STI(STI), TII(*STI.getInstrInfo()), - TRI(*STI.getRegisterInfo()), RBI(RBI) + : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()), + TRI(*STI.getRegisterInfo()), RBI(RBI), OptForSize(false), + OptForMinSize(false), AvailableFeatures() #define GET_GLOBALISEL_TEMPORARIES_INIT #include "X86GenGlobalISel.inc" #undef GET_GLOBALISEL_TEMPORARIES_INIT @@ -181,6 +195,12 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, return true; } +void X86InstructionSelector::beginFunction(const MachineFunction &MF) { + OptForSize = MF.getFunction()->optForSize(); + OptForMinSize = MF.getFunction()->optForMinSize(); + AvailableFeatures = computeAvailableFeatures(&MF, &STI); +} + bool X86InstructionSelector::select(MachineInstr &I) const { assert(I.getParent() && "Instruction should be in a basic block!"); assert(I.getParent()->getParent() && "Instruction should be in a function!"); @@ -571,7 +591,8 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I, } InstructionSelector * -llvm::createX86InstructionSelector(X86Subtarget &Subtarget, +llvm::createX86InstructionSelector(const X86TargetMachine &TM, + X86Subtarget &Subtarget, X86RegisterBankInfo &RBI) { - return new X86InstructionSelector(Subtarget, RBI); + return new X86InstructionSelector(TM, Subtarget, RBI); } diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 9bab9a4..1f16f3c 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -137,25 +137,29 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, case X86::FR32RegClassID: case X86::FR64RegClassID: // If AVX-512 isn't supported we should only inflate to these classes. - if (!Subtarget.hasAVX512() && Super->getSize() == RC->getSize()) + if (!Subtarget.hasAVX512() && + getRegSizeInBits(*Super) == getRegSizeInBits(*RC)) return Super; break; case X86::VR128RegClassID: case X86::VR256RegClassID: // If VLX isn't supported we should only inflate to these classes. - if (!Subtarget.hasVLX() && Super->getSize() == RC->getSize()) + if (!Subtarget.hasVLX() && + getRegSizeInBits(*Super) == getRegSizeInBits(*RC)) return Super; break; case X86::VR128XRegClassID: case X86::VR256XRegClassID: // If VLX isn't support we shouldn't inflate to these classes. - if (Subtarget.hasVLX() && Super->getSize() == RC->getSize()) + if (Subtarget.hasVLX() && + getRegSizeInBits(*Super) == getRegSizeInBits(*RC)) return Super; break; case X86::FR32XRegClassID: case X86::FR64XRegClassID: // If AVX-512 isn't support we shouldn't inflate to these classes. - if (Subtarget.hasAVX512() && Super->getSize() == RC->getSize()) + if (Subtarget.hasAVX512() && + getRegSizeInBits(*Super) == getRegSizeInBits(*RC)) return Super; break; case X86::GR8RegClassID: @@ -168,7 +172,7 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, case X86::VR512RegClassID: // Don't return a super-class that would shrink the spill size. // That can happen with the vector and float classes. - if (Super->getSize() == RC->getSize()) + if (getRegSizeInBits(*Super) == getRegSizeInBits(*RC)) return Super; } Super = *I++; @@ -669,32 +673,28 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MachineFunction &MF = *MI.getParent()->getParent(); const X86FrameLowering *TFI = getFrameLowering(MF); int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); - unsigned BasePtr; - unsigned Opc = MI.getOpcode(); - bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm || - Opc == X86::TCRETURNmi || Opc == X86::TCRETURNmi64; - - if (hasBasePointer(MF)) - BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister()); - else if (needsStackRealignment(MF)) - BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr); - else if (AfterFPPop) - BasePtr = StackPtr; - else - BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr); + // Determine base register and offset. + int FIOffset; + unsigned BasePtr; + if (MI.isReturn()) { + assert((!needsStackRealignment(MF) || + MF.getFrameInfo().isFixedObjectIndex(FrameIndex)) && + "Return instruction can only reference SP relative frame objects"); + FIOffset = TFI->getFrameIndexReferenceSP(MF, FrameIndex, BasePtr, 0); + } else { + FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, BasePtr); + } // LOCAL_ESCAPE uses a single offset, with no register. It only works in the // simple FP case, and doesn't work with stack realignment. On 32-bit, the // offset is from the traditional base pointer location. On 64-bit, the // offset is from the SP at the end of the prologue, not the FP location. This // matches the behavior of llvm.frameaddress. - unsigned IgnoredFrameReg; + unsigned Opc = MI.getOpcode(); if (Opc == TargetOpcode::LOCAL_ESCAPE) { MachineOperand &FI = MI.getOperand(FIOperandNum); - int Offset; - Offset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); - FI.ChangeToImmediate(Offset); + FI.ChangeToImmediate(FIOffset); return; } @@ -710,15 +710,6 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // FrameIndex with base register. Add an offset to the offset. MI.getOperand(FIOperandNum).ChangeToRegister(MachineBasePtr, false); - // Now add the frame object offset to the offset from EBP. - int FIOffset; - if (AfterFPPop) { - // Tail call jmp happens after FP is popped. - const MachineFrameInfo &MFI = MF.getFrameInfo(); - FIOffset = MFI.getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea(); - } else - FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); - if (BasePtr == StackPtr) FIOffset += SPAdj; diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index 9da8a18..1a72a0b 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -106,7 +106,6 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( SDValue Count; ConstantSDNode *ValC = dyn_cast(Src); unsigned BytesLeft = 0; - bool TwoRepStos = false; if (ValC) { unsigned ValReg; uint64_t Val = ValC->getZExtValue() & 255; @@ -163,20 +162,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops); - if (TwoRepStos) { - InFlag = Chain.getValue(1); - Count = Size; - EVT CVT = Count.getValueType(); - SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, - DAG.getConstant((AVT == MVT::i64) ? 7 : 3, dl, - CVT)); - Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : X86::ECX, - Left, InFlag); - InFlag = Chain.getValue(1); - Tys = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag }; - Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops); - } else if (BytesLeft) { + if (BytesLeft) { // Handle the last 1 - 7 bytes. unsigned Offset = SizeVal - BytesLeft; EVT AddrVT = Dst.getValueType(); @@ -195,6 +181,24 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( return Chain; } +namespace { + +// Represents a cover of a buffer of SizeVal bytes with blocks of size +// AVT, as well as how many bytes remain (BytesLeft is always smaller than +// the block size). +struct RepMovsRepeats { + RepMovsRepeats(const uint64_t SizeVal, const MVT& AVT) { + const unsigned UBytes = AVT.getSizeInBits() / 8; + Count = SizeVal / UBytes; + BytesLeft = SizeVal % UBytes; + } + + unsigned Count; + unsigned BytesLeft; +}; + +} // namespace + SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, @@ -229,7 +233,12 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( return SDValue(); MVT AVT; - if (Align & 1) + if (Subtarget.hasERMSB()) + // If the target has enhanced REPMOVSB, then it's at least as fast to use + // REP MOVSB instead of REP MOVS{W,D,Q}, and it avoids having to handle + // BytesLeft. + AVT = MVT::i8; + else if (Align & 1) AVT = MVT::i8; else if (Align & 2) AVT = MVT::i16; @@ -240,14 +249,18 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( // QWORD aligned AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32; - unsigned UBytes = AVT.getSizeInBits() / 8; - unsigned CountVal = SizeVal / UBytes; - SDValue Count = DAG.getIntPtrConstant(CountVal, dl); - unsigned BytesLeft = SizeVal % UBytes; + RepMovsRepeats Repeats(SizeVal, AVT); + if (Repeats.BytesLeft > 0 && + DAG.getMachineFunction().getFunction()->optForMinSize()) { + // When agressively optimizing for size, avoid generating the code to handle + // BytesLeft. + AVT = MVT::i8; + Repeats = RepMovsRepeats(SizeVal, AVT); + } SDValue InFlag; Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX, - Count, InFlag); + DAG.getIntPtrConstant(Repeats.Count, dl), InFlag); InFlag = Chain.getValue(1); Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI, Dst, InFlag); @@ -262,9 +275,9 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( SmallVector Results; Results.push_back(RepMovs); - if (BytesLeft) { + if (Repeats.BytesLeft) { // Handle the last 1 - 7 bytes. - unsigned Offset = SizeVal - BytesLeft; + unsigned Offset = SizeVal - Repeats.BytesLeft; EVT DstVT = Dst.getValueType(); EVT SrcVT = Src.getValueType(); EVT SizeVT = Size.getValueType(); @@ -275,7 +288,8 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, dl, SrcVT)), - DAG.getConstant(BytesLeft, dl, SizeVT), + DAG.getConstant(Repeats.BytesLeft, dl, + SizeVT), Align, isVolatile, AlwaysInline, false, DstPtrInfo.getWithOffset(Offset), SrcPtrInfo.getWithOffset(Offset))); diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 92a6875..4154530 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -303,6 +303,7 @@ void X86Subtarget::initializeEnvironment() { HasFastVectorFSQRT = false; HasFastLZCNT = false; HasFastSHLDRotate = false; + HasERMSB = false; HasSlowDivide32 = false; HasSlowDivide64 = false; PadShortFunctions = false; diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index d0d88d3..fd057f3 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -232,6 +232,9 @@ protected: /// True if SHLD based rotate is fast. bool HasFastSHLDRotate; + /// True if the processor has enhanced REP MOVSB/STOSB. + bool HasERMSB; + /// True if the short functions should be padded to prevent /// a stall when returning too early. bool PadShortFunctions; @@ -472,6 +475,7 @@ public: bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } bool hasFastLZCNT() const { return HasFastLZCNT; } bool hasFastSHLDRotate() const { return HasFastSHLDRotate; } + bool hasERMSB() const { return HasERMSB; } bool hasSlowDivide32() const { return HasSlowDivide32; } bool hasSlowDivide64() const { return HasSlowDivide64; } bool padShortFunctions() const { return PadShortFunctions; } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 03a1958..623cf38 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -286,7 +286,7 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { auto *RBI = new X86RegisterBankInfo(*I->getRegisterInfo()); GISel->RegBankInfo.reset(RBI); - GISel->InstSelector.reset(createX86InstructionSelector(*I, *RBI)); + GISel->InstSelector.reset(createX86InstructionSelector(*this, *I, *RBI)); #endif I->setGISelAccessor(*GISel); } diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp index a752357..7846120 100644 --- a/lib/Target/XCore/XCoreFrameLowering.cpp +++ b/lib/Target/XCore/XCoreFrameLowering.cpp @@ -575,18 +575,17 @@ processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS) const { assert(RS && "requiresRegisterScavenging failed"); MachineFrameInfo &MFI = MF.getFrameInfo(); - const TargetRegisterClass *RC = &XCore::GRRegsRegClass; + const TargetRegisterClass &RC = XCore::GRRegsRegClass; + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); XCoreFunctionInfo *XFI = MF.getInfo(); // Reserve slots close to SP or frame pointer for Scavenging spills. // When using SP for small frames, we don't need any scratch registers. // When using SP for large frames, we may need 2 scratch registers. // When using FP, for large or small frames, we may need 1 scratch register. + unsigned Size = TRI.getSpillSize(RC); + unsigned Align = TRI.getSpillAlignment(RC); if (XFI->isLargeFrame(MF) || hasFP(MF)) - RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(), - RC->getAlignment(), - false)); + RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false)); if (XFI->isLargeFrame(MF) && !hasFP(MF)) - RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(), - RC->getAlignment(), - false)); + RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false)); } diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp index 4543781..2efcd46 100644 --- a/lib/Target/XCore/XCoreISelLowering.cpp +++ b/lib/Target/XCore/XCoreISelLowering.cpp @@ -1605,7 +1605,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N, TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLO.ShrinkDemandedConstant(OutVal, DemandedMask) || + if (TLI.ShrinkDemandedConstant(OutVal, DemandedMask, TLO) || TLI.SimplifyDemandedBits(OutVal, DemandedMask, KnownZero, KnownOne, TLO)) DCI.CommitTargetLoweringOpt(TLO); @@ -1622,7 +1622,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N, TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLO.ShrinkDemandedConstant(Time, DemandedMask) || + if (TLI.ShrinkDemandedConstant(Time, DemandedMask, TLO) || TLI.SimplifyDemandedBits(Time, DemandedMask, KnownZero, KnownOne, TLO)) DCI.CommitTargetLoweringOpt(TLO); diff --git a/lib/Target/XCore/XCoreMachineFunctionInfo.cpp b/lib/Target/XCore/XCoreMachineFunctionInfo.cpp index e91536c..75af0e9 100644 --- a/lib/Target/XCore/XCoreMachineFunctionInfo.cpp +++ b/lib/Target/XCore/XCoreMachineFunctionInfo.cpp @@ -10,6 +10,7 @@ #include "XCoreMachineFunctionInfo.h" #include "XCoreInstrInfo.h" #include "llvm/IR/Function.h" +#include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; @@ -35,13 +36,15 @@ int XCoreFunctionInfo::createLRSpillSlot(MachineFunction &MF) { if (LRSpillSlotSet) { return LRSpillSlot; } - const TargetRegisterClass *RC = &XCore::GRRegsRegClass; + const TargetRegisterClass &RC = XCore::GRRegsRegClass; + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); MachineFrameInfo &MFI = MF.getFrameInfo(); if (! MF.getFunction()->isVarArg()) { // A fixed offset of 0 allows us to save / restore LR using entsp / retsp. - LRSpillSlot = MFI.CreateFixedObject(RC->getSize(), 0, true); + LRSpillSlot = MFI.CreateFixedObject(TRI.getSpillSize(RC), 0, true); } else { - LRSpillSlot = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true); + LRSpillSlot = MFI.CreateStackObject(TRI.getSpillSize(RC), + TRI.getSpillAlignment(RC), true); } LRSpillSlotSet = true; return LRSpillSlot; @@ -51,9 +54,11 @@ int XCoreFunctionInfo::createFPSpillSlot(MachineFunction &MF) { if (FPSpillSlotSet) { return FPSpillSlot; } - const TargetRegisterClass *RC = &XCore::GRRegsRegClass; + const TargetRegisterClass &RC = XCore::GRRegsRegClass; + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); MachineFrameInfo &MFI = MF.getFrameInfo(); - FPSpillSlot = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true); + FPSpillSlot = MFI.CreateStackObject(TRI.getSpillSize(RC), + TRI.getSpillAlignment(RC), true); FPSpillSlotSet = true; return FPSpillSlot; } @@ -62,10 +67,13 @@ const int* XCoreFunctionInfo::createEHSpillSlot(MachineFunction &MF) { if (EHSpillSlotSet) { return EHSpillSlot; } - const TargetRegisterClass *RC = &XCore::GRRegsRegClass; + const TargetRegisterClass &RC = XCore::GRRegsRegClass; + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); MachineFrameInfo &MFI = MF.getFrameInfo(); - EHSpillSlot[0] = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true); - EHSpillSlot[1] = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true); + unsigned Size = TRI.getSpillSize(RC); + unsigned Align = TRI.getSpillAlignment(RC); + EHSpillSlot[0] = MFI.CreateStackObject(Size, Align, true); + EHSpillSlot[1] = MFI.CreateStackObject(Size, Align, true); EHSpillSlotSet = true; return EHSpillSlot; } diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index a2f6e56..78e71c1 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -17,7 +17,9 @@ #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" #include "llvm/IR/CFG.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -27,10 +29,21 @@ #include "llvm/Transforms/Utils/CodeExtractor.h" using namespace llvm; -#define DEBUG_TYPE "partialinlining" +#define DEBUG_TYPE "partial-inlining" STATISTIC(NumPartialInlined, "Number of functions partially inlined"); +// Command line option to disable partial-inlining. The default is false: +static cl::opt + DisablePartialInlining("disable-partial-inlining", cl::init(false), + cl::Hidden, cl::desc("Disable partial ininling")); + +// Command line option to set the maximum number of partial inlining allowed +// for the module. The default value of -1 means no limit. +static cl::opt MaxNumPartialInlining( + "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, + cl::desc("Max number of partial inlining. The default is unlimited")); + namespace { struct PartialInlinerImpl { PartialInlinerImpl(InlineFunctionInfo IFI) : IFI(std::move(IFI)) {} @@ -39,6 +52,12 @@ struct PartialInlinerImpl { private: InlineFunctionInfo IFI; + int NumPartialInlining = 0; + + bool IsLimitReached() { + return (MaxNumPartialInlining != -1 && + NumPartialInlining >= MaxNumPartialInlining); + } }; struct PartialInlinerLegacyPass : public ModulePass { static char ID; // Pass identification, replacement for typeid @@ -66,6 +85,9 @@ struct PartialInlinerLegacyPass : public ModulePass { Function *PartialInlinerImpl::unswitchFunction(Function *F) { // First, verify that this function is an unswitching candidate... + if (F->hasAddressTaken()) + return nullptr; + BasicBlock *EntryBlock = &F->front(); BranchInst *BR = dyn_cast(EntryBlock->getTerminator()); if (!BR || BR->isUnconditional()) @@ -149,11 +171,29 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) { // Inline the top-level if test into all callers. std::vector Users(DuplicateFunction->user_begin(), DuplicateFunction->user_end()); - for (User *User : Users) + + for (User *User : Users) { + CallSite CS; if (CallInst *CI = dyn_cast(User)) - InlineFunction(CI, IFI); + CS = CallSite(CI); else if (InvokeInst *II = dyn_cast(User)) - InlineFunction(II, IFI); + CS = CallSite(II); + else + llvm_unreachable("All uses must be calls"); + + if (IsLimitReached()) + continue; + NumPartialInlining++; + + OptimizationRemarkEmitter ORE(CS.getCaller()); + DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); + BasicBlock *Block = CS.getParent(); + ORE.emit(OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", DLoc, Block) + << ore::NV("Callee", F) << " partially inlined into " + << ore::NV("Caller", CS.getCaller())); + + InlineFunction(CS, IFI); + } // Ditch the duplicate, since we're done with it, and rewrite all remaining // users (function pointers, etc.) back to the original function. @@ -166,6 +206,9 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) { } bool PartialInlinerImpl::run(Module &M) { + if (DisablePartialInlining) + return false; + std::vector Worklist; Worklist.reserve(M.size()); for (Function &F : M) diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index f11b58d..0d5910e 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -282,6 +282,12 @@ void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM) { } if (!PGOInstrUse.empty()) MPM.add(createPGOInstrumentationUseLegacyPass(PGOInstrUse)); + // Indirect call promotion that promotes intra-module targets only. + // For ThinLTO this is done earlier due to interactions with globalopt + // for imported functions. We don't run this at -O0. + if (OptLevel > 0) + MPM.add( + createPGOIndirectCallPromotionLegacyPass(false, !PGOSampleUse.empty())); } void PassManagerBuilder::addFunctionSimplificationPasses( legacy::PassManagerBase &MPM) { @@ -414,11 +420,14 @@ void PassManagerBuilder::populateModulePassManager( else if (!GlobalExtensions->empty() || !Extensions.empty()) MPM.add(createBarrierNoopPass()); + addExtensionsToPM(EP_EnabledOnOptLevel0, MPM); + + // Rename anon globals to be able to export them in the summary. + // This has to be done after we add the extensions to the pass manager + // as there could be passes (e.g. Adddress sanitizer) which introduce + // new unnamed globals. if (PrepareForThinLTO) - // Rename anon globals to be able to export them in the summary. MPM.add(createNameAnonGlobalPass()); - - addExtensionsToPM(EP_EnabledOnOptLevel0, MPM); return; } @@ -468,16 +477,10 @@ void PassManagerBuilder::populateModulePassManager( // For SamplePGO in ThinLTO compile phase, we do not want to do indirect // call promotion as it will change the CFG too much to make the 2nd // profile annotation in backend more difficult. - if (!PerformThinLTO && !PrepareForThinLTOUsingPGOSampleProfile) { - /// PGO instrumentation is added during the compile phase for ThinLTO, do - /// not run it a second time + // PGO instrumentation is added during the compile phase for ThinLTO, do + // not run it a second time + if (!PerformThinLTO && !PrepareForThinLTOUsingPGOSampleProfile) addPGOInstrPasses(MPM); - // Indirect call promotion that promotes intra-module targets only. - // For ThinLTO this is done earlier due to interactions with globalopt - // for imported functions. - MPM.add( - createPGOIndirectCallPromotionLegacyPass(false, !PGOSampleUse.empty())); - } if (EnableNonLTOGlobalsModRef) // We add a module alias analysis pass here. In part due to bugs in the @@ -677,6 +680,11 @@ void PassManagerBuilder::populateModulePassManager( MPM.add(createLoopSinkPass()); // Get rid of LCSSA nodes. MPM.add(createInstructionSimplifierPass()); + + // LoopSink (and other loop passes since the last simplifyCFG) might have + // resulted in single-entry-single-exit or empty blocks. Clean up the CFG. + MPM.add(createCFGSimplificationPass()); + addExtensionsToPM(EP_OptimizerLast, MPM); } diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp index e30a4ba..0304610 100644 --- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -17,6 +17,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/Support/KnownBits.h" using namespace llvm; using namespace PatternMatch; @@ -794,6 +795,11 @@ unsigned FAddCombine::calcInstrNumber(const AddendVect &Opnds) { if (Opnd->isConstant()) continue; + // The constant check above is really for a few special constant + // coefficients. + if (isa(Opnd->getSymVal())) + continue; + const FAddendCoef &CE = Opnd->getCoef(); if (CE.isMinusOne() || CE.isMinusTwo()) NegOpndNum++; @@ -894,24 +900,22 @@ bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS, return true; unsigned BitWidth = LHS->getType()->getScalarSizeInBits(); - APInt LHSKnownZero(BitWidth, 0); - APInt LHSKnownOne(BitWidth, 0); - computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, &CxtI); + KnownBits LHSKnown(BitWidth); + computeKnownBits(LHS, LHSKnown, 0, &CxtI); - APInt RHSKnownZero(BitWidth, 0); - APInt RHSKnownOne(BitWidth, 0); - computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, &CxtI); + KnownBits RHSKnown(BitWidth); + computeKnownBits(RHS, RHSKnown, 0, &CxtI); // Addition of two 2's complement numbers having opposite signs will never // overflow. - if ((LHSKnownOne[BitWidth - 1] && RHSKnownZero[BitWidth - 1]) || - (LHSKnownZero[BitWidth - 1] && RHSKnownOne[BitWidth - 1])) + if ((LHSKnown.One[BitWidth - 1] && RHSKnown.Zero[BitWidth - 1]) || + (LHSKnown.Zero[BitWidth - 1] && RHSKnown.One[BitWidth - 1])) return true; // Check if carry bit of addition will not cause overflow. - if (checkRippleForAdd(LHSKnownZero, RHSKnownZero)) + if (checkRippleForAdd(LHSKnown.Zero, RHSKnown.Zero)) return true; - if (checkRippleForAdd(RHSKnownZero, LHSKnownZero)) + if (checkRippleForAdd(RHSKnown.Zero, LHSKnown.Zero)) return true; return false; @@ -931,18 +935,16 @@ bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS, return true; unsigned BitWidth = LHS->getType()->getScalarSizeInBits(); - APInt LHSKnownZero(BitWidth, 0); - APInt LHSKnownOne(BitWidth, 0); - computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, &CxtI); + KnownBits LHSKnown(BitWidth); + computeKnownBits(LHS, LHSKnown, 0, &CxtI); - APInt RHSKnownZero(BitWidth, 0); - APInt RHSKnownOne(BitWidth, 0); - computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, &CxtI); + KnownBits RHSKnown(BitWidth); + computeKnownBits(RHS, RHSKnown, 0, &CxtI); // Subtraction of two 2's complement numbers having identical signs will // never overflow. - if ((LHSKnownOne[BitWidth - 1] && RHSKnownOne[BitWidth - 1]) || - (LHSKnownZero[BitWidth - 1] && RHSKnownZero[BitWidth - 1])) + if ((LHSKnown.One[BitWidth - 1] && RHSKnown.One[BitWidth - 1]) || + (LHSKnown.Zero[BitWidth - 1] && RHSKnown.Zero[BitWidth - 1])) return true; // TODO: implement logic similar to checkRippleForAdd @@ -1113,10 +1115,9 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { // a sub and fuse this add with it. if (LHS->hasOneUse() && (XorRHS->getValue()+1).isPowerOf2()) { IntegerType *IT = cast(I.getType()); - APInt LHSKnownOne(IT->getBitWidth(), 0); - APInt LHSKnownZero(IT->getBitWidth(), 0); - computeKnownBits(XorLHS, LHSKnownZero, LHSKnownOne, 0, &I); - if ((XorRHS->getValue() | LHSKnownZero).isAllOnesValue()) + KnownBits LHSKnown(IT->getBitWidth()); + computeKnownBits(XorLHS, LHSKnown, 0, &I); + if ((XorRHS->getValue() | LHSKnown.Zero).isAllOnesValue()) return BinaryOperator::CreateSub(ConstantExpr::getAdd(XorRHS, CI), XorLHS); } @@ -1385,39 +1386,58 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { // integer add followed by a promotion. if (SIToFPInst *LHSConv = dyn_cast(LHS)) { Value *LHSIntVal = LHSConv->getOperand(0); + Type *FPType = LHSConv->getType(); + + // TODO: This check is overly conservative. In many cases known bits + // analysis can tell us that the result of the addition has less significant + // bits than the integer type can hold. + auto IsValidPromotion = [](Type *FTy, Type *ITy) { + Type *FScalarTy = FTy->getScalarType(); + Type *IScalarTy = ITy->getScalarType(); + + // Do we have enough bits in the significand to represent the result of + // the integer addition? + unsigned MaxRepresentableBits = + APFloat::semanticsPrecision(FScalarTy->getFltSemantics()); + return IScalarTy->getIntegerBitWidth() <= MaxRepresentableBits; + }; // (fadd double (sitofp x), fpcst) --> (sitofp (add int x, intcst)) // ... if the constant fits in the integer value. This is useful for things // like (double)(x & 1234) + 4.0 -> (double)((X & 1234)+4) which no longer // requires a constant pool load, and generally allows the add to be better // instcombined. - if (ConstantFP *CFP = dyn_cast(RHS)) { - Constant *CI = - ConstantExpr::getFPToSI(CFP, LHSIntVal->getType()); - if (LHSConv->hasOneUse() && - ConstantExpr::getSIToFP(CI, I.getType()) == CFP && - WillNotOverflowSignedAdd(LHSIntVal, CI, I)) { - // Insert the new integer add. - Value *NewAdd = Builder->CreateNSWAdd(LHSIntVal, - CI, "addconv"); - return new SIToFPInst(NewAdd, I.getType()); + if (ConstantFP *CFP = dyn_cast(RHS)) + if (IsValidPromotion(FPType, LHSIntVal->getType())) { + Constant *CI = + ConstantExpr::getFPToSI(CFP, LHSIntVal->getType()); + if (LHSConv->hasOneUse() && + ConstantExpr::getSIToFP(CI, I.getType()) == CFP && + WillNotOverflowSignedAdd(LHSIntVal, CI, I)) { + // Insert the new integer add. + Value *NewAdd = Builder->CreateNSWAdd(LHSIntVal, + CI, "addconv"); + return new SIToFPInst(NewAdd, I.getType()); + } } - } // (fadd double (sitofp x), (sitofp y)) --> (sitofp (add int x, y)) if (SIToFPInst *RHSConv = dyn_cast(RHS)) { Value *RHSIntVal = RHSConv->getOperand(0); - - // Only do this if x/y have the same type, if at least one of them has a - // single use (so we don't increase the number of int->fp conversions), - // and if the integer add will not overflow. - if (LHSIntVal->getType() == RHSIntVal->getType() && - (LHSConv->hasOneUse() || RHSConv->hasOneUse()) && - WillNotOverflowSignedAdd(LHSIntVal, RHSIntVal, I)) { - // Insert the new integer add. - Value *NewAdd = Builder->CreateNSWAdd(LHSIntVal, - RHSIntVal, "addconv"); - return new SIToFPInst(NewAdd, I.getType()); + // It's enough to check LHS types only because we require int types to + // be the same for this transform. + if (IsValidPromotion(FPType, LHSIntVal->getType())) { + // Only do this if x/y have the same type, if at least one of them has a + // single use (so we don't increase the number of int->fp conversions), + // and if the integer add will not overflow. + if (LHSIntVal->getType() == RHSIntVal->getType() && + (LHSConv->hasOneUse() || RHSConv->hasOneUse()) && + WillNotOverflowSignedAdd(LHSIntVal, RHSIntVal, I)) { + // Insert the new integer add. + Value *NewAdd = Builder->CreateNSWAdd(LHSIntVal, + RHSIntVal, "addconv"); + return new SIToFPInst(NewAdd, I.getType()); + } } } } @@ -1617,10 +1637,9 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known // zero. if (Op0C->isMask()) { - APInt RHSKnownZero(BitWidth, 0); - APInt RHSKnownOne(BitWidth, 0); - computeKnownBits(Op1, RHSKnownZero, RHSKnownOne, 0, &I); - if ((*Op0C | RHSKnownZero).isAllOnesValue()) + KnownBits RHSKnown(BitWidth); + computeKnownBits(Op1, RHSKnown, 0, &I); + if ((*Op0C | RHSKnown.Zero).isAllOnesValue()) return BinaryOperator::CreateXor(Op1, Op0); } } diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 3a98e89..a97b5a9 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -906,15 +906,6 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { switch (PredL) { default: llvm_unreachable("Unknown integer condition code!"); - case ICmpInst::ICMP_EQ: - switch (PredR) { - default: - llvm_unreachable("Unknown integer condition code!"); - case ICmpInst::ICMP_NE: // (X == 13 & X != 15) -> X == 13 - case ICmpInst::ICMP_ULT: // (X == 13 & X < 15) -> X == 13 - case ICmpInst::ICMP_SLT: // (X == 13 & X < 15) -> X == 13 - return LHS; - } case ICmpInst::ICMP_NE: switch (PredR) { default: @@ -930,43 +921,15 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { if (LHSC == SubOne(RHSC)) // (X != 13 & X s< 14) -> X < 13 return Builder->CreateICmpSLT(LHS0, LHSC); break; // (X != 13 & X s< 15) -> no change - case ICmpInst::ICMP_EQ: // (X != 13 & X == 15) -> X == 15 - case ICmpInst::ICMP_UGT: // (X != 13 & X u> 15) -> X u> 15 - case ICmpInst::ICMP_SGT: // (X != 13 & X s> 15) -> X s> 15 - return RHS; case ICmpInst::ICMP_NE: // Potential folds for this case should already be handled. break; } break; - case ICmpInst::ICMP_ULT: - switch (PredR) { - default: - llvm_unreachable("Unknown integer condition code!"); - case ICmpInst::ICMP_EQ: // (X u< 13 & X == 15) -> false - case ICmpInst::ICMP_UGT: // (X u< 13 & X u> 15) -> false - return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0); - case ICmpInst::ICMP_NE: // (X u< 13 & X != 15) -> X u< 13 - case ICmpInst::ICMP_ULT: // (X u< 13 & X u< 15) -> X u< 13 - return LHS; - } - break; - case ICmpInst::ICMP_SLT: - switch (PredR) { - default: - llvm_unreachable("Unknown integer condition code!"); - case ICmpInst::ICMP_NE: // (X s< 13 & X != 15) -> X < 13 - case ICmpInst::ICMP_SLT: // (X s< 13 & X s< 15) -> X < 13 - return LHS; - } - break; case ICmpInst::ICMP_UGT: switch (PredR) { default: llvm_unreachable("Unknown integer condition code!"); - case ICmpInst::ICMP_EQ: // (X u> 13 & X == 15) -> X == 15 - case ICmpInst::ICMP_UGT: // (X u> 13 & X u> 15) -> X u> 15 - return RHS; case ICmpInst::ICMP_NE: if (RHSC == AddOne(LHSC)) // (X u> 13 & X != 14) -> X u> 14 return Builder->CreateICmp(PredL, LHS0, RHSC); @@ -980,9 +943,6 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { switch (PredR) { default: llvm_unreachable("Unknown integer condition code!"); - case ICmpInst::ICMP_EQ: // (X s> 13 & X == 15) -> X == 15 - case ICmpInst::ICMP_SGT: // (X s> 13 & X s> 15) -> X s> 15 - return RHS; case ICmpInst::ICMP_NE: if (RHSC == AddOne(LHSC)) // (X s> 13 & X != 14) -> X s> 14 return Builder->CreateICmp(PredL, LHS0, RHSC); @@ -1234,6 +1194,56 @@ static Instruction *foldBoolSextMaskToSelect(BinaryOperator &I) { return nullptr; } +static Instruction *foldAndToXor(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + assert(I.getOpcode() == Instruction::And); + Value *Op0 = I.getOperand(0); + Value *Op1 = I.getOperand(1); + Value *A, *B; + + // Operand complexity canonicalization guarantees that the 'or' is Op0. + // (A | B) & ~(A & B) --> A ^ B + // (A | B) & ~(B & A) --> A ^ B + if (match(Op0, m_Or(m_Value(A), m_Value(B))) && + match(Op1, m_Not(m_c_And(m_Specific(A), m_Specific(B))))) + return BinaryOperator::CreateXor(A, B); + + // (A | ~B) & (~A | B) --> ~(A ^ B) + // (A | ~B) & (B | ~A) --> ~(A ^ B) + // (~B | A) & (~A | B) --> ~(A ^ B) + // (~B | A) & (B | ~A) --> ~(A ^ B) + if (match(Op0, m_c_Or(m_Value(A), m_Not(m_Value(B)))) && + match(Op1, m_c_Or(m_Not(m_Specific(A)), m_Value(B)))) + return BinaryOperator::CreateNot(Builder.CreateXor(A, B)); + + return nullptr; +} + +static Instruction *foldOrToXor(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + assert(I.getOpcode() == Instruction::Or); + Value *Op0 = I.getOperand(0); + Value *Op1 = I.getOperand(1); + Value *A, *B; + + // Operand complexity canonicalization guarantees that the 'and' is Op0. + // (A & B) | ~(A | B) --> ~(A ^ B) + // (A & B) | ~(B | A) --> ~(A ^ B) + if (match(Op0, m_And(m_Value(A), m_Value(B))) && + match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B))))) + return BinaryOperator::CreateNot(Builder.CreateXor(A, B)); + + // (A & ~B) | (~A & B) --> A ^ B + // (A & ~B) | (B & ~A) --> A ^ B + // (~B & A) | (~A & B) --> A ^ B + // (~B & A) | (B & ~A) --> A ^ B + if (match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) && + match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))) + return BinaryOperator::CreateXor(A, B); + + return nullptr; +} + // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches // here. We should standardize that construct where it is needed or choose some // other way to ensure that commutated variants of patterns are not missed. @@ -1247,15 +1257,19 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { if (Value *V = SimplifyAndInst(Op0, Op1, DL, &TLI, &DT, &AC)) return replaceInstUsesWith(I, V); - // (A|B)&(A|C) -> A|(B&C) etc - if (Value *V = SimplifyUsingDistributiveLaws(I)) - return replaceInstUsesWith(I, V); - // See if we can simplify any instructions used by the instruction whose sole // purpose is to compute bits we don't care about. if (SimplifyDemandedInstructionBits(I)) return &I; + // Do this before using distributive laws to catch simple and/or/not patterns. + if (Instruction *Xor = foldAndToXor(I, *Builder)) + return Xor; + + // (A|B)&(A|C) -> A|(B&C) etc + if (Value *V = SimplifyUsingDistributiveLaws(I)) + return replaceInstUsesWith(I, V); + if (Value *V = SimplifyBSwap(I)) return replaceInstUsesWith(I, V); @@ -1366,19 +1380,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { return DeMorgan; { - Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr; - // (A|B) & ~(A&B) -> A^B - if (match(Op0, m_Or(m_Value(A), m_Value(B))) && - match(Op1, m_Not(m_And(m_Value(C), m_Value(D)))) && - ((A == C && B == D) || (A == D && B == C))) - return BinaryOperator::CreateXor(A, B); - - // ~(A&B) & (A|B) -> A^B - if (match(Op1, m_Or(m_Value(A), m_Value(B))) && - match(Op0, m_Not(m_And(m_Value(C), m_Value(D)))) && - ((A == C && B == D) || (A == D && B == C))) - return BinaryOperator::CreateXor(A, B); - + Value *A = nullptr, *B = nullptr, *C = nullptr; // A&(A^B) => A & ~B { Value *tmpOp0 = Op0; @@ -1405,11 +1407,9 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { } // (A&((~A)|B)) -> A&B - if (match(Op0, m_Or(m_Not(m_Specific(Op1)), m_Value(A))) || - match(Op0, m_Or(m_Value(A), m_Not(m_Specific(Op1))))) + if (match(Op0, m_c_Or(m_Not(m_Specific(Op1)), m_Value(A)))) return BinaryOperator::CreateAnd(A, Op1); - if (match(Op1, m_Or(m_Not(m_Specific(Op0)), m_Value(A))) || - match(Op1, m_Or(m_Value(A), m_Not(m_Specific(Op0))))) + if (match(Op1, m_c_Or(m_Not(m_Specific(Op0)), m_Value(A)))) return BinaryOperator::CreateAnd(A, Op0); // (A ^ B) & ((B ^ C) ^ A) -> (A ^ B) & ~C @@ -1425,13 +1425,18 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { return BinaryOperator::CreateAnd(Op1, Builder->CreateNot(C)); // (A | B) & ((~A) ^ B) -> (A & B) - if (match(Op0, m_Or(m_Value(A), m_Value(B))) && - match(Op1, m_Xor(m_Not(m_Specific(A)), m_Specific(B)))) + // (A | B) & (B ^ (~A)) -> (A & B) + // (B | A) & ((~A) ^ B) -> (A & B) + // (B | A) & (B ^ (~A)) -> (A & B) + if (match(Op1, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) && + match(Op0, m_c_Or(m_Specific(A), m_Specific(B)))) return BinaryOperator::CreateAnd(A, B); // ((~A) ^ B) & (A | B) -> (A & B) // ((~A) ^ B) & (B | A) -> (A & B) - if (match(Op0, m_Xor(m_Not(m_Value(A)), m_Value(B))) && + // (B ^ (~A)) & (A | B) -> (A & B) + // (B ^ (~A)) & (B | A) -> (A & B) + if (match(Op0, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) && match(Op1, m_c_Or(m_Specific(A), m_Specific(B)))) return BinaryOperator::CreateAnd(A, B); } @@ -2037,15 +2042,19 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { if (Value *V = SimplifyOrInst(Op0, Op1, DL, &TLI, &DT, &AC)) return replaceInstUsesWith(I, V); - // (A&B)|(A&C) -> A&(B|C) etc - if (Value *V = SimplifyUsingDistributiveLaws(I)) - return replaceInstUsesWith(I, V); - // See if we can simplify any instructions used by the instruction whose sole // purpose is to compute bits we don't care about. if (SimplifyDemandedInstructionBits(I)) return &I; + // Do this before using distributive laws to catch simple and/or/not patterns. + if (Instruction *Xor = foldOrToXor(I, *Builder)) + return Xor; + + // (A&B)|(A&C) -> A&(B|C) etc + if (Value *V = SimplifyUsingDistributiveLaws(I)) + return replaceInstUsesWith(I, V); + if (Value *V = SimplifyBSwap(I)) return replaceInstUsesWith(I, V); @@ -2105,19 +2114,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { match(Op0, m_c_And(m_Specific(A), m_Value(B)))) return BinaryOperator::CreateOr(Op1, B); - // (A & ~B) | (A ^ B) -> (A ^ B) - // (~B & A) | (A ^ B) -> (A ^ B) - if (match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) && - match(Op1, m_Xor(m_Specific(A), m_Specific(B)))) - return BinaryOperator::CreateXor(A, B); - - // Commute the 'or' operands. - // (A ^ B) | (A & ~B) -> (A ^ B) - // (A ^ B) | (~B & A) -> (A ^ B) - if (match(Op1, m_c_And(m_Value(A), m_Not(m_Value(B)))) && - match(Op0, m_Xor(m_Specific(A), m_Specific(B)))) - return BinaryOperator::CreateXor(A, B); - // (A & C)|(B & D) Value *C = nullptr, *D = nullptr; if (match(Op0, m_And(m_Value(A), m_Value(C))) && @@ -2182,23 +2178,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { return replaceInstUsesWith(I, V); } - // ((A&~B)|(~A&B)) -> A^B - if ((match(C, m_Not(m_Specific(D))) && - match(B, m_Not(m_Specific(A))))) - return BinaryOperator::CreateXor(A, D); - // ((~B&A)|(~A&B)) -> A^B - if ((match(A, m_Not(m_Specific(D))) && - match(B, m_Not(m_Specific(C))))) - return BinaryOperator::CreateXor(C, D); - // ((A&~B)|(B&~A)) -> A^B - if ((match(C, m_Not(m_Specific(B))) && - match(D, m_Not(m_Specific(A))))) - return BinaryOperator::CreateXor(A, B); - // ((~B&A)|(B&~A)) -> A^B - if ((match(A, m_Not(m_Specific(B))) && - match(D, m_Not(m_Specific(C))))) - return BinaryOperator::CreateXor(C, B); - // ((A|B)&1)|(B&-2) -> (A&1) | B if (match(A, m_Or(m_Value(V1), m_Specific(B))) || match(A, m_Or(m_Specific(B), m_Value(V1)))) { @@ -2374,6 +2353,58 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { return Changed ? &I : nullptr; } +/// A ^ B can be specified using other logic ops in a variety of patterns. We +/// can fold these early and efficiently by morphing an existing instruction. +static Instruction *foldXorToXor(BinaryOperator &I) { + assert(I.getOpcode() == Instruction::Xor); + Value *Op0 = I.getOperand(0); + Value *Op1 = I.getOperand(1); + Value *A, *B; + + // There are 4 commuted variants for each of the basic patterns. + + // (A & B) ^ (A | B) -> A ^ B + // (A & B) ^ (B | A) -> A ^ B + // (A | B) ^ (A & B) -> A ^ B + // (A | B) ^ (B & A) -> A ^ B + if ((match(Op0, m_And(m_Value(A), m_Value(B))) && + match(Op1, m_c_Or(m_Specific(A), m_Specific(B)))) || + (match(Op0, m_Or(m_Value(A), m_Value(B))) && + match(Op1, m_c_And(m_Specific(A), m_Specific(B))))) { + I.setOperand(0, A); + I.setOperand(1, B); + return &I; + } + + // (A | ~B) ^ (~A | B) -> A ^ B + // (~B | A) ^ (~A | B) -> A ^ B + // (~A | B) ^ (A | ~B) -> A ^ B + // (B | ~A) ^ (A | ~B) -> A ^ B + if ((match(Op0, m_c_Or(m_Value(A), m_Not(m_Value(B)))) && + match(Op1, m_Or(m_Not(m_Specific(A)), m_Specific(B)))) || + (match(Op0, m_c_Or(m_Not(m_Value(A)), m_Value(B))) && + match(Op1, m_Or(m_Specific(A), m_Not(m_Specific(B)))))) { + I.setOperand(0, A); + I.setOperand(1, B); + return &I; + } + + // (A & ~B) ^ (~A & B) -> A ^ B + // (~B & A) ^ (~A & B) -> A ^ B + // (~A & B) ^ (A & ~B) -> A ^ B + // (B & ~A) ^ (A & ~B) -> A ^ B + if ((match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) && + match(Op1, m_And(m_Not(m_Specific(A)), m_Specific(B)))) || + (match(Op0, m_c_And(m_Not(m_Value(A)), m_Value(B))) && + match(Op1, m_And(m_Specific(A), m_Not(m_Specific(B)))))) { + I.setOperand(0, A); + I.setOperand(1, B); + return &I; + } + + return nullptr; +} + // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches // here. We should standardize that construct where it is needed or choose some // other way to ensure that commutated variants of patterns are not missed. @@ -2387,6 +2418,9 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { if (Value *V = SimplifyXorInst(Op0, Op1, DL, &TLI, &DT, &AC)) return replaceInstUsesWith(I, V); + if (Instruction *NewXor = foldXorToXor(I)) + return NewXor; + // (A&B)^(A&C) -> A&(B^C) etc if (Value *V = SimplifyUsingDistributiveLaws(I)) return replaceInstUsesWith(I, V); @@ -2399,44 +2433,39 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { if (Value *V = SimplifyBSwap(I)) return replaceInstUsesWith(I, V); - // Is this a ~ operation? - if (Value *NotOp = dyn_castNotVal(&I)) { - if (BinaryOperator *Op0I = dyn_cast(NotOp)) { - if (Op0I->getOpcode() == Instruction::And || - Op0I->getOpcode() == Instruction::Or) { - // ~(~X & Y) --> (X | ~Y) - De Morgan's Law - // ~(~X | Y) === (X & ~Y) - De Morgan's Law - if (dyn_castNotVal(Op0I->getOperand(1))) - Op0I->swapOperands(); - if (Value *Op0NotVal = dyn_castNotVal(Op0I->getOperand(0))) { - Value *NotY = - Builder->CreateNot(Op0I->getOperand(1), - Op0I->getOperand(1)->getName()+".not"); - if (Op0I->getOpcode() == Instruction::And) - return BinaryOperator::CreateOr(Op0NotVal, NotY); - return BinaryOperator::CreateAnd(Op0NotVal, NotY); - } - - // ~(X & Y) --> (~X | ~Y) - De Morgan's Law - // ~(X | Y) === (~X & ~Y) - De Morgan's Law - if (IsFreeToInvert(Op0I->getOperand(0), - Op0I->getOperand(0)->hasOneUse()) && - IsFreeToInvert(Op0I->getOperand(1), - Op0I->getOperand(1)->hasOneUse())) { - Value *NotX = - Builder->CreateNot(Op0I->getOperand(0), "notlhs"); - Value *NotY = - Builder->CreateNot(Op0I->getOperand(1), "notrhs"); - if (Op0I->getOpcode() == Instruction::And) - return BinaryOperator::CreateOr(NotX, NotY); - return BinaryOperator::CreateAnd(NotX, NotY); - } + // Is this a 'not' (~) fed by a binary operator? + BinaryOperator *NotOp; + if (match(&I, m_Not(m_BinOp(NotOp)))) { + if (NotOp->getOpcode() == Instruction::And || + NotOp->getOpcode() == Instruction::Or) { + // ~(~X & Y) --> (X | ~Y) - De Morgan's Law + // ~(~X | Y) === (X & ~Y) - De Morgan's Law + if (dyn_castNotVal(NotOp->getOperand(1))) + NotOp->swapOperands(); + if (Value *Op0NotVal = dyn_castNotVal(NotOp->getOperand(0))) { + Value *NotY = Builder->CreateNot( + NotOp->getOperand(1), NotOp->getOperand(1)->getName() + ".not"); + if (NotOp->getOpcode() == Instruction::And) + return BinaryOperator::CreateOr(Op0NotVal, NotY); + return BinaryOperator::CreateAnd(Op0NotVal, NotY); + } - } else if (Op0I->getOpcode() == Instruction::AShr) { - // ~(~X >>s Y) --> (X >>s Y) - if (Value *Op0NotVal = dyn_castNotVal(Op0I->getOperand(0))) - return BinaryOperator::CreateAShr(Op0NotVal, Op0I->getOperand(1)); + // ~(X & Y) --> (~X | ~Y) - De Morgan's Law + // ~(X | Y) === (~X & ~Y) - De Morgan's Law + if (IsFreeToInvert(NotOp->getOperand(0), + NotOp->getOperand(0)->hasOneUse()) && + IsFreeToInvert(NotOp->getOperand(1), + NotOp->getOperand(1)->hasOneUse())) { + Value *NotX = Builder->CreateNot(NotOp->getOperand(0), "notlhs"); + Value *NotY = Builder->CreateNot(NotOp->getOperand(1), "notrhs"); + if (NotOp->getOpcode() == Instruction::And) + return BinaryOperator::CreateOr(NotX, NotY); + return BinaryOperator::CreateAnd(NotX, NotY); } + } else if (NotOp->getOpcode() == Instruction::AShr) { + // ~(~X >>s Y) --> (X >>s Y) + if (Value *Op0NotVal = dyn_castNotVal(NotOp->getOperand(0))) + return BinaryOperator::CreateAShr(Op0NotVal, NotOp->getOperand(1)); } } @@ -2574,40 +2603,6 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { { Value *A, *B, *C, *D; - // (A & B)^(A | B) -> A ^ B - if (match(Op0, m_And(m_Value(A), m_Value(B))) && - match(Op1, m_Or(m_Value(C), m_Value(D)))) { - if ((A == C && B == D) || (A == D && B == C)) - return BinaryOperator::CreateXor(A, B); - } - // (A | B)^(A & B) -> A ^ B - if (match(Op0, m_Or(m_Value(A), m_Value(B))) && - match(Op1, m_And(m_Value(C), m_Value(D)))) { - if ((A == C && B == D) || (A == D && B == C)) - return BinaryOperator::CreateXor(A, B); - } - // (A | ~B) ^ (~A | B) -> A ^ B - // (~B | A) ^ (~A | B) -> A ^ B - if (match(Op0, m_c_Or(m_Value(A), m_Not(m_Value(B)))) && - match(Op1, m_Or(m_Not(m_Specific(A)), m_Specific(B)))) - return BinaryOperator::CreateXor(A, B); - - // (~A | B) ^ (A | ~B) -> A ^ B - if (match(Op0, m_Or(m_Not(m_Value(A)), m_Value(B))) && - match(Op1, m_Or(m_Specific(A), m_Not(m_Specific(B))))) { - return BinaryOperator::CreateXor(A, B); - } - // (A & ~B) ^ (~A & B) -> A ^ B - // (~B & A) ^ (~A & B) -> A ^ B - if (match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) && - match(Op1, m_And(m_Not(m_Specific(A)), m_Specific(B)))) - return BinaryOperator::CreateXor(A, B); - - // (~A & B) ^ (A & ~B) -> A ^ B - if (match(Op0, m_And(m_Not(m_Value(A)), m_Value(B))) && - match(Op1, m_And(m_Specific(A), m_Not(m_Specific(B))))) { - return BinaryOperator::CreateXor(A, B); - } // (A ^ C)^(A | B) -> ((~A) & B) ^ C if (match(Op0, m_Xor(m_Value(D), m_Value(C))) && match(Op1, m_Or(m_Value(A), m_Value(B)))) { diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index e7aa1a4..313ab13 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -44,6 +44,7 @@ #include "llvm/IR/ValueHandle.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SimplifyLibCalls.h" @@ -1378,14 +1379,13 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) { return nullptr; unsigned BitWidth = IT->getBitWidth(); - APInt KnownZero(BitWidth, 0); - APInt KnownOne(BitWidth, 0); - IC.computeKnownBits(Op0, KnownZero, KnownOne, 0, &II); + KnownBits Known(BitWidth); + IC.computeKnownBits(Op0, Known, 0, &II); // Create a mask for bits above (ctlz) or below (cttz) the first known one. bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz; - unsigned NumMaskBits = IsTZ ? KnownOne.countTrailingZeros() - : KnownOne.countLeadingZeros(); + unsigned NumMaskBits = IsTZ ? Known.One.countTrailingZeros() + : Known.One.countLeadingZeros(); APInt Mask = IsTZ ? APInt::getLowBitsSet(BitWidth, NumMaskBits) : APInt::getHighBitsSet(BitWidth, NumMaskBits); @@ -1393,7 +1393,7 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) { // zero, this value is constant. // FIXME: This should be in InstSimplify because we're replacing an // instruction with a constant. - if ((Mask & KnownZero) == Mask) { + if (Mask.isSubsetOf(Known.Zero)) { auto *C = ConstantInt::get(IT, APInt(BitWidth, NumMaskBits)); return IC.replaceInstUsesWith(II, C); } @@ -1401,7 +1401,7 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) { // If the input to cttz/ctlz is known to be non-zero, // then change the 'ZeroIsUndef' parameter to 'true' // because we know the zero behavior can't affect the result. - if (KnownOne != 0 || isKnownNonZero(Op0, IC.getDataLayout())) { + if (Known.One != 0 || isKnownNonZero(Op0, IC.getDataLayout())) { if (!match(II.getArgOperand(1), m_One())) { II.setOperand(1, IC.Builder->getTrue()); return &II; @@ -3432,8 +3432,26 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { if (auto *CSrc0 = dyn_cast(Src0)) { if (auto *CSrc1 = dyn_cast(Src1)) { Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); - return replaceInstUsesWith(*II, - ConstantExpr::getSExt(CCmp, II->getType())); + if (CCmp->isNullValue()) { + return replaceInstUsesWith( + *II, ConstantExpr::getSExt(CCmp, II->getType())); + } + + // The result of V_ICMP/V_FCMP assembly instructions (which this + // intrinsic exposes) is one bit per thread, masked with the EXEC + // register (which contains the bitmask of live threads). So a + // comparison that always returns true is the same as a read of the + // EXEC register. + Value *NewF = Intrinsic::getDeclaration( + II->getModule(), Intrinsic::read_register, II->getType()); + Metadata *MDArgs[] = {MDString::get(II->getContext(), "exec")}; + MDNode *MD = MDNode::get(II->getContext(), MDArgs); + Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)}; + CallInst *NewCall = Builder->CreateCall(NewF, Args); + NewCall->addAttribute(AttributeList::FunctionIndex, + Attribute::Convergent); + NewCall->takeName(II); + return replaceInstUsesWith(*II, NewCall); } // Canonicalize constants to RHS. @@ -3599,9 +3617,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // If there is a dominating assume with the same condition as this one, // then this one is redundant, and should be removed. - APInt KnownZero(1, 0), KnownOne(1, 0); - computeKnownBits(IIOperand, KnownZero, KnownOne, 0, II); - if (KnownOne.isAllOnesValue()) + KnownBits Known(1); + computeKnownBits(IIOperand, Known, 0, II); + if (Known.One.isAllOnesValue()) return eraseInstFromFunction(*II); // Update the cache of affected values for this assumption (we might be diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp index 9127ddc..312d9ba 100644 --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -14,9 +14,10 @@ #include "InstCombineInternal.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/PatternMatch.h" -#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Support/KnownBits.h" using namespace llvm; using namespace PatternMatch; @@ -676,11 +677,10 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI, // This only works for EQ and NE ICI->isEquality()) { // If Op1C some other power of two, convert: - uint32_t BitWidth = Op1C->getType()->getBitWidth(); - APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); - computeKnownBits(ICI->getOperand(0), KnownZero, KnownOne, 0, &CI); + KnownBits Known(Op1C->getType()->getBitWidth()); + computeKnownBits(ICI->getOperand(0), Known, 0, &CI); - APInt KnownZeroMask(~KnownZero); + APInt KnownZeroMask(~Known.Zero); if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1? if (!DoTransform) return ICI; @@ -726,13 +726,13 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI, Value *LHS = ICI->getOperand(0); Value *RHS = ICI->getOperand(1); - APInt KnownZeroLHS(BitWidth, 0), KnownOneLHS(BitWidth, 0); - APInt KnownZeroRHS(BitWidth, 0), KnownOneRHS(BitWidth, 0); - computeKnownBits(LHS, KnownZeroLHS, KnownOneLHS, 0, &CI); - computeKnownBits(RHS, KnownZeroRHS, KnownOneRHS, 0, &CI); + KnownBits KnownLHS(BitWidth); + KnownBits KnownRHS(BitWidth); + computeKnownBits(LHS, KnownLHS, 0, &CI); + computeKnownBits(RHS, KnownRHS, 0, &CI); - if (KnownZeroLHS == KnownZeroRHS && KnownOneLHS == KnownOneRHS) { - APInt KnownBits = KnownZeroLHS | KnownOneLHS; + if (KnownLHS.Zero == KnownRHS.Zero && KnownLHS.One == KnownRHS.One) { + APInt KnownBits = KnownLHS.Zero | KnownLHS.One; APInt UnknownBit = ~KnownBits; if (UnknownBit.countPopulation() == 1) { if (!DoTransform) return ICI; @@ -740,7 +740,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI, Value *Result = Builder->CreateXor(LHS, RHS); // Mask off any bits that are set and won't be shifted away. - if (KnownOneLHS.uge(UnknownBit)) + if (KnownLHS.One.uge(UnknownBit)) Result = Builder->CreateAnd(Result, ConstantInt::get(ITy, UnknownBit)); @@ -1049,10 +1049,10 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) { if (ICI->hasOneUse() && ICI->isEquality() && (Op1C->isZero() || Op1C->getValue().isPowerOf2())){ unsigned BitWidth = Op1C->getType()->getBitWidth(); - APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); - computeKnownBits(Op0, KnownZero, KnownOne, 0, &CI); + KnownBits Known(BitWidth); + computeKnownBits(Op0, Known, 0, &CI); - APInt KnownZeroMask(~KnownZero); + APInt KnownZeroMask(~Known.Zero); if (KnownZeroMask.isPowerOf2()) { Value *In = ICI->getOperand(0); diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 003029a..d846a63 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/KnownBits.h" using namespace llvm; using namespace PatternMatch; @@ -175,19 +176,18 @@ static bool isSignTest(ICmpInst::Predicate &Pred, const APInt &C) { /// Given a signed integer type and a set of known zero and one bits, compute /// the maximum and minimum values that could have the specified known zero and /// known one bits, returning them in Min/Max. -static void computeSignedMinMaxValuesFromKnownBits(const APInt &KnownZero, - const APInt &KnownOne, +/// TODO: Move to method on KnownBits struct? +static void computeSignedMinMaxValuesFromKnownBits(const KnownBits &Known, APInt &Min, APInt &Max) { - assert(KnownZero.getBitWidth() == KnownOne.getBitWidth() && - KnownZero.getBitWidth() == Min.getBitWidth() && - KnownZero.getBitWidth() == Max.getBitWidth() && + assert(Known.getBitWidth() == Min.getBitWidth() && + Known.getBitWidth() == Max.getBitWidth() && "KnownZero, KnownOne and Min, Max must have equal bitwidth."); - APInt UnknownBits = ~(KnownZero|KnownOne); + APInt UnknownBits = ~(Known.Zero|Known.One); // The minimum value is when all unknown bits are zeros, EXCEPT for the sign // bit if it is unknown. - Min = KnownOne; - Max = KnownOne|UnknownBits; + Min = Known.One; + Max = Known.One|UnknownBits; if (UnknownBits.isNegative()) { // Sign bit is unknown Min.setBit(Min.getBitWidth()-1); @@ -198,19 +198,18 @@ static void computeSignedMinMaxValuesFromKnownBits(const APInt &KnownZero, /// Given an unsigned integer type and a set of known zero and one bits, compute /// the maximum and minimum values that could have the specified known zero and /// known one bits, returning them in Min/Max. -static void computeUnsignedMinMaxValuesFromKnownBits(const APInt &KnownZero, - const APInt &KnownOne, +/// TODO: Move to method on KnownBits struct? +static void computeUnsignedMinMaxValuesFromKnownBits(const KnownBits &Known, APInt &Min, APInt &Max) { - assert(KnownZero.getBitWidth() == KnownOne.getBitWidth() && - KnownZero.getBitWidth() == Min.getBitWidth() && - KnownZero.getBitWidth() == Max.getBitWidth() && + assert(Known.getBitWidth() == Min.getBitWidth() && + Known.getBitWidth() == Max.getBitWidth() && "Ty, KnownZero, KnownOne and Min, Max must have equal bitwidth."); - APInt UnknownBits = ~(KnownZero|KnownOne); + APInt UnknownBits = ~(Known.Zero|Known.One); // The minimum value is when the unknown bits are all zeros. - Min = KnownOne; + Min = Known.One; // The maximum value is when the unknown bits are all ones. - Max = KnownOne|UnknownBits; + Max = Known.One|UnknownBits; } /// This is called when we see this pattern: @@ -1479,14 +1478,14 @@ Instruction *InstCombiner::foldICmpTruncConstant(ICmpInst &Cmp, // of the high bits truncated out of x are known. unsigned DstBits = Trunc->getType()->getScalarSizeInBits(), SrcBits = X->getType()->getScalarSizeInBits(); - APInt KnownZero(SrcBits, 0), KnownOne(SrcBits, 0); - computeKnownBits(X, KnownZero, KnownOne, 0, &Cmp); + KnownBits Known(SrcBits); + computeKnownBits(X, Known, 0, &Cmp); // If all the high bits are known, we can do this xform. - if ((KnownZero | KnownOne).countLeadingOnes() >= SrcBits - DstBits) { + if ((Known.Zero | Known.One).countLeadingOnes() >= SrcBits - DstBits) { // Pull in the high bits from known-ones set. APInt NewRHS = C->zext(SrcBits); - NewRHS |= KnownOne & APInt::getHighBitsSet(SrcBits, SrcBits - DstBits); + NewRHS |= Known.One & APInt::getHighBitsSet(SrcBits, SrcBits - DstBits); return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), NewRHS)); } } @@ -4001,16 +4000,16 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) { IsSignBit = isSignBitCheck(Pred, *CmpC, UnusedBit); } - APInt Op0KnownZero(BitWidth, 0), Op0KnownOne(BitWidth, 0); - APInt Op1KnownZero(BitWidth, 0), Op1KnownOne(BitWidth, 0); + KnownBits Op0Known(BitWidth); + KnownBits Op1Known(BitWidth); if (SimplifyDemandedBits(&I, 0, getDemandedBitsLHSMask(I, BitWidth, IsSignBit), - Op0KnownZero, Op0KnownOne, 0)) + Op0Known, 0)) return &I; if (SimplifyDemandedBits(&I, 1, APInt::getAllOnesValue(BitWidth), - Op1KnownZero, Op1KnownOne, 0)) + Op1Known, 0)) return &I; // Given the known and unknown bits, compute a range that the LHS could be @@ -4019,15 +4018,11 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) { APInt Op0Min(BitWidth, 0), Op0Max(BitWidth, 0); APInt Op1Min(BitWidth, 0), Op1Max(BitWidth, 0); if (I.isSigned()) { - computeSignedMinMaxValuesFromKnownBits(Op0KnownZero, Op0KnownOne, Op0Min, - Op0Max); - computeSignedMinMaxValuesFromKnownBits(Op1KnownZero, Op1KnownOne, Op1Min, - Op1Max); + computeSignedMinMaxValuesFromKnownBits(Op0Known, Op0Min, Op0Max); + computeSignedMinMaxValuesFromKnownBits(Op1Known, Op1Min, Op1Max); } else { - computeUnsignedMinMaxValuesFromKnownBits(Op0KnownZero, Op0KnownOne, Op0Min, - Op0Max); - computeUnsignedMinMaxValuesFromKnownBits(Op1KnownZero, Op1KnownOne, Op1Min, - Op1Max); + computeUnsignedMinMaxValuesFromKnownBits(Op0Known, Op0Min, Op0Max); + computeUnsignedMinMaxValuesFromKnownBits(Op1Known, Op1Min, Op1Max); } // If Min and Max are known to be the same, then SimplifyDemandedBits @@ -4054,8 +4049,8 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) { // If all bits are known zero except for one, then we know at most one bit // is set. If the comparison is against zero, then this is a check to see if // *that* bit is set. - APInt Op0KnownZeroInverted = ~Op0KnownZero; - if (~Op1KnownZero == 0) { + APInt Op0KnownZeroInverted = ~Op0Known.Zero; + if (~Op1Known.Zero == 0) { // If the LHS is an AND with the same constant, look through it. Value *LHS = nullptr; const APInt *LHSC; @@ -4193,8 +4188,8 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) { // Turn a signed comparison into an unsigned one if both operands are known to // have the same sign. if (I.isSigned() && - ((Op0KnownZero.isNegative() && Op1KnownZero.isNegative()) || - (Op0KnownOne.isNegative() && Op1KnownOne.isNegative()))) + ((Op0Known.Zero.isNegative() && Op1Known.Zero.isNegative()) || + (Op0Known.One.isNegative() && Op1Known.One.isNegative()))) return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1); return nullptr; diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h index 7100006..776686d 100644 --- a/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/lib/Transforms/InstCombine/InstCombineInternal.h @@ -489,10 +489,9 @@ public: return nullptr; // Don't do anything with FI } - void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne, + void computeKnownBits(Value *V, KnownBits &Known, unsigned Depth, Instruction *CxtI) const { - return llvm::computeKnownBits(V, KnownZero, KnownOne, DL, Depth, &AC, CxtI, - &DT); + return llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT); } bool MaskedValueIsZero(Value *V, const APInt &Mask, unsigned Depth = 0, @@ -536,24 +535,23 @@ private: /// \brief Attempts to replace V with a simpler value based on the demanded /// bits. - Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, APInt &KnownZero, - APInt &KnownOne, unsigned Depth, - Instruction *CxtI); + Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, KnownBits &Known, + unsigned Depth, Instruction *CxtI); bool SimplifyDemandedBits(Instruction *I, unsigned Op, - const APInt &DemandedMask, APInt &KnownZero, - APInt &KnownOne, unsigned Depth = 0); + const APInt &DemandedMask, KnownBits &Known, + unsigned Depth = 0); /// Helper routine of SimplifyDemandedUseBits. It computes KnownZero/KnownOne /// bits. It also tries to handle simplifications that can be done based on /// DemandedMask, but without modifying the Instruction. Value *SimplifyMultipleUseDemandedBits(Instruction *I, const APInt &DemandedMask, - APInt &KnownZero, APInt &KnownOne, + KnownBits &Known, unsigned Depth, Instruction *CxtI); /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded /// bit for "r1 = shr x, c1; r2 = shl r1, c2" instruction sequence. - Value *SimplifyShrShlDemandedBits(Instruction *Lsr, Instruction *Sftl, - const APInt &DemandedMask, APInt &KnownZero, - APInt &KnownOne); + Value *simplifyShrShlDemandedBits( + Instruction *Shr, const APInt &ShrOp1, Instruction *Shl, + const APInt &ShlOp1, const APInt &DemandedMask, KnownBits &Known); /// \brief Tries to simplify operands to an integer instruction based on its /// demanded bits. diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp index 5d6d899..76829c5 100644 --- a/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -17,6 +17,7 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/Support/KnownBits.h" using namespace llvm; using namespace PatternMatch; @@ -1476,11 +1477,11 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { // The motivation for this call into value tracking is to take advantage of // the assumption cache, so make sure that is populated. if (!CondVal->getType()->isVectorTy() && !AC.assumptions().empty()) { - APInt KnownOne(1, 0), KnownZero(1, 0); - computeKnownBits(CondVal, KnownZero, KnownOne, 0, &SI); - if (KnownOne == 1) + KnownBits Known(1); + computeKnownBits(CondVal, Known, 0, &SI); + if (Known.One == 1) return replaceInstUsesWith(SI, TrueVal); - if (KnownZero == 1) + if (Known.Zero == 1) return replaceInstUsesWith(SI, FalseVal); } diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 2ba052b..8d0ed85 100644 --- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -16,6 +16,7 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/Support/KnownBits.h" using namespace llvm; using namespace llvm::PatternMatch; @@ -26,7 +27,7 @@ using namespace llvm::PatternMatch; /// constant integer. If so, check to see if there are any bits set in the /// constant that are not demanded. If so, shrink the constant and return true. static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo, - APInt Demanded) { + const APInt &Demanded) { assert(I && "No instruction?"); assert(OpNo < I->getNumOperands() && "Operand index too large"); @@ -37,13 +38,11 @@ static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo, return false; // If there are no bits set that aren't demanded, nothing to do. - Demanded = Demanded.zextOrTrunc(C->getBitWidth()); if (C->isSubsetOf(Demanded)) return false; // This instruction is producing bits that are not demanded. Shrink the RHS. - Demanded &= *C; - I->setOperand(OpNo, ConstantInt::get(Op->getType(), Demanded)); + I->setOperand(OpNo, ConstantInt::get(Op->getType(), *C & Demanded)); return true; } @@ -54,10 +53,10 @@ static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo, /// the instruction has any properties that allow us to simplify its operands. bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) { unsigned BitWidth = Inst.getType()->getScalarSizeInBits(); - APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); + KnownBits Known(BitWidth); APInt DemandedMask(APInt::getAllOnesValue(BitWidth)); - Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, KnownZero, KnownOne, + Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, Known, 0, &Inst); if (!V) return false; if (V == &Inst) return true; @@ -70,11 +69,11 @@ bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) { /// change and false otherwise. bool InstCombiner::SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, - APInt &KnownZero, APInt &KnownOne, + KnownBits &Known, unsigned Depth) { Use &U = I->getOperandUse(OpNo); - Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, KnownZero, - KnownOne, Depth, I); + Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, Known, + Depth, I); if (!NewVal) return false; U = NewVal; return true; @@ -88,15 +87,16 @@ bool InstCombiner::SimplifyDemandedBits(Instruction *I, unsigned OpNo, /// with a constant or one of its operands. In such cases, this function does /// the replacement and returns true. In all other cases, it returns false after /// analyzing the expression and setting KnownOne and known to be one in the -/// expression. KnownZero contains all the bits that are known to be zero in the -/// expression. These are provided to potentially allow the caller (which might -/// recursively be SimplifyDemandedBits itself) to simplify the expression. -/// KnownOne and KnownZero always follow the invariant that: -/// KnownOne & KnownZero == 0. -/// That is, a bit can't be both 1 and 0. Note that the bits in KnownOne and -/// KnownZero may only be accurate for those bits set in DemandedMask. Note also -/// that the bitwidth of V, DemandedMask, KnownZero and KnownOne must all be the -/// same. +/// expression. Known.Zero contains all the bits that are known to be zero in +/// the expression. These are provided to potentially allow the caller (which +/// might recursively be SimplifyDemandedBits itself) to simplify the +/// expression. +/// Known.One and Known.Zero always follow the invariant that: +/// Known.One & Known.Zero == 0. +/// That is, a bit can't be both 1 and 0. Note that the bits in Known.One and +/// Known.Zero may only be accurate for those bits set in DemandedMask. Note +/// also that the bitwidth of V, DemandedMask, Known.Zero and Known.One must all +/// be the same. /// /// This returns null if it did not change anything and it permits no /// simplification. This returns V itself if it did some simplification of V's @@ -104,8 +104,7 @@ bool InstCombiner::SimplifyDemandedBits(Instruction *I, unsigned OpNo, /// some other non-null value if it found out that V is equal to another value /// in the context where the specified bits are demanded, but not for all users. Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, - APInt &KnownZero, APInt &KnownOne, - unsigned Depth, + KnownBits &Known, unsigned Depth, Instruction *CxtI) { assert(V != nullptr && "Null pointer of Value???"); assert(Depth <= 6 && "Limit Search Depth"); @@ -113,18 +112,16 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, Type *VTy = V->getType(); assert( (!VTy->isIntOrIntVectorTy() || VTy->getScalarSizeInBits() == BitWidth) && - KnownZero.getBitWidth() == BitWidth && - KnownOne.getBitWidth() == BitWidth && - "Value *V, DemandedMask, KnownZero and KnownOne " - "must have same BitWidth"); + Known.getBitWidth() == BitWidth && + "Value *V, DemandedMask and Known must have same BitWidth"); if (isa(V)) { - computeKnownBits(V, KnownZero, KnownOne, Depth, CxtI); + computeKnownBits(V, Known, Depth, CxtI); return nullptr; } - KnownZero.clearAllBits(); - KnownOne.clearAllBits(); + Known.Zero.clearAllBits(); + Known.One.clearAllBits(); if (DemandedMask == 0) // Not demanding any bits from V. return UndefValue::get(VTy); @@ -133,20 +130,17 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, Instruction *I = dyn_cast(V); if (!I) { - computeKnownBits(V, KnownZero, KnownOne, Depth, CxtI); + computeKnownBits(V, Known, Depth, CxtI); return nullptr; // Only analyze instructions. } // If there are multiple uses of this value and we aren't at the root, then // we can't do any simplifications of the operands, because DemandedMask // only reflects the bits demanded by *one* of the users. - if (Depth != 0 && !I->hasOneUse()) { - return SimplifyMultipleUseDemandedBits(I, DemandedMask, KnownZero, KnownOne, - Depth, CxtI); - } + if (Depth != 0 && !I->hasOneUse()) + return SimplifyMultipleUseDemandedBits(I, DemandedMask, Known, Depth, CxtI); - APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0); - APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); + KnownBits LHSKnown(BitWidth), RHSKnown(BitWidth); // If this is the root being simplified, allow it to have multiple uses, // just set the DemandedMask to all bits so that we can try to simplify the @@ -157,22 +151,21 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, switch (I->getOpcode()) { default: - computeKnownBits(I, KnownZero, KnownOne, Depth, CxtI); + computeKnownBits(I, Known, Depth, CxtI); break; case Instruction::And: { // If either the LHS or the RHS are Zero, the result is zero. - if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnownZero, RHSKnownOne, - Depth + 1) || - SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnownZero, LHSKnownZero, - LHSKnownOne, Depth + 1)) + if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) || + SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.Zero, LHSKnown, + Depth + 1)) return I; - assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); - assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); + assert(!(RHSKnown.Zero & RHSKnown.One) && "Bits known to be one AND zero?"); + assert(!(LHSKnown.Zero & LHSKnown.One) && "Bits known to be one AND zero?"); // Output known-0 are known to be clear if zero in either the LHS | RHS. - APInt IKnownZero = RHSKnownZero | LHSKnownZero; + APInt IKnownZero = RHSKnown.Zero | LHSKnown.Zero; // Output known-1 bits are only known if set in both the LHS & RHS. - APInt IKnownOne = RHSKnownOne & LHSKnownOne; + APInt IKnownOne = RHSKnown.One & LHSKnown.One; // If the client is only demanding bits that we know, return the known // constant. @@ -181,33 +174,32 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // If all of the demanded bits are known 1 on one side, return the other. // These bits cannot contribute to the result of the 'and'. - if (DemandedMask.isSubsetOf(LHSKnownZero | RHSKnownOne)) + if (DemandedMask.isSubsetOf(LHSKnown.Zero | RHSKnown.One)) return I->getOperand(0); - if (DemandedMask.isSubsetOf(RHSKnownZero | LHSKnownOne)) + if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.One)) return I->getOperand(1); // If the RHS is a constant, see if we can simplify it. - if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnownZero)) + if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnown.Zero)) return I; - KnownZero = std::move(IKnownZero); - KnownOne = std::move(IKnownOne); + Known.Zero = std::move(IKnownZero); + Known.One = std::move(IKnownOne); break; } case Instruction::Or: { // If either the LHS or the RHS are One, the result is One. - if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnownZero, RHSKnownOne, - Depth + 1) || - SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnownOne, LHSKnownZero, - LHSKnownOne, Depth + 1)) + if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) || + SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.One, LHSKnown, + Depth + 1)) return I; - assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); - assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); + assert(!(RHSKnown.Zero & RHSKnown.One) && "Bits known to be one AND zero?"); + assert(!(LHSKnown.Zero & LHSKnown.One) && "Bits known to be one AND zero?"); // Output known-0 bits are only known if clear in both the LHS & RHS. - APInt IKnownZero = RHSKnownZero & LHSKnownZero; - // Output known-1 are known to be set if set in either the LHS | RHS. - APInt IKnownOne = RHSKnownOne | LHSKnownOne; + APInt IKnownZero = RHSKnown.Zero & LHSKnown.Zero; + // Output known-1 are known. to be set if s.et in either the LHS | RHS. + APInt IKnownOne = RHSKnown.One | LHSKnown.One; // If the client is only demanding bits that we know, return the known // constant. @@ -216,34 +208,32 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // If all of the demanded bits are known zero on one side, return the other. // These bits cannot contribute to the result of the 'or'. - if (DemandedMask.isSubsetOf(LHSKnownOne | RHSKnownZero)) + if (DemandedMask.isSubsetOf(LHSKnown.One | RHSKnown.Zero)) return I->getOperand(0); - if (DemandedMask.isSubsetOf(RHSKnownOne | LHSKnownZero)) + if (DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero)) return I->getOperand(1); // If the RHS is a constant, see if we can simplify it. if (ShrinkDemandedConstant(I, 1, DemandedMask)) return I; - KnownZero = std::move(IKnownZero); - KnownOne = std::move(IKnownOne); + Known.Zero = std::move(IKnownZero); + Known.One = std::move(IKnownOne); break; } case Instruction::Xor: { - if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnownZero, RHSKnownOne, - Depth + 1) || - SimplifyDemandedBits(I, 0, DemandedMask, LHSKnownZero, LHSKnownOne, - Depth + 1)) + if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) || + SimplifyDemandedBits(I, 0, DemandedMask, LHSKnown, Depth + 1)) return I; - assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); - assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); + assert(!(RHSKnown.Zero & RHSKnown.One) && "Bits known to be one AND zero?"); + assert(!(LHSKnown.Zero & LHSKnown.One) && "Bits known to be one AND zero?"); // Output known-0 bits are known if clear or set in both the LHS & RHS. - APInt IKnownZero = (RHSKnownZero & LHSKnownZero) | - (RHSKnownOne & LHSKnownOne); + APInt IKnownZero = (RHSKnown.Zero & LHSKnown.Zero) | + (RHSKnown.One & LHSKnown.One); // Output known-1 are known to be set if set in only one of the LHS, RHS. - APInt IKnownOne = (RHSKnownZero & LHSKnownOne) | - (RHSKnownOne & LHSKnownZero); + APInt IKnownOne = (RHSKnown.Zero & LHSKnown.One) | + (RHSKnown.One & LHSKnown.Zero); // If the client is only demanding bits that we know, return the known // constant. @@ -252,15 +242,15 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // If all of the demanded bits are known zero on one side, return the other. // These bits cannot contribute to the result of the 'xor'. - if (DemandedMask.isSubsetOf(RHSKnownZero)) + if (DemandedMask.isSubsetOf(RHSKnown.Zero)) return I->getOperand(0); - if (DemandedMask.isSubsetOf(LHSKnownZero)) + if (DemandedMask.isSubsetOf(LHSKnown.Zero)) return I->getOperand(1); // If all of the demanded bits are known to be zero on one side or the // other, turn this into an *inclusive* or. // e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0 - if (DemandedMask.isSubsetOf(RHSKnownZero | LHSKnownZero)) { + if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero)) { Instruction *Or = BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1), I->getName()); @@ -271,10 +261,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // bits on that side are also known to be set on the other side, turn this // into an AND, as we know the bits will be cleared. // e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2 - if (DemandedMask.isSubsetOf(RHSKnownZero|RHSKnownOne) && - RHSKnownOne.isSubsetOf(LHSKnownOne)) { + if (DemandedMask.isSubsetOf(RHSKnown.Zero|RHSKnown.One) && + RHSKnown.One.isSubsetOf(LHSKnown.One)) { Constant *AndC = Constant::getIntegerValue(VTy, - ~RHSKnownOne & DemandedMask); + ~RHSKnown.One & DemandedMask); Instruction *And = BinaryOperator::CreateAnd(I->getOperand(0), AndC); return InsertNewInstWith(And, *I); } @@ -292,10 +282,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (LHSInst->getOpcode() == Instruction::And && LHSInst->hasOneUse() && isa(I->getOperand(1)) && isa(LHSInst->getOperand(1)) && - (LHSKnownOne & RHSKnownOne & DemandedMask) != 0) { + (LHSKnown.One & RHSKnown.One & DemandedMask) != 0) { ConstantInt *AndRHS = cast(LHSInst->getOperand(1)); ConstantInt *XorRHS = cast(I->getOperand(1)); - APInt NewMask = ~(LHSKnownOne & RHSKnownOne & DemandedMask); + APInt NewMask = ~(LHSKnown.One & RHSKnown.One & DemandedMask); Constant *AndC = ConstantInt::get(I->getType(), NewMask & AndRHS->getValue()); @@ -309,9 +299,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, } // Output known-0 bits are known if clear or set in both the LHS & RHS. - KnownZero = std::move(IKnownZero); + Known.Zero = std::move(IKnownZero); // Output known-1 are known to be set if set in only one of the LHS, RHS. - KnownOne = std::move(IKnownOne); + Known.One = std::move(IKnownOne); break; } case Instruction::Select: @@ -321,13 +311,11 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (matchSelectPattern(I, LHS, RHS).Flavor != SPF_UNKNOWN) return nullptr; - if (SimplifyDemandedBits(I, 2, DemandedMask, RHSKnownZero, RHSKnownOne, - Depth + 1) || - SimplifyDemandedBits(I, 1, DemandedMask, LHSKnownZero, LHSKnownOne, - Depth + 1)) + if (SimplifyDemandedBits(I, 2, DemandedMask, RHSKnown, Depth + 1) || + SimplifyDemandedBits(I, 1, DemandedMask, LHSKnown, Depth + 1)) return I; - assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); - assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); + assert(!(RHSKnown.Zero & RHSKnown.One) && "Bits known to be one AND zero?"); + assert(!(LHSKnown.Zero & LHSKnown.One) && "Bits known to be one AND zero?"); // If the operands are constants, see if we can simplify them. if (ShrinkDemandedConstant(I, 1, DemandedMask) || @@ -335,21 +323,20 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, return I; // Only known if known in both the LHS and RHS. - KnownOne = RHSKnownOne & LHSKnownOne; - KnownZero = RHSKnownZero & LHSKnownZero; + Known.One = RHSKnown.One & LHSKnown.One; + Known.Zero = RHSKnown.Zero & LHSKnown.Zero; break; case Instruction::Trunc: { unsigned truncBf = I->getOperand(0)->getType()->getScalarSizeInBits(); DemandedMask = DemandedMask.zext(truncBf); - KnownZero = KnownZero.zext(truncBf); - KnownOne = KnownOne.zext(truncBf); - if (SimplifyDemandedBits(I, 0, DemandedMask, KnownZero, KnownOne, - Depth + 1)) + Known.Zero = Known.Zero.zext(truncBf); + Known.One = Known.One.zext(truncBf); + if (SimplifyDemandedBits(I, 0, DemandedMask, Known, Depth + 1)) return I; DemandedMask = DemandedMask.trunc(BitWidth); - KnownZero = KnownZero.trunc(BitWidth); - KnownOne = KnownOne.trunc(BitWidth); - assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); + Known.Zero = Known.Zero.trunc(BitWidth); + Known.One = Known.One.trunc(BitWidth); + assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); break; } case Instruction::BitCast: @@ -369,27 +356,25 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // Don't touch a vector-to-scalar bitcast. return nullptr; - if (SimplifyDemandedBits(I, 0, DemandedMask, KnownZero, KnownOne, - Depth + 1)) + if (SimplifyDemandedBits(I, 0, DemandedMask, Known, Depth + 1)) return I; - assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); + assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); break; case Instruction::ZExt: { // Compute the bits in the result that are not present in the input. unsigned SrcBitWidth =I->getOperand(0)->getType()->getScalarSizeInBits(); DemandedMask = DemandedMask.trunc(SrcBitWidth); - KnownZero = KnownZero.trunc(SrcBitWidth); - KnownOne = KnownOne.trunc(SrcBitWidth); - if (SimplifyDemandedBits(I, 0, DemandedMask, KnownZero, KnownOne, - Depth + 1)) + Known.Zero = Known.Zero.trunc(SrcBitWidth); + Known.One = Known.One.trunc(SrcBitWidth); + if (SimplifyDemandedBits(I, 0, DemandedMask, Known, Depth + 1)) return I; DemandedMask = DemandedMask.zext(BitWidth); - KnownZero = KnownZero.zext(BitWidth); - KnownOne = KnownOne.zext(BitWidth); - assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); + Known.Zero = Known.Zero.zext(BitWidth); + Known.One = Known.One.zext(BitWidth); + assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); // The top bits are known to be zero. - KnownZero.setBitsFrom(SrcBitWidth); + Known.Zero.setBitsFrom(SrcBitWidth); break; } case Instruction::SExt: { @@ -406,27 +391,26 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, InputDemandedBits.setBit(SrcBitWidth-1); InputDemandedBits = InputDemandedBits.trunc(SrcBitWidth); - KnownZero = KnownZero.trunc(SrcBitWidth); - KnownOne = KnownOne.trunc(SrcBitWidth); - if (SimplifyDemandedBits(I, 0, InputDemandedBits, KnownZero, KnownOne, - Depth + 1)) + Known.Zero = Known.Zero.trunc(SrcBitWidth); + Known.One = Known.One.trunc(SrcBitWidth); + if (SimplifyDemandedBits(I, 0, InputDemandedBits, Known, Depth + 1)) return I; InputDemandedBits = InputDemandedBits.zext(BitWidth); - KnownZero = KnownZero.zext(BitWidth); - KnownOne = KnownOne.zext(BitWidth); - assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); + Known.Zero = Known.Zero.zext(BitWidth); + Known.One = Known.One.zext(BitWidth); + assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); // If the sign bit of the input is known set or clear, then we know the // top bits of the result. // If the input sign bit is known zero, or if the NewBits are not demanded // convert this into a zero extension. - if (KnownZero[SrcBitWidth-1] || (NewBits & ~DemandedMask) == NewBits) { + if (Known.Zero[SrcBitWidth-1] || (NewBits & ~DemandedMask) == NewBits) { // Convert to ZExt cast CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName()); return InsertNewInstWith(NewCast, *I); - } else if (KnownOne[SrcBitWidth-1]) { // Input sign bit known set - KnownOne |= NewBits; + } else if (Known.One[SrcBitWidth-1]) { // Input sign bit known set + Known.One |= NewBits; } break; } @@ -440,11 +424,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // significant bit and all those below it. APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ)); if (ShrinkDemandedConstant(I, 0, DemandedFromOps) || - SimplifyDemandedBits(I, 0, DemandedFromOps, LHSKnownZero, LHSKnownOne, - Depth + 1) || + SimplifyDemandedBits(I, 0, DemandedFromOps, LHSKnown, Depth + 1) || ShrinkDemandedConstant(I, 1, DemandedFromOps) || - SimplifyDemandedBits(I, 1, DemandedFromOps, RHSKnownZero, RHSKnownOne, - Depth + 1)) { + SimplifyDemandedBits(I, 1, DemandedFromOps, RHSKnown, Depth + 1)) { // Disable the nsw and nuw flags here: We can no longer guarantee that // we won't wrap after simplification. Removing the nsw/nuw flags is // legal here because the top bit is not demanded. @@ -456,30 +438,28 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // If we are known to be adding/subtracting zeros to every bit below // the highest demanded bit, we just return the other side. - if ((DemandedFromOps & RHSKnownZero) == DemandedFromOps) + if (DemandedFromOps.isSubsetOf(RHSKnown.Zero)) return I->getOperand(0); // We can't do this with the LHS for subtraction. if (I->getOpcode() == Instruction::Add && - (DemandedFromOps & LHSKnownZero) == DemandedFromOps) + DemandedFromOps.isSubsetOf(LHSKnown.Zero)) return I->getOperand(1); } // Otherwise just hand the add/sub off to computeKnownBits to fill in // the known zeros and ones. - computeKnownBits(V, KnownZero, KnownOne, Depth, CxtI); + computeKnownBits(V, Known, Depth, CxtI); break; } - case Instruction::Shl: - if (ConstantInt *SA = dyn_cast(I->getOperand(1))) { - { - Value *VarX; ConstantInt *C1; - if (match(I->getOperand(0), m_Shr(m_Value(VarX), m_ConstantInt(C1)))) { - Instruction *Shr = cast(I->getOperand(0)); - Value *R = SimplifyShrShlDemandedBits(Shr, I, DemandedMask, - KnownZero, KnownOne); - if (R) - return R; - } + case Instruction::Shl: { + const APInt *SA; + if (match(I->getOperand(1), m_APInt(SA))) { + const APInt *ShrAmt; + if (match(I->getOperand(0), m_Shr(m_Value(), m_APInt(ShrAmt)))) { + Instruction *Shr = cast(I->getOperand(0)); + if (Value *R = simplifyShrShlDemandedBits( + Shr, *ShrAmt, I, *SA, DemandedMask, Known)) + return R; } uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1); @@ -492,17 +472,17 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, else if (IOp->hasNoUnsignedWrap()) DemandedMaskIn.setHighBits(ShiftAmt); - if (SimplifyDemandedBits(I, 0, DemandedMaskIn, KnownZero, KnownOne, - Depth + 1)) + if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1)) return I; - assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); - KnownZero <<= ShiftAmt; - KnownOne <<= ShiftAmt; + assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); + Known.Zero <<= ShiftAmt; + Known.One <<= ShiftAmt; // low bits known zero. if (ShiftAmt) - KnownZero.setLowBits(ShiftAmt); + Known.Zero.setLowBits(ShiftAmt); } break; + } case Instruction::LShr: { const APInt *SA; if (match(I->getOperand(1), m_APInt(SA))) { @@ -516,14 +496,13 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (cast(I)->isExact()) DemandedMaskIn.setLowBits(ShiftAmt); - if (SimplifyDemandedBits(I, 0, DemandedMaskIn, KnownZero, KnownOne, - Depth + 1)) + if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1)) return I; - assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); - KnownZero.lshrInPlace(ShiftAmt); - KnownOne.lshrInPlace(ShiftAmt); + assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); + Known.Zero.lshrInPlace(ShiftAmt); + Known.One.lshrInPlace(ShiftAmt); if (ShiftAmt) - KnownZero.setHighBits(ShiftAmt); // high bits known zero. + Known.Zero.setHighBits(ShiftAmt); // high bits known zero. } break; } @@ -560,15 +539,14 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (cast(I)->isExact()) DemandedMaskIn.setLowBits(ShiftAmt); - if (SimplifyDemandedBits(I, 0, DemandedMaskIn, KnownZero, KnownOne, - Depth + 1)) + if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1)) return I; - assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); + assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); // Compute the new bits that are at the top now. APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt)); - KnownZero.lshrInPlace(ShiftAmt); - KnownOne.lshrInPlace(ShiftAmt); + Known.Zero.lshrInPlace(ShiftAmt); + Known.One.lshrInPlace(ShiftAmt); // Handle the sign bits. APInt SignMask(APInt::getSignMask(BitWidth)); @@ -577,14 +555,14 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // If the input sign bit is known to be zero, or if none of the top bits // are demanded, turn this into an unsigned shift right. - if (BitWidth <= ShiftAmt || KnownZero[BitWidth-ShiftAmt-1] || - (HighBits & ~DemandedMask) == HighBits) { + if (BitWidth <= ShiftAmt || Known.Zero[BitWidth-ShiftAmt-1] || + !DemandedMask.intersects(HighBits)) { BinaryOperator *LShr = BinaryOperator::CreateLShr(I->getOperand(0), I->getOperand(1)); LShr->setIsExact(cast(I)->isExact()); return InsertNewInstWith(LShr, *I); - } else if ((KnownOne & SignMask) != 0) { // New bits are known one. - KnownOne |= HighBits; + } else if (Known.One.intersects(SignMask)) { // New bits are known one. + Known.One |= HighBits; } } break; @@ -602,25 +580,24 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, APInt LowBits = RA - 1; APInt Mask2 = LowBits | APInt::getSignMask(BitWidth); - if (SimplifyDemandedBits(I, 0, Mask2, LHSKnownZero, LHSKnownOne, - Depth + 1)) + if (SimplifyDemandedBits(I, 0, Mask2, LHSKnown, Depth + 1)) return I; // The low bits of LHS are unchanged by the srem. - KnownZero = LHSKnownZero & LowBits; - KnownOne = LHSKnownOne & LowBits; + Known.Zero = LHSKnown.Zero & LowBits; + Known.One = LHSKnown.One & LowBits; // If LHS is non-negative or has all low bits zero, then the upper bits // are all zero. - if (LHSKnownZero.isSignBitSet() || ((LHSKnownZero & LowBits) == LowBits)) - KnownZero |= ~LowBits; + if (LHSKnown.Zero.isSignBitSet() || LowBits.isSubsetOf(LHSKnown.Zero)) + Known.Zero |= ~LowBits; // If LHS is negative and not all low bits are zero, then the upper bits // are all one. - if (LHSKnownOne.isSignBitSet() && ((LHSKnownOne & LowBits) != 0)) - KnownOne |= ~LowBits; + if (LHSKnown.One.isSignBitSet() && LowBits.intersects(LHSKnown.One)) + Known.One |= ~LowBits; - assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); + assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); break; } } @@ -628,22 +605,21 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // The sign bit is the LHS's sign bit, except when the result of the // remainder is zero. if (DemandedMask.isSignBitSet()) { - computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1, - CxtI); + computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, CxtI); // If it's known zero, our sign bit is also zero. - if (LHSKnownZero.isSignBitSet()) - KnownZero.setSignBit(); + if (LHSKnown.Zero.isSignBitSet()) + Known.Zero.setSignBit(); } break; case Instruction::URem: { - APInt KnownZero2(BitWidth, 0), KnownOne2(BitWidth, 0); + KnownBits Known2(BitWidth); APInt AllOnes = APInt::getAllOnesValue(BitWidth); - if (SimplifyDemandedBits(I, 0, AllOnes, KnownZero2, KnownOne2, Depth + 1) || - SimplifyDemandedBits(I, 1, AllOnes, KnownZero2, KnownOne2, Depth + 1)) + if (SimplifyDemandedBits(I, 0, AllOnes, Known2, Depth + 1) || + SimplifyDemandedBits(I, 1, AllOnes, Known2, Depth + 1)) return I; - unsigned Leaders = KnownZero2.countLeadingOnes(); - KnownZero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask; + unsigned Leaders = Known2.Zero.countLeadingOnes(); + Known.Zero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask; break; } case Instruction::Call: @@ -707,56 +683,54 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, return ConstantInt::getNullValue(VTy); // We know that the upper bits are set to zero. - KnownZero.setBitsFrom(ArgWidth); + Known.Zero.setBitsFrom(ArgWidth); return nullptr; } case Intrinsic::x86_sse42_crc32_64_64: - KnownZero.setBitsFrom(32); + Known.Zero.setBitsFrom(32); return nullptr; } } - computeKnownBits(V, KnownZero, KnownOne, Depth, CxtI); + computeKnownBits(V, Known, Depth, CxtI); break; } // If the client is only demanding bits that we know, return the known // constant. - if (DemandedMask.isSubsetOf(KnownZero|KnownOne)) - return Constant::getIntegerValue(VTy, KnownOne); + if (DemandedMask.isSubsetOf(Known.Zero|Known.One)) + return Constant::getIntegerValue(VTy, Known.One); return nullptr; } -/// Helper routine of SimplifyDemandedUseBits. It computes KnownZero/KnownOne +/// Helper routine of SimplifyDemandedUseBits. It computes Known /// bits. It also tries to handle simplifications that can be done based on /// DemandedMask, but without modifying the Instruction. Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I, const APInt &DemandedMask, - APInt &KnownZero, - APInt &KnownOne, + KnownBits &Known, unsigned Depth, Instruction *CxtI) { unsigned BitWidth = DemandedMask.getBitWidth(); Type *ITy = I->getType(); - APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0); - APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); + KnownBits LHSKnown(BitWidth); + KnownBits RHSKnown(BitWidth); // Despite the fact that we can't simplify this instruction in all User's - // context, we can at least compute the knownzero/knownone bits, and we can + // context, we can at least compute the known bits, and we can // do simplifications that apply to *just* the one user if we know that // this instruction has a simpler value in that context. switch (I->getOpcode()) { case Instruction::And: { // If either the LHS or the RHS are Zero, the result is zero. - computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth + 1, - CxtI); - computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1, + computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI); + computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, CxtI); // Output known-0 are known to be clear if zero in either the LHS | RHS. - APInt IKnownZero = RHSKnownZero | LHSKnownZero; + APInt IKnownZero = RHSKnown.Zero | LHSKnown.Zero; // Output known-1 bits are only known if set in both the LHS & RHS. - APInt IKnownOne = RHSKnownOne & LHSKnownOne; + APInt IKnownOne = RHSKnown.One & LHSKnown.One; // If the client is only demanding bits that we know, return the known // constant. @@ -766,13 +740,13 @@ Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I, // If all of the demanded bits are known 1 on one side, return the other. // These bits cannot contribute to the result of the 'and' in this // context. - if (DemandedMask.isSubsetOf(LHSKnownZero | RHSKnownOne)) + if (DemandedMask.isSubsetOf(LHSKnown.Zero | RHSKnown.One)) return I->getOperand(0); - if (DemandedMask.isSubsetOf(RHSKnownZero | LHSKnownOne)) + if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.One)) return I->getOperand(1); - KnownZero = std::move(IKnownZero); - KnownOne = std::move(IKnownOne); + Known.Zero = std::move(IKnownZero); + Known.One = std::move(IKnownOne); break; } case Instruction::Or: { @@ -780,15 +754,14 @@ Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I, // only bits from X or Y are demanded. // If either the LHS or the RHS are One, the result is One. - computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth + 1, - CxtI); - computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1, + computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI); + computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, CxtI); // Output known-0 bits are only known if clear in both the LHS & RHS. - APInt IKnownZero = RHSKnownZero & LHSKnownZero; + APInt IKnownZero = RHSKnown.Zero & LHSKnown.Zero; // Output known-1 are known to be set if set in either the LHS | RHS. - APInt IKnownOne = RHSKnownOne | LHSKnownOne; + APInt IKnownOne = RHSKnown.One | LHSKnown.One; // If the client is only demanding bits that we know, return the known // constant. @@ -798,30 +771,29 @@ Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I, // If all of the demanded bits are known zero on one side, return the // other. These bits cannot contribute to the result of the 'or' in this // context. - if (DemandedMask.isSubsetOf(LHSKnownOne | RHSKnownZero)) + if (DemandedMask.isSubsetOf(LHSKnown.One | RHSKnown.Zero)) return I->getOperand(0); - if (DemandedMask.isSubsetOf(RHSKnownOne | LHSKnownZero)) + if (DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero)) return I->getOperand(1); - KnownZero = std::move(IKnownZero); - KnownOne = std::move(IKnownOne); + Known.Zero = std::move(IKnownZero); + Known.One = std::move(IKnownOne); break; } case Instruction::Xor: { // We can simplify (X^Y) -> X or Y in the user's context if we know that // only bits from X or Y are demanded. - computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth + 1, - CxtI); - computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1, + computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI); + computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, CxtI); // Output known-0 bits are known if clear or set in both the LHS & RHS. - APInt IKnownZero = (RHSKnownZero & LHSKnownZero) | - (RHSKnownOne & LHSKnownOne); + APInt IKnownZero = (RHSKnown.Zero & LHSKnown.Zero) | + (RHSKnown.One & LHSKnown.One); // Output known-1 are known to be set if set in only one of the LHS, RHS. - APInt IKnownOne = (RHSKnownZero & LHSKnownOne) | - (RHSKnownOne & LHSKnownZero); + APInt IKnownOne = (RHSKnown.Zero & LHSKnown.One) | + (RHSKnown.One & LHSKnown.Zero); // If the client is only demanding bits that we know, return the known // constant. @@ -830,25 +802,25 @@ Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I, // If all of the demanded bits are known zero on one side, return the // other. - if (DemandedMask.isSubsetOf(RHSKnownZero)) + if (DemandedMask.isSubsetOf(RHSKnown.Zero)) return I->getOperand(0); - if (DemandedMask.isSubsetOf(LHSKnownZero)) + if (DemandedMask.isSubsetOf(LHSKnown.Zero)) return I->getOperand(1); // Output known-0 bits are known if clear or set in both the LHS & RHS. - KnownZero = std::move(IKnownZero); + Known.Zero = std::move(IKnownZero); // Output known-1 are known to be set if set in only one of the LHS, RHS. - KnownOne = std::move(IKnownOne); + Known.One = std::move(IKnownOne); break; } default: - // Compute the KnownZero/KnownOne bits to simplify things downstream. - computeKnownBits(I, KnownZero, KnownOne, Depth, CxtI); + // Compute the Known bits to simplify things downstream. + computeKnownBits(I, Known, Depth, CxtI); // If this user is only demanding bits that we know, return the known // constant. - if (DemandedMask.isSubsetOf(KnownZero|KnownOne)) - return Constant::getIntegerValue(ITy, KnownOne); + if (DemandedMask.isSubsetOf(Known.Zero|Known.One)) + return Constant::getIntegerValue(ITy, Known.One); break; } @@ -874,29 +846,26 @@ Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I, /// /// As with SimplifyDemandedUseBits, it returns NULL if the simplification was /// not successful. -Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr, - Instruction *Shl, - const APInt &DemandedMask, - APInt &KnownZero, - APInt &KnownOne) { - - const APInt &ShlOp1 = cast(Shl->getOperand(1))->getValue(); - const APInt &ShrOp1 = cast(Shr->getOperand(1))->getValue(); +Value * +InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1, + Instruction *Shl, const APInt &ShlOp1, + const APInt &DemandedMask, + KnownBits &Known) { if (!ShlOp1 || !ShrOp1) - return nullptr; // Noop. + return nullptr; // No-op. Value *VarX = Shr->getOperand(0); Type *Ty = VarX->getType(); - unsigned BitWidth = Ty->getIntegerBitWidth(); + unsigned BitWidth = Ty->getScalarSizeInBits(); if (ShlOp1.uge(BitWidth) || ShrOp1.uge(BitWidth)) return nullptr; // Undef. unsigned ShlAmt = ShlOp1.getZExtValue(); unsigned ShrAmt = ShrOp1.getZExtValue(); - KnownOne.clearAllBits(); - KnownZero.setLowBits(ShlAmt - 1); - KnownZero &= DemandedMask; + Known.One.clearAllBits(); + Known.Zero.setLowBits(ShlAmt - 1); + Known.Zero &= DemandedMask; APInt BitMask1(APInt::getAllOnesValue(BitWidth)); APInt BitMask2(APInt::getAllOnesValue(BitWidth)); diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index 81f2d9f..4729c79 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -60,6 +60,7 @@ #include "llvm/IR/ValueHandle.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" @@ -641,14 +642,6 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { if (Value *R = SimplifyBinOp(TopLevelOpcode, B, C, DL)) { // They do! Return "L op' R". ++NumExpand; - // If "L op' R" equals "A op' B" then "L op' R" is just the LHS. - if ((L == A && R == B) || - (Instruction::isCommutative(InnerOpcode) && L == B && R == A)) - return Op0; - // Otherwise return "L op' R" if it simplifies. - if (Value *V = SimplifyBinOp(InnerOpcode, L, R, DL)) - return V; - // Otherwise, create a new instruction. C = Builder->CreateBinOp(InnerOpcode, L, R); C->takeName(&I); return C; @@ -666,14 +659,6 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { if (Value *R = SimplifyBinOp(TopLevelOpcode, A, C, DL)) { // They do! Return "L op' R". ++NumExpand; - // If "L op' R" equals "B op' C" then "L op' R" is just the RHS. - if ((L == B && R == C) || - (Instruction::isCommutative(InnerOpcode) && L == C && R == B)) - return Op1; - // Otherwise return "L op' R" if it simplifies. - if (Value *V = SimplifyBinOp(InnerOpcode, L, R, DL)) - return V; - // Otherwise, create a new instruction. A = Builder->CreateBinOp(InnerOpcode, L, R); A->takeName(&I); return A; @@ -2196,11 +2181,10 @@ Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) { // There might be assume intrinsics dominating this return that completely // determine the value. If so, constant fold it. - unsigned BitWidth = VTy->getPrimitiveSizeInBits(); - APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); - computeKnownBits(ResultOp, KnownZero, KnownOne, 0, &RI); - if ((KnownZero|KnownOne).isAllOnesValue()) - RI.setOperand(0, Constant::getIntegerValue(VTy, KnownOne)); + KnownBits Known(VTy->getPrimitiveSizeInBits()); + computeKnownBits(ResultOp, Known, 0, &RI); + if ((Known.Zero|Known.One).isAllOnesValue()) + RI.setOperand(0, Constant::getIntegerValue(VTy, Known.One)); return nullptr; } @@ -2279,10 +2263,10 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) { } unsigned BitWidth = cast(Cond->getType())->getBitWidth(); - APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); - computeKnownBits(Cond, KnownZero, KnownOne, 0, &SI); - unsigned LeadingKnownZeros = KnownZero.countLeadingOnes(); - unsigned LeadingKnownOnes = KnownOne.countLeadingOnes(); + KnownBits Known(BitWidth); + computeKnownBits(Cond, Known, 0, &SI); + unsigned LeadingKnownZeros = Known.Zero.countLeadingOnes(); + unsigned LeadingKnownOnes = Known.One.countLeadingOnes(); // Compute the number of leading bits we can ignore. // TODO: A better way to determine this would use ComputeNumSignBits(). @@ -2879,11 +2863,10 @@ bool InstCombiner::run() { Type *Ty = I->getType(); if (ExpensiveCombines && !I->use_empty() && Ty->isIntOrIntVectorTy()) { unsigned BitWidth = Ty->getScalarSizeInBits(); - APInt KnownZero(BitWidth, 0); - APInt KnownOne(BitWidth, 0); - computeKnownBits(I, KnownZero, KnownOne, /*Depth*/0, I); - if ((KnownZero | KnownOne).isAllOnesValue()) { - Constant *C = ConstantInt::get(Ty, KnownOne); + KnownBits Known(BitWidth); + computeKnownBits(I, Known, /*Depth*/0, I); + if ((Known.Zero | Known.One).isAllOnesValue()) { + Constant *C = ConstantInt::get(Ty, Known.One); DEBUG(dbgs() << "IC: ConstFold (all bits known) to: " << *C << " from: " << *I << '\n'); diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 036dd8d..b866958 100644 --- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -265,11 +265,10 @@ static cl::opt cl::Hidden, cl::init(false)); static cl::opt - ClUseMachOGlobalsSection("asan-globals-live-support", - cl::desc("Use linker features to support dead " - "code stripping of globals " - "(Mach-O only)"), - cl::Hidden, cl::init(true)); + ClUseGlobalsGC("asan-globals-live-support", + cl::desc("Use linker features to support dead " + "code stripping of globals"), + cl::Hidden, cl::init(true)); // Debug flags. static cl::opt ClDebug("asan-debug", cl::desc("debug"), cl::Hidden, @@ -594,13 +593,15 @@ struct AddressSanitizer : public FunctionPass { }; class AddressSanitizerModule : public ModulePass { - public: +public: explicit AddressSanitizerModule(bool CompileKernel = false, - bool Recover = false) + bool Recover = false, + bool UseGlobalsGC = true) : ModulePass(ID), CompileKernel(CompileKernel || ClEnableKasan), - Recover(Recover || ClRecover) {} + Recover(Recover || ClRecover), + UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC) {} bool runOnModule(Module &M) override; - static char ID; // Pass identification, replacement for typeid + static char ID; // Pass identification, replacement for typeid StringRef getPassName() const override { return "AddressSanitizerModule"; } private: @@ -635,6 +636,7 @@ private: GlobalsMetadata GlobalsMD; bool CompileKernel; bool Recover; + bool UseGlobalsGC; Type *IntptrTy; LLVMContext *C; Triple TargetTriple; @@ -913,9 +915,10 @@ INITIALIZE_PASS( "ModulePass", false, false) ModulePass *llvm::createAddressSanitizerModulePass(bool CompileKernel, - bool Recover) { + bool Recover, + bool UseGlobalsGC) { assert(!CompileKernel || Recover); - return new AddressSanitizerModule(CompileKernel, Recover); + return new AddressSanitizerModule(CompileKernel, Recover, UseGlobalsGC); } static size_t TypeSizeToSizeIndex(uint32_t TypeSize) { @@ -1537,9 +1540,6 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) { // binary in order to allow the linker to properly dead strip. This is only // supported on recent versions of ld64. bool AddressSanitizerModule::ShouldUseMachOGlobalsSection() const { - if (!ClUseMachOGlobalsSection) - return false; - if (!TargetTriple.isOSBinFormatMachO()) return false; @@ -1911,9 +1911,9 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) { Initializers[i] = Initializer; } - if (TargetTriple.isOSBinFormatCOFF()) { + if (UseGlobalsGC && TargetTriple.isOSBinFormatCOFF()) { InstrumentGlobalsCOFF(IRB, M, NewGlobals, Initializers); - } else if (ShouldUseMachOGlobalsSection()) { + } else if (UseGlobalsGC && ShouldUseMachOGlobalsSection()) { InstrumentGlobalsMachO(IRB, M, NewGlobals, Initializers); } else { InstrumentGlobalsWithMetadataArray(IRB, M, NewGlobals, Initializers); diff --git a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp index 61d6276..d7eb857 100644 --- a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp +++ b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp @@ -943,14 +943,16 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) { BasicBlock *BB = MI->getParent(); DEBUG(dbgs() << "\n\n== Basic Block Before ==\n"); DEBUG(dbgs() << *BB << "\n"); + auto OrigBBFreq = BFI.getBlockFreq(BB); BasicBlock *DefaultBB = SplitBlock(BB, MI); BasicBlock::iterator It(*MI); ++It; assert(It != DefaultBB->end()); BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It)); - DefaultBB->setName("MemOP.Default"); MergeBB->setName("MemOP.Merge"); + BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency()); + DefaultBB->setName("MemOP.Default"); auto &Ctx = Func.getContext(); IRBuilder<> IRB(BB); diff --git a/lib/Transforms/ObjCARC/PtrState.cpp b/lib/Transforms/ObjCARC/PtrState.cpp index a5afc8a..c1bbc4e 100644 --- a/lib/Transforms/ObjCARC/PtrState.cpp +++ b/lib/Transforms/ObjCARC/PtrState.cpp @@ -351,8 +351,10 @@ bool TopDownPtrState::HandlePotentialAlterRefCount(Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA, ARCInstKind Class) { - // Check for possible releases. - if (!CanAlterRefCount(Inst, Ptr, PA, Class)) + // Check for possible releases. Treat clang.arc.use as a releasing instruction + // to prevent sinking a retain past it. + if (!CanAlterRefCount(Inst, Ptr, PA, Class) && + Class != ARCInstKind::IntrinsicUser) return false; DEBUG(dbgs() << " CanAlterRefCount: Seq: " << GetSeq() << "; " << *Ptr diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp index ee6333e..f62e111 100644 --- a/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -53,6 +53,12 @@ using namespace consthoist; STATISTIC(NumConstantsHoisted, "Number of constants hoisted"); STATISTIC(NumConstantsRebased, "Number of constants rebased"); +static cl::opt ConstHoistWithBlockFrequency( + "consthoist-with-block-frequency", cl::init(false), cl::Hidden, + cl::desc("Enable the use of the block frequency analysis to reduce the " + "chance to execute const materialization more frequently than " + "without hoisting.")); + namespace { /// \brief The constant hoisting pass. class ConstantHoistingLegacyPass : public FunctionPass { @@ -68,6 +74,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + if (ConstHoistWithBlockFrequency) + AU.addRequired(); AU.addRequired(); AU.addRequired(); } @@ -82,6 +90,7 @@ private: char ConstantHoistingLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(ConstantHoistingLegacyPass, "consthoist", "Constant Hoisting", false, false) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(ConstantHoistingLegacyPass, "consthoist", @@ -99,9 +108,13 @@ bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) { DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n"); DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n'); - bool MadeChange = Impl.runImpl( - Fn, getAnalysis().getTTI(Fn), - getAnalysis().getDomTree(), Fn.getEntryBlock()); + bool MadeChange = + Impl.runImpl(Fn, getAnalysis().getTTI(Fn), + getAnalysis().getDomTree(), + ConstHoistWithBlockFrequency + ? &getAnalysis().getBFI() + : nullptr, + Fn.getEntryBlock()); if (MadeChange) { DEBUG(dbgs() << "********** Function after Constant Hoisting: " @@ -148,33 +161,142 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst, return IDom->getBlock()->getTerminator(); } +/// \brief Given \p BBs as input, find another set of BBs which collectively +/// dominates \p BBs and have the minimal sum of frequencies. Return the BB +/// set found in \p BBs. +void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, + BasicBlock *Entry, + SmallPtrSet &BBs) { + assert(!BBs.count(Entry) && "Assume Entry is not in BBs"); + // Nodes on the current path to the root. + SmallPtrSet Path; + // Candidates includes any block 'BB' in set 'BBs' that is not strictly + // dominated by any other blocks in set 'BBs', and all nodes in the path + // in the dominator tree from Entry to 'BB'. + SmallPtrSet Candidates; + for (auto BB : BBs) { + Path.clear(); + // Walk up the dominator tree until Entry or another BB in BBs + // is reached. Insert the nodes on the way to the Path. + BasicBlock *Node = BB; + // The "Path" is a candidate path to be added into Candidates set. + bool isCandidate = false; + do { + Path.insert(Node); + if (Node == Entry || Candidates.count(Node)) { + isCandidate = true; + break; + } + assert(DT.getNode(Node)->getIDom() && + "Entry doens't dominate current Node"); + Node = DT.getNode(Node)->getIDom()->getBlock(); + } while (!BBs.count(Node)); + + // If isCandidate is false, Node is another Block in BBs dominating + // current 'BB'. Drop the nodes on the Path. + if (!isCandidate) + continue; + + // Add nodes on the Path into Candidates. + Candidates.insert(Path.begin(), Path.end()); + } + + // Sort the nodes in Candidates in top-down order and save the nodes + // in Orders. + unsigned Idx = 0; + SmallVector Orders; + Orders.push_back(Entry); + while (Idx != Orders.size()) { + BasicBlock *Node = Orders[Idx++]; + for (auto ChildDomNode : DT.getNode(Node)->getChildren()) { + if (Candidates.count(ChildDomNode->getBlock())) + Orders.push_back(ChildDomNode->getBlock()); + } + } + + // Visit Orders in bottom-up order. + typedef std::pair, BlockFrequency> + InsertPtsCostPair; + // InsertPtsMap is a map from a BB to the best insertion points for the + // subtree of BB (subtree not including the BB itself). + DenseMap InsertPtsMap; + InsertPtsMap.reserve(Orders.size() + 1); + for (auto RIt = Orders.rbegin(); RIt != Orders.rend(); RIt++) { + BasicBlock *Node = *RIt; + bool NodeInBBs = BBs.count(Node); + SmallPtrSet &InsertPts = InsertPtsMap[Node].first; + BlockFrequency &InsertPtsFreq = InsertPtsMap[Node].second; + + // Return the optimal insert points in BBs. + if (Node == Entry) { + BBs.clear(); + if (InsertPtsFreq > BFI.getBlockFreq(Node)) + BBs.insert(Entry); + else + BBs.insert(InsertPts.begin(), InsertPts.end()); + break; + } + + BasicBlock *Parent = DT.getNode(Node)->getIDom()->getBlock(); + // Initially, ParentInsertPts is empty and ParentPtsFreq is 0. Every child + // will update its parent's ParentInsertPts and ParentPtsFreq. + SmallPtrSet &ParentInsertPts = InsertPtsMap[Parent].first; + BlockFrequency &ParentPtsFreq = InsertPtsMap[Parent].second; + // Choose to insert in Node or in subtree of Node. + if (InsertPtsFreq > BFI.getBlockFreq(Node) || NodeInBBs) { + ParentInsertPts.insert(Node); + ParentPtsFreq += BFI.getBlockFreq(Node); + } else { + ParentInsertPts.insert(InsertPts.begin(), InsertPts.end()); + ParentPtsFreq += InsertPtsFreq; + } + } +} + /// \brief Find an insertion point that dominates all uses. -Instruction *ConstantHoistingPass::findConstantInsertionPoint( +SmallPtrSet ConstantHoistingPass::findConstantInsertionPoint( const ConstantInfo &ConstInfo) const { assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry."); // Collect all basic blocks. SmallPtrSet BBs; + SmallPtrSet InsertPts; for (auto const &RCI : ConstInfo.RebasedConstants) for (auto const &U : RCI.Uses) BBs.insert(findMatInsertPt(U.Inst, U.OpndIdx)->getParent()); - if (BBs.count(Entry)) - return &Entry->front(); + if (BBs.count(Entry)) { + InsertPts.insert(&Entry->front()); + return InsertPts; + } + + if (BFI) { + findBestInsertionSet(*DT, *BFI, Entry, BBs); + for (auto BB : BBs) { + BasicBlock::iterator InsertPt = BB->begin(); + for (; isa(InsertPt) || InsertPt->isEHPad(); ++InsertPt) + ; + InsertPts.insert(&*InsertPt); + } + return InsertPts; + } while (BBs.size() >= 2) { BasicBlock *BB, *BB1, *BB2; BB1 = *BBs.begin(); BB2 = *std::next(BBs.begin()); BB = DT->findNearestCommonDominator(BB1, BB2); - if (BB == Entry) - return &Entry->front(); + if (BB == Entry) { + InsertPts.insert(&Entry->front()); + return InsertPts; + } BBs.erase(BB1); BBs.erase(BB2); BBs.insert(BB); } assert((BBs.size() == 1) && "Expected only one element."); Instruction &FirstInst = (*BBs.begin())->front(); - return findMatInsertPt(&FirstInst); + InsertPts.insert(findMatInsertPt(&FirstInst)); + return InsertPts; } @@ -557,29 +679,54 @@ bool ConstantHoistingPass::emitBaseConstants() { bool MadeChange = false; for (auto const &ConstInfo : ConstantVec) { // Hoist and hide the base constant behind a bitcast. - Instruction *IP = findConstantInsertionPoint(ConstInfo); - IntegerType *Ty = ConstInfo.BaseConstant->getType(); - Instruction *Base = - new BitCastInst(ConstInfo.BaseConstant, Ty, "const", IP); - DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant << ") to BB " - << IP->getParent()->getName() << '\n' << *Base << '\n'); - NumConstantsHoisted++; + SmallPtrSet IPSet = findConstantInsertionPoint(ConstInfo); + assert(!IPSet.empty() && "IPSet is empty"); + + unsigned UsesNum = 0; + unsigned ReBasesNum = 0; + for (Instruction *IP : IPSet) { + IntegerType *Ty = ConstInfo.BaseConstant->getType(); + Instruction *Base = + new BitCastInst(ConstInfo.BaseConstant, Ty, "const", IP); + DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant + << ") to BB " << IP->getParent()->getName() << '\n' + << *Base << '\n'); + + // Emit materialization code for all rebased constants. + unsigned Uses = 0; + for (auto const &RCI : ConstInfo.RebasedConstants) { + for (auto const &U : RCI.Uses) { + Uses++; + BasicBlock *OrigMatInsertBB = + findMatInsertPt(U.Inst, U.OpndIdx)->getParent(); + // If Base constant is to be inserted in multiple places, + // generate rebase for U using the Base dominating U. + if (IPSet.size() == 1 || + DT->dominates(Base->getParent(), OrigMatInsertBB)) { + emitBaseConstants(Base, RCI.Offset, U); + ReBasesNum++; + } + } + } + UsesNum = Uses; - // Emit materialization code for all rebased constants. - for (auto const &RCI : ConstInfo.RebasedConstants) { - NumConstantsRebased++; - for (auto const &U : RCI.Uses) - emitBaseConstants(Base, RCI.Offset, U); + // Use the same debug location as the last user of the constant. + assert(!Base->use_empty() && "The use list is empty!?"); + assert(isa(Base->user_back()) && + "All uses should be instructions."); + Base->setDebugLoc(cast(Base->user_back())->getDebugLoc()); } + (void)UsesNum; + (void)ReBasesNum; + // Expect all uses are rebased after rebase is done. + assert(UsesNum == ReBasesNum && "Not all uses are rebased"); + + NumConstantsHoisted++; - // Use the same debug location as the last user of the constant. - assert(!Base->use_empty() && "The use list is empty!?"); - assert(isa(Base->user_back()) && - "All uses should be instructions."); - Base->setDebugLoc(cast(Base->user_back())->getDebugLoc()); + // Base constant is also included in ConstInfo.RebasedConstants, so + // deduct 1 from ConstInfo.RebasedConstants.size(). + NumConstantsRebased = ConstInfo.RebasedConstants.size() - 1; - // Correct for base constant, which we counted above too. - NumConstantsRebased--; MadeChange = true; } return MadeChange; @@ -595,9 +742,11 @@ void ConstantHoistingPass::deleteDeadCastInst() const { /// \brief Optimize expensive integer constants in the given function. bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI, - DominatorTree &DT, BasicBlock &Entry) { + DominatorTree &DT, BlockFrequencyInfo *BFI, + BasicBlock &Entry) { this->TTI = &TTI; this->DT = &DT; + this->BFI = BFI; this->Entry = &Entry; // Collect all constant candidates. collectConstantCandidates(Fn); @@ -628,7 +777,10 @@ PreservedAnalyses ConstantHoistingPass::run(Function &F, FunctionAnalysisManager &AM) { auto &DT = AM.getResult(F); auto &TTI = AM.getResult(F); - if (!runImpl(F, TTI, DT, F.getEntryBlock())) + auto BFI = ConstHoistWithBlockFrequency + ? &AM.getResult(F) + : nullptr; + if (!runImpl(F, TTI, DT, BFI, F.getEntryBlock())) return PreservedAnalyses::all(); PreservedAnalyses PA; diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index c843c61..b5a4cc2 100644 --- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -12,8 +12,8 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" -#include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyValueInfo.h" @@ -26,6 +26,7 @@ #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -95,7 +96,8 @@ static bool processSelect(SelectInst *S, LazyValueInfo *LVI) { return true; } -static bool processPHI(PHINode *P, LazyValueInfo *LVI) { +static bool processPHI(PHINode *P, LazyValueInfo *LVI, + const SimplifyQuery &SQ) { bool Changed = false; BasicBlock *BB = P->getParent(); @@ -149,9 +151,7 @@ static bool processPHI(PHINode *P, LazyValueInfo *LVI) { Changed = true; } - // FIXME: Provide TLI, DT, AT to SimplifyInstruction. - const DataLayout &DL = BB->getModule()->getDataLayout(); - if (Value *V = SimplifyInstruction(P, DL)) { + if (Value *V = SimplifyInstruction(P, SQ.getWithInstruction(P))) { P->replaceAllUsesWith(V); P->eraseFromParent(); Changed = true; @@ -488,9 +488,8 @@ static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) { ConstantInt::getFalse(C->getContext()); } -static bool runImpl(Function &F, LazyValueInfo *LVI) { +static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) { bool FnChanged = false; - // Visiting in a pre-order depth-first traversal causes us to simplify early // blocks before querying later blocks (which require us to analyze early // blocks). Eagerly simplifying shallow blocks means there is strictly less @@ -505,7 +504,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI) { BBChanged |= processSelect(cast(II), LVI); break; case Instruction::PHI: - BBChanged |= processPHI(cast(II), LVI); + BBChanged |= processPHI(cast(II), LVI, SQ); break; case Instruction::ICmp: case Instruction::FCmp: @@ -566,14 +565,25 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) { return false; LazyValueInfo *LVI = &getAnalysis().getLVI(); - return runImpl(F, LVI); + auto *DTWP = getAnalysisIfAvailable(); + auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; + auto *TLIWP = getAnalysisIfAvailable(); + auto *TLI = TLIWP ? &TLIWP->getTLI() : nullptr; + auto *ACWP = getAnalysisIfAvailable(); + auto *AC = ACWP ? &ACWP->getAssumptionCache(F) : nullptr; + const SimplifyQuery SQ(F.getParent()->getDataLayout(), TLI, DT, AC); + return runImpl(F, LVI, SQ); } PreservedAnalyses CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) { LazyValueInfo *LVI = &AM.getResult(F); - bool Changed = runImpl(F, LVI); + auto *DT = AM.getCachedResult(F); + auto *TLI = AM.getCachedResult(F); + auto *AC = AM.getCachedResult(F); + const SimplifyQuery SQ(F.getParent()->getDataLayout(), TLI, DT, AC); + bool Changed = runImpl(F, LVI, SQ); if (!Changed) return PreservedAnalyses::all(); diff --git a/lib/Transforms/Scalar/GuardWidening.cpp b/lib/Transforms/Scalar/GuardWidening.cpp index 7019287..48eda09 100644 --- a/lib/Transforms/Scalar/GuardWidening.cpp +++ b/lib/Transforms/Scalar/GuardWidening.cpp @@ -51,6 +51,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Transforms/Scalar.h" using namespace llvm; @@ -537,9 +538,9 @@ bool GuardWideningImpl::parseRangeChecks( } else if (match(Check.getBase(), m_Or(m_Value(OpLHS), m_ConstantInt(OpRHS)))) { unsigned BitWidth = OpLHS->getType()->getScalarSizeInBits(); - APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); - computeKnownBits(OpLHS, KnownZero, KnownOne, DL); - if ((OpRHS->getValue() & KnownZero) == OpRHS->getValue()) { + KnownBits Known(BitWidth); + computeKnownBits(OpLHS, Known, DL); + if ((OpRHS->getValue() & Known.Zero) == OpRHS->getValue()) { Check.setBase(OpLHS); APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue(); Check.setOffset(ConstantInt::get(Ctx, NewOffset)); diff --git a/lib/Transforms/Scalar/InferAddressSpaces.cpp b/lib/Transforms/Scalar/InferAddressSpaces.cpp index 5d87014..9e25638 100644 --- a/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -152,15 +152,15 @@ private: Function *F) const; void appendsFlatAddressExpressionToPostorderStack( - Value *V, std::vector> *PostorderStack, - DenseSet *Visited) const; + Value *V, std::vector> &PostorderStack, + DenseSet &Visited) const; bool rewriteIntrinsicOperands(IntrinsicInst *II, Value *OldV, Value *NewV) const; void collectRewritableIntrinsicOperands( IntrinsicInst *II, - std::vector> *PostorderStack, - DenseSet *Visited) const; + std::vector> &PostorderStack, + DenseSet &Visited) const; std::vector collectFlatAddressExpressions(Function &F) const; @@ -204,7 +204,6 @@ static bool isAddressExpression(const Value &V) { // // Precondition: V is an address expression. static SmallVector getPointerOperands(const Value &V) { - assert(isAddressExpression(V)); const Operator &Op = cast(V); switch (Op.getOpcode()) { case Instruction::PHI: { @@ -254,8 +253,8 @@ bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II, // TODO: Move logic to TTI? void InferAddressSpaces::collectRewritableIntrinsicOperands( - IntrinsicInst *II, std::vector> *PostorderStack, - DenseSet *Visited) const { + IntrinsicInst *II, std::vector> &PostorderStack, + DenseSet &Visited) const { switch (II->getIntrinsicID()) { case Intrinsic::objectsize: case Intrinsic::amdgcn_atomic_inc: @@ -272,13 +271,13 @@ void InferAddressSpaces::collectRewritableIntrinsicOperands( // If V is an unvisited flat address expression, appends V to PostorderStack // and marks it as visited. void InferAddressSpaces::appendsFlatAddressExpressionToPostorderStack( - Value *V, std::vector> *PostorderStack, - DenseSet *Visited) const { + Value *V, std::vector> &PostorderStack, + DenseSet &Visited) const { assert(V->getType()->isPointerTy()); if (isAddressExpression(*V) && V->getType()->getPointerAddressSpace() == FlatAddrSpace) { - if (Visited->insert(V).second) - PostorderStack->push_back(std::make_pair(V, false)); + if (Visited.insert(V).second) + PostorderStack.push_back(std::make_pair(V, false)); } } @@ -293,14 +292,18 @@ InferAddressSpaces::collectFlatAddressExpressions(Function &F) const { DenseSet Visited; auto PushPtrOperand = [&](Value *Ptr) { - appendsFlatAddressExpressionToPostorderStack(Ptr, &PostorderStack, - &Visited); + appendsFlatAddressExpressionToPostorderStack(Ptr, PostorderStack, + Visited); }; - // We only explore address expressions that are reachable from loads and - // stores for now because we aim at generating faster loads and stores. + // Look at operations that may be interesting accelerate by moving to a known + // address space. We aim at generating after loads and stores, but pure + // addressing calculations may also be faster. for (Instruction &I : instructions(F)) { - if (auto *LI = dyn_cast(&I)) + if (auto *GEP = dyn_cast(&I)) { + if (!GEP->getType()->isVectorTy()) + PushPtrOperand(GEP->getPointerOperand()); + } else if (auto *LI = dyn_cast(&I)) PushPtrOperand(LI->getPointerOperand()); else if (auto *SI = dyn_cast(&I)) PushPtrOperand(SI->getPointerOperand()); @@ -316,7 +319,7 @@ InferAddressSpaces::collectFlatAddressExpressions(Function &F) const { if (auto *MTI = dyn_cast(MI)) PushPtrOperand(MTI->getRawSource()); } else if (auto *II = dyn_cast(&I)) - collectRewritableIntrinsicOperands(II, &PostorderStack, &Visited); + collectRewritableIntrinsicOperands(II, PostorderStack, Visited); else if (ICmpInst *Cmp = dyn_cast(&I)) { // FIXME: Handle vectors of pointers if (Cmp->getOperand(0)->getType()->isPointerTy()) { @@ -338,8 +341,8 @@ InferAddressSpaces::collectFlatAddressExpressions(Function &F) const { // Otherwise, adds its operands to the stack and explores them. PostorderStack.back().second = true; for (Value *PtrOperand : getPointerOperands(*PostorderStack.back().first)) { - appendsFlatAddressExpressionToPostorderStack(PtrOperand, &PostorderStack, - &Visited); + appendsFlatAddressExpressionToPostorderStack(PtrOperand, PostorderStack, + Visited); } } return Postorder; diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index 08eb95a..a0da816 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -1289,6 +1289,36 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, if (PredToDestList.empty()) return false; + // If all the predecessors go to a single known successor, we want to fold, + // not thread. By doing so, we do not need to duplicate the current block and + // also miss potential opportunities in case we dont/cant duplicate. + if (OnlyDest && OnlyDest != MultipleDestSentinel) { + if (PredToDestList.size() == + (size_t)std::distance(pred_begin(BB), pred_end(BB))) { + bool SeenFirstBranchToOnlyDest = false; + for (BasicBlock *SuccBB : successors(BB)) { + if (SuccBB == OnlyDest && !SeenFirstBranchToOnlyDest) + SeenFirstBranchToOnlyDest = true; // Don't modify the first branch. + else + SuccBB->removePredecessor(BB, true); // This is unreachable successor. + } + + // Finally update the terminator. + TerminatorInst *Term = BB->getTerminator(); + BranchInst::Create(OnlyDest, Term); + Term->eraseFromParent(); + + // If the condition is now dead due to the removal of the old terminator, + // erase it. + auto *CondInst = dyn_cast(Cond); + if (CondInst && CondInst->use_empty()) + CondInst->eraseFromParent(); + // FIXME: in case this instruction is defined in the current BB and it + // resolves to a single value from all predecessors, we can do RAUW. + return true; + } + } + // Determine which is the most common successor. If we have many inputs and // this block is a switch, we want to start by threading the batch that goes // to the most popular destination first. If we only know about one diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 946d85d..5042fc1 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -345,6 +345,11 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset, if (!SI->isSimple()) return false; + // Don't convert stores of non-integral pointer types to memsets (which stores + // integers). + if (DL->isNonIntegralPointerType(SI->getValueOperand()->getType())) + return false; + // Avoid merging nontemporal stores. if (SI->getMetadata(LLVMContext::MD_nontemporal)) return false; diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp index e568936..8ce96cf 100644 --- a/lib/Transforms/Scalar/LoopRotation.cpp +++ b/lib/Transforms/Scalar/LoopRotation.cpp @@ -58,13 +58,14 @@ class LoopRotate { AssumptionCache *AC; DominatorTree *DT; ScalarEvolution *SE; + const SimplifyQuery &SQ; public: LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI, const TargetTransformInfo *TTI, AssumptionCache *AC, - DominatorTree *DT, ScalarEvolution *SE) - : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE) { - } + DominatorTree *DT, ScalarEvolution *SE, const SimplifyQuery &SQ) + : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE), + SQ(SQ) {} bool processLoop(Loop *L); private: @@ -311,8 +312,6 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { for (; PHINode *PN = dyn_cast(I); ++I) ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader); - const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); - // For the rest of the instructions, either hoist to the OrigPreheader if // possible or create a clone in the OldPreHeader if not. TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator(); @@ -342,8 +341,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // With the operands remapped, see if the instruction constant folds or is // otherwise simplifyable. This commonly occurs because the entry from PHI // nodes allows icmps and other instructions to fold. - // FIXME: Provide TLI, DT, AC to SimplifyInstruction. - Value *V = SimplifyInstruction(C, DL); + Value *V = SimplifyInstruction(C, SQ.getWithInstruction(C)); if (V && LI->replacementPreservesLCSSAForm(C, V)) { // If so, then delete the temporary instruction and stick the folded value // in the map. @@ -671,7 +669,9 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &) { int Threshold = EnableHeaderDuplication ? DefaultRotationThreshold : 0; - LoopRotate LR(Threshold, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE); + const DataLayout &DL = L.getHeader()->getModule()->getDataLayout(); + const SimplifyQuery SQ(DL, &AR.TLI, &AR.DT, &AR.AC); + LoopRotate LR(Threshold, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE, SQ); bool Changed = LR.processLoop(&L); if (!Changed) @@ -714,7 +714,11 @@ public: auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; auto *SEWP = getAnalysisIfAvailable(); auto *SE = SEWP ? &SEWP->getSE() : nullptr; - LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, SE); + auto *TLIWP = getAnalysisIfAvailable(); + auto *TLI = TLIWP ? &TLIWP->getTLI() : nullptr; + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + const SimplifyQuery SQ(DL, TLI, DT, AC); + LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, SE, SQ); return LR.processLoop(L); } }; diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index a99c999..8fa806a 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // // This pass transforms loops that contain branches on loop-invariant conditions -// to have multiple loops. For example, it turns the left into the right code: +// to multiple loops. For example, it turns the left into the right code: // // for (...) if (lic) // A for (...) diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp index 659353e..49ce026 100644 --- a/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -352,20 +352,10 @@ Value *StructurizeCFG::invert(Value *Condition) { if (Instruction *Inst = dyn_cast(Condition)) { // Third: Check all the users for an invert BasicBlock *Parent = Inst->getParent(); - for (User *U : Condition->users()) { - if (Instruction *I = dyn_cast(U)) { + for (User *U : Condition->users()) + if (Instruction *I = dyn_cast(U)) if (I->getParent() == Parent && match(I, m_Not(m_Specific(Condition)))) return I; - } - } - - // Avoid creating a new instruction in the common case of a compare. - if (CmpInst *Cmp = dyn_cast(Inst)) { - if (Cmp->hasOneUse()) { - Cmp->setPredicate(Cmp->getInversePredicate()); - return Cmp; - } - } // Last option: Create a new instruction return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator()); diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp index 1cfe3bd..7ffdad5 100644 --- a/lib/Transforms/Utils/BypassSlowDivision.cpp +++ b/lib/Transforms/Utils/BypassSlowDivision.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -256,14 +257,14 @@ ValueRange FastDivInsertionTask::getValueRange(Value *V, unsigned HiBits = LongLen - ShortLen; const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout(); - APInt Zeros(LongLen, 0), Ones(LongLen, 0); + KnownBits Known(LongLen); - computeKnownBits(V, Zeros, Ones, DL); + computeKnownBits(V, Known, DL); - if (Zeros.countLeadingOnes() >= HiBits) + if (Known.Zero.countLeadingOnes() >= HiBits) return VALRNG_KNOWN_SHORT; - if (Ones.countLeadingZeros() < HiBits) + if (Known.One.countLeadingZeros() < HiBits) return VALRNG_LIKELY_LONG; // Long integer divisions are often used in hashtable implementations. It's diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp index 8255268..ed72099 100644 --- a/lib/Transforms/Utils/CodeExtractor.cpp +++ b/lib/Transforms/Utils/CodeExtractor.cpp @@ -73,24 +73,26 @@ bool CodeExtractor::isBlockValidForExtraction(const BasicBlock &BB) { } /// \brief Build a set of blocks to extract if the input blocks are viable. -template -static SetVector buildExtractionBlockSet(IteratorT BBBegin, - IteratorT BBEnd) { +static SetVector +buildExtractionBlockSet(ArrayRef BBs, DominatorTree *DT) { + assert(!BBs.empty() && "The set of blocks to extract must be non-empty"); SetVector Result; - assert(BBBegin != BBEnd); - // Loop over the blocks, adding them to our set-vector, and aborting with an // empty set if we encounter invalid blocks. - do { - if (!Result.insert(*BBBegin)) - llvm_unreachable("Repeated basic blocks in extraction input"); + for (BasicBlock *BB : BBs) { - if (!CodeExtractor::isBlockValidForExtraction(**BBBegin)) { + // If this block is dead, don't process it. + if (DT && !DT->isReachableFromEntry(BB)) + continue; + + if (!Result.insert(BB)) + llvm_unreachable("Repeated basic blocks in extraction input"); + if (!CodeExtractor::isBlockValidForExtraction(*BB)) { Result.clear(); return Result; } - } while (++BBBegin != BBEnd); + } #ifndef NDEBUG for (SetVector::iterator I = std::next(Result.begin()), @@ -106,23 +108,17 @@ static SetVector buildExtractionBlockSet(IteratorT BBBegin, return Result; } -/// \brief Helper to call buildExtractionBlockSet with an ArrayRef. -static SetVector -buildExtractionBlockSet(ArrayRef BBs) { - return buildExtractionBlockSet(BBs.begin(), BBs.end()); -} - CodeExtractor::CodeExtractor(ArrayRef BBs, DominatorTree *DT, bool AggregateArgs, BlockFrequencyInfo *BFI, BranchProbabilityInfo *BPI) : DT(DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI), - BPI(BPI), Blocks(buildExtractionBlockSet(BBs)), NumExitBlocks(~0U) {} + BPI(BPI), Blocks(buildExtractionBlockSet(BBs, DT)), NumExitBlocks(~0U) {} CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs, BlockFrequencyInfo *BFI, BranchProbabilityInfo *BPI) : DT(&DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI), - BPI(BPI), Blocks(buildExtractionBlockSet(L.getBlocks())), + BPI(BPI), Blocks(buildExtractionBlockSet(L.getBlocks(), &DT)), NumExitBlocks(~0U) {} /// definedInRegion - Return true if the specified value is defined in the @@ -194,9 +190,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) { // containing PHI nodes merging values from outside of the region, and a // second that contains all of the code for the block and merges back any // incoming values from inside of the region. - BasicBlock::iterator AfterPHIs = Header->getFirstNonPHI()->getIterator(); - BasicBlock *NewBB = Header->splitBasicBlock(AfterPHIs, - Header->getName()+".ce"); + BasicBlock *NewBB = llvm::SplitBlock(Header, Header->getFirstNonPHI(), DT); // We only want to code extract the second block now, and it becomes the new // header of the region. @@ -205,11 +199,6 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) { Blocks.insert(NewBB); Header = NewBB; - // Okay, update dominator sets. The blocks that dominate the new one are the - // blocks that dominate TIBB plus the new block itself. - if (DT) - DT->splitBlock(NewBB); - // Okay, now we need to adjust the PHI nodes and any branches from within the // region to go to the new header block instead of the old header block. if (NumPredsFromRegion) { @@ -224,12 +213,14 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) { // Okay, everything within the region is now branching to the right block, we // just have to update the PHI nodes now, inserting PHI nodes into NewBB. + BasicBlock::iterator AfterPHIs; for (AfterPHIs = OldPred->begin(); isa(AfterPHIs); ++AfterPHIs) { PHINode *PN = cast(AfterPHIs); // Create a new PHI node in the new region, which has an incoming value // from OldPred of PN. PHINode *NewPN = PHINode::Create(PN->getType(), 1 + NumPredsFromRegion, PN->getName() + ".ce", &NewBB->front()); + PN->replaceAllUsesWith(NewPN); NewPN->addIncoming(PN, OldPred); // Loop over all of the incoming value in PN, moving them to NewPN if they diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index 8c54427..d3002c5 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -45,6 +45,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -1038,9 +1039,9 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, "getOrEnforceKnownAlignment expects a pointer!"); unsigned BitWidth = DL.getPointerTypeSizeInBits(V->getType()); - APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); - computeKnownBits(V, KnownZero, KnownOne, DL, 0, AC, CxtI, DT); - unsigned TrailZ = KnownZero.countTrailingOnes(); + KnownBits Known(BitWidth); + computeKnownBits(V, Known, DL, 0, AC, CxtI, DT); + unsigned TrailZ = Known.Zero.countTrailingOnes(); // Avoid trouble with ridiculously large TrailZ values, such as // those computed from a null pointer. @@ -1268,21 +1269,37 @@ static void appendOffset(SmallVectorImpl &Ops, int64_t Offset) { } } -/// Prepend \p DIExpr with a deref and offset operation. +enum { WithStackValue = true }; + +/// Prepend \p DIExpr with a deref and offset operation and optionally turn it +/// into a stack value. static DIExpression *prependDIExpr(DIBuilder &Builder, DIExpression *DIExpr, - bool Deref, int64_t Offset) { - if (!Deref && !Offset) + bool Deref, int64_t Offset = 0, + bool StackValue = false) { + if (!Deref && !Offset && !StackValue) return DIExpr; - // Create a copy of the original DIDescriptor for user variable, prepending - // "deref" operation to a list of address elements, as new llvm.dbg.declare - // will take a value storing address of the memory for variable, not - // alloca itself. - SmallVector Ops; + + SmallVector Ops; + appendOffset(Ops, Offset); if (Deref) Ops.push_back(dwarf::DW_OP_deref); - appendOffset(Ops, Offset); if (DIExpr) - Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()); + for (auto Op : DIExpr->expr_ops()) { + // A DW_OP_stack_value comes at the end, but before a DW_OP_LLVM_fragment. + if (StackValue) { + if (Op.getOp() == dwarf::DW_OP_stack_value) + StackValue = false; + else if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) { + Ops.push_back(dwarf::DW_OP_stack_value); + StackValue = false; + } + } + Ops.push_back(Op.getOp()); + for (unsigned I = 0; I < Op.getNumArgs(); ++I) + Ops.push_back(Op.getArg(I)); + } + if (StackValue) + Ops.push_back(dwarf::DW_OP_stack_value); return Builder.createExpression(Ops); } @@ -1374,12 +1391,15 @@ void llvm::salvageDebugInfo(Instruction &I) { unsigned BitWidth = M.getDataLayout().getPointerSizeInBits(GEP->getPointerAddressSpace()); APInt Offset(BitWidth, 0); - // Rewrite a constant GEP into a DIExpression. + // Rewrite a constant GEP into a DIExpression. Since we are performing + // arithmetic to compute the variable's *value* in the DIExpression, we + // need to mark the expression with a DW_OP_stack_value. if (GEP->accumulateConstantOffset(M.getDataLayout(), Offset)) { auto *DIExpr = DVI->getExpression(); DIBuilder DIB(M, /*AllowUnresolved*/ false); - // GEP offsets are i32 and thus alwaus fit into an int64_t. - DIExpr = prependDIExpr(DIB, DIExpr, NoDeref, Offset.getSExtValue()); + // GEP offsets are i32 and thus always fit into an int64_t. + DIExpr = prependDIExpr(DIB, DIExpr, NoDeref, Offset.getSExtValue(), + WithStackValue); DVI->setOperand(0, MDWrap(I.getOperand(0))); DVI->setOperand(3, MetadataAsValue::get(I.getContext(), DIExpr)); DEBUG(dbgs() << "SALVAGE: " << *DVI << '\n'); @@ -1391,7 +1411,7 @@ void llvm::salvageDebugInfo(Instruction &I) { // Rewrite the load into DW_OP_deref. auto *DIExpr = DVI->getExpression(); DIBuilder DIB(M, /*AllowUnresolved*/ false); - DIExpr = prependDIExpr(DIB, DIExpr, WithDeref, 0); + DIExpr = prependDIExpr(DIB, DIExpr, WithDeref); DVI->setOperand(0, MDWrap(I.getOperand(0))); DVI->setOperand(3, MetadataAsValue::get(I.getContext(), DIExpr)); DEBUG(dbgs() << "SALVAGE: " << *DVI << '\n'); diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp index 3c669ce..43ab725 100644 --- a/lib/Transforms/Utils/LoopUnroll.cpp +++ b/lib/Transforms/Utils/LoopUnroll.cpp @@ -318,6 +318,10 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force, return false; } + // The current loop unroll pass can only unroll loops with a single latch + // that's a conditional branch exiting the loop. + // FIXME: The implementation can be extended to work with more complicated + // cases, e.g. loops with multiple latches. BasicBlock *Header = L->getHeader(); BranchInst *BI = dyn_cast(LatchBlock->getTerminator()); @@ -328,6 +332,16 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force, return false; } + auto CheckSuccessors = [&](unsigned S1, unsigned S2) { + return BI->getSuccessor(S1) == Header && !L->contains(BI->getSuccessor(S2)); + }; + + if (!CheckSuccessors(0, 1) && !CheckSuccessors(1, 0)) { + DEBUG(dbgs() << "Can't unroll; only loops with one conditional latch" + " exiting the loop can be unrolled\n"); + return false; + } + if (Header->hasAddressTaken()) { // The loop-rotate pass can be helpful to avoid this in many cases. DEBUG(dbgs() << diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp index b375d51..8959e77 100644 --- a/lib/Transforms/Utils/LowerSwitch.cpp +++ b/lib/Transforms/Utils/LowerSwitch.cpp @@ -403,6 +403,14 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI, Value *Val = SI->getCondition(); // The value we are switching on... BasicBlock* Default = SI->getDefaultDest(); + // Don't handle unreachable blocks. If there are successors with phis, this + // would leave them behind with missing predecessors. + if ((CurBlock != &F->getEntryBlock() && pred_empty(CurBlock)) || + CurBlock->getSinglePredecessor() == CurBlock) { + DeleteList.insert(CurBlock); + return; + } + // If there is only the default destination, just branch. if (!SI->getNumCases()) { BranchInst::Create(Default, CurBlock); diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index 2f575b9..f86e97b 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -60,6 +60,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -3055,6 +3056,15 @@ static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI) { BasicBlock *QFB = QBI->getSuccessor(1); BasicBlock *PostBB = QFB->getSingleSuccessor(); + // Make sure we have a good guess for PostBB. If QTB's only successor is + // QFB, then QFB is a better PostBB. + if (QTB->getSingleSuccessor() == QFB) + PostBB = QFB; + + // If we couldn't find a good PostBB, stop. + if (!PostBB) + return false; + bool InvertPCond = false, InvertQCond = false; // Canonicalize fallthroughs to the true branches. if (PFB == QBI->getParent()) { @@ -3079,8 +3089,7 @@ static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI) { auto HasOnePredAndOneSucc = [](BasicBlock *BB, BasicBlock *P, BasicBlock *S) { return BB->getSinglePredecessor() == P && BB->getSingleSuccessor() == S; }; - if (!PostBB || - !HasOnePredAndOneSucc(PFB, PBI->getParent(), QBI->getParent()) || + if (!HasOnePredAndOneSucc(PFB, PBI->getParent(), QBI->getParent()) || !HasOnePredAndOneSucc(QFB, QBI->getParent(), PostBB)) return false; if ((PTB && !HasOnePredAndOneSucc(PTB, PBI->getParent(), QBI->getParent())) || @@ -3746,7 +3755,7 @@ bool SimplifyCFGOpt::SimplifyCommonResume(ResumeInst *RI) { if (!isa(I)) return false; - SmallSet TrivialUnwindBlocks; + SmallSetVector TrivialUnwindBlocks; auto *PhiLPInst = cast(RI->getValue()); // Check incoming blocks to see if any of them are trivial. @@ -4359,8 +4368,8 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC, const DataLayout &DL) { Value *Cond = SI->getCondition(); unsigned Bits = Cond->getType()->getIntegerBitWidth(); - APInt KnownZero(Bits, 0), KnownOne(Bits, 0); - computeKnownBits(Cond, KnownZero, KnownOne, DL, 0, AC, SI); + KnownBits Known(Bits); + computeKnownBits(Cond, Known, DL, 0, AC, SI); // We can also eliminate cases by determining that their values are outside of // the limited range of the condition based on how many significant (non-sign) @@ -4372,7 +4381,7 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC, SmallVector DeadCases; for (auto &Case : SI->cases()) { APInt CaseVal = Case.getCaseValue()->getValue(); - if ((CaseVal & KnownZero) != 0 || (CaseVal & KnownOne) != KnownOne || + if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) || (CaseVal.getMinSignedBits() > MaxSignificantBitsInCond)) { DeadCases.push_back(Case.getCaseValue()); DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal << " is dead.\n"); @@ -4386,7 +4395,7 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC, bool HasDefault = !isa(SI->getDefaultDest()->getFirstNonPHIOrDbg()); const unsigned NumUnknownBits = - Bits - (KnownZero | KnownOne).countPopulation(); + Bits - (Known.Zero | Known.One).countPopulation(); assert(NumUnknownBits <= Bits); if (HasDefault && DeadCases.empty() && NumUnknownBits < 64 /* avoid overflow */ && diff --git a/lib/Transforms/Utils/SimplifyInstructions.cpp b/lib/Transforms/Utils/SimplifyInstructions.cpp index f607086..2737342 100644 --- a/lib/Transforms/Utils/SimplifyInstructions.cpp +++ b/lib/Transforms/Utils/SimplifyInstructions.cpp @@ -35,10 +35,8 @@ using namespace llvm; STATISTIC(NumSimplified, "Number of redundant instructions removed"); -static bool runImpl(Function &F, const DominatorTree *DT, - const TargetLibraryInfo *TLI, AssumptionCache *AC, +static bool runImpl(Function &F, const SimplifyQuery &SQ, OptimizationRemarkEmitter *ORE) { - const DataLayout &DL = F.getParent()->getDataLayout(); SmallPtrSet S1, S2, *ToSimplify = &S1, *Next = &S2; bool Changed = false; @@ -56,7 +54,8 @@ static bool runImpl(Function &F, const DominatorTree *DT, // Don't waste time simplifying unused instructions. if (!I->use_empty()) { - if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC, ORE)) { + if (Value *V = + SimplifyInstruction(I, SQ.getWithInstruction(I), ORE)) { // Mark all uses for resimplification next time round the loop. for (User *U : I->users()) Next->insert(cast(U)); @@ -65,7 +64,7 @@ static bool runImpl(Function &F, const DominatorTree *DT, Changed = true; } } - if (RecursivelyDeleteTriviallyDeadInstructions(I, TLI)) { + if (RecursivelyDeleteTriviallyDeadInstructions(I, SQ.TLI)) { // RecursivelyDeleteTriviallyDeadInstruction can remove more than one // instruction, so simply incrementing the iterator does not work. // When instructions get deleted re-iterate instead. @@ -113,8 +112,9 @@ namespace { &getAnalysis().getAssumptionCache(F); OptimizationRemarkEmitter *ORE = &getAnalysis().getORE(); - - return runImpl(F, DT, TLI, AC, ORE); + const DataLayout &DL = F.getParent()->getDataLayout(); + const SimplifyQuery SQ(DL, TLI, DT, AC); + return runImpl(F, SQ, ORE); } }; } @@ -141,7 +141,9 @@ PreservedAnalyses InstSimplifierPass::run(Function &F, auto &TLI = AM.getResult(F); auto &AC = AM.getResult(F); auto &ORE = AM.getResult(F); - bool Changed = runImpl(F, &DT, &TLI, &AC, &ORE); + const DataLayout &DL = F.getParent()->getDataLayout(); + const SimplifyQuery SQ(DL, &TLI, &DT, &AC); + bool Changed = runImpl(F, SQ, &ORE); if (!Changed) return PreservedAnalyses::all(); diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp index aa71e36..2c1c304 100644 --- a/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -30,6 +30,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/Local.h" @@ -37,10 +38,6 @@ using namespace llvm; using namespace PatternMatch; static cl::opt - ColdErrorCalls("error-reporting-is-cold", cl::init(true), cl::Hidden, - cl::desc("Treat error-reporting calls as cold")); - -static cl::opt EnableUnsafeFPShrink("enable-double-float-shrink", cl::Hidden, cl::init(false), cl::desc("Enable unsafe double to float " @@ -459,11 +456,9 @@ Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) { Value *Offset = GEP->getOperand(2); unsigned BitWidth = Offset->getType()->getIntegerBitWidth(); - APInt KnownZero(BitWidth, 0); - APInt KnownOne(BitWidth, 0); - computeKnownBits(Offset, KnownZero, KnownOne, DL, 0, nullptr, CI, - nullptr); - KnownZero.flipAllBits(); + KnownBits Known(BitWidth); + computeKnownBits(Offset, Known, DL, 0, nullptr, CI, nullptr); + Known.Zero.flipAllBits(); size_t ArrSize = cast(GEP->getSourceElementType())->getNumElements(); @@ -477,7 +472,7 @@ Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) { // optimize if we can prove that the program has undefined behavior when // Offset is outside that range. That is the case when GEP->getOperand(0) // is a pointer to an object whose memory extent is NullTermIdx+1. - if ((KnownZero.isNonNegative() && KnownZero.ule(NullTermIdx)) || + if ((Known.Zero.isNonNegative() && Known.Zero.ule(NullTermIdx)) || (GEP->isInBounds() && isa(GEP->getOperand(0)) && NullTermIdx == ArrSize - 1)) return B.CreateSub(ConstantInt::get(CI->getType(), NullTermIdx), @@ -846,6 +841,9 @@ static Value *foldMallocMemset(CallInst *Memset, IRBuilder<> &B, // Is the inner call really malloc()? Function *InnerCallee = Malloc->getCalledFunction(); + if (!InnerCallee) + return nullptr; + LibFunc Func; if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) || Func != LibFunc_malloc) @@ -930,6 +928,24 @@ static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B, if (V == nullptr) return nullptr; + // If call isn't an intrinsic, check that it isn't within a function with the + // same name as the float version of this call. + // + // e.g. inline float expf(float val) { return (float) exp((double) val); } + // + // A similar such definition exists in the MinGW-w64 math.h header file which + // when compiled with -O2 -ffast-math causes the generation of infinite loops + // where expf is called. + if (!Callee->isIntrinsic()) { + const Function *F = CI->getFunction(); + StringRef FName = F->getName(); + StringRef CalleeName = Callee->getName(); + if ((FName.size() == (CalleeName.size() + 1)) && + (FName.back() == 'f') && + FName.startswith(CalleeName)) + return nullptr; + } + // Propagate fast-math flags from the existing call to the new call. IRBuilder<>::FastMathFlagGuard Guard(B); B.setFastMathFlags(CI->getFastMathFlags()); @@ -1632,7 +1648,7 @@ Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilder<> &B, } static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg) { - if (!ColdErrorCalls || !Callee || !Callee->isDeclaration()) + if (!Callee || !Callee->isDeclaration()) return false; if (StreamArg < 0) diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 4409d7a..97dcb40 100644 --- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -30,6 +30,7 @@ #include "llvm/IR/Value.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Vectorize.h" @@ -65,7 +66,9 @@ public: bool run(); private: - Value *getPointerOperand(Value *I); + Value *getPointerOperand(Value *I) const; + + GetElementPtrInst *getSourceGEP(Value *Src) const; unsigned getPointerAddressSpace(Value *I); @@ -215,7 +218,7 @@ bool Vectorizer::run() { return Changed; } -Value *Vectorizer::getPointerOperand(Value *I) { +Value *Vectorizer::getPointerOperand(Value *I) const { if (LoadInst *LI = dyn_cast(I)) return LI->getPointerOperand(); if (StoreInst *SI = dyn_cast(I)) @@ -231,6 +234,19 @@ unsigned Vectorizer::getPointerAddressSpace(Value *I) { return -1; } +GetElementPtrInst *Vectorizer::getSourceGEP(Value *Src) const { + // First strip pointer bitcasts. Make sure pointee size is the same with + // and without casts. + // TODO: a stride set by the add instruction below can match the difference + // in pointee type size here. Currently it will not be vectorized. + Value *SrcPtr = getPointerOperand(Src); + Value *SrcBase = SrcPtr->stripPointerCasts(); + if (DL.getTypeStoreSize(SrcPtr->getType()->getPointerElementType()) == + DL.getTypeStoreSize(SrcBase->getType()->getPointerElementType())) + SrcPtr = SrcBase; + return dyn_cast(SrcPtr); +} + // FIXME: Merge with llvm::isConsecutiveAccess bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { Value *PtrA = getPointerOperand(A); @@ -283,8 +299,8 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { // Look through GEPs after checking they're the same except for the last // index. - GetElementPtrInst *GEPA = dyn_cast(getPointerOperand(A)); - GetElementPtrInst *GEPB = dyn_cast(getPointerOperand(B)); + GetElementPtrInst *GEPA = getSourceGEP(A); + GetElementPtrInst *GEPB = getSourceGEP(B); if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands()) return false; unsigned FinalIndex = GEPA->getNumOperands() - 1; @@ -328,11 +344,9 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { // If any bits are known to be zero other than the sign bit in OpA, we can // add 1 to it while guaranteeing no overflow of any sort. if (!Safe) { - APInt KnownZero(BitWidth, 0); - APInt KnownOne(BitWidth, 0); - computeKnownBits(OpA, KnownZero, KnownOne, DL, 0, nullptr, OpA, &DT); - KnownZero &= ~APInt::getHighBitsSet(BitWidth, 1); - if (KnownZero != 0) + KnownBits Known(BitWidth); + computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT); + if (Known.Zero.countTrailingZeros() < (BitWidth - 1)) Safe = true; } diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 7eb8fab..87ce019 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3586,8 +3586,12 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, IRBuilder<> B(MiddleBlock->getTerminator()); Value *CountMinusOne = B.CreateSub( CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); - Value *CMO = B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType(), - "cast.cmo"); + Value *CMO = + !II.getStep()->getType()->isIntegerTy() + ? B.CreateCast(Instruction::SIToFP, CountMinusOne, + II.getStep()->getType()) + : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); + CMO->setName("cast.cmo"); Value *Escape = II.transform(B, CMO, PSE.getSE(), DL); Escape->setName("ind.escape"); MissingVals[UI] = Escape; @@ -4516,14 +4520,15 @@ void InnerLoopVectorizer::predicateInstructions() { for (auto KV : PredicatedInstructions) { BasicBlock::iterator I(KV.first); BasicBlock *Head = I->getParent(); - auto *BB = SplitBlock(Head, &*std::next(I), DT, LI); auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, /*BranchWeights=*/nullptr, DT, LI); I->moveBefore(T); sinkScalarOperands(&*I); - I->getParent()->setName(Twine("pred.") + I->getOpcodeName() + ".if"); - BB->setName(Twine("pred.") + I->getOpcodeName() + ".continue"); + BasicBlock *PredicatedBlock = I->getParent(); + Twine BBNamePrefix = Twine("pred.") + I->getOpcodeName(); + PredicatedBlock->setName(BBNamePrefix + ".if"); + PredicatedBlock->getSingleSuccessor()->setName(BBNamePrefix + ".continue"); // If the instruction is non-void create a Phi node at reconvergence point. if (!I->getType()->isVoidTy()) { @@ -7324,8 +7329,16 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, VectorTy, VF - 1, VectorTy); - // TODO: IF-converted IFs become selects. - return 0; + // Phi nodes in non-header blocks (not inductions, reductions, etc.) are + // converted into select instructions. We require N - 1 selects per phi + // node, where N is the number of incoming values. + if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) + return (Phi->getNumIncomingValues() - 1) * + TTI.getCmpSelInstrCost( + Instruction::Select, ToVectorTy(Phi->getType(), VF), + ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)); + + return TTI.getCFInstrCost(Instruction::PHI); } case Instruction::UDiv: case Instruction::SDiv: diff --git a/test/Analysis/IVUsers/quadradic-exit-value.ll b/test/Analysis/IVUsers/quadradic-exit-value.ll index 214afcb..6d4f1b0 100644 --- a/test/Analysis/IVUsers/quadradic-exit-value.ll +++ b/test/Analysis/IVUsers/quadradic-exit-value.ll @@ -36,7 +36,7 @@ exit: ; sure they aren't marked as post-inc users. ; ; CHECK-LABEL: IV Users for loop %test2.loop -; CHECK-NO-LCSSA: %sext.us = {0,+,(16777216 + (-16777216 * %sub.us)),+,33554432}<%test2.loop> in %f = ashr i32 %sext.us, 24 +; CHECK-NO-LCSSA: %sext.us = {0,+,(16777216 + (-16777216 * %sub.us)),+,33554432}<%test2.loop> (post-inc with loop %test2.loop) in %f = ashr i32 %sext.us, 24 define i32 @test2() { entry: br label %test2.loop diff --git a/test/Analysis/ScalarEvolution/exponential-behavior.ll b/test/Analysis/ScalarEvolution/exponential-behavior.ll new file mode 100644 index 0000000..919521a --- /dev/null +++ b/test/Analysis/ScalarEvolution/exponential-behavior.ll @@ -0,0 +1,57 @@ +; RUN: opt -analyze -scalar-evolution < %s | FileCheck %s + +; CHECK: Printing analysis 'Scalar Evolution Analysis' for function 'f': + +; CHECK: Loop %loop: Unpredictable backedge-taken count. +; CHECK: Loop %loop: max backedge-taken count is 0 +; CHECK: Loop %loop: Unpredictable predicated backedge-taken count. + + +define void @f(i32 %n, i32* %ptr) { +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.inc, %be ] + %iv.inc = add i32 %iv, 1 + %unswitch_cond_root = icmp ne i32 %iv.inc, 42 + %us.0 = and i1 %unswitch_cond_root, %unswitch_cond_root + %us.1 = and i1 %us.0, %us.0 + %us.2 = and i1 %us.1, %us.1 + %us.3 = and i1 %us.2, %us.2 + %us.4 = and i1 %us.3, %us.3 + %us.5 = and i1 %us.4, %us.4 + %us.6 = and i1 %us.5, %us.5 + %us.7 = and i1 %us.6, %us.6 + %us.8 = and i1 %us.7, %us.7 + %us.9 = and i1 %us.8, %us.8 + %us.10 = and i1 %us.9, %us.9 + %us.11 = and i1 %us.10, %us.10 + %us.12 = and i1 %us.11, %us.11 + %us.13 = and i1 %us.12, %us.12 + %us.14 = and i1 %us.13, %us.13 + %us.15 = and i1 %us.14, %us.14 + %us.16 = and i1 %us.15, %us.15 + %us.17 = and i1 %us.16, %us.16 + %us.18 = and i1 %us.17, %us.17 + %us.19 = and i1 %us.18, %us.18 + %us.20 = and i1 %us.19, %us.19 + %us.21 = and i1 %us.20, %us.20 + %us.22 = and i1 %us.21, %us.21 + %us.23 = and i1 %us.22, %us.22 + %us.24 = and i1 %us.23, %us.23 + %us.25 = and i1 %us.24, %us.24 + %us.26 = and i1 %us.25, %us.25 + %us.27 = and i1 %us.26, %us.26 + %us.28 = and i1 %us.27, %us.27 + %us.29 = and i1 %us.28, %us.28 + br i1 %us.29, label %leave, label %be + +be: + store volatile i32 0, i32* %ptr + %becond = icmp ult i32 %iv.inc, %n + br i1 %becond, label %leave, label %loop + +leave: + ret void +} diff --git a/test/Analysis/ScalarEvolution/or-as-add.ll b/test/Analysis/ScalarEvolution/or-as-add.ll deleted file mode 100644 index ac4e65a..0000000 --- a/test/Analysis/ScalarEvolution/or-as-add.ll +++ /dev/null @@ -1,38 +0,0 @@ -; RUN: opt < %s -analyze -scalar-evolution | FileCheck %s - -declare void @z(i32) -declare void @z2(i64) - -define void @fun(i1 %bool, i32 %x) { -entry: - br label %body -body: - %i = phi i32 [ 0, %entry ], [ %i.next, %body ] - %bottom_zero = mul i32 %i, 2 - %a = or i32 %bottom_zero, 1 - call void @z(i32 %a) - %bool_ext = zext i1 %bool to i32 - %b = or i32 %bool_ext, %bottom_zero - call void @z(i32 %b) - %shifted = lshr i32 %x, 31 - %c = or i32 %shifted, %bottom_zero - call void @z(i32 %c) - %i_ext = zext i32 %i to i64 - %d = or i64 %i_ext, 4294967296 - call void @z2(i64 %d) - %i.next = add i32 %i, 1 - %cond = icmp eq i32 %i.next, 10 - br i1 %cond, label %exit, label %body -exit: - ret void -} - -; CHECK: %a = or i32 %bottom_zero, 1 -; CHECK-NEXT: --> {1,+,2}<%body> -; CHECK: %b = or i32 %bool_ext, %bottom_zero -; CHECK-NEXT: --> {(zext i1 %bool to i32),+,2} -; CHECK: %c = or i32 %shifted, %bottom_zero -; CHECK-NEXT: --> {(%x /u -2147483648),+,2}<%body> -; CHECK: %d = or i64 %i_ext, 4294967296 -; CHECK-NEXT: --> {4294967296,+,1}<%body> - diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c166704..25c340f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -45,6 +45,7 @@ set(LLVM_TEST_DEPENDS llvm-config llvm-cov llvm-cxxdump + llvm-cvtres llvm-diff llvm-dis llvm-dsymutil diff --git a/test/CodeGen/AArch64/GlobalISel/select-pr32733.mir b/test/CodeGen/AArch64/GlobalISel/select-pr32733.mir new file mode 100644 index 0000000..9643620 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select-pr32733.mir @@ -0,0 +1,65 @@ +# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s + +--- | + define i32 @main() { + entry: + ret i32 0 + } + + declare i32 @printf(i8*, ...) +... +--- +# CHECK-LABEL: name: main +name: main +alignment: 2 +exposesReturnsTwice: false +noVRegs: false +legalized: true +regBankSelected: true +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } + - { id: 4, class: gpr } + - { id: 5, class: gpr } + - { id: 6, class: gpr } + - { id: 7, class: gpr } + - { id: 8, class: gpr } + - { id: 9, class: gpr } + - { id: 10, class: gpr } + - { id: 11, class: gpr } + - { id: 12, class: gpr } + - { id: 13, class: gpr } + - { id: 14, class: gpr } + - { id: 15, class: gpr } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 8 + adjustsStack: false + hasCalls: true + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +# CHECK: body: +# CHECK: %1 = COPY %w0 +# CHECK-NOT: %2 = ORNWrr %wzr, %1 +# CHECK: %4 = EONWrr %1, %3 +body: | + bb.1.entry: + liveins: %w0 + %0(s32) = G_CONSTANT i32 -1 + %3(s32) = G_CONSTANT i32 1 + %1(s32) = COPY %w0 + %2(s32) = G_XOR %1, %0 + %4(s32) = G_XOR %2, %3 + %w0 = COPY %4(s32) +... diff --git a/test/CodeGen/AArch64/arm64-vmul.ll b/test/CodeGen/AArch64/arm64-vmul.ll index a5fa78a..a7668ec 100644 --- a/test/CodeGen/AArch64/arm64-vmul.ll +++ b/test/CodeGen/AArch64/arm64-vmul.ll @@ -1201,35 +1201,35 @@ define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nou ; Scalar FMULX define float @fmulxs(float %a, float %b) nounwind { ; CHECK-LABEL: fmulxs: -; CHECKNEXT: fmulx s0, s0, s1 +; CHECK-NEXT: fmulx s0, s0, s1 %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind -; CHECKNEXT: ret +; CHECK-NEXT: ret ret float %fmulx.i } define double @fmulxd(double %a, double %b) nounwind { ; CHECK-LABEL: fmulxd: -; CHECKNEXT: fmulx d0, d0, d1 +; CHECK-NEXT: fmulx d0, d0, d1 %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind -; CHECKNEXT: ret +; CHECK-NEXT: ret ret double %fmulx.i } define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind { ; CHECK-LABEL: fmulxs_lane: -; CHECKNEXT: fmulx.s s0, s0, v1[3] +; CHECK-NEXT: fmulx.s s0, s0, v1[3] %b = extractelement <4 x float> %vec, i32 3 %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind -; CHECKNEXT: ret +; CHECK-NEXT: ret ret float %fmulx.i } define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind { ; CHECK-LABEL: fmulxd_lane: -; CHECKNEXT: fmulx d0, d0, v1[1] +; CHECK-NEXT: fmulx.d d0, d0, v1[1] %b = extractelement <2 x double> %vec, i32 1 %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind -; CHECKNEXT: ret +; CHECK-NEXT: ret ret double %fmulx.i } diff --git a/test/CodeGen/AArch64/fence-singlethread.ll b/test/CodeGen/AArch64/fence-singlethread.ll new file mode 100644 index 0000000..2ed7442 --- /dev/null +++ b/test/CodeGen/AArch64/fence-singlethread.ll @@ -0,0 +1,21 @@ +; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s --check-prefix=LINUX +; RUN: llc -mtriple=aarch64-apple-ios %s -o - | FileCheck %s --check-prefix=IOS +; RUN: llc -mtriple=aarch64-linux-gnueabihf %s -filetype=obj -o %t +; RUN: llvm-objdump -d %t | FileCheck %s --check-prefix=OBJ + +; OBJ-NOT: dmb + +define void @fence_singlethread() { +; LINUX-LABEL: fence_singlethread: +; LINUX-NOT: dmb +; LINUX: // COMPILER BARRIER +; LINUX-NOT: dmb + +; IOS-LABEL: fence_singlethread: +; IOS-NOT: dmb +; IOS: ; COMPILER BARRIER +; IOS-NOT: dmb + + fence singlethread seq_cst + ret void +} diff --git a/test/CodeGen/AArch64/optimize-imm.ll b/test/CodeGen/AArch64/optimize-imm.ll new file mode 100644 index 0000000..a4725c6 --- /dev/null +++ b/test/CodeGen/AArch64/optimize-imm.ll @@ -0,0 +1,64 @@ +; RUN: llc -o - %s -mtriple=aarch64-- | FileCheck %s + +; CHECK-LABEL: and1: +; CHECK: and {{w[0-9]+}}, w0, #0xfffffffd + +define void @and1(i32 %a, i8* nocapture %p) { +entry: + %and = and i32 %a, 253 + %conv = trunc i32 %and to i8 + store i8 %conv, i8* %p, align 1 + ret void +} + +; (a & 0x3dfd) | 0xffffc000 +; +; CHECK-LABEL: and2: +; CHECK: and {{w[0-9]+}}, w0, #0xfdfdfdfd + +define i32 @and2(i32 %a) { +entry: + %and = and i32 %a, 15869 + %or = or i32 %and, -16384 + ret i32 %or +} + +; (a & 0x19) | 0xffffffc0 +; +; CHECK-LABEL: and3: +; CHECK: and {{w[0-9]+}}, w0, #0x99999999 + +define i32 @and3(i32 %a) { +entry: + %and = and i32 %a, 25 + %or = or i32 %and, -64 + ret i32 %or +} + +; (a & 0xc5600) | 0xfff1f1ff +; +; CHECK-LABEL: and4: +; CHECK: and {{w[0-9]+}}, w0, #0xfffc07ff + +define i32 @and4(i32 %a) { +entry: + %and = and i32 %a, 787968 + %or = or i32 %and, -921089 + ret i32 %or +} + +; Make sure we don't shrink or optimize an XOR's immediate operand if the +; immediate is -1. Instruction selection turns (and ((xor $mask, -1), $v0)) into +; a BIC. + +; CHECK-LABEL: xor1: +; CHECK: orr [[R0:w[0-9]+]], wzr, #0x38 +; CHECK: bic {{w[0-9]+}}, [[R0]], w0, lsl #3 + +define i32 @xor1(i32 %a) { +entry: + %shl = shl i32 %a, 3 + %xor = and i32 %shl, 56 + %and = xor i32 %xor, 56 + ret i32 %and +} diff --git a/test/CodeGen/AArch64/swiftself-scavenger.ll b/test/CodeGen/AArch64/swiftself-scavenger.ll new file mode 100644 index 0000000..6d02784 --- /dev/null +++ b/test/CodeGen/AArch64/swiftself-scavenger.ll @@ -0,0 +1,82 @@ +; RUN: llc -o - %s | FileCheck %s +; Check that we reserve an emergency spill slot, even if we added an extra +; CSR spill for the values used by the swiftself parameter. +; CHECK-LABEL: func: +; CHECK: str [[REG:x[0-9]+]], [sp, #8] +; CHECK: add [[REG]], sp, #248 +; CHECK: str xzr, [{{\s*}}[[REG]], #32760] +; CHECK: ldr x30, [sp, #8] +target triple = "arm64-apple-ios" + +@ptr8 = external global i8* +@ptr64 = external global i64 + +define hidden swiftcc void @func(i8* swiftself %arg) #0 { +bb: + %stack0 = alloca i8*, i32 5000, align 8 + %stack1 = alloca i8*, i32 32, align 8 + + %v0 = load volatile i64, i64* @ptr64, align 8 + %v1 = load volatile i64, i64* @ptr64, align 8 + %v2 = load volatile i64, i64* @ptr64, align 8 + %v3 = load volatile i64, i64* @ptr64, align 8 + %v4 = load volatile i64, i64* @ptr64, align 8 + %v5 = load volatile i64, i64* @ptr64, align 8 + %v6 = load volatile i64, i64* @ptr64, align 8 + %v7 = load volatile i64, i64* @ptr64, align 8 + %v8 = load volatile i64, i64* @ptr64, align 8 + %v9 = load volatile i64, i64* @ptr64, align 8 + %v10 = load volatile i64, i64* @ptr64, align 8 + %v11 = load volatile i64, i64* @ptr64, align 8 + %v12 = load volatile i64, i64* @ptr64, align 8 + %v13 = load volatile i64, i64* @ptr64, align 8 + %v14 = load volatile i64, i64* @ptr64, align 8 + %v15 = load volatile i64, i64* @ptr64, align 8 + %v16 = load volatile i64, i64* @ptr64, align 8 + %v17 = load volatile i64, i64* @ptr64, align 8 + %v18 = load volatile i64, i64* @ptr64, align 8 + %v19 = load volatile i64, i64* @ptr64, align 8 + %v20 = load volatile i64, i64* @ptr64, align 8 + %v21 = load volatile i64, i64* @ptr64, align 8 + %v22 = load volatile i64, i64* @ptr64, align 8 + %v23 = load volatile i64, i64* @ptr64, align 8 + %v24 = load volatile i64, i64* @ptr64, align 8 + %v25 = load volatile i64, i64* @ptr64, align 8 + + ; this should exceed stack-relative addressing limits and need an emergency + ; spill slot. + %s = getelementptr inbounds i8*, i8** %stack0, i64 4092 + store volatile i8* null, i8** %s + store volatile i8* null, i8** %stack1 + + store volatile i64 %v0, i64* @ptr64, align 8 + store volatile i64 %v1, i64* @ptr64, align 8 + store volatile i64 %v2, i64* @ptr64, align 8 + store volatile i64 %v3, i64* @ptr64, align 8 + store volatile i64 %v4, i64* @ptr64, align 8 + store volatile i64 %v5, i64* @ptr64, align 8 + store volatile i64 %v6, i64* @ptr64, align 8 + store volatile i64 %v7, i64* @ptr64, align 8 + store volatile i64 %v8, i64* @ptr64, align 8 + store volatile i64 %v9, i64* @ptr64, align 8 + store volatile i64 %v10, i64* @ptr64, align 8 + store volatile i64 %v11, i64* @ptr64, align 8 + store volatile i64 %v12, i64* @ptr64, align 8 + store volatile i64 %v13, i64* @ptr64, align 8 + store volatile i64 %v14, i64* @ptr64, align 8 + store volatile i64 %v15, i64* @ptr64, align 8 + store volatile i64 %v16, i64* @ptr64, align 8 + store volatile i64 %v17, i64* @ptr64, align 8 + store volatile i64 %v18, i64* @ptr64, align 8 + store volatile i64 %v19, i64* @ptr64, align 8 + store volatile i64 %v20, i64* @ptr64, align 8 + store volatile i64 %v21, i64* @ptr64, align 8 + store volatile i64 %v22, i64* @ptr64, align 8 + store volatile i64 %v23, i64* @ptr64, align 8 + store volatile i64 %v24, i64* @ptr64, align 8 + store volatile i64 %v25, i64* @ptr64, align 8 + + ; use swiftself parameter late so it stays alive throughout the function. + store volatile i8* %arg, i8** @ptr8 + ret void +} diff --git a/test/CodeGen/AMDGPU/add.v2i16.ll b/test/CodeGen/AMDGPU/add.v2i16.ll index e137ef4..73e80d5 100644 --- a/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/test/CodeGen/AMDGPU/add.v2i16.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; FIXME: Need to handle non-uniform case for function below (load without gep). diff --git a/test/CodeGen/AMDGPU/addrspacecast.ll b/test/CodeGen/AMDGPU/addrspacecast.ll index 6ec93c7..b1e7172 100644 --- a/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/test/CodeGen/AMDGPU/addrspacecast.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA -check-prefix=CI %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast: ; HSA: enable_sgpr_private_segment_buffer = 1 @@ -223,9 +223,8 @@ define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 { } ; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast: -; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], 0{{$}} -; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} -; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen +; HSA: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} +; HSA: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 { %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(0)* store volatile i32 7, i32* %cast diff --git a/test/CodeGen/AMDGPU/ashr.v2i16.ll b/test/CodeGen/AMDGPU/ashr.v2i16.ll index 96a5e3b..7f424ef 100644 --- a/test/CodeGen/AMDGPU/ashr.v2i16.ll +++ b/test/CodeGen/AMDGPU/ashr.v2i16.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s diff --git a/test/CodeGen/AMDGPU/code-object-metadata-images.ll b/test/CodeGen/AMDGPU/code-object-metadata-images.ll new file mode 100644 index 0000000..9185604 --- /dev/null +++ b/test/CodeGen/AMDGPU/code-object-metadata-images.ll @@ -0,0 +1,80 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX800 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s + +%opencl.image1d_t = type opaque +%opencl.image1d_array_t = type opaque +%opencl.image1d_buffer_t = type opaque +%opencl.image2d_t = type opaque +%opencl.image2d_array_t = type opaque +%opencl.image2d_array_depth_t = type opaque +%opencl.image2d_array_msaa_t = type opaque +%opencl.image2d_array_msaa_depth_t = type opaque +%opencl.image2d_depth_t = type opaque +%opencl.image2d_msaa_t = type opaque +%opencl.image2d_msaa_depth_t = type opaque +%opencl.image3d_t = type opaque + +; CHECK: --- +; CHECK: Version: [ 1, 0 ] + +; CHECK: Kernels: +; CHECK: - Name: test +; CHECK: Args: +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image1d_t +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image1d_array_t +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image1d_buffer_t +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image2d_t +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image2d_array_t +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image2d_array_depth_t +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image2d_array_msaa_t +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image2d_array_msaa_depth_t +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image2d_depth_t +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image2d_msaa_t +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image2d_msaa_depth_t +; CHECK: - Size: 8 +; CHECK: ValueKind: Image +; CHECK: TypeName: image3d_t +define amdgpu_kernel void @test(%opencl.image1d_t addrspace(1)* %a, + %opencl.image1d_array_t addrspace(1)* %b, + %opencl.image1d_buffer_t addrspace(1)* %c, + %opencl.image2d_t addrspace(1)* %d, + %opencl.image2d_array_t addrspace(1)* %e, + %opencl.image2d_array_depth_t addrspace(1)* %f, + %opencl.image2d_array_msaa_t addrspace(1)* %g, + %opencl.image2d_array_msaa_depth_t addrspace(1)* %h, + %opencl.image2d_depth_t addrspace(1)* %i, + %opencl.image2d_msaa_t addrspace(1)* %j, + %opencl.image2d_msaa_depth_t addrspace(1)* %k, + %opencl.image3d_t addrspace(1)* %l) + !kernel_arg_type !1 !kernel_arg_base_type !1 { + ret void +} + +!1 = !{!"image1d_t", !"image1d_array_t", !"image1d_buffer_t", + !"image2d_t", !"image2d_array_t", !"image2d_array_depth_t", + !"image2d_array_msaa_t", !"image2d_array_msaa_depth_t", + !"image2d_depth_t", !"image2d_msaa_t", !"image2d_msaa_depth_t", + !"image3d_t"} diff --git a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index f2686a5..c9787bb 100644 --- a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s declare half @llvm.fabs.f16(half) #0 declare half @llvm.canonicalize.f16(half) #0 diff --git a/test/CodeGen/AMDGPU/fdiv.ll b/test/CodeGen/AMDGPU/fdiv.ll index b3a2b66..738a5ad 100644 --- a/test/CodeGen/AMDGPU/fdiv.ll +++ b/test/CodeGen/AMDGPU/fdiv.ll @@ -85,10 +85,20 @@ entry: } ; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32: -; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} -; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] -; GCN-NOT: [[RESULT]] -; GCN: buffer_store_dword [[RESULT]] +; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] +; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] +; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] + +; GCN-NOT: s_setreg +; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 +; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] +; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]] +; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] +; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] +; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] +; GCN-NOT: s_setreg +; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] +; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { entry: %fdiv = fdiv fast float %a, %b diff --git a/test/CodeGen/AMDGPU/fence-amdgiz.ll b/test/CodeGen/AMDGPU/fence-amdgiz.ll new file mode 100644 index 0000000..df675c9 --- /dev/null +++ b/test/CodeGen/AMDGPU/fence-amdgiz.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5" +target triple = "amdgcn-amd-amdhsa-amdgizcl" + +; CHECK_LABEL: atomic_fence +; CHECK: BB#0: +; CHECK: ATOMIC_FENCE 4, 1 +; CHECK: s_endpgm + +define amdgpu_kernel void @atomic_fence() { + fence acquire + ret void +} + diff --git a/test/CodeGen/AMDGPU/fmuladd.v2f16.ll b/test/CodeGen/AMDGPU/fmuladd.v2f16.ll index bdd3c04..6246100 100644 --- a/test/CodeGen/AMDGPU/fmuladd.v2f16.ll +++ b/test/CodeGen/AMDGPU/fmuladd.v2f16.ll @@ -1,12 +1,12 @@ -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s - -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s + +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s declare i32 @llvm.amdgcn.workitem.id.x() #1 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #1 diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index 555764c..506b2a0 100644 --- a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=CIVI %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GFX89 -check-prefix=GCN -check-prefix=CIVI %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX89 -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX89 -check-prefix=GFX9 -check-prefix=GCN %s ; GCN-LABEL: {{^}}fneg_fabs_fadd_f16: ; CI: v_cvt_f32_f16_e32 diff --git a/test/CodeGen/AMDGPU/immv216.ll b/test/CodeGen/AMDGPU/immv216.ll index 85ad365..c15a30e 100644 --- a/test/CodeGen/AMDGPU/immv216.ll +++ b/test/CodeGen/AMDGPU/immv216.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s ; FIXME: Merge into imm.ll diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index a3f82b8..89adcff 100644 --- a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -216,7 +216,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, ; CIVI-DAG: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]] ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], [[ELT0_SHIFT]], [[ELT1]] -; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}} +; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0000{{$}} ; GFX9-DAG: v_lshrrev_b32_e64 [[ELT0_SHIFT:v[0-9]+]], 16, [[ELT0]] ; GFX9: v_and_or_b32 [[RES:v[0-9]+]], [[VEC]], [[MASK]], [[ELT0_SHIFT]] diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index 5e892fa..cbd8f0a 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -19,6 +19,20 @@ define amdgpu_kernel void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %s ret void } +; CHECK-LABEL: {{^}}test_readlane_vregs: +; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}} +; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]] +define amdgpu_kernel void @test_readlane_vregs(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid + %args = load <2 x i32>, <2 x i32> addrspace(1)* %gep.in + %value = extractelement <2 x i32> %args, i32 0 + %lane = extractelement <2 x i32> %args, i32 1 + %readlane = call i32 @llvm.amdgcn.readlane(i32 %value, i32 %lane) + store i32 %readlane, i32 addrspace(1)* %out, align 4 + ret void +} + ; TODO: m0 should be folded. ; CHECK-LABEL: {{^}}test_readlane_m0_sreg: ; CHECK: s_mov_b32 m0, -1 @@ -40,5 +54,8 @@ define amdgpu_kernel void @test_readlane_imm(i32 addrspace(1)* %out, i32 %src0) ret void } +declare i32 @llvm.amdgcn.workitem.id.x() #2 + attributes #0 = { nounwind readnone convergent } attributes #1 = { nounwind } +attributes #2 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.unreachable.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.unreachable.ll new file mode 100644 index 0000000..bafafa3 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.unreachable.ll @@ -0,0 +1,9 @@ +; RUN: llc -march amdgcn %s -filetype=obj -o /dev/null +; RUN: llc -march amdgcn <%s | FileCheck %s +define amdgpu_kernel void @f() { + ; CHECK: ; divergent unreachable + call void @llvm.amdgcn.unreachable() + ret void +} + +declare void @llvm.amdgcn.unreachable() diff --git a/test/CodeGen/AMDGPU/loop_break.ll b/test/CodeGen/AMDGPU/loop_break.ll index 84c42e8..b9df2cb 100644 --- a/test/CodeGen/AMDGPU/loop_break.ll +++ b/test/CodeGen/AMDGPU/loop_break.ll @@ -10,7 +10,7 @@ ; OPT: bb4: ; OPT: load volatile -; OPT: %cmp1 = icmp sge i32 %tmp, %load +; OPT: xor i1 %cmp1 ; OPT: call i64 @llvm.amdgcn.if.break( ; OPT: br label %Flow diff --git a/test/CodeGen/AMDGPU/lshr.v2i16.ll b/test/CodeGen/AMDGPU/lshr.v2i16.ll index e21d0d0..6a90a7a 100644 --- a/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s diff --git a/test/CodeGen/AMDGPU/merge-m0.mir b/test/CodeGen/AMDGPU/merge-m0.mir new file mode 100644 index 0000000..064db49 --- /dev/null +++ b/test/CodeGen/AMDGPU/merge-m0.mir @@ -0,0 +1,132 @@ +# RUN: llc -march=amdgcn -amdgpu-enable-merge-m0 -verify-machineinstrs -run-pass si-fix-sgpr-copies %s -o - | FileCheck -check-prefix=GCN %s + +# GCN: bb.0.entry: +# GCN: SI_INIT_M0 -1 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: SI_INIT_M0 65536 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: SI_INIT_M0 -1 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: SI_INIT_M0 65536 +# GCN-NEXT: DS_WRITE_B32 + +# GCN: bb.1: +# GCN: SI_INIT_M0 -1 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: DS_WRITE_B32 + +# GCN: bb.2: +# GCN: SI_INIT_M0 65536 +# GCN-NEXT: DS_WRITE_B32 + +# GCN: bb.3: +# GCN: SI_INIT_M0 3 + +# GCN: bb.4: +# GCN-NOT: SI_INIT_M0 +# GCN: DS_WRITE_B32 +# GCN-NEXT: SI_INIT_M0 4 +# GCN-NEXT: DS_WRITE_B32 + +# GCN: bb.5: +# GCN-NOT: SI_INIT_M0 +# GCN: DS_WRITE_B32 +# GCN-NEXT: SI_INIT_M0 4 +# GCN-NEXT: DS_WRITE_B32 + +# GCN: bb.6: +# GCN: SI_INIT_M0 -1, +# GCN-NEXT: DS_WRITE_B32 +# GCN: SI_INIT_M0 %2 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: SI_INIT_M0 %2 +# GCN-NEXT: DS_WRITE_B32 +# GCN-NEXT: SI_INIT_M0 -1 +# GCN-NEXT: DS_WRITE_B32 + +--- +name: test +alignment: 0 +exposesReturnsTwice: false +noVRegs: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: sreg_32_xm0 } +body: | + bb.0.entry: + successors: %bb.1, %bb.2 + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + SI_INIT_M0 -1, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 -1, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 65536, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 65536, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 -1, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 65536, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + S_CBRANCH_VCCZ %bb.1, implicit undef %vcc + S_BRANCH %bb.2 + + bb.1: + successors: %bb.2 + SI_INIT_M0 -1, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 -1, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + SI_INIT_M0 65536, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4, %bb.5 + S_CBRANCH_VCCZ %bb.4, implicit undef %vcc + S_BRANCH %bb.5 + + bb.4: + successors: %bb.6 + SI_INIT_M0 3, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 4, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + S_BRANCH %bb.6 + + bb.5: + successors: %bb.6 + SI_INIT_M0 3, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 4, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + S_BRANCH %bb.6 + + bb.6: + successors: %bb.0.entry, %bb.6 + SI_INIT_M0 -1, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + %2 = IMPLICIT_DEF + SI_INIT_M0 %2, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 %2, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + SI_INIT_M0 -1, implicit-def %m0 + DS_WRITE_B32 %0, %1, 0, 0, implicit %m0, implicit %exec + S_CBRANCH_VCCZ %bb.6, implicit undef %vcc + S_BRANCH %bb.0.entry + +... diff --git a/test/CodeGen/AMDGPU/mubuf-offset-private.ll b/test/CodeGen/AMDGPU/mubuf-offset-private.ll new file mode 100644 index 0000000..3a0605f --- /dev/null +++ b/test/CodeGen/AMDGPU/mubuf-offset-private.ll @@ -0,0 +1,136 @@ +; RUN: llc -march=amdgcn -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s + +; Test addressing modes when the scratch base is not a frame index. + +; GCN-LABEL: {{^}}store_private_offset_i8: +; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @store_private_offset_i8() #0 { + store volatile i8 5, i8* inttoptr (i32 8 to i8*) + ret void +} + +; GCN-LABEL: {{^}}store_private_offset_i16: +; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @store_private_offset_i16() #0 { + store volatile i16 5, i16* inttoptr (i32 8 to i16*) + ret void +} + +; GCN-LABEL: {{^}}store_private_offset_i32: +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @store_private_offset_i32() #0 { + store volatile i32 5, i32* inttoptr (i32 8 to i32*) + ret void +} + +; GCN-LABEL: {{^}}store_private_offset_v2i32: +; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @store_private_offset_v2i32() #0 { + store volatile <2 x i32> , <2 x i32>* inttoptr (i32 8 to <2 x i32>*) + ret void +} + +; GCN-LABEL: {{^}}store_private_offset_v4i32: +; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @store_private_offset_v4i32() #0 { + store volatile <4 x i32> , <4 x i32>* inttoptr (i32 8 to <4 x i32>*) + ret void +} + +; GCN-LABEL: {{^}}load_private_offset_i8: +; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @load_private_offset_i8() #0 { + %load = load volatile i8, i8* inttoptr (i32 8 to i8*) + ret void +} + +; GCN-LABEL: {{^}}sextload_private_offset_i8: +; GCN: buffer_load_sbyte v{{[0-9]+}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @sextload_private_offset_i8(i32 addrspace(1)* %out) #0 { + %load = load volatile i8, i8* inttoptr (i32 8 to i8*) + %sextload = sext i8 %load to i32 + store i32 %sextload, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}zextload_private_offset_i8: +; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @zextload_private_offset_i8(i32 addrspace(1)* %out) #0 { + %load = load volatile i8, i8* inttoptr (i32 8 to i8*) + %zextload = zext i8 %load to i32 + store i32 %zextload, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_offset_i16: +; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @load_private_offset_i16() #0 { + %load = load volatile i16, i16* inttoptr (i32 8 to i16*) + ret void +} + +; GCN-LABEL: {{^}}sextload_private_offset_i16: +; GCN: buffer_load_sshort v{{[0-9]+}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @sextload_private_offset_i16(i32 addrspace(1)* %out) #0 { + %load = load volatile i16, i16* inttoptr (i32 8 to i16*) + %sextload = sext i16 %load to i32 + store i32 %sextload, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}zextload_private_offset_i16: +; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @zextload_private_offset_i16(i32 addrspace(1)* %out) #0 { + %load = load volatile i16, i16* inttoptr (i32 8 to i16*) + %zextload = zext i16 %load to i32 + store i32 %zextload, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_offset_i32: +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @load_private_offset_i32() #0 { + %load = load volatile i32, i32* inttoptr (i32 8 to i32*) + ret void +} + +; GCN-LABEL: {{^}}load_private_offset_v2i32: +; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @load_private_offset_v2i32() #0 { + %load = load volatile <2 x i32>, <2 x i32>* inttoptr (i32 8 to <2 x i32>*) + ret void +} + +; GCN-LABEL: {{^}}load_private_offset_v4i32: +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8 +define amdgpu_kernel void @load_private_offset_v4i32() #0 { + %load = load volatile <4 x i32>, <4 x i32>* inttoptr (i32 8 to <4 x i32>*) + ret void +} + +; GCN-LABEL: {{^}}store_private_offset_i8_max_offset: +; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s8 offset:4095 +define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 { + store volatile i8 5, i8* inttoptr (i32 4095 to i8*) + ret void +} + +; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus1: +; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000 +; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s8 offen{{$}} +define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 { + store volatile i8 5, i8* inttoptr (i32 4096 to i8*) + ret void +} + +; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus2: +; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000 +; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s8 offen offset:1{{$}} +define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 { + store volatile i8 5, i8* inttoptr (i32 4097 to i8*) + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll index 4bd8bff..9d0b6b3 100644 --- a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -9,19 +9,18 @@ ; StructurizeCFG. ; IR-LABEL: @multi_divergent_region_exit_ret_ret( -; IR: %Pivot = icmp sge i32 %tmp16, 2 -; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot) -; IR: %1 = extractvalue { i1, i64 } %0, 0 -; IR: %2 = extractvalue { i1, i64 } %0, 1 -; IR: br i1 %1, label %LeafBlock1, label %Flow +; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) +; IR: %2 = extractvalue { i1, i64 } %1, 0 +; IR: %3 = extractvalue { i1, i64 } %1, 1 +; IR: br i1 %2, label %LeafBlock1, label %Flow ; IR: Flow: -; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] -; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ] -; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2) -; IR: %6 = extractvalue { i1, i64 } %5, 0 -; IR: %7 = extractvalue { i1, i64 } %5, 1 -; IR: br i1 %6, label %LeafBlock, label %Flow1 +; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] +; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) +; IR: %7 = extractvalue { i1, i64 } %6, 0 +; IR: %8 = extractvalue { i1, i64 } %6, 1 +; IR: br i1 %7, label %LeafBlock, label %Flow1 ; IR: LeafBlock: ; IR: br label %Flow1 @@ -30,32 +29,32 @@ ; IR: br label %Flow{{$}} ; IR: Flow2: -; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %16) -; IR: [[IF:%[0-9]+]] = call { i1, i64 } @llvm.amdgcn.if(i1 %8) -; IR: %10 = extractvalue { i1, i64 } [[IF]], 0 -; IR: %11 = extractvalue { i1, i64 } [[IF]], 1 -; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock +; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %19) +; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) +; IR: %13 = extractvalue { i1, i64 } %12, 0 +; IR: %14 = extractvalue { i1, i64 } %12, 1 +; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock ; IR: exit0: ; IR: store volatile i32 9, i32 addrspace(1)* undef ; IR: br label %UnifiedReturnBlock ; IR: Flow1: -; IR: %12 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %3, %Flow ] -; IR: %13 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ] -; IR: call void @llvm.amdgcn.end.cf(i64 %7) -; IR: %14 = call { i1, i64 } @llvm.amdgcn.if(i1 %13) -; IR: %15 = extractvalue { i1, i64 } %14, 0 -; IR: %16 = extractvalue { i1, i64 } %14, 1 -; IR: br i1 %15, label %exit1, label %Flow2 +; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ] +; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ] +; IR: call void @llvm.amdgcn.end.cf(i64 %8) +; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16) +; IR: %18 = extractvalue { i1, i64 } %17, 0 +; IR: %19 = extractvalue { i1, i64 } %17, 1 +; IR: br i1 %18, label %exit1, label %Flow2 ; IR: exit1: ; IR: store volatile i32 17, i32 addrspace(3)* undef ; IR: br label %Flow2 ; IR: UnifiedReturnBlock: -; IR: call void @llvm.amdgcn.end.cf(i64 %11) +; IR: call void @llvm.amdgcn.end.cf(i64 %14) ; IR: ret void @@ -65,9 +64,11 @@ ; GCN: s_xor_b64 -; GCN: ; %LeafBlock -; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG:v[0-9]+]] +; FIXME: Why is this compare essentially repeated? +; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]] +; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]] ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1 ; GCN: ; %Flow1 ; GCN-NEXT: s_or_b64 exec, exec @@ -125,15 +126,14 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 } ; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable( -; IR: %Pivot = icmp sge i32 %tmp16, 2 -; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot) +; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) -; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2) +; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) -; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %16) -; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8) -; IR: br i1 %10, label %exit0, label %UnifiedUnreachableBlock +; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %19) +; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) +; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock ; IR: UnifiedUnreachableBlock: @@ -181,49 +181,51 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 } ; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret( -; IR: %divergent.cond0 = icmp sge i32 %tmp16, 2 +; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2 ; IR: llvm.amdgcn.if ; IR: br i1 ; IR: {{^}}Flow: -; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] -; IR: %4 = phi i1 [ %uniform.cond0, %LeafBlock1 ], [ false, %entry ] -; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2) -; IR: br i1 %6, label %LeafBlock, label %Flow1 +; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] +; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) +; IR: br i1 %7, label %LeafBlock, label %Flow1 ; IR: {{^}}LeafBlock: -; IR: %divergent.cond1 = icmp ne i32 %tmp16, 1 +; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1 +; IR: %9 = xor i1 %divergent.cond1, true ; IR: br label %Flow1 ; IR: LeafBlock1: -; IR: %uniform.cond0 = icmp ne i32 %arg3, 2 +; IR: %uniform.cond0 = icmp eq i32 %arg3, 2 +; IR: %10 = xor i1 %uniform.cond0, true ; IR: br label %Flow ; IR: Flow2: -; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %16) -; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8) -; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock +; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %19) +; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) +; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock ; IR: exit0: ; IR: store volatile i32 9, i32 addrspace(1)* undef ; IR: br label %UnifiedReturnBlock ; IR: {{^}}Flow1: -; IR: %12 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %3, %Flow ] -; IR: %13 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ] -; IR: call void @llvm.amdgcn.end.cf(i64 %7) -; IR: %14 = call { i1, i64 } @llvm.amdgcn.if(i1 %13) -; IR: %15 = extractvalue { i1, i64 } %14, 0 -; IR: %16 = extractvalue { i1, i64 } %14, 1 -; IR: br i1 %15, label %exit1, label %Flow2 +; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ] +; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ] +; IR: call void @llvm.amdgcn.end.cf(i64 %8) +; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16) +; IR: %18 = extractvalue { i1, i64 } %17, 0 +; IR: %19 = extractvalue { i1, i64 } %17, 1 +; IR: br i1 %18, label %exit1, label %Flow2 ; IR: exit1: ; IR: store volatile i32 17, i32 addrspace(3)* undef ; IR: br label %Flow2 ; IR: UnifiedReturnBlock: -; IR: call void @llvm.amdgcn.end.cf(i64 %11) +; IR: call void @llvm.amdgcn.end.cf(i64 %14) ; IR: ret void define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 { entry: @@ -262,18 +264,17 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 } ; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret( -; IR: %Pivot = icmp sge i32 %tmp16, 2 -; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot) -; IR: br i1 %1, label %LeafBlock1, label %Flow +; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) +; IR: br i1 %2, label %LeafBlock1, label %Flow ; IR: Flow: -; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] -; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ] -; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2) +; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] +; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) -; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %16) -; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8) +; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %19) +; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 { entry: @@ -313,13 +314,13 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 ; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value( ; IR: Flow2: -; IR: %8 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ] -; IR: %9 = phi i1 [ false, %exit1 ], [ %13, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %17) +; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ] +; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %20) ; IR: UnifiedReturnBlock: -; IR: %UnifiedRetVal = phi float [ %8, %Flow2 ], [ 1.000000e+00, %exit0 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %12) +; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %15) ; IR: ret float %UnifiedRetVal define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 { entry: @@ -386,32 +387,31 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 } ; IR-LABEL: @multi_divergent_region_exit_ret_unreachable( -; IR: %Pivot = icmp sge i32 %tmp16, 2 -; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot) +; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) ; IR: Flow: -; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] -; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ] -; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2) +; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] +; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) ; IR: Flow2: -; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %16) -; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8) -; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock +; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %19) +; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) +; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock ; IR: exit0: ; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef ; IR-NEXT: br label %UnifiedReturnBlock ; IR: Flow1: -; IR: %12 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %3, %Flow ] -; IR: %13 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ] -; IR: call void @llvm.amdgcn.end.cf(i64 %7) -; IR: %14 = call { i1, i64 } @llvm.amdgcn.if(i1 %13) -; IR: %15 = extractvalue { i1, i64 } %14, 0 -; IR: %16 = extractvalue { i1, i64 } %14, 1 -; IR: br i1 %15, label %exit1, label %Flow2 +; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ] +; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ] +; IR: call void @llvm.amdgcn.end.cf(i64 %8) +; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16) +; IR: %18 = extractvalue { i1, i64 } %17, 0 +; IR: %19 = extractvalue { i1, i64 } %17, 1 +; IR: br i1 %18, label %exit1, label %Flow2 ; IR: exit1: ; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef @@ -419,7 +419,7 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 ; IR-NEXT: br label %Flow2 ; IR: UnifiedReturnBlock: -; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11) +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14) ; IR-NEXT: ret void define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { entry: @@ -475,7 +475,7 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 ; IR-NEXT: br label %Flow2 ; IR: UnifiedReturnBlock: ; preds = %exit0, %Flow2 -; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11) +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14) ; IR-NEXT: ret void define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { entry: @@ -622,15 +622,15 @@ uniform.ret: ; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle( ; IR: Flow1: ; preds = %uniform.ret1, %uniform.multi.exit.region -; IR: %6 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ] -; IR: br i1 %6, label %uniform.if, label %Flow2 +; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ] +; IR: br i1 %8, label %uniform.if, label %Flow2 ; IR: Flow: ; preds = %uniform.then, %uniform.if -; IR: %7 = phi i1 [ %uniform.cond2, %uniform.then ], [ %uniform.cond1, %uniform.if ] -; IR: br i1 %7, label %uniform.endif, label %uniform.ret0 +; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ] +; IR: br i1 %11, label %uniform.endif, label %uniform.ret0 ; IR: UnifiedReturnBlock: ; preds = %Flow3, %Flow2 -; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %5) +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %6) ; IR-NEXT: ret void define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 { entry: diff --git a/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/test/CodeGen/AMDGPU/nested-loop-conditions.ll index c0b4eaf..672549c 100644 --- a/test/CodeGen/AMDGPU/nested-loop-conditions.ll +++ b/test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -133,9 +133,9 @@ bb23: ; preds = %bb10 ; IR: Flow1: ; IR-NEXT: %loop.phi = phi i64 [ %loop.phi9, %Flow6 ], [ %phi.broken, %bb14 ] -; IR-NEXT: %13 = phi <4 x i32> [ %28, %Flow6 ], [ undef, %bb14 ] -; IR-NEXT: %14 = phi i32 [ %29, %Flow6 ], [ undef, %bb14 ] -; IR-NEXT: %15 = phi i1 [ %30, %Flow6 ], [ false, %bb14 ] +; IR-NEXT: %13 = phi <4 x i32> [ %29, %Flow6 ], [ undef, %bb14 ] +; IR-NEXT: %14 = phi i32 [ %30, %Flow6 ], [ undef, %bb14 ] +; IR-NEXT: %15 = phi i1 [ %31, %Flow6 ], [ false, %bb14 ] ; IR-NEXT: %16 = phi i1 [ false, %Flow6 ], [ %8, %bb14 ] ; IR-NEXT: %17 = call i64 @llvm.amdgcn.else.break(i64 %11, i64 %loop.phi) ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11) @@ -144,9 +144,9 @@ bb23: ; preds = %bb10 ; IR: Flow2: ; IR-NEXT: %loop.phi10 = phi i64 [ %loop.phi11, %Flow5 ], [ %12, %bb16 ] -; IR-NEXT: %19 = phi <4 x i32> [ %28, %Flow5 ], [ undef, %bb16 ] -; IR-NEXT: %20 = phi i32 [ %29, %Flow5 ], [ undef, %bb16 ] -; IR-NEXT: %21 = phi i1 [ %30, %Flow5 ], [ false, %bb16 ] +; IR-NEXT: %19 = phi <4 x i32> [ %29, %Flow5 ], [ undef, %bb16 ] +; IR-NEXT: %20 = phi i32 [ %30, %Flow5 ], [ undef, %bb16 ] +; IR-NEXT: %21 = phi i1 [ %31, %Flow5 ], [ false, %bb16 ] ; IR-NEXT: %22 = phi i1 [ false, %Flow5 ], [ false, %bb16 ] ; IR-NEXT: %23 = phi i1 [ false, %Flow5 ], [ %8, %bb16 ] ; IR-NEXT: %24 = call { i1, i64 } @llvm.amdgcn.if(i1 %23) @@ -156,15 +156,16 @@ bb23: ; preds = %bb10 ; IR: bb21: ; IR: %tmp12 = icmp slt i32 %tmp11, 9 -; IR-NEXT: %27 = call i64 @llvm.amdgcn.if.break(i1 %tmp12, i64 %phi.broken) +; IR-NEXT: %27 = xor i1 %tmp12, true +; IR-NEXT: %28 = call i64 @llvm.amdgcn.if.break(i1 %27, i64 %phi.broken) ; IR-NEXT: br label %Flow3 ; IR: Flow3: ; IR-NEXT: %loop.phi11 = phi i64 [ %phi.broken, %bb21 ], [ %phi.broken, %Flow2 ] -; IR-NEXT: %loop.phi9 = phi i64 [ %27, %bb21 ], [ %loop.phi10, %Flow2 ] -; IR-NEXT: %28 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ] -; IR-NEXT: %29 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ] -; IR-NEXT: %30 = phi i1 [ %tmp12, %bb21 ], [ %21, %Flow2 ] +; IR-NEXT: %loop.phi9 = phi i64 [ %28, %bb21 ], [ %loop.phi10, %Flow2 ] +; IR-NEXT: %29 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ] +; IR-NEXT: %30 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ] +; IR-NEXT: %31 = phi i1 [ %27, %bb21 ], [ %21, %Flow2 ] ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %26) ; IR-NEXT: br i1 %22, label %bb31.loopexit, label %Flow4 diff --git a/test/CodeGen/AMDGPU/private-access-no-objects.ll b/test/CodeGen/AMDGPU/private-access-no-objects.ll index af26835..dcb0890 100644 --- a/test/CodeGen/AMDGPU/private-access-no-objects.ll +++ b/test/CodeGen/AMDGPU/private-access-no-objects.ll @@ -1,7 +1,7 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=OPT %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=OPT %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=iceland -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=OPT %s -; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=OPTNONE %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=OPT %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI -check-prefix=OPT %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=iceland -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=OPT %s +; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=OPTNONE %s ; There are no stack objects, but still a private memory access. The ; private access regiters need to be correctly initialized anyway, and @@ -27,9 +27,9 @@ define amdgpu_kernel void @store_to_undef() #0 { ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}} -; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} +; OPT: buffer_store_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}} define amdgpu_kernel void @store_to_inttoptr() #0 { - store volatile i32 0, i32* inttoptr (i32 123 to i32*) + store volatile i32 0, i32* inttoptr (i32 124 to i32*) ret void } @@ -47,9 +47,9 @@ define amdgpu_kernel void @load_from_undef() #0 { ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}} -; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} +; OPT: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}} define amdgpu_kernel void @load_from_inttoptr() #0 { - %ld = load volatile i32, i32* inttoptr (i32 123 to i32*) + %ld = load volatile i32, i32* inttoptr (i32 124 to i32*) ret void } diff --git a/test/CodeGen/AMDGPU/readcyclecounter.ll b/test/CodeGen/AMDGPU/readcyclecounter.ll index 5c698c8..d7b353c 100644 --- a/test/CodeGen/AMDGPU/readcyclecounter.ll +++ b/test/CodeGen/AMDGPU/readcyclecounter.ll @@ -22,4 +22,18 @@ define amdgpu_kernel void @test_readcyclecounter(i64 addrspace(1)* %out) #0 { ret void } +; This test used to crash in ScheduleDAG. +; +; GCN-LABEL: {{^}}test_readcyclecounter_smem: +; SI-DAG: s_memtime +; VI-DAG: s_memrealtime +; GCN-DAG: s_load_dword +define amdgpu_cs i32 @test_readcyclecounter_smem(i64 addrspace(2)* inreg %in) #0 { + %cycle0 = call i64 @llvm.readcyclecounter() + %in.v = load i64, i64 addrspace(2)* %in + %r.64 = add i64 %cycle0, %in.v + %r.32 = trunc i64 %r.64 to i32 + ret i32 %r.32 +} + attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/ret_jump.ll b/test/CodeGen/AMDGPU/ret_jump.ll index 748f98a..f2fbacb 100644 --- a/test/CodeGen/AMDGPU/ret_jump.ll +++ b/test/CodeGen/AMDGPU/ret_jump.ll @@ -56,7 +56,7 @@ ret.bb: ; preds = %else, %main_body } ; GCN-LABEL: {{^}}uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable: -; GCN: s_cbranch_scc1 [[RET_BB:BB[0-9]+_[0-9]+]] +; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]] ; GCN: ; BB#{{[0-9]+}}: ; %else ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll index b702e1c..160fb6a 100644 --- a/test/CodeGen/AMDGPU/sext-in-reg.ll +++ b/test/CodeGen/AMDGPU/sext-in-reg.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FIXME: i16 promotion pass ruins the scalar cases when legal. diff --git a/test/CodeGen/AMDGPU/shl.v2i16.ll b/test/CodeGen/AMDGPU/shl.v2i16.ll index eac29ba..115221c 100644 --- a/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s diff --git a/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/test/CodeGen/AMDGPU/sminmax.v2i16.ll index 4e093cd..16ce86b 100644 --- a/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s diff --git a/test/CodeGen/AMDGPU/spill-m0.ll b/test/CodeGen/AMDGPU/spill-m0.ll index 0e715c4..8f1aebf 100644 --- a/test/CodeGen/AMDGPU/spill-m0.ll +++ b/test/CodeGen/AMDGPU/spill-m0.ll @@ -69,19 +69,20 @@ endif: ; TOSMEM-NOT: s_m0 ; TOSMEM: s_add_u32 m0, s7, 0x100 ; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill -; TOSMEM-NOT: m0 +; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it +; FIXME-TOSMEM-NOT: m0 -; TOSMEM-NOT: m0 +; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_add_u32 m0, s7, 0x200 ; TOSMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill -; TOSMEM-NOT: m0 +; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_mov_b64 exec, ; TOSMEM: s_cbranch_execz ; TOSMEM: s_branch ; TOSMEM: BB{{[0-9]+_[0-9]+}}: -; TOSMEM-NEXT: s_add_u32 m0, s7, 0x200 +; TOSMEM: s_add_u32 m0, s7, 0x200 ; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload @@ -130,7 +131,7 @@ endif: ; preds = %else, %if ; TOSMEM: s_branch ; TOSMEM: BB{{[0-9]+_[0-9]+}}: -; TOSMEM-NEXT: s_add_u32 m0, s3, 0x100 +; TOSMEM: s_add_u32 m0, s3, 0x100 ; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload ; GCN-NOT: v_readlane_b32 m0 @@ -159,13 +160,14 @@ endif: ; GCN-LABEL: {{^}}restore_m0_lds: ; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]] ; TOSMEM: s_cmp_eq_u32 -; TOSMEM-NOT: m0 +; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it +; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_add_u32 m0, s3, 0x100 ; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill -; TOSMEM-NOT: m0 +; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_add_u32 m0, s3, 0x300 ; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill -; TOSMEM-NOT: m0 +; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_cbranch_scc1 ; TOSMEM: s_mov_b32 m0, -1 @@ -178,10 +180,10 @@ endif: ; TOSMEM: ds_write_b64 -; TOSMEM-NOT: m0 +; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_add_u32 m0, s3, 0x300 ; TOSMEM: s_buffer_load_dword s0, s[88:91], m0 ; 4-byte Folded Reload -; TOSMEM-NOT: m0 +; FIXME-TOSMEM-NOT: m0 ; TOSMEM: s_waitcnt lgkmcnt(0) ; TOSMEM-NOT: m0 ; TOSMEM: s_mov_b32 m0, s0 diff --git a/test/CodeGen/AMDGPU/sub.v2i16.ll b/test/CodeGen/AMDGPU/sub.v2i16.ll index 69f0acc..4313446 100644 --- a/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s ; FIXME: Need to handle non-uniform case for function below (load without gep). diff --git a/test/CodeGen/AMDGPU/trap.ll b/test/CodeGen/AMDGPU/trap.ll index 77ad895..51771c9 100644 --- a/test/CodeGen/AMDGPU/trap.ll +++ b/test/CodeGen/AMDGPU/trap.ll @@ -80,4 +80,25 @@ define amdgpu_kernel void @trap() { ret void } +; GCN-LABEL: {{^}}non_entry_trap: +; TRAP-BIT: enable_trap_handler = 1 +; NO-TRAP-BIT: enable_trap_handler = 0 + +; HSA: BB{{[0-9]_[0-9]+]]: ; %trap +; HSA-TRAP: s_mov_b64 s[0:1], s[4:5] +; HSA-TRAP-NEXT: s_trap 2 +define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %arg0) local_unnamed_addr #1 { +entry: + %tmp29 = load volatile i32, i32 addrspace(1)* %arg0 + %cmp = icmp eq i32 %tmp29, -1 + br i1 %cmp, label %ret, label %trap + +trap: + call void @llvm.trap() + unreachable + +ret: + ret void +} + attributes #0 = { nounwind noreturn } diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir index 21c7741..83ab265 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir @@ -5,6 +5,8 @@ define void @test_sext_s8() { ret void } define void @test_zext_s16() { ret void } + define void @test_trunc_s32_16() { ret void } + define void @test_add_s8() { ret void } define void @test_add_s16() { ret void } define void @test_add_s32() { ret void } @@ -21,6 +23,9 @@ define void @test_mul_s32() #1 { ret void } define void @test_mulv5_s32() { ret void } + define void @test_sdiv_s32() #2 { ret void } + define void @test_udiv_s32() #2 { ret void } + define void @test_load_from_stack() { ret void } define void @test_load_f32() #0 { ret void } define void @test_load_f64() #0 { ret void } @@ -28,12 +33,14 @@ define void @test_stores() #0 { ret void } define void @test_gep() { ret void } - define void @test_constants() { ret void } + define void @test_constant_imm() { ret void } + define void @test_constant_cimm() { ret void } define void @test_soft_fp_double() #0 { ret void } attributes #0 = { "target-features"="+vfp2,-neonfp" } attributes #1 = { "target-features"="+v6" } + attributes #2 = { "target-features"="+hwdiv-arm" } ... --- name: test_zext_s1 @@ -142,6 +149,34 @@ body: | ; CHECK: BX_RET 14, _, implicit %r0 ... --- +name: test_trunc_s32_16 +# CHECK-LABEL: name: test_trunc_s32_16 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } +# CHECK-DAG: id: 0, class: gpr +# CHECK-DAG: id: 1, class: gpr +body: | + bb.0: + liveins: %r0 + + %0(s32) = COPY %r0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0 + + %1(s16) = G_TRUNC %0(s32) + ; CHECK: [[VREGTRUNC:%[0-9]+]] = COPY [[VREGX]] + + %r0 = COPY %1(s16) + ; CHECK: %r0 = COPY [[VREGTRUNC]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- name: test_add_s8 # CHECK-LABEL: name: test_add_s8 legalized: true @@ -538,6 +573,72 @@ body: | ; CHECK: BX_RET 14, _, implicit %r0 ... --- +name: test_sdiv_s32 +# CHECK-LABEL: name: test_sdiv_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +# CHECK: id: 0, class: gpr +# CHECK: id: 1, class: gpr +# CHECK: id: 2, class: gpr +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0 + + %1(s32) = COPY %r1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 + + %2(s32) = G_SDIV %0, %1 + ; CHECK: [[VREGRES:%[0-9]+]] = SDIV [[VREGX]], [[VREGY]], 14, _ + + %r0 = COPY %2(s32) + ; CHECK: %r0 = COPY [[VREGRES]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_udiv_s32 +# CHECK-LABEL: name: test_udiv_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +# CHECK: id: 0, class: gpr +# CHECK: id: 1, class: gpr +# CHECK: id: 2, class: gpr +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0 + + %1(s32) = COPY %r1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 + + %2(s32) = G_UDIV %0, %1 + ; CHECK: [[VREGRES:%[0-9]+]] = UDIV [[VREGX]], [[VREGY]], 14, _ + + %r0 = COPY %2(s32) + ; CHECK: %r0 = COPY [[VREGRES]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- name: test_load_from_stack # CHECK-LABEL: name: test_load_from_stack legalized: true @@ -714,8 +815,8 @@ body: | BX_RET 14, _, implicit %r0 ... --- -name: test_constants -# CHECK-LABEL: name: test_constants +name: test_constant_imm +# CHECK-LABEL: name: test_constant_imm legalized: true regBankSelected: true selected: false @@ -732,6 +833,26 @@ body: | BX_RET 14, _, implicit %r0 ... --- +name: test_constant_cimm +# CHECK-LABEL: name: test_constant_cimm +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } +# CHECK: id: [[C:[0-9]+]], class: gpr +body: | + bb.0: + ; Adding a type on G_CONSTANT changes its operand from an Imm into a CImm. + ; We still want to see the same thing in the output though. + %0(s32) = G_CONSTANT i32 42 + ; CHECK: %[[C]] = MOVi 42, 14, _, _ + + %r0 = COPY %0(s32) + BX_RET 14, _, implicit %r0 +... +--- name: test_soft_fp_double # CHECK-LABEL: name: test_soft_fp_double legalized: true diff --git a/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll b/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll new file mode 100644 index 0000000..2881740 --- /dev/null +++ b/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll @@ -0,0 +1,68 @@ +; We use V6 ops so we can easily check for the extensions (sxth vs bit tricks). +; RUN: llc -mtriple arm-gnueabi -mattr=+v6,+hwdiv-arm -global-isel %s -o - | FileCheck %s -check-prefixes=CHECK,HWDIV +; RUN: llc -mtriple arm-gnueabi -mattr=+v6,-hwdiv-arm -global-isel %s -o - | FileCheck %s -check-prefixes=CHECK,SOFT-AEABI +; RUN: llc -mtriple arm-gnu -mattr=+v6,+hwdiv-arm -global-isel %s -o - | FileCheck %s -check-prefixes=CHECK,HWDIV +; RUN: llc -mtriple arm-gnu -mattr=+v6,-hwdiv-arm -global-isel %s -o - | FileCheck %s -check-prefixes=CHECK,SOFT-DEFAULT + +define arm_aapcscc i32 @test_sdiv_i32(i32 %a, i32 %b) { +; CHECK-LABEL: test_sdiv_i32: +; HWDIV: sdiv r0, r0, r1 +; SOFT-AEABI: blx __aeabi_idiv +; SOFT-DEFAULT: blx __divsi3 + %r = sdiv i32 %a, %b + ret i32 %r +} + +define arm_aapcscc i32 @test_udiv_i32(i32 %a, i32 %b) { +; CHECK-LABEL: test_udiv_i32: +; HWDIV: udiv r0, r0, r1 +; SOFT-AEABI: blx __aeabi_uidiv +; SOFT-DEFAULT: blx __udivsi3 + %r = udiv i32 %a, %b + ret i32 %r +} + +define arm_aapcscc i16 @test_sdiv_i16(i16 %a, i16 %b) { +; CHECK-LABEL: test_sdiv_i16: +; CHECK-DAG: sxth r0, r0 +; CHECK-DAG: sxth r1, r1 +; HWDIV: sdiv r0, r0, r1 +; SOFT-AEABI: blx __aeabi_idiv +; SOFT-DEFAULT: blx __divsi3 + %r = sdiv i16 %a, %b + ret i16 %r +} + +define arm_aapcscc i16 @test_udiv_i16(i16 %a, i16 %b) { +; CHECK-LABEL: test_udiv_i16: +; CHECK-DAG: uxth r0, r0 +; CHECK-DAG: uxth r1, r1 +; HWDIV: udiv r0, r0, r1 +; SOFT-AEABI: blx __aeabi_uidiv +; SOFT-DEFAULT: blx __udivsi3 + %r = udiv i16 %a, %b + ret i16 %r +} + +define arm_aapcscc i8 @test_sdiv_i8(i8 %a, i8 %b) { +; CHECK-LABEL: test_sdiv_i8: +; CHECK-DAG: sxtb r0, r0 +; CHECK-DAG: sxtb r1, r1 +; HWDIV: sdiv r0, r0, r1 +; SOFT-AEABI: blx __aeabi_idiv +; SOFT-DEFAULT: blx __divsi3 + %r = sdiv i8 %a, %b + ret i8 %r +} + +define arm_aapcscc i8 @test_udiv_i8(i8 %a, i8 %b) { +; CHECK-LABEL: test_udiv_i8: +; CHECK-DAG: uxtb r0, r0 +; CHECK-DAG: uxtb r1, r1 +; HWDIV: udiv r0, r0, r1 +; SOFT-AEABI: blx __aeabi_uidiv +; SOFT-DEFAULT: blx __udivsi3 + %r = udiv i8 %a, %b + ret i8 %r +} + diff --git a/test/CodeGen/ARM/GlobalISel/arm-isel.ll b/test/CodeGen/ARM/GlobalISel/arm-isel.ll index f3ca291..da02bfe 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-isel.ll +++ b/test/CodeGen/ARM/GlobalISel/arm-isel.ll @@ -7,6 +7,14 @@ entry: ret void } +define i32 @test_constant_return_i32() { +; CHECK-LABEL: test_constant_return_i32: +; CHECK: mov r0, #42 +; CHECK: bx lr +entry: + ret i32 42 +} + define zeroext i1 @test_zext_i1(i1 %x) { ; CHECK-LABEL: test_zext_i1 ; CHECK: and r0, r0, #1 @@ -40,6 +48,30 @@ entry: ret i16 %x } +define void @test_trunc_i32_i16(i32 %v, i16 *%p) { +; CHECK-LABEL: test_trunc_i32_i16: +; The trunc doesn't result in any instructions, but we +; expect the store to be explicitly 16-bit. +; CHECK: strh r0, [r1] +; CHECK: bx lr +entry: + %v16 = trunc i32 %v to i16 + store i16 %v16, i16 *%p + ret void +} + +define void @test_trunc_i32_i8(i32 %v, i8 *%p) { +; CHECK-LABEL: test_trunc_i32_i8: +; The trunc doesn't result in any instructions, but we +; expect the store to be explicitly 8-bit. +; CHECK: strb r0, [r1] +; CHECK: bx lr +entry: + %v8 = trunc i32 %v to i8 + store i8 %v8, i8 *%p + ret void +} + define i8 @test_add_i8(i8 %x, i8 %y) { ; CHECK-LABEL: test_add_i8: ; CHECK: add r0, r0, r1 diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir new file mode 100644 index 0000000..6f3e09d --- /dev/null +++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir @@ -0,0 +1,230 @@ +# RUN: llc -mtriple arm-linux-gnueabi -mattr=+hwdiv-arm -global-isel -run-pass=legalizer %s -o - | FileCheck %s -check-prefixes=CHECK,HWDIV +# RUN: llc -mtriple arm-linux-gnueabi -mattr=-hwdiv-arm -global-isel -run-pass=legalizer %s -o - | FileCheck %s -check-prefixes=CHECK,SOFT,SOFT-AEABI +# RUN: llc -mtriple arm-linux-gnu -mattr=+hwdiv-arm -global-isel -run-pass=legalizer %s -o - | FileCheck %s -check-prefixes=CHECK,HWDIV +# RUN: llc -mtriple arm-linux-gnu -mattr=-hwdiv-arm -global-isel -run-pass=legalizer %s -o - | FileCheck %s -check-prefixes=CHECK,SOFT,SOFT-DEFAULT +--- | + define void @test_sdiv_i32() { ret void } + define void @test_udiv_i32() { ret void } + + define void @test_sdiv_i16() { ret void } + define void @test_udiv_i16() { ret void } + + define void @test_sdiv_i8() { ret void } + define void @test_udiv_i8() { ret void } +... +--- +name: test_sdiv_i32 +# CHECK-LABEL: name: test_sdiv_i32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; HWDIV: [[R:%[0-9]+]](s32) = G_SDIV [[X]], [[Y]] + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; SOFT-AEABI: BLX $__aeabi_idiv, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-AEABI: [[R:%[0-9]+]](s32) = COPY %r0 + ; SOFT-DEFAULT: BLX $__divsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: [[R:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + %2(s32) = G_SDIV %0, %1 + ; CHECK: %r0 = COPY [[R]] + %r0 = COPY %2(s32) + BX_RET 14, _, implicit %r0 +... +--- +name: test_udiv_i32 +# CHECK-LABEL: name: test_udiv_i32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; HWDIV: [[R:%[0-9]+]](s32) = G_UDIV [[X]], [[Y]] + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; SOFT-AEABI: BLX $__aeabi_uidiv, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-AEABI: [[R:%[0-9]+]](s32) = COPY %r0 + ; SOFT-DEFAULT: BLX $__udivsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: [[R:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + %2(s32) = G_UDIV %0, %1 + ; CHECK: %r0 = COPY [[R]] + %r0 = COPY %2(s32) + BX_RET 14, _, implicit %r0 +... +--- +name: test_sdiv_i16 +# CHECK-LABEL: name: test_sdiv_i16 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + ; CHECK-DAG: [[X:%[0-9]+]](s16) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s16) = COPY %r1 + ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_SEXT [[X]](s16) + ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_SEXT [[Y]](s16) + %0(s16) = COPY %r0 + %1(s16) = COPY %r1 + ; HWDIV: [[R32:%[0-9]+]](s32) = G_SDIV [[X32]], [[Y32]] + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X32]] + ; SOFT-DAG: %r1 = COPY [[Y32]] + ; SOFT-AEABI: BLX $__aeabi_idiv, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r0 + ; SOFT-DEFAULT: BLX $__divsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; CHECK: [[R:%[0-9]+]](s16) = G_TRUNC [[R32]] + %2(s16) = G_SDIV %0, %1 + ; CHECK: %r0 = COPY [[R]] + %r0 = COPY %2(s16) + BX_RET 14, _, implicit %r0 +... +--- +name: test_udiv_i16 +# CHECK-LABEL: name: test_udiv_i16 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + ; CHECK-DAG: [[X:%[0-9]+]](s16) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s16) = COPY %r1 + ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_ZEXT [[X]](s16) + ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_ZEXT [[Y]](s16) + %0(s16) = COPY %r0 + %1(s16) = COPY %r1 + ; HWDIV: [[R32:%[0-9]+]](s32) = G_UDIV [[X32]], [[Y32]] + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X32]] + ; SOFT-DAG: %r1 = COPY [[Y32]] + ; SOFT-AEABI: BLX $__aeabi_uidiv, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r0 + ; SOFT-DEFAULT: BLX $__udivsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; CHECK: [[R:%[0-9]+]](s16) = G_TRUNC [[R32]] + %2(s16) = G_UDIV %0, %1 + ; CHECK: %r0 = COPY [[R]] + %r0 = COPY %2(s16) + BX_RET 14, _, implicit %r0 +... +--- +name: test_sdiv_i8 +# CHECK-LABEL: name: test_sdiv_i8 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + ; CHECK-DAG: [[X:%[0-9]+]](s8) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s8) = COPY %r1 + ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_SEXT [[X]](s8) + ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_SEXT [[Y]](s8) + %0(s8) = COPY %r0 + %1(s8) = COPY %r1 + ; HWDIV: [[R32:%[0-9]+]](s32) = G_SDIV [[X32]], [[Y32]] + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X32]] + ; SOFT-DAG: %r1 = COPY [[Y32]] + ; SOFT-AEABI: BLX $__aeabi_idiv, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r0 + ; SOFT-DEFAULT: BLX $__divsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; CHECK: [[R:%[0-9]+]](s8) = G_TRUNC [[R32]] + %2(s8) = G_SDIV %0, %1 + ; CHECK: %r0 = COPY [[R]] + %r0 = COPY %2(s8) + BX_RET 14, _, implicit %r0 +... +--- +name: test_udiv_i8 +# CHECK-LABEL: name: test_udiv_i8 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + ; CHECK-DAG: [[X:%[0-9]+]](s8) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s8) = COPY %r1 + ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_ZEXT [[X]](s8) + ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_ZEXT [[Y]](s8) + %0(s8) = COPY %r0 + %1(s8) = COPY %r1 + ; HWDIV: [[R32:%[0-9]+]](s32) = G_UDIV [[X32]], [[Y32]] + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X32]] + ; SOFT-DAG: %r1 = COPY [[Y32]] + ; SOFT-AEABI: BLX $__aeabi_uidiv, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r0 + ; SOFT-DEFAULT: BLX $__udivsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; CHECK: [[R:%[0-9]+]](s8) = G_TRUNC [[R32]] + %2(s8) = G_UDIV %0, %1 + ; CHECK: %r0 = COPY [[R]] + %r0 = COPY %2(s8) + BX_RET 14, _, implicit %r0 +... diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir index e793583..4e94fb4 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir @@ -13,6 +13,9 @@ define void @test_mul_s16() { ret void } define void @test_mul_s8() { ret void } + define void @test_sdiv_s32() #1 { ret void } + define void @test_udiv_s32() #1 { ret void } + define void @test_loads() #0 { ret void } define void @test_stores() #0 { ret void } @@ -22,12 +25,15 @@ define void @test_constants() { ret void } + define void @test_trunc_s32_16() { ret void } + define void @test_fadd_s32() #0 { ret void } define void @test_fadd_s64() #0 { ret void } define void @test_soft_fp_s64() #0 { ret void } attributes #0 = { "target-features"="+vfp2"} + attributes #1 = { "target-features"="+hwdiv-arm" } ... --- name: test_add_s32 @@ -290,6 +296,58 @@ body: | ... --- +name: test_sdiv_s32 +# CHECK-LABEL: name: test_sdiv_s32 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +# CHECK: - { id: 1, class: gprb } +# CHECK: - { id: 2, class: gprb } + +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = G_SDIV %0, %1 + %r0 = COPY %2(s32) + BX_RET 14, _, implicit %r0 + +... +--- +name: test_udiv_s32 +# CHECK-LABEL: name: test_udiv_s32 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +# CHECK: - { id: 1, class: gprb } +# CHECK: - { id: 2, class: gprb } + +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = G_UDIV %0, %1 + %r0 = COPY %2(s32) + BX_RET 14, _, implicit %r0 + +... +--- name: test_loads # CHECK-LABEL: name: test_loads legalized: true @@ -442,6 +500,27 @@ body: | BX_RET 14, _, implicit %r0 ... --- +name: test_trunc_s32_16 +# CHECK-LABEL: name: test_trunc_s32_16 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +# CHECK: - { id: 1, class: gprb } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.0: + liveins: %r0 + + %0(s32) = COPY %r0 + %1(s16) = G_TRUNC %0(s32) + %r0 = COPY %1(s16) + BX_RET 14, _, implicit %r0 +... +--- name: test_fadd_s32 # CHECK-LABEL: name: test_fadd_s32 legalized: true diff --git a/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll b/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll new file mode 100644 index 0000000..e3680ed --- /dev/null +++ b/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll @@ -0,0 +1,80 @@ +; RUN: llc -mtriple arm-unknown -verify-machineinstrs -global-isel -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o - 2>&1 | FileCheck %s + +; This file checks that we use the fallback path for things that are known to +; be unsupported on the ARM target. It should progressively shrink in size. + +define <4 x i32> @test_int_vectors(<4 x i32> %a, <4 x i32> %b) { +; CHECK: remark: {{.*}} unable to lower arguments: <4 x i32> (<4 x i32>, <4 x i32>)* +; CHECK-LABEL: warning: Instruction selection used fallback path for test_int_vectors + %res = add <4 x i32> %a, %b + ret <4 x i32> %res +} + +define <4 x float> @test_float_vectors(<4 x float> %a, <4 x float> %b) { +; CHECK: remark: {{.*}} unable to lower arguments: <4 x float> (<4 x float>, <4 x float>)* +; CHECK-LABEL: warning: Instruction selection used fallback path for test_float_vectors + %res = fadd <4 x float> %a, %b + ret <4 x float> %res +} + +define i64 @test_i64(i64 %a, i64 %b) { +; CHECK: remark: {{.*}} unable to lower arguments: i64 (i64, i64)* +; CHECK-LABEL: warning: Instruction selection used fallback path for test_i64 + %res = add i64 %a, %b + ret i64 %res +} + +define i128 @test_i128(i128 %a, i128 %b) { +; CHECK: remark: {{.*}} unable to lower arguments: i128 (i128, i128)* +; CHECK-LABEL: warning: Instruction selection used fallback path for test_i128 + %res = add i128 %a, %b + ret i128 %res +} + +define i17 @test_funny_ints(i17 %a, i17 %b) { +; CHECK: remark: {{.*}} unable to lower arguments: i17 (i17, i17)* +; CHECK-LABEL: warning: Instruction selection used fallback path for test_funny_ints + %res = add i17 %a, %b + ret i17 %res +} + +define half @test_half(half %a, half %b) { +; CHECK: remark: {{.*}} unable to lower arguments: half (half, half)* +; CHECK-LABEL: warning: Instruction selection used fallback path for test_half + %res = fadd half %a, %b + ret half %res +} + +; On ARM, clang lowers structs to arrays. +define void @test_arrays([2 x i32] %this.could.come.from.a.struct) { +; CHECK: remark: {{.*}} unable to lower arguments: void ([2 x i32])* +; CHECK-LABEL: warning: Instruction selection used fallback path for test_arrays + ret void +} + +define void @test_structs({i32, i32} %struct) { +; CHECK: remark: {{.*}} unable to lower arguments: void ({ i32, i32 })* +; CHECK-LABEL: warning: Instruction selection used fallback path for test_structs + ret void +} + +define void @test_vararg_definition(i32 %a, ...) { +; CHECK: remark: {{.*}} unable to lower arguments: void (i32, ...)* +; CHECK-LABEL: warning: Instruction selection used fallback path for test_vararg_definition + ret void +} + +define void @test_vararg_call(i32 %a) { +; CHECK: remark: {{.*}} unable to translate instruction: call +; CHECK-LABEL: warning: Instruction selection used fallback path for test_vararg_call + call void(i32, ...) @test_vararg_definition(i32 %a, i32 %a, i32 %a) + ret void +} + +define i32 @test_thumb(i32 %a) #0 { +; CHECK: remark: {{.*}} unable to lower arguments: i32 (i32)* +; CHECK-LABEL: warning: Instruction selection used fallback path for test_thumb + ret i32 %a +} + +attributes #0 = { "target-features"="+thumb-mode" } diff --git a/test/CodeGen/ARM/bool-ext-inc.ll b/test/CodeGen/ARM/bool-ext-inc.ll index fe43f1b..b91b9b2 100644 --- a/test/CodeGen/ARM/bool-ext-inc.ll +++ b/test/CodeGen/ARM/bool-ext-inc.ll @@ -30,3 +30,42 @@ define <4 x i32> @sext_inc_vec(<4 x i1> %x) { ret <4 x i32> %add } +define <4 x i32> @cmpgt_sext_inc_vec(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: cmpgt_sext_inc_vec: +; CHECK: @ BB#0: +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vmov d19, r2, r3 +; CHECK-NEXT: vmov.i32 q10, #0x1 +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: vmov d18, r0, r1 +; CHECK-NEXT: vcgt.s32 q8, q9, q8 +; CHECK-NEXT: vadd.i32 q8, q8, q10 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %cmp = icmp sgt <4 x i32> %x, %y + %ext = sext <4 x i1> %cmp to <4 x i32> + %add = add <4 x i32> %ext, + ret <4 x i32> %add +} + +define <4 x i32> @cmpne_sext_inc_vec(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: cmpne_sext_inc_vec: +; CHECK: @ BB#0: +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vmov d19, r2, r3 +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: vmov d18, r0, r1 +; CHECK-NEXT: vceq.i32 q8, q9, q8 +; CHECK-NEXT: vmov.i32 q9, #0x1 +; CHECK-NEXT: vmvn q8, q8 +; CHECK-NEXT: vadd.i32 q8, q8, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %cmp = icmp ne <4 x i32> %x, %y + %ext = sext <4 x i1> %cmp to <4 x i32> + %add = add <4 x i32> %ext, + ret <4 x i32> %add +} + diff --git a/test/CodeGen/ARM/fence-singlethread.ll b/test/CodeGen/ARM/fence-singlethread.ll new file mode 100644 index 0000000..ec032cc --- /dev/null +++ b/test/CodeGen/ARM/fence-singlethread.ll @@ -0,0 +1,16 @@ +; RUN: llc -mtriple=thumbv7-linux-gnueabihf %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv7-apple-ios %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv7-linux-gnueabihf %s -filetype=obj -o %t +; RUN: llvm-objdump -d %t | FileCheck %s --check-prefix=OBJ + +; OBJ-NOT: dmb + +define void @fence_singlethread() { +; CHECK-LABEL: fence_singlethread: +; CHECK-NOT: dmb +; CHECK: @ COMPILER BARRIER +; CHECK-NOT: dmb + + fence singlethread seq_cst + ret void +} diff --git a/test/CodeGen/ARM/v6m-smul-with-overflow.ll b/test/CodeGen/ARM/v6m-smul-with-overflow.ll new file mode 100644 index 0000000..6e8a704 --- /dev/null +++ b/test/CodeGen/ARM/v6m-smul-with-overflow.ll @@ -0,0 +1,16 @@ +; RUN: llc < %s -mtriple=thumbv6m-none-eabi | FileCheck %s + +define i1 @signed_multiplication_did_overflow(i32, i32) { +; CHECK-LABEL: signed_multiplication_did_overflow: +entry-block: + %2 = tail call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %0, i32 %1) + %3 = extractvalue { i32, i1 } %2, 1 + ret i1 %3 + +; CHECK: mov r2, r1 +; CHECK: asrs r1, r0, #31 +; CHECK: asrs r3, r2, #31 +; CHECK: bl __aeabi_lmul +} + +declare { i32, i1 } @llvm.smul.with.overflow.i32(i32, i32) diff --git a/test/CodeGen/ARM/vpadd.ll b/test/CodeGen/ARM/vpadd.ll index 1aa2359..3409d37 100644 --- a/test/CodeGen/ARM/vpadd.ll +++ b/test/CodeGen/ARM/vpadd.ll @@ -485,6 +485,26 @@ define <2 x i16> @fromExtendingExtractVectorElt_i16(<4 x i16> %in) { ret <2 x i16> %x } +; And <2 x i8> to <2 x i32> +define <2 x i8> @fromExtendingExtractVectorElt_2i8(<8 x i8> %in) { +; CHECK-LABEL: fromExtendingExtractVectorElt_2i8: +; CHECK: vadd.i32 + %tmp1 = shufflevector <8 x i8> %in, <8 x i8> undef, <2 x i32> + %tmp2 = shufflevector <8 x i8> %in, <8 x i8> undef, <2 x i32> + %x = add <2 x i8> %tmp2, %tmp1 + ret <2 x i8> %x +} + +define <2 x i16> @fromExtendingExtractVectorElt_2i16(<8 x i16> %in) { +; CHECK-LABEL: fromExtendingExtractVectorElt_2i16: +; CHECK: vadd.i32 + %tmp1 = shufflevector <8 x i16> %in, <8 x i16> undef, <2 x i32> + %tmp2 = shufflevector <8 x i16> %in, <8 x i16> undef, <2 x i32> + %x = add <2 x i16> %tmp2, %tmp1 + ret <2 x i16> %x +} + + declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) nounwind readnone declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) nounwind readnone diff --git a/test/CodeGen/AVR/pseudo/LDDWRdPtrQ-same-src-dst.mir b/test/CodeGen/AVR/pseudo/LDDWRdPtrQ-same-src-dst.mir new file mode 100644 index 0000000..b19e44e --- /dev/null +++ b/test/CodeGen/AVR/pseudo/LDDWRdPtrQ-same-src-dst.mir @@ -0,0 +1,35 @@ +# RUN: llc -O0 %s -o - -march=avr | FileCheck %s + +# This test checks the expansion of the 16-bit 'LDDWRdPtrQ' pseudo instruction. +# +# This test ensures that the pseudo expander can correctly handle the case +# where we are expanding a 16-bit LDD instruction where the source and +# destination registers are the same. +# +# The instruction itself is earlyclobber and so ISel will never produce an +# instruction like this, but the stack slot loading can and will. + +--- | + target triple = "avr--" + define void @test_lddwrdptrq() { + entry: + ret void + } +... + +--- +name: test_lddwrdptrq +tracksRegLiveness: true +body: | + bb.0.entry: + + ; CHECK-LABEL: test_lddwrdptrq + + ; CHECK: ldd [[SCRATCH:r[0-9]+]], Z+10 + ; CHECK-NEXT: push [[SCRATCH]] + ; CHECK-NEXT: ldd [[SCRATCH]], Z+11 + ; CHECK-NEXT: mov r31, [[SCRATCH]] + ; CHECK-NEXT: pop r30 + + early-clobber %r31r30 = LDDWRdPtrQ undef %r31r30, 10 +... diff --git a/test/CodeGen/AVR/pseudo/LDWRdPtr-same-src-dst.mir b/test/CodeGen/AVR/pseudo/LDWRdPtr-same-src-dst.mir new file mode 100644 index 0000000..3e7fdcd --- /dev/null +++ b/test/CodeGen/AVR/pseudo/LDWRdPtr-same-src-dst.mir @@ -0,0 +1,29 @@ +# RUN: llc -O0 %s -o - | FileCheck %s + +# This test checks the expansion of the 16-bit LDWRdPtr pseudo instruction. + +--- | + target triple = "avr--" + define void @test_ldwrdptr() { + entry: + ret void + } +... + +--- +name: test_ldwrdptr +tracksRegLiveness: true +body: | + bb.0.entry: + + ; CHECK-LABEL: test_ldwrdptr + + ; CHECK: ld [[SCRATCH:r[0-9]+]], Z + ; CHECK-NEXT: push [[SCRATCH]] + ; CHECK-NEXT: ldd [[SCRATCH]], Z+1 + ; CHECK-NEXT: mov r31, [[SCRATCH]] + ; CHECK-NEXT: pop r30 + + early-clobber %r31r30 = LDWRdPtr undef %r31r30 +... + diff --git a/test/CodeGen/AVR/pseudo/expand-lddw-dst-src-same.mir b/test/CodeGen/AVR/pseudo/expand-lddw-dst-src-same.mir deleted file mode 100644 index 8427a2b..0000000 --- a/test/CodeGen/AVR/pseudo/expand-lddw-dst-src-same.mir +++ /dev/null @@ -1,35 +0,0 @@ -# RUN: llc -O0 %s -o - -march=avr | FileCheck %s - -# This test ensures that the pseudo expander can correctly handle the case -# where we are expanding a 16-bit LDD instruction where the source and -# destination registers are the same. -# -# The instruction itself is earlyclobber and so ISel will never produce an -# instruction like this, but the stack slot loading can and will. - ---- | - target triple = "avr--" - - define void @test_lddw() { - entry: - ret void - } - -... ---- -name: test_lddw -tracksRegLiveness: true -stack: - - { id: 0, type: spill-slot, offset: -4, size: 1, alignment: 1, callee-saved-register: '%r28' } -body: | - bb.0.entry: - liveins: %r28, %r29 - - ; CHECK-LABEL: test_lddw - - ; CHECK: ldd [[TMPREG:r[0-9]+]], Y+0 - ; CHECK-NEXT: mov r28, [[TMPREG]] - ; CHECK-NEXT: ldd [[TMPREG]], Y+1 - ; CHECK-NEXT: mov r29, [[TMPREG]] - dead early-clobber %r29r28 = LDDWRdYQ killed %r29r28, 0 -... diff --git a/test/CodeGen/MSP430/select-use-sr.ll b/test/CodeGen/MSP430/select-use-sr.ll new file mode 100644 index 0000000..3f67fb8 --- /dev/null +++ b/test/CodeGen/MSP430/select-use-sr.ll @@ -0,0 +1,21 @@ +; RUN: llc < %s -march=msp430 | FileCheck %s +; PR32769 + +target triple = "msp430" + +; Test that CMP instruction is not removed by MachineCSE. +; +; CHECK-LABEL: @f +; CHECK: cmp.w r15, r13 +; CHECK: cmp.w r15, r13 +; CHECK-NEXT: jeq .LBB0_2 +define i16 @f(i16, i16, i16, i16) { +entry: + %4 = icmp ult i16 %1, %3 + %5 = zext i1 %4 to i16 + %6 = icmp ult i16 %0, %2 + %7 = zext i1 %6 to i16 + %8 = icmp eq i16 %1, %3 + %out = select i1 %8, i16 %5, i16 %7 + ret i16 %out +} diff --git a/test/CodeGen/Mips/llvm-ir/mul.ll b/test/CodeGen/Mips/llvm-ir/mul.ll index 1562372..2085307 100644 --- a/test/CodeGen/Mips/llvm-ir/mul.ll +++ b/test/CodeGen/Mips/llvm-ir/mul.ll @@ -268,7 +268,7 @@ entry: ; MM64R6: daddu $2, $[[T1]], $[[T0]] ; MM64R6-DAG: dmul $3, $5, $7 - ; MM32: lw $25, %call16(__multi3)($16) + ; MM32: lw $25, %call16(__multi3)($gp) %r = mul i128 %a, %b ret i128 %r diff --git a/test/CodeGen/Mips/llvm-ir/sdiv.ll b/test/CodeGen/Mips/llvm-ir/sdiv.ll index defd25b..ee2b212 100644 --- a/test/CodeGen/Mips/llvm-ir/sdiv.ll +++ b/test/CodeGen/Mips/llvm-ir/sdiv.ll @@ -172,7 +172,7 @@ entry: ; 64R6: ddiv $2, $4, $5 ; 64R6: teq $5, $zero, 7 - ; MM32: lw $25, %call16(__divdi3)($2) + ; MM32: lw $25, %call16(__divdi3)($gp) ; MM64: ddiv $2, $4, $5 ; MM64: teq $5, $zero, 7 @@ -184,15 +184,7 @@ entry: define signext i128 @sdiv_i128(i128 signext %a, i128 signext %b) { entry: ; ALL-LABEL: sdiv_i128: - - ; GP32: lw $25, %call16(__divti3)($gp) - - ; GP64-NOT-R6: ld $25, %call16(__divti3)($gp) - ; 64R6: ld $25, %call16(__divti3)($gp) - - ; MM32: lw $25, %call16(__divti3)($16) - - ; MM64: ld $25, %call16(__divti3)($2) + ; ALL: l{{w|d}} $25, %call16(__divti3)($gp) %r = sdiv i128 %a, %b ret i128 %r diff --git a/test/CodeGen/Mips/llvm-ir/srem.ll b/test/CodeGen/Mips/llvm-ir/srem.ll index 42664d7..812c105 100644 --- a/test/CodeGen/Mips/llvm-ir/srem.ll +++ b/test/CodeGen/Mips/llvm-ir/srem.ll @@ -164,7 +164,7 @@ entry: ; 64R6: dmod $2, $4, $5 ; 64R6: teq $5, $zero, 7 - ; MM32: lw $25, %call16(__moddi3)($2) + ; MM32: lw $25, %call16(__moddi3)($gp) ; MM64: dmod $2, $4, $5 ; MM64: teq $5, $zero, 7 @@ -177,14 +177,7 @@ define signext i128 @srem_i128(i128 signext %a, i128 signext %b) { entry: ; ALL-LABEL: srem_i128: - ; GP32: lw $25, %call16(__modti3)($gp) - - ; GP64-NOT-R6: ld $25, %call16(__modti3)($gp) - ; 64R6: ld $25, %call16(__modti3)($gp) - - ; MM32: lw $25, %call16(__modti3)($16) - - ; MM64: ld $25, %call16(__modti3)($2) + ; ALL: l{{w|d}} $25, %call16(__modti3)($gp) %r = srem i128 %a, %b ret i128 %r diff --git a/test/CodeGen/Mips/llvm-ir/udiv.ll b/test/CodeGen/Mips/llvm-ir/udiv.ll index 78ab364..6e078fd 100644 --- a/test/CodeGen/Mips/llvm-ir/udiv.ll +++ b/test/CodeGen/Mips/llvm-ir/udiv.ll @@ -134,7 +134,7 @@ entry: ; 64R6: ddivu $2, $4, $5 ; 64R6: teq $5, $zero, 7 - ; MM32: lw $25, %call16(__udivdi3)($2) + ; MM32: lw $25, %call16(__udivdi3)($gp) ; MM64: ddivu $2, $4, $5 ; MM64: teq $5, $zero, 7 @@ -147,14 +147,7 @@ define signext i128 @udiv_i128(i128 signext %a, i128 signext %b) { entry: ; ALL-LABEL: udiv_i128: - ; GP32: lw $25, %call16(__udivti3)($gp) - - ; GP64-NOT-R6: ld $25, %call16(__udivti3)($gp) - ; 64-R6: ld $25, %call16(__udivti3)($gp) - - ; MM32: lw $25, %call16(__udivti3)($16) - - ; MM64: ld $25, %call16(__udivti3)($2) + ; ALL: l{{w|d}} $25, %call16(__udivti3)($gp) %r = udiv i128 %a, %b ret i128 %r diff --git a/test/CodeGen/Mips/llvm-ir/urem.ll b/test/CodeGen/Mips/llvm-ir/urem.ll index 160c126..3bc82ce 100644 --- a/test/CodeGen/Mips/llvm-ir/urem.ll +++ b/test/CodeGen/Mips/llvm-ir/urem.ll @@ -190,7 +190,7 @@ entry: ; 64R6: dmodu $2, $4, $5 ; 64R6: teq $5, $zero, 7 - ; MM32: lw $25, %call16(__umoddi3)($2) + ; MM32: lw $25, %call16(__umoddi3)($gp) ; MM64: dmodu $2, $4, $5 ; MM64: teq $5, $zero, 7 @@ -208,9 +208,9 @@ entry: ; GP64-NOT-R6: ld $25, %call16(__umodti3)($gp) ; 64R6: ld $25, %call16(__umodti3)($gp) - ; MM32: lw $25, %call16(__umodti3)($16) + ; MM32: lw $25, %call16(__umodti3)($gp) - ; MM64: ld $25, %call16(__umodti3)($2) + ; MM64: ld $25, %call16(__umodti3)($gp) %r = urem i128 %a, %b ret i128 %r diff --git a/test/CodeGen/Mips/micromips-gp-rc.ll b/test/CodeGen/Mips/micromips-gp-rc.ll index f139f7a..16e55c3 100644 --- a/test/CodeGen/Mips/micromips-gp-rc.ll +++ b/test/CodeGen/Mips/micromips-gp-rc.ll @@ -14,5 +14,5 @@ entry: ; Function Attrs: noreturn declare void @exit(i32 signext) -; CHECK: move $gp, ${{[0-9]+}} +; CHECK: addu $gp, ${{[0-9]+}} diff --git a/test/CodeGen/Mips/mips64fpldst.ll b/test/CodeGen/Mips/mips64fpldst.ll index 564ffdd..6fa5068 100644 --- a/test/CodeGen/Mips/mips64fpldst.ll +++ b/test/CodeGen/Mips/mips64fpldst.ll @@ -1,9 +1,9 @@ -; RUN: llc < %s -march=mips64el -mcpu=mips4 -target-abi n64 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N64 -; RUN: llc < %s -march=mips64el -mcpu=mips4 -target-abi n32 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N32 -; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi n64 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N64 -; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi n32 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N32 -; RUN: llc < %s -march=mipsel -mcpu=mips64r6 -mattr=+micromips -target-abi n32 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N32 -; RUN: llc < %s -march=mipsel -mcpu=mips64r6 -mattr=+micromips -target-abi n64 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-N64 +; RUN: llc < %s -march=mips64el -mcpu=mips4 -target-abi n64 -relocation-model=pic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-N64 +; RUN: llc < %s -march=mips64el -mcpu=mips4 -target-abi n32 -relocation-model=pic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-N32 +; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi n64 -relocation-model=pic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-N64 +; RUN: llc < %s -march=mips64el -mcpu=mips64 -target-abi n32 -relocation-model=pic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-N32 +; RUN: llc < %s -march=mipsel -mcpu=mips64r6 -mattr=+micromips -target-abi n32 -relocation-model=pic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-N32 +; RUN: llc < %s -march=mipsel -mcpu=mips64r6 -mattr=+micromips -target-abi n64 -relocation-model=pic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-N64 @f0 = common global float 0.000000e+00, align 4 @d0 = common global double 0.000000e+00, align 8 diff --git a/test/CodeGen/Mips/tailcall/tailcall.ll b/test/CodeGen/Mips/tailcall/tailcall.ll index 3f04e1c..01a9b64 100644 --- a/test/CodeGen/Mips/tailcall/tailcall.ll +++ b/test/CodeGen/Mips/tailcall/tailcall.ll @@ -176,7 +176,7 @@ entry: ; ALL-LABEL: caller8_1: ; PIC32: jalr $25 ; PIC32R6: jalr $25 -; PIC32MM: jalr $25 +; PIC32MM: jalr{{.*}} $25 ; STATIC32: jal ; PIC64: jalr $25 ; STATIC64: jal @@ -288,7 +288,7 @@ entry: ; ALL-LABEL: caller13: ; PIC32: jalr $25 ; PIC32R6: jalr $25 -; PIC32MM: jalr $25 +; PIC32MM: jalr{{.*}} $25 ; STATIC32: jal ; STATIC64: jal ; PIC64R6: jalr $25 diff --git a/test/CodeGen/PowerPC/empty-functions.ll b/test/CodeGen/PowerPC/empty-functions.ll index 56db8f3..b8394e1 100644 --- a/test/CodeGen/PowerPC/empty-functions.ll +++ b/test/CodeGen/PowerPC/empty-functions.ll @@ -24,9 +24,7 @@ entry: ; LINUX-NO-FP-NEXT: .size func, .L[[END]]-.L[[BEGIN]] ; LINUX-NO-FP-NEXT: .cfi_endproc -; A cfi directive can point to the end of a function. It (and in fact the -; entire body) could be optimized out because of the unreachable, but we -; don't do it right now. +; A cfi directive cannot point to the end of a function. ; LINUX-FP: func: ; LINUX-FP-NEXT: {{^}}.L[[BEGIN:.*]]:{{$}} ; LINUX-FP-NEXT: .cfi_startproc @@ -38,8 +36,6 @@ entry: ; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}} ; LINUX-FP-NEXT: .cfi_offset r31, -4 ; LINUX-FP-NEXT: mr 31, 1 -; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}} -; LINUX-FP-NEXT: .cfi_def_cfa_register r31 ; LINUX-FP-NEXT: {{^}}.L[[END:.*]]:{{$}} ; LINUX-FP-NEXT: .size func, .L[[END]]-.L[[BEGIN]] ; LINUX-FP-NEXT: .cfi_endproc diff --git a/test/CodeGen/SPARC/empty-functions.ll b/test/CodeGen/SPARC/empty-functions.ll index 1f8c5e3..974df23 100644 --- a/test/CodeGen/SPARC/empty-functions.ll +++ b/test/CodeGen/SPARC/empty-functions.ll @@ -14,19 +14,11 @@ entry: ; LINUX-NO-FP-NEXT: .size func, .L{{.*}}-func ; LINUX-NO-FP-NEXT: .cfi_endproc -; A cfi directive can point to the end of a function. It (and in fact the -; entire body) could be optimized out because of the unreachable, but we -; don't do it right now. +; A cfi directive cannot point to the end of a function. ; LINUX-FP: func: ; LINUX-FP-NEXT: .cfi_startproc ; LINUX-FP-NEXT: {{^}}! ; LINUX-FP-NEXT: save %sp, -96, %sp ; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}} -; LINUX-FP-NEXT: .cfi_def_cfa_register %fp -; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}} -; LINUX-FP-NEXT: .cfi_window_save -; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}} -; LINUX-FP-NEXT: .cfi_register 15, 31 -; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}} ; LINUX-FP-NEXT: .size func, .Lfunc_end0-func ; LINUX-FP-NEXT: .cfi_endproc diff --git a/test/CodeGen/SystemZ/splitMove_undefReg_mverifier_2.ll b/test/CodeGen/SystemZ/splitMove_undefReg_mverifier_2.ll new file mode 100644 index 0000000..fc3b7ef --- /dev/null +++ b/test/CodeGen/SystemZ/splitMove_undefReg_mverifier_2.ll @@ -0,0 +1,229 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -verify-machineinstrs -disable-lsr | FileCheck %s +; +; Regression test for a machine verifier complaint discovered with llvm-stress. +; Test that splitting of a 128 bit store does not result in use of undef phys reg. +; This test case involved spilling of 128 bits, where the data operand was killed. + +define void @autogen_SD15107(i8*, i32*, i64*, i32, i64, i8) { +; CHECK: .text +BB: + %A4 = alloca double + %A1 = alloca i32 + %L = load i8, i8* %0 + br label %CF331 + +CF331: ; preds = %CF331, %BB + %Shuff = shufflevector <8 x i8> zeroinitializer, <8 x i8> zeroinitializer, <8 x i32> + %L5 = load i8, i8* %0 + %FC9 = fptosi float 0xC59D259100000000 to i8 + %Shuff13 = shufflevector <8 x i64> zeroinitializer, <8 x i64> zeroinitializer, <8 x i32> + %Tr = trunc <8 x i16> zeroinitializer to <8 x i1> + %Sl16 = select i1 true, i64 448097, i64 253977 + %E18 = extractelement <2 x i1> zeroinitializer, i32 1 + br i1 %E18, label %CF331, label %CF350 + +CF350: ; preds = %CF331 + %Cmp22 = icmp slt i8 %L, -1 + br label %CF + +CF: ; preds = %CF333, %CF364, %CF, %CF350 + %Shuff25 = shufflevector <16 x i1> zeroinitializer, <16 x i1> zeroinitializer, <16 x i32> + %B27 = mul <8 x i8> zeroinitializer, %Shuff + %L31 = load i8, i8* %0 + store i8 %L5, i8* %0 + %E32 = extractelement <8 x i64> %Shuff13, i32 5 + %Sl37 = select i1 %E18, i64* %2, i64* %2 + %E40 = extractelement <8 x i64> %Shuff13, i32 4 + %I42 = insertelement <8 x i64> %Shuff13, i64 0, i32 1 + %Sl44 = select i1 true, double* %A4, double* %A4 + %L46 = load i64, i64* %Sl37 + br i1 undef, label %CF, label %CF335 + +CF335: ; preds = %CF335, %CF + %Shuff48 = shufflevector <8 x i16> zeroinitializer, <8 x i16> zeroinitializer, <8 x i32> + %B50 = sub <8 x i64> undef, zeroinitializer + %Se = sext i1 %Cmp22 to i64 + %Cmp52 = icmp ule i64 %E40, 184653 + br i1 %Cmp52, label %CF335, label %CF364 + +CF364: ; preds = %CF335 + store i64 %E32, i64* %Sl37 + %B57 = udiv <8 x i64> %I42, %B50 + %L61 = load i64, i64* %Sl37 + %Sl65 = select i1 undef, i1 %Cmp52, i1 true + br i1 %Sl65, label %CF, label %CF333 + +CF333: ; preds = %CF364 + %Cmp66 = fcmp uge float 0x474A237E00000000, undef + br i1 %Cmp66, label %CF, label %CF324 + +CF324: ; preds = %CF358, %CF360, %CF333 + %L67 = load i64, i64* %Sl37 + %Sl73 = select i1 %E18, i8 %L, i8 %L31 + %ZE = zext i1 true to i32 + %Cmp81 = icmp ult i64 184653, %L46 + br label %CF346 + +CF346: ; preds = %CF363, %CF346, %CF324 + %L82 = load double, double* %Sl44 + store i64 %Se, i64* %Sl37 + br i1 undef, label %CF346, label %CF363 + +CF363: ; preds = %CF346 + %I85 = insertelement <8 x i64> undef, i64 0, i32 4 + %Se86 = sext i1 %Cmp81 to i64 + %Cmp88 = icmp eq <16 x i1> zeroinitializer, undef + %Shuff91 = shufflevector <8 x i64> %B57, <8 x i64> %I42, <8 x i32> + %Sl95 = select i1 undef, i8 -1, i8 %5 + store i8 %FC9, i8* %0 + %Sl102 = select i1 %Sl65, float 0x3AAFABC380000000, float undef + %L104 = load i64, i64* %Sl37 + store i8 %Sl95, i8* %0 + br i1 undef, label %CF346, label %CF360 + +CF360: ; preds = %CF363 + %I107 = insertelement <16 x i1> undef, i1 %Sl65, i32 3 + %B108 = fdiv float undef, %Sl102 + %FC109 = sitofp <16 x i1> %Shuff25 to <16 x float> + %Cmp111 = icmp slt i8 %Sl73, %Sl95 + br i1 %Cmp111, label %CF324, label %CF344 + +CF344: ; preds = %CF344, %CF360 + store i64 %4, i64* %Sl37 + br i1 undef, label %CF344, label %CF358 + +CF358: ; preds = %CF344 + %B116 = add i8 29, %5 + %Sl118 = select i1 %Cmp81, <8 x i1> undef, <8 x i1> %Tr + %L120 = load i16, i16* undef + store i8 %FC9, i8* %0 + %E121 = extractelement <16 x i1> %Shuff25, i32 3 + br i1 %E121, label %CF324, label %CF325 + +CF325: ; preds = %CF362, %CF358 + %I123 = insertelement <8 x i16> undef, i16 %L120, i32 0 + %Sl125 = select i1 undef, i32 undef, i32 199785 + %Cmp126 = icmp ule <16 x i1> undef, %Cmp88 + br label %CF356 + +CF356: ; preds = %CF356, %CF325 + %FC131 = sitofp <8 x i8> %B27 to <8 x double> + store i8 %Sl73, i8* %0 + store i64 396197, i64* %Sl37 + %L150 = load i64, i64* %Sl37 + %Cmp157 = icmp ult i64 %L150, %L61 + br i1 %Cmp157, label %CF356, label %CF359 + +CF359: ; preds = %CF359, %CF356 + %B162 = srem <8 x i64> %I85, %Shuff13 + %Tr163 = trunc i64 %Se to i8 + %Sl164 = select i1 %Cmp52, i32* %A1, i32* %1 + store i64 %E32, i64* undef + %I168 = insertelement <8 x i16> %I123, i16 undef, i32 5 + %Se170 = sext i1 %Cmp81 to i32 + %Cmp172 = icmp uge i8 %Sl73, %Sl73 + br i1 %Cmp172, label %CF359, label %CF362 + +CF362: ; preds = %CF359 + store i16 0, i16* undef + store i64 448097, i64* %Sl37 + %E189 = extractelement <8 x i16> %Shuff48, i32 6 + %Sl194 = select i1 %Cmp111, i8 29, i8 0 + %Cmp195 = icmp eq i32 %ZE, %ZE + br i1 %Cmp195, label %CF325, label %CF326 + +CF326: ; preds = %CF342, %CF362 + store i64 %L104, i64* undef + br label %CF342 + +CF342: ; preds = %CF326 + %Cmp203 = icmp ule i1 %Cmp195, %E18 + br i1 %Cmp203, label %CF326, label %CF337 + +CF337: ; preds = %CF342 + br label %CF327 + +CF327: ; preds = %CF336, %CF355, %CF327, %CF337 + store i64 %Se86, i64* undef + %Tr216 = trunc i64 184653 to i16 + %Sl217 = select i1 %Cmp157, <4 x i1> undef, <4 x i1> undef + %Cmp218 = icmp slt i32 undef, %Se170 + br i1 %Cmp218, label %CF327, label %CF355 + +CF355: ; preds = %CF327 + %E220 = extractelement <16 x i1> %Cmp126, i32 3 + br i1 %E220, label %CF327, label %CF340 + +CF340: ; preds = %CF355 + %Sl224 = select i1 %Sl65, double undef, double 0xBE278346AB25A5C4 + br label %CF334 + +CF334: ; preds = %CF343, %CF334, %CF340 + %L226 = load i64, i64* undef + store i32 %3, i32* %Sl164 + %Cmp233 = icmp uge i16 %Tr216, %L120 + br i1 %Cmp233, label %CF334, label %CF354 + +CF354: ; preds = %CF334 + store i64 %L226, i64* %Sl37 + %Cmp240 = icmp uge i1 %Cmp52, undef + %Shuff243 = shufflevector <16 x i1> %I107, <16 x i1> undef, <16 x i32> + %B245 = fmul <16 x float> %FC109, %FC109 + br label %CF343 + +CF343: ; preds = %CF354 + %Cmp248 = icmp sgt i8 0, %B116 + br i1 %Cmp248, label %CF334, label %CF336 + +CF336: ; preds = %CF343 + store i64 %E32, i64* undef + br i1 undef, label %CF327, label %CF328 + +CF328: ; preds = %CF345, %CF336 + br label %CF345 + +CF345: ; preds = %CF328 + %E257 = extractelement <4 x i1> %Sl217, i32 2 + br i1 %E257, label %CF328, label %CF338 + +CF338: ; preds = %CF345 + %Sl261 = select i1 %E121, <8 x i16> zeroinitializer, <8 x i16> undef + %Cmp262 = icmp sgt i8 undef, %Sl194 + br label %CF329 + +CF329: ; preds = %CF339, %CF348, %CF357, %CF338 + store i64 %L67, i64* %Sl37 + br label %CF357 + +CF357: ; preds = %CF329 + %Cmp275 = icmp ne i1 %Cmp203, %Sl65 + br i1 %Cmp275, label %CF329, label %CF348 + +CF348: ; preds = %CF357 + %Shuff286 = shufflevector <8 x i16> undef, <8 x i16> %Sl261, <8 x i32> + %Cmp291 = icmp ne i32 %Sl125, undef + br i1 %Cmp291, label %CF329, label %CF339 + +CF339: ; preds = %CF348 + %Cmp299 = fcmp ugt double %L82, undef + br i1 %Cmp299, label %CF329, label %CF330 + +CF330: ; preds = %CF361, %CF330, %CF339 + %E301 = extractelement <8 x double> %FC131, i32 3 + store i64 %Sl16, i64* %Sl37 + %Se313 = sext <8 x i1> %Sl118 to <8 x i32> + %Cmp315 = icmp sgt i8 %Tr163, %L + br i1 %Cmp315, label %CF330, label %CF361 + +CF361: ; preds = %CF330 + store i16 %L120, i16* undef + %Shuff318 = shufflevector <8 x i64> %B162, <8 x i64> undef, <8 x i32> + %ZE321 = zext i16 %E189 to i64 + %Sl322 = select i1 %Cmp240, i1 %Cmp262, i1 %Cmp291 + br i1 %Sl322, label %CF330, label %CF351 + +CF351: ; preds = %CF361 + store double %Sl224, double* %Sl44 + store i32 %ZE, i32* %Sl164 + ret void +} diff --git a/test/CodeGen/Thumb/long.ll b/test/CodeGen/Thumb/long.ll index c549bd4..13951ef 100644 --- a/test/CodeGen/Thumb/long.ll +++ b/test/CodeGen/Thumb/long.ll @@ -206,3 +206,34 @@ entry: ; CHECK: adds r0, r0, r2 ; CHECK: sbcs r1, r3 } + +declare void @f13(i64 %x) + +define void @f14(i1 %x, i64 %y) #0 { +; CHECK-LABEL: f14: +entry: + %a = add i64 %y, 47 + call void @f13(i64 %a) +; CHECK: bl + br i1 %x, label %if.end, label %if.then + +if.then: + call void @f13(i64 %y) +; CHECK: bl + br label %if.end + +if.end: + %b = add i64 %y, 45 + call void @f13(i64 %b) +; CHECK: adds +; CHECK: adcs +; CHECK: bl + %c = add i64 %y, 47 + call void @f13(i64 %c) +; CHECK: adds +; CHECK-NEXT: adcs +; CHECK: bl + ret void +} + +attributes #0 = { optsize } diff --git a/test/CodeGen/Thumb/optionaldef-scheduling.ll b/test/CodeGen/Thumb/optionaldef-scheduling.ll new file mode 100644 index 0000000..bd091cf --- /dev/null +++ b/test/CodeGen/Thumb/optionaldef-scheduling.ll @@ -0,0 +1,18 @@ +; RUN: llc -mtriple=thumb-eabi %s -verify-machineinstrs -o - | FileCheck %s +; RUN: llc -mtriple=thumbv6-eabi %s -verify-machineinstrs -o - | FileCheck %s + +define i1 @test(i64 %arg) { +entry: + %ispos = icmp sgt i64 %arg, -1 + %neg = sub i64 0, %arg + %sel = select i1 %ispos, i64 %arg, i64 %neg + %cmp2 = icmp eq i64 %sel, %arg + ret i1 %cmp2 +} + +; The scheduler used to ignore OptionalDefs, and could unwittingly insert +; a flag-setting instruction in between an ADDS and the corresponding ADC. + +; CHECK: adds +; CHECK-NOT: eors +; CHECK: adcs diff --git a/test/CodeGen/X86/GlobalISel/callingconv.ll b/test/CodeGen/X86/GlobalISel/callingconv.ll new file mode 100644 index 0000000..ec62ece --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/callingconv.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=i386-linux-gnu -mattr=+sse2 -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 --check-prefix=X32_GISEL +; RUN: llc -mtriple=i386-linux-gnu -mattr=+sse2 < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 --check-prefix=X32_ISEL +; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 --check-prefix=X64_GISEL +; RUN: llc -mtriple=x86_64-linux-gnu < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 --check-prefix=X64_ISEL + +define i32 @test_ret_i32() { +; X32-LABEL: test_ret_i32: +; X32: # BB#0: +; X32-NEXT: movl $20, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_ret_i32: +; X64: # BB#0: +; X64-NEXT: movl $20, %eax +; X64-NEXT: retq + ret i32 20 +} + +define i64 @test_ret_i64() { +; X32_GISEL-LABEL: test_ret_i64: +; X32_GISEL: # BB#0: +; X32_GISEL-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF +; X32_GISEL-NEXT: movl $15, %edx +; X32_GISEL-NEXT: retl +; +; X32_ISEL-LABEL: test_ret_i64: +; X32_ISEL: # BB#0: +; X32_ISEL-NEXT: movl $-1, %eax +; X32_ISEL-NEXT: movl $15, %edx +; X32_ISEL-NEXT: retl +; +; X64-LABEL: test_ret_i64: +; X64: # BB#0: +; X64-NEXT: movabsq $68719476735, %rax # imm = 0xFFFFFFFFF +; X64-NEXT: retq + ret i64 68719476735 +} + +define i32 @test_arg_i32(i32 %a) { +; X32_GISEL-LABEL: test_arg_i32: +; X32_GISEL: # BB#0: +; X32_GISEL-NEXT: leal 4(%esp), %eax +; X32_GISEL-NEXT: movl (%eax), %eax +; X32_GISEL-NEXT: retl +; +; X32_ISEL-LABEL: test_arg_i32: +; X32_ISEL: # BB#0: +; X32_ISEL-NEXT: movl 4(%esp), %eax +; X32_ISEL-NEXT: retl +; +; X64-LABEL: test_arg_i32: +; X64: # BB#0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: retq + ret i32 %a +} + +define i64 @test_arg_i64(i64 %a) { +; X32_GISEL-LABEL: test_arg_i64: +; X32_GISEL: # BB#0: +; X32_GISEL-NEXT: leal 4(%esp), %eax +; X32_GISEL-NEXT: movl (%eax), %eax +; X32_GISEL-NEXT: leal 8(%esp), %ecx +; X32_GISEL-NEXT: movl (%ecx), %edx +; X32_GISEL-NEXT: retl +; +; X32_ISEL-LABEL: test_arg_i64: +; X32_ISEL: # BB#0: +; X32_ISEL-NEXT: movl 4(%esp), %eax +; X32_ISEL-NEXT: movl 8(%esp), %edx +; X32_ISEL-NEXT: retl +; +; X64-LABEL: test_arg_i64: +; X64: # BB#0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq + ret i64 %a +} + +define i64 @test_i64_args_8(i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8) { +; X32_GISEL-LABEL: test_i64_args_8: +; X32_GISEL: # BB#0: +; X32_GISEL-NEXT: leal 60(%esp), %eax +; X32_GISEL-NEXT: movl (%eax), %eax +; X32_GISEL-NEXT: leal 64(%esp), %ecx +; X32_GISEL-NEXT: movl (%ecx), %edx +; X32_GISEL-NEXT: retl +; +; X32_ISEL-LABEL: test_i64_args_8: +; X32_ISEL: # BB#0: +; X32_ISEL-NEXT: movl 60(%esp), %eax +; X32_ISEL-NEXT: movl 64(%esp), %edx +; X32_ISEL-NEXT: retl +; +; X64_GISEL-LABEL: test_i64_args_8: +; X64_GISEL: # BB#0: +; X64_GISEL-NEXT: leaq 16(%rsp), %rax +; X64_GISEL-NEXT: movq (%rax), %rax +; X64_GISEL-NEXT: retq +; +; X64_ISEL-LABEL: test_i64_args_8: +; X64_ISEL: # BB#0: +; X64_ISEL-NEXT: movq 16(%rsp), %rax +; X64_ISEL-NEXT: retq + + ret i64 %arg8 +} + +define <4 x i32> @test_v4i32_args(<4 x i32> %arg1, <4 x i32> %arg2) { +; X32-LABEL: test_v4i32_args: +; X32: # BB#0: +; X32-NEXT: movaps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_v4i32_args: +; X64: # BB#0: +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: retq + ret <4 x i32> %arg2 +} + +define <8 x i32> @test_v8i32_args(<8 x i32> %arg1) { +; X32-LABEL: test_v8i32_args: +; X32: # BB#0: +; X32-NEXT: retl +; +; X64-LABEL: test_v8i32_args: +; X64: # BB#0: +; X64-NEXT: retq + + ret <8 x i32> %arg1 +} diff --git a/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll b/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll index 616cb70..8ea3e4f 100644 --- a/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll +++ b/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll @@ -207,24 +207,15 @@ define i64 @test_i64_args_8(i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, ; X32-NEXT: [[ARG8H_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK60]] ; X32-NEXT: [[ARG8H:%[0-9]+]](s32) = G_LOAD [[ARG8H_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK60]], align 0) -; X32-NEXT: [[UNDEF:%[0-9]+]](s64) = IMPLICIT_DEF -; X32-NEXT: [[ARG1_TMP0:%[0-9]+]](s64) = G_INSERT [[UNDEF]], [[ARG1L]](s32), 0 -; X32-NEXT: [[ARG1_TMP1:%[0-9]+]](s64) = G_INSERT [[ARG1_TMP0]], [[ARG1H]](s32), 32 -; X32-NEXT: [[ARG1:%[0-9]+]](s64) = COPY [[ARG1_TMP1]] - ; ... a bunch more that we don't track ... - ; X32: IMPLICIT_DEF - ; X32: IMPLICIT_DEF - ; X32: IMPLICIT_DEF - ; X32: IMPLICIT_DEF - ; X32: IMPLICIT_DEF -; X32: [[UNDEF:%[0-9]+]](s64) = IMPLICIT_DEF -; X32-NEXT: [[ARG7_TMP0:%[0-9]+]](s64) = G_INSERT [[UNDEF]], [[ARG7L]](s32), 0 -; X32-NEXT: [[ARG7_TMP1:%[0-9]+]](s64) = G_INSERT [[ARG7_TMP0]], [[ARG7H]](s32), 32 -; X32-NEXT: [[ARG7:%[0-9]+]](s64) = COPY [[ARG7_TMP1]] -; X32-NEXT: [[UNDEF:%[0-9]+]](s64) = IMPLICIT_DEF -; X32-NEXT: [[ARG8_TMP0:%[0-9]+]](s64) = G_INSERT [[UNDEF]], [[ARG8L]](s32), 0 -; X32-NEXT: [[ARG8_TMP1:%[0-9]+]](s64) = G_INSERT [[ARG8_TMP0]], [[ARG8H]](s32), 32 -; X32-NEXT: [[ARG8:%[0-9]+]](s64) = COPY [[ARG8_TMP1]] +; X32-NEXT: [[ARG1:%[0-9]+]](s64) = G_MERGE_VALUES [[ARG1L]](s32), [[ARG1H]](s32) +; ... a bunch more that we don't track ... +; X32-NEXT: G_MERGE_VALUES +; X32-NEXT: G_MERGE_VALUES +; X32-NEXT: G_MERGE_VALUES +; X32-NEXT: G_MERGE_VALUES +; X32-NEXT: G_MERGE_VALUES +; X32-NEXT: [[ARG7:%[0-9]+]](s64) = G_MERGE_VALUES [[ARG7L]](s32), [[ARG7H]](s32) +; X32-NEXT: [[ARG8:%[0-9]+]](s64) = G_MERGE_VALUES [[ARG8L]](s32), [[ARG8H]](s32) ; ALL-NEXT: [[GADDR_A1:%[0-9]+]](p0) = G_GLOBAL_VALUE @a1_64bit ; ALL-NEXT: [[GADDR_A7:%[0-9]+]](p0) = G_GLOBAL_VALUE @a7_64bit @@ -236,8 +227,7 @@ define i64 @test_i64_args_8(i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, ; X64-NEXT: %rax = COPY [[ARG1]](s64) ; X64-NEXT: RET 0, implicit %rax -; X32-NEXT: [[RETL:%[0-9]+]](s32) = G_EXTRACT [[ARG1:%[0-9]+]](s64), 0 -; X32-NEXT: [[RETH:%[0-9]+]](s32) = G_EXTRACT [[ARG1:%[0-9]+]](s64), 32 +; X32-NEXT: [[RETL:%[0-9]+]](s32), [[RETH:%[0-9]+]](s32) = G_UNMERGE_VALUES [[ARG1:%[0-9]+]](s64) ; X32-NEXT: %eax = COPY [[RETL:%[0-9]+]](s32) ; X32-NEXT: %edx = COPY [[RETH:%[0-9]+]](s32) ; X32-NEXT: RET 0, implicit %eax, implicit %edx diff --git a/test/CodeGen/X86/GlobalISel/irtranslator-callingconv_64bit.ll b/test/CodeGen/X86/GlobalISel/irtranslator-callingconv_64bit.ll index e2d9385..90a05f5 100644 --- a/test/CodeGen/X86/GlobalISel/irtranslator-callingconv_64bit.ll +++ b/test/CodeGen/X86/GlobalISel/irtranslator-callingconv_64bit.ll @@ -15,12 +15,8 @@ define <8 x i32> @test_v8i32_args(<8 x i32> %arg1) { ; X64: liveins: %xmm0, %xmm1 ; X64: [[ARG1L:%[0-9]+]](<4 x s32>) = COPY %xmm0 ; X64-NEXT: [[ARG1H:%[0-9]+]](<4 x s32>) = COPY %xmm1 -; X64-NEXT: [[UNDEF:%[0-9]+]](<8 x s32>) = IMPLICIT_DEF -; X64-NEXT: [[ARG1_TMP0:%[0-9]+]](<8 x s32>) = G_INSERT [[UNDEF]], [[ARG1L]](<4 x s32>), 0 -; X64-NEXT: [[ARG1_TMP1:%[0-9]+]](<8 x s32>) = G_INSERT [[ARG1_TMP0]], [[ARG1H]](<4 x s32>), 128 -; X64-NEXT: [[ARG1:%[0-9]+]](<8 x s32>) = COPY [[ARG1_TMP1]] -; X64-NEXT: [[RETL:%[0-9]+]](<4 x s32>) = G_EXTRACT [[ARG1:%[0-9]+]](<8 x s32>), 0 -; X64-NEXT: [[RETH:%[0-9]+]](<4 x s32>) = G_EXTRACT [[ARG1:%[0-9]+]](<8 x s32>), 128 +; X64-NEXT: [[ARG1:%[0-9]+]](<8 x s32>) = G_MERGE_VALUES [[ARG1L]](<4 x s32>), [[ARG1H]](<4 x s32>) +; X64-NEXT: [[RETL:%[0-9]+]](<4 x s32>), [[RETH:%[0-9]+]](<4 x s32>) = G_UNMERGE_VALUES [[ARG1:%[0-9]+]](<8 x s32>) ; X64-NEXT: %xmm0 = COPY [[RETL:%[0-9]+]](<4 x s32>) ; X64-NEXT: %xmm1 = COPY [[RETH:%[0-9]+]](<4 x s32>) ; X64-NEXT: RET 0, implicit %xmm0, implicit %xmm1 diff --git a/test/CodeGen/X86/GlobalISel/memop.ll b/test/CodeGen/X86/GlobalISel/memop.ll index 6fe6643..f793e36 100644 --- a/test/CodeGen/X86/GlobalISel/memop.ll +++ b/test/CodeGen/X86/GlobalISel/memop.ll @@ -65,7 +65,7 @@ define double @test_load_double(double * %p1) { ; SSE-LABEL: test_load_double: ; SSE: # BB#0: ; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: retq ; ; ALL_AVX-LABEL: test_load_double: @@ -160,7 +160,7 @@ define double * @test_store_double(double %val, double * %p1) { ; ; SSE_FAST-LABEL: test_store_double: ; SSE_FAST: # BB#0: -; SSE_FAST-NEXT: movd %xmm0, %rax +; SSE_FAST-NEXT: movq %xmm0, %rax ; SSE_FAST-NEXT: movq %rax, (%rdi) ; SSE_FAST-NEXT: movq %rdi, %rax ; SSE_FAST-NEXT: retq diff --git a/test/CodeGen/X86/asm-reg-type-mismatch.ll b/test/CodeGen/X86/asm-reg-type-mismatch.ll index 47accdb..ced0740 100644 --- a/test/CodeGen/X86/asm-reg-type-mismatch.ll +++ b/test/CodeGen/X86/asm-reg-type-mismatch.ll @@ -27,5 +27,5 @@ entry: ret i64 %0 ; CHECK: test2 ; CHECK: movq {{.*}}, %xmm7 - ; CHECK: movd %xmm7, %rax + ; CHECK: movq %xmm7, %rax } diff --git a/test/CodeGen/X86/atomic-non-integer.ll b/test/CodeGen/X86/atomic-non-integer.ll index 17b73ec..1f25c71 100644 --- a/test/CodeGen/X86/atomic-non-integer.ll +++ b/test/CodeGen/X86/atomic-non-integer.ll @@ -26,7 +26,7 @@ define void @store_float(float* %fptr, float %v) { define void @store_double(double* %fptr, double %v) { ; CHECK-LABEL: @store_double -; CHECK: movd %xmm0, %rax +; CHECK: movq %xmm0, %rax ; CHECK: movq %rax, (%rdi) store atomic double %v, double* %fptr unordered, align 8 ret void @@ -59,7 +59,7 @@ define float @load_float(float* %fptr) { define double @load_double(double* %fptr) { ; CHECK-LABEL: @load_double ; CHECK: movq (%rdi), %rax -; CHECK: movd %rax, %xmm0 +; CHECK: movq %rax, %xmm0 %v = load atomic double, double* %fptr unordered, align 8 ret double %v } @@ -85,7 +85,7 @@ define void @store_float_seq_cst(float* %fptr, float %v) { define void @store_double_seq_cst(double* %fptr, double %v) { ; CHECK-LABEL: @store_double_seq_cst -; CHECK: movd %xmm0, %rax +; CHECK: movq %xmm0, %rax ; CHECK: xchgq %rax, (%rdi) store atomic double %v, double* %fptr seq_cst, align 8 ret void @@ -102,7 +102,7 @@ define float @load_float_seq_cst(float* %fptr) { define double @load_double_seq_cst(double* %fptr) { ; CHECK-LABEL: @load_double_seq_cst ; CHECK: movq (%rdi), %rax -; CHECK: movd %rax, %xmm0 +; CHECK: movq %rax, %xmm0 %v = load atomic double, double* %fptr seq_cst, align 8 ret double %v } diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll new file mode 100644 index 0000000..052cacf --- /dev/null +++ b/test/CodeGen/X86/avx-schedule.ll @@ -0,0 +1,2840 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 + +define <4 x double> @test_addpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_addpd: +; SANDY: # BB#0: +; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_addpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_addpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fadd <4 x double> %a0, %a1 + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = fadd <4 x double> %1, %2 + ret <4 x double> %3 +} + +define <8 x float> @test_addps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_addps: +; SANDY: # BB#0: +; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_addps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_addps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fadd <8 x float> %a0, %a1 + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = fadd <8 x float> %1, %2 + ret <8 x float> %3 +} + +define <4 x double> @test_addsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_addsubpd: +; SANDY: # BB#0: +; SANDY-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_addsubpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_addsubpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addsubpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1) + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %1, <4 x double> %2) + ret <4 x double> %3 +} +declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone + +define <8 x float> @test_addsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_addsubps: +; SANDY: # BB#0: +; SANDY-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_addsubps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_addsubps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addsubps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1) + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %1, <8 x float> %2) + ret <8 x float> %3 +} +declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone + +define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_andnotpd: +; SANDY: # BB#0: +; SANDY-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_andnotpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_andnotpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andnotpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = bitcast <4 x double> %a0 to <4 x i64> + %2 = bitcast <4 x double> %a1 to <4 x i64> + %3 = xor <4 x i64> %1, + %4 = and <4 x i64> %3, %2 + %5 = load <4 x double>, <4 x double> *%a2, align 32 + %6 = bitcast <4 x double> %5 to <4 x i64> + %7 = xor <4 x i64> %4, + %8 = and <4 x i64> %6, %7 + %9 = bitcast <4 x i64> %8 to <4 x double> + %10 = fadd <4 x double> %a1, %9 + ret <4 x double> %10 +} + +define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_andnotps: +; SANDY: # BB#0: +; SANDY-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_andnotps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_andnotps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andnotps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = bitcast <8 x float> %a0 to <4 x i64> + %2 = bitcast <8 x float> %a1 to <4 x i64> + %3 = xor <4 x i64> %1, + %4 = and <4 x i64> %3, %2 + %5 = load <8 x float>, <8 x float> *%a2, align 32 + %6 = bitcast <8 x float> %5 to <4 x i64> + %7 = xor <4 x i64> %4, + %8 = and <4 x i64> %6, %7 + %9 = bitcast <4 x i64> %8 to <8 x float> + %10 = fadd <8 x float> %a1, %9 + ret <8 x float> %10 +} + +define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_andpd: +; SANDY: # BB#0: +; SANDY-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_andpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_andpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = bitcast <4 x double> %a0 to <4 x i64> + %2 = bitcast <4 x double> %a1 to <4 x i64> + %3 = and <4 x i64> %1, %2 + %4 = load <4 x double>, <4 x double> *%a2, align 32 + %5 = bitcast <4 x double> %4 to <4 x i64> + %6 = and <4 x i64> %3, %5 + %7 = bitcast <4 x i64> %6 to <4 x double> + %8 = fadd <4 x double> %a1, %7 + ret <4 x double> %8 +} + +define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_andps: +; SANDY: # BB#0: +; SANDY-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_andps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_andps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = bitcast <8 x float> %a0 to <4 x i64> + %2 = bitcast <8 x float> %a1 to <4 x i64> + %3 = and <4 x i64> %1, %2 + %4 = load <8 x float>, <8 x float> *%a2, align 32 + %5 = bitcast <8 x float> %4 to <4 x i64> + %6 = and <4 x i64> %3, %5 + %7 = bitcast <4 x i64> %6 to <8 x float> + %8 = fadd <8 x float> %a1, %7 + ret <8 x float> %8 +} + +define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_blendpd: +; SANDY: # BB#0: +; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50] +; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_blendpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.33] +; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blendpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50] +; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blendpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = fadd <4 x double> %a1, %1 + %4 = shufflevector <4 x double> %3, <4 x double> %2, <4 x i32> + ret <4 x double> %4 +} + +define <8 x float> @test_blendps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_blendps: +; SANDY: # BB#0: +; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50] +; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_blendps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.33] +; HASWELL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blendps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50] +; BTVER2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blendps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50] +; ZNVER1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> + ret <8 x float> %3 +} + +define <4 x double> @test_blendvpd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) { +; SANDY-LABEL: test_blendvpd: +; SANDY: # BB#0: +; SANDY-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; SANDY-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_blendvpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00] +; HASWELL-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blendvpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blendvpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; ZNVER1-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) + %2 = load <4 x double>, <4 x double> *%a3, align 32 + %3 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %1, <4 x double> %2, <4 x double> %a2) + ret <4 x double> %3 +} +declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone + +define <8 x float> @test_blendvps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) { +; SANDY-LABEL: test_blendvps: +; SANDY: # BB#0: +; SANDY-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; SANDY-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_blendvps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00] +; HASWELL-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_blendvps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blendvps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; ZNVER1-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) + %2 = load <8 x float>, <8 x float> *%a3, align 32 + %3 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %1, <8 x float> %2, <8 x float> %a2) + ret <8 x float> %3 +} +declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone + +define <8 x float> @test_broadcastf128(<4 x float> *%a0) { +; SANDY-LABEL: test_broadcastf128: +; SANDY: # BB#0: +; SANDY-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_broadcastf128: +; HASWELL: # BB#0: +; HASWELL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [4:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_broadcastf128: +; BTVER2: # BB#0: +; BTVER2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_broadcastf128: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = load <4 x float>, <4 x float> *%a0, align 32 + %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> + ret <8 x float> %2 +} + +define <4 x double> @test_broadcastsd_ymm(double *%a0) { +; SANDY-LABEL: test_broadcastsd_ymm: +; SANDY: # BB#0: +; SANDY-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_broadcastsd_ymm: +; HASWELL: # BB#0: +; HASWELL-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_broadcastsd_ymm: +; BTVER2: # BB#0: +; BTVER2-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_broadcastsd_ymm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = load double, double *%a0, align 8 + %2 = insertelement <4 x double> undef, double %1, i32 0 + %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> zeroinitializer + ret <4 x double> %3 +} + +define <4 x float> @test_broadcastss(float *%a0) { +; SANDY-LABEL: test_broadcastss: +; SANDY: # BB#0: +; SANDY-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_broadcastss: +; HASWELL: # BB#0: +; HASWELL-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [4:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_broadcastss: +; BTVER2: # BB#0: +; BTVER2-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [5:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_broadcastss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = load float, float *%a0, align 4 + %2 = insertelement <4 x float> undef, float %1, i32 0 + %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer + ret <4 x float> %3 +} + +define <8 x float> @test_broadcastss_ymm(float *%a0) { +; SANDY-LABEL: test_broadcastss_ymm: +; SANDY: # BB#0: +; SANDY-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_broadcastss_ymm: +; HASWELL: # BB#0: +; HASWELL-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_broadcastss_ymm: +; BTVER2: # BB#0: +; BTVER2-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_broadcastss_ymm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = load float, float *%a0, align 4 + %2 = insertelement <8 x float> undef, float %1, i32 0 + %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> zeroinitializer + ret <8 x float> %3 +} + +define <4 x double> @test_cmppd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_cmppd: +; SANDY: # BB#0: +; SANDY-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00] +; SANDY-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cmppd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00] +; HASWELL-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cmppd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00] +; BTVER2-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cmppd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00] +; ZNVER1-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fcmp oeq <4 x double> %a0, %a1 + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = fcmp oeq <4 x double> %a0, %2 + %4 = sext <4 x i1> %1 to <4 x i64> + %5 = sext <4 x i1> %3 to <4 x i64> + %6 = or <4 x i64> %4, %5 + %7 = bitcast <4 x i64> %6 to <4 x double> + ret <4 x double> %7 +} + +define <8 x float> @test_cmpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_cmpps: +; SANDY: # BB#0: +; SANDY-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00] +; SANDY-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cmpps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00] +; HASWELL-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cmpps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00] +; BTVER2-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cmpps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00] +; ZNVER1-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fcmp oeq <8 x float> %a0, %a1 + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = fcmp oeq <8 x float> %a0, %2 + %4 = sext <8 x i1> %1 to <8 x i32> + %5 = sext <8 x i1> %3 to <8 x i32> + %6 = or <8 x i32> %4, %5 + %7 = bitcast <8 x i32> %6 to <8 x float> + ret <8 x float> %7 +} + +define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) { +; SANDY-LABEL: test_cvtdq2pd: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [4:1.00] +; SANDY-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00] +; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtdq2pd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [6:1.00] +; HASWELL-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00] +; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtdq2pd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtdq2pd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00] +; ZNVER1-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = sitofp <4 x i32> %a0 to <4 x double> + %2 = load <4 x i32>, <4 x i32> *%a1, align 16 + %3 = sitofp <4 x i32> %2 to <4 x double> + %4 = fadd <4 x double> %1, %3 + ret <4 x double> %4 +} + +define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) { +; SANDY-LABEL: test_cvtdq2ps: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:1.00] +; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [4:0.50] +; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm1, %ymm1 # sched: [5:1.00] +; SANDY-NEXT: vcvtdq2ps %ymm1, %ymm1 # sched: [4:1.00] +; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtdq2ps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00] +; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtdq2ps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtdq2ps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00] +; ZNVER1-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = sitofp <8 x i32> %a0 to <8 x float> + %2 = load <8 x i32>, <8 x i32> *%a1, align 16 + %3 = sitofp <8 x i32> %2 to <8 x float> + %4 = fadd <8 x float> %1, %3 + ret <8 x float> %4 +} + +define <8 x i32> @test_cvtpd2dq(<4 x double> %a0, <4 x double> *%a1) { +; SANDY-LABEL: test_cvtpd2dq: +; SANDY: # BB#0: +; SANDY-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtpd2dq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [6:1.00] +; HASWELL-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [10:1.00] +; HASWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtpd2dq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtpd2dq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [8:1.00] +; ZNVER1-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fptosi <4 x double> %a0 to <4 x i32> + %2 = load <4 x double>, <4 x double> *%a1, align 32 + %3 = fptosi <4 x double> %2 to <4 x i32> + %4 = shufflevector <4 x i32> %1, <4 x i32> %3, <8 x i32> + ret <8 x i32> %4 +} + +define <8 x float> @test_cvtpd2ps(<4 x double> %a0, <4 x double> *%a1) { +; SANDY-LABEL: test_cvtpd2ps: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtpd2ps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [9:1.00] +; HASWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtpd2ps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtpd2ps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [8:1.00] +; ZNVER1-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fptrunc <4 x double> %a0 to <4 x float> + %2 = load <4 x double>, <4 x double> *%a1, align 32 + %3 = fptrunc <4 x double> %2 to <4 x float> + %4 = shufflevector <4 x float> %1, <4 x float> %3, <8 x i32> + ret <8 x float> %4 +} + +define <8 x i32> @test_cvtps2dq(<8 x float> %a0, <8 x float> *%a1) { +; SANDY-LABEL: test_cvtps2dq: +; SANDY: # BB#0: +; SANDY-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [7:1.00] +; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtps2dq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [7:1.00] +; HASWELL-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtps2dq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtps2dq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [8:1.00] +; ZNVER1-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fptosi <8 x float> %a0 to <8 x i32> + %2 = load <8 x float>, <8 x float> *%a1, align 32 + %3 = fptosi <8 x float> %2 to <8 x i32> + %4 = or <8 x i32> %1, %3 + ret <8 x i32> %4 +} + +define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_divpd: +; SANDY: # BB#0: +; SANDY-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [12:1.00] +; SANDY-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [16:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_divpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [27:2.00] +; HASWELL-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [31:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_divpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [19:19.00] +; BTVER2-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [24:19.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_divpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [19:19.00] +; ZNVER1-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [24:19.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fdiv <4 x double> %a0, %a1 + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = fdiv <4 x double> %1, %2 + ret <4 x double> %3 +} + +define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_divps: +; SANDY: # BB#0: +; SANDY-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [12:1.00] +; SANDY-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [16:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_divps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [19:2.00] +; HASWELL-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [23:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_divps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [19:19.00] +; BTVER2-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [24:19.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_divps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [19:19.00] +; ZNVER1-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [24:19.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fdiv <8 x float> %a0, %a1 + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = fdiv <8 x float> %1, %2 + ret <8 x float> %3 +} + +define <8 x float> @test_dpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_dpps: +; SANDY: # BB#0: +; SANDY-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_dpps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [14:2.00] +; HASWELL-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [18:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_dpps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_dpps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %1, <8 x float> %2, i8 7) + ret <8 x float> %3 +} +declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone + +define <4 x float> @test_extractf128(<8 x float> %a0, <8 x float> %a1, <4 x float> *%a2) { +; SANDY-LABEL: test_extractf128: +; SANDY: # BB#0: +; SANDY-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00] +; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_extractf128: +; HASWELL: # BB#0: +; HASWELL-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [4:1.00] +; HASWELL-NEXT: vzeroupper # sched: [1:0.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_extractf128: +; BTVER2: # BB#0: +; BTVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_extractf128: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00] +; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x float> %a0, <8 x float> undef, <4 x i32> + %2 = shufflevector <8 x float> %a1, <8 x float> undef, <4 x i32> + store <4 x float> %2, <4 x float> *%a2 + ret <4 x float> %1 +} + +define <4 x double> @test_haddpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_haddpd: +; SANDY: # BB#0: +; SANDY-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_haddpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00] +; HASWELL-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [9:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_haddpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_haddpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1) + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %1, <4 x double> %2) + ret <4 x double> %3 +} +declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone + +define <8 x float> @test_haddps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_haddps: +; SANDY: # BB#0: +; SANDY-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_haddps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00] +; HASWELL-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [9:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_haddps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_haddps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1) + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %1, <8 x float> %2) + ret <8 x float> %3 +} +declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone + +define <4 x double> @test_hsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_hsubpd: +; SANDY: # BB#0: +; SANDY-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_hsubpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00] +; HASWELL-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [9:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_hsubpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_hsubpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1) + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %1, <4 x double> %2) + ret <4 x double> %3 +} +declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone + +define <8 x float> @test_hsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_hsubps: +; SANDY: # BB#0: +; SANDY-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_hsubps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00] +; HASWELL-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [9:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_hsubps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_hsubps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1) + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %1, <8 x float> %2) + ret <8 x float> %3 +} +declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone + +define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; SANDY-LABEL: test_insertf128: +; SANDY: # BB#0: +; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:1.00] +; SANDY-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_insertf128: +; HASWELL: # BB#0: +; HASWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00] +; HASWELL-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_insertf128: +; BTVER2: # BB#0: +; BTVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:0.50] +; BTVER2-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_insertf128: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:0.50] +; ZNVER1-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> + %2 = shufflevector <8 x float> %a0, <8 x float> %1, <8 x i32> + %3 = load <4 x float>, <4 x float> *%a2, align 16 + %4 = shufflevector <4 x float> %3, <4 x float> undef, <8 x i32> + %5 = shufflevector <8 x float> %a0, <8 x float> %4, <8 x i32> + %6 = fadd <8 x float> %2, %5 + ret <8 x float> %6 +} + +define <32 x i8> @test_lddqu(i8* %a0) { +; SANDY-LABEL: test_lddqu: +; SANDY: # BB#0: +; SANDY-NEXT: vlddqu (%rdi), %ymm0 # sched: [4:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_lddqu: +; HASWELL: # BB#0: +; HASWELL-NEXT: vlddqu (%rdi), %ymm0 # sched: [4:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lddqu: +; BTVER2: # BB#0: +; BTVER2-NEXT: vlddqu (%rdi), %ymm0 # sched: [5:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lddqu: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vlddqu (%rdi), %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %a0) + ret <32 x i8> %1 +} +declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readonly + +define <2 x double> @test_maskmovpd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) { +; SANDY-LABEL: test_maskmovpd: +; SANDY: # BB#0: +; SANDY-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00] +; SANDY-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00] +; SANDY-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_maskmovpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [4:2.00] +; HASWELL-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [13:1.00] +; HASWELL-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_maskmovpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00] +; BTVER2-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00] +; BTVER2-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maskmovpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00] +; ZNVER1-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00] +; ZNVER1-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %a0, <2 x i64> %a1) + call void @llvm.x86.avx.maskstore.pd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) + ret <2 x double> %1 +} +declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>) nounwind readonly +declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind + +define <4 x double> @test_maskmovpd_ymm(i8* %a0, <4 x i64> %a1, <4 x double> %a2) { +; SANDY-LABEL: test_maskmovpd_ymm: +; SANDY: # BB#0: +; SANDY-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00] +; SANDY-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00] +; SANDY-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_maskmovpd_ymm: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [4:2.00] +; HASWELL-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [14:1.00] +; HASWELL-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_maskmovpd_ymm: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00] +; BTVER2-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00] +; BTVER2-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maskmovpd_ymm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00] +; ZNVER1-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00] +; ZNVER1-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %a0, <4 x i64> %a1) + call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x i64> %a1, <4 x double> %a2) + ret <4 x double> %1 +} +declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>) nounwind readonly +declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwind + +define <4 x float> @test_maskmovps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) { +; SANDY-LABEL: test_maskmovps: +; SANDY: # BB#0: +; SANDY-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00] +; SANDY-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00] +; SANDY-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_maskmovps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [4:2.00] +; HASWELL-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [13:1.00] +; HASWELL-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_maskmovps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00] +; BTVER2-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00] +; BTVER2-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maskmovps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00] +; ZNVER1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00] +; ZNVER1-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %a0, <4 x i32> %a1) + call void @llvm.x86.avx.maskstore.ps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) + ret <4 x float> %1 +} +declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>) nounwind readonly +declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind + +define <8 x float> @test_maskmovps_ymm(i8* %a0, <8 x i32> %a1, <8 x float> %a2) { +; SANDY-LABEL: test_maskmovps_ymm: +; SANDY: # BB#0: +; SANDY-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00] +; SANDY-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00] +; SANDY-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_maskmovps_ymm: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [4:2.00] +; HASWELL-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [14:1.00] +; HASWELL-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_maskmovps_ymm: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00] +; BTVER2-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00] +; BTVER2-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maskmovps_ymm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00] +; ZNVER1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00] +; ZNVER1-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %a0, <8 x i32> %a1) + call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x i32> %a1, <8 x float> %a2) + ret <8 x float> %1 +} +declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>) nounwind readonly +declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwind + +define <4 x double> @test_maxpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_maxpd: +; SANDY: # BB#0: +; SANDY-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_maxpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_maxpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maxpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1) + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %1, <4 x double> %2) + ret <4 x double> %3 +} +declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone + +define <8 x float> @test_maxps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_maxps: +; SANDY: # BB#0: +; SANDY-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_maxps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_maxps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maxps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %1, <8 x float> %2) + ret <8 x float> %3 +} +declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone + +define <4 x double> @test_minpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_minpd: +; SANDY: # BB#0: +; SANDY-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_minpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_minpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_minpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1) + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %1, <4 x double> %2) + ret <4 x double> %3 +} +declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone + +define <8 x float> @test_minps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_minps: +; SANDY: # BB#0: +; SANDY-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_minps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_minps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_minps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %1, <8 x float> %2) + ret <8 x float> %3 +} +declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone + +define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) { +; SANDY-LABEL: test_movapd: +; SANDY: # BB#0: +; SANDY-NEXT: vmovapd (%rdi), %ymm0 # sched: [4:0.50] +; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movapd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovapd (%rdi), %ymm0 # sched: [4:0.50] +; HASWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movapd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovapd (%rdi), %ymm0 # sched: [5:1.00] +; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movapd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovapd (%rdi), %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = load <4 x double>, <4 x double> *%a0, align 32 + %2 = fadd <4 x double> %1, %1 + store <4 x double> %2, <4 x double> *%a1, align 32 + ret <4 x double> %2 +} + +define <8 x float> @test_movaps(<8 x float> *%a0, <8 x float> *%a1) { +; SANDY-LABEL: test_movaps: +; SANDY: # BB#0: +; SANDY-NEXT: vmovaps (%rdi), %ymm0 # sched: [4:0.50] +; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movaps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovaps (%rdi), %ymm0 # sched: [4:0.50] +; HASWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movaps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovaps (%rdi), %ymm0 # sched: [5:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movaps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovaps (%rdi), %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = load <8 x float>, <8 x float> *%a0, align 32 + %2 = fadd <8 x float> %1, %1 + store <8 x float> %2, <8 x float> *%a1, align 32 + ret <8 x float> %2 +} + +define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) { +; SANDY-LABEL: test_movddup: +; SANDY: # BB#0: +; SANDY-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00] +; SANDY-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [4:0.50] +; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movddup: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00] +; HASWELL-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [4:0.50] +; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movddup: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [5:1.00] +; BTVER2-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:0.50] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movddup: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [5:1.00] +; ZNVER1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:0.50] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> + %2 = load <4 x double>, <4 x double> *%a1, align 32 + %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> + %4 = fadd <4 x double> %1, %3 + ret <4 x double> %4 +} + +define i32 @test_movmskpd(<4 x double> %a0) { +; SANDY-LABEL: test_movmskpd: +; SANDY: # BB#0: +; SANDY-NEXT: vmovmskpd %ymm0, %eax # sched: [1:0.33] +; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movmskpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovmskpd %ymm0, %eax # sched: [2:1.00] +; HASWELL-NEXT: vzeroupper # sched: [1:0.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movmskpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovmskpd %ymm0, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movmskpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovmskpd %ymm0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) + ret i32 %1 +} +declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone + +define i32 @test_movmskps(<8 x float> %a0) { +; SANDY-LABEL: test_movmskps: +; SANDY: # BB#0: +; SANDY-NEXT: vmovmskps %ymm0, %eax # sched: [1:0.33] +; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movmskps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovmskps %ymm0, %eax # sched: [2:1.00] +; HASWELL-NEXT: vzeroupper # sched: [1:0.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movmskps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovmskps %ymm0, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movmskps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovmskps %ymm0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) + ret i32 %1 +} +declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone + +define <4 x double> @test_movntpd(<4 x double> %a0, <4 x double> *%a1) { +; SANDY-LABEL: test_movntpd: +; SANDY: # BB#0: +; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movntpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movntpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movntpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fadd <4 x double> %a0, %a0 + store <4 x double> %1, <4 x double> *%a1, align 32, !nontemporal !0 + ret <4 x double> %1 +} + +define <8 x float> @test_movntps(<8 x float> %a0, <8 x float> *%a1) { +; SANDY-LABEL: test_movntps: +; SANDY: # BB#0: +; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movntps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movntps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movntps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fadd <8 x float> %a0, %a0 + store <8 x float> %1, <8 x float> *%a1, align 32, !nontemporal !0 + ret <8 x float> %1 +} + +define <8 x float> @test_movshdup(<8 x float> %a0, <8 x float> *%a1) { +; SANDY-LABEL: test_movshdup: +; SANDY: # BB#0: +; SANDY-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00] +; SANDY-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [4:0.50] +; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movshdup: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00] +; HASWELL-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [4:0.50] +; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movshdup: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [5:1.00] +; BTVER2-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:0.50] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movshdup: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [5:1.00] +; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:0.50] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> + %2 = load <8 x float>, <8 x float> *%a1, align 32 + %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> + %4 = fadd <8 x float> %1, %3 + ret <8 x float> %4 +} + +define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) { +; SANDY-LABEL: test_movsldup: +; SANDY: # BB#0: +; SANDY-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00] +; SANDY-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [4:0.50] +; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movsldup: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00] +; HASWELL-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [4:0.50] +; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movsldup: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [5:1.00] +; BTVER2-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:0.50] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movsldup: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [5:1.00] +; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:0.50] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> + %2 = load <8 x float>, <8 x float> *%a1, align 32 + %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> + %4 = fadd <8 x float> %1, %3 + ret <8 x float> %4 +} + +define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) { +; SANDY-LABEL: test_movupd: +; SANDY: # BB#0: +; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [1:1.00] +; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movupd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovupd (%rdi), %ymm0 # sched: [4:0.50] +; HASWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movupd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovupd (%rdi), %ymm0 # sched: [5:1.00] +; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movupd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovupd (%rdi), %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = load <4 x double>, <4 x double> *%a0, align 1 + %2 = fadd <4 x double> %1, %1 + store <4 x double> %2, <4 x double> *%a1, align 1 + ret <4 x double> %2 +} + +define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) { +; SANDY-LABEL: test_movups: +; SANDY: # BB#0: +; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [1:1.00] +; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movups: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovups (%rdi), %ymm0 # sched: [4:0.50] +; HASWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movups: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovups (%rdi), %ymm0 # sched: [5:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movups: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovups (%rdi), %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = load <8 x float>, <8 x float> *%a0, align 1 + %2 = fadd <8 x float> %1, %1 + store <8 x float> %2, <8 x float> *%a1, align 1 + ret <8 x float> %2 +} + +define <4 x double> @test_mulpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_mulpd: +; SANDY: # BB#0: +; SANDY-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_mulpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_mulpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_mulpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; ZNVER1-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fmul <4 x double> %a0, %a1 + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = fmul <4 x double> %1, %2 + ret <4 x double> %3 +} + +define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_mulps: +; SANDY: # BB#0: +; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_mulps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [9:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_mulps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_mulps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00] +; ZNVER1-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fmul <8 x float> %a0, %a1 + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = fmul <8 x float> %1, %2 + ret <8 x float> %3 +} + +define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: orpd: +; SANDY: # BB#0: +; SANDY-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: orpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: orpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: orpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = bitcast <4 x double> %a0 to <4 x i64> + %2 = bitcast <4 x double> %a1 to <4 x i64> + %3 = or <4 x i64> %1, %2 + %4 = load <4 x double>, <4 x double> *%a2, align 32 + %5 = bitcast <4 x double> %4 to <4 x i64> + %6 = or <4 x i64> %3, %5 + %7 = bitcast <4 x i64> %6 to <4 x double> + %8 = fadd <4 x double> %a1, %7 + ret <4 x double> %8 +} + +define <8 x float> @test_orps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_orps: +; SANDY: # BB#0: +; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_orps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_orps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_orps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = bitcast <8 x float> %a0 to <4 x i64> + %2 = bitcast <8 x float> %a1 to <4 x i64> + %3 = or <4 x i64> %1, %2 + %4 = load <8 x float>, <8 x float> *%a2, align 32 + %5 = bitcast <8 x float> %4 to <4 x i64> + %6 = or <4 x i64> %3, %5 + %7 = bitcast <4 x i64> %6 to <8 x float> + %8 = fadd <8 x float> %a1, %7 + ret <8 x float> %8 +} + +define <2 x double> @test_permilpd(<2 x double> %a0, <2 x double> *%a1) { +; SANDY-LABEL: test_permilpd: +; SANDY: # BB#0: +; SANDY-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00] +; SANDY-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [5:1.00] +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_permilpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00] +; HASWELL-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [5:1.00] +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_permilpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [6:1.00] +; BTVER2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:0.50] +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_permilpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [6:1.00] +; ZNVER1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:0.50] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> + %2 = load <2 x double>, <2 x double> *%a1, align 16 + %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> + %4 = fadd <2 x double> %1, %3 + ret <2 x double> %4 +} + +define <4 x double> @test_permilpd_ymm(<4 x double> %a0, <4 x double> *%a1) { +; SANDY-LABEL: test_permilpd_ymm: +; SANDY: # BB#0: +; SANDY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00] +; SANDY-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [5:1.00] +; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_permilpd_ymm: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00] +; HASWELL-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [5:1.00] +; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_permilpd_ymm: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [6:1.00] +; BTVER2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:0.50] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_permilpd_ymm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [6:1.00] +; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:0.50] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> + %2 = load <4 x double>, <4 x double> *%a1, align 32 + %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> + %4 = fadd <4 x double> %1, %3 + ret <4 x double> %4 +} + +define <4 x float> @test_permilps(<4 x float> %a0, <4 x float> *%a1) { +; SANDY-LABEL: test_permilps: +; SANDY: # BB#0: +; SANDY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00] +; SANDY-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00] +; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_permilps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00] +; HASWELL-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00] +; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_permilps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [6:1.00] +; BTVER2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:0.50] +; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_permilps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [6:1.00] +; ZNVER1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> + %2 = load <4 x float>, <4 x float> *%a1, align 16 + %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> + %4 = fadd <4 x float> %1, %3 + ret <4 x float> %4 +} + +define <8 x float> @test_permilps_ymm(<8 x float> %a0, <8 x float> *%a1) { +; SANDY-LABEL: test_permilps_ymm: +; SANDY: # BB#0: +; SANDY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00] +; SANDY-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [5:1.00] +; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_permilps_ymm: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00] +; HASWELL-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [5:1.00] +; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_permilps_ymm: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [6:1.00] +; BTVER2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:0.50] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_permilps_ymm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [6:1.00] +; ZNVER1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:0.50] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> + %2 = load <8 x float>, <8 x float> *%a1, align 32 + %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> + %4 = fadd <8 x float> %1, %3 + ret <8 x float> %4 +} + +define <2 x double> @test_permilvarpd(<2 x double> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; SANDY-LABEL: test_permilvarpd: +; SANDY: # BB#0: +; SANDY-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_permilvarpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_permilvarpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_permilvarpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1) + %2 = load <2 x i64>, <2 x i64> *%a2, align 16 + %3 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %1, <2 x i64> %2) + ret <2 x double> %3 +} +declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone + +define <4 x double> @test_permilvarpd_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x i64> *%a2) { +; SANDY-LABEL: test_permilvarpd_ymm: +; SANDY: # BB#0: +; SANDY-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_permilvarpd_ymm: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_permilvarpd_ymm: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_permilvarpd_ymm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1) + %2 = load <4 x i64>, <4 x i64> *%a2, align 32 + %3 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %1, <4 x i64> %2) + ret <4 x double> %3 +} +declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone + +define <4 x float> @test_permilvarps(<4 x float> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; SANDY-LABEL: test_permilvarps: +; SANDY: # BB#0: +; SANDY-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_permilvarps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_permilvarps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_permilvarps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1) + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> %2) + ret <4 x float> %3 +} +declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone + +define <8 x float> @test_permilvarps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i32> *%a2) { +; SANDY-LABEL: test_permilvarps_ymm: +; SANDY: # BB#0: +; SANDY-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_permilvarps_ymm: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_permilvarps_ymm: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_permilvarps_ymm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1) + %2 = load <8 x i32>, <8 x i32> *%a2, align 32 + %3 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> %2) + ret <8 x float> %3 +} +declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone + +define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) { +; SANDY-LABEL: test_rcpps: +; SANDY: # BB#0: +; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vrcpps (%rdi), %ymm1 # sched: [9:1.00] +; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_rcpps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vrcpps (%rdi), %ymm1 # sched: [11:2.00] +; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] +; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_rcpps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vrcpps (%rdi), %ymm1 # sched: [7:1.00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_rcpps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vrcpps (%rdi), %ymm1 # sched: [7:1.00] +; ZNVER1-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) + %2 = load <8 x float>, <8 x float> *%a1, align 32 + %3 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %2) + %4 = fadd <8 x float> %1, %3 + ret <8 x float> %4 +} +declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone + +define <4 x double> @test_roundpd(<4 x double> %a0, <4 x double> *%a1) { +; SANDY-LABEL: test_roundpd: +; SANDY: # BB#0: +; SANDY-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [7:1.00] +; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_roundpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [6:2.00] +; HASWELL-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [10:2.00] +; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_roundpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [8:1.00] +; BTVER2-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_roundpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [8:1.00] +; ZNVER1-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7) + %2 = load <4 x double>, <4 x double> *%a1, align 32 + %3 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %2, i32 7) + %4 = fadd <4 x double> %1, %3 + ret <4 x double> %4 +} +declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone + +define <8 x float> @test_roundps(<8 x float> %a0, <8 x float> *%a1) { +; SANDY-LABEL: test_roundps: +; SANDY: # BB#0: +; SANDY-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [7:1.00] +; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_roundps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [6:2.00] +; HASWELL-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [10:2.00] +; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_roundps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [8:1.00] +; BTVER2-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_roundps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [8:1.00] +; ZNVER1-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7) + %2 = load <8 x float>, <8 x float> *%a1, align 32 + %3 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %2, i32 7) + %4 = fadd <8 x float> %1, %3 + ret <8 x float> %4 +} +declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone + +define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) { +; SANDY-LABEL: test_rsqrtps: +; SANDY: # BB#0: +; SANDY-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [9:1.00] +; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_rsqrtps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [11:2.00] +; HASWELL-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [7:2.00] +; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_rsqrtps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [7:1.00] +; BTVER2-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_rsqrtps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [7:1.00] +; ZNVER1-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [2:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) + %2 = load <8 x float>, <8 x float> *%a1, align 32 + %3 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %2) + %4 = fadd <8 x float> %1, %3 + ret <8 x float> %4 +} +declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone + +define <4 x double> @test_shufpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_shufpd: +; SANDY: # BB#0: +; SANDY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00] +; SANDY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [5:1.00] +; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_shufpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00] +; HASWELL-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [5:1.00] +; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_shufpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:0.50] +; BTVER2-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [6:1.00] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_shufpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:0.50] +; ZNVER1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [6:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> + %4 = fadd <4 x double> %1, %3 + ret <4 x double> %4 +} + +define <8 x float> @test_shufps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) nounwind { +; SANDY-LABEL: test_shufps: +; SANDY: # BB#0: +; SANDY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00] +; SANDY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_shufps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00] +; HASWELL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_shufps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:0.50] +; BTVER2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_shufps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:0.50] +; ZNVER1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> + ret <8 x float> %3 +} + +define <4 x double> @test_sqrtpd(<4 x double> %a0, <4 x double> *%a1) { +; SANDY-LABEL: test_sqrtpd: +; SANDY: # BB#0: +; SANDY-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [15:1.00] +; SANDY-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [19:1.00] +; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_sqrtpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [32:2.00] +; HASWELL-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [28:2.00] +; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_sqrtpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [26:21.00] +; BTVER2-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [21:21.00] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_sqrtpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [26:21.00] +; ZNVER1-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [21:21.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) + %2 = load <4 x double>, <4 x double> *%a1, align 32 + %3 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %2) + %4 = fadd <4 x double> %1, %3 + ret <4 x double> %4 +} +declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone + +define <8 x float> @test_sqrtps(<8 x float> %a0, <8 x float> *%a1) { +; SANDY-LABEL: test_sqrtps: +; SANDY: # BB#0: +; SANDY-NEXT: vsqrtps %ymm0, %ymm0 # sched: [15:1.00] +; SANDY-NEXT: vsqrtps (%rdi), %ymm1 # sched: [19:1.00] +; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_sqrtps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vsqrtps (%rdi), %ymm1 # sched: [23:2.00] +; HASWELL-NEXT: vsqrtps %ymm0, %ymm0 # sched: [19:2.00] +; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_sqrtps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vsqrtps (%rdi), %ymm1 # sched: [26:21.00] +; BTVER2-NEXT: vsqrtps %ymm0, %ymm0 # sched: [21:21.00] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_sqrtps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsqrtps (%rdi), %ymm1 # sched: [26:21.00] +; ZNVER1-NEXT: vsqrtps %ymm0, %ymm0 # sched: [21:21.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) + %2 = load <8 x float>, <8 x float> *%a1, align 32 + %3 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %2) + %4 = fadd <8 x float> %1, %3 + ret <8 x float> %4 +} +declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone + +define <4 x double> @test_subpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_subpd: +; SANDY: # BB#0: +; SANDY-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_subpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_subpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_subpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fsub <4 x double> %a0, %a1 + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = fsub <4 x double> %1, %2 + ret <4 x double> %3 +} + +define <8 x float> @test_subps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_subps: +; SANDY: # BB#0: +; SANDY-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_subps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_subps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_subps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = fsub <8 x float> %a0, %a1 + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = fsub <8 x float> %1, %2 + ret <8 x float> %3 +} + +define i32 @test_testpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; SANDY-LABEL: test_testpd: +; SANDY: # BB#0: +; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33] +; SANDY-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: setb %al # sched: [1:0.33] +; SANDY-NEXT: vtestpd (%rdi), %xmm0 # sched: [5:0.50] +; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_testpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25] +; HASWELL-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: setb %al # sched: [1:0.50] +; HASWELL-NEXT: vtestpd (%rdi), %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_testpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: xorl %eax, %eax # sched: [1:0.50] +; BTVER2-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: setb %al # sched: [1:0.50] +; BTVER2-NEXT: vtestpd (%rdi), %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: adcl $0, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_testpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50] +; ZNVER1-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: setb %al # sched: [1:0.50] +; ZNVER1-NEXT: vtestpd (%rdi), %xmm0 # sched: [6:1.00] +; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1) + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %2) + %4 = add i32 %1, %3 + ret i32 %4 +} +declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone + +define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_testpd_ymm: +; SANDY: # BB#0: +; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33] +; SANDY-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: setb %al # sched: [1:0.33] +; SANDY-NEXT: vtestpd (%rdi), %ymm0 # sched: [5:0.50] +; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33] +; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_testpd_ymm: +; HASWELL: # BB#0: +; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25] +; HASWELL-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.33] +; HASWELL-NEXT: setb %al # sched: [1:0.50] +; HASWELL-NEXT: vtestpd (%rdi), %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50] +; HASWELL-NEXT: vzeroupper # sched: [1:0.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_testpd_ymm: +; BTVER2: # BB#0: +; BTVER2-NEXT: xorl %eax, %eax # sched: [1:0.50] +; BTVER2-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: setb %al # sched: [1:0.50] +; BTVER2-NEXT: vtestpd (%rdi), %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: adcl $0, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_testpd_ymm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50] +; ZNVER1-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: setb %al # sched: [1:0.50] +; ZNVER1-NEXT: vtestpd (%rdi), %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1) + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %2) + %4 = add i32 %1, %3 + ret i32 %4 +} +declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone + +define i32 @test_testps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { +; SANDY-LABEL: test_testps: +; SANDY: # BB#0: +; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33] +; SANDY-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: setb %al # sched: [1:0.33] +; SANDY-NEXT: vtestps (%rdi), %xmm0 # sched: [5:0.50] +; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_testps: +; HASWELL: # BB#0: +; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25] +; HASWELL-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: setb %al # sched: [1:0.50] +; HASWELL-NEXT: vtestps (%rdi), %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_testps: +; BTVER2: # BB#0: +; BTVER2-NEXT: xorl %eax, %eax # sched: [1:0.50] +; BTVER2-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: setb %al # sched: [1:0.50] +; BTVER2-NEXT: vtestps (%rdi), %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: adcl $0, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_testps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50] +; ZNVER1-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: setb %al # sched: [1:0.50] +; ZNVER1-NEXT: vtestps (%rdi), %xmm0 # sched: [6:1.00] +; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1) + %2 = load <4 x float>, <4 x float> *%a2, align 16 + %3 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %2) + %4 = add i32 %1, %3 + ret i32 %4 +} +declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone + +define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_testps_ymm: +; SANDY: # BB#0: +; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33] +; SANDY-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: setb %al # sched: [1:0.33] +; SANDY-NEXT: vtestps (%rdi), %ymm0 # sched: [5:0.50] +; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33] +; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_testps_ymm: +; HASWELL: # BB#0: +; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25] +; HASWELL-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.33] +; HASWELL-NEXT: setb %al # sched: [1:0.50] +; HASWELL-NEXT: vtestps (%rdi), %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50] +; HASWELL-NEXT: vzeroupper # sched: [1:0.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_testps_ymm: +; BTVER2: # BB#0: +; BTVER2-NEXT: xorl %eax, %eax # sched: [1:0.50] +; BTVER2-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: setb %al # sched: [1:0.50] +; BTVER2-NEXT: vtestps (%rdi), %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: adcl $0, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_testps_ymm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50] +; ZNVER1-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: setb %al # sched: [1:0.50] +; ZNVER1-NEXT: vtestps (%rdi), %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1) + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %2) + %4 = add i32 %1, %3 + ret i32 %4 +} +declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone + +define <4 x double> @test_unpckhpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_unpckhpd: +; SANDY: # BB#0: +; SANDY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] +; SANDY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [5:1.00] +; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_unpckhpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] +; HASWELL-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [5:1.00] +; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_unpckhpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:0.50] +; BTVER2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [6:1.00] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_unpckhpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:0.50] +; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [6:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> + %4 = fadd <4 x double> %1, %3 + ret <4 x double> %4 +} + +define <8 x float> @test_unpckhps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) nounwind { +; SANDY-LABEL: test_unpckhps: +; SANDY: # BB#0: +; SANDY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] +; SANDY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_unpckhps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] +; HASWELL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_unpckhps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:0.50] +; BTVER2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_unpckhps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:0.50] +; ZNVER1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> + ret <8 x float> %3 +} + +define <4 x double> @test_unpcklpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_unpcklpd: +; SANDY: # BB#0: +; SANDY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] +; SANDY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [5:1.00] +; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_unpcklpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] +; HASWELL-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [5:1.00] +; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_unpcklpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:0.50] +; BTVER2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [6:1.00] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_unpcklpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:0.50] +; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [6:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> + %2 = load <4 x double>, <4 x double> *%a2, align 32 + %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> + %4 = fadd <4 x double> %1, %3 + ret <4 x double> %4 +} + +define <8 x float> @test_unpcklps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) nounwind { +; SANDY-LABEL: test_unpcklps: +; SANDY: # BB#0: +; SANDY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] +; SANDY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [5:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_unpcklps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] +; HASWELL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_unpcklps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:0.50] +; BTVER2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_unpcklps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:0.50] +; ZNVER1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [6:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> + %2 = load <8 x float>, <8 x float> *%a2, align 32 + %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> + ret <8 x float> %3 +} + +define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { +; SANDY-LABEL: test_xorpd: +; SANDY: # BB#0: +; SANDY-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_xorpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_xorpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_xorpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = bitcast <4 x double> %a0 to <4 x i64> + %2 = bitcast <4 x double> %a1 to <4 x i64> + %3 = xor <4 x i64> %1, %2 + %4 = load <4 x double>, <4 x double> *%a2, align 32 + %5 = bitcast <4 x double> %4 to <4 x i64> + %6 = xor <4 x i64> %3, %5 + %7 = bitcast <4 x i64> %6 to <4 x double> + %8 = fadd <4 x double> %a1, %7 + ret <4 x double> %8 +} + +define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { +; SANDY-LABEL: test_xorps: +; SANDY: # BB#0: +; SANDY-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SANDY-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_xorps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_xorps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; BTVER2-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_xorps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + %1 = bitcast <8 x float> %a0 to <4 x i64> + %2 = bitcast <8 x float> %a1 to <4 x i64> + %3 = xor <4 x i64> %1, %2 + %4 = load <8 x float>, <8 x float> *%a2, align 32 + %5 = bitcast <8 x float> %4 to <4 x i64> + %6 = xor <4 x i64> %3, %5 + %7 = bitcast <4 x i64> %6 to <8 x float> + %8 = fadd <8 x float> %a1, %7 + ret <8 x float> %8 +} + +!0 = !{i32 1} diff --git a/test/CodeGen/X86/bitcast2.ll b/test/CodeGen/X86/bitcast2.ll index 12aa863..b75db95 100644 --- a/test/CodeGen/X86/bitcast2.ll +++ b/test/CodeGen/X86/bitcast2.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86-64 -mattr=-avx | grep movd | count 2 +; RUN: llc < %s -march=x86-64 -mattr=-avx | grep movq | count 2 ; RUN: llc < %s -march=x86-64 -mattr=-avx | not grep rsp define i64 @test1(double %A) { diff --git a/test/CodeGen/X86/bool-ext-inc.ll b/test/CodeGen/X86/bool-ext-inc.ll index d0967c1..1b69b55 100644 --- a/test/CodeGen/X86/bool-ext-inc.ll +++ b/test/CodeGen/X86/bool-ext-inc.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s ; FIXME: add (sext i1 X), 1 -> zext (not i1 X) @@ -20,13 +20,93 @@ define i32 @sext_inc(i1 zeroext %x) nounwind { define <4 x i32> @sext_inc_vec(<4 x i1> %x) nounwind { ; CHECK-LABEL: sext_inc_vec: ; CHECK: # BB#0: -; CHECK-NEXT: pslld $31, %xmm0 -; CHECK-NEXT: psrad $31, %xmm0 -; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-NEXT: vpsrad $31, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %ext = sext <4 x i1> %x to <4 x i32> %add = add <4 x i32> %ext, ret <4 x i32> %add } +define <4 x i32> @cmpgt_sext_inc_vec(<4 x i32> %x, <4 x i32> %y) nounwind { +; CHECK-LABEL: cmpgt_sext_inc_vec: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %cmp = icmp sgt <4 x i32> %x, %y + %ext = sext <4 x i1> %cmp to <4 x i32> + %add = add <4 x i32> %ext, + ret <4 x i32> %add +} + +define <4 x i32> @cmpne_sext_inc_vec(<4 x i32> %x, <4 x i32> %y) nounwind { +; CHECK-LABEL: cmpne_sext_inc_vec: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %cmp = icmp ne <4 x i32> %x, %y + %ext = sext <4 x i1> %cmp to <4 x i32> + %add = add <4 x i32> %ext, + ret <4 x i32> %add +} + +define <4 x i64> @cmpgt_sext_inc_vec256(<4 x i64> %x, <4 x i64> %y) nounwind { +; CHECK-LABEL: cmpgt_sext_inc_vec256: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %cmp = icmp sgt <4 x i64> %x, %y + %ext = sext <4 x i1> %cmp to <4 x i64> + %add = add <4 x i64> %ext, + ret <4 x i64> %add +} + +define i32 @bool_logic_and_math(i32 %a, i32 %b, i32 %c, i32 %d) nounwind { +; CHECK-LABEL: bool_logic_and_math: +; CHECK: # BB#0: +; CHECK-NEXT: cmpl %esi, %edi +; CHECK-NEXT: setne %al +; CHECK-NEXT: cmpl %ecx, %edx +; CHECK-NEXT: setne %cl +; CHECK-NEXT: andb %al, %cl +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: retq + %cmp1 = icmp ne i32 %a, %b + %cmp2 = icmp ne i32 %c, %d + %and = and i1 %cmp1, %cmp2 + %ext = sext i1 %and to i32 + %add = add i32 %ext, 1 + ret i32 %add +} + +define <4 x i32> @bool_logic_and_math_vec(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) nounwind { +; CHECK-LABEL: bool_logic_and_math_vec: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vpxor %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %cmp1 = icmp ne <4 x i32> %a, %b + %cmp2 = icmp ne <4 x i32> %c, %d + %and = and <4 x i1> %cmp1, %cmp2 + %ext = sext <4 x i1> %and to <4 x i32> + %add = add <4 x i32> %ext, + ret <4 x i32> %add +} diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll index c425e3a..ae0f440 100644 --- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -928,7 +928,7 @@ define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind { ; SSE-NEXT: pushq %r14 ; SSE-NEXT: pushq %rbx ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: movd %xmm0, %rcx +; SSE-NEXT: movq %xmm0, %rcx ; SSE-NEXT: movq %rcx, %r8 ; SSE-NEXT: movq %rcx, %r9 ; SSE-NEXT: movq %rcx, %r10 @@ -938,7 +938,7 @@ define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind { ; SSE-NEXT: movq %rcx, %rdi ; SSE-NEXT: andb $15, %cl ; SSE-NEXT: movb %cl, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movd %xmm1, %rcx +; SSE-NEXT: movq %xmm1, %rcx ; SSE-NEXT: shrq $56, %rdi ; SSE-NEXT: andb $15, %dil ; SSE-NEXT: movb %dil, -{{[0-9]+}}(%rsp) @@ -1106,7 +1106,7 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind { ; SSE-NEXT: pushq %r14 ; SSE-NEXT: pushq %rbx ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE-NEXT: movd %xmm0, %rcx +; SSE-NEXT: movq %xmm0, %rcx ; SSE-NEXT: movq %rcx, %r8 ; SSE-NEXT: movq %rcx, %r9 ; SSE-NEXT: movq %rcx, %r10 @@ -1116,7 +1116,7 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind { ; SSE-NEXT: movq %rcx, %rdi ; SSE-NEXT: andb $15, %cl ; SSE-NEXT: movb %cl, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movd %xmm2, %rcx +; SSE-NEXT: movq %xmm2, %rcx ; SSE-NEXT: shrq $56, %rdi ; SSE-NEXT: andb $15, %dil ; SSE-NEXT: movb %dil, -{{[0-9]+}}(%rsp) diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll index 44c4510..706e890 100644 --- a/test/CodeGen/X86/combine-srl.ll +++ b/test/CodeGen/X86/combine-srl.ll @@ -223,18 +223,17 @@ define <4 x i32> @combine_vec_lshr_lshr_zero1(<4 x i32> %x) { define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) { ; SSE-LABEL: combine_vec_lshr_trunc_lshr0: ; SSE: # BB#0: -; SSE-NEXT: psrlq $32, %xmm1 -; SSE-NEXT: psrlq $32, %xmm0 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: psrlq $48, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_lshr_trunc_lshr0: ; AVX: # BB#0: -; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX-NEXT: vpsrlq $48, %ymm0, %ymm0 ; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %1 = lshr <4 x i64> %x, diff --git a/test/CodeGen/X86/combine-udiv.ll b/test/CodeGen/X86/combine-udiv.ll index 71f6c3e..e1e8499 100644 --- a/test/CodeGen/X86/combine-udiv.ll +++ b/test/CodeGen/X86/combine-udiv.ll @@ -76,6 +76,53 @@ define <4 x i32> @combine_vec_udiv_by_pow2b(<4 x i32> %x) { ret <4 x i32> %1 } +define <4 x i32> @combine_vec_udiv_by_pow2c(<4 x i32> %x, <4 x i32> %y) { +; SSE-LABEL: combine_vec_udiv_by_pow2c: +; SSE: # BB#0: +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrld %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: psrld %xmm2, %xmm4 +; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrld %xmm1, %xmm2 +; SSE-NEXT: psrld %xmm3, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] +; SSE-NEXT: retq +; +; AVX1-LABEL: combine_vec_udiv_by_pow2c: +; AVX1: # BB#0: +; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_udiv_by_pow2c: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %1 = shl <4 x i32> , %y + %2 = udiv <4 x i32> %x, %1 + ret <4 x i32> %2 +} + ; fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2 define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) { ; SSE-LABEL: combine_vec_udiv_by_shl_pow2a: diff --git a/test/CodeGen/X86/combine-urem.ll b/test/CodeGen/X86/combine-urem.ll index f412e9c..91da268 100644 --- a/test/CodeGen/X86/combine-urem.ll +++ b/test/CodeGen/X86/combine-urem.ll @@ -64,6 +64,99 @@ define <4 x i32> @combine_vec_urem_by_pow2b(<4 x i32> %x) { ret <4 x i32> %1 } +define <4 x i32> @combine_vec_urem_by_pow2c(<4 x i32> %x, <4 x i32> %y) { +; SSE-LABEL: combine_vec_urem_by_pow2c: +; SSE: # BB#0: +; SSE-NEXT: pslld $23, %xmm1 +; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSE-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: combine_vec_urem_by_pow2c: +; AVX1: # BB#0: +; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_urem_by_pow2c: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpsllvd %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %1 = shl <4 x i32> , %y + %2 = urem <4 x i32> %x, %1 + ret <4 x i32> %2 +} + +define <4 x i32> @combine_vec_urem_by_pow2d(<4 x i32> %x, <4 x i32> %y) { +; SSE-LABEL: combine_vec_urem_by_pow2d: +; SSE: # BB#0: +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: psrld %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: psrld %xmm2, %xmm5 +; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: psrld %xmm1, %xmm2 +; SSE-NEXT: psrld %xmm4, %xmm3 +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3],xmm3[4,5],xmm5[6,7] +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: paddd %xmm3, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: combine_vec_urem_by_pow2d: +; AVX1: # BB#0: +; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 +; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsrld %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_urem_by_pow2d: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq + %1 = lshr <4 x i32> , %y + %2 = urem <4 x i32> %x, %1 + ret <4 x i32> %2 +} + ; fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) define <4 x i32> @combine_vec_urem_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) { ; SSE-LABEL: combine_vec_urem_by_shl_pow2a: diff --git a/test/CodeGen/X86/constant-hoisting-bfi.ll b/test/CodeGen/X86/constant-hoisting-bfi.ll new file mode 100644 index 0000000..83589b7 --- /dev/null +++ b/test/CodeGen/X86/constant-hoisting-bfi.ll @@ -0,0 +1,115 @@ +; RUN: opt -consthoist -mtriple=x86_64-unknown-linux-gnu -consthoist-with-block-frequency=true -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; Check when BFI is enabled for constant hoisting, constant 214748364701 +; will not be hoisted to the func entry. +; CHECK-LABEL: @foo( +; CHECK: entry: +; CHECK-NOT: bitcast i64 214748364701 to i64 +; CHECK: if.then: + +; Function Attrs: norecurse nounwind uwtable +define i64 @foo(i64* nocapture %a) { +entry: + %arrayidx = getelementptr inbounds i64, i64* %a, i64 9 + %t0 = load i64, i64* %arrayidx, align 8 + %cmp = icmp slt i64 %t0, 564 + br i1 %cmp, label %if.then, label %if.else5 + +if.then: ; preds = %entry + %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 5 + %t1 = load i64, i64* %arrayidx1, align 8 + %cmp2 = icmp slt i64 %t1, 1009 + br i1 %cmp2, label %if.then3, label %return + +if.then3: ; preds = %if.then + %arrayidx4 = getelementptr inbounds i64, i64* %a, i64 6 + %t2 = load i64, i64* %arrayidx4, align 8 + %inc = add nsw i64 %t2, 1 + store i64 %inc, i64* %arrayidx4, align 8 + br label %return + +if.else5: ; preds = %entry + %arrayidx6 = getelementptr inbounds i64, i64* %a, i64 6 + %t3 = load i64, i64* %arrayidx6, align 8 + %cmp7 = icmp slt i64 %t3, 3512 + br i1 %cmp7, label %if.then8, label %return + +if.then8: ; preds = %if.else5 + %arrayidx9 = getelementptr inbounds i64, i64* %a, i64 7 + %t4 = load i64, i64* %arrayidx9, align 8 + %inc10 = add nsw i64 %t4, 1 + store i64 %inc10, i64* %arrayidx9, align 8 + br label %return + +return: ; preds = %if.else5, %if.then, %if.then8, %if.then3 + %retval.0 = phi i64 [ 214748364701, %if.then3 ], [ 214748364701, %if.then8 ], [ 250148364702, %if.then ], [ 256148364704, %if.else5 ] + ret i64 %retval.0 +} + +; Check when BFI is enabled for constant hoisting, constant 214748364701 +; in while.body will be hoisted to while.body.preheader. 214748364701 in +; if.then16 and if.else10 will be merged and hoisted to the beginning of +; if.else10 because if.else10 dominates if.then16. +; CHECK-LABEL: @goo( +; CHECK: entry: +; CHECK-NOT: bitcast i64 214748364701 to i64 +; CHECK: while.body.preheader: +; CHECK-NEXT: bitcast i64 214748364701 to i64 +; CHECK-NOT: bitcast i64 214748364701 to i64 +; CHECK: if.else10: +; CHECK-NEXT: bitcast i64 214748364701 to i64 +; CHECK-NOT: bitcast i64 214748364701 to i64 +define i64 @goo(i64* nocapture %a) { +entry: + %arrayidx = getelementptr inbounds i64, i64* %a, i64 9 + %t0 = load i64, i64* %arrayidx, align 8 + %cmp = icmp ult i64 %t0, 56 + br i1 %cmp, label %if.then, label %if.else10, !prof !0 + +if.then: ; preds = %entry + %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 5 + %t1 = load i64, i64* %arrayidx1, align 8 + %cmp2 = icmp ult i64 %t1, 10 + br i1 %cmp2, label %while.cond.preheader, label %return, !prof !0 + +while.cond.preheader: ; preds = %if.then + %arrayidx7 = getelementptr inbounds i64, i64* %a, i64 6 + %t2 = load i64, i64* %arrayidx7, align 8 + %cmp823 = icmp ugt i64 %t2, 10000 + br i1 %cmp823, label %while.body.preheader, label %return + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %t3 = phi i64 [ %add, %while.body ], [ %t2, %while.body.preheader ] + %add = add i64 %t3, 214748364701 + %cmp8 = icmp ugt i64 %add, 10000 + br i1 %cmp8, label %while.body, label %while.cond.return.loopexit_crit_edge + +if.else10: ; preds = %entry + %arrayidx11 = getelementptr inbounds i64, i64* %a, i64 6 + %t4 = load i64, i64* %arrayidx11, align 8 + %add2 = add i64 %t4, 214748364701 + %cmp12 = icmp ult i64 %add2, 35 + br i1 %cmp12, label %if.then16, label %return, !prof !0 + +if.then16: ; preds = %if.else10 + %arrayidx17 = getelementptr inbounds i64, i64* %a, i64 7 + %t5 = load i64, i64* %arrayidx17, align 8 + %inc = add i64 %t5, 1 + store i64 %inc, i64* %arrayidx17, align 8 + br label %return + +while.cond.return.loopexit_crit_edge: ; preds = %while.body + store i64 %add, i64* %arrayidx7, align 8 + br label %return + +return: ; preds = %while.cond.preheader, %while.cond.return.loopexit_crit_edge, %if.else10, %if.then, %if.then16 + %retval.0 = phi i64 [ 214748364701, %if.then16 ], [ 0, %if.then ], [ 0, %if.else10 ], [ 0, %while.cond.return.loopexit_crit_edge ], [ 0, %while.cond.preheader ] + ret i64 %retval.0 +} + +!0 = !{!"branch_weights", i32 1, i32 2000} diff --git a/test/CodeGen/X86/dagcombine-cse.ll b/test/CodeGen/X86/dagcombine-cse.ll index a283bcc..726e30f 100644 --- a/test/CodeGen/X86/dagcombine-cse.ll +++ b/test/CodeGen/X86/dagcombine-cse.ll @@ -30,7 +30,7 @@ define i32 @t(i8* %ref_frame_ptr, i32 %ref_frame_stride, i32 %idxX, i32 %idxY) n ; X64-NEXT: shlq $32, %rcx ; X64-NEXT: movl (%rdi,%rax), %eax ; X64-NEXT: orq %rcx, %rax -; X64-NEXT: movd %rax, %xmm0 +; X64-NEXT: movq %rax, %xmm0 ; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] ; X64-NEXT: movd %xmm0, %eax diff --git a/test/CodeGen/X86/dwarf-headers.ll b/test/CodeGen/X86/dwarf-headers.ll index 612807d..c2111f6 100644 --- a/test/CodeGen/X86/dwarf-headers.ll +++ b/test/CodeGen/X86/dwarf-headers.ll @@ -1,16 +1,16 @@ -; RUN: llc -split-dwarf=Disable -dwarf-version=4 -generate-type-units \ +; RUN: llc -dwarf-version=4 -generate-type-units \ ; RUN: -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu < %s \ ; RUN: | llvm-dwarfdump - | FileCheck %s --check-prefix=SINGLE-4 -; RUN: llc -split-dwarf=Enable -dwarf-version=4 -generate-type-units \ +; RUN: llc -split-dwarf-file=foo.dwo -dwarf-version=4 -generate-type-units \ ; RUN: -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu < %s \ ; RUN: | llvm-dwarfdump - | FileCheck %s --check-prefix=SPLIT-4 -; RUN: llc -split-dwarf=Disable -dwarf-version=5 -generate-type-units \ +; RUN: llc -dwarf-version=5 -generate-type-units \ ; RUN: -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu < %s \ ; RUN: | llvm-dwarfdump - | FileCheck %s --check-prefix=SINGLE-5 -; RUN: llc -split-dwarf=Enable -dwarf-version=5 -generate-type-units \ +; RUN: llc -split-dwarf-file=foo.dwo -dwarf-version=5 -generate-type-units \ ; RUN: -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu < %s \ ; RUN: | llvm-dwarfdump - | FileCheck %s --check-prefix=SPLIT-5 diff --git a/test/CodeGen/X86/eh-frame-unreachable.ll b/test/CodeGen/X86/eh-frame-unreachable.ll new file mode 100644 index 0000000..a7abc8a --- /dev/null +++ b/test/CodeGen/X86/eh-frame-unreachable.ll @@ -0,0 +1,11 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s +; Test that we don't emit a row that extends beyond the FDE's range_size. +; +; CHECK: movq %rsp, %rbp +; CHECK-NEXT: .cfi_endproc +; CHECK-NOT: .cfi + +define void @f() #0 { + unreachable +} +attributes #0 = { "no-frame-pointer-elim"="true" } diff --git a/test/CodeGen/X86/empty-function.ll b/test/CodeGen/X86/empty-function.ll new file mode 100644 index 0000000..92bebd0 --- /dev/null +++ b/test/CodeGen/X86/empty-function.ll @@ -0,0 +1,22 @@ +; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck -check-prefix=CHECK -check-prefix=WIN32 %s +; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck -check-prefix=CHECK -check-prefix=WIN64 %s +; RUN: llc < %s -mtriple=i386-linux-gnu | FileCheck -check-prefix=LINUX %s + +target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32" +target triple = "i686-pc-windows-msvc18.0.0" + +; Don't emit empty functions on Windows; it can lead to duplicate entries +; (multiple functions sharing the same RVA) in the Guard CF Function Table which +; the kernel refuses to load. + +define void @f() { +entry: + unreachable + +; CHECK-LABEL: f: +; WIN32: nop +; WIN64: ud2 +; LINUX-NOT: nop +; LINUX-NOT: ud2 + +} diff --git a/test/CodeGen/X86/empty-functions.ll b/test/CodeGen/X86/empty-functions.ll index 735df2a..0c13953 100644 --- a/test/CodeGen/X86/empty-functions.ll +++ b/test/CodeGen/X86/empty-functions.ll @@ -23,8 +23,6 @@ entry: ; CHECK-FP-NEXT: : ; CHECK-FP-NEXT: .cfi_offset %rbp, -16 ; CHECK-FP-NEXT: movq %rsp, %rbp -; CHECK-FP-NEXT: : -; CHECK-FP-NEXT: .cfi_def_cfa_register %rbp ; CHECK-FP-NEXT: .cfi_endproc ; An empty function is perfectly fine on ELF. @@ -35,9 +33,7 @@ entry: ; LINUX-NO-FP-NEXT: .size func, .L{{.*}}-func ; LINUX-NO-FP-NEXT: .cfi_endproc -; A cfi directive can point to the end of a function. It (and in fact the -; entire body) could be optimized out because of the unreachable, but we -; don't do it right now. +; A cfi directive cannot point to the end of a function. ; LINUX-FP: func: ; LINUX-FP-NEXT: .cfi_startproc ; LINUX-FP-NEXT: {{^}}# @@ -48,7 +44,5 @@ entry: ; LINUX-FP-NEXT: .cfi_offset %rbp, -16 ; LINUX-FP-NEXT: movq %rsp, %rbp ; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}} -; LINUX-FP-NEXT: .cfi_def_cfa_register %rbp -; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}} ; LINUX-FP-NEXT: .size func, .Lfunc_end0-func ; LINUX-FP-NEXT: .cfi_endproc diff --git a/test/CodeGen/X86/extractelement-index.ll b/test/CodeGen/X86/extractelement-index.ll index e36e33f..228ce70 100644 --- a/test/CodeGen/X86/extractelement-index.ll +++ b/test/CodeGen/X86/extractelement-index.ll @@ -320,7 +320,7 @@ define i32 @extractelement_v8i32_7(<8 x i32> %a) nounwind { define i64 @extractelement_v2i64_0(<2 x i64> %a, i256 %i) nounwind { ; SSE-LABEL: extractelement_v2i64_0: ; SSE: # BB#0: -; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq ; ; AVX-LABEL: extractelement_v2i64_0: @@ -335,7 +335,7 @@ define i64 @extractelement_v2i64_1(<2 x i64> %a, i256 %i) nounwind { ; SSE2-LABEL: extractelement_v2i64_1: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: retq ; ; SSE41-LABEL: extractelement_v2i64_1: @@ -355,7 +355,7 @@ define i64 @extractelement_v4i64_1(<4 x i64> %a, i256 %i) nounwind { ; SSE2-LABEL: extractelement_v4i64_1: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: retq ; ; SSE41-LABEL: extractelement_v4i64_1: @@ -376,7 +376,7 @@ define i64 @extractelement_v4i64_3(<4 x i64> %a, i256 %i) nounwind { ; SSE2-LABEL: extractelement_v4i64_3: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: retq ; ; SSE41-LABEL: extractelement_v4i64_3: diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll index 5c48119..d68236e 100644 --- a/test/CodeGen/X86/fold-tied-op.ll +++ b/test/CodeGen/X86/fold-tied-op.ll @@ -7,7 +7,6 @@ target triple = "i386--netbsd" ; CHECK-LABEL: fn1 ; CHECK: addl {{.*#+}} 4-byte Folded Reload -; CHECK: addl {{.*#+}} 4-byte Folded Reload ; CHECK: imull {{.*#+}} 4-byte Folded Reload ; CHECK: orl {{.*#+}} 4-byte Folded Reload ; CHECK: retl diff --git a/test/CodeGen/X86/gather-addresses.ll b/test/CodeGen/X86/gather-addresses.ll index f7d4eb3..c310967 100644 --- a/test/CodeGen/X86/gather-addresses.ll +++ b/test/CodeGen/X86/gather-addresses.ll @@ -11,7 +11,7 @@ ; LIN: movdqa (%rsi), %xmm0 ; LIN: pand (%rdx), %xmm0 ; LIN: pextrq $1, %xmm0, %r[[REG4:.+]] -; LIN: movd %xmm0, %r[[REG2:.+]] +; LIN: movq %xmm0, %r[[REG2:.+]] ; LIN: movslq %e[[REG2]], %r[[REG1:.+]] ; LIN: sarq $32, %r[[REG2]] ; LIN: movslq %e[[REG4]], %r[[REG3:.+]] @@ -24,7 +24,7 @@ ; WIN: movdqa (%rdx), %xmm0 ; WIN: pand (%r8), %xmm0 ; WIN: pextrq $1, %xmm0, %r[[REG4:.+]] -; WIN: movd %xmm0, %r[[REG2:.+]] +; WIN: movq %xmm0, %r[[REG2:.+]] ; WIN: movslq %e[[REG2]], %r[[REG1:.+]] ; WIN: sarq $32, %r[[REG2]] ; WIN: movslq %e[[REG4]], %r[[REG3:.+]] diff --git a/test/CodeGen/X86/i256-add.ll b/test/CodeGen/X86/i256-add.ll index a745f65..7b26568 100644 --- a/test/CodeGen/X86/i256-add.ll +++ b/test/CodeGen/X86/i256-add.ll @@ -12,34 +12,35 @@ define void @add(i256* %p, i256* %q) nounwind { ; X32-NEXT: subl $12, %esp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 8(%ecx), %edx -; X32-NEXT: movl (%ecx), %ebx -; X32-NEXT: movl 4(%ecx), %edi +; X32-NEXT: movl 8(%ecx), %edi +; X32-NEXT: movl (%ecx), %edx +; X32-NEXT: movl 4(%ecx), %ebx ; X32-NEXT: movl 28(%eax), %esi ; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill ; X32-NEXT: movl 24(%eax), %ebp -; X32-NEXT: addl (%eax), %ebx -; X32-NEXT: adcl 4(%eax), %edi -; X32-NEXT: adcl 8(%eax), %edx +; X32-NEXT: addl (%eax), %edx ; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl 20(%eax), %esi +; X32-NEXT: adcl 4(%eax), %ebx +; X32-NEXT: adcl 8(%eax), %edi +; X32-NEXT: movl %edi, (%esp) # 4-byte Spill +; X32-NEXT: movl 20(%eax), %edi ; X32-NEXT: movl 12(%eax), %edx -; X32-NEXT: movl 16(%eax), %eax +; X32-NEXT: movl 16(%eax), %esi ; X32-NEXT: adcl 12(%ecx), %edx -; X32-NEXT: adcl 16(%ecx), %eax -; X32-NEXT: adcl 20(%ecx), %esi -; X32-NEXT: adcl 24(%ecx), %ebp -; X32-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X32-NEXT: adcl 16(%ecx), %esi +; X32-NEXT: adcl 20(%ecx), %edi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: adcl 24(%ecx), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload ; X32-NEXT: adcl %ebp, 28(%ecx) +; X32-NEXT: movl (%esp), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, 8(%ecx) +; X32-NEXT: movl %ebx, 4(%ecx) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, (%ecx) -; X32-NEXT: movl %edi, 4(%ecx) -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload -; X32-NEXT: movl %edi, 8(%ecx) ; X32-NEXT: movl %edx, 12(%ecx) -; X32-NEXT: movl %eax, 16(%ecx) -; X32-NEXT: movl %esi, 20(%ecx) -; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: movl %esi, 16(%ecx) +; X32-NEXT: movl %edi, 20(%ecx) ; X32-NEXT: movl %eax, 24(%ecx) ; X32-NEXT: addl $12, %esp ; X32-NEXT: popl %esi @@ -58,9 +59,9 @@ define void @add(i256* %p, i256* %q) nounwind { ; X64-NEXT: adcq 8(%rsi), %rdx ; X64-NEXT: adcq 16(%rsi), %rax ; X64-NEXT: adcq %r8, 24(%rdi) -; X64-NEXT: movq %rcx, (%rdi) -; X64-NEXT: movq %rdx, 8(%rdi) ; X64-NEXT: movq %rax, 16(%rdi) +; X64-NEXT: movq %rdx, 8(%rdi) +; X64-NEXT: movq %rcx, (%rdi) ; X64-NEXT: retq %a = load i256, i256* %p %b = load i256, i256* %q @@ -96,9 +97,9 @@ define void @sub(i256* %p, i256* %q) nounwind { ; X32-NEXT: sbbl 24(%esi), %eax ; X32-NEXT: movl 28(%esi), %esi ; X32-NEXT: sbbl %esi, 28(%ecx) -; X32-NEXT: movl %ebx, (%ecx) -; X32-NEXT: movl %ebp, 4(%ecx) ; X32-NEXT: movl %edi, 8(%ecx) +; X32-NEXT: movl %ebp, 4(%ecx) +; X32-NEXT: movl %ebx, (%ecx) ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload ; X32-NEXT: movl %esi, 12(%ecx) ; X32-NEXT: movl (%esp), %esi # 4-byte Reload @@ -122,9 +123,9 @@ define void @sub(i256* %p, i256* %q) nounwind { ; X64-NEXT: sbbq 8(%rsi), %rdx ; X64-NEXT: sbbq 16(%rsi), %rax ; X64-NEXT: sbbq %r8, 24(%rdi) -; X64-NEXT: movq %rcx, (%rdi) -; X64-NEXT: movq %rdx, 8(%rdi) ; X64-NEXT: movq %rax, 16(%rdi) +; X64-NEXT: movq %rdx, 8(%rdi) +; X64-NEXT: movq %rcx, (%rdi) ; X64-NEXT: retq %a = load i256, i256* %p %b = load i256, i256* %q diff --git a/test/CodeGen/X86/i64-to-float.ll b/test/CodeGen/X86/i64-to-float.ll index 3da1a36..f2fbff1 100644 --- a/test/CodeGen/X86/i64-to-float.ll +++ b/test/CodeGen/X86/i64-to-float.ll @@ -251,11 +251,11 @@ define <2 x double> @clamp_sitofp_2i64_2f64(<2 x i64> %a) nounwind { ; X64-SSE-NEXT: pandn %xmm3, %xmm0 ; X64-SSE-NEXT: pand {{.*}}(%rip), %xmm1 ; X64-SSE-NEXT: por %xmm0, %xmm1 -; X64-SSE-NEXT: movd %xmm1, %rax +; X64-SSE-NEXT: movq %xmm1, %rax ; X64-SSE-NEXT: xorps %xmm0, %xmm0 ; X64-SSE-NEXT: cvtsi2sdq %rax, %xmm0 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; X64-SSE-NEXT: movd %xmm1, %rax +; X64-SSE-NEXT: movq %xmm1, %rax ; X64-SSE-NEXT: xorps %xmm1, %xmm1 ; X64-SSE-NEXT: cvtsi2sdq %rax, %xmm1 ; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] diff --git a/test/CodeGen/X86/insertelement-duplicates.ll b/test/CodeGen/X86/insertelement-duplicates.ll new file mode 100644 index 0000000..b073433 --- /dev/null +++ b/test/CodeGen/X86/insertelement-duplicates.ll @@ -0,0 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE-32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE-64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX-32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX-64 + +define void @PR15298(<4 x float>* nocapture %source, <8 x float>* nocapture %dest) nounwind noinline { +; SSE-32-LABEL: PR15298: +; SSE-32: # BB#0: # %L.entry +; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE-32-NEXT: movaps 304(%ecx), %xmm0 +; SSE-32-NEXT: xorps %xmm1, %xmm1 +; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1] +; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-32-NEXT: movups %xmm1, 624(%eax) +; SSE-32-NEXT: movups %xmm0, 608(%eax) +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: PR15298: +; SSE-64: # BB#0: # %L.entry +; SSE-64-NEXT: movaps 304(%rdi), %xmm0 +; SSE-64-NEXT: xorps %xmm1, %xmm1 +; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1] +; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-64-NEXT: movups %xmm1, 624(%rsi) +; SSE-64-NEXT: movups %xmm0, 608(%rsi) +; SSE-64-NEXT: retq +; +; AVX-32-LABEL: PR15298: +; AVX-32: # BB#0: # %L.entry +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; AVX-32-NEXT: vbroadcastss 304(%ecx), %xmm0 +; AVX-32-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] +; AVX-32-NEXT: vmovups %ymm0, 608(%eax) +; AVX-32-NEXT: vzeroupper +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: PR15298: +; AVX-64: # BB#0: # %L.entry +; AVX-64-NEXT: vbroadcastss 304(%rdi), %xmm0 +; AVX-64-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; AVX-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] +; AVX-64-NEXT: vmovups %ymm0, 608(%rsi) +; AVX-64-NEXT: vzeroupper +; AVX-64-NEXT: retq +L.entry: + %0 = getelementptr inbounds <4 x float>, <4 x float>* %source, i32 19 + %1 = load <4 x float>, <4 x float>* %0, align 16 + %2 = extractelement <4 x float> %1, i32 0 + %3 = insertelement <8 x float> , float %2, i32 2 + %4 = insertelement <8 x float> %3, float %2, i32 1 + %5 = getelementptr <8 x float>, <8 x float>* %dest, i32 19 + store <8 x float> %4, <8 x float>* %5, align 4 + ret void +} diff --git a/test/CodeGen/X86/isint.ll b/test/CodeGen/X86/isint.ll index ea38d9e..89e5f94 100644 --- a/test/CodeGen/X86/isint.ll +++ b/test/CodeGen/X86/isint.ll @@ -1,8 +1,7 @@ -; RUN: llc < %s -mtriple=x86_64-pc-unknown -mattr=+sse2 | FileCheck %s -; RUN: llc < %s -mtriple=i686-pc-unknown -mattr=+sse2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-pc-unknown -mattr=+sse2 | FileCheck -check-prefix=CHECK -check-prefix=CHECK64 %s ; PR19059 -; RUN: llc < %s -mtriple=i686-pc-unknown -mattr=+sse2 | FileCheck -check-prefix=CHECK32 %s +; RUN: llc < %s -mtriple=i686-pc-unknown -mattr=+sse2 | FileCheck -check-prefix=CHECK -check-prefix=CHECK32 %s define i32 @isint_return(double %d) nounwind { ; CHECK-LABEL: isint_return: @@ -15,7 +14,8 @@ define i32 @isint_return(double %d) nounwind { %c = fcmp oeq double %d, %e ; CHECK32-NOT: movd {{.*}}, %r{{.*}} ; CHECK32-NOT: andq -; CHECK-NEXT: movd +; CHECK32-NEXT: movd +; CHECK64-NEXT: movq ; CHECK-NEXT: andl %z = zext i1 %c to i32 ret i32 %z diff --git a/test/CodeGen/X86/lower-bitcast.ll b/test/CodeGen/X86/lower-bitcast.ll index 62020c2..79f90f4 100644 --- a/test/CodeGen/X86/lower-bitcast.ll +++ b/test/CodeGen/X86/lower-bitcast.ll @@ -44,16 +44,16 @@ define double @test2(double %A, double %B) { define i64 @test3(i64 %A) { ; CHECK-LABEL: test3: ; CHECK: # BB#0: -; CHECK-NEXT: movd %rdi, %xmm0 +; CHECK-NEXT: movq %rdi, %xmm0 ; CHECK-NEXT: addps {{.*}}(%rip), %xmm0 -; CHECK-NEXT: movd %xmm0, %rax +; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: retq ; ; CHECK-WIDE-LABEL: test3: ; CHECK-WIDE: # BB#0: -; CHECK-WIDE-NEXT: movd %rdi, %xmm0 +; CHECK-WIDE-NEXT: movq %rdi, %xmm0 ; CHECK-WIDE-NEXT: addps {{.*}}(%rip), %xmm0 -; CHECK-WIDE-NEXT: movd %xmm0, %rax +; CHECK-WIDE-NEXT: movq %xmm0, %rax ; CHECK-WIDE-NEXT: retq %1 = bitcast i64 %A to <2 x float> %add = fadd <2 x float> %1, @@ -67,18 +67,18 @@ define i64 @test3(i64 %A) { define i64 @test4(i64 %A) { ; CHECK-LABEL: test4: ; CHECK: # BB#0: -; CHECK-NEXT: movd %rdi, %xmm0 +; CHECK-NEXT: movq %rdi, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-NEXT: movd %xmm0, %rax +; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: retq ; ; CHECK-WIDE-LABEL: test4: ; CHECK-WIDE: # BB#0: -; CHECK-WIDE-NEXT: movd %rdi, %xmm0 +; CHECK-WIDE-NEXT: movq %rdi, %xmm0 ; CHECK-WIDE-NEXT: paddd {{.*}}(%rip), %xmm0 -; CHECK-WIDE-NEXT: movd %xmm0, %rax +; CHECK-WIDE-NEXT: movq %xmm0, %rax ; CHECK-WIDE-NEXT: retq %1 = bitcast i64 %A to <2 x i32> %add = add <2 x i32> %1, diff --git a/test/CodeGen/X86/memcpy-struct-by-value.ll b/test/CodeGen/X86/memcpy-struct-by-value.ll new file mode 100644 index 0000000..2e7a64d --- /dev/null +++ b/test/CodeGen/X86/memcpy-struct-by-value.ll @@ -0,0 +1,48 @@ +; RUN: llc -mtriple=x86_64-linux-gnu -mattr=-ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST +; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST +; RUN: llc -mtriple=i686-linux-gnu -mattr=-ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST32 +; RUN: llc -mtriple=i686-linux-gnu -mattr=+ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=generic < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=haswell < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skylake < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST +; FIXME: The documentation states that ivybridge has ermsb, but this is not +; enabled right now since I could not confirm by testing. +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=ivybridge < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST + +%struct.large = type { [4096 x i8] } + +declare void @foo(%struct.large* align 8 byval) nounwind + +define void @test1(%struct.large* nocapture %x) nounwind { + call void @foo(%struct.large* align 8 byval %x) + ret void + +; ALL-LABEL: test1: +; NOFAST: rep;movsq +; NOFAST32: rep;movsl +; FAST: rep;movsb +} + +define void @test2(%struct.large* nocapture %x) nounwind minsize { + call void @foo(%struct.large* align 8 byval %x) + ret void + +; ALL-LABEL: test2: +; NOFAST: rep;movsq +; NOFAST32: rep;movsl +; FAST: rep;movsb +} + +%struct.large_oddsize = type { [4095 x i8] } + +declare void @foo_oddsize(%struct.large_oddsize* align 8 byval) nounwind + +define void @test3(%struct.large_oddsize* nocapture %x) nounwind minsize { + call void @foo_oddsize(%struct.large_oddsize* align 8 byval %x) + ret void + +; ALL-LABEL: test3: +; NOFAST: rep;movsb +; NOFAST32: rep;movsb +; FAST: rep;movsb +} diff --git a/test/CodeGen/X86/merge_store.ll b/test/CodeGen/X86/merge_store.ll index dcb7bd0..f4c4c6d 100644 --- a/test/CodeGen/X86/merge_store.ll +++ b/test/CodeGen/X86/merge_store.ll @@ -29,17 +29,8 @@ entry: ret void } - - ;; CHECK-LABEL: indexed-store-merge - -;; We should be able to merge the 4 consecutive stores. -;; FIXMECHECK: movl $0, 2(%rsi,%rdi) - -;; CHECK: movb $0, 2(%rsi,%rdi) -;; CHECK: movb $0, 3(%rsi,%rdi) -;; CHECK: movb $0, 4(%rsi,%rdi) -;; CHECK: movb $0, 5(%rsi,%rdi) +;; CHECK: movl $0, 2(%rsi,%rdi) ;; CHECK: movb $0, (%rsi) define void @indexed-store-merge(i64 %p, i8* %v) { entry: diff --git a/test/CodeGen/X86/mmx-bitcast.ll b/test/CodeGen/X86/mmx-bitcast.ll index 9128e5c..30cf474 100644 --- a/test/CodeGen/X86/mmx-bitcast.ll +++ b/test/CodeGen/X86/mmx-bitcast.ll @@ -80,7 +80,7 @@ define i64 @t5(i32 %a, i32 %b) nounwind readnone { ; CHECK-NEXT: movd %esi, %xmm0 ; CHECK-NEXT: movd %edi, %xmm1 ; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: movd %xmm1, %rax +; CHECK-NEXT: movq %xmm1, %rax ; CHECK-NEXT: retq %v0 = insertelement <2 x i32> undef, i32 %a, i32 0 %v1 = insertelement <2 x i32> %v0, i32 %b, i32 1 diff --git a/test/CodeGen/X86/mmx-cvt.ll b/test/CodeGen/X86/mmx-cvt.ll index 8f2da95..fd6c508 100644 --- a/test/CodeGen/X86/mmx-cvt.ll +++ b/test/CodeGen/X86/mmx-cvt.ll @@ -347,7 +347,7 @@ define <4 x float> @cvt_v2i32_v2f32(<1 x i64>*) nounwind { ; X64-NEXT: movq (%rdi), %mm0 ; X64-NEXT: paddd %mm0, %mm0 ; X64-NEXT: movd %mm0, %rax -; X64-NEXT: movd %rax, %xmm0 +; X64-NEXT: movq %rax, %xmm0 ; X64-NEXT: cvtdq2ps %xmm0, %xmm0 ; X64-NEXT: retq %2 = bitcast <1 x i64>* %0 to x86_mmx* diff --git a/test/CodeGen/X86/mod128.ll b/test/CodeGen/X86/mod128.ll index 4fdee11..ae28fab 100644 --- a/test/CodeGen/X86/mod128.ll +++ b/test/CodeGen/X86/mod128.ll @@ -18,7 +18,7 @@ define i64 @mod128(i128 %x) { ; WIN64-DAG: movq $0, 40(%rsp) ; WIN64-DAG: movq $3, 32(%rsp) ; WIN64: callq __modti3 - ; WIN64: movd %xmm0, %rax + ; WIN64: movq %xmm0, %rax %1 = srem i128 %x, 3 %2 = trunc i128 %1 to i64 diff --git a/test/CodeGen/X86/movmsk.ll b/test/CodeGen/X86/movmsk.ll index 1caa22a..e40f64e 100644 --- a/test/CodeGen/X86/movmsk.ll +++ b/test/CodeGen/X86/movmsk.ll @@ -100,7 +100,7 @@ entry: define void @float_call_signbit(double %n) { ; CHECK-LABEL: float_call_signbit: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: movd %xmm0, %rdi +; CHECK-NEXT: movq %xmm0, %rdi ; CHECK-NEXT: shrq $63, %rdi ; CHECK-NEXT: ## kill: %EDI %EDI %RDI ; CHECK-NEXT: jmp _float_call_signbit_callee ## TAILCALL diff --git a/test/CodeGen/X86/nontemporal-2.ll b/test/CodeGen/X86/nontemporal-2.ll index d1bb8d3..337e625 100644 --- a/test/CodeGen/X86/nontemporal-2.ll +++ b/test/CodeGen/X86/nontemporal-2.ll @@ -596,14 +596,14 @@ define void @test_extract_i64(<2 x i64> %arg, i64* %dst) { ; SSE2-LABEL: test_extract_i64: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: movq %xmm0, %rax ; SSE2-NEXT: movntiq %rax, (%rdi) ; SSE2-NEXT: retq ; ; SSE4A-LABEL: test_extract_i64: ; SSE4A: # BB#0: ; SSE4A-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE4A-NEXT: movd %xmm0, %rax +; SSE4A-NEXT: movq %xmm0, %rax ; SSE4A-NEXT: movntiq %rax, (%rdi) ; SSE4A-NEXT: retq ; diff --git a/test/CodeGen/X86/post-ra-sched-with-debug.mir b/test/CodeGen/X86/post-ra-sched-with-debug.mir new file mode 100644 index 0000000..ba5c859 --- /dev/null +++ b/test/CodeGen/X86/post-ra-sched-with-debug.mir @@ -0,0 +1,322 @@ +# RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=btver2 -run-pass=post-RA-sched -o - %s | FileCheck %s + +# Test that multiple DBG_VALUE's following an instruction whose register needs +# to be changed during the post-RA scheduler pass are updated correctly. + +# Test case was derived from the output from the following command and +# the source code below: +# +# clang -S -emit-llvm -target x86_64 -march=btver2 -O2 -g -o - | +# llc -stop-before=post-RA-sched -o - +# +# Source code reduced from the original 8MB source file: +# +# struct a; +# class b { +# public: +# a *c = ap; +# unsigned *d() { return (unsigned *)c; } +# a *ap; +# }; +# enum { e = 2 }; +# template f *g(f *h, f *i) { +# long j = long(i), k = -!h; +# return reinterpret_cast(long(h) | k & j); +# } +# class l { +# public: +# l(int); +# int m; +# }; +# unsigned *n; +# unsigned o; +# class p { +# public: +# int aa(); +# unsigned *q() { +# n = r.d(); +# return g(n, &o); +# } +# b r; +# }; +# class s : l { +# public: +# p t; +# s(int h) : l(h), ab(t), ac(~0 << h) { ae(); } +# p &ab; +# int ac; +# void ae() { +# const unsigned *v; +# const unsigned u = 0; +# v = ab.q(); +# const unsigned *x = g(v, &u); +# int w = x[m] & ac; +# while (w) { +# int z = (ab.aa() - 1) / e; +# if (m <= z) +# return; +# } +# } +# }; +# class ad { +# public: +# ~ad() { +# for (y();;) +# ; +# } +# class y { +# public: +# y() : af(0) {} +# s af; +# }; +# }; +# class ag { +# ad ah; +# }; +# enum ai {}; +# class aj { +# public: +# aj(unsigned(ai)); +# ag ak; +# }; +# struct al { +# static unsigned am(ai); +# }; +# template struct an : al { static aj ao; }; +# template <> aj an<0>::ao(am); + +--- | + + %class.s = type <{ %class.l, [4 x i8], %class.p, %class.p*, i32, [4 x i8] }> + %class.l = type { i32 } + %class.p = type { %class.b } + %class.b = type { %struct.a*, %struct.a* } + %struct.a = type opaque + + @n = local_unnamed_addr global i32* null, align 8 + @o = global i32 0, align 4 + + define linkonce_odr void @_ZN1sC2Ei(%class.s*, i32) unnamed_addr #0 align 2 !dbg !4 { + %3 = alloca i32, align 4 + %4 = bitcast %class.s* %0 to %class.l* + tail call void @_ZN1lC2Ei(%class.l* %4, i32 %1) + %5 = getelementptr inbounds %class.s, %class.s* %0, i64 0, i32 2 + tail call void @llvm.dbg.value(metadata %class.p* %5, i64 0, metadata !10, metadata !17), !dbg !18 + tail call void @llvm.dbg.value(metadata %class.p* %5, i64 0, metadata !20, metadata !17), !dbg !27 + %6 = getelementptr inbounds %class.s, %class.s* %0, i64 0, i32 2, i32 0, i32 1 + %7 = bitcast %struct.a** %6 to i64* + %8 = load i64, i64* %7, align 8 + %9 = bitcast %class.p* %5 to i64* + store i64 %8, i64* %9, align 8 + %10 = getelementptr inbounds %class.s, %class.s* %0, i64 0, i32 3 + store %class.p* %5, %class.p** %10, align 8 + %11 = getelementptr inbounds %class.s, %class.s* %0, i64 0, i32 4 + %12 = shl i32 -1, %1 + store i32 %12, i32* %11, align 8 + store i32 0, i32* %3, align 4 + %13 = bitcast %class.p* %5 to i32** + %14 = load i32*, i32** %13, align 8 + store i32* %14, i32** @n, align 8 + %15 = icmp eq i32* %14, null + %16 = ptrtoint i32* %14 to i64 + %17 = select i1 %15, i64 ptrtoint (i32* @o to i64), i64 0 + %18 = or i64 %17, %16 + tail call void @llvm.dbg.value(metadata i32* %3, i64 0, metadata !29, metadata !35), !dbg !36 + tail call void @llvm.dbg.value(metadata i32* %3, i64 0, metadata !39, metadata !17), !dbg !44 + %19 = ptrtoint i32* %3 to i64 + call void @llvm.dbg.value(metadata i64 %19, i64 0, metadata !46, metadata !17), !dbg !48 + %20 = icmp eq i64 %18, 0 + %21 = select i1 %20, i64 %19, i64 0 + %22 = or i64 %21, %18 + %23 = inttoptr i64 %22 to i32* + %24 = bitcast %class.s* %0 to i32* + %25 = load i32, i32* %24, align 8 + %26 = sext i32 %25 to i64 + %27 = getelementptr inbounds i32, i32* %23, i64 %26 + %28 = load i32, i32* %27, align 4 + %29 = and i32 %12, %28 + %30 = icmp eq i32 %29, 0 + br i1 %30, label %47, label %31 + + ;